feat(pipeline): image caption/credit extraction, no-image exclusion, WP attribution
source_extraction.py:
- New _extract_image_metadata(): extracts figcaption text + copyright/credit
per image URL using 3 strategies (figure+figcaption, data-* attributes,
adjacent credit spans)
- ExtractedArticle gets new image_metadata field
- extracted_article_to_meta() includes image_metadata in stored JSON
pipeline.py:
- After auto image selection, check if selected_url is set
- Articles without usable image → status "no_image" (excluded with Telegram notice)
- PipelineStats and summary report include no_image counter
db.py:
- Add "no_image" to articles status CHECK constraint
- Migration: recreates articles table with updated constraint on existing DBs
workflow.py / main.py:
- Map no_image as own UI status with rewrite/close transitions
wordpress.py:
- _upload_featured_media() accepts image_caption param, sends to WP media
- _get_image_meta_for_url() / _build_image_caption() helpers
- _build_attribution_block(): separator + attribution paragraph at article end
(original link, author, Bildnachweis/credit)
- _build_post_content() appends attribution block
telegram_bot.py:
- notify_pipeline_done() shows 🖼️ no-image count
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
1963e32ab4
commit
aaac5def27
7 changed files with 381 additions and 10 deletions
|
|
@ -110,7 +110,7 @@ def init_db() -> None:
|
||||||
publish_last_error TEXT,
|
publish_last_error TEXT,
|
||||||
published_to_wp_at TEXT,
|
published_to_wp_at TEXT,
|
||||||
word_count INTEGER DEFAULT 0,
|
word_count INTEGER DEFAULT 0,
|
||||||
status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error')),
|
status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error', 'no_image')),
|
||||||
meta_json TEXT,
|
meta_json TEXT,
|
||||||
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||||
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
|
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||||
|
|
@ -181,6 +181,89 @@ def init_db() -> None:
|
||||||
if column not in existing_columns:
|
if column not in existing_columns:
|
||||||
conn.execute(ddl)
|
conn.execute(ddl)
|
||||||
|
|
||||||
|
# Migration: add 'no_image' to the status CHECK constraint if not present.
|
||||||
|
# SQLite cannot modify CHECK constraints in-place, so we recreate the table.
|
||||||
|
table_sql_row = conn.execute(
|
||||||
|
"SELECT sql FROM sqlite_master WHERE type='table' AND name='articles'"
|
||||||
|
).fetchone()
|
||||||
|
if table_sql_row and "'no_image'" not in (table_sql_row["sql"] or ""):
|
||||||
|
conn.executescript(
|
||||||
|
"""
|
||||||
|
PRAGMA foreign_keys=OFF;
|
||||||
|
|
||||||
|
CREATE TABLE articles_v2 (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
feed_id INTEGER,
|
||||||
|
source_article_id TEXT,
|
||||||
|
source_hash TEXT,
|
||||||
|
title TEXT NOT NULL,
|
||||||
|
source_url TEXT NOT NULL,
|
||||||
|
canonical_url TEXT,
|
||||||
|
published_at TEXT,
|
||||||
|
author TEXT,
|
||||||
|
summary TEXT,
|
||||||
|
content_raw TEXT,
|
||||||
|
content_rewritten TEXT,
|
||||||
|
image_urls_json TEXT,
|
||||||
|
press_contact TEXT,
|
||||||
|
source_name_snapshot TEXT,
|
||||||
|
source_terms_url_snapshot TEXT,
|
||||||
|
source_license_name_snapshot TEXT,
|
||||||
|
legal_checked INTEGER NOT NULL DEFAULT 0,
|
||||||
|
legal_checked_at TEXT,
|
||||||
|
legal_note TEXT,
|
||||||
|
wp_post_id INTEGER,
|
||||||
|
wp_post_url TEXT,
|
||||||
|
publish_attempts INTEGER NOT NULL DEFAULT 0,
|
||||||
|
publish_last_error TEXT,
|
||||||
|
published_to_wp_at TEXT,
|
||||||
|
word_count INTEGER DEFAULT 0,
|
||||||
|
status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error', 'no_image')),
|
||||||
|
meta_json TEXT,
|
||||||
|
relevance_score INTEGER,
|
||||||
|
scheduled_publish_at TEXT,
|
||||||
|
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||||
|
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||||
|
FOREIGN KEY(feed_id) REFERENCES feeds(id) ON DELETE SET NULL,
|
||||||
|
UNIQUE(source_url)
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO articles_v2 SELECT
|
||||||
|
id, feed_id, source_article_id, source_hash, title, source_url,
|
||||||
|
canonical_url, published_at, author, summary, content_raw,
|
||||||
|
content_rewritten, image_urls_json, press_contact,
|
||||||
|
source_name_snapshot, source_terms_url_snapshot, source_license_name_snapshot,
|
||||||
|
legal_checked, legal_checked_at, legal_note,
|
||||||
|
wp_post_id, wp_post_url, publish_attempts, publish_last_error,
|
||||||
|
published_to_wp_at, word_count, status, meta_json,
|
||||||
|
relevance_score, scheduled_publish_at, created_at, updated_at
|
||||||
|
FROM articles;
|
||||||
|
|
||||||
|
DROP TABLE articles;
|
||||||
|
ALTER TABLE articles_v2 RENAME TO articles;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_articles_source_article_id ON articles(source_article_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_articles_source_hash ON articles(source_hash);
|
||||||
|
CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_feed_source_article_id
|
||||||
|
ON articles(feed_id, source_article_id)
|
||||||
|
WHERE source_article_id IS NOT NULL;
|
||||||
|
CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_source_hash
|
||||||
|
ON articles(source_hash)
|
||||||
|
WHERE source_hash IS NOT NULL;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_articles_status ON articles(status);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_articles_published_at ON articles(published_at);
|
||||||
|
|
||||||
|
CREATE TRIGGER IF NOT EXISTS trg_articles_updated_at
|
||||||
|
AFTER UPDATE ON articles
|
||||||
|
FOR EACH ROW
|
||||||
|
BEGIN
|
||||||
|
UPDATE articles SET updated_at = datetime('now') WHERE id = OLD.id;
|
||||||
|
END;
|
||||||
|
|
||||||
|
PRAGMA foreign_keys=ON;
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
table_rows = conn.execute(
|
table_rows = conn.execute(
|
||||||
"SELECT name FROM sqlite_master WHERE type = 'table' AND name = 'publish_jobs'"
|
"SELECT name FROM sqlite_master WHERE type = 'table' AND name = 'publish_jobs'"
|
||||||
).fetchall()
|
).fetchall()
|
||||||
|
|
|
||||||
|
|
@ -123,7 +123,7 @@ class ArticleUpsertRequest(BaseModel):
|
||||||
publish_last_error: str | None = None
|
publish_last_error: str | None = None
|
||||||
published_to_wp_at: str | None = None
|
published_to_wp_at: str | None = None
|
||||||
word_count: int = 0
|
word_count: int = 0
|
||||||
status: str = Field(default="new", pattern="^(new|rewrite|publish|published|close|review|approved|error)$")
|
status: str = Field(default="new", pattern="^(new|rewrite|publish|published|close|review|approved|error|no_image)$")
|
||||||
meta_json: str | None = None
|
meta_json: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -132,7 +132,7 @@ class IngestionRunRequest(BaseModel):
|
||||||
|
|
||||||
|
|
||||||
class ArticleTransitionRequest(BaseModel):
|
class ArticleTransitionRequest(BaseModel):
|
||||||
target_status: str = Field(pattern="^(new|rewrite|publish|published|close|review|approved|error)$")
|
target_status: str = Field(pattern="^(new|rewrite|publish|published|close|review|approved|error|no_image)$")
|
||||||
note: str | None = None
|
note: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -45,6 +45,7 @@ class PipelineStats:
|
||||||
rejected: int = 0
|
rejected: int = 0
|
||||||
warnings: int = 0
|
warnings: int = 0
|
||||||
errors: int = 0
|
errors: int = 0
|
||||||
|
no_image: int = 0
|
||||||
rejected_articles: list[dict[str, Any]] = field(default_factory=list)
|
rejected_articles: list[dict[str, Any]] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -226,6 +227,7 @@ def run_auto_pipeline(trigger: str = "auto") -> dict[str, Any]:
|
||||||
"processed": stats.processed,
|
"processed": stats.processed,
|
||||||
"drafts_created": stats.drafts_created,
|
"drafts_created": stats.drafts_created,
|
||||||
"rejected": stats.rejected,
|
"rejected": stats.rejected,
|
||||||
|
"no_image": stats.no_image,
|
||||||
"warnings": stats.warnings,
|
"warnings": stats.warnings,
|
||||||
"errors": stats.errors,
|
"errors": stats.errors,
|
||||||
}
|
}
|
||||||
|
|
@ -242,6 +244,33 @@ def _process_article(article: dict[str, Any], stats: PipelineStats, settings: An
|
||||||
# Auto-select image
|
# Auto-select image
|
||||||
_auto_select_image(article)
|
_auto_select_image(article)
|
||||||
|
|
||||||
|
# Reload to get updated image_review
|
||||||
|
article = get_article_by_id(article_id) or article
|
||||||
|
|
||||||
|
# Exclude articles without a usable image
|
||||||
|
try:
|
||||||
|
meta = json.loads(article.get("meta_json") or "{}")
|
||||||
|
except Exception:
|
||||||
|
meta = {}
|
||||||
|
has_image = bool((meta.get("image_review") or {}).get("selected_url"))
|
||||||
|
if not has_image:
|
||||||
|
update_article_status(
|
||||||
|
article_id,
|
||||||
|
"no_image",
|
||||||
|
actor="pipeline",
|
||||||
|
note="Kein Bild vorhanden – Artikel ausgeschlossen",
|
||||||
|
)
|
||||||
|
stats.no_image += 1
|
||||||
|
logger.info("Artikel #%d ausgeschlossen: kein Bild gefunden", article_id)
|
||||||
|
try:
|
||||||
|
tg.send_message(
|
||||||
|
f"🖼️ <b>Kein Bild</b> – Artikel #{article_id} ausgeschlossen\n"
|
||||||
|
f"📰 {(article.get('title') or '')[:80]}"
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return
|
||||||
|
|
||||||
# Score relevance
|
# Score relevance
|
||||||
try:
|
try:
|
||||||
relevance = score_article_relevance(article)
|
relevance = score_article_relevance(article)
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass, field
|
||||||
from html import unescape
|
from html import unescape
|
||||||
import re
|
import re
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
@ -21,6 +21,7 @@ class ExtractedArticle:
|
||||||
images: list[str]
|
images: list[str]
|
||||||
press_contact: str | None
|
press_contact: str | None
|
||||||
extraction_error: str | None = None
|
extraction_error: str | None = None
|
||||||
|
image_metadata: dict[str, dict] = field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
def _clean_text(raw: str | None) -> str | None:
|
def _clean_text(raw: str | None) -> str | None:
|
||||||
|
|
@ -197,6 +198,187 @@ def _extract_press_contact(content_text: str | None) -> str | None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# CSS class keywords that indicate a copyright/credit element inside a figcaption
|
||||||
|
_CREDIT_CLASS_RE = re.compile(
|
||||||
|
r"class\s*=\s*[\"'][^\"']*(?:copyright|credit|photographer|photo-credit|image-credit|bildrechte|fotocredit)[^\"']*[\"']",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Inline text patterns that signal a credit/copyright notice
|
||||||
|
_CREDIT_TEXT_RE = re.compile(
|
||||||
|
r"(©[^<\n\r]{1,100}|(?:Foto|Bild|Credit|Fotograf|Fotografie)\s*:[^<\n\r]{1,100})",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
# data-* attribute names that carry credit/caption information directly on <img>
|
||||||
|
_IMG_DATA_CREDIT_ATTRS = ("data-credit", "data-photographer", "data-copyright")
|
||||||
|
_IMG_DATA_CAPTION_ATTRS = ("data-caption", "data-description")
|
||||||
|
|
||||||
|
# Class keywords for adjacent sibling credit spans/divs after an <img>
|
||||||
|
_ADJ_CREDIT_CLASS_RE = re.compile(
|
||||||
|
r"class\s*=\s*[\"'][^\"']*(?:copyright|credit|photographer|photo-credit|image-credit|bildrechte|fotocredit)[^\"']*[\"']",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_image_metadata(html: str, page_url: str) -> dict[str, dict]:
|
||||||
|
"""Return a mapping of absolute image URL → {"caption": ..., "credit": ...}.
|
||||||
|
|
||||||
|
Uses three progressive strategies:
|
||||||
|
1. <figure> with <img> + <figcaption>
|
||||||
|
2. data-* attributes on <img> tags not already covered
|
||||||
|
3. <img> tags whose immediately following HTML contains a credit element
|
||||||
|
"""
|
||||||
|
result: dict[str, dict] = {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Strategy 1: <figure> blocks containing <img> and <figcaption>
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
for fig_match in re.finditer(r"<figure[^>]*>([\s\S]*?)</figure>", html, re.IGNORECASE):
|
||||||
|
fig_html = fig_match.group(1)
|
||||||
|
|
||||||
|
# Locate image src (src or lazy-loaded data-src)
|
||||||
|
img_match = re.search(
|
||||||
|
r"<img[^>]+(?:src|data-src)\s*=\s*[\"']([^\"']+)[\"'][^>]*>",
|
||||||
|
fig_html,
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
if not img_match:
|
||||||
|
continue
|
||||||
|
img_src = urljoin(page_url, img_match.group(1).strip())
|
||||||
|
|
||||||
|
# Locate figcaption
|
||||||
|
figcap_match = re.search(
|
||||||
|
r"<figcaption[^>]*>([\s\S]*?)</figcaption>",
|
||||||
|
fig_html,
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
if not figcap_match:
|
||||||
|
continue
|
||||||
|
figcap_html = figcap_match.group(1)
|
||||||
|
|
||||||
|
# --- Extract credit ---
|
||||||
|
credit: str | None = None
|
||||||
|
|
||||||
|
# Try credit via class attribute on an inner element
|
||||||
|
credit_elem_match = re.search(
|
||||||
|
r"<(?:span|p|div)[^>]*"
|
||||||
|
+ _CREDIT_CLASS_RE.pattern
|
||||||
|
+ r"[^>]*>([\s\S]*?)</(?:span|p|div)>",
|
||||||
|
figcap_html,
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
if credit_elem_match:
|
||||||
|
credit = _clean_text(credit_elem_match.group(1))
|
||||||
|
|
||||||
|
# Fallback: scan plain text of figcaption for credit patterns
|
||||||
|
if not credit:
|
||||||
|
figcap_text = unescape(re.sub(r"<[^>]+>", " ", figcap_html))
|
||||||
|
cred_text_match = _CREDIT_TEXT_RE.search(figcap_text)
|
||||||
|
if cred_text_match:
|
||||||
|
credit = _clean_text(cred_text_match.group(1))
|
||||||
|
|
||||||
|
# --- Extract caption (full figcaption text) ---
|
||||||
|
caption = _clean_text(figcap_html)
|
||||||
|
|
||||||
|
# Only store entries that carry at least one piece of metadata
|
||||||
|
if caption or credit:
|
||||||
|
entry: dict[str, str] = {}
|
||||||
|
if caption:
|
||||||
|
entry["caption"] = caption
|
||||||
|
if credit:
|
||||||
|
entry["credit"] = credit
|
||||||
|
result[img_src] = entry
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Strategy 2: data-* attributes on <img> tags
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
for img_match in re.finditer(r"<img([^>]+)>", html, re.IGNORECASE):
|
||||||
|
img_attrs = img_match.group(1)
|
||||||
|
|
||||||
|
# Resolve image URL (prefer src over data-src)
|
||||||
|
src_match = re.search(r'(?:^|\s)src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE)
|
||||||
|
if not src_match:
|
||||||
|
src_match = re.search(r'data-src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE)
|
||||||
|
if not src_match:
|
||||||
|
continue
|
||||||
|
img_src = urljoin(page_url, src_match.group(1).strip())
|
||||||
|
|
||||||
|
# Skip images already handled by Strategy 1
|
||||||
|
if img_src in result:
|
||||||
|
continue
|
||||||
|
|
||||||
|
credit: str | None = None
|
||||||
|
caption: str | None = None
|
||||||
|
|
||||||
|
for attr in _IMG_DATA_CREDIT_ATTRS:
|
||||||
|
attr_match = re.search(
|
||||||
|
rf'{re.escape(attr)}\s*=\s*["\']([^"\']+)["\']',
|
||||||
|
img_attrs,
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
if attr_match:
|
||||||
|
credit = _clean_text(attr_match.group(1))
|
||||||
|
break
|
||||||
|
|
||||||
|
for attr in _IMG_DATA_CAPTION_ATTRS:
|
||||||
|
attr_match = re.search(
|
||||||
|
rf'{re.escape(attr)}\s*=\s*["\']([^"\']+)["\']',
|
||||||
|
img_attrs,
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
if attr_match:
|
||||||
|
caption = _clean_text(attr_match.group(1))
|
||||||
|
break
|
||||||
|
|
||||||
|
if caption or credit:
|
||||||
|
entry = {}
|
||||||
|
if caption:
|
||||||
|
entry["caption"] = caption
|
||||||
|
if credit:
|
||||||
|
entry["credit"] = credit
|
||||||
|
result[img_src] = entry
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Strategy 3: <img> followed within 200 chars by a credit element
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
for img_match in re.finditer(r"<img([^>]+)>", html, re.IGNORECASE):
|
||||||
|
img_attrs = img_match.group(1)
|
||||||
|
|
||||||
|
src_match = re.search(r'(?:^|\s)src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE)
|
||||||
|
if not src_match:
|
||||||
|
src_match = re.search(r'data-src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE)
|
||||||
|
if not src_match:
|
||||||
|
continue
|
||||||
|
img_src = urljoin(page_url, src_match.group(1).strip())
|
||||||
|
|
||||||
|
# Skip images already handled by earlier strategies
|
||||||
|
if img_src in result:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Look at the 200 characters of HTML immediately after the img tag
|
||||||
|
after_start = img_match.end()
|
||||||
|
after_html = html[after_start : after_start + 200]
|
||||||
|
|
||||||
|
adj_match = re.search(
|
||||||
|
r"<(?:span|p|div)[^>]*"
|
||||||
|
+ _ADJ_CREDIT_CLASS_RE.pattern
|
||||||
|
+ r"[^>]*>([\s\S]*?)</(?:span|p|div)>",
|
||||||
|
after_html,
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
if adj_match:
|
||||||
|
credit = _clean_text(adj_match.group(1))
|
||||||
|
if credit:
|
||||||
|
result[img_src] = {"credit": credit}
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
def extract_article(url: str, timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS) -> ExtractedArticle:
|
def extract_article(url: str, timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS) -> ExtractedArticle:
|
||||||
try:
|
try:
|
||||||
req = Request(
|
req = Request(
|
||||||
|
|
@ -232,6 +414,7 @@ def extract_article(url: str, timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS) ->
|
||||||
summary = _clean_text(content_text[:320])
|
summary = _clean_text(content_text[:320])
|
||||||
images = _extract_images(html, url)
|
images = _extract_images(html, url)
|
||||||
press_contact = _extract_press_contact(content_text)
|
press_contact = _extract_press_contact(content_text)
|
||||||
|
image_metadata = _extract_image_metadata(html, url)
|
||||||
|
|
||||||
return ExtractedArticle(
|
return ExtractedArticle(
|
||||||
title=title,
|
title=title,
|
||||||
|
|
@ -242,6 +425,7 @@ def extract_article(url: str, timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS) ->
|
||||||
images=images,
|
images=images,
|
||||||
press_contact=press_contact,
|
press_contact=press_contact,
|
||||||
extraction_error=None,
|
extraction_error=None,
|
||||||
|
image_metadata=image_metadata,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -254,4 +438,5 @@ def extracted_article_to_meta(article: ExtractedArticle) -> dict[str, Any]:
|
||||||
"images": article.images,
|
"images": article.images,
|
||||||
"press_contact": article.press_contact,
|
"press_contact": article.press_contact,
|
||||||
"extraction_error": article.extraction_error,
|
"extraction_error": article.extraction_error,
|
||||||
|
"image_metadata": article.image_metadata,
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -289,6 +289,7 @@ def notify_pipeline_done(stats: dict[str, Any]) -> None:
|
||||||
processed = stats.get("processed", 0)
|
processed = stats.get("processed", 0)
|
||||||
drafts = stats.get("drafts_created", 0)
|
drafts = stats.get("drafts_created", 0)
|
||||||
rejected = stats.get("rejected", 0)
|
rejected = stats.get("rejected", 0)
|
||||||
|
no_image = stats.get("no_image", 0)
|
||||||
warnings = stats.get("warnings", 0)
|
warnings = stats.get("warnings", 0)
|
||||||
errors = stats.get("errors", 0)
|
errors = stats.get("errors", 0)
|
||||||
|
|
||||||
|
|
@ -300,6 +301,8 @@ def notify_pipeline_done(stats: dict[str, Any]) -> None:
|
||||||
]
|
]
|
||||||
if rejected:
|
if rejected:
|
||||||
lines.append(f"🚫 Abgelehnt: {rejected}")
|
lines.append(f"🚫 Abgelehnt: {rejected}")
|
||||||
|
if no_image:
|
||||||
|
lines.append(f"🖼️ Kein Bild: {no_image}")
|
||||||
if warnings:
|
if warnings:
|
||||||
lines.append(f"⚠️ Warnungen: {warnings}")
|
lines.append(f"⚠️ Warnungen: {warnings}")
|
||||||
if errors:
|
if errors:
|
||||||
|
|
|
||||||
|
|
@ -161,6 +161,32 @@ def _guess_filename(image_url: str, content_type: str) -> str:
|
||||||
return stem
|
return stem
|
||||||
|
|
||||||
|
|
||||||
|
def _get_image_meta_for_url(meta_json: str | None, image_url: str) -> dict:
|
||||||
|
"""Return the caption/credit dict for a specific image URL from extraction metadata."""
|
||||||
|
if not meta_json or not image_url:
|
||||||
|
return {}
|
||||||
|
try:
|
||||||
|
meta = json.loads(meta_json)
|
||||||
|
image_metadata = (meta.get("extraction") or {}).get("image_metadata") or {}
|
||||||
|
return image_metadata.get(image_url) or {}
|
||||||
|
except Exception:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def _build_image_caption(image_meta: dict, source_url: str) -> str:
|
||||||
|
"""Build a WP caption string from image metadata and source URL."""
|
||||||
|
caption = (image_meta.get("caption") or "").strip()
|
||||||
|
credit = (image_meta.get("credit") or "").strip()
|
||||||
|
parts = []
|
||||||
|
if caption:
|
||||||
|
parts.append(caption)
|
||||||
|
if credit:
|
||||||
|
parts.append(credit)
|
||||||
|
if not parts:
|
||||||
|
parts.append(f"Quelle: {source_url}")
|
||||||
|
return " | ".join(parts)
|
||||||
|
|
||||||
|
|
||||||
def _upload_featured_media(
|
def _upload_featured_media(
|
||||||
*,
|
*,
|
||||||
base_url: str,
|
base_url: str,
|
||||||
|
|
@ -168,6 +194,7 @@ def _upload_featured_media(
|
||||||
image_url: str,
|
image_url: str,
|
||||||
article_title: str,
|
article_title: str,
|
||||||
source_url: str,
|
source_url: str,
|
||||||
|
image_caption: str = "",
|
||||||
) -> int:
|
) -> int:
|
||||||
image_bytes, content_type = _download_image_bytes(image_url, referer=source_url or None)
|
image_bytes, content_type = _download_image_bytes(image_url, referer=source_url or None)
|
||||||
filename = _guess_filename(image_url, content_type)
|
filename = _guess_filename(image_url, content_type)
|
||||||
|
|
@ -192,7 +219,6 @@ def _upload_featured_media(
|
||||||
if media_id <= 0:
|
if media_id <= 0:
|
||||||
raise RuntimeError(f"WordPress Media-Upload fehlgeschlagen: {media_payload}")
|
raise RuntimeError(f"WordPress Media-Upload fehlgeschlagen: {media_payload}")
|
||||||
|
|
||||||
# Optional metadata update for traceability.
|
|
||||||
_wp_request(
|
_wp_request(
|
||||||
base_url=base_url,
|
base_url=base_url,
|
||||||
auth_header=auth_header,
|
auth_header=auth_header,
|
||||||
|
|
@ -200,7 +226,7 @@ def _upload_featured_media(
|
||||||
endpoint=f"media/{media_id}",
|
endpoint=f"media/{media_id}",
|
||||||
payload={
|
payload={
|
||||||
"title": f"{article_title[:120]} - Bild",
|
"title": f"{article_title[:120]} - Bild",
|
||||||
"caption": f"Quelle: {source_url}",
|
"caption": image_caption or f"Quelle: {source_url}",
|
||||||
"alt_text": article_title[:200],
|
"alt_text": article_title[:200],
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
@ -289,6 +315,45 @@ def _sanitize_publish_text(text: str) -> str:
|
||||||
return merged
|
return merged
|
||||||
|
|
||||||
|
|
||||||
|
def _build_attribution_block(article: dict[str, Any]) -> str:
|
||||||
|
"""Build a WP Gutenberg attribution block for the bottom of the article."""
|
||||||
|
source_url = (article.get("canonical_url") or article.get("source_url") or "").strip()
|
||||||
|
source_name = (article.get("source_name_snapshot") or "").strip()
|
||||||
|
author = (article.get("author") or "").strip()
|
||||||
|
|
||||||
|
# Get image credit from extraction metadata
|
||||||
|
credit = ""
|
||||||
|
try:
|
||||||
|
meta = json.loads(article.get("meta_json") or "{}")
|
||||||
|
selected_url = (meta.get("image_review") or {}).get("selected_url") or ""
|
||||||
|
if selected_url:
|
||||||
|
img_meta = (meta.get("extraction") or {}).get("image_metadata") or {}
|
||||||
|
credit = (img_meta.get(selected_url) or {}).get("credit") or ""
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
parts: list[str] = []
|
||||||
|
if source_url:
|
||||||
|
label = source_name or source_url
|
||||||
|
parts.append(f'Originalartikel: <a href="{source_url}">{escape(label)}</a>')
|
||||||
|
if author:
|
||||||
|
parts.append(f"Autor: {escape(author)}")
|
||||||
|
if credit:
|
||||||
|
parts.append(f"Bildnachweis: {escape(credit)}")
|
||||||
|
|
||||||
|
if not parts:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
inner = " | ".join(parts)
|
||||||
|
return (
|
||||||
|
"\n<!-- wp:separator {\"className\":\"is-style-wide\"} -->"
|
||||||
|
"<hr class=\"wp-block-separator is-style-wide\"/><!-- /wp:separator -->\n"
|
||||||
|
f'<!-- wp:paragraph {{\"className\":\"article-attribution\"}} -->'
|
||||||
|
f'<p class="article-attribution"><em>{inner}</em></p>'
|
||||||
|
"<!-- /wp:paragraph -->"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _build_post_content(article: dict[str, Any]) -> tuple[str, str | None]:
|
def _build_post_content(article: dict[str, Any]) -> tuple[str, str | None]:
|
||||||
summary = (article.get("summary") or "").strip()
|
summary = (article.get("summary") or "").strip()
|
||||||
body_text = (article.get("content_rewritten") or article.get("content_raw") or "").strip()
|
body_text = (article.get("content_rewritten") or article.get("content_raw") or "").strip()
|
||||||
|
|
@ -300,7 +365,9 @@ def _build_post_content(article: dict[str, Any]) -> tuple[str, str | None]:
|
||||||
body_html = _html_to_wp_blocks(body_text) if has_html else _as_block_paragraphs(body_text)
|
body_html = _html_to_wp_blocks(body_text) if has_html else _as_block_paragraphs(body_text)
|
||||||
if not body_html:
|
if not body_html:
|
||||||
body_html = "<!-- wp:paragraph --><p>Kein Inhalt verfügbar.</p><!-- /wp:paragraph -->"
|
body_html = "<!-- wp:paragraph --><p>Kein Inhalt verfügbar.</p><!-- /wp:paragraph -->"
|
||||||
content = body_html.strip()
|
|
||||||
|
attribution = _build_attribution_block(article)
|
||||||
|
content = (body_html + attribution).strip()
|
||||||
return content, None
|
return content, None
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -318,6 +385,8 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]:
|
||||||
featured_media_id = None
|
featured_media_id = None
|
||||||
selected_image_url = _selected_image_url_from_meta(article.get("meta_json"))
|
selected_image_url = _selected_image_url_from_meta(article.get("meta_json"))
|
||||||
if selected_image_url:
|
if selected_image_url:
|
||||||
|
image_meta = _get_image_meta_for_url(article.get("meta_json"), selected_image_url)
|
||||||
|
image_caption = _build_image_caption(image_meta, source_url)
|
||||||
try:
|
try:
|
||||||
featured_media_id = _upload_featured_media(
|
featured_media_id = _upload_featured_media(
|
||||||
base_url=settings.wordpress_base_url,
|
base_url=settings.wordpress_base_url,
|
||||||
|
|
@ -325,6 +394,7 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]:
|
||||||
image_url=selected_image_url,
|
image_url=selected_image_url,
|
||||||
article_title=title,
|
article_title=title,
|
||||||
source_url=source_url,
|
source_url=source_url,
|
||||||
|
image_caption=image_caption,
|
||||||
)
|
)
|
||||||
except Exception as img_exc:
|
except Exception as img_exc:
|
||||||
import logging
|
import logging
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
UI_STATUSES = ("new", "rewrite", "publish", "published", "close")
|
UI_STATUSES = ("new", "rewrite", "publish", "published", "close", "no_image")
|
||||||
|
|
||||||
|
|
||||||
def internal_to_ui_status(status: str | None) -> str:
|
def internal_to_ui_status(status: str | None) -> str:
|
||||||
|
|
@ -11,7 +11,7 @@ def internal_to_ui_status(status: str | None) -> str:
|
||||||
return "close"
|
return "close"
|
||||||
if value == "review":
|
if value == "review":
|
||||||
return "rewrite"
|
return "rewrite"
|
||||||
if value in {"new", "rewrite", "published"}:
|
if value in {"new", "rewrite", "published", "no_image"}:
|
||||||
return value
|
return value
|
||||||
return value or "new"
|
return value or "new"
|
||||||
|
|
||||||
|
|
@ -22,7 +22,7 @@ def ui_to_internal_status(status: str | None) -> str:
|
||||||
return "approved"
|
return "approved"
|
||||||
if value == "close":
|
if value == "close":
|
||||||
return "error"
|
return "error"
|
||||||
if value in {"new", "rewrite", "published"}:
|
if value in {"new", "rewrite", "published", "no_image"}:
|
||||||
return value
|
return value
|
||||||
if value in {"approved", "error", "review"}:
|
if value in {"approved", "error", "review"}:
|
||||||
return value
|
return value
|
||||||
|
|
@ -35,4 +35,5 @@ ALLOWED_UI_TRANSITIONS: dict[str, set[str]] = {
|
||||||
"publish": {"published", "close"},
|
"publish": {"published", "close"},
|
||||||
"published": {"rewrite", "close"},
|
"published": {"rewrite", "close"},
|
||||||
"close": {"rewrite"},
|
"close": {"rewrite"},
|
||||||
|
"no_image": {"rewrite", "close"},
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue