diff --git a/backend/app/db.py b/backend/app/db.py index 1b394c3..b6ef898 100644 --- a/backend/app/db.py +++ b/backend/app/db.py @@ -110,7 +110,7 @@ def init_db() -> None: publish_last_error TEXT, published_to_wp_at TEXT, word_count INTEGER DEFAULT 0, - status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error')), + status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error', 'no_image')), meta_json TEXT, created_at TEXT NOT NULL DEFAULT (datetime('now')), updated_at TEXT NOT NULL DEFAULT (datetime('now')), @@ -181,6 +181,89 @@ def init_db() -> None: if column not in existing_columns: conn.execute(ddl) + # Migration: add 'no_image' to the status CHECK constraint if not present. + # SQLite cannot modify CHECK constraints in-place, so we recreate the table. + table_sql_row = conn.execute( + "SELECT sql FROM sqlite_master WHERE type='table' AND name='articles'" + ).fetchone() + if table_sql_row and "'no_image'" not in (table_sql_row["sql"] or ""): + conn.executescript( + """ + PRAGMA foreign_keys=OFF; + + CREATE TABLE articles_v2 ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + feed_id INTEGER, + source_article_id TEXT, + source_hash TEXT, + title TEXT NOT NULL, + source_url TEXT NOT NULL, + canonical_url TEXT, + published_at TEXT, + author TEXT, + summary TEXT, + content_raw TEXT, + content_rewritten TEXT, + image_urls_json TEXT, + press_contact TEXT, + source_name_snapshot TEXT, + source_terms_url_snapshot TEXT, + source_license_name_snapshot TEXT, + legal_checked INTEGER NOT NULL DEFAULT 0, + legal_checked_at TEXT, + legal_note TEXT, + wp_post_id INTEGER, + wp_post_url TEXT, + publish_attempts INTEGER NOT NULL DEFAULT 0, + publish_last_error TEXT, + published_to_wp_at TEXT, + word_count INTEGER DEFAULT 0, + status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error', 'no_image')), + meta_json TEXT, + relevance_score INTEGER, + scheduled_publish_at TEXT, + created_at TEXT NOT NULL DEFAULT (datetime('now')), + updated_at TEXT NOT NULL DEFAULT (datetime('now')), + FOREIGN KEY(feed_id) REFERENCES feeds(id) ON DELETE SET NULL, + UNIQUE(source_url) + ); + + INSERT INTO articles_v2 SELECT + id, feed_id, source_article_id, source_hash, title, source_url, + canonical_url, published_at, author, summary, content_raw, + content_rewritten, image_urls_json, press_contact, + source_name_snapshot, source_terms_url_snapshot, source_license_name_snapshot, + legal_checked, legal_checked_at, legal_note, + wp_post_id, wp_post_url, publish_attempts, publish_last_error, + published_to_wp_at, word_count, status, meta_json, + relevance_score, scheduled_publish_at, created_at, updated_at + FROM articles; + + DROP TABLE articles; + ALTER TABLE articles_v2 RENAME TO articles; + + CREATE INDEX IF NOT EXISTS idx_articles_source_article_id ON articles(source_article_id); + CREATE INDEX IF NOT EXISTS idx_articles_source_hash ON articles(source_hash); + CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_feed_source_article_id + ON articles(feed_id, source_article_id) + WHERE source_article_id IS NOT NULL; + CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_source_hash + ON articles(source_hash) + WHERE source_hash IS NOT NULL; + CREATE INDEX IF NOT EXISTS idx_articles_status ON articles(status); + CREATE INDEX IF NOT EXISTS idx_articles_published_at ON articles(published_at); + + CREATE TRIGGER IF NOT EXISTS trg_articles_updated_at + AFTER UPDATE ON articles + FOR EACH ROW + BEGIN + UPDATE articles SET updated_at = datetime('now') WHERE id = OLD.id; + END; + + PRAGMA foreign_keys=ON; + """ + ) + table_rows = conn.execute( "SELECT name FROM sqlite_master WHERE type = 'table' AND name = 'publish_jobs'" ).fetchall() diff --git a/backend/app/main.py b/backend/app/main.py index e954de6..264a2be 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -123,7 +123,7 @@ class ArticleUpsertRequest(BaseModel): publish_last_error: str | None = None published_to_wp_at: str | None = None word_count: int = 0 - status: str = Field(default="new", pattern="^(new|rewrite|publish|published|close|review|approved|error)$") + status: str = Field(default="new", pattern="^(new|rewrite|publish|published|close|review|approved|error|no_image)$") meta_json: str | None = None @@ -132,7 +132,7 @@ class IngestionRunRequest(BaseModel): class ArticleTransitionRequest(BaseModel): - target_status: str = Field(pattern="^(new|rewrite|publish|published|close|review|approved|error)$") + target_status: str = Field(pattern="^(new|rewrite|publish|published|close|review|approved|error|no_image)$") note: str | None = None diff --git a/backend/app/pipeline.py b/backend/app/pipeline.py index ef0597d..100c70c 100644 --- a/backend/app/pipeline.py +++ b/backend/app/pipeline.py @@ -45,6 +45,7 @@ class PipelineStats: rejected: int = 0 warnings: int = 0 errors: int = 0 + no_image: int = 0 rejected_articles: list[dict[str, Any]] = field(default_factory=list) @@ -226,6 +227,7 @@ def run_auto_pipeline(trigger: str = "auto") -> dict[str, Any]: "processed": stats.processed, "drafts_created": stats.drafts_created, "rejected": stats.rejected, + "no_image": stats.no_image, "warnings": stats.warnings, "errors": stats.errors, } @@ -242,6 +244,33 @@ def _process_article(article: dict[str, Any], stats: PipelineStats, settings: An # Auto-select image _auto_select_image(article) + # Reload to get updated image_review + article = get_article_by_id(article_id) or article + + # Exclude articles without a usable image + try: + meta = json.loads(article.get("meta_json") or "{}") + except Exception: + meta = {} + has_image = bool((meta.get("image_review") or {}).get("selected_url")) + if not has_image: + update_article_status( + article_id, + "no_image", + actor="pipeline", + note="Kein Bild vorhanden – Artikel ausgeschlossen", + ) + stats.no_image += 1 + logger.info("Artikel #%d ausgeschlossen: kein Bild gefunden", article_id) + try: + tg.send_message( + f"🖼️ Kein Bild – Artikel #{article_id} ausgeschlossen\n" + f"📰 {(article.get('title') or '')[:80]}" + ) + except Exception: + pass + return + # Score relevance try: relevance = score_article_relevance(article) diff --git a/backend/app/source_extraction.py b/backend/app/source_extraction.py index 925fcf6..d3cbed8 100644 --- a/backend/app/source_extraction.py +++ b/backend/app/source_extraction.py @@ -1,6 +1,6 @@ from __future__ import annotations -from dataclasses import dataclass +from dataclasses import dataclass, field from html import unescape import re from typing import Any @@ -21,6 +21,7 @@ class ExtractedArticle: images: list[str] press_contact: str | None extraction_error: str | None = None + image_metadata: dict[str, dict] = field(default_factory=dict) def _clean_text(raw: str | None) -> str | None: @@ -197,6 +198,187 @@ def _extract_press_contact(content_text: str | None) -> str | None: return None +# CSS class keywords that indicate a copyright/credit element inside a figcaption +_CREDIT_CLASS_RE = re.compile( + r"class\s*=\s*[\"'][^\"']*(?:copyright|credit|photographer|photo-credit|image-credit|bildrechte|fotocredit)[^\"']*[\"']", + re.IGNORECASE, +) + +# Inline text patterns that signal a credit/copyright notice +_CREDIT_TEXT_RE = re.compile( + r"(©[^<\n\r]{1,100}|(?:Foto|Bild|Credit|Fotograf|Fotografie)\s*:[^<\n\r]{1,100})", + re.IGNORECASE, +) + +# data-* attribute names that carry credit/caption information directly on +_IMG_DATA_CREDIT_ATTRS = ("data-credit", "data-photographer", "data-copyright") +_IMG_DATA_CAPTION_ATTRS = ("data-caption", "data-description") + +# Class keywords for adjacent sibling credit spans/divs after an +_ADJ_CREDIT_CLASS_RE = re.compile( + r"class\s*=\s*[\"'][^\"']*(?:copyright|credit|photographer|photo-credit|image-credit|bildrechte|fotocredit)[^\"']*[\"']", + re.IGNORECASE, +) + + +def _extract_image_metadata(html: str, page_url: str) -> dict[str, dict]: + """Return a mapping of absolute image URL → {"caption": ..., "credit": ...}. + + Uses three progressive strategies: + 1.
with +
+ 2. data-* attributes on tags not already covered + 3. tags whose immediately following HTML contains a credit element + """ + result: dict[str, dict] = {} + + try: + # ------------------------------------------------------------------ + # Strategy 1:
blocks containing and
+ # ------------------------------------------------------------------ + for fig_match in re.finditer(r"]*>([\s\S]*?)
", html, re.IGNORECASE): + fig_html = fig_match.group(1) + + # Locate image src (src or lazy-loaded data-src) + img_match = re.search( + r"]+(?:src|data-src)\s*=\s*[\"']([^\"']+)[\"'][^>]*>", + fig_html, + re.IGNORECASE, + ) + if not img_match: + continue + img_src = urljoin(page_url, img_match.group(1).strip()) + + # Locate figcaption + figcap_match = re.search( + r"]*>([\s\S]*?)
", + fig_html, + re.IGNORECASE, + ) + if not figcap_match: + continue + figcap_html = figcap_match.group(1) + + # --- Extract credit --- + credit: str | None = None + + # Try credit via class attribute on an inner element + credit_elem_match = re.search( + r"<(?:span|p|div)[^>]*" + + _CREDIT_CLASS_RE.pattern + + r"[^>]*>([\s\S]*?)", + figcap_html, + re.IGNORECASE, + ) + if credit_elem_match: + credit = _clean_text(credit_elem_match.group(1)) + + # Fallback: scan plain text of figcaption for credit patterns + if not credit: + figcap_text = unescape(re.sub(r"<[^>]+>", " ", figcap_html)) + cred_text_match = _CREDIT_TEXT_RE.search(figcap_text) + if cred_text_match: + credit = _clean_text(cred_text_match.group(1)) + + # --- Extract caption (full figcaption text) --- + caption = _clean_text(figcap_html) + + # Only store entries that carry at least one piece of metadata + if caption or credit: + entry: dict[str, str] = {} + if caption: + entry["caption"] = caption + if credit: + entry["credit"] = credit + result[img_src] = entry + + # ------------------------------------------------------------------ + # Strategy 2: data-* attributes on tags + # ------------------------------------------------------------------ + for img_match in re.finditer(r"]+)>", html, re.IGNORECASE): + img_attrs = img_match.group(1) + + # Resolve image URL (prefer src over data-src) + src_match = re.search(r'(?:^|\s)src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE) + if not src_match: + src_match = re.search(r'data-src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE) + if not src_match: + continue + img_src = urljoin(page_url, src_match.group(1).strip()) + + # Skip images already handled by Strategy 1 + if img_src in result: + continue + + credit: str | None = None + caption: str | None = None + + for attr in _IMG_DATA_CREDIT_ATTRS: + attr_match = re.search( + rf'{re.escape(attr)}\s*=\s*["\']([^"\']+)["\']', + img_attrs, + re.IGNORECASE, + ) + if attr_match: + credit = _clean_text(attr_match.group(1)) + break + + for attr in _IMG_DATA_CAPTION_ATTRS: + attr_match = re.search( + rf'{re.escape(attr)}\s*=\s*["\']([^"\']+)["\']', + img_attrs, + re.IGNORECASE, + ) + if attr_match: + caption = _clean_text(attr_match.group(1)) + break + + if caption or credit: + entry = {} + if caption: + entry["caption"] = caption + if credit: + entry["credit"] = credit + result[img_src] = entry + + # ------------------------------------------------------------------ + # Strategy 3: followed within 200 chars by a credit element + # ------------------------------------------------------------------ + for img_match in re.finditer(r"]+)>", html, re.IGNORECASE): + img_attrs = img_match.group(1) + + src_match = re.search(r'(?:^|\s)src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE) + if not src_match: + src_match = re.search(r'data-src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE) + if not src_match: + continue + img_src = urljoin(page_url, src_match.group(1).strip()) + + # Skip images already handled by earlier strategies + if img_src in result: + continue + + # Look at the 200 characters of HTML immediately after the img tag + after_start = img_match.end() + after_html = html[after_start : after_start + 200] + + adj_match = re.search( + r"<(?:span|p|div)[^>]*" + + _ADJ_CREDIT_CLASS_RE.pattern + + r"[^>]*>([\s\S]*?)", + after_html, + re.IGNORECASE, + ) + if adj_match: + credit = _clean_text(adj_match.group(1)) + if credit: + result[img_src] = {"credit": credit} + + except Exception: + return {} + + return result + + def extract_article(url: str, timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS) -> ExtractedArticle: try: req = Request( @@ -232,6 +414,7 @@ def extract_article(url: str, timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS) -> summary = _clean_text(content_text[:320]) images = _extract_images(html, url) press_contact = _extract_press_contact(content_text) + image_metadata = _extract_image_metadata(html, url) return ExtractedArticle( title=title, @@ -242,6 +425,7 @@ def extract_article(url: str, timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS) -> images=images, press_contact=press_contact, extraction_error=None, + image_metadata=image_metadata, ) @@ -254,4 +438,5 @@ def extracted_article_to_meta(article: ExtractedArticle) -> dict[str, Any]: "images": article.images, "press_contact": article.press_contact, "extraction_error": article.extraction_error, + "image_metadata": article.image_metadata, } diff --git a/backend/app/telegram_bot.py b/backend/app/telegram_bot.py index 46ddd28..c92b009 100644 --- a/backend/app/telegram_bot.py +++ b/backend/app/telegram_bot.py @@ -289,6 +289,7 @@ def notify_pipeline_done(stats: dict[str, Any]) -> None: processed = stats.get("processed", 0) drafts = stats.get("drafts_created", 0) rejected = stats.get("rejected", 0) + no_image = stats.get("no_image", 0) warnings = stats.get("warnings", 0) errors = stats.get("errors", 0) @@ -300,6 +301,8 @@ def notify_pipeline_done(stats: dict[str, Any]) -> None: ] if rejected: lines.append(f"🚫 Abgelehnt: {rejected}") + if no_image: + lines.append(f"🖼️ Kein Bild: {no_image}") if warnings: lines.append(f"⚠️ Warnungen: {warnings}") if errors: diff --git a/backend/app/wordpress.py b/backend/app/wordpress.py index a4f7f3a..a1ef8f5 100644 --- a/backend/app/wordpress.py +++ b/backend/app/wordpress.py @@ -161,6 +161,32 @@ def _guess_filename(image_url: str, content_type: str) -> str: return stem +def _get_image_meta_for_url(meta_json: str | None, image_url: str) -> dict: + """Return the caption/credit dict for a specific image URL from extraction metadata.""" + if not meta_json or not image_url: + return {} + try: + meta = json.loads(meta_json) + image_metadata = (meta.get("extraction") or {}).get("image_metadata") or {} + return image_metadata.get(image_url) or {} + except Exception: + return {} + + +def _build_image_caption(image_meta: dict, source_url: str) -> str: + """Build a WP caption string from image metadata and source URL.""" + caption = (image_meta.get("caption") or "").strip() + credit = (image_meta.get("credit") or "").strip() + parts = [] + if caption: + parts.append(caption) + if credit: + parts.append(credit) + if not parts: + parts.append(f"Quelle: {source_url}") + return " | ".join(parts) + + def _upload_featured_media( *, base_url: str, @@ -168,6 +194,7 @@ def _upload_featured_media( image_url: str, article_title: str, source_url: str, + image_caption: str = "", ) -> int: image_bytes, content_type = _download_image_bytes(image_url, referer=source_url or None) filename = _guess_filename(image_url, content_type) @@ -192,7 +219,6 @@ def _upload_featured_media( if media_id <= 0: raise RuntimeError(f"WordPress Media-Upload fehlgeschlagen: {media_payload}") - # Optional metadata update for traceability. _wp_request( base_url=base_url, auth_header=auth_header, @@ -200,7 +226,7 @@ def _upload_featured_media( endpoint=f"media/{media_id}", payload={ "title": f"{article_title[:120]} - Bild", - "caption": f"Quelle: {source_url}", + "caption": image_caption or f"Quelle: {source_url}", "alt_text": article_title[:200], }, ) @@ -289,6 +315,45 @@ def _sanitize_publish_text(text: str) -> str: return merged +def _build_attribution_block(article: dict[str, Any]) -> str: + """Build a WP Gutenberg attribution block for the bottom of the article.""" + source_url = (article.get("canonical_url") or article.get("source_url") or "").strip() + source_name = (article.get("source_name_snapshot") or "").strip() + author = (article.get("author") or "").strip() + + # Get image credit from extraction metadata + credit = "" + try: + meta = json.loads(article.get("meta_json") or "{}") + selected_url = (meta.get("image_review") or {}).get("selected_url") or "" + if selected_url: + img_meta = (meta.get("extraction") or {}).get("image_metadata") or {} + credit = (img_meta.get(selected_url) or {}).get("credit") or "" + except Exception: + pass + + parts: list[str] = [] + if source_url: + label = source_name or source_url + parts.append(f'Originalartikel: {escape(label)}') + if author: + parts.append(f"Autor: {escape(author)}") + if credit: + parts.append(f"Bildnachweis: {escape(credit)}") + + if not parts: + return "" + + inner = "  |  ".join(parts) + return ( + "\n" + "
\n" + f'' + f'

{inner}

' + "" + ) + + def _build_post_content(article: dict[str, Any]) -> tuple[str, str | None]: summary = (article.get("summary") or "").strip() body_text = (article.get("content_rewritten") or article.get("content_raw") or "").strip() @@ -300,7 +365,9 @@ def _build_post_content(article: dict[str, Any]) -> tuple[str, str | None]: body_html = _html_to_wp_blocks(body_text) if has_html else _as_block_paragraphs(body_text) if not body_html: body_html = "

Kein Inhalt verfügbar.

" - content = body_html.strip() + + attribution = _build_attribution_block(article) + content = (body_html + attribution).strip() return content, None @@ -318,6 +385,8 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]: featured_media_id = None selected_image_url = _selected_image_url_from_meta(article.get("meta_json")) if selected_image_url: + image_meta = _get_image_meta_for_url(article.get("meta_json"), selected_image_url) + image_caption = _build_image_caption(image_meta, source_url) try: featured_media_id = _upload_featured_media( base_url=settings.wordpress_base_url, @@ -325,6 +394,7 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]: image_url=selected_image_url, article_title=title, source_url=source_url, + image_caption=image_caption, ) except Exception as img_exc: import logging diff --git a/backend/app/workflow.py b/backend/app/workflow.py index b6cd4bb..83e9b63 100644 --- a/backend/app/workflow.py +++ b/backend/app/workflow.py @@ -1,6 +1,6 @@ from __future__ import annotations -UI_STATUSES = ("new", "rewrite", "publish", "published", "close") +UI_STATUSES = ("new", "rewrite", "publish", "published", "close", "no_image") def internal_to_ui_status(status: str | None) -> str: @@ -11,7 +11,7 @@ def internal_to_ui_status(status: str | None) -> str: return "close" if value == "review": return "rewrite" - if value in {"new", "rewrite", "published"}: + if value in {"new", "rewrite", "published", "no_image"}: return value return value or "new" @@ -22,7 +22,7 @@ def ui_to_internal_status(status: str | None) -> str: return "approved" if value == "close": return "error" - if value in {"new", "rewrite", "published"}: + if value in {"new", "rewrite", "published", "no_image"}: return value if value in {"approved", "error", "review"}: return value @@ -35,4 +35,5 @@ ALLOWED_UI_TRANSITIONS: dict[str, set[str]] = { "publish": {"published", "close"}, "published": {"rewrite", "close"}, "close": {"rewrite"}, + "no_image": {"rewrite", "close"}, }