diff --git a/backend/app/db.py b/backend/app/db.py
index 1b394c3..b6ef898 100644
--- a/backend/app/db.py
+++ b/backend/app/db.py
@@ -110,7 +110,7 @@ def init_db() -> None:
publish_last_error TEXT,
published_to_wp_at TEXT,
word_count INTEGER DEFAULT 0,
- status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error')),
+ status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error', 'no_image')),
meta_json TEXT,
created_at TEXT NOT NULL DEFAULT (datetime('now')),
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
@@ -181,6 +181,89 @@ def init_db() -> None:
if column not in existing_columns:
conn.execute(ddl)
+ # Migration: add 'no_image' to the status CHECK constraint if not present.
+ # SQLite cannot modify CHECK constraints in-place, so we recreate the table.
+ table_sql_row = conn.execute(
+ "SELECT sql FROM sqlite_master WHERE type='table' AND name='articles'"
+ ).fetchone()
+ if table_sql_row and "'no_image'" not in (table_sql_row["sql"] or ""):
+ conn.executescript(
+ """
+ PRAGMA foreign_keys=OFF;
+
+ CREATE TABLE articles_v2 (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ feed_id INTEGER,
+ source_article_id TEXT,
+ source_hash TEXT,
+ title TEXT NOT NULL,
+ source_url TEXT NOT NULL,
+ canonical_url TEXT,
+ published_at TEXT,
+ author TEXT,
+ summary TEXT,
+ content_raw TEXT,
+ content_rewritten TEXT,
+ image_urls_json TEXT,
+ press_contact TEXT,
+ source_name_snapshot TEXT,
+ source_terms_url_snapshot TEXT,
+ source_license_name_snapshot TEXT,
+ legal_checked INTEGER NOT NULL DEFAULT 0,
+ legal_checked_at TEXT,
+ legal_note TEXT,
+ wp_post_id INTEGER,
+ wp_post_url TEXT,
+ publish_attempts INTEGER NOT NULL DEFAULT 0,
+ publish_last_error TEXT,
+ published_to_wp_at TEXT,
+ word_count INTEGER DEFAULT 0,
+ status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error', 'no_image')),
+ meta_json TEXT,
+ relevance_score INTEGER,
+ scheduled_publish_at TEXT,
+ created_at TEXT NOT NULL DEFAULT (datetime('now')),
+ updated_at TEXT NOT NULL DEFAULT (datetime('now')),
+ FOREIGN KEY(feed_id) REFERENCES feeds(id) ON DELETE SET NULL,
+ UNIQUE(source_url)
+ );
+
+ INSERT INTO articles_v2 SELECT
+ id, feed_id, source_article_id, source_hash, title, source_url,
+ canonical_url, published_at, author, summary, content_raw,
+ content_rewritten, image_urls_json, press_contact,
+ source_name_snapshot, source_terms_url_snapshot, source_license_name_snapshot,
+ legal_checked, legal_checked_at, legal_note,
+ wp_post_id, wp_post_url, publish_attempts, publish_last_error,
+ published_to_wp_at, word_count, status, meta_json,
+ relevance_score, scheduled_publish_at, created_at, updated_at
+ FROM articles;
+
+ DROP TABLE articles;
+ ALTER TABLE articles_v2 RENAME TO articles;
+
+ CREATE INDEX IF NOT EXISTS idx_articles_source_article_id ON articles(source_article_id);
+ CREATE INDEX IF NOT EXISTS idx_articles_source_hash ON articles(source_hash);
+ CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_feed_source_article_id
+ ON articles(feed_id, source_article_id)
+ WHERE source_article_id IS NOT NULL;
+ CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_source_hash
+ ON articles(source_hash)
+ WHERE source_hash IS NOT NULL;
+ CREATE INDEX IF NOT EXISTS idx_articles_status ON articles(status);
+ CREATE INDEX IF NOT EXISTS idx_articles_published_at ON articles(published_at);
+
+ CREATE TRIGGER IF NOT EXISTS trg_articles_updated_at
+ AFTER UPDATE ON articles
+ FOR EACH ROW
+ BEGIN
+ UPDATE articles SET updated_at = datetime('now') WHERE id = OLD.id;
+ END;
+
+ PRAGMA foreign_keys=ON;
+ """
+ )
+
table_rows = conn.execute(
"SELECT name FROM sqlite_master WHERE type = 'table' AND name = 'publish_jobs'"
).fetchall()
diff --git a/backend/app/main.py b/backend/app/main.py
index e954de6..264a2be 100644
--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -123,7 +123,7 @@ class ArticleUpsertRequest(BaseModel):
publish_last_error: str | None = None
published_to_wp_at: str | None = None
word_count: int = 0
- status: str = Field(default="new", pattern="^(new|rewrite|publish|published|close|review|approved|error)$")
+ status: str = Field(default="new", pattern="^(new|rewrite|publish|published|close|review|approved|error|no_image)$")
meta_json: str | None = None
@@ -132,7 +132,7 @@ class IngestionRunRequest(BaseModel):
class ArticleTransitionRequest(BaseModel):
- target_status: str = Field(pattern="^(new|rewrite|publish|published|close|review|approved|error)$")
+ target_status: str = Field(pattern="^(new|rewrite|publish|published|close|review|approved|error|no_image)$")
note: str | None = None
diff --git a/backend/app/pipeline.py b/backend/app/pipeline.py
index ef0597d..100c70c 100644
--- a/backend/app/pipeline.py
+++ b/backend/app/pipeline.py
@@ -45,6 +45,7 @@ class PipelineStats:
rejected: int = 0
warnings: int = 0
errors: int = 0
+ no_image: int = 0
rejected_articles: list[dict[str, Any]] = field(default_factory=list)
@@ -226,6 +227,7 @@ def run_auto_pipeline(trigger: str = "auto") -> dict[str, Any]:
"processed": stats.processed,
"drafts_created": stats.drafts_created,
"rejected": stats.rejected,
+ "no_image": stats.no_image,
"warnings": stats.warnings,
"errors": stats.errors,
}
@@ -242,6 +244,33 @@ def _process_article(article: dict[str, Any], stats: PipelineStats, settings: An
# Auto-select image
_auto_select_image(article)
+ # Reload to get updated image_review
+ article = get_article_by_id(article_id) or article
+
+ # Exclude articles without a usable image
+ try:
+ meta = json.loads(article.get("meta_json") or "{}")
+ except Exception:
+ meta = {}
+ has_image = bool((meta.get("image_review") or {}).get("selected_url"))
+ if not has_image:
+ update_article_status(
+ article_id,
+ "no_image",
+ actor="pipeline",
+ note="Kein Bild vorhanden – Artikel ausgeschlossen",
+ )
+ stats.no_image += 1
+ logger.info("Artikel #%d ausgeschlossen: kein Bild gefunden", article_id)
+ try:
+ tg.send_message(
+ f"🖼️ Kein Bild – Artikel #{article_id} ausgeschlossen\n"
+ f"📰 {(article.get('title') or '')[:80]}"
+ )
+ except Exception:
+ pass
+ return
+
# Score relevance
try:
relevance = score_article_relevance(article)
diff --git a/backend/app/source_extraction.py b/backend/app/source_extraction.py
index 925fcf6..d3cbed8 100644
--- a/backend/app/source_extraction.py
+++ b/backend/app/source_extraction.py
@@ -1,6 +1,6 @@
from __future__ import annotations
-from dataclasses import dataclass
+from dataclasses import dataclass, field
from html import unescape
import re
from typing import Any
@@ -21,6 +21,7 @@ class ExtractedArticle:
images: list[str]
press_contact: str | None
extraction_error: str | None = None
+ image_metadata: dict[str, dict] = field(default_factory=dict)
def _clean_text(raw: str | None) -> str | None:
@@ -197,6 +198,187 @@ def _extract_press_contact(content_text: str | None) -> str | None:
return None
+# CSS class keywords that indicate a copyright/credit element inside a figcaption
+_CREDIT_CLASS_RE = re.compile(
+ r"class\s*=\s*[\"'][^\"']*(?:copyright|credit|photographer|photo-credit|image-credit|bildrechte|fotocredit)[^\"']*[\"']",
+ re.IGNORECASE,
+)
+
+# Inline text patterns that signal a credit/copyright notice
+_CREDIT_TEXT_RE = re.compile(
+ r"(©[^<\n\r]{1,100}|(?:Foto|Bild|Credit|Fotograf|Fotografie)\s*:[^<\n\r]{1,100})",
+ re.IGNORECASE,
+)
+
+# data-* attribute names that carry credit/caption information directly on
+_IMG_DATA_CREDIT_ATTRS = ("data-credit", "data-photographer", "data-copyright")
+_IMG_DATA_CAPTION_ATTRS = ("data-caption", "data-description")
+
+# Class keywords for adjacent sibling credit spans/divs after an
+_ADJ_CREDIT_CLASS_RE = re.compile(
+ r"class\s*=\s*[\"'][^\"']*(?:copyright|credit|photographer|photo-credit|image-credit|bildrechte|fotocredit)[^\"']*[\"']",
+ re.IGNORECASE,
+)
+
+
+def _extract_image_metadata(html: str, page_url: str) -> dict[str, dict]:
+ """Return a mapping of absolute image URL → {"caption": ..., "credit": ...}.
+
+ Uses three progressive strategies:
+ 1. with
+
+ 2. data-* attributes on
tags not already covered
+ 3.
tags whose immediately following HTML contains a credit element
+ """
+ result: dict[str, dict] = {}
+
+ try:
+ # ------------------------------------------------------------------
+ # Strategy 1: blocks containing
and
+ # ------------------------------------------------------------------
+ for fig_match in re.finditer(r"]*>([\s\S]*?)", html, re.IGNORECASE):
+ fig_html = fig_match.group(1)
+
+ # Locate image src (src or lazy-loaded data-src)
+ img_match = re.search(
+ r"
]+(?:src|data-src)\s*=\s*[\"']([^\"']+)[\"'][^>]*>",
+ fig_html,
+ re.IGNORECASE,
+ )
+ if not img_match:
+ continue
+ img_src = urljoin(page_url, img_match.group(1).strip())
+
+ # Locate figcaption
+ figcap_match = re.search(
+ r"]*>([\s\S]*?)",
+ fig_html,
+ re.IGNORECASE,
+ )
+ if not figcap_match:
+ continue
+ figcap_html = figcap_match.group(1)
+
+ # --- Extract credit ---
+ credit: str | None = None
+
+ # Try credit via class attribute on an inner element
+ credit_elem_match = re.search(
+ r"<(?:span|p|div)[^>]*"
+ + _CREDIT_CLASS_RE.pattern
+ + r"[^>]*>([\s\S]*?)(?:span|p|div)>",
+ figcap_html,
+ re.IGNORECASE,
+ )
+ if credit_elem_match:
+ credit = _clean_text(credit_elem_match.group(1))
+
+ # Fallback: scan plain text of figcaption for credit patterns
+ if not credit:
+ figcap_text = unescape(re.sub(r"<[^>]+>", " ", figcap_html))
+ cred_text_match = _CREDIT_TEXT_RE.search(figcap_text)
+ if cred_text_match:
+ credit = _clean_text(cred_text_match.group(1))
+
+ # --- Extract caption (full figcaption text) ---
+ caption = _clean_text(figcap_html)
+
+ # Only store entries that carry at least one piece of metadata
+ if caption or credit:
+ entry: dict[str, str] = {}
+ if caption:
+ entry["caption"] = caption
+ if credit:
+ entry["credit"] = credit
+ result[img_src] = entry
+
+ # ------------------------------------------------------------------
+ # Strategy 2: data-* attributes on
tags
+ # ------------------------------------------------------------------
+ for img_match in re.finditer(r"
]+)>", html, re.IGNORECASE):
+ img_attrs = img_match.group(1)
+
+ # Resolve image URL (prefer src over data-src)
+ src_match = re.search(r'(?:^|\s)src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE)
+ if not src_match:
+ src_match = re.search(r'data-src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE)
+ if not src_match:
+ continue
+ img_src = urljoin(page_url, src_match.group(1).strip())
+
+ # Skip images already handled by Strategy 1
+ if img_src in result:
+ continue
+
+ credit: str | None = None
+ caption: str | None = None
+
+ for attr in _IMG_DATA_CREDIT_ATTRS:
+ attr_match = re.search(
+ rf'{re.escape(attr)}\s*=\s*["\']([^"\']+)["\']',
+ img_attrs,
+ re.IGNORECASE,
+ )
+ if attr_match:
+ credit = _clean_text(attr_match.group(1))
+ break
+
+ for attr in _IMG_DATA_CAPTION_ATTRS:
+ attr_match = re.search(
+ rf'{re.escape(attr)}\s*=\s*["\']([^"\']+)["\']',
+ img_attrs,
+ re.IGNORECASE,
+ )
+ if attr_match:
+ caption = _clean_text(attr_match.group(1))
+ break
+
+ if caption or credit:
+ entry = {}
+ if caption:
+ entry["caption"] = caption
+ if credit:
+ entry["credit"] = credit
+ result[img_src] = entry
+
+ # ------------------------------------------------------------------
+ # Strategy 3:
followed within 200 chars by a credit element
+ # ------------------------------------------------------------------
+ for img_match in re.finditer(r"
]+)>", html, re.IGNORECASE):
+ img_attrs = img_match.group(1)
+
+ src_match = re.search(r'(?:^|\s)src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE)
+ if not src_match:
+ src_match = re.search(r'data-src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE)
+ if not src_match:
+ continue
+ img_src = urljoin(page_url, src_match.group(1).strip())
+
+ # Skip images already handled by earlier strategies
+ if img_src in result:
+ continue
+
+ # Look at the 200 characters of HTML immediately after the img tag
+ after_start = img_match.end()
+ after_html = html[after_start : after_start + 200]
+
+ adj_match = re.search(
+ r"<(?:span|p|div)[^>]*"
+ + _ADJ_CREDIT_CLASS_RE.pattern
+ + r"[^>]*>([\s\S]*?)(?:span|p|div)>",
+ after_html,
+ re.IGNORECASE,
+ )
+ if adj_match:
+ credit = _clean_text(adj_match.group(1))
+ if credit:
+ result[img_src] = {"credit": credit}
+
+ except Exception:
+ return {}
+
+ return result
+
+
def extract_article(url: str, timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS) -> ExtractedArticle:
try:
req = Request(
@@ -232,6 +414,7 @@ def extract_article(url: str, timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS) ->
summary = _clean_text(content_text[:320])
images = _extract_images(html, url)
press_contact = _extract_press_contact(content_text)
+ image_metadata = _extract_image_metadata(html, url)
return ExtractedArticle(
title=title,
@@ -242,6 +425,7 @@ def extract_article(url: str, timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS) ->
images=images,
press_contact=press_contact,
extraction_error=None,
+ image_metadata=image_metadata,
)
@@ -254,4 +438,5 @@ def extracted_article_to_meta(article: ExtractedArticle) -> dict[str, Any]:
"images": article.images,
"press_contact": article.press_contact,
"extraction_error": article.extraction_error,
+ "image_metadata": article.image_metadata,
}
diff --git a/backend/app/telegram_bot.py b/backend/app/telegram_bot.py
index 46ddd28..c92b009 100644
--- a/backend/app/telegram_bot.py
+++ b/backend/app/telegram_bot.py
@@ -289,6 +289,7 @@ def notify_pipeline_done(stats: dict[str, Any]) -> None:
processed = stats.get("processed", 0)
drafts = stats.get("drafts_created", 0)
rejected = stats.get("rejected", 0)
+ no_image = stats.get("no_image", 0)
warnings = stats.get("warnings", 0)
errors = stats.get("errors", 0)
@@ -300,6 +301,8 @@ def notify_pipeline_done(stats: dict[str, Any]) -> None:
]
if rejected:
lines.append(f"🚫 Abgelehnt: {rejected}")
+ if no_image:
+ lines.append(f"🖼️ Kein Bild: {no_image}")
if warnings:
lines.append(f"⚠️ Warnungen: {warnings}")
if errors:
diff --git a/backend/app/wordpress.py b/backend/app/wordpress.py
index a4f7f3a..a1ef8f5 100644
--- a/backend/app/wordpress.py
+++ b/backend/app/wordpress.py
@@ -161,6 +161,32 @@ def _guess_filename(image_url: str, content_type: str) -> str:
return stem
+def _get_image_meta_for_url(meta_json: str | None, image_url: str) -> dict:
+ """Return the caption/credit dict for a specific image URL from extraction metadata."""
+ if not meta_json or not image_url:
+ return {}
+ try:
+ meta = json.loads(meta_json)
+ image_metadata = (meta.get("extraction") or {}).get("image_metadata") or {}
+ return image_metadata.get(image_url) or {}
+ except Exception:
+ return {}
+
+
+def _build_image_caption(image_meta: dict, source_url: str) -> str:
+ """Build a WP caption string from image metadata and source URL."""
+ caption = (image_meta.get("caption") or "").strip()
+ credit = (image_meta.get("credit") or "").strip()
+ parts = []
+ if caption:
+ parts.append(caption)
+ if credit:
+ parts.append(credit)
+ if not parts:
+ parts.append(f"Quelle: {source_url}")
+ return " | ".join(parts)
+
+
def _upload_featured_media(
*,
base_url: str,
@@ -168,6 +194,7 @@ def _upload_featured_media(
image_url: str,
article_title: str,
source_url: str,
+ image_caption: str = "",
) -> int:
image_bytes, content_type = _download_image_bytes(image_url, referer=source_url or None)
filename = _guess_filename(image_url, content_type)
@@ -192,7 +219,6 @@ def _upload_featured_media(
if media_id <= 0:
raise RuntimeError(f"WordPress Media-Upload fehlgeschlagen: {media_payload}")
- # Optional metadata update for traceability.
_wp_request(
base_url=base_url,
auth_header=auth_header,
@@ -200,7 +226,7 @@ def _upload_featured_media(
endpoint=f"media/{media_id}",
payload={
"title": f"{article_title[:120]} - Bild",
- "caption": f"Quelle: {source_url}",
+ "caption": image_caption or f"Quelle: {source_url}",
"alt_text": article_title[:200],
},
)
@@ -289,6 +315,45 @@ def _sanitize_publish_text(text: str) -> str:
return merged
+def _build_attribution_block(article: dict[str, Any]) -> str:
+ """Build a WP Gutenberg attribution block for the bottom of the article."""
+ source_url = (article.get("canonical_url") or article.get("source_url") or "").strip()
+ source_name = (article.get("source_name_snapshot") or "").strip()
+ author = (article.get("author") or "").strip()
+
+ # Get image credit from extraction metadata
+ credit = ""
+ try:
+ meta = json.loads(article.get("meta_json") or "{}")
+ selected_url = (meta.get("image_review") or {}).get("selected_url") or ""
+ if selected_url:
+ img_meta = (meta.get("extraction") or {}).get("image_metadata") or {}
+ credit = (img_meta.get(selected_url) or {}).get("credit") or ""
+ except Exception:
+ pass
+
+ parts: list[str] = []
+ if source_url:
+ label = source_name or source_url
+ parts.append(f'Originalartikel: {escape(label)}')
+ if author:
+ parts.append(f"Autor: {escape(author)}")
+ if credit:
+ parts.append(f"Bildnachweis: {escape(credit)}")
+
+ if not parts:
+ return ""
+
+ inner = " | ".join(parts)
+ return (
+ "\n"
+ "
\n"
+ f''
+ f'{inner}
'
+ ""
+ )
+
+
def _build_post_content(article: dict[str, Any]) -> tuple[str, str | None]:
summary = (article.get("summary") or "").strip()
body_text = (article.get("content_rewritten") or article.get("content_raw") or "").strip()
@@ -300,7 +365,9 @@ def _build_post_content(article: dict[str, Any]) -> tuple[str, str | None]:
body_html = _html_to_wp_blocks(body_text) if has_html else _as_block_paragraphs(body_text)
if not body_html:
body_html = "Kein Inhalt verfügbar.
"
- content = body_html.strip()
+
+ attribution = _build_attribution_block(article)
+ content = (body_html + attribution).strip()
return content, None
@@ -318,6 +385,8 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]:
featured_media_id = None
selected_image_url = _selected_image_url_from_meta(article.get("meta_json"))
if selected_image_url:
+ image_meta = _get_image_meta_for_url(article.get("meta_json"), selected_image_url)
+ image_caption = _build_image_caption(image_meta, source_url)
try:
featured_media_id = _upload_featured_media(
base_url=settings.wordpress_base_url,
@@ -325,6 +394,7 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]:
image_url=selected_image_url,
article_title=title,
source_url=source_url,
+ image_caption=image_caption,
)
except Exception as img_exc:
import logging
diff --git a/backend/app/workflow.py b/backend/app/workflow.py
index b6cd4bb..83e9b63 100644
--- a/backend/app/workflow.py
+++ b/backend/app/workflow.py
@@ -1,6 +1,6 @@
from __future__ import annotations
-UI_STATUSES = ("new", "rewrite", "publish", "published", "close")
+UI_STATUSES = ("new", "rewrite", "publish", "published", "close", "no_image")
def internal_to_ui_status(status: str | None) -> str:
@@ -11,7 +11,7 @@ def internal_to_ui_status(status: str | None) -> str:
return "close"
if value == "review":
return "rewrite"
- if value in {"new", "rewrite", "published"}:
+ if value in {"new", "rewrite", "published", "no_image"}:
return value
return value or "new"
@@ -22,7 +22,7 @@ def ui_to_internal_status(status: str | None) -> str:
return "approved"
if value == "close":
return "error"
- if value in {"new", "rewrite", "published"}:
+ if value in {"new", "rewrite", "published", "no_image"}:
return value
if value in {"approved", "error", "review"}:
return value
@@ -35,4 +35,5 @@ ALLOWED_UI_TRANSITIONS: dict[str, set[str]] = {
"publish": {"published", "close"},
"published": {"rewrite", "close"},
"close": {"rewrite"},
+ "no_image": {"rewrite", "close"},
}