feat(pipeline): image caption/credit extraction, no-image exclusion, WP attribution

source_extraction.py:
- New _extract_image_metadata(): extracts figcaption text + copyright/credit
  per image URL using 3 strategies (figure+figcaption, data-* attributes,
  adjacent credit spans)
- ExtractedArticle gets new image_metadata field
- extracted_article_to_meta() includes image_metadata in stored JSON

pipeline.py:
- After auto image selection, check if selected_url is set
- Articles without usable image → status "no_image" (excluded with Telegram notice)
- PipelineStats and summary report include no_image counter

db.py:
- Add "no_image" to articles status CHECK constraint
- Migration: recreates articles table with updated constraint on existing DBs

workflow.py / main.py:
- Map no_image as own UI status with rewrite/close transitions

wordpress.py:
- _upload_featured_media() accepts image_caption param, sends to WP media
- _get_image_meta_for_url() / _build_image_caption() helpers
- _build_attribution_block(): separator + attribution paragraph at article end
  (original link, author, Bildnachweis/credit)
- _build_post_content() appends attribution block

telegram_bot.py:
- notify_pipeline_done() shows 🖼️ no-image count

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OliverGiertz 2026-03-27 07:08:48 +00:00
parent 1963e32ab4
commit aaac5def27
7 changed files with 381 additions and 10 deletions

View file

@ -110,7 +110,7 @@ def init_db() -> None:
publish_last_error TEXT,
published_to_wp_at TEXT,
word_count INTEGER DEFAULT 0,
status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error')),
status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error', 'no_image')),
meta_json TEXT,
created_at TEXT NOT NULL DEFAULT (datetime('now')),
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
@ -181,6 +181,89 @@ def init_db() -> None:
if column not in existing_columns:
conn.execute(ddl)
# Migration: add 'no_image' to the status CHECK constraint if not present.
# SQLite cannot modify CHECK constraints in-place, so we recreate the table.
table_sql_row = conn.execute(
"SELECT sql FROM sqlite_master WHERE type='table' AND name='articles'"
).fetchone()
if table_sql_row and "'no_image'" not in (table_sql_row["sql"] or ""):
conn.executescript(
"""
PRAGMA foreign_keys=OFF;
CREATE TABLE articles_v2 (
id INTEGER PRIMARY KEY AUTOINCREMENT,
feed_id INTEGER,
source_article_id TEXT,
source_hash TEXT,
title TEXT NOT NULL,
source_url TEXT NOT NULL,
canonical_url TEXT,
published_at TEXT,
author TEXT,
summary TEXT,
content_raw TEXT,
content_rewritten TEXT,
image_urls_json TEXT,
press_contact TEXT,
source_name_snapshot TEXT,
source_terms_url_snapshot TEXT,
source_license_name_snapshot TEXT,
legal_checked INTEGER NOT NULL DEFAULT 0,
legal_checked_at TEXT,
legal_note TEXT,
wp_post_id INTEGER,
wp_post_url TEXT,
publish_attempts INTEGER NOT NULL DEFAULT 0,
publish_last_error TEXT,
published_to_wp_at TEXT,
word_count INTEGER DEFAULT 0,
status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error', 'no_image')),
meta_json TEXT,
relevance_score INTEGER,
scheduled_publish_at TEXT,
created_at TEXT NOT NULL DEFAULT (datetime('now')),
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
FOREIGN KEY(feed_id) REFERENCES feeds(id) ON DELETE SET NULL,
UNIQUE(source_url)
);
INSERT INTO articles_v2 SELECT
id, feed_id, source_article_id, source_hash, title, source_url,
canonical_url, published_at, author, summary, content_raw,
content_rewritten, image_urls_json, press_contact,
source_name_snapshot, source_terms_url_snapshot, source_license_name_snapshot,
legal_checked, legal_checked_at, legal_note,
wp_post_id, wp_post_url, publish_attempts, publish_last_error,
published_to_wp_at, word_count, status, meta_json,
relevance_score, scheduled_publish_at, created_at, updated_at
FROM articles;
DROP TABLE articles;
ALTER TABLE articles_v2 RENAME TO articles;
CREATE INDEX IF NOT EXISTS idx_articles_source_article_id ON articles(source_article_id);
CREATE INDEX IF NOT EXISTS idx_articles_source_hash ON articles(source_hash);
CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_feed_source_article_id
ON articles(feed_id, source_article_id)
WHERE source_article_id IS NOT NULL;
CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_source_hash
ON articles(source_hash)
WHERE source_hash IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_articles_status ON articles(status);
CREATE INDEX IF NOT EXISTS idx_articles_published_at ON articles(published_at);
CREATE TRIGGER IF NOT EXISTS trg_articles_updated_at
AFTER UPDATE ON articles
FOR EACH ROW
BEGIN
UPDATE articles SET updated_at = datetime('now') WHERE id = OLD.id;
END;
PRAGMA foreign_keys=ON;
"""
)
table_rows = conn.execute(
"SELECT name FROM sqlite_master WHERE type = 'table' AND name = 'publish_jobs'"
).fetchall()

View file

@ -123,7 +123,7 @@ class ArticleUpsertRequest(BaseModel):
publish_last_error: str | None = None
published_to_wp_at: str | None = None
word_count: int = 0
status: str = Field(default="new", pattern="^(new|rewrite|publish|published|close|review|approved|error)$")
status: str = Field(default="new", pattern="^(new|rewrite|publish|published|close|review|approved|error|no_image)$")
meta_json: str | None = None
@ -132,7 +132,7 @@ class IngestionRunRequest(BaseModel):
class ArticleTransitionRequest(BaseModel):
target_status: str = Field(pattern="^(new|rewrite|publish|published|close|review|approved|error)$")
target_status: str = Field(pattern="^(new|rewrite|publish|published|close|review|approved|error|no_image)$")
note: str | None = None

View file

@ -45,6 +45,7 @@ class PipelineStats:
rejected: int = 0
warnings: int = 0
errors: int = 0
no_image: int = 0
rejected_articles: list[dict[str, Any]] = field(default_factory=list)
@ -226,6 +227,7 @@ def run_auto_pipeline(trigger: str = "auto") -> dict[str, Any]:
"processed": stats.processed,
"drafts_created": stats.drafts_created,
"rejected": stats.rejected,
"no_image": stats.no_image,
"warnings": stats.warnings,
"errors": stats.errors,
}
@ -242,6 +244,33 @@ def _process_article(article: dict[str, Any], stats: PipelineStats, settings: An
# Auto-select image
_auto_select_image(article)
# Reload to get updated image_review
article = get_article_by_id(article_id) or article
# Exclude articles without a usable image
try:
meta = json.loads(article.get("meta_json") or "{}")
except Exception:
meta = {}
has_image = bool((meta.get("image_review") or {}).get("selected_url"))
if not has_image:
update_article_status(
article_id,
"no_image",
actor="pipeline",
note="Kein Bild vorhanden Artikel ausgeschlossen",
)
stats.no_image += 1
logger.info("Artikel #%d ausgeschlossen: kein Bild gefunden", article_id)
try:
tg.send_message(
f"🖼️ <b>Kein Bild</b> Artikel #{article_id} ausgeschlossen\n"
f"📰 {(article.get('title') or '')[:80]}"
)
except Exception:
pass
return
# Score relevance
try:
relevance = score_article_relevance(article)

View file

@ -1,6 +1,6 @@
from __future__ import annotations
from dataclasses import dataclass
from dataclasses import dataclass, field
from html import unescape
import re
from typing import Any
@ -21,6 +21,7 @@ class ExtractedArticle:
images: list[str]
press_contact: str | None
extraction_error: str | None = None
image_metadata: dict[str, dict] = field(default_factory=dict)
def _clean_text(raw: str | None) -> str | None:
@ -197,6 +198,187 @@ def _extract_press_contact(content_text: str | None) -> str | None:
return None
# CSS class keywords that indicate a copyright/credit element inside a figcaption
_CREDIT_CLASS_RE = re.compile(
r"class\s*=\s*[\"'][^\"']*(?:copyright|credit|photographer|photo-credit|image-credit|bildrechte|fotocredit)[^\"']*[\"']",
re.IGNORECASE,
)
# Inline text patterns that signal a credit/copyright notice
_CREDIT_TEXT_RE = re.compile(
r"(©[^<\n\r]{1,100}|(?:Foto|Bild|Credit|Fotograf|Fotografie)\s*:[^<\n\r]{1,100})",
re.IGNORECASE,
)
# data-* attribute names that carry credit/caption information directly on <img>
_IMG_DATA_CREDIT_ATTRS = ("data-credit", "data-photographer", "data-copyright")
_IMG_DATA_CAPTION_ATTRS = ("data-caption", "data-description")
# Class keywords for adjacent sibling credit spans/divs after an <img>
_ADJ_CREDIT_CLASS_RE = re.compile(
r"class\s*=\s*[\"'][^\"']*(?:copyright|credit|photographer|photo-credit|image-credit|bildrechte|fotocredit)[^\"']*[\"']",
re.IGNORECASE,
)
def _extract_image_metadata(html: str, page_url: str) -> dict[str, dict]:
"""Return a mapping of absolute image URL → {"caption": ..., "credit": ...}.
Uses three progressive strategies:
1. <figure> with <img> + <figcaption>
2. data-* attributes on <img> tags not already covered
3. <img> tags whose immediately following HTML contains a credit element
"""
result: dict[str, dict] = {}
try:
# ------------------------------------------------------------------
# Strategy 1: <figure> blocks containing <img> and <figcaption>
# ------------------------------------------------------------------
for fig_match in re.finditer(r"<figure[^>]*>([\s\S]*?)</figure>", html, re.IGNORECASE):
fig_html = fig_match.group(1)
# Locate image src (src or lazy-loaded data-src)
img_match = re.search(
r"<img[^>]+(?:src|data-src)\s*=\s*[\"']([^\"']+)[\"'][^>]*>",
fig_html,
re.IGNORECASE,
)
if not img_match:
continue
img_src = urljoin(page_url, img_match.group(1).strip())
# Locate figcaption
figcap_match = re.search(
r"<figcaption[^>]*>([\s\S]*?)</figcaption>",
fig_html,
re.IGNORECASE,
)
if not figcap_match:
continue
figcap_html = figcap_match.group(1)
# --- Extract credit ---
credit: str | None = None
# Try credit via class attribute on an inner element
credit_elem_match = re.search(
r"<(?:span|p|div)[^>]*"
+ _CREDIT_CLASS_RE.pattern
+ r"[^>]*>([\s\S]*?)</(?:span|p|div)>",
figcap_html,
re.IGNORECASE,
)
if credit_elem_match:
credit = _clean_text(credit_elem_match.group(1))
# Fallback: scan plain text of figcaption for credit patterns
if not credit:
figcap_text = unescape(re.sub(r"<[^>]+>", " ", figcap_html))
cred_text_match = _CREDIT_TEXT_RE.search(figcap_text)
if cred_text_match:
credit = _clean_text(cred_text_match.group(1))
# --- Extract caption (full figcaption text) ---
caption = _clean_text(figcap_html)
# Only store entries that carry at least one piece of metadata
if caption or credit:
entry: dict[str, str] = {}
if caption:
entry["caption"] = caption
if credit:
entry["credit"] = credit
result[img_src] = entry
# ------------------------------------------------------------------
# Strategy 2: data-* attributes on <img> tags
# ------------------------------------------------------------------
for img_match in re.finditer(r"<img([^>]+)>", html, re.IGNORECASE):
img_attrs = img_match.group(1)
# Resolve image URL (prefer src over data-src)
src_match = re.search(r'(?:^|\s)src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE)
if not src_match:
src_match = re.search(r'data-src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE)
if not src_match:
continue
img_src = urljoin(page_url, src_match.group(1).strip())
# Skip images already handled by Strategy 1
if img_src in result:
continue
credit: str | None = None
caption: str | None = None
for attr in _IMG_DATA_CREDIT_ATTRS:
attr_match = re.search(
rf'{re.escape(attr)}\s*=\s*["\']([^"\']+)["\']',
img_attrs,
re.IGNORECASE,
)
if attr_match:
credit = _clean_text(attr_match.group(1))
break
for attr in _IMG_DATA_CAPTION_ATTRS:
attr_match = re.search(
rf'{re.escape(attr)}\s*=\s*["\']([^"\']+)["\']',
img_attrs,
re.IGNORECASE,
)
if attr_match:
caption = _clean_text(attr_match.group(1))
break
if caption or credit:
entry = {}
if caption:
entry["caption"] = caption
if credit:
entry["credit"] = credit
result[img_src] = entry
# ------------------------------------------------------------------
# Strategy 3: <img> followed within 200 chars by a credit element
# ------------------------------------------------------------------
for img_match in re.finditer(r"<img([^>]+)>", html, re.IGNORECASE):
img_attrs = img_match.group(1)
src_match = re.search(r'(?:^|\s)src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE)
if not src_match:
src_match = re.search(r'data-src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE)
if not src_match:
continue
img_src = urljoin(page_url, src_match.group(1).strip())
# Skip images already handled by earlier strategies
if img_src in result:
continue
# Look at the 200 characters of HTML immediately after the img tag
after_start = img_match.end()
after_html = html[after_start : after_start + 200]
adj_match = re.search(
r"<(?:span|p|div)[^>]*"
+ _ADJ_CREDIT_CLASS_RE.pattern
+ r"[^>]*>([\s\S]*?)</(?:span|p|div)>",
after_html,
re.IGNORECASE,
)
if adj_match:
credit = _clean_text(adj_match.group(1))
if credit:
result[img_src] = {"credit": credit}
except Exception:
return {}
return result
def extract_article(url: str, timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS) -> ExtractedArticle:
try:
req = Request(
@ -232,6 +414,7 @@ def extract_article(url: str, timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS) ->
summary = _clean_text(content_text[:320])
images = _extract_images(html, url)
press_contact = _extract_press_contact(content_text)
image_metadata = _extract_image_metadata(html, url)
return ExtractedArticle(
title=title,
@ -242,6 +425,7 @@ def extract_article(url: str, timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS) ->
images=images,
press_contact=press_contact,
extraction_error=None,
image_metadata=image_metadata,
)
@ -254,4 +438,5 @@ def extracted_article_to_meta(article: ExtractedArticle) -> dict[str, Any]:
"images": article.images,
"press_contact": article.press_contact,
"extraction_error": article.extraction_error,
"image_metadata": article.image_metadata,
}

View file

@ -289,6 +289,7 @@ def notify_pipeline_done(stats: dict[str, Any]) -> None:
processed = stats.get("processed", 0)
drafts = stats.get("drafts_created", 0)
rejected = stats.get("rejected", 0)
no_image = stats.get("no_image", 0)
warnings = stats.get("warnings", 0)
errors = stats.get("errors", 0)
@ -300,6 +301,8 @@ def notify_pipeline_done(stats: dict[str, Any]) -> None:
]
if rejected:
lines.append(f"🚫 Abgelehnt: {rejected}")
if no_image:
lines.append(f"🖼️ Kein Bild: {no_image}")
if warnings:
lines.append(f"⚠️ Warnungen: {warnings}")
if errors:

View file

@ -161,6 +161,32 @@ def _guess_filename(image_url: str, content_type: str) -> str:
return stem
def _get_image_meta_for_url(meta_json: str | None, image_url: str) -> dict:
"""Return the caption/credit dict for a specific image URL from extraction metadata."""
if not meta_json or not image_url:
return {}
try:
meta = json.loads(meta_json)
image_metadata = (meta.get("extraction") or {}).get("image_metadata") or {}
return image_metadata.get(image_url) or {}
except Exception:
return {}
def _build_image_caption(image_meta: dict, source_url: str) -> str:
"""Build a WP caption string from image metadata and source URL."""
caption = (image_meta.get("caption") or "").strip()
credit = (image_meta.get("credit") or "").strip()
parts = []
if caption:
parts.append(caption)
if credit:
parts.append(credit)
if not parts:
parts.append(f"Quelle: {source_url}")
return " | ".join(parts)
def _upload_featured_media(
*,
base_url: str,
@ -168,6 +194,7 @@ def _upload_featured_media(
image_url: str,
article_title: str,
source_url: str,
image_caption: str = "",
) -> int:
image_bytes, content_type = _download_image_bytes(image_url, referer=source_url or None)
filename = _guess_filename(image_url, content_type)
@ -192,7 +219,6 @@ def _upload_featured_media(
if media_id <= 0:
raise RuntimeError(f"WordPress Media-Upload fehlgeschlagen: {media_payload}")
# Optional metadata update for traceability.
_wp_request(
base_url=base_url,
auth_header=auth_header,
@ -200,7 +226,7 @@ def _upload_featured_media(
endpoint=f"media/{media_id}",
payload={
"title": f"{article_title[:120]} - Bild",
"caption": f"Quelle: {source_url}",
"caption": image_caption or f"Quelle: {source_url}",
"alt_text": article_title[:200],
},
)
@ -289,6 +315,45 @@ def _sanitize_publish_text(text: str) -> str:
return merged
def _build_attribution_block(article: dict[str, Any]) -> str:
"""Build a WP Gutenberg attribution block for the bottom of the article."""
source_url = (article.get("canonical_url") or article.get("source_url") or "").strip()
source_name = (article.get("source_name_snapshot") or "").strip()
author = (article.get("author") or "").strip()
# Get image credit from extraction metadata
credit = ""
try:
meta = json.loads(article.get("meta_json") or "{}")
selected_url = (meta.get("image_review") or {}).get("selected_url") or ""
if selected_url:
img_meta = (meta.get("extraction") or {}).get("image_metadata") or {}
credit = (img_meta.get(selected_url) or {}).get("credit") or ""
except Exception:
pass
parts: list[str] = []
if source_url:
label = source_name or source_url
parts.append(f'Originalartikel: <a href="{source_url}">{escape(label)}</a>')
if author:
parts.append(f"Autor: {escape(author)}")
if credit:
parts.append(f"Bildnachweis: {escape(credit)}")
if not parts:
return ""
inner = " &nbsp;|&nbsp; ".join(parts)
return (
"\n<!-- wp:separator {\"className\":\"is-style-wide\"} -->"
"<hr class=\"wp-block-separator is-style-wide\"/><!-- /wp:separator -->\n"
f'<!-- wp:paragraph {{\"className\":\"article-attribution\"}} -->'
f'<p class="article-attribution"><em>{inner}</em></p>'
"<!-- /wp:paragraph -->"
)
def _build_post_content(article: dict[str, Any]) -> tuple[str, str | None]:
summary = (article.get("summary") or "").strip()
body_text = (article.get("content_rewritten") or article.get("content_raw") or "").strip()
@ -300,7 +365,9 @@ def _build_post_content(article: dict[str, Any]) -> tuple[str, str | None]:
body_html = _html_to_wp_blocks(body_text) if has_html else _as_block_paragraphs(body_text)
if not body_html:
body_html = "<!-- wp:paragraph --><p>Kein Inhalt verfügbar.</p><!-- /wp:paragraph -->"
content = body_html.strip()
attribution = _build_attribution_block(article)
content = (body_html + attribution).strip()
return content, None
@ -318,6 +385,8 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]:
featured_media_id = None
selected_image_url = _selected_image_url_from_meta(article.get("meta_json"))
if selected_image_url:
image_meta = _get_image_meta_for_url(article.get("meta_json"), selected_image_url)
image_caption = _build_image_caption(image_meta, source_url)
try:
featured_media_id = _upload_featured_media(
base_url=settings.wordpress_base_url,
@ -325,6 +394,7 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]:
image_url=selected_image_url,
article_title=title,
source_url=source_url,
image_caption=image_caption,
)
except Exception as img_exc:
import logging

View file

@ -1,6 +1,6 @@
from __future__ import annotations
UI_STATUSES = ("new", "rewrite", "publish", "published", "close")
UI_STATUSES = ("new", "rewrite", "publish", "published", "close", "no_image")
def internal_to_ui_status(status: str | None) -> str:
@ -11,7 +11,7 @@ def internal_to_ui_status(status: str | None) -> str:
return "close"
if value == "review":
return "rewrite"
if value in {"new", "rewrite", "published"}:
if value in {"new", "rewrite", "published", "no_image"}:
return value
return value or "new"
@ -22,7 +22,7 @@ def ui_to_internal_status(status: str | None) -> str:
return "approved"
if value == "close":
return "error"
if value in {"new", "rewrite", "published"}:
if value in {"new", "rewrite", "published", "no_image"}:
return value
if value in {"approved", "error", "review"}:
return value
@ -35,4 +35,5 @@ ALLOWED_UI_TRANSITIONS: dict[str, set[str]] = {
"publish": {"published", "close"},
"published": {"rewrite", "close"},
"close": {"rewrite"},
"no_image": {"rewrite", "close"},
}