feat(pipeline): image caption/credit extraction, no-image exclusion, WP attribution

source_extraction.py:
- New _extract_image_metadata(): extracts figcaption text + copyright/credit
  per image URL using 3 strategies (figure+figcaption, data-* attributes,
  adjacent credit spans)
- ExtractedArticle gets new image_metadata field
- extracted_article_to_meta() includes image_metadata in stored JSON

pipeline.py:
- After auto image selection, check if selected_url is set
- Articles without usable image → status "no_image" (excluded with Telegram notice)
- PipelineStats and summary report include no_image counter

db.py:
- Add "no_image" to articles status CHECK constraint
- Migration: recreates articles table with updated constraint on existing DBs

workflow.py / main.py:
- Map no_image as own UI status with rewrite/close transitions

wordpress.py:
- _upload_featured_media() accepts image_caption param, sends to WP media
- _get_image_meta_for_url() / _build_image_caption() helpers
- _build_attribution_block(): separator + attribution paragraph at article end
  (original link, author, Bildnachweis/credit)
- _build_post_content() appends attribution block

telegram_bot.py:
- notify_pipeline_done() shows 🖼️ no-image count

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OliverGiertz 2026-03-27 07:08:48 +00:00
parent 1963e32ab4
commit aaac5def27
7 changed files with 381 additions and 10 deletions

View file

@ -1,6 +1,6 @@
from __future__ import annotations
from dataclasses import dataclass
from dataclasses import dataclass, field
from html import unescape
import re
from typing import Any
@ -21,6 +21,7 @@ class ExtractedArticle:
images: list[str]
press_contact: str | None
extraction_error: str | None = None
image_metadata: dict[str, dict] = field(default_factory=dict)
def _clean_text(raw: str | None) -> str | None:
@ -197,6 +198,187 @@ def _extract_press_contact(content_text: str | None) -> str | None:
return None
# CSS class keywords that indicate a copyright/credit element inside a figcaption
_CREDIT_CLASS_RE = re.compile(
r"class\s*=\s*[\"'][^\"']*(?:copyright|credit|photographer|photo-credit|image-credit|bildrechte|fotocredit)[^\"']*[\"']",
re.IGNORECASE,
)
# Inline text patterns that signal a credit/copyright notice
_CREDIT_TEXT_RE = re.compile(
r"(©[^<\n\r]{1,100}|(?:Foto|Bild|Credit|Fotograf|Fotografie)\s*:[^<\n\r]{1,100})",
re.IGNORECASE,
)
# data-* attribute names that carry credit/caption information directly on <img>
_IMG_DATA_CREDIT_ATTRS = ("data-credit", "data-photographer", "data-copyright")
_IMG_DATA_CAPTION_ATTRS = ("data-caption", "data-description")
# Class keywords for adjacent sibling credit spans/divs after an <img>
_ADJ_CREDIT_CLASS_RE = re.compile(
r"class\s*=\s*[\"'][^\"']*(?:copyright|credit|photographer|photo-credit|image-credit|bildrechte|fotocredit)[^\"']*[\"']",
re.IGNORECASE,
)
def _extract_image_metadata(html: str, page_url: str) -> dict[str, dict]:
"""Return a mapping of absolute image URL → {"caption": ..., "credit": ...}.
Uses three progressive strategies:
1. <figure> with <img> + <figcaption>
2. data-* attributes on <img> tags not already covered
3. <img> tags whose immediately following HTML contains a credit element
"""
result: dict[str, dict] = {}
try:
# ------------------------------------------------------------------
# Strategy 1: <figure> blocks containing <img> and <figcaption>
# ------------------------------------------------------------------
for fig_match in re.finditer(r"<figure[^>]*>([\s\S]*?)</figure>", html, re.IGNORECASE):
fig_html = fig_match.group(1)
# Locate image src (src or lazy-loaded data-src)
img_match = re.search(
r"<img[^>]+(?:src|data-src)\s*=\s*[\"']([^\"']+)[\"'][^>]*>",
fig_html,
re.IGNORECASE,
)
if not img_match:
continue
img_src = urljoin(page_url, img_match.group(1).strip())
# Locate figcaption
figcap_match = re.search(
r"<figcaption[^>]*>([\s\S]*?)</figcaption>",
fig_html,
re.IGNORECASE,
)
if not figcap_match:
continue
figcap_html = figcap_match.group(1)
# --- Extract credit ---
credit: str | None = None
# Try credit via class attribute on an inner element
credit_elem_match = re.search(
r"<(?:span|p|div)[^>]*"
+ _CREDIT_CLASS_RE.pattern
+ r"[^>]*>([\s\S]*?)</(?:span|p|div)>",
figcap_html,
re.IGNORECASE,
)
if credit_elem_match:
credit = _clean_text(credit_elem_match.group(1))
# Fallback: scan plain text of figcaption for credit patterns
if not credit:
figcap_text = unescape(re.sub(r"<[^>]+>", " ", figcap_html))
cred_text_match = _CREDIT_TEXT_RE.search(figcap_text)
if cred_text_match:
credit = _clean_text(cred_text_match.group(1))
# --- Extract caption (full figcaption text) ---
caption = _clean_text(figcap_html)
# Only store entries that carry at least one piece of metadata
if caption or credit:
entry: dict[str, str] = {}
if caption:
entry["caption"] = caption
if credit:
entry["credit"] = credit
result[img_src] = entry
# ------------------------------------------------------------------
# Strategy 2: data-* attributes on <img> tags
# ------------------------------------------------------------------
for img_match in re.finditer(r"<img([^>]+)>", html, re.IGNORECASE):
img_attrs = img_match.group(1)
# Resolve image URL (prefer src over data-src)
src_match = re.search(r'(?:^|\s)src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE)
if not src_match:
src_match = re.search(r'data-src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE)
if not src_match:
continue
img_src = urljoin(page_url, src_match.group(1).strip())
# Skip images already handled by Strategy 1
if img_src in result:
continue
credit: str | None = None
caption: str | None = None
for attr in _IMG_DATA_CREDIT_ATTRS:
attr_match = re.search(
rf'{re.escape(attr)}\s*=\s*["\']([^"\']+)["\']',
img_attrs,
re.IGNORECASE,
)
if attr_match:
credit = _clean_text(attr_match.group(1))
break
for attr in _IMG_DATA_CAPTION_ATTRS:
attr_match = re.search(
rf'{re.escape(attr)}\s*=\s*["\']([^"\']+)["\']',
img_attrs,
re.IGNORECASE,
)
if attr_match:
caption = _clean_text(attr_match.group(1))
break
if caption or credit:
entry = {}
if caption:
entry["caption"] = caption
if credit:
entry["credit"] = credit
result[img_src] = entry
# ------------------------------------------------------------------
# Strategy 3: <img> followed within 200 chars by a credit element
# ------------------------------------------------------------------
for img_match in re.finditer(r"<img([^>]+)>", html, re.IGNORECASE):
img_attrs = img_match.group(1)
src_match = re.search(r'(?:^|\s)src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE)
if not src_match:
src_match = re.search(r'data-src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE)
if not src_match:
continue
img_src = urljoin(page_url, src_match.group(1).strip())
# Skip images already handled by earlier strategies
if img_src in result:
continue
# Look at the 200 characters of HTML immediately after the img tag
after_start = img_match.end()
after_html = html[after_start : after_start + 200]
adj_match = re.search(
r"<(?:span|p|div)[^>]*"
+ _ADJ_CREDIT_CLASS_RE.pattern
+ r"[^>]*>([\s\S]*?)</(?:span|p|div)>",
after_html,
re.IGNORECASE,
)
if adj_match:
credit = _clean_text(adj_match.group(1))
if credit:
result[img_src] = {"credit": credit}
except Exception:
return {}
return result
def extract_article(url: str, timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS) -> ExtractedArticle:
try:
req = Request(
@ -232,6 +414,7 @@ def extract_article(url: str, timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS) ->
summary = _clean_text(content_text[:320])
images = _extract_images(html, url)
press_contact = _extract_press_contact(content_text)
image_metadata = _extract_image_metadata(html, url)
return ExtractedArticle(
title=title,
@ -242,6 +425,7 @@ def extract_article(url: str, timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS) ->
images=images,
press_contact=press_contact,
extraction_error=None,
image_metadata=image_metadata,
)
@ -254,4 +438,5 @@ def extracted_article_to_meta(article: ExtractedArticle) -> dict[str, Any]:
"images": article.images,
"press_contact": article.press_contact,
"extraction_error": article.extraction_error,
"image_metadata": article.image_metadata,
}