feat(pipeline): image caption/credit extraction, no-image exclusion, WP attribution
source_extraction.py:
- New _extract_image_metadata(): extracts figcaption text + copyright/credit
per image URL using 3 strategies (figure+figcaption, data-* attributes,
adjacent credit spans)
- ExtractedArticle gets new image_metadata field
- extracted_article_to_meta() includes image_metadata in stored JSON
pipeline.py:
- After auto image selection, check if selected_url is set
- Articles without usable image → status "no_image" (excluded with Telegram notice)
- PipelineStats and summary report include no_image counter
db.py:
- Add "no_image" to articles status CHECK constraint
- Migration: recreates articles table with updated constraint on existing DBs
workflow.py / main.py:
- Map no_image as own UI status with rewrite/close transitions
wordpress.py:
- _upload_featured_media() accepts image_caption param, sends to WP media
- _get_image_meta_for_url() / _build_image_caption() helpers
- _build_attribution_block(): separator + attribution paragraph at article end
(original link, author, Bildnachweis/credit)
- _build_post_content() appends attribution block
telegram_bot.py:
- notify_pipeline_done() shows 🖼️ no-image count
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
1963e32ab4
commit
aaac5def27
7 changed files with 381 additions and 10 deletions
|
|
@ -45,6 +45,7 @@ class PipelineStats:
|
|||
rejected: int = 0
|
||||
warnings: int = 0
|
||||
errors: int = 0
|
||||
no_image: int = 0
|
||||
rejected_articles: list[dict[str, Any]] = field(default_factory=list)
|
||||
|
||||
|
||||
|
|
@ -226,6 +227,7 @@ def run_auto_pipeline(trigger: str = "auto") -> dict[str, Any]:
|
|||
"processed": stats.processed,
|
||||
"drafts_created": stats.drafts_created,
|
||||
"rejected": stats.rejected,
|
||||
"no_image": stats.no_image,
|
||||
"warnings": stats.warnings,
|
||||
"errors": stats.errors,
|
||||
}
|
||||
|
|
@ -242,6 +244,33 @@ def _process_article(article: dict[str, Any], stats: PipelineStats, settings: An
|
|||
# Auto-select image
|
||||
_auto_select_image(article)
|
||||
|
||||
# Reload to get updated image_review
|
||||
article = get_article_by_id(article_id) or article
|
||||
|
||||
# Exclude articles without a usable image
|
||||
try:
|
||||
meta = json.loads(article.get("meta_json") or "{}")
|
||||
except Exception:
|
||||
meta = {}
|
||||
has_image = bool((meta.get("image_review") or {}).get("selected_url"))
|
||||
if not has_image:
|
||||
update_article_status(
|
||||
article_id,
|
||||
"no_image",
|
||||
actor="pipeline",
|
||||
note="Kein Bild vorhanden – Artikel ausgeschlossen",
|
||||
)
|
||||
stats.no_image += 1
|
||||
logger.info("Artikel #%d ausgeschlossen: kein Bild gefunden", article_id)
|
||||
try:
|
||||
tg.send_message(
|
||||
f"🖼️ <b>Kein Bild</b> – Artikel #{article_id} ausgeschlossen\n"
|
||||
f"📰 {(article.get('title') or '')[:80]}"
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
return
|
||||
|
||||
# Score relevance
|
||||
try:
|
||||
relevance = score_article_relevance(article)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue