feat(images): auto-select relevant article images and tidy detail header

This commit is contained in:
Oliver 2026-02-18 10:40:39 +01:00
parent fb3465fb10
commit 26e3d26b93
4 changed files with 115 additions and 13 deletions

View file

@ -4,8 +4,10 @@ from dataclasses import dataclass
from datetime import datetime, timezone from datetime import datetime, timezone
import hashlib import hashlib
import json import json
import re
import time import time
from typing import Any from typing import Any
from urllib.parse import unquote, urlparse
import feedparser import feedparser
@ -67,6 +69,72 @@ def _parsed_get(parsed: object, key: str, default: object = None) -> object:
return getattr(parsed, key, default) return getattr(parsed, key, default)
def _normalize_tokens(text: str) -> set[str]:
normalized = re.sub(r"[^a-z0-9]+", " ", text.lower())
return {token for token in normalized.split() if len(token) >= 4}
def _rank_image_candidates(source_url: str, title: str, images: list[str]) -> list[dict[str, Any]]:
source_host = (urlparse(source_url).hostname or "").lower()
is_presseportal = "presseportal.de" in source_host
title_tokens = _normalize_tokens(title)
blocked_patterns = ("logo", "badge", "app-store", "google-play", "na-logo", "sprite", "icon", "favicon", "tracking", "pixel")
ranked: list[dict[str, Any]] = []
for url in images:
parsed = urlparse(url)
path = unquote(parsed.path.lower())
full = f"{parsed.netloc.lower()}{path}"
score = 0
reasons: list[str] = []
if any(token in full for token in blocked_patterns):
score -= 150
reasons.append("blocked-pattern")
if is_presseportal and "/thumbnail/story_big/" in path:
score += 120
reasons.append("presseportal-story-big")
elif is_presseportal and "/thumbnail/highlight/" in path:
score += 45
reasons.append("presseportal-highlight")
elif is_presseportal and "/thumbnail/liste/" in path:
score -= 40
reasons.append("presseportal-list")
if "crop=" in (parsed.query or "").lower():
score -= 10
reasons.append("cropped-preview")
path_tokens = _normalize_tokens(path.replace("-", " "))
overlap = len(title_tokens.intersection(path_tokens))
if overlap > 0:
score += min(30, overlap * 6)
reasons.append(f"title-match:{overlap}")
ranked.append({"url": url, "score": score, "reasons": reasons})
ranked.sort(key=lambda item: item["score"], reverse=True)
return ranked
def _select_relevant_images(source_url: str, title: str, images: list[str], max_keep: int = 3) -> tuple[list[str], str | None, list[dict[str, Any]]]:
# dedupe incoming order first
deduped: list[str] = []
seen: set[str] = set()
for image in images:
if image and image not in seen:
seen.add(image)
deduped.append(image)
ranked = _rank_image_candidates(source_url, title, deduped)
kept = [item["url"] for item in ranked if item["score"] > 0][:max_keep]
if not kept and ranked:
kept = [ranked[0]["url"]]
primary = kept[0] if kept else None
return kept, primary, ranked
def run_ingestion(feed_id: int | None = None) -> IngestionStats: def run_ingestion(feed_id: int | None = None) -> IngestionStats:
run_id = create_run(RunCreate(run_type="ingestion", status="running", details="started")) run_id = create_run(RunCreate(run_type="ingestion", status="running", details="started"))
feeds_processed = 0 feeds_processed = 0
@ -167,6 +235,12 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats:
final_summary = extracted.summary or (summary[:1000] if summary else None) final_summary = extracted.summary or (summary[:1000] if summary else None)
final_content_raw = extracted.content_text or content_raw final_content_raw = extracted.content_text or content_raw
final_canonical = extracted.canonical_url or entry.get("link") final_canonical = extracted.canonical_url or entry.get("link")
selected_images, primary_image, ranked_images = _select_relevant_images(
link,
final_title,
extracted.images,
max_keep=3,
)
source_hash = _entry_hash( source_hash = _entry_hash(
entry, entry,
@ -188,6 +262,12 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats:
} }
extraction_meta: dict[str, Any] = extracted_article_to_meta(extracted) extraction_meta: dict[str, Any] = extracted_article_to_meta(extracted)
extraction_meta["fetched_from"] = link extraction_meta["fetched_from"] = link
extraction_meta["image_selection"] = {
"primary": primary_image,
"selected_count": len(selected_images),
"total_candidates": len(extracted.images),
"ranked": ranked_images,
}
article_id = upsert_article( article_id = upsert_article(
ArticleUpsert( ArticleUpsert(
feed_id=int(feed["id"]), feed_id=int(feed["id"]),
@ -201,7 +281,7 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats:
summary=final_summary, summary=final_summary,
content_raw=final_content_raw, content_raw=final_content_raw,
content_rewritten=None, content_rewritten=None,
image_urls_json=json.dumps(extracted.images, ensure_ascii=False) if extracted.images else None, image_urls_json=json.dumps(selected_images, ensure_ascii=False) if selected_images else None,
press_contact=extracted.press_contact, press_contact=extracted.press_contact,
source_name_snapshot=feed.get("source_name"), source_name_snapshot=feed.get("source_name"),
source_terms_url_snapshot=feed.get("source_terms_url"), source_terms_url_snapshot=feed.get("source_terms_url"),

View file

@ -179,6 +179,27 @@ button.secondary {
background: #f8fafc; background: #f8fafc;
} }
.detail-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(220px, 1fr));
gap: 8px 12px;
margin-bottom: 10px;
}
.detail-item {
background: #f8fafc;
border: 1px solid #e2e8f0;
border-radius: 8px;
padding: 8px;
display: grid;
gap: 4px;
}
.detail-item .k {
font-size: 12px;
color: #64748b;
}
.thumb { .thumb {
width: 72px; width: 72px;
height: 72px; height: 72px;

View file

@ -23,19 +23,19 @@
<main class="container"> <main class="container">
<section class="card"> <section class="card">
<h2>{{ article.title }}</h2> <h2>{{ article.title }}</h2>
<p><strong>Status:</strong> <span class="badge">{{ article.status }}</span></p> <div class="detail-grid">
<p><strong>Artikel-Datum:</strong> {{ article.published_at or "-" }}</p> <div class="detail-item"><span class="k">Status</span><span><span class="badge">{{ article.status }}</span></span></div>
<p><strong>Alter:</strong> {{ article.days_old if article.days_old is not none else "-" }} Tage</p> <div class="detail-item"><span class="k">Artikel-Datum</span><span>{{ article.published_at or "-" }}</span></div>
<p><strong>Relevanz:</strong> {{ article.relevance }}</p> <div class="detail-item"><span class="k">Alter</span><span>{{ article.days_old if article.days_old is not none else "-" }} Tage</span></div>
<p><strong>Autor:</strong> {{ article.author or "-" }}</p> <div class="detail-item"><span class="k">Relevanz</span><span>{{ article.relevance }}</span></div>
<p><strong>Feed:</strong> {{ feed.name if feed else "-" }}</p> <div class="detail-item"><span class="k">Autor</span><span>{{ article.author or "-" }}</span></div>
<p><strong>Quelle Snapshot:</strong> {{ article.source_name_snapshot or "-" }}</p> <div class="detail-item"><span class="k">Feed</span><span>{{ feed.name if feed else "-" }}</span></div>
<p><strong>Lizenz Snapshot:</strong> {{ article.source_license_name_snapshot or "-" }}</p> <div class="detail-item"><span class="k">Quelle Snapshot</span><span>{{ article.source_name_snapshot or "-" }}</span></div>
<p><strong>Terms Snapshot:</strong> {{ article.source_terms_url_snapshot or "-" }}</p> <div class="detail-item"><span class="k">Lizenz Snapshot</span><span>{{ article.source_license_name_snapshot or "-" }}</span></div>
<div class="detail-item"><span class="k">Terms Snapshot</span><span>{{ article.source_terms_url_snapshot or "-" }}</span></div>
</div>
<p><strong>Quelle:</strong> <a href="{{ article.source_url }}" target="_blank" rel="noopener">{{ article.source_url }}</a></p> <p><strong>Quelle:</strong> <a href="{{ article.source_url }}" target="_blank" rel="noopener">{{ article.source_url }}</a></p>
{% if article.canonical_url %} {% if article.canonical_url %}<p><strong>Canonical:</strong> <a href="{{ article.canonical_url }}" target="_blank" rel="noopener">{{ article.canonical_url }}</a></p>{% endif %}
<p><strong>Canonical:</strong> <a href="{{ article.canonical_url }}" target="_blank" rel="noopener">{{ article.canonical_url }}</a></p>
{% endif %}
{% if article.summary %} {% if article.summary %}
<p><strong>Summary:</strong> {{ article.summary }}</p> <p><strong>Summary:</strong> {{ article.summary }}</p>
{% endif %} {% endif %}

View file

@ -85,6 +85,7 @@ class TestIngestion(unittest.TestCase):
self.assertEqual(article["author"], "Autorin A") self.assertEqual(article["author"], "Autorin A")
self.assertIn("Original Volltext", article["content_raw"] or "") self.assertIn("Original Volltext", article["content_raw"] or "")
self.assertIn("Pressekontakt", article["meta_json"] or "") self.assertIn("Pressekontakt", article["meta_json"] or "")
self.assertIsNotNone(article["image_urls_json"])
@patch("backend.app.ingestion.extract_article") @patch("backend.app.ingestion.extract_article")
@patch("backend.app.ingestion.feedparser.parse") @patch("backend.app.ingestion.feedparser.parse")