feat(images): auto-select relevant article images and tidy detail header
This commit is contained in:
parent
fb3465fb10
commit
26e3d26b93
4 changed files with 115 additions and 13 deletions
|
|
@ -4,8 +4,10 @@ from dataclasses import dataclass
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
import hashlib
|
import hashlib
|
||||||
import json
|
import json
|
||||||
|
import re
|
||||||
import time
|
import time
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
from urllib.parse import unquote, urlparse
|
||||||
|
|
||||||
import feedparser
|
import feedparser
|
||||||
|
|
||||||
|
|
@ -67,6 +69,72 @@ def _parsed_get(parsed: object, key: str, default: object = None) -> object:
|
||||||
return getattr(parsed, key, default)
|
return getattr(parsed, key, default)
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_tokens(text: str) -> set[str]:
|
||||||
|
normalized = re.sub(r"[^a-z0-9]+", " ", text.lower())
|
||||||
|
return {token for token in normalized.split() if len(token) >= 4}
|
||||||
|
|
||||||
|
|
||||||
|
def _rank_image_candidates(source_url: str, title: str, images: list[str]) -> list[dict[str, Any]]:
|
||||||
|
source_host = (urlparse(source_url).hostname or "").lower()
|
||||||
|
is_presseportal = "presseportal.de" in source_host
|
||||||
|
title_tokens = _normalize_tokens(title)
|
||||||
|
blocked_patterns = ("logo", "badge", "app-store", "google-play", "na-logo", "sprite", "icon", "favicon", "tracking", "pixel")
|
||||||
|
|
||||||
|
ranked: list[dict[str, Any]] = []
|
||||||
|
for url in images:
|
||||||
|
parsed = urlparse(url)
|
||||||
|
path = unquote(parsed.path.lower())
|
||||||
|
full = f"{parsed.netloc.lower()}{path}"
|
||||||
|
score = 0
|
||||||
|
reasons: list[str] = []
|
||||||
|
|
||||||
|
if any(token in full for token in blocked_patterns):
|
||||||
|
score -= 150
|
||||||
|
reasons.append("blocked-pattern")
|
||||||
|
|
||||||
|
if is_presseportal and "/thumbnail/story_big/" in path:
|
||||||
|
score += 120
|
||||||
|
reasons.append("presseportal-story-big")
|
||||||
|
elif is_presseportal and "/thumbnail/highlight/" in path:
|
||||||
|
score += 45
|
||||||
|
reasons.append("presseportal-highlight")
|
||||||
|
elif is_presseportal and "/thumbnail/liste/" in path:
|
||||||
|
score -= 40
|
||||||
|
reasons.append("presseportal-list")
|
||||||
|
|
||||||
|
if "crop=" in (parsed.query or "").lower():
|
||||||
|
score -= 10
|
||||||
|
reasons.append("cropped-preview")
|
||||||
|
|
||||||
|
path_tokens = _normalize_tokens(path.replace("-", " "))
|
||||||
|
overlap = len(title_tokens.intersection(path_tokens))
|
||||||
|
if overlap > 0:
|
||||||
|
score += min(30, overlap * 6)
|
||||||
|
reasons.append(f"title-match:{overlap}")
|
||||||
|
|
||||||
|
ranked.append({"url": url, "score": score, "reasons": reasons})
|
||||||
|
|
||||||
|
ranked.sort(key=lambda item: item["score"], reverse=True)
|
||||||
|
return ranked
|
||||||
|
|
||||||
|
|
||||||
|
def _select_relevant_images(source_url: str, title: str, images: list[str], max_keep: int = 3) -> tuple[list[str], str | None, list[dict[str, Any]]]:
|
||||||
|
# dedupe incoming order first
|
||||||
|
deduped: list[str] = []
|
||||||
|
seen: set[str] = set()
|
||||||
|
for image in images:
|
||||||
|
if image and image not in seen:
|
||||||
|
seen.add(image)
|
||||||
|
deduped.append(image)
|
||||||
|
|
||||||
|
ranked = _rank_image_candidates(source_url, title, deduped)
|
||||||
|
kept = [item["url"] for item in ranked if item["score"] > 0][:max_keep]
|
||||||
|
if not kept and ranked:
|
||||||
|
kept = [ranked[0]["url"]]
|
||||||
|
primary = kept[0] if kept else None
|
||||||
|
return kept, primary, ranked
|
||||||
|
|
||||||
|
|
||||||
def run_ingestion(feed_id: int | None = None) -> IngestionStats:
|
def run_ingestion(feed_id: int | None = None) -> IngestionStats:
|
||||||
run_id = create_run(RunCreate(run_type="ingestion", status="running", details="started"))
|
run_id = create_run(RunCreate(run_type="ingestion", status="running", details="started"))
|
||||||
feeds_processed = 0
|
feeds_processed = 0
|
||||||
|
|
@ -167,6 +235,12 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats:
|
||||||
final_summary = extracted.summary or (summary[:1000] if summary else None)
|
final_summary = extracted.summary or (summary[:1000] if summary else None)
|
||||||
final_content_raw = extracted.content_text or content_raw
|
final_content_raw = extracted.content_text or content_raw
|
||||||
final_canonical = extracted.canonical_url or entry.get("link")
|
final_canonical = extracted.canonical_url or entry.get("link")
|
||||||
|
selected_images, primary_image, ranked_images = _select_relevant_images(
|
||||||
|
link,
|
||||||
|
final_title,
|
||||||
|
extracted.images,
|
||||||
|
max_keep=3,
|
||||||
|
)
|
||||||
|
|
||||||
source_hash = _entry_hash(
|
source_hash = _entry_hash(
|
||||||
entry,
|
entry,
|
||||||
|
|
@ -188,6 +262,12 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats:
|
||||||
}
|
}
|
||||||
extraction_meta: dict[str, Any] = extracted_article_to_meta(extracted)
|
extraction_meta: dict[str, Any] = extracted_article_to_meta(extracted)
|
||||||
extraction_meta["fetched_from"] = link
|
extraction_meta["fetched_from"] = link
|
||||||
|
extraction_meta["image_selection"] = {
|
||||||
|
"primary": primary_image,
|
||||||
|
"selected_count": len(selected_images),
|
||||||
|
"total_candidates": len(extracted.images),
|
||||||
|
"ranked": ranked_images,
|
||||||
|
}
|
||||||
article_id = upsert_article(
|
article_id = upsert_article(
|
||||||
ArticleUpsert(
|
ArticleUpsert(
|
||||||
feed_id=int(feed["id"]),
|
feed_id=int(feed["id"]),
|
||||||
|
|
@ -201,7 +281,7 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats:
|
||||||
summary=final_summary,
|
summary=final_summary,
|
||||||
content_raw=final_content_raw,
|
content_raw=final_content_raw,
|
||||||
content_rewritten=None,
|
content_rewritten=None,
|
||||||
image_urls_json=json.dumps(extracted.images, ensure_ascii=False) if extracted.images else None,
|
image_urls_json=json.dumps(selected_images, ensure_ascii=False) if selected_images else None,
|
||||||
press_contact=extracted.press_contact,
|
press_contact=extracted.press_contact,
|
||||||
source_name_snapshot=feed.get("source_name"),
|
source_name_snapshot=feed.get("source_name"),
|
||||||
source_terms_url_snapshot=feed.get("source_terms_url"),
|
source_terms_url_snapshot=feed.get("source_terms_url"),
|
||||||
|
|
|
||||||
|
|
@ -179,6 +179,27 @@ button.secondary {
|
||||||
background: #f8fafc;
|
background: #f8fafc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.detail-grid {
|
||||||
|
display: grid;
|
||||||
|
grid-template-columns: repeat(auto-fit, minmax(220px, 1fr));
|
||||||
|
gap: 8px 12px;
|
||||||
|
margin-bottom: 10px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.detail-item {
|
||||||
|
background: #f8fafc;
|
||||||
|
border: 1px solid #e2e8f0;
|
||||||
|
border-radius: 8px;
|
||||||
|
padding: 8px;
|
||||||
|
display: grid;
|
||||||
|
gap: 4px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.detail-item .k {
|
||||||
|
font-size: 12px;
|
||||||
|
color: #64748b;
|
||||||
|
}
|
||||||
|
|
||||||
.thumb {
|
.thumb {
|
||||||
width: 72px;
|
width: 72px;
|
||||||
height: 72px;
|
height: 72px;
|
||||||
|
|
|
||||||
|
|
@ -23,19 +23,19 @@
|
||||||
<main class="container">
|
<main class="container">
|
||||||
<section class="card">
|
<section class="card">
|
||||||
<h2>{{ article.title }}</h2>
|
<h2>{{ article.title }}</h2>
|
||||||
<p><strong>Status:</strong> <span class="badge">{{ article.status }}</span></p>
|
<div class="detail-grid">
|
||||||
<p><strong>Artikel-Datum:</strong> {{ article.published_at or "-" }}</p>
|
<div class="detail-item"><span class="k">Status</span><span><span class="badge">{{ article.status }}</span></span></div>
|
||||||
<p><strong>Alter:</strong> {{ article.days_old if article.days_old is not none else "-" }} Tage</p>
|
<div class="detail-item"><span class="k">Artikel-Datum</span><span>{{ article.published_at or "-" }}</span></div>
|
||||||
<p><strong>Relevanz:</strong> {{ article.relevance }}</p>
|
<div class="detail-item"><span class="k">Alter</span><span>{{ article.days_old if article.days_old is not none else "-" }} Tage</span></div>
|
||||||
<p><strong>Autor:</strong> {{ article.author or "-" }}</p>
|
<div class="detail-item"><span class="k">Relevanz</span><span>{{ article.relevance }}</span></div>
|
||||||
<p><strong>Feed:</strong> {{ feed.name if feed else "-" }}</p>
|
<div class="detail-item"><span class="k">Autor</span><span>{{ article.author or "-" }}</span></div>
|
||||||
<p><strong>Quelle Snapshot:</strong> {{ article.source_name_snapshot or "-" }}</p>
|
<div class="detail-item"><span class="k">Feed</span><span>{{ feed.name if feed else "-" }}</span></div>
|
||||||
<p><strong>Lizenz Snapshot:</strong> {{ article.source_license_name_snapshot or "-" }}</p>
|
<div class="detail-item"><span class="k">Quelle Snapshot</span><span>{{ article.source_name_snapshot or "-" }}</span></div>
|
||||||
<p><strong>Terms Snapshot:</strong> {{ article.source_terms_url_snapshot or "-" }}</p>
|
<div class="detail-item"><span class="k">Lizenz Snapshot</span><span>{{ article.source_license_name_snapshot or "-" }}</span></div>
|
||||||
|
<div class="detail-item"><span class="k">Terms Snapshot</span><span>{{ article.source_terms_url_snapshot or "-" }}</span></div>
|
||||||
|
</div>
|
||||||
<p><strong>Quelle:</strong> <a href="{{ article.source_url }}" target="_blank" rel="noopener">{{ article.source_url }}</a></p>
|
<p><strong>Quelle:</strong> <a href="{{ article.source_url }}" target="_blank" rel="noopener">{{ article.source_url }}</a></p>
|
||||||
{% if article.canonical_url %}
|
{% if article.canonical_url %}<p><strong>Canonical:</strong> <a href="{{ article.canonical_url }}" target="_blank" rel="noopener">{{ article.canonical_url }}</a></p>{% endif %}
|
||||||
<p><strong>Canonical:</strong> <a href="{{ article.canonical_url }}" target="_blank" rel="noopener">{{ article.canonical_url }}</a></p>
|
|
||||||
{% endif %}
|
|
||||||
{% if article.summary %}
|
{% if article.summary %}
|
||||||
<p><strong>Summary:</strong> {{ article.summary }}</p>
|
<p><strong>Summary:</strong> {{ article.summary }}</p>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
|
||||||
|
|
@ -85,6 +85,7 @@ class TestIngestion(unittest.TestCase):
|
||||||
self.assertEqual(article["author"], "Autorin A")
|
self.assertEqual(article["author"], "Autorin A")
|
||||||
self.assertIn("Original Volltext", article["content_raw"] or "")
|
self.assertIn("Original Volltext", article["content_raw"] or "")
|
||||||
self.assertIn("Pressekontakt", article["meta_json"] or "")
|
self.assertIn("Pressekontakt", article["meta_json"] or "")
|
||||||
|
self.assertIsNotNone(article["image_urls_json"])
|
||||||
|
|
||||||
@patch("backend.app.ingestion.extract_article")
|
@patch("backend.app.ingestion.extract_article")
|
||||||
@patch("backend.app.ingestion.feedparser.parse")
|
@patch("backend.app.ingestion.feedparser.parse")
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue