From 26e3d26b93b440b5b3460f6023b39ba8b99dddaf Mon Sep 17 00:00:00 2001 From: Oliver G Date: Wed, 18 Feb 2026 10:40:39 +0100 Subject: [PATCH] feat(images): auto-select relevant article images and tidy detail header --- backend/app/ingestion.py | 82 ++++++++++++++++++++- backend/static/admin.css | 21 ++++++ backend/templates/admin_article_detail.html | 24 +++--- backend/tests/test_ingestion.py | 1 + 4 files changed, 115 insertions(+), 13 deletions(-) diff --git a/backend/app/ingestion.py b/backend/app/ingestion.py index 37703de..8a7696a 100644 --- a/backend/app/ingestion.py +++ b/backend/app/ingestion.py @@ -4,8 +4,10 @@ from dataclasses import dataclass from datetime import datetime, timezone import hashlib import json +import re import time from typing import Any +from urllib.parse import unquote, urlparse import feedparser @@ -67,6 +69,72 @@ def _parsed_get(parsed: object, key: str, default: object = None) -> object: return getattr(parsed, key, default) +def _normalize_tokens(text: str) -> set[str]: + normalized = re.sub(r"[^a-z0-9]+", " ", text.lower()) + return {token for token in normalized.split() if len(token) >= 4} + + +def _rank_image_candidates(source_url: str, title: str, images: list[str]) -> list[dict[str, Any]]: + source_host = (urlparse(source_url).hostname or "").lower() + is_presseportal = "presseportal.de" in source_host + title_tokens = _normalize_tokens(title) + blocked_patterns = ("logo", "badge", "app-store", "google-play", "na-logo", "sprite", "icon", "favicon", "tracking", "pixel") + + ranked: list[dict[str, Any]] = [] + for url in images: + parsed = urlparse(url) + path = unquote(parsed.path.lower()) + full = f"{parsed.netloc.lower()}{path}" + score = 0 + reasons: list[str] = [] + + if any(token in full for token in blocked_patterns): + score -= 150 + reasons.append("blocked-pattern") + + if is_presseportal and "/thumbnail/story_big/" in path: + score += 120 + reasons.append("presseportal-story-big") + elif is_presseportal and "/thumbnail/highlight/" in path: + score += 45 + reasons.append("presseportal-highlight") + elif is_presseportal and "/thumbnail/liste/" in path: + score -= 40 + reasons.append("presseportal-list") + + if "crop=" in (parsed.query or "").lower(): + score -= 10 + reasons.append("cropped-preview") + + path_tokens = _normalize_tokens(path.replace("-", " ")) + overlap = len(title_tokens.intersection(path_tokens)) + if overlap > 0: + score += min(30, overlap * 6) + reasons.append(f"title-match:{overlap}") + + ranked.append({"url": url, "score": score, "reasons": reasons}) + + ranked.sort(key=lambda item: item["score"], reverse=True) + return ranked + + +def _select_relevant_images(source_url: str, title: str, images: list[str], max_keep: int = 3) -> tuple[list[str], str | None, list[dict[str, Any]]]: + # dedupe incoming order first + deduped: list[str] = [] + seen: set[str] = set() + for image in images: + if image and image not in seen: + seen.add(image) + deduped.append(image) + + ranked = _rank_image_candidates(source_url, title, deduped) + kept = [item["url"] for item in ranked if item["score"] > 0][:max_keep] + if not kept and ranked: + kept = [ranked[0]["url"]] + primary = kept[0] if kept else None + return kept, primary, ranked + + def run_ingestion(feed_id: int | None = None) -> IngestionStats: run_id = create_run(RunCreate(run_type="ingestion", status="running", details="started")) feeds_processed = 0 @@ -167,6 +235,12 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats: final_summary = extracted.summary or (summary[:1000] if summary else None) final_content_raw = extracted.content_text or content_raw final_canonical = extracted.canonical_url or entry.get("link") + selected_images, primary_image, ranked_images = _select_relevant_images( + link, + final_title, + extracted.images, + max_keep=3, + ) source_hash = _entry_hash( entry, @@ -188,6 +262,12 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats: } extraction_meta: dict[str, Any] = extracted_article_to_meta(extracted) extraction_meta["fetched_from"] = link + extraction_meta["image_selection"] = { + "primary": primary_image, + "selected_count": len(selected_images), + "total_candidates": len(extracted.images), + "ranked": ranked_images, + } article_id = upsert_article( ArticleUpsert( feed_id=int(feed["id"]), @@ -201,7 +281,7 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats: summary=final_summary, content_raw=final_content_raw, content_rewritten=None, - image_urls_json=json.dumps(extracted.images, ensure_ascii=False) if extracted.images else None, + image_urls_json=json.dumps(selected_images, ensure_ascii=False) if selected_images else None, press_contact=extracted.press_contact, source_name_snapshot=feed.get("source_name"), source_terms_url_snapshot=feed.get("source_terms_url"), diff --git a/backend/static/admin.css b/backend/static/admin.css index 705aeda..16d55be 100644 --- a/backend/static/admin.css +++ b/backend/static/admin.css @@ -179,6 +179,27 @@ button.secondary { background: #f8fafc; } +.detail-grid { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(220px, 1fr)); + gap: 8px 12px; + margin-bottom: 10px; +} + +.detail-item { + background: #f8fafc; + border: 1px solid #e2e8f0; + border-radius: 8px; + padding: 8px; + display: grid; + gap: 4px; +} + +.detail-item .k { + font-size: 12px; + color: #64748b; +} + .thumb { width: 72px; height: 72px; diff --git a/backend/templates/admin_article_detail.html b/backend/templates/admin_article_detail.html index a38937b..29c054b 100644 --- a/backend/templates/admin_article_detail.html +++ b/backend/templates/admin_article_detail.html @@ -23,19 +23,19 @@

{{ article.title }}

-

Status: {{ article.status }}

-

Artikel-Datum: {{ article.published_at or "-" }}

-

Alter: {{ article.days_old if article.days_old is not none else "-" }} Tage

-

Relevanz: {{ article.relevance }}

-

Autor: {{ article.author or "-" }}

-

Feed: {{ feed.name if feed else "-" }}

-

Quelle Snapshot: {{ article.source_name_snapshot or "-" }}

-

Lizenz Snapshot: {{ article.source_license_name_snapshot or "-" }}

-

Terms Snapshot: {{ article.source_terms_url_snapshot or "-" }}

+
+
Status{{ article.status }}
+
Artikel-Datum{{ article.published_at or "-" }}
+
Alter{{ article.days_old if article.days_old is not none else "-" }} Tage
+
Relevanz{{ article.relevance }}
+
Autor{{ article.author or "-" }}
+
Feed{{ feed.name if feed else "-" }}
+
Quelle Snapshot{{ article.source_name_snapshot or "-" }}
+
Lizenz Snapshot{{ article.source_license_name_snapshot or "-" }}
+
Terms Snapshot{{ article.source_terms_url_snapshot or "-" }}
+

Quelle: {{ article.source_url }}

- {% if article.canonical_url %} -

Canonical: {{ article.canonical_url }}

- {% endif %} + {% if article.canonical_url %}

Canonical: {{ article.canonical_url }}

{% endif %} {% if article.summary %}

Summary: {{ article.summary }}

{% endif %} diff --git a/backend/tests/test_ingestion.py b/backend/tests/test_ingestion.py index 05b2c2b..342c216 100644 --- a/backend/tests/test_ingestion.py +++ b/backend/tests/test_ingestion.py @@ -85,6 +85,7 @@ class TestIngestion(unittest.TestCase): self.assertEqual(article["author"], "Autorin A") self.assertIn("Original Volltext", article["content_raw"] or "") self.assertIn("Pressekontakt", article["meta_json"] or "") + self.assertIsNotNone(article["image_urls_json"]) @patch("backend.app.ingestion.extract_article") @patch("backend.app.ingestion.feedparser.parse")