From 764e7bff6ab4907cedaf91680334d78b48039f32 Mon Sep 17 00:00:00 2001
From: OliverGiertz <oliver@vanityontour.de>
Date: Tue, 7 Apr 2026 09:09:44 +0000
Subject: [PATCH] fix(ingestion): skip data: URIs and known placeholder images

- ingestion.py: filter out data:image/... inline URIs before ranking
- ingestion.py: penalise (-300) known placeholder paths (some-default.jpg etc.)
- wordpress.py: _is_usable_image_url rejects data: URIs and placeholder paths

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 backend/app/ingestion.py | 35 ++++++++++++++++++++-
 backend/app/wordpress.py | 68 +++++++++++++++++++++++++++++++++++-----
 2 files changed, 94 insertions(+), 9 deletions(-)
diff --git a/backend/app/ingestion.py b/backend/app/ingestion.py
index d76f4c4..30140ca 100644
--- a/backend/app/ingestion.py
+++ b/backend/app/ingestion.py
@@ -38,6 +38,26 @@ class IngestionStats:
 MAX_FEED_FETCH_RETRIES = 3
 
 
+def _normalize_article_url(url: str) -> str:
+    """Strip AMP and tracking query parameters from article URLs.
+
+    Removes ?outputType=valid_amp and other AMP/tracking params so that
+    AMP and non-AMP versions of the same article are deduplicated.
+    """
+    _AMP_PARAMS = {"outputtype", "amp", "outputformat"}
+    try:
+        from urllib.parse import parse_qs, urlencode
+        parsed = urlparse(url)
+        if not parsed.query:
+            return url
+        params = parse_qs(parsed.query, keep_blank_values=True)
+        filtered = {k: v for k, v in params.items() if k.lower() not in _AMP_PARAMS}
+        new_query = urlencode(filtered, doseq=True)
+        return parsed._replace(query=new_query).geturl()
+    except Exception:
+        return url
+
+
 def _resolve_google_redirect(url: str) -> str:
     """Extract the real article URL from Google redirect URLs.
 
@@ -103,16 +123,27 @@ def _rank_image_candidates(source_url: str, title: str, images: list[str]) -> li
     source_host = (urlparse(source_url).hostname or "").lower()
     is_presseportal = "presseportal.de" in source_host
     title_tokens = _normalize_tokens(title)
-    blocked_patterns = ("logo", "badge", "app-store", "google-play", "na-logo", "sprite", "icon", "favicon", "tracking", "pixel")
+    blocked_patterns = ("logo", "badge", "app-store", "google-play", "na-logo", "sprite", "icon", "favicon", "tracking", "pixel", ".svg", ".ico", ".gif")
+    # Known placeholder/default images that should never be used as featured image
+    placeholder_patterns = ("some-default.jpg", "default-image", "placeholder", "no-image", "noimage")
+
 
     ranked: list[dict[str, Any]] = []
     for url in images:
+        # Skip inline data: URIs (e.g. base64-encoded SVG placeholders)
+        if url.startswith("data:"):
+            continue
+
         parsed = urlparse(url)
         path = unquote(parsed.path.lower())
         full = f"{parsed.netloc.lower()}{path}"
         score = 0
         reasons: list[str] = []
 
+        if any(token in full for token in placeholder_patterns):
+            score -= 300
+            reasons.append("placeholder-image")
+
         if any(token in full for token in blocked_patterns):
             score -= 150
             reasons.append("blocked-pattern")
@@ -242,6 +273,8 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats:
                     continue
                 # Resolve Google redirect URLs (google.com/url?...&url=<actual_url>&...)
                 link = _resolve_google_redirect(link)
+                # Normalize AMP/tracking params (e.g. ?outputType=valid_amp)
+                link = _normalize_article_url(link)
 
                 summary, content_raw = _entry_text(entry)
                 # Strip HTML tags from title (Google Alerts wraps matched keywords in <b>)
diff --git a/backend/app/wordpress.py b/backend/app/wordpress.py
index 8b6d5a0..cced743 100644
--- a/backend/app/wordpress.py
+++ b/backend/app/wordpress.py
@@ -2,11 +2,13 @@ from __future__ import annotations
 
 import base64
 from html import escape
+import logging
 import json
 import mimetypes
 from pathlib import Path
 import re
 from typing import Any
+from html import unescape as _html_unescape
 from urllib.parse import quote_plus, urlparse
 from urllib.request import Request, urlopen
 
@@ -135,9 +137,37 @@ def _resolve_wp_tag_ids(*, base_url: str, auth_header: str, tags: list[str]) ->
     return ids
 
 
+_BLOCKED_IMAGE_EXTS = {".svg", ".gif", ".ico", ".webp"}
+_logger = logging.getLogger(__name__)
+
+
+def _sanitize_image_url(url: str) -> str:
+    """Decode HTML entities (e.g. &amp; → &) in image URLs from RSS feeds."""
+    return _html_unescape(url)
+
+
+_PLACEHOLDER_PATTERNS = ("some-default.jpg", "default-image", "placeholder", "no-image", "noimage")
+
+def _is_usable_image_url(url: str) -> bool:
+    """Return False for URLs that are unlikely to work as WP featured images."""
+    if not url or url.startswith("data:"):
+        return False
+    try:
+        path = urlparse(url).path.lower()
+        _, ext = path.rsplit(".", 1) if "." in path else ("", "")
+        if f".{ext}" in _BLOCKED_IMAGE_EXTS:
+            return False
+        if any(p in path for p in _PLACEHOLDER_PATTERNS):
+            return False
+    except Exception:
+        pass
+    return True
+
+
 def _download_image_bytes(url: str, referer: str | None = None) -> tuple[bytes, str]:
+    url = _sanitize_image_url(url)
     headers = {
-        "User-Agent": "rss-news-publisher/1.0",
+        "User-Agent": "Mozilla/5.0 (compatible; rss-news-publisher/1.0)",
         "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
     }
     if referer:
@@ -153,11 +183,14 @@ def _download_image_bytes(url: str, referer: str | None = None) -> tuple[bytes,
 
 
 def _guess_filename(image_url: str, content_type: str) -> str:
-    parsed = urlparse(image_url)
+    parsed = urlparse(_sanitize_image_url(image_url))
     stem = Path(parsed.path).name or "article-image"
     if "." not in stem:
         ext = mimetypes.guess_extension(content_type.split(";")[0].strip()) or ".jpg"
         stem = f"{stem}{ext}"
+    # Sanitize to ASCII-safe characters for the HTTP Content-Disposition header
+    stem = stem.encode("ascii", errors="ignore").decode("ascii")
+    stem = re.sub(r"[^\w.\-]", "_", stem) or "article-image.jpg"
     return stem
 
 
@@ -416,24 +449,43 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]:
 
     featured_media_id = None
     selected_image_url = _selected_image_url_from_meta(article.get("meta_json"))
-    if selected_image_url:
-        image_meta = _get_image_meta_for_url(article.get("meta_json"), selected_image_url)
+
+    # Build candidate list: primary selected URL + fallbacks from image_urls_json
+    image_candidates: list[str] = []
+    if selected_image_url and _is_usable_image_url(selected_image_url):
+        image_candidates.append(selected_image_url)
+    try:
+        extra_urls = json.loads(article.get("image_urls_json") or "[]")
+        for u in extra_urls:
+            if u and u not in image_candidates and _is_usable_image_url(u):
+                image_candidates.append(u)
+    except Exception:
+        pass
+
+    for candidate_url in image_candidates:
+        image_meta = _get_image_meta_for_url(article.get("meta_json"), candidate_url)
         image_caption = _build_image_caption(image_meta, source_url)
         try:
             featured_media_id = _upload_featured_media(
                 base_url=settings.wordpress_base_url,
                 auth_header=auth,
-                image_url=selected_image_url,
+                image_url=candidate_url,
                 article_title=title,
                 source_url=source_url,
                 image_caption=image_caption,
             )
+            break  # success — stop trying further candidates
         except Exception as img_exc:
-            import logging
-            logging.getLogger(__name__).warning(
-                "Bild-Upload fehlgeschlagen (wird übersprungen): %s — %s", selected_image_url, img_exc
+            _logger.warning(
+                "Bild-Upload fehlgeschlagen, versuche nächste URL: %s — %s", candidate_url, img_exc
             )
 
+    if not featured_media_id and image_candidates:
+        _logger.warning(
+            "Alle %d Bild-Kandidaten fehlgeschlagen für Artikel #%s (%s)",
+            len(image_candidates), article.get("id"), title[:60],
+        )
+
     payload = {
         "title": title,
         "content": content,