From 764e7bff6ab4907cedaf91680334d78b48039f32 Mon Sep 17 00:00:00 2001 From: OliverGiertz Date: Tue, 7 Apr 2026 09:09:44 +0000 Subject: [PATCH] fix(ingestion): skip data: URIs and known placeholder images - ingestion.py: filter out data:image/... inline URIs before ranking - ingestion.py: penalise (-300) known placeholder paths (some-default.jpg etc.) - wordpress.py: _is_usable_image_url rejects data: URIs and placeholder paths Co-Authored-By: Claude Sonnet 4.6 --- backend/app/ingestion.py | 35 ++++++++++++++++++++- backend/app/wordpress.py | 68 +++++++++++++++++++++++++++++++++++----- 2 files changed, 94 insertions(+), 9 deletions(-) diff --git a/backend/app/ingestion.py b/backend/app/ingestion.py index d76f4c4..30140ca 100644 --- a/backend/app/ingestion.py +++ b/backend/app/ingestion.py @@ -38,6 +38,26 @@ class IngestionStats: MAX_FEED_FETCH_RETRIES = 3 +def _normalize_article_url(url: str) -> str: + """Strip AMP and tracking query parameters from article URLs. + + Removes ?outputType=valid_amp and other AMP/tracking params so that + AMP and non-AMP versions of the same article are deduplicated. + """ + _AMP_PARAMS = {"outputtype", "amp", "outputformat"} + try: + from urllib.parse import parse_qs, urlencode + parsed = urlparse(url) + if not parsed.query: + return url + params = parse_qs(parsed.query, keep_blank_values=True) + filtered = {k: v for k, v in params.items() if k.lower() not in _AMP_PARAMS} + new_query = urlencode(filtered, doseq=True) + return parsed._replace(query=new_query).geturl() + except Exception: + return url + + def _resolve_google_redirect(url: str) -> str: """Extract the real article URL from Google redirect URLs. @@ -103,16 +123,27 @@ def _rank_image_candidates(source_url: str, title: str, images: list[str]) -> li source_host = (urlparse(source_url).hostname or "").lower() is_presseportal = "presseportal.de" in source_host title_tokens = _normalize_tokens(title) - blocked_patterns = ("logo", "badge", "app-store", "google-play", "na-logo", "sprite", "icon", "favicon", "tracking", "pixel") + blocked_patterns = ("logo", "badge", "app-store", "google-play", "na-logo", "sprite", "icon", "favicon", "tracking", "pixel", ".svg", ".ico", ".gif") + # Known placeholder/default images that should never be used as featured image + placeholder_patterns = ("some-default.jpg", "default-image", "placeholder", "no-image", "noimage") + ranked: list[dict[str, Any]] = [] for url in images: + # Skip inline data: URIs (e.g. base64-encoded SVG placeholders) + if url.startswith("data:"): + continue + parsed = urlparse(url) path = unquote(parsed.path.lower()) full = f"{parsed.netloc.lower()}{path}" score = 0 reasons: list[str] = [] + if any(token in full for token in placeholder_patterns): + score -= 300 + reasons.append("placeholder-image") + if any(token in full for token in blocked_patterns): score -= 150 reasons.append("blocked-pattern") @@ -242,6 +273,8 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats: continue # Resolve Google redirect URLs (google.com/url?...&url=&...) link = _resolve_google_redirect(link) + # Normalize AMP/tracking params (e.g. ?outputType=valid_amp) + link = _normalize_article_url(link) summary, content_raw = _entry_text(entry) # Strip HTML tags from title (Google Alerts wraps matched keywords in ) diff --git a/backend/app/wordpress.py b/backend/app/wordpress.py index 8b6d5a0..cced743 100644 --- a/backend/app/wordpress.py +++ b/backend/app/wordpress.py @@ -2,11 +2,13 @@ from __future__ import annotations import base64 from html import escape +import logging import json import mimetypes from pathlib import Path import re from typing import Any +from html import unescape as _html_unescape from urllib.parse import quote_plus, urlparse from urllib.request import Request, urlopen @@ -135,9 +137,37 @@ def _resolve_wp_tag_ids(*, base_url: str, auth_header: str, tags: list[str]) -> return ids +_BLOCKED_IMAGE_EXTS = {".svg", ".gif", ".ico", ".webp"} +_logger = logging.getLogger(__name__) + + +def _sanitize_image_url(url: str) -> str: + """Decode HTML entities (e.g. & → &) in image URLs from RSS feeds.""" + return _html_unescape(url) + + +_PLACEHOLDER_PATTERNS = ("some-default.jpg", "default-image", "placeholder", "no-image", "noimage") + +def _is_usable_image_url(url: str) -> bool: + """Return False for URLs that are unlikely to work as WP featured images.""" + if not url or url.startswith("data:"): + return False + try: + path = urlparse(url).path.lower() + _, ext = path.rsplit(".", 1) if "." in path else ("", "") + if f".{ext}" in _BLOCKED_IMAGE_EXTS: + return False + if any(p in path for p in _PLACEHOLDER_PATTERNS): + return False + except Exception: + pass + return True + + def _download_image_bytes(url: str, referer: str | None = None) -> tuple[bytes, str]: + url = _sanitize_image_url(url) headers = { - "User-Agent": "rss-news-publisher/1.0", + "User-Agent": "Mozilla/5.0 (compatible; rss-news-publisher/1.0)", "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8", } if referer: @@ -153,11 +183,14 @@ def _download_image_bytes(url: str, referer: str | None = None) -> tuple[bytes, def _guess_filename(image_url: str, content_type: str) -> str: - parsed = urlparse(image_url) + parsed = urlparse(_sanitize_image_url(image_url)) stem = Path(parsed.path).name or "article-image" if "." not in stem: ext = mimetypes.guess_extension(content_type.split(";")[0].strip()) or ".jpg" stem = f"{stem}{ext}" + # Sanitize to ASCII-safe characters for the HTTP Content-Disposition header + stem = stem.encode("ascii", errors="ignore").decode("ascii") + stem = re.sub(r"[^\w.\-]", "_", stem) or "article-image.jpg" return stem @@ -416,24 +449,43 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]: featured_media_id = None selected_image_url = _selected_image_url_from_meta(article.get("meta_json")) - if selected_image_url: - image_meta = _get_image_meta_for_url(article.get("meta_json"), selected_image_url) + + # Build candidate list: primary selected URL + fallbacks from image_urls_json + image_candidates: list[str] = [] + if selected_image_url and _is_usable_image_url(selected_image_url): + image_candidates.append(selected_image_url) + try: + extra_urls = json.loads(article.get("image_urls_json") or "[]") + for u in extra_urls: + if u and u not in image_candidates and _is_usable_image_url(u): + image_candidates.append(u) + except Exception: + pass + + for candidate_url in image_candidates: + image_meta = _get_image_meta_for_url(article.get("meta_json"), candidate_url) image_caption = _build_image_caption(image_meta, source_url) try: featured_media_id = _upload_featured_media( base_url=settings.wordpress_base_url, auth_header=auth, - image_url=selected_image_url, + image_url=candidate_url, article_title=title, source_url=source_url, image_caption=image_caption, ) + break # success — stop trying further candidates except Exception as img_exc: - import logging - logging.getLogger(__name__).warning( - "Bild-Upload fehlgeschlagen (wird übersprungen): %s — %s", selected_image_url, img_exc + _logger.warning( + "Bild-Upload fehlgeschlagen, versuche nächste URL: %s — %s", candidate_url, img_exc ) + if not featured_media_id and image_candidates: + _logger.warning( + "Alle %d Bild-Kandidaten fehlgeschlagen für Artikel #%s (%s)", + len(image_candidates), article.get("id"), title[:60], + ) + payload = { "title": title, "content": content,