fix(ingestion): skip data: URIs and known placeholder images

- ingestion.py: filter out data:image/... inline URIs before ranking
- ingestion.py: penalise (-300) known placeholder paths (some-default.jpg etc.)
- wordpress.py: _is_usable_image_url rejects data: URIs and placeholder paths

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OliverGiertz 2026-04-07 09:09:44 +00:00
parent 426a799371
commit 764e7bff6a
2 changed files with 94 additions and 9 deletions

View file

@ -38,6 +38,26 @@ class IngestionStats:
MAX_FEED_FETCH_RETRIES = 3 MAX_FEED_FETCH_RETRIES = 3
def _normalize_article_url(url: str) -> str:
"""Strip AMP and tracking query parameters from article URLs.
Removes ?outputType=valid_amp and other AMP/tracking params so that
AMP and non-AMP versions of the same article are deduplicated.
"""
_AMP_PARAMS = {"outputtype", "amp", "outputformat"}
try:
from urllib.parse import parse_qs, urlencode
parsed = urlparse(url)
if not parsed.query:
return url
params = parse_qs(parsed.query, keep_blank_values=True)
filtered = {k: v for k, v in params.items() if k.lower() not in _AMP_PARAMS}
new_query = urlencode(filtered, doseq=True)
return parsed._replace(query=new_query).geturl()
except Exception:
return url
def _resolve_google_redirect(url: str) -> str: def _resolve_google_redirect(url: str) -> str:
"""Extract the real article URL from Google redirect URLs. """Extract the real article URL from Google redirect URLs.
@ -103,16 +123,27 @@ def _rank_image_candidates(source_url: str, title: str, images: list[str]) -> li
source_host = (urlparse(source_url).hostname or "").lower() source_host = (urlparse(source_url).hostname or "").lower()
is_presseportal = "presseportal.de" in source_host is_presseportal = "presseportal.de" in source_host
title_tokens = _normalize_tokens(title) title_tokens = _normalize_tokens(title)
blocked_patterns = ("logo", "badge", "app-store", "google-play", "na-logo", "sprite", "icon", "favicon", "tracking", "pixel") blocked_patterns = ("logo", "badge", "app-store", "google-play", "na-logo", "sprite", "icon", "favicon", "tracking", "pixel", ".svg", ".ico", ".gif")
# Known placeholder/default images that should never be used as featured image
placeholder_patterns = ("some-default.jpg", "default-image", "placeholder", "no-image", "noimage")
ranked: list[dict[str, Any]] = [] ranked: list[dict[str, Any]] = []
for url in images: for url in images:
# Skip inline data: URIs (e.g. base64-encoded SVG placeholders)
if url.startswith("data:"):
continue
parsed = urlparse(url) parsed = urlparse(url)
path = unquote(parsed.path.lower()) path = unquote(parsed.path.lower())
full = f"{parsed.netloc.lower()}{path}" full = f"{parsed.netloc.lower()}{path}"
score = 0 score = 0
reasons: list[str] = [] reasons: list[str] = []
if any(token in full for token in placeholder_patterns):
score -= 300
reasons.append("placeholder-image")
if any(token in full for token in blocked_patterns): if any(token in full for token in blocked_patterns):
score -= 150 score -= 150
reasons.append("blocked-pattern") reasons.append("blocked-pattern")
@ -242,6 +273,8 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats:
continue continue
# Resolve Google redirect URLs (google.com/url?...&url=<actual_url>&...) # Resolve Google redirect URLs (google.com/url?...&url=<actual_url>&...)
link = _resolve_google_redirect(link) link = _resolve_google_redirect(link)
# Normalize AMP/tracking params (e.g. ?outputType=valid_amp)
link = _normalize_article_url(link)
summary, content_raw = _entry_text(entry) summary, content_raw = _entry_text(entry)
# Strip HTML tags from title (Google Alerts wraps matched keywords in <b>) # Strip HTML tags from title (Google Alerts wraps matched keywords in <b>)

View file

@ -2,11 +2,13 @@ from __future__ import annotations
import base64 import base64
from html import escape from html import escape
import logging
import json import json
import mimetypes import mimetypes
from pathlib import Path from pathlib import Path
import re import re
from typing import Any from typing import Any
from html import unescape as _html_unescape
from urllib.parse import quote_plus, urlparse from urllib.parse import quote_plus, urlparse
from urllib.request import Request, urlopen from urllib.request import Request, urlopen
@ -135,9 +137,37 @@ def _resolve_wp_tag_ids(*, base_url: str, auth_header: str, tags: list[str]) ->
return ids return ids
_BLOCKED_IMAGE_EXTS = {".svg", ".gif", ".ico", ".webp"}
_logger = logging.getLogger(__name__)
def _sanitize_image_url(url: str) -> str:
"""Decode HTML entities (e.g. &amp; → &) in image URLs from RSS feeds."""
return _html_unescape(url)
_PLACEHOLDER_PATTERNS = ("some-default.jpg", "default-image", "placeholder", "no-image", "noimage")
def _is_usable_image_url(url: str) -> bool:
"""Return False for URLs that are unlikely to work as WP featured images."""
if not url or url.startswith("data:"):
return False
try:
path = urlparse(url).path.lower()
_, ext = path.rsplit(".", 1) if "." in path else ("", "")
if f".{ext}" in _BLOCKED_IMAGE_EXTS:
return False
if any(p in path for p in _PLACEHOLDER_PATTERNS):
return False
except Exception:
pass
return True
def _download_image_bytes(url: str, referer: str | None = None) -> tuple[bytes, str]: def _download_image_bytes(url: str, referer: str | None = None) -> tuple[bytes, str]:
url = _sanitize_image_url(url)
headers = { headers = {
"User-Agent": "rss-news-publisher/1.0", "User-Agent": "Mozilla/5.0 (compatible; rss-news-publisher/1.0)",
"Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8", "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
} }
if referer: if referer:
@ -153,11 +183,14 @@ def _download_image_bytes(url: str, referer: str | None = None) -> tuple[bytes,
def _guess_filename(image_url: str, content_type: str) -> str: def _guess_filename(image_url: str, content_type: str) -> str:
parsed = urlparse(image_url) parsed = urlparse(_sanitize_image_url(image_url))
stem = Path(parsed.path).name or "article-image" stem = Path(parsed.path).name or "article-image"
if "." not in stem: if "." not in stem:
ext = mimetypes.guess_extension(content_type.split(";")[0].strip()) or ".jpg" ext = mimetypes.guess_extension(content_type.split(";")[0].strip()) or ".jpg"
stem = f"{stem}{ext}" stem = f"{stem}{ext}"
# Sanitize to ASCII-safe characters for the HTTP Content-Disposition header
stem = stem.encode("ascii", errors="ignore").decode("ascii")
stem = re.sub(r"[^\w.\-]", "_", stem) or "article-image.jpg"
return stem return stem
@ -416,22 +449,41 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]:
featured_media_id = None featured_media_id = None
selected_image_url = _selected_image_url_from_meta(article.get("meta_json")) selected_image_url = _selected_image_url_from_meta(article.get("meta_json"))
if selected_image_url:
image_meta = _get_image_meta_for_url(article.get("meta_json"), selected_image_url) # Build candidate list: primary selected URL + fallbacks from image_urls_json
image_candidates: list[str] = []
if selected_image_url and _is_usable_image_url(selected_image_url):
image_candidates.append(selected_image_url)
try:
extra_urls = json.loads(article.get("image_urls_json") or "[]")
for u in extra_urls:
if u and u not in image_candidates and _is_usable_image_url(u):
image_candidates.append(u)
except Exception:
pass
for candidate_url in image_candidates:
image_meta = _get_image_meta_for_url(article.get("meta_json"), candidate_url)
image_caption = _build_image_caption(image_meta, source_url) image_caption = _build_image_caption(image_meta, source_url)
try: try:
featured_media_id = _upload_featured_media( featured_media_id = _upload_featured_media(
base_url=settings.wordpress_base_url, base_url=settings.wordpress_base_url,
auth_header=auth, auth_header=auth,
image_url=selected_image_url, image_url=candidate_url,
article_title=title, article_title=title,
source_url=source_url, source_url=source_url,
image_caption=image_caption, image_caption=image_caption,
) )
break # success — stop trying further candidates
except Exception as img_exc: except Exception as img_exc:
import logging _logger.warning(
logging.getLogger(__name__).warning( "Bild-Upload fehlgeschlagen, versuche nächste URL: %s%s", candidate_url, img_exc
"Bild-Upload fehlgeschlagen (wird übersprungen): %s%s", selected_image_url, img_exc )
if not featured_media_id and image_candidates:
_logger.warning(
"Alle %d Bild-Kandidaten fehlgeschlagen für Artikel #%s (%s)",
len(image_candidates), article.get("id"), title[:60],
) )
payload = { payload = {