fix(ingestion): resolve Google Alerts redirect URLs before article fetch

Google Alerts feed entries use google.com/url?...&url=<encoded_real_url>&... tracking links. The extractor was fetching the Google redirect page instead of the actual article, resulting in empty content and no images. _resolve_google_redirect() extracts the real URL from the 'url' query parameter before passing it to extract_article(). Non-Google URLs are returned unchanged. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-27 07:10:30 +00:00 · 2026-03-27 07:10:30 +00:00 · 0d07a9804d
commit 0d07a9804d
parent aaac5def27
1 changed files with 28 additions and 1 deletions
--- a/backend/app/ingestion.py
+++ b/backend/app/ingestion.py
@ -7,7 +7,7 @@ import json
 import re
 import time
 from typing import Any
-from urllib.parse import unquote, urlparse
+from urllib.parse import unquote, urlencode, urlparse, parse_qs

 import feedparser

@ -38,6 +38,31 @@ class IngestionStats:
 MAX_FEED_FETCH_RETRIES = 3


+def _resolve_google_redirect(url: str) -> str:
+    """Extract the real article URL from Google redirect URLs.
+
+    Google Alerts feed entries use tracking links like:
+      https://www.google.com/url?rct=j&sa=t&url=<encoded_real_url>&ct=ga&...
+
+    This function returns the decoded real URL if detected, otherwise the
+    original URL unchanged.
+    """
+    try:
+        parsed = urlparse(url)
+        host = (parsed.hostname or "").lower()
+        if host not in ("www.google.com", "google.com"):
+            return url
+        if parsed.path not in ("/url", "/url/"):
+            return url
+        params = parse_qs(parsed.query, keep_blank_values=False)
+        real_urls = params.get("url")
+        if real_urls:
+            return unquote(real_urls[0])
+    except Exception:
+        pass
+    return url
+
+
 def _entry_published_iso(entry: dict) -> str | None:
    published = entry.get("published_parsed") or entry.get("updated_parsed")
    if not published:
@ -215,6 +240,8 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats:
                link = entry.get("link")
                if not link:
                    continue
+                # Resolve Google redirect URLs (google.com/url?...&url=<actual_url>&...)
+                link = _resolve_google_redirect(link)

                summary, content_raw = _entry_text(entry)
                title = entry.get("title") or "Ohne Titel"