From 0d07a9804dd42d963347a304d8e3ff5d83651284 Mon Sep 17 00:00:00 2001
From: OliverGiertz <oliver@vanityontour.de>
Date: Fri, 27 Mar 2026 07:10:30 +0000
Subject: [PATCH] fix(ingestion): resolve Google Alerts redirect URLs before
 article fetch

Google Alerts feed entries use google.com/url?...&url=<encoded_real_url>&...
tracking links. The extractor was fetching the Google redirect page instead
of the actual article, resulting in empty content and no images.

_resolve_google_redirect() extracts the real URL from the 'url' query
parameter before passing it to extract_article(). Non-Google URLs are
returned unchanged.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 backend/app/ingestion.py | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/backend/app/ingestion.py b/backend/app/ingestion.py
index 510fd10..3710276 100644
--- a/backend/app/ingestion.py
+++ b/backend/app/ingestion.py
@@ -7,7 +7,7 @@ import json
 import re
 import time
 from typing import Any
-from urllib.parse import unquote, urlparse
+from urllib.parse import unquote, urlencode, urlparse, parse_qs
 
 import feedparser
 
@@ -38,6 +38,31 @@ class IngestionStats:
 MAX_FEED_FETCH_RETRIES = 3
 
 
+def _resolve_google_redirect(url: str) -> str:
+    """Extract the real article URL from Google redirect URLs.
+
+    Google Alerts feed entries use tracking links like:
+      https://www.google.com/url?rct=j&sa=t&url=<encoded_real_url>&ct=ga&...
+
+    This function returns the decoded real URL if detected, otherwise the
+    original URL unchanged.
+    """
+    try:
+        parsed = urlparse(url)
+        host = (parsed.hostname or "").lower()
+        if host not in ("www.google.com", "google.com"):
+            return url
+        if parsed.path not in ("/url", "/url/"):
+            return url
+        params = parse_qs(parsed.query, keep_blank_values=False)
+        real_urls = params.get("url")
+        if real_urls:
+            return unquote(real_urls[0])
+    except Exception:
+        pass
+    return url
+
+
 def _entry_published_iso(entry: dict) -> str | None:
     published = entry.get("published_parsed") or entry.get("updated_parsed")
     if not published:
@@ -215,6 +240,8 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats:
                 link = entry.get("link")
                 if not link:
                     continue
+                # Resolve Google redirect URLs (google.com/url?...&url=<actual_url>&...)
+                link = _resolve_google_redirect(link)
 
                 summary, content_raw = _entry_text(entry)
                 title = entry.get("title") or "Ohne Titel"