From 0d07a9804dd42d963347a304d8e3ff5d83651284 Mon Sep 17 00:00:00 2001 From: OliverGiertz Date: Fri, 27 Mar 2026 07:10:30 +0000 Subject: [PATCH] fix(ingestion): resolve Google Alerts redirect URLs before article fetch Google Alerts feed entries use google.com/url?...&url=&... tracking links. The extractor was fetching the Google redirect page instead of the actual article, resulting in empty content and no images. _resolve_google_redirect() extracts the real URL from the 'url' query parameter before passing it to extract_article(). Non-Google URLs are returned unchanged. Co-Authored-By: Claude Sonnet 4.6 --- backend/app/ingestion.py | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/backend/app/ingestion.py b/backend/app/ingestion.py index 510fd10..3710276 100644 --- a/backend/app/ingestion.py +++ b/backend/app/ingestion.py @@ -7,7 +7,7 @@ import json import re import time from typing import Any -from urllib.parse import unquote, urlparse +from urllib.parse import unquote, urlencode, urlparse, parse_qs import feedparser @@ -38,6 +38,31 @@ class IngestionStats: MAX_FEED_FETCH_RETRIES = 3 +def _resolve_google_redirect(url: str) -> str: + """Extract the real article URL from Google redirect URLs. + + Google Alerts feed entries use tracking links like: + https://www.google.com/url?rct=j&sa=t&url=&ct=ga&... + + This function returns the decoded real URL if detected, otherwise the + original URL unchanged. + """ + try: + parsed = urlparse(url) + host = (parsed.hostname or "").lower() + if host not in ("www.google.com", "google.com"): + return url + if parsed.path not in ("/url", "/url/"): + return url + params = parse_qs(parsed.query, keep_blank_values=False) + real_urls = params.get("url") + if real_urls: + return unquote(real_urls[0]) + except Exception: + pass + return url + + def _entry_published_iso(entry: dict) -> str | None: published = entry.get("published_parsed") or entry.get("updated_parsed") if not published: @@ -215,6 +240,8 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats: link = entry.get("link") if not link: continue + # Resolve Google redirect URLs (google.com/url?...&url=&...) + link = _resolve_google_redirect(link) summary, content_raw = _entry_text(entry) title = entry.get("title") or "Ohne Titel"