fix(ingestion): resolve Google Alerts redirect URLs before article fetch

Google Alerts feed entries use google.com/url?...&url=<encoded_real_url>&...
tracking links. The extractor was fetching the Google redirect page instead
of the actual article, resulting in empty content and no images.

_resolve_google_redirect() extracts the real URL from the 'url' query
parameter before passing it to extract_article(). Non-Google URLs are
returned unchanged.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OliverGiertz 2026-03-27 07:10:30 +00:00
parent aaac5def27
commit 0d07a9804d

View file

@ -7,7 +7,7 @@ import json
import re
import time
from typing import Any
from urllib.parse import unquote, urlparse
from urllib.parse import unquote, urlencode, urlparse, parse_qs
import feedparser
@ -38,6 +38,31 @@ class IngestionStats:
MAX_FEED_FETCH_RETRIES = 3
def _resolve_google_redirect(url: str) -> str:
"""Extract the real article URL from Google redirect URLs.
Google Alerts feed entries use tracking links like:
https://www.google.com/url?rct=j&sa=t&url=<encoded_real_url>&ct=ga&...
This function returns the decoded real URL if detected, otherwise the
original URL unchanged.
"""
try:
parsed = urlparse(url)
host = (parsed.hostname or "").lower()
if host not in ("www.google.com", "google.com"):
return url
if parsed.path not in ("/url", "/url/"):
return url
params = parse_qs(parsed.query, keep_blank_values=False)
real_urls = params.get("url")
if real_urls:
return unquote(real_urls[0])
except Exception:
pass
return url
def _entry_published_iso(entry: dict) -> str | None:
published = entry.get("published_parsed") or entry.get("updated_parsed")
if not published:
@ -215,6 +240,8 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats:
link = entry.get("link")
if not link:
continue
# Resolve Google redirect URLs (google.com/url?...&url=<actual_url>&...)
link = _resolve_google_redirect(link)
summary, content_raw = _entry_text(entry)
title = entry.get("title") or "Ohne Titel"