fix(ingestion): resolve Google Alerts redirect URLs before article fetch
Google Alerts feed entries use google.com/url?...&url=<encoded_real_url>&... tracking links. The extractor was fetching the Google redirect page instead of the actual article, resulting in empty content and no images. _resolve_google_redirect() extracts the real URL from the 'url' query parameter before passing it to extract_article(). Non-Google URLs are returned unchanged. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
aaac5def27
commit
0d07a9804d
1 changed files with 28 additions and 1 deletions
|
|
@ -7,7 +7,7 @@ import json
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from urllib.parse import unquote, urlparse
|
from urllib.parse import unquote, urlencode, urlparse, parse_qs
|
||||||
|
|
||||||
import feedparser
|
import feedparser
|
||||||
|
|
||||||
|
|
@ -38,6 +38,31 @@ class IngestionStats:
|
||||||
MAX_FEED_FETCH_RETRIES = 3
|
MAX_FEED_FETCH_RETRIES = 3
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_google_redirect(url: str) -> str:
|
||||||
|
"""Extract the real article URL from Google redirect URLs.
|
||||||
|
|
||||||
|
Google Alerts feed entries use tracking links like:
|
||||||
|
https://www.google.com/url?rct=j&sa=t&url=<encoded_real_url>&ct=ga&...
|
||||||
|
|
||||||
|
This function returns the decoded real URL if detected, otherwise the
|
||||||
|
original URL unchanged.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
parsed = urlparse(url)
|
||||||
|
host = (parsed.hostname or "").lower()
|
||||||
|
if host not in ("www.google.com", "google.com"):
|
||||||
|
return url
|
||||||
|
if parsed.path not in ("/url", "/url/"):
|
||||||
|
return url
|
||||||
|
params = parse_qs(parsed.query, keep_blank_values=False)
|
||||||
|
real_urls = params.get("url")
|
||||||
|
if real_urls:
|
||||||
|
return unquote(real_urls[0])
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
def _entry_published_iso(entry: dict) -> str | None:
|
def _entry_published_iso(entry: dict) -> str | None:
|
||||||
published = entry.get("published_parsed") or entry.get("updated_parsed")
|
published = entry.get("published_parsed") or entry.get("updated_parsed")
|
||||||
if not published:
|
if not published:
|
||||||
|
|
@ -215,6 +240,8 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats:
|
||||||
link = entry.get("link")
|
link = entry.get("link")
|
||||||
if not link:
|
if not link:
|
||||||
continue
|
continue
|
||||||
|
# Resolve Google redirect URLs (google.com/url?...&url=<actual_url>&...)
|
||||||
|
link = _resolve_google_redirect(link)
|
||||||
|
|
||||||
summary, content_raw = _entry_text(entry)
|
summary, content_raw = _entry_text(entry)
|
||||||
title = entry.get("title") or "Ohne Titel"
|
title = entry.get("title") or "Ohne Titel"
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue