rss-news/backend/app/rewrite.py
OliverGiertz 12932bca90 fix(rewrite): attribute claims to source instead of using first-person 'wir'
Rewrites must not use 'wir haben erforscht/berechnet' since the content
comes from a third-party source. The prompt now passes the source name
and instructs GPT to attribute all claims to the original publisher
(e.g. 'laut PiNCAMP', 'die Auswertung zeigt').

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-26 07:36:09 +00:00

204 lines
7.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from __future__ import annotations
import json
import re
from typing import Any
from urllib.request import Request, urlopen
from .config import get_settings
def _sanitize_source_text(text: str) -> str:
raw = (text or "").strip()
if not raw:
return ""
lines = [ln.strip() for ln in raw.splitlines() if ln.strip()]
if len(lines) > 3:
lines = lines[3:]
joined = "\n".join(lines)
# Remove press contact block at end from "Pressekontakt" onward.
joined = re.sub(
r"\n?\s*Pressekontakt[\s\S]*$",
"",
joined,
flags=re.IGNORECASE,
).strip()
return joined
def _normalize_tags(tags: list[str], max_tags: int = 8) -> list[str]:
out: list[str] = []
seen: set[str] = set()
for raw in tags:
value = re.sub(r"\s+", " ", str(raw or "").strip())
value = re.sub(r"^[#\-•\s]+", "", value)
value = re.sub(r"[;,.:\s]+$", "", value)
if not value:
continue
if len(value) < 2 or len(value) > 40:
continue
key = value.casefold()
if key in seen:
continue
seen.add(key)
out.append(value)
if len(out) >= max_tags:
break
return out
def _openai_chat(system: str, user: str, temperature: float = 0.4) -> str:
settings = get_settings()
api_key = settings.openai_api_key
if not api_key:
raise RuntimeError("OPENAI_API_KEY fehlt")
payload = {
"model": settings.openai_model,
"temperature": temperature,
"messages": [
{"role": "system", "content": system},
{"role": "user", "content": user},
],
}
req = Request(
url="https://api.openai.com/v1/chat/completions",
method="POST",
data=json.dumps(payload).encode("utf-8"),
headers={
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
"Accept": "application/json",
},
)
with urlopen(req, timeout=60) as resp:
raw = resp.read().decode("utf-8", errors="replace")
data = json.loads(raw)
choices = data.get("choices")
if not isinstance(choices, list) or not choices:
raise RuntimeError(f"Ungültige OpenAI-Antwort: {data}")
message = choices[0].get("message", {})
content = message.get("content")
if not isinstance(content, str) or not content.strip():
raise RuntimeError("OpenAI lieferte keinen Inhalt")
return content.strip()
def rewrite_article_text(article: dict[str, Any]) -> str:
source_text = _sanitize_source_text(article.get("content_raw") or "")
if not source_text:
source_text = (article.get("summary") or "").strip()
if not source_text:
raise RuntimeError("Kein Quelltext für Rewrite verfügbar")
title = (article.get("title") or "").strip()
source_name = (article.get("source_name_snapshot") or article.get("author") or "die Quelle").strip()
prompt = (
"Schreibe den folgenden News-Text neu auf Deutsch in persönlicher Du-Form. "
"Stil: ausführlich, gut lesbar, ohne Einleitung mit Datum/Uhrzeit/Firma/Ort, "
"ohne Pressekontakt, ohne Quellenblock. "
"Nutze klare Absätze und Zwischenüberschriften in HTML (<h2>, <p>, <ul><li> falls passend). "
"Inhaltlich korrekt bleiben, nichts erfinden. "
f"Wichtig: Der Artikel wurde von '{source_name}' veröffentlicht. "
"Verwende NIEMALS 'wir' oder 'ich' aus Sicht der Quelle beziehe Aussagen stets auf die Quelle, "
f"z.B. 'laut {source_name}', '{source_name} hat ermittelt', 'die Auswertung zeigt'.\n\n"
f"Titel: {title}\n\n"
f"Originaltext:\n{source_text}"
)
return _openai_chat(
"Du bist ein deutscher News-Redakteur.",
prompt,
temperature=0.4,
)
def generate_article_tags(article: dict[str, Any], rewritten_text: str | None = None, max_tags: int = 8) -> list[str]:
source_text = rewritten_text or _sanitize_source_text(article.get("content_raw") or "") or (article.get("summary") or "")
source_text = str(source_text).strip()
if not source_text:
return []
title = (article.get("title") or "").strip()
prompt = (
"Erzeuge präzise Schlagwörter für einen deutschen News-Artikel. "
f"Maximal {max_tags} Tags. Nur relevante Begriffe, keine allgemeinen Wörter wie News/Artikel. "
"Gib ausschließlich ein JSON-Array mit Strings zurück, ohne Erklärung.\n\n"
f"Titel: {title}\n\n"
f"Text:\n{source_text[:3500]}"
)
raw = _openai_chat(
"Du extrahierst präzise, kurze News-Tags auf Deutsch.",
prompt,
temperature=0.2,
)
try:
parsed = json.loads(raw)
if isinstance(parsed, list):
return _normalize_tags([str(x) for x in parsed], max_tags=max_tags)
except Exception:
pass
# fallback: extract first JSON-like array if model wrapped output
match = re.search(r"\[[\s\S]*\]", raw)
if match:
try:
parsed = json.loads(match.group(0))
if isinstance(parsed, list):
return _normalize_tags([str(x) for x in parsed], max_tags=max_tags)
except Exception:
return []
return []
def score_article_relevance(article: dict[Any, Any]) -> dict[str, Any]:
"""Score article relevance for VanLife/Camping/Outdoor blog (0-100).
Returns {"score": int, "reason": str, "topics": list[str]}.
Raises RuntimeError on OpenAI failure.
"""
title = (article.get("title") or "").strip()
text = _sanitize_source_text(article.get("content_raw") or "")
if not text:
text = (article.get("summary") or "").strip()
prompt = (
"Bewerte die Relevanz des folgenden Artikels für einen deutschen VanLife-, Camping- und Outdoor-Blog. "
"Relevante Themen: Campingplätze, Stellplätze, Wohnmobil, Camper, Van, Roadtrip, "
"Outdoor-Ausrüstung, Wandern, Naturreisen, Reise-Tipps für Campende. "
"Nicht relevant: allgemeine Nachrichten, Politik, Wirtschaft, Sport (außer Outdoor), Unterhaltung.\n\n"
"Antworte NUR mit einem JSON-Objekt:\n"
'{"score": <0-100>, "reason": "<kurze Begründung auf Deutsch>", "topics": ["<Thema1>", "<Thema2>"]}\n\n'
f"Titel: {title}\n\n"
f"Text (Auszug):\n{text[:2000]}"
)
raw = _openai_chat(
"Du bist ein Redakteur für einen VanLife- und Camping-Blog und bewertest Artikelrelevanz.",
prompt,
temperature=0.1,
)
try:
match = re.search(r"\{[\s\S]*\}", raw)
if match:
parsed = json.loads(match.group(0))
score = max(0, min(100, int(parsed.get("score", 0))))
return {
"score": score,
"reason": str(parsed.get("reason", "")),
"topics": [str(t) for t in (parsed.get("topics") or [])],
}
except Exception:
pass
return {"score": 0, "reason": "Parsing-Fehler bei Relevanz-Score", "topics": []}
def merge_generated_tags(meta_json: str | None, tags: list[str]) -> str:
meta: dict[str, Any] = {}
if meta_json:
try:
parsed = json.loads(meta_json)
if isinstance(parsed, dict):
meta = parsed
except Exception:
meta = {}
meta["generated_tags"] = _normalize_tags(tags)
return json.dumps(meta, ensure_ascii=False)