Rewrites must not use 'wir haben erforscht/berechnet' since the content comes from a third-party source. The prompt now passes the source name and instructs GPT to attribute all claims to the original publisher (e.g. 'laut PiNCAMP', 'die Auswertung zeigt'). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
204 lines
7.3 KiB
Python
204 lines
7.3 KiB
Python
from __future__ import annotations
|
||
|
||
import json
|
||
import re
|
||
from typing import Any
|
||
from urllib.request import Request, urlopen
|
||
|
||
from .config import get_settings
|
||
|
||
|
||
def _sanitize_source_text(text: str) -> str:
|
||
raw = (text or "").strip()
|
||
if not raw:
|
||
return ""
|
||
|
||
lines = [ln.strip() for ln in raw.splitlines() if ln.strip()]
|
||
if len(lines) > 3:
|
||
lines = lines[3:]
|
||
|
||
joined = "\n".join(lines)
|
||
# Remove press contact block at end from "Pressekontakt" onward.
|
||
joined = re.sub(
|
||
r"\n?\s*Pressekontakt[\s\S]*$",
|
||
"",
|
||
joined,
|
||
flags=re.IGNORECASE,
|
||
).strip()
|
||
return joined
|
||
|
||
|
||
def _normalize_tags(tags: list[str], max_tags: int = 8) -> list[str]:
|
||
out: list[str] = []
|
||
seen: set[str] = set()
|
||
for raw in tags:
|
||
value = re.sub(r"\s+", " ", str(raw or "").strip())
|
||
value = re.sub(r"^[#\-•\s]+", "", value)
|
||
value = re.sub(r"[;,.:\s]+$", "", value)
|
||
if not value:
|
||
continue
|
||
if len(value) < 2 or len(value) > 40:
|
||
continue
|
||
key = value.casefold()
|
||
if key in seen:
|
||
continue
|
||
seen.add(key)
|
||
out.append(value)
|
||
if len(out) >= max_tags:
|
||
break
|
||
return out
|
||
|
||
|
||
def _openai_chat(system: str, user: str, temperature: float = 0.4) -> str:
|
||
settings = get_settings()
|
||
api_key = settings.openai_api_key
|
||
if not api_key:
|
||
raise RuntimeError("OPENAI_API_KEY fehlt")
|
||
|
||
payload = {
|
||
"model": settings.openai_model,
|
||
"temperature": temperature,
|
||
"messages": [
|
||
{"role": "system", "content": system},
|
||
{"role": "user", "content": user},
|
||
],
|
||
}
|
||
req = Request(
|
||
url="https://api.openai.com/v1/chat/completions",
|
||
method="POST",
|
||
data=json.dumps(payload).encode("utf-8"),
|
||
headers={
|
||
"Authorization": f"Bearer {api_key}",
|
||
"Content-Type": "application/json",
|
||
"Accept": "application/json",
|
||
},
|
||
)
|
||
with urlopen(req, timeout=60) as resp:
|
||
raw = resp.read().decode("utf-8", errors="replace")
|
||
data = json.loads(raw)
|
||
choices = data.get("choices")
|
||
if not isinstance(choices, list) or not choices:
|
||
raise RuntimeError(f"Ungültige OpenAI-Antwort: {data}")
|
||
message = choices[0].get("message", {})
|
||
content = message.get("content")
|
||
if not isinstance(content, str) or not content.strip():
|
||
raise RuntimeError("OpenAI lieferte keinen Inhalt")
|
||
return content.strip()
|
||
|
||
|
||
def rewrite_article_text(article: dict[str, Any]) -> str:
|
||
source_text = _sanitize_source_text(article.get("content_raw") or "")
|
||
if not source_text:
|
||
source_text = (article.get("summary") or "").strip()
|
||
if not source_text:
|
||
raise RuntimeError("Kein Quelltext für Rewrite verfügbar")
|
||
|
||
title = (article.get("title") or "").strip()
|
||
source_name = (article.get("source_name_snapshot") or article.get("author") or "die Quelle").strip()
|
||
prompt = (
|
||
"Schreibe den folgenden News-Text neu auf Deutsch in persönlicher Du-Form. "
|
||
"Stil: ausführlich, gut lesbar, ohne Einleitung mit Datum/Uhrzeit/Firma/Ort, "
|
||
"ohne Pressekontakt, ohne Quellenblock. "
|
||
"Nutze klare Absätze und Zwischenüberschriften in HTML (<h2>, <p>, <ul><li> falls passend). "
|
||
"Inhaltlich korrekt bleiben, nichts erfinden. "
|
||
f"Wichtig: Der Artikel wurde von '{source_name}' veröffentlicht. "
|
||
"Verwende NIEMALS 'wir' oder 'ich' aus Sicht der Quelle – beziehe Aussagen stets auf die Quelle, "
|
||
f"z.B. 'laut {source_name}', '{source_name} hat ermittelt', 'die Auswertung zeigt'.\n\n"
|
||
f"Titel: {title}\n\n"
|
||
f"Originaltext:\n{source_text}"
|
||
)
|
||
return _openai_chat(
|
||
"Du bist ein deutscher News-Redakteur.",
|
||
prompt,
|
||
temperature=0.4,
|
||
)
|
||
|
||
|
||
def generate_article_tags(article: dict[str, Any], rewritten_text: str | None = None, max_tags: int = 8) -> list[str]:
|
||
source_text = rewritten_text or _sanitize_source_text(article.get("content_raw") or "") or (article.get("summary") or "")
|
||
source_text = str(source_text).strip()
|
||
if not source_text:
|
||
return []
|
||
title = (article.get("title") or "").strip()
|
||
prompt = (
|
||
"Erzeuge präzise Schlagwörter für einen deutschen News-Artikel. "
|
||
f"Maximal {max_tags} Tags. Nur relevante Begriffe, keine allgemeinen Wörter wie News/Artikel. "
|
||
"Gib ausschließlich ein JSON-Array mit Strings zurück, ohne Erklärung.\n\n"
|
||
f"Titel: {title}\n\n"
|
||
f"Text:\n{source_text[:3500]}"
|
||
)
|
||
raw = _openai_chat(
|
||
"Du extrahierst präzise, kurze News-Tags auf Deutsch.",
|
||
prompt,
|
||
temperature=0.2,
|
||
)
|
||
try:
|
||
parsed = json.loads(raw)
|
||
if isinstance(parsed, list):
|
||
return _normalize_tags([str(x) for x in parsed], max_tags=max_tags)
|
||
except Exception:
|
||
pass
|
||
# fallback: extract first JSON-like array if model wrapped output
|
||
match = re.search(r"\[[\s\S]*\]", raw)
|
||
if match:
|
||
try:
|
||
parsed = json.loads(match.group(0))
|
||
if isinstance(parsed, list):
|
||
return _normalize_tags([str(x) for x in parsed], max_tags=max_tags)
|
||
except Exception:
|
||
return []
|
||
return []
|
||
|
||
|
||
def score_article_relevance(article: dict[Any, Any]) -> dict[str, Any]:
|
||
"""Score article relevance for VanLife/Camping/Outdoor blog (0-100).
|
||
|
||
Returns {"score": int, "reason": str, "topics": list[str]}.
|
||
Raises RuntimeError on OpenAI failure.
|
||
"""
|
||
title = (article.get("title") or "").strip()
|
||
text = _sanitize_source_text(article.get("content_raw") or "")
|
||
if not text:
|
||
text = (article.get("summary") or "").strip()
|
||
|
||
prompt = (
|
||
"Bewerte die Relevanz des folgenden Artikels für einen deutschen VanLife-, Camping- und Outdoor-Blog. "
|
||
"Relevante Themen: Campingplätze, Stellplätze, Wohnmobil, Camper, Van, Roadtrip, "
|
||
"Outdoor-Ausrüstung, Wandern, Naturreisen, Reise-Tipps für Campende. "
|
||
"Nicht relevant: allgemeine Nachrichten, Politik, Wirtschaft, Sport (außer Outdoor), Unterhaltung.\n\n"
|
||
"Antworte NUR mit einem JSON-Objekt:\n"
|
||
'{"score": <0-100>, "reason": "<kurze Begründung auf Deutsch>", "topics": ["<Thema1>", "<Thema2>"]}\n\n'
|
||
f"Titel: {title}\n\n"
|
||
f"Text (Auszug):\n{text[:2000]}"
|
||
)
|
||
raw = _openai_chat(
|
||
"Du bist ein Redakteur für einen VanLife- und Camping-Blog und bewertest Artikelrelevanz.",
|
||
prompt,
|
||
temperature=0.1,
|
||
)
|
||
try:
|
||
match = re.search(r"\{[\s\S]*\}", raw)
|
||
if match:
|
||
parsed = json.loads(match.group(0))
|
||
score = max(0, min(100, int(parsed.get("score", 0))))
|
||
return {
|
||
"score": score,
|
||
"reason": str(parsed.get("reason", "")),
|
||
"topics": [str(t) for t in (parsed.get("topics") or [])],
|
||
}
|
||
except Exception:
|
||
pass
|
||
return {"score": 0, "reason": "Parsing-Fehler bei Relevanz-Score", "topics": []}
|
||
|
||
|
||
def merge_generated_tags(meta_json: str | None, tags: list[str]) -> str:
|
||
meta: dict[str, Any] = {}
|
||
if meta_json:
|
||
try:
|
||
parsed = json.loads(meta_json)
|
||
if isinstance(parsed, dict):
|
||
meta = parsed
|
||
except Exception:
|
||
meta = {}
|
||
meta["generated_tags"] = _normalize_tags(tags)
|
||
return json.dumps(meta, ensure_ascii=False)
|