feat(rewrite): add batch rewrite run, AI tags for WP, and agentur contact detection

This commit is contained in:
Oliver 2026-02-21 14:39:47 +01:00
parent da269d08f1
commit b0f995d5c9
No known key found for this signature in database
10 changed files with 374 additions and 36 deletions

View file

@ -20,7 +20,7 @@ from .ingestion import run_ingestion
from .policy import evaluate_source_policy
from .publisher import enqueue_publish, run_publisher
from .relevance import article_age_days, article_relevance
from .rewrite import rewrite_article_text
from .rewrite import generate_article_tags, merge_generated_tags, rewrite_article_text
from .repositories import (
FeedCreate,
FeedUpdate,
@ -373,6 +373,7 @@ def _upsert_article_from_existing(
publish_attempts: int | object = _UNSET,
publish_last_error: str | None | object = _UNSET,
published_to_wp_at: str | None | object = _UNSET,
meta_json: str | None | object = _UNSET,
) -> None:
rewritten = article.get("content_rewritten") if content_rewritten is None else content_rewritten
upsert_article(
@ -403,7 +404,7 @@ def _upsert_article_from_existing(
published_to_wp_at=article.get("published_to_wp_at") if published_to_wp_at is _UNSET else published_to_wp_at,
word_count=len(str(rewritten or "").split()),
status=article.get("status") if status is None else status,
meta_json=article.get("meta_json"),
meta_json=article.get("meta_json") if meta_json is _UNSET else meta_json,
)
)
@ -493,6 +494,8 @@ def admin_dashboard(request: Request):
article["days_old"] = article_age_days(article.get("published_at"))
article["relevance"] = article_relevance(article.get("published_at"))
article["status_ui"] = internal_to_ui_status(article.get("status"))
tags = meta.get("generated_tags") if isinstance(meta.get("generated_tags"), list) else []
article["generated_tags"] = [str(t) for t in tags if t]
return templates.TemplateResponse(
request,
@ -836,12 +839,40 @@ def admin_rewrite_run(request: Request, article_id: int):
return _dashboard_redirect(msg=f"Rewrite nur aus new/rewrite fuer Artikel #{article_id}", msg_type="error")
try:
rewritten = rewrite_article_text(article)
tags = generate_article_tags(article, rewritten_text=rewritten)
except Exception as exc:
return _dashboard_redirect(msg=f"Rewrite fehlgeschlagen fuer Artikel #{article_id}: {exc}", msg_type="error")
_upsert_article_from_existing(article, content_rewritten=rewritten, status="approved")
merged_meta = merge_generated_tags(article.get("meta_json"), tags)
_upsert_article_from_existing(article, content_rewritten=rewritten, status="approved", meta_json=merged_meta)
return _dashboard_redirect(msg=f"Rewrite fertig fuer Artikel #{article_id} -> publish")
@router.post("/admin/rewrite/run")
def admin_rewrite_run_batch(request: Request, max_jobs: str = Form("10")):
user = _admin_user(request)
if not user:
return RedirectResponse(url="/admin/login", status_code=303)
try:
limit = max(1, min(int(max_jobs), 100))
except Exception:
limit = 10
planned = list_articles(limit=limit, status_filter="rewrite")
processed = 0
success = 0
failed = 0
for article in planned:
processed += 1
try:
rewritten = rewrite_article_text(article)
tags = generate_article_tags(article, rewritten_text=rewritten)
merged_meta = merge_generated_tags(article.get("meta_json"), tags)
_upsert_article_from_existing(article, content_rewritten=rewritten, status="approved", meta_json=merged_meta)
success += 1
except Exception:
failed += 1
return _dashboard_redirect(msg=f"Rewrite-Run: processed={processed}, success={success}, failed={failed}")
@router.post("/admin/articles/{article_id}/rewrite-save")
def admin_rewrite_save(request: Request, article_id: int, content_rewritten: str = Form(...)):
user = _admin_user(request)

View file

@ -18,7 +18,7 @@ from .ingestion import run_ingestion
from .policy import evaluate_source_policy, is_source_allowed
from .publisher import enqueue_publish, run_publisher
from .relevance import article_age_days, article_relevance
from .rewrite import rewrite_article_text
from .rewrite import generate_article_tags, merge_generated_tags, rewrite_article_text
from .repositories import (
ArticleUpsert,
FeedCreate,
@ -514,6 +514,12 @@ def api_article_rewrite_run(article_id: int, username: str = Depends(require_aut
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Rewrite nur aus Status 'new' oder 'rewrite'")
rewritten = rewrite_article_text(article)
tags: list[str] = []
try:
tags = generate_article_tags(article, rewritten_text=rewritten)
except Exception:
tags = []
merged_meta = merge_generated_tags(article.get("meta_json"), tags)
# upsert via status update + existing fields by lightweight path:
repo_upsert_article(
ArticleUpsert(
@ -543,10 +549,10 @@ def api_article_rewrite_run(article_id: int, username: str = Depends(require_aut
published_to_wp_at=article.get("published_to_wp_at"),
word_count=len(rewritten.split()),
status="approved",
meta_json=article.get("meta_json"),
meta_json=merged_meta,
)
)
return {"ok": True, "id": article_id, "status": "publish"}
return {"ok": True, "id": article_id, "status": "publish", "tags": tags}
@app.post("/api/articles/{article_id}/legal-review")

View file

@ -28,35 +28,39 @@ def _sanitize_source_text(text: str) -> str:
return joined
def rewrite_article_text(article: dict[str, Any]) -> str:
def _normalize_tags(tags: list[str], max_tags: int = 8) -> list[str]:
out: list[str] = []
seen: set[str] = set()
for raw in tags:
value = re.sub(r"\s+", " ", str(raw or "").strip())
value = re.sub(r"^[#\-•\s]+", "", value)
value = re.sub(r"[;,.:\s]+$", "", value)
if not value:
continue
if len(value) < 2 or len(value) > 40:
continue
key = value.casefold()
if key in seen:
continue
seen.add(key)
out.append(value)
if len(out) >= max_tags:
break
return out
def _openai_chat(system: str, user: str, temperature: float = 0.4) -> str:
settings = get_settings()
api_key = settings.openai_api_key
if not api_key:
raise RuntimeError("OPENAI_API_KEY fehlt")
source_text = _sanitize_source_text(article.get("content_raw") or "")
if not source_text:
source_text = (article.get("summary") or "").strip()
if not source_text:
raise RuntimeError("Kein Quelltext für Rewrite verfügbar")
title = (article.get("title") or "").strip()
prompt = (
"Schreibe den folgenden News-Text neu auf Deutsch in persönlicher Du-Form. "
"Stil: ausführlich, gut lesbar, ohne Einleitung mit Datum/Uhrzeit/Firma/Ort, "
"ohne Pressekontakt, ohne Quellenblock. "
"Nutze klare Absätze und Zwischenüberschriften in HTML (<h2>, <p>, <ul><li> falls passend). "
"Inhaltlich korrekt bleiben, nichts erfinden.\n\n"
f"Titel: {title}\n\n"
f"Originaltext:\n{source_text}"
)
payload = {
"model": settings.openai_model,
"temperature": 0.4,
"temperature": temperature,
"messages": [
{"role": "system", "content": "Du bist ein deutscher News-Redakteur."},
{"role": "user", "content": prompt},
{"role": "system", "content": system},
{"role": "user", "content": user},
],
}
req = Request(
@ -78,6 +82,78 @@ def rewrite_article_text(article: dict[str, Any]) -> str:
message = choices[0].get("message", {})
content = message.get("content")
if not isinstance(content, str) or not content.strip():
raise RuntimeError("OpenAI lieferte keinen Rewrite-Text")
raise RuntimeError("OpenAI lieferte keinen Inhalt")
return content.strip()
def rewrite_article_text(article: dict[str, Any]) -> str:
source_text = _sanitize_source_text(article.get("content_raw") or "")
if not source_text:
source_text = (article.get("summary") or "").strip()
if not source_text:
raise RuntimeError("Kein Quelltext für Rewrite verfügbar")
title = (article.get("title") or "").strip()
prompt = (
"Schreibe den folgenden News-Text neu auf Deutsch in persönlicher Du-Form. "
"Stil: ausführlich, gut lesbar, ohne Einleitung mit Datum/Uhrzeit/Firma/Ort, "
"ohne Pressekontakt, ohne Quellenblock. "
"Nutze klare Absätze und Zwischenüberschriften in HTML (<h2>, <p>, <ul><li> falls passend). "
"Inhaltlich korrekt bleiben, nichts erfinden.\n\n"
f"Titel: {title}\n\n"
f"Originaltext:\n{source_text}"
)
return _openai_chat(
"Du bist ein deutscher News-Redakteur.",
prompt,
temperature=0.4,
)
def generate_article_tags(article: dict[str, Any], rewritten_text: str | None = None, max_tags: int = 8) -> list[str]:
source_text = rewritten_text or _sanitize_source_text(article.get("content_raw") or "") or (article.get("summary") or "")
source_text = str(source_text).strip()
if not source_text:
return []
title = (article.get("title") or "").strip()
prompt = (
"Erzeuge präzise Schlagwörter für einen deutschen News-Artikel. "
f"Maximal {max_tags} Tags. Nur relevante Begriffe, keine allgemeinen Wörter wie News/Artikel. "
"Gib ausschließlich ein JSON-Array mit Strings zurück, ohne Erklärung.\n\n"
f"Titel: {title}\n\n"
f"Text:\n{source_text[:3500]}"
)
raw = _openai_chat(
"Du extrahierst präzise, kurze News-Tags auf Deutsch.",
prompt,
temperature=0.2,
)
try:
parsed = json.loads(raw)
if isinstance(parsed, list):
return _normalize_tags([str(x) for x in parsed], max_tags=max_tags)
except Exception:
pass
# fallback: extract first JSON-like array if model wrapped output
match = re.search(r"\[[\s\S]*\]", raw)
if match:
try:
parsed = json.loads(match.group(0))
if isinstance(parsed, list):
return _normalize_tags([str(x) for x in parsed], max_tags=max_tags)
except Exception:
return []
return []
def merge_generated_tags(meta_json: str | None, tags: list[str]) -> str:
meta: dict[str, Any] = {}
if meta_json:
try:
parsed = json.loads(meta_json)
if isinstance(parsed, dict):
meta = parsed
except Exception:
meta = {}
meta["generated_tags"] = _normalize_tags(tags)
return json.dumps(meta, ensure_ascii=False)

View file

@ -157,7 +157,7 @@ def _extract_content_text(html: str) -> str | None:
paragraphs = []
for match in re.finditer(r"<h[2-4][^>]*>([\s\S]*?)</h[2-4]>", section, re.IGNORECASE):
text = _clean_text(match.group(1))
if text and re.search(r"\b(pressekontakt|press contact|kontakt)\b", text, re.IGNORECASE):
if text and re.search(r"\b(pressekontakt|press contact|kontakt|agentur)\b", text, re.IGNORECASE):
paragraphs.append(text)
for match in re.finditer(r"<p[^>]*>([\s\S]*?)</p>", section, re.IGNORECASE):
@ -177,18 +177,18 @@ def _extract_press_contact(content_text: str | None) -> str | None:
return None
lines = [line.strip() for line in content_text.split("\n") if line.strip()]
marker_re = re.compile(r"\b(pressekontakt|press contact|presse\-kontakt)\b", re.IGNORECASE)
marker_re = re.compile(r"\b(pressekontakt|press contact|presse\-kontakt|agentur)\b", re.IGNORECASE)
for idx, line in enumerate(lines):
if marker_re.search(line):
chunk = [line]
for nxt in lines[idx + 1 : idx + 6]:
if re.search(r"\b(original\-content von|ots:|newsroom:)\b", nxt, re.IGNORECASE):
if re.search(r"\b(original\-content von|ots:|newsroom:|alle meldungen|zum newsroom)\b", nxt, re.IGNORECASE):
break
chunk.append(nxt)
return _clean_text("\n".join(chunk))
match = re.search(
r"(Pressekontakt[\s\S]{0,1200}?)(?:Original-Content von|OTS:|newsroom:|$)",
r"((?:Pressekontakt|Agentur)[\s\S]{0,1200}?)(?:Original-Content von|OTS:|newsroom:|Alle Meldungen|Zum Newsroom|$)",
content_text,
re.IGNORECASE,
)

View file

@ -7,7 +7,7 @@ import mimetypes
from pathlib import Path
import re
from typing import Any
from urllib.parse import urlparse
from urllib.parse import quote_plus, urlparse
from urllib.request import Request, urlopen
from .config import get_settings
@ -25,7 +25,7 @@ def _wp_request(
method: str,
endpoint: str,
payload: dict[str, Any] | None = None,
) -> dict[str, Any]:
) -> Any:
url = f"{base_url.rstrip('/')}/wp-json/wp/v2/{endpoint.lstrip('/')}"
data = json.dumps(payload).encode("utf-8") if payload is not None else None
req = Request(
@ -41,8 +41,7 @@ def _wp_request(
)
with urlopen(req, timeout=20) as resp:
raw = resp.read().decode("utf-8", errors="replace")
parsed = json.loads(raw) if raw else {}
return parsed if isinstance(parsed, dict) else {}
return json.loads(raw) if raw else {}
def _selected_image_url_from_meta(meta_json: str | None) -> str | None:
@ -61,6 +60,81 @@ def _selected_image_url_from_meta(meta_json: str | None) -> str | None:
return selected if isinstance(selected, str) and selected.strip() else None
def _selected_tags_from_meta(meta_json: str | None) -> list[str]:
if not meta_json:
return []
try:
meta = json.loads(meta_json)
except Exception:
return []
if not isinstance(meta, dict):
return []
raw_tags = meta.get("generated_tags")
if not isinstance(raw_tags, list):
return []
tags: list[str] = []
seen: set[str] = set()
for item in raw_tags:
value = str(item or "").strip()
if not value:
continue
key = value.casefold()
if key in seen:
continue
seen.add(key)
tags.append(value)
if len(tags) >= 12:
break
return tags
def _resolve_wp_tag_ids(*, base_url: str, auth_header: str, tags: list[str]) -> list[int]:
ids: list[int] = []
seen: set[int] = set()
for tag in tags:
name = tag.strip()
if not name:
continue
try:
endpoint = f"tags?search={quote_plus(name)}&per_page=20"
result = _wp_request(base_url=base_url, auth_header=auth_header, method="GET", endpoint=endpoint)
tag_id: int | None = None
if isinstance(result, list):
for row in result:
if not isinstance(row, dict):
continue
row_name = str(row.get("name") or "")
rid = int(row.get("id", 0) or 0)
if rid <= 0:
continue
if row_name.casefold() == name.casefold():
tag_id = rid
break
if tag_id is None:
for row in result:
if isinstance(row, dict) and int(row.get("id", 0) or 0) > 0:
tag_id = int(row.get("id", 0))
break
if tag_id is None:
created = _wp_request(
base_url=base_url,
auth_header=auth_header,
method="POST",
endpoint="tags",
payload={"name": name},
)
if isinstance(created, dict):
rid = int(created.get("id", 0) or 0)
if rid > 0:
tag_id = rid
if tag_id is not None and tag_id > 0 and tag_id not in seen:
seen.add(tag_id)
ids.append(tag_id)
except Exception:
continue
return ids
def _download_image_bytes(url: str, referer: str | None = None) -> tuple[bytes, str]:
headers = {
"User-Agent": "rss-news-publisher/1.0",
@ -269,6 +343,14 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]:
payload["featured_media"] = featured_media_id
wp_post_id = article.get("wp_post_id")
tag_ids = _resolve_wp_tag_ids(
base_url=settings.wordpress_base_url,
auth_header=auth,
tags=_selected_tags_from_meta(article.get("meta_json")),
)
if tag_ids:
payload["tags"] = tag_ids
if wp_post_id:
result = _wp_request(
base_url=settings.wordpress_base_url,
@ -286,6 +368,8 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]:
payload=payload,
)
if not isinstance(result, dict):
raise RuntimeError(f"WordPress Antwort im unerwarteten Format: {result}")
post_id = int(result.get("id", 0))
if post_id <= 0:
raise RuntimeError(f"WordPress Antwort ohne Post-ID: {result}")