feat(rewrite): add batch rewrite run, AI tags for WP, and agentur contact detection

This commit is contained in:
Oliver 2026-02-21 14:39:47 +01:00
parent da269d08f1
commit b0f995d5c9
No known key found for this signature in database
10 changed files with 374 additions and 36 deletions

View file

@ -20,7 +20,7 @@ from .ingestion import run_ingestion
from .policy import evaluate_source_policy
from .publisher import enqueue_publish, run_publisher
from .relevance import article_age_days, article_relevance
from .rewrite import rewrite_article_text
from .rewrite import generate_article_tags, merge_generated_tags, rewrite_article_text
from .repositories import (
FeedCreate,
FeedUpdate,
@ -373,6 +373,7 @@ def _upsert_article_from_existing(
publish_attempts: int | object = _UNSET,
publish_last_error: str | None | object = _UNSET,
published_to_wp_at: str | None | object = _UNSET,
meta_json: str | None | object = _UNSET,
) -> None:
rewritten = article.get("content_rewritten") if content_rewritten is None else content_rewritten
upsert_article(
@ -403,7 +404,7 @@ def _upsert_article_from_existing(
published_to_wp_at=article.get("published_to_wp_at") if published_to_wp_at is _UNSET else published_to_wp_at,
word_count=len(str(rewritten or "").split()),
status=article.get("status") if status is None else status,
meta_json=article.get("meta_json"),
meta_json=article.get("meta_json") if meta_json is _UNSET else meta_json,
)
)
@ -493,6 +494,8 @@ def admin_dashboard(request: Request):
article["days_old"] = article_age_days(article.get("published_at"))
article["relevance"] = article_relevance(article.get("published_at"))
article["status_ui"] = internal_to_ui_status(article.get("status"))
tags = meta.get("generated_tags") if isinstance(meta.get("generated_tags"), list) else []
article["generated_tags"] = [str(t) for t in tags if t]
return templates.TemplateResponse(
request,
@ -836,12 +839,40 @@ def admin_rewrite_run(request: Request, article_id: int):
return _dashboard_redirect(msg=f"Rewrite nur aus new/rewrite fuer Artikel #{article_id}", msg_type="error")
try:
rewritten = rewrite_article_text(article)
tags = generate_article_tags(article, rewritten_text=rewritten)
except Exception as exc:
return _dashboard_redirect(msg=f"Rewrite fehlgeschlagen fuer Artikel #{article_id}: {exc}", msg_type="error")
_upsert_article_from_existing(article, content_rewritten=rewritten, status="approved")
merged_meta = merge_generated_tags(article.get("meta_json"), tags)
_upsert_article_from_existing(article, content_rewritten=rewritten, status="approved", meta_json=merged_meta)
return _dashboard_redirect(msg=f"Rewrite fertig fuer Artikel #{article_id} -> publish")
@router.post("/admin/rewrite/run")
def admin_rewrite_run_batch(request: Request, max_jobs: str = Form("10")):
user = _admin_user(request)
if not user:
return RedirectResponse(url="/admin/login", status_code=303)
try:
limit = max(1, min(int(max_jobs), 100))
except Exception:
limit = 10
planned = list_articles(limit=limit, status_filter="rewrite")
processed = 0
success = 0
failed = 0
for article in planned:
processed += 1
try:
rewritten = rewrite_article_text(article)
tags = generate_article_tags(article, rewritten_text=rewritten)
merged_meta = merge_generated_tags(article.get("meta_json"), tags)
_upsert_article_from_existing(article, content_rewritten=rewritten, status="approved", meta_json=merged_meta)
success += 1
except Exception:
failed += 1
return _dashboard_redirect(msg=f"Rewrite-Run: processed={processed}, success={success}, failed={failed}")
@router.post("/admin/articles/{article_id}/rewrite-save")
def admin_rewrite_save(request: Request, article_id: int, content_rewritten: str = Form(...)):
user = _admin_user(request)

View file

@ -18,7 +18,7 @@ from .ingestion import run_ingestion
from .policy import evaluate_source_policy, is_source_allowed
from .publisher import enqueue_publish, run_publisher
from .relevance import article_age_days, article_relevance
from .rewrite import rewrite_article_text
from .rewrite import generate_article_tags, merge_generated_tags, rewrite_article_text
from .repositories import (
ArticleUpsert,
FeedCreate,
@ -514,6 +514,12 @@ def api_article_rewrite_run(article_id: int, username: str = Depends(require_aut
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Rewrite nur aus Status 'new' oder 'rewrite'")
rewritten = rewrite_article_text(article)
tags: list[str] = []
try:
tags = generate_article_tags(article, rewritten_text=rewritten)
except Exception:
tags = []
merged_meta = merge_generated_tags(article.get("meta_json"), tags)
# upsert via status update + existing fields by lightweight path:
repo_upsert_article(
ArticleUpsert(
@ -543,10 +549,10 @@ def api_article_rewrite_run(article_id: int, username: str = Depends(require_aut
published_to_wp_at=article.get("published_to_wp_at"),
word_count=len(rewritten.split()),
status="approved",
meta_json=article.get("meta_json"),
meta_json=merged_meta,
)
)
return {"ok": True, "id": article_id, "status": "publish"}
return {"ok": True, "id": article_id, "status": "publish", "tags": tags}
@app.post("/api/articles/{article_id}/legal-review")

View file

@ -28,35 +28,39 @@ def _sanitize_source_text(text: str) -> str:
return joined
def rewrite_article_text(article: dict[str, Any]) -> str:
def _normalize_tags(tags: list[str], max_tags: int = 8) -> list[str]:
out: list[str] = []
seen: set[str] = set()
for raw in tags:
value = re.sub(r"\s+", " ", str(raw or "").strip())
value = re.sub(r"^[#\-•\s]+", "", value)
value = re.sub(r"[;,.:\s]+$", "", value)
if not value:
continue
if len(value) < 2 or len(value) > 40:
continue
key = value.casefold()
if key in seen:
continue
seen.add(key)
out.append(value)
if len(out) >= max_tags:
break
return out
def _openai_chat(system: str, user: str, temperature: float = 0.4) -> str:
settings = get_settings()
api_key = settings.openai_api_key
if not api_key:
raise RuntimeError("OPENAI_API_KEY fehlt")
source_text = _sanitize_source_text(article.get("content_raw") or "")
if not source_text:
source_text = (article.get("summary") or "").strip()
if not source_text:
raise RuntimeError("Kein Quelltext für Rewrite verfügbar")
title = (article.get("title") or "").strip()
prompt = (
"Schreibe den folgenden News-Text neu auf Deutsch in persönlicher Du-Form. "
"Stil: ausführlich, gut lesbar, ohne Einleitung mit Datum/Uhrzeit/Firma/Ort, "
"ohne Pressekontakt, ohne Quellenblock. "
"Nutze klare Absätze und Zwischenüberschriften in HTML (<h2>, <p>, <ul><li> falls passend). "
"Inhaltlich korrekt bleiben, nichts erfinden.\n\n"
f"Titel: {title}\n\n"
f"Originaltext:\n{source_text}"
)
payload = {
"model": settings.openai_model,
"temperature": 0.4,
"temperature": temperature,
"messages": [
{"role": "system", "content": "Du bist ein deutscher News-Redakteur."},
{"role": "user", "content": prompt},
{"role": "system", "content": system},
{"role": "user", "content": user},
],
}
req = Request(
@ -78,6 +82,78 @@ def rewrite_article_text(article: dict[str, Any]) -> str:
message = choices[0].get("message", {})
content = message.get("content")
if not isinstance(content, str) or not content.strip():
raise RuntimeError("OpenAI lieferte keinen Rewrite-Text")
raise RuntimeError("OpenAI lieferte keinen Inhalt")
return content.strip()
def rewrite_article_text(article: dict[str, Any]) -> str:
source_text = _sanitize_source_text(article.get("content_raw") or "")
if not source_text:
source_text = (article.get("summary") or "").strip()
if not source_text:
raise RuntimeError("Kein Quelltext für Rewrite verfügbar")
title = (article.get("title") or "").strip()
prompt = (
"Schreibe den folgenden News-Text neu auf Deutsch in persönlicher Du-Form. "
"Stil: ausführlich, gut lesbar, ohne Einleitung mit Datum/Uhrzeit/Firma/Ort, "
"ohne Pressekontakt, ohne Quellenblock. "
"Nutze klare Absätze und Zwischenüberschriften in HTML (<h2>, <p>, <ul><li> falls passend). "
"Inhaltlich korrekt bleiben, nichts erfinden.\n\n"
f"Titel: {title}\n\n"
f"Originaltext:\n{source_text}"
)
return _openai_chat(
"Du bist ein deutscher News-Redakteur.",
prompt,
temperature=0.4,
)
def generate_article_tags(article: dict[str, Any], rewritten_text: str | None = None, max_tags: int = 8) -> list[str]:
source_text = rewritten_text or _sanitize_source_text(article.get("content_raw") or "") or (article.get("summary") or "")
source_text = str(source_text).strip()
if not source_text:
return []
title = (article.get("title") or "").strip()
prompt = (
"Erzeuge präzise Schlagwörter für einen deutschen News-Artikel. "
f"Maximal {max_tags} Tags. Nur relevante Begriffe, keine allgemeinen Wörter wie News/Artikel. "
"Gib ausschließlich ein JSON-Array mit Strings zurück, ohne Erklärung.\n\n"
f"Titel: {title}\n\n"
f"Text:\n{source_text[:3500]}"
)
raw = _openai_chat(
"Du extrahierst präzise, kurze News-Tags auf Deutsch.",
prompt,
temperature=0.2,
)
try:
parsed = json.loads(raw)
if isinstance(parsed, list):
return _normalize_tags([str(x) for x in parsed], max_tags=max_tags)
except Exception:
pass
# fallback: extract first JSON-like array if model wrapped output
match = re.search(r"\[[\s\S]*\]", raw)
if match:
try:
parsed = json.loads(match.group(0))
if isinstance(parsed, list):
return _normalize_tags([str(x) for x in parsed], max_tags=max_tags)
except Exception:
return []
return []
def merge_generated_tags(meta_json: str | None, tags: list[str]) -> str:
meta: dict[str, Any] = {}
if meta_json:
try:
parsed = json.loads(meta_json)
if isinstance(parsed, dict):
meta = parsed
except Exception:
meta = {}
meta["generated_tags"] = _normalize_tags(tags)
return json.dumps(meta, ensure_ascii=False)

View file

@ -157,7 +157,7 @@ def _extract_content_text(html: str) -> str | None:
paragraphs = []
for match in re.finditer(r"<h[2-4][^>]*>([\s\S]*?)</h[2-4]>", section, re.IGNORECASE):
text = _clean_text(match.group(1))
if text and re.search(r"\b(pressekontakt|press contact|kontakt)\b", text, re.IGNORECASE):
if text and re.search(r"\b(pressekontakt|press contact|kontakt|agentur)\b", text, re.IGNORECASE):
paragraphs.append(text)
for match in re.finditer(r"<p[^>]*>([\s\S]*?)</p>", section, re.IGNORECASE):
@ -177,18 +177,18 @@ def _extract_press_contact(content_text: str | None) -> str | None:
return None
lines = [line.strip() for line in content_text.split("\n") if line.strip()]
marker_re = re.compile(r"\b(pressekontakt|press contact|presse\-kontakt)\b", re.IGNORECASE)
marker_re = re.compile(r"\b(pressekontakt|press contact|presse\-kontakt|agentur)\b", re.IGNORECASE)
for idx, line in enumerate(lines):
if marker_re.search(line):
chunk = [line]
for nxt in lines[idx + 1 : idx + 6]:
if re.search(r"\b(original\-content von|ots:|newsroom:)\b", nxt, re.IGNORECASE):
if re.search(r"\b(original\-content von|ots:|newsroom:|alle meldungen|zum newsroom)\b", nxt, re.IGNORECASE):
break
chunk.append(nxt)
return _clean_text("\n".join(chunk))
match = re.search(
r"(Pressekontakt[\s\S]{0,1200}?)(?:Original-Content von|OTS:|newsroom:|$)",
r"((?:Pressekontakt|Agentur)[\s\S]{0,1200}?)(?:Original-Content von|OTS:|newsroom:|Alle Meldungen|Zum Newsroom|$)",
content_text,
re.IGNORECASE,
)

View file

@ -7,7 +7,7 @@ import mimetypes
from pathlib import Path
import re
from typing import Any
from urllib.parse import urlparse
from urllib.parse import quote_plus, urlparse
from urllib.request import Request, urlopen
from .config import get_settings
@ -25,7 +25,7 @@ def _wp_request(
method: str,
endpoint: str,
payload: dict[str, Any] | None = None,
) -> dict[str, Any]:
) -> Any:
url = f"{base_url.rstrip('/')}/wp-json/wp/v2/{endpoint.lstrip('/')}"
data = json.dumps(payload).encode("utf-8") if payload is not None else None
req = Request(
@ -41,8 +41,7 @@ def _wp_request(
)
with urlopen(req, timeout=20) as resp:
raw = resp.read().decode("utf-8", errors="replace")
parsed = json.loads(raw) if raw else {}
return parsed if isinstance(parsed, dict) else {}
return json.loads(raw) if raw else {}
def _selected_image_url_from_meta(meta_json: str | None) -> str | None:
@ -61,6 +60,81 @@ def _selected_image_url_from_meta(meta_json: str | None) -> str | None:
return selected if isinstance(selected, str) and selected.strip() else None
def _selected_tags_from_meta(meta_json: str | None) -> list[str]:
if not meta_json:
return []
try:
meta = json.loads(meta_json)
except Exception:
return []
if not isinstance(meta, dict):
return []
raw_tags = meta.get("generated_tags")
if not isinstance(raw_tags, list):
return []
tags: list[str] = []
seen: set[str] = set()
for item in raw_tags:
value = str(item or "").strip()
if not value:
continue
key = value.casefold()
if key in seen:
continue
seen.add(key)
tags.append(value)
if len(tags) >= 12:
break
return tags
def _resolve_wp_tag_ids(*, base_url: str, auth_header: str, tags: list[str]) -> list[int]:
ids: list[int] = []
seen: set[int] = set()
for tag in tags:
name = tag.strip()
if not name:
continue
try:
endpoint = f"tags?search={quote_plus(name)}&per_page=20"
result = _wp_request(base_url=base_url, auth_header=auth_header, method="GET", endpoint=endpoint)
tag_id: int | None = None
if isinstance(result, list):
for row in result:
if not isinstance(row, dict):
continue
row_name = str(row.get("name") or "")
rid = int(row.get("id", 0) or 0)
if rid <= 0:
continue
if row_name.casefold() == name.casefold():
tag_id = rid
break
if tag_id is None:
for row in result:
if isinstance(row, dict) and int(row.get("id", 0) or 0) > 0:
tag_id = int(row.get("id", 0))
break
if tag_id is None:
created = _wp_request(
base_url=base_url,
auth_header=auth_header,
method="POST",
endpoint="tags",
payload={"name": name},
)
if isinstance(created, dict):
rid = int(created.get("id", 0) or 0)
if rid > 0:
tag_id = rid
if tag_id is not None and tag_id > 0 and tag_id not in seen:
seen.add(tag_id)
ids.append(tag_id)
except Exception:
continue
return ids
def _download_image_bytes(url: str, referer: str | None = None) -> tuple[bytes, str]:
headers = {
"User-Agent": "rss-news-publisher/1.0",
@ -269,6 +343,14 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]:
payload["featured_media"] = featured_media_id
wp_post_id = article.get("wp_post_id")
tag_ids = _resolve_wp_tag_ids(
base_url=settings.wordpress_base_url,
auth_header=auth,
tags=_selected_tags_from_meta(article.get("meta_json")),
)
if tag_ids:
payload["tags"] = tag_ids
if wp_post_id:
result = _wp_request(
base_url=settings.wordpress_base_url,
@ -286,6 +368,8 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]:
payload=payload,
)
if not isinstance(result, dict):
raise RuntimeError(f"WordPress Antwort im unerwarteten Format: {result}")
post_id = int(result.get("id", 0))
if post_id <= 0:
raise RuntimeError(f"WordPress Antwort ohne Post-ID: {result}")

View file

@ -170,6 +170,9 @@
<textarea name="content_rewritten" rows="14" style="width:100%;">{{ article.content_rewritten or "" }}</textarea>
<button type="submit">Rewrite-Text speichern</button>
</form>
{% if article.meta.generated_tags %}
<p><strong>Generierte Tags:</strong> {{ article.meta.generated_tags|join("; ") }}</p>
{% endif %}
<p class="subtle">Dieser Text wird für den WordPress-Entwurf verwendet, falls vorhanden.</p>
</section>

View file

@ -102,6 +102,15 @@
</form>
</section>
<section class="card">
<h2>Rewrite Run (geplante Artikel)</h2>
<p class="subtle">Verarbeitet alle Artikel im Status <code>rewrite</code> und setzt sie auf <code>publish</code>.</p>
<form method="post" action="/admin/rewrite/run" class="row">
<input name="max_jobs" value="10" />
<button type="submit">Rewrite Run starten</button>
</form>
</section>
<section class="card">
<h2>Quellen + Policy</h2>
<table>
@ -269,6 +278,9 @@
{% if a.summary %}
<div><strong>Summary:</strong> {{ a.summary }}</div>
{% endif %}
{% if a.generated_tags %}
<div><strong>Tags:</strong> {{ a.generated_tags|join("; ") }}</div>
{% endif %}
{% if a.content_raw %}
<details>
<summary>Volltext anzeigen</summary>

View file

@ -271,6 +271,71 @@ class TestAdminUi(unittest.TestCase):
self.assertIn("Neu", article.get("content_rewritten") or "")
self.assertIsNone(article.get("wp_post_id"))
@patch("backend.app.admin_ui.generate_article_tags")
@patch("backend.app.admin_ui.rewrite_article_text")
def test_batch_rewrite_run_processes_planned_articles(self, mock_rewrite_text, mock_tags) -> None:
mock_rewrite_text.return_value = "<h2>Neu</h2><p>Text</p>"
mock_tags.return_value = ["Rheingas", "Monheim"]
source_id = create_source(
SourceCreate(
name="Batch Source",
base_url="https://example.org",
terms_url="https://example.org/terms",
license_name="cc-by",
risk_level="green",
is_enabled=True,
notes=None,
last_reviewed_at=None,
)
)
feed_id = create_feed(
FeedCreate(
name="Batch Feed",
url="https://example.org/feed.xml",
source_id=source_id,
is_enabled=True,
)
)
article_id = upsert_article(
ArticleUpsert(
feed_id=feed_id,
source_article_id="batch-1",
source_hash="batch-hash-1",
title="Batch Titel",
source_url="https://example.org/batch",
canonical_url="https://example.org/batch",
published_at=None,
author="Autor",
summary="Summary",
content_raw="Raw",
content_rewritten=None,
image_urls_json=None,
press_contact=None,
source_name_snapshot="Batch Source",
source_terms_url_snapshot="https://example.org/terms",
source_license_name_snapshot="cc-by",
legal_checked=False,
legal_checked_at=None,
legal_note=None,
wp_post_id=None,
wp_post_url=None,
publish_attempts=0,
publish_last_error=None,
published_to_wp_at=None,
word_count=1,
status="rewrite",
meta_json="{}",
)
)
self.client.post("/admin/login", data={"username": "admin", "password": "secret"}, follow_redirects=True)
res = self.client.post("/admin/rewrite/run", data={"max_jobs": "10"}, follow_redirects=False)
self.assertEqual(res.status_code, 303)
article = get_article_by_id(article_id)
self.assertIsNotNone(article)
self.assertEqual(article.get("status"), "approved")
self.assertIn("generated_tags", article.get("meta_json", ""))
@patch("backend.app.admin_ui.urlopen")
def test_image_proxy_returns_image_data(self, mock_urlopen) -> None:
class _FakeHeaders:

View file

@ -26,6 +26,25 @@ SAMPLE_HTML = """
</html>
"""
SAMPLE_HTML_AGENTUR = """
<!doctype html>
<html lang="de">
<head>
<meta charset="utf-8" />
<meta property="og:title" content="Demo Meldung Agentur" />
</head>
<body>
<article>
<p>Inhalt der Meldung.</p>
<h3>Agentur</h3>
<p>Agenturname GmbH</p>
<p>presse@agentur.example</p>
<p>Original-Content von Beispiel</p>
</article>
</body>
</html>
"""
class _FakeHeaders:
@staticmethod
@ -64,6 +83,14 @@ class TestSourceExtraction(unittest.TestCase):
self.assertIn("Pressekontakt", extracted.press_contact or "")
self.assertIsNone(extracted.extraction_error)
@patch("backend.app.source_extraction.urlopen")
def test_extract_article_detects_agentur_block_as_press_contact(self, mock_urlopen) -> None:
mock_urlopen.return_value = _FakeResponse(SAMPLE_HTML_AGENTUR)
extracted = extract_article("https://www.presseportal.de/pm/155103/6210401")
self.assertIn("Agentur", extracted.press_contact or "")
self.assertIn("Agenturname", extracted.press_contact or "")
self.assertIn("presse@agentur.example", extracted.press_contact or "")
if __name__ == "__main__":
unittest.main()

View file

@ -80,6 +80,40 @@ class TestWordpressPublish(unittest.TestCase):
self.assertNotIn("Pressekontakt", content)
self.assertIn("eigentliche Text", content)
@patch("backend.app.wordpress._upload_featured_media")
@patch("backend.app.wordpress._wp_request")
def test_publish_resolves_and_sets_tags(self, mock_wp_request, mock_upload_media) -> None:
def _fake_wp_request(**kwargs):
endpoint = kwargs.get("endpoint", "")
method = kwargs.get("method", "")
if method == "GET" and endpoint.startswith("tags?search="):
if "Rheingas" in endpoint:
return [{"id": 11, "name": "Rheingas"}]
return []
if method == "POST" and endpoint == "tags":
name = (kwargs.get("payload") or {}).get("name")
if name == "Gasflasche":
return {"id": 12, "name": "Gasflasche"}
return {"id": 13, "name": str(name)}
if method == "POST" and endpoint == "posts":
return {"id": 900, "link": "https://example.org/?p=900"}
return {}
mock_wp_request.side_effect = _fake_wp_request
article = {
"title": "Tag Test",
"content_raw": "Inhalt",
"source_url": "https://example.com/source",
"canonical_url": "https://example.com/source",
"meta_json": '{"generated_tags":["Rheingas","Gasflasche"]}',
}
post_id, _ = publish_article_draft(article)
self.assertEqual(post_id, 900)
post_calls = [call for call in mock_wp_request.call_args_list if call.kwargs.get("endpoint") == "posts"]
self.assertEqual(len(post_calls), 1)
payload = post_calls[0].kwargs.get("payload", {})
self.assertEqual(payload.get("tags"), [11, 12])
if __name__ == "__main__":
unittest.main()