feat(rewrite): add batch rewrite run, AI tags for WP, and agentur contact detection
This commit is contained in:
parent
da269d08f1
commit
b0f995d5c9
10 changed files with 374 additions and 36 deletions
|
|
@ -20,7 +20,7 @@ from .ingestion import run_ingestion
|
|||
from .policy import evaluate_source_policy
|
||||
from .publisher import enqueue_publish, run_publisher
|
||||
from .relevance import article_age_days, article_relevance
|
||||
from .rewrite import rewrite_article_text
|
||||
from .rewrite import generate_article_tags, merge_generated_tags, rewrite_article_text
|
||||
from .repositories import (
|
||||
FeedCreate,
|
||||
FeedUpdate,
|
||||
|
|
@ -373,6 +373,7 @@ def _upsert_article_from_existing(
|
|||
publish_attempts: int | object = _UNSET,
|
||||
publish_last_error: str | None | object = _UNSET,
|
||||
published_to_wp_at: str | None | object = _UNSET,
|
||||
meta_json: str | None | object = _UNSET,
|
||||
) -> None:
|
||||
rewritten = article.get("content_rewritten") if content_rewritten is None else content_rewritten
|
||||
upsert_article(
|
||||
|
|
@ -403,7 +404,7 @@ def _upsert_article_from_existing(
|
|||
published_to_wp_at=article.get("published_to_wp_at") if published_to_wp_at is _UNSET else published_to_wp_at,
|
||||
word_count=len(str(rewritten or "").split()),
|
||||
status=article.get("status") if status is None else status,
|
||||
meta_json=article.get("meta_json"),
|
||||
meta_json=article.get("meta_json") if meta_json is _UNSET else meta_json,
|
||||
)
|
||||
)
|
||||
|
||||
|
|
@ -493,6 +494,8 @@ def admin_dashboard(request: Request):
|
|||
article["days_old"] = article_age_days(article.get("published_at"))
|
||||
article["relevance"] = article_relevance(article.get("published_at"))
|
||||
article["status_ui"] = internal_to_ui_status(article.get("status"))
|
||||
tags = meta.get("generated_tags") if isinstance(meta.get("generated_tags"), list) else []
|
||||
article["generated_tags"] = [str(t) for t in tags if t]
|
||||
|
||||
return templates.TemplateResponse(
|
||||
request,
|
||||
|
|
@ -836,12 +839,40 @@ def admin_rewrite_run(request: Request, article_id: int):
|
|||
return _dashboard_redirect(msg=f"Rewrite nur aus new/rewrite fuer Artikel #{article_id}", msg_type="error")
|
||||
try:
|
||||
rewritten = rewrite_article_text(article)
|
||||
tags = generate_article_tags(article, rewritten_text=rewritten)
|
||||
except Exception as exc:
|
||||
return _dashboard_redirect(msg=f"Rewrite fehlgeschlagen fuer Artikel #{article_id}: {exc}", msg_type="error")
|
||||
_upsert_article_from_existing(article, content_rewritten=rewritten, status="approved")
|
||||
merged_meta = merge_generated_tags(article.get("meta_json"), tags)
|
||||
_upsert_article_from_existing(article, content_rewritten=rewritten, status="approved", meta_json=merged_meta)
|
||||
return _dashboard_redirect(msg=f"Rewrite fertig fuer Artikel #{article_id} -> publish")
|
||||
|
||||
|
||||
@router.post("/admin/rewrite/run")
|
||||
def admin_rewrite_run_batch(request: Request, max_jobs: str = Form("10")):
|
||||
user = _admin_user(request)
|
||||
if not user:
|
||||
return RedirectResponse(url="/admin/login", status_code=303)
|
||||
try:
|
||||
limit = max(1, min(int(max_jobs), 100))
|
||||
except Exception:
|
||||
limit = 10
|
||||
planned = list_articles(limit=limit, status_filter="rewrite")
|
||||
processed = 0
|
||||
success = 0
|
||||
failed = 0
|
||||
for article in planned:
|
||||
processed += 1
|
||||
try:
|
||||
rewritten = rewrite_article_text(article)
|
||||
tags = generate_article_tags(article, rewritten_text=rewritten)
|
||||
merged_meta = merge_generated_tags(article.get("meta_json"), tags)
|
||||
_upsert_article_from_existing(article, content_rewritten=rewritten, status="approved", meta_json=merged_meta)
|
||||
success += 1
|
||||
except Exception:
|
||||
failed += 1
|
||||
return _dashboard_redirect(msg=f"Rewrite-Run: processed={processed}, success={success}, failed={failed}")
|
||||
|
||||
|
||||
@router.post("/admin/articles/{article_id}/rewrite-save")
|
||||
def admin_rewrite_save(request: Request, article_id: int, content_rewritten: str = Form(...)):
|
||||
user = _admin_user(request)
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ from .ingestion import run_ingestion
|
|||
from .policy import evaluate_source_policy, is_source_allowed
|
||||
from .publisher import enqueue_publish, run_publisher
|
||||
from .relevance import article_age_days, article_relevance
|
||||
from .rewrite import rewrite_article_text
|
||||
from .rewrite import generate_article_tags, merge_generated_tags, rewrite_article_text
|
||||
from .repositories import (
|
||||
ArticleUpsert,
|
||||
FeedCreate,
|
||||
|
|
@ -514,6 +514,12 @@ def api_article_rewrite_run(article_id: int, username: str = Depends(require_aut
|
|||
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Rewrite nur aus Status 'new' oder 'rewrite'")
|
||||
|
||||
rewritten = rewrite_article_text(article)
|
||||
tags: list[str] = []
|
||||
try:
|
||||
tags = generate_article_tags(article, rewritten_text=rewritten)
|
||||
except Exception:
|
||||
tags = []
|
||||
merged_meta = merge_generated_tags(article.get("meta_json"), tags)
|
||||
# upsert via status update + existing fields by lightweight path:
|
||||
repo_upsert_article(
|
||||
ArticleUpsert(
|
||||
|
|
@ -543,10 +549,10 @@ def api_article_rewrite_run(article_id: int, username: str = Depends(require_aut
|
|||
published_to_wp_at=article.get("published_to_wp_at"),
|
||||
word_count=len(rewritten.split()),
|
||||
status="approved",
|
||||
meta_json=article.get("meta_json"),
|
||||
meta_json=merged_meta,
|
||||
)
|
||||
)
|
||||
return {"ok": True, "id": article_id, "status": "publish"}
|
||||
return {"ok": True, "id": article_id, "status": "publish", "tags": tags}
|
||||
|
||||
|
||||
@app.post("/api/articles/{article_id}/legal-review")
|
||||
|
|
|
|||
|
|
@ -28,35 +28,39 @@ def _sanitize_source_text(text: str) -> str:
|
|||
return joined
|
||||
|
||||
|
||||
def rewrite_article_text(article: dict[str, Any]) -> str:
|
||||
def _normalize_tags(tags: list[str], max_tags: int = 8) -> list[str]:
|
||||
out: list[str] = []
|
||||
seen: set[str] = set()
|
||||
for raw in tags:
|
||||
value = re.sub(r"\s+", " ", str(raw or "").strip())
|
||||
value = re.sub(r"^[#\-•\s]+", "", value)
|
||||
value = re.sub(r"[;,.:\s]+$", "", value)
|
||||
if not value:
|
||||
continue
|
||||
if len(value) < 2 or len(value) > 40:
|
||||
continue
|
||||
key = value.casefold()
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
out.append(value)
|
||||
if len(out) >= max_tags:
|
||||
break
|
||||
return out
|
||||
|
||||
|
||||
def _openai_chat(system: str, user: str, temperature: float = 0.4) -> str:
|
||||
settings = get_settings()
|
||||
api_key = settings.openai_api_key
|
||||
if not api_key:
|
||||
raise RuntimeError("OPENAI_API_KEY fehlt")
|
||||
|
||||
source_text = _sanitize_source_text(article.get("content_raw") or "")
|
||||
if not source_text:
|
||||
source_text = (article.get("summary") or "").strip()
|
||||
if not source_text:
|
||||
raise RuntimeError("Kein Quelltext für Rewrite verfügbar")
|
||||
|
||||
title = (article.get("title") or "").strip()
|
||||
prompt = (
|
||||
"Schreibe den folgenden News-Text neu auf Deutsch in persönlicher Du-Form. "
|
||||
"Stil: ausführlich, gut lesbar, ohne Einleitung mit Datum/Uhrzeit/Firma/Ort, "
|
||||
"ohne Pressekontakt, ohne Quellenblock. "
|
||||
"Nutze klare Absätze und Zwischenüberschriften in HTML (<h2>, <p>, <ul><li> falls passend). "
|
||||
"Inhaltlich korrekt bleiben, nichts erfinden.\n\n"
|
||||
f"Titel: {title}\n\n"
|
||||
f"Originaltext:\n{source_text}"
|
||||
)
|
||||
|
||||
payload = {
|
||||
"model": settings.openai_model,
|
||||
"temperature": 0.4,
|
||||
"temperature": temperature,
|
||||
"messages": [
|
||||
{"role": "system", "content": "Du bist ein deutscher News-Redakteur."},
|
||||
{"role": "user", "content": prompt},
|
||||
{"role": "system", "content": system},
|
||||
{"role": "user", "content": user},
|
||||
],
|
||||
}
|
||||
req = Request(
|
||||
|
|
@ -78,6 +82,78 @@ def rewrite_article_text(article: dict[str, Any]) -> str:
|
|||
message = choices[0].get("message", {})
|
||||
content = message.get("content")
|
||||
if not isinstance(content, str) or not content.strip():
|
||||
raise RuntimeError("OpenAI lieferte keinen Rewrite-Text")
|
||||
raise RuntimeError("OpenAI lieferte keinen Inhalt")
|
||||
return content.strip()
|
||||
|
||||
|
||||
def rewrite_article_text(article: dict[str, Any]) -> str:
|
||||
source_text = _sanitize_source_text(article.get("content_raw") or "")
|
||||
if not source_text:
|
||||
source_text = (article.get("summary") or "").strip()
|
||||
if not source_text:
|
||||
raise RuntimeError("Kein Quelltext für Rewrite verfügbar")
|
||||
|
||||
title = (article.get("title") or "").strip()
|
||||
prompt = (
|
||||
"Schreibe den folgenden News-Text neu auf Deutsch in persönlicher Du-Form. "
|
||||
"Stil: ausführlich, gut lesbar, ohne Einleitung mit Datum/Uhrzeit/Firma/Ort, "
|
||||
"ohne Pressekontakt, ohne Quellenblock. "
|
||||
"Nutze klare Absätze und Zwischenüberschriften in HTML (<h2>, <p>, <ul><li> falls passend). "
|
||||
"Inhaltlich korrekt bleiben, nichts erfinden.\n\n"
|
||||
f"Titel: {title}\n\n"
|
||||
f"Originaltext:\n{source_text}"
|
||||
)
|
||||
return _openai_chat(
|
||||
"Du bist ein deutscher News-Redakteur.",
|
||||
prompt,
|
||||
temperature=0.4,
|
||||
)
|
||||
|
||||
|
||||
def generate_article_tags(article: dict[str, Any], rewritten_text: str | None = None, max_tags: int = 8) -> list[str]:
|
||||
source_text = rewritten_text or _sanitize_source_text(article.get("content_raw") or "") or (article.get("summary") or "")
|
||||
source_text = str(source_text).strip()
|
||||
if not source_text:
|
||||
return []
|
||||
title = (article.get("title") or "").strip()
|
||||
prompt = (
|
||||
"Erzeuge präzise Schlagwörter für einen deutschen News-Artikel. "
|
||||
f"Maximal {max_tags} Tags. Nur relevante Begriffe, keine allgemeinen Wörter wie News/Artikel. "
|
||||
"Gib ausschließlich ein JSON-Array mit Strings zurück, ohne Erklärung.\n\n"
|
||||
f"Titel: {title}\n\n"
|
||||
f"Text:\n{source_text[:3500]}"
|
||||
)
|
||||
raw = _openai_chat(
|
||||
"Du extrahierst präzise, kurze News-Tags auf Deutsch.",
|
||||
prompt,
|
||||
temperature=0.2,
|
||||
)
|
||||
try:
|
||||
parsed = json.loads(raw)
|
||||
if isinstance(parsed, list):
|
||||
return _normalize_tags([str(x) for x in parsed], max_tags=max_tags)
|
||||
except Exception:
|
||||
pass
|
||||
# fallback: extract first JSON-like array if model wrapped output
|
||||
match = re.search(r"\[[\s\S]*\]", raw)
|
||||
if match:
|
||||
try:
|
||||
parsed = json.loads(match.group(0))
|
||||
if isinstance(parsed, list):
|
||||
return _normalize_tags([str(x) for x in parsed], max_tags=max_tags)
|
||||
except Exception:
|
||||
return []
|
||||
return []
|
||||
|
||||
|
||||
def merge_generated_tags(meta_json: str | None, tags: list[str]) -> str:
|
||||
meta: dict[str, Any] = {}
|
||||
if meta_json:
|
||||
try:
|
||||
parsed = json.loads(meta_json)
|
||||
if isinstance(parsed, dict):
|
||||
meta = parsed
|
||||
except Exception:
|
||||
meta = {}
|
||||
meta["generated_tags"] = _normalize_tags(tags)
|
||||
return json.dumps(meta, ensure_ascii=False)
|
||||
|
|
|
|||
|
|
@ -157,7 +157,7 @@ def _extract_content_text(html: str) -> str | None:
|
|||
paragraphs = []
|
||||
for match in re.finditer(r"<h[2-4][^>]*>([\s\S]*?)</h[2-4]>", section, re.IGNORECASE):
|
||||
text = _clean_text(match.group(1))
|
||||
if text and re.search(r"\b(pressekontakt|press contact|kontakt)\b", text, re.IGNORECASE):
|
||||
if text and re.search(r"\b(pressekontakt|press contact|kontakt|agentur)\b", text, re.IGNORECASE):
|
||||
paragraphs.append(text)
|
||||
|
||||
for match in re.finditer(r"<p[^>]*>([\s\S]*?)</p>", section, re.IGNORECASE):
|
||||
|
|
@ -177,18 +177,18 @@ def _extract_press_contact(content_text: str | None) -> str | None:
|
|||
return None
|
||||
|
||||
lines = [line.strip() for line in content_text.split("\n") if line.strip()]
|
||||
marker_re = re.compile(r"\b(pressekontakt|press contact|presse\-kontakt)\b", re.IGNORECASE)
|
||||
marker_re = re.compile(r"\b(pressekontakt|press contact|presse\-kontakt|agentur)\b", re.IGNORECASE)
|
||||
for idx, line in enumerate(lines):
|
||||
if marker_re.search(line):
|
||||
chunk = [line]
|
||||
for nxt in lines[idx + 1 : idx + 6]:
|
||||
if re.search(r"\b(original\-content von|ots:|newsroom:)\b", nxt, re.IGNORECASE):
|
||||
if re.search(r"\b(original\-content von|ots:|newsroom:|alle meldungen|zum newsroom)\b", nxt, re.IGNORECASE):
|
||||
break
|
||||
chunk.append(nxt)
|
||||
return _clean_text("\n".join(chunk))
|
||||
|
||||
match = re.search(
|
||||
r"(Pressekontakt[\s\S]{0,1200}?)(?:Original-Content von|OTS:|newsroom:|$)",
|
||||
r"((?:Pressekontakt|Agentur)[\s\S]{0,1200}?)(?:Original-Content von|OTS:|newsroom:|Alle Meldungen|Zum Newsroom|$)",
|
||||
content_text,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ import mimetypes
|
|||
from pathlib import Path
|
||||
import re
|
||||
from typing import Any
|
||||
from urllib.parse import urlparse
|
||||
from urllib.parse import quote_plus, urlparse
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
from .config import get_settings
|
||||
|
|
@ -25,7 +25,7 @@ def _wp_request(
|
|||
method: str,
|
||||
endpoint: str,
|
||||
payload: dict[str, Any] | None = None,
|
||||
) -> dict[str, Any]:
|
||||
) -> Any:
|
||||
url = f"{base_url.rstrip('/')}/wp-json/wp/v2/{endpoint.lstrip('/')}"
|
||||
data = json.dumps(payload).encode("utf-8") if payload is not None else None
|
||||
req = Request(
|
||||
|
|
@ -41,8 +41,7 @@ def _wp_request(
|
|||
)
|
||||
with urlopen(req, timeout=20) as resp:
|
||||
raw = resp.read().decode("utf-8", errors="replace")
|
||||
parsed = json.loads(raw) if raw else {}
|
||||
return parsed if isinstance(parsed, dict) else {}
|
||||
return json.loads(raw) if raw else {}
|
||||
|
||||
|
||||
def _selected_image_url_from_meta(meta_json: str | None) -> str | None:
|
||||
|
|
@ -61,6 +60,81 @@ def _selected_image_url_from_meta(meta_json: str | None) -> str | None:
|
|||
return selected if isinstance(selected, str) and selected.strip() else None
|
||||
|
||||
|
||||
def _selected_tags_from_meta(meta_json: str | None) -> list[str]:
|
||||
if not meta_json:
|
||||
return []
|
||||
try:
|
||||
meta = json.loads(meta_json)
|
||||
except Exception:
|
||||
return []
|
||||
if not isinstance(meta, dict):
|
||||
return []
|
||||
raw_tags = meta.get("generated_tags")
|
||||
if not isinstance(raw_tags, list):
|
||||
return []
|
||||
tags: list[str] = []
|
||||
seen: set[str] = set()
|
||||
for item in raw_tags:
|
||||
value = str(item or "").strip()
|
||||
if not value:
|
||||
continue
|
||||
key = value.casefold()
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
tags.append(value)
|
||||
if len(tags) >= 12:
|
||||
break
|
||||
return tags
|
||||
|
||||
|
||||
def _resolve_wp_tag_ids(*, base_url: str, auth_header: str, tags: list[str]) -> list[int]:
|
||||
ids: list[int] = []
|
||||
seen: set[int] = set()
|
||||
for tag in tags:
|
||||
name = tag.strip()
|
||||
if not name:
|
||||
continue
|
||||
try:
|
||||
endpoint = f"tags?search={quote_plus(name)}&per_page=20"
|
||||
result = _wp_request(base_url=base_url, auth_header=auth_header, method="GET", endpoint=endpoint)
|
||||
tag_id: int | None = None
|
||||
if isinstance(result, list):
|
||||
for row in result:
|
||||
if not isinstance(row, dict):
|
||||
continue
|
||||
row_name = str(row.get("name") or "")
|
||||
rid = int(row.get("id", 0) or 0)
|
||||
if rid <= 0:
|
||||
continue
|
||||
if row_name.casefold() == name.casefold():
|
||||
tag_id = rid
|
||||
break
|
||||
if tag_id is None:
|
||||
for row in result:
|
||||
if isinstance(row, dict) and int(row.get("id", 0) or 0) > 0:
|
||||
tag_id = int(row.get("id", 0))
|
||||
break
|
||||
if tag_id is None:
|
||||
created = _wp_request(
|
||||
base_url=base_url,
|
||||
auth_header=auth_header,
|
||||
method="POST",
|
||||
endpoint="tags",
|
||||
payload={"name": name},
|
||||
)
|
||||
if isinstance(created, dict):
|
||||
rid = int(created.get("id", 0) or 0)
|
||||
if rid > 0:
|
||||
tag_id = rid
|
||||
if tag_id is not None and tag_id > 0 and tag_id not in seen:
|
||||
seen.add(tag_id)
|
||||
ids.append(tag_id)
|
||||
except Exception:
|
||||
continue
|
||||
return ids
|
||||
|
||||
|
||||
def _download_image_bytes(url: str, referer: str | None = None) -> tuple[bytes, str]:
|
||||
headers = {
|
||||
"User-Agent": "rss-news-publisher/1.0",
|
||||
|
|
@ -269,6 +343,14 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]:
|
|||
payload["featured_media"] = featured_media_id
|
||||
|
||||
wp_post_id = article.get("wp_post_id")
|
||||
tag_ids = _resolve_wp_tag_ids(
|
||||
base_url=settings.wordpress_base_url,
|
||||
auth_header=auth,
|
||||
tags=_selected_tags_from_meta(article.get("meta_json")),
|
||||
)
|
||||
if tag_ids:
|
||||
payload["tags"] = tag_ids
|
||||
|
||||
if wp_post_id:
|
||||
result = _wp_request(
|
||||
base_url=settings.wordpress_base_url,
|
||||
|
|
@ -286,6 +368,8 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]:
|
|||
payload=payload,
|
||||
)
|
||||
|
||||
if not isinstance(result, dict):
|
||||
raise RuntimeError(f"WordPress Antwort im unerwarteten Format: {result}")
|
||||
post_id = int(result.get("id", 0))
|
||||
if post_id <= 0:
|
||||
raise RuntimeError(f"WordPress Antwort ohne Post-ID: {result}")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue