feat(rewrite): add batch rewrite run, AI tags for WP, and agentur contact detection
This commit is contained in:
parent
da269d08f1
commit
b0f995d5c9
10 changed files with 374 additions and 36 deletions
|
|
@ -20,7 +20,7 @@ from .ingestion import run_ingestion
|
||||||
from .policy import evaluate_source_policy
|
from .policy import evaluate_source_policy
|
||||||
from .publisher import enqueue_publish, run_publisher
|
from .publisher import enqueue_publish, run_publisher
|
||||||
from .relevance import article_age_days, article_relevance
|
from .relevance import article_age_days, article_relevance
|
||||||
from .rewrite import rewrite_article_text
|
from .rewrite import generate_article_tags, merge_generated_tags, rewrite_article_text
|
||||||
from .repositories import (
|
from .repositories import (
|
||||||
FeedCreate,
|
FeedCreate,
|
||||||
FeedUpdate,
|
FeedUpdate,
|
||||||
|
|
@ -373,6 +373,7 @@ def _upsert_article_from_existing(
|
||||||
publish_attempts: int | object = _UNSET,
|
publish_attempts: int | object = _UNSET,
|
||||||
publish_last_error: str | None | object = _UNSET,
|
publish_last_error: str | None | object = _UNSET,
|
||||||
published_to_wp_at: str | None | object = _UNSET,
|
published_to_wp_at: str | None | object = _UNSET,
|
||||||
|
meta_json: str | None | object = _UNSET,
|
||||||
) -> None:
|
) -> None:
|
||||||
rewritten = article.get("content_rewritten") if content_rewritten is None else content_rewritten
|
rewritten = article.get("content_rewritten") if content_rewritten is None else content_rewritten
|
||||||
upsert_article(
|
upsert_article(
|
||||||
|
|
@ -403,7 +404,7 @@ def _upsert_article_from_existing(
|
||||||
published_to_wp_at=article.get("published_to_wp_at") if published_to_wp_at is _UNSET else published_to_wp_at,
|
published_to_wp_at=article.get("published_to_wp_at") if published_to_wp_at is _UNSET else published_to_wp_at,
|
||||||
word_count=len(str(rewritten or "").split()),
|
word_count=len(str(rewritten or "").split()),
|
||||||
status=article.get("status") if status is None else status,
|
status=article.get("status") if status is None else status,
|
||||||
meta_json=article.get("meta_json"),
|
meta_json=article.get("meta_json") if meta_json is _UNSET else meta_json,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -493,6 +494,8 @@ def admin_dashboard(request: Request):
|
||||||
article["days_old"] = article_age_days(article.get("published_at"))
|
article["days_old"] = article_age_days(article.get("published_at"))
|
||||||
article["relevance"] = article_relevance(article.get("published_at"))
|
article["relevance"] = article_relevance(article.get("published_at"))
|
||||||
article["status_ui"] = internal_to_ui_status(article.get("status"))
|
article["status_ui"] = internal_to_ui_status(article.get("status"))
|
||||||
|
tags = meta.get("generated_tags") if isinstance(meta.get("generated_tags"), list) else []
|
||||||
|
article["generated_tags"] = [str(t) for t in tags if t]
|
||||||
|
|
||||||
return templates.TemplateResponse(
|
return templates.TemplateResponse(
|
||||||
request,
|
request,
|
||||||
|
|
@ -836,12 +839,40 @@ def admin_rewrite_run(request: Request, article_id: int):
|
||||||
return _dashboard_redirect(msg=f"Rewrite nur aus new/rewrite fuer Artikel #{article_id}", msg_type="error")
|
return _dashboard_redirect(msg=f"Rewrite nur aus new/rewrite fuer Artikel #{article_id}", msg_type="error")
|
||||||
try:
|
try:
|
||||||
rewritten = rewrite_article_text(article)
|
rewritten = rewrite_article_text(article)
|
||||||
|
tags = generate_article_tags(article, rewritten_text=rewritten)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
return _dashboard_redirect(msg=f"Rewrite fehlgeschlagen fuer Artikel #{article_id}: {exc}", msg_type="error")
|
return _dashboard_redirect(msg=f"Rewrite fehlgeschlagen fuer Artikel #{article_id}: {exc}", msg_type="error")
|
||||||
_upsert_article_from_existing(article, content_rewritten=rewritten, status="approved")
|
merged_meta = merge_generated_tags(article.get("meta_json"), tags)
|
||||||
|
_upsert_article_from_existing(article, content_rewritten=rewritten, status="approved", meta_json=merged_meta)
|
||||||
return _dashboard_redirect(msg=f"Rewrite fertig fuer Artikel #{article_id} -> publish")
|
return _dashboard_redirect(msg=f"Rewrite fertig fuer Artikel #{article_id} -> publish")
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/admin/rewrite/run")
|
||||||
|
def admin_rewrite_run_batch(request: Request, max_jobs: str = Form("10")):
|
||||||
|
user = _admin_user(request)
|
||||||
|
if not user:
|
||||||
|
return RedirectResponse(url="/admin/login", status_code=303)
|
||||||
|
try:
|
||||||
|
limit = max(1, min(int(max_jobs), 100))
|
||||||
|
except Exception:
|
||||||
|
limit = 10
|
||||||
|
planned = list_articles(limit=limit, status_filter="rewrite")
|
||||||
|
processed = 0
|
||||||
|
success = 0
|
||||||
|
failed = 0
|
||||||
|
for article in planned:
|
||||||
|
processed += 1
|
||||||
|
try:
|
||||||
|
rewritten = rewrite_article_text(article)
|
||||||
|
tags = generate_article_tags(article, rewritten_text=rewritten)
|
||||||
|
merged_meta = merge_generated_tags(article.get("meta_json"), tags)
|
||||||
|
_upsert_article_from_existing(article, content_rewritten=rewritten, status="approved", meta_json=merged_meta)
|
||||||
|
success += 1
|
||||||
|
except Exception:
|
||||||
|
failed += 1
|
||||||
|
return _dashboard_redirect(msg=f"Rewrite-Run: processed={processed}, success={success}, failed={failed}")
|
||||||
|
|
||||||
|
|
||||||
@router.post("/admin/articles/{article_id}/rewrite-save")
|
@router.post("/admin/articles/{article_id}/rewrite-save")
|
||||||
def admin_rewrite_save(request: Request, article_id: int, content_rewritten: str = Form(...)):
|
def admin_rewrite_save(request: Request, article_id: int, content_rewritten: str = Form(...)):
|
||||||
user = _admin_user(request)
|
user = _admin_user(request)
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,7 @@ from .ingestion import run_ingestion
|
||||||
from .policy import evaluate_source_policy, is_source_allowed
|
from .policy import evaluate_source_policy, is_source_allowed
|
||||||
from .publisher import enqueue_publish, run_publisher
|
from .publisher import enqueue_publish, run_publisher
|
||||||
from .relevance import article_age_days, article_relevance
|
from .relevance import article_age_days, article_relevance
|
||||||
from .rewrite import rewrite_article_text
|
from .rewrite import generate_article_tags, merge_generated_tags, rewrite_article_text
|
||||||
from .repositories import (
|
from .repositories import (
|
||||||
ArticleUpsert,
|
ArticleUpsert,
|
||||||
FeedCreate,
|
FeedCreate,
|
||||||
|
|
@ -514,6 +514,12 @@ def api_article_rewrite_run(article_id: int, username: str = Depends(require_aut
|
||||||
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Rewrite nur aus Status 'new' oder 'rewrite'")
|
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Rewrite nur aus Status 'new' oder 'rewrite'")
|
||||||
|
|
||||||
rewritten = rewrite_article_text(article)
|
rewritten = rewrite_article_text(article)
|
||||||
|
tags: list[str] = []
|
||||||
|
try:
|
||||||
|
tags = generate_article_tags(article, rewritten_text=rewritten)
|
||||||
|
except Exception:
|
||||||
|
tags = []
|
||||||
|
merged_meta = merge_generated_tags(article.get("meta_json"), tags)
|
||||||
# upsert via status update + existing fields by lightweight path:
|
# upsert via status update + existing fields by lightweight path:
|
||||||
repo_upsert_article(
|
repo_upsert_article(
|
||||||
ArticleUpsert(
|
ArticleUpsert(
|
||||||
|
|
@ -543,10 +549,10 @@ def api_article_rewrite_run(article_id: int, username: str = Depends(require_aut
|
||||||
published_to_wp_at=article.get("published_to_wp_at"),
|
published_to_wp_at=article.get("published_to_wp_at"),
|
||||||
word_count=len(rewritten.split()),
|
word_count=len(rewritten.split()),
|
||||||
status="approved",
|
status="approved",
|
||||||
meta_json=article.get("meta_json"),
|
meta_json=merged_meta,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
return {"ok": True, "id": article_id, "status": "publish"}
|
return {"ok": True, "id": article_id, "status": "publish", "tags": tags}
|
||||||
|
|
||||||
|
|
||||||
@app.post("/api/articles/{article_id}/legal-review")
|
@app.post("/api/articles/{article_id}/legal-review")
|
||||||
|
|
|
||||||
|
|
@ -28,35 +28,39 @@ def _sanitize_source_text(text: str) -> str:
|
||||||
return joined
|
return joined
|
||||||
|
|
||||||
|
|
||||||
def rewrite_article_text(article: dict[str, Any]) -> str:
|
def _normalize_tags(tags: list[str], max_tags: int = 8) -> list[str]:
|
||||||
|
out: list[str] = []
|
||||||
|
seen: set[str] = set()
|
||||||
|
for raw in tags:
|
||||||
|
value = re.sub(r"\s+", " ", str(raw or "").strip())
|
||||||
|
value = re.sub(r"^[#\-•\s]+", "", value)
|
||||||
|
value = re.sub(r"[;,.:\s]+$", "", value)
|
||||||
|
if not value:
|
||||||
|
continue
|
||||||
|
if len(value) < 2 or len(value) > 40:
|
||||||
|
continue
|
||||||
|
key = value.casefold()
|
||||||
|
if key in seen:
|
||||||
|
continue
|
||||||
|
seen.add(key)
|
||||||
|
out.append(value)
|
||||||
|
if len(out) >= max_tags:
|
||||||
|
break
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _openai_chat(system: str, user: str, temperature: float = 0.4) -> str:
|
||||||
settings = get_settings()
|
settings = get_settings()
|
||||||
api_key = settings.openai_api_key
|
api_key = settings.openai_api_key
|
||||||
if not api_key:
|
if not api_key:
|
||||||
raise RuntimeError("OPENAI_API_KEY fehlt")
|
raise RuntimeError("OPENAI_API_KEY fehlt")
|
||||||
|
|
||||||
source_text = _sanitize_source_text(article.get("content_raw") or "")
|
|
||||||
if not source_text:
|
|
||||||
source_text = (article.get("summary") or "").strip()
|
|
||||||
if not source_text:
|
|
||||||
raise RuntimeError("Kein Quelltext für Rewrite verfügbar")
|
|
||||||
|
|
||||||
title = (article.get("title") or "").strip()
|
|
||||||
prompt = (
|
|
||||||
"Schreibe den folgenden News-Text neu auf Deutsch in persönlicher Du-Form. "
|
|
||||||
"Stil: ausführlich, gut lesbar, ohne Einleitung mit Datum/Uhrzeit/Firma/Ort, "
|
|
||||||
"ohne Pressekontakt, ohne Quellenblock. "
|
|
||||||
"Nutze klare Absätze und Zwischenüberschriften in HTML (<h2>, <p>, <ul><li> falls passend). "
|
|
||||||
"Inhaltlich korrekt bleiben, nichts erfinden.\n\n"
|
|
||||||
f"Titel: {title}\n\n"
|
|
||||||
f"Originaltext:\n{source_text}"
|
|
||||||
)
|
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
"model": settings.openai_model,
|
"model": settings.openai_model,
|
||||||
"temperature": 0.4,
|
"temperature": temperature,
|
||||||
"messages": [
|
"messages": [
|
||||||
{"role": "system", "content": "Du bist ein deutscher News-Redakteur."},
|
{"role": "system", "content": system},
|
||||||
{"role": "user", "content": prompt},
|
{"role": "user", "content": user},
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
req = Request(
|
req = Request(
|
||||||
|
|
@ -78,6 +82,78 @@ def rewrite_article_text(article: dict[str, Any]) -> str:
|
||||||
message = choices[0].get("message", {})
|
message = choices[0].get("message", {})
|
||||||
content = message.get("content")
|
content = message.get("content")
|
||||||
if not isinstance(content, str) or not content.strip():
|
if not isinstance(content, str) or not content.strip():
|
||||||
raise RuntimeError("OpenAI lieferte keinen Rewrite-Text")
|
raise RuntimeError("OpenAI lieferte keinen Inhalt")
|
||||||
return content.strip()
|
return content.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def rewrite_article_text(article: dict[str, Any]) -> str:
|
||||||
|
source_text = _sanitize_source_text(article.get("content_raw") or "")
|
||||||
|
if not source_text:
|
||||||
|
source_text = (article.get("summary") or "").strip()
|
||||||
|
if not source_text:
|
||||||
|
raise RuntimeError("Kein Quelltext für Rewrite verfügbar")
|
||||||
|
|
||||||
|
title = (article.get("title") or "").strip()
|
||||||
|
prompt = (
|
||||||
|
"Schreibe den folgenden News-Text neu auf Deutsch in persönlicher Du-Form. "
|
||||||
|
"Stil: ausführlich, gut lesbar, ohne Einleitung mit Datum/Uhrzeit/Firma/Ort, "
|
||||||
|
"ohne Pressekontakt, ohne Quellenblock. "
|
||||||
|
"Nutze klare Absätze und Zwischenüberschriften in HTML (<h2>, <p>, <ul><li> falls passend). "
|
||||||
|
"Inhaltlich korrekt bleiben, nichts erfinden.\n\n"
|
||||||
|
f"Titel: {title}\n\n"
|
||||||
|
f"Originaltext:\n{source_text}"
|
||||||
|
)
|
||||||
|
return _openai_chat(
|
||||||
|
"Du bist ein deutscher News-Redakteur.",
|
||||||
|
prompt,
|
||||||
|
temperature=0.4,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def generate_article_tags(article: dict[str, Any], rewritten_text: str | None = None, max_tags: int = 8) -> list[str]:
|
||||||
|
source_text = rewritten_text or _sanitize_source_text(article.get("content_raw") or "") or (article.get("summary") or "")
|
||||||
|
source_text = str(source_text).strip()
|
||||||
|
if not source_text:
|
||||||
|
return []
|
||||||
|
title = (article.get("title") or "").strip()
|
||||||
|
prompt = (
|
||||||
|
"Erzeuge präzise Schlagwörter für einen deutschen News-Artikel. "
|
||||||
|
f"Maximal {max_tags} Tags. Nur relevante Begriffe, keine allgemeinen Wörter wie News/Artikel. "
|
||||||
|
"Gib ausschließlich ein JSON-Array mit Strings zurück, ohne Erklärung.\n\n"
|
||||||
|
f"Titel: {title}\n\n"
|
||||||
|
f"Text:\n{source_text[:3500]}"
|
||||||
|
)
|
||||||
|
raw = _openai_chat(
|
||||||
|
"Du extrahierst präzise, kurze News-Tags auf Deutsch.",
|
||||||
|
prompt,
|
||||||
|
temperature=0.2,
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
parsed = json.loads(raw)
|
||||||
|
if isinstance(parsed, list):
|
||||||
|
return _normalize_tags([str(x) for x in parsed], max_tags=max_tags)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
# fallback: extract first JSON-like array if model wrapped output
|
||||||
|
match = re.search(r"\[[\s\S]*\]", raw)
|
||||||
|
if match:
|
||||||
|
try:
|
||||||
|
parsed = json.loads(match.group(0))
|
||||||
|
if isinstance(parsed, list):
|
||||||
|
return _normalize_tags([str(x) for x in parsed], max_tags=max_tags)
|
||||||
|
except Exception:
|
||||||
|
return []
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def merge_generated_tags(meta_json: str | None, tags: list[str]) -> str:
|
||||||
|
meta: dict[str, Any] = {}
|
||||||
|
if meta_json:
|
||||||
|
try:
|
||||||
|
parsed = json.loads(meta_json)
|
||||||
|
if isinstance(parsed, dict):
|
||||||
|
meta = parsed
|
||||||
|
except Exception:
|
||||||
|
meta = {}
|
||||||
|
meta["generated_tags"] = _normalize_tags(tags)
|
||||||
|
return json.dumps(meta, ensure_ascii=False)
|
||||||
|
|
|
||||||
|
|
@ -157,7 +157,7 @@ def _extract_content_text(html: str) -> str | None:
|
||||||
paragraphs = []
|
paragraphs = []
|
||||||
for match in re.finditer(r"<h[2-4][^>]*>([\s\S]*?)</h[2-4]>", section, re.IGNORECASE):
|
for match in re.finditer(r"<h[2-4][^>]*>([\s\S]*?)</h[2-4]>", section, re.IGNORECASE):
|
||||||
text = _clean_text(match.group(1))
|
text = _clean_text(match.group(1))
|
||||||
if text and re.search(r"\b(pressekontakt|press contact|kontakt)\b", text, re.IGNORECASE):
|
if text and re.search(r"\b(pressekontakt|press contact|kontakt|agentur)\b", text, re.IGNORECASE):
|
||||||
paragraphs.append(text)
|
paragraphs.append(text)
|
||||||
|
|
||||||
for match in re.finditer(r"<p[^>]*>([\s\S]*?)</p>", section, re.IGNORECASE):
|
for match in re.finditer(r"<p[^>]*>([\s\S]*?)</p>", section, re.IGNORECASE):
|
||||||
|
|
@ -177,18 +177,18 @@ def _extract_press_contact(content_text: str | None) -> str | None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
lines = [line.strip() for line in content_text.split("\n") if line.strip()]
|
lines = [line.strip() for line in content_text.split("\n") if line.strip()]
|
||||||
marker_re = re.compile(r"\b(pressekontakt|press contact|presse\-kontakt)\b", re.IGNORECASE)
|
marker_re = re.compile(r"\b(pressekontakt|press contact|presse\-kontakt|agentur)\b", re.IGNORECASE)
|
||||||
for idx, line in enumerate(lines):
|
for idx, line in enumerate(lines):
|
||||||
if marker_re.search(line):
|
if marker_re.search(line):
|
||||||
chunk = [line]
|
chunk = [line]
|
||||||
for nxt in lines[idx + 1 : idx + 6]:
|
for nxt in lines[idx + 1 : idx + 6]:
|
||||||
if re.search(r"\b(original\-content von|ots:|newsroom:)\b", nxt, re.IGNORECASE):
|
if re.search(r"\b(original\-content von|ots:|newsroom:|alle meldungen|zum newsroom)\b", nxt, re.IGNORECASE):
|
||||||
break
|
break
|
||||||
chunk.append(nxt)
|
chunk.append(nxt)
|
||||||
return _clean_text("\n".join(chunk))
|
return _clean_text("\n".join(chunk))
|
||||||
|
|
||||||
match = re.search(
|
match = re.search(
|
||||||
r"(Pressekontakt[\s\S]{0,1200}?)(?:Original-Content von|OTS:|newsroom:|$)",
|
r"((?:Pressekontakt|Agentur)[\s\S]{0,1200}?)(?:Original-Content von|OTS:|newsroom:|Alle Meldungen|Zum Newsroom|$)",
|
||||||
content_text,
|
content_text,
|
||||||
re.IGNORECASE,
|
re.IGNORECASE,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ import mimetypes
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import re
|
import re
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import quote_plus, urlparse
|
||||||
from urllib.request import Request, urlopen
|
from urllib.request import Request, urlopen
|
||||||
|
|
||||||
from .config import get_settings
|
from .config import get_settings
|
||||||
|
|
@ -25,7 +25,7 @@ def _wp_request(
|
||||||
method: str,
|
method: str,
|
||||||
endpoint: str,
|
endpoint: str,
|
||||||
payload: dict[str, Any] | None = None,
|
payload: dict[str, Any] | None = None,
|
||||||
) -> dict[str, Any]:
|
) -> Any:
|
||||||
url = f"{base_url.rstrip('/')}/wp-json/wp/v2/{endpoint.lstrip('/')}"
|
url = f"{base_url.rstrip('/')}/wp-json/wp/v2/{endpoint.lstrip('/')}"
|
||||||
data = json.dumps(payload).encode("utf-8") if payload is not None else None
|
data = json.dumps(payload).encode("utf-8") if payload is not None else None
|
||||||
req = Request(
|
req = Request(
|
||||||
|
|
@ -41,8 +41,7 @@ def _wp_request(
|
||||||
)
|
)
|
||||||
with urlopen(req, timeout=20) as resp:
|
with urlopen(req, timeout=20) as resp:
|
||||||
raw = resp.read().decode("utf-8", errors="replace")
|
raw = resp.read().decode("utf-8", errors="replace")
|
||||||
parsed = json.loads(raw) if raw else {}
|
return json.loads(raw) if raw else {}
|
||||||
return parsed if isinstance(parsed, dict) else {}
|
|
||||||
|
|
||||||
|
|
||||||
def _selected_image_url_from_meta(meta_json: str | None) -> str | None:
|
def _selected_image_url_from_meta(meta_json: str | None) -> str | None:
|
||||||
|
|
@ -61,6 +60,81 @@ def _selected_image_url_from_meta(meta_json: str | None) -> str | None:
|
||||||
return selected if isinstance(selected, str) and selected.strip() else None
|
return selected if isinstance(selected, str) and selected.strip() else None
|
||||||
|
|
||||||
|
|
||||||
|
def _selected_tags_from_meta(meta_json: str | None) -> list[str]:
|
||||||
|
if not meta_json:
|
||||||
|
return []
|
||||||
|
try:
|
||||||
|
meta = json.loads(meta_json)
|
||||||
|
except Exception:
|
||||||
|
return []
|
||||||
|
if not isinstance(meta, dict):
|
||||||
|
return []
|
||||||
|
raw_tags = meta.get("generated_tags")
|
||||||
|
if not isinstance(raw_tags, list):
|
||||||
|
return []
|
||||||
|
tags: list[str] = []
|
||||||
|
seen: set[str] = set()
|
||||||
|
for item in raw_tags:
|
||||||
|
value = str(item or "").strip()
|
||||||
|
if not value:
|
||||||
|
continue
|
||||||
|
key = value.casefold()
|
||||||
|
if key in seen:
|
||||||
|
continue
|
||||||
|
seen.add(key)
|
||||||
|
tags.append(value)
|
||||||
|
if len(tags) >= 12:
|
||||||
|
break
|
||||||
|
return tags
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_wp_tag_ids(*, base_url: str, auth_header: str, tags: list[str]) -> list[int]:
|
||||||
|
ids: list[int] = []
|
||||||
|
seen: set[int] = set()
|
||||||
|
for tag in tags:
|
||||||
|
name = tag.strip()
|
||||||
|
if not name:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
endpoint = f"tags?search={quote_plus(name)}&per_page=20"
|
||||||
|
result = _wp_request(base_url=base_url, auth_header=auth_header, method="GET", endpoint=endpoint)
|
||||||
|
tag_id: int | None = None
|
||||||
|
if isinstance(result, list):
|
||||||
|
for row in result:
|
||||||
|
if not isinstance(row, dict):
|
||||||
|
continue
|
||||||
|
row_name = str(row.get("name") or "")
|
||||||
|
rid = int(row.get("id", 0) or 0)
|
||||||
|
if rid <= 0:
|
||||||
|
continue
|
||||||
|
if row_name.casefold() == name.casefold():
|
||||||
|
tag_id = rid
|
||||||
|
break
|
||||||
|
if tag_id is None:
|
||||||
|
for row in result:
|
||||||
|
if isinstance(row, dict) and int(row.get("id", 0) or 0) > 0:
|
||||||
|
tag_id = int(row.get("id", 0))
|
||||||
|
break
|
||||||
|
if tag_id is None:
|
||||||
|
created = _wp_request(
|
||||||
|
base_url=base_url,
|
||||||
|
auth_header=auth_header,
|
||||||
|
method="POST",
|
||||||
|
endpoint="tags",
|
||||||
|
payload={"name": name},
|
||||||
|
)
|
||||||
|
if isinstance(created, dict):
|
||||||
|
rid = int(created.get("id", 0) or 0)
|
||||||
|
if rid > 0:
|
||||||
|
tag_id = rid
|
||||||
|
if tag_id is not None and tag_id > 0 and tag_id not in seen:
|
||||||
|
seen.add(tag_id)
|
||||||
|
ids.append(tag_id)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
return ids
|
||||||
|
|
||||||
|
|
||||||
def _download_image_bytes(url: str, referer: str | None = None) -> tuple[bytes, str]:
|
def _download_image_bytes(url: str, referer: str | None = None) -> tuple[bytes, str]:
|
||||||
headers = {
|
headers = {
|
||||||
"User-Agent": "rss-news-publisher/1.0",
|
"User-Agent": "rss-news-publisher/1.0",
|
||||||
|
|
@ -269,6 +343,14 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]:
|
||||||
payload["featured_media"] = featured_media_id
|
payload["featured_media"] = featured_media_id
|
||||||
|
|
||||||
wp_post_id = article.get("wp_post_id")
|
wp_post_id = article.get("wp_post_id")
|
||||||
|
tag_ids = _resolve_wp_tag_ids(
|
||||||
|
base_url=settings.wordpress_base_url,
|
||||||
|
auth_header=auth,
|
||||||
|
tags=_selected_tags_from_meta(article.get("meta_json")),
|
||||||
|
)
|
||||||
|
if tag_ids:
|
||||||
|
payload["tags"] = tag_ids
|
||||||
|
|
||||||
if wp_post_id:
|
if wp_post_id:
|
||||||
result = _wp_request(
|
result = _wp_request(
|
||||||
base_url=settings.wordpress_base_url,
|
base_url=settings.wordpress_base_url,
|
||||||
|
|
@ -286,6 +368,8 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]:
|
||||||
payload=payload,
|
payload=payload,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if not isinstance(result, dict):
|
||||||
|
raise RuntimeError(f"WordPress Antwort im unerwarteten Format: {result}")
|
||||||
post_id = int(result.get("id", 0))
|
post_id = int(result.get("id", 0))
|
||||||
if post_id <= 0:
|
if post_id <= 0:
|
||||||
raise RuntimeError(f"WordPress Antwort ohne Post-ID: {result}")
|
raise RuntimeError(f"WordPress Antwort ohne Post-ID: {result}")
|
||||||
|
|
|
||||||
|
|
@ -170,6 +170,9 @@
|
||||||
<textarea name="content_rewritten" rows="14" style="width:100%;">{{ article.content_rewritten or "" }}</textarea>
|
<textarea name="content_rewritten" rows="14" style="width:100%;">{{ article.content_rewritten or "" }}</textarea>
|
||||||
<button type="submit">Rewrite-Text speichern</button>
|
<button type="submit">Rewrite-Text speichern</button>
|
||||||
</form>
|
</form>
|
||||||
|
{% if article.meta.generated_tags %}
|
||||||
|
<p><strong>Generierte Tags:</strong> {{ article.meta.generated_tags|join("; ") }}</p>
|
||||||
|
{% endif %}
|
||||||
<p class="subtle">Dieser Text wird für den WordPress-Entwurf verwendet, falls vorhanden.</p>
|
<p class="subtle">Dieser Text wird für den WordPress-Entwurf verwendet, falls vorhanden.</p>
|
||||||
</section>
|
</section>
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -102,6 +102,15 @@
|
||||||
</form>
|
</form>
|
||||||
</section>
|
</section>
|
||||||
|
|
||||||
|
<section class="card">
|
||||||
|
<h2>Rewrite Run (geplante Artikel)</h2>
|
||||||
|
<p class="subtle">Verarbeitet alle Artikel im Status <code>rewrite</code> und setzt sie auf <code>publish</code>.</p>
|
||||||
|
<form method="post" action="/admin/rewrite/run" class="row">
|
||||||
|
<input name="max_jobs" value="10" />
|
||||||
|
<button type="submit">Rewrite Run starten</button>
|
||||||
|
</form>
|
||||||
|
</section>
|
||||||
|
|
||||||
<section class="card">
|
<section class="card">
|
||||||
<h2>Quellen + Policy</h2>
|
<h2>Quellen + Policy</h2>
|
||||||
<table>
|
<table>
|
||||||
|
|
@ -269,6 +278,9 @@
|
||||||
{% if a.summary %}
|
{% if a.summary %}
|
||||||
<div><strong>Summary:</strong> {{ a.summary }}</div>
|
<div><strong>Summary:</strong> {{ a.summary }}</div>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
{% if a.generated_tags %}
|
||||||
|
<div><strong>Tags:</strong> {{ a.generated_tags|join("; ") }}</div>
|
||||||
|
{% endif %}
|
||||||
{% if a.content_raw %}
|
{% if a.content_raw %}
|
||||||
<details>
|
<details>
|
||||||
<summary>Volltext anzeigen</summary>
|
<summary>Volltext anzeigen</summary>
|
||||||
|
|
|
||||||
|
|
@ -271,6 +271,71 @@ class TestAdminUi(unittest.TestCase):
|
||||||
self.assertIn("Neu", article.get("content_rewritten") or "")
|
self.assertIn("Neu", article.get("content_rewritten") or "")
|
||||||
self.assertIsNone(article.get("wp_post_id"))
|
self.assertIsNone(article.get("wp_post_id"))
|
||||||
|
|
||||||
|
@patch("backend.app.admin_ui.generate_article_tags")
|
||||||
|
@patch("backend.app.admin_ui.rewrite_article_text")
|
||||||
|
def test_batch_rewrite_run_processes_planned_articles(self, mock_rewrite_text, mock_tags) -> None:
|
||||||
|
mock_rewrite_text.return_value = "<h2>Neu</h2><p>Text</p>"
|
||||||
|
mock_tags.return_value = ["Rheingas", "Monheim"]
|
||||||
|
|
||||||
|
source_id = create_source(
|
||||||
|
SourceCreate(
|
||||||
|
name="Batch Source",
|
||||||
|
base_url="https://example.org",
|
||||||
|
terms_url="https://example.org/terms",
|
||||||
|
license_name="cc-by",
|
||||||
|
risk_level="green",
|
||||||
|
is_enabled=True,
|
||||||
|
notes=None,
|
||||||
|
last_reviewed_at=None,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
feed_id = create_feed(
|
||||||
|
FeedCreate(
|
||||||
|
name="Batch Feed",
|
||||||
|
url="https://example.org/feed.xml",
|
||||||
|
source_id=source_id,
|
||||||
|
is_enabled=True,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
article_id = upsert_article(
|
||||||
|
ArticleUpsert(
|
||||||
|
feed_id=feed_id,
|
||||||
|
source_article_id="batch-1",
|
||||||
|
source_hash="batch-hash-1",
|
||||||
|
title="Batch Titel",
|
||||||
|
source_url="https://example.org/batch",
|
||||||
|
canonical_url="https://example.org/batch",
|
||||||
|
published_at=None,
|
||||||
|
author="Autor",
|
||||||
|
summary="Summary",
|
||||||
|
content_raw="Raw",
|
||||||
|
content_rewritten=None,
|
||||||
|
image_urls_json=None,
|
||||||
|
press_contact=None,
|
||||||
|
source_name_snapshot="Batch Source",
|
||||||
|
source_terms_url_snapshot="https://example.org/terms",
|
||||||
|
source_license_name_snapshot="cc-by",
|
||||||
|
legal_checked=False,
|
||||||
|
legal_checked_at=None,
|
||||||
|
legal_note=None,
|
||||||
|
wp_post_id=None,
|
||||||
|
wp_post_url=None,
|
||||||
|
publish_attempts=0,
|
||||||
|
publish_last_error=None,
|
||||||
|
published_to_wp_at=None,
|
||||||
|
word_count=1,
|
||||||
|
status="rewrite",
|
||||||
|
meta_json="{}",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
self.client.post("/admin/login", data={"username": "admin", "password": "secret"}, follow_redirects=True)
|
||||||
|
res = self.client.post("/admin/rewrite/run", data={"max_jobs": "10"}, follow_redirects=False)
|
||||||
|
self.assertEqual(res.status_code, 303)
|
||||||
|
article = get_article_by_id(article_id)
|
||||||
|
self.assertIsNotNone(article)
|
||||||
|
self.assertEqual(article.get("status"), "approved")
|
||||||
|
self.assertIn("generated_tags", article.get("meta_json", ""))
|
||||||
|
|
||||||
@patch("backend.app.admin_ui.urlopen")
|
@patch("backend.app.admin_ui.urlopen")
|
||||||
def test_image_proxy_returns_image_data(self, mock_urlopen) -> None:
|
def test_image_proxy_returns_image_data(self, mock_urlopen) -> None:
|
||||||
class _FakeHeaders:
|
class _FakeHeaders:
|
||||||
|
|
|
||||||
|
|
@ -26,6 +26,25 @@ SAMPLE_HTML = """
|
||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
SAMPLE_HTML_AGENTUR = """
|
||||||
|
<!doctype html>
|
||||||
|
<html lang="de">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8" />
|
||||||
|
<meta property="og:title" content="Demo Meldung Agentur" />
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<article>
|
||||||
|
<p>Inhalt der Meldung.</p>
|
||||||
|
<h3>Agentur</h3>
|
||||||
|
<p>Agenturname GmbH</p>
|
||||||
|
<p>presse@agentur.example</p>
|
||||||
|
<p>Original-Content von Beispiel</p>
|
||||||
|
</article>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class _FakeHeaders:
|
class _FakeHeaders:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
@ -64,6 +83,14 @@ class TestSourceExtraction(unittest.TestCase):
|
||||||
self.assertIn("Pressekontakt", extracted.press_contact or "")
|
self.assertIn("Pressekontakt", extracted.press_contact or "")
|
||||||
self.assertIsNone(extracted.extraction_error)
|
self.assertIsNone(extracted.extraction_error)
|
||||||
|
|
||||||
|
@patch("backend.app.source_extraction.urlopen")
|
||||||
|
def test_extract_article_detects_agentur_block_as_press_contact(self, mock_urlopen) -> None:
|
||||||
|
mock_urlopen.return_value = _FakeResponse(SAMPLE_HTML_AGENTUR)
|
||||||
|
extracted = extract_article("https://www.presseportal.de/pm/155103/6210401")
|
||||||
|
self.assertIn("Agentur", extracted.press_contact or "")
|
||||||
|
self.assertIn("Agenturname", extracted.press_contact or "")
|
||||||
|
self.assertIn("presse@agentur.example", extracted.press_contact or "")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|
|
||||||
|
|
@ -80,6 +80,40 @@ class TestWordpressPublish(unittest.TestCase):
|
||||||
self.assertNotIn("Pressekontakt", content)
|
self.assertNotIn("Pressekontakt", content)
|
||||||
self.assertIn("eigentliche Text", content)
|
self.assertIn("eigentliche Text", content)
|
||||||
|
|
||||||
|
@patch("backend.app.wordpress._upload_featured_media")
|
||||||
|
@patch("backend.app.wordpress._wp_request")
|
||||||
|
def test_publish_resolves_and_sets_tags(self, mock_wp_request, mock_upload_media) -> None:
|
||||||
|
def _fake_wp_request(**kwargs):
|
||||||
|
endpoint = kwargs.get("endpoint", "")
|
||||||
|
method = kwargs.get("method", "")
|
||||||
|
if method == "GET" and endpoint.startswith("tags?search="):
|
||||||
|
if "Rheingas" in endpoint:
|
||||||
|
return [{"id": 11, "name": "Rheingas"}]
|
||||||
|
return []
|
||||||
|
if method == "POST" and endpoint == "tags":
|
||||||
|
name = (kwargs.get("payload") or {}).get("name")
|
||||||
|
if name == "Gasflasche":
|
||||||
|
return {"id": 12, "name": "Gasflasche"}
|
||||||
|
return {"id": 13, "name": str(name)}
|
||||||
|
if method == "POST" and endpoint == "posts":
|
||||||
|
return {"id": 900, "link": "https://example.org/?p=900"}
|
||||||
|
return {}
|
||||||
|
|
||||||
|
mock_wp_request.side_effect = _fake_wp_request
|
||||||
|
article = {
|
||||||
|
"title": "Tag Test",
|
||||||
|
"content_raw": "Inhalt",
|
||||||
|
"source_url": "https://example.com/source",
|
||||||
|
"canonical_url": "https://example.com/source",
|
||||||
|
"meta_json": '{"generated_tags":["Rheingas","Gasflasche"]}',
|
||||||
|
}
|
||||||
|
post_id, _ = publish_article_draft(article)
|
||||||
|
self.assertEqual(post_id, 900)
|
||||||
|
post_calls = [call for call in mock_wp_request.call_args_list if call.kwargs.get("endpoint") == "posts"]
|
||||||
|
self.assertEqual(len(post_calls), 1)
|
||||||
|
payload = post_calls[0].kwargs.get("payload", {})
|
||||||
|
self.assertEqual(payload.get("tags"), [11, 12])
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue