feat(rewrite): add batch rewrite run, AI tags for WP, and agentur contact detection
This commit is contained in:
parent
da269d08f1
commit
b0f995d5c9
10 changed files with 374 additions and 36 deletions
|
|
@ -20,7 +20,7 @@ from .ingestion import run_ingestion
|
|||
from .policy import evaluate_source_policy
|
||||
from .publisher import enqueue_publish, run_publisher
|
||||
from .relevance import article_age_days, article_relevance
|
||||
from .rewrite import rewrite_article_text
|
||||
from .rewrite import generate_article_tags, merge_generated_tags, rewrite_article_text
|
||||
from .repositories import (
|
||||
FeedCreate,
|
||||
FeedUpdate,
|
||||
|
|
@ -373,6 +373,7 @@ def _upsert_article_from_existing(
|
|||
publish_attempts: int | object = _UNSET,
|
||||
publish_last_error: str | None | object = _UNSET,
|
||||
published_to_wp_at: str | None | object = _UNSET,
|
||||
meta_json: str | None | object = _UNSET,
|
||||
) -> None:
|
||||
rewritten = article.get("content_rewritten") if content_rewritten is None else content_rewritten
|
||||
upsert_article(
|
||||
|
|
@ -403,7 +404,7 @@ def _upsert_article_from_existing(
|
|||
published_to_wp_at=article.get("published_to_wp_at") if published_to_wp_at is _UNSET else published_to_wp_at,
|
||||
word_count=len(str(rewritten or "").split()),
|
||||
status=article.get("status") if status is None else status,
|
||||
meta_json=article.get("meta_json"),
|
||||
meta_json=article.get("meta_json") if meta_json is _UNSET else meta_json,
|
||||
)
|
||||
)
|
||||
|
||||
|
|
@ -493,6 +494,8 @@ def admin_dashboard(request: Request):
|
|||
article["days_old"] = article_age_days(article.get("published_at"))
|
||||
article["relevance"] = article_relevance(article.get("published_at"))
|
||||
article["status_ui"] = internal_to_ui_status(article.get("status"))
|
||||
tags = meta.get("generated_tags") if isinstance(meta.get("generated_tags"), list) else []
|
||||
article["generated_tags"] = [str(t) for t in tags if t]
|
||||
|
||||
return templates.TemplateResponse(
|
||||
request,
|
||||
|
|
@ -836,12 +839,40 @@ def admin_rewrite_run(request: Request, article_id: int):
|
|||
return _dashboard_redirect(msg=f"Rewrite nur aus new/rewrite fuer Artikel #{article_id}", msg_type="error")
|
||||
try:
|
||||
rewritten = rewrite_article_text(article)
|
||||
tags = generate_article_tags(article, rewritten_text=rewritten)
|
||||
except Exception as exc:
|
||||
return _dashboard_redirect(msg=f"Rewrite fehlgeschlagen fuer Artikel #{article_id}: {exc}", msg_type="error")
|
||||
_upsert_article_from_existing(article, content_rewritten=rewritten, status="approved")
|
||||
merged_meta = merge_generated_tags(article.get("meta_json"), tags)
|
||||
_upsert_article_from_existing(article, content_rewritten=rewritten, status="approved", meta_json=merged_meta)
|
||||
return _dashboard_redirect(msg=f"Rewrite fertig fuer Artikel #{article_id} -> publish")
|
||||
|
||||
|
||||
@router.post("/admin/rewrite/run")
|
||||
def admin_rewrite_run_batch(request: Request, max_jobs: str = Form("10")):
|
||||
user = _admin_user(request)
|
||||
if not user:
|
||||
return RedirectResponse(url="/admin/login", status_code=303)
|
||||
try:
|
||||
limit = max(1, min(int(max_jobs), 100))
|
||||
except Exception:
|
||||
limit = 10
|
||||
planned = list_articles(limit=limit, status_filter="rewrite")
|
||||
processed = 0
|
||||
success = 0
|
||||
failed = 0
|
||||
for article in planned:
|
||||
processed += 1
|
||||
try:
|
||||
rewritten = rewrite_article_text(article)
|
||||
tags = generate_article_tags(article, rewritten_text=rewritten)
|
||||
merged_meta = merge_generated_tags(article.get("meta_json"), tags)
|
||||
_upsert_article_from_existing(article, content_rewritten=rewritten, status="approved", meta_json=merged_meta)
|
||||
success += 1
|
||||
except Exception:
|
||||
failed += 1
|
||||
return _dashboard_redirect(msg=f"Rewrite-Run: processed={processed}, success={success}, failed={failed}")
|
||||
|
||||
|
||||
@router.post("/admin/articles/{article_id}/rewrite-save")
|
||||
def admin_rewrite_save(request: Request, article_id: int, content_rewritten: str = Form(...)):
|
||||
user = _admin_user(request)
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ from .ingestion import run_ingestion
|
|||
from .policy import evaluate_source_policy, is_source_allowed
|
||||
from .publisher import enqueue_publish, run_publisher
|
||||
from .relevance import article_age_days, article_relevance
|
||||
from .rewrite import rewrite_article_text
|
||||
from .rewrite import generate_article_tags, merge_generated_tags, rewrite_article_text
|
||||
from .repositories import (
|
||||
ArticleUpsert,
|
||||
FeedCreate,
|
||||
|
|
@ -514,6 +514,12 @@ def api_article_rewrite_run(article_id: int, username: str = Depends(require_aut
|
|||
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Rewrite nur aus Status 'new' oder 'rewrite'")
|
||||
|
||||
rewritten = rewrite_article_text(article)
|
||||
tags: list[str] = []
|
||||
try:
|
||||
tags = generate_article_tags(article, rewritten_text=rewritten)
|
||||
except Exception:
|
||||
tags = []
|
||||
merged_meta = merge_generated_tags(article.get("meta_json"), tags)
|
||||
# upsert via status update + existing fields by lightweight path:
|
||||
repo_upsert_article(
|
||||
ArticleUpsert(
|
||||
|
|
@ -543,10 +549,10 @@ def api_article_rewrite_run(article_id: int, username: str = Depends(require_aut
|
|||
published_to_wp_at=article.get("published_to_wp_at"),
|
||||
word_count=len(rewritten.split()),
|
||||
status="approved",
|
||||
meta_json=article.get("meta_json"),
|
||||
meta_json=merged_meta,
|
||||
)
|
||||
)
|
||||
return {"ok": True, "id": article_id, "status": "publish"}
|
||||
return {"ok": True, "id": article_id, "status": "publish", "tags": tags}
|
||||
|
||||
|
||||
@app.post("/api/articles/{article_id}/legal-review")
|
||||
|
|
|
|||
|
|
@ -28,35 +28,39 @@ def _sanitize_source_text(text: str) -> str:
|
|||
return joined
|
||||
|
||||
|
||||
def rewrite_article_text(article: dict[str, Any]) -> str:
|
||||
def _normalize_tags(tags: list[str], max_tags: int = 8) -> list[str]:
|
||||
out: list[str] = []
|
||||
seen: set[str] = set()
|
||||
for raw in tags:
|
||||
value = re.sub(r"\s+", " ", str(raw or "").strip())
|
||||
value = re.sub(r"^[#\-•\s]+", "", value)
|
||||
value = re.sub(r"[;,.:\s]+$", "", value)
|
||||
if not value:
|
||||
continue
|
||||
if len(value) < 2 or len(value) > 40:
|
||||
continue
|
||||
key = value.casefold()
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
out.append(value)
|
||||
if len(out) >= max_tags:
|
||||
break
|
||||
return out
|
||||
|
||||
|
||||
def _openai_chat(system: str, user: str, temperature: float = 0.4) -> str:
|
||||
settings = get_settings()
|
||||
api_key = settings.openai_api_key
|
||||
if not api_key:
|
||||
raise RuntimeError("OPENAI_API_KEY fehlt")
|
||||
|
||||
source_text = _sanitize_source_text(article.get("content_raw") or "")
|
||||
if not source_text:
|
||||
source_text = (article.get("summary") or "").strip()
|
||||
if not source_text:
|
||||
raise RuntimeError("Kein Quelltext für Rewrite verfügbar")
|
||||
|
||||
title = (article.get("title") or "").strip()
|
||||
prompt = (
|
||||
"Schreibe den folgenden News-Text neu auf Deutsch in persönlicher Du-Form. "
|
||||
"Stil: ausführlich, gut lesbar, ohne Einleitung mit Datum/Uhrzeit/Firma/Ort, "
|
||||
"ohne Pressekontakt, ohne Quellenblock. "
|
||||
"Nutze klare Absätze und Zwischenüberschriften in HTML (<h2>, <p>, <ul><li> falls passend). "
|
||||
"Inhaltlich korrekt bleiben, nichts erfinden.\n\n"
|
||||
f"Titel: {title}\n\n"
|
||||
f"Originaltext:\n{source_text}"
|
||||
)
|
||||
|
||||
payload = {
|
||||
"model": settings.openai_model,
|
||||
"temperature": 0.4,
|
||||
"temperature": temperature,
|
||||
"messages": [
|
||||
{"role": "system", "content": "Du bist ein deutscher News-Redakteur."},
|
||||
{"role": "user", "content": prompt},
|
||||
{"role": "system", "content": system},
|
||||
{"role": "user", "content": user},
|
||||
],
|
||||
}
|
||||
req = Request(
|
||||
|
|
@ -78,6 +82,78 @@ def rewrite_article_text(article: dict[str, Any]) -> str:
|
|||
message = choices[0].get("message", {})
|
||||
content = message.get("content")
|
||||
if not isinstance(content, str) or not content.strip():
|
||||
raise RuntimeError("OpenAI lieferte keinen Rewrite-Text")
|
||||
raise RuntimeError("OpenAI lieferte keinen Inhalt")
|
||||
return content.strip()
|
||||
|
||||
|
||||
def rewrite_article_text(article: dict[str, Any]) -> str:
|
||||
source_text = _sanitize_source_text(article.get("content_raw") or "")
|
||||
if not source_text:
|
||||
source_text = (article.get("summary") or "").strip()
|
||||
if not source_text:
|
||||
raise RuntimeError("Kein Quelltext für Rewrite verfügbar")
|
||||
|
||||
title = (article.get("title") or "").strip()
|
||||
prompt = (
|
||||
"Schreibe den folgenden News-Text neu auf Deutsch in persönlicher Du-Form. "
|
||||
"Stil: ausführlich, gut lesbar, ohne Einleitung mit Datum/Uhrzeit/Firma/Ort, "
|
||||
"ohne Pressekontakt, ohne Quellenblock. "
|
||||
"Nutze klare Absätze und Zwischenüberschriften in HTML (<h2>, <p>, <ul><li> falls passend). "
|
||||
"Inhaltlich korrekt bleiben, nichts erfinden.\n\n"
|
||||
f"Titel: {title}\n\n"
|
||||
f"Originaltext:\n{source_text}"
|
||||
)
|
||||
return _openai_chat(
|
||||
"Du bist ein deutscher News-Redakteur.",
|
||||
prompt,
|
||||
temperature=0.4,
|
||||
)
|
||||
|
||||
|
||||
def generate_article_tags(article: dict[str, Any], rewritten_text: str | None = None, max_tags: int = 8) -> list[str]:
|
||||
source_text = rewritten_text or _sanitize_source_text(article.get("content_raw") or "") or (article.get("summary") or "")
|
||||
source_text = str(source_text).strip()
|
||||
if not source_text:
|
||||
return []
|
||||
title = (article.get("title") or "").strip()
|
||||
prompt = (
|
||||
"Erzeuge präzise Schlagwörter für einen deutschen News-Artikel. "
|
||||
f"Maximal {max_tags} Tags. Nur relevante Begriffe, keine allgemeinen Wörter wie News/Artikel. "
|
||||
"Gib ausschließlich ein JSON-Array mit Strings zurück, ohne Erklärung.\n\n"
|
||||
f"Titel: {title}\n\n"
|
||||
f"Text:\n{source_text[:3500]}"
|
||||
)
|
||||
raw = _openai_chat(
|
||||
"Du extrahierst präzise, kurze News-Tags auf Deutsch.",
|
||||
prompt,
|
||||
temperature=0.2,
|
||||
)
|
||||
try:
|
||||
parsed = json.loads(raw)
|
||||
if isinstance(parsed, list):
|
||||
return _normalize_tags([str(x) for x in parsed], max_tags=max_tags)
|
||||
except Exception:
|
||||
pass
|
||||
# fallback: extract first JSON-like array if model wrapped output
|
||||
match = re.search(r"\[[\s\S]*\]", raw)
|
||||
if match:
|
||||
try:
|
||||
parsed = json.loads(match.group(0))
|
||||
if isinstance(parsed, list):
|
||||
return _normalize_tags([str(x) for x in parsed], max_tags=max_tags)
|
||||
except Exception:
|
||||
return []
|
||||
return []
|
||||
|
||||
|
||||
def merge_generated_tags(meta_json: str | None, tags: list[str]) -> str:
|
||||
meta: dict[str, Any] = {}
|
||||
if meta_json:
|
||||
try:
|
||||
parsed = json.loads(meta_json)
|
||||
if isinstance(parsed, dict):
|
||||
meta = parsed
|
||||
except Exception:
|
||||
meta = {}
|
||||
meta["generated_tags"] = _normalize_tags(tags)
|
||||
return json.dumps(meta, ensure_ascii=False)
|
||||
|
|
|
|||
|
|
@ -157,7 +157,7 @@ def _extract_content_text(html: str) -> str | None:
|
|||
paragraphs = []
|
||||
for match in re.finditer(r"<h[2-4][^>]*>([\s\S]*?)</h[2-4]>", section, re.IGNORECASE):
|
||||
text = _clean_text(match.group(1))
|
||||
if text and re.search(r"\b(pressekontakt|press contact|kontakt)\b", text, re.IGNORECASE):
|
||||
if text and re.search(r"\b(pressekontakt|press contact|kontakt|agentur)\b", text, re.IGNORECASE):
|
||||
paragraphs.append(text)
|
||||
|
||||
for match in re.finditer(r"<p[^>]*>([\s\S]*?)</p>", section, re.IGNORECASE):
|
||||
|
|
@ -177,18 +177,18 @@ def _extract_press_contact(content_text: str | None) -> str | None:
|
|||
return None
|
||||
|
||||
lines = [line.strip() for line in content_text.split("\n") if line.strip()]
|
||||
marker_re = re.compile(r"\b(pressekontakt|press contact|presse\-kontakt)\b", re.IGNORECASE)
|
||||
marker_re = re.compile(r"\b(pressekontakt|press contact|presse\-kontakt|agentur)\b", re.IGNORECASE)
|
||||
for idx, line in enumerate(lines):
|
||||
if marker_re.search(line):
|
||||
chunk = [line]
|
||||
for nxt in lines[idx + 1 : idx + 6]:
|
||||
if re.search(r"\b(original\-content von|ots:|newsroom:)\b", nxt, re.IGNORECASE):
|
||||
if re.search(r"\b(original\-content von|ots:|newsroom:|alle meldungen|zum newsroom)\b", nxt, re.IGNORECASE):
|
||||
break
|
||||
chunk.append(nxt)
|
||||
return _clean_text("\n".join(chunk))
|
||||
|
||||
match = re.search(
|
||||
r"(Pressekontakt[\s\S]{0,1200}?)(?:Original-Content von|OTS:|newsroom:|$)",
|
||||
r"((?:Pressekontakt|Agentur)[\s\S]{0,1200}?)(?:Original-Content von|OTS:|newsroom:|Alle Meldungen|Zum Newsroom|$)",
|
||||
content_text,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ import mimetypes
|
|||
from pathlib import Path
|
||||
import re
|
||||
from typing import Any
|
||||
from urllib.parse import urlparse
|
||||
from urllib.parse import quote_plus, urlparse
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
from .config import get_settings
|
||||
|
|
@ -25,7 +25,7 @@ def _wp_request(
|
|||
method: str,
|
||||
endpoint: str,
|
||||
payload: dict[str, Any] | None = None,
|
||||
) -> dict[str, Any]:
|
||||
) -> Any:
|
||||
url = f"{base_url.rstrip('/')}/wp-json/wp/v2/{endpoint.lstrip('/')}"
|
||||
data = json.dumps(payload).encode("utf-8") if payload is not None else None
|
||||
req = Request(
|
||||
|
|
@ -41,8 +41,7 @@ def _wp_request(
|
|||
)
|
||||
with urlopen(req, timeout=20) as resp:
|
||||
raw = resp.read().decode("utf-8", errors="replace")
|
||||
parsed = json.loads(raw) if raw else {}
|
||||
return parsed if isinstance(parsed, dict) else {}
|
||||
return json.loads(raw) if raw else {}
|
||||
|
||||
|
||||
def _selected_image_url_from_meta(meta_json: str | None) -> str | None:
|
||||
|
|
@ -61,6 +60,81 @@ def _selected_image_url_from_meta(meta_json: str | None) -> str | None:
|
|||
return selected if isinstance(selected, str) and selected.strip() else None
|
||||
|
||||
|
||||
def _selected_tags_from_meta(meta_json: str | None) -> list[str]:
|
||||
if not meta_json:
|
||||
return []
|
||||
try:
|
||||
meta = json.loads(meta_json)
|
||||
except Exception:
|
||||
return []
|
||||
if not isinstance(meta, dict):
|
||||
return []
|
||||
raw_tags = meta.get("generated_tags")
|
||||
if not isinstance(raw_tags, list):
|
||||
return []
|
||||
tags: list[str] = []
|
||||
seen: set[str] = set()
|
||||
for item in raw_tags:
|
||||
value = str(item or "").strip()
|
||||
if not value:
|
||||
continue
|
||||
key = value.casefold()
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
tags.append(value)
|
||||
if len(tags) >= 12:
|
||||
break
|
||||
return tags
|
||||
|
||||
|
||||
def _resolve_wp_tag_ids(*, base_url: str, auth_header: str, tags: list[str]) -> list[int]:
|
||||
ids: list[int] = []
|
||||
seen: set[int] = set()
|
||||
for tag in tags:
|
||||
name = tag.strip()
|
||||
if not name:
|
||||
continue
|
||||
try:
|
||||
endpoint = f"tags?search={quote_plus(name)}&per_page=20"
|
||||
result = _wp_request(base_url=base_url, auth_header=auth_header, method="GET", endpoint=endpoint)
|
||||
tag_id: int | None = None
|
||||
if isinstance(result, list):
|
||||
for row in result:
|
||||
if not isinstance(row, dict):
|
||||
continue
|
||||
row_name = str(row.get("name") or "")
|
||||
rid = int(row.get("id", 0) or 0)
|
||||
if rid <= 0:
|
||||
continue
|
||||
if row_name.casefold() == name.casefold():
|
||||
tag_id = rid
|
||||
break
|
||||
if tag_id is None:
|
||||
for row in result:
|
||||
if isinstance(row, dict) and int(row.get("id", 0) or 0) > 0:
|
||||
tag_id = int(row.get("id", 0))
|
||||
break
|
||||
if tag_id is None:
|
||||
created = _wp_request(
|
||||
base_url=base_url,
|
||||
auth_header=auth_header,
|
||||
method="POST",
|
||||
endpoint="tags",
|
||||
payload={"name": name},
|
||||
)
|
||||
if isinstance(created, dict):
|
||||
rid = int(created.get("id", 0) or 0)
|
||||
if rid > 0:
|
||||
tag_id = rid
|
||||
if tag_id is not None and tag_id > 0 and tag_id not in seen:
|
||||
seen.add(tag_id)
|
||||
ids.append(tag_id)
|
||||
except Exception:
|
||||
continue
|
||||
return ids
|
||||
|
||||
|
||||
def _download_image_bytes(url: str, referer: str | None = None) -> tuple[bytes, str]:
|
||||
headers = {
|
||||
"User-Agent": "rss-news-publisher/1.0",
|
||||
|
|
@ -269,6 +343,14 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]:
|
|||
payload["featured_media"] = featured_media_id
|
||||
|
||||
wp_post_id = article.get("wp_post_id")
|
||||
tag_ids = _resolve_wp_tag_ids(
|
||||
base_url=settings.wordpress_base_url,
|
||||
auth_header=auth,
|
||||
tags=_selected_tags_from_meta(article.get("meta_json")),
|
||||
)
|
||||
if tag_ids:
|
||||
payload["tags"] = tag_ids
|
||||
|
||||
if wp_post_id:
|
||||
result = _wp_request(
|
||||
base_url=settings.wordpress_base_url,
|
||||
|
|
@ -286,6 +368,8 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]:
|
|||
payload=payload,
|
||||
)
|
||||
|
||||
if not isinstance(result, dict):
|
||||
raise RuntimeError(f"WordPress Antwort im unerwarteten Format: {result}")
|
||||
post_id = int(result.get("id", 0))
|
||||
if post_id <= 0:
|
||||
raise RuntimeError(f"WordPress Antwort ohne Post-ID: {result}")
|
||||
|
|
|
|||
|
|
@ -170,6 +170,9 @@
|
|||
<textarea name="content_rewritten" rows="14" style="width:100%;">{{ article.content_rewritten or "" }}</textarea>
|
||||
<button type="submit">Rewrite-Text speichern</button>
|
||||
</form>
|
||||
{% if article.meta.generated_tags %}
|
||||
<p><strong>Generierte Tags:</strong> {{ article.meta.generated_tags|join("; ") }}</p>
|
||||
{% endif %}
|
||||
<p class="subtle">Dieser Text wird für den WordPress-Entwurf verwendet, falls vorhanden.</p>
|
||||
</section>
|
||||
|
||||
|
|
|
|||
|
|
@ -102,6 +102,15 @@
|
|||
</form>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Rewrite Run (geplante Artikel)</h2>
|
||||
<p class="subtle">Verarbeitet alle Artikel im Status <code>rewrite</code> und setzt sie auf <code>publish</code>.</p>
|
||||
<form method="post" action="/admin/rewrite/run" class="row">
|
||||
<input name="max_jobs" value="10" />
|
||||
<button type="submit">Rewrite Run starten</button>
|
||||
</form>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Quellen + Policy</h2>
|
||||
<table>
|
||||
|
|
@ -269,6 +278,9 @@
|
|||
{% if a.summary %}
|
||||
<div><strong>Summary:</strong> {{ a.summary }}</div>
|
||||
{% endif %}
|
||||
{% if a.generated_tags %}
|
||||
<div><strong>Tags:</strong> {{ a.generated_tags|join("; ") }}</div>
|
||||
{% endif %}
|
||||
{% if a.content_raw %}
|
||||
<details>
|
||||
<summary>Volltext anzeigen</summary>
|
||||
|
|
|
|||
|
|
@ -271,6 +271,71 @@ class TestAdminUi(unittest.TestCase):
|
|||
self.assertIn("Neu", article.get("content_rewritten") or "")
|
||||
self.assertIsNone(article.get("wp_post_id"))
|
||||
|
||||
@patch("backend.app.admin_ui.generate_article_tags")
|
||||
@patch("backend.app.admin_ui.rewrite_article_text")
|
||||
def test_batch_rewrite_run_processes_planned_articles(self, mock_rewrite_text, mock_tags) -> None:
|
||||
mock_rewrite_text.return_value = "<h2>Neu</h2><p>Text</p>"
|
||||
mock_tags.return_value = ["Rheingas", "Monheim"]
|
||||
|
||||
source_id = create_source(
|
||||
SourceCreate(
|
||||
name="Batch Source",
|
||||
base_url="https://example.org",
|
||||
terms_url="https://example.org/terms",
|
||||
license_name="cc-by",
|
||||
risk_level="green",
|
||||
is_enabled=True,
|
||||
notes=None,
|
||||
last_reviewed_at=None,
|
||||
)
|
||||
)
|
||||
feed_id = create_feed(
|
||||
FeedCreate(
|
||||
name="Batch Feed",
|
||||
url="https://example.org/feed.xml",
|
||||
source_id=source_id,
|
||||
is_enabled=True,
|
||||
)
|
||||
)
|
||||
article_id = upsert_article(
|
||||
ArticleUpsert(
|
||||
feed_id=feed_id,
|
||||
source_article_id="batch-1",
|
||||
source_hash="batch-hash-1",
|
||||
title="Batch Titel",
|
||||
source_url="https://example.org/batch",
|
||||
canonical_url="https://example.org/batch",
|
||||
published_at=None,
|
||||
author="Autor",
|
||||
summary="Summary",
|
||||
content_raw="Raw",
|
||||
content_rewritten=None,
|
||||
image_urls_json=None,
|
||||
press_contact=None,
|
||||
source_name_snapshot="Batch Source",
|
||||
source_terms_url_snapshot="https://example.org/terms",
|
||||
source_license_name_snapshot="cc-by",
|
||||
legal_checked=False,
|
||||
legal_checked_at=None,
|
||||
legal_note=None,
|
||||
wp_post_id=None,
|
||||
wp_post_url=None,
|
||||
publish_attempts=0,
|
||||
publish_last_error=None,
|
||||
published_to_wp_at=None,
|
||||
word_count=1,
|
||||
status="rewrite",
|
||||
meta_json="{}",
|
||||
)
|
||||
)
|
||||
self.client.post("/admin/login", data={"username": "admin", "password": "secret"}, follow_redirects=True)
|
||||
res = self.client.post("/admin/rewrite/run", data={"max_jobs": "10"}, follow_redirects=False)
|
||||
self.assertEqual(res.status_code, 303)
|
||||
article = get_article_by_id(article_id)
|
||||
self.assertIsNotNone(article)
|
||||
self.assertEqual(article.get("status"), "approved")
|
||||
self.assertIn("generated_tags", article.get("meta_json", ""))
|
||||
|
||||
@patch("backend.app.admin_ui.urlopen")
|
||||
def test_image_proxy_returns_image_data(self, mock_urlopen) -> None:
|
||||
class _FakeHeaders:
|
||||
|
|
|
|||
|
|
@ -26,6 +26,25 @@ SAMPLE_HTML = """
|
|||
</html>
|
||||
"""
|
||||
|
||||
SAMPLE_HTML_AGENTUR = """
|
||||
<!doctype html>
|
||||
<html lang="de">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta property="og:title" content="Demo Meldung Agentur" />
|
||||
</head>
|
||||
<body>
|
||||
<article>
|
||||
<p>Inhalt der Meldung.</p>
|
||||
<h3>Agentur</h3>
|
||||
<p>Agenturname GmbH</p>
|
||||
<p>presse@agentur.example</p>
|
||||
<p>Original-Content von Beispiel</p>
|
||||
</article>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
|
||||
class _FakeHeaders:
|
||||
@staticmethod
|
||||
|
|
@ -64,6 +83,14 @@ class TestSourceExtraction(unittest.TestCase):
|
|||
self.assertIn("Pressekontakt", extracted.press_contact or "")
|
||||
self.assertIsNone(extracted.extraction_error)
|
||||
|
||||
@patch("backend.app.source_extraction.urlopen")
|
||||
def test_extract_article_detects_agentur_block_as_press_contact(self, mock_urlopen) -> None:
|
||||
mock_urlopen.return_value = _FakeResponse(SAMPLE_HTML_AGENTUR)
|
||||
extracted = extract_article("https://www.presseportal.de/pm/155103/6210401")
|
||||
self.assertIn("Agentur", extracted.press_contact or "")
|
||||
self.assertIn("Agenturname", extracted.press_contact or "")
|
||||
self.assertIn("presse@agentur.example", extracted.press_contact or "")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
|
|
|||
|
|
@ -80,6 +80,40 @@ class TestWordpressPublish(unittest.TestCase):
|
|||
self.assertNotIn("Pressekontakt", content)
|
||||
self.assertIn("eigentliche Text", content)
|
||||
|
||||
@patch("backend.app.wordpress._upload_featured_media")
|
||||
@patch("backend.app.wordpress._wp_request")
|
||||
def test_publish_resolves_and_sets_tags(self, mock_wp_request, mock_upload_media) -> None:
|
||||
def _fake_wp_request(**kwargs):
|
||||
endpoint = kwargs.get("endpoint", "")
|
||||
method = kwargs.get("method", "")
|
||||
if method == "GET" and endpoint.startswith("tags?search="):
|
||||
if "Rheingas" in endpoint:
|
||||
return [{"id": 11, "name": "Rheingas"}]
|
||||
return []
|
||||
if method == "POST" and endpoint == "tags":
|
||||
name = (kwargs.get("payload") or {}).get("name")
|
||||
if name == "Gasflasche":
|
||||
return {"id": 12, "name": "Gasflasche"}
|
||||
return {"id": 13, "name": str(name)}
|
||||
if method == "POST" and endpoint == "posts":
|
||||
return {"id": 900, "link": "https://example.org/?p=900"}
|
||||
return {}
|
||||
|
||||
mock_wp_request.side_effect = _fake_wp_request
|
||||
article = {
|
||||
"title": "Tag Test",
|
||||
"content_raw": "Inhalt",
|
||||
"source_url": "https://example.com/source",
|
||||
"canonical_url": "https://example.com/source",
|
||||
"meta_json": '{"generated_tags":["Rheingas","Gasflasche"]}',
|
||||
}
|
||||
post_id, _ = publish_article_draft(article)
|
||||
self.assertEqual(post_id, 900)
|
||||
post_calls = [call for call in mock_wp_request.call_args_list if call.kwargs.get("endpoint") == "posts"]
|
||||
self.assertEqual(len(post_calls), 1)
|
||||
payload = post_calls[0].kwargs.get("payload", {})
|
||||
self.assertEqual(payload.get("tags"), [11, 12])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue