diff --git a/backend/app/admin_ui.py b/backend/app/admin_ui.py index d3ca53e..689efce 100644 --- a/backend/app/admin_ui.py +++ b/backend/app/admin_ui.py @@ -20,7 +20,7 @@ from .ingestion import run_ingestion from .policy import evaluate_source_policy from .publisher import enqueue_publish, run_publisher from .relevance import article_age_days, article_relevance -from .rewrite import rewrite_article_text +from .rewrite import generate_article_tags, merge_generated_tags, rewrite_article_text from .repositories import ( FeedCreate, FeedUpdate, @@ -373,6 +373,7 @@ def _upsert_article_from_existing( publish_attempts: int | object = _UNSET, publish_last_error: str | None | object = _UNSET, published_to_wp_at: str | None | object = _UNSET, + meta_json: str | None | object = _UNSET, ) -> None: rewritten = article.get("content_rewritten") if content_rewritten is None else content_rewritten upsert_article( @@ -403,7 +404,7 @@ def _upsert_article_from_existing( published_to_wp_at=article.get("published_to_wp_at") if published_to_wp_at is _UNSET else published_to_wp_at, word_count=len(str(rewritten or "").split()), status=article.get("status") if status is None else status, - meta_json=article.get("meta_json"), + meta_json=article.get("meta_json") if meta_json is _UNSET else meta_json, ) ) @@ -493,6 +494,8 @@ def admin_dashboard(request: Request): article["days_old"] = article_age_days(article.get("published_at")) article["relevance"] = article_relevance(article.get("published_at")) article["status_ui"] = internal_to_ui_status(article.get("status")) + tags = meta.get("generated_tags") if isinstance(meta.get("generated_tags"), list) else [] + article["generated_tags"] = [str(t) for t in tags if t] return templates.TemplateResponse( request, @@ -836,12 +839,40 @@ def admin_rewrite_run(request: Request, article_id: int): return _dashboard_redirect(msg=f"Rewrite nur aus new/rewrite fuer Artikel #{article_id}", msg_type="error") try: rewritten = rewrite_article_text(article) + tags = generate_article_tags(article, rewritten_text=rewritten) except Exception as exc: return _dashboard_redirect(msg=f"Rewrite fehlgeschlagen fuer Artikel #{article_id}: {exc}", msg_type="error") - _upsert_article_from_existing(article, content_rewritten=rewritten, status="approved") + merged_meta = merge_generated_tags(article.get("meta_json"), tags) + _upsert_article_from_existing(article, content_rewritten=rewritten, status="approved", meta_json=merged_meta) return _dashboard_redirect(msg=f"Rewrite fertig fuer Artikel #{article_id} -> publish") +@router.post("/admin/rewrite/run") +def admin_rewrite_run_batch(request: Request, max_jobs: str = Form("10")): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + try: + limit = max(1, min(int(max_jobs), 100)) + except Exception: + limit = 10 + planned = list_articles(limit=limit, status_filter="rewrite") + processed = 0 + success = 0 + failed = 0 + for article in planned: + processed += 1 + try: + rewritten = rewrite_article_text(article) + tags = generate_article_tags(article, rewritten_text=rewritten) + merged_meta = merge_generated_tags(article.get("meta_json"), tags) + _upsert_article_from_existing(article, content_rewritten=rewritten, status="approved", meta_json=merged_meta) + success += 1 + except Exception: + failed += 1 + return _dashboard_redirect(msg=f"Rewrite-Run: processed={processed}, success={success}, failed={failed}") + + @router.post("/admin/articles/{article_id}/rewrite-save") def admin_rewrite_save(request: Request, article_id: int, content_rewritten: str = Form(...)): user = _admin_user(request) diff --git a/backend/app/main.py b/backend/app/main.py index 4dcee28..b0bcf2a 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -18,7 +18,7 @@ from .ingestion import run_ingestion from .policy import evaluate_source_policy, is_source_allowed from .publisher import enqueue_publish, run_publisher from .relevance import article_age_days, article_relevance -from .rewrite import rewrite_article_text +from .rewrite import generate_article_tags, merge_generated_tags, rewrite_article_text from .repositories import ( ArticleUpsert, FeedCreate, @@ -514,6 +514,12 @@ def api_article_rewrite_run(article_id: int, username: str = Depends(require_aut raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Rewrite nur aus Status 'new' oder 'rewrite'") rewritten = rewrite_article_text(article) + tags: list[str] = [] + try: + tags = generate_article_tags(article, rewritten_text=rewritten) + except Exception: + tags = [] + merged_meta = merge_generated_tags(article.get("meta_json"), tags) # upsert via status update + existing fields by lightweight path: repo_upsert_article( ArticleUpsert( @@ -543,10 +549,10 @@ def api_article_rewrite_run(article_id: int, username: str = Depends(require_aut published_to_wp_at=article.get("published_to_wp_at"), word_count=len(rewritten.split()), status="approved", - meta_json=article.get("meta_json"), + meta_json=merged_meta, ) ) - return {"ok": True, "id": article_id, "status": "publish"} + return {"ok": True, "id": article_id, "status": "publish", "tags": tags} @app.post("/api/articles/{article_id}/legal-review") diff --git a/backend/app/rewrite.py b/backend/app/rewrite.py index 8c313ad..759fac9 100644 --- a/backend/app/rewrite.py +++ b/backend/app/rewrite.py @@ -28,35 +28,39 @@ def _sanitize_source_text(text: str) -> str: return joined -def rewrite_article_text(article: dict[str, Any]) -> str: +def _normalize_tags(tags: list[str], max_tags: int = 8) -> list[str]: + out: list[str] = [] + seen: set[str] = set() + for raw in tags: + value = re.sub(r"\s+", " ", str(raw or "").strip()) + value = re.sub(r"^[#\-•\s]+", "", value) + value = re.sub(r"[;,.:\s]+$", "", value) + if not value: + continue + if len(value) < 2 or len(value) > 40: + continue + key = value.casefold() + if key in seen: + continue + seen.add(key) + out.append(value) + if len(out) >= max_tags: + break + return out + + +def _openai_chat(system: str, user: str, temperature: float = 0.4) -> str: settings = get_settings() api_key = settings.openai_api_key if not api_key: raise RuntimeError("OPENAI_API_KEY fehlt") - source_text = _sanitize_source_text(article.get("content_raw") or "") - if not source_text: - source_text = (article.get("summary") or "").strip() - if not source_text: - raise RuntimeError("Kein Quelltext für Rewrite verfügbar") - - title = (article.get("title") or "").strip() - prompt = ( - "Schreibe den folgenden News-Text neu auf Deutsch in persönlicher Du-Form. " - "Stil: ausführlich, gut lesbar, ohne Einleitung mit Datum/Uhrzeit/Firma/Ort, " - "ohne Pressekontakt, ohne Quellenblock. " - "Nutze klare Absätze und Zwischenüberschriften in HTML (
,
,
]*>([\s\S]*?)
", section, re.IGNORECASE): @@ -177,18 +177,18 @@ def _extract_press_contact(content_text: str | None) -> str | None: return None lines = [line.strip() for line in content_text.split("\n") if line.strip()] - marker_re = re.compile(r"\b(pressekontakt|press contact|presse\-kontakt)\b", re.IGNORECASE) + marker_re = re.compile(r"\b(pressekontakt|press contact|presse\-kontakt|agentur)\b", re.IGNORECASE) for idx, line in enumerate(lines): if marker_re.search(line): chunk = [line] for nxt in lines[idx + 1 : idx + 6]: - if re.search(r"\b(original\-content von|ots:|newsroom:)\b", nxt, re.IGNORECASE): + if re.search(r"\b(original\-content von|ots:|newsroom:|alle meldungen|zum newsroom)\b", nxt, re.IGNORECASE): break chunk.append(nxt) return _clean_text("\n".join(chunk)) match = re.search( - r"(Pressekontakt[\s\S]{0,1200}?)(?:Original-Content von|OTS:|newsroom:|$)", + r"((?:Pressekontakt|Agentur)[\s\S]{0,1200}?)(?:Original-Content von|OTS:|newsroom:|Alle Meldungen|Zum Newsroom|$)", content_text, re.IGNORECASE, ) diff --git a/backend/app/wordpress.py b/backend/app/wordpress.py index c257747..8da5fc5 100644 --- a/backend/app/wordpress.py +++ b/backend/app/wordpress.py @@ -7,7 +7,7 @@ import mimetypes from pathlib import Path import re from typing import Any -from urllib.parse import urlparse +from urllib.parse import quote_plus, urlparse from urllib.request import Request, urlopen from .config import get_settings @@ -25,7 +25,7 @@ def _wp_request( method: str, endpoint: str, payload: dict[str, Any] | None = None, -) -> dict[str, Any]: +) -> Any: url = f"{base_url.rstrip('/')}/wp-json/wp/v2/{endpoint.lstrip('/')}" data = json.dumps(payload).encode("utf-8") if payload is not None else None req = Request( @@ -41,8 +41,7 @@ def _wp_request( ) with urlopen(req, timeout=20) as resp: raw = resp.read().decode("utf-8", errors="replace") - parsed = json.loads(raw) if raw else {} - return parsed if isinstance(parsed, dict) else {} + return json.loads(raw) if raw else {} def _selected_image_url_from_meta(meta_json: str | None) -> str | None: @@ -61,6 +60,81 @@ def _selected_image_url_from_meta(meta_json: str | None) -> str | None: return selected if isinstance(selected, str) and selected.strip() else None +def _selected_tags_from_meta(meta_json: str | None) -> list[str]: + if not meta_json: + return [] + try: + meta = json.loads(meta_json) + except Exception: + return [] + if not isinstance(meta, dict): + return [] + raw_tags = meta.get("generated_tags") + if not isinstance(raw_tags, list): + return [] + tags: list[str] = [] + seen: set[str] = set() + for item in raw_tags: + value = str(item or "").strip() + if not value: + continue + key = value.casefold() + if key in seen: + continue + seen.add(key) + tags.append(value) + if len(tags) >= 12: + break + return tags + + +def _resolve_wp_tag_ids(*, base_url: str, auth_header: str, tags: list[str]) -> list[int]: + ids: list[int] = [] + seen: set[int] = set() + for tag in tags: + name = tag.strip() + if not name: + continue + try: + endpoint = f"tags?search={quote_plus(name)}&per_page=20" + result = _wp_request(base_url=base_url, auth_header=auth_header, method="GET", endpoint=endpoint) + tag_id: int | None = None + if isinstance(result, list): + for row in result: + if not isinstance(row, dict): + continue + row_name = str(row.get("name") or "") + rid = int(row.get("id", 0) or 0) + if rid <= 0: + continue + if row_name.casefold() == name.casefold(): + tag_id = rid + break + if tag_id is None: + for row in result: + if isinstance(row, dict) and int(row.get("id", 0) or 0) > 0: + tag_id = int(row.get("id", 0)) + break + if tag_id is None: + created = _wp_request( + base_url=base_url, + auth_header=auth_header, + method="POST", + endpoint="tags", + payload={"name": name}, + ) + if isinstance(created, dict): + rid = int(created.get("id", 0) or 0) + if rid > 0: + tag_id = rid + if tag_id is not None and tag_id > 0 and tag_id not in seen: + seen.add(tag_id) + ids.append(tag_id) + except Exception: + continue + return ids + + def _download_image_bytes(url: str, referer: str | None = None) -> tuple[bytes, str]: headers = { "User-Agent": "rss-news-publisher/1.0", @@ -269,6 +343,14 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]: payload["featured_media"] = featured_media_id wp_post_id = article.get("wp_post_id") + tag_ids = _resolve_wp_tag_ids( + base_url=settings.wordpress_base_url, + auth_header=auth, + tags=_selected_tags_from_meta(article.get("meta_json")), + ) + if tag_ids: + payload["tags"] = tag_ids + if wp_post_id: result = _wp_request( base_url=settings.wordpress_base_url, @@ -286,6 +368,8 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]: payload=payload, ) + if not isinstance(result, dict): + raise RuntimeError(f"WordPress Antwort im unerwarteten Format: {result}") post_id = int(result.get("id", 0)) if post_id <= 0: raise RuntimeError(f"WordPress Antwort ohne Post-ID: {result}") diff --git a/backend/templates/admin_article_detail.html b/backend/templates/admin_article_detail.html index 992fe20..1c16658 100644 --- a/backend/templates/admin_article_detail.html +++ b/backend/templates/admin_article_detail.html @@ -170,6 +170,9 @@ + {% if article.meta.generated_tags %} +Generierte Tags: {{ article.meta.generated_tags|join("; ") }}
+ {% endif %}Dieser Text wird für den WordPress-Entwurf verwendet, falls vorhanden.
diff --git a/backend/templates/admin_dashboard.html b/backend/templates/admin_dashboard.html index f24b76e..15f3daf 100644 --- a/backend/templates/admin_dashboard.html +++ b/backend/templates/admin_dashboard.html @@ -102,6 +102,15 @@ +Verarbeitet alle Artikel im Status rewrite und setzt sie auf publish.