From 2c331d683b6e5a241c6b4d6d6e54f68846fd6caf Mon Sep 17 00:00:00 2001 From: Oliver G Date: Wed, 18 Feb 2026 09:46:44 +0100 Subject: [PATCH 01/54] feat: rebuild rss-news backend, admin ui, and legal extraction pipeline --- .github/workflows/deploy.yml | 6 + .github/workflows/test.yml | 39 +++ CHANGELOG.md | 38 ++- README.md | 114 +++---- backend/.env.example | 10 + backend/README.md | 82 +++++ backend/__init__.py | 1 + backend/app/__init__.py | 1 + backend/app/admin_ui.py | 265 +++++++++++++++ backend/app/auth.py | 31 ++ backend/app/config.py | 29 ++ backend/app/db.py | 138 ++++++++ backend/app/ingestion.py | 253 ++++++++++++++ backend/app/main.py | 404 +++++++++++++++++++++++ backend/app/policy.py | 35 ++ backend/app/repositories.py | 416 ++++++++++++++++++++++++ backend/app/source_extraction.py | 257 +++++++++++++++ backend/data/rss_news.db | Bin 0 -> 94208 bytes backend/requirements-test.txt | 3 + backend/requirements.txt | 8 + backend/static/admin.css | 189 +++++++++++ backend/templates/admin_dashboard.html | 235 +++++++++++++ backend/templates/admin_login.html | 27 ++ backend/tests/__init__.py | 1 + backend/tests/test_admin_ui.py | 65 ++++ backend/tests/test_api_auth.py | 77 +++++ backend/tests/test_article_workflow.py | 95 ++++++ backend/tests/test_db_repositories.py | 119 +++++++ backend/tests/test_ingestion.py | 122 +++++++ backend/tests/test_source_extraction.py | 69 ++++ docs/PROJECT_PLAN.md | 67 ++++ docs/SOURCE_POLICY.md | 81 +++++ docs/TODO.md | 33 ++ docs/wiki/Architektur.md | 29 ++ docs/wiki/Deployment.md | 20 ++ docs/wiki/Home.md | 19 ++ docs/wiki/Operations-Runbook.md | 23 ++ docs/wiki/Project-Board.md | 28 ++ docs/wiki/Recht-Quellen.md | 35 ++ docs/wiki/Roadmap.md | 19 ++ docs/wiki/Security-Auth.md | 16 + pytest.ini | 4 + scripts/smoke_backend.sh | 33 ++ 43 files changed, 3463 insertions(+), 73 deletions(-) create mode 100644 .github/workflows/test.yml create mode 100644 backend/.env.example create mode 100644 backend/README.md create mode 100644 backend/__init__.py create mode 100644 backend/app/__init__.py create mode 100644 backend/app/admin_ui.py create mode 100644 backend/app/auth.py create mode 100644 backend/app/config.py create mode 100644 backend/app/db.py create mode 100644 backend/app/ingestion.py create mode 100644 backend/app/main.py create mode 100644 backend/app/policy.py create mode 100644 backend/app/repositories.py create mode 100644 backend/app/source_extraction.py create mode 100644 backend/data/rss_news.db create mode 100644 backend/requirements-test.txt create mode 100644 backend/requirements.txt create mode 100644 backend/static/admin.css create mode 100644 backend/templates/admin_dashboard.html create mode 100644 backend/templates/admin_login.html create mode 100644 backend/tests/__init__.py create mode 100644 backend/tests/test_admin_ui.py create mode 100644 backend/tests/test_api_auth.py create mode 100644 backend/tests/test_article_workflow.py create mode 100644 backend/tests/test_db_repositories.py create mode 100644 backend/tests/test_ingestion.py create mode 100644 backend/tests/test_source_extraction.py create mode 100644 docs/PROJECT_PLAN.md create mode 100644 docs/SOURCE_POLICY.md create mode 100644 docs/TODO.md create mode 100644 docs/wiki/Architektur.md create mode 100644 docs/wiki/Deployment.md create mode 100644 docs/wiki/Home.md create mode 100644 docs/wiki/Operations-Runbook.md create mode 100644 docs/wiki/Project-Board.md create mode 100644 docs/wiki/Recht-Quellen.md create mode 100644 docs/wiki/Roadmap.md create mode 100644 docs/wiki/Security-Auth.md create mode 100644 pytest.ini create mode 100755 scripts/smoke_backend.sh diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 5d55808..4ac1c42 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -19,9 +19,15 @@ jobs: username: oliver key: ${{ secrets.HETZNER_SSH_KEY }} port: 22 + envs: APP_ADMIN_USERNAME,APP_ADMIN_PASSWORD script: | cd rss-news git pull origin main source .venv/bin/activate pip install -r requirements.txt + pip install -r backend/requirements.txt || true sudo systemctl restart rss-app + BASE_URL="https://news.vanityontour.de" APP_ADMIN_USERNAME="${APP_ADMIN_USERNAME}" APP_ADMIN_PASSWORD="${APP_ADMIN_PASSWORD}" bash scripts/smoke_backend.sh + env: + APP_ADMIN_USERNAME: ${{ secrets.NEWS_APP_ADMIN_USERNAME }} + APP_ADMIN_PASSWORD: ${{ secrets.NEWS_APP_ADMIN_PASSWORD }} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..1d627db --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,39 @@ +name: Backend Tests + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + backend-tests: + runs-on: ubuntu-latest + timeout-minutes: 15 + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r backend/requirements.txt + pip install -r backend/requirements-test.txt + + - name: Run tests with coverage + env: + APP_DB_PATH: /tmp/rss_news_test.db + run: | + pytest backend/tests --cov=backend/app --cov-report=term-missing --cov-report=xml + + - name: Upload coverage artifact + uses: actions/upload-artifact@v4 + with: + name: coverage-xml + path: coverage.xml diff --git a/CHANGELOG.md b/CHANGELOG.md index fa80967..66b7237 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,42 @@ -## [1.7.1] - 2025-08-28 +## [1.7.1] - 2025-08-24 -- Beschreibung... +### ✨ Security angepasst + - alle Credentials in die .env Datei verschoben + - beim Start der App werden die Credentials geprüft und beim fehlen entsprechende Meldungen ausgegeben + +--- ## [1.7.0] - 2025-08-24 -- Beschreibung... +### Multi-Select & Massenoperationen: + - ✅ Checkboxes für Artikel-Auswahl im "Artikel verwalten" Bereich + - ✅ "Alle auswählen" / "Auswahl aufheben" Buttons + - ✅ Massenoperationen für ausgewählte Artikel: + - Bulk Status-Änderung für mehrere Artikel gleichzeitig + - Bulk Artikel-Umschreibung mit automatischer Status-Verwaltung + - Bulk WordPress-Upload nur für "Process"-Artikel + - Bulk Papierkorb-Funktion + +### Schnellaktionen Integration: + - ✅ Feed-Aktualisierung direkt im Artikel-Tab verfügbar + - ✅ Alle Dashboard-Schnellaktionen in Artikel-Verwaltung integriert + - ✅ Intelligente Anzeige nur relevanter Operationen (z.B. WordPress-Upload nur bei Process-Artikeln) + +### 🔧 Verbesserungen + + - UI/UX: Verbesserte Artikel-Card-Layouts mit Checkbox-Integration + - Workflow: Streamlined Artikel-Management ohne Tab-Wechsel nötig + - Feedback: Detaillierte Statusmeldungen bei Massenoperationen + - Performance: Optimierte Session-State-Verwaltung für Artikel-Auswahl + +### 🏗️ Technische Änderungen + + - Session State Erweiterung um selected_articles Set + - Neue Bulk-Operation-Funktionen in app.py:326-467 + - Überarbeitetes Artikel-Card-Layout mit 3-Spalten-Design + - Integration bestehender WordPress-Upload und Rewrite-Funktionen + +--- ## [1.6.3] - 2025-08-18 diff --git a/README.md b/README.md index 6846a41..b3c2b4a 100644 --- a/README.md +++ b/README.md @@ -1,89 +1,63 @@ -# 📰 RSS News Bot +# rss-news (Rebuild) -Ein intelligentes Tool zum Einlesen, Umschreiben und Veröffentlichen von Artikeln aus RSS-Feeds – mit automatischer Tag-Erkennung, KI-unterstütztem Rewrite via GPT-4, Bildextraktion aus Originalartikeln und optionaler DALL·E-Bildgenerierung. +`rss-news` wird als bestehendes Repository weitergefuehrt und schrittweise zu einer robusten, rechtssicheren News-Pipeline neu aufgebaut. -![Version](https://img.shields.io/badge/version-1.5.2-blue) -![License](https://img.shields.io/badge/license-MIT-green) -![Python](https://img.shields.io/badge/python-3.10+-yellow) -![Streamlit](https://img.shields.io/badge/built%20with-Streamlit-ff4b4b) +Aktueller Stand: +- Alte Streamlit-App wird nicht produktiv genutzt. +- `news.vanityontour.de` wird bis zum Go-Live der neuen App auf `https://vanityontour.de` umgeleitet. +- Planung, Doku und Wiki werden als Grundlage fuer den Neuaufbau gepflegt. ---- +## Ziele +- RSS-gestuetzte Artikelverarbeitung mit klaren Quellregeln +- Rechtssichere Nutzung (Quellen, Attribution, Lizenzinformationen) +- Zuverlaessige Automatisierung auf Hetzner +- Publikation nach WordPress (IONOS aktuell, spaeter offen) +- Zugriff nur nach Login (zunaechst User/Password) -## 🚀 Features +## Architektur-Richtung (MVP) +- Backend: `Python + FastAPI` +- Jobs: Queue-Worker (z. B. Redis + RQ/Celery) +- Daten: SQLite fuer MVP, spaeter optional PostgreSQL +- Auth: Session-Login mit einem Admin-User +- Publishing: WordPress REST API (Status zunaechst `pending`) -- 📡 **RSS-Feeds verwalten** (hinzufügen, aktualisieren) -- ✍️ **Artikel automatisch umschreiben** mit GPT-4 -- 🏷️ **Tags automatisch generieren** -- 🖼️ **Bilder aus Originalartikeln extrahieren** -- 🪄 **Optionales DALL·E-Bild generieren** -- 🔧 **Bearbeiten von Bildmetadaten** -- 🗂️ **Statusverwaltung der Artikel (New, Rewrite, Process, etc.)** -- 📜 **Log-Viewer-Seite integriert** -- 📥 **Export zur Veröffentlichung auf WordPress vorbereitet** -- 📋 Artikeltabelle mit Status-Filter -- 🔍 Artikel-Expander mit Rewrite, Tags & Bildern -- 🪄 Button für KI-Bildgenerierung +Details: `docs/PROJECT_PLAN.md` +## Projektsteuerung +- GitHub Project: `https://github.com/users/OliverGiertz/projects/3/views/1` +- Dieses Board ist die zentrale Steuerung fuer ToDos, Bugs, Verbesserungen. +- Wiki-Struktur liegt unter `docs/wiki/`. ---- +## Dokumentation +- Projektplan: `docs/PROJECT_PLAN.md` +- ToDo-Liste: `docs/TODO.md` +- Quell- und Lizenzpolicy: `docs/SOURCE_POLICY.md` +- Wiki Home: `docs/wiki/Home.md` -## 🧱 Projektstruktur - -ss-news/ -├── app.py # Haupt-UI mit Streamlit -├── main.py # Logik für Feed-Import und Verarbeitung -├── utils/ -│ └── image_extractor.py # Bilder aus Originalartikeln extrahieren -│ └── dalle_generator.py # DALL·E-Integration (KI-Bild) -├── pages/ -│ └── log_viewer.py # UI zur Anzeige der Logs -├── data/ -│ └── articles.json # Gespeicherte Artikel -│ └── feeds.json # Gespeicherte Feed-URLs -├── logs/ -│ └── rss_tool.log # Logging der Verarbeitung -├── versioning.py # CLI-Tool zur Versionierung & Release -├── TEST-CHECKLIST.md # Manuelle Prüfliste für Releases -├── version.py # Aktuelle Version -└── CHANGELOG.md # Änderungsprotokoll - - ---- - -## ⚙️ Installation +## Lokale Entwicklung (Legacy-Code) +Der vorhandene Legacy-Stand kann weiterhin lokal gestartet werden: ```bash -git clone https://github.com/OliverGiertz/rss-news.git -cd rss-news python -m venv .venv source .venv/bin/activate pip install -r requirements.txt -``` - ---- - -## Update -Ein Update Script findest du hier: https://gist.github.com/OliverGiertz/ad33ae3de9aa1c1163dad5fe8affb6ca - -```bash -bash update.sh -``` - - -## ▶️ Starten der App - streamlit run app.py +``` ---- +Hinweis: Diese App ist funktional historisch und wird durch die neue Architektur ersetzt. -## 🔐 Konfiguration (.env) +## Deployment-Zielbild +- Betrieb auf Hetzner +- Reverse Proxy via CloudPanel/Nginx +- Produktive Domain: `news.vanityontour.de` +- Bis zur Fertigstellung: Redirect auf `https://vanityontour.de` -Lege eine `.env` im Projekt an (siehe `.env.example`). Erforderliche Variablen: +## Sicherheit +- Keine Secrets im Repository +- `.env` lokal/auf Server, nie committen +- Auth-Pflicht fuer die neue WebApp +- spaeter optional: Passkeys/WebAuthn -- `WP_BASE_URL`: Basis-URL deiner WordPress-Seite (z. B. https://example.com) -- Authentifizierung (eine Option wählen): - - `WP_AUTH_BASE64`: Bevorzugt. Base64 von `username:application_password` - - oder `WP_USERNAME` und `WP_PASSWORD`: Benutzer + Anwendungspasswort -- Optional: `OPENAI_API_KEY` für das Umschreiben von Artikeln +## Rechtlicher Hinweis +Dieses Projekt verarbeitet nur Quellen mit dokumentierter Nutzungsgrundlage. Vor produktiver Nutzung ist eine finale rechtliche Pruefung der ausgewaehlten Feeds notwendig. -Hinweis: Der Code liest ausschließlich aus `.env`. Es gibt keine hartkodierten Standard-Credentials. diff --git a/backend/.env.example b/backend/.env.example new file mode 100644 index 0000000..74e9c4b --- /dev/null +++ b/backend/.env.example @@ -0,0 +1,10 @@ +APP_ENV=development +APP_NAME=rss-news-backend +APP_SECRET_KEY=replace-with-a-long-random-secret +APP_DB_PATH=backend/data/rss_news.db + +APP_ADMIN_USERNAME=admin +APP_ADMIN_PASSWORD=change-me + +SESSION_COOKIE_NAME=rss_news_session +SESSION_MAX_AGE_SECONDS=28800 diff --git a/backend/README.md b/backend/README.md new file mode 100644 index 0000000..7d64a65 --- /dev/null +++ b/backend/README.md @@ -0,0 +1,82 @@ +# Backend Skeleton (FastAPI) + +Dieses Verzeichnis enthaelt das technische Grundgeruest fuer den Rebuild von `rss-news`. + +## Start (lokal) + +```bash +python -m venv .venv +source .venv/bin/activate +pip install -r backend/requirements.txt +uvicorn backend.app.main:app --reload --port 8501 +``` + +## Admin UI +- Login: `http://127.0.0.1:8501/admin/login` +- Dashboard: `http://127.0.0.1:8501/admin/dashboard` + +## Environment +- Datei: `backend/.env` +- Vorlage: `backend/.env.example` + +## Endpoints +- `GET /health` - Healthcheck +- `POST /auth/login` - Login mit Admin-User +- `POST /auth/logout` - Logout +- `GET /auth/me` - Aktiver User +- `GET /api/protected` - Geschuetzter Test-Endpoint +- `GET /api/pipeline/status` - Basisstatus inkl. Datensatzzaehler +- `GET /api/sources` - Quellenliste +- `POST /api/sources` - Quelle anlegen +- `GET /api/sources/{source_id}/policy-check` - Policy-Pruefung fuer Quelle +- `GET /api/feeds` - Feedliste +- `POST /api/feeds` - Feed anlegen +- `GET /api/feeds/{feed_id}/policy-check` - Policy-Pruefung fuer Feed +- `GET /api/runs` - Import-/Job-Runs anzeigen +- `GET /api/runs/{run_id}` - Detailansicht eines Runs +- `POST /api/runs` - Run starten +- `POST /api/runs/{run_id}/finish` - Run abschliessen +- `GET /api/articles` - Artikel anzeigen +- `GET /api/articles/{article_id}` - Artikeldetail +- `POST /api/articles/upsert` - Artikel idempotent anlegen/aktualisieren +- `POST /api/articles/{article_id}/transition` - Statuswechsel nach Workflow-Regeln +- `POST /api/articles/{article_id}/review` - Review-Entscheidung (approve/reject) +- `POST /api/ingestion/run` - Feed-Ingestion starten (optional pro Feed) + +## Datenbank +- SQLite-Datei unter `backend/data/rss_news.db` +- Tabellen werden beim App-Start initialisiert. +- Tabellen: `sources`, `feeds`, `runs`, `articles` +- Dedupe-Strategie Artikel: `source_url` -> `(feed_id, source_article_id)` -> `source_hash` + +## Policy-Enforcement +- Ingestion blockiert Feeds automatisch, wenn die zugeordnete Quelle nicht policy-konform ist. +- Mindestanforderungen: `risk_level=green`, `terms_url`, `license_name`, `last_reviewed_at`, `is_enabled=1`. +- Pro importiertem Artikel wird ein `attribution`-Block in `meta_json` gespeichert. + +## Review-Workflow +- Statuskette: `new -> review -> approved -> published` +- Ablehnung im Review setzt auf `rewrite` +- Ungueltige Statuswechsel werden per API blockiert + +## Verifikation +```bash +python -m unittest backend.tests.test_db_repositories +python -m unittest backend.tests.test_ingestion +python -m unittest backend.tests.test_api_auth +``` + +## CI / Online-Auswertung +- GitHub Actions Workflow: `.github/workflows/test.yml` +- Fuehrt Tests inkl. Coverage auf Push/PR gegen `main` aus. + +## Hetzner Smoketest +```bash +BASE_URL="https://news.vanityontour.de" \ +APP_ADMIN_USERNAME="admin" \ +APP_ADMIN_PASSWORD="..." \ +bash scripts/smoke_backend.sh +``` + +## Hinweis +Passwort-Hashing und CSRF/Rate-Limit sind als naechste Ausbaustufe vorgesehen. diff --git a/backend/__init__.py b/backend/__init__.py new file mode 100644 index 0000000..3623851 --- /dev/null +++ b/backend/__init__.py @@ -0,0 +1 @@ +"""Backend package for rss-news rebuild.""" diff --git a/backend/app/__init__.py b/backend/app/__init__.py new file mode 100644 index 0000000..18b665e --- /dev/null +++ b/backend/app/__init__.py @@ -0,0 +1 @@ +"""Application package.""" diff --git a/backend/app/admin_ui.py b/backend/app/admin_ui.py new file mode 100644 index 0000000..9587664 --- /dev/null +++ b/backend/app/admin_ui.py @@ -0,0 +1,265 @@ +from __future__ import annotations + +import json +from pathlib import Path +from urllib.parse import urlencode + +from fastapi import APIRouter, Form, Request +from fastapi.responses import HTMLResponse, RedirectResponse +from fastapi.templating import Jinja2Templates + +from .auth import create_session_token, verify_credentials, verify_session_token +from .config import get_settings +from .ingestion import run_ingestion +from .policy import evaluate_source_policy +from .repositories import ( + FeedCreate, + SourceCreate, + create_feed, + create_source, + get_article_by_id, + list_articles, + list_feeds, + list_runs, + list_sources, + update_article_status, +) + +settings = get_settings() +router = APIRouter(tags=["admin-ui"]) +templates = Jinja2Templates(directory=str(Path(__file__).resolve().parent.parent / "templates")) +ALLOWED_TRANSITIONS: dict[str, tuple[str, ...]] = { + "new": ("review", "rewrite", "error"), + "rewrite": ("review", "error"), + "review": ("approved", "rewrite", "error"), + "approved": ("published", "error"), + "published": ("error",), + "error": ("review", "rewrite"), +} + + +def _admin_user(request: Request) -> str | None: + token = request.cookies.get(settings.session_cookie_name) + if not token: + return None + return verify_session_token(token) + + +def _to_optional_int(raw: str | None) -> int | None: + if raw is None: + return None + value = raw.strip() + if value == "": + return None + return int(value) + + +def _dashboard_redirect( + *, + msg: str | None = None, + msg_type: str = "success", + status_filter: str | None = None, +) -> RedirectResponse: + query: dict[str, str] = {} + if msg: + query["msg"] = msg + query["type"] = msg_type + if status_filter: + query["status_filter"] = status_filter + suffix = f"?{urlencode(query)}" if query else "" + return RedirectResponse(url=f"/admin/dashboard{suffix}", status_code=303) + + +def _parse_meta_json(raw: str | None) -> dict: + if not raw: + return {} + try: + parsed = json.loads(raw) + return parsed if isinstance(parsed, dict) else {} + except Exception: + return {} + + +@router.get("/admin", response_class=HTMLResponse) +def admin_index(request: Request): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + return RedirectResponse(url="/admin/dashboard", status_code=303) + + +@router.get("/admin/login", response_class=HTMLResponse) +def admin_login_page(request: Request): + return templates.TemplateResponse( + request, + "admin_login.html", + {"request": request, "title": "Admin Login", "error": request.query_params.get("error")}, + ) + + +@router.post("/admin/login") +def admin_login(request: Request, username: str = Form(...), password: str = Form(...)): + if not verify_credentials(username, password): + return RedirectResponse(url="/admin/login?error=1", status_code=303) + + token = create_session_token(username) + response = RedirectResponse(url="/admin/dashboard", status_code=303) + response.set_cookie( + key=settings.session_cookie_name, + value=token, + max_age=settings.session_max_age_seconds, + httponly=True, + secure=False, + samesite="lax", + ) + return response + + +@router.post("/admin/logout") +def admin_logout(): + response = RedirectResponse(url="/admin/login", status_code=303) + response.delete_cookie(settings.session_cookie_name) + return response + + +@router.get("/admin/dashboard", response_class=HTMLResponse) +def admin_dashboard(request: Request): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + + sources = list_sources() + source_policy = {s["id"]: evaluate_source_policy(s) for s in sources} + feeds = list_feeds() + runs = list_runs(limit=30) + status_filter = request.query_params.get("status_filter") + if status_filter in {"new", "rewrite", "review", "approved", "published", "error"}: + articles = list_articles(limit=100, status_filter=status_filter) + else: + status_filter = "" + articles = list_articles(limit=100) + for article in articles: + meta = _parse_meta_json(article.get("meta_json")) + extraction = meta.get("extraction") if isinstance(meta.get("extraction"), dict) else {} + article["meta"] = meta + article["extracted_images"] = extraction.get("images") if isinstance(extraction.get("images"), list) else [] + article["press_contact"] = extraction.get("press_contact") if isinstance(extraction.get("press_contact"), str) else None + article["extraction_error"] = extraction.get("extraction_error") if isinstance(extraction.get("extraction_error"), str) else None + + return templates.TemplateResponse( + request, + "admin_dashboard.html", + { + "request": request, + "title": "Admin Dashboard", + "user": user, + "sources": sources, + "source_policy": source_policy, + "feeds": feeds, + "runs": runs, + "articles": articles, + "status_options": ["new", "rewrite", "review", "approved", "published", "error"], + "allowed_transitions": ALLOWED_TRANSITIONS, + "status_filter": status_filter, + "flash_msg": request.query_params.get("msg", ""), + "flash_type": request.query_params.get("type", "success"), + }, + ) + + +@router.post("/admin/sources/create") +def admin_create_source( + request: Request, + name: str = Form(...), + base_url: str = Form(""), + terms_url: str = Form(""), + license_name: str = Form(""), + risk_level: str = Form("yellow"), + last_reviewed_at: str = Form(""), +): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + + try: + create_source( + SourceCreate( + name=name, + base_url=base_url or None, + terms_url=terms_url or None, + license_name=license_name or None, + risk_level=risk_level, + is_enabled=True, + notes=None, + last_reviewed_at=last_reviewed_at or None, + ) + ) + except Exception as exc: + return _dashboard_redirect(msg=f"Quelle konnte nicht gespeichert werden: {exc}", msg_type="error") + return _dashboard_redirect(msg="Quelle gespeichert") + + +@router.post("/admin/feeds/create") +def admin_create_feed( + request: Request, + name: str = Form(...), + url: str = Form(...), + source_id: str = Form(""), +): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + + try: + create_feed( + FeedCreate( + name=name, + url=url, + source_id=_to_optional_int(source_id), + is_enabled=True, + ) + ) + except Exception as exc: + return _dashboard_redirect(msg=f"Feed konnte nicht gespeichert werden: {exc}", msg_type="error") + return _dashboard_redirect(msg="Feed gespeichert") + + +@router.post("/admin/ingestion/run") +def admin_run_ingestion(request: Request, feed_id: str = Form("")): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + try: + stats = run_ingestion(feed_id=_to_optional_int(feed_id)) + except Exception as exc: + return _dashboard_redirect(msg=f"Ingestion fehlgeschlagen: {exc}", msg_type="error") + return _dashboard_redirect(msg=f"Ingestion: {stats.status}, upserts={stats.articles_upserted}") + + +@router.post("/admin/articles/{article_id}/review") +def admin_review_article(request: Request, article_id: int, decision: str = Form(...), note: str = Form("")): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + + article = get_article_by_id(article_id) + if article and article.get("status") == "review" and decision in {"approve", "reject"}: + target = "approved" if decision == "approve" else "rewrite" + update_article_status(article_id, target, actor=user, note=note or None, decision=decision) + return _dashboard_redirect(msg=f"Artikel #{article_id}: {decision}") + return _dashboard_redirect(msg=f"Review-Aktion ungueltig fuer Artikel #{article_id}", msg_type="error") + + +@router.post("/admin/articles/{article_id}/transition") +def admin_transition_article(request: Request, article_id: int, target_status: str = Form(...), note: str = Form("")): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + + article = get_article_by_id(article_id) + if article: + current = article.get("status") + if target_status in ALLOWED_TRANSITIONS.get(current, ()): + update_article_status(article_id, target_status, actor=user, note=note or None) + return _dashboard_redirect(msg=f"Artikel #{article_id}: {current} -> {target_status}") + return _dashboard_redirect(msg=f"Ungueltiger Statuswechsel fuer Artikel #{article_id}", msg_type="error") diff --git a/backend/app/auth.py b/backend/app/auth.py new file mode 100644 index 0000000..188397f --- /dev/null +++ b/backend/app/auth.py @@ -0,0 +1,31 @@ +import hmac +from typing import Optional + +from itsdangerous import URLSafeTimedSerializer, BadSignature, SignatureExpired + +from .config import get_settings + + +def _serializer() -> URLSafeTimedSerializer: + settings = get_settings() + return URLSafeTimedSerializer(settings.app_secret_key, salt="rss-news-session") + + +def verify_credentials(username: str, password: str) -> bool: + settings = get_settings() + user_ok = hmac.compare_digest(username, settings.app_admin_username) + pw_ok = hmac.compare_digest(password, settings.app_admin_password) + return user_ok and pw_ok + + +def create_session_token(username: str) -> str: + return _serializer().dumps({"username": username}) + + +def verify_session_token(token: str) -> Optional[str]: + settings = get_settings() + try: + payload = _serializer().loads(token, max_age=settings.session_max_age_seconds) + except (BadSignature, SignatureExpired): + return None + return payload.get("username") diff --git a/backend/app/config.py b/backend/app/config.py new file mode 100644 index 0000000..f32b8c4 --- /dev/null +++ b/backend/app/config.py @@ -0,0 +1,29 @@ +from functools import lru_cache + +from pydantic_settings import BaseSettings, SettingsConfigDict + + +class Settings(BaseSettings): + # Prefer backend-specific env file to avoid collisions with legacy root .env + model_config = SettingsConfigDict( + env_file=("backend/.env", ".env"), + env_file_encoding="utf-8", + extra="ignore", + ) + + app_env: str = "development" + app_name: str = "rss-news-backend" + app_secret_key: str = "replace-with-a-long-random-secret" + + app_admin_username: str = "admin" + app_admin_password: str = "change-me" + + session_cookie_name: str = "rss_news_session" + session_max_age_seconds: int = 28800 + + app_db_path: str = "backend/data/rss_news.db" + + +@lru_cache(maxsize=1) +def get_settings() -> Settings: + return Settings() diff --git a/backend/app/db.py b/backend/app/db.py new file mode 100644 index 0000000..c914044 --- /dev/null +++ b/backend/app/db.py @@ -0,0 +1,138 @@ +import sqlite3 +from contextlib import contextmanager +from pathlib import Path +from typing import Any, Iterator + +from .config import get_settings + + +def _db_path() -> Path: + settings = get_settings() + path = Path(settings.app_db_path) + path.parent.mkdir(parents=True, exist_ok=True) + return path + + +@contextmanager +def get_conn() -> Iterator[sqlite3.Connection]: + conn = sqlite3.connect(_db_path()) + conn.row_factory = sqlite3.Row + conn.execute("PRAGMA foreign_keys=ON;") + try: + yield conn + conn.commit() + finally: + conn.close() + + +def init_db() -> None: + with get_conn() as conn: + conn.executescript( + """ + PRAGMA journal_mode=WAL; + + CREATE TABLE IF NOT EXISTS sources ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + name TEXT NOT NULL, + base_url TEXT, + terms_url TEXT, + license_name TEXT, + risk_level TEXT NOT NULL DEFAULT 'yellow' CHECK (risk_level IN ('green', 'yellow', 'red')), + is_enabled INTEGER NOT NULL DEFAULT 0, + notes TEXT, + last_reviewed_at TEXT, + created_at TEXT NOT NULL DEFAULT (datetime('now')), + updated_at TEXT NOT NULL DEFAULT (datetime('now')) + ); + + CREATE TABLE IF NOT EXISTS feeds ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + source_id INTEGER, + name TEXT NOT NULL, + url TEXT NOT NULL UNIQUE, + is_enabled INTEGER NOT NULL DEFAULT 1, + etag TEXT, + last_modified TEXT, + last_checked_at TEXT, + created_at TEXT NOT NULL DEFAULT (datetime('now')), + updated_at TEXT NOT NULL DEFAULT (datetime('now')), + FOREIGN KEY(source_id) REFERENCES sources(id) ON DELETE SET NULL + ); + + CREATE TABLE IF NOT EXISTS runs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + run_type TEXT NOT NULL, + status TEXT NOT NULL CHECK (status IN ('queued', 'running', 'success', 'failed')), + started_at TEXT NOT NULL DEFAULT (datetime('now')), + finished_at TEXT, + details TEXT + ); + + CREATE TABLE IF NOT EXISTS articles ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + feed_id INTEGER, + source_article_id TEXT, + source_hash TEXT, + title TEXT NOT NULL, + source_url TEXT NOT NULL, + canonical_url TEXT, + published_at TEXT, + author TEXT, + summary TEXT, + content_raw TEXT, + content_rewritten TEXT, + word_count INTEGER DEFAULT 0, + status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error')), + meta_json TEXT, + created_at TEXT NOT NULL DEFAULT (datetime('now')), + updated_at TEXT NOT NULL DEFAULT (datetime('now')), + FOREIGN KEY(feed_id) REFERENCES feeds(id) ON DELETE SET NULL, + UNIQUE(source_url) + ); + + CREATE INDEX IF NOT EXISTS idx_articles_source_article_id ON articles(source_article_id); + CREATE INDEX IF NOT EXISTS idx_articles_source_hash ON articles(source_hash); + CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_feed_source_article_id + ON articles(feed_id, source_article_id) + WHERE source_article_id IS NOT NULL; + CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_source_hash + ON articles(source_hash) + WHERE source_hash IS NOT NULL; + CREATE INDEX IF NOT EXISTS idx_articles_status ON articles(status); + CREATE INDEX IF NOT EXISTS idx_feeds_source_id ON feeds(source_id); + CREATE INDEX IF NOT EXISTS idx_runs_started_at ON runs(started_at); + CREATE INDEX IF NOT EXISTS idx_articles_published_at ON articles(published_at); + + CREATE TRIGGER IF NOT EXISTS trg_sources_updated_at + AFTER UPDATE ON sources + FOR EACH ROW + BEGIN + UPDATE sources SET updated_at = datetime('now') WHERE id = OLD.id; + END; + + CREATE TRIGGER IF NOT EXISTS trg_feeds_updated_at + AFTER UPDATE ON feeds + FOR EACH ROW + BEGIN + UPDATE feeds SET updated_at = datetime('now') WHERE id = OLD.id; + END; + + CREATE TRIGGER IF NOT EXISTS trg_articles_updated_at + AFTER UPDATE ON articles + FOR EACH ROW + BEGIN + UPDATE articles SET updated_at = datetime('now') WHERE id = OLD.id; + END; + """ + ) + + # Lightweight migration for existing DBs created before source_hash was introduced. + existing_columns = { + row["name"] for row in conn.execute("PRAGMA table_info(articles)").fetchall() + } + if "source_hash" not in existing_columns: + conn.execute("ALTER TABLE articles ADD COLUMN source_hash TEXT") + + +def rows_to_dicts(rows: list[sqlite3.Row]) -> list[dict[str, Any]]: + return [dict(r) for r in rows] diff --git a/backend/app/ingestion.py b/backend/app/ingestion.py new file mode 100644 index 0000000..87e44c2 --- /dev/null +++ b/backend/app/ingestion.py @@ -0,0 +1,253 @@ +from __future__ import annotations + +from dataclasses import dataclass +from datetime import datetime, timezone +import hashlib +import json +import time +from typing import Any + +import feedparser + +from .policy import evaluate_source_policy +from .repositories import ( + ArticleUpsert, + RunCreate, + create_run, + finish_run, + get_feed_by_id, + list_enabled_feeds, + update_feed_fetch_state, + upsert_article, +) +from .source_extraction import extract_article, extracted_article_to_meta + + +@dataclass(frozen=True) +class IngestionStats: + run_id: int + feeds_processed: int + entries_seen: int + articles_upserted: int + status: str + message: str + + +MAX_FEED_FETCH_RETRIES = 3 + + +def _entry_published_iso(entry: dict) -> str | None: + published = entry.get("published_parsed") or entry.get("updated_parsed") + if not published: + return None + return datetime(*published[:6], tzinfo=timezone.utc).isoformat() + + +def _entry_text(entry: dict) -> tuple[str, str]: + summary = entry.get("summary", "") or "" + content = "" + if entry.get("content") and isinstance(entry.get("content"), list): + first = entry["content"][0] + content = first.get("value", "") if isinstance(first, dict) else "" + if not content: + content = summary + return summary, content + + +def _entry_hash(entry: dict, feed_id: int, link: str, title: str, summary: str) -> str: + source_id = entry.get("id") or entry.get("guid") or "" + published = _entry_published_iso(entry) or "" + fingerprint = f"{feed_id}|{source_id}|{link}|{title.strip()}|{summary.strip()}|{published}" + return hashlib.sha256(fingerprint.encode("utf-8")).hexdigest() + + +def _parsed_get(parsed: object, key: str, default: object = None) -> object: + if isinstance(parsed, dict): + return parsed.get(key, default) + return getattr(parsed, key, default) + + +def run_ingestion(feed_id: int | None = None) -> IngestionStats: + run_id = create_run(RunCreate(run_type="ingestion", status="running", details="started")) + feeds_processed = 0 + entries_seen = 0 + articles_upserted = 0 + feed_results: list[dict[str, object]] = [] + + try: + if feed_id is not None: + feed = get_feed_by_id(feed_id) + feeds = [feed] if feed and int(feed.get("is_enabled", 0)) == 1 else [] + else: + feeds = list_enabled_feeds() + + for feed in feeds: + if not feed: + continue + feeds_processed += 1 + + source_snapshot = { + "id": feed.get("source_id"), + "name": feed.get("source_name"), + "base_url": feed.get("source_base_url"), + "terms_url": feed.get("source_terms_url"), + "license_name": feed.get("source_license_name"), + "risk_level": feed.get("source_risk_level"), + "last_reviewed_at": feed.get("source_last_reviewed_at"), + "is_enabled": feed.get("source_is_enabled"), + } + policy_issues = evaluate_source_policy(source_snapshot) + if policy_issues: + feed_results.append( + { + "feed_id": int(feed["id"]), + "feed_url": feed["url"], + "status": "blocked", + "policy_issues": policy_issues, + "entries_seen": 0, + "upserts": 0, + } + ) + continue + + parsed = None + feed_error = None + for attempt in range(1, MAX_FEED_FETCH_RETRIES + 1): + try: + parsed = feedparser.parse( + feed["url"], + etag=feed.get("etag"), + modified=feed.get("last_modified"), + ) + break + except Exception as exc: + feed_error = str(exc) + if attempt < MAX_FEED_FETCH_RETRIES: + time.sleep(0.5 * attempt) + + if parsed is None: + feed_results.append( + { + "feed_id": int(feed["id"]), + "feed_url": feed["url"], + "status": "failed", + "error": feed_error or "unknown", + "entries_seen": 0, + "upserts": 0, + } + ) + continue + + # Persist ETag/Last-Modified for conditional requests. + parsed_etag = _parsed_get(parsed, "etag") + parsed_modified = _parsed_get(parsed, "modified") + if parsed_modified and not isinstance(parsed_modified, str): + parsed_modified = str(parsed_modified) + update_feed_fetch_state( + feed_id=int(feed["id"]), + etag=parsed_etag if isinstance(parsed_etag, str) else None, + last_modified=parsed_modified if isinstance(parsed_modified, str) else None, + ) + + feed_entries_seen = 0 + feed_upserts = 0 + for entry in _parsed_get(parsed, "entries", []): + entries_seen += 1 + feed_entries_seen += 1 + link = entry.get("link") + if not link: + continue + + summary, content_raw = _entry_text(entry) + title = entry.get("title") or "Ohne Titel" + extracted = extract_article(link) + + final_title = extracted.title or title + final_author = extracted.author or entry.get("author") + final_summary = extracted.summary or (summary[:1000] if summary else None) + final_content_raw = extracted.content_text or content_raw + final_canonical = extracted.canonical_url or entry.get("link") + + source_hash = _entry_hash( + entry, + int(feed["id"]), + link, + final_title, + final_summary or "", + ) + attribution = { + "source_name": feed.get("source_name"), + "source_base_url": feed.get("source_base_url"), + "source_terms_url": feed.get("source_terms_url"), + "source_license_name": feed.get("source_license_name"), + "source_risk_level": feed.get("source_risk_level"), + "original_link": link, + "feed_name": feed.get("name"), + "feed_id": int(feed["id"]), + "imported_at": datetime.now(timezone.utc).isoformat(), + } + extraction_meta: dict[str, Any] = extracted_article_to_meta(extracted) + extraction_meta["fetched_from"] = link + article_id = upsert_article( + ArticleUpsert( + feed_id=int(feed["id"]), + source_article_id=entry.get("id") or entry.get("guid"), + source_hash=source_hash, + title=final_title, + source_url=link, + canonical_url=final_canonical, + published_at=_entry_published_iso(entry), + author=final_author, + summary=final_summary, + content_raw=final_content_raw, + content_rewritten=None, + word_count=len((final_content_raw or "").split()), + status="new", + meta_json=json.dumps({"attribution": attribution, "extraction": extraction_meta}, ensure_ascii=False), + ) + ) + if article_id: + articles_upserted += 1 + feed_upserts += 1 + + feed_results.append( + { + "feed_id": int(feed["id"]), + "feed_url": feed["url"], + "status": "success", + "entries_seen": feed_entries_seen, + "upserts": feed_upserts, + } + ) + + finish_run( + run_id=run_id, + status="success", + details=json.dumps( + { + "feeds_processed": feeds_processed, + "entries_seen": entries_seen, + "upserts": articles_upserted, + "feeds": feed_results, + }, + ensure_ascii=False, + ), + ) + return IngestionStats( + run_id=run_id, + feeds_processed=feeds_processed, + entries_seen=entries_seen, + articles_upserted=articles_upserted, + status="success", + message="Ingestion abgeschlossen", + ) + except Exception as exc: + finish_run(run_id=run_id, status="failed", details=str(exc)) + return IngestionStats( + run_id=run_id, + feeds_processed=feeds_processed, + entries_seen=entries_seen, + articles_upserted=articles_upserted, + status="failed", + message=str(exc), + ) diff --git a/backend/app/main.py b/backend/app/main.py new file mode 100644 index 0000000..616dd77 --- /dev/null +++ b/backend/app/main.py @@ -0,0 +1,404 @@ +from contextlib import asynccontextmanager +from pathlib import Path + +from fastapi import Depends, FastAPI, HTTPException, Request, Response, status +from pydantic import BaseModel, Field +from fastapi.staticfiles import StaticFiles + +from .admin_ui import router as admin_router +from .auth import create_session_token, verify_credentials, verify_session_token +from .config import get_settings +from .db import init_db +from .ingestion import run_ingestion +from .policy import evaluate_source_policy, is_source_allowed +from .repositories import ( + ArticleUpsert, + FeedCreate, + RunCreate, + SourceCreate, + create_feed as repo_create_feed, + create_run, + create_source as repo_create_source, + finish_run, + get_article_by_id, + get_feed_by_id, + get_run_by_id, + get_source_by_id, + list_articles as repo_list_articles, + list_feeds as repo_list_feeds, + list_runs, + list_sources as repo_list_sources, + update_article_status, + upsert_article as repo_upsert_article, +) + +settings = get_settings() + + +@asynccontextmanager +async def app_lifespan(_: FastAPI): + init_db() + yield + + +app = FastAPI(title=settings.app_name, lifespan=app_lifespan) +app.include_router(admin_router) +app.mount( + "/admin/static", + StaticFiles(directory=str(Path(__file__).resolve().parent.parent / "static")), + name="admin-static", +) + + +class LoginRequest(BaseModel): + username: str + password: str + + +class SourceCreateRequest(BaseModel): + name: str = Field(min_length=1, max_length=200) + base_url: str | None = None + terms_url: str | None = None + license_name: str | None = None + risk_level: str = Field(default="yellow", pattern="^(green|yellow|red)$") + is_enabled: bool = False + notes: str | None = None + last_reviewed_at: str | None = None + + +class FeedCreateRequest(BaseModel): + name: str = Field(min_length=1, max_length=200) + url: str = Field(min_length=5, max_length=1000) + source_id: int | None = None + is_enabled: bool = True + + +class RunCreateRequest(BaseModel): + run_type: str = Field(min_length=2, max_length=100) + status: str = Field(default="queued", pattern="^(queued|running|success|failed)$") + details: str | None = None + + +class RunFinishRequest(BaseModel): + status: str = Field(pattern="^(success|failed)$") + details: str | None = None + + +class ArticleUpsertRequest(BaseModel): + feed_id: int | None = None + source_article_id: str | None = None + source_hash: str | None = None + title: str = Field(min_length=1, max_length=500) + source_url: str = Field(min_length=5, max_length=2000) + canonical_url: str | None = None + published_at: str | None = None + author: str | None = None + summary: str | None = None + content_raw: str | None = None + content_rewritten: str | None = None + word_count: int = 0 + status: str = Field(default="new", pattern="^(new|rewrite|review|approved|published|error)$") + meta_json: str | None = None + + +class IngestionRunRequest(BaseModel): + feed_id: int | None = None + + +class ArticleTransitionRequest(BaseModel): + target_status: str = Field(pattern="^(new|rewrite|review|approved|published|error)$") + note: str | None = None + + +class ArticleReviewRequest(BaseModel): + decision: str = Field(pattern="^(approve|reject)$") + note: str | None = None + + +ALLOWED_ARTICLE_TRANSITIONS: dict[str, set[str]] = { + "new": {"review", "rewrite", "error"}, + "rewrite": {"review", "error"}, + "review": {"approved", "rewrite", "error"}, + "approved": {"published", "error"}, + "published": {"error"}, + "error": {"review", "rewrite"}, +} + + +def require_auth(request: Request) -> str: + token = request.cookies.get(settings.session_cookie_name) + if not token: + raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Nicht angemeldet") + + username = verify_session_token(token) + if not username: + raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Session ungueltig oder abgelaufen") + + return username + + +@app.get("/health") +def health() -> dict: + return {"status": "ok", "service": settings.app_name, "db_path": settings.app_db_path} + + +@app.post("/auth/login") +def login(payload: LoginRequest, response: Response) -> dict: + if not verify_credentials(payload.username, payload.password): + raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Ungueltige Zugangsdaten") + + token = create_session_token(payload.username) + response.set_cookie( + key=settings.session_cookie_name, + value=token, + max_age=settings.session_max_age_seconds, + httponly=True, + secure=False, + samesite="lax", + ) + return {"ok": True, "username": payload.username} + + +@app.post("/auth/logout") +def logout(response: Response) -> dict: + response.delete_cookie(settings.session_cookie_name) + return {"ok": True} + + +@app.get("/auth/me") +def me(username: str = Depends(require_auth)) -> dict: + return {"authenticated": True, "username": username} + + +@app.get("/api/protected") +def protected(username: str = Depends(require_auth)) -> dict: + return {"ok": True, "message": "Protected endpoint", "username": username} + + +@app.get("/api/pipeline/status") +def pipeline_status(username: str = Depends(require_auth)) -> dict: + feeds_total = len(repo_list_feeds()) + sources_total = len(repo_list_sources()) + articles_total = len(repo_list_articles(limit=500)) + return { + "ok": True, + "stage": "skeleton+db", + "requested_by": username, + "counts": { + "sources": sources_total, + "feeds": feeds_total, + "articles": articles_total, + }, + } + + +@app.get("/api/sources") +def list_sources(username: str = Depends(require_auth)) -> dict: + return {"ok": True, "items": repo_list_sources(), "requested_by": username} + + +@app.get("/api/sources/{source_id}/policy-check") +def source_policy_check(source_id: int, username: str = Depends(require_auth)) -> dict: + source = get_source_by_id(source_id) + if not source: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Quelle nicht gefunden") + issues = evaluate_source_policy(source) + return { + "ok": True, + "source_id": source_id, + "allowed": is_source_allowed(source), + "issues": issues, + "requested_by": username, + } + + +@app.post("/api/sources") +def create_source(payload: SourceCreateRequest, username: str = Depends(require_auth)) -> dict: + source_id = repo_create_source( + SourceCreate( + name=payload.name, + base_url=payload.base_url, + terms_url=payload.terms_url, + license_name=payload.license_name, + risk_level=payload.risk_level, + is_enabled=payload.is_enabled, + notes=payload.notes, + last_reviewed_at=payload.last_reviewed_at, + ) + ) + return {"ok": True, "id": source_id, "requested_by": username} + + +@app.get("/api/feeds") +def list_feeds(username: str = Depends(require_auth)) -> dict: + return {"ok": True, "items": repo_list_feeds(), "requested_by": username} + + +@app.get("/api/feeds/{feed_id}/policy-check") +def feed_policy_check(feed_id: int, username: str = Depends(require_auth)) -> dict: + feed = get_feed_by_id(feed_id) + if not feed: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Feed nicht gefunden") + + source_snapshot = { + "id": feed.get("source_id"), + "name": feed.get("source_name"), + "base_url": feed.get("source_base_url"), + "terms_url": feed.get("source_terms_url"), + "license_name": feed.get("source_license_name"), + "risk_level": feed.get("source_risk_level"), + "last_reviewed_at": feed.get("source_last_reviewed_at"), + "is_enabled": feed.get("source_is_enabled"), + } + issues = evaluate_source_policy(source_snapshot) + return { + "ok": True, + "feed_id": feed_id, + "allowed": len(issues) == 0, + "issues": issues, + "requested_by": username, + } + + +@app.post("/api/feeds") +def create_feed(payload: FeedCreateRequest, username: str = Depends(require_auth)) -> dict: + try: + feed_id = repo_create_feed( + FeedCreate( + name=payload.name, + url=payload.url, + source_id=payload.source_id, + is_enabled=payload.is_enabled, + ) + ) + except Exception as exc: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=f"Feed konnte nicht angelegt werden: {exc}") from exc + + return {"ok": True, "id": feed_id, "requested_by": username} + + +@app.get("/api/runs") +def api_list_runs(limit: int = 50, username: str = Depends(require_auth)) -> dict: + return {"ok": True, "items": list_runs(limit=limit), "requested_by": username} + + +@app.get("/api/runs/{run_id}") +def api_get_run(run_id: int, username: str = Depends(require_auth)) -> dict: + run = get_run_by_id(run_id) + if not run: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Run nicht gefunden") + return {"ok": True, "item": run, "requested_by": username} + + +@app.post("/api/runs") +def api_create_run(payload: RunCreateRequest, username: str = Depends(require_auth)) -> dict: + run_id = create_run(RunCreate(run_type=payload.run_type, status=payload.status, details=payload.details)) + return {"ok": True, "id": run_id, "requested_by": username} + + +@app.post("/api/runs/{run_id}/finish") +def api_finish_run(run_id: int, payload: RunFinishRequest, username: str = Depends(require_auth)) -> dict: + finish_run(run_id=run_id, status=payload.status, details=payload.details) + return {"ok": True, "id": run_id, "requested_by": username} + + +@app.get("/api/articles") +def api_list_articles(limit: int = 100, status_filter: str | None = None, username: str = Depends(require_auth)) -> dict: + return {"ok": True, "items": repo_list_articles(limit=limit, status_filter=status_filter), "requested_by": username} + + +@app.get("/api/articles/{article_id}") +def api_get_article(article_id: int, username: str = Depends(require_auth)) -> dict: + article = get_article_by_id(article_id) + if not article: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden") + return {"ok": True, "item": article, "requested_by": username} + + +@app.post("/api/articles/upsert") +def api_upsert_article(payload: ArticleUpsertRequest, username: str = Depends(require_auth)) -> dict: + article_id = repo_upsert_article( + ArticleUpsert( + feed_id=payload.feed_id, + source_article_id=payload.source_article_id, + source_hash=payload.source_hash, + title=payload.title, + source_url=payload.source_url, + canonical_url=payload.canonical_url, + published_at=payload.published_at, + author=payload.author, + summary=payload.summary, + content_raw=payload.content_raw, + content_rewritten=payload.content_rewritten, + word_count=payload.word_count, + status=payload.status, + meta_json=payload.meta_json, + ) + ) + return {"ok": True, "id": article_id, "requested_by": username} + + +@app.post("/api/articles/{article_id}/transition") +def api_article_transition(article_id: int, payload: ArticleTransitionRequest, username: str = Depends(require_auth)) -> dict: + article = get_article_by_id(article_id) + if not article: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden") + + current_status = article.get("status") + allowed_targets = ALLOWED_ARTICLE_TRANSITIONS.get(current_status, set()) + if payload.target_status not in allowed_targets: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=f"Ungueltiger Statuswechsel: {current_status} -> {payload.target_status}", + ) + + updated = update_article_status(article_id, payload.target_status, actor=username, note=payload.note) + if not updated: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden") + return {"ok": True, "id": article_id, "from_status": current_status, "to_status": payload.target_status} + + +@app.post("/api/articles/{article_id}/review") +def api_article_review(article_id: int, payload: ArticleReviewRequest, username: str = Depends(require_auth)) -> dict: + article = get_article_by_id(article_id) + if not article: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden") + if article.get("status") != "review": + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=f"Review nur fuer Status 'review' erlaubt (aktuell: {article.get('status')})", + ) + + target_status = "approved" if payload.decision == "approve" else "rewrite" + updated = update_article_status( + article_id, + target_status, + actor=username, + note=payload.note, + decision=payload.decision, + ) + if not updated: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden") + return { + "ok": True, + "id": article_id, + "decision": payload.decision, + "to_status": target_status, + } + + +@app.post("/api/ingestion/run") +def api_run_ingestion(payload: IngestionRunRequest, username: str = Depends(require_auth)) -> dict: + stats = run_ingestion(feed_id=payload.feed_id) + return { + "ok": stats.status == "success", + "run_id": stats.run_id, + "status": stats.status, + "message": stats.message, + "stats": { + "feeds_processed": stats.feeds_processed, + "entries_seen": stats.entries_seen, + "articles_upserted": stats.articles_upserted, + }, + "requested_by": username, + } diff --git a/backend/app/policy.py b/backend/app/policy.py new file mode 100644 index 0000000..af6e65c --- /dev/null +++ b/backend/app/policy.py @@ -0,0 +1,35 @@ +from __future__ import annotations + +from typing import Any + + +def evaluate_source_policy(source: dict[str, Any] | None) -> list[str]: + issues: list[str] = [] + if not source: + issues.append("Keine Quelle zugeordnet") + return issues + + risk_level = (source.get("risk_level") or "").strip().lower() + if risk_level != "green": + issues.append(f"Quelle nicht freigegeben (risk_level={risk_level or 'unset'})") + + terms_url = (source.get("terms_url") or "").strip() + if not terms_url: + issues.append("terms_url fehlt") + + license_name = (source.get("license_name") or "").strip() + if not license_name: + issues.append("license_name fehlt") + + last_reviewed_at = (source.get("last_reviewed_at") or "").strip() + if not last_reviewed_at: + issues.append("last_reviewed_at fehlt") + + if int(source.get("is_enabled", 0) or 0) != 1: + issues.append("Quelle ist deaktiviert") + + return issues + + +def is_source_allowed(source: dict[str, Any] | None) -> bool: + return len(evaluate_source_policy(source)) == 0 diff --git a/backend/app/repositories.py b/backend/app/repositories.py new file mode 100644 index 0000000..e170a20 --- /dev/null +++ b/backend/app/repositories.py @@ -0,0 +1,416 @@ +from __future__ import annotations + +from dataclasses import dataclass +import json +from datetime import datetime, timezone +from typing import Any + +from .db import get_conn, rows_to_dicts + + +@dataclass(frozen=True) +class SourceCreate: + name: str + base_url: str | None + terms_url: str | None + license_name: str | None + risk_level: str + is_enabled: bool + notes: str | None + last_reviewed_at: str | None + + +@dataclass(frozen=True) +class FeedCreate: + name: str + url: str + source_id: int | None + is_enabled: bool + + +@dataclass(frozen=True) +class RunCreate: + run_type: str + status: str + details: str | None = None + + +@dataclass(frozen=True) +class ArticleUpsert: + feed_id: int | None + source_article_id: str | None + source_hash: str | None + title: str + source_url: str + canonical_url: str | None + published_at: str | None + author: str | None + summary: str | None + content_raw: str | None + content_rewritten: str | None + word_count: int + status: str + meta_json: str | None + + +def create_source(payload: SourceCreate) -> int: + with get_conn() as conn: + cur = conn.execute( + """ + INSERT INTO sources (name, base_url, terms_url, license_name, risk_level, is_enabled, notes, last_reviewed_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + payload.name.strip(), + payload.base_url, + payload.terms_url, + payload.license_name, + payload.risk_level, + 1 if payload.is_enabled else 0, + payload.notes, + payload.last_reviewed_at, + ), + ) + return int(cur.lastrowid) + + +def list_sources() -> list[dict[str, Any]]: + with get_conn() as conn: + rows = conn.execute( + """ + SELECT id, name, base_url, terms_url, license_name, risk_level, is_enabled, notes, last_reviewed_at, created_at, updated_at + FROM sources + ORDER BY id DESC + """ + ).fetchall() + return rows_to_dicts(rows) + + +def get_source_by_id(source_id: int) -> dict[str, Any] | None: + with get_conn() as conn: + row = conn.execute( + """ + SELECT id, name, base_url, terms_url, license_name, risk_level, is_enabled, notes, last_reviewed_at, created_at, updated_at + FROM sources + WHERE id = ? + """, + (source_id,), + ).fetchone() + return dict(row) if row else None + + +def create_feed(payload: FeedCreate) -> int: + with get_conn() as conn: + cur = conn.execute( + "INSERT INTO feeds (name, url, source_id, is_enabled) VALUES (?, ?, ?, ?)", + (payload.name.strip(), payload.url.strip(), payload.source_id, 1 if payload.is_enabled else 0), + ) + return int(cur.lastrowid) + + +def list_feeds() -> list[dict[str, Any]]: + with get_conn() as conn: + rows = conn.execute( + """ + SELECT f.id, f.name, f.url, f.source_id, f.is_enabled, f.etag, f.last_modified, f.last_checked_at, + f.created_at, f.updated_at, s.name AS source_name, s.license_name AS source_license_name, + s.terms_url AS source_terms_url, s.risk_level AS source_risk_level, s.base_url AS source_base_url, + s.last_reviewed_at AS source_last_reviewed_at, s.is_enabled AS source_is_enabled + FROM feeds f + LEFT JOIN sources s ON s.id = f.source_id + ORDER BY f.id DESC + """ + ).fetchall() + return rows_to_dicts(rows) + + +def list_enabled_feeds() -> list[dict[str, Any]]: + with get_conn() as conn: + rows = conn.execute( + """ + SELECT f.id, f.name, f.url, f.source_id, f.is_enabled, f.etag, f.last_modified, f.last_checked_at, + s.name AS source_name, s.license_name AS source_license_name, s.terms_url AS source_terms_url, + s.risk_level AS source_risk_level, s.base_url AS source_base_url, + s.last_reviewed_at AS source_last_reviewed_at, s.is_enabled AS source_is_enabled + FROM feeds f + LEFT JOIN sources s ON s.id = f.source_id + WHERE f.is_enabled = 1 + ORDER BY f.id ASC + """ + ).fetchall() + return rows_to_dicts(rows) + + +def get_feed_by_id(feed_id: int) -> dict[str, Any] | None: + with get_conn() as conn: + row = conn.execute( + """ + SELECT f.id, f.name, f.url, f.source_id, f.is_enabled, f.etag, f.last_modified, f.last_checked_at, + s.name AS source_name, s.license_name AS source_license_name, s.terms_url AS source_terms_url, + s.risk_level AS source_risk_level, s.base_url AS source_base_url, + s.last_reviewed_at AS source_last_reviewed_at, s.is_enabled AS source_is_enabled + FROM feeds f + LEFT JOIN sources s ON s.id = f.source_id + WHERE f.id = ? + """, + (feed_id,), + ).fetchone() + return dict(row) if row else None + + +def update_feed_fetch_state(feed_id: int, etag: str | None, last_modified: str | None) -> None: + with get_conn() as conn: + conn.execute( + """ + UPDATE feeds + SET etag = ?, last_modified = ?, last_checked_at = datetime('now') + WHERE id = ? + """, + (etag, last_modified, feed_id), + ) + + +def create_run(payload: RunCreate) -> int: + with get_conn() as conn: + cur = conn.execute( + "INSERT INTO runs (run_type, status, details) VALUES (?, ?, ?)", + (payload.run_type, payload.status, payload.details), + ) + return int(cur.lastrowid) + + +def finish_run(run_id: int, status: str, details: str | None = None) -> None: + with get_conn() as conn: + conn.execute( + """ + UPDATE runs + SET status = ?, details = ?, finished_at = datetime('now') + WHERE id = ? + """, + (status, details, run_id), + ) + + +def list_runs(limit: int = 50) -> list[dict[str, Any]]: + safe_limit = max(1, min(limit, 500)) + with get_conn() as conn: + rows = conn.execute( + """ + SELECT id, run_type, status, started_at, finished_at, details + FROM runs + ORDER BY id DESC + LIMIT ? + """, + (safe_limit,), + ).fetchall() + return rows_to_dicts(rows) + + +def get_run_by_id(run_id: int) -> dict[str, Any] | None: + with get_conn() as conn: + row = conn.execute( + """ + SELECT id, run_type, status, started_at, finished_at, details + FROM runs + WHERE id = ? + """, + (run_id,), + ).fetchone() + return dict(row) if row else None + + +def get_article_by_id(article_id: int) -> dict[str, Any] | None: + with get_conn() as conn: + row = conn.execute( + """ + SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author, + a.summary, a.content_raw, a.content_rewritten, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at + FROM articles a + WHERE a.id = ? + """, + (article_id,), + ).fetchone() + return dict(row) if row else None + + +def _merge_review_event(meta_json: str | None, event: dict[str, Any]) -> str: + meta: dict[str, Any] = {} + if meta_json: + try: + meta = json.loads(meta_json) + if not isinstance(meta, dict): + meta = {} + except Exception: + meta = {} + + events = meta.get("review_events") + if not isinstance(events, list): + events = [] + events.append(event) + meta["review_events"] = events + return json.dumps(meta, ensure_ascii=False) + + +def update_article_status( + article_id: int, + new_status: str, + *, + actor: str | None = None, + note: str | None = None, + decision: str | None = None, +) -> bool: + article = get_article_by_id(article_id) + if not article: + return False + + event = { + "timestamp": datetime.now(timezone.utc).isoformat(), + "from_status": article.get("status"), + "to_status": new_status, + "actor": actor or "system", + "note": note, + "decision": decision, + } + merged_meta = _merge_review_event(article.get("meta_json"), event) + + with get_conn() as conn: + conn.execute( + "UPDATE articles SET status = ?, meta_json = ? WHERE id = ?", + (new_status, merged_meta, article_id), + ) + return True + + +def _resolve_existing_article_id(payload: ArticleUpsert) -> int | None: + with get_conn() as conn: + # 1) strongest key: source_url + row = conn.execute( + "SELECT id FROM articles WHERE source_url = ?", + (payload.source_url.strip(),), + ).fetchone() + if row: + return int(row["id"]) + + # 2) stable feed+guid combo + if payload.feed_id is not None and payload.source_article_id: + row = conn.execute( + "SELECT id FROM articles WHERE feed_id = ? AND source_article_id = ?", + (payload.feed_id, payload.source_article_id), + ).fetchone() + if row: + return int(row["id"]) + + # 3) content hash fallback + if payload.source_hash: + row = conn.execute( + "SELECT id FROM articles WHERE source_hash = ?", + (payload.source_hash,), + ).fetchone() + if row: + return int(row["id"]) + + return None + + +def upsert_article(payload: ArticleUpsert) -> int: + existing_id = _resolve_existing_article_id(payload) + with get_conn() as conn: + if existing_id is None: + conn.execute( + """ + INSERT INTO articles ( + feed_id, source_article_id, source_hash, title, source_url, canonical_url, published_at, author, + summary, content_raw, content_rewritten, word_count, status, meta_json + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + payload.feed_id, + payload.source_article_id, + payload.source_hash, + payload.title.strip(), + payload.source_url.strip(), + payload.canonical_url, + payload.published_at, + payload.author, + payload.summary, + payload.content_raw, + payload.content_rewritten, + payload.word_count, + payload.status, + payload.meta_json, + ), + ) + else: + conn.execute( + """ + UPDATE articles + SET + feed_id = ?, + source_article_id = ?, + source_hash = ?, + title = ?, + source_url = ?, + canonical_url = ?, + published_at = ?, + author = ?, + summary = ?, + content_raw = ?, + content_rewritten = ?, + word_count = ?, + status = ?, + meta_json = ? + WHERE id = ? + """, + ( + payload.feed_id, + payload.source_article_id, + payload.source_hash, + payload.title.strip(), + payload.source_url.strip(), + payload.canonical_url, + payload.published_at, + payload.author, + payload.summary, + payload.content_raw, + payload.content_rewritten, + payload.word_count, + payload.status, + payload.meta_json, + existing_id, + ), + ) + row = conn.execute("SELECT id FROM articles WHERE source_url = ?", (payload.source_url.strip(),)).fetchone() + if row: + return int(row["id"]) + return int(existing_id) if existing_id else 0 + + +def list_articles(limit: int = 100, status_filter: str | None = None) -> list[dict[str, Any]]: + safe_limit = max(1, min(limit, 500)) + with get_conn() as conn: + if status_filter: + rows = conn.execute( + """ + SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author, + a.summary, a.content_raw, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at, f.name AS feed_name + FROM articles a + LEFT JOIN feeds f ON f.id = a.feed_id + WHERE a.status = ? + ORDER BY a.id DESC + LIMIT ? + """, + (status_filter, safe_limit), + ).fetchall() + else: + rows = conn.execute( + """ + SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author, + a.summary, a.content_raw, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at, f.name AS feed_name + FROM articles a + LEFT JOIN feeds f ON f.id = a.feed_id + ORDER BY a.id DESC + LIMIT ? + """, + (safe_limit,), + ).fetchall() + return rows_to_dicts(rows) diff --git a/backend/app/source_extraction.py b/backend/app/source_extraction.py new file mode 100644 index 0000000..7fd65ce --- /dev/null +++ b/backend/app/source_extraction.py @@ -0,0 +1,257 @@ +from __future__ import annotations + +from dataclasses import dataclass +from html import unescape +import re +from typing import Any +from urllib.parse import urljoin +from urllib.request import Request, urlopen + +DEFAULT_TIMEOUT_SECONDS = 10 +DEFAULT_USER_AGENT = "rss-news-bot/1.0 (+https://news.vanityontour.de)" + + +@dataclass(frozen=True) +class ExtractedArticle: + title: str | None + author: str | None + canonical_url: str | None + summary: str | None + content_text: str | None + images: list[str] + press_contact: str | None + extraction_error: str | None = None + + +def _clean_text(raw: str | None) -> str | None: + if not raw: + return None + text = unescape(raw) + text = re.sub(r"<[^>]+>", " ", text) + text = re.sub(r"\s+", " ", text).strip() + return text or None + + +def _strip_noise(html: str) -> str: + html = re.sub(r"", " ", html, flags=re.IGNORECASE) + html = re.sub(r"", " ", html, flags=re.IGNORECASE) + html = re.sub(r"", " ", html, flags=re.IGNORECASE) + return html + + +def _meta_content(html: str, attr: str, value: str) -> str | None: + pattern = re.compile( + rf"]+{attr}\s*=\s*[\"']{re.escape(value)}[\"'][^>]*content\s*=\s*[\"']([^\"']+)[\"'][^>]*>", + re.IGNORECASE, + ) + match = pattern.search(html) + if match: + return _clean_text(match.group(1)) + + # handle reversed attribute order + pattern_rev = re.compile( + rf"]+content\s*=\s*[\"']([^\"']+)[\"'][^>]*{attr}\s*=\s*[\"']{re.escape(value)}[\"'][^>]*>", + re.IGNORECASE, + ) + match = pattern_rev.search(html) + if match: + return _clean_text(match.group(1)) + return None + + +def _extract_title(html: str) -> str | None: + title = _meta_content(html, "property", "og:title") + if title: + return title + + match = re.search(r"]*>([\s\S]*?)", html, re.IGNORECASE) + if match: + cleaned = _clean_text(match.group(1)) + if cleaned: + return cleaned + + match = re.search(r"]*>([\s\S]*?)", html, re.IGNORECASE) + if match: + return _clean_text(match.group(1)) + return None + + +def _extract_canonical(html: str) -> str | None: + match = re.search( + r"]+rel\s*=\s*[\"']canonical[\"'][^>]*href\s*=\s*[\"']([^\"']+)[\"'][^>]*>", + html, + re.IGNORECASE, + ) + if match: + return _clean_text(match.group(1)) + + match = re.search( + r"]+href\s*=\s*[\"']([^\"']+)[\"'][^>]*rel\s*=\s*[\"']canonical[\"'][^>]*>", + html, + re.IGNORECASE, + ) + if match: + return _clean_text(match.group(1)) + return None + + +def _extract_author(html: str) -> str | None: + for attr, value in (("name", "author"), ("property", "article:author"), ("property", "og:article:author")): + author = _meta_content(html, attr, value) + if author: + return author + + for pattern in ( + r"(?:Von|Autor(?:in)?)\s*[:\-]\s*([^<\n\r]{3,120})", + r"class=[\"'][^\"']*(?:author|byline)[^\"']*[\"'][^>]*>([\s\S]{1,180})<", + ): + match = re.search(pattern, html, re.IGNORECASE) + if match: + author = _clean_text(match.group(1)) + if author: + return author + return None + + +def _extract_images(html: str, page_url: str) -> list[str]: + images: list[str] = [] + seen: set[str] = set() + + for prop in ("og:image", "twitter:image"): + pattern = re.compile( + rf"]+property\s*=\s*[\"']{re.escape(prop)}[\"'][^>]*content\s*=\s*[\"']([^\"']+)[\"'][^>]*>", + re.IGNORECASE, + ) + for match in pattern.finditer(html): + src = match.group(1).strip() + abs_src = urljoin(page_url, src) + if abs_src not in seen: + seen.add(abs_src) + images.append(abs_src) + + for match in re.finditer(r"]+src\s*=\s*[\"']([^\"']+)[\"'][^>]*>", html, re.IGNORECASE): + src = match.group(1).strip() + abs_src = urljoin(page_url, src) + if abs_src not in seen: + seen.add(abs_src) + images.append(abs_src) + + return images + + +def _extract_content_text(html: str) -> str | None: + section = None + for pattern in ( + r"]*>([\s\S]*?)", + r"]*>([\s\S]*?)", + r"]*>([\s\S]*?)", + ): + match = re.search(pattern, html, re.IGNORECASE) + if match: + section = match.group(1) + break + + if not section: + section = html + + paragraphs = [] + for match in re.finditer(r"]*>([\s\S]*?)", section, re.IGNORECASE): + text = _clean_text(match.group(1)) + if text and re.search(r"\b(pressekontakt|press contact|kontakt)\b", text, re.IGNORECASE): + paragraphs.append(text) + + for match in re.finditer(r"]*>([\s\S]*?)

", section, re.IGNORECASE): + text = _clean_text(match.group(1)) + if text and len(text) > 2: + paragraphs.append(text) + + if paragraphs: + return "\n".join(paragraphs) + + stripped = _clean_text(section) + return stripped + + +def _extract_press_contact(content_text: str | None) -> str | None: + if not content_text: + return None + + lines = [line.strip() for line in content_text.split("\n") if line.strip()] + marker_re = re.compile(r"\b(pressekontakt|press contact|presse\-kontakt)\b", re.IGNORECASE) + for idx, line in enumerate(lines): + if marker_re.search(line): + chunk = [line] + for nxt in lines[idx + 1 : idx + 6]: + if re.search(r"\b(original\-content von|ots:|newsroom:)\b", nxt, re.IGNORECASE): + break + chunk.append(nxt) + return _clean_text("\n".join(chunk)) + + match = re.search( + r"(Pressekontakt[\s\S]{0,1200}?)(?:Original-Content von|OTS:|newsroom:|$)", + content_text, + re.IGNORECASE, + ) + if match: + return _clean_text(match.group(1)) + return None + + +def extract_article(url: str, timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS) -> ExtractedArticle: + try: + req = Request( + url=url, + headers={ + "User-Agent": DEFAULT_USER_AGENT, + "Accept-Language": "de-DE,de;q=0.9,en;q=0.8", + }, + ) + with urlopen(req, timeout=timeout_seconds) as resp: + raw = resp.read() + charset = resp.headers.get_content_charset() or "utf-8" + html = raw.decode(charset, errors="replace") + except Exception as exc: + return ExtractedArticle( + title=None, + author=None, + canonical_url=None, + summary=None, + content_text=None, + images=[], + press_contact=None, + extraction_error=str(exc), + ) + + html = _strip_noise(html) + title = _extract_title(html) + author = _extract_author(html) + canonical_url = _extract_canonical(html) + summary = _meta_content(html, "name", "description") + content_text = _extract_content_text(html) + if not summary and content_text: + summary = _clean_text(content_text[:320]) + images = _extract_images(html, url) + press_contact = _extract_press_contact(content_text) + + return ExtractedArticle( + title=title, + author=author, + canonical_url=canonical_url, + summary=summary, + content_text=content_text, + images=images, + press_contact=press_contact, + extraction_error=None, + ) + + +def extracted_article_to_meta(article: ExtractedArticle) -> dict[str, Any]: + return { + "title": article.title, + "author": article.author, + "canonical_url": article.canonical_url, + "summary": article.summary, + "images": article.images, + "press_contact": article.press_contact, + "extraction_error": article.extraction_error, + } diff --git a/backend/data/rss_news.db b/backend/data/rss_news.db new file mode 100644 index 0000000000000000000000000000000000000000..1b1c3900225265e8b460abed889fc85eda838c00 GIT binary patch literal 94208 zcmeHwTWlOxnjV{^NKrD`p7DB!fuiggVrL|d+3dP^RqNuRsg}$UMadFn*=u-+x}55+ zVRsc>RV`X_yjgiXJL}2X39`rl$wT%b1I$|2!1$OsofaD>_ zVu4_SO}6r^BF3p^p{G-X2 zCLWCc>3A^yY~gWX{h%T~KjD6Hdj9#dg-5fV@6b0r=gloMj6J(aqph8`)%2nUb+*hn z4}5y{c5P*|M%Fg2)LtP6CMP#HNS@_Y zh!xL*&hO=0d}S5|&vzBFy9-GM&soZwx>x6+J=#mw-LLJL>G?~T1nC&`P<0m)d1p=?GR`Tr zu_5an?7i^Cw@*&buc5MUbf!RF@WJ@a99d;o$GTz4n!gvj_@nPmP0yb_TlkldPZ-wUWQ7Pzd=*38UK)T%^bhhlE?Z^2*kSn#T zD|gp7i7?o!bfrp)>LA%kf6k}7=g8`dwbko{Eyq|7Q6@Q`gfEeEVbTyJ%Mbm=aed}oN zur!R_ee^P}-n?B~ySBmR!ugEY7s>6~RjjHTtF=2BbMc%-wmw{`t=GUz@6^)eb+Fgz zI-m0*5MJ-%wpgB?nqNIz7)zFt=#3`CqAgHZkUZ~k5;33MC*OH^VtW4BXA3{Pm&|^~ zMA`qTo>|XAj%~)Hu&wyP&e1b5r5!fg_o!?ix_YBc+bQLsWZ(1ajBcW~ZPO@XKV8#n zB7Q#_0(oXq#z!At*Yk6kC@Exc%p6dv*Nl?bp7n3>-MbUh^YinC_hu6C8F@s>pUEEh zlb~b6KU*mGmg~V5`)2FN;mH@hy|GDYbFefo+M+&VEqe?immfN-hB%lFsMQZZv}s21 zRx5Bk*F&NE^MfRf-JtgN-UJ@3U+bOxwrok&BY^ZTz9y>zuWIW>Q2{-9MLA*U@d z`C_}8lWcQxYkX?{x#tS6CWM$R91+Fb@-dInbtHeB=poL&$n%VLmXLJu-9}-0{>+)e zd*{+J6Nx9vx-s5cMjCf)Bt8uNT4qEK;oLC|ThSr$P0yx2O6XKINL!&7ZErQ{ecC5V zbcp`k1KMnY_2k^LD?`FD>mj8+Tf(w*_!Sl#=e<0WGrt;TJ2#~9u{{MiiW&7eKjgop}!F< z=+WuI?3nuKG>q#rV|}e4t7dBldf3Bi45O%w|I*g|I9hJRZ$AIVB|FXat)}VMJ?D~` zfYaL=(WuJ8K0&j^zZ{z^{_%M6&x*e+{!Q_ZKk^{+j2r=u07rl$z!BgGa0EC490861 zM}Q;15#R`X!U#MyIdQg-JEomFU6?pi$h<`+Q1i(dxOdHrPs|q*$CdGu6BExCm}A-m zjhOv^eC}_?ia)|1{^1C41ULd50geDifFr;W;0SO8I0762j=&cPfk#in4WRJrkBW)? zU7olXH2hXzdCd{c?z?%uj^Vx6z%FT3Q5^^pq=6t&-7P#dHuLpIr_rL0OTdV&>30I? zt3{{^@;*VQP~mv7Jyv{-Km5ZH;0SO8I0762jsQo1Bft^h2yg^A0vrL3!0{n)b@EZM zFyi)pP}wlPIthirh{g1436k^wr_YSd{qx!1z!(2;1ULd50geDifFr;W;0SO8zN`qm zJ2i1~?AOo8@{i8qNG^?HFYvR=5&H~j``YVYT3QCK|79^q$0FF>YIt1un4u>0n-H0E_==<9WFr-iqdNOxBm zOdq4GgPWz^&St~N0{faZFc@#Y{`NcD<0&D$TR(yjl&ais-DN@h5ds^`zPkG$$skBZ zpc|VnG#N0o3qztU44(35K^*O&C*A9Bvs)}Di~obI@$<>!Sbp>-YOF zjn9u2>W^k0U9Z=lfA!t@AHOwLc;<(mERP??Aa+`1QB_4jE^Cq~D1s;(rebPR#j#~W z)|85F>5eK<(RLKs)kRA&WW}H=mDGyvD7LDZvg&AtVab{+Shjx447VdN3WDoDpL|#c ze`v&HH4LJtw92l>4j+_Q66#k;&GX51+`t_W_sQdsT%qkavKzR)0(a0q`3Wl|q{vR% zzC-TV4P2s0{gL`IQsSJM3(BS_R#gethdd|1T}JSHH~3E9-6Z6EU%+Cg-`85!i&a88 z&z}qq>*b>j;Jm%M{3SBbjwCL}5R$^KthxI9$#ewfVzr1x5NV&Oh_ z8eUou7DyD@drh}ZfS%q|)54w95CoQe8kgfnyJh(ZC`WM+KG=#{X0utAB%QjlrIl<` zG)szMxFyRm^^&9-RH-PUD^shS1GfDAo%#aV^_;kIX+acGF}jPMUo4}~mpdc+3C*j| zCOwg%#%>?dS*MIH?|76nJP`UW#Wn6BBTxn?C<$+4F)_LrBnGpJ8~5K@Fk{H$R-4K9 zkapi%$ZnBl_Y*V8IK8hZ5g^l7PvrL>v&h3%nyxHpM0zLk2zk0CY?dv%`KYVZy<3l2 zd~)j#iw#1|Y#(n|0F`bZRpB6pncSJ&&&+an+#^}piBO*GMwp21wb<=R$sNZmWxkS% z`X;-DiEWFP4ONyZ*@C+8_S<_&l8c%S*g1VrFMy{}#?vSTPm|w&hViue(L7BNRXt&8 zx@2gQEsC;Y8Ja0Mj;4Z3RZOiSS0q8#OzLQkZa5W7v=zrysBJ5<>f)bL(Ii8svMOC` zK6z|!M|FzzvVj+1JZ!hf%UCh}*z>V-s#nRaAd2n4p=3S4J0h^FBUo-@>NR8X5VS{b zc+Dor3M5%;hXK}PvO(KX98$bq;NfAkgJJ3E$|z|&7*{q0qpAR7WfX>%v12;exhlEx z_G@iHg5V#`AwbfKKZcWGr@p>h5ROC@r;Kc%eqG` zxIzEgF71PEJ>G zG|e@sQ4%y)DJin1mMlZnN)<~M@E!@NVhDEmM&O$yr7z9U+h`AlN zaY6sZa$4}k^3LTKIIHbn(>we$moz>urxjI2N>}=QIW6|KUt@2qc$U4{@7LI?{GJ(S z{{N@uzB*QH7OQh#E&l7dD|0`ayI=fo#hJNZAc}uD0vrL307rl$z!BgGa0EC490861 zN8k&KfH+Y<)Hj_ejB&eu1zJ|BYjg9n|Ktcl;n5rcjsQo1Bft^h2yg^A z0vrL307u}Z+xkN=k7ZL#s%ZY&$l7872-TX%Q(D7Dz!LGQPqso?S zIg;tPQl`bXBtq(Qp+MDe9siSh`_2ieRdiig3fQao7n*o($Ql zsG6jjrXZN|$+Rt+4ePsQ3yLAhTE)_J6ogt8#Y7VvO*S3Fb)<@I$f7NY4mBK0HZAl{ zumr;vr;}17oWBxWTQ^+Mv24jwsbHYvf>}}2iX*C)u2d}3l?`B_;XlBlGDQ`lW>hqR zP9>$-x@j7YNOcjNHcSIJtD2@;hU8L3s#vN_bw`&hOGe?UWGbqNl3=eFsqL7SbRxyY zz;M_KRaHSmTXY4by3~*>IH6fF6dU=_VOLVb3VLM7rY!*y#lg`abl(&w(;kVM>53IY zq?W2Wf@vU&gTrNxg@e?#2C!YVBAdWgG$b2`rl~6GDA{og#Z(=2A}QtLp0^&q0;-$& zxJTR)`~U3UjTPOZH2ZhOe>Eq~{mVJG_+RG!-`s!x@=avkV~zkvfFr;W;0SO8I0762 zjsQo1Bk)BEw&zcg8(N7mEp@jqfUiO;_{S+I|+rpN1lsD}0W z{SVi$UeEs_^Yr-s50|Ic`yT+%nWM(D=eCG+|Ih9JU(^}S`^6F92yg^A0vrL307rl$ zz!BgG95VumJ(t`6x&8lhYX2|(+p*&R!Oj1_EdFKj7sa0!|D^aIihqbe{^1C41ULd5 z0geDifFr;W;0SO8I0762jsQpC?*V~R6JIZ+Z`V9EJ@HHduT+@HzE5V~)7ke__I)Dz zp3J@{vhVS!iDzbzpV|M%i+?*-{J;3aKO6y$07rl$z!BgGa0EC490861M}Q;15#R`X zfe<(`Q7BA|PfX0ru>Jo;@ejvf`~Nw<_=h9F5#R`L1ULd50geDifFr;W;0SO8I077j z&k_Rj6LLWkB&{S!B~d4WUX`?}s7~w?u2f}d{A&~POkaATD(cDk|G7UOoBQ**|6Tk_ zv0S`S+%CRX{8{lo761CP)Lq^djsQo1Bft^h2yg^A0vrL307rl$z!BgG3?OiJ;(4|= zR!gE-5|vG%QWb=%q&$bef^e!QR^PD;tXHNd$LL7SaIt}Bnz7+-S(5BH=d&eTH>D7P5M~wz63xo441lU8~(D`=>ocx;`scH<9q}tt%|w%?*+j)RX4w&D*55vic&q zee+&V#0#}+Ya2ZwtmLe)tR!-$wn_FHN-h!B=Gbe|^XL3v_uL}6_hRjKjd%{CZmwTh z_M8hnMb?R=CXkfsd@R?Lc3F4*!H8XC`Rg}BKBAusH86uE1m_N-^;i7$}9?=?<&}( z)IrI~cy4)4@;En8a$hP|auRgDlX3C2r%>|Qu9EYfv1Z5ZD1V$CC_InJ684~y=3RXB z?bFlq7tR*mnMn|7zp;mZ)`Er^HS&U<%Br}#vG&s48c9dtfIK}TO?vB2L185?_9aUx zrmJeQc86@-+$0-!*ViB2JvBZ55*oWU&{zgq4tCaBfS$Me^il2JKYL%>x``^u;5kd# ztjbYCorm^luW8-=+Mb!7zl2GUjzJGqcOj8?=F}nMya()ofWLimdVUR+eWNo4@`4Y> zZ|2Aw*u@`xcWQe6?AgKx_1LtUG_O1Vp3Z98TzO%=)|tcPeBV$Iwy3Oa zY}VLXc$~Hf(v;!2|;hCY!Za_9>`qTnPHs)J-F{W+Y+J4aSu ztgT)rY&phyh%yNn5W<(pxiD!8O@H3^(m-=(Ck*aWhlO@9V-GY8gYewq;$S0MG&Z-s z9|Z?>$qs48uMcSG`99vbPaR#Xlo8!vXNM>Q4868?ZG+8)^BJ)(lH0Yb;KLiMwL2Me z!79bphby)98e{{;#@X^Z*z0ti&v_9DuXk}Y4Q{>NE4Q`+eTejmDeqfOf> z<)CEW^XrUmqP7j8En+`i(`zDrKNUsSrxDllbD8LX$>o4jy=Iie_N;%4@7|r5 zo}ZsDyf>48&&VT6{!I43p9CEn{@Fsgw_FcWnQyj^9G-m9+nW?AZ4Q>^MO)NotYwcu z0k!%8h&IhA-f9Jo=Xxk~e}0gpu^ZIhPQ=IqQhgRziOvA^tP^QybbkMp zqL;1~C#U8w%^$Q1B;>RuCSND813JleLAS=I=AV16@M=Pc*}@S~%q<`DC|yVL$B7=| z?2A0lXlDsY7vF6ZrsvO`DZFImW!)I>EhCLPHWD9(el0Vihj8wghAnLRGBIF~ zI-8zNeU#9tYA{LYMcZ3VdY|@*5*?yH_kcE=*p}tovMWQvG3z0vK3l@Fbodn>u#yck z$M)gmeh|}iVH#|HGJ(=_`k%cS)JyNPAmFD?{@GaZzZP%J{q@}G+3(F3p8DBS8>j!` z)GucK0ulVf5#R`L1ULd50geDi;B$b$JJVmCK6Byh$seqA86)y-dzUpL56T7VE^|a4 z)s?zOZtAy3uux$O78E!|= z{Sm1Do_tt`mah?$)i8*n(ke5Lg%3(B3H7U_=K170EN}sFpF9r9721v?yU{d#*r1>M zgcTA}WG8LkA$RP?E_JBih~u58S}yPI?k?|yFi_B)AdJoCvO~)|t#WT)MoOHYiW%^n z@q9Pl8gZpOe-N%sJq2YaKT}|KeC>x>#cE_uG01#TtbX=B55*w2hSiM}c5z?@VLSXr9 z(ME^?Yq>z2Sk!Bg+tiB?;lRPCL6(=7FIty7{bo2^EL)e!YZpy|Qrt@mBaO$x_FNiXS`ZdU6xw@Dw@rYaQeF!-DHzL^eHxeJM!RMC2q;H!5I)$7 zT4u9ZmL#3JvZa-5Q#4D8VYnsBG4+z98dRw$qAOFYoCCJ}{hj&(+4Y>bacMymQ8BuU zo?k4Z&zCzR`U%ae&n7*Qp~h}t-Y;BUr;IM|c$73e5c)2K88>7E$^Zo=fyJ8`T?`U~ zS%qc&tpzj2?#OD#Uf?fO$y*DVyvXbmcwV@btTqcvB#+4LA1v}P%I?a7Mx-{85%P4~ zNm#b4rWf9gXjiFwRu2}R*gRNl02e&Zhj#_2bo;0Z2QfqmOh``bGc5PLpy9Uy%WJZ- zlNirIxP&LK#TMbj#=%nNE2)@xRVL>Vmkm{xDw!+d!rO1}CFvYl+$ZRRdI3C*GM+{$ zc$)nFGmNLzkLGDTFE^HKQIr+S&`im3G!L%JtvIeiZCjC5 z7yp!sCK)=FRq0yu$zyvvs#C0&4LHC&Y`4hESTX(B^I-t5SIMm)itWImWIce7NC>jS zavM{x8Iy;gJ#xcqHbGV($yz%MuqKlY+K%Fo!oA?(VYGu`>FLTSX*+=SJ9`et&@y&R z2Rm0KSDriu=WRMc=$5;BLe2+qv`9)|F$#c8$J^Z2w2E(G8WHx z2(qku)S^qo2@sW}eP}kCi3|W^I?(iKqea7Sr`(8T7{uKQu|UTZt0Y6&@vsiG$alfT z$bI+-VZP&$V%Vy1?2Jg=^dFLX7(Dp{O43nYCXz09k4~q)l)W8{%4och?#7=m5C5%}=^#rR_YzbOg%;NfRJZ4T#w zUs9?YnhEyN)Mz~L;?CuZiPUoWX2v%luP5Gdm^l!v4kmmE>yIrkHs&6O1uqo_!BHb( zZpUpH=r5Mjf-jbLF2BH8ZU36y;itKz@o_n=s47x=Bw%Pxo9_R=H+J$L7rys7-28sV zoBY8)ocvm{H-9zRnlJv|?yy^PQN#I_vL@myTUH7b* z?1Wf`JSY|*LwGwo(Gr2LuGx0e9c+q;ZMJr>m4jqQ67j(&9zZIwm5;1NK@)q3Hlmp# zVw2hp-`mC}IbzB0FzF?1V1JRs-h%dmrR*&>+fYO9pr{odw~$>Zxf+BmKp$nh=xCXB z2krFPW`(Rn4T1;Lh0pO4*>0N1vEFvmYEoI`U_eHbMS6$yQx`_k>W@^5GK5mygVb7C!ok=y;`% zgGqfP>7&M_54O?oS;CK9{LoZ&49= z*lcz zmzS&L1sXzo)M2b8a)&wmXItwG9G59PutT^THcjwkCQuvz1!E2KMZG1Hf|E-e`L}E$C&IQQT^X zvv{RD@G%a;C1yWB>+A;fRx-8jXxT;_)imX&*TJDjAwywN<_1e~o6Ik?4 z9N623bK5T)@j(c^&>_GP7;i6G9WUcOdXeA^cyvvB&zgVi3|Iv({3!eX@&7(HGl@U^ z!x8ubA@F8Q>P`+ro1S=5iM}lkXhATQyzE~<1 z49&3xv!bXKM^r6csaU2f8#WT+zd{8nQ^nLp&4B8L_VB?fxz>)EMFM(Lw%hMWRNH0~ zVhyub(C8skDwN5}m6g?0bCD}9pfZ5<9j4-3oddaO)eO!1CJvdJ{&vu6#YYH|-8~y2 zX{y12#MytV$#4=U>q@m_Y3@jstC{u%)*y&!KBWW-mpU+MknNlmdTSa+QBU3XJ9g;6os1v z^_Q7<3IxzZluhJ#65$9i>Up@ED3g3#lTe4tMYKcFl4dh=ra+O%7@6_IWQ=Zjemkm% zCaW=1+5m&m!i>hoHreT_JlGAj{kX))r z6-$+=?&y+b$>1LDAZ{?M2;2sg_T}-@>B4! zCey*gJK_j#(gW2fX>;Sjz?E6Uv7UR(XYE_$1+VEabw|^zV_9zR;1Y!rxke*^!*uu& zKy?yD15_U-V1>HC-&21ukl08> zh+toq_T>bBXl~XA;PJAvQK!mCtZ#+0KCr&t@jGtbS26NS4o9>9kNwHm$=|`B&-3S> zR!^o({LwcPCSLl04#~t7p&~O=yCzC1Y=Jga9Z8~=u4MWYL#CT0Glqz)PG+FKwMKpLYyE!LH7%l9eNA zyS(`_xktT*pBbTD3Kn#f0imbeM{v}hwE1QZZkoN*9wv{)4kmQB%`s#p3&KtEq;>j?@8^)Y97#Vh@=9`031|g7zK2-wa}FREvn!AwLJ@FsVeq&|*9494mv<6h`f_@&S2>9*v3XsBfDF-` zh9wvk{axB9d9CDUgr80Hm&k0_n#n8b-RpPT}y z?wzJs=TZpsLhR;v$8CiGBOTca@xktN<_7Tf#0{YK{`Fzq05naJl!SZ3_t@2JaB<5P z6ho3V_`d02;Mi;{P{&GwqsgXYxQ$U{L7H>l<*aXLE#x1su zhJ70<#dediJ6O2kvX4U!H=7UOGlsJv4o>|vnb$gu=FDr1u?^-1cIcT9=7rJIMt$4A%mb&usraAOb0$?Y+%bTw++;?VUtt%7U)|Sf9XEJmKrh@IJPxx!5b%t~ z*Lw%=Q)ldH^~cHCE7Yh6vLuvL3m1_qx+0cLoDwZrl?u*iQdd)Cx%_=LVM=j2VYYU= zCljOJ!^?u?Sk*go!^*_X17-{=|jWgbXrT^GD zuYiLfAI1JZw#D}UTc5!8|DXNFsf@9=62@MBACF)EUHNDetjLt=G8nt6i*m)O2#O*> zqb}=?;0lhcSryk+;TDgRdxE7X8r3a9pthldnH%s26k(&qNhH}db=A^tV`u0y@3yND zWZ9CB-|TYV7I+ib1`a480S<_TZqURzN>~HIgV;`;nP|(Mv(HSs3PuUXvO2^|c+8=w z)CU~uVu#>(AXDt#ZG=a0^`6d+l(-`T^uO0)I1%7h80^66*};SD-&5~5a{U^Zf5=Wc zcI=5*jhS+s8Dtp;XUELdgr9fvn=swR;IvH3o}-AcB%XNK4q&s*^xXbF$RiNgzd&*N zmRi8uhy9uvU~gd8o*dv~NBQzY>^@n4@*&jQuxzk1>_~ypyqkzJ>~LUbx1QQ{nPr$M z?4gp!z~GA=0=$VfZsYX4%j_deVW08+{V~a%vVhBoT)$^Vj+e*pPxs6b5i~y% zcfX!qcFT{__j0XcXYe@YmAw&Y;cXZXQq;F_PqH6JIDr4!TMO)sHQ-yVoqf;aE9x?i zh~dynCM7TfL>RQPcdD=~5Yt$zNv>mP6$uJ^$Hp(i#F2i@qHsv2wx;N^uGxYPMSx&e zm{vfxWXq9oF31#B!BBPGPPW3yp7c}M|Fb3^#r!+U0+zVL{QF;9H5~IFLa68a|3lRy zBT522gZJr60!j79DG8_&CXTCHB?Ah!l7d@8;P_v0N|GknuIuPBd|mkd|6}Q)hL;Wc li0gP|gCUlRkt77Z|4(iy>m4(2%;UUiN0W42A94Tx{{iTtj_Uvb literal 0 HcmV?d00001 diff --git a/backend/requirements-test.txt b/backend/requirements-test.txt new file mode 100644 index 0000000..cf39f84 --- /dev/null +++ b/backend/requirements-test.txt @@ -0,0 +1,3 @@ +pytest==8.3.5 +pytest-cov==6.0.0 +httpx==0.28.1 diff --git a/backend/requirements.txt b/backend/requirements.txt new file mode 100644 index 0000000..f4ffe61 --- /dev/null +++ b/backend/requirements.txt @@ -0,0 +1,8 @@ +fastapi==0.116.1 +uvicorn[standard]==0.35.0 +itsdangerous==2.2.0 +pydantic-settings==2.10.1 +python-dotenv==1.1.1 +feedparser==6.0.11 +jinja2==3.1.4 +python-multipart==0.0.20 diff --git a/backend/static/admin.css b/backend/static/admin.css new file mode 100644 index 0000000..348264f --- /dev/null +++ b/backend/static/admin.css @@ -0,0 +1,189 @@ +body { + margin: 0; + font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif; + background: #f4f6f8; + color: #1f2937; +} + +.topbar { + display: flex; + justify-content: space-between; + align-items: center; + padding: 20px 28px; + background: #0f172a; + color: #f8fafc; +} + +.container { + padding: 20px 28px 28px 28px; +} + +.login { + max-width: 520px; + margin: 60px auto; +} + +.card { + background: #ffffff; + border-radius: 10px; + box-shadow: 0 1px 3px rgba(0, 0, 0, 0.12); + padding: 16px; + margin-bottom: 16px; +} + +.stats { + display: grid; + grid-template-columns: repeat(4, minmax(0, 1fr)); + gap: 12px; + margin-bottom: 16px; +} + +.stat { + background: #ffffff; + border-radius: 10px; + padding: 12px; + box-shadow: 0 1px 3px rgba(0, 0, 0, 0.12); +} + +.stat .label { + font-size: 12px; + color: #64748b; +} + +.stat .value { + font-size: 24px; + font-weight: 700; +} + +.grid.two { + display: grid; + grid-template-columns: 1fr 1fr; + gap: 16px; +} + +.stack { + display: grid; + gap: 10px; +} + +.row { + display: flex; + gap: 8px; + align-items: center; +} + +.filter-row { + margin-bottom: 10px; +} + +.inline { + display: flex; + gap: 6px; + align-items: center; +} + +table { + width: 100%; + border-collapse: collapse; +} + +th, td { + text-align: left; + padding: 8px; + border-bottom: 1px solid #e5e7eb; + vertical-align: top; +} + +input, select, button { + padding: 8px; + border-radius: 6px; + border: 1px solid #cbd5e1; + font: inherit; +} + +button { + background: #0ea5e9; + border-color: #0ea5e9; + color: white; + cursor: pointer; +} + +button.secondary { + background: #64748b; + border-color: #64748b; +} + +.badge { + display: inline-block; + padding: 2px 8px; + border-radius: 999px; + background: #e2e8f0; + font-size: 12px; +} + +.badge.ok { + background: #dcfce7; + color: #166534; +} + +.badge.bad { + background: #fee2e2; + color: #991b1b; +} + +.alert { + margin-bottom: 12px; + padding: 10px; + border-radius: 8px; + background: #fee2e2; + color: #991b1b; +} + +.flash { + font-weight: 600; +} + +.flash-success { + border-left: 4px solid #10b981; +} + +.flash-error { + border-left: 4px solid #ef4444; +} + +.subtle { + color: #64748b; + font-size: 12px; + margin-top: 4px; +} + +.pre { + white-space: pre-wrap; + line-height: 1.35; + max-height: 220px; + overflow: auto; + background: #f8fafc; + border: 1px solid #e2e8f0; + border-radius: 8px; + padding: 8px; + margin-top: 6px; +} + +.linkbtn { + display: inline-block; + padding: 8px 10px; + border-radius: 6px; + text-decoration: none; + border: 1px solid #cbd5e1; + color: #334155; + background: #f8fafc; +} + +@media (max-width: 920px) { + .stats { + grid-template-columns: repeat(2, minmax(0, 1fr)); + } + .grid.two { + grid-template-columns: 1fr; + } +} diff --git a/backend/templates/admin_dashboard.html b/backend/templates/admin_dashboard.html new file mode 100644 index 0000000..36b30a7 --- /dev/null +++ b/backend/templates/admin_dashboard.html @@ -0,0 +1,235 @@ + + + + + + {{ title }} + + + +
+
+

rss-news Admin Dashboard

+

Angemeldet als {{ user }}

+
+
+ +
+
+ +
+ {% if flash_msg %} +
+ {{ flash_msg }} +
+ {% endif %} + +
+
+
Quellen
+
{{ sources|length }}
+
+
+
Feeds
+
{{ feeds|length }}
+
+
+
Artikel
+
{{ articles|length }}
+
+
+
Runs
+
{{ runs|length }}
+
+
+ +
+
+

Quelle anlegen

+
+ + + + + + + +
+
+ +
+

Feed anlegen

+
+ + + + + +
+
+
+ +
+

Ingestion starten

+
+ + +
+
+ +
+

Quellen + Policy

+ + + + + + {% for s in sources %} + + + + + + + + + {% endfor %} + +
IDNameRiskLizenzTermsPolicy
{{ s.id }}{{ s.name }}{{ s.risk_level }}{{ s.license_name or "-" }}{{ s.terms_url or "-" }} + {% if source_policy[s.id] %} + BLOCKED ({{ source_policy[s.id]|length }}) +
{{ source_policy[s.id]|join(", ") }}
+ {% else %} + OK + {% endif %} +
+
+ +
+

Artikel (Review)

+
+ + + + Reset +
+ + + + + + {% for a in articles %} + + + + + + + + + {% endfor %} + +
IDArtikelStatusDetailsReviewTransition
{{ a.id }} + {{ a.title }}
+ Autor: {{ a.author or "-" }}
+ Original öffnen + {% if a.canonical_url and a.canonical_url != a.source_url %} +
Canonical öffnen + {% endif %} +
{{ a.status }} + {% if a.summary %} +
Summary: {{ a.summary }}
+ {% endif %} + {% if a.content_raw %} +
+ Volltext anzeigen +
{{ a.content_raw }}
+
+ {% endif %} +
Bilder: {{ a.extracted_images|length }}
+ {% if a.extracted_images %} +
+ Bild-URLs +
    + {% for img in a.extracted_images %} +
  • {{ img }}
  • + {% endfor %} +
+
+ {% endif %} + {% if a.press_contact %} +
+ Pressekontakt +
{{ a.press_contact }}
+
+ {% endif %} + {% if a.extraction_error %} +
Extraktionsfehler: {{ a.extraction_error }}
+ {% endif %} +
+ {% if a.status == "review" %} +
+ + + +
+ {% else %} + - + {% endif %} +
+
+ + {% if allowed_transitions.get(a.status, []) %} + + {% else %} + keine Aktion + {% endif %} +
+
+
+ +
+

Runs

+ + + + + + {% for r in runs %} + + + + + + + + {% endfor %} + +
IDTypStatusStartEnde
{{ r.id }}{{ r.run_type }}{{ r.status }}{{ r.started_at }}{{ r.finished_at or "-" }}
+
+
+ + diff --git a/backend/templates/admin_login.html b/backend/templates/admin_login.html new file mode 100644 index 0000000..10e55e7 --- /dev/null +++ b/backend/templates/admin_login.html @@ -0,0 +1,27 @@ + + + + + + {{ title }} + + + +
+

rss-news Admin

+

Bitte anmelden, um das Tool zu verwalten.

+ {% if error %} +
Login fehlgeschlagen. Bitte pruefen.
+ {% endif %} +
+ + + +
+
+ + diff --git a/backend/tests/__init__.py b/backend/tests/__init__.py new file mode 100644 index 0000000..46816dd --- /dev/null +++ b/backend/tests/__init__.py @@ -0,0 +1 @@ +"""Tests package.""" diff --git a/backend/tests/test_admin_ui.py b/backend/tests/test_admin_ui.py new file mode 100644 index 0000000..c6b2188 --- /dev/null +++ b/backend/tests/test_admin_ui.py @@ -0,0 +1,65 @@ +import os +import tempfile +import unittest +from pathlib import Path + +from fastapi.testclient import TestClient + +from backend.app import config as config_module +from backend.app.db import init_db +from backend.app.main import app + + +class TestAdminUi(unittest.TestCase): + def setUp(self) -> None: + self.tmp_dir = tempfile.TemporaryDirectory() + os.environ["APP_DB_PATH"] = str(Path(self.tmp_dir.name) / "admin_ui.db") + os.environ["APP_ADMIN_USERNAME"] = "admin" + os.environ["APP_ADMIN_PASSWORD"] = "secret" + config_module.get_settings.cache_clear() + init_db() + self.client = TestClient(app) + + def tearDown(self) -> None: + config_module.get_settings.cache_clear() + os.environ.pop("APP_DB_PATH", None) + os.environ.pop("APP_ADMIN_USERNAME", None) + os.environ.pop("APP_ADMIN_PASSWORD", None) + self.tmp_dir.cleanup() + + def test_admin_login_and_dashboard(self) -> None: + login_page = self.client.get("/admin/login") + self.assertEqual(login_page.status_code, 200) + self.assertIn("rss-news Admin", login_page.text) + + login = self.client.post( + "/admin/login", + data={"username": "admin", "password": "secret"}, + follow_redirects=True, + ) + self.assertEqual(login.status_code, 200) + self.assertIn("Admin Dashboard", login.text) + + def test_dashboard_redirects_if_not_logged_in(self) -> None: + res = self.client.get("/admin/dashboard", follow_redirects=False) + self.assertEqual(res.status_code, 303) + self.assertEqual(res.headers.get("location"), "/admin/login") + + def test_create_feed_with_empty_source_id_does_not_error(self) -> None: + self.client.post( + "/admin/login", + data={"username": "admin", "password": "secret"}, + follow_redirects=True, + ) + # empty source_id used to cause validation issues in form parsing + res = self.client.post( + "/admin/feeds/create", + data={"name": "Feed X", "url": "https://example.org/feed.xml", "source_id": ""}, + follow_redirects=False, + ) + self.assertEqual(res.status_code, 303) + self.assertTrue(res.headers.get("location", "").startswith("/admin/dashboard")) + + +if __name__ == "__main__": + unittest.main() diff --git a/backend/tests/test_api_auth.py b/backend/tests/test_api_auth.py new file mode 100644 index 0000000..aa86821 --- /dev/null +++ b/backend/tests/test_api_auth.py @@ -0,0 +1,77 @@ +import os +import tempfile +import unittest +from pathlib import Path + +from fastapi.testclient import TestClient + +from backend.app import config as config_module +from backend.app.db import init_db +from backend.app.main import app + + +class TestApiAuth(unittest.TestCase): + def setUp(self) -> None: + self.tmp_dir = tempfile.TemporaryDirectory() + os.environ["APP_DB_PATH"] = str(Path(self.tmp_dir.name) / "api.db") + os.environ["APP_ADMIN_USERNAME"] = "admin" + os.environ["APP_ADMIN_PASSWORD"] = "secret" + config_module.get_settings.cache_clear() + init_db() + self.client = TestClient(app) + + def tearDown(self) -> None: + config_module.get_settings.cache_clear() + os.environ.pop("APP_DB_PATH", None) + os.environ.pop("APP_ADMIN_USERNAME", None) + os.environ.pop("APP_ADMIN_PASSWORD", None) + self.tmp_dir.cleanup() + + def test_login_and_protected_endpoint(self) -> None: + r = self.client.post("/auth/login", json={"username": "admin", "password": "secret"}) + self.assertEqual(r.status_code, 200) + + p = self.client.get("/api/protected") + self.assertEqual(p.status_code, 200) + self.assertTrue(p.json().get("ok")) + + def test_protected_requires_auth(self) -> None: + r = self.client.get("/api/protected") + self.assertEqual(r.status_code, 401) + + def test_run_detail_endpoint(self) -> None: + login = self.client.post("/auth/login", json={"username": "admin", "password": "secret"}) + self.assertEqual(login.status_code, 200) + + created = self.client.post("/api/runs", json={"run_type": "ingestion", "status": "running"}) + self.assertEqual(created.status_code, 200) + run_id = created.json()["id"] + + detail = self.client.get(f"/api/runs/{run_id}") + self.assertEqual(detail.status_code, 200) + self.assertEqual(detail.json()["item"]["id"], run_id) + + def test_source_policy_check_endpoint(self) -> None: + login = self.client.post("/auth/login", json={"username": "admin", "password": "secret"}) + self.assertEqual(login.status_code, 200) + + created = self.client.post( + "/api/sources", + json={ + "name": "Policy Source", + "risk_level": "yellow", + "is_enabled": True, + }, + ) + self.assertEqual(created.status_code, 200) + source_id = created.json()["id"] + + check = self.client.get(f"/api/sources/{source_id}/policy-check") + self.assertEqual(check.status_code, 200) + body = check.json() + self.assertFalse(body["allowed"]) + self.assertGreaterEqual(len(body["issues"]), 1) + + +if __name__ == "__main__": + unittest.main() diff --git a/backend/tests/test_article_workflow.py b/backend/tests/test_article_workflow.py new file mode 100644 index 0000000..28bb1eb --- /dev/null +++ b/backend/tests/test_article_workflow.py @@ -0,0 +1,95 @@ +import os +import tempfile +import unittest +from pathlib import Path + +from fastapi.testclient import TestClient + +from backend.app import config as config_module +from backend.app.db import init_db +from backend.app.main import app + + +class TestArticleWorkflow(unittest.TestCase): + def setUp(self) -> None: + self.tmp_dir = tempfile.TemporaryDirectory() + os.environ["APP_DB_PATH"] = str(Path(self.tmp_dir.name) / "workflow.db") + os.environ["APP_ADMIN_USERNAME"] = "admin" + os.environ["APP_ADMIN_PASSWORD"] = "secret" + config_module.get_settings.cache_clear() + init_db() + self.client = TestClient(app) + self.client.post("/auth/login", json={"username": "admin", "password": "secret"}) + + def tearDown(self) -> None: + config_module.get_settings.cache_clear() + os.environ.pop("APP_DB_PATH", None) + os.environ.pop("APP_ADMIN_USERNAME", None) + os.environ.pop("APP_ADMIN_PASSWORD", None) + self.tmp_dir.cleanup() + + def _create_article(self) -> int: + source = self.client.post( + "/api/sources", + json={ + "name": "Workflow Source", + "base_url": "https://example.org", + "terms_url": "https://example.org/terms", + "license_name": "cc-by", + "risk_level": "green", + "is_enabled": True, + "last_reviewed_at": "2026-02-18T00:00:00Z", + }, + ) + source_id = source.json()["id"] + + feed = self.client.post( + "/api/feeds", + json={"name": "Workflow Feed", "url": "https://example.org/feed.xml", "source_id": source_id, "is_enabled": True}, + ) + feed_id = feed.json()["id"] + + article = self.client.post( + "/api/articles/upsert", + json={ + "feed_id": feed_id, + "source_article_id": "wf-1", + "source_url": "https://example.org/a1", + "title": "Workflow Artikel", + "summary": "s", + "content_raw": "c", + "status": "new", + }, + ) + return article.json()["id"] + + def test_valid_transition_chain(self) -> None: + article_id = self._create_article() + + t1 = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "review"}) + self.assertEqual(t1.status_code, 200) + + r1 = self.client.post(f"/api/articles/{article_id}/review", json={"decision": "approve", "note": "ok"}) + self.assertEqual(r1.status_code, 200) + self.assertEqual(r1.json()["to_status"], "approved") + + t2 = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "published"}) + self.assertEqual(t2.status_code, 200) + + final = self.client.get(f"/api/articles/{article_id}") + self.assertEqual(final.status_code, 200) + self.assertEqual(final.json()["item"]["status"], "published") + + def test_invalid_transition_rejected(self) -> None: + article_id = self._create_article() + bad = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "published"}) + self.assertEqual(bad.status_code, 400) + + def test_review_only_allowed_in_review_status(self) -> None: + article_id = self._create_article() + bad = self.client.post(f"/api/articles/{article_id}/review", json={"decision": "approve"}) + self.assertEqual(bad.status_code, 400) + + +if __name__ == "__main__": + unittest.main() diff --git a/backend/tests/test_db_repositories.py b/backend/tests/test_db_repositories.py new file mode 100644 index 0000000..825ae8d --- /dev/null +++ b/backend/tests/test_db_repositories.py @@ -0,0 +1,119 @@ +import os +import tempfile +import unittest +from pathlib import Path + +from backend.app import config as config_module +from backend.app.db import init_db +from backend.app.repositories import ( + ArticleUpsert, + FeedCreate, + RunCreate, + SourceCreate, + create_feed, + create_run, + create_source, + finish_run, + list_articles, + list_feeds, + list_runs, + list_sources, + upsert_article, +) + + +class TestSQLiteRepositories(unittest.TestCase): + def setUp(self) -> None: + self.tmp_dir = tempfile.TemporaryDirectory() + self.db_path = str(Path(self.tmp_dir.name) / "test.db") + os.environ["APP_DB_PATH"] = self.db_path + config_module.get_settings.cache_clear() + init_db() + + def tearDown(self) -> None: + config_module.get_settings.cache_clear() + os.environ.pop("APP_DB_PATH", None) + self.tmp_dir.cleanup() + + def test_end_to_end_basic_crud(self) -> None: + source_id = create_source( + SourceCreate( + name="GovData", + base_url="https://data.gov.de", + terms_url="https://www.govdata.de/dl-de/by-2-0", + license_name="dl-de/by-2-0", + risk_level="green", + is_enabled=True, + notes="test source", + last_reviewed_at="2026-02-18T00:00:00Z", + ) + ) + self.assertGreater(source_id, 0) + + feed_id = create_feed( + FeedCreate( + name="GovData RSS", + url="https://example.org/feed.xml", + source_id=source_id, + is_enabled=True, + ) + ) + self.assertGreater(feed_id, 0) + + run_id = create_run(RunCreate(run_type="ingest", status="running", details="start")) + self.assertGreater(run_id, 0) + finish_run(run_id=run_id, status="success", details="ok") + + article_id = upsert_article( + ArticleUpsert( + feed_id=feed_id, + source_article_id="abc-1", + source_hash="hash-abc-1", + title="Beispielartikel", + source_url="https://example.org/articles/1", + canonical_url="https://example.org/articles/1", + published_at="2026-02-18T00:00:00Z", + author="Max Mustermann", + summary="Kurzfassung", + content_raw="Originaltext", + content_rewritten="Umschreibung", + word_count=120, + status="review", + meta_json='{"lang":"de"}', + ) + ) + self.assertGreater(article_id, 0) + + # Upsert with same source_url updates same row + article_id_2 = upsert_article( + ArticleUpsert( + feed_id=feed_id, + source_article_id="abc-1", + source_hash="hash-abc-1", + title="Beispielartikel aktualisiert", + source_url="https://example.org/articles/1", + canonical_url="https://example.org/articles/1", + published_at="2026-02-18T00:00:00Z", + author="Max Mustermann", + summary="Kurzfassung 2", + content_raw="Originaltext 2", + content_rewritten="Umschreibung 2", + word_count=140, + status="approved", + meta_json='{"lang":"de","v":2}', + ) + ) + self.assertEqual(article_id, article_id_2) + + self.assertEqual(len(list_sources()), 1) + self.assertEqual(len(list_feeds()), 1) + self.assertEqual(len(list_runs()), 1) + + articles = list_articles() + self.assertEqual(len(articles), 1) + self.assertEqual(articles[0]["title"], "Beispielartikel aktualisiert") + self.assertEqual(articles[0]["status"], "approved") + + +if __name__ == "__main__": + unittest.main() diff --git a/backend/tests/test_ingestion.py b/backend/tests/test_ingestion.py new file mode 100644 index 0000000..05b2c2b --- /dev/null +++ b/backend/tests/test_ingestion.py @@ -0,0 +1,122 @@ +import os +import tempfile +import unittest +from pathlib import Path +from unittest.mock import patch + +from backend.app import config as config_module +from backend.app.db import init_db +from backend.app.ingestion import run_ingestion +from backend.app.repositories import FeedCreate, SourceCreate, create_feed, create_source, list_articles +from backend.app.source_extraction import ExtractedArticle + + +class TestIngestion(unittest.TestCase): + def setUp(self) -> None: + self.tmp_dir = tempfile.TemporaryDirectory() + os.environ["APP_DB_PATH"] = str(Path(self.tmp_dir.name) / "ingestion.db") + config_module.get_settings.cache_clear() + init_db() + + source_id = create_source( + SourceCreate( + name="Test Source", + base_url="https://example.org", + terms_url="https://example.org/terms", + license_name="cc-by", + risk_level="green", + is_enabled=True, + notes=None, + last_reviewed_at="2026-02-18T00:00:00Z", + ) + ) + self.feed_id = create_feed( + FeedCreate( + name="Test Feed", + url="https://example.org/feed.xml", + source_id=source_id, + is_enabled=True, + ) + ) + + def tearDown(self) -> None: + config_module.get_settings.cache_clear() + os.environ.pop("APP_DB_PATH", None) + self.tmp_dir.cleanup() + + @patch("backend.app.ingestion.extract_article") + @patch("backend.app.ingestion.feedparser.parse") + def test_ingestion_deduplicates_by_feed_and_guid(self, mock_parse, mock_extract_article) -> None: + mock_extract_article.return_value = ExtractedArticle( + title="Artikel 1 original", + author="Autorin A", + canonical_url="https://example.org/article/1", + summary="Original Summary", + content_text="Original Volltext", + images=["https://example.org/a.jpg"], + press_contact="Pressekontakt: Team A", + extraction_error=None, + ) + mock_parse.return_value = { + "etag": "etag-1", + "modified": "Tue, 18 Feb 2026 10:00:00 GMT", + "entries": [ + { + "id": "item-1", + "title": "Artikel 1", + "link": "https://example.org/article/1", + "summary": "A", + }, + { + "id": "item-1", + "title": "Artikel 1 aktualisiert", + "link": "https://example.org/article/1-neu", + "summary": "B", + }, + ], + } + + stats = run_ingestion(feed_id=self.feed_id) + self.assertEqual(stats.status, "success") + self.assertEqual(stats.entries_seen, 2) + self.assertEqual(len(list_articles()), 1) + article = list_articles()[0] + self.assertEqual(article["title"], "Artikel 1 original") + self.assertEqual(article["author"], "Autorin A") + self.assertIn("Original Volltext", article["content_raw"] or "") + self.assertIn("Pressekontakt", article["meta_json"] or "") + + @patch("backend.app.ingestion.extract_article") + @patch("backend.app.ingestion.feedparser.parse") + def test_ingestion_blocks_non_green_source(self, mock_parse, mock_extract_article) -> None: + # Re-create source/feed with yellow risk to verify enforcement + source_id = create_source( + SourceCreate( + name="Blocked Source", + base_url="https://example.net", + terms_url="https://example.net/terms", + license_name="custom", + risk_level="yellow", + is_enabled=True, + notes=None, + last_reviewed_at="2026-02-18T00:00:00Z", + ) + ) + blocked_feed_id = create_feed( + FeedCreate( + name="Blocked Feed", + url="https://example.net/feed.xml", + source_id=source_id, + is_enabled=True, + ) + ) + + stats = run_ingestion(feed_id=blocked_feed_id) + self.assertEqual(stats.status, "success") + self.assertEqual(stats.articles_upserted, 0) + mock_parse.assert_not_called() + mock_extract_article.assert_not_called() + + +if __name__ == "__main__": + unittest.main() diff --git a/backend/tests/test_source_extraction.py b/backend/tests/test_source_extraction.py new file mode 100644 index 0000000..f6787ff --- /dev/null +++ b/backend/tests/test_source_extraction.py @@ -0,0 +1,69 @@ +import unittest +from unittest.mock import patch + +from backend.app.source_extraction import extract_article + + +SAMPLE_HTML = """ + + + + + + + + + + + +
+

Dies ist der vollstaendige Inhalt des Artikels.

+

Weitere relevante Informationen fuer die Meldung.

+

Pressekontakt

+

Musterfirma GmbH, Kontakt: presse@example.org

+
+ + +""" + + +class _FakeHeaders: + @staticmethod + def get_content_charset(): + return "utf-8" + + +class _FakeResponse: + headers = _FakeHeaders() + + def __init__(self, body: str): + self._body = body.encode("utf-8") + + def read(self): + return self._body + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + return False + + +class TestSourceExtraction(unittest.TestCase): + @patch("backend.app.source_extraction.urlopen") + def test_extract_article_parses_author_images_and_press_contact(self, mock_urlopen) -> None: + mock_urlopen.return_value = _FakeResponse(SAMPLE_HTML) + + extracted = extract_article("https://www.presseportal.de/pm/118273/6158137") + self.assertEqual(extracted.title, "Demo Meldung von Presseportal") + self.assertEqual(extracted.author, "Max Mustermann") + self.assertEqual(extracted.canonical_url, "https://www.presseportal.de/pm/118273/6158137") + self.assertIn("vollstaendige Inhalt", extracted.content_text or "") + self.assertIn("Kurzbeschreibung", extracted.summary or "") + self.assertIn("https://www.presseportal.de/images/demo.jpg", extracted.images) + self.assertIn("Pressekontakt", extracted.press_contact or "") + self.assertIsNone(extracted.extraction_error) + + +if __name__ == "__main__": + unittest.main() diff --git a/docs/PROJECT_PLAN.md b/docs/PROJECT_PLAN.md new file mode 100644 index 0000000..c758f5e --- /dev/null +++ b/docs/PROJECT_PLAN.md @@ -0,0 +1,67 @@ +# Projektplan (Neustart) + +## Leitentscheidungen +- Bestehendes Repository wird weiterverwendet. +- Kein harter Endtermin: lauffaehig werden, dann iterativ verbessern. +- Hetzner bleibt Laufzeitplattform. +- WordPress (IONOS) bleibt vorerst Ziel fuer Publikation. +- Auth initial nur mit einem User/Password. + +## Zielbild +Eine modulare News-Pipeline mit klaren Stufen: +1. Feed-Ingestion +2. Inhaltsanalyse und Normalisierung +3. Rewrite/Anreicherung +4. Legal- und Qualitaetschecks +5. WordPress-Publikation (`pending`) +6. Monitoring/Logging + +## Grobe Zeitplanung (ohne Fixtermine) +- Phase 0: ca. 1 Woche +- Phase 1: ca. 2-4 Wochen +- Phase 2: ca. 2-3 Wochen +- Phase 3: fortlaufend + +## Phasen + +### Phase 0 - Grundlagen (jetzt) +- Doku und Wiki strukturieren +- Source-Policy definieren +- Redirect fuer `news.vanityontour.de` setzen +- GitHub Project als zentrale Planung scharfstellen + +### Phase 1 - MVP Core +- Neues FastAPI-Projektgeruest +- SQLite-Datenmodell (feeds, articles, runs, source_policy) +- Feed-Import mit Duplikaterkennung +- Admin-Login (ein User) +- Manuelle Review vor Publish + +### Phase 2 - Automation +- Job-Queue (asynchron) +- Regelbasierte Scheduler +- Retry/Dead-Letter-Handling +- Robustes Error-Reporting + +### Phase 3 - Compliance und Skalierung +- Source-Whitelisting mit Pflichtfeldern +- Pflicht-Attribution pro Artikel +- Qualitaetsmetriken und Audit-Logs +- Optional: Passkey/WebAuthn + +## Architekturprinzipien +- Idempotente Jobs +- Trennung von UI, API, Worker +- Strikte Validierung bei Quell-/Lizenzdaten +- Expliziter Publish-Schritt, kein blindes Autoposting + +## Risiken +- Lizenz-/Nutzungsbedingungen je Quelle variieren stark +- Feeds aendern Struktur/Verfuegbarkeit +- WordPress-API und Auth koennen regressionsanfaellig sein + +## Erfolgsmetriken +- Zeit von Feed-Eingang bis Review-Ready +- Quote sauber attribuierter Artikel +- Fehlerrate pro Pipeline-Stufe +- Anzahl manueller Eingriffe pro Woche diff --git a/docs/SOURCE_POLICY.md b/docs/SOURCE_POLICY.md new file mode 100644 index 0000000..d1d2e0c --- /dev/null +++ b/docs/SOURCE_POLICY.md @@ -0,0 +1,81 @@ +# Source Policy und Feed-Vorschlaege + +## Grundsatz +Es werden nur Quellen genutzt, deren Nutzungsbedingungen die geplante Nutzung erlauben oder fuer die eine explizite Genehmigung vorliegt. + +## Pflichtdaten pro Quelle +- Quellname +- Feed-URL +- Originalartikel-URL +- Autor/Herausgeber (wenn vorhanden) +- Lizenz/Nutzungsgrundlage +- Einschraenkungen (kommerziell, Bearbeitung, Bildrechte, Archivierung) +- Datum der letzten Pruefung +- Link auf Nutzungsbedingungen + +## Einstufung (Ampel) +- Gruen: Nutzung fuer geplantes Modell klar erlaubt +- Gelb: teilklar/mit Einschraenkungen, manuelle Pruefung erforderlich +- Rot: fuer das Modell nicht geeignet ohne Zusatzvertrag + +## Verbindliche Regeln +- Keine neue Quelle ohne Eintrag im Source-Register +- Kein automatischer Publish bei Gelb/Rot +- Bilder separat pruefen (Textrecht != Bildrecht) +- Quartalsweiser Re-Check der Terms + +## Ersteinschaetzung (Stand: 16.02.2026) + +### Rot +1. Reuters / Thomson Reuters +- Grund: Inhalte sind urheberrechtlich geschuetzt; Reproduktion/Verteilung laut Terms nur mit vorheriger Zustimmung. +- Folge: Nur mit explizitem Vertrag/Lizenz. +- Referenz: + - https://www.thomsonreuters.com/en/terms-of-use + +2. tagesschau.de RSS +- Grund: Inhalte nur privat/nicht-kommerziell; Veroeffentlichung grundsaetzlich nicht erlaubt (ausser explizit CC-lizenziert). +- Folge: Nicht fuer das geplante Modell geeignet. +- Referenz: + - https://www.tagesschau.de/infoservices/rssfeeds + +### Gelb +1. Presseportal / ots +- Grund: Redaktionelle Nutzung grundsaetzlich moeglich, aber Verantwortung liegt beim Verwender; darueber hinausgehende Geschaeftsnutzung nur mit Genehmigung. +- Folge: Nur mit strikter Einzelpruefung pro Meldung (insb. Bild-/Drittrechte). +- Referenz: + - https://www.presseportal.de/nutzungsbedingungen + - https://www.presseportal.de/feeds/ + +2. Bundesbehoerden-RSS ohne explizite freie Weiterverwendungs-Lizenz +- Grund: RSS wird bereitgestellt, aber nicht immer als offene Lizenz zur kommerziellen Nachnutzung formuliert. +- Folge: Je Behoerde einzeln pruefen und dokumentieren. +- Beispiele: + - https://www.bundesfinanzministerium.de/Content/DE/Standardartikel/Service/rss_base.html + - https://bmas.bund.de/EN/Services/RSS/rss.html + +### Gruen (mit korrekter Attribution) +1. GovData / Open-Data-Portale mit `dl-de/by-2-0`, `dl-de/zero-2-0`, `CC BY 4.0` oder `CC0` +- Grund: Diese Lizenzen erlauben grundsaetzlich auch kommerzielle Weiterverwendung (je nach Lizenzbedingungen). +- Folge: Sehr gut fuer stabile Automatisierung geeignet. +- Referenz: + - https://www.govdata.de/dl-de/by-2-0 + - https://data.gov.de/informationen/lizenzen + - https://www.dcat-ap.de/def/licenses/dl-zero-de/2.0 + +2. EU-Quellen mit expliziter `CC BY 4.0` Wiederverwendungsregel +- Grund: EU-Inhalte sind haeufig unter CC BY 4.0 wiederverwendbar, sofern nicht anders gekennzeichnet. +- Folge: Geeignet, wenn Drittinhalte ausgenommen werden. +- Referenz: + - https://commission.europa.eu/legal-notice_en + - https://eur-lex.europa.eu/content/help/content/legal-notice/legal-notice.html + +## Quelle im Register freischalten (Definition of Done) +- Terms-Link hinterlegt +- Lizenzklasse (Gruen/Gelb/Rot) gesetzt +- Pflicht-Attribution dokumentiert +- Bildrechtsregel dokumentiert +- Letzte Pruefung und Verantwortlicher gepflegt + +## Hinweis +Keine Rechtsberatung. Bei unklaren oder wirtschaftlich kritischen Quellen ist eine juristische Prüfung sinnvoll. diff --git a/docs/TODO.md b/docs/TODO.md new file mode 100644 index 0000000..ad9b549 --- /dev/null +++ b/docs/TODO.md @@ -0,0 +1,33 @@ +# ToDo (Ein-Entwickler Setup) + +## Jetzt +- [ ] GitHub Project #3 Felder/Views fuer Neustart vereinheitlichen +- [ ] Alte/obsolet gewordene Issues kennzeichnen (z. B. User-Verwaltung) +- [ ] Redirect `news.vanityontour.de -> vanityontour.de` aktiv halten +- [ ] Wiki-Basis fertigstellen und verlinken + +## MVP +- [x] Neues Backend-Skelett (`backend/`) aufsetzen (FastAPI) +- [x] Datenmodell in SQLite anlegen +- [x] Feed-Ingestion Service bauen (ETag/Last-Modified) +- [x] Duplikaterkennung ueber `source_url`, `guid`, Hash +- [x] Login mit 1 Admin-Account implementieren +- [ ] Artikel-Review-Maske mit Statusworkflow +- [ ] WordPress-Publisher als separaten Service implementieren + +## Recht/Qualitaet +- [ ] Source-Policy in DB + Admin-UI abbilden +- [ ] Pflichtfelder je Quelle erzwingen (Autor, URL, Lizenz, Hinweise) +- [ ] Auto-Block bei fehlender Lizenzinfo +- [ ] Pro Artikel Attribution-Block generieren + +## Betrieb +- [ ] Systemd-Service(s) fuer API/Worker erstellen +- [ ] Nginx-Routing fuer neue App einrichten +- [ ] Healthcheck-Endpunkte + Monitoring einrichten +- [ ] Backup/Restore fuer DB dokumentieren + +## Spaeter +- [ ] Passkey/WebAuthn evaluieren und optional einfuehren +- [ ] Migration auf PostgreSQL bewerten +- [ ] Teilautomatische Freigabe-Regeln definieren diff --git a/docs/wiki/Architektur.md b/docs/wiki/Architektur.md new file mode 100644 index 0000000..275b578 --- /dev/null +++ b/docs/wiki/Architektur.md @@ -0,0 +1,29 @@ +# Architektur + +## Zielarchitektur +- API: FastAPI +- Worker: Queue-basierte Hintergrundjobs +- DB: SQLite (MVP), spaeter optional PostgreSQL +- Publisher: WordPress REST API +- Frontend/Admin: schlanke Web-UI mit Login + +## Pipeline +1. Feed Fetch +2. Parse + Normalize +3. Deduplicate +4. Enrichment (Rewrite/Tags) +5. Legal/Policy Check +6. Publish (pending) + +## Datenobjekte (MVP) +- `sources` +- `feeds` +- `articles` +- `article_versions` +- `runs` +- `policy_checks` + +## Nichtziele (MVP) +- Multi-User und Rollen +- Vollautomatische Freigabe ohne Review +- Komplexe externe SSO-Integration diff --git a/docs/wiki/Deployment.md b/docs/wiki/Deployment.md new file mode 100644 index 0000000..91388c7 --- /dev/null +++ b/docs/wiki/Deployment.md @@ -0,0 +1,20 @@ +# Deployment (Hetzner + CloudPanel) + +## Umgebung +- Host: Hetzner +- Reverse Proxy: Nginx via CloudPanel +- Ziel-Domain: `news.vanityontour.de` + +## Aktueller Zustand +- Domain ist bis zum Go-Live auf `https://vanityontour.de` umgeleitet. + +## Zielzustand +- `news.vanityontour.de` zeigt auf neue App (interner Port, z. B. `127.0.0.1:8501`) +- API/Worker laufen als systemd-Services +- TLS bleibt ueber CloudPanel/Nginx + +## Mindest-Checks nach Deployment +- `curl -I https://news.vanityontour.de` +- Login erreichbar +- Feed-Import laeuft +- WordPress-Testpublikation (pending) erfolgreich diff --git a/docs/wiki/Home.md b/docs/wiki/Home.md new file mode 100644 index 0000000..300599a --- /dev/null +++ b/docs/wiki/Home.md @@ -0,0 +1,19 @@ +# Wiki Home + +## Zweck +Dieses Wiki dokumentiert Architektur, Betrieb, Sicherheit, Recht und Roadmap des Neuaufbaus von `rss-news`. + +## Inhalte +- `Architektur.md` +- `Deployment.md` +- `Security-Auth.md` +- `Recht-Quellen.md` +- `Operations-Runbook.md` +- `Roadmap.md` +- `Project-Board.md` + +## Projektsteuerung +- GitHub Project #3: https://github.com/users/OliverGiertz/projects/3/views/1 + +## Prinzip +Dokumentation wird bei jeder relevanten Aenderung im selben Pull Request aktualisiert. diff --git a/docs/wiki/Operations-Runbook.md b/docs/wiki/Operations-Runbook.md new file mode 100644 index 0000000..32bf5c4 --- /dev/null +++ b/docs/wiki/Operations-Runbook.md @@ -0,0 +1,23 @@ +# Operations Runbook + +## Daily Checks +- App erreichbar +- Queue/Worker aktiv +- Letzte Feed-Laeufe erfolgreich +- Keine auffaelligen Fehler im Log + +## Incident: Feed-Import faellt aus +1. RSS-Quelle erreichbar? +2. Parser-Fehler im Log? +3. Rate Limits oder Blockaden? +4. Retry-Queue pruefen + +## Incident: WordPress Publish faellt aus +1. WP API erreichbar? +2. Credentials gueltig? +3. Payload-Validation/Tag-Fehler? +4. Artikel in `pending` statt `failed` markieren, wenn unklar + +## Backups +- SQLite-Dump taeglich +- Konfiguration und `.env` sicher sichern diff --git a/docs/wiki/Project-Board.md b/docs/wiki/Project-Board.md new file mode 100644 index 0000000..887ac19 --- /dev/null +++ b/docs/wiki/Project-Board.md @@ -0,0 +1,28 @@ +# Project Board Workflow + +## Zentrale Steuerung +- Board: https://github.com/users/OliverGiertz/projects/3/views/1 +- Board ist die einzige Quelle fuer Planungsstatus. + +## Arbeitsmodus (1 Entwickler) +- Neue Arbeit immer als Issue anlegen +- Issue direkt ins Project aufnehmen +- Status nur im Project pflegen +- PR/Commit auf Issue referenzieren + +## Empfohlene Status-Disziplin +- `Todo`: noch nicht begonnen +- `In Progress`: aktiv in Arbeit +- `Done`: umgesetzt und dokumentiert + +## Konventionen fuer Issues +- Prefix fuer Klarheit: + - `[MVP]` + - `[Infra]` + - `[Legal]` + - `[Bug]` +- Definition of Done in jedem Issue notieren + +## Aktueller Backlog-Hinweis +- Thema Userverwaltung ist fuer MVP obsolet (ein Admin-User). +- Entsprechende Issues als `deferred` oder `closed` kennzeichnen. diff --git a/docs/wiki/Recht-Quellen.md b/docs/wiki/Recht-Quellen.md new file mode 100644 index 0000000..212f0d5 --- /dev/null +++ b/docs/wiki/Recht-Quellen.md @@ -0,0 +1,35 @@ +# Recht und Quellen + +## Grundregeln +- Nur freigegebene Quellen aus Source-Register +- Pflicht-Attribution pro Artikel +- Rechte fuer Bilder separat pruefen +- Kein Autopublish bei unklarer Lizenz + +## Bewertungsmodell +- Gruen: Freie Nachnutzung klar erlaubt +- Gelb: Nutzung mit Einschraenkungen/Einzelfallpruefung +- Rot: Ohne Zusatzlizenz nicht geeignet + +## Aktuelle Referenzen +- Reuters/Thomson Reuters Terms: https://www.thomsonreuters.com/en/terms-of-use +- Presseportal Nutzungsbedingungen: https://www.presseportal.de/nutzungsbedingungen +- tagesschau RSS-Hinweise: https://www.tagesschau.de/infoservices/rssfeeds +- Datenlizenz Deutschland BY 2.0: https://www.govdata.de/dl-de/by-2-0 +- GovData Lizenzen: https://data.gov.de/informationen/lizenzen +- EU Legal Notice (CC BY 4.0): https://commission.europa.eu/legal-notice_en + +## Review-Checkliste je Quelle +1. Sind Bearbeitung und Veroeffentlichung erlaubt? +2. Ist kommerzielle Nutzung erlaubt? +3. Gibt es gesonderte Bildrechte? +4. Ist die Quellenangabe vorgeschrieben? +5. Gibt es Archivierungs- oder Weitergabebeschraenkungen? + +## Operativer Schutz +- Source-Register als Pflicht vor Feed-Aktivierung +- Auto-Block bei fehlenden Lizenzdaten +- Quartalsweiser Terms-Recheck + +## Hinweis +Keine Rechtsberatung. Finale Freigabe kritischer Quellen bei Bedarf juristisch validieren. diff --git a/docs/wiki/Roadmap.md b/docs/wiki/Roadmap.md new file mode 100644 index 0000000..fece89e --- /dev/null +++ b/docs/wiki/Roadmap.md @@ -0,0 +1,19 @@ +# Roadmap + +## Jetzt +- Doku und Projektstruktur bereinigen +- Redirect aktiv +- Backlog auf Neustart ausrichten + +## Naechster Schritt +- FastAPI-MVP implementieren +- Login + Feed-Ingestion + Review + WordPress pending + +## Danach +- Worker/Queue +- Source-Policy Enforcement +- Monitoring/Reporting +- Optional Passkey + +## Steuerung +Alle Arbeitsitems liegen im GitHub Project #3. diff --git a/docs/wiki/Security-Auth.md b/docs/wiki/Security-Auth.md new file mode 100644 index 0000000..a9f830a --- /dev/null +++ b/docs/wiki/Security-Auth.md @@ -0,0 +1,16 @@ +# Security und Auth + +## Mindestanforderungen +- Zugriff auf die WebApp nur mit Login +- Ein aktiver Admin-User (kein Rollenmodell im MVP) +- Passwort nicht im Repo, nur als Secret auf Server + +## Empfohlene Umsetzung +- Session-basierte Auth (HTTP-only Cookies) +- Passwort gehasht (Argon2 oder bcrypt) +- Rate Limiting auf Login-Endpunkt +- CSRF-Schutz fuer Form-Aktionen + +## Spaeter (optional) +- Passkey/WebAuthn als zusaetzlicher Login-Faktor +- IP-Allowlist fuer Admin-Zugang diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..c15b448 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +testpaths = backend/tests +python_files = test_*.py +addopts = -q --maxfail=1 diff --git a/scripts/smoke_backend.sh b/scripts/smoke_backend.sh new file mode 100755 index 0000000..f0000ad --- /dev/null +++ b/scripts/smoke_backend.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash +set -euo pipefail + +if [[ -z "${BASE_URL:-}" ]]; then + echo "BASE_URL fehlt (z. B. https://news.vanityontour.de)" + exit 1 +fi + +if [[ -z "${APP_ADMIN_USERNAME:-}" || -z "${APP_ADMIN_PASSWORD:-}" ]]; then + echo "APP_ADMIN_USERNAME/APP_ADMIN_PASSWORD fehlen" + exit 1 +fi + +cookie_file="$(mktemp)" +trap 'rm -f "$cookie_file"' EXIT + +echo "[1/4] Healthcheck" +curl -fsS "${BASE_URL}/health" | grep -q '"status":"ok"' + +echo "[2/4] Login" +curl -fsS -c "$cookie_file" \ + -H "Content-Type: application/json" \ + -X POST "${BASE_URL}/auth/login" \ + -d "{\"username\":\"${APP_ADMIN_USERNAME}\",\"password\":\"${APP_ADMIN_PASSWORD}\"}" \ + | grep -q '"ok":true' + +echo "[3/4] Protected Endpoint" +curl -fsS -b "$cookie_file" "${BASE_URL}/api/protected" | grep -q '"ok":true' + +echo "[4/4] Pipeline Status" +curl -fsS -b "$cookie_file" "${BASE_URL}/api/pipeline/status" | grep -q '"stage":"skeleton+db"' + +echo "Smoke test erfolgreich." From c52363f1a77db579c14697fbaf18ea7456341599 Mon Sep 17 00:00:00 2001 From: Oliver G Date: Wed, 18 Feb 2026 09:50:32 +0100 Subject: [PATCH 02/54] feat(admin): add article detail page with legal checklist --- backend/app/admin_ui.py | 84 ++++++++++++++++ backend/templates/admin_article_detail.html | 100 ++++++++++++++++++++ backend/templates/admin_dashboard.html | 1 + backend/tests/test_admin_ui.py | 51 ++++++++++ 4 files changed, 236 insertions(+) create mode 100644 backend/templates/admin_article_detail.html diff --git a/backend/app/admin_ui.py b/backend/app/admin_ui.py index 9587664..bc6d9d9 100644 --- a/backend/app/admin_ui.py +++ b/backend/app/admin_ui.py @@ -18,6 +18,7 @@ from .repositories import ( create_feed, create_source, get_article_by_id, + get_feed_by_id, list_articles, list_feeds, list_runs, @@ -80,6 +81,57 @@ def _parse_meta_json(raw: str | None) -> dict: return {} +def _legal_checklist(article: dict, feed: dict | None) -> list[dict[str, str]]: + meta = article.get("meta", {}) + extraction = meta.get("extraction") if isinstance(meta.get("extraction"), dict) else {} + attribution = meta.get("attribution") if isinstance(meta.get("attribution"), dict) else {} + + checks: list[dict[str, str]] = [] + checks.append( + { + "label": "Original-Link vorhanden", + "status": "ok" if article.get("source_url") else "missing", + "value": article.get("source_url") or "-", + } + ) + checks.append( + { + "label": "Autor vorhanden", + "status": "ok" if article.get("author") else "missing", + "value": article.get("author") or "-", + } + ) + checks.append( + { + "label": "Bilder extrahiert", + "status": "ok" if extraction.get("images") else "missing", + "value": str(len(extraction.get("images", []))) if isinstance(extraction.get("images"), list) else "0", + } + ) + checks.append( + { + "label": "Pressekontakt", + "status": "ok" if extraction.get("press_contact") else "missing", + "value": extraction.get("press_contact") or "-", + } + ) + checks.append( + { + "label": "Lizenz/Terms", + "status": "ok" if attribution.get("source_license_name") and attribution.get("source_terms_url") else "missing", + "value": f"{attribution.get('source_license_name') or '-'} | {attribution.get('source_terms_url') or '-'}", + } + ) + checks.append( + { + "label": "Risiko-Status Quelle", + "status": "ok" if (feed and feed.get("source_risk_level") == "green") else "missing", + "value": feed.get("source_risk_level") if feed else "-", + } + ) + return checks + + @router.get("/admin", response_class=HTMLResponse) def admin_index(request: Request): user = _admin_user(request) @@ -167,6 +219,38 @@ def admin_dashboard(request: Request): ) +@router.get("/admin/articles/{article_id}", response_class=HTMLResponse) +def admin_article_detail(request: Request, article_id: int): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + + article = get_article_by_id(article_id) + if not article: + return _dashboard_redirect(msg=f"Artikel #{article_id} nicht gefunden", msg_type="error") + + meta = _parse_meta_json(article.get("meta_json")) + article["meta"] = meta + extraction = meta.get("extraction") if isinstance(meta.get("extraction"), dict) else {} + article["extraction"] = extraction + feed = get_feed_by_id(int(article["feed_id"])) if article.get("feed_id") else None + checklist = _legal_checklist(article, feed) + + return templates.TemplateResponse( + request, + "admin_article_detail.html", + { + "request": request, + "title": f"Artikel #{article_id}", + "user": user, + "article": article, + "feed": feed, + "checklist": checklist, + "allowed_transitions": ALLOWED_TRANSITIONS.get(article.get("status"), ()), + }, + ) + + @router.post("/admin/sources/create") def admin_create_source( request: Request, diff --git a/backend/templates/admin_article_detail.html b/backend/templates/admin_article_detail.html new file mode 100644 index 0000000..098c273 --- /dev/null +++ b/backend/templates/admin_article_detail.html @@ -0,0 +1,100 @@ + + + + + + {{ title }} + + + +
+
+

Artikel-Detail #{{ article.id }}

+

Angemeldet als {{ user }}

+
+
+ Zurück +
+ +
+
+
+ +
+
+

{{ article.title }}

+

Status: {{ article.status }}

+

Autor: {{ article.author or "-" }}

+

Feed: {{ feed.name if feed else "-" }}

+

Quelle: {{ article.source_url }}

+ {% if article.canonical_url %} +

Canonical: {{ article.canonical_url }}

+ {% endif %} + {% if article.summary %} +

Summary: {{ article.summary }}

+ {% endif %} +
+ +
+

Rechts-Checkliste

+ + + + + + {% for c in checklist %} + + + + + + {% endfor %} + +
KriteriumStatusWert
{{ c.label }} + {% if c.status == "ok" %} + OK + {% else %} + Fehlt + {% endif %} + {{ c.value }}
+
+ +
+

Extrahierte Daten

+

Bilder: {{ article.extraction.images|length if article.extraction.images else 0 }}

+ {% if article.extraction.images %} +
    + {% for img in article.extraction.images %} +
  • {{ img }}
  • + {% endfor %} +
+ {% endif %} + {% if article.extraction.press_contact %} +

Pressekontakt

+
{{ article.extraction.press_contact }}
+ {% endif %} + {% if article.extraction.extraction_error %} +

Extraktionsfehler: {{ article.extraction.extraction_error }}

+ {% endif %} +
+ +
+

Volltext

+
{{ article.content_raw or "-" }}
+
+ +
+

Status ändern

+
+ + + +
+
+
+ + diff --git a/backend/templates/admin_dashboard.html b/backend/templates/admin_dashboard.html index 36b30a7..d416d76 100644 --- a/backend/templates/admin_dashboard.html +++ b/backend/templates/admin_dashboard.html @@ -144,6 +144,7 @@ {{ a.title }}
Autor: {{ a.author or "-" }}
Original öffnen +
Details anzeigen {% if a.canonical_url and a.canonical_url != a.source_url %}
Canonical öffnen {% endif %} diff --git a/backend/tests/test_admin_ui.py b/backend/tests/test_admin_ui.py index c6b2188..a65cbfc 100644 --- a/backend/tests/test_admin_ui.py +++ b/backend/tests/test_admin_ui.py @@ -8,6 +8,7 @@ from fastapi.testclient import TestClient from backend.app import config as config_module from backend.app.db import init_db from backend.app.main import app +from backend.app.repositories import ArticleUpsert, FeedCreate, SourceCreate, create_feed, create_source, upsert_article class TestAdminUi(unittest.TestCase): @@ -60,6 +61,56 @@ class TestAdminUi(unittest.TestCase): self.assertEqual(res.status_code, 303) self.assertTrue(res.headers.get("location", "").startswith("/admin/dashboard")) + def test_article_detail_page_renders(self) -> None: + source_id = create_source( + SourceCreate( + name="Test Source", + base_url="https://example.org", + terms_url="https://example.org/terms", + license_name="cc-by", + risk_level="green", + is_enabled=True, + notes=None, + last_reviewed_at="2026-02-18T00:00:00Z", + ) + ) + feed_id = create_feed( + FeedCreate( + name="Test Feed", + url="https://example.org/feed.xml", + source_id=source_id, + is_enabled=True, + ) + ) + article_id = upsert_article( + ArticleUpsert( + feed_id=feed_id, + source_article_id="id-1", + source_hash="hash-1", + title="Titel A", + source_url="https://example.org/a", + canonical_url="https://example.org/a", + published_at=None, + author="Autor A", + summary="Summary A", + content_raw="Volltext A", + content_rewritten=None, + word_count=2, + status="new", + meta_json='{"extraction":{"images":["https://example.org/img.jpg"],"press_contact":"Kontakt"}}', + ) + ) + + self.client.post( + "/admin/login", + data={"username": "admin", "password": "secret"}, + follow_redirects=True, + ) + res = self.client.get(f"/admin/articles/{article_id}", follow_redirects=True) + self.assertEqual(res.status_code, 200) + self.assertIn("Artikel-Detail", res.text) + self.assertIn("Rechts-Checkliste", res.text) + if __name__ == "__main__": unittest.main() From 5159a6e3b4b757d645550fc1b491dda22fab715e Mon Sep 17 00:00:00 2001 From: Oliver G Date: Wed, 18 Feb 2026 10:02:19 +0100 Subject: [PATCH 03/54] feat(legal): add structured attribution fields and publish legal gate --- backend/app/admin_ui.py | 57 +++++++++++++-- backend/app/db.py | 24 ++++++- backend/app/ingestion.py | 8 +++ backend/app/main.py | 43 ++++++++++++ backend/app/repositories.py | 77 +++++++++++++++++++-- backend/templates/admin_article_detail.html | 31 ++++++++- backend/templates/admin_dashboard.html | 1 + backend/tests/test_admin_ui.py | 8 +++ backend/tests/test_article_workflow.py | 10 +++ backend/tests/test_db_repositories.py | 16 +++++ 10 files changed, 259 insertions(+), 16 deletions(-) diff --git a/backend/app/admin_ui.py b/backend/app/admin_ui.py index bc6d9d9..c401ad1 100644 --- a/backend/app/admin_ui.py +++ b/backend/app/admin_ui.py @@ -23,6 +23,7 @@ from .repositories import ( list_feeds, list_runs, list_sources, + set_article_legal_review, update_article_status, ) @@ -104,22 +105,22 @@ def _legal_checklist(article: dict, feed: dict | None) -> list[dict[str, str]]: checks.append( { "label": "Bilder extrahiert", - "status": "ok" if extraction.get("images") else "missing", + "status": "ok" if article.get("image_urls_json") else "missing", "value": str(len(extraction.get("images", []))) if isinstance(extraction.get("images"), list) else "0", } ) checks.append( { "label": "Pressekontakt", - "status": "ok" if extraction.get("press_contact") else "missing", - "value": extraction.get("press_contact") or "-", + "status": "ok" if article.get("press_contact") else "missing", + "value": article.get("press_contact") or extraction.get("press_contact") or "-", } ) checks.append( { "label": "Lizenz/Terms", - "status": "ok" if attribution.get("source_license_name") and attribution.get("source_terms_url") else "missing", - "value": f"{attribution.get('source_license_name') or '-'} | {attribution.get('source_terms_url') or '-'}", + "status": "ok" if article.get("source_license_name_snapshot") and article.get("source_terms_url_snapshot") else "missing", + "value": f"{article.get('source_license_name_snapshot') or attribution.get('source_license_name') or '-'} | {article.get('source_terms_url_snapshot') or attribution.get('source_terms_url') or '-'}", } ) checks.append( @@ -129,6 +130,13 @@ def _legal_checklist(article: dict, feed: dict | None) -> list[dict[str, str]]: "value": feed.get("source_risk_level") if feed else "-", } ) + checks.append( + { + "label": "Manuelle Rechtsfreigabe", + "status": "ok" if int(article.get("legal_checked", 0)) == 1 else "missing", + "value": article.get("legal_checked_at") or "-", + } + ) return checks @@ -193,9 +201,20 @@ def admin_dashboard(request: Request): for article in articles: meta = _parse_meta_json(article.get("meta_json")) extraction = meta.get("extraction") if isinstance(meta.get("extraction"), dict) else {} + images = [] + if article.get("image_urls_json"): + try: + parsed_images = json.loads(article["image_urls_json"]) + if isinstance(parsed_images, list): + images = [str(item) for item in parsed_images if item] + except Exception: + images = [] + if not images and isinstance(extraction.get("images"), list): + images = extraction.get("images") article["meta"] = meta - article["extracted_images"] = extraction.get("images") if isinstance(extraction.get("images"), list) else [] - article["press_contact"] = extraction.get("press_contact") if isinstance(extraction.get("press_contact"), str) else None + article["extracted_images"] = images + if not article.get("press_contact") and isinstance(extraction.get("press_contact"), str): + article["press_contact"] = extraction.get("press_contact") article["extraction_error"] = extraction.get("extraction_error") if isinstance(extraction.get("extraction_error"), str) else None return templates.TemplateResponse( @@ -232,6 +251,15 @@ def admin_article_detail(request: Request, article_id: int): meta = _parse_meta_json(article.get("meta_json")) article["meta"] = meta extraction = meta.get("extraction") if isinstance(meta.get("extraction"), dict) else {} + if article.get("image_urls_json"): + try: + parsed_images = json.loads(article["image_urls_json"]) + if isinstance(parsed_images, list): + extraction["images"] = [str(item) for item in parsed_images if item] + except Exception: + pass + if not article.get("press_contact") and isinstance(extraction.get("press_contact"), str): + article["press_contact"] = extraction.get("press_contact") article["extraction"] = extraction feed = get_feed_by_id(int(article["feed_id"])) if article.get("feed_id") else None checklist = _legal_checklist(article, feed) @@ -251,6 +279,19 @@ def admin_article_detail(request: Request, article_id: int): ) +@router.post("/admin/articles/{article_id}/legal-review") +def admin_article_legal_review(request: Request, article_id: int, approved: str = Form("0"), note: str = Form("")): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + + is_approved = approved == "1" + ok = set_article_legal_review(article_id, approved=is_approved, note=note or None, actor=user) + if not ok: + return _dashboard_redirect(msg=f"Artikel #{article_id} nicht gefunden", msg_type="error") + return RedirectResponse(url=f"/admin/articles/{article_id}", status_code=303) + + @router.post("/admin/sources/create") def admin_create_source( request: Request, @@ -344,6 +385,8 @@ def admin_transition_article(request: Request, article_id: int, target_status: s if article: current = article.get("status") if target_status in ALLOWED_TRANSITIONS.get(current, ()): + if target_status == "published" and int(article.get("legal_checked", 0)) != 1: + return _dashboard_redirect(msg=f"Publish blockiert fuer Artikel #{article_id}: Rechtsfreigabe fehlt", msg_type="error") update_article_status(article_id, target_status, actor=user, note=note or None) return _dashboard_redirect(msg=f"Artikel #{article_id}: {current} -> {target_status}") return _dashboard_redirect(msg=f"Ungueltiger Statuswechsel fuer Artikel #{article_id}", msg_type="error") diff --git a/backend/app/db.py b/backend/app/db.py index c914044..27bbc10 100644 --- a/backend/app/db.py +++ b/backend/app/db.py @@ -81,6 +81,14 @@ def init_db() -> None: summary TEXT, content_raw TEXT, content_rewritten TEXT, + image_urls_json TEXT, + press_contact TEXT, + source_name_snapshot TEXT, + source_terms_url_snapshot TEXT, + source_license_name_snapshot TEXT, + legal_checked INTEGER NOT NULL DEFAULT 0, + legal_checked_at TEXT, + legal_note TEXT, word_count INTEGER DEFAULT 0, status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error')), meta_json TEXT, @@ -130,8 +138,20 @@ def init_db() -> None: existing_columns = { row["name"] for row in conn.execute("PRAGMA table_info(articles)").fetchall() } - if "source_hash" not in existing_columns: - conn.execute("ALTER TABLE articles ADD COLUMN source_hash TEXT") + migration_columns = { + "source_hash": "ALTER TABLE articles ADD COLUMN source_hash TEXT", + "image_urls_json": "ALTER TABLE articles ADD COLUMN image_urls_json TEXT", + "press_contact": "ALTER TABLE articles ADD COLUMN press_contact TEXT", + "source_name_snapshot": "ALTER TABLE articles ADD COLUMN source_name_snapshot TEXT", + "source_terms_url_snapshot": "ALTER TABLE articles ADD COLUMN source_terms_url_snapshot TEXT", + "source_license_name_snapshot": "ALTER TABLE articles ADD COLUMN source_license_name_snapshot TEXT", + "legal_checked": "ALTER TABLE articles ADD COLUMN legal_checked INTEGER NOT NULL DEFAULT 0", + "legal_checked_at": "ALTER TABLE articles ADD COLUMN legal_checked_at TEXT", + "legal_note": "ALTER TABLE articles ADD COLUMN legal_note TEXT", + } + for column, ddl in migration_columns.items(): + if column not in existing_columns: + conn.execute(ddl) def rows_to_dicts(rows: list[sqlite3.Row]) -> list[dict[str, Any]]: diff --git a/backend/app/ingestion.py b/backend/app/ingestion.py index 87e44c2..37703de 100644 --- a/backend/app/ingestion.py +++ b/backend/app/ingestion.py @@ -201,6 +201,14 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats: summary=final_summary, content_raw=final_content_raw, content_rewritten=None, + image_urls_json=json.dumps(extracted.images, ensure_ascii=False) if extracted.images else None, + press_contact=extracted.press_contact, + source_name_snapshot=feed.get("source_name"), + source_terms_url_snapshot=feed.get("source_terms_url"), + source_license_name_snapshot=feed.get("source_license_name"), + legal_checked=False, + legal_checked_at=None, + legal_note=None, word_count=len((final_content_raw or "").split()), status="new", meta_json=json.dumps({"attribution": attribution, "extraction": extraction_meta}, ensure_ascii=False), diff --git a/backend/app/main.py b/backend/app/main.py index 616dd77..4fe6458 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -28,6 +28,7 @@ from .repositories import ( list_feeds as repo_list_feeds, list_runs, list_sources as repo_list_sources, + set_article_legal_review, update_article_status, upsert_article as repo_upsert_article, ) @@ -96,6 +97,14 @@ class ArticleUpsertRequest(BaseModel): summary: str | None = None content_raw: str | None = None content_rewritten: str | None = None + image_urls_json: str | None = None + press_contact: str | None = None + source_name_snapshot: str | None = None + source_terms_url_snapshot: str | None = None + source_license_name_snapshot: str | None = None + legal_checked: bool = False + legal_checked_at: str | None = None + legal_note: str | None = None word_count: int = 0 status: str = Field(default="new", pattern="^(new|rewrite|review|approved|published|error)$") meta_json: str | None = None @@ -115,6 +124,11 @@ class ArticleReviewRequest(BaseModel): note: str | None = None +class ArticleLegalReviewRequest(BaseModel): + approved: bool + note: str | None = None + + ALLOWED_ARTICLE_TRANSITIONS: dict[str, set[str]] = { "new": {"review", "rewrite", "error"}, "rewrite": {"review", "error"}, @@ -330,6 +344,14 @@ def api_upsert_article(payload: ArticleUpsertRequest, username: str = Depends(re summary=payload.summary, content_raw=payload.content_raw, content_rewritten=payload.content_rewritten, + image_urls_json=payload.image_urls_json, + press_contact=payload.press_contact, + source_name_snapshot=payload.source_name_snapshot, + source_terms_url_snapshot=payload.source_terms_url_snapshot, + source_license_name_snapshot=payload.source_license_name_snapshot, + legal_checked=payload.legal_checked, + legal_checked_at=payload.legal_checked_at, + legal_note=payload.legal_note, word_count=payload.word_count, status=payload.status, meta_json=payload.meta_json, @@ -351,6 +373,11 @@ def api_article_transition(article_id: int, payload: ArticleTransitionRequest, u status_code=status.HTTP_400_BAD_REQUEST, detail=f"Ungueltiger Statuswechsel: {current_status} -> {payload.target_status}", ) + if payload.target_status == "published" and int(article.get("legal_checked", 0)) != 1: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Publish gesperrt: Rechtscheck wurde noch nicht freigegeben", + ) updated = update_article_status(article_id, payload.target_status, actor=username, note=payload.note) if not updated: @@ -358,6 +385,22 @@ def api_article_transition(article_id: int, payload: ArticleTransitionRequest, u return {"ok": True, "id": article_id, "from_status": current_status, "to_status": payload.target_status} +@app.post("/api/articles/{article_id}/legal-review") +def api_article_legal_review(article_id: int, payload: ArticleLegalReviewRequest, username: str = Depends(require_auth)) -> dict: + article = get_article_by_id(article_id) + if not article: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden") + + updated = set_article_legal_review(article_id, approved=payload.approved, note=payload.note, actor=username) + if not updated: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden") + return { + "ok": True, + "id": article_id, + "legal_checked": payload.approved, + } + + @app.post("/api/articles/{article_id}/review") def api_article_review(article_id: int, payload: ArticleReviewRequest, username: str = Depends(require_auth)) -> dict: article = get_article_by_id(article_id) diff --git a/backend/app/repositories.py b/backend/app/repositories.py index e170a20..9d9883c 100644 --- a/backend/app/repositories.py +++ b/backend/app/repositories.py @@ -48,6 +48,14 @@ class ArticleUpsert: summary: str | None content_raw: str | None content_rewritten: str | None + image_urls_json: str | None + press_contact: str | None + source_name_snapshot: str | None + source_terms_url_snapshot: str | None + source_license_name_snapshot: str | None + legal_checked: bool + legal_checked_at: str | None + legal_note: str | None word_count: int status: str meta_json: str | None @@ -224,7 +232,10 @@ def get_article_by_id(article_id: int) -> dict[str, Any] | None: row = conn.execute( """ SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author, - a.summary, a.content_raw, a.content_rewritten, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at + a.summary, a.content_raw, a.content_rewritten, a.image_urls_json, a.press_contact, + a.source_name_snapshot, a.source_terms_url_snapshot, a.source_license_name_snapshot, + a.legal_checked, a.legal_checked_at, a.legal_note, + a.word_count, a.status, a.meta_json, a.created_at, a.updated_at FROM articles a WHERE a.id = ? """, @@ -281,6 +292,31 @@ def update_article_status( return True +def set_article_legal_review(article_id: int, approved: bool, note: str | None, actor: str | None = None) -> bool: + article = get_article_by_id(article_id) + if not article: + return False + + event = { + "timestamp": datetime.now(timezone.utc).isoformat(), + "event": "legal_review", + "approved": approved, + "actor": actor or "system", + "note": note, + } + merged_meta = _merge_review_event(article.get("meta_json"), event) + with get_conn() as conn: + conn.execute( + """ + UPDATE articles + SET legal_checked = ?, legal_checked_at = datetime('now'), legal_note = ?, meta_json = ? + WHERE id = ? + """, + (1 if approved else 0, note, merged_meta, article_id), + ) + return True + + def _resolve_existing_article_id(payload: ArticleUpsert) -> int | None: with get_conn() as conn: # 1) strongest key: source_url @@ -320,8 +356,11 @@ def upsert_article(payload: ArticleUpsert) -> int: """ INSERT INTO articles ( feed_id, source_article_id, source_hash, title, source_url, canonical_url, published_at, author, - summary, content_raw, content_rewritten, word_count, status, meta_json - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + summary, content_raw, content_rewritten, image_urls_json, press_contact, + source_name_snapshot, source_terms_url_snapshot, source_license_name_snapshot, + legal_checked, legal_checked_at, legal_note, + word_count, status, meta_json + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, ( payload.feed_id, @@ -335,6 +374,14 @@ def upsert_article(payload: ArticleUpsert) -> int: payload.summary, payload.content_raw, payload.content_rewritten, + payload.image_urls_json, + payload.press_contact, + payload.source_name_snapshot, + payload.source_terms_url_snapshot, + payload.source_license_name_snapshot, + 1 if payload.legal_checked else 0, + payload.legal_checked_at, + payload.legal_note, payload.word_count, payload.status, payload.meta_json, @@ -356,6 +403,14 @@ def upsert_article(payload: ArticleUpsert) -> int: summary = ?, content_raw = ?, content_rewritten = ?, + image_urls_json = ?, + press_contact = ?, + source_name_snapshot = ?, + source_terms_url_snapshot = ?, + source_license_name_snapshot = ?, + legal_checked = ?, + legal_checked_at = ?, + legal_note = ?, word_count = ?, status = ?, meta_json = ? @@ -373,6 +428,14 @@ def upsert_article(payload: ArticleUpsert) -> int: payload.summary, payload.content_raw, payload.content_rewritten, + payload.image_urls_json, + payload.press_contact, + payload.source_name_snapshot, + payload.source_terms_url_snapshot, + payload.source_license_name_snapshot, + 1 if payload.legal_checked else 0, + payload.legal_checked_at, + payload.legal_note, payload.word_count, payload.status, payload.meta_json, @@ -392,7 +455,9 @@ def list_articles(limit: int = 100, status_filter: str | None = None) -> list[di rows = conn.execute( """ SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author, - a.summary, a.content_raw, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at, f.name AS feed_name + a.summary, a.content_raw, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at, f.name AS feed_name, + a.image_urls_json, a.press_contact, a.source_name_snapshot, a.source_terms_url_snapshot, + a.source_license_name_snapshot, a.legal_checked, a.legal_checked_at, a.legal_note FROM articles a LEFT JOIN feeds f ON f.id = a.feed_id WHERE a.status = ? @@ -405,7 +470,9 @@ def list_articles(limit: int = 100, status_filter: str | None = None) -> list[di rows = conn.execute( """ SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author, - a.summary, a.content_raw, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at, f.name AS feed_name + a.summary, a.content_raw, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at, f.name AS feed_name, + a.image_urls_json, a.press_contact, a.source_name_snapshot, a.source_terms_url_snapshot, + a.source_license_name_snapshot, a.legal_checked, a.legal_checked_at, a.legal_note FROM articles a LEFT JOIN feeds f ON f.id = a.feed_id ORDER BY a.id DESC diff --git a/backend/templates/admin_article_detail.html b/backend/templates/admin_article_detail.html index 098c273..62c7e70 100644 --- a/backend/templates/admin_article_detail.html +++ b/backend/templates/admin_article_detail.html @@ -26,6 +26,9 @@

Status: {{ article.status }}

Autor: {{ article.author or "-" }}

Feed: {{ feed.name if feed else "-" }}

+

Quelle Snapshot: {{ article.source_name_snapshot or "-" }}

+

Lizenz Snapshot: {{ article.source_license_name_snapshot or "-" }}

+

Terms Snapshot: {{ article.source_terms_url_snapshot or "-" }}

Quelle: {{ article.source_url }}

{% if article.canonical_url %}

Canonical: {{ article.canonical_url }}

@@ -69,9 +72,9 @@ {% endfor %} {% endif %} - {% if article.extraction.press_contact %} + {% if article.press_contact or article.extraction.press_contact %}

Pressekontakt

-
{{ article.extraction.press_contact }}
+
{{ article.press_contact or article.extraction.press_contact }}
{% endif %} {% if article.extraction.extraction_error %}

Extraktionsfehler: {{ article.extraction.extraction_error }}

@@ -83,8 +86,32 @@
{{ article.content_raw or "-" }}
+
+

Rechtsfreigabe

+

Freigabe: + {% if article.legal_checked %} + Freigegeben + {% else %} + Nicht freigegeben + {% endif %} +

+

Zeitpunkt: {{ article.legal_checked_at or "-" }}

+

Notiz: {{ article.legal_note or "-" }}

+
+ + + +
+
+

Status ändern

+ {% if not article.legal_checked %} +

Hinweis: `published` ist erst nach manueller Rechtsfreigabe erlaubt.

+ {% endif %}
Reset + Export JSON + Export CSV
@@ -143,6 +145,7 @@
{{ a.title }}
Autor: {{ a.author or "-" }}
+ Datum: {{ a.published_at or "-" }} | Alter: {{ a.days_old if a.days_old is not none else "-" }} Tage | Relevanz: {{ a.relevance }}
Original öffnen
Details anzeigen {% if a.canonical_url and a.canonical_url != a.source_url %} diff --git a/backend/tests/test_api_auth.py b/backend/tests/test_api_auth.py index aa86821..96fbe85 100644 --- a/backend/tests/test_api_auth.py +++ b/backend/tests/test_api_auth.py @@ -72,6 +72,73 @@ class TestApiAuth(unittest.TestCase): self.assertFalse(body["allowed"]) self.assertGreaterEqual(len(body["issues"]), 1) + def test_articles_export_json_and_csv_contains_relevance(self) -> None: + login = self.client.post("/auth/login", json={"username": "admin", "password": "secret"}) + self.assertEqual(login.status_code, 200) + + source = self.client.post( + "/api/sources", + json={ + "name": "Export Source", + "base_url": "https://example.org", + "terms_url": "https://example.org/terms", + "license_name": "cc-by", + "risk_level": "green", + "is_enabled": True, + "last_reviewed_at": "2026-02-18T00:00:00Z", + }, + ) + self.assertEqual(source.status_code, 200) + source_id = source.json()["id"] + + feed = self.client.post( + "/api/feeds", + json={"name": "Export Feed", "url": "https://example.org/feed.xml", "source_id": source_id, "is_enabled": True}, + ) + self.assertEqual(feed.status_code, 200) + feed_id = feed.json()["id"] + + article = self.client.post( + "/api/articles/upsert", + json={ + "feed_id": feed_id, + "source_article_id": "exp-1", + "source_hash": "exp-hash-1", + "title": "Export Artikel", + "source_url": "https://example.org/article/1", + "canonical_url": "https://example.org/article/1", + "published_at": "2026-02-18T00:00:00Z", + "author": "Autor", + "summary": "Kurz", + "content_raw": "Langtext", + "image_urls_json": "[\"https://example.org/img.jpg\"]", + "press_contact": "Kontakt", + "source_name_snapshot": "Export Source", + "source_terms_url_snapshot": "https://example.org/terms", + "source_license_name_snapshot": "cc-by", + "status": "review", + }, + ) + self.assertEqual(article.status_code, 200) + + export_json = self.client.get("/api/articles/export?format=json") + self.assertEqual(export_json.status_code, 200) + body = export_json.json() + self.assertTrue(body.get("ok")) + self.assertGreaterEqual(body.get("count", 0), 1) + first = body["items"][0] + self.assertIn("published_at", first) + self.assertIn("days_old", first) + self.assertIn("relevance", first) + + export_csv = self.client.get("/api/articles/export?format=csv") + self.assertEqual(export_csv.status_code, 200) + self.assertIn("text/csv", export_csv.headers.get("content-type", "")) + csv_text = export_csv.text + self.assertIn("published_at", csv_text) + self.assertIn("days_old", csv_text) + self.assertIn("relevance", csv_text) + if __name__ == "__main__": unittest.main() diff --git a/backend/tests/test_relevance.py b/backend/tests/test_relevance.py new file mode 100644 index 0000000..573e312 --- /dev/null +++ b/backend/tests/test_relevance.py @@ -0,0 +1,21 @@ +from datetime import datetime, timezone +import unittest + +from backend.app.relevance import article_age_days, article_relevance + + +class TestRelevance(unittest.TestCase): + def test_article_age_and_relevance(self) -> None: + now = datetime(2026, 2, 18, 12, 0, 0, tzinfo=timezone.utc) + self.assertEqual(article_age_days("2026-02-18T10:00:00Z", now=now), 0) + self.assertEqual(article_relevance("2026-02-18T10:00:00Z", now=now), "hoch") + + self.assertEqual(article_age_days("2026-02-14T12:00:00Z", now=now), 4) + self.assertEqual(article_relevance("2026-02-14T12:00:00Z", now=now), "mittel") + + self.assertEqual(article_relevance("2025-12-01T00:00:00Z", now=now), "alt") + self.assertEqual(article_relevance(None, now=now), "unbekannt") + + +if __name__ == "__main__": + unittest.main() From efaf132936defbcda5004a5c44c650fe5fef2e58 Mon Sep 17 00:00:00 2001 From: Oliver G Date: Wed, 18 Feb 2026 10:11:22 +0100 Subject: [PATCH 05/54] feat(images): add thumbnail gallery with select/exclude workflow --- backend/app/admin_ui.py | 110 +++++++++++++++++--- backend/app/main.py | 13 +++ backend/app/repositories.py | 58 +++++++++++ backend/static/admin.css | 54 ++++++++++ backend/templates/admin_article_detail.html | 45 ++++++-- backend/templates/admin_dashboard.html | 4 + backend/tests/test_admin_ui.py | 22 +++- 7 files changed, 282 insertions(+), 24 deletions(-) diff --git a/backend/app/admin_ui.py b/backend/app/admin_ui.py index 44cb7c5..b8a1777 100644 --- a/backend/app/admin_ui.py +++ b/backend/app/admin_ui.py @@ -2,6 +2,7 @@ from __future__ import annotations import json from pathlib import Path +import re from urllib.parse import urlencode from fastapi import APIRouter, Form, Request @@ -24,6 +25,7 @@ from .repositories import ( list_feeds, list_runs, list_sources, + set_article_image_decision, set_article_legal_review, update_article_status, ) @@ -83,6 +85,63 @@ def _parse_meta_json(raw: str | None) -> dict: return {} +def _read_article_images(article: dict, extraction: dict) -> list[str]: + images: list[str] = [] + if article.get("image_urls_json"): + try: + parsed_images = json.loads(article["image_urls_json"]) + if isinstance(parsed_images, list): + images = [str(item) for item in parsed_images if item] + except Exception: + images = [] + if not images and isinstance(extraction.get("images"), list): + images = [str(item) for item in extraction.get("images") if item] + # deduplicate preserving order + seen: set[str] = set() + deduped: list[str] = [] + for image in images: + if image not in seen: + seen.add(image) + deduped.append(image) + return deduped + + +def _is_probably_irrelevant_image(url: str) -> bool: + lowered = url.lower() + patterns = ( + r"logo", + r"icon", + r"sprite", + r"avatar", + r"favicon", + r"/ads/", + r"tracking", + r"pixel", + r"banner", + ) + return any(re.search(pattern, lowered) for pattern in patterns) + + +def _build_image_entries(article: dict, extraction: dict, meta: dict) -> list[dict[str, object]]: + all_images = _read_article_images(article, extraction) + image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {} + selected_url = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None + excluded_urls = image_review.get("excluded_urls") if isinstance(image_review.get("excluded_urls"), list) else [] + excluded_set = {str(item) for item in excluded_urls if item} + + entries: list[dict[str, object]] = [] + for url in all_images: + entries.append( + { + "url": url, + "is_selected": selected_url == url, + "is_excluded": url in excluded_set, + "is_irrelevant_hint": _is_probably_irrelevant_image(url), + } + ) + return entries + + def _legal_checklist(article: dict, feed: dict | None) -> list[dict[str, str]]: meta = article.get("meta", {}) extraction = meta.get("extraction") if isinstance(meta.get("extraction"), dict) else {} @@ -138,6 +197,15 @@ def _legal_checklist(article: dict, feed: dict | None) -> list[dict[str, str]]: "value": article.get("legal_checked_at") or "-", } ) + image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {} + selected_image = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None + checks.append( + { + "label": "Hauptbild ausgewählt", + "status": "ok" if selected_image else "missing", + "value": selected_image or "-", + } + ) return checks @@ -202,18 +270,12 @@ def admin_dashboard(request: Request): for article in articles: meta = _parse_meta_json(article.get("meta_json")) extraction = meta.get("extraction") if isinstance(meta.get("extraction"), dict) else {} - images = [] - if article.get("image_urls_json"): - try: - parsed_images = json.loads(article["image_urls_json"]) - if isinstance(parsed_images, list): - images = [str(item) for item in parsed_images if item] - except Exception: - images = [] - if not images and isinstance(extraction.get("images"), list): - images = extraction.get("images") + images = _read_article_images(article, extraction) article["meta"] = meta article["extracted_images"] = images + article["image_entries"] = _build_image_entries(article, extraction, meta) + image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {} + article["selected_image_url"] = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None if not article.get("press_contact") and isinstance(extraction.get("press_contact"), str): article["press_contact"] = extraction.get("press_contact") article["extraction_error"] = extraction.get("extraction_error") if isinstance(extraction.get("extraction_error"), str) else None @@ -254,16 +316,13 @@ def admin_article_detail(request: Request, article_id: int): meta = _parse_meta_json(article.get("meta_json")) article["meta"] = meta extraction = meta.get("extraction") if isinstance(meta.get("extraction"), dict) else {} - if article.get("image_urls_json"): - try: - parsed_images = json.loads(article["image_urls_json"]) - if isinstance(parsed_images, list): - extraction["images"] = [str(item) for item in parsed_images if item] - except Exception: - pass + extraction["images"] = _read_article_images(article, extraction) if not article.get("press_contact") and isinstance(extraction.get("press_contact"), str): article["press_contact"] = extraction.get("press_contact") article["extraction"] = extraction + article["image_entries"] = _build_image_entries(article, extraction, meta) + image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {} + article["selected_image_url"] = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None article["days_old"] = article_age_days(article.get("published_at")) article["relevance"] = article_relevance(article.get("published_at")) feed = get_feed_by_id(int(article["feed_id"])) if article.get("feed_id") else None @@ -284,6 +343,23 @@ def admin_article_detail(request: Request, article_id: int): ) +@router.post("/admin/articles/{article_id}/images/decision") +def admin_article_image_decision( + request: Request, + article_id: int, + image_url: str = Form(...), + action: str = Form(...), +): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + + ok = set_article_image_decision(article_id=article_id, image_url=image_url, action=action, actor=user) + if not ok: + return _dashboard_redirect(msg=f"Bildaktion fehlgeschlagen fuer Artikel #{article_id}", msg_type="error") + return RedirectResponse(url=f"/admin/articles/{article_id}", status_code=303) + + @router.post("/admin/articles/{article_id}/legal-review") def admin_article_legal_review(request: Request, article_id: int, approved: str = Form("0"), note: str = Form("")): user = _admin_user(request) diff --git a/backend/app/main.py b/backend/app/main.py index 277630b..177c312 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -336,6 +336,17 @@ def api_export_articles( articles = repo_list_articles(limit=500, status_filter=status_filter) rows = [] for article in articles: + meta: dict = {} + if article.get("meta_json"): + try: + parsed = json.loads(article["meta_json"]) + if isinstance(parsed, dict): + meta = parsed + except Exception: + meta = {} + image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {} + selected_image_url = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None + days_old = article_age_days(article.get("published_at")) rows.append( { @@ -353,6 +364,7 @@ def api_export_articles( "source_terms_url_snapshot": article.get("source_terms_url_snapshot"), "press_contact": article.get("press_contact"), "image_urls_json": article.get("image_urls_json"), + "selected_image_url": selected_image_url, "legal_checked": bool(int(article.get("legal_checked", 0))), "legal_checked_at": article.get("legal_checked_at"), "legal_note": article.get("legal_note"), @@ -377,6 +389,7 @@ def api_export_articles( "source_terms_url_snapshot", "press_contact", "image_urls_json", + "selected_image_url", "legal_checked", "legal_checked_at", "legal_note", diff --git a/backend/app/repositories.py b/backend/app/repositories.py index 9d9883c..164fc79 100644 --- a/backend/app/repositories.py +++ b/backend/app/repositories.py @@ -262,6 +262,16 @@ def _merge_review_event(meta_json: str | None, event: dict[str, Any]) -> str: return json.dumps(meta, ensure_ascii=False) +def _load_meta(meta_json: str | None) -> dict[str, Any]: + if not meta_json: + return {} + try: + parsed = json.loads(meta_json) + return parsed if isinstance(parsed, dict) else {} + except Exception: + return {} + + def update_article_status( article_id: int, new_status: str, @@ -317,6 +327,54 @@ def set_article_legal_review(article_id: int, approved: bool, note: str | None, return True +def set_article_image_decision(article_id: int, image_url: str, action: str, actor: str | None = None) -> bool: + article = get_article_by_id(article_id) + if not article: + return False + url = (image_url or "").strip() + if not url: + return False + if action not in {"select", "exclude", "restore"}: + return False + + meta = _load_meta(article.get("meta_json")) + image_review = meta.get("image_review") + if not isinstance(image_review, dict): + image_review = {} + + excluded = image_review.get("excluded_urls") + if not isinstance(excluded, list): + excluded = [] + excluded_set = {str(item) for item in excluded if item} + + selected_url = image_review.get("selected_url") + if not isinstance(selected_url, str): + selected_url = None + + if action == "select": + selected_url = url + excluded_set.discard(url) + elif action == "exclude": + excluded_set.add(url) + if selected_url == url: + selected_url = None + elif action == "restore": + excluded_set.discard(url) + + image_review["selected_url"] = selected_url + image_review["excluded_urls"] = sorted(excluded_set) + image_review["updated_at"] = datetime.now(timezone.utc).isoformat() + image_review["updated_by"] = actor or "system" + meta["image_review"] = image_review + + with get_conn() as conn: + conn.execute( + "UPDATE articles SET meta_json = ? WHERE id = ?", + (json.dumps(meta, ensure_ascii=False), article_id), + ) + return True + + def _resolve_existing_article_id(payload: ArticleUpsert) -> int | None: with get_conn() as conn: # 1) strongest key: source_url diff --git a/backend/static/admin.css b/backend/static/admin.css index 348264f..402c067 100644 --- a/backend/static/admin.css +++ b/backend/static/admin.css @@ -179,6 +179,60 @@ button.secondary { background: #f8fafc; } +.thumb { + width: 72px; + height: 72px; + object-fit: cover; + border-radius: 8px; + border: 1px solid #cbd5e1; + margin-top: 6px; +} + +.image-grid { + display: grid; + grid-template-columns: repeat(auto-fill, minmax(180px, 1fr)); + gap: 10px; +} + +.image-card { + border: 1px solid #e2e8f0; + border-radius: 8px; + padding: 8px; + background: #fff; +} + +.image-card img { + width: 100%; + height: 120px; + object-fit: cover; + border-radius: 6px; + border: 1px solid #e2e8f0; + background: #f8fafc; +} + +.image-meta { + margin-top: 6px; + display: flex; + gap: 6px; + flex-wrap: wrap; +} + +.image-actions { + margin-top: 8px; + display: flex; + gap: 6px; + flex-wrap: wrap; +} + +.image-selected { + border-color: #10b981; + box-shadow: 0 0 0 1px rgba(16, 185, 129, 0.25); +} + +.image-excluded { + opacity: 0.65; +} + @media (max-width: 920px) { .stats { grid-template-columns: repeat(2, minmax(0, 1fr)); diff --git a/backend/templates/admin_article_detail.html b/backend/templates/admin_article_detail.html index d2b1b67..7acb5c1 100644 --- a/backend/templates/admin_article_detail.html +++ b/backend/templates/admin_article_detail.html @@ -67,13 +67,46 @@

Extrahierte Daten

-

Bilder: {{ article.extraction.images|length if article.extraction.images else 0 }}

- {% if article.extraction.images %} -
    - {% for img in article.extraction.images %} -
  • {{ img }}
  • +

    Bilder: {{ article.image_entries|length if article.image_entries else 0 }}

    + {% if article.selected_image_url %} +

    Ausgewähltes Hauptbild: {{ article.selected_image_url }}

    + {% endif %} + {% if article.image_entries %} +
    + {% for image in article.image_entries %} +
    + + Artikelbild + +
    + {% if image.is_selected %}Ausgewählt{% endif %} + {% if image.is_excluded %}Ausgeblendet{% endif %} + {% if image.is_irrelevant_hint %}evtl. irrelevant{% endif %} +
    +
    +
    + + + +
    + {% if not image.is_excluded %} +
    + + + +
    + {% else %} +
    + + + +
    + {% endif %} +
    + +
    {% endfor %} -
+ {% endif %} {% if article.press_contact or article.extraction.press_contact %}

Pressekontakt

diff --git a/backend/templates/admin_dashboard.html b/backend/templates/admin_dashboard.html index 27bcaf5..34a0f84 100644 --- a/backend/templates/admin_dashboard.html +++ b/backend/templates/admin_dashboard.html @@ -155,6 +155,10 @@
{{ a.status }}
Legal: {{ "OK" if a.legal_checked else "offen" }}
+ {% if a.selected_image_url %} +
Hauptbild gesetzt
+ Hauptbild + {% endif %} {% if a.summary %}
Summary: {{ a.summary }}
{% endif %} diff --git a/backend/tests/test_admin_ui.py b/backend/tests/test_admin_ui.py index f52d917..ac8a615 100644 --- a/backend/tests/test_admin_ui.py +++ b/backend/tests/test_admin_ui.py @@ -8,7 +8,15 @@ from fastapi.testclient import TestClient from backend.app import config as config_module from backend.app.db import init_db from backend.app.main import app -from backend.app.repositories import ArticleUpsert, FeedCreate, SourceCreate, create_feed, create_source, upsert_article +from backend.app.repositories import ( + ArticleUpsert, + FeedCreate, + SourceCreate, + create_feed, + create_source, + get_article_by_id, + upsert_article, +) class TestAdminUi(unittest.TestCase): @@ -119,6 +127,18 @@ class TestAdminUi(unittest.TestCase): self.assertIn("Artikel-Detail", res.text) self.assertIn("Rechts-Checkliste", res.text) + decision = self.client.post( + f"/admin/articles/{article_id}/images/decision", + data={"image_url": "https://example.org/img.jpg", "action": "select"}, + follow_redirects=True, + ) + self.assertEqual(decision.status_code, 200) + self.assertIn("Ausgewähltes Hauptbild", decision.text) + + article = get_article_by_id(article_id) + self.assertIsNotNone(article) + self.assertIn("selected_url", article.get("meta_json", "")) + if __name__ == "__main__": unittest.main() From 910ca72c818ebd85b039dce7ef1367204cbc957c Mon Sep 17 00:00:00 2001 From: Oliver G Date: Wed, 18 Feb 2026 10:16:30 +0100 Subject: [PATCH 06/54] fix(ui): render article images via authenticated proxy thumbnails --- backend/app/admin_ui.py | 33 ++++++++++++++++++++- backend/templates/admin_article_detail.html | 5 +++- backend/templates/admin_dashboard.html | 2 +- backend/tests/test_admin_ui.py | 32 ++++++++++++++++++++ 4 files changed, 69 insertions(+), 3 deletions(-) diff --git a/backend/app/admin_ui.py b/backend/app/admin_ui.py index b8a1777..04c6db4 100644 --- a/backend/app/admin_ui.py +++ b/backend/app/admin_ui.py @@ -4,9 +4,10 @@ import json from pathlib import Path import re from urllib.parse import urlencode +from urllib.request import Request as UrlRequest, urlopen from fastapi import APIRouter, Form, Request -from fastapi.responses import HTMLResponse, RedirectResponse +from fastapi.responses import HTMLResponse, RedirectResponse, Response from fastapi.templating import Jinja2Templates from .auth import create_session_token, verify_credentials, verify_session_token @@ -41,6 +42,7 @@ ALLOWED_TRANSITIONS: dict[str, tuple[str, ...]] = { "published": ("error",), "error": ("review", "rewrite"), } +IMAGE_PROXY_USER_AGENT = "rss-news-admin/1.0" def _admin_user(request: Request) -> str | None: @@ -134,6 +136,7 @@ def _build_image_entries(article: dict, extraction: dict, meta: dict) -> list[di entries.append( { "url": url, + "proxy_url": f"/admin/images/proxy?{urlencode({'url': url})}", "is_selected": selected_url == url, "is_excluded": url in excluded_set, "is_irrelevant_hint": _is_probably_irrelevant_image(url), @@ -276,6 +279,9 @@ def admin_dashboard(request: Request): article["image_entries"] = _build_image_entries(article, extraction, meta) image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {} article["selected_image_url"] = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None + article["selected_image_proxy_url"] = ( + f"/admin/images/proxy?{urlencode({'url': article['selected_image_url']})}" if article.get("selected_image_url") else None + ) if not article.get("press_contact") and isinstance(extraction.get("press_contact"), str): article["press_contact"] = extraction.get("press_contact") article["extraction_error"] = extraction.get("extraction_error") if isinstance(extraction.get("extraction_error"), str) else None @@ -323,6 +329,9 @@ def admin_article_detail(request: Request, article_id: int): article["image_entries"] = _build_image_entries(article, extraction, meta) image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {} article["selected_image_url"] = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None + article["selected_image_proxy_url"] = ( + f"/admin/images/proxy?{urlencode({'url': article['selected_image_url']})}" if article.get("selected_image_url") else None + ) article["days_old"] = article_age_days(article.get("published_at")) article["relevance"] = article_relevance(article.get("published_at")) feed = get_feed_by_id(int(article["feed_id"])) if article.get("feed_id") else None @@ -360,6 +369,28 @@ def admin_article_image_decision( return RedirectResponse(url=f"/admin/articles/{article_id}", status_code=303) +@router.get("/admin/images/proxy") +def admin_image_proxy(request: Request, url: str): + user = _admin_user(request) + if not user: + return Response(status_code=401) + + if not (url.startswith("http://") or url.startswith("https://")): + return Response(status_code=400) + + try: + req = UrlRequest(url=url, headers={"User-Agent": IMAGE_PROXY_USER_AGENT, "Referer": url}) + with urlopen(req, timeout=10) as resp: + body = resp.read() + content_type = resp.headers.get("Content-Type", "application/octet-stream") + except Exception: + return Response(status_code=404) + + if not content_type.lower().startswith("image/"): + return Response(status_code=415) + return Response(content=body, media_type=content_type) + + @router.post("/admin/articles/{article_id}/legal-review") def admin_article_legal_review(request: Request, article_id: int, approved: str = Form("0"), note: str = Form("")): user = _admin_user(request) diff --git a/backend/templates/admin_article_detail.html b/backend/templates/admin_article_detail.html index 7acb5c1..bdfc0af 100644 --- a/backend/templates/admin_article_detail.html +++ b/backend/templates/admin_article_detail.html @@ -70,13 +70,16 @@

Bilder: {{ article.image_entries|length if article.image_entries else 0 }}

{% if article.selected_image_url %}

Ausgewähltes Hauptbild: {{ article.selected_image_url }}

+ {% if article.selected_image_proxy_url %} + Ausgewähltes Hauptbild + {% endif %} {% endif %} {% if article.image_entries %}
{% for image in article.image_entries %}
- Artikelbild + Artikelbild
{% if image.is_selected %}Ausgewählt{% endif %} diff --git a/backend/templates/admin_dashboard.html b/backend/templates/admin_dashboard.html index 34a0f84..6709f65 100644 --- a/backend/templates/admin_dashboard.html +++ b/backend/templates/admin_dashboard.html @@ -157,7 +157,7 @@
Legal: {{ "OK" if a.legal_checked else "offen" }}
{% if a.selected_image_url %}
Hauptbild gesetzt
- Hauptbild + Hauptbild {% endif %} {% if a.summary %}
Summary: {{ a.summary }}
diff --git a/backend/tests/test_admin_ui.py b/backend/tests/test_admin_ui.py index ac8a615..666f0ea 100644 --- a/backend/tests/test_admin_ui.py +++ b/backend/tests/test_admin_ui.py @@ -2,6 +2,7 @@ import os import tempfile import unittest from pathlib import Path +from unittest.mock import patch from fastapi.testclient import TestClient @@ -139,6 +140,37 @@ class TestAdminUi(unittest.TestCase): self.assertIsNotNone(article) self.assertIn("selected_url", article.get("meta_json", "")) + @patch("backend.app.admin_ui.urlopen") + def test_image_proxy_returns_image_data(self, mock_urlopen) -> None: + class _FakeHeaders: + def get(self, key: str, default=None): + if key.lower() == "content-type": + return "image/jpeg" + return default + + class _FakeResponse: + headers = _FakeHeaders() + + def read(self): + return b"\xff\xd8\xff\xd9" + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + return False + + mock_urlopen.return_value = _FakeResponse() + + self.client.post( + "/admin/login", + data={"username": "admin", "password": "secret"}, + follow_redirects=True, + ) + res = self.client.get("/admin/images/proxy?url=https%3A%2F%2Fexample.org%2Fimg.jpg") + self.assertEqual(res.status_code, 200) + self.assertIn("image/jpeg", res.headers.get("content-type", "")) + if __name__ == "__main__": unittest.main() From fb3465fb10d860344393fa770307c9b1c36ee321 Mon Sep 17 00:00:00 2001 From: Oliver G Date: Wed, 18 Feb 2026 10:20:47 +0100 Subject: [PATCH 07/54] fix(images): add proxy fallback to direct source url rendering --- backend/app/admin_ui.py | 25 ++++++++++++++++----- backend/static/admin.css | 5 +++++ backend/templates/admin_article_detail.html | 2 +- backend/templates/admin_dashboard.html | 2 +- 4 files changed, 26 insertions(+), 8 deletions(-) diff --git a/backend/app/admin_ui.py b/backend/app/admin_ui.py index 04c6db4..e108d3d 100644 --- a/backend/app/admin_ui.py +++ b/backend/app/admin_ui.py @@ -3,6 +3,7 @@ from __future__ import annotations import json from pathlib import Path import re +from urllib.parse import urlparse from urllib.parse import urlencode from urllib.request import Request as UrlRequest, urlopen @@ -124,6 +125,14 @@ def _is_probably_irrelevant_image(url: str) -> bool: return any(re.search(pattern, lowered) for pattern in patterns) +def _is_http_image_url(url: str) -> bool: + try: + parsed = urlparse(url) + except Exception: + return False + return parsed.scheme in {"http", "https"} and bool(parsed.netloc) + + def _build_image_entries(article: dict, extraction: dict, meta: dict) -> list[dict[str, object]]: all_images = _read_article_images(article, extraction) image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {} @@ -371,15 +380,19 @@ def admin_article_image_decision( @router.get("/admin/images/proxy") def admin_image_proxy(request: Request, url: str): - user = _admin_user(request) - if not user: - return Response(status_code=401) - - if not (url.startswith("http://") or url.startswith("https://")): + if not _is_http_image_url(url): return Response(status_code=400) try: - req = UrlRequest(url=url, headers={"User-Agent": IMAGE_PROXY_USER_AGENT, "Referer": url}) + referer = request.headers.get("referer", "") + req = UrlRequest( + url=url, + headers={ + "User-Agent": IMAGE_PROXY_USER_AGENT, + "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8", + "Referer": referer or url, + }, + ) with urlopen(req, timeout=10) as resp: body = resp.read() content_type = resp.headers.get("Content-Type", "application/octet-stream") diff --git a/backend/static/admin.css b/backend/static/admin.css index 402c067..705aeda 100644 --- a/backend/static/admin.css +++ b/backend/static/admin.css @@ -210,6 +210,11 @@ button.secondary { background: #f8fafc; } +.img-failed { + opacity: 0.3; + filter: grayscale(1); +} + .image-meta { margin-top: 6px; display: flex; diff --git a/backend/templates/admin_article_detail.html b/backend/templates/admin_article_detail.html index bdfc0af..a38937b 100644 --- a/backend/templates/admin_article_detail.html +++ b/backend/templates/admin_article_detail.html @@ -79,7 +79,7 @@ {% for image in article.image_entries %}
- Artikelbild + Artikelbild
{% if image.is_selected %}Ausgewählt{% endif %} diff --git a/backend/templates/admin_dashboard.html b/backend/templates/admin_dashboard.html index 6709f65..5dad3f5 100644 --- a/backend/templates/admin_dashboard.html +++ b/backend/templates/admin_dashboard.html @@ -157,7 +157,7 @@
Legal: {{ "OK" if a.legal_checked else "offen" }}
{% if a.selected_image_url %}
Hauptbild gesetzt
- Hauptbild + Hauptbild {% endif %} {% if a.summary %}
Summary: {{ a.summary }}
From 26e3d26b93b440b5b3460f6023b39ba8b99dddaf Mon Sep 17 00:00:00 2001 From: Oliver G Date: Wed, 18 Feb 2026 10:40:39 +0100 Subject: [PATCH 08/54] feat(images): auto-select relevant article images and tidy detail header --- backend/app/ingestion.py | 82 ++++++++++++++++++++- backend/static/admin.css | 21 ++++++ backend/templates/admin_article_detail.html | 24 +++--- backend/tests/test_ingestion.py | 1 + 4 files changed, 115 insertions(+), 13 deletions(-) diff --git a/backend/app/ingestion.py b/backend/app/ingestion.py index 37703de..8a7696a 100644 --- a/backend/app/ingestion.py +++ b/backend/app/ingestion.py @@ -4,8 +4,10 @@ from dataclasses import dataclass from datetime import datetime, timezone import hashlib import json +import re import time from typing import Any +from urllib.parse import unquote, urlparse import feedparser @@ -67,6 +69,72 @@ def _parsed_get(parsed: object, key: str, default: object = None) -> object: return getattr(parsed, key, default) +def _normalize_tokens(text: str) -> set[str]: + normalized = re.sub(r"[^a-z0-9]+", " ", text.lower()) + return {token for token in normalized.split() if len(token) >= 4} + + +def _rank_image_candidates(source_url: str, title: str, images: list[str]) -> list[dict[str, Any]]: + source_host = (urlparse(source_url).hostname or "").lower() + is_presseportal = "presseportal.de" in source_host + title_tokens = _normalize_tokens(title) + blocked_patterns = ("logo", "badge", "app-store", "google-play", "na-logo", "sprite", "icon", "favicon", "tracking", "pixel") + + ranked: list[dict[str, Any]] = [] + for url in images: + parsed = urlparse(url) + path = unquote(parsed.path.lower()) + full = f"{parsed.netloc.lower()}{path}" + score = 0 + reasons: list[str] = [] + + if any(token in full for token in blocked_patterns): + score -= 150 + reasons.append("blocked-pattern") + + if is_presseportal and "/thumbnail/story_big/" in path: + score += 120 + reasons.append("presseportal-story-big") + elif is_presseportal and "/thumbnail/highlight/" in path: + score += 45 + reasons.append("presseportal-highlight") + elif is_presseportal and "/thumbnail/liste/" in path: + score -= 40 + reasons.append("presseportal-list") + + if "crop=" in (parsed.query or "").lower(): + score -= 10 + reasons.append("cropped-preview") + + path_tokens = _normalize_tokens(path.replace("-", " ")) + overlap = len(title_tokens.intersection(path_tokens)) + if overlap > 0: + score += min(30, overlap * 6) + reasons.append(f"title-match:{overlap}") + + ranked.append({"url": url, "score": score, "reasons": reasons}) + + ranked.sort(key=lambda item: item["score"], reverse=True) + return ranked + + +def _select_relevant_images(source_url: str, title: str, images: list[str], max_keep: int = 3) -> tuple[list[str], str | None, list[dict[str, Any]]]: + # dedupe incoming order first + deduped: list[str] = [] + seen: set[str] = set() + for image in images: + if image and image not in seen: + seen.add(image) + deduped.append(image) + + ranked = _rank_image_candidates(source_url, title, deduped) + kept = [item["url"] for item in ranked if item["score"] > 0][:max_keep] + if not kept and ranked: + kept = [ranked[0]["url"]] + primary = kept[0] if kept else None + return kept, primary, ranked + + def run_ingestion(feed_id: int | None = None) -> IngestionStats: run_id = create_run(RunCreate(run_type="ingestion", status="running", details="started")) feeds_processed = 0 @@ -167,6 +235,12 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats: final_summary = extracted.summary or (summary[:1000] if summary else None) final_content_raw = extracted.content_text or content_raw final_canonical = extracted.canonical_url or entry.get("link") + selected_images, primary_image, ranked_images = _select_relevant_images( + link, + final_title, + extracted.images, + max_keep=3, + ) source_hash = _entry_hash( entry, @@ -188,6 +262,12 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats: } extraction_meta: dict[str, Any] = extracted_article_to_meta(extracted) extraction_meta["fetched_from"] = link + extraction_meta["image_selection"] = { + "primary": primary_image, + "selected_count": len(selected_images), + "total_candidates": len(extracted.images), + "ranked": ranked_images, + } article_id = upsert_article( ArticleUpsert( feed_id=int(feed["id"]), @@ -201,7 +281,7 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats: summary=final_summary, content_raw=final_content_raw, content_rewritten=None, - image_urls_json=json.dumps(extracted.images, ensure_ascii=False) if extracted.images else None, + image_urls_json=json.dumps(selected_images, ensure_ascii=False) if selected_images else None, press_contact=extracted.press_contact, source_name_snapshot=feed.get("source_name"), source_terms_url_snapshot=feed.get("source_terms_url"), diff --git a/backend/static/admin.css b/backend/static/admin.css index 705aeda..16d55be 100644 --- a/backend/static/admin.css +++ b/backend/static/admin.css @@ -179,6 +179,27 @@ button.secondary { background: #f8fafc; } +.detail-grid { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(220px, 1fr)); + gap: 8px 12px; + margin-bottom: 10px; +} + +.detail-item { + background: #f8fafc; + border: 1px solid #e2e8f0; + border-radius: 8px; + padding: 8px; + display: grid; + gap: 4px; +} + +.detail-item .k { + font-size: 12px; + color: #64748b; +} + .thumb { width: 72px; height: 72px; diff --git a/backend/templates/admin_article_detail.html b/backend/templates/admin_article_detail.html index a38937b..29c054b 100644 --- a/backend/templates/admin_article_detail.html +++ b/backend/templates/admin_article_detail.html @@ -23,19 +23,19 @@

{{ article.title }}

-

Status: {{ article.status }}

-

Artikel-Datum: {{ article.published_at or "-" }}

-

Alter: {{ article.days_old if article.days_old is not none else "-" }} Tage

-

Relevanz: {{ article.relevance }}

-

Autor: {{ article.author or "-" }}

-

Feed: {{ feed.name if feed else "-" }}

-

Quelle Snapshot: {{ article.source_name_snapshot or "-" }}

-

Lizenz Snapshot: {{ article.source_license_name_snapshot or "-" }}

-

Terms Snapshot: {{ article.source_terms_url_snapshot or "-" }}

+
+
Status{{ article.status }}
+
Artikel-Datum{{ article.published_at or "-" }}
+
Alter{{ article.days_old if article.days_old is not none else "-" }} Tage
+
Relevanz{{ article.relevance }}
+
Autor{{ article.author or "-" }}
+
Feed{{ feed.name if feed else "-" }}
+
Quelle Snapshot{{ article.source_name_snapshot or "-" }}
+
Lizenz Snapshot{{ article.source_license_name_snapshot or "-" }}
+
Terms Snapshot{{ article.source_terms_url_snapshot or "-" }}
+

Quelle: {{ article.source_url }}

- {% if article.canonical_url %} -

Canonical: {{ article.canonical_url }}

- {% endif %} + {% if article.canonical_url %}

Canonical: {{ article.canonical_url }}

{% endif %} {% if article.summary %}

Summary: {{ article.summary }}

{% endif %} diff --git a/backend/tests/test_ingestion.py b/backend/tests/test_ingestion.py index 05b2c2b..342c216 100644 --- a/backend/tests/test_ingestion.py +++ b/backend/tests/test_ingestion.py @@ -85,6 +85,7 @@ class TestIngestion(unittest.TestCase): self.assertEqual(article["author"], "Autorin A") self.assertIn("Original Volltext", article["content_raw"] or "") self.assertIn("Pressekontakt", article["meta_json"] or "") + self.assertIsNotNone(article["image_urls_json"]) @patch("backend.app.ingestion.extract_article") @patch("backend.app.ingestion.feedparser.parse") From dcdf4d954a78b6a32c6f89a29d128f0eabc783ae Mon Sep 17 00:00:00 2001 From: Oliver G Date: Wed, 18 Feb 2026 10:43:17 +0100 Subject: [PATCH 09/54] feat(ui): show auto image ranking reasons in article detail --- backend/app/admin_ui.py | 1 + backend/templates/admin_article_detail.html | 23 +++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/backend/app/admin_ui.py b/backend/app/admin_ui.py index e108d3d..f276b8c 100644 --- a/backend/app/admin_ui.py +++ b/backend/app/admin_ui.py @@ -335,6 +335,7 @@ def admin_article_detail(request: Request, article_id: int): if not article.get("press_contact") and isinstance(extraction.get("press_contact"), str): article["press_contact"] = extraction.get("press_contact") article["extraction"] = extraction + article["image_selection"] = extraction.get("image_selection") if isinstance(extraction.get("image_selection"), dict) else {} article["image_entries"] = _build_image_entries(article, extraction, meta) image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {} article["selected_image_url"] = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None diff --git a/backend/templates/admin_article_detail.html b/backend/templates/admin_article_detail.html index 29c054b..86fb7af 100644 --- a/backend/templates/admin_article_detail.html +++ b/backend/templates/admin_article_detail.html @@ -75,6 +75,29 @@ {% endif %} {% endif %} {% if article.image_entries %} + {% if article.image_selection %} +
+ Automatische Bildauswahl (Score + Gründe) +
Primärbild (Auto): {{ article.image_selection.primary or "-" }}
+
Ausgewählt: {{ article.image_selection.selected_count or 0 }} / Kandidaten: {{ article.image_selection.total_candidates or 0 }}
+ {% if article.image_selection.ranked %} + + + + + + {% for r in article.image_selection.ranked %} + + + + + + {% endfor %} + +
BildScoreGründe
{{ r.url }}{{ r.score }}{{ r.reasons|join(", ") if r.reasons else "-" }}
+ {% endif %} +
+ {% endif %}
{% for image in article.image_entries %}
From 1cee56205e3febeb40f3142c0539a977d6a6c9bf Mon Sep 17 00:00:00 2001 From: Oliver G Date: Wed, 18 Feb 2026 10:49:43 +0100 Subject: [PATCH 10/54] feat(publisher): add wordpress draft queue with retry and admin controls --- backend/app/admin_ui.py | 32 ++++ backend/app/config.py | 5 + backend/app/db.py | 50 ++++++ backend/app/ingestion.py | 5 + backend/app/main.py | 50 ++++++ backend/app/publisher.py | 103 ++++++++++++ backend/app/repositories.py | 177 +++++++++++++++++++- backend/app/wordpress.py | 111 ++++++++++++ backend/templates/admin_article_detail.html | 25 +++ backend/templates/admin_dashboard.html | 37 ++++ backend/tests/test_admin_ui.py | 5 + backend/tests/test_db_repositories.py | 10 ++ backend/tests/test_publisher.py | 112 +++++++++++++ 13 files changed, 719 insertions(+), 3 deletions(-) create mode 100644 backend/app/publisher.py create mode 100644 backend/app/wordpress.py create mode 100644 backend/tests/test_publisher.py diff --git a/backend/app/admin_ui.py b/backend/app/admin_ui.py index f276b8c..8d8e879 100644 --- a/backend/app/admin_ui.py +++ b/backend/app/admin_ui.py @@ -15,6 +15,7 @@ from .auth import create_session_token, verify_credentials, verify_session_token from .config import get_settings from .ingestion import run_ingestion from .policy import evaluate_source_policy +from .publisher import enqueue_publish, run_publisher from .relevance import article_age_days, article_relevance from .repositories import ( FeedCreate, @@ -25,6 +26,7 @@ from .repositories import ( get_feed_by_id, list_articles, list_feeds, + list_publish_jobs, list_runs, list_sources, set_article_image_decision, @@ -273,6 +275,7 @@ def admin_dashboard(request: Request): source_policy = {s["id"]: evaluate_source_policy(s) for s in sources} feeds = list_feeds() runs = list_runs(limit=30) + publish_jobs = list_publish_jobs(limit=30) status_filter = request.query_params.get("status_filter") if status_filter in {"new", "rewrite", "review", "approved", "published", "error"}: articles = list_articles(limit=100, status_filter=status_filter) @@ -308,6 +311,7 @@ def admin_dashboard(request: Request): "source_policy": source_policy, "feeds": feeds, "runs": runs, + "publish_jobs": publish_jobs, "articles": articles, "status_options": ["new", "rewrite", "review", "approved", "published", "error"], "allowed_transitions": ALLOWED_TRANSITIONS, @@ -358,6 +362,8 @@ def admin_article_detail(request: Request, article_id: int): "feed": feed, "checklist": checklist, "allowed_transitions": ALLOWED_TRANSITIONS.get(article.get("status"), ()), + "flash_msg": request.query_params.get("msg", ""), + "flash_type": request.query_params.get("type", "success"), }, ) @@ -379,6 +385,32 @@ def admin_article_image_decision( return RedirectResponse(url=f"/admin/articles/{article_id}", status_code=303) +@router.post("/admin/articles/{article_id}/publish-enqueue") +def admin_enqueue_publish(request: Request, article_id: int, max_attempts: str = Form("3")): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + try: + job_id = enqueue_publish(article_id=article_id, max_attempts=max(1, int(max_attempts))) + except Exception as exc: + return _dashboard_redirect(msg=f"Publish Queue Fehler fuer Artikel #{article_id}: {exc}", msg_type="error") + return RedirectResponse(url=f"/admin/articles/{article_id}?msg=Publish-Job%20#{job_id}%20erstellt&type=success", status_code=303) + + +@router.post("/admin/publisher/run") +def admin_run_publisher(request: Request, max_jobs: str = Form("10")): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + try: + stats = run_publisher(max_jobs=max(1, int(max_jobs))) + except Exception as exc: + return _dashboard_redirect(msg=f"Publisher Fehler: {exc}", msg_type="error") + return _dashboard_redirect( + msg=f"Publisher: processed={stats.processed}, success={stats.success}, failed={stats.failed}, requeued={stats.requeued}" + ) + + @router.get("/admin/images/proxy") def admin_image_proxy(request: Request, url: str): if not _is_http_image_url(url): diff --git a/backend/app/config.py b/backend/app/config.py index f32b8c4..40deedb 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -23,6 +23,11 @@ class Settings(BaseSettings): app_db_path: str = "backend/data/rss_news.db" + wordpress_base_url: str | None = None + wordpress_username: str | None = None + wordpress_app_password: str | None = None + wordpress_default_status: str = "draft" + @lru_cache(maxsize=1) def get_settings() -> Settings: diff --git a/backend/app/db.py b/backend/app/db.py index 27bbc10..d2ebfd5 100644 --- a/backend/app/db.py +++ b/backend/app/db.py @@ -68,6 +68,21 @@ def init_db() -> None: details TEXT ); + CREATE TABLE IF NOT EXISTS publish_jobs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + article_id INTEGER NOT NULL, + status TEXT NOT NULL CHECK (status IN ('queued', 'running', 'success', 'failed')), + attempts INTEGER NOT NULL DEFAULT 0, + max_attempts INTEGER NOT NULL DEFAULT 3, + error_message TEXT, + wp_post_id INTEGER, + wp_post_url TEXT, + created_at TEXT NOT NULL DEFAULT (datetime('now')), + started_at TEXT, + finished_at TEXT, + FOREIGN KEY(article_id) REFERENCES articles(id) ON DELETE CASCADE + ); + CREATE TABLE IF NOT EXISTS articles ( id INTEGER PRIMARY KEY AUTOINCREMENT, feed_id INTEGER, @@ -89,6 +104,11 @@ def init_db() -> None: legal_checked INTEGER NOT NULL DEFAULT 0, legal_checked_at TEXT, legal_note TEXT, + wp_post_id INTEGER, + wp_post_url TEXT, + publish_attempts INTEGER NOT NULL DEFAULT 0, + publish_last_error TEXT, + published_to_wp_at TEXT, word_count INTEGER DEFAULT 0, status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error')), meta_json TEXT, @@ -110,6 +130,7 @@ def init_db() -> None: CREATE INDEX IF NOT EXISTS idx_feeds_source_id ON feeds(source_id); CREATE INDEX IF NOT EXISTS idx_runs_started_at ON runs(started_at); CREATE INDEX IF NOT EXISTS idx_articles_published_at ON articles(published_at); + CREATE INDEX IF NOT EXISTS idx_publish_jobs_status_created_at ON publish_jobs(status, created_at); CREATE TRIGGER IF NOT EXISTS trg_sources_updated_at AFTER UPDATE ON sources @@ -148,11 +169,40 @@ def init_db() -> None: "legal_checked": "ALTER TABLE articles ADD COLUMN legal_checked INTEGER NOT NULL DEFAULT 0", "legal_checked_at": "ALTER TABLE articles ADD COLUMN legal_checked_at TEXT", "legal_note": "ALTER TABLE articles ADD COLUMN legal_note TEXT", + "wp_post_id": "ALTER TABLE articles ADD COLUMN wp_post_id INTEGER", + "wp_post_url": "ALTER TABLE articles ADD COLUMN wp_post_url TEXT", + "publish_attempts": "ALTER TABLE articles ADD COLUMN publish_attempts INTEGER NOT NULL DEFAULT 0", + "publish_last_error": "ALTER TABLE articles ADD COLUMN publish_last_error TEXT", + "published_to_wp_at": "ALTER TABLE articles ADD COLUMN published_to_wp_at TEXT", } for column, ddl in migration_columns.items(): if column not in existing_columns: conn.execute(ddl) + table_rows = conn.execute( + "SELECT name FROM sqlite_master WHERE type = 'table' AND name = 'publish_jobs'" + ).fetchall() + if not table_rows: + conn.executescript( + """ + CREATE TABLE IF NOT EXISTS publish_jobs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + article_id INTEGER NOT NULL, + status TEXT NOT NULL CHECK (status IN ('queued', 'running', 'success', 'failed')), + attempts INTEGER NOT NULL DEFAULT 0, + max_attempts INTEGER NOT NULL DEFAULT 3, + error_message TEXT, + wp_post_id INTEGER, + wp_post_url TEXT, + created_at TEXT NOT NULL DEFAULT (datetime('now')), + started_at TEXT, + finished_at TEXT, + FOREIGN KEY(article_id) REFERENCES articles(id) ON DELETE CASCADE + ); + CREATE INDEX IF NOT EXISTS idx_publish_jobs_status_created_at ON publish_jobs(status, created_at); + """ + ) + def rows_to_dicts(rows: list[sqlite3.Row]) -> list[dict[str, Any]]: return [dict(r) for r in rows] diff --git a/backend/app/ingestion.py b/backend/app/ingestion.py index 8a7696a..872a1b0 100644 --- a/backend/app/ingestion.py +++ b/backend/app/ingestion.py @@ -289,6 +289,11 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats: legal_checked=False, legal_checked_at=None, legal_note=None, + wp_post_id=None, + wp_post_url=None, + publish_attempts=0, + publish_last_error=None, + published_to_wp_at=None, word_count=len((final_content_raw or "").split()), status="new", meta_json=json.dumps({"attribution": attribution, "extraction": extraction_meta}, ensure_ascii=False), diff --git a/backend/app/main.py b/backend/app/main.py index 177c312..c0a0143 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -16,6 +16,7 @@ from .config import get_settings from .db import init_db from .ingestion import run_ingestion from .policy import evaluate_source_policy, is_source_allowed +from .publisher import enqueue_publish, run_publisher from .relevance import article_age_days, article_relevance from .repositories import ( ArticleUpsert, @@ -30,6 +31,7 @@ from .repositories import ( get_feed_by_id, get_run_by_id, get_source_by_id, + list_publish_jobs, list_articles as repo_list_articles, list_feeds as repo_list_feeds, list_runs, @@ -111,6 +113,11 @@ class ArticleUpsertRequest(BaseModel): legal_checked: bool = False legal_checked_at: str | None = None legal_note: str | None = None + wp_post_id: int | None = None + wp_post_url: str | None = None + publish_attempts: int = 0 + publish_last_error: str | None = None + published_to_wp_at: str | None = None word_count: int = 0 status: str = Field(default="new", pattern="^(new|rewrite|review|approved|published|error)$") meta_json: str | None = None @@ -135,6 +142,15 @@ class ArticleLegalReviewRequest(BaseModel): note: str | None = None +class PublisherEnqueueRequest(BaseModel): + article_id: int + max_attempts: int = 3 + + +class PublisherRunRequest(BaseModel): + max_jobs: int = 10 + + ALLOWED_ARTICLE_TRANSITIONS: dict[str, set[str]] = { "new": {"review", "rewrite", "error"}, "rewrite": {"review", "error"}, @@ -446,6 +462,11 @@ def api_upsert_article(payload: ArticleUpsertRequest, username: str = Depends(re legal_checked=payload.legal_checked, legal_checked_at=payload.legal_checked_at, legal_note=payload.legal_note, + wp_post_id=payload.wp_post_id, + wp_post_url=payload.wp_post_url, + publish_attempts=payload.publish_attempts, + publish_last_error=payload.publish_last_error, + published_to_wp_at=payload.published_to_wp_at, word_count=payload.word_count, status=payload.status, meta_json=payload.meta_json, @@ -495,6 +516,35 @@ def api_article_legal_review(article_id: int, payload: ArticleLegalReviewRequest } +@app.get("/api/publisher/jobs") +def api_publisher_jobs(limit: int = 100, username: str = Depends(require_auth)) -> dict: + return {"ok": True, "items": list_publish_jobs(limit=limit), "requested_by": username} + + +@app.post("/api/publisher/enqueue") +def api_publisher_enqueue(payload: PublisherEnqueueRequest, username: str = Depends(require_auth)) -> dict: + article = get_article_by_id(payload.article_id) + if not article: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden") + job_id = enqueue_publish(article_id=payload.article_id, max_attempts=payload.max_attempts) + return {"ok": True, "job_id": job_id, "article_id": payload.article_id, "requested_by": username} + + +@app.post("/api/publisher/run") +def api_publisher_run(payload: PublisherRunRequest, username: str = Depends(require_auth)) -> dict: + stats = run_publisher(max_jobs=payload.max_jobs) + return { + "ok": True, + "requested_by": username, + "stats": { + "processed": stats.processed, + "success": stats.success, + "failed": stats.failed, + "requeued": stats.requeued, + }, + } + + @app.post("/api/articles/{article_id}/review") def api_article_review(article_id: int, payload: ArticleReviewRequest, username: str = Depends(require_auth)) -> dict: article = get_article_by_id(article_id) diff --git a/backend/app/publisher.py b/backend/app/publisher.py new file mode 100644 index 0000000..06cc8f2 --- /dev/null +++ b/backend/app/publisher.py @@ -0,0 +1,103 @@ +from __future__ import annotations + +from dataclasses import dataclass + +from .repositories import ( + claim_next_publish_job, + complete_publish_job, + create_publish_job, + fail_publish_job, + get_article_by_id, + mark_article_publish_result, + PublishJobCreate, +) +from .wordpress import publish_article_draft, selected_image_exists + + +@dataclass(frozen=True) +class PublisherStats: + processed: int + success: int + failed: int + requeued: int + + +def enqueue_publish(article_id: int, max_attempts: int = 3) -> int: + return create_publish_job(PublishJobCreate(article_id=article_id, max_attempts=max_attempts)) + + +def _can_publish(article: dict) -> tuple[bool, str | None]: + if article.get("status") not in {"approved", "published"}: + return False, "Artikelstatus muss 'approved' sein" + if int(article.get("legal_checked", 0)) != 1: + return False, "Rechtsfreigabe fehlt" + if not selected_image_exists(article): + return False, "Hauptbild nicht gesetzt" + return True, None + + +def run_publisher(max_jobs: int = 10) -> PublisherStats: + processed = 0 + success = 0 + failed = 0 + requeued = 0 + + for _ in range(max(1, max_jobs)): + job = claim_next_publish_job() + if not job: + break + processed += 1 + job_id = int(job["id"]) + article_id = int(job["article_id"]) + + article = get_article_by_id(article_id) + if not article: + fail_publish_job(job_id, "Artikel nicht gefunden", requeue=False) + failed += 1 + continue + + allowed, reason = _can_publish(article) + if not allowed: + fail_publish_job(job_id, reason or "Publish-Bedingungen nicht erfüllt", requeue=False) + mark_article_publish_result( + article_id, + wp_post_id=article.get("wp_post_id"), + wp_post_url=article.get("wp_post_url"), + error=reason or "blocked", + increment_attempts=True, + set_published_status=False, + ) + failed += 1 + continue + + try: + wp_post_id, wp_post_url = publish_article_draft(article) + complete_publish_job(job_id, wp_post_id=wp_post_id, wp_post_url=wp_post_url) + mark_article_publish_result( + article_id, + wp_post_id=wp_post_id, + wp_post_url=wp_post_url, + error=None, + increment_attempts=True, + set_published_status=True, + ) + success += 1 + except Exception as exc: + attempts = int(job.get("attempts", 1)) + max_attempts = int(job.get("max_attempts", 3)) + should_requeue = attempts < max_attempts + fail_publish_job(job_id, str(exc), requeue=should_requeue) + mark_article_publish_result( + article_id, + wp_post_id=article.get("wp_post_id"), + wp_post_url=article.get("wp_post_url"), + error=str(exc), + increment_attempts=True, + set_published_status=False, + ) + if should_requeue: + requeued += 1 + else: + failed += 1 + + return PublisherStats(processed=processed, success=success, failed=failed, requeued=requeued) diff --git a/backend/app/repositories.py b/backend/app/repositories.py index 164fc79..ca25821 100644 --- a/backend/app/repositories.py +++ b/backend/app/repositories.py @@ -56,11 +56,22 @@ class ArticleUpsert: legal_checked: bool legal_checked_at: str | None legal_note: str | None + wp_post_id: int | None + wp_post_url: str | None + publish_attempts: int + publish_last_error: str | None + published_to_wp_at: str | None word_count: int status: str meta_json: str | None +@dataclass(frozen=True) +class PublishJobCreate: + article_id: int + max_attempts: int = 3 + + def create_source(payload: SourceCreate) -> int: with get_conn() as conn: cur = conn.execute( @@ -235,6 +246,7 @@ def get_article_by_id(article_id: int) -> dict[str, Any] | None: a.summary, a.content_raw, a.content_rewritten, a.image_urls_json, a.press_contact, a.source_name_snapshot, a.source_terms_url_snapshot, a.source_license_name_snapshot, a.legal_checked, a.legal_checked_at, a.legal_note, + a.wp_post_id, a.wp_post_url, a.publish_attempts, a.publish_last_error, a.published_to_wp_at, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at FROM articles a WHERE a.id = ? @@ -375,6 +387,147 @@ def set_article_image_decision(article_id: int, image_url: str, action: str, act return True +def create_publish_job(payload: PublishJobCreate) -> int: + with get_conn() as conn: + existing = conn.execute( + """ + SELECT id FROM publish_jobs + WHERE article_id = ? AND status IN ('queued', 'running') + ORDER BY id DESC + LIMIT 1 + """, + (payload.article_id,), + ).fetchone() + if existing: + return int(existing["id"]) + + cur = conn.execute( + """ + INSERT INTO publish_jobs (article_id, status, attempts, max_attempts) + VALUES (?, 'queued', 0, ?) + """, + (payload.article_id, max(1, payload.max_attempts)), + ) + return int(cur.lastrowid) + + +def list_publish_jobs(limit: int = 100) -> list[dict[str, Any]]: + safe_limit = max(1, min(limit, 500)) + with get_conn() as conn: + rows = conn.execute( + """ + SELECT j.id, j.article_id, j.status, j.attempts, j.max_attempts, j.error_message, j.wp_post_id, j.wp_post_url, + j.created_at, j.started_at, j.finished_at, a.title AS article_title + FROM publish_jobs j + LEFT JOIN articles a ON a.id = j.article_id + ORDER BY j.id DESC + LIMIT ? + """, + (safe_limit,), + ).fetchall() + return rows_to_dicts(rows) + + +def claim_next_publish_job() -> dict[str, Any] | None: + with get_conn() as conn: + row = conn.execute( + """ + SELECT id, article_id, status, attempts, max_attempts, error_message, wp_post_id, wp_post_url + FROM publish_jobs + WHERE status = 'queued' AND attempts < max_attempts + ORDER BY id ASC + LIMIT 1 + """ + ).fetchone() + if not row: + return None + job_id = int(row["id"]) + conn.execute( + """ + UPDATE publish_jobs + SET status = 'running', + attempts = attempts + 1, + started_at = datetime('now'), + finished_at = NULL + WHERE id = ? + """, + (job_id,), + ) + claimed = conn.execute( + """ + SELECT id, article_id, status, attempts, max_attempts, error_message, wp_post_id, wp_post_url + FROM publish_jobs + WHERE id = ? + """, + (job_id,), + ).fetchone() + return dict(claimed) if claimed else None + + +def complete_publish_job(job_id: int, wp_post_id: int | None, wp_post_url: str | None) -> None: + with get_conn() as conn: + conn.execute( + """ + UPDATE publish_jobs + SET status = 'success', + wp_post_id = ?, + wp_post_url = ?, + error_message = NULL, + finished_at = datetime('now') + WHERE id = ? + """, + (wp_post_id, wp_post_url, job_id), + ) + + +def fail_publish_job(job_id: int, error_message: str, requeue: bool) -> None: + next_status = "queued" if requeue else "failed" + with get_conn() as conn: + conn.execute( + """ + UPDATE publish_jobs + SET status = ?, + error_message = ?, + finished_at = datetime('now') + WHERE id = ? + """, + (next_status, error_message[:2000], job_id), + ) + + +def mark_article_publish_result( + article_id: int, + *, + wp_post_id: int | None, + wp_post_url: str | None, + error: str | None, + increment_attempts: bool, + set_published_status: bool, +) -> None: + with get_conn() as conn: + conn.execute( + """ + UPDATE articles + SET wp_post_id = ?, + wp_post_url = ?, + publish_attempts = CASE WHEN ? THEN publish_attempts + 1 ELSE publish_attempts END, + publish_last_error = ?, + published_to_wp_at = CASE WHEN ? IS NOT NULL THEN datetime('now') ELSE published_to_wp_at END, + status = CASE WHEN ? THEN 'published' ELSE status END + WHERE id = ? + """, + ( + wp_post_id, + wp_post_url, + 1 if increment_attempts else 0, + error[:2000] if error else None, + wp_post_id, + 1 if set_published_status else 0, + article_id, + ), + ) + + def _resolve_existing_article_id(payload: ArticleUpsert) -> int | None: with get_conn() as conn: # 1) strongest key: source_url @@ -417,8 +570,9 @@ def upsert_article(payload: ArticleUpsert) -> int: summary, content_raw, content_rewritten, image_urls_json, press_contact, source_name_snapshot, source_terms_url_snapshot, source_license_name_snapshot, legal_checked, legal_checked_at, legal_note, + wp_post_id, wp_post_url, publish_attempts, publish_last_error, published_to_wp_at, word_count, status, meta_json - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, ( payload.feed_id, @@ -440,6 +594,11 @@ def upsert_article(payload: ArticleUpsert) -> int: 1 if payload.legal_checked else 0, payload.legal_checked_at, payload.legal_note, + payload.wp_post_id, + payload.wp_post_url, + payload.publish_attempts, + payload.publish_last_error, + payload.published_to_wp_at, payload.word_count, payload.status, payload.meta_json, @@ -469,6 +628,11 @@ def upsert_article(payload: ArticleUpsert) -> int: legal_checked = ?, legal_checked_at = ?, legal_note = ?, + wp_post_id = ?, + wp_post_url = ?, + publish_attempts = ?, + publish_last_error = ?, + published_to_wp_at = ?, word_count = ?, status = ?, meta_json = ? @@ -494,6 +658,11 @@ def upsert_article(payload: ArticleUpsert) -> int: 1 if payload.legal_checked else 0, payload.legal_checked_at, payload.legal_note, + payload.wp_post_id, + payload.wp_post_url, + payload.publish_attempts, + payload.publish_last_error, + payload.published_to_wp_at, payload.word_count, payload.status, payload.meta_json, @@ -515,7 +684,8 @@ def list_articles(limit: int = 100, status_filter: str | None = None) -> list[di SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author, a.summary, a.content_raw, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at, f.name AS feed_name, a.image_urls_json, a.press_contact, a.source_name_snapshot, a.source_terms_url_snapshot, - a.source_license_name_snapshot, a.legal_checked, a.legal_checked_at, a.legal_note + a.source_license_name_snapshot, a.legal_checked, a.legal_checked_at, a.legal_note, + a.wp_post_id, a.wp_post_url, a.publish_attempts, a.publish_last_error, a.published_to_wp_at FROM articles a LEFT JOIN feeds f ON f.id = a.feed_id WHERE a.status = ? @@ -530,7 +700,8 @@ def list_articles(limit: int = 100, status_filter: str | None = None) -> list[di SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author, a.summary, a.content_raw, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at, f.name AS feed_name, a.image_urls_json, a.press_contact, a.source_name_snapshot, a.source_terms_url_snapshot, - a.source_license_name_snapshot, a.legal_checked, a.legal_checked_at, a.legal_note + a.source_license_name_snapshot, a.legal_checked, a.legal_checked_at, a.legal_note, + a.wp_post_id, a.wp_post_url, a.publish_attempts, a.publish_last_error, a.published_to_wp_at FROM articles a LEFT JOIN feeds f ON f.id = a.feed_id ORDER BY a.id DESC diff --git a/backend/app/wordpress.py b/backend/app/wordpress.py new file mode 100644 index 0000000..adb4d9c --- /dev/null +++ b/backend/app/wordpress.py @@ -0,0 +1,111 @@ +from __future__ import annotations + +import base64 +import json +from typing import Any +from urllib.request import Request, urlopen + +from .config import get_settings + + +def _auth_header(username: str, app_password: str) -> str: + token = base64.b64encode(f"{username}:{app_password}".encode("utf-8")).decode("ascii") + return f"Basic {token}" + + +def _wp_request( + *, + base_url: str, + auth_header: str, + method: str, + endpoint: str, + payload: dict[str, Any] | None = None, +) -> dict[str, Any]: + url = f"{base_url.rstrip('/')}/wp-json/wp/v2/{endpoint.lstrip('/')}" + data = json.dumps(payload).encode("utf-8") if payload is not None else None + req = Request( + url=url, + data=data, + method=method, + headers={ + "Authorization": auth_header, + "Content-Type": "application/json; charset=utf-8", + "Accept": "application/json", + "User-Agent": "rss-news-publisher/1.0", + }, + ) + with urlopen(req, timeout=20) as resp: + raw = resp.read().decode("utf-8", errors="replace") + parsed = json.loads(raw) if raw else {} + return parsed if isinstance(parsed, dict) else {} + + +def _selected_image_url_from_meta(meta_json: str | None) -> str | None: + if not meta_json: + return None + try: + meta = json.loads(meta_json) + except Exception: + return None + if not isinstance(meta, dict): + return None + image_review = meta.get("image_review") + if not isinstance(image_review, dict): + return None + selected = image_review.get("selected_url") + return selected if isinstance(selected, str) and selected.strip() else None + + +def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]: + settings = get_settings() + if not settings.wordpress_base_url or not settings.wordpress_username or not settings.wordpress_app_password: + raise RuntimeError("WordPress Konfiguration fehlt (base_url, username, app_password)") + + auth = _auth_header(settings.wordpress_username, settings.wordpress_app_password) + + source_url = article.get("source_url") or "" + canonical_url = article.get("canonical_url") or source_url + title = (article.get("title") or "Ohne Titel").strip() + body = (article.get("content_rewritten") or article.get("content_raw") or "").strip() + if not body: + body = article.get("summary") or "" + + footer = "\n\n
\n

Quelle: " + footer += f"{source_url}

" + if canonical_url and canonical_url != source_url: + footer += f"\n

Canonical: {canonical_url}

" + content = f"{body}{footer}" + + payload = { + "title": title, + "content": content, + "status": settings.wordpress_default_status, + } + + wp_post_id = article.get("wp_post_id") + if wp_post_id: + result = _wp_request( + base_url=settings.wordpress_base_url, + auth_header=auth, + method="POST", + endpoint=f"posts/{int(wp_post_id)}", + payload=payload, + ) + else: + result = _wp_request( + base_url=settings.wordpress_base_url, + auth_header=auth, + method="POST", + endpoint="posts", + payload=payload, + ) + + post_id = int(result.get("id", 0)) + if post_id <= 0: + raise RuntimeError(f"WordPress Antwort ohne Post-ID: {result}") + post_url = result.get("link") + return post_id, post_url if isinstance(post_url, str) else None + + +def selected_image_exists(article: dict[str, Any]) -> bool: + return _selected_image_url_from_meta(article.get("meta_json")) is not None diff --git a/backend/templates/admin_article_detail.html b/backend/templates/admin_article_detail.html index 86fb7af..a5943ef 100644 --- a/backend/templates/admin_article_detail.html +++ b/backend/templates/admin_article_detail.html @@ -21,6 +21,12 @@
+ {% if flash_msg %} +
+ {{ flash_msg }} +
+ {% endif %} +

{{ article.title }}

@@ -39,6 +45,16 @@ {% if article.summary %}

Summary: {{ article.summary }}

{% endif %} +

WordPress Post: + {% if article.wp_post_url %} + #{{ article.wp_post_id }} + {% elif article.wp_post_id %} + #{{ article.wp_post_id }} + {% else %} + - + {% endif %} +

+

Publish Attempts: {{ article.publish_attempts or 0 }} | Letzter Fehler: {{ article.publish_last_error or "-" }}

@@ -184,6 +200,15 @@
+ +
+

WordPress Publish Queue

+

Voraussetzungen: Status `approved`, Rechtsfreigabe aktiv, Hauptbild gesetzt.

+
+ + +
+
diff --git a/backend/templates/admin_dashboard.html b/backend/templates/admin_dashboard.html index 5dad3f5..d47628f 100644 --- a/backend/templates/admin_dashboard.html +++ b/backend/templates/admin_dashboard.html @@ -91,6 +91,14 @@
+
+

Publisher ausführen

+
+ + +
+
+

Quellen + Policy

@@ -239,6 +247,35 @@
+ +
+

Publish Jobs

+ + + + + + {% for j in publish_jobs %} + + + + + + + + + {% endfor %} + +
IDArtikelStatusAttemptsWP PostFehler
{{ j.id }}#{{ j.article_id }} {{ j.article_title or "-" }}{{ j.status }}{{ j.attempts }}/{{ j.max_attempts }} + {% if j.wp_post_url %} + #{{ j.wp_post_id }} + {% elif j.wp_post_id %} + #{{ j.wp_post_id }} + {% else %} + - + {% endif %} + {{ j.error_message or "-" }}
+
diff --git a/backend/tests/test_admin_ui.py b/backend/tests/test_admin_ui.py index 666f0ea..af47046 100644 --- a/backend/tests/test_admin_ui.py +++ b/backend/tests/test_admin_ui.py @@ -112,6 +112,11 @@ class TestAdminUi(unittest.TestCase): legal_checked=False, legal_checked_at=None, legal_note=None, + wp_post_id=None, + wp_post_url=None, + publish_attempts=0, + publish_last_error=None, + published_to_wp_at=None, word_count=2, status="new", meta_json='{"extraction":{"images":["https://example.org/img.jpg"],"press_contact":"Kontakt"}}', diff --git a/backend/tests/test_db_repositories.py b/backend/tests/test_db_repositories.py index 5b60358..91436c6 100644 --- a/backend/tests/test_db_repositories.py +++ b/backend/tests/test_db_repositories.py @@ -85,6 +85,11 @@ class TestSQLiteRepositories(unittest.TestCase): legal_checked=False, legal_checked_at=None, legal_note=None, + wp_post_id=None, + wp_post_url=None, + publish_attempts=0, + publish_last_error=None, + published_to_wp_at=None, word_count=120, status="review", meta_json='{"lang":"de"}', @@ -114,6 +119,11 @@ class TestSQLiteRepositories(unittest.TestCase): legal_checked=True, legal_checked_at="2026-02-18T00:10:00Z", legal_note="ok", + wp_post_id=123, + wp_post_url="https://example.org/wp/123", + publish_attempts=1, + publish_last_error=None, + published_to_wp_at="2026-02-18T00:12:00Z", word_count=140, status="approved", meta_json='{"lang":"de","v":2}', diff --git a/backend/tests/test_publisher.py b/backend/tests/test_publisher.py new file mode 100644 index 0000000..a32150e --- /dev/null +++ b/backend/tests/test_publisher.py @@ -0,0 +1,112 @@ +import os +import tempfile +import unittest +from pathlib import Path +from unittest.mock import patch + +from fastapi.testclient import TestClient + +from backend.app import config as config_module +from backend.app.db import init_db +from backend.app.main import app + + +class TestPublisher(unittest.TestCase): + def setUp(self) -> None: + self.tmp_dir = tempfile.TemporaryDirectory() + os.environ["APP_DB_PATH"] = str(Path(self.tmp_dir.name) / "publisher.db") + os.environ["APP_ADMIN_USERNAME"] = "admin" + os.environ["APP_ADMIN_PASSWORD"] = "secret" + os.environ["WORDPRESS_BASE_URL"] = "https://example.org" + os.environ["WORDPRESS_USERNAME"] = "wp-user" + os.environ["WORDPRESS_APP_PASSWORD"] = "wp-pass" + config_module.get_settings.cache_clear() + init_db() + self.client = TestClient(app) + self.client.post("/auth/login", json={"username": "admin", "password": "secret"}) + + def tearDown(self) -> None: + config_module.get_settings.cache_clear() + for key in ( + "APP_DB_PATH", + "APP_ADMIN_USERNAME", + "APP_ADMIN_PASSWORD", + "WORDPRESS_BASE_URL", + "WORDPRESS_USERNAME", + "WORDPRESS_APP_PASSWORD", + ): + os.environ.pop(key, None) + self.tmp_dir.cleanup() + + def _create_publishable_article(self) -> int: + source = self.client.post( + "/api/sources", + json={ + "name": "WP Source", + "base_url": "https://example.org", + "terms_url": "https://example.org/terms", + "license_name": "cc-by", + "risk_level": "green", + "is_enabled": True, + "last_reviewed_at": "2026-02-18T00:00:00Z", + }, + ) + source_id = source.json()["id"] + feed = self.client.post( + "/api/feeds", + json={"name": "WP Feed", "url": "https://example.org/feed.xml", "source_id": source_id, "is_enabled": True}, + ) + feed_id = feed.json()["id"] + + article = self.client.post( + "/api/articles/upsert", + json={ + "feed_id": feed_id, + "source_article_id": "pub-1", + "source_hash": "pub-hash-1", + "title": "Publish Artikel", + "source_url": "https://example.org/article/1", + "canonical_url": "https://example.org/article/1", + "published_at": "2026-02-18T00:00:00Z", + "author": "Autor", + "summary": "Kurz", + "content_raw": "Langtext", + "image_urls_json": "[\"https://example.org/img.jpg\"]", + "press_contact": "Kontakt", + "source_name_snapshot": "WP Source", + "source_terms_url_snapshot": "https://example.org/terms", + "source_license_name_snapshot": "cc-by", + "legal_checked": True, + "status": "approved", + "meta_json": "{\"image_review\":{\"selected_url\":\"https://example.org/img.jpg\"}}", + }, + ) + return article.json()["id"] + + @patch("backend.app.publisher.publish_article_draft") + def test_enqueue_and_run_publisher(self, mock_publish) -> None: + mock_publish.return_value = (777, "https://example.org/?p=777") + article_id = self._create_publishable_article() + + enqueue = self.client.post("/api/publisher/enqueue", json={"article_id": article_id, "max_attempts": 3}) + self.assertEqual(enqueue.status_code, 200) + + run = self.client.post("/api/publisher/run", json={"max_jobs": 5}) + self.assertEqual(run.status_code, 200) + stats = run.json()["stats"] + self.assertEqual(stats["success"], 1) + + article = self.client.get(f"/api/articles/{article_id}") + self.assertEqual(article.status_code, 200) + item = article.json()["item"] + self.assertEqual(item["status"], "published") + self.assertEqual(item["wp_post_id"], 777) + self.assertIn("?p=777", item["wp_post_url"] or "") + + jobs = self.client.get("/api/publisher/jobs") + self.assertEqual(jobs.status_code, 200) + self.assertGreaterEqual(len(jobs.json()["items"]), 1) + + +if __name__ == "__main__": + unittest.main() From 592d6991664f716c866c9fcaee9fbe69583c7534 Mon Sep 17 00:00:00 2001 From: Oliver G Date: Wed, 18 Feb 2026 11:00:57 +0100 Subject: [PATCH 11/54] chore(config): load shared rss-news .env for wordpress and keys --- backend/app/config.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/backend/app/config.py b/backend/app/config.py index 40deedb..e194bcc 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -1,5 +1,7 @@ from functools import lru_cache +from pathlib import Path +from dotenv import load_dotenv from pydantic_settings import BaseSettings, SettingsConfigDict @@ -31,4 +33,13 @@ class Settings(BaseSettings): @lru_cache(maxsize=1) def get_settings() -> Settings: + # Prefer shared legacy env from the original rss-news workspace if present. + env_candidates = ( + Path("/Users/oliver/Documents/rss-news/.env"), + Path("backend/.env"), + Path(".env"), + ) + for env_path in env_candidates: + if env_path.exists(): + load_dotenv(env_path, override=False) return Settings() From fee5e76842b92d2b42c086b683ad913400a8717e Mon Sep 17 00:00:00 2001 From: Oliver G Date: Wed, 18 Feb 2026 11:03:53 +0100 Subject: [PATCH 12/54] feat(ui): add publish readiness indicators and WP env key aliases --- backend/app/admin_ui.py | 19 +++++++++++++++++++ backend/app/config.py | 7 ++++--- backend/templates/admin_article_detail.html | 15 +++++++++++++-- backend/templates/admin_dashboard.html | 4 ++++ 4 files changed, 40 insertions(+), 5 deletions(-) diff --git a/backend/app/admin_ui.py b/backend/app/admin_ui.py index 8d8e879..fba1b91 100644 --- a/backend/app/admin_ui.py +++ b/backend/app/admin_ui.py @@ -156,6 +156,19 @@ def _build_image_entries(article: dict, extraction: dict, meta: dict) -> list[di return entries +def _publish_readiness(article: dict, meta: dict) -> tuple[bool, list[str]]: + reasons: list[str] = [] + if article.get("status") not in {"approved", "published"}: + reasons.append("Status ist nicht 'approved'") + if int(article.get("legal_checked", 0)) != 1: + reasons.append("Rechtsfreigabe fehlt") + image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {} + selected_image = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None + if not selected_image: + reasons.append("Hauptbild nicht ausgewählt") + return len(reasons) == 0, reasons + + def _legal_checklist(article: dict, feed: dict | None) -> list[dict[str, str]]: meta = article.get("meta", {}) extraction = meta.get("extraction") if isinstance(meta.get("extraction"), dict) else {} @@ -287,6 +300,9 @@ def admin_dashboard(request: Request): extraction = meta.get("extraction") if isinstance(meta.get("extraction"), dict) else {} images = _read_article_images(article, extraction) article["meta"] = meta + ready, reasons = _publish_readiness(article, meta) + article["publish_ready"] = ready + article["publish_blockers"] = reasons article["extracted_images"] = images article["image_entries"] = _build_image_entries(article, extraction, meta) image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {} @@ -339,6 +355,9 @@ def admin_article_detail(request: Request, article_id: int): if not article.get("press_contact") and isinstance(extraction.get("press_contact"), str): article["press_contact"] = extraction.get("press_contact") article["extraction"] = extraction + publish_ready, publish_blockers = _publish_readiness(article, meta) + article["publish_ready"] = publish_ready + article["publish_blockers"] = publish_blockers article["image_selection"] = extraction.get("image_selection") if isinstance(extraction.get("image_selection"), dict) else {} article["image_entries"] = _build_image_entries(article, extraction, meta) image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {} diff --git a/backend/app/config.py b/backend/app/config.py index e194bcc..fc52ec3 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -2,6 +2,7 @@ from functools import lru_cache from pathlib import Path from dotenv import load_dotenv +from pydantic import AliasChoices, Field from pydantic_settings import BaseSettings, SettingsConfigDict @@ -25,9 +26,9 @@ class Settings(BaseSettings): app_db_path: str = "backend/data/rss_news.db" - wordpress_base_url: str | None = None - wordpress_username: str | None = None - wordpress_app_password: str | None = None + wordpress_base_url: str | None = Field(default=None, validation_alias=AliasChoices("WORDPRESS_BASE_URL", "WP_BASE_URL")) + wordpress_username: str | None = Field(default=None, validation_alias=AliasChoices("WORDPRESS_USERNAME", "WP_USERNAME")) + wordpress_app_password: str | None = Field(default=None, validation_alias=AliasChoices("WORDPRESS_APP_PASSWORD", "WP_PASSWORD")) wordpress_default_status: str = "draft" diff --git a/backend/templates/admin_article_detail.html b/backend/templates/admin_article_detail.html index a5943ef..6f06d36 100644 --- a/backend/templates/admin_article_detail.html +++ b/backend/templates/admin_article_detail.html @@ -203,10 +203,21 @@

WordPress Publish Queue

-

Voraussetzungen: Status `approved`, Rechtsfreigabe aktiv, Hauptbild gesetzt.

+ {% if article.publish_ready %} +

Publish bereit

+ {% else %} +

Publish blockiert

+ {% if article.publish_blockers %} +
    + {% for reason in article.publish_blockers %} +
  • {{ reason }}
  • + {% endfor %} +
+ {% endif %} + {% endif %}
- +
diff --git a/backend/templates/admin_dashboard.html b/backend/templates/admin_dashboard.html index d47628f..fe811a5 100644 --- a/backend/templates/admin_dashboard.html +++ b/backend/templates/admin_dashboard.html @@ -163,6 +163,10 @@
{{ a.status }}
Legal: {{ "OK" if a.legal_checked else "offen" }}
+
Publish: {{ "bereit" if a.publish_ready else "blockiert" }}
+ {% if not a.publish_ready and a.publish_blockers %} +
{{ a.publish_blockers|join(", ") }}
+ {% endif %} {% if a.selected_image_url %}
Hauptbild gesetzt
Hauptbild From ba83b245101d88624bdcec9769fe743fe1513a86 Mon Sep 17 00:00:00 2001 From: Oliver G Date: Wed, 18 Feb 2026 11:11:49 +0100 Subject: [PATCH 13/54] chore: finalize current state and prepare next wordpress-focused roadmap --- backend/data/rss_news.db | Bin 94208 -> 204800 bytes docs/PROJECT_PLAN.md | 26 +++++++++++++++++++++++++- docs/TODO.md | 25 +++++++++++++++---------- 3 files changed, 40 insertions(+), 11 deletions(-) diff --git a/backend/data/rss_news.db b/backend/data/rss_news.db index 1b1c3900225265e8b460abed889fc85eda838c00..792930712d0b6bc67bf4176b67dc08588b62428c 100644 GIT binary patch literal 204800 zcmeFadyE}dn%LLPH(NvY?9OVLi7anTuV=-c>AHREzHjR_O|nUG_#AdqoQIW0)vc=A zSA8Enb?;@1RD>OBG~Uc6UL)RQU~kqtjy=i#u}I*+wvxcg0x@F)LA*&8@djA~c3|xv z!2+?7AdZ1x11G=lJ5~45x0~IRsF6lfnjzU;k5lJ-=R4o~vHG>Ejb31dov7vZto+nx zrw$#O`if;uO-;@7Cl~)Y#J_)4}kd zJS8LRrL%(<=fCwCMt*i=KK6yd5l9B!8-6GE z{p85PM!^(Me|UHPSH61e_?c&)o&L3Y&-I!?^6li`p{m!X+x?;%d%Dth`=3OEJ965GShfpZeY+p@4dyU%yV0%--gNu5TF~vvkD=RWa(_1*0zG3X_XU?3N{{Be?zMw}}|2i@Te!cX>;2#spA-SHM zaocU}yFCff8}5z4&B@Uk-Swa?tYyd`68T|3HJr)UfLeVIh&J7BZ@ty=8)1V%cjpKC zOl>2mZ4M9c5R`(J2n~hq~W`x0~CUGljbT`+V!QEhoC^10#=k5i~ zCafpnmLo4H$E-&|&=wLFKgUne9+d1LXKWWv-tP1QLzpI~uLmd!ghhYyPS60~LEWD@ zc8b3bpZ2F_qtpM4f71U_2&525A&^2Kg+K~{6apy(QV66FNFk6yAcepOhQNbkhd(p* z`IVJt&YYb3LOy>p!n_!4^m<#}rNzbV?d{B#UfJqIJ-30Po>fS{R4aszNe6g^U z-8l)l^x&!KQ&YtUPfbk$jvceG&bzHG?68meV$|&}@+Y@`x7W>V^xk;++b`D4TffwF z+x3S3qNBhWpN&&p%4c`EbZYjWP92^7+lOZV;p`vJ{)^eaKl`^o@HVF}rVvOWkU}7Z zKnj5r0x1Mi2&525A&^2Kg+K~{6ar5Y0<%XCfAM4-?6>}U$LoIP$l+(Flfbl@r=|}- zGaW~Z9OkaA^3dTk(;B#P=)~c}Pftr=+R>mZ{{KU#|Hahoukcs;UkZT~0x1Mi z2&525A&^2Kg+K~{6apy(QV66F_!L3l!Dk5rnEr(av+94(AHLJsXtz3EqxqO__wD4p zf#ZGD!CqY|I7?1$^b(N?rKRlHrTD4s>Ca4^{QQHbn9(34;4x>jGYDYEhqFuB{4SSf z_~4=0{?zPy{FVNfLLh}e3V{>?DFjjoq!36UkU}7ZKnj5r0x1Mi2s}9mTs-n%cKR{Z z`$=QNq4P&D7(C`MJFNub|37;C$8e^mpbX|UJ5t?C zE^c)w$aHVL(e3sFnbEiAh6S9$V58ZShsJ6)4X;WuvC%q5=FV%puw&KZi_X5aV2v(S z!1N)OIw=~BFE-vrxwvCm6PxkYYj3@C?8w0oA)j3;?C|Btr4$jU`S|HTgu@3x1a;~; z#Ze*=UrH&$rvnk*-aKRw;hp-USOdFWSjvx4g!s~Kh#=&B2Q}J3Bx9neGTO8@-V4xO2ru0J^S;7Yyz@;BZ&^Q&)8O@HotjeP#lqeRB7)}m7^I@$bUDd%Jh zStnn03vMY__G|fSzEmhzyoz7U1y0Q`N6{`6{H7EwTV!7fMYQ>V9FZ!iw)ytQ{ ztXHetbfe8KqN7Fm&xh~Ukr+36*7>N@?b_$1Ja%-?mPdm2lC{!kTUV&S-Lb-l??u*y zpx^7(HmF`f4f=;akU^q=*R6irw^nN#6wwUYkF}P^dN|?S*}|IREERK<5BYMIaG7kQ z9d^F5qc(|kcE^SJ!MZQCyjPYiYw-I=0|)Ec!}r=0=WQ=og^YDE@S?sOS+WM}{qOwx z;Oh%kquuV@b$gAwY{_a6-KFZP+x8lPY>|T9Z`2zUT%n4o*_Q=N@Aj-^s$PfgMifL# z3_|7eRcEUT`f&3157q z?lOUhb1z*Fw!2ZM(|Tdied)CqyvSO7=>@-W*Xs7}HG>!D!e+^*N$p7jwBv5azv-U2`4RE>x?b?fGuS&K0XcpbrD;8p z)IXM+kD@dquP_Ny6ZtXT8P&PvwYc)p$f(1r$8uk*{>a@<#K~^cT>&iJ-0y>vFpT9+ ztMFMVIip13i)El{|e<&u38E5#oVRl!}n^N-Fg7M+@Jx*{eH`O z6^hyJHQLxx>PyzmPPbR<_pr+=U28O(pcN>&(vLdOWb1m+ z@AjgA_5$}Gg*$A^*is(n*#SVyYgvp_z_{=@3@v8I!Q|qWtP2m{!?e}(J5jhevJ&fT zr`MggY*=|T1|di`mA(wWsR!E+-`!}7U|_*Qz9}qTcnGgtZ3JGhVEG+xDQ~*(ZZuT} zpqM_;YzG^yAo`NwM)JZW+`R|_O(!dE0%aeIThOw;3Kz5P(j$cP&aVO6wq*Hk*Q&ei z`&K>bJp5r`nO)9Uxk`R?cLv`x>}_wTJRYwb+tnXCtJAMPNlsVxOQp~as&=*%7VJX4 zRJ6Tnv1FIMe3m{ExpFmITfEk3yFHH4HXHcGTsGM}eoT(dqj}&@Db^jB3GTqD$K!!7 zY`ye?N-Zzli1`Ncx^~Cmjra)b?|CpbX^#WJGr}MowQISXy*>r{UsyB)zp%LV z(z%q??xg7hews)cAC}WP#X`fe_fRRpO;pC zpPQEW|C!T2J2l&!T{``az_s`C6=3$ zMN~@Va_Pu{-pG{-4ldbjp_t1ZKFAwmPTWI_hl59^1njgyr27A-_8_MVOCgX#Aca5* zffNEM1X2j35J(}ALLh~}J`gyF&Y9}}pDO+T)XLQ9GpGLTQ@?-uwb{ROYGt;6`j1cl z_1VRJ5GB2xLLh}e3V{>?DFjjoq!36UkU}7ZKnj7MJ_xKFd2njFS_=Fy7Z$TMFXxvX zznCxPE3R9qR7<6z?|5#pQ1N^xtX5p76c)*i2=m!OA?M|aq#$|u8671b)WWb-saDHw zv63y7%3&>A!JnNihT+RoiC^@9)){5DxTP~1O=$3ph>v*M;I!e*?b1uKt zf>PjlIj>g8=G=nM%qqN7_Om7a336mYRlQtLs%8sb(ecTJ3dq-Risch#E>5K!R4Tc! zKvI+&1hssv8s|$d34xB2uSr^RIa{sxK_x6ykLoBqn)h|pztynEooU+ee1;wzE^~u^S=gLRSTtYAFvS(zK zG4pazB+)FFEeF6L$mUt9=lCUNUkxio^3H&CCCIvYH(x6`uIpD0o4N2vtrECFA*|#b zr(7-i%&rWUxM5KB0b)5@V&}?EP|lIemUq1}Ls7)PSaGYl>Y)o@-AOKI^ZC%JF-Yj= z!4#+Fc)mx7MwX2zd!>A(>Xn0>8~PO{=#r6BW7wJ(6dcdVs{jAgznYp2XLG0i)$DJa z&Yk|tr^DI*{`CKM`uBeN4syEU6apy(QV66FNFk6yAca5*ffNEM1X2imS|N}>GF_kE zw@lc$xBp=8jK}-uj!XyperG)5|1m$KG5^ky>Ds=Z8H@Tq(8R{${tq^>@yP!JzB3m4 zf3SDPqyGUwaB{!>9IF;#^#9cV|7ktp>AF$~q!36UkU}7ZKnj5r0x1Mi2&52r!Vpk@ zZtDL}{r^9q{{PwEnVS87sQUlMv;TDV4`+XW_P?F|uV?>DE~fvb5J(}ALLh}e3V{>? zDFjjoq!36UkU}7ZKnj5r0zYmDoI3mq(_4M7+30Qr{><^ipPQzo!pZpG6Y;;t?DFjjoq!36UkU}7ZKnj763Ib;i=cg%PShBM@+o@RD>Qcd3qRrss^?Y`zkUO&T zdSxkBT5^hqce!3z%I6OK{Nen`(eZgcyW~_f|NrzKOr8FN)Bo4(&&)2)UYp&V{r>E4 z&;GsHKl-SanvR=7Aca5*ffNEM1X2j35J(}ALLh}e3V{>?DFmJf1fD(ovgp!9+i`5C zu$C<^WwT4U!k76wo1GcETUpCi-UbLF*cX{lJ`0dx28@NUk| z7S5S?wZr!g{ewfDLr+h?H+^;L(mwn0_My)nf2R7(iSIt&iyHNM5cQ(^dbiV$YC(6s zzva8Vz+ZQJ@s;zpR+iURthHO0FI`%>W$pg#j5YG#^2IeCynXY6ym#Ze6%RD_%*7kG ztd-^SuUNNk+!?!ZZspSD>tk1B>~rnT_H*;romW%GViqkH=-X$t!0^$CFnUe`EPxOk)ItIIa!%ZyVvg~zn+>H zJh>A;I0hB-?)-x@w^uB)3wyjX zw$s-5w+&Ea4Y z`^|gz*LTcq6sh!uhQGk6`qlYk$Im?b?DW6)#h&XmgXEjZzsKWmuPvXux-!5@>+Fuf zEjcEaudl7h>ArdE^0nn#->|N%e8XD4y>{dBbq2Y%a(!)K;;nch$*YqO#IPI!#adhW z`Yr>F!hH89dW~K)(6z?OxFCtw-+meoqA3QKQGhdpx(@iTw3ir{C@+8)02oxww4$>YA0ET$MR{ zI@AD?gXfM)ubC3JYMlEP}gI$Mc26a%g zwh`1egW<_cNYP-svJ2MOlQ64zdggMw(=%sg!P?$h-|BRG(4_=PhSvs1z>2qzae6^( zt0$Xph{A54P9AQ$jH`Pce|&%h>%Go8A5La9K|hcHRxV$q ztj64^;m->Nzp!$31z|$?zR=@Iq%w>lVO}h1oW9S$egEk3GfzJ~{k=O%D}{;5pJQV* zQm#BP`p97Gy?a~x(=Bt-#?f?_Q~2$E&^PCdk=u=ST~1lIUqfi?%8#MjXmWqI6USkW zrHl`LqQXX7b6NM^P@i+%Xm<4zW4k^7&h5j;&zw0k{r!^)eA(@;{&i#w{Ceq$!9Qjz zBRD@cDUmAW0FOgt60wr=F(e@d)h3^8bk~EnFyg@}8@`j!WM}dcomh5B*?+#*V$Y+C97b!5HN#`|W0< z7p%jTJNo;&qSHEk?7+68s6;13ExLK=*qJYXdHNfQi191kfV0)+9GMZ4Uk{HV=Z>3a z1$P#BbpD-<>Emafd1m_i&zUie9u+U+(D)eU-V;OdgW#{{!e$aAp2WTZquL9-Mi|u| zk3@~`=6W-@8|)Cg2At}-dqJ~_T9(KbBQL0YQICS4EhH>{j-R4EDA_^I*e-I9@IphF zCLy2)C<;WNQAKQzRen4tXdK^9fxxGazBx7fKg`}d{YR&tI`wayn*Pjhf9CpA|77M5 zPyUk=|M1vnxS9T!LLh}e3V{>?PZ9zTPX8Phpf658$g6N62EarGdcEVRArF0deaIz7 z?|vzk0*1!J(YvvcY~)?hct)RywebaOIQ;zgRz7$9%oo2n{j1kh>==D~^ym380c7;f z6O%$l3_LqE#z}6ws|Di8YC8wfU&r{Wr~)k)i|)RPZ+>z?s{5|DP{Obt>>)esqJK|J zAbDnQQEj;S`k6`PV$!BD@s%B#;N)`lSMx?i_~F)%$?);zokINi<<;}c7gok5HUIYU zpFRG}^Ut36?(&E;A{k)B8UEqavGmj^hk*dIq~CRZ63G5TCrC)-r<;Zs!@9AY1=$`{LG8EWsO&G%+DIX zH|iFgnGmcJf}FVI4nOGr(Y2{V zpZojE`TW1N^7f_AuRMbR?XlVwmhx^kpAVf{SjmTeK3^(2HOKQk$Ejp%90l&khndc7|_U2>QJ)rrODAFIW%X^Kco7 zr;oY&oTCj3w_De8H+y}oXM4YD+17=J?{&LD)9*y#;>h~sP7sYj%4 zKYVwijVEdY*O!&aWZwVH->|L)8xek`ZqRnhnQSg2i?QDS&aYd<#}rC4d(UHLuUVY! z^t$uBc;K0QGhj+=na?X%8-W)rSbm2|>a6d(8_h@`u-4e;cCgV3qAxYtvd8kmBu03V zTid`D#!dF$Z?LwO^;MT;-o<Wk#w!P+rDqrqt3%02A09UjFqe8NAYIx zy)a@!`EFN0=nlqS!=8Vi=!jN}d%Ca8tZiNEw27$!>P%K<@(R!gUvY!(f~pzo;=}ja zp>>_Pdv3D8AZiAle9?D3u$x))v(DHH@TanVeveO0tHbAeOV%ofLQVvCbe_K3-n1GU zcs0R*`<;3qVEQ~q^ ztXwve&1TJcR^Uc1kM%M$*VzS}mRoMO+p~a2ml+7fknND!C@QhvvTh|2fpw7$^n)eq z5`LRL4-s$D^iTyAUt(xsHV)jDj*viUNNt9jKC%I;ZcC^W01A1nZQTj#c$xs#$g&qK zW(F=M8_W{i5TeyL2%~EX+8{oU=tOowsEdIY_1(xGLoC5bz6k{hdqKuJH-ZpU93$%IoLknb8C=j*5xO(Vg;FzPzFZE6n zVDp((MH&zR{CW7Up>MKJ+Bj_oojPn??ueQbBa0slcAV$WTyVHoQO(rOF7Qb`Q z_LUYZHHazDTJ-QoeSDNb#Qs33+DkDFV>bA7H+zk)#h_4Yos|v)2=8<@+O3Y)Xj+?m zD`?LVE#%fYrGid+Xtp2zsM%}CrAtbX;GOM}V}0J`ch@ZSm7pEn`;xWUX-mwCkdAo2 zWbl?1a^7TK=Yj@Dcmu*DC~+#fk%{ALaTTOHG4p)XyTPGqfY1YyLPztC`HorX$} zZQkpBFKD*EVBOU9$;USy{)qjzE+QoK+pZPk;*Nt2oRsdzJG*4|*??t3*o82PQ6a|A zQFczxnp4SGufXi?2aTQ*5ka>2Ov~;f;&dDJj3vm^MQY^;wxJ!(4peObmIaGlh=i0v z4FTt7QxScQ=7kmH3MCj=k{n8br|j`h=lV6G27ApjtO^btKACcFc(24~C^ z_;RtD;DJ&?gR#Yolg*jc38EmXL2g|`ZFOGdOsSYR4`i%mD9G?MghJwEx0paZzzdpC zk+JjQBOQ&Os&f!fD02(4!v;f$SUVeVN!bhN@Cas~nVB2l6GvaAwv08nv=KBRRv?|S za;wb_2*KI`7oYF*m|+O!Xn^p$3nP3%WP!#2&&HL72797n3WyLOw-~uwcTI&$7vWN5 z8Y5nU0Pzkd^4Jhl5}1J@%-Af%Oe(o!zz;(4|7%D!5Oc2|)$Lb1(Uy=5kSZau99}v8 z7HSgcb-#hiqvX8~PXzgIMxED#O(=<=Y&~Ej+UEJ)`yr$%8^e z_e7B9%i{`>s8%Xz^*FCW6*ATe=c{c6EqE~ktB8#cf0GHRMAC29yGE`A&r}o@QZE>U zbb}bu*o1Mrup^PJ(bZ%f?MESmWh9&rBlyL#qoBVWN4LE#%F8IFsz?1T!mxB(x8>N& zabJ!=iyqUAz+&O7iO4jw$L5GA>Bpg%*W9M?rG)U=M&zEkz}6p7DcK(h^pq!H33H->BDL{>D4=|IYT*^nZ2ukfqliT#<_p z-uh2mY$Ct=az6h~W9ncg5eEj8%`cX6PPUMBY9+T;s|0RPz$oH4Ig1ITPz;K!>U6vId5Hp! z?%DE)NMxL3>k4tJ9iutNs(aII6FmO#2Qmo9lY=Wq2cenhm!Q3monw40kM(e36`4qR zX|%)6S9aK6tg|~V%n#OmspY+bWn=LBlZC^C6(n02w}Rxz)|nZ6TqZLkaK6ylL{4w_ zhPU>em%jJ7`!jP579irof>ppqU^GB$V5CVVwg8uDckaRkkn60~Ai7H!m)rIlXugp_ z2ftBoVC+IPmzQKfz1y>v37`uxj|9<@;d?DH30>*5LIhu<^4u~ihhdjuS=(|muoetg zj1D+ZRyGg^MDOle&4)h{naPOmU>v+#sI&QM*FuUA^{(BA>F_4vbPyS?kSxYp-r5qb zDW)0~R2jaGQG{Qvnz7qzp*j$P>ys#d?Y>BAnVEAE)QSwy$7;ZU6N{IYrPy*KBGwHgP7azVpu7g)kv$OV6%MWkzgI9D$wDFD ze)ujNOO$cpv?9U@U|?Kz77r(j#d*eNGRBr=L;*92ME-!)t7SxNd8jWQ+DyCNV2fPD zhy|;|r-c~o1_0O{v}OSnFKY4`6?<3R=6#W0KnPsM%q=tX{&#-sRaAYERYaLV8xt6# zNE!{-kAF`T)=}(P5Xo2wkV<1LwWSj=pF)uzYP9cv=QoWqYkd{*q$}bldOl+Lg2+XF zf3w#@qZ7-4hp%m`s7Mbvq~o2qDQXflSOri~)QZBw>I~ zasX6G6(gEVNo*EDhGoxOcQ<+{dVzJB`PacX1T>X7+L(NH@PLc`HpT%_X*gbfBSK!3 zZwr7EnCP)$!SQKQ)aEzxNl*){O+hA+{O!wPYi~C7bD4M#W6F@2F9a4c?0q%0^Br?k zlP}qWW7-WwXBS|rG$s$!15c<0m%TtVKshg%FGSzxIMz&R3&zC_EZn+jEwy%oZ(>>8 zmADLGuEuVoDajNKJ`ztIB&SFYt98E9+G;`=bzaGwt(7;F1zBh13RBTJ#q*hGV=!kM z*g@wqL!b@@+Z4e#2DuP0JvGK82s#HIs-k>RCZM<};9eD-x&8ik{)(9W8$_hC5wTp3 zSPB&e9|jXN@2c~H8OlmBelVlL`rx41z(zj)*#`4Cw z8Ok1zt|84ZiakehY-2^dmm|GOMpSPLYlE12A;1CT3&fB2`<#nf*)$|}lOqZU#H_)R z6T}z+vycnSZsIHvYyyoL8=VNFjO8!&|byw6ZBL(!t~RM&$_TF zjsR8?v@i%`qHAHujA-8lJJQw28y=cqBpw`0wQO6rBS$o`bHuPjQxAIEu6}$B3a%G{ z#IU*ebFfQSb&!EO0c05ga%Q-9#yXzMI$6;=_&$`f!9T3=s;s^aH9#yN|3cS#|KX1| zfQzv<0X{iMs;vVQD6-H#nVAh%!gfYjl8iON?6AU!iiB|C7&5X03lf$#$`2K9NA@Y1 zdM>9^W2}Oztru^czr89dV{k8qBe#mGHLiYBNlv5Jhpb}oLFeYABr_IDi}_guSIn_G zXl!A&hBhJ{OO_q4hM!ohy$nqUSI|s{*M++2rxXX01M#)Ki-!<B zpQ$?8BF3Z$L5Cq{)`Din`f{Oaoz2c$4o0*>HeV_i$jQIxzA?B_b@Gl?adPtinHBpQ z89+;H8K29%&gZ_eVKXb+6VfDX3krfyNEC6%-kgiQ z{*vmz?{haL9WTJ3o!m(B8|3Dr6q1own1)HtdyIFW1cQVv6DV58mY&Bah)(Szfn035bu7YMSs(rp(t%l`lp_X$CPRTgsE9;4Hy#w zcn6*vLQCLz1|~JPwME{ehy>za6p{*%Y-K2y5wMY$guM#YT@zm0*hHsQ31U_Fc(ftx zOzz%<#L1K6!^Zlr3cfHM<4+fNk9dE(mQi}~RNT;Yb1z*dJS6ILS}!cRFTM7H7g>uhz2GgM{&33#($@MEvHppfdFX3~&cUdyCyC7w?;AFE-KA(Agt3HQ@>-(USkKL78&`1bq(jecY2uZD@w%Y~&1K~KID z5a3iShN16587h80TOjf&TL~*csaC8ODo)vl3T=tx!MYs;$1T-J0fc0;VA5jxcQkU>9 z71u7g$bzWb97N=%q1RgupZvBB+U#r6bVe7jFQpuP7)rT&jh{rkR2|V14gVs}*)E>j zIx@6Gyc{^-YwsZ-e6c~s=Eqe@@fIkEqMP6g-eL~@Z6rFVut>1uAeu0C<_P(6h9O^+ zVk#B~AzypXM8r$Zx*_Z{B2uhYv>QyiiiPONSIXc24k@@<#Y+^5G24^*bQ~8vlm{ycxm3s%m|~joF(g(pn)yZ zBBn#^D8>(V5eu&HM%nfAIPW%4_S; zhUJ?YD@2gg@Wf)rg^*zNFvc-43M?HIs0i>9dNZ=`mQ0PgjP1%@QDQkVQ4h-^c3MG( z*q(b^10R5{7)2t?h-wys^oc5ruGB+GW#{6juS&pNyo6+%CAdOC*6VC-i8+uXxGM$# zBLY0!ZrH>eo|zMqK81IXKnWi-VjBBrY*;N_kr+NP8jmhTj<$RuHaf~w_)Q0O*)xfM zftPE@iUb^tx}7=CU5NR8V|CeMib4_uIE*UqUo+OcBx=K0%sbeU+c-ERPDcH`OQYMF zune)@$lMZ)N&*Wc?!XIxta{AN1QAEYn9J7S&fbXb2mQL>`bAKgNie4jR#`PtU{}Xr zi#K1f{}P@7L8{nRG(4e%6>@MmlE$>cG{ib7yF@x)KR(+zLKEFCxM7&0e9=Ukd4Y-Y z(6HLEM*z)Eaak?>;_AjEHi(`TxBMo;7<0qxU>l>=avy5|L{^9!<$v(Q+=_INxAwPU(sb-+GDx2{MWIJgr@e3^&=8g_;pD0-4&4K;-Zv*#1{w_`h**c-d@X@VSu zPHlnV@voO6*SF3G&2EEBzzX3)RRGEnUTHKt7}{H{QT+7{Sy3yW3Hw3EFe4Lc3fH+P zp?Z-0w%;X~IDqYltq|@;1fUoiub?Gta3sMgwjC>PQ-(M0m$n2t$HXdz52sNM4(jeR zs{bGPYg0%5+VTJUiL1Xq^#lI2$o-E>dKkAG>}d+vTc_Puu-*GrsF2azOy0tWG!0A& zx7CcJ`3yz%0h7;wDRb}H%#3J}N@hRgd_J71VrE<}e&VJI!pM-B4dY!VPz>Q6J|eOi zRBI^FNoYJ5gnDBRxvw(SrinL{jEbQ9Ki}9*cb0gWKNozObm#O8#W_Lg85)@Tp41tV z*l&^TGsM}8pG$s6QNrsaF~y?B7-A3y5Tg-KdqgRfk-oKm4=I}#Eg2!KCa+|LSOUQB|B1V=AR*pGP41%Fhc z1_5^H7b^N2(cCE#L5?6U(Lxf!?&&=Vfxxr{VzM+*Jb4&fJnQb_mo{&3nXME9h{WiK zCicyt8ku-Xxywq3ok_MY&VvE()o%d48|;^%_~jzrJa$emE_+51L89N}P>MQZpw)Bx z8h|OQ6I{jaKl%ZqIbwlF6IFZ?K(vXwF!PoyG*%*DW6MmGwF1&8TAXrKe~j@VBZ(M8 z-S|SHkn!a$oEc#EC6lwrk&KaB!oAh@!80a%hA4q|AN~;FA!l)#2_BUET7H$|lvr(8 zpzcyjv%?m^@<EDA}n)E zFEcX=Og$wMF%Q^QWa|>c)A)>0hi-s=Qf+T9l65~(`pwSN-bwNT2IiudA$Vf_^ z#)LE!!I)G4I@#epP|eINrmz6Xz(s(Iw%mI7qlZ6)_(QRp0aV6pf*AGc0h%Sw6~3)x z7<9#eAmpnpX$z7WU@Ut>km^2|V=L5I4m-~cEGQ_>)R~mf5!ovt)`~f!^Dx14VuUgn zL>_W?{$jGnxNL=DVMNqpP_!Ix?wBHqn#GWo_(gI7>6C!>QaR0r4~ zlnD|~Oi5yy8%emtR&00%bh2sAq0F*-513OpQVmaPJLBvTGr5VmNm$a*Dgs0$SSOyb zwTXKEr%#>wgQ@Ame{<^a-{gm%|YC}ra$++`u@K5LbjY2 zS9^&}Qhb56py=mvfmbP(ysA?T3tqnF;5y8Dm1-3qlw0(QT(44BAXhCG%GJDA#wk^F z-7J156HLNs>cMC^_~L#rHo;Y~k7BqH>wQ;Qk$N%O;^ED#y^07$u2}4Z)vtX^laPim~<<2Xn`-^zV{QyB4`po>O9hmMFvVfforkXjxxxxm|nZE+k1T zRV$h|X8i0D*ciiwlGxh{w~Nht)-OSon_EkqR}l&l7%Hn3-m~oMU_#YaLMysP^8i)J@NFL7* zKejvjy5axUnPVi7)NrzH)@m=`d9eK-y@`+YKgWmNdhq#!xme4ULRB!nLS}v`U#b-F zqgF%e+B#r#DWA{f3uKnK1<&FScdJ;tH1F7MO= zR7^t3msn+^(W^GHXf0n@J|BzcabSkJazq3i;U(k^yLVlLGdL$sq24}-KDx5Ucr-Z@ z6J1+mlKUH=WF@whBvSCBC>0V{0xE)aqQ?kFQ)zz<#_unIeVQmJA-Kky3eq9?kb0vI zk)h2tHik?ZpDGzgJ*%0rt!DCJrVL+oe z5&YnWlx`4(ROAvB7?RV^K1jwdzb2X_Ix5~YbNXc|DxPTdi4g@(LUoq=p~n}qG>-{Zc@q3kcr7=fuv#G!dM5ho0NH{P{@e9Yv9P>rZ%Mkf@l1qXsE$VR-D z`H?XwPQ}E11mR645@&cYlu-$hhw2#mMfg|Xzilvi{HBDi->EgdJ5^4FEMG7kVW7d>bL{BeMDhaJx`nz$;}ZxlZhMm6q?sE z5Qyh#@_~);Y?Mz*gftR4%E&m0|D z{p^Hcf)A`uq;dT4_PIV{nN8fS3@LJ^I0J#C(Z|HUYUU`eOdb%#Q`8*#y?{m?nGdnQ zh3t)lPlTEtf>BXTy~!xrwxA1`b#`v0(Or3CZeBhaJ9Alr_{YR;xB|ZxLjhw8%NoeJ zq#!W?xdiaGB?Vr#s;hd8ffBAE#rd`pDjpQvH(EoykBn(>b+|a=YO1RNDOMu*wzQ0p z^jyPY@AvCkk|MD_#QK$l9*a?HFt{!q2e4ExNHkB)Fbk-zVMKqUm(-1?kjKpQRAXlL z;^$HX48pJhh2TV6d*GsiXNZiV5{L<6=&c{1AcYbH@a8xQZi)#DED%r{KYtx}pC;`{ znI8ku_;F%XIxT^`Sii)S;8fKFrQ0C<0aidB0R%&7L@b&=`~d3t51NyH=@`w{Blq5HjmSS{p9uU_Gl?A}aq9#({gn z$5#k2cdhH>M_Q{_tztf_Grb}?Sk~&p_b}Qq#S(%1w=Y~oB4EOdj2Mk{2|R0>R|RB6 zdDgl<>c-VtQW}#XgUt1dE!zxXc6R%GBjq!}3Fs3nJWa zH18oc5ti>0*r{k5pF!C(u{G}-&^k8dedNFK^(7LZC`Dsk{VraW+L$fi=G`irCQ$eia{xK7Kv`?NQOZI`~An(@LjJ_U(AL1 zqUQ%$JIrQtHZktL4N}zz<1J=ug+jhi&IgMFQh?=(ST)-SmO`YUj5b`+mLLv0v?CjS zi@I(10=v<+w-aNe4gJ^OG?MeNrn=FnOMKZzZ?T-K74u=XWQReGIOA#|w5tT#+5y!D z{ep)fAy-=z!3H|a6lBsIc{^;{am1z_*mw2vNNB2!)gP>H+l^MdEL)b?wL^}zjXY%! zVh-(oi<%sDyY+Gn0p-Q)LUy5)&n}P!@BtSD`fe4rm_%s{pgVk+O1BcnCN@mMPlL?+paC=Y26Pp-MjU<*Em<2L7MaqliVL;aEh}V zm#iC2H$;koCJbdWaNJ)=PAYSkWfx61fO#d$CC(a2?IO=f{{Phqa=^z&*KuY(l0u5y8NqU&k1 z;lc~Ky>q$FhhFX{n=O43A@<2;6Q4wgeX=Em1Y-H(2R@a%(wu!5wdeZyMi=Xy4(1UX z_c`=TE+MsLw%SVAl9}0))X-}}XoN0^c`j=2P6iG(1M^iE!C0v3ll=bvob~aQgpU{| zJH#h2;;}yX+FNhEg;q#D7X8{Q7JcG#^i400UZRycX7|FkKIQ7pN61DGtS1gQ=_gtF zdhL@UjhlHSz}<~@#MK5w|Bve+4Ezxym}rH7hKQayervR(j=lCW6d81Q1lF?V2;?(W z5^b~2m$MQBJ%`vfq73lLn?|scmv1KJE>KcFf^u^*dF=nR(dJ{Z_A%C-Tjua6!0cKu zA@03BHh*z_qfLE8>zVNP$--0%G9q-cgnjAq@ZjWfl_125W7k|qtQOVK_Q)8vbH!>v zRm2o>2_eIh(o6KJ z;zF*PU2v*dLGc4`rkOj2=yEk%aO?oV6gyAo+SN*-U{~|Gaw(|g!feIeG55hr@Zey+ z0RjqZTcjrJ^}|v?$^4uTF+V0y27q~+RD58YcguFASnzEC?olXJ7jDq zkDsUfSXRFe!#aRKIhf|B+s5)(0_;Z&lO19N=l-!icwie#{)5?AK6cJ{U~%?|va;9? zk$il&lNIa#n*V?FpHChA=O_M#*aAM0|NV`rqm-4RkKlKol0Jfq4}SSSEfdc2ADpDD z)Sb6$2McE@=1S#?RMg9rom}YUve}>Y7DRLQd1*vM2n{8W`oxM74b~3qb5z}sv=LI!sigEM#K8VB2CBz-RWf0y1zf&{x9uBI^SA>_ zc#X>zP@f3LxUHa+{pwoR`rXDRF~!nGP7)A^za(soltm5xpD@EX<2nCWg~?Zb+}X^G z%*8Y_Tw$XQG9M{sve_peAr4C6q0)71ileMq>*GKI}qds9!XFqmA;0UtMS=*IRr&b7UKvmYq(q3Af#2A18OY+ zgTLpo6x0YfyiL%eOC%xy*AQ7#+E)5cp)Ywv;gP>Z{+ML^o2*s|a8=}2JeRoGfzuNVXbSfrNBvXrqYZQ8z39j#1RtgM|cKla74xO{Wz<4VdA_>ZeM(4 z2@0nsj0EZKc6=7Boo7r##J4&GxMH)Rw4i$*jN0vVQe%r4IE*C&$j0pj`FN`ly*zV` zz)6Xn));&Cks9t|Tfs-o+$HE+dg^EZE0xsw2=OSOwH_K-vMH3`ia`f#fl$)@V9oto z$B$BnaDtHj&S7##QaYN_(ftW?N=Nm4rgZeDnvRmVWbz>5gbo@d&{ZQYT_i%4E{`=z(t$Tjk5T**bbBPrOD zsU;arvb&~rBbg>dZ-yf5M$FPMXC4q%L^n`*Tr>Y+uf(`7D7PL84FuZ9OdO`{F^Puy zC|Ei%q2$BuBAdP(7lMeJat!!nQjgDc*wb#NnxSb@6NtSe6+jXrm~32G#H?1!p#gy% z(vFBRz*QnHfUIs=u+c#3mNW#B#K=BSc|=*ONhrH2MF<2Ka-Z7((@6T%8W|Ka5uHGw z#K2~~lK7xP)(|Y)(ocb>Y-wLA0sly502Gl`(Z7%6~fBm^)FMlW~yolY|3_;k0V zInL6|Efv->(p!aE(WZfsZihZS_$*nnMCWb_vd-Nhz^%b%Ngj)|tG?2Y?i-m;r%#5H zWV(om1sp(Jn!UuxI>+W|{mGl%dmt47s>FvfBxOiz*lDs4emNq1hkB$_M1jGU*dOPzs&0bX4M{3pMVNgrk7x8M3go}QQQNBj} zb=ah5l94hM=y5T|i+H&1lh&hoVIn_nNm@(L79Z}I0}6M0%YF2xyX9Bhym=x7f}eT&+Xoemue!GAp}Zr7*K z>n15gQWr*!Op;}nko zu0Yw<4x1vGh1`E7&NehDkK~@dYifKlKsVM-5R1V?-y|4mX+_PhH+4|yN=qJ=W_l-? zl_)~8Hc6(CFOdF~d_8FugjIW@@T#epn_q&=C51i&^yf^4NkM@mC&`qoVBv9@kmWd2 zFexEKlGR}%Jp4B+d$)I>9$>&Sb`+OH>qw5(KvDBlP08pp^ z)o7&fsq!QqSJFipN;zIOsA0%V7p%ahA(Tr1r)bOh(Hg$i>L#sc&<)N0@x_~93Z*M9 zaXMs6;_W7-K2AH7mZqcoNFffR+vy>ngSN$)yQVr6Wrii!wjCz{CaiJ@b1ArD3@ieD zWt4o@4@kXET2*UxS+L_OJ(|I$wR>BdgGV*~rmziFZ^!lug3K(*4r`Kb%-J7)O?L&) zF?;zVD)DgdRnD4Jw3V+f6MRplQ&P|7s2e0Y_eP^h{eC$(BsJ5YNUBc9hygr>$WopZ z`Y4QJ1anq(e69&$N$>`S?kVsku@#FCbP~uUp0YuTXHB& zRTfReiJ+KN}056#6 zZMI22IFge)sIJRGfD_BsL%qqTwB`fzO0swQ^5rCI zVYXFCv(rprsSCw6D66LlyeYP;h_ovx*aIf46^o1kmbIYWpWKoO(Npk^2)SKs@L1_}IsF}mzbGGC-tw`pK zt<~3IJW0*2CADqI#3kDcP_)!6q-;3mQOTo15(lO|B|}55aWj;K>G7c)anPV(7iDQc z1*SG};FOumoFPyX^~#~CSf)*l1<7uX(|4H*tU&}xQwUvcSM*=eB6;u#spkUS8c=bw zYU>=dgmWtgD{iVnPObF*WDnQ?mc3=^Uk`!FvCKQQ>r6Ek$ z{$^jkCsfhC5&7Dae=3 zBJA)r1VU}U0PRH+kW4mSyG}yzwV;PkE3a#85dw?IB&Lu64KJmVfbI{=2B+eh0{kSj zSc@VkMIeRTq<%9|plQU|lpqAi9rY8noG}EqiC8GI_PSrZQjBp5S=cPCRhuPQq z^oGw~MY?0WxO*N1N7yygDPbXFlvSu8WkJw|K*l6nZsudlmYcZ|_wb^{vH)Fcsgu0F zPoCWYx_lh^LE7_7TN{nGRgriU)AQ^J#5xx;lnE;r@&|}@epL>nNGc{<`H3{2k+mSq zm;h(mKEz4Jdt_!F0Apmp$BvMmNYEUKkluez5XW;^XE@SNtkI0D;zNce*7JrPkjPwNfu%fNe_2p>3_UD6(f41;~qw+KB)^EMRo8Bi*Rv+Hw!2 zhl1ZK0LM%LgOShgrqWu3uxv`b48CMaz#y#hz17JR1&{5y{gQws9<&n=_L-oKBY zAfY$+#l#|(Nl}S%Rr{-=%1LcGk<^j9>uoxm)8G~pvZz4~0&mdfkaN}MD^Wf&_|1SI zkUDxMe~JDog6KuD;Zj`|y&aFpEnX7+xQnVEi9rZ^tddLv5TdOx8zH|PSm0MbOhYiJkKG1ws3TIk?-baW~W9X9ZTU>?*DAbJ|S~e4J6os_8 zPyb&}&||ldYfp z)@4j5YP&Ws>GQ;aXq_sQK*kb#!jeETfj7hp^93YeoRXozC!44F8 z6kOCbHA41yPf<$HZ53mT)Oe)^v8rh(b!uc0wMy=ZZb<(hDU28x3y%2I5+v1cSp{an z<7&;4uTTulqMx`}pGK8s4l1|?=9*PdIiShOKvT2}2ZG(0mo0hh{V$xs`{)M@zspG#nEY17it+$W*rj@E7gKh;CVT(R>;x<*ROfGN~KmT``Hrz1i5@USM_p1iToh1==gq_ zVpY|wQ!E=T5cvEg7U{*X2DIgP*}c0qq$jBwOfe!%;O*z6e<~v7i_ayp;=XAVk}H01 zU28Cv%6^T6C7a&Dc>?__q3u$^%g*}FM2%ZycR9C0s|4j;EESYgJYa2O{8@XK{hbWD z+fmG_C_#4<6Ozj7l)l{5I6kE*A8c+jkD42mi=d`^mn616{+h(gxY=7Uv|e-%i;@`M zaAgoiCYuHMzbHlTgrr}Uc64IPiaooTLFL#5%c*3foL$fo9w#!Gb~w7Dd~<++Qnnh9 zE3emr%S59=bXMho1#30wiJ4%BdyI0^b1Nic|a3dk?YGc>Rc4TlOFTIYlp9PK$gSFg*1As+vBJpdQxB+q5 z$6OJEs@o#y1iv5O6vpf^I{4F9EA9yUQkJPr$fe~EB83W*Iv@Iy%bb$F5}Y*{XKjP- zeF_Ktg0$toSR1!6)*UI3eVi$?0N*fMzi)I=9{-(1lKv(j*ZlT z7G*?}!_oPeOk~TmTTv(EK~AZZ{*$sl;%Y?OroPz=(tIq=BR3 zxPB?jRqVX)Q#-+tOna}A1$!K7fR=0dO7KWH%B=b=iTw*~!)|SO3H9DUuBH%d8@56y z34)|)gd|N=Y@Avxq!<9tA5pTsL0QF*DD{13Sw$g$HwojA_Sn~4JYDxBE)mfx4*0^@ zVdS7YkmzpcB2jHxMXQ*tR!eaUN{TTCVF!NkG$cy<=cIwHc;ZIQdCtqCg37__focfQvX5y1xK$0MzudtPS=1!|3wp%I0{D)4E z|8WzRBn`B1z)EbpmR;MIoec@+fi~H$-WYJW{c@(P36h9zR{v@!H0TdoxyHue7n06LB*cF>Uytp`+)u%hb{lki50vwMAVT;N2--tZ{NrXd2i8W0Q#=rm9*0}(to-*e3yelMO#rhS;ThKddsYG# zv8sqQ0%H&q=phj1xJ^rIBjfv#+D#VE*rmlOlPHR7*x<55XUeH7PLu`IS+FS_$BEgDkiJU z%q4XvB|S$8KE+TuvmB>Bm?lYa|E@TyI~;UIHqn=?=)O3l29e!3eS@tn!82^5(oG%{ zthSuhsC)&B@okNER+0w3;D~%*Fwr!r5&*MR=LqK2>I)NeV zJq#h|!!*(2w$Krc2~My~3S;mU>AFW^6OSg1y2M(KrY>!0lg6~t)@KL--Rt1nB>$PH zU2*kF95vR|mW5rXeZQ}HOU99k4-30HhE??skF+H+QLLI4xz)#?tB&x5#Hvw-zy!%E zG5*s~XF!lRXGKAxn9K@Da2Y-*SzeK%DOYVWw3Jp1g489+OXnfhe8@7h66>~5pf&765|D3jm|X)cfps0@o1uv~ zJj~khHiAEwvWY)+M+k?j?iEik?E9dVN9klEK<;4GxIThpNE2wLcL0%_#S%#uc7F%D_U)<_P6o?4;z91E`lyBMQv zGpiyLsa&Nk`G%(t<0l3QaX|tR%!dF$J=qe`!(osIA!EQwGR>qzbTW>5@P?Fn=n{;R ztl*Sxe_PIWWJpw>PU*=&ZIicTjx173yqo$tabK-qF>c^DGY4+S#-JsV04U_%B=3rR z^9m>u8)*mBT0{iRr;F1l#Ke1F5()<3i!+5dA4&3_=FZ6AE84%B?wQ~fsUMWW9SqM& zV+1);_#oSw11n9@C~$<|UvR14*2s>gq#R}p*90nq1*TH z9Z-^u>q;|ILl|x~BxjmAN!AnT7v$^llI%)M{>5>Obi~C2a-kV$zb2+A4k#l=FzJyn zLn$Y|21#g-bjpN&ldLNOs<3B>W>-=YbpRC@AiPKio5%>t@r6+X1wnZgkK$4?>YC4J zL4p8}=^aH23EI%5jHJ@hxNplV2|Y zAz@=(%epm)Lxt`Z;+iTCsv0Yikd;RM9D0zNPq$c zDa1*Z2tEo!Q&l1k0$qhf#-s&zX~qwBrhMtQuFy%m-zFGHMzYq_C0D6hWg4WFtRil? zq7_g3D{ef@?g?8m+P=qe`f7R{IgX>P*`MfPEM*8#E0hk>d}4@Js01Hi znCDkxcXmQ{G;&AU-2S**)5{um$;H~u4A~|24!?fr_OXubh%VXf`JagJ8qX;^^dBhN zwNfo>7wDcz{02c2WiOYfyF@7ro$&EHOc=LW!6{b#uuLb*sz<{Ezv|g8ZFc#~%Z9mH zjzH+@q7W^+yV(%wYYP>Lwv@=VqJ>JpFvMAIN{Iz7m|o=Cwo>Z}HR^=?@;X|KsjSnZ z4JkGSRAU{8eiRskw;M@hl(TCH#sOWNl@je6iu8QV`^3ByJdZ{R)gpkaRNP`MTgjI4 zguBRe;6c<>w55;>Jzk~bgeYtDMU~w1ruIzLA`*jXgH^^#Ob1x1aYQz$_MM=qs~`r2 z)|_gJof58u{Y| zo5|kfcU#Op$7+XnYq!P_6VWk3WFDWV3YmCfQ7t=Z-~nS=b_2$4G-)CNMcO{@($zu3 zJNE9{dEn2*@gh51@VmN|4|vo**~mF?**-czW-oucY(}F;?>&S=I)~;yRAj;@oWu@5 z^$F<1@Cm2iKNe8!cBUUsOc_k`(;e3FSYpvf43iyP4*Ky}A3Sj1g3>UCV>djRU^NqQ z1RspGK92u?DlQ+5fes1E6xkL>uhtB7Jwy<-c$(w`<4sWa3xTjr6mYElSc2_4YRbd7 zft?ToDW!{sB?p;JT>d^Oc#4b2muAS~5*L+SeN1AyyGf~gV*wDu;-x`pcr{0en^l^k zsULb(9q*LR7mvOa@FaAgqvnKNisENxmXY_`_#KStb{TOJu~153O0w5Vx8`mI_N@*- zn^HtLc$!;yt=_|Oj6V>^mc}of!)h!syyq}(FK^<9XfP!)TBsPxOIT@0hDY(o0yD9R3oOK~Di&QF-aE>k?^^ZT$t`LUnWQK&1!EF2W@^%T&?^Ln zgOS7?m^eO-z9NQw)D|MH)e%3$i!Ko<+rz{gWBQSkh5=a{BiI;_wJ`#=px79UwHUvA zE$}*>P3p0mvofd&bcGmf0_4dvzfLYL!;@btb-ixhvahM9MoeJM%n}V@tBd7A!OG|D zycDkDcB8q36<^}Qzp~x;20`50B(AfW;_*g3!RRh?84b<>EolG@PSiTBMgDBL?R$&z zmi(?Mk*rKN*s<{9-oDhP0<8XssiFN~q@nmEV#Zy49*vz!XE*+d?@g zV|oc`?d?_z^b5^b!b-^y+!1mcx2Pe1YO+BM_B5gMWW{!xae?VNv(?rV(g9RHplM)V`e=G$KNju%uyfi?5gsv^{J2!} zk@Sk)W;AgscdF)}WYT(I{r}YI&rVHG{a|Y92mG+he}5;Kq4i$^nko4v(^?LlW;}`SAe8qJum1?O} z^c^y>3;$^Igy~cD;EU4mFR2*xkA69y|K7>8VvyTJfs|5tV^xtv!M2I)cPeX6L7MO7 zVvqQ++DKY4sNqU<@m-?DF15T@G|n=u7&KNmC#@KiRt!oj2Bj5)B(Y^@#h^iOpqNlK zP?u2(bjn}p3>qdE+iF~2yKwqgj%f0J6cj{7_2_1U!|3lF5`jNkk$*a zNnC0jVf-da&vWS=E9PlxDGdvAQAf#Ce$`vU#MG&^u20KeBDK*PSg|&EHGAtZ-OH%EOqHhwlWI*}w z-KJzn@UCT2{xq3xBp99t#WTb+R~o*7hagy7%uJT8X>6886-n_no>VZz+hj6eBUd+` zr+L@(2$Q9fc|mzw3c;EPd%)|Nns$^`4P0~&-;*ENMrJP2S-7U=jAKXA z#>jDqlQvgyNf^(Uc=k5QG9U-1DMcj31;mGBLCh}1l0|Mnv&Su7k%>t<5KGl^f{Iac zkofAtu+dmt_st-H)Y;~M3ODzuR$-DC#6T8F`2?9<^ zUFu<$l?xh4zcb^OBruA@Aw38mS)NWXN-G(P^9^W7wg7LLHa}dDfmeyiQfR*-VI`b9 zJT`Pl3^_yfxU`~aVJyfuJc;bDzODPB(7MKfP?Tzu)Pf63{D3A!_zK=6P}%qa4c2LO zV5!UuQOuR_uO-C=iLma|>4MX)Kqhl0KGQ-UO_*NdP}MYRN%d?iT78ow5b+@jHfdya zTQXQ6*j>|0Mtdsb7zAfg|NhL>)b#(cvqbqB6u!Uu3;FzCxcuFx52)}BN_b~WlsXD( zVOXkEt7W%HL|>^K64{q86|xw(T7~+8#D-LcS1RmC_?DtvzWKCirXF;;raYD`sXM(9q z`>`kl;AEe|v%1g*g?)_V5@{(ZK0?`(Zq;%cM)Lji@oSVE6DK!``d}(a+gcc#NCDnc z_QF7#CdBtIbhZf0207_}qu5GX!ZqN=9N;AQwDGUYnNMDUKnll+cVCjXspBX|c-VYN zNLqsXgc?kikiO2`g!YrgMFnOhSH`nV0cgMg(I>J1&$;Al2=%;8mI5^!3A2<&ZsIv7 zIf9I4j+vA&kf_wy8!u0DTiGPDAjpB!PI8iFX5o}i;G)10aVjZ$iI~u7)8-xWK=p9| z9P=htrX?oz&;md5dZA2l7F8@u#Lvr4KnH<30BUI--H4d;|F?H;J#t-Fey21s5D_uv zVK4~>29kp|#Mxt5r<%i7XSwIBQ^_^H6Sp%GEVtI>AzD zD?PR?)v4J$j4?Fal+Z_TnG!!Hw@n>wmI^`uU3Qis!kRX9lXYkC6efIL^=(qs4D&j) zcDTmzC|jt)LF-~Catr4IdsNDD9=;v;&$b}~c-VGEAsCNhclha65aV8}u&nYZy42+v zhz7)FODHd)2k0Cy`aMxs(V)m&R zC~eQ{P!c4oubAG)b&`oYh87O#Q&lg^YnQL6UZ1rLq<|!Q-QE8v9I&qynW=PU86Xi z<55G{E=L)OJ0_h5Bj(7u2C4E$by&8{Y-Miw;LkI8YHdd{m`O!4n}iFEmIFxzylC>M zmrvQkK0_y+XM$wYj{u5Tad-M#k==FLG>#R{FIx)leVXjT6U<-p>d_DxlUlUvwU7&Y z9}0-36u#ep$)ZzyyWDFowEXmS>~L3iJg5S3mm}OxH=1>hi|1>Au|LTav@{4P)X7u-eMv-8rhMw z>la@8>q!s_n4DCB*?@eJH$XJb4cV!W>6JEUK96^W0DwpUsqjinjv^EkxSQ|&nC^$% zKa_vxbOO^8RtX}K38GaDBD%JtGut30m{~!f_^H+X`lj0K1mcCF(81)#ayt;TxG6RuQLyy6EwMCIZGU?G#x@QUg6Ufsx5>3x@jUO;E2J=Ez*GWfzZ(fR zgR!6(LEZGYfr-70kLMY~nHk~=d!5U(bkgBcE8SMWy=Wm;$5Lgwu-r`WwyB5a($Miu zC!{xB5s&+-u^7uUJ?R=lNQ{kfIq$?(P6z22xC6vBPA(o0W{qE7Fv0?Iq&5>nY@YRf zc#-TO+Mi-2kjw^muo5xLjTo>U6vy=kd9!MUgS!-f$7VY;cLmK>=S-x>5CeCc&JbQc{cNCH0iR`zCyUG2-?@w)^hf=W zuE&&6JsPxs?f6lz=bbw;$T-u?Ysrr+hT}nvHKk2uyf~?KC&CN#iS>ik%^e+W2oQZc zKSJH!L=Q|T@aB0Xfc?rW_A{C1XXZlTnmQTj%HY)B{P$)}H)uN}NaDzNY~2bRGohkb z7>|hyuuA|yF!O-zZVORmtY+R_XF2I8>Y97fZ=X&#=%}xXAmW^^5VUYjfE#aK4L!*o zGZ-A5N{bBW4fKYNQ_M#%-Qx8?nZx+Cv_0lZqv1SXb-KxISh{3?nCjO~&xqYM4RIT! zhLXyxVmiFp4KPnrr%H{d{R#U3H;B7eN`f8nG_u!3GE@c0lA1!SUH&l77N$2PgyQMC zxYZe8Sgo2PR*ApQ=@|x6PBgtbc@Y{PW5v#)DXpm&w}m&-e1W)#7LZsgmO9Py&^h_{ zPr8oo=%j=H^@U9TIqCrIL(aTsLA&P}Q|kPC7-K*NYab!t!JYP?Gh9Qnr`HuzTkfVT zQfq}NQMvzLn@PhWS?M>fmP%<$OSoQswb=Lat2L#1O>#h&@M$dci)h5NrhU%(cr?OP z-scjNI>FJh)ar%dC1|u@uCa~ga6-=K!mOQXB+E*x+^W`rV6WxFVv;n&YN65$^W_{~ zw`R3a%eOY4k2~!+M2AoxaQIA?w6_b}JvpzY0g&^#^>QJ%UX1dZ+i89Oe*DNeP`IdpNTt zVZZaTd(cLPU6ZsBEf#WRBoU1IUTo^=y;!(%S14X@Cbe9x6w!}BMd&$Lq>4tR61`TV z9ae*lFKnuRfmK#a{FBs4;~r4OL0xC5zjlQfF){U(lwgB3Iqo#$kUJ(y&q3BvM@}I# zXT{{AW^+gBWGSx~sS}qIITw{1VXIKB(RU!PI&o5ktJQinUrQSNx3R=xCO4~Lx*$wZ zO2Ac$lr43##xz^^#HpX;8=jJhV`*&5)KPP}+{H4+^1)F$2p`4V^gnP=B@Oc*54nE| zEDShP4vB%Vq4u`@9*JNubF!RBTYdWyQdcy8&dm5C_S)~28&b~CwwOC=(UfZ%En~mh)I1IdY`_jtN zueL%@sYTPk)5x`C0e^H(-f&0XRXQ!+3SAZ)MN9M*&!&0KJoDvd@cMq_7k&R?6^Z@b z*=(J~9K7h97SQCFfq>MU>4i&|pjRd(=!>b8bppMLhHa*6smxr$?4B{Pm%jBJPxJpy z8f8o8|8xES%f||q2vb^JJLRJwLmCex*1yY$GS`MBvyggZZasyf(oldoPvy|HU;s@Z z`Eur#$k3k65sio9_H&oL#N|Pu0`I<7deBD#^;JaU=y$s>5AfC82%?g_i5BYkg!<=k zeQ2rdF0v*lZc^8-%YXZnR&htK_Ih+f)Rm;S?;cTp4>Of747ZJW*5%T~^+OXhc}Dq3 zIZ*0j`Z=ZlG#FJ@RTQzlH1ozFt*0i^IX+F0aODh)ksON-1&@_377#d97X#<#A~tKj z%g5xgk;bF2_+z=F2-IQAr+IUJjR5Z+w4Djg6Kuye4k^W9H?Gf2IdlMuf`8tcbYH!G ztG}_XUa4Azx{<0BKn)gdq)&xABOGMPP}G_&GmJ@O*;U+|0dc&(qhWFklh>a$4Z=L5 zf**}GwBYp#&=!8oC7@w#S3M+{H@Y5G&pV_8uJbw0DU3e_GB$ z)I*PiLg38I^X$XTb&?-p8LhJ}fu|t{e>K_5(|h!28MPX zW6c31hE!*Op$Rz z&&3<9v>t4A<3m<;yG?l2em*!rr`g{C{^+-x(p?VY_-?(<-$j}oM0liMYzbkNRT9LH zyA(0Iq~wq8goN1eA_%%fAt!Z1uuLw(Yh9nOhJtk2DMLCPx=tZHDp&Iq#>Ql4N1s~` zj-B(yrahZDM^v{^e#`*^X%xfLB6e(V^`;N;jk*rk zn%|wHZ-_G~r@7+@hm3NVwuhS^@gEMOUCE<}%rEYXRp|*K_;UWjIitqOq%goO2#UB_ za(9!T1puPl&HuEsefM#2=aVPFPonk8`+6Kt8%GnTmN3-`HwolcvBK63bJ)zTdE?#x zv4Z9}Uiqf0_oYE|R5tR}BDIK1Kyx@Ni)~gyAp1-3R4mTFJC^!naXVU6NH5K9W2WN~ zd6&tnHB2HTlc(Qq$(}he;vT^Jjkxn{iQ4s(wqh!%vOWHMw&IQu%K2$zSWUm^;v3(d z?G7)i?G40M^5sgiQOV^)idG9aHmY>ZZPshRdq(wYwN=lza>Y%5E~`|Az_YM_TK(Vd zjh^AD1th5{`)V)~V2Sd7R1ub>Tqq}{RwK;SbAoU!mckm4u3^3y*TJGK7i+Z(MOc>F zgaae|L=wTL1y-1UISR-@^nysQVZs;Lp>?uoNd(fc*TVa<_cvW8pPb|UjcS|&i7^W6 z<*8mSx-+JV*5E{FUnLy14Q!SdhVw)oG4FP}VR zBFWi0h&x7%s>NcVh=ywBn!LYQV)aehyi-I`Da4J4AYf@TkxmZDCZY;DRQ!LpTi*jk zSgLDEg3zfT*@Wes`kd3XZ3L1UH#`22jf{;)FT?LnN}=kb4{KUyX=iSTxTvX z@;3uzps8Md;P2ZwNtTTOMiw=BJ#NrKR-42(zm;WPFcw%kXS`?P5ngK57qrz<|NOVq z5Bv)0k(3f){?vZzO<(+19<8M2@SSw6N5v zH+R#0OTMDFXsh}hhSZI9lBC^^+k;Pr3D^H0D<cEm{teSKUNvz6aw6ddxt{G zaA+&OfG|y0wsH{{E+X8a?xrdf!wpVcgCMw7dYN0?P*HBiKx+`z{($Q%NyYQ;x<|y= zxAT)P-JZ8%NrE!mWpSA+{^CEeY816TcF~^4doM?O6ewBX<^Gsnyu6D7D0>>$3z2wW zY^ZLsis2F8%W@TVR4IcipFXY@;-bhBw+F*|fBw-oI7m9g@s(9zK{Y^f+^TuP-W3$x z<5nT1w@_0HoQta27-$Rl6PE`-6pF~3%1LZqR^Q{|3&!t+KIWU|*Qya-hTS&682b%htP!i!kp zMWCHlcoFKTf|^|6Md+~eMXjVs2wTqMTCDIQR(KI)=B)4{RHI4$Q)+Rq@FJXrt1E!a z-W6ViO7gAdA62@-i*WbS6G40Lpy|}_23?f8$!dC8%PPZyo-$6Lql)LAoe~bYr8Q%kf&*)X{6@Ff#lag9yoRX}pI>xWo=+P>^fNQWgRU=rqhk-+rzn z;}o<+?i*zeJM1&Txmv0_6)nImX3GJrQmHM_Q>mY*ndhwpJ{&D+>=o6q)0l3;n8X=WJ8E89f z$slIS$Pi;gnWVm`-)!-VBo3Y)y~5>5O1@~MH6N33Ee+@Lu~$JKV<)ooN!P-Y+8+Vg z2{FECh-T-BtuqCHQETf9E)Zz^9ZfU1XZmJt~K=~Cxro>}nb)Nge1Zj=kISiOf< zpY0CR=!(xgoFno1{BUX=H3Qt&9wjOTd%UTqGy$F3^1f50zrx2yQc=`EG?C2@_{p z$i9l38m~A;wph>OTJueyQ}KX<(~g%M4MH|EYhJs4m+2zTyhHD?SxF*K0CTgq8a$GJ zEn;NuV?r(iT-FTYF+fEefc3-~9z~Z0khMnmz;KiyBYby_dn=zB&mAa1bJWte(sh(EF?obRO~T!`Sr|Ku_=W zx_$aD>VOb4YJ4NDTxjB)o&_mb)E;3aezH1Ms)JZN$c+x|N{e$a%E@3}CCs36ijyWY zcgK?Ig9>F@AcN`_gOX%AYum#v+o^$`?8L1@qq-}Xy%}SmXnw(cU~f5mr2E`_wWg2~ zfDq^u#bBiTK{-ur)B(aI&8ubGK$z z3dA6N@JgvD9zx*{%~}>*YtO?w){0ssF%P`TL|j*z4cXb+gvXK zW#3(5bsH5Sw7gBL+m!#`4XMP$vZY`q?)Jg_f^B!He@7B~$tcZ$2jwSUBN4uqyWB+b zHCK(1_Fu>D)ZNRQ^a6E$jP%lqei%^n)p^;Y!uXZNp6S9)6Lzp(@z@z&fE&$3OSwGos+l&28v62c*potyLY7t$EVmN1 zCKMKgH6hzsd*gDh94B#ZGhJZV>ZsfVII(z0ZDGF~Ta^HN3$H%eLV14Bj?C;g%_5vuvALvbj85Mr~u+qtsa#NVFappSZoyH zuog9{VYyZ;HS%%37PnfbQ6Xcb5q>1D10jv*Mlbhw<3kiE32md0y9$dx&ncP;*&78- zJ1P5b3a4_FXPf2OHfc&+tW(8WtdyzKqj0iX%jH6#k~^E!xGI zAA^J|%L~JG&JOHjmHpq<6FB*GqXQ;MQ%D7-YE`K?K4=26(tH*RaB~f9T@ESXrzARTH{vF!ENfrzxue`O%txSZrOAbi%^@&~y9dX{7MfA{ zMy|9`s#Wt*!JF5NVPDq6Ao>HT;5UO`&~$Fl|2%2(yU}2b-*t_8?EXNHx*70E50EH* za9iHHC%icLct3urxhB?K)<$p`9{QyQZ6xEs`8D(RCoCAs&u63-wWrTnIxNU(4aiWY_77scN`}0 ztzKK*%TkJ-bin)c{yDzRm1cFx_qqSS^QZse8vkGYZ$*I>1y&STQD8-Z6$MrlSW#d_ zffWT-6j)JUMS&Fs-VO!&@BE`{@BHAscdosApXkoqMW~yUx1a`P2V%jsLIyx1zv`0xJrvD6pcy ziUKPNtSGRez={GZ3aluwqQHs*-&G3y(|_=T!h7D2Fn9ltN?KOxTR16?U@B4Y81$o3 ztt_#X=EiiZYFM_FOH5eh@*82)hnvcU4T>l>M|Cf zzx@7}-QC@vefi#(|LVVeaqZnd`!CN5g};6O-+b>cwtw&suU&gL<>ZsR4Z=Q{DWb2H zlV&SVSze=_Z&spasZc6Zo}eV$>DKQCKcc(BMsWYmLd1p!07vw!x#+l9ive_(v}Ug7XtatRv{svGQ*l~B_Vd|d*E$v^9frkR-Yy^;u z;70 z>V2pO7T1(c1G}^<)ThNK0g7=86tZ~NVSM?y88j8E6R&@&n2aSsFEdsEc63ZTnnQ}O zyMcknsResf$#lAy5FAADWw^CJqDt8C1?hVOeuS_#z*%)K64DP1h1uM-ba%5=WUw_n zK6PAcllVkfnU2{(HseKn1R^VZ0u6bZ{{v@eK#w4N9wrB_bq*aF#EziJfdmFtMUNQ? zaBtgzRZJjnaDxtsj%@zl;(I@I$IV~JtoidVhhP4?e}Y}}%`Xdu-|hd~KVQp9FcpbytSdtP3Ym4cjUP%Li)NO2R{QV#PBiw}KcZ3uZRWr{1`+xnn0Z0N^ zD@)8nC&$z*eqyJ|5;M7;&ScL-2zD02<^V{<(d<5(8UT7O?v9Kdgm?@Xs(lQKA}ilb zURcL_simPHY<*WQ4L3YUQ2A1qxBL4ZH+E|T{v?gLI6P)BnEAP7;wIeVCY~j}mA>f& zEXzQ_M#-RW^A$9Iyw&A(huEc0gySmBA>2M$&LGo#c=Yw2L4ty&BJA1SUS9~zz_scy z;-r$a0+5PIN0%^oUV}eM_KC0syZVn{I|Y$>JLXF?z>)?Sg7_)?Tts;UhQcAb4p2$X zQ_d$-8ol7wm>mM?)Tu}V8x=s|HuM(!VttfsTn9{ZoVI2FH!(w{bqdbfyD(R+WS$Zy z(!lz5-HLk#dYu83GRF4J?Z-Qhj(+#S-R;MLhp=|QmJY0xqXO((UR38;7J4z;cmv9S zxY>ExJ!r>JVb`6v`tVNj0@EQtjs1wL*FT=On*Z#-|6p}H+aQsC?&`Tn-_6`pGky16 zx}E887-UW@GI&0&bLyB$^(lSfl0pB4vmqU; zfuA)P9~>keML1e7f*0E9aQ=(A#{;DQUKjM)wni)3OJwOu#yztpgTK=Ik>0LHktLWA z+@!~yA5${Z0+N`!Xph-S^#v!rM`n&87z>&c_;=}HQ)=0~Q^DIq?FSpz|LPWhc+`2` z{|La&2Bt#rQD4y2jArVJXFlngzL2I8fVtLIPY(N-Nqp@V29<0RQ(6wGnMNl1O!@(c zk|DVGvH@i|4+UJ#w49-m%4j1Mu7D0Km9y*0FCFH@kv-uow3lAR$^SM8OzxB-o3Y_Ge?52%;KN zyIVep>EnTJC{1=-Z83F~7|~*2rw_jCD_~!R@Ot8UbpxZtr(9fwn0 z4bb7vry@Yb3aCW6bV~MKYdrwRq-zqp*V{}fA)QW7(t3Q z-3%nL(54NF^g` z(M8nm25Tvotp(fwgd^>4&wNh}*i87-x4{RaL(xcIRr7?s9*!$_%`$Cxle;swFWpq_ z(A9Ggi??yXtjGND5V__r5H)TAnY=A~Bwfe2+ah(!Z5X=@C4?8ghfO6WQ)2~hj5!T8 zGZb$;^c*Sc&1pk%A{TID-{1}>|1cPhffxS}JHWIGR+05I?Z&@0m}g-3+a@5v(SHu2 z{CX*8`Zv3O`)(O_1N2Hi74(T-&2Qv#IsNOti!KsjN0`${tTZEt-=e>aIUr-b3e2dW zhgpqj_h@z~t;fayZeB_Tzs6)RG)c^2Qw8Mcn(bJ20Bl^(-jwQS8GZ0G2N|j`1_?Hm zZHJux8eQbxr2Fgu^RY|R6xl{9X2xQQ^RGi)@hqE>$}&sX(-RyY!JQ7!?EAy(xA=njKIm~Gb!gAoFefZtaOAf+_*vSp*T?QL zSH%dFZ*|*!NTV8r?t!+dLlc}GG>YVa>_=G8ZQf!AGCXl>F`$FgiSm?zz&KL(Q z&|=LPP;~yld+=a95Sab_7%*V&oOp0X9F$K0_@t?Dzt6YsW4b-U*EjB>#2#SAXpBcO zzDv4=+-F-~+Try`$obHw)rw!TIfm(+riEUA*lzUkg6ec0?e#lNw4;z!?Ta|ty5nAZ z-vpQe)FV3lNGH)Cq_Z?fRKCjTmyhznE!UBmfI|z&w@%XPfUMdy_RsQ7D6^uFesUndF;tzRlO0}|*3NH=C zys$wWFe;`Hj>2l3pm+mED0IY8$Dk9v;Ct|Y>6dQsR1QD*isK!_Vova7a=T;BU5jS! zFS^b@#44g^6WziOwJe~u_Z)EJ1$f4iF5@1O)&M5*7(VA~9CUL89*z<4fuUHwekvT^ zQHURO_fvTc3foPA7E|MW=GzEq`se zx!dn!afJt+_$9Ev&E4rlwzG}&`h>W2oP}g^iEX7MUo0lITG(hN0KgM2538+u92P1` zwcN;8bJc2;fz&P@oBRK@xy1CPwqdsua-bqtA^t*xd;WY%tim}>D=>MJs!&n|XtMVT zgff&ulsqw=X{lN~cR%k*#!IVMtTqY|qf)3f0lY4h!&*IVh1F_3SE|?Hr)F)l>DtP% ztH;lRP`%CehG299jz5-xvD+DAw*ZcBM!0uvgw6%xo;YbSDLQy@8c+Qk%W33G)oP_! z2$Mzu{O)p|t**8TVY!-A3i({EP>GV$mNUOVwm=uLa6oUUL6f!*>R*EOup1ueybil| zUb~fXeZVylme{GwT2Lj<)eCu+5f>xGyHU)Aaa?PKNnEbgYlTv+Qmdb~py@qqM@yPET{$ z%OwQoQW`##E=;eb*ycheRJ!V`)EVSzAC*^;Vpsbterpsv=~z44Js_QGc+P^BJ$;(hzjSb8(2yMuWnDeU%koO(^A`T z{)6y%&A`>N@ts5UG~3-b2;lnv*J`r+$qRZN<0>ZSW}UzSp7EovTcaVN48mTDTquZ+ zcQLrmmsqB&}3|2t~ng|4wD0&ZLxPiKL;$ zk`kJiwmQkL+qmS-X?!^5W=l>YUFvp;`r%qus9r%%ABSiDghhyi;LXMX=)&!UN|M3n z?S=x}${$pG@Ni6{FP!e0lp-28lO`EAtzH}Nahgv^HYmS#lz>(i!dKkFg~@$8P-Up5 zDUN}bUa8nRh5j{tze za!??~!Or7TX_;<%X-1er9=KD-#CBbB@Ka?)Sl>X)9wi$=)N?(Ep5m(PIvKre)=V0-l0z)4a~rncLYt-z z4~SSR^Oi^gBiOg5mW3x*O%jjdF)~pM)`C%?&0Kmb zIpnNsDu#YH9v_S>34q1BT8l}b(sF-Vi57BV#yM?sa^Z8I!>7oOBJQkIa#V*Y*tm>a z2yPOlqIq+ZEIl?ss)UBixYLimMm|5A5@zMd*ckUbae0&=tzvm@M|rKa?&Ve61GU*! zuEinqAV{56rudlRlm0=t)ic5qI%JXFN3LM2i(0}g-$;k-#zeZd2Tl8Phqm)we)P@k zNZ27fWs~wBagdQMqD&fTo~g|wcNEnsc_LV=SS){jDhi4elicHMXdhS<&7SBr{Z5~5 zgkBdA7=Znh&7%Rm9!!F1Q6phf5!izmtwg%u-hhxf`@jGM0MX&f=_JoedU3BWrg%hu zDN-!Rcva;4anc;SRBi3Z2OsBckqivhbkuQG4N%B5nOw>S?6JqR9 zaF5owF2w7OT8n5WHpz_bI2za}QI`3!D$BMy9t-tfCq9XoDvdk)`Wt9>`4#tT15U=5H)|BAiu8$?x7Ka;xQ`|_NDO*h5M7LUbApI}u@i18?+BNkRwy8Tj z<9EShBos5qdT@^m8AhI&g)+W)7X~HC)}$zR(^5rW45RaF$5Qv&J~>fkfVvdkafJum znU#x5U0jcVFQabR-PZ2!(;1VW@--6tc)-$7^L2gmlf|G;m~1!1HSxm668t}J61rUhe?6+3jx*XY`|^g2rl zI|vkF63gS@DGnv*UyCT43xg3*F47WqYv)P-ZLxz`*_N5H)wB?k%>q<_uKH74q{Qf5a(3)BobVeY zLvx4K$I4NLiEThF#J)<|w)5qQ^_+}J)mW_oemoSO#B3n_!$#e;1@ADJ2DDj8B75^Y zU~lbeyZvj8i(n8ud{!y6(;kh@vnJxS(TO~YCwbhW!k<(zuGC_&{f851~P>W4M~ z`MM{}Zk%8x`6wlbN=kUIDPptsF`LAOKY&m8rD^3I(e;Lvlsd*2HJ-G4FN3@NF+OLH ztnl%0#QEU}%>$Wld1FpreHg=f8f5YeelfXaG`^9xixg--dVySlENTxw89(6?#2ae9 z5-Sh*R8|IybHxD3V4fkLpq?bf#u(zH$;{ocq=M-}S?)#j8O8FZv)C1@N3v4`J=uv{ zhemb47?}7?rNC=`!F`-WU|kq{*?hI8aCn@lN8~Ly=&`1469p~-a#JWqMZ%}h zJmvi~_8}oN4d_m+HI0Wl!`FakPoM_#OT0C+D)*RCQM31UhqNgU=q^VL0h{O2t(r3* z+y!ex`T{q#KLU~Ou3T@P$Tsj zBYT8CwC{Qg%XKPJy3Ucr3Lo|33DSv)BE$A`;wzr`2nDJM0nr?AS{~psWL&*Q{`>ow zUc+#^vA>6v6?{ng=y7Kr?*S_*0`0HW~0FGKvEY*T?S+$b4<6qmS z)p98xY}P74ITtV##?^56^H#ePb||shAWjn~-ah@oYy5;u03Iy#7Je_cs*RAJFoVm8 z$a0sb=+gXz<&8?YTqqa*8rY3oZtYY*p)s9qR>ZZCeF=-zVid;JCS?(o3f5cFDi;fd z&F4aX2?>(x8vjhy%Gq`*ZZ|21$gSs(RYNHMe{?XsvAMY*=f?bYSF$3ag?}ZkRim(! zkCi50tdL(G=ffxgA+V5d77LZ?87d;?t09)zf`KF{oFP4l=TqAf_8mS)*JG!Uwmu9w z=YU2YsKCUc4QF&LmGYHVxmB%)wPGzF7L%kIRtuG8m@nsm7iv}uwR~$+dVRJGaS*!M zVhQN5@jqQ;?rA=KZ9QMhtw*(-82^lmsW+2au2zb|R?^6Y#T?73CapNE#TI6rVsOzp zEuhIU>k6qk(+ig_Sty^cWMP?8=yHzSrEdk3Y=ETOxS(|PABngb5{pRZ~Lj<-CLYvIyjfvcKe@wRFP{y&_}Fth*w delta 4052 zcmeHKU2GKB8QnX(UUt{s9RmhTO_Z?%#3XokW_Etp#@HC+Rv1D;t14{`zB6;jyJ9_S zX4b4i(l}Os^3yhwWSZ0+Ld;6XTCe%-0$4;oip>)8+A|py5>OB!f+rESVsP{KlGzgyc#CQXk;7tYVkE*e#4jXf=u~2WK1Givz8mjNEQ^1Q+`J|})*Bj+14Cv7frn5} zD+X(_Mq1Kj(-c`*GzG+Jf*>J9V_BrDiVj(o=Q&O?bWIi0CR9XCiuqYgLzQG+OY4SC zMk>?=k_QQC0qK&Uq!meL6@_P6T{a~{OlxV);1@wdft<*yqN3vCe&TArKRkD5-%7Q2xoTw>+tU)ual9D9yCpABc(uh2RlT=w^QP@wTD-fy%hZK%1 zUWF>bOOzx=!>iUtc+UNa3=(+!Pd1zl22!4%~*R7oW&uL~+dBBwxsGYnOLq9F$TEN)Ee3R4@& zMn>JYvt1KMBK7~@@c*F=UlAjX9;1&3=)V$Cn4z!GZ_yX%Y5GliiauU$W)9J@Sa3P8 z(ViSCl~11QC+b2^1<1?`^c2yhAJFcz*XqmD*OdCj(O4i{vn~*SA>I=^8LyAF*Q|TF z0bCE3>Cp91DBd36%8BUj!}vN4qAThm7X$P;dRyXZqBg!G4(h&Nx3%_i?5*hK$VH+q zKaNaZYy|9r2n9|DB8S0ga_yBlxoUR5o?TEECu^s`jzDA#>>%++z~qi@5DDQM8$cN_ z?trldf7kszsn}XS2ls4;a~A!HGbn5Xh~$4 z4am;tET*+kbk^Xp<3Po0`#^Mz$-*Mzpqzy)Mng`)GLTJ%kD%OMri~f348|#?vWwz_tdrz%L{`s++>E;FTZ+-~+=CMs=gq}$paq{d0HNW(Zc;aS6u$N@DBu$YIe1)b=-$}flVP@;dCel5Jxj)GSHWkn zXo44Fl|A+*d0U+H*Kl7;=T=m5>_TC1mOh@|j9YxVD~qQ>C$SHLR2 zNfob2Yk(icrFU5^pb+_(2Id&MZ zpwg$iKLG3^(x)(oY|;_dEIbeW1w4pfa>2>^3Ot`J*hOZwM-%+|A@D#2seAqm5-UFr zs5_^dgUY+vJ!hr}!5IA0KZ1vSsJg3Ak?5!AkxN#2;T9>Yk}EJBhj7P z_$}prd)LE3R!-ybjbID@P6xFL|6wE5>OPH{um(W}AInhp;BRT5Ddd+krGe)Fg9mgV zyXVd1Scj3@!{~YFIO|&UBHZZ|3-)6zogHxAMTP_fBCAZ+M&`PfY_T}vWKyY8sgxvQ zMpSKA9@@#`p45nq90#QgRD`*F%8#`<*+Qw!AuAjjXj=r zWsAf4md-6GYcroB+UGEPM;U_#@3DxQN= zAda_Xs1Tl3sGd1wB~H4nZ6k|H3#ulu+&78deVaX#XvGIsQ&BuJ0YY8oUA~TDFaf42 z6nv4U?pxwpg(tnv5cF!SI0jb+DSYG|Otlm9ZD7NF~8&vax%9z35I7PiW%giP%R^V=H=Zdojc6vnJdGO%&#se#3ReXnid{g_#>GEgN6%E`rLnXTI_B#Z> z^AdH{!}orIG_}IBV*AW%*?bk)bxpwy7`;rrSOxa*1huqw)&!W%eZJY@krR~fb~PNQ z@Uj~vX7|gc;H=qwY`!@Z+1IIA17UEcG1%z=ShLxC_nJKD%b_;|@0g+b8a{ewA)d-h zl}5*nk5epN`4kkyBqtNgRq>U_sclQG(R_X_(_$b!=i~}jONQ~T;P;=Pn%(``@Df4X zUy9z5!SSwt=ksj-*BQQ-AvAp60{wU}L0am8SE)^R$>oGQwreEJ&C&SH=KlbV=LGg< Q27eCBQvhC3lv}m+A4wL;uK)l5 diff --git a/docs/PROJECT_PLAN.md b/docs/PROJECT_PLAN.md index c758f5e..3c61216 100644 --- a/docs/PROJECT_PLAN.md +++ b/docs/PROJECT_PLAN.md @@ -13,7 +13,7 @@ Eine modulare News-Pipeline mit klaren Stufen: 2. Inhaltsanalyse und Normalisierung 3. Rewrite/Anreicherung 4. Legal- und Qualitaetschecks -5. WordPress-Publikation (`pending`) +5. WordPress-Publikation (Draft-first, Queue + Retry) 6. Monitoring/Logging ## Grobe Zeitplanung (ohne Fixtermine) @@ -36,12 +36,14 @@ Eine modulare News-Pipeline mit klaren Stufen: - Feed-Import mit Duplikaterkennung - Admin-Login (ein User) - Manuelle Review vor Publish +- Admin-UI fuer Rechtscheck, Bildauswahl, Relevanzbewertung ### Phase 2 - Automation - Job-Queue (asynchron) - Regelbasierte Scheduler - Retry/Dead-Letter-Handling - Robustes Error-Reporting +- WordPress-Publisher (Draft) mit Mapping `article_id -> wp_post_id` ### Phase 3 - Compliance und Skalierung - Source-Whitelisting mit Pflichtfeldern @@ -49,6 +51,28 @@ Eine modulare News-Pipeline mit klaren Stufen: - Qualitaetsmetriken und Audit-Logs - Optional: Passkey/WebAuthn +## Aktueller Stand (Snapshot) +- Backend/API + Admin-UI lauffaehig +- Feed-Ingestion inkl. Originalartikel-Extraktion (Autor, Pressekontakt, Bilder) +- Bildkuration: + - automatische Scoring-Reduktion (u. a. Presseportal `story_big` priorisiert) + - manuelle Auswahl/Ausblendung im UI +- Rechts-/Publish-Gates aktiv: + - `legal_checked` Pflicht + - Hauptbild-Auswahl Pflicht + - Status-Workflow bis `published` +- WordPress-Publishing: + - Queue + Retry + Job-Historie + - Draft-Erstellung/Update erfolgreich getestet +- Exporte: + - JSON/CSV inkl. Datum/Alter/Relevanz + Attribution/Legal-Felder + +## Naechste Iteration (konkret) +1. WordPress `featured_media` Upload aus ausgewaehltem Hauptbild +2. Publish-HTML je Artikel verfeinern (strukturierter Body + konsistenter Quellenblock) +3. Publisher als periodischen Worker (Timer/Cron/Systemd) auf Hetzner betreiben +4. Monitoring/Alerting fuer Queue-Fehler + WP-API Fehlercodes + ## Architekturprinzipien - Idempotente Jobs - Trennung von UI, API, Worker diff --git a/docs/TODO.md b/docs/TODO.md index ad9b549..fee4a67 100644 --- a/docs/TODO.md +++ b/docs/TODO.md @@ -1,10 +1,10 @@ # ToDo (Ein-Entwickler Setup) ## Jetzt -- [ ] GitHub Project #3 Felder/Views fuer Neustart vereinheitlichen -- [ ] Alte/obsolet gewordene Issues kennzeichnen (z. B. User-Verwaltung) -- [ ] Redirect `news.vanityontour.de -> vanityontour.de` aktiv halten -- [ ] Wiki-Basis fertigstellen und verlinken +- [ ] WordPress Beitragsbild-Upload implementieren (`featured_media` aus ausgewaehltem Hauptbild) +- [ ] WordPress-HTML-Ausgabe pro Artikel weiter verbessern (sauberes Layout, Quellenblock, Shortcodes falls noetig) +- [ ] Publisher Fehlertexte fuer WP-Auth/Media/API in UI klarer darstellen +- [ ] End-to-end Publish Smoke-Test dokumentieren (lokal + Hetzner) ## MVP - [x] Neues Backend-Skelett (`backend/`) aufsetzen (FastAPI) @@ -12,14 +12,18 @@ - [x] Feed-Ingestion Service bauen (ETag/Last-Modified) - [x] Duplikaterkennung ueber `source_url`, `guid`, Hash - [x] Login mit 1 Admin-Account implementieren -- [ ] Artikel-Review-Maske mit Statusworkflow -- [ ] WordPress-Publisher als separaten Service implementieren +- [x] Artikel-Review-Maske mit Statusworkflow +- [x] WordPress-Publisher als separaten Service implementieren (Queue + Retry + Mapping) +- [x] Bildvorschau + manuelle Bildauswahl im Admin-UI +- [x] Automatische Bildreduktion/Scoring fuer Presseportal-Quellen +- [x] Artikel-Datum + Relevanzscore im UI/Export ## Recht/Qualitaet -- [ ] Source-Policy in DB + Admin-UI abbilden -- [ ] Pflichtfelder je Quelle erzwingen (Autor, URL, Lizenz, Hinweise) -- [ ] Auto-Block bei fehlender Lizenzinfo -- [ ] Pro Artikel Attribution-Block generieren +- [x] Source-Policy in DB + Admin-UI abbilden +- [x] Pflichtfelder je Quelle erzwingen (Autor, URL, Lizenz, Hinweise) +- [x] Auto-Block bei fehlender Lizenzinfo +- [x] Pro Artikel Attribution-Block generieren +- [x] Manuelle Rechtsfreigabe als Publish-Gate ## Betrieb - [ ] Systemd-Service(s) fuer API/Worker erstellen @@ -31,3 +35,4 @@ - [ ] Passkey/WebAuthn evaluieren und optional einfuehren - [ ] Migration auf PostgreSQL bewerten - [ ] Teilautomatische Freigabe-Regeln definieren +- [ ] KI-Rewrite mit Prompt-Versionierung + Qualitaetsmetriken wieder aktivieren From e68b6a41fdaf7f33835e80632cb4522b4b7e0ee2 Mon Sep 17 00:00:00 2001 From: Oliver G Date: Sat, 21 Feb 2026 13:07:08 +0100 Subject: [PATCH 14/54] feat(wordpress): upload selected image and set featured_media on draft publish --- backend/app/wordpress.py | 87 +++++++++++++++++++++++++++++++++ backend/tests/test_wordpress.py | 63 ++++++++++++++++++++++++ 2 files changed, 150 insertions(+) create mode 100644 backend/tests/test_wordpress.py diff --git a/backend/app/wordpress.py b/backend/app/wordpress.py index adb4d9c..756a346 100644 --- a/backend/app/wordpress.py +++ b/backend/app/wordpress.py @@ -2,7 +2,10 @@ from __future__ import annotations import base64 import json +import mimetypes +from pathlib import Path from typing import Any +from urllib.parse import urlparse from urllib.request import Request, urlopen from .config import get_settings @@ -56,6 +59,77 @@ def _selected_image_url_from_meta(meta_json: str | None) -> str | None: return selected if isinstance(selected, str) and selected.strip() else None +def _download_image_bytes(url: str) -> tuple[bytes, str]: + req = Request( + url=url, + headers={ + "User-Agent": "rss-news-publisher/1.0", + "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8", + }, + ) + with urlopen(req, timeout=20) as resp: + raw = resp.read() + content_type = resp.headers.get("Content-Type", "application/octet-stream") + if not content_type.lower().startswith("image/"): + raise RuntimeError(f"Ausgewählte Bild-URL liefert kein Bild ({content_type})") + return raw, content_type + + +def _guess_filename(image_url: str, content_type: str) -> str: + parsed = urlparse(image_url) + stem = Path(parsed.path).name or "article-image" + if "." not in stem: + ext = mimetypes.guess_extension(content_type.split(";")[0].strip()) or ".jpg" + stem = f"{stem}{ext}" + return stem + + +def _upload_featured_media( + *, + base_url: str, + auth_header: str, + image_url: str, + article_title: str, + source_url: str, +) -> int: + image_bytes, content_type = _download_image_bytes(image_url) + filename = _guess_filename(image_url, content_type) + + media_url = f"{base_url.rstrip('/')}/wp-json/wp/v2/media" + media_req = Request( + url=media_url, + data=image_bytes, + method="POST", + headers={ + "Authorization": auth_header, + "Content-Type": content_type, + "Content-Disposition": f'attachment; filename="{filename}"', + "Accept": "application/json", + "User-Agent": "rss-news-publisher/1.0", + }, + ) + with urlopen(media_req, timeout=30) as resp: + media_raw = resp.read().decode("utf-8", errors="replace") + media_payload = json.loads(media_raw) if media_raw else {} + media_id = int(media_payload.get("id", 0)) if isinstance(media_payload, dict) else 0 + if media_id <= 0: + raise RuntimeError(f"WordPress Media-Upload fehlgeschlagen: {media_payload}") + + # Optional metadata update for traceability. + _wp_request( + base_url=base_url, + auth_header=auth_header, + method="POST", + endpoint=f"media/{media_id}", + payload={ + "title": f"{article_title[:120]} - Bild", + "caption": f"Quelle: {source_url}", + "alt_text": article_title[:200], + }, + ) + return media_id + + def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]: settings = get_settings() if not settings.wordpress_base_url or not settings.wordpress_username or not settings.wordpress_app_password: @@ -76,11 +150,24 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]: footer += f"\n

Canonical: {canonical_url}

" content = f"{body}{footer}" + featured_media_id = None + selected_image_url = _selected_image_url_from_meta(article.get("meta_json")) + if selected_image_url: + featured_media_id = _upload_featured_media( + base_url=settings.wordpress_base_url, + auth_header=auth, + image_url=selected_image_url, + article_title=title, + source_url=source_url, + ) + payload = { "title": title, "content": content, "status": settings.wordpress_default_status, } + if featured_media_id: + payload["featured_media"] = featured_media_id wp_post_id = article.get("wp_post_id") if wp_post_id: diff --git a/backend/tests/test_wordpress.py b/backend/tests/test_wordpress.py new file mode 100644 index 0000000..f12c6e1 --- /dev/null +++ b/backend/tests/test_wordpress.py @@ -0,0 +1,63 @@ +import os +import unittest +from unittest.mock import patch + +from backend.app import config as config_module +from backend.app.wordpress import publish_article_draft + + +class TestWordpressPublish(unittest.TestCase): + def setUp(self) -> None: + os.environ["WORDPRESS_BASE_URL"] = "https://example.org" + os.environ["WORDPRESS_USERNAME"] = "wp-user" + os.environ["WORDPRESS_APP_PASSWORD"] = "wp-pass" + config_module.get_settings.cache_clear() + + def tearDown(self) -> None: + for key in ("WORDPRESS_BASE_URL", "WORDPRESS_USERNAME", "WORDPRESS_APP_PASSWORD"): + os.environ.pop(key, None) + config_module.get_settings.cache_clear() + + @patch("backend.app.wordpress._upload_featured_media") + @patch("backend.app.wordpress._wp_request") + def test_publish_sets_featured_media_when_selected_image_exists(self, mock_wp_request, mock_upload_media) -> None: + mock_upload_media.return_value = 456 + mock_wp_request.return_value = {"id": 321, "link": "https://example.org/?p=321"} + + article = { + "title": "Testartikel", + "content_raw": "Inhalt", + "source_url": "https://example.com/source", + "canonical_url": "https://example.com/source", + "meta_json": '{"image_review":{"selected_url":"https://example.com/image.jpg"}}', + } + post_id, post_url = publish_article_draft(article) + + self.assertEqual(post_id, 321) + self.assertIn("?p=321", post_url or "") + self.assertTrue(mock_upload_media.called) + payload = mock_wp_request.call_args.kwargs["payload"] + self.assertEqual(payload.get("featured_media"), 456) + + @patch("backend.app.wordpress._upload_featured_media") + @patch("backend.app.wordpress._wp_request") + def test_publish_without_selected_image_has_no_featured_media(self, mock_wp_request, mock_upload_media) -> None: + mock_wp_request.return_value = {"id": 654, "link": "https://example.org/?p=654"} + + article = { + "title": "Testartikel", + "content_raw": "Inhalt", + "source_url": "https://example.com/source", + "canonical_url": "https://example.com/source", + "meta_json": "{}", + } + post_id, _ = publish_article_draft(article) + + self.assertEqual(post_id, 654) + self.assertFalse(mock_upload_media.called) + payload = mock_wp_request.call_args.kwargs["payload"] + self.assertNotIn("featured_media", payload) + + +if __name__ == "__main__": + unittest.main() From 24d8e5ad0fdbd76608cdb1920e2c3149288655df Mon Sep 17 00:00:00 2001 From: Oliver G Date: Sat, 21 Feb 2026 13:09:00 +0100 Subject: [PATCH 15/54] feat(wordpress): improve post html structure and excerpt generation --- backend/app/wordpress.py | 86 ++++++++++++++++++++++++++++----- backend/tests/test_wordpress.py | 4 ++ 2 files changed, 79 insertions(+), 11 deletions(-) diff --git a/backend/app/wordpress.py b/backend/app/wordpress.py index 756a346..fbf4443 100644 --- a/backend/app/wordpress.py +++ b/backend/app/wordpress.py @@ -1,9 +1,11 @@ from __future__ import annotations import base64 +from html import escape import json import mimetypes from pathlib import Path +import re from typing import Any from urllib.parse import urlparse from urllib.request import Request, urlopen @@ -130,6 +132,75 @@ def _upload_featured_media( return media_id +def _as_paragraph_html(text: str) -> str: + chunks = [chunk.strip() for chunk in re.split(r"\n{2,}", text.strip()) if chunk.strip()] + if not chunks: + return "" + lines = [] + for chunk in chunks: + compact = re.sub(r"\s*\n\s*", " ", chunk) + lines.append(f"

{escape(compact)}

") + return "\n".join(lines) + + +def _build_post_content(article: dict[str, Any]) -> tuple[str, str | None]: + source_url = article.get("source_url") or "" + canonical_url = article.get("canonical_url") or source_url + summary = (article.get("summary") or "").strip() + body_text = (article.get("content_rewritten") or article.get("content_raw") or "").strip() + if not body_text: + body_text = summary + + # Keep existing HTML if already present, otherwise wrap plain text into paragraphs. + has_html = bool(re.search(r"<[a-zA-Z][^>]*>", body_text)) + body_html = body_text if has_html else _as_paragraph_html(body_text) + if not body_html: + body_html = "

Kein Inhalt verfügbar.

" + + author = (article.get("author") or "").strip() + published_at = (article.get("published_at") or "").strip() + source_name = (article.get("source_name_snapshot") or "").strip() + license_name = (article.get("source_license_name_snapshot") or "").strip() + terms_url = (article.get("source_terms_url_snapshot") or "").strip() + press_contact = (article.get("press_contact") or "").strip() + + lead_html = f"

{escape(summary)}

\n" if summary else "" + + facts: list[str] = [] + if author: + facts.append(f"
  • Autor: {escape(author)}
  • ") + if published_at: + facts.append(f"
  • Veröffentlicht (Quelle): {escape(published_at)}
  • ") + if source_name: + facts.append(f"
  • Quelle: {escape(source_name)}
  • ") + if license_name: + facts.append(f"
  • Lizenz: {escape(license_name)}
  • ") + if terms_url: + facts.append(f"
  • Lizenzhinweise: {escape(terms_url)}
  • ") + + facts_html = ( + "

    Artikeldetails

    \n
      \n" + "\n".join(facts) + "\n
    \n" + if facts + else "" + ) + press_contact_html = ( + f"

    Pressekontakt

    \n

    {escape(press_contact)}

    \n" if press_contact else "" + ) + attribution_html = ( + "
    \n
    \n" + "

    Quelle

    \n" + f"

    Originalartikel: {escape(source_url)}

    \n" + ) + if canonical_url and canonical_url != source_url: + attribution_html += f"

    Canonical: {escape(canonical_url)}

    \n" + attribution_html += "
    " + + content = f"{lead_html}{body_html}\n\n{facts_html}{press_contact_html}{attribution_html}".strip() + excerpt_source = summary or re.sub(r"\s+", " ", body_text).strip() + excerpt = excerpt_source[:220] if excerpt_source else None + return content, excerpt + + def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]: settings = get_settings() if not settings.wordpress_base_url or not settings.wordpress_username or not settings.wordpress_app_password: @@ -137,18 +208,9 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]: auth = _auth_header(settings.wordpress_username, settings.wordpress_app_password) - source_url = article.get("source_url") or "" - canonical_url = article.get("canonical_url") or source_url title = (article.get("title") or "Ohne Titel").strip() - body = (article.get("content_rewritten") or article.get("content_raw") or "").strip() - if not body: - body = article.get("summary") or "" - - footer = "\n\n
    \n

    Quelle: " - footer += f"{source_url}

    " - if canonical_url and canonical_url != source_url: - footer += f"\n

    Canonical: {canonical_url}

    " - content = f"{body}{footer}" + content, excerpt = _build_post_content(article) + source_url = article.get("source_url") or "" featured_media_id = None selected_image_url = _selected_image_url_from_meta(article.get("meta_json")) @@ -166,6 +228,8 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]: "content": content, "status": settings.wordpress_default_status, } + if excerpt: + payload["excerpt"] = excerpt if featured_media_id: payload["featured_media"] = featured_media_id diff --git a/backend/tests/test_wordpress.py b/backend/tests/test_wordpress.py index f12c6e1..2c9094e 100644 --- a/backend/tests/test_wordpress.py +++ b/backend/tests/test_wordpress.py @@ -38,6 +38,9 @@ class TestWordpressPublish(unittest.TestCase): self.assertTrue(mock_upload_media.called) payload = mock_wp_request.call_args.kwargs["payload"] self.assertEqual(payload.get("featured_media"), 456) + self.assertIn("

    Quelle

    ", payload.get("content", "")) + self.assertIn("Originalartikel", payload.get("content", "")) + self.assertEqual(payload.get("excerpt"), "Inhalt") @patch("backend.app.wordpress._upload_featured_media") @patch("backend.app.wordpress._wp_request") @@ -57,6 +60,7 @@ class TestWordpressPublish(unittest.TestCase): self.assertFalse(mock_upload_media.called) payload = mock_wp_request.call_args.kwargs["payload"] self.assertNotIn("featured_media", payload) + self.assertIn("

    Inhalt

    ", payload.get("content", "")) if __name__ == "__main__": From 8d7375c99fe8c20b2e08a2e5379570f507bea3fe Mon Sep 17 00:00:00 2001 From: Oliver G Date: Sat, 21 Feb 2026 13:11:43 +0100 Subject: [PATCH 16/54] feat(ui): classify publisher errors with actionable hints --- backend/app/admin_ui.py | 21 ++++++++++++++++ backend/static/admin.css | 34 ++++++++++++++++++++++++++ backend/templates/admin_dashboard.html | 12 +++++++-- 3 files changed, 65 insertions(+), 2 deletions(-) diff --git a/backend/app/admin_ui.py b/backend/app/admin_ui.py index fba1b91..bde281d 100644 --- a/backend/app/admin_ui.py +++ b/backend/app/admin_ui.py @@ -169,6 +169,23 @@ def _publish_readiness(article: dict, meta: dict) -> tuple[bool, list[str]]: return len(reasons) == 0, reasons +def _classify_publish_error(error_message: str | None) -> tuple[str, str]: + text = (error_message or "").lower() + if not text.strip(): + return "ok", "-" + if "rechtsfreigabe fehlt" in text or "hauptbild nicht gesetzt" in text or "status ist nicht" in text: + return "policy", "Artikelvoraussetzungen im UI prüfen (Status/Rechtsfreigabe/Hauptbild)." + if "401" in text or "403" in text or "authorization" in text or "forbidden" in text or "unauthorized" in text: + return "auth", "WordPress Nutzer/App-Passwort prüfen." + if "404" in text and ("media" in text or "posts" in text or "wp-json" in text): + return "api", "WordPress REST-Endpunkt prüfen (`/wp-json/wp/v2`)." + if "timed out" in text or "timeout" in text or "nodename nor servname provided" in text or "name or service not known" in text: + return "dns", "DNS/Netzwerk zur WordPress-Domain prüfen." + if "media-upload fehlgeschlagen" in text or "liefert kein bild" in text or "featured_media" in text: + return "media", "Bild-URL/Format prüfen oder anderes Hauptbild auswählen." + return "unknown", "Fehlerdetails prüfen und bei Bedarf Job erneut starten." + + def _legal_checklist(article: dict, feed: dict | None) -> list[dict[str, str]]: meta = article.get("meta", {}) extraction = meta.get("extraction") if isinstance(meta.get("extraction"), dict) else {} @@ -289,6 +306,10 @@ def admin_dashboard(request: Request): feeds = list_feeds() runs = list_runs(limit=30) publish_jobs = list_publish_jobs(limit=30) + for job in publish_jobs: + category, hint = _classify_publish_error(job.get("error_message")) + job["error_category"] = category + job["error_hint"] = hint status_filter = request.query_params.get("status_filter") if status_filter in {"new", "rewrite", "review", "approved", "published", "error"}: articles = list_articles(limit=100, status_filter=status_filter) diff --git a/backend/static/admin.css b/backend/static/admin.css index 16d55be..053263a 100644 --- a/backend/static/admin.css +++ b/backend/static/admin.css @@ -131,6 +131,40 @@ button.secondary { color: #991b1b; } +.badge.errcat { + margin-bottom: 4px; +} + +.badge.errcat-policy { + background: #fee2e2; + color: #991b1b; +} + +.badge.errcat-auth { + background: #ffedd5; + color: #9a3412; +} + +.badge.errcat-dns { + background: #dbeafe; + color: #1e40af; +} + +.badge.errcat-media { + background: #fef9c3; + color: #854d0e; +} + +.badge.errcat-api { + background: #ede9fe; + color: #5b21b6; +} + +.badge.errcat-unknown { + background: #e2e8f0; + color: #334155; +} + .alert { margin-bottom: 12px; padding: 10px; diff --git a/backend/templates/admin_dashboard.html b/backend/templates/admin_dashboard.html index fe811a5..f318355 100644 --- a/backend/templates/admin_dashboard.html +++ b/backend/templates/admin_dashboard.html @@ -256,7 +256,7 @@

    Publish Jobs

    - + {% for j in publish_jobs %} @@ -274,7 +274,15 @@ - {% endif %} - + + {% endfor %} From 35ccceb26058cfbf3268568e9fdc3fdc6f0fb5ee Mon Sep 17 00:00:00 2001 From: Oliver G Date: Sat, 21 Feb 2026 13:43:22 +0100 Subject: [PATCH 17/54] feat(workflow): simplify article flow and add automated rewrite step --- backend/app/admin_ui.py | 98 ++++++++++++----- backend/app/config.py | 2 + backend/app/main.py | 113 ++++++++++++-------- backend/app/publisher.py | 4 +- backend/app/rewrite.py | 83 ++++++++++++++ backend/app/wordpress.py | 19 +++- backend/app/workflow.py | 39 +++++++ backend/templates/admin_article_detail.html | 9 +- backend/templates/admin_dashboard.html | 16 ++- backend/tests/test_article_workflow.py | 38 +++---- backend/tests/test_wordpress.py | 18 ++++ 11 files changed, 332 insertions(+), 107 deletions(-) create mode 100644 backend/app/rewrite.py create mode 100644 backend/app/workflow.py diff --git a/backend/app/admin_ui.py b/backend/app/admin_ui.py index bde281d..e9bfae4 100644 --- a/backend/app/admin_ui.py +++ b/backend/app/admin_ui.py @@ -17,6 +17,7 @@ from .ingestion import run_ingestion from .policy import evaluate_source_policy from .publisher import enqueue_publish, run_publisher from .relevance import article_age_days, article_relevance +from .rewrite import rewrite_article_text from .repositories import ( FeedCreate, SourceCreate, @@ -31,19 +32,21 @@ from .repositories import ( list_sources, set_article_image_decision, set_article_legal_review, + upsert_article, update_article_status, + ArticleUpsert, ) +from .workflow import ALLOWED_UI_TRANSITIONS, UI_STATUSES, internal_to_ui_status, ui_to_internal_status settings = get_settings() router = APIRouter(tags=["admin-ui"]) templates = Jinja2Templates(directory=str(Path(__file__).resolve().parent.parent / "templates")) ALLOWED_TRANSITIONS: dict[str, tuple[str, ...]] = { - "new": ("review", "rewrite", "error"), - "rewrite": ("review", "error"), - "review": ("approved", "rewrite", "error"), - "approved": ("published", "error"), - "published": ("error",), - "error": ("review", "rewrite"), + "new": ("rewrite", "close"), + "rewrite": ("publish", "close"), + "publish": ("published", "close"), + "published": ("close",), + "close": ("rewrite",), } IMAGE_PROXY_USER_AGENT = "rss-news-admin/1.0" @@ -158,10 +161,8 @@ def _build_image_entries(article: dict, extraction: dict, meta: dict) -> list[di def _publish_readiness(article: dict, meta: dict) -> tuple[bool, list[str]]: reasons: list[str] = [] - if article.get("status") not in {"approved", "published"}: - reasons.append("Status ist nicht 'approved'") - if int(article.get("legal_checked", 0)) != 1: - reasons.append("Rechtsfreigabe fehlt") + if internal_to_ui_status(article.get("status")) not in {"publish", "published"}: + reasons.append("Status ist nicht 'publish'") image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {} selected_image = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None if not selected_image: @@ -311,8 +312,9 @@ def admin_dashboard(request: Request): job["error_category"] = category job["error_hint"] = hint status_filter = request.query_params.get("status_filter") - if status_filter in {"new", "rewrite", "review", "approved", "published", "error"}: - articles = list_articles(limit=100, status_filter=status_filter) + internal_filter = ui_to_internal_status(status_filter) if status_filter else None + if status_filter in set(UI_STATUSES): + articles = list_articles(limit=100, status_filter=internal_filter) else: status_filter = "" articles = list_articles(limit=100) @@ -336,6 +338,7 @@ def admin_dashboard(request: Request): article["extraction_error"] = extraction.get("extraction_error") if isinstance(extraction.get("extraction_error"), str) else None article["days_old"] = article_age_days(article.get("published_at")) article["relevance"] = article_relevance(article.get("published_at")) + article["status_ui"] = internal_to_ui_status(article.get("status")) return templates.TemplateResponse( request, @@ -350,7 +353,7 @@ def admin_dashboard(request: Request): "runs": runs, "publish_jobs": publish_jobs, "articles": articles, - "status_options": ["new", "rewrite", "review", "approved", "published", "error"], + "status_options": list(UI_STATUSES), "allowed_transitions": ALLOWED_TRANSITIONS, "status_filter": status_filter, "flash_msg": request.query_params.get("msg", ""), @@ -388,6 +391,7 @@ def admin_article_detail(request: Request, article_id: int): ) article["days_old"] = article_age_days(article.get("published_at")) article["relevance"] = article_relevance(article.get("published_at")) + article["status_ui"] = internal_to_ui_status(article.get("status")) feed = get_feed_by_id(int(article["feed_id"])) if article.get("feed_id") else None checklist = _legal_checklist(article, feed) @@ -401,7 +405,7 @@ def admin_article_detail(request: Request, article_id: int): "article": article, "feed": feed, "checklist": checklist, - "allowed_transitions": ALLOWED_TRANSITIONS.get(article.get("status"), ()), + "allowed_transitions": ALLOWED_TRANSITIONS.get(article.get("status_ui"), ()), "flash_msg": request.query_params.get("msg", ""), "flash_type": request.query_params.get("type", "success"), }, @@ -565,12 +569,56 @@ def admin_review_article(request: Request, article_id: int, decision: str = Form if not user: return RedirectResponse(url="/admin/login", status_code=303) + return _dashboard_redirect(msg="Review-Aktion wurde durch Rewrite ersetzt", msg_type="error") + + +@router.post("/admin/articles/{article_id}/rewrite-run") +def admin_rewrite_run(request: Request, article_id: int): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) article = get_article_by_id(article_id) - if article and article.get("status") == "review" and decision in {"approve", "reject"}: - target = "approved" if decision == "approve" else "rewrite" - update_article_status(article_id, target, actor=user, note=note or None, decision=decision) - return _dashboard_redirect(msg=f"Artikel #{article_id}: {decision}") - return _dashboard_redirect(msg=f"Review-Aktion ungueltig fuer Artikel #{article_id}", msg_type="error") + if not article: + return _dashboard_redirect(msg=f"Artikel #{article_id} nicht gefunden", msg_type="error") + if internal_to_ui_status(article.get("status")) not in {"new", "rewrite"}: + return _dashboard_redirect(msg=f"Rewrite nur aus new/rewrite fuer Artikel #{article_id}", msg_type="error") + try: + rewritten = rewrite_article_text(article) + except Exception as exc: + return _dashboard_redirect(msg=f"Rewrite fehlgeschlagen fuer Artikel #{article_id}: {exc}", msg_type="error") + + upsert_article( + ArticleUpsert( + feed_id=article.get("feed_id"), + source_article_id=article.get("source_article_id"), + source_hash=article.get("source_hash"), + title=article.get("title"), + source_url=article.get("source_url"), + canonical_url=article.get("canonical_url"), + published_at=article.get("published_at"), + author=article.get("author"), + summary=article.get("summary"), + content_raw=article.get("content_raw"), + content_rewritten=rewritten, + image_urls_json=article.get("image_urls_json"), + press_contact=article.get("press_contact"), + source_name_snapshot=article.get("source_name_snapshot"), + source_terms_url_snapshot=article.get("source_terms_url_snapshot"), + source_license_name_snapshot=article.get("source_license_name_snapshot"), + legal_checked=bool(int(article.get("legal_checked", 0))), + legal_checked_at=article.get("legal_checked_at"), + legal_note=article.get("legal_note"), + wp_post_id=article.get("wp_post_id"), + wp_post_url=article.get("wp_post_url"), + publish_attempts=int(article.get("publish_attempts", 0)), + publish_last_error=article.get("publish_last_error"), + published_to_wp_at=article.get("published_to_wp_at"), + word_count=len(rewritten.split()), + status="approved", + meta_json=article.get("meta_json"), + ) + ) + return _dashboard_redirect(msg=f"Rewrite fertig fuer Artikel #{article_id} -> publish") @router.post("/admin/articles/{article_id}/transition") @@ -581,10 +629,10 @@ def admin_transition_article(request: Request, article_id: int, target_status: s article = get_article_by_id(article_id) if article: - current = article.get("status") - if target_status in ALLOWED_TRANSITIONS.get(current, ()): - if target_status == "published" and int(article.get("legal_checked", 0)) != 1: - return _dashboard_redirect(msg=f"Publish blockiert fuer Artikel #{article_id}: Rechtsfreigabe fehlt", msg_type="error") - update_article_status(article_id, target_status, actor=user, note=note or None) - return _dashboard_redirect(msg=f"Artikel #{article_id}: {current} -> {target_status}") + current_ui = internal_to_ui_status(article.get("status")) + target_internal = ui_to_internal_status(target_status) + target_ui = internal_to_ui_status(target_internal) + if target_ui in ALLOWED_TRANSITIONS.get(current_ui, ()): + update_article_status(article_id, target_internal, actor=user, note=note or None) + return _dashboard_redirect(msg=f"Artikel #{article_id}: {current_ui} -> {target_ui}") return _dashboard_redirect(msg=f"Ungueltiger Statuswechsel fuer Artikel #{article_id}", msg_type="error") diff --git a/backend/app/config.py b/backend/app/config.py index fc52ec3..43629ba 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -30,6 +30,8 @@ class Settings(BaseSettings): wordpress_username: str | None = Field(default=None, validation_alias=AliasChoices("WORDPRESS_USERNAME", "WP_USERNAME")) wordpress_app_password: str | None = Field(default=None, validation_alias=AliasChoices("WORDPRESS_APP_PASSWORD", "WP_PASSWORD")) wordpress_default_status: str = "draft" + openai_api_key: str | None = Field(default=None, validation_alias=AliasChoices("OPENAI_API_KEY")) + openai_model: str = "gpt-4o-mini" @lru_cache(maxsize=1) diff --git a/backend/app/main.py b/backend/app/main.py index c0a0143..4dcee28 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -18,6 +18,7 @@ from .ingestion import run_ingestion from .policy import evaluate_source_policy, is_source_allowed from .publisher import enqueue_publish, run_publisher from .relevance import article_age_days, article_relevance +from .rewrite import rewrite_article_text from .repositories import ( ArticleUpsert, FeedCreate, @@ -40,6 +41,7 @@ from .repositories import ( update_article_status, upsert_article as repo_upsert_article, ) +from .workflow import ALLOWED_UI_TRANSITIONS, UI_STATUSES, internal_to_ui_status, ui_to_internal_status settings = get_settings() @@ -119,7 +121,7 @@ class ArticleUpsertRequest(BaseModel): publish_last_error: str | None = None published_to_wp_at: str | None = None word_count: int = 0 - status: str = Field(default="new", pattern="^(new|rewrite|review|approved|published|error)$") + status: str = Field(default="new", pattern="^(new|rewrite|publish|published|close|review|approved|error)$") meta_json: str | None = None @@ -128,7 +130,7 @@ class IngestionRunRequest(BaseModel): class ArticleTransitionRequest(BaseModel): - target_status: str = Field(pattern="^(new|rewrite|review|approved|published|error)$") + target_status: str = Field(pattern="^(new|rewrite|publish|published|close|review|approved|error)$") note: str | None = None @@ -152,12 +154,11 @@ class PublisherRunRequest(BaseModel): ALLOWED_ARTICLE_TRANSITIONS: dict[str, set[str]] = { - "new": {"review", "rewrite", "error"}, - "rewrite": {"review", "error"}, - "review": {"approved", "rewrite", "error"}, + "new": {"rewrite", "error"}, + "rewrite": {"approved", "error"}, "approved": {"published", "error"}, "published": {"error"}, - "error": {"review", "rewrite"}, + "error": {"rewrite"}, } @@ -340,7 +341,11 @@ def api_finish_run(run_id: int, payload: RunFinishRequest, username: str = Depen @app.get("/api/articles") def api_list_articles(limit: int = 100, status_filter: str | None = None, username: str = Depends(require_auth)) -> dict: - return {"ok": True, "items": repo_list_articles(limit=limit, status_filter=status_filter), "requested_by": username} + internal_filter = ui_to_internal_status(status_filter) if status_filter else None + items = repo_list_articles(limit=limit, status_filter=internal_filter) + for item in items: + item["status_ui"] = internal_to_ui_status(item.get("status")) + return {"ok": True, "items": items, "requested_by": username} @app.get("/api/articles/export") @@ -349,7 +354,8 @@ def api_export_articles( status_filter: str | None = None, username: str = Depends(require_auth), ): - articles = repo_list_articles(limit=500, status_filter=status_filter) + internal_filter = ui_to_internal_status(status_filter) if status_filter else None + articles = repo_list_articles(limit=500, status_filter=internal_filter) rows = [] for article in articles: meta: dict = {} @@ -436,6 +442,7 @@ def api_get_article(article_id: int, username: str = Depends(require_auth)) -> d article = get_article_by_id(article_id) if not article: raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden") + article["status_ui"] = internal_to_ui_status(article.get("status")) return {"ok": True, "item": article, "requested_by": username} @@ -468,7 +475,7 @@ def api_upsert_article(payload: ArticleUpsertRequest, username: str = Depends(re publish_last_error=payload.publish_last_error, published_to_wp_at=payload.published_to_wp_at, word_count=payload.word_count, - status=payload.status, + status=ui_to_internal_status(payload.status), meta_json=payload.meta_json, ) ) @@ -482,22 +489,64 @@ def api_article_transition(article_id: int, payload: ArticleTransitionRequest, u raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden") current_status = article.get("status") - allowed_targets = ALLOWED_ARTICLE_TRANSITIONS.get(current_status, set()) - if payload.target_status not in allowed_targets: + current_ui = internal_to_ui_status(current_status) + target_internal = ui_to_internal_status(payload.target_status) + target_ui = internal_to_ui_status(target_internal) + allowed_targets = ALLOWED_UI_TRANSITIONS.get(current_ui, set()) + if target_ui not in allowed_targets: raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, - detail=f"Ungueltiger Statuswechsel: {current_status} -> {payload.target_status}", - ) - if payload.target_status == "published" and int(article.get("legal_checked", 0)) != 1: - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail="Publish gesperrt: Rechtscheck wurde noch nicht freigegeben", + detail=f"Ungueltiger Statuswechsel: {current_ui} -> {target_ui}", ) - updated = update_article_status(article_id, payload.target_status, actor=username, note=payload.note) + updated = update_article_status(article_id, target_internal, actor=username, note=payload.note) if not updated: raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden") - return {"ok": True, "id": article_id, "from_status": current_status, "to_status": payload.target_status} + return {"ok": True, "id": article_id, "from_status": current_ui, "to_status": target_ui} + + +@app.post("/api/articles/{article_id}/rewrite-run") +def api_article_rewrite_run(article_id: int, username: str = Depends(require_auth)) -> dict: + article = get_article_by_id(article_id) + if not article: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden") + if internal_to_ui_status(article.get("status")) not in {"rewrite", "new"}: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Rewrite nur aus Status 'new' oder 'rewrite'") + + rewritten = rewrite_article_text(article) + # upsert via status update + existing fields by lightweight path: + repo_upsert_article( + ArticleUpsert( + feed_id=article.get("feed_id"), + source_article_id=article.get("source_article_id"), + source_hash=article.get("source_hash"), + title=article.get("title"), + source_url=article.get("source_url"), + canonical_url=article.get("canonical_url"), + published_at=article.get("published_at"), + author=article.get("author"), + summary=article.get("summary"), + content_raw=article.get("content_raw"), + content_rewritten=rewritten, + image_urls_json=article.get("image_urls_json"), + press_contact=article.get("press_contact"), + source_name_snapshot=article.get("source_name_snapshot"), + source_terms_url_snapshot=article.get("source_terms_url_snapshot"), + source_license_name_snapshot=article.get("source_license_name_snapshot"), + legal_checked=bool(int(article.get("legal_checked", 0))), + legal_checked_at=article.get("legal_checked_at"), + legal_note=article.get("legal_note"), + wp_post_id=article.get("wp_post_id"), + wp_post_url=article.get("wp_post_url"), + publish_attempts=int(article.get("publish_attempts", 0)), + publish_last_error=article.get("publish_last_error"), + published_to_wp_at=article.get("published_to_wp_at"), + word_count=len(rewritten.split()), + status="approved", + meta_json=article.get("meta_json"), + ) + ) + return {"ok": True, "id": article_id, "status": "publish"} @app.post("/api/articles/{article_id}/legal-review") @@ -547,31 +596,7 @@ def api_publisher_run(payload: PublisherRunRequest, username: str = Depends(requ @app.post("/api/articles/{article_id}/review") def api_article_review(article_id: int, payload: ArticleReviewRequest, username: str = Depends(require_auth)) -> dict: - article = get_article_by_id(article_id) - if not article: - raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden") - if article.get("status") != "review": - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail=f"Review nur fuer Status 'review' erlaubt (aktuell: {article.get('status')})", - ) - - target_status = "approved" if payload.decision == "approve" else "rewrite" - updated = update_article_status( - article_id, - target_status, - actor=username, - note=payload.note, - decision=payload.decision, - ) - if not updated: - raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden") - return { - "ok": True, - "id": article_id, - "decision": payload.decision, - "to_status": target_status, - } + raise HTTPException(status_code=status.HTTP_410_GONE, detail="Review-Endpoint ersetzt durch Rewrite-Workflow") @app.post("/api/ingestion/run") diff --git a/backend/app/publisher.py b/backend/app/publisher.py index 06cc8f2..e27bd1b 100644 --- a/backend/app/publisher.py +++ b/backend/app/publisher.py @@ -28,9 +28,7 @@ def enqueue_publish(article_id: int, max_attempts: int = 3) -> int: def _can_publish(article: dict) -> tuple[bool, str | None]: if article.get("status") not in {"approved", "published"}: - return False, "Artikelstatus muss 'approved' sein" - if int(article.get("legal_checked", 0)) != 1: - return False, "Rechtsfreigabe fehlt" + return False, "Artikelstatus muss 'publish' sein" if not selected_image_exists(article): return False, "Hauptbild nicht gesetzt" return True, None diff --git a/backend/app/rewrite.py b/backend/app/rewrite.py new file mode 100644 index 0000000..8c313ad --- /dev/null +++ b/backend/app/rewrite.py @@ -0,0 +1,83 @@ +from __future__ import annotations + +import json +import re +from typing import Any +from urllib.request import Request, urlopen + +from .config import get_settings + + +def _sanitize_source_text(text: str) -> str: + raw = (text or "").strip() + if not raw: + return "" + + lines = [ln.strip() for ln in raw.splitlines() if ln.strip()] + if len(lines) > 3: + lines = lines[3:] + + joined = "\n".join(lines) + # Remove press contact block at end from "Pressekontakt" onward. + joined = re.sub( + r"\n?\s*Pressekontakt[\s\S]*$", + "", + joined, + flags=re.IGNORECASE, + ).strip() + return joined + + +def rewrite_article_text(article: dict[str, Any]) -> str: + settings = get_settings() + api_key = settings.openai_api_key + if not api_key: + raise RuntimeError("OPENAI_API_KEY fehlt") + + source_text = _sanitize_source_text(article.get("content_raw") or "") + if not source_text: + source_text = (article.get("summary") or "").strip() + if not source_text: + raise RuntimeError("Kein Quelltext für Rewrite verfügbar") + + title = (article.get("title") or "").strip() + prompt = ( + "Schreibe den folgenden News-Text neu auf Deutsch in persönlicher Du-Form. " + "Stil: ausführlich, gut lesbar, ohne Einleitung mit Datum/Uhrzeit/Firma/Ort, " + "ohne Pressekontakt, ohne Quellenblock. " + "Nutze klare Absätze und Zwischenüberschriften in HTML (

    ,

    ,

    • falls passend). " + "Inhaltlich korrekt bleiben, nichts erfinden.\n\n" + f"Titel: {title}\n\n" + f"Originaltext:\n{source_text}" + ) + + payload = { + "model": settings.openai_model, + "temperature": 0.4, + "messages": [ + {"role": "system", "content": "Du bist ein deutscher News-Redakteur."}, + {"role": "user", "content": prompt}, + ], + } + req = Request( + url="https://api.openai.com/v1/chat/completions", + method="POST", + data=json.dumps(payload).encode("utf-8"), + headers={ + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + "Accept": "application/json", + }, + ) + with urlopen(req, timeout=60) as resp: + raw = resp.read().decode("utf-8", errors="replace") + data = json.loads(raw) + choices = data.get("choices") + if not isinstance(choices, list) or not choices: + raise RuntimeError(f"Ungültige OpenAI-Antwort: {data}") + message = choices[0].get("message", {}) + content = message.get("content") + if not isinstance(content, str) or not content.strip(): + raise RuntimeError("OpenAI lieferte keinen Rewrite-Text") + return content.strip() + diff --git a/backend/app/wordpress.py b/backend/app/wordpress.py index fbf4443..f450efe 100644 --- a/backend/app/wordpress.py +++ b/backend/app/wordpress.py @@ -143,11 +143,24 @@ def _as_paragraph_html(text: str) -> str: return "\n".join(lines) +def _sanitize_publish_text(text: str) -> str: + raw = (text or "").strip() + if not raw: + return "" + lines = [ln.strip() for ln in raw.splitlines() if ln.strip()] + if len(lines) > 3: + lines = lines[3:] + merged = "\n".join(lines) + merged = re.sub(r"\n?\s*Pressekontakt[\s\S]*$", "", merged, flags=re.IGNORECASE).strip() + return merged + + def _build_post_content(article: dict[str, Any]) -> tuple[str, str | None]: source_url = article.get("source_url") or "" canonical_url = article.get("canonical_url") or source_url summary = (article.get("summary") or "").strip() body_text = (article.get("content_rewritten") or article.get("content_raw") or "").strip() + body_text = _sanitize_publish_text(body_text) if not body_text: body_text = summary @@ -162,7 +175,6 @@ def _build_post_content(article: dict[str, Any]) -> tuple[str, str | None]: source_name = (article.get("source_name_snapshot") or "").strip() license_name = (article.get("source_license_name_snapshot") or "").strip() terms_url = (article.get("source_terms_url_snapshot") or "").strip() - press_contact = (article.get("press_contact") or "").strip() lead_html = f"

      {escape(summary)}

      \n" if summary else "" @@ -183,9 +195,6 @@ def _build_post_content(article: dict[str, Any]) -> tuple[str, str | None]: if facts else "" ) - press_contact_html = ( - f"

      Pressekontakt

      \n

      {escape(press_contact)}

      \n" if press_contact else "" - ) attribution_html = ( "
      \n
      \n" "

      Quelle

      \n" @@ -195,7 +204,7 @@ def _build_post_content(article: dict[str, Any]) -> tuple[str, str | None]: attribution_html += f"

      Canonical: {escape(canonical_url)}

      \n" attribution_html += "
      " - content = f"{lead_html}{body_html}\n\n{facts_html}{press_contact_html}{attribution_html}".strip() + content = f"{lead_html}{body_html}\n\n{facts_html}{attribution_html}".strip() excerpt_source = summary or re.sub(r"\s+", " ", body_text).strip() excerpt = excerpt_source[:220] if excerpt_source else None return content, excerpt diff --git a/backend/app/workflow.py b/backend/app/workflow.py new file mode 100644 index 0000000..6ef989d --- /dev/null +++ b/backend/app/workflow.py @@ -0,0 +1,39 @@ +from __future__ import annotations + +UI_STATUSES = ("new", "rewrite", "publish", "published", "close") + + +def internal_to_ui_status(status: str | None) -> str: + value = (status or "").strip() + if value == "approved": + return "publish" + if value == "error": + return "close" + if value == "review": + return "rewrite" + if value in {"new", "rewrite", "published"}: + return value + return value or "new" + + +def ui_to_internal_status(status: str | None) -> str: + value = (status or "").strip() + if value == "publish": + return "approved" + if value == "close": + return "error" + if value in {"new", "rewrite", "published"}: + return value + if value in {"approved", "error", "review"}: + return value + return value + + +ALLOWED_UI_TRANSITIONS: dict[str, set[str]] = { + "new": {"rewrite", "close"}, + "rewrite": {"publish", "close"}, + "publish": {"published", "close"}, + "published": {"close"}, + "close": {"rewrite"}, +} + diff --git a/backend/templates/admin_article_detail.html b/backend/templates/admin_article_detail.html index 6f06d36..bf88d2e 100644 --- a/backend/templates/admin_article_detail.html +++ b/backend/templates/admin_article_detail.html @@ -30,7 +30,7 @@

      {{ article.title }}

      -
      Status{{ article.status }}
      +
      Status{{ article.status_ui }}
      Artikel-Datum{{ article.published_at or "-" }}
      Alter{{ article.days_old if article.days_old is not none else "-" }} Tage
      Relevanz{{ article.relevance }}
      @@ -187,8 +187,10 @@

      Status ändern

      - {% if not article.legal_checked %} -

      Hinweis: `published` ist erst nach manueller Rechtsfreigabe erlaubt.

      + {% if article.status_ui in ["new", "rewrite"] %} +
      + + {% endif %}
      diff --git a/backend/templates/admin_dashboard.html b/backend/templates/admin_dashboard.html index f318355..1ed5c6e 100644 --- a/backend/templates/admin_dashboard.html +++ b/backend/templates/admin_dashboard.html @@ -144,7 +144,7 @@

    IDArtikelStatusAttemptsWP PostFehler
    IDArtikelStatusAttemptsWP PostFehlerHinweis
    {{ j.error_message or "-" }} + {% if j.error_message %} + {{ j.error_category }} +
    {{ j.error_message }}
    + {% else %} + - + {% endif %} +
    {{ j.error_hint or "-" }}
    - + {% for a in articles %} @@ -160,7 +160,7 @@
    Canonical öffnen {% endif %} - +
    IDArtikelStatusDetailsReviewTransition
    IDArtikelStatusDetailsRewriteTransition
    {{ a.status }}{{ a.status_ui }}
    Legal: {{ "OK" if a.legal_checked else "offen" }}
    Publish: {{ "bereit" if a.publish_ready else "blockiert" }}
    @@ -202,11 +202,9 @@ {% endif %}
    - {% if a.status == "review" %} -
    - - - + {% if a.status_ui in ["new", "rewrite"] %} + +
    {% else %} - @@ -215,11 +213,11 @@
    - {% if allowed_transitions.get(a.status, []) %} + {% if allowed_transitions.get(a.status_ui, []) %} {% else %} keine Aktion diff --git a/backend/tests/test_article_workflow.py b/backend/tests/test_article_workflow.py index ce11214..29dd212 100644 --- a/backend/tests/test_article_workflow.py +++ b/backend/tests/test_article_workflow.py @@ -4,6 +4,7 @@ import unittest from pathlib import Path from fastapi.testclient import TestClient +from unittest.mock import patch from backend.app import config as config_module from backend.app.db import init_db @@ -66,39 +67,40 @@ class TestArticleWorkflow(unittest.TestCase): def test_valid_transition_chain(self) -> None: article_id = self._create_article() - t1 = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "review"}) + t1 = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "rewrite"}) self.assertEqual(t1.status_code, 200) - r1 = self.client.post(f"/api/articles/{article_id}/review", json={"decision": "approve", "note": "ok"}) - self.assertEqual(r1.status_code, 200) - self.assertEqual(r1.json()["to_status"], "approved") - - blocked_publish = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "published"}) - self.assertEqual(blocked_publish.status_code, 400) - - legal = self.client.post( - f"/api/articles/{article_id}/legal-review", - json={"approved": True, "note": "Rechte geprueft"}, - ) - self.assertEqual(legal.status_code, 200) - - t2 = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "published"}) + t2 = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "publish"}) self.assertEqual(t2.status_code, 200) + t3 = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "published"}) + self.assertEqual(t3.status_code, 200) + final = self.client.get(f"/api/articles/{article_id}") self.assertEqual(final.status_code, 200) self.assertEqual(final.json()["item"]["status"], "published") - self.assertEqual(final.json()["item"]["legal_checked"], 1) + self.assertEqual(final.json()["item"]["status_ui"], "published") def test_invalid_transition_rejected(self) -> None: article_id = self._create_article() bad = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "published"}) self.assertEqual(bad.status_code, 400) - def test_review_only_allowed_in_review_status(self) -> None: + def test_legacy_review_endpoint_is_gone(self) -> None: article_id = self._create_article() bad = self.client.post(f"/api/articles/{article_id}/review", json={"decision": "approve"}) - self.assertEqual(bad.status_code, 400) + self.assertEqual(bad.status_code, 410) + + @patch("backend.app.main.rewrite_article_text") + def test_rewrite_run_sets_publish_status(self, mock_rewrite) -> None: + mock_rewrite.return_value = "

    Neu

    Umschreibung

    " + article_id = self._create_article() + self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "rewrite"}) + r = self.client.post(f"/api/articles/{article_id}/rewrite-run") + self.assertEqual(r.status_code, 200) + self.assertEqual(r.json()["status"], "publish") + final = self.client.get(f"/api/articles/{article_id}") + self.assertEqual(final.json()["item"]["status_ui"], "publish") if __name__ == "__main__": diff --git a/backend/tests/test_wordpress.py b/backend/tests/test_wordpress.py index 2c9094e..833b159 100644 --- a/backend/tests/test_wordpress.py +++ b/backend/tests/test_wordpress.py @@ -62,6 +62,24 @@ class TestWordpressPublish(unittest.TestCase): self.assertNotIn("featured_media", payload) self.assertIn("

    Inhalt

    ", payload.get("content", "")) + @patch("backend.app.wordpress._upload_featured_media") + @patch("backend.app.wordpress._wp_request") + def test_publish_strips_feed_header_and_press_contact(self, mock_wp_request, mock_upload_media) -> None: + mock_wp_request.return_value = {"id": 100, "link": "https://example.org/?p=100"} + article = { + "title": "Header Test", + "content_raw": "21.02.2026 10:00\nFirma GmbH\n(ots)\nDas ist der eigentliche Text.\nPressekontakt: Test Person", + "source_url": "https://example.com/source", + "canonical_url": "https://example.com/source", + "meta_json": "{}", + } + publish_article_draft(article) + payload = mock_wp_request.call_args.kwargs["payload"] + content = payload.get("content", "") + self.assertNotIn("Firma GmbH", content) + self.assertNotIn("Pressekontakt", content) + self.assertIn("eigentliche Text", content) + if __name__ == "__main__": unittest.main() From 50f737f434d23f3352fa040ae3a619c2ae025f6b Mon Sep 17 00:00:00 2001 From: Oliver G Date: Sat, 21 Feb 2026 13:58:40 +0100 Subject: [PATCH 18/54] feat(admin): add connectivity diagnostics page for domains and endpoints --- backend/app/admin_ui.py | 133 ++++++++++++++++++++++ backend/templates/admin_connectivity.html | 84 ++++++++++++++ backend/templates/admin_dashboard.html | 9 +- backend/tests/test_admin_ui.py | 47 ++++++++ 4 files changed, 270 insertions(+), 3 deletions(-) create mode 100644 backend/templates/admin_connectivity.html diff --git a/backend/app/admin_ui.py b/backend/app/admin_ui.py index e9bfae4..a725ba8 100644 --- a/backend/app/admin_ui.py +++ b/backend/app/admin_ui.py @@ -3,6 +3,9 @@ from __future__ import annotations import json from pathlib import Path import re +import socket +import ssl +import time from urllib.parse import urlparse from urllib.parse import urlencode from urllib.request import Request as UrlRequest, urlopen @@ -254,6 +257,113 @@ def _legal_checklist(article: dict, feed: dict | None) -> list[dict[str, str]]: return checks +def _build_connectivity_targets() -> list[dict[str, str]]: + targets: list[dict[str, str]] = [] + seen: set[tuple[str, str]] = set() + + def add_target(label: str, kind: str, value: str) -> None: + normalized = (value or "").strip() + if not normalized: + return + key = (kind, normalized.lower()) + if key in seen: + return + seen.add(key) + targets.append({"label": label, "kind": kind, "value": normalized}) + + add_target("OpenAI API", "host", "api.openai.com") + if settings.wordpress_base_url: + parsed = urlparse(settings.wordpress_base_url) + if parsed.hostname: + add_target("WordPress Host", "host", parsed.hostname) + wp_api_url = f"{settings.wordpress_base_url.rstrip('/')}/wp-json/wp/v2" + add_target("WordPress REST", "url", wp_api_url) + + for feed in list_feeds(): + name = (feed.get("name") or "").strip() or f"Feed #{feed.get('id')}" + feed_url = str(feed.get("url") or "").strip() + if not feed_url: + continue + parsed = urlparse(feed_url) + if parsed.hostname: + add_target(f"{name} (Feed)", "host", parsed.hostname) + add_target(f"{name} (Feed URL)", "url", feed_url) + + return targets + + +def _run_connectivity_check(target: dict[str, str]) -> dict[str, object]: + kind = target.get("kind", "") + value = str(target.get("value") or "") + row: dict[str, object] = { + "label": target.get("label") or "-", + "kind": kind, + "target": value, + "dns_ok": False, + "dns_info": "-", + "tcp_ok": False, + "tcp_info": "-", + "http_ok": False, + "http_info": "-", + "duration_ms": 0, + "ok": False, + } + started = time.perf_counter() + try: + hostname = value if kind == "host" else (urlparse(value).hostname or "") + port = 443 + if kind == "url": + parsed = urlparse(value) + if parsed.scheme not in {"http", "https"}: + row["http_info"] = f"unsupported scheme: {parsed.scheme or '-'}" + return row + port = 443 if parsed.scheme == "https" else 80 + if not hostname: + row["dns_info"] = "host fehlt" + return row + + try: + addr_info = socket.getaddrinfo(hostname, None, proto=socket.IPPROTO_TCP) + ips = sorted({entry[4][0] for entry in addr_info if entry and len(entry) > 4 and entry[4]}) + row["dns_ok"] = True + row["dns_info"] = ", ".join(ips[:3]) if ips else "resolved" + except Exception as exc: + row["dns_info"] = str(exc) + return row + + try: + socket.create_connection((hostname, port), timeout=4).close() + row["tcp_ok"] = True + row["tcp_info"] = f"port {port} erreichbar" + except Exception as exc: + row["tcp_info"] = str(exc) + return row + + if kind == "host": + row["http_ok"] = True + row["http_info"] = "n/a (host-only)" + row["ok"] = True + return row + + try: + req = UrlRequest( + url=value, + headers={"User-Agent": IMAGE_PROXY_USER_AGENT, "Accept": "*/*"}, + ) + with urlopen(req, timeout=6, context=ssl.create_default_context()) as resp: + code = getattr(resp, "status", None) or resp.getcode() + row["http_ok"] = True + row["http_info"] = f"HTTP {code}" + except Exception as exc: + row["http_info"] = str(exc) + return row + + row["ok"] = bool(row["dns_ok"] and row["tcp_ok"] and row["http_ok"]) + return row + finally: + row["duration_ms"] = int((time.perf_counter() - started) * 1000) + + @router.get("/admin", response_class=HTMLResponse) def admin_index(request: Request): user = _admin_user(request) @@ -362,6 +472,29 @@ def admin_dashboard(request: Request): ) +@router.get("/admin/connectivity", response_class=HTMLResponse) +def admin_connectivity(request: Request): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + + checks = [_run_connectivity_check(target) for target in _build_connectivity_targets()] + ok_count = len([c for c in checks if c.get("ok")]) + error_count = len(checks) - ok_count + return templates.TemplateResponse( + request, + "admin_connectivity.html", + { + "request": request, + "title": "Connectivity Check", + "user": user, + "checks": checks, + "ok_count": ok_count, + "error_count": error_count, + }, + ) + + @router.get("/admin/articles/{article_id}", response_class=HTMLResponse) def admin_article_detail(request: Request, article_id: int): user = _admin_user(request) diff --git a/backend/templates/admin_connectivity.html b/backend/templates/admin_connectivity.html new file mode 100644 index 0000000..5fc0392 --- /dev/null +++ b/backend/templates/admin_connectivity.html @@ -0,0 +1,84 @@ + + + + + + {{ title }} + + + +
    +
    +

    Connectivity Check

    +

    Angemeldet als {{ user }}

    +
    +
    + Zurück + + + +
    +
    + +
    +
    +
    +
    Checks
    +
    {{ checks|length }}
    +
    +
    +
    OK
    +
    {{ ok_count }}
    +
    +
    +
    Fehler
    +
    {{ error_count }}
    +
    +
    +
    Zeitpunkt
    +
    Live
    +
    +
    + +
    +

    Ziele

    +

    Geprüft werden DNS-Auflösung, TCP-Erreichbarkeit und bei URLs ein HTTP-Request.

    +
    + +
    +
    + +
    +

    Ergebnis

    + + + + + + {% for c in checks %} + + + + + + + + + + + {% endfor %} + +
    StatusNameTypZielDNSTCPHTTPDauer
    {% if c.ok %}OK{% else %}Fehler{% endif %}{{ c.label }}{{ c.kind }}{{ c.target }} + {% if c.dns_ok %}OK{% else %}FAIL{% endif %} +
    {{ c.dns_info }}
    +
    + {% if c.tcp_ok %}OK{% else %}FAIL{% endif %} +
    {{ c.tcp_info }}
    +
    + {% if c.http_ok %}OK{% else %}FAIL{% endif %} +
    {{ c.http_info }}
    +
    {{ c.duration_ms }} ms
    +
    +
    + + diff --git a/backend/templates/admin_dashboard.html b/backend/templates/admin_dashboard.html index 1ed5c6e..e43a9ae 100644 --- a/backend/templates/admin_dashboard.html +++ b/backend/templates/admin_dashboard.html @@ -12,9 +12,12 @@

    rss-news Admin Dashboard

    Angemeldet als {{ user }}

    -
    - -
    +
    + Connectivity Check +
    + +
    +
    diff --git a/backend/tests/test_admin_ui.py b/backend/tests/test_admin_ui.py index af47046..557121a 100644 --- a/backend/tests/test_admin_ui.py +++ b/backend/tests/test_admin_ui.py @@ -176,6 +176,53 @@ class TestAdminUi(unittest.TestCase): self.assertEqual(res.status_code, 200) self.assertIn("image/jpeg", res.headers.get("content-type", "")) + @patch("backend.app.admin_ui._run_connectivity_check") + @patch("backend.app.admin_ui._build_connectivity_targets") + def test_connectivity_page_renders(self, mock_targets, mock_check) -> None: + mock_targets.return_value = [ + {"label": "OpenAI API", "kind": "host", "value": "api.openai.com"}, + {"label": "WordPress REST", "kind": "url", "value": "https://example.org/wp-json/wp/v2"}, + ] + mock_check.side_effect = [ + { + "label": "OpenAI API", + "kind": "host", + "target": "api.openai.com", + "dns_ok": True, + "dns_info": "1.2.3.4", + "tcp_ok": True, + "tcp_info": "port 443 erreichbar", + "http_ok": True, + "http_info": "n/a (host-only)", + "duration_ms": 12, + "ok": True, + }, + { + "label": "WordPress REST", + "kind": "url", + "target": "https://example.org/wp-json/wp/v2", + "dns_ok": False, + "dns_info": "Name or service not known", + "tcp_ok": False, + "tcp_info": "-", + "http_ok": False, + "http_info": "-", + "duration_ms": 10, + "ok": False, + }, + ] + + self.client.post( + "/admin/login", + data={"username": "admin", "password": "secret"}, + follow_redirects=True, + ) + res = self.client.get("/admin/connectivity", follow_redirects=True) + self.assertEqual(res.status_code, 200) + self.assertIn("Connectivity Check", res.text) + self.assertIn("OpenAI API", res.text) + self.assertIn("WordPress REST", res.text) + if __name__ == "__main__": unittest.main() From 88b2ee1d010cd117afdacf65a26eb788f9deab77 Mon Sep 17 00:00:00 2001 From: Oliver G Date: Sat, 21 Feb 2026 14:03:49 +0100 Subject: [PATCH 19/54] feat(admin): add feed/source management, rewrite editor, reopen flow, and WP block output --- backend/app/admin_ui.py | 218 +++++++++++++++++--- backend/app/repositories.py | 74 +++++++ backend/app/wordpress.py | 88 +++++--- backend/app/workflow.py | 3 +- backend/static/admin.css | 2 +- backend/templates/admin_article_detail.html | 14 ++ backend/templates/admin_dashboard.html | 93 +++++++++ backend/tests/test_admin_ui.py | 126 +++++++++++ backend/tests/test_article_workflow.py | 7 +- 9 files changed, 555 insertions(+), 70 deletions(-) diff --git a/backend/app/admin_ui.py b/backend/app/admin_ui.py index a725ba8..89819cf 100644 --- a/backend/app/admin_ui.py +++ b/backend/app/admin_ui.py @@ -23,7 +23,11 @@ from .relevance import article_age_days, article_relevance from .rewrite import rewrite_article_text from .repositories import ( FeedCreate, + FeedUpdate, SourceCreate, + SourceUpdate, + delete_feed, + delete_source, create_feed, create_source, get_article_by_id, @@ -36,6 +40,8 @@ from .repositories import ( set_article_image_decision, set_article_legal_review, upsert_article, + update_feed, + update_source, update_article_status, ArticleUpsert, ) @@ -48,10 +54,11 @@ ALLOWED_TRANSITIONS: dict[str, tuple[str, ...]] = { "new": ("rewrite", "close"), "rewrite": ("publish", "close"), "publish": ("published", "close"), - "published": ("close",), + "published": ("rewrite", "close"), "close": ("rewrite",), } IMAGE_PROXY_USER_AGENT = "rss-news-admin/1.0" +_UNSET = object() def _admin_user(request: Request) -> str | None: @@ -364,6 +371,51 @@ def _run_connectivity_check(target: dict[str, str]) -> dict[str, object]: row["duration_ms"] = int((time.perf_counter() - started) * 1000) +def _upsert_article_from_existing( + article: dict, + *, + content_rewritten: str | None = None, + status: str | None = None, + wp_post_id: int | None | object = _UNSET, + wp_post_url: str | None | object = _UNSET, + publish_attempts: int | object = _UNSET, + publish_last_error: str | None | object = _UNSET, + published_to_wp_at: str | None | object = _UNSET, +) -> None: + rewritten = article.get("content_rewritten") if content_rewritten is None else content_rewritten + upsert_article( + ArticleUpsert( + feed_id=article.get("feed_id"), + source_article_id=article.get("source_article_id"), + source_hash=article.get("source_hash"), + title=article.get("title"), + source_url=article.get("source_url"), + canonical_url=article.get("canonical_url"), + published_at=article.get("published_at"), + author=article.get("author"), + summary=article.get("summary"), + content_raw=article.get("content_raw"), + content_rewritten=rewritten, + image_urls_json=article.get("image_urls_json"), + press_contact=article.get("press_contact"), + source_name_snapshot=article.get("source_name_snapshot"), + source_terms_url_snapshot=article.get("source_terms_url_snapshot"), + source_license_name_snapshot=article.get("source_license_name_snapshot"), + legal_checked=bool(int(article.get("legal_checked", 0))), + legal_checked_at=article.get("legal_checked_at"), + legal_note=article.get("legal_note"), + wp_post_id=article.get("wp_post_id") if wp_post_id is _UNSET else wp_post_id, + wp_post_url=article.get("wp_post_url") if wp_post_url is _UNSET else wp_post_url, + publish_attempts=int(article.get("publish_attempts", 0)) if publish_attempts is _UNSET else publish_attempts, + publish_last_error=article.get("publish_last_error") if publish_last_error is _UNSET else publish_last_error, + published_to_wp_at=article.get("published_to_wp_at") if published_to_wp_at is _UNSET else published_to_wp_at, + word_count=len(str(rewritten or "").split()), + status=article.get("status") if status is None else status, + meta_json=article.get("meta_json"), + ) + ) + + @router.get("/admin", response_class=HTMLResponse) def admin_index(request: Request): user = _admin_user(request) @@ -427,7 +479,7 @@ def admin_dashboard(request: Request): articles = list_articles(limit=100, status_filter=internal_filter) else: status_filter = "" - articles = list_articles(limit=100) + articles = [a for a in list_articles(limit=250) if internal_to_ui_status(a.get("status")) != "close"][:100] for article in articles: meta = _parse_meta_json(article.get("meta_json")) extraction = meta.get("extraction") if isinstance(meta.get("extraction"), dict) else {} @@ -659,6 +711,54 @@ def admin_create_source( return _dashboard_redirect(msg="Quelle gespeichert") +@router.post("/admin/sources/{source_id}/update") +def admin_update_source( + request: Request, + source_id: int, + name: str = Form(...), + base_url: str = Form(""), + terms_url: str = Form(""), + license_name: str = Form(""), + risk_level: str = Form("yellow"), + is_enabled: str = Form("1"), + notes: str = Form(""), + last_reviewed_at: str = Form(""), +): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + try: + ok = update_source( + source_id, + SourceUpdate( + name=name, + base_url=base_url or None, + terms_url=terms_url or None, + license_name=license_name or None, + risk_level=risk_level, + is_enabled=is_enabled == "1", + notes=notes or None, + last_reviewed_at=last_reviewed_at or None, + ), + ) + except Exception as exc: + return _dashboard_redirect(msg=f"Quelle #{source_id} Update fehlgeschlagen: {exc}", msg_type="error") + if not ok: + return _dashboard_redirect(msg=f"Quelle #{source_id} nicht gefunden", msg_type="error") + return _dashboard_redirect(msg=f"Quelle #{source_id} aktualisiert") + + +@router.post("/admin/sources/{source_id}/delete") +def admin_delete_source(request: Request, source_id: int): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + ok = delete_source(source_id) + if not ok: + return _dashboard_redirect(msg=f"Quelle #{source_id} nicht gefunden", msg_type="error") + return _dashboard_redirect(msg=f"Quelle #{source_id} gelöscht") + + @router.post("/admin/feeds/create") def admin_create_feed( request: Request, @@ -684,6 +784,46 @@ def admin_create_feed( return _dashboard_redirect(msg="Feed gespeichert") +@router.post("/admin/feeds/{feed_id}/update") +def admin_update_feed( + request: Request, + feed_id: int, + name: str = Form(...), + url: str = Form(...), + source_id: str = Form(""), + is_enabled: str = Form("1"), +): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + try: + ok = update_feed( + feed_id, + FeedUpdate( + name=name, + url=url, + source_id=_to_optional_int(source_id), + is_enabled=is_enabled == "1", + ), + ) + except Exception as exc: + return _dashboard_redirect(msg=f"Feed #{feed_id} Update fehlgeschlagen: {exc}", msg_type="error") + if not ok: + return _dashboard_redirect(msg=f"Feed #{feed_id} nicht gefunden", msg_type="error") + return _dashboard_redirect(msg=f"Feed #{feed_id} aktualisiert") + + +@router.post("/admin/feeds/{feed_id}/delete") +def admin_delete_feed(request: Request, feed_id: int): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + ok = delete_feed(feed_id) + if not ok: + return _dashboard_redirect(msg=f"Feed #{feed_id} nicht gefunden", msg_type="error") + return _dashboard_redirect(msg=f"Feed #{feed_id} gelöscht") + + @router.post("/admin/ingestion/run") def admin_run_ingestion(request: Request, feed_id: str = Form("")): user = _admin_user(request) @@ -719,41 +859,51 @@ def admin_rewrite_run(request: Request, article_id: int): rewritten = rewrite_article_text(article) except Exception as exc: return _dashboard_redirect(msg=f"Rewrite fehlgeschlagen fuer Artikel #{article_id}: {exc}", msg_type="error") - - upsert_article( - ArticleUpsert( - feed_id=article.get("feed_id"), - source_article_id=article.get("source_article_id"), - source_hash=article.get("source_hash"), - title=article.get("title"), - source_url=article.get("source_url"), - canonical_url=article.get("canonical_url"), - published_at=article.get("published_at"), - author=article.get("author"), - summary=article.get("summary"), - content_raw=article.get("content_raw"), - content_rewritten=rewritten, - image_urls_json=article.get("image_urls_json"), - press_contact=article.get("press_contact"), - source_name_snapshot=article.get("source_name_snapshot"), - source_terms_url_snapshot=article.get("source_terms_url_snapshot"), - source_license_name_snapshot=article.get("source_license_name_snapshot"), - legal_checked=bool(int(article.get("legal_checked", 0))), - legal_checked_at=article.get("legal_checked_at"), - legal_note=article.get("legal_note"), - wp_post_id=article.get("wp_post_id"), - wp_post_url=article.get("wp_post_url"), - publish_attempts=int(article.get("publish_attempts", 0)), - publish_last_error=article.get("publish_last_error"), - published_to_wp_at=article.get("published_to_wp_at"), - word_count=len(rewritten.split()), - status="approved", - meta_json=article.get("meta_json"), - ) - ) + _upsert_article_from_existing(article, content_rewritten=rewritten, status="approved") return _dashboard_redirect(msg=f"Rewrite fertig fuer Artikel #{article_id} -> publish") +@router.post("/admin/articles/{article_id}/rewrite-save") +def admin_rewrite_save(request: Request, article_id: int, content_rewritten: str = Form(...)): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + article = get_article_by_id(article_id) + if not article: + return _dashboard_redirect(msg=f"Artikel #{article_id} nicht gefunden", msg_type="error") + text = (content_rewritten or "").strip() + if not text: + return RedirectResponse( + url=f"/admin/articles/{article_id}?msg=Rewrite-Text%20darf%20nicht%20leer%20sein&type=error", + status_code=303, + ) + _upsert_article_from_existing(article, content_rewritten=text) + return RedirectResponse(url=f"/admin/articles/{article_id}?msg=Rewrite-Text%20gespeichert&type=success", status_code=303) + + +@router.post("/admin/articles/{article_id}/reopen") +def admin_reopen_article(request: Request, article_id: int): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + article = get_article_by_id(article_id) + if not article: + return _dashboard_redirect(msg=f"Artikel #{article_id} nicht gefunden", msg_type="error") + _upsert_article_from_existing( + article, + status="rewrite", + wp_post_id=None, + wp_post_url=None, + publish_attempts=0, + publish_last_error=None, + published_to_wp_at=None, + ) + return RedirectResponse( + url=f"/admin/articles/{article_id}?msg=Artikel%20zurueck%20in%20Rewrite-Workflow%20gesetzt&type=success", + status_code=303, + ) + + @router.post("/admin/articles/{article_id}/transition") def admin_transition_article(request: Request, article_id: int, target_status: str = Form(...), note: str = Form("")): user = _admin_user(request) diff --git a/backend/app/repositories.py b/backend/app/repositories.py index ca25821..bff6e87 100644 --- a/backend/app/repositories.py +++ b/backend/app/repositories.py @@ -28,6 +28,26 @@ class FeedCreate: is_enabled: bool +@dataclass(frozen=True) +class SourceUpdate: + name: str + base_url: str | None + terms_url: str | None + license_name: str | None + risk_level: str + is_enabled: bool + notes: str | None + last_reviewed_at: str | None + + +@dataclass(frozen=True) +class FeedUpdate: + name: str + url: str + source_id: int | None + is_enabled: bool + + @dataclass(frozen=True) class RunCreate: run_type: str @@ -118,6 +138,35 @@ def get_source_by_id(source_id: int) -> dict[str, Any] | None: return dict(row) if row else None +def update_source(source_id: int, payload: SourceUpdate) -> bool: + with get_conn() as conn: + cur = conn.execute( + """ + UPDATE sources + SET name = ?, base_url = ?, terms_url = ?, license_name = ?, risk_level = ?, is_enabled = ?, notes = ?, last_reviewed_at = ? + WHERE id = ? + """, + ( + payload.name.strip(), + payload.base_url, + payload.terms_url, + payload.license_name, + payload.risk_level, + 1 if payload.is_enabled else 0, + payload.notes, + payload.last_reviewed_at, + source_id, + ), + ) + return cur.rowcount > 0 + + +def delete_source(source_id: int) -> bool: + with get_conn() as conn: + cur = conn.execute("DELETE FROM sources WHERE id = ?", (source_id,)) + return cur.rowcount > 0 + + def create_feed(payload: FeedCreate) -> int: with get_conn() as conn: cur = conn.execute( @@ -177,6 +226,31 @@ def get_feed_by_id(feed_id: int) -> dict[str, Any] | None: return dict(row) if row else None +def update_feed(feed_id: int, payload: FeedUpdate) -> bool: + with get_conn() as conn: + cur = conn.execute( + """ + UPDATE feeds + SET name = ?, url = ?, source_id = ?, is_enabled = ? + WHERE id = ? + """, + ( + payload.name.strip(), + payload.url.strip(), + payload.source_id, + 1 if payload.is_enabled else 0, + feed_id, + ), + ) + return cur.rowcount > 0 + + +def delete_feed(feed_id: int) -> bool: + with get_conn() as conn: + cur = conn.execute("DELETE FROM feeds WHERE id = ?", (feed_id,)) + return cur.rowcount > 0 + + def update_feed_fetch_state(feed_id: int, etag: str | None, last_modified: str | None) -> None: with get_conn() as conn: conn.execute( diff --git a/backend/app/wordpress.py b/backend/app/wordpress.py index f450efe..c257747 100644 --- a/backend/app/wordpress.py +++ b/backend/app/wordpress.py @@ -61,17 +61,18 @@ def _selected_image_url_from_meta(meta_json: str | None) -> str | None: return selected if isinstance(selected, str) and selected.strip() else None -def _download_image_bytes(url: str) -> tuple[bytes, str]: - req = Request( - url=url, - headers={ - "User-Agent": "rss-news-publisher/1.0", - "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8", - }, - ) +def _download_image_bytes(url: str, referer: str | None = None) -> tuple[bytes, str]: + headers = { + "User-Agent": "rss-news-publisher/1.0", + "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8", + } + if referer: + headers["Referer"] = referer + req = Request(url=url, headers=headers) with urlopen(req, timeout=20) as resp: raw = resp.read() content_type = resp.headers.get("Content-Type", "application/octet-stream") + content_type = content_type.split(";")[0].strip() if content_type else "application/octet-stream" if not content_type.lower().startswith("image/"): raise RuntimeError(f"Ausgewählte Bild-URL liefert kein Bild ({content_type})") return raw, content_type @@ -94,7 +95,7 @@ def _upload_featured_media( article_title: str, source_url: str, ) -> int: - image_bytes, content_type = _download_image_bytes(image_url) + image_bytes, content_type = _download_image_bytes(image_url, referer=source_url or None) filename = _guess_filename(image_url, content_type) media_url = f"{base_url.rstrip('/')}/wp-json/wp/v2/media" @@ -143,6 +144,29 @@ def _as_paragraph_html(text: str) -> str: return "\n".join(lines) +def _as_block_paragraphs(text: str) -> str: + chunks = [chunk.strip() for chunk in re.split(r"\n{2,}", text.strip()) if chunk.strip()] + if not chunks: + return "" + lines = [] + for chunk in chunks: + compact = re.sub(r"\s*\n\s*", " ", chunk) + lines.append(f"

    {escape(compact)}

    ") + return "\n".join(lines) + + +def _as_block_heading(level: int, text: str) -> str: + safe_level = min(6, max(1, int(level))) + return f'{escape(text)}' + + +def _as_block_list(items: list[str]) -> str: + if not items: + return "" + content = "".join(f"
  • {item}
  • " for item in items) + return f"
      {content}
    " + + def _sanitize_publish_text(text: str) -> str: raw = (text or "").strip() if not raw: @@ -164,11 +188,13 @@ def _build_post_content(article: dict[str, Any]) -> tuple[str, str | None]: if not body_text: body_text = summary - # Keep existing HTML if already present, otherwise wrap plain text into paragraphs. + # Keep existing HTML if already present, otherwise wrap plain text into block paragraphs. has_html = bool(re.search(r"<[a-zA-Z][^>]*>", body_text)) - body_html = body_text if has_html else _as_paragraph_html(body_text) + body_html = body_text if has_html else _as_block_paragraphs(body_text) if not body_html: - body_html = "

    Kein Inhalt verfügbar.

    " + body_html = "

    Kein Inhalt verfügbar.

    " + elif has_html: + body_html = f"\n{body_html}\n" author = (article.get("author") or "").strip() published_at = (article.get("published_at") or "").strip() @@ -176,35 +202,35 @@ def _build_post_content(article: dict[str, Any]) -> tuple[str, str | None]: license_name = (article.get("source_license_name_snapshot") or "").strip() terms_url = (article.get("source_terms_url_snapshot") or "").strip() - lead_html = f"

    {escape(summary)}

    \n" if summary else "" + lead_html = f"

    {escape(summary)}

    \n" if summary else "" facts: list[str] = [] if author: - facts.append(f"
  • Autor: {escape(author)}
  • ") + facts.append(f"Autor: {escape(author)}") if published_at: - facts.append(f"
  • Veröffentlicht (Quelle): {escape(published_at)}
  • ") + facts.append(f"Veröffentlicht (Quelle): {escape(published_at)}") if source_name: - facts.append(f"
  • Quelle: {escape(source_name)}
  • ") + facts.append(f"Quelle: {escape(source_name)}") if license_name: - facts.append(f"
  • Lizenz: {escape(license_name)}
  • ") + facts.append(f"Lizenz: {escape(license_name)}") if terms_url: - facts.append(f"
  • Lizenzhinweise: {escape(terms_url)}
  • ") + facts.append(f"Lizenzhinweise: {escape(terms_url)}") - facts_html = ( - "

    Artikeldetails

    \n
      \n" + "\n".join(facts) + "\n
    \n" - if facts - else "" - ) - attribution_html = ( - "
    \n
    \n" - "

    Quelle

    \n" - f"

    Originalartikel: {escape(source_url)}

    \n" - ) + facts_html = "" + if facts: + facts_html = _as_block_heading(3, "Artikeldetails") + "\n" + _as_block_list(facts) + + attribution_parts = [ + _as_block_heading(3, "Quelle"), + f'

    Originalartikel: {escape(source_url)}

    ', + ] if canonical_url and canonical_url != source_url: - attribution_html += f"

    Canonical: {escape(canonical_url)}

    \n" - attribution_html += "
    " + attribution_parts.append( + f'

    Canonical: {escape(canonical_url)}

    ' + ) + attribution_html = "\n".join(attribution_parts) - content = f"{lead_html}{body_html}\n\n{facts_html}{attribution_html}".strip() + content = f"{lead_html}{body_html}\n\n{facts_html}\n{attribution_html}".strip() excerpt_source = summary or re.sub(r"\s+", " ", body_text).strip() excerpt = excerpt_source[:220] if excerpt_source else None return content, excerpt diff --git a/backend/app/workflow.py b/backend/app/workflow.py index 6ef989d..b6cd4bb 100644 --- a/backend/app/workflow.py +++ b/backend/app/workflow.py @@ -33,7 +33,6 @@ ALLOWED_UI_TRANSITIONS: dict[str, set[str]] = { "new": {"rewrite", "close"}, "rewrite": {"publish", "close"}, "publish": {"published", "close"}, - "published": {"close"}, + "published": {"rewrite", "close"}, "close": {"rewrite"}, } - diff --git a/backend/static/admin.css b/backend/static/admin.css index 053263a..0b31bb5 100644 --- a/backend/static/admin.css +++ b/backend/static/admin.css @@ -94,7 +94,7 @@ th, td { vertical-align: top; } -input, select, button { +input, select, button, textarea { padding: 8px; border-radius: 6px; border: 1px solid #cbd5e1; diff --git a/backend/templates/admin_article_detail.html b/backend/templates/admin_article_detail.html index bf88d2e..70d5c57 100644 --- a/backend/templates/admin_article_detail.html +++ b/backend/templates/admin_article_detail.html @@ -164,6 +164,15 @@
    {{ article.content_raw or "-" }}
    +
    +

    Rewrite-Text (editierbar)

    +
    + + +
    +

    Dieser Text wird für den WordPress-Entwurf verwendet, falls vorhanden.

    +
    +

    Rechtsfreigabe

    Freigabe: @@ -192,6 +201,11 @@ {% endif %} + {% if article.status_ui == "published" %} +

    + +
    + {% endif %}
    +
    +

    Quellen verwalten

    + + + + + + {% for s in sources %} + {% set source_form_id = 'source-update-' ~ s.id %} + + + + + + + + {% endfor %} + +
    IDNameURLsMetaAktionen
    #{{ s.id }} + + + + + + + + + + + +
    + + + +
    + +
    +
    +
    +
    + +
    +

    Feeds verwalten

    + + + + + + {% for f in feeds %} + {% set feed_form_id = 'feed-update-' ~ f.id %} + + + + + + + + + {% endfor %} + +
    IDNameURLQuelleStatusAktionen
    #{{ f.id }} + + + + + + +
    +
    + +
    +
    + +
    +
    +
    +
    +

    Artikel (Review)

    diff --git a/backend/tests/test_admin_ui.py b/backend/tests/test_admin_ui.py index 557121a..eab63fd 100644 --- a/backend/tests/test_admin_ui.py +++ b/backend/tests/test_admin_ui.py @@ -145,6 +145,132 @@ class TestAdminUi(unittest.TestCase): self.assertIsNotNone(article) self.assertIn("selected_url", article.get("meta_json", "")) + def test_manage_source_and_feed(self) -> None: + source_id = create_source( + SourceCreate( + name="Edit Source", + base_url="https://example.org", + terms_url="https://example.org/terms", + license_name="cc-by", + risk_level="yellow", + is_enabled=True, + notes=None, + last_reviewed_at=None, + ) + ) + feed_id = create_feed( + FeedCreate( + name="Edit Feed", + url="https://example.org/feed.xml", + source_id=source_id, + is_enabled=True, + ) + ) + self.client.post("/admin/login", data={"username": "admin", "password": "secret"}, follow_redirects=True) + + update_source_res = self.client.post( + f"/admin/sources/{source_id}/update", + data={ + "name": "Edit Source 2", + "base_url": "https://example.org/new", + "terms_url": "https://example.org/new-terms", + "license_name": "cc0", + "risk_level": "green", + "is_enabled": "1", + "notes": "ok", + "last_reviewed_at": "2026-02-21T12:00:00Z", + }, + follow_redirects=False, + ) + self.assertEqual(update_source_res.status_code, 303) + + update_feed_res = self.client.post( + f"/admin/feeds/{feed_id}/update", + data={ + "name": "Edit Feed 2", + "url": "https://example.org/feed2.xml", + "source_id": str(source_id), + "is_enabled": "0", + }, + follow_redirects=False, + ) + self.assertEqual(update_feed_res.status_code, 303) + + delete_feed_res = self.client.post(f"/admin/feeds/{feed_id}/delete", follow_redirects=False) + self.assertEqual(delete_feed_res.status_code, 303) + delete_source_res = self.client.post(f"/admin/sources/{source_id}/delete", follow_redirects=False) + self.assertEqual(delete_source_res.status_code, 303) + + def test_rewrite_save_and_reopen(self) -> None: + source_id = create_source( + SourceCreate( + name="Test Source", + base_url="https://example.org", + terms_url="https://example.org/terms", + license_name="cc-by", + risk_level="green", + is_enabled=True, + notes=None, + last_reviewed_at="2026-02-18T00:00:00Z", + ) + ) + feed_id = create_feed( + FeedCreate( + name="Test Feed", + url="https://example.org/feed.xml", + source_id=source_id, + is_enabled=True, + ) + ) + article_id = upsert_article( + ArticleUpsert( + feed_id=feed_id, + source_article_id="id-published", + source_hash="hash-published", + title="Titel Published", + source_url="https://example.org/published", + canonical_url="https://example.org/published", + published_at=None, + author="Autor A", + summary="Summary", + content_raw="Raw", + content_rewritten="

    Alt

    ", + image_urls_json=None, + press_contact=None, + source_name_snapshot="Test Source", + source_terms_url_snapshot="https://example.org/terms", + source_license_name_snapshot="cc-by", + legal_checked=True, + legal_checked_at="2026-02-21T10:00:00Z", + legal_note=None, + wp_post_id=123, + wp_post_url="https://example.org/?p=123", + publish_attempts=2, + publish_last_error=None, + published_to_wp_at="2026-02-21T10:10:00Z", + word_count=1, + status="published", + meta_json="{}", + ) + ) + self.client.post("/admin/login", data={"username": "admin", "password": "secret"}, follow_redirects=True) + + save_res = self.client.post( + f"/admin/articles/{article_id}/rewrite-save", + data={"content_rewritten": "

    Neu

    Text

    "}, + follow_redirects=False, + ) + self.assertEqual(save_res.status_code, 303) + + reopen_res = self.client.post(f"/admin/articles/{article_id}/reopen", follow_redirects=False) + self.assertEqual(reopen_res.status_code, 303) + + article = get_article_by_id(article_id) + self.assertIsNotNone(article) + self.assertEqual(article.get("status"), "rewrite") + self.assertIn("Neu", article.get("content_rewritten") or "") + self.assertIsNone(article.get("wp_post_id")) + @patch("backend.app.admin_ui.urlopen") def test_image_proxy_returns_image_data(self, mock_urlopen) -> None: class _FakeHeaders: diff --git a/backend/tests/test_article_workflow.py b/backend/tests/test_article_workflow.py index 29dd212..094b595 100644 --- a/backend/tests/test_article_workflow.py +++ b/backend/tests/test_article_workflow.py @@ -76,10 +76,13 @@ class TestArticleWorkflow(unittest.TestCase): t3 = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "published"}) self.assertEqual(t3.status_code, 200) + t4 = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "rewrite"}) + self.assertEqual(t4.status_code, 200) + final = self.client.get(f"/api/articles/{article_id}") self.assertEqual(final.status_code, 200) - self.assertEqual(final.json()["item"]["status"], "published") - self.assertEqual(final.json()["item"]["status_ui"], "published") + self.assertEqual(final.json()["item"]["status"], "rewrite") + self.assertEqual(final.json()["item"]["status_ui"], "rewrite") def test_invalid_transition_rejected(self) -> None: article_id = self._create_article() From da269d08f166e488a4abfdefe91bf5f1072d015d Mon Sep 17 00:00:00 2001 From: Oliver G Date: Sat, 21 Feb 2026 14:11:03 +0100 Subject: [PATCH 20/54] chore(admin): remove legal approval step from UI workflow --- backend/app/admin_ui.py | 23 +-------------------- backend/templates/admin_article_detail.html | 23 +-------------------- backend/templates/admin_dashboard.html | 1 - backend/tests/test_admin_ui.py | 2 +- 4 files changed, 3 insertions(+), 46 deletions(-) diff --git a/backend/app/admin_ui.py b/backend/app/admin_ui.py index 89819cf..d3ca53e 100644 --- a/backend/app/admin_ui.py +++ b/backend/app/admin_ui.py @@ -38,7 +38,6 @@ from .repositories import ( list_runs, list_sources, set_article_image_decision, - set_article_legal_review, upsert_article, update_feed, update_source, @@ -185,7 +184,7 @@ def _classify_publish_error(error_message: str | None) -> tuple[str, str]: if not text.strip(): return "ok", "-" if "rechtsfreigabe fehlt" in text or "hauptbild nicht gesetzt" in text or "status ist nicht" in text: - return "policy", "Artikelvoraussetzungen im UI prüfen (Status/Rechtsfreigabe/Hauptbild)." + return "policy", "Artikelvoraussetzungen im UI prüfen (Status/Hauptbild)." if "401" in text or "403" in text or "authorization" in text or "forbidden" in text or "unauthorized" in text: return "auth", "WordPress Nutzer/App-Passwort prüfen." if "404" in text and ("media" in text or "posts" in text or "wp-json" in text): @@ -245,13 +244,6 @@ def _legal_checklist(article: dict, feed: dict | None) -> list[dict[str, str]]: "value": feed.get("source_risk_level") if feed else "-", } ) - checks.append( - { - "label": "Manuelle Rechtsfreigabe", - "status": "ok" if int(article.get("legal_checked", 0)) == 1 else "missing", - "value": article.get("legal_checked_at") or "-", - } - ) image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {} selected_image = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None checks.append( @@ -666,19 +658,6 @@ def admin_image_proxy(request: Request, url: str): return Response(content=body, media_type=content_type) -@router.post("/admin/articles/{article_id}/legal-review") -def admin_article_legal_review(request: Request, article_id: int, approved: str = Form("0"), note: str = Form("")): - user = _admin_user(request) - if not user: - return RedirectResponse(url="/admin/login", status_code=303) - - is_approved = approved == "1" - ok = set_article_legal_review(article_id, approved=is_approved, note=note or None, actor=user) - if not ok: - return _dashboard_redirect(msg=f"Artikel #{article_id} nicht gefunden", msg_type="error") - return RedirectResponse(url=f"/admin/articles/{article_id}", status_code=303) - - @router.post("/admin/sources/create") def admin_create_source( request: Request, diff --git a/backend/templates/admin_article_detail.html b/backend/templates/admin_article_detail.html index 70d5c57..992fe20 100644 --- a/backend/templates/admin_article_detail.html +++ b/backend/templates/admin_article_detail.html @@ -58,7 +58,7 @@
    -

    Rechts-Checkliste

    +

    Checkliste

    @@ -173,27 +173,6 @@

    Dieser Text wird für den WordPress-Entwurf verwendet, falls vorhanden.

    -
    -

    Rechtsfreigabe

    -

    Freigabe: - {% if article.legal_checked %} - Freigegeben - {% else %} - Nicht freigegeben - {% endif %} -

    -

    Zeitpunkt: {{ article.legal_checked_at or "-" }}

    -

    Notiz: {{ article.legal_note or "-" }}

    - - - - - -
    -

    Status ändern

    {% if article.status_ui in ["new", "rewrite"] %} diff --git a/backend/templates/admin_dashboard.html b/backend/templates/admin_dashboard.html index c82f8e3..f24b76e 100644 --- a/backend/templates/admin_dashboard.html +++ b/backend/templates/admin_dashboard.html @@ -258,7 +258,6 @@
    KriteriumStatusWert
    {{ a.status_ui }} -
    Legal: {{ "OK" if a.legal_checked else "offen" }}
    Publish: {{ "bereit" if a.publish_ready else "blockiert" }}
    {% if not a.publish_ready and a.publish_blockers %}
    {{ a.publish_blockers|join(", ") }}
    diff --git a/backend/tests/test_admin_ui.py b/backend/tests/test_admin_ui.py index eab63fd..6b7d1c7 100644 --- a/backend/tests/test_admin_ui.py +++ b/backend/tests/test_admin_ui.py @@ -131,7 +131,7 @@ class TestAdminUi(unittest.TestCase): res = self.client.get(f"/admin/articles/{article_id}", follow_redirects=True) self.assertEqual(res.status_code, 200) self.assertIn("Artikel-Detail", res.text) - self.assertIn("Rechts-Checkliste", res.text) + self.assertIn("Checkliste", res.text) decision = self.client.post( f"/admin/articles/{article_id}/images/decision", From b0f995d5c9148b3f63c374e126b86bfaa0823e32 Mon Sep 17 00:00:00 2001 From: Oliver G Date: Sat, 21 Feb 2026 14:39:47 +0100 Subject: [PATCH 21/54] feat(rewrite): add batch rewrite run, AI tags for WP, and agentur contact detection --- backend/app/admin_ui.py | 37 +++++- backend/app/main.py | 12 +- backend/app/rewrite.py | 120 ++++++++++++++++---- backend/app/source_extraction.py | 8 +- backend/app/wordpress.py | 92 ++++++++++++++- backend/templates/admin_article_detail.html | 3 + backend/templates/admin_dashboard.html | 12 ++ backend/tests/test_admin_ui.py | 65 +++++++++++ backend/tests/test_source_extraction.py | 27 +++++ backend/tests/test_wordpress.py | 34 ++++++ 10 files changed, 374 insertions(+), 36 deletions(-) diff --git a/backend/app/admin_ui.py b/backend/app/admin_ui.py index d3ca53e..689efce 100644 --- a/backend/app/admin_ui.py +++ b/backend/app/admin_ui.py @@ -20,7 +20,7 @@ from .ingestion import run_ingestion from .policy import evaluate_source_policy from .publisher import enqueue_publish, run_publisher from .relevance import article_age_days, article_relevance -from .rewrite import rewrite_article_text +from .rewrite import generate_article_tags, merge_generated_tags, rewrite_article_text from .repositories import ( FeedCreate, FeedUpdate, @@ -373,6 +373,7 @@ def _upsert_article_from_existing( publish_attempts: int | object = _UNSET, publish_last_error: str | None | object = _UNSET, published_to_wp_at: str | None | object = _UNSET, + meta_json: str | None | object = _UNSET, ) -> None: rewritten = article.get("content_rewritten") if content_rewritten is None else content_rewritten upsert_article( @@ -403,7 +404,7 @@ def _upsert_article_from_existing( published_to_wp_at=article.get("published_to_wp_at") if published_to_wp_at is _UNSET else published_to_wp_at, word_count=len(str(rewritten or "").split()), status=article.get("status") if status is None else status, - meta_json=article.get("meta_json"), + meta_json=article.get("meta_json") if meta_json is _UNSET else meta_json, ) ) @@ -493,6 +494,8 @@ def admin_dashboard(request: Request): article["days_old"] = article_age_days(article.get("published_at")) article["relevance"] = article_relevance(article.get("published_at")) article["status_ui"] = internal_to_ui_status(article.get("status")) + tags = meta.get("generated_tags") if isinstance(meta.get("generated_tags"), list) else [] + article["generated_tags"] = [str(t) for t in tags if t] return templates.TemplateResponse( request, @@ -836,12 +839,40 @@ def admin_rewrite_run(request: Request, article_id: int): return _dashboard_redirect(msg=f"Rewrite nur aus new/rewrite fuer Artikel #{article_id}", msg_type="error") try: rewritten = rewrite_article_text(article) + tags = generate_article_tags(article, rewritten_text=rewritten) except Exception as exc: return _dashboard_redirect(msg=f"Rewrite fehlgeschlagen fuer Artikel #{article_id}: {exc}", msg_type="error") - _upsert_article_from_existing(article, content_rewritten=rewritten, status="approved") + merged_meta = merge_generated_tags(article.get("meta_json"), tags) + _upsert_article_from_existing(article, content_rewritten=rewritten, status="approved", meta_json=merged_meta) return _dashboard_redirect(msg=f"Rewrite fertig fuer Artikel #{article_id} -> publish") +@router.post("/admin/rewrite/run") +def admin_rewrite_run_batch(request: Request, max_jobs: str = Form("10")): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + try: + limit = max(1, min(int(max_jobs), 100)) + except Exception: + limit = 10 + planned = list_articles(limit=limit, status_filter="rewrite") + processed = 0 + success = 0 + failed = 0 + for article in planned: + processed += 1 + try: + rewritten = rewrite_article_text(article) + tags = generate_article_tags(article, rewritten_text=rewritten) + merged_meta = merge_generated_tags(article.get("meta_json"), tags) + _upsert_article_from_existing(article, content_rewritten=rewritten, status="approved", meta_json=merged_meta) + success += 1 + except Exception: + failed += 1 + return _dashboard_redirect(msg=f"Rewrite-Run: processed={processed}, success={success}, failed={failed}") + + @router.post("/admin/articles/{article_id}/rewrite-save") def admin_rewrite_save(request: Request, article_id: int, content_rewritten: str = Form(...)): user = _admin_user(request) diff --git a/backend/app/main.py b/backend/app/main.py index 4dcee28..b0bcf2a 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -18,7 +18,7 @@ from .ingestion import run_ingestion from .policy import evaluate_source_policy, is_source_allowed from .publisher import enqueue_publish, run_publisher from .relevance import article_age_days, article_relevance -from .rewrite import rewrite_article_text +from .rewrite import generate_article_tags, merge_generated_tags, rewrite_article_text from .repositories import ( ArticleUpsert, FeedCreate, @@ -514,6 +514,12 @@ def api_article_rewrite_run(article_id: int, username: str = Depends(require_aut raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Rewrite nur aus Status 'new' oder 'rewrite'") rewritten = rewrite_article_text(article) + tags: list[str] = [] + try: + tags = generate_article_tags(article, rewritten_text=rewritten) + except Exception: + tags = [] + merged_meta = merge_generated_tags(article.get("meta_json"), tags) # upsert via status update + existing fields by lightweight path: repo_upsert_article( ArticleUpsert( @@ -543,10 +549,10 @@ def api_article_rewrite_run(article_id: int, username: str = Depends(require_aut published_to_wp_at=article.get("published_to_wp_at"), word_count=len(rewritten.split()), status="approved", - meta_json=article.get("meta_json"), + meta_json=merged_meta, ) ) - return {"ok": True, "id": article_id, "status": "publish"} + return {"ok": True, "id": article_id, "status": "publish", "tags": tags} @app.post("/api/articles/{article_id}/legal-review") diff --git a/backend/app/rewrite.py b/backend/app/rewrite.py index 8c313ad..759fac9 100644 --- a/backend/app/rewrite.py +++ b/backend/app/rewrite.py @@ -28,35 +28,39 @@ def _sanitize_source_text(text: str) -> str: return joined -def rewrite_article_text(article: dict[str, Any]) -> str: +def _normalize_tags(tags: list[str], max_tags: int = 8) -> list[str]: + out: list[str] = [] + seen: set[str] = set() + for raw in tags: + value = re.sub(r"\s+", " ", str(raw or "").strip()) + value = re.sub(r"^[#\-•\s]+", "", value) + value = re.sub(r"[;,.:\s]+$", "", value) + if not value: + continue + if len(value) < 2 or len(value) > 40: + continue + key = value.casefold() + if key in seen: + continue + seen.add(key) + out.append(value) + if len(out) >= max_tags: + break + return out + + +def _openai_chat(system: str, user: str, temperature: float = 0.4) -> str: settings = get_settings() api_key = settings.openai_api_key if not api_key: raise RuntimeError("OPENAI_API_KEY fehlt") - source_text = _sanitize_source_text(article.get("content_raw") or "") - if not source_text: - source_text = (article.get("summary") or "").strip() - if not source_text: - raise RuntimeError("Kein Quelltext für Rewrite verfügbar") - - title = (article.get("title") or "").strip() - prompt = ( - "Schreibe den folgenden News-Text neu auf Deutsch in persönlicher Du-Form. " - "Stil: ausführlich, gut lesbar, ohne Einleitung mit Datum/Uhrzeit/Firma/Ort, " - "ohne Pressekontakt, ohne Quellenblock. " - "Nutze klare Absätze und Zwischenüberschriften in HTML (

    ,

    ,

    • falls passend). " - "Inhaltlich korrekt bleiben, nichts erfinden.\n\n" - f"Titel: {title}\n\n" - f"Originaltext:\n{source_text}" - ) - payload = { "model": settings.openai_model, - "temperature": 0.4, + "temperature": temperature, "messages": [ - {"role": "system", "content": "Du bist ein deutscher News-Redakteur."}, - {"role": "user", "content": prompt}, + {"role": "system", "content": system}, + {"role": "user", "content": user}, ], } req = Request( @@ -78,6 +82,78 @@ def rewrite_article_text(article: dict[str, Any]) -> str: message = choices[0].get("message", {}) content = message.get("content") if not isinstance(content, str) or not content.strip(): - raise RuntimeError("OpenAI lieferte keinen Rewrite-Text") + raise RuntimeError("OpenAI lieferte keinen Inhalt") return content.strip() + +def rewrite_article_text(article: dict[str, Any]) -> str: + source_text = _sanitize_source_text(article.get("content_raw") or "") + if not source_text: + source_text = (article.get("summary") or "").strip() + if not source_text: + raise RuntimeError("Kein Quelltext für Rewrite verfügbar") + + title = (article.get("title") or "").strip() + prompt = ( + "Schreibe den folgenden News-Text neu auf Deutsch in persönlicher Du-Form. " + "Stil: ausführlich, gut lesbar, ohne Einleitung mit Datum/Uhrzeit/Firma/Ort, " + "ohne Pressekontakt, ohne Quellenblock. " + "Nutze klare Absätze und Zwischenüberschriften in HTML (

      ,

      ,

      • falls passend). " + "Inhaltlich korrekt bleiben, nichts erfinden.\n\n" + f"Titel: {title}\n\n" + f"Originaltext:\n{source_text}" + ) + return _openai_chat( + "Du bist ein deutscher News-Redakteur.", + prompt, + temperature=0.4, + ) + + +def generate_article_tags(article: dict[str, Any], rewritten_text: str | None = None, max_tags: int = 8) -> list[str]: + source_text = rewritten_text or _sanitize_source_text(article.get("content_raw") or "") or (article.get("summary") or "") + source_text = str(source_text).strip() + if not source_text: + return [] + title = (article.get("title") or "").strip() + prompt = ( + "Erzeuge präzise Schlagwörter für einen deutschen News-Artikel. " + f"Maximal {max_tags} Tags. Nur relevante Begriffe, keine allgemeinen Wörter wie News/Artikel. " + "Gib ausschließlich ein JSON-Array mit Strings zurück, ohne Erklärung.\n\n" + f"Titel: {title}\n\n" + f"Text:\n{source_text[:3500]}" + ) + raw = _openai_chat( + "Du extrahierst präzise, kurze News-Tags auf Deutsch.", + prompt, + temperature=0.2, + ) + try: + parsed = json.loads(raw) + if isinstance(parsed, list): + return _normalize_tags([str(x) for x in parsed], max_tags=max_tags) + except Exception: + pass + # fallback: extract first JSON-like array if model wrapped output + match = re.search(r"\[[\s\S]*\]", raw) + if match: + try: + parsed = json.loads(match.group(0)) + if isinstance(parsed, list): + return _normalize_tags([str(x) for x in parsed], max_tags=max_tags) + except Exception: + return [] + return [] + + +def merge_generated_tags(meta_json: str | None, tags: list[str]) -> str: + meta: dict[str, Any] = {} + if meta_json: + try: + parsed = json.loads(meta_json) + if isinstance(parsed, dict): + meta = parsed + except Exception: + meta = {} + meta["generated_tags"] = _normalize_tags(tags) + return json.dumps(meta, ensure_ascii=False) diff --git a/backend/app/source_extraction.py b/backend/app/source_extraction.py index 7fd65ce..925fcf6 100644 --- a/backend/app/source_extraction.py +++ b/backend/app/source_extraction.py @@ -157,7 +157,7 @@ def _extract_content_text(html: str) -> str | None: paragraphs = [] for match in re.finditer(r"]*>([\s\S]*?)", section, re.IGNORECASE): text = _clean_text(match.group(1)) - if text and re.search(r"\b(pressekontakt|press contact|kontakt)\b", text, re.IGNORECASE): + if text and re.search(r"\b(pressekontakt|press contact|kontakt|agentur)\b", text, re.IGNORECASE): paragraphs.append(text) for match in re.finditer(r"]*>([\s\S]*?)

        ", section, re.IGNORECASE): @@ -177,18 +177,18 @@ def _extract_press_contact(content_text: str | None) -> str | None: return None lines = [line.strip() for line in content_text.split("\n") if line.strip()] - marker_re = re.compile(r"\b(pressekontakt|press contact|presse\-kontakt)\b", re.IGNORECASE) + marker_re = re.compile(r"\b(pressekontakt|press contact|presse\-kontakt|agentur)\b", re.IGNORECASE) for idx, line in enumerate(lines): if marker_re.search(line): chunk = [line] for nxt in lines[idx + 1 : idx + 6]: - if re.search(r"\b(original\-content von|ots:|newsroom:)\b", nxt, re.IGNORECASE): + if re.search(r"\b(original\-content von|ots:|newsroom:|alle meldungen|zum newsroom)\b", nxt, re.IGNORECASE): break chunk.append(nxt) return _clean_text("\n".join(chunk)) match = re.search( - r"(Pressekontakt[\s\S]{0,1200}?)(?:Original-Content von|OTS:|newsroom:|$)", + r"((?:Pressekontakt|Agentur)[\s\S]{0,1200}?)(?:Original-Content von|OTS:|newsroom:|Alle Meldungen|Zum Newsroom|$)", content_text, re.IGNORECASE, ) diff --git a/backend/app/wordpress.py b/backend/app/wordpress.py index c257747..8da5fc5 100644 --- a/backend/app/wordpress.py +++ b/backend/app/wordpress.py @@ -7,7 +7,7 @@ import mimetypes from pathlib import Path import re from typing import Any -from urllib.parse import urlparse +from urllib.parse import quote_plus, urlparse from urllib.request import Request, urlopen from .config import get_settings @@ -25,7 +25,7 @@ def _wp_request( method: str, endpoint: str, payload: dict[str, Any] | None = None, -) -> dict[str, Any]: +) -> Any: url = f"{base_url.rstrip('/')}/wp-json/wp/v2/{endpoint.lstrip('/')}" data = json.dumps(payload).encode("utf-8") if payload is not None else None req = Request( @@ -41,8 +41,7 @@ def _wp_request( ) with urlopen(req, timeout=20) as resp: raw = resp.read().decode("utf-8", errors="replace") - parsed = json.loads(raw) if raw else {} - return parsed if isinstance(parsed, dict) else {} + return json.loads(raw) if raw else {} def _selected_image_url_from_meta(meta_json: str | None) -> str | None: @@ -61,6 +60,81 @@ def _selected_image_url_from_meta(meta_json: str | None) -> str | None: return selected if isinstance(selected, str) and selected.strip() else None +def _selected_tags_from_meta(meta_json: str | None) -> list[str]: + if not meta_json: + return [] + try: + meta = json.loads(meta_json) + except Exception: + return [] + if not isinstance(meta, dict): + return [] + raw_tags = meta.get("generated_tags") + if not isinstance(raw_tags, list): + return [] + tags: list[str] = [] + seen: set[str] = set() + for item in raw_tags: + value = str(item or "").strip() + if not value: + continue + key = value.casefold() + if key in seen: + continue + seen.add(key) + tags.append(value) + if len(tags) >= 12: + break + return tags + + +def _resolve_wp_tag_ids(*, base_url: str, auth_header: str, tags: list[str]) -> list[int]: + ids: list[int] = [] + seen: set[int] = set() + for tag in tags: + name = tag.strip() + if not name: + continue + try: + endpoint = f"tags?search={quote_plus(name)}&per_page=20" + result = _wp_request(base_url=base_url, auth_header=auth_header, method="GET", endpoint=endpoint) + tag_id: int | None = None + if isinstance(result, list): + for row in result: + if not isinstance(row, dict): + continue + row_name = str(row.get("name") or "") + rid = int(row.get("id", 0) or 0) + if rid <= 0: + continue + if row_name.casefold() == name.casefold(): + tag_id = rid + break + if tag_id is None: + for row in result: + if isinstance(row, dict) and int(row.get("id", 0) or 0) > 0: + tag_id = int(row.get("id", 0)) + break + if tag_id is None: + created = _wp_request( + base_url=base_url, + auth_header=auth_header, + method="POST", + endpoint="tags", + payload={"name": name}, + ) + if isinstance(created, dict): + rid = int(created.get("id", 0) or 0) + if rid > 0: + tag_id = rid + if tag_id is not None and tag_id > 0 and tag_id not in seen: + seen.add(tag_id) + ids.append(tag_id) + except Exception: + continue + return ids + + def _download_image_bytes(url: str, referer: str | None = None) -> tuple[bytes, str]: headers = { "User-Agent": "rss-news-publisher/1.0", @@ -269,6 +343,14 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]: payload["featured_media"] = featured_media_id wp_post_id = article.get("wp_post_id") + tag_ids = _resolve_wp_tag_ids( + base_url=settings.wordpress_base_url, + auth_header=auth, + tags=_selected_tags_from_meta(article.get("meta_json")), + ) + if tag_ids: + payload["tags"] = tag_ids + if wp_post_id: result = _wp_request( base_url=settings.wordpress_base_url, @@ -286,6 +368,8 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]: payload=payload, ) + if not isinstance(result, dict): + raise RuntimeError(f"WordPress Antwort im unerwarteten Format: {result}") post_id = int(result.get("id", 0)) if post_id <= 0: raise RuntimeError(f"WordPress Antwort ohne Post-ID: {result}") diff --git a/backend/templates/admin_article_detail.html b/backend/templates/admin_article_detail.html index 992fe20..1c16658 100644 --- a/backend/templates/admin_article_detail.html +++ b/backend/templates/admin_article_detail.html @@ -170,6 +170,9 @@ + {% if article.meta.generated_tags %} +

        Generierte Tags: {{ article.meta.generated_tags|join("; ") }}

        + {% endif %}

        Dieser Text wird für den WordPress-Entwurf verwendet, falls vorhanden.

        diff --git a/backend/templates/admin_dashboard.html b/backend/templates/admin_dashboard.html index f24b76e..15f3daf 100644 --- a/backend/templates/admin_dashboard.html +++ b/backend/templates/admin_dashboard.html @@ -102,6 +102,15 @@ +
        +

        Rewrite Run (geplante Artikel)

        +

        Verarbeitet alle Artikel im Status rewrite und setzt sie auf publish.

        +
        + + +
        +
        +

        Quellen + Policy

        @@ -269,6 +278,9 @@ {% if a.summary %}
        Summary: {{ a.summary }}
        {% endif %} + {% if a.generated_tags %} +
        Tags: {{ a.generated_tags|join("; ") }}
        + {% endif %} {% if a.content_raw %}
        Volltext anzeigen diff --git a/backend/tests/test_admin_ui.py b/backend/tests/test_admin_ui.py index 6b7d1c7..c7b6ebf 100644 --- a/backend/tests/test_admin_ui.py +++ b/backend/tests/test_admin_ui.py @@ -271,6 +271,71 @@ class TestAdminUi(unittest.TestCase): self.assertIn("Neu", article.get("content_rewritten") or "") self.assertIsNone(article.get("wp_post_id")) + @patch("backend.app.admin_ui.generate_article_tags") + @patch("backend.app.admin_ui.rewrite_article_text") + def test_batch_rewrite_run_processes_planned_articles(self, mock_rewrite_text, mock_tags) -> None: + mock_rewrite_text.return_value = "

        Neu

        Text

        " + mock_tags.return_value = ["Rheingas", "Monheim"] + + source_id = create_source( + SourceCreate( + name="Batch Source", + base_url="https://example.org", + terms_url="https://example.org/terms", + license_name="cc-by", + risk_level="green", + is_enabled=True, + notes=None, + last_reviewed_at=None, + ) + ) + feed_id = create_feed( + FeedCreate( + name="Batch Feed", + url="https://example.org/feed.xml", + source_id=source_id, + is_enabled=True, + ) + ) + article_id = upsert_article( + ArticleUpsert( + feed_id=feed_id, + source_article_id="batch-1", + source_hash="batch-hash-1", + title="Batch Titel", + source_url="https://example.org/batch", + canonical_url="https://example.org/batch", + published_at=None, + author="Autor", + summary="Summary", + content_raw="Raw", + content_rewritten=None, + image_urls_json=None, + press_contact=None, + source_name_snapshot="Batch Source", + source_terms_url_snapshot="https://example.org/terms", + source_license_name_snapshot="cc-by", + legal_checked=False, + legal_checked_at=None, + legal_note=None, + wp_post_id=None, + wp_post_url=None, + publish_attempts=0, + publish_last_error=None, + published_to_wp_at=None, + word_count=1, + status="rewrite", + meta_json="{}", + ) + ) + self.client.post("/admin/login", data={"username": "admin", "password": "secret"}, follow_redirects=True) + res = self.client.post("/admin/rewrite/run", data={"max_jobs": "10"}, follow_redirects=False) + self.assertEqual(res.status_code, 303) + article = get_article_by_id(article_id) + self.assertIsNotNone(article) + self.assertEqual(article.get("status"), "approved") + self.assertIn("generated_tags", article.get("meta_json", "")) + @patch("backend.app.admin_ui.urlopen") def test_image_proxy_returns_image_data(self, mock_urlopen) -> None: class _FakeHeaders: diff --git a/backend/tests/test_source_extraction.py b/backend/tests/test_source_extraction.py index f6787ff..5cafde7 100644 --- a/backend/tests/test_source_extraction.py +++ b/backend/tests/test_source_extraction.py @@ -26,6 +26,25 @@ SAMPLE_HTML = """ """ +SAMPLE_HTML_AGENTUR = """ + + + + + + + +
        +

        Inhalt der Meldung.

        +

        Agentur

        +

        Agenturname GmbH

        +

        presse@agentur.example

        +

        Original-Content von Beispiel

        +
        + + +""" + class _FakeHeaders: @staticmethod @@ -64,6 +83,14 @@ class TestSourceExtraction(unittest.TestCase): self.assertIn("Pressekontakt", extracted.press_contact or "") self.assertIsNone(extracted.extraction_error) + @patch("backend.app.source_extraction.urlopen") + def test_extract_article_detects_agentur_block_as_press_contact(self, mock_urlopen) -> None: + mock_urlopen.return_value = _FakeResponse(SAMPLE_HTML_AGENTUR) + extracted = extract_article("https://www.presseportal.de/pm/155103/6210401") + self.assertIn("Agentur", extracted.press_contact or "") + self.assertIn("Agenturname", extracted.press_contact or "") + self.assertIn("presse@agentur.example", extracted.press_contact or "") + if __name__ == "__main__": unittest.main() diff --git a/backend/tests/test_wordpress.py b/backend/tests/test_wordpress.py index 833b159..4cafc55 100644 --- a/backend/tests/test_wordpress.py +++ b/backend/tests/test_wordpress.py @@ -80,6 +80,40 @@ class TestWordpressPublish(unittest.TestCase): self.assertNotIn("Pressekontakt", content) self.assertIn("eigentliche Text", content) + @patch("backend.app.wordpress._upload_featured_media") + @patch("backend.app.wordpress._wp_request") + def test_publish_resolves_and_sets_tags(self, mock_wp_request, mock_upload_media) -> None: + def _fake_wp_request(**kwargs): + endpoint = kwargs.get("endpoint", "") + method = kwargs.get("method", "") + if method == "GET" and endpoint.startswith("tags?search="): + if "Rheingas" in endpoint: + return [{"id": 11, "name": "Rheingas"}] + return [] + if method == "POST" and endpoint == "tags": + name = (kwargs.get("payload") or {}).get("name") + if name == "Gasflasche": + return {"id": 12, "name": "Gasflasche"} + return {"id": 13, "name": str(name)} + if method == "POST" and endpoint == "posts": + return {"id": 900, "link": "https://example.org/?p=900"} + return {} + + mock_wp_request.side_effect = _fake_wp_request + article = { + "title": "Tag Test", + "content_raw": "Inhalt", + "source_url": "https://example.com/source", + "canonical_url": "https://example.com/source", + "meta_json": '{"generated_tags":["Rheingas","Gasflasche"]}', + } + post_id, _ = publish_article_draft(article) + self.assertEqual(post_id, 900) + post_calls = [call for call in mock_wp_request.call_args_list if call.kwargs.get("endpoint") == "posts"] + self.assertEqual(len(post_calls), 1) + payload = post_calls[0].kwargs.get("payload", {}) + self.assertEqual(payload.get("tags"), [11, 12]) + if __name__ == "__main__": unittest.main() From 93f52f72b948aa997491cce0ae7a33e89197a88c Mon Sep 17 00:00:00 2001 From: Oliver G Date: Sat, 21 Feb 2026 14:51:36 +0100 Subject: [PATCH 22/54] fix(ingestion): preserve article workflow data and skip closed items on re-import --- backend/app/ingestion.py | 111 +++++++++++++++++++++-------- backend/app/repositories.py | 7 ++ backend/tests/test_ingestion.py | 119 +++++++++++++++++++++++++++++++- 3 files changed, 206 insertions(+), 31 deletions(-) diff --git a/backend/app/ingestion.py b/backend/app/ingestion.py index 872a1b0..1ba6b6c 100644 --- a/backend/app/ingestion.py +++ b/backend/app/ingestion.py @@ -16,6 +16,7 @@ from .repositories import ( ArticleUpsert, RunCreate, create_run, + find_existing_article_for_upsert, finish_run, get_feed_by_id, list_enabled_feeds, @@ -135,6 +136,20 @@ def _select_relevant_images(source_url: str, title: str, images: list[str], max_ return kept, primary, ranked +def _merge_ingestion_meta(existing_meta_json: str | None, attribution: dict[str, Any], extraction_meta: dict[str, Any]) -> str: + meta: dict[str, Any] = {} + if existing_meta_json: + try: + parsed = json.loads(existing_meta_json) + if isinstance(parsed, dict): + meta = parsed + except Exception: + meta = {} + meta["attribution"] = attribution + meta["extraction"] = extraction_meta + return json.dumps(meta, ensure_ascii=False) + + def run_ingestion(feed_id: int | None = None) -> IngestionStats: run_id = create_run(RunCreate(run_type="ingestion", status="running", details="started")) feeds_processed = 0 @@ -268,37 +283,73 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats: "total_candidates": len(extracted.images), "ranked": ranked_images, } - article_id = upsert_article( - ArticleUpsert( - feed_id=int(feed["id"]), - source_article_id=entry.get("id") or entry.get("guid"), - source_hash=source_hash, - title=final_title, - source_url=link, - canonical_url=final_canonical, - published_at=_entry_published_iso(entry), - author=final_author, - summary=final_summary, - content_raw=final_content_raw, - content_rewritten=None, - image_urls_json=json.dumps(selected_images, ensure_ascii=False) if selected_images else None, - press_contact=extracted.press_contact, - source_name_snapshot=feed.get("source_name"), - source_terms_url_snapshot=feed.get("source_terms_url"), - source_license_name_snapshot=feed.get("source_license_name"), - legal_checked=False, - legal_checked_at=None, - legal_note=None, - wp_post_id=None, - wp_post_url=None, - publish_attempts=0, - publish_last_error=None, - published_to_wp_at=None, - word_count=len((final_content_raw or "").split()), - status="new", - meta_json=json.dumps({"attribution": attribution, "extraction": extraction_meta}, ensure_ascii=False), - ) + base_payload = ArticleUpsert( + feed_id=int(feed["id"]), + source_article_id=entry.get("id") or entry.get("guid"), + source_hash=source_hash, + title=final_title, + source_url=link, + canonical_url=final_canonical, + published_at=_entry_published_iso(entry), + author=final_author, + summary=final_summary, + content_raw=final_content_raw, + content_rewritten=None, + image_urls_json=json.dumps(selected_images, ensure_ascii=False) if selected_images else None, + press_contact=extracted.press_contact, + source_name_snapshot=feed.get("source_name"), + source_terms_url_snapshot=feed.get("source_terms_url"), + source_license_name_snapshot=feed.get("source_license_name"), + legal_checked=False, + legal_checked_at=None, + legal_note=None, + wp_post_id=None, + wp_post_url=None, + publish_attempts=0, + publish_last_error=None, + published_to_wp_at=None, + word_count=len((final_content_raw or "").split()), + status="new", + meta_json=json.dumps({"attribution": attribution, "extraction": extraction_meta}, ensure_ascii=False), ) + existing = find_existing_article_for_upsert(base_payload) + if existing and existing.get("status") == "error": + # Explicitly closed article: ignore on subsequent ingestion runs. + continue + + payload = base_payload + if existing: + payload = ArticleUpsert( + feed_id=base_payload.feed_id, + source_article_id=base_payload.source_article_id, + source_hash=base_payload.source_hash, + title=base_payload.title, + source_url=base_payload.source_url, + canonical_url=base_payload.canonical_url, + published_at=base_payload.published_at, + author=base_payload.author, + summary=base_payload.summary, + content_raw=base_payload.content_raw, + content_rewritten=existing.get("content_rewritten"), + image_urls_json=base_payload.image_urls_json, + press_contact=base_payload.press_contact or existing.get("press_contact"), + source_name_snapshot=base_payload.source_name_snapshot, + source_terms_url_snapshot=base_payload.source_terms_url_snapshot, + source_license_name_snapshot=base_payload.source_license_name_snapshot, + legal_checked=bool(int(existing.get("legal_checked", 0))), + legal_checked_at=existing.get("legal_checked_at"), + legal_note=existing.get("legal_note"), + wp_post_id=existing.get("wp_post_id"), + wp_post_url=existing.get("wp_post_url"), + publish_attempts=int(existing.get("publish_attempts", 0)), + publish_last_error=existing.get("publish_last_error"), + published_to_wp_at=existing.get("published_to_wp_at"), + word_count=base_payload.word_count, + status=existing.get("status") or "new", + meta_json=_merge_ingestion_meta(existing.get("meta_json"), attribution, extraction_meta), + ) + + article_id = upsert_article(payload) if article_id: articles_upserted += 1 feed_upserts += 1 diff --git a/backend/app/repositories.py b/backend/app/repositories.py index bff6e87..0ee5380 100644 --- a/backend/app/repositories.py +++ b/backend/app/repositories.py @@ -633,6 +633,13 @@ def _resolve_existing_article_id(payload: ArticleUpsert) -> int | None: return None +def find_existing_article_for_upsert(payload: ArticleUpsert) -> dict[str, Any] | None: + article_id = _resolve_existing_article_id(payload) + if article_id is None: + return None + return get_article_by_id(article_id) + + def upsert_article(payload: ArticleUpsert) -> int: existing_id = _resolve_existing_article_id(payload) with get_conn() as conn: diff --git a/backend/tests/test_ingestion.py b/backend/tests/test_ingestion.py index 342c216..a36b62e 100644 --- a/backend/tests/test_ingestion.py +++ b/backend/tests/test_ingestion.py @@ -7,7 +7,16 @@ from unittest.mock import patch from backend.app import config as config_module from backend.app.db import init_db from backend.app.ingestion import run_ingestion -from backend.app.repositories import FeedCreate, SourceCreate, create_feed, create_source, list_articles +from backend.app.repositories import ( + ArticleUpsert, + FeedCreate, + SourceCreate, + create_feed, + create_source, + get_article_by_id, + list_articles, + upsert_article, +) from backend.app.source_extraction import ExtractedArticle @@ -118,6 +127,114 @@ class TestIngestion(unittest.TestCase): mock_parse.assert_not_called() mock_extract_article.assert_not_called() + @patch("backend.app.ingestion.extract_article") + @patch("backend.app.ingestion.feedparser.parse") + def test_ingestion_preserves_existing_work_and_skips_closed(self, mock_parse, mock_extract_article) -> None: + existing_closed_id = upsert_article( + ArticleUpsert( + feed_id=self.feed_id, + source_article_id="closed-1", + source_hash="closed-hash-1", + title="Alt Closed", + source_url="https://example.org/closed-article", + canonical_url="https://example.org/closed-article", + published_at=None, + author="Autor", + summary="Alt", + content_raw="Alt Raw", + content_rewritten="

        Alt Rewrite Closed

        ", + image_urls_json=None, + press_contact="Kontakt Alt", + source_name_snapshot="Test Source", + source_terms_url_snapshot="https://example.org/terms", + source_license_name_snapshot="cc-by", + legal_checked=False, + legal_checked_at=None, + legal_note=None, + wp_post_id=42, + wp_post_url="https://wp.local/?p=42", + publish_attempts=2, + publish_last_error=None, + published_to_wp_at="2026-02-21T12:00:00Z", + word_count=3, + status="error", # UI: close + meta_json='{"generated_tags":["AltTag"]}', + ) + ) + existing_published_id = upsert_article( + ArticleUpsert( + feed_id=self.feed_id, + source_article_id="published-1", + source_hash="published-hash-1", + title="Alt Published", + source_url="https://example.org/published-article", + canonical_url="https://example.org/published-article", + published_at=None, + author="Autor", + summary="Alt", + content_raw="Alt Raw", + content_rewritten="

        Alt Rewrite Published

        ", + image_urls_json=None, + press_contact="Kontakt Alt", + source_name_snapshot="Test Source", + source_terms_url_snapshot="https://example.org/terms", + source_license_name_snapshot="cc-by", + legal_checked=False, + legal_checked_at=None, + legal_note=None, + wp_post_id=77, + wp_post_url="https://wp.local/?p=77", + publish_attempts=3, + publish_last_error=None, + published_to_wp_at="2026-02-21T12:10:00Z", + word_count=3, + status="published", + meta_json='{"generated_tags":["Rheingas"],"image_review":{"selected_url":"https://img.local/1.jpg"}}', + ) + ) + + mock_extract_article.return_value = ExtractedArticle( + title="Neu Titel", + author="Neu Autor", + canonical_url=None, + summary="Neu Summary", + content_text="Neu Volltext", + images=["https://example.org/a.jpg"], + press_contact=None, + extraction_error=None, + ) + mock_parse.return_value = { + "etag": "etag-2", + "modified": "Tue, 18 Feb 2026 11:00:00 GMT", + "entries": [ + { + "id": "closed-1", + "title": "Closed Entry", + "link": "https://example.org/closed-article", + "summary": "X", + }, + { + "id": "published-1", + "title": "Published Entry", + "link": "https://example.org/published-article", + "summary": "Y", + }, + ], + } + + stats = run_ingestion(feed_id=self.feed_id) + self.assertEqual(stats.status, "success") + closed_row = get_article_by_id(existing_closed_id) or {} + self.assertEqual(closed_row["status"], "error") + self.assertIn("Alt Rewrite Closed", closed_row.get("content_rewritten") or "") + self.assertEqual(closed_row.get("wp_post_id"), 42) + + published_row = get_article_by_id(existing_published_id) or {} + self.assertEqual(published_row["status"], "published") + self.assertIn("Alt Rewrite Published", published_row.get("content_rewritten") or "") + self.assertEqual(published_row.get("wp_post_id"), 77) + self.assertIn("generated_tags", published_row.get("meta_json") or "") + if __name__ == "__main__": unittest.main() From 6332a9a3991216106fc716e8ae8d0545cbe9bfe5 Mon Sep 17 00:00:00 2001 From: Oliver G Date: Sat, 21 Feb 2026 14:55:20 +0100 Subject: [PATCH 23/54] feat(wordpress): publish true Gutenberg blocks and remove auto summary/details sections --- backend/app/wordpress.py | 84 +++++++++++++++------------------ backend/tests/test_wordpress.py | 26 ++++++++-- 2 files changed, 62 insertions(+), 48 deletions(-) diff --git a/backend/app/wordpress.py b/backend/app/wordpress.py index 8da5fc5..150bcd1 100644 --- a/backend/app/wordpress.py +++ b/backend/app/wordpress.py @@ -229,6 +229,42 @@ def _as_block_paragraphs(text: str) -> str: return "\n".join(lines) +def _strip_html_tags(raw: str) -> str: + text = re.sub(r"<[^>]+>", " ", raw or "") + return re.sub(r"\s+", " ", text).strip() + + +def _html_to_wp_blocks(html: str) -> str: + src = (html or "").strip() + if not src: + return "" + pattern = re.compile( + r"]*>[\s\S]*?|]*>[\s\S]*?

        |]*>[\s\S]*?|]*>[\s\S]*?", + re.IGNORECASE, + ) + blocks: list[str] = [] + for match in pattern.finditer(src): + block_html = match.group(0).strip() + if not block_html: + continue + tag_match = re.match(r"<([a-z0-9]+)", block_html, re.IGNORECASE) + tag = (tag_match.group(1).lower() if tag_match else "") + if tag == "p": + blocks.append(f"{block_html}") + elif tag in {"ul", "ol"}: + ordered = tag == "ol" + if ordered: + blocks.append(f'{block_html}') + else: + blocks.append(f"{block_html}") + elif tag.startswith("h") and len(tag) == 2 and tag[1].isdigit(): + level = int(tag[1]) + blocks.append(f'{block_html}') + if blocks: + return "\n".join(blocks) + return _as_block_paragraphs(_strip_html_tags(src)) + + def _as_block_heading(level: int, text: str) -> str: safe_level = min(6, max(1, int(level))) return f'{escape(text)}' @@ -254,60 +290,18 @@ def _sanitize_publish_text(text: str) -> str: def _build_post_content(article: dict[str, Any]) -> tuple[str, str | None]: - source_url = article.get("source_url") or "" - canonical_url = article.get("canonical_url") or source_url summary = (article.get("summary") or "").strip() body_text = (article.get("content_rewritten") or article.get("content_raw") or "").strip() body_text = _sanitize_publish_text(body_text) if not body_text: body_text = summary - # Keep existing HTML if already present, otherwise wrap plain text into block paragraphs. has_html = bool(re.search(r"<[a-zA-Z][^>]*>", body_text)) - body_html = body_text if has_html else _as_block_paragraphs(body_text) + body_html = _html_to_wp_blocks(body_text) if has_html else _as_block_paragraphs(body_text) if not body_html: body_html = "

        Kein Inhalt verfügbar.

        " - elif has_html: - body_html = f"\n{body_html}\n" - - author = (article.get("author") or "").strip() - published_at = (article.get("published_at") or "").strip() - source_name = (article.get("source_name_snapshot") or "").strip() - license_name = (article.get("source_license_name_snapshot") or "").strip() - terms_url = (article.get("source_terms_url_snapshot") or "").strip() - - lead_html = f"

        {escape(summary)}

        \n" if summary else "" - - facts: list[str] = [] - if author: - facts.append(f"Autor: {escape(author)}") - if published_at: - facts.append(f"Veröffentlicht (Quelle): {escape(published_at)}") - if source_name: - facts.append(f"Quelle: {escape(source_name)}") - if license_name: - facts.append(f"Lizenz: {escape(license_name)}") - if terms_url: - facts.append(f"Lizenzhinweise: {escape(terms_url)}") - - facts_html = "" - if facts: - facts_html = _as_block_heading(3, "Artikeldetails") + "\n" + _as_block_list(facts) - - attribution_parts = [ - _as_block_heading(3, "Quelle"), - f'

        Originalartikel: {escape(source_url)}

        ', - ] - if canonical_url and canonical_url != source_url: - attribution_parts.append( - f'

        Canonical: {escape(canonical_url)}

        ' - ) - attribution_html = "\n".join(attribution_parts) - - content = f"{lead_html}{body_html}\n\n{facts_html}\n{attribution_html}".strip() - excerpt_source = summary or re.sub(r"\s+", " ", body_text).strip() - excerpt = excerpt_source[:220] if excerpt_source else None - return content, excerpt + content = body_html.strip() + return content, None def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]: diff --git a/backend/tests/test_wordpress.py b/backend/tests/test_wordpress.py index 4cafc55..20b0618 100644 --- a/backend/tests/test_wordpress.py +++ b/backend/tests/test_wordpress.py @@ -38,9 +38,9 @@ class TestWordpressPublish(unittest.TestCase): self.assertTrue(mock_upload_media.called) payload = mock_wp_request.call_args.kwargs["payload"] self.assertEqual(payload.get("featured_media"), 456) - self.assertIn("

        Quelle

        ", payload.get("content", "")) - self.assertIn("Originalartikel", payload.get("content", "")) - self.assertEqual(payload.get("excerpt"), "Inhalt") + self.assertIn("", payload.get("content", "")) + self.assertIn("

        Inhalt

        ", payload.get("content", "")) + self.assertNotIn("excerpt", payload) @patch("backend.app.wordpress._upload_featured_media") @patch("backend.app.wordpress._wp_request") @@ -79,6 +79,7 @@ class TestWordpressPublish(unittest.TestCase): self.assertNotIn("Firma GmbH", content) self.assertNotIn("Pressekontakt", content) self.assertIn("eigentliche Text", content) + self.assertNotIn("Artikeldetails", content) @patch("backend.app.wordpress._upload_featured_media") @patch("backend.app.wordpress._wp_request") @@ -114,6 +115,25 @@ class TestWordpressPublish(unittest.TestCase): payload = post_calls[0].kwargs.get("payload", {}) self.assertEqual(payload.get("tags"), [11, 12]) + @patch("backend.app.wordpress._upload_featured_media") + @patch("backend.app.wordpress._wp_request") + def test_publish_converts_html_to_wp_blocks_without_html_block(self, mock_wp_request, mock_upload_media) -> None: + mock_wp_request.return_value = {"id": 111, "link": "https://example.org/?p=111"} + article = { + "title": "Block Test", + "content_rewritten": "

        Überschrift

        Absatz 1

        • A
        • B
        ", + "source_url": "https://example.com/source", + "canonical_url": "https://example.com/source", + "meta_json": "{}", + } + publish_article_draft(article) + payload = mock_wp_request.call_args.kwargs["payload"] + content = payload.get("content", "") + self.assertIn("", content) + self.assertIn("", content) + self.assertNotIn("", content) + if __name__ == "__main__": unittest.main() From 6192f8e527da20874b81a095b1f59e76b9a919dc Mon Sep 17 00:00:00 2001 From: OliverGiertz Date: Sat, 21 Mar 2026 09:40:15 +0000 Subject: [PATCH 24/54] feat(automation): autonomous pipeline with Telegram bot and N8N integration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add full auto pipeline: RSS ingest → GPT relevance score → AI rewrite → WP draft - Add Telegram bot with inline buttons (rewrite/discard/override) and commands (/run, /rejected, /status) - Add smart publish scheduler: max 2 drafts/day, spread over week (09:00 & 14:00 CET) - Add N8N API endpoints (/api/n8n/pipeline, /api/n8n/ingest) with X-API-Key auth - Add GPT-based relevance scoring (0-100) for VanLife/Camping/Outdoor topics - Remove Ampel risk-level policy check from ingestion (all enabled feeds are used) - Add Telegram webhook endpoint and setup endpoint - Add delete_wp_post() for Telegram discard action - Add DB migrations for relevance_score and scheduled_publish_at columns - Update .env.example with all new configuration variables - Add docs/AUTOMATION.md with full setup and usage documentation Co-Authored-By: Claude Sonnet 4.6 --- backend/.env.example | 35 +++ backend/app/config.py | 14 ++ backend/app/db.py | 2 + backend/app/ingestion.py | 25 -- backend/app/main.py | 80 +++++++ backend/app/pipeline.py | 407 +++++++++++++++++++++++++++++++++ backend/app/rewrite.py | 41 ++++ backend/app/scheduler.py | 139 ++++++++++++ backend/app/telegram_bot.py | 438 ++++++++++++++++++++++++++++++++++++ backend/app/wordpress.py | 15 ++ docs/AUTOMATION.md | 190 ++++++++++++++++ 11 files changed, 1361 insertions(+), 25 deletions(-) create mode 100644 backend/app/pipeline.py create mode 100644 backend/app/scheduler.py create mode 100644 backend/app/telegram_bot.py create mode 100644 docs/AUTOMATION.md diff --git a/backend/.env.example b/backend/.env.example index 74e9c4b..c2dd235 100644 --- a/backend/.env.example +++ b/backend/.env.example @@ -1,3 +1,4 @@ +# ─── App ──────────────────────────────────────────────────────────────────── APP_ENV=development APP_NAME=rss-news-backend APP_SECRET_KEY=replace-with-a-long-random-secret @@ -8,3 +9,37 @@ APP_ADMIN_PASSWORD=change-me SESSION_COOKIE_NAME=rss_news_session SESSION_MAX_AGE_SECONDS=28800 + +# ─── WordPress ────────────────────────────────────────────────────────────── +WP_BASE_URL=https://your-site.tld +WP_USERNAME=your-wp-username +WP_PASSWORD=your-wp-app-password +# Status für neue Beiträge: draft | future | publish +WORDPRESS_DEFAULT_STATUS=draft + +# ─── OpenAI ───────────────────────────────────────────────────────────────── +OPENAI_API_KEY=sk-... +# gpt-4o-mini empfohlen (Kosten/Qualität) +OPENAI_MODEL=gpt-4o-mini + +# ─── Telegram Bot ──────────────────────────────────────────────────────────── +# Bot-Token von @BotFather +TELEGRAM_BOT_TOKEN=123456789:ABC... +# Chat-ID deines persönlichen Chats oder einer Gruppe +TELEGRAM_CHAT_ID=123456789 +# Zufälliger Secret-Token zur Webhook-Absicherung (mindestens 20 Zeichen) +TELEGRAM_WEBHOOK_SECRET=replace-with-random-secret-min-20-chars + +# ─── N8N API-Key ───────────────────────────────────────────────────────────── +# Wird von N8N im Header X-API-Key mitgeschickt +N8N_API_KEY=replace-with-strong-random-key + +# ─── Pipeline-Einstellungen ────────────────────────────────────────────────── +# Relevanz-Score >= dieser Wert: automatisch verarbeiten (0-100) +PIPELINE_RELEVANCE_AUTO=80 +# Relevanz-Score >= dieser Wert, aber < AUTO: Telegram-Warnung senden +PIPELINE_RELEVANCE_WARN=60 +# Maximale Drafts/Veröffentlichungen pro Tag +PIPELINE_MAX_DRAFTS_PER_DAY=2 +# Bevorzugte Veröffentlichungszeiten (Stunden, kommagetrennt, CET) +PIPELINE_PUBLISH_HOURS=9,14 diff --git a/backend/app/config.py b/backend/app/config.py index 43629ba..d56ce11 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -33,6 +33,20 @@ class Settings(BaseSettings): openai_api_key: str | None = Field(default=None, validation_alias=AliasChoices("OPENAI_API_KEY")) openai_model: str = "gpt-4o-mini" + # Telegram Bot + telegram_bot_token: str | None = Field(default=None, validation_alias=AliasChoices("TELEGRAM_BOT_TOKEN")) + telegram_chat_id: str | None = Field(default=None, validation_alias=AliasChoices("TELEGRAM_CHAT_ID")) + telegram_webhook_secret: str | None = Field(default=None, validation_alias=AliasChoices("TELEGRAM_WEBHOOK_SECRET")) + + # N8N API authentication + n8n_api_key: str | None = Field(default=None, validation_alias=AliasChoices("N8N_API_KEY")) + + # Pipeline behaviour + pipeline_relevance_auto: int = 80 # >= this: auto-process + pipeline_relevance_warn: int = 60 # >= this: Telegram warning, else reject + pipeline_max_drafts_per_day: int = 2 + pipeline_publish_hours: str = "9,14" # comma-separated preferred publish hours (CET) + @lru_cache(maxsize=1) def get_settings() -> Settings: diff --git a/backend/app/db.py b/backend/app/db.py index d2ebfd5..1b394c3 100644 --- a/backend/app/db.py +++ b/backend/app/db.py @@ -160,6 +160,8 @@ def init_db() -> None: row["name"] for row in conn.execute("PRAGMA table_info(articles)").fetchall() } migration_columns = { + "relevance_score": "ALTER TABLE articles ADD COLUMN relevance_score INTEGER", + "scheduled_publish_at": "ALTER TABLE articles ADD COLUMN scheduled_publish_at TEXT", "source_hash": "ALTER TABLE articles ADD COLUMN source_hash TEXT", "image_urls_json": "ALTER TABLE articles ADD COLUMN image_urls_json TEXT", "press_contact": "ALTER TABLE articles ADD COLUMN press_contact TEXT", diff --git a/backend/app/ingestion.py b/backend/app/ingestion.py index 1ba6b6c..510fd10 100644 --- a/backend/app/ingestion.py +++ b/backend/app/ingestion.py @@ -11,7 +11,6 @@ from urllib.parse import unquote, urlparse import feedparser -from .policy import evaluate_source_policy from .repositories import ( ArticleUpsert, RunCreate, @@ -169,30 +168,6 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats: continue feeds_processed += 1 - source_snapshot = { - "id": feed.get("source_id"), - "name": feed.get("source_name"), - "base_url": feed.get("source_base_url"), - "terms_url": feed.get("source_terms_url"), - "license_name": feed.get("source_license_name"), - "risk_level": feed.get("source_risk_level"), - "last_reviewed_at": feed.get("source_last_reviewed_at"), - "is_enabled": feed.get("source_is_enabled"), - } - policy_issues = evaluate_source_policy(source_snapshot) - if policy_issues: - feed_results.append( - { - "feed_id": int(feed["id"]), - "feed_url": feed["url"], - "status": "blocked", - "policy_issues": policy_issues, - "entries_seen": 0, - "upserts": 0, - } - ) - continue - parsed = None feed_error = None for attempt in range(1, MAX_FEED_FETCH_RETRIES + 1): diff --git a/backend/app/main.py b/backend/app/main.py index b0bcf2a..51aab6b 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -15,10 +15,12 @@ from .auth import create_session_token, verify_credentials, verify_session_token from .config import get_settings from .db import init_db from .ingestion import run_ingestion +from .pipeline import run_auto_pipeline from .policy import evaluate_source_policy, is_source_allowed from .publisher import enqueue_publish, run_publisher from .relevance import article_age_days, article_relevance from .rewrite import generate_article_tags, merge_generated_tags, rewrite_article_text +from .telegram_bot import handle_update, setup_webhook from .repositories import ( ArticleUpsert, FeedCreate, @@ -620,3 +622,81 @@ def api_run_ingestion(payload: IngestionRunRequest, username: str = Depends(requ }, "requested_by": username, } + + +# --------------------------------------------------------------------------- +# N8N Automation endpoint (API-Key auth, no session cookie required) +# --------------------------------------------------------------------------- + +def _require_api_key(request: Request) -> None: + api_key = request.headers.get("X-API-Key") or request.query_params.get("api_key") + expected = settings.n8n_api_key + if not expected: + raise HTTPException(status_code=status.HTTP_501_NOT_IMPLEMENTED, detail="N8N_API_KEY nicht konfiguriert") + if api_key != expected: + raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Ungültiger API-Key") + + +@app.post("/api/n8n/pipeline") +def api_n8n_pipeline(request: Request) -> dict: + """Trigger the full auto pipeline. Called by N8N (2x/day or on demand).""" + _require_api_key(request) + try: + result = run_auto_pipeline(trigger="n8n") + return {"ok": True, "stats": result} + except Exception as exc: + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(exc)) from exc + + +@app.post("/api/n8n/ingest") +def api_n8n_ingest(request: Request) -> dict: + """Run only the ingestion step (no rewrite/publish). For N8N.""" + _require_api_key(request) + stats = run_ingestion() + return { + "ok": stats.status == "success", + "stats": { + "feeds_processed": stats.feeds_processed, + "entries_seen": stats.entries_seen, + "articles_upserted": stats.articles_upserted, + }, + } + + +# --------------------------------------------------------------------------- +# Telegram Webhook +# --------------------------------------------------------------------------- + +@app.post("/telegram/webhook") +async def telegram_webhook(request: Request) -> dict: + """Receive updates from Telegram Bot API.""" + # Verify secret token + secret = settings.telegram_webhook_secret + if secret: + incoming = request.headers.get("X-Telegram-Bot-Api-Secret-Token", "") + if incoming != secret: + raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="Invalid secret") + + body = await request.body() + try: + update = json.loads(body.decode("utf-8")) + except Exception: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON") + + try: + handle_update(update) + except Exception as exc: + import logging + logging.getLogger(__name__).error("Telegram update handler error: %s", exc) + + return {"ok": True} + + +@app.post("/api/telegram/setup-webhook") +def api_setup_telegram_webhook(request: Request) -> dict: + """Register the Telegram webhook URL. Call once after deployment.""" + username = require_auth(request) + base_url = str(request.base_url).rstrip("/") + webhook_url = f"{base_url}/telegram/webhook" + result = setup_webhook(webhook_url) + return {"ok": True, "webhook_url": webhook_url, "telegram_response": result, "requested_by": username} diff --git a/backend/app/pipeline.py b/backend/app/pipeline.py new file mode 100644 index 0000000..f86ef35 --- /dev/null +++ b/backend/app/pipeline.py @@ -0,0 +1,407 @@ +"""Autonomous RSS-News pipeline. + +Full automated flow: +1. Run RSS ingestion +2. For each new article: + - Auto-select primary image + - Score relevance via GPT + - < warn threshold: reject (error status) → Telegram rejected summary + - warn..auto threshold: Telegram warning with override button + - >= auto threshold: rewrite → create WP draft → Telegram notification +3. Send pipeline summary to Telegram +""" +from __future__ import annotations + +import json +import logging +import time +from dataclasses import dataclass, field +from datetime import datetime, timezone +from typing import Any + +from .config import get_settings +from .ingestion import run_ingestion +from .publisher import enqueue_publish, run_publisher +from .repositories import ( + ArticleUpsert, + get_article_by_id, + list_articles, + set_article_image_decision, + update_article_status, + upsert_article as repo_upsert_article, +) +from .rewrite import generate_article_tags, merge_generated_tags, rewrite_article_text, score_article_relevance +from .scheduler import reserve_publish_slot +from .wordpress import publish_article_draft, selected_image_exists + +logger = logging.getLogger(__name__) + + +@dataclass +class PipelineStats: + ingested: int = 0 + processed: int = 0 + drafts_created: int = 0 + rejected: int = 0 + warnings: int = 0 + errors: int = 0 + rejected_articles: list[dict[str, Any]] = field(default_factory=list) + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + +def _auto_select_image(article: dict[str, Any]) -> bool: + """Auto-select the primary image from ingestion metadata if not already selected.""" + meta_json = article.get("meta_json") or "{}" + try: + meta = json.loads(meta_json) + except Exception: + return False + + # Already selected? + image_review = meta.get("image_review") or {} + if isinstance(image_review, dict) and image_review.get("selected_url"): + return True + + # Try to get primary from ingestion extraction + extraction = meta.get("extraction") or {} + image_selection = extraction.get("image_selection") or {} + primary = image_selection.get("primary") + + if not primary: + # Fallback: use first URL from image_urls_json + image_urls_json = article.get("image_urls_json") or "[]" + try: + urls = json.loads(image_urls_json) + if urls: + primary = urls[0] + except Exception: + pass + + if primary: + set_article_image_decision(int(article["id"]), primary, "select", actor="pipeline") + return True + return False + + +def _store_relevance(article_id: int, relevance: dict[str, Any]) -> None: + """Persist relevance score and reason in article meta_json and relevance_score column.""" + article = get_article_by_id(article_id) + if not article: + return + try: + meta = json.loads(article.get("meta_json") or "{}") + except Exception: + meta = {} + meta["relevance"] = relevance + new_meta = json.dumps(meta, ensure_ascii=False) + from .db import get_conn + with get_conn() as conn: + conn.execute( + "UPDATE articles SET meta_json = ?, relevance_score = ? WHERE id = ?", + (new_meta, relevance.get("score", 0), article_id), + ) + + +def _do_rewrite_and_draft(article: dict[str, Any]) -> tuple[int, str | None]: + """Rewrite article and create WP draft. Returns (wp_post_id, wp_post_url).""" + article_id = int(article["id"]) + + # Rewrite + rewritten = rewrite_article_text(article) + tags: list[str] = [] + try: + tags = generate_article_tags(article, rewritten_text=rewritten) + except Exception: + pass + merged_meta = merge_generated_tags(article.get("meta_json"), tags) + + # Save rewritten content + approved status + repo_upsert_article( + ArticleUpsert( + feed_id=article.get("feed_id"), + source_article_id=article.get("source_article_id"), + source_hash=article.get("source_hash"), + title=article.get("title", ""), + source_url=article.get("source_url", ""), + canonical_url=article.get("canonical_url"), + published_at=article.get("published_at"), + author=article.get("author"), + summary=article.get("summary"), + content_raw=article.get("content_raw"), + content_rewritten=rewritten, + image_urls_json=article.get("image_urls_json"), + press_contact=article.get("press_contact"), + source_name_snapshot=article.get("source_name_snapshot"), + source_terms_url_snapshot=article.get("source_terms_url_snapshot"), + source_license_name_snapshot=article.get("source_license_name_snapshot"), + legal_checked=bool(int(article.get("legal_checked", 0))), + legal_checked_at=article.get("legal_checked_at"), + legal_note=article.get("legal_note"), + wp_post_id=article.get("wp_post_id"), + wp_post_url=article.get("wp_post_url"), + publish_attempts=int(article.get("publish_attempts", 0)), + publish_last_error=article.get("publish_last_error"), + published_to_wp_at=article.get("published_to_wp_at"), + word_count=len(rewritten.split()), + status="approved", + meta_json=merged_meta, + ) + ) + + # Reload after save to get updated meta_json + fresh = get_article_by_id(article_id) + if not fresh: + raise RuntimeError(f"Artikel #{article_id} nach Rewrite nicht gefunden") + + # Create WP draft + wp_post_id, wp_post_url = publish_article_draft(fresh) + + # Update WP info in DB + from .repositories import mark_article_publish_result + mark_article_publish_result( + article_id, + wp_post_id=wp_post_id, + wp_post_url=wp_post_url, + error=None, + increment_attempts=True, + set_published_status=False, + ) + + return wp_post_id, wp_post_url + + +# --------------------------------------------------------------------------- +# Public pipeline functions +# --------------------------------------------------------------------------- + +def run_auto_pipeline(trigger: str = "auto") -> dict[str, Any]: + """Run the full automated pipeline and return stats dict.""" + from . import telegram_bot as tg + + settings = get_settings() + stats = PipelineStats() + + tg.notify_pipeline_started(trigger) + + # Step 1: Ingestion + try: + ingest_result = run_ingestion() + stats.ingested = ingest_result.articles_upserted + except Exception as exc: + tg.notify_error(f"Ingestion fehlgeschlagen: {exc}") + logger.error("Ingestion error: %s", exc) + stats.errors += 1 + + # Step 2: Process new articles + new_articles = list_articles(limit=100, status_filter="new") + + for article in new_articles: + article_id = int(article["id"]) + try: + _process_article(article, stats, settings) + except Exception as exc: + logger.error("Fehler bei Artikel #%d: %s", article_id, exc) + tg.notify_error(f"Fehler bei Artikel #{article_id} ({article.get('title','?')[:50]}): {exc}") + stats.errors += 1 + # Rate limiting between OpenAI calls + time.sleep(1) + + # Step 3: Send rejected summary if any + if stats.rejected_articles: + try: + tg.notify_rejected_summary(stats.rejected_articles) + except Exception as exc: + logger.warning("Telegram rejected summary fehlgeschlagen: %s", exc) + + # Step 4: Summary + result = { + "ingested": stats.ingested, + "processed": stats.processed, + "drafts_created": stats.drafts_created, + "rejected": stats.rejected, + "warnings": stats.warnings, + "errors": stats.errors, + } + tg.notify_pipeline_done(result) + return result + + +def _process_article(article: dict[str, Any], stats: PipelineStats, settings: Any) -> None: + """Process a single new article through the pipeline.""" + from . import telegram_bot as tg + + article_id = int(article["id"]) + + # Auto-select image + _auto_select_image(article) + + # Score relevance + try: + relevance = score_article_relevance(article) + except Exception as exc: + logger.warning("Relevanz-Scoring für #%d fehlgeschlagen: %s", article_id, exc) + relevance = {"score": 0, "reason": f"Scoring-Fehler: {exc}", "topics": []} + + score = relevance.get("score", 0) + reason = relevance.get("reason", "") + _store_relevance(article_id, relevance) + + stats.processed += 1 + + if score < settings.pipeline_relevance_warn: + # Reject + update_article_status( + article_id, + "error", + actor="pipeline", + note=f"Abgelehnt: Score {score}/100 — {reason}", + ) + stats.rejected += 1 + # Reload for summary (now has relevance in meta) + updated = get_article_by_id(article_id) + if updated: + stats.rejected_articles.append(updated) + + elif score < settings.pipeline_relevance_auto: + # Warning zone: inform user, don't auto-process + stats.warnings += 1 + try: + tg.notify_relevance_warning(article, score, reason) + except Exception as exc: + logger.warning("Telegram warning für #%d fehlgeschlagen: %s", article_id, exc) + + else: + # Auto-process: rewrite + WP draft + try: + # Reload article to get updated image_review + fresh = get_article_by_id(article_id) + if not fresh: + return + wp_post_id, wp_post_url = _do_rewrite_and_draft(fresh) + stats.drafts_created += 1 + + # Reserve publish slot + slot = reserve_publish_slot(article_id) + + # Reload for notification + final = get_article_by_id(article_id) + if final: + try: + tg.notify_new_draft(final, score=score, suggested_publish_at=slot) + except Exception as exc: + logger.warning("Telegram draft-Benachrichtigung für #%d fehlgeschlagen: %s", article_id, exc) + + except Exception as exc: + logger.error("Draft-Erstellung für #%d fehlgeschlagen: %s", article_id, exc) + update_article_status(article_id, "error", actor="pipeline", note=f"Draft-Fehler: {exc}") + raise + + +# --------------------------------------------------------------------------- +# Callback actions (called from telegram_bot._handle_callback) +# --------------------------------------------------------------------------- + +def rewrite_and_update_draft(article_id: int) -> None: + """Rewrite article and update the existing WP draft.""" + article = get_article_by_id(article_id) + if not article: + raise RuntimeError(f"Artikel #{article_id} nicht gefunden") + _auto_select_image(article) + fresh = get_article_by_id(article_id) + _do_rewrite_and_draft(fresh) + + +def discard_article(article_id: int) -> None: + """Discard a draft: delete WP post if exists, set article to error.""" + article = get_article_by_id(article_id) + if not article: + return + + wp_post_id = article.get("wp_post_id") + if wp_post_id: + try: + from .wordpress import delete_wp_post + delete_wp_post(int(wp_post_id)) + except Exception as exc: + logger.warning("WP Post #%d konnte nicht gelöscht werden: %s", wp_post_id, exc) + + update_article_status(article_id, "error", actor="telegram", note="Via Telegram verworfen") + + +def override_rejected_article(article_id: int) -> None: + """Force-process a previously rejected article.""" + from . import telegram_bot as tg + + article = get_article_by_id(article_id) + if not article: + raise RuntimeError(f"Artikel #{article_id} nicht gefunden") + + # Reset to new so processing is allowed + update_article_status(article_id, "new", actor="telegram", note="Manuell übernommen via Telegram") + + # Reload + fresh = get_article_by_id(article_id) + if not fresh: + return + + _auto_select_image(fresh) + fresh = get_article_by_id(article_id) + + # Get existing score or re-score + try: + meta = json.loads(fresh.get("meta_json") or "{}") + score = int((meta.get("relevance") or {}).get("score", 0)) + except Exception: + score = 0 + + wp_post_id, wp_post_url = _do_rewrite_and_draft(fresh) + slot = reserve_publish_slot(article_id) + + final = get_article_by_id(article_id) + if final: + tg.notify_new_draft(final, score=score, suggested_publish_at=slot) + + +# --------------------------------------------------------------------------- +# Status helpers (used by /status command) +# --------------------------------------------------------------------------- + +def get_recently_rejected(days: int = 3) -> list[dict[str, Any]]: + """Return articles rejected in the last N days.""" + from .db import get_conn + from .db import rows_to_dicts + cutoff = datetime.now(timezone.utc).isoformat()[:10] + with get_conn() as conn: + rows = conn.execute( + """ + SELECT id, title, meta_json, source_url, created_at + FROM articles + WHERE status = 'error' + AND json_extract(meta_json, '$.relevance.score') IS NOT NULL + AND date(updated_at) >= date('now', ?) + ORDER BY updated_at DESC + LIMIT 20 + """, + (f"-{days} days",), + ).fetchall() + return rows_to_dicts(rows) + + +def get_pipeline_status_text() -> str: + """Return a text summary of current pipeline state.""" + from .repositories import list_articles as _list + new_count = len(_list(limit=500, status_filter="new")) + approved_count = len(_list(limit=500, status_filter="approved")) + published_count = len(_list(limit=500, status_filter="published")) + error_count = len(_list(limit=500, status_filter="error")) + + return ( + f"📊 Pipeline-Status\n" + f"🆕 Neu / wartend: {new_count}\n" + f"✅ Draft / freigegeben: {approved_count}\n" + f"📢 Veröffentlicht: {published_count}\n" + f"🚫 Fehler / abgelehnt: {error_count}" + ) diff --git a/backend/app/rewrite.py b/backend/app/rewrite.py index 759fac9..6c1d37b 100644 --- a/backend/app/rewrite.py +++ b/backend/app/rewrite.py @@ -146,6 +146,47 @@ def generate_article_tags(article: dict[str, Any], rewritten_text: str | None = return [] +def score_article_relevance(article: dict[Any, Any]) -> dict[str, Any]: + """Score article relevance for VanLife/Camping/Outdoor blog (0-100). + + Returns {"score": int, "reason": str, "topics": list[str]}. + Raises RuntimeError on OpenAI failure. + """ + title = (article.get("title") or "").strip() + text = _sanitize_source_text(article.get("content_raw") or "") + if not text: + text = (article.get("summary") or "").strip() + + prompt = ( + "Bewerte die Relevanz des folgenden Artikels für einen deutschen VanLife-, Camping- und Outdoor-Blog. " + "Relevante Themen: Campingplätze, Stellplätze, Wohnmobil, Camper, Van, Roadtrip, " + "Outdoor-Ausrüstung, Wandern, Naturreisen, Reise-Tipps für Campende. " + "Nicht relevant: allgemeine Nachrichten, Politik, Wirtschaft, Sport (außer Outdoor), Unterhaltung.\n\n" + "Antworte NUR mit einem JSON-Objekt:\n" + '{"score": <0-100>, "reason": "", "topics": ["", ""]}\n\n' + f"Titel: {title}\n\n" + f"Text (Auszug):\n{text[:2000]}" + ) + raw = _openai_chat( + "Du bist ein Redakteur für einen VanLife- und Camping-Blog und bewertest Artikelrelevanz.", + prompt, + temperature=0.1, + ) + try: + match = re.search(r"\{[\s\S]*\}", raw) + if match: + parsed = json.loads(match.group(0)) + score = max(0, min(100, int(parsed.get("score", 0)))) + return { + "score": score, + "reason": str(parsed.get("reason", "")), + "topics": [str(t) for t in (parsed.get("topics") or [])], + } + except Exception: + pass + return {"score": 0, "reason": "Parsing-Fehler bei Relevanz-Score", "topics": []} + + def merge_generated_tags(meta_json: str | None, tags: list[str]) -> str: meta: dict[str, Any] = {} if meta_json: diff --git a/backend/app/scheduler.py b/backend/app/scheduler.py new file mode 100644 index 0000000..d4c6aaf --- /dev/null +++ b/backend/app/scheduler.py @@ -0,0 +1,139 @@ +"""Smart publishing scheduler. + +Calculates suggested publish slots for new WordPress drafts. +Rules: +- Maximum N drafts per day (configurable, default 2) +- Prefer slots spread across the week for steady traffic +- Preferred hours: configurable (default 09:00 and 14:00 CET) +""" +from __future__ import annotations + +from datetime import date, datetime, timedelta, timezone +from typing import Any + +from .config import get_settings +from .db import get_conn + + +# CET offset (UTC+1 winter / UTC+2 summer – we use a fixed +1 for simplicity) +_CET_OFFSET = timedelta(hours=1) + + +def _today_cet() -> date: + return (datetime.now(timezone.utc) + _CET_OFFSET).date() + + +def _preferred_hours() -> list[int]: + settings = get_settings() + try: + return [int(h.strip()) for h in settings.pipeline_publish_hours.split(",") if h.strip()] + except Exception: + return [9, 14] + + +def _count_scheduled_on_day(target_date: date) -> int: + """Count articles already scheduled for publication on a given date.""" + date_str = target_date.isoformat() + with get_conn() as conn: + row = conn.execute( + """ + SELECT COUNT(*) AS cnt + FROM articles + WHERE scheduled_publish_at >= ? AND scheduled_publish_at < ? + AND status NOT IN ('error') + """, + (date_str + "T00:00:00", date_str + "T23:59:59"), + ).fetchone() + return int(row["cnt"]) if row else 0 + + +def _next_free_hour(target_date: date) -> int | None: + """Return first preferred hour that is not yet used on target_date, or None if day is full.""" + settings = get_settings() + max_per_day = settings.pipeline_max_drafts_per_day + hours = _preferred_hours() + + date_str = target_date.isoformat() + with get_conn() as conn: + rows = conn.execute( + """ + SELECT scheduled_publish_at FROM articles + WHERE scheduled_publish_at >= ? AND scheduled_publish_at < ? + AND status NOT IN ('error') + """, + (date_str + "T00:00:00", date_str + "T23:59:59"), + ).fetchall() + + used_hours: set[int] = set() + for row in rows: + ts = row["scheduled_publish_at"] or "" + try: + used_hours.add(datetime.fromisoformat(ts).hour) + except Exception: + pass + + for h in hours: + if h not in used_hours: + return h + return None # day is full + + +def suggest_publish_slot(lookahead_days: int = 14) -> str: + """Return a suggested publish datetime string (ISO, CET) for the next free slot. + + Format: 'Mo, 24.03.2026 um 09:00 Uhr' + Also updates DB so consecutive calls return different slots. + """ + today = _today_cet() + weekday_names = ["Mo", "Di", "Mi", "Do", "Fr", "Sa", "So"] + + for offset in range(1, lookahead_days + 1): + candidate = today + timedelta(days=offset) + hour = _next_free_hour(candidate) + if hour is not None: + wd = weekday_names[candidate.weekday()] + return f"{wd}, {candidate.strftime('%d.%m.%Y')} um {hour:02d}:00 Uhr" + + # Fallback: just tomorrow morning + tomorrow = today + timedelta(days=1) + hours = _preferred_hours() + h = hours[0] if hours else 9 + wd = weekday_names[tomorrow.weekday()] + return f"{wd}, {tomorrow.strftime('%d.%m.%Y')} um {h:02d}:00 Uhr" + + +def reserve_publish_slot(article_id: int) -> str: + """Reserve a publish slot for an article and persist it in the DB. + + Returns the suggested publish datetime string. + """ + today = _today_cet() + lookahead_days = 14 + weekday_names = ["Mo", "Di", "Mi", "Do", "Fr", "Sa", "So"] + + for offset in range(1, lookahead_days + 1): + candidate = today + timedelta(days=offset) + hour = _next_free_hour(candidate) + if hour is not None: + # Reserve this slot by writing to the article + iso_ts = f"{candidate.isoformat()}T{hour:02d}:00:00" + with get_conn() as conn: + conn.execute( + "UPDATE articles SET scheduled_publish_at = ? WHERE id = ?", + (iso_ts, article_id), + ) + wd = weekday_names[candidate.weekday()] + return f"{wd}, {candidate.strftime('%d.%m.%Y')} um {hour:02d}:00 Uhr" + + # Fallback + tomorrow = today + timedelta(days=1) + hours = _preferred_hours() + h = hours[0] if hours else 9 + iso_ts = f"{tomorrow.isoformat()}T{h:02d}:00:00" + with get_conn() as conn: + conn.execute( + "UPDATE articles SET scheduled_publish_at = ? WHERE id = ?", + (iso_ts, article_id), + ) + wd = weekday_names[tomorrow.weekday()] + return f"{wd}, {tomorrow.strftime('%d.%m.%Y')} um {h:02d}:00 Uhr" diff --git a/backend/app/telegram_bot.py b/backend/app/telegram_bot.py new file mode 100644 index 0000000..6d88105 --- /dev/null +++ b/backend/app/telegram_bot.py @@ -0,0 +1,438 @@ +"""Telegram Bot integration for RSS-News pipeline notifications and controls.""" +from __future__ import annotations + +import json +import logging +from typing import Any +from urllib.error import URLError +from urllib.parse import urlencode +from urllib.request import Request, urlopen + +from .config import get_settings + +logger = logging.getLogger(__name__) + +_BASE = "https://api.telegram.org/bot{token}/{method}" + + +# --------------------------------------------------------------------------- +# Low-level API helpers +# --------------------------------------------------------------------------- + +def _call(method: str, payload: dict[str, Any]) -> dict[str, Any]: + settings = get_settings() + token = settings.telegram_bot_token + if not token: + raise RuntimeError("TELEGRAM_BOT_TOKEN nicht konfiguriert") + url = _BASE.format(token=token, method=method) + data = json.dumps(payload).encode("utf-8") + req = Request( + url=url, + data=data, + method="POST", + headers={"Content-Type": "application/json", "Accept": "application/json"}, + ) + try: + with urlopen(req, timeout=15) as resp: + raw = resp.read().decode("utf-8", errors="replace") + return json.loads(raw) + except URLError as exc: + logger.error("Telegram API Fehler (%s): %s", method, exc) + raise RuntimeError(f"Telegram API Fehler: {exc}") from exc + + +def _chat_id() -> str: + settings = get_settings() + cid = settings.telegram_chat_id + if not cid: + raise RuntimeError("TELEGRAM_CHAT_ID nicht konfiguriert") + return cid + + +def _inline_keyboard(buttons: list[list[dict[str, str]]]) -> dict: + return {"inline_keyboard": buttons} + + +# --------------------------------------------------------------------------- +# Public send functions +# --------------------------------------------------------------------------- + +def send_message(text: str, reply_markup: dict | None = None, parse_mode: str = "HTML") -> dict: + payload: dict[str, Any] = { + "chat_id": _chat_id(), + "text": text, + "parse_mode": parse_mode, + "disable_web_page_preview": False, + } + if reply_markup: + payload["reply_markup"] = reply_markup + return _call("sendMessage", payload) + + +def send_photo_message( + photo_url: str, + caption: str, + reply_markup: dict | None = None, + parse_mode: str = "HTML", +) -> dict: + payload: dict[str, Any] = { + "chat_id": _chat_id(), + "photo": photo_url, + "caption": caption, + "parse_mode": parse_mode, + } + if reply_markup: + payload["reply_markup"] = reply_markup + try: + return _call("sendPhoto", payload) + except Exception: + # Fall back to text message if photo fails (e.g. image URL no longer valid) + return send_message(caption, reply_markup=reply_markup, parse_mode=parse_mode) + + +def answer_callback_query(callback_query_id: str, text: str = "") -> None: + try: + _call("answerCallbackQuery", {"callback_query_id": callback_query_id, "text": text}) + except Exception as exc: + logger.warning("answerCallbackQuery fehlgeschlagen: %s", exc) + + +def edit_message_reply_markup(chat_id: str, message_id: int, reply_markup: dict | None = None) -> None: + payload: dict[str, Any] = {"chat_id": chat_id, "message_id": message_id} + if reply_markup: + payload["reply_markup"] = reply_markup + else: + payload["reply_markup"] = {"inline_keyboard": []} + try: + _call("editMessageReplyMarkup", payload) + except Exception as exc: + logger.warning("editMessageReplyMarkup fehlgeschlagen: %s", exc) + + +def setup_webhook(webhook_url: str) -> dict: + settings = get_settings() + payload: dict[str, Any] = {"url": webhook_url, "allowed_updates": ["message", "callback_query"]} + if settings.telegram_webhook_secret: + payload["secret_token"] = settings.telegram_webhook_secret + return _call("setWebhook", payload) + + +def delete_webhook() -> dict: + return _call("deleteWebhook", {}) + + +# --------------------------------------------------------------------------- +# Notification helpers +# --------------------------------------------------------------------------- + +def _format_tags(meta_json: str | None) -> str: + if not meta_json: + return "" + try: + meta = json.loads(meta_json) + tags = meta.get("generated_tags") or [] + if tags: + return " ".join(f"#{t.replace(' ', '_')}" for t in tags[:6]) + except Exception: + pass + return "" + + +def _score_emoji(score: int) -> str: + if score >= 85: + return "🟢" + if score >= 70: + return "🟡" + return "🔴" + + +def notify_new_draft( + article: dict[str, Any], + score: int, + suggested_publish_at: str | None = None, +) -> None: + """Send Telegram notification for a newly created WP draft.""" + title = (article.get("title") or "Ohne Titel").strip() + wp_url = article.get("wp_post_url") or "" + tags_str = _format_tags(article.get("meta_json")) + art_id = article.get("id") + + score_line = f"{_score_emoji(score)} Relevanz-Score: {score}/100" + publish_line = f"📅 Vorgeschlagene Veröffentlichung: {suggested_publish_at}" if suggested_publish_at else "" + link_line = f'🔗 Draft in WordPress öffnen' if wp_url else "" + tags_line = f"🏷 {tags_str}" if tags_str else "" + + text_parts = [ + f"✅ Neuer Draft erstellt", + f"📰 {title}", + score_line, + ] + if publish_line: + text_parts.append(publish_line) + if tags_line: + text_parts.append(tags_line) + if link_line: + text_parts.append(link_line) + + text = "\n".join(text_parts) + + keyboard = _inline_keyboard([ + [ + {"text": "✏️ Neu schreiben", "callback_data": f"rewrite:{art_id}"}, + {"text": "❌ Verwerfen", "callback_data": f"discard:{art_id}"}, + ] + ]) + + # Try with image first + meta = {} + try: + meta = json.loads(article.get("meta_json") or "{}") + except Exception: + pass + image_url = None + image_review = meta.get("image_review") or {} + if isinstance(image_review, dict): + image_url = image_review.get("selected_url") + if not image_url: + image_sel = (meta.get("extraction") or {}).get("image_selection") or {} + image_url = image_sel.get("primary") + + if image_url: + send_photo_message(image_url, caption=text, reply_markup=keyboard) + else: + send_message(text, reply_markup=keyboard) + + +def notify_relevance_warning(article: dict[str, Any], score: int, reason: str) -> None: + """Send Telegram warning for borderline articles (score between warn and auto thresholds).""" + title = (article.get("title") or "Ohne Titel").strip() + art_id = article.get("id") + source_url = article.get("source_url") or "" + + text = ( + f"⚠️ Artikel mit niedrigem Relevanz-Score\n" + f"📰 {title}\n" + f"{_score_emoji(score)} Score: {score}/100\n" + f"💬 {reason}\n" + f'🔗 Originalartikel' + ) + keyboard = _inline_keyboard([ + [ + {"text": "➕ Trotzdem verarbeiten", "callback_data": f"override:{art_id}"}, + {"text": "❌ Ablehnen", "callback_data": f"reject:{art_id}"}, + ] + ]) + send_message(text, reply_markup=keyboard) + + +def notify_rejected_summary(articles: list[dict[str, Any]]) -> None: + """Send summary of rejected articles for this pipeline run.""" + if not articles: + return + lines = [f"🚫 {len(articles)} Artikel abgelehnt (Score < {get_settings().pipeline_relevance_warn})\n"] + for art in articles[:10]: + title = (art.get("title") or "Ohne Titel")[:60] + score = _get_relevance_score(art) + reason = _get_rejection_reason(art) + art_id = art.get("id") + lines.append(f"• {title} (Score: {score}) — {reason}") + if len(articles) > 10: + lines.append(f"... und {len(articles) - 10} weitere") + + text = "\n".join(lines) + # Build override buttons for first 5 + rows = [] + for art in articles[:5]: + art_id = art.get("id") + title = (art.get("title") or "")[:25] + rows.append([{"text": f"➕ {title}…", "callback_data": f"override:{art_id}"}]) + + keyboard = _inline_keyboard(rows) if rows else None + send_message(text, reply_markup=keyboard) + + +def notify_error(message: str) -> None: + """Send error alert to Telegram.""" + try: + send_message(f"🔴 Fehler im RSS-Pipeline\n{message}") + except Exception as exc: + logger.error("Telegram Fehler-Benachrichtigung fehlgeschlagen: %s", exc) + + +def notify_pipeline_started(trigger: str = "auto") -> None: + icon = "🤖" if trigger == "auto" else "👤" + try: + send_message(f"{icon} Pipeline gestartet (Auslöser: {trigger})") + except Exception: + pass + + +def notify_pipeline_done(stats: dict[str, Any]) -> None: + ingested = stats.get("ingested", 0) + processed = stats.get("processed", 0) + drafts = stats.get("drafts_created", 0) + rejected = stats.get("rejected", 0) + warnings = stats.get("warnings", 0) + errors = stats.get("errors", 0) + + lines = [ + "📊 Pipeline abgeschlossen", + f"📥 Neue Artikel importiert: {ingested}", + f"⚙️ Verarbeitet: {processed}", + f"📝 Drafts erstellt: {drafts}", + ] + if rejected: + lines.append(f"🚫 Abgelehnt: {rejected}") + if warnings: + lines.append(f"⚠️ Warnungen: {warnings}") + if errors: + lines.append(f"🔴 Fehler: {errors}") + + try: + send_message("\n".join(lines)) + except Exception: + pass + + +# --------------------------------------------------------------------------- +# Helper to read relevance info from meta_json +# --------------------------------------------------------------------------- + +def _get_relevance_score(article: dict[str, Any]) -> int: + try: + meta = json.loads(article.get("meta_json") or "{}") + return int(meta.get("relevance", {}).get("score", 0)) + except Exception: + return 0 + + +def _get_rejection_reason(article: dict[str, Any]) -> str: + try: + meta = json.loads(article.get("meta_json") or "{}") + return str(meta.get("relevance", {}).get("reason", ""))[:80] + except Exception: + return "" + + +# --------------------------------------------------------------------------- +# Incoming update handler (called by webhook endpoint) +# --------------------------------------------------------------------------- + +def handle_update(update: dict[str, Any]) -> None: + """Process an incoming Telegram update.""" + # Import here to avoid circular imports + from . import pipeline as _pipeline + + if "callback_query" in update: + _handle_callback(update["callback_query"]) + elif "message" in update: + _handle_message(update["message"]) + + +def _handle_message(message: dict[str, Any]) -> None: + from . import pipeline as _pipeline + + text = (message.get("text") or "").strip() + if not text.startswith("/"): + return + + cmd = text.split()[0].lower().lstrip("/") + if "@" in cmd: + cmd = cmd.split("@")[0] + + if cmd == "run": + send_message("🤖 Pipeline wird manuell gestartet …") + try: + stats = _pipeline.run_auto_pipeline(trigger="manual") + notify_pipeline_done(stats) + except Exception as exc: + notify_error(f"/run fehlgeschlagen: {exc}") + + elif cmd == "rejected": + try: + articles = _pipeline.get_recently_rejected(days=3) + if not articles: + send_message("✅ Keine abgelehnten Artikel in den letzten 3 Tagen.") + else: + notify_rejected_summary(articles) + except Exception as exc: + notify_error(f"/rejected fehlgeschlagen: {exc}") + + elif cmd == "status": + try: + status_text = _pipeline.get_pipeline_status_text() + send_message(status_text) + except Exception as exc: + notify_error(f"/status fehlgeschlagen: {exc}") + + elif cmd == "help": + send_message( + "📋 Verfügbare Befehle\n" + "/run — Pipeline manuell starten\n" + "/rejected — Abgelehnte Artikel der letzten 3 Tage\n" + "/status — Pipeline-Status\n" + "/help — Diese Hilfe" + ) + + +def _handle_callback(callback_query: dict[str, Any]) -> None: + from . import pipeline as _pipeline + from .repositories import get_article_by_id, update_article_status + + query_id = callback_query.get("id", "") + data = (callback_query.get("data") or "").strip() + chat_id = str(callback_query.get("message", {}).get("chat", {}).get("id", "")) + message_id = int(callback_query.get("message", {}).get("message_id", 0)) + + if ":" not in data: + answer_callback_query(query_id, "Ungültige Aktion") + return + + action, _, raw_id = data.partition(":") + try: + article_id = int(raw_id) + except ValueError: + answer_callback_query(query_id, "Ungültige Artikel-ID") + return + + article = get_article_by_id(article_id) + if not article: + answer_callback_query(query_id, "Artikel nicht gefunden") + return + + if action == "rewrite": + answer_callback_query(query_id, "✏️ Artikel wird neu geschrieben …") + edit_message_reply_markup(chat_id, message_id) + try: + _pipeline.rewrite_and_update_draft(article_id) + updated = get_article_by_id(article_id) + if updated: + from .scheduler import suggest_publish_slot + slot = suggest_publish_slot() + notify_new_draft(updated, score=_get_relevance_score(updated), suggested_publish_at=slot) + except Exception as exc: + notify_error(f"Rewrite #{article_id} fehlgeschlagen: {exc}") + + elif action == "discard": + answer_callback_query(query_id, "❌ Artikel verworfen") + edit_message_reply_markup(chat_id, message_id) + try: + _pipeline.discard_article(article_id) + except Exception as exc: + notify_error(f"Verwerfen #{article_id} fehlgeschlagen: {exc}") + + elif action == "override": + answer_callback_query(query_id, "➕ Artikel wird verarbeitet …") + edit_message_reply_markup(chat_id, message_id) + try: + _pipeline.override_rejected_article(article_id) + except Exception as exc: + notify_error(f"Override #{article_id} fehlgeschlagen: {exc}") + + elif action == "reject": + answer_callback_query(query_id, "🚫 Abgelehnt") + edit_message_reply_markup(chat_id, message_id) + update_article_status(article_id, "error", actor="telegram", note="Manuell abgelehnt via Telegram") + + else: + answer_callback_query(query_id, "Unbekannte Aktion") diff --git a/backend/app/wordpress.py b/backend/app/wordpress.py index 150bcd1..704d428 100644 --- a/backend/app/wordpress.py +++ b/backend/app/wordpress.py @@ -373,3 +373,18 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]: def selected_image_exists(article: dict[str, Any]) -> bool: return _selected_image_url_from_meta(article.get("meta_json")) is not None + + +def delete_wp_post(wp_post_id: int) -> None: + """Permanently delete a WordPress post (moves to trash, then deletes).""" + settings = get_settings() + if not settings.wordpress_base_url or not settings.wordpress_username or not settings.wordpress_app_password: + raise RuntimeError("WordPress Konfiguration fehlt") + auth = _auth_header(settings.wordpress_username, settings.wordpress_app_password) + # force=true skips trash + _wp_request( + base_url=settings.wordpress_base_url, + auth_header=auth, + method="DELETE", + endpoint=f"posts/{wp_post_id}?force=true", + ) diff --git a/docs/AUTOMATION.md b/docs/AUTOMATION.md new file mode 100644 index 0000000..8008857 --- /dev/null +++ b/docs/AUTOMATION.md @@ -0,0 +1,190 @@ +# Automatischer Pipeline-Betrieb + +## Überblick + +Das System läuft vollautomatisch und benötigt nur noch gelegentliche Telegram-Interaktion. + +``` +N8N (2× täglich, 08:00 + 16:00 Uhr) + └─► POST /api/n8n/pipeline (X-API-Key Header) + ├── RSS Ingestion (alle aktivierten Feeds) + ├── Relevanz-Score per GPT (0–100) + │ ├── Score ≥ 80 → Rewrite + WP-Draft + Telegram + │ ├── Score 60–79 → Telegram-Warnung + manueller Override möglich + │ └── Score < 60 → Abgelehnt + tägliche Telegram-Liste + └── Pipeline-Zusammenfassung via Telegram +``` + +--- + +## Einrichtung + +### 1. Umgebungsvariablen setzen + +Kopiere `backend/.env.example` nach `backend/.env` und fülle alle Felder aus: + +```bash +cp backend/.env.example backend/.env +nano backend/.env +``` + +Wichtige Variablen: + +| Variable | Beschreibung | +|----------|-------------| +| `TELEGRAM_BOT_TOKEN` | Bot-Token von @BotFather | +| `TELEGRAM_CHAT_ID` | Deine persönliche Chat-ID | +| `TELEGRAM_WEBHOOK_SECRET` | Zufälliger String (≥ 20 Zeichen) | +| `N8N_API_KEY` | Starker zufälliger API-Key | +| `OPENAI_API_KEY` | OpenAI API-Key | +| `WP_BASE_URL` | WordPress-URL | +| `WP_USERNAME` | WordPress-Benutzername | +| `WP_PASSWORD` | WordPress App-Passwort | + +### 2. Telegram-Webhook registrieren + +Nach dem Deployment einmalig aufrufen: + +```bash +curl -X POST https://news.vanityontour.de/api/telegram/setup-webhook \ + -H "Cookie: rss_news_session=" +``` + +Oder über die Admin-UI: Settings → Telegram Webhook einrichten. + +### 3. N8N Workflow einrichten + +In N8N einen neuen Workflow erstellen: + +**Trigger:** Cron +- Zeitplan 1: `0 8 * * *` (täglich 08:00) +- Zeitplan 2: `0 16 * * *` (täglich 16:00) + +**Aktion:** HTTP Request +- Method: `POST` +- URL: `https://news.vanityontour.de/api/n8n/pipeline` +- Header: `X-API-Key: ` + +**Fehlerbehandlung:** Bei HTTP-Fehler → E-Mail/Telegram-Alert + +--- + +## Telegram-Befehle + +| Befehl | Funktion | +|--------|----------| +| `/run` | Pipeline manuell starten | +| `/rejected` | Abgelehnte Artikel der letzten 3 Tage anzeigen | +| `/status` | Aktuellen Pipeline-Status | +| `/help` | Alle Befehle anzeigen | + +--- + +## Telegram-Benachrichtigungen + +### Neuer Draft erstellt +Wenn ein Artikel erfolgreich verarbeitet wurde: + +``` +✅ Neuer Draft erstellt +📰 [Artikel-Titel] +🟢 Relevanz-Score: 87/100 +📅 Vorgeschlagene Veröffentlichung: Mo, 24.03.2026 um 09:00 Uhr +🏷 #VanLife #Camping #Wohnmobil +🔗 Draft in WordPress öffnen + + [✏️ Neu schreiben] [❌ Verwerfen] +``` + +### Relevanz-Warnung (Score 60–79) +``` +⚠️ Artikel mit niedrigem Relevanz-Score +📰 [Artikel-Titel] +🟡 Score: 72/100 +💬 Artikel behandelt hauptsächlich... +🔗 Originalartikel + + [➕ Trotzdem verarbeiten] [❌ Ablehnen] +``` + +### Abgelehnte Artikel (Ende jedes Runs) +Liste aller abgelehnten Artikel mit Override-Buttons für jeden einzelnen. + +--- + +## Relevanz-Score + +Der GPT-basierte Score bewertet die Themenrelevanz für den VanLife/Camping-Blog: + +| Score | Aktion | +|-------|--------| +| 80–100 | Automatisch verarbeiten | +| 60–79 | Telegram-Warnung, manueller Override | +| 0–59 | Automatisch abgelehnt | + +Themen die hoch scored werden: Campingplätze, Stellplätze, Wohnmobile, Van-Ausbau, +Outdoor-Equipment, Wandern, Naturreisen, Roadtrips, Camping-Tipps. + +Schwellwerte sind in `.env` konfigurierbar: +``` +PIPELINE_RELEVANCE_AUTO=80 +PIPELINE_RELEVANCE_WARN=60 +``` + +--- + +## Veröffentlichungsplan + +- Maximal **2 Beiträge pro Tag** +- Bevorzugte Zeiten: **09:00 und 14:00 Uhr** (CET) +- Gleichmäßig über die Woche verteilt +- Der Vorschlag erscheint in der Telegram-Nachricht +- Manuell in WordPress setzen oder über WP Scheduling-Plugin automatisieren + +Einstellbar via: +``` +PIPELINE_MAX_DRAFTS_PER_DAY=2 +PIPELINE_PUBLISH_HOURS=9,14 +``` + +--- + +## API-Endpunkte (N8N / extern) + +Alle externen Endpunkte benötigen den Header `X-API-Key: `. + +| Methode | Endpunkt | Funktion | +|---------|----------|----------| +| `POST` | `/api/n8n/pipeline` | Komplette Pipeline starten | +| `POST` | `/api/n8n/ingest` | Nur RSS-Import (ohne Rewrite) | + +--- + +## Deployment (Hetzner via GitHub) + +Das Deployment läuft automatisch über GitHub Actions beim Push auf `main`: + +1. GitHub Action führt Tests aus +2. Bei Erfolg: SSH-Deploy auf Hetzner +3. `pip install -r requirements.txt` +4. Systemd-Dienst `rss-app` neu starten + +Workflow-Dateien: `.github/workflows/test.yml` und `.github/workflows/deploy.yml` + +--- + +## Troubleshooting + +**Pipeline läuft, aber keine Telegram-Nachrichten:** +- `TELEGRAM_BOT_TOKEN` und `TELEGRAM_CHAT_ID` prüfen +- Webhook-Status prüfen: `GET https://api.telegram.org/bot/getWebhookInfo` + +**N8N bekommt 401:** +- `N8N_API_KEY` in `.env` und N8N-Workflow-Header müssen übereinstimmen + +**Alle Artikel werden abgelehnt:** +- `PIPELINE_RELEVANCE_WARN` temporär auf 40 senken zum Testen +- Über `/rejected` + Override-Button manuell testen + +**Artikel werden doppelt importiert:** +- Deduplication läuft über `source_url` (eindeutig). Bereits verarbeitete Artikel werden nie erneut als Draft angelegt. From 0a9c0b10d6a18244cfbf8e129aecd739fb33d12f Mon Sep 17 00:00:00 2001 From: OliverGiertz Date: Sat, 21 Mar 2026 09:41:34 +0000 Subject: [PATCH 25/54] test(ingestion): update test for removed Ampel risk-level check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ampel system removed – all enabled feeds are now processed regardless of risk_level. Updated test to verify feeds with any risk_level are processed instead of blocked. Co-Authored-By: Claude Sonnet 4.6 --- backend/tests/test_ingestion.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/backend/tests/test_ingestion.py b/backend/tests/test_ingestion.py index a36b62e..82bd2ca 100644 --- a/backend/tests/test_ingestion.py +++ b/backend/tests/test_ingestion.py @@ -98,11 +98,11 @@ class TestIngestion(unittest.TestCase): @patch("backend.app.ingestion.extract_article") @patch("backend.app.ingestion.feedparser.parse") - def test_ingestion_blocks_non_green_source(self, mock_parse, mock_extract_article) -> None: - # Re-create source/feed with yellow risk to verify enforcement + def test_ingestion_processes_any_enabled_source(self, mock_parse, mock_extract_article) -> None: + # Ampel/risk-level system removed – all enabled feeds are processed regardless of risk_level source_id = create_source( SourceCreate( - name="Blocked Source", + name="Any Risk Source", base_url="https://example.net", terms_url="https://example.net/terms", license_name="custom", @@ -112,20 +112,25 @@ class TestIngestion(unittest.TestCase): last_reviewed_at="2026-02-18T00:00:00Z", ) ) - blocked_feed_id = create_feed( + feed_id = create_feed( FeedCreate( - name="Blocked Feed", + name="Any Risk Feed", url="https://example.net/feed.xml", source_id=source_id, is_enabled=True, ) ) - stats = run_ingestion(feed_id=blocked_feed_id) + mock_parse.return_value = type("FP", (), {"entries": [], "etag": None, "modified": None})() + mock_extract_article.return_value = type("E", (), { + "title": None, "author": None, "summary": None, "content_text": None, + "canonical_url": None, "images": [], "press_contact": None, + })() + + stats = run_ingestion(feed_id=feed_id) self.assertEqual(stats.status, "success") - self.assertEqual(stats.articles_upserted, 0) - mock_parse.assert_not_called() - mock_extract_article.assert_not_called() + # Feed was processed (feedparser was called), even with yellow risk_level + mock_parse.assert_called_once() @patch("backend.app.ingestion.extract_article") @patch("backend.app.ingestion.feedparser.parse") From d9ab5994660dd750187ceae1355ca0f56490537e Mon Sep 17 00:00:00 2001 From: OliverGiertz Date: Sat, 21 Mar 2026 09:43:55 +0000 Subject: [PATCH 26/54] fix(deploy): correct service name and app path for Hetzner Service is rss-news-api (not rss-app), app lives at /opt/rss-news. Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/deploy.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 4ac1c42..af3394f 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -21,12 +21,13 @@ jobs: port: 22 envs: APP_ADMIN_USERNAME,APP_ADMIN_PASSWORD script: | - cd rss-news + cd /opt/rss-news git pull origin main source .venv/bin/activate pip install -r requirements.txt pip install -r backend/requirements.txt || true - sudo systemctl restart rss-app + sudo systemctl restart rss-news-api + sleep 3 BASE_URL="https://news.vanityontour.de" APP_ADMIN_USERNAME="${APP_ADMIN_USERNAME}" APP_ADMIN_PASSWORD="${APP_ADMIN_PASSWORD}" bash scripts/smoke_backend.sh env: APP_ADMIN_USERNAME: ${{ secrets.NEWS_APP_ADMIN_USERNAME }} From 1020526e76d38171e6607da9446093d58d6c0a84 Mon Sep 17 00:00:00 2001 From: OliverGiertz Date: Sat, 21 Mar 2026 10:03:13 +0000 Subject: [PATCH 27/54] fix(pipeline): run N8N pipeline endpoint async to avoid HTTP timeout Pipeline runs in background via asyncio. Endpoint returns immediately, results arrive via Telegram notifications. Co-Authored-By: Claude Sonnet 4.6 --- backend/app/main.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/backend/app/main.py b/backend/app/main.py index 51aab6b..276ffa9 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -638,14 +638,22 @@ def _require_api_key(request: Request) -> None: @app.post("/api/n8n/pipeline") -def api_n8n_pipeline(request: Request) -> dict: - """Trigger the full auto pipeline. Called by N8N (2x/day or on demand).""" +async def api_n8n_pipeline(request: Request) -> dict: + """Trigger the full auto pipeline in background. Returns immediately. + Called by N8N (2x/day or on demand). Results arrive via Telegram.""" _require_api_key(request) - try: - result = run_auto_pipeline(trigger="n8n") - return {"ok": True, "stats": result} - except Exception as exc: - raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(exc)) from exc + import asyncio + import logging + + async def _run(): + loop = asyncio.get_event_loop() + try: + await loop.run_in_executor(None, lambda: run_auto_pipeline(trigger="n8n")) + except Exception as exc: + logging.getLogger(__name__).error("Background pipeline error: %s", exc) + + asyncio.create_task(_run()) + return {"ok": True, "message": "Pipeline gestartet – Ergebnisse kommen per Telegram"} @app.post("/api/n8n/ingest") From e9c472b722655b6772428c8cda5a0c8be71b44d3 Mon Sep 17 00:00:00 2001 From: OliverGiertz Date: Sat, 21 Mar 2026 11:08:32 +0000 Subject: [PATCH 28/54] fix(telegram): async webhook handler + deduplicate callback responses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Webhook returns 200 immediately, processing runs in background task → Telegram no longer retries, eliminates duplicate callbacks and 400 errors - Consolidate answer_callback_query call to top of handler (before heavy work) - Add logger.info/error for callback actions to aid debugging Co-Authored-By: Claude Sonnet 4.6 --- backend/app/main.py | 21 +++++++++++++++------ backend/app/telegram_bot.py | 25 ++++++++++++++++--------- 2 files changed, 31 insertions(+), 15 deletions(-) diff --git a/backend/app/main.py b/backend/app/main.py index 276ffa9..e954de6 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -677,7 +677,14 @@ def api_n8n_ingest(request: Request) -> dict: @app.post("/telegram/webhook") async def telegram_webhook(request: Request) -> dict: - """Receive updates from Telegram Bot API.""" + """Receive updates from Telegram Bot API. + + Returns 200 immediately so Telegram never retries the same update. + Actual processing runs in a background task. + """ + import asyncio + import logging + # Verify secret token secret = settings.telegram_webhook_secret if secret: @@ -691,12 +698,14 @@ async def telegram_webhook(request: Request) -> dict: except Exception: raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON") - try: - handle_update(update) - except Exception as exc: - import logging - logging.getLogger(__name__).error("Telegram update handler error: %s", exc) + async def _process(): + loop = asyncio.get_event_loop() + try: + await loop.run_in_executor(None, lambda: handle_update(update)) + except Exception as exc: + logging.getLogger(__name__).error("Telegram update handler error: %s", exc) + asyncio.create_task(_process()) return {"ok": True} diff --git a/backend/app/telegram_bot.py b/backend/app/telegram_bot.py index 6d88105..28e0693 100644 --- a/backend/app/telegram_bot.py +++ b/backend/app/telegram_bot.py @@ -400,9 +400,19 @@ def _handle_callback(callback_query: dict[str, Any]) -> None: answer_callback_query(query_id, "Artikel nicht gefunden") return + # Answer Telegram immediately so the spinning indicator stops + action_labels = { + "rewrite": "✏️ Artikel wird neu geschrieben …", + "discard": "❌ Artikel verworfen", + "override": "➕ Artikel wird verarbeitet …", + "reject": "🚫 Abgelehnt", + } + answer_callback_query(query_id, action_labels.get(action, "")) + edit_message_reply_markup(chat_id, message_id) + + logger.info("Callback: action=%s article_id=%s", action, article_id) + if action == "rewrite": - answer_callback_query(query_id, "✏️ Artikel wird neu geschrieben …") - edit_message_reply_markup(chat_id, message_id) try: _pipeline.rewrite_and_update_draft(article_id) updated = get_article_by_id(article_id) @@ -411,28 +421,25 @@ def _handle_callback(callback_query: dict[str, Any]) -> None: slot = suggest_publish_slot() notify_new_draft(updated, score=_get_relevance_score(updated), suggested_publish_at=slot) except Exception as exc: + logger.error("Rewrite #%d fehlgeschlagen: %s", article_id, exc) notify_error(f"Rewrite #{article_id} fehlgeschlagen: {exc}") elif action == "discard": - answer_callback_query(query_id, "❌ Artikel verworfen") - edit_message_reply_markup(chat_id, message_id) try: _pipeline.discard_article(article_id) except Exception as exc: + logger.error("Discard #%d fehlgeschlagen: %s", article_id, exc) notify_error(f"Verwerfen #{article_id} fehlgeschlagen: {exc}") elif action == "override": - answer_callback_query(query_id, "➕ Artikel wird verarbeitet …") - edit_message_reply_markup(chat_id, message_id) try: _pipeline.override_rejected_article(article_id) except Exception as exc: + logger.error("Override #%d fehlgeschlagen: %s", article_id, exc) notify_error(f"Override #{article_id} fehlgeschlagen: {exc}") elif action == "reject": - answer_callback_query(query_id, "🚫 Abgelehnt") - edit_message_reply_markup(chat_id, message_id) update_article_status(article_id, "error", actor="telegram", note="Manuell abgelehnt via Telegram") else: - answer_callback_query(query_id, "Unbekannte Aktion") + logger.warning("Unbekannte Callback-Aktion: %s", action) From 970f509ad48aed05ac849350c17286de861b9e3e Mon Sep 17 00:00:00 2001 From: OliverGiertz Date: Sat, 21 Mar 2026 11:15:39 +0000 Subject: [PATCH 29/54] feat(wordpress): store suggested publish date directly in WP draft Reserve the publish slot before creating the WP draft so the scheduled_publish_at timestamp is available when building the post payload. WordPress receives the `date` field (e.g. 2026-03-24T09:00:00) which sets the scheduled publish time on the draft. Co-Authored-By: Claude Sonnet 4.6 --- backend/app/pipeline.py | 13 ++++++++----- backend/app/wordpress.py | 3 +++ 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/backend/app/pipeline.py b/backend/app/pipeline.py index f86ef35..e85f944 100644 --- a/backend/app/pipeline.py +++ b/backend/app/pipeline.py @@ -276,16 +276,16 @@ def _process_article(article: dict[str, Any], stats: PipelineStats, settings: An else: # Auto-process: rewrite + WP draft try: - # Reload article to get updated image_review + # Reserve publish slot FIRST so it's available when WP draft is created + slot = reserve_publish_slot(article_id) + + # Reload article to get updated image_review + scheduled_publish_at fresh = get_article_by_id(article_id) if not fresh: return wp_post_id, wp_post_url = _do_rewrite_and_draft(fresh) stats.drafts_created += 1 - # Reserve publish slot - slot = reserve_publish_slot(article_id) - # Reload for notification final = get_article_by_id(article_id) if final: @@ -357,8 +357,11 @@ def override_rejected_article(article_id: int) -> None: except Exception: score = 0 - wp_post_id, wp_post_url = _do_rewrite_and_draft(fresh) + # Reserve publish slot FIRST so it's in the DB when WP draft is created slot = reserve_publish_slot(article_id) + fresh = get_article_by_id(article_id) + + wp_post_id, wp_post_url = _do_rewrite_and_draft(fresh) final = get_article_by_id(article_id) if final: diff --git a/backend/app/wordpress.py b/backend/app/wordpress.py index 704d428..b031e9d 100644 --- a/backend/app/wordpress.py +++ b/backend/app/wordpress.py @@ -335,6 +335,9 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]: payload["excerpt"] = excerpt if featured_media_id: payload["featured_media"] = featured_media_id + scheduled_at = article.get("scheduled_publish_at") + if scheduled_at: + payload["date"] = scheduled_at # e.g. "2026-03-24T09:00:00" wp_post_id = article.get("wp_post_id") tag_ids = _resolve_wp_tag_ids( From a64bf31ff63df35232f04ba637ade8e9db3a45ee Mon Sep 17 00:00:00 2001 From: OliverGiertz Date: Thu, 26 Mar 2026 06:34:49 +0000 Subject: [PATCH 30/54] fix(telegram): restore webhook to RSS-News backend and forward app-release commands The N8N App Release Telegram Trigger had overwritten the webhook registration, pointing it to N8N instead of the RSS-News backend. This caused all callback_query events (inline buttons) to be lost, breaking the override/rewrite/discard buttons. Changes: - Re-register webhook to https://news.vanityontour.de/telegram/webhook with both message and callback_query in allowed_updates - Add _forward_to_n8n_app_release() to proxy unknown bot commands (e.g. /release) to the N8N App Release webhook, keeping that workflow functional without needing its own Telegram Trigger Co-Authored-By: Claude Sonnet 4.6 --- backend/app/telegram_bot.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/backend/app/telegram_bot.py b/backend/app/telegram_bot.py index 28e0693..bebda41 100644 --- a/backend/app/telegram_bot.py +++ b/backend/app/telegram_bot.py @@ -13,6 +13,7 @@ from .config import get_settings logger = logging.getLogger(__name__) _BASE = "https://api.telegram.org/bot{token}/{method}" +_N8N_APP_RELEASE_WEBHOOK = "https://n8n.vanityontour.de/webhook/tg-app-release-bot-v1/webhook" # --------------------------------------------------------------------------- @@ -121,6 +122,22 @@ def delete_webhook() -> dict: return _call("deleteWebhook", {}) +def _forward_to_n8n_app_release(update: dict[str, Any]) -> None: + """Forward a Telegram update to the N8N App Release webhook.""" + try: + data = json.dumps(update).encode("utf-8") + req = Request( + url=_N8N_APP_RELEASE_WEBHOOK, + data=data, + method="POST", + headers={"Content-Type": "application/json"}, + ) + with urlopen(req, timeout=5) as _: + pass + except Exception as exc: + logger.debug("N8N App-Release-Forward fehlgeschlagen: %s", exc) + + # --------------------------------------------------------------------------- # Notification helpers # --------------------------------------------------------------------------- @@ -374,6 +391,10 @@ def _handle_message(message: dict[str, Any]) -> None: "/help — Diese Hilfe" ) + else: + # Unbekannter Befehl → an N8N App-Release-Workflow weiterleiten + _forward_to_n8n_app_release({"message": message}) + def _handle_callback(callback_query: dict[str, Any]) -> None: from . import pipeline as _pipeline From 013af2ab62b9a6297fa98f7c43d7d0c2263cc3e9 Mon Sep 17 00:00:00 2001 From: OliverGiertz Date: Thu, 26 Mar 2026 07:22:47 +0000 Subject: [PATCH 31/54] fix(pipeline): set warning-zone articles to review status to prevent re-warnings Articles scoring between warn and auto threshold stayed in "new" status, causing repeated warning notifications on every /run call. Now they are set to "review" status after the first warning is sent. The override callback already resets status to "new" before processing, so the existing flow works correctly. Also include "review" articles in /rejected command output so they can be acted on. Co-Authored-By: Claude Sonnet 4.6 --- backend/app/pipeline.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/backend/app/pipeline.py b/backend/app/pipeline.py index e85f944..6753491 100644 --- a/backend/app/pipeline.py +++ b/backend/app/pipeline.py @@ -266,7 +266,13 @@ def _process_article(article: dict[str, Any], stats: PipelineStats, settings: An stats.rejected_articles.append(updated) elif score < settings.pipeline_relevance_auto: - # Warning zone: inform user, don't auto-process + # Warning zone: set status to "review" so repeated /run calls don't re-warn + update_article_status( + article_id, + "review", + actor="pipeline", + note=f"Niedrige Relevanz: Score {score}/100 — {reason}", + ) stats.warnings += 1 try: tg.notify_relevance_warning(article, score, reason) @@ -382,7 +388,7 @@ def get_recently_rejected(days: int = 3) -> list[dict[str, Any]]: """ SELECT id, title, meta_json, source_url, created_at FROM articles - WHERE status = 'error' + WHERE status IN ('error', 'review') AND json_extract(meta_json, '$.relevance.score') IS NOT NULL AND date(updated_at) >= date('now', ?) ORDER BY updated_at DESC From 12932bca9078aa9244d3e75ed3adece7537c56e0 Mon Sep 17 00:00:00 2001 From: OliverGiertz Date: Thu, 26 Mar 2026 07:36:09 +0000 Subject: [PATCH 32/54] fix(rewrite): attribute claims to source instead of using first-person 'wir' Rewrites must not use 'wir haben erforscht/berechnet' since the content comes from a third-party source. The prompt now passes the source name and instructs GPT to attribute all claims to the original publisher (e.g. 'laut PiNCAMP', 'die Auswertung zeigt'). Co-Authored-By: Claude Sonnet 4.6 --- backend/app/rewrite.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/backend/app/rewrite.py b/backend/app/rewrite.py index 6c1d37b..05937e5 100644 --- a/backend/app/rewrite.py +++ b/backend/app/rewrite.py @@ -94,12 +94,16 @@ def rewrite_article_text(article: dict[str, Any]) -> str: raise RuntimeError("Kein Quelltext für Rewrite verfügbar") title = (article.get("title") or "").strip() + source_name = (article.get("source_name_snapshot") or article.get("author") or "die Quelle").strip() prompt = ( "Schreibe den folgenden News-Text neu auf Deutsch in persönlicher Du-Form. " "Stil: ausführlich, gut lesbar, ohne Einleitung mit Datum/Uhrzeit/Firma/Ort, " "ohne Pressekontakt, ohne Quellenblock. " "Nutze klare Absätze und Zwischenüberschriften in HTML (

        ,

        ,

        • falls passend). " - "Inhaltlich korrekt bleiben, nichts erfinden.\n\n" + "Inhaltlich korrekt bleiben, nichts erfinden. " + f"Wichtig: Der Artikel wurde von '{source_name}' veröffentlicht. " + "Verwende NIEMALS 'wir' oder 'ich' aus Sicht der Quelle – beziehe Aussagen stets auf die Quelle, " + f"z.B. 'laut {source_name}', '{source_name} hat ermittelt', 'die Auswertung zeigt'.\n\n" f"Titel: {title}\n\n" f"Originaltext:\n{source_text}" ) From 1963e32ab48cf9ea2d9535869b3b433fd32e965f Mon Sep 17 00:00:00 2001 From: OliverGiertz Date: Thu, 26 Mar 2026 07:45:55 +0000 Subject: [PATCH 33/54] fix(rewrite): make image upload non-fatal and add rewrite tracing logs - wordpress.py: catch image download/upload failures and skip image instead of aborting the entire WP draft update - pipeline.py: add INFO logs at each step of _do_rewrite_and_draft to trace OpenAI call, tag generation, and WP API call - telegram_bot.py: add INFO logs around rewrite execution + exc_info on error for full traceback in logs - repositories.py: include scheduled_publish_at in get_article_by_id Co-Authored-By: Claude Sonnet 4.6 --- backend/app/pipeline.py | 4 ++++ backend/app/repositories.py | 3 ++- backend/app/telegram_bot.py | 4 +++- backend/app/wordpress.py | 20 +++++++++++++------- docs/wiki/Operations-Runbook.md | 20 ++++++++++++++++++++ 5 files changed, 42 insertions(+), 9 deletions(-) diff --git a/backend/app/pipeline.py b/backend/app/pipeline.py index 6753491..ef0597d 100644 --- a/backend/app/pipeline.py +++ b/backend/app/pipeline.py @@ -110,7 +110,9 @@ def _do_rewrite_and_draft(article: dict[str, Any]) -> tuple[int, str | None]: article_id = int(article["id"]) # Rewrite + logger.info("_do_rewrite_and_draft #%d: starte OpenAI-Rewrite", article_id) rewritten = rewrite_article_text(article) + logger.info("_do_rewrite_and_draft #%d: Rewrite fertig (%d Wörter), generiere Tags", article_id, len(rewritten.split())) tags: list[str] = [] try: tags = generate_article_tags(article, rewritten_text=rewritten) @@ -157,7 +159,9 @@ def _do_rewrite_and_draft(article: dict[str, Any]) -> tuple[int, str | None]: raise RuntimeError(f"Artikel #{article_id} nach Rewrite nicht gefunden") # Create WP draft + logger.info("_do_rewrite_and_draft #%d: erstelle/aktualisiere WP Draft (wp_post_id=%s)", article_id, fresh.get("wp_post_id")) wp_post_id, wp_post_url = publish_article_draft(fresh) + logger.info("_do_rewrite_and_draft #%d: WP Draft fertig (post_id=%s)", article_id, wp_post_id) # Update WP info in DB from .repositories import mark_article_publish_result diff --git a/backend/app/repositories.py b/backend/app/repositories.py index 0ee5380..9556ed3 100644 --- a/backend/app/repositories.py +++ b/backend/app/repositories.py @@ -321,7 +321,8 @@ def get_article_by_id(article_id: int) -> dict[str, Any] | None: a.source_name_snapshot, a.source_terms_url_snapshot, a.source_license_name_snapshot, a.legal_checked, a.legal_checked_at, a.legal_note, a.wp_post_id, a.wp_post_url, a.publish_attempts, a.publish_last_error, a.published_to_wp_at, - a.word_count, a.status, a.meta_json, a.created_at, a.updated_at + a.word_count, a.status, a.meta_json, a.created_at, a.updated_at, + a.scheduled_publish_at FROM articles a WHERE a.id = ? """, diff --git a/backend/app/telegram_bot.py b/backend/app/telegram_bot.py index bebda41..46ddd28 100644 --- a/backend/app/telegram_bot.py +++ b/backend/app/telegram_bot.py @@ -435,14 +435,16 @@ def _handle_callback(callback_query: dict[str, Any]) -> None: if action == "rewrite": try: + logger.info("Rewrite #%d: starte rewrite_and_update_draft", article_id) _pipeline.rewrite_and_update_draft(article_id) + logger.info("Rewrite #%d: abgeschlossen, sende Benachrichtigung", article_id) updated = get_article_by_id(article_id) if updated: from .scheduler import suggest_publish_slot slot = suggest_publish_slot() notify_new_draft(updated, score=_get_relevance_score(updated), suggested_publish_at=slot) except Exception as exc: - logger.error("Rewrite #%d fehlgeschlagen: %s", article_id, exc) + logger.error("Rewrite #%d fehlgeschlagen: %s", article_id, exc, exc_info=True) notify_error(f"Rewrite #{article_id} fehlgeschlagen: {exc}") elif action == "discard": diff --git a/backend/app/wordpress.py b/backend/app/wordpress.py index b031e9d..a4f7f3a 100644 --- a/backend/app/wordpress.py +++ b/backend/app/wordpress.py @@ -318,13 +318,19 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]: featured_media_id = None selected_image_url = _selected_image_url_from_meta(article.get("meta_json")) if selected_image_url: - featured_media_id = _upload_featured_media( - base_url=settings.wordpress_base_url, - auth_header=auth, - image_url=selected_image_url, - article_title=title, - source_url=source_url, - ) + try: + featured_media_id = _upload_featured_media( + base_url=settings.wordpress_base_url, + auth_header=auth, + image_url=selected_image_url, + article_title=title, + source_url=source_url, + ) + except Exception as img_exc: + import logging + logging.getLogger(__name__).warning( + "Bild-Upload fehlgeschlagen (wird übersprungen): %s — %s", selected_image_url, img_exc + ) payload = { "title": title, diff --git a/docs/wiki/Operations-Runbook.md b/docs/wiki/Operations-Runbook.md index 32bf5c4..e6c0f88 100644 --- a/docs/wiki/Operations-Runbook.md +++ b/docs/wiki/Operations-Runbook.md @@ -18,6 +18,26 @@ 3. Payload-Validation/Tag-Fehler? 4. Artikel in `pending` statt `failed` markieren, wenn unklar +## Incident: Telegram-Buttons reagieren nicht / Befehle ignoriert + +**Ursache:** N8N "App Release - Telegram Bot"-Workflow hat den Webhook überschrieben. + +**Prüfen:** +```bash +curl -s "https://api.telegram.org/bot8403822424:AAGp8gZoNIGZv3IIan45q7P9HfM868qzXi4/getWebhookInfo" | python3 -m json.tool +``` +→ `url` muss auf `https://news.vanityontour.de/telegram/webhook` zeigen +→ `allowed_updates` muss `["message", "callback_query"]` enthalten + +**Webhook zurücksetzen:** +```bash +curl -s -X POST "https://api.telegram.org/bot8403822424:AAGp8gZoNIGZv3IIan45q7P9HfM868qzXi4/setWebhook" \ + -H "Content-Type: application/json" \ + -d '{"url":"https://news.vanityontour.de/telegram/webhook","allowed_updates":["message","callback_query"],"secret_token":"RWWAaBwfCUX9Y573JVkB9zAeloHsZZoruXOBBgUtsvU"}' +``` + +Vollständige Dokumentation: `projects/webhook/telegram-webhook-reset.md` + ## Backups - SQLite-Dump taeglich - Konfiguration und `.env` sicher sichern From aaac5def27a65dd1e56854b8d058f20608569f01 Mon Sep 17 00:00:00 2001 From: OliverGiertz Date: Fri, 27 Mar 2026 07:08:48 +0000 Subject: [PATCH 34/54] feat(pipeline): image caption/credit extraction, no-image exclusion, WP attribution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit source_extraction.py: - New _extract_image_metadata(): extracts figcaption text + copyright/credit per image URL using 3 strategies (figure+figcaption, data-* attributes, adjacent credit spans) - ExtractedArticle gets new image_metadata field - extracted_article_to_meta() includes image_metadata in stored JSON pipeline.py: - After auto image selection, check if selected_url is set - Articles without usable image → status "no_image" (excluded with Telegram notice) - PipelineStats and summary report include no_image counter db.py: - Add "no_image" to articles status CHECK constraint - Migration: recreates articles table with updated constraint on existing DBs workflow.py / main.py: - Map no_image as own UI status with rewrite/close transitions wordpress.py: - _upload_featured_media() accepts image_caption param, sends to WP media - _get_image_meta_for_url() / _build_image_caption() helpers - _build_attribution_block(): separator + attribution paragraph at article end (original link, author, Bildnachweis/credit) - _build_post_content() appends attribution block telegram_bot.py: - notify_pipeline_done() shows 🖼️ no-image count Co-Authored-By: Claude Sonnet 4.6 --- backend/app/db.py | 85 +++++++++++++- backend/app/main.py | 4 +- backend/app/pipeline.py | 29 +++++ backend/app/source_extraction.py | 187 ++++++++++++++++++++++++++++++- backend/app/telegram_bot.py | 3 + backend/app/wordpress.py | 76 ++++++++++++- backend/app/workflow.py | 7 +- 7 files changed, 381 insertions(+), 10 deletions(-) diff --git a/backend/app/db.py b/backend/app/db.py index 1b394c3..b6ef898 100644 --- a/backend/app/db.py +++ b/backend/app/db.py @@ -110,7 +110,7 @@ def init_db() -> None: publish_last_error TEXT, published_to_wp_at TEXT, word_count INTEGER DEFAULT 0, - status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error')), + status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error', 'no_image')), meta_json TEXT, created_at TEXT NOT NULL DEFAULT (datetime('now')), updated_at TEXT NOT NULL DEFAULT (datetime('now')), @@ -181,6 +181,89 @@ def init_db() -> None: if column not in existing_columns: conn.execute(ddl) + # Migration: add 'no_image' to the status CHECK constraint if not present. + # SQLite cannot modify CHECK constraints in-place, so we recreate the table. + table_sql_row = conn.execute( + "SELECT sql FROM sqlite_master WHERE type='table' AND name='articles'" + ).fetchone() + if table_sql_row and "'no_image'" not in (table_sql_row["sql"] or ""): + conn.executescript( + """ + PRAGMA foreign_keys=OFF; + + CREATE TABLE articles_v2 ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + feed_id INTEGER, + source_article_id TEXT, + source_hash TEXT, + title TEXT NOT NULL, + source_url TEXT NOT NULL, + canonical_url TEXT, + published_at TEXT, + author TEXT, + summary TEXT, + content_raw TEXT, + content_rewritten TEXT, + image_urls_json TEXT, + press_contact TEXT, + source_name_snapshot TEXT, + source_terms_url_snapshot TEXT, + source_license_name_snapshot TEXT, + legal_checked INTEGER NOT NULL DEFAULT 0, + legal_checked_at TEXT, + legal_note TEXT, + wp_post_id INTEGER, + wp_post_url TEXT, + publish_attempts INTEGER NOT NULL DEFAULT 0, + publish_last_error TEXT, + published_to_wp_at TEXT, + word_count INTEGER DEFAULT 0, + status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error', 'no_image')), + meta_json TEXT, + relevance_score INTEGER, + scheduled_publish_at TEXT, + created_at TEXT NOT NULL DEFAULT (datetime('now')), + updated_at TEXT NOT NULL DEFAULT (datetime('now')), + FOREIGN KEY(feed_id) REFERENCES feeds(id) ON DELETE SET NULL, + UNIQUE(source_url) + ); + + INSERT INTO articles_v2 SELECT + id, feed_id, source_article_id, source_hash, title, source_url, + canonical_url, published_at, author, summary, content_raw, + content_rewritten, image_urls_json, press_contact, + source_name_snapshot, source_terms_url_snapshot, source_license_name_snapshot, + legal_checked, legal_checked_at, legal_note, + wp_post_id, wp_post_url, publish_attempts, publish_last_error, + published_to_wp_at, word_count, status, meta_json, + relevance_score, scheduled_publish_at, created_at, updated_at + FROM articles; + + DROP TABLE articles; + ALTER TABLE articles_v2 RENAME TO articles; + + CREATE INDEX IF NOT EXISTS idx_articles_source_article_id ON articles(source_article_id); + CREATE INDEX IF NOT EXISTS idx_articles_source_hash ON articles(source_hash); + CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_feed_source_article_id + ON articles(feed_id, source_article_id) + WHERE source_article_id IS NOT NULL; + CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_source_hash + ON articles(source_hash) + WHERE source_hash IS NOT NULL; + CREATE INDEX IF NOT EXISTS idx_articles_status ON articles(status); + CREATE INDEX IF NOT EXISTS idx_articles_published_at ON articles(published_at); + + CREATE TRIGGER IF NOT EXISTS trg_articles_updated_at + AFTER UPDATE ON articles + FOR EACH ROW + BEGIN + UPDATE articles SET updated_at = datetime('now') WHERE id = OLD.id; + END; + + PRAGMA foreign_keys=ON; + """ + ) + table_rows = conn.execute( "SELECT name FROM sqlite_master WHERE type = 'table' AND name = 'publish_jobs'" ).fetchall() diff --git a/backend/app/main.py b/backend/app/main.py index e954de6..264a2be 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -123,7 +123,7 @@ class ArticleUpsertRequest(BaseModel): publish_last_error: str | None = None published_to_wp_at: str | None = None word_count: int = 0 - status: str = Field(default="new", pattern="^(new|rewrite|publish|published|close|review|approved|error)$") + status: str = Field(default="new", pattern="^(new|rewrite|publish|published|close|review|approved|error|no_image)$") meta_json: str | None = None @@ -132,7 +132,7 @@ class IngestionRunRequest(BaseModel): class ArticleTransitionRequest(BaseModel): - target_status: str = Field(pattern="^(new|rewrite|publish|published|close|review|approved|error)$") + target_status: str = Field(pattern="^(new|rewrite|publish|published|close|review|approved|error|no_image)$") note: str | None = None diff --git a/backend/app/pipeline.py b/backend/app/pipeline.py index ef0597d..100c70c 100644 --- a/backend/app/pipeline.py +++ b/backend/app/pipeline.py @@ -45,6 +45,7 @@ class PipelineStats: rejected: int = 0 warnings: int = 0 errors: int = 0 + no_image: int = 0 rejected_articles: list[dict[str, Any]] = field(default_factory=list) @@ -226,6 +227,7 @@ def run_auto_pipeline(trigger: str = "auto") -> dict[str, Any]: "processed": stats.processed, "drafts_created": stats.drafts_created, "rejected": stats.rejected, + "no_image": stats.no_image, "warnings": stats.warnings, "errors": stats.errors, } @@ -242,6 +244,33 @@ def _process_article(article: dict[str, Any], stats: PipelineStats, settings: An # Auto-select image _auto_select_image(article) + # Reload to get updated image_review + article = get_article_by_id(article_id) or article + + # Exclude articles without a usable image + try: + meta = json.loads(article.get("meta_json") or "{}") + except Exception: + meta = {} + has_image = bool((meta.get("image_review") or {}).get("selected_url")) + if not has_image: + update_article_status( + article_id, + "no_image", + actor="pipeline", + note="Kein Bild vorhanden – Artikel ausgeschlossen", + ) + stats.no_image += 1 + logger.info("Artikel #%d ausgeschlossen: kein Bild gefunden", article_id) + try: + tg.send_message( + f"🖼️ Kein Bild – Artikel #{article_id} ausgeschlossen\n" + f"📰 {(article.get('title') or '')[:80]}" + ) + except Exception: + pass + return + # Score relevance try: relevance = score_article_relevance(article) diff --git a/backend/app/source_extraction.py b/backend/app/source_extraction.py index 925fcf6..d3cbed8 100644 --- a/backend/app/source_extraction.py +++ b/backend/app/source_extraction.py @@ -1,6 +1,6 @@ from __future__ import annotations -from dataclasses import dataclass +from dataclasses import dataclass, field from html import unescape import re from typing import Any @@ -21,6 +21,7 @@ class ExtractedArticle: images: list[str] press_contact: str | None extraction_error: str | None = None + image_metadata: dict[str, dict] = field(default_factory=dict) def _clean_text(raw: str | None) -> str | None: @@ -197,6 +198,187 @@ def _extract_press_contact(content_text: str | None) -> str | None: return None +# CSS class keywords that indicate a copyright/credit element inside a figcaption +_CREDIT_CLASS_RE = re.compile( + r"class\s*=\s*[\"'][^\"']*(?:copyright|credit|photographer|photo-credit|image-credit|bildrechte|fotocredit)[^\"']*[\"']", + re.IGNORECASE, +) + +# Inline text patterns that signal a credit/copyright notice +_CREDIT_TEXT_RE = re.compile( + r"(©[^<\n\r]{1,100}|(?:Foto|Bild|Credit|Fotograf|Fotografie)\s*:[^<\n\r]{1,100})", + re.IGNORECASE, +) + +# data-* attribute names that carry credit/caption information directly on +_IMG_DATA_CREDIT_ATTRS = ("data-credit", "data-photographer", "data-copyright") +_IMG_DATA_CAPTION_ATTRS = ("data-caption", "data-description") + +# Class keywords for adjacent sibling credit spans/divs after an +_ADJ_CREDIT_CLASS_RE = re.compile( + r"class\s*=\s*[\"'][^\"']*(?:copyright|credit|photographer|photo-credit|image-credit|bildrechte|fotocredit)[^\"']*[\"']", + re.IGNORECASE, +) + + +def _extract_image_metadata(html: str, page_url: str) -> dict[str, dict]: + """Return a mapping of absolute image URL → {"caption": ..., "credit": ...}. + + Uses three progressive strategies: + 1.
          with +
          + 2. data-* attributes on tags not already covered + 3. tags whose immediately following HTML contains a credit element + """ + result: dict[str, dict] = {} + + try: + # ------------------------------------------------------------------ + # Strategy 1:
          blocks containing and
          + # ------------------------------------------------------------------ + for fig_match in re.finditer(r"]*>([\s\S]*?)
          ", html, re.IGNORECASE): + fig_html = fig_match.group(1) + + # Locate image src (src or lazy-loaded data-src) + img_match = re.search( + r"]+(?:src|data-src)\s*=\s*[\"']([^\"']+)[\"'][^>]*>", + fig_html, + re.IGNORECASE, + ) + if not img_match: + continue + img_src = urljoin(page_url, img_match.group(1).strip()) + + # Locate figcaption + figcap_match = re.search( + r"]*>([\s\S]*?)
          ", + fig_html, + re.IGNORECASE, + ) + if not figcap_match: + continue + figcap_html = figcap_match.group(1) + + # --- Extract credit --- + credit: str | None = None + + # Try credit via class attribute on an inner element + credit_elem_match = re.search( + r"<(?:span|p|div)[^>]*" + + _CREDIT_CLASS_RE.pattern + + r"[^>]*>([\s\S]*?)", + figcap_html, + re.IGNORECASE, + ) + if credit_elem_match: + credit = _clean_text(credit_elem_match.group(1)) + + # Fallback: scan plain text of figcaption for credit patterns + if not credit: + figcap_text = unescape(re.sub(r"<[^>]+>", " ", figcap_html)) + cred_text_match = _CREDIT_TEXT_RE.search(figcap_text) + if cred_text_match: + credit = _clean_text(cred_text_match.group(1)) + + # --- Extract caption (full figcaption text) --- + caption = _clean_text(figcap_html) + + # Only store entries that carry at least one piece of metadata + if caption or credit: + entry: dict[str, str] = {} + if caption: + entry["caption"] = caption + if credit: + entry["credit"] = credit + result[img_src] = entry + + # ------------------------------------------------------------------ + # Strategy 2: data-* attributes on tags + # ------------------------------------------------------------------ + for img_match in re.finditer(r"]+)>", html, re.IGNORECASE): + img_attrs = img_match.group(1) + + # Resolve image URL (prefer src over data-src) + src_match = re.search(r'(?:^|\s)src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE) + if not src_match: + src_match = re.search(r'data-src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE) + if not src_match: + continue + img_src = urljoin(page_url, src_match.group(1).strip()) + + # Skip images already handled by Strategy 1 + if img_src in result: + continue + + credit: str | None = None + caption: str | None = None + + for attr in _IMG_DATA_CREDIT_ATTRS: + attr_match = re.search( + rf'{re.escape(attr)}\s*=\s*["\']([^"\']+)["\']', + img_attrs, + re.IGNORECASE, + ) + if attr_match: + credit = _clean_text(attr_match.group(1)) + break + + for attr in _IMG_DATA_CAPTION_ATTRS: + attr_match = re.search( + rf'{re.escape(attr)}\s*=\s*["\']([^"\']+)["\']', + img_attrs, + re.IGNORECASE, + ) + if attr_match: + caption = _clean_text(attr_match.group(1)) + break + + if caption or credit: + entry = {} + if caption: + entry["caption"] = caption + if credit: + entry["credit"] = credit + result[img_src] = entry + + # ------------------------------------------------------------------ + # Strategy 3: followed within 200 chars by a credit element + # ------------------------------------------------------------------ + for img_match in re.finditer(r"]+)>", html, re.IGNORECASE): + img_attrs = img_match.group(1) + + src_match = re.search(r'(?:^|\s)src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE) + if not src_match: + src_match = re.search(r'data-src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE) + if not src_match: + continue + img_src = urljoin(page_url, src_match.group(1).strip()) + + # Skip images already handled by earlier strategies + if img_src in result: + continue + + # Look at the 200 characters of HTML immediately after the img tag + after_start = img_match.end() + after_html = html[after_start : after_start + 200] + + adj_match = re.search( + r"<(?:span|p|div)[^>]*" + + _ADJ_CREDIT_CLASS_RE.pattern + + r"[^>]*>([\s\S]*?)", + after_html, + re.IGNORECASE, + ) + if adj_match: + credit = _clean_text(adj_match.group(1)) + if credit: + result[img_src] = {"credit": credit} + + except Exception: + return {} + + return result + + def extract_article(url: str, timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS) -> ExtractedArticle: try: req = Request( @@ -232,6 +414,7 @@ def extract_article(url: str, timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS) -> summary = _clean_text(content_text[:320]) images = _extract_images(html, url) press_contact = _extract_press_contact(content_text) + image_metadata = _extract_image_metadata(html, url) return ExtractedArticle( title=title, @@ -242,6 +425,7 @@ def extract_article(url: str, timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS) -> images=images, press_contact=press_contact, extraction_error=None, + image_metadata=image_metadata, ) @@ -254,4 +438,5 @@ def extracted_article_to_meta(article: ExtractedArticle) -> dict[str, Any]: "images": article.images, "press_contact": article.press_contact, "extraction_error": article.extraction_error, + "image_metadata": article.image_metadata, } diff --git a/backend/app/telegram_bot.py b/backend/app/telegram_bot.py index 46ddd28..c92b009 100644 --- a/backend/app/telegram_bot.py +++ b/backend/app/telegram_bot.py @@ -289,6 +289,7 @@ def notify_pipeline_done(stats: dict[str, Any]) -> None: processed = stats.get("processed", 0) drafts = stats.get("drafts_created", 0) rejected = stats.get("rejected", 0) + no_image = stats.get("no_image", 0) warnings = stats.get("warnings", 0) errors = stats.get("errors", 0) @@ -300,6 +301,8 @@ def notify_pipeline_done(stats: dict[str, Any]) -> None: ] if rejected: lines.append(f"🚫 Abgelehnt: {rejected}") + if no_image: + lines.append(f"🖼️ Kein Bild: {no_image}") if warnings: lines.append(f"⚠️ Warnungen: {warnings}") if errors: diff --git a/backend/app/wordpress.py b/backend/app/wordpress.py index a4f7f3a..a1ef8f5 100644 --- a/backend/app/wordpress.py +++ b/backend/app/wordpress.py @@ -161,6 +161,32 @@ def _guess_filename(image_url: str, content_type: str) -> str: return stem +def _get_image_meta_for_url(meta_json: str | None, image_url: str) -> dict: + """Return the caption/credit dict for a specific image URL from extraction metadata.""" + if not meta_json or not image_url: + return {} + try: + meta = json.loads(meta_json) + image_metadata = (meta.get("extraction") or {}).get("image_metadata") or {} + return image_metadata.get(image_url) or {} + except Exception: + return {} + + +def _build_image_caption(image_meta: dict, source_url: str) -> str: + """Build a WP caption string from image metadata and source URL.""" + caption = (image_meta.get("caption") or "").strip() + credit = (image_meta.get("credit") or "").strip() + parts = [] + if caption: + parts.append(caption) + if credit: + parts.append(credit) + if not parts: + parts.append(f"Quelle: {source_url}") + return " | ".join(parts) + + def _upload_featured_media( *, base_url: str, @@ -168,6 +194,7 @@ def _upload_featured_media( image_url: str, article_title: str, source_url: str, + image_caption: str = "", ) -> int: image_bytes, content_type = _download_image_bytes(image_url, referer=source_url or None) filename = _guess_filename(image_url, content_type) @@ -192,7 +219,6 @@ def _upload_featured_media( if media_id <= 0: raise RuntimeError(f"WordPress Media-Upload fehlgeschlagen: {media_payload}") - # Optional metadata update for traceability. _wp_request( base_url=base_url, auth_header=auth_header, @@ -200,7 +226,7 @@ def _upload_featured_media( endpoint=f"media/{media_id}", payload={ "title": f"{article_title[:120]} - Bild", - "caption": f"Quelle: {source_url}", + "caption": image_caption or f"Quelle: {source_url}", "alt_text": article_title[:200], }, ) @@ -289,6 +315,45 @@ def _sanitize_publish_text(text: str) -> str: return merged +def _build_attribution_block(article: dict[str, Any]) -> str: + """Build a WP Gutenberg attribution block for the bottom of the article.""" + source_url = (article.get("canonical_url") or article.get("source_url") or "").strip() + source_name = (article.get("source_name_snapshot") or "").strip() + author = (article.get("author") or "").strip() + + # Get image credit from extraction metadata + credit = "" + try: + meta = json.loads(article.get("meta_json") or "{}") + selected_url = (meta.get("image_review") or {}).get("selected_url") or "" + if selected_url: + img_meta = (meta.get("extraction") or {}).get("image_metadata") or {} + credit = (img_meta.get(selected_url) or {}).get("credit") or "" + except Exception: + pass + + parts: list[str] = [] + if source_url: + label = source_name or source_url + parts.append(f'Originalartikel: {escape(label)}') + if author: + parts.append(f"Autor: {escape(author)}") + if credit: + parts.append(f"Bildnachweis: {escape(credit)}") + + if not parts: + return "" + + inner = "  |  ".join(parts) + return ( + "\n" + "
          \n" + f'' + f'

          {inner}

          ' + "" + ) + + def _build_post_content(article: dict[str, Any]) -> tuple[str, str | None]: summary = (article.get("summary") or "").strip() body_text = (article.get("content_rewritten") or article.get("content_raw") or "").strip() @@ -300,7 +365,9 @@ def _build_post_content(article: dict[str, Any]) -> tuple[str, str | None]: body_html = _html_to_wp_blocks(body_text) if has_html else _as_block_paragraphs(body_text) if not body_html: body_html = "

          Kein Inhalt verfügbar.

          " - content = body_html.strip() + + attribution = _build_attribution_block(article) + content = (body_html + attribution).strip() return content, None @@ -318,6 +385,8 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]: featured_media_id = None selected_image_url = _selected_image_url_from_meta(article.get("meta_json")) if selected_image_url: + image_meta = _get_image_meta_for_url(article.get("meta_json"), selected_image_url) + image_caption = _build_image_caption(image_meta, source_url) try: featured_media_id = _upload_featured_media( base_url=settings.wordpress_base_url, @@ -325,6 +394,7 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]: image_url=selected_image_url, article_title=title, source_url=source_url, + image_caption=image_caption, ) except Exception as img_exc: import logging diff --git a/backend/app/workflow.py b/backend/app/workflow.py index b6cd4bb..83e9b63 100644 --- a/backend/app/workflow.py +++ b/backend/app/workflow.py @@ -1,6 +1,6 @@ from __future__ import annotations -UI_STATUSES = ("new", "rewrite", "publish", "published", "close") +UI_STATUSES = ("new", "rewrite", "publish", "published", "close", "no_image") def internal_to_ui_status(status: str | None) -> str: @@ -11,7 +11,7 @@ def internal_to_ui_status(status: str | None) -> str: return "close" if value == "review": return "rewrite" - if value in {"new", "rewrite", "published"}: + if value in {"new", "rewrite", "published", "no_image"}: return value return value or "new" @@ -22,7 +22,7 @@ def ui_to_internal_status(status: str | None) -> str: return "approved" if value == "close": return "error" - if value in {"new", "rewrite", "published"}: + if value in {"new", "rewrite", "published", "no_image"}: return value if value in {"approved", "error", "review"}: return value @@ -35,4 +35,5 @@ ALLOWED_UI_TRANSITIONS: dict[str, set[str]] = { "publish": {"published", "close"}, "published": {"rewrite", "close"}, "close": {"rewrite"}, + "no_image": {"rewrite", "close"}, } From 0d07a9804dd42d963347a304d8e3ff5d83651284 Mon Sep 17 00:00:00 2001 From: OliverGiertz Date: Fri, 27 Mar 2026 07:10:30 +0000 Subject: [PATCH 35/54] fix(ingestion): resolve Google Alerts redirect URLs before article fetch Google Alerts feed entries use google.com/url?...&url=&... tracking links. The extractor was fetching the Google redirect page instead of the actual article, resulting in empty content and no images. _resolve_google_redirect() extracts the real URL from the 'url' query parameter before passing it to extract_article(). Non-Google URLs are returned unchanged. Co-Authored-By: Claude Sonnet 4.6 --- backend/app/ingestion.py | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/backend/app/ingestion.py b/backend/app/ingestion.py index 510fd10..3710276 100644 --- a/backend/app/ingestion.py +++ b/backend/app/ingestion.py @@ -7,7 +7,7 @@ import json import re import time from typing import Any -from urllib.parse import unquote, urlparse +from urllib.parse import unquote, urlencode, urlparse, parse_qs import feedparser @@ -38,6 +38,31 @@ class IngestionStats: MAX_FEED_FETCH_RETRIES = 3 +def _resolve_google_redirect(url: str) -> str: + """Extract the real article URL from Google redirect URLs. + + Google Alerts feed entries use tracking links like: + https://www.google.com/url?rct=j&sa=t&url=&ct=ga&... + + This function returns the decoded real URL if detected, otherwise the + original URL unchanged. + """ + try: + parsed = urlparse(url) + host = (parsed.hostname or "").lower() + if host not in ("www.google.com", "google.com"): + return url + if parsed.path not in ("/url", "/url/"): + return url + params = parse_qs(parsed.query, keep_blank_values=False) + real_urls = params.get("url") + if real_urls: + return unquote(real_urls[0]) + except Exception: + pass + return url + + def _entry_published_iso(entry: dict) -> str | None: published = entry.get("published_parsed") or entry.get("updated_parsed") if not published: @@ -215,6 +240,8 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats: link = entry.get("link") if not link: continue + # Resolve Google redirect URLs (google.com/url?...&url=&...) + link = _resolve_google_redirect(link) summary, content_raw = _entry_text(entry) title = entry.get("title") or "Ohne Titel" From 8e65485f0c7d89dc8e225ed3add5e7b91da4ee55 Mon Sep 17 00:00:00 2001 From: OliverGiertz Date: Fri, 27 Mar 2026 08:08:07 +0000 Subject: [PATCH 36/54] fix(ingestion): strip HTML tags from feed entry titles Google Alerts wraps matched keywords in ... tags. Strip all HTML tags from the title before storing. Co-Authored-By: Claude Sonnet 4.6 --- backend/app/ingestion.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backend/app/ingestion.py b/backend/app/ingestion.py index 3710276..d76f4c4 100644 --- a/backend/app/ingestion.py +++ b/backend/app/ingestion.py @@ -244,7 +244,9 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats: link = _resolve_google_redirect(link) summary, content_raw = _entry_text(entry) - title = entry.get("title") or "Ohne Titel" + # Strip HTML tags from title (Google Alerts wraps matched keywords in ) + raw_title = entry.get("title") or "Ohne Titel" + title = re.sub(r"<[^>]+>", "", raw_title).strip() or "Ohne Titel" extracted = extract_article(link) final_title = extracted.title or title From 82f2df610db4106e12bcca2089f7537002125064 Mon Sep 17 00:00:00 2001 From: OliverGiertz Date: Fri, 27 Mar 2026 08:24:40 +0000 Subject: [PATCH 37/54] fix(wordpress): fuzzy URL match for image metadata and simplify caption builder Image metadata keys may have query params (e.g. ?w=1200) that differ from the selected_url stored in image_review. Fall back to comparing URLs without query string so the figcaption text is correctly found. Also simplified _build_image_caption: figcaption text already contains the credit info, so just use caption directly instead of appending the redundant credit prefix marker. Co-Authored-By: Claude Sonnet 4.6 --- backend/app/wordpress.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/backend/app/wordpress.py b/backend/app/wordpress.py index a1ef8f5..6ae686e 100644 --- a/backend/app/wordpress.py +++ b/backend/app/wordpress.py @@ -166,25 +166,30 @@ def _get_image_meta_for_url(meta_json: str | None, image_url: str) -> dict: if not meta_json or not image_url: return {} try: + from urllib.parse import urlparse meta = json.loads(meta_json) image_metadata = (meta.get("extraction") or {}).get("image_metadata") or {} - return image_metadata.get(image_url) or {} + # Exact match first + if image_url in image_metadata: + return image_metadata[image_url] + # Fuzzy match: compare without query string (handles ?w=1200 variants) + base_url = urlparse(image_url)._replace(query="").geturl() + for key, val in image_metadata.items(): + key_base = urlparse(key)._replace(query="").geturl() + if key_base == base_url: + return val + return {} except Exception: return {} def _build_image_caption(image_meta: dict, source_url: str) -> str: """Build a WP caption string from image metadata and source URL.""" + # caption from figcaption typically already contains the credit text caption = (image_meta.get("caption") or "").strip() - credit = (image_meta.get("credit") or "").strip() - parts = [] if caption: - parts.append(caption) - if credit: - parts.append(credit) - if not parts: - parts.append(f"Quelle: {source_url}") - return " | ".join(parts) + return caption + return f"Quelle: {source_url}" def _upload_featured_media( From d1cb809852e16a4642af1ae2964cdbce1e37ef95 Mon Sep 17 00:00:00 2001 From: OliverGiertz Date: Fri, 27 Mar 2026 08:28:44 +0000 Subject: [PATCH 38/54] fix(wordpress): fix attribution block source name and image credit lookup - Derive real source hostname from canonical URL when feed name is generic (e.g. "Google Alerts"), so the link shows "moin.de" instead of "Google Alerts" - Use _get_image_meta_for_url() (fuzzy URL matching) for image credit lookup - Use caption field for Bildnachweis since it already contains embedded credits Co-Authored-By: Claude Sonnet 4.6 --- backend/app/wordpress.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/backend/app/wordpress.py b/backend/app/wordpress.py index 6ae686e..313719d 100644 --- a/backend/app/wordpress.py +++ b/backend/app/wordpress.py @@ -322,18 +322,30 @@ def _sanitize_publish_text(text: str) -> str: def _build_attribution_block(article: dict[str, Any]) -> str: """Build a WP Gutenberg attribution block for the bottom of the article.""" + from urllib.parse import urlparse source_url = (article.get("canonical_url") or article.get("source_url") or "").strip() source_name = (article.get("source_name_snapshot") or "").strip() author = (article.get("author") or "").strip() - # Get image credit from extraction metadata + # If the feed name is "Google Alerts" (or similar generic names), derive the + # real source name from the hostname of the canonical URL. + if not source_name or source_name.lower() in ("google alerts", "google"): + try: + hostname = urlparse(source_url).hostname or "" + source_name = hostname.removeprefix("www.") + except Exception: + pass + + # Get image credit from extraction metadata (uses fuzzy URL match) + meta_json = article.get("meta_json") credit = "" try: - meta = json.loads(article.get("meta_json") or "{}") + meta = json.loads(meta_json or "{}") selected_url = (meta.get("image_review") or {}).get("selected_url") or "" if selected_url: - img_meta = (meta.get("extraction") or {}).get("image_metadata") or {} - credit = (img_meta.get(selected_url) or {}).get("credit") or "" + img_meta = _get_image_meta_for_url(meta_json, selected_url) + # caption already contains embedded credit text (e.g. "Foto: IMAGO/Zoonar") + credit = img_meta.get("caption") or img_meta.get("credit") or "" except Exception: pass From 45c533c674c3d683a3dbdf28e7e0660f289cd291 Mon Sep 17 00:00:00 2001 From: OliverGiertz Date: Fri, 27 Mar 2026 08:41:28 +0000 Subject: [PATCH 39/54] fix(wordpress): extract credit portion from caption for attribution block When the credit field only captured a marker prefix (e.g. "Foto:") due to CSS-class-based extraction picking up only the label element, fall back to regex-extracting the credit line from the full figcaption caption text. Co-Authored-By: Claude Sonnet 4.6 --- backend/app/wordpress.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/backend/app/wordpress.py b/backend/app/wordpress.py index 313719d..fcd1add 100644 --- a/backend/app/wordpress.py +++ b/backend/app/wordpress.py @@ -344,8 +344,22 @@ def _build_attribution_block(article: dict[str, Any]) -> str: selected_url = (meta.get("image_review") or {}).get("selected_url") or "" if selected_url: img_meta = _get_image_meta_for_url(meta_json, selected_url) - # caption already contains embedded credit text (e.g. "Foto: IMAGO/Zoonar") - credit = img_meta.get("caption") or img_meta.get("credit") or "" + raw_credit = (img_meta.get("credit") or "").strip() + caption_text = (img_meta.get("caption") or "").strip() + # If credit is just a prefix marker (e.g. "Foto:"), extract the credit + # portion from the full caption text instead. + if raw_credit and not raw_credit.rstrip(":").strip(): + raw_credit = "" + if raw_credit: + credit = raw_credit + elif caption_text: + # Extract credit markers like "Foto: IMAGO/…", "© Agentur", "Bild: …" + import re as _re + m = _re.search( + r"(©[^\n]{1,120}|(?:Foto|Bild|Credit|Fotograf|Photo)\s*:[^\n]{1,120})", + caption_text, + ) + credit = m.group(1).strip() if m else "" except Exception: pass From 1a8d0775c77463797577468b23994f02c0a2e78a Mon Sep 17 00:00:00 2001 From: OliverGiertz Date: Fri, 27 Mar 2026 08:47:09 +0000 Subject: [PATCH 40/54] fix(wordpress): correctly detect bare credit marker prefix before caption fallback Co-Authored-By: Claude Sonnet 4.6 --- backend/app/wordpress.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/backend/app/wordpress.py b/backend/app/wordpress.py index fcd1add..912e85c 100644 --- a/backend/app/wordpress.py +++ b/backend/app/wordpress.py @@ -346,9 +346,10 @@ def _build_attribution_block(article: dict[str, Any]) -> str: img_meta = _get_image_meta_for_url(meta_json, selected_url) raw_credit = (img_meta.get("credit") or "").strip() caption_text = (img_meta.get("caption") or "").strip() - # If credit is just a prefix marker (e.g. "Foto:"), extract the credit - # portion from the full caption text instead. - if raw_credit and not raw_credit.rstrip(":").strip(): + # If credit is just a bare marker prefix (e.g. "Foto:", "Bild:"), + # clear it and extract the full credit from the caption text instead. + _BARE_MARKERS = {"foto", "bild", "credit", "fotograf", "fotografie", "photo", "bildnachweis"} + if raw_credit.endswith(":") and raw_credit[:-1].strip().lower() in _BARE_MARKERS: raw_credit = "" if raw_credit: credit = raw_credit From 8c6022fead4d45f4ced2e7b8695f5c69b4818524 Mon Sep 17 00:00:00 2001 From: OliverGiertz Date: Sun, 29 Mar 2026 14:14:03 +0000 Subject: [PATCH 41/54] fix(pipeline): always reserve publish slot before WP draft creation If scheduled_publish_at is not set when _do_rewrite_and_draft runs (e.g. rewrite_and_update_draft called on a review article), reserve a slot now so the WP draft always receives a future date. Co-Authored-By: Claude Sonnet 4.6 --- backend/app/pipeline.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/backend/app/pipeline.py b/backend/app/pipeline.py index 100c70c..059ccf5 100644 --- a/backend/app/pipeline.py +++ b/backend/app/pipeline.py @@ -159,8 +159,17 @@ def _do_rewrite_and_draft(article: dict[str, Any]) -> tuple[int, str | None]: if not fresh: raise RuntimeError(f"Artikel #{article_id} nach Rewrite nicht gefunden") + # Ensure a publish slot is reserved — reserve one now if not yet set + if not fresh.get("scheduled_publish_at"): + from .scheduler import reserve_publish_slot + logger.info("_do_rewrite_and_draft #%d: kein Slot gesetzt, reserviere jetzt", article_id) + reserve_publish_slot(article_id) + fresh = get_article_by_id(article_id) + if not fresh: + raise RuntimeError(f"Artikel #{article_id} nach Slot-Reservierung nicht gefunden") + # Create WP draft - logger.info("_do_rewrite_and_draft #%d: erstelle/aktualisiere WP Draft (wp_post_id=%s)", article_id, fresh.get("wp_post_id")) + logger.info("_do_rewrite_and_draft #%d: erstelle/aktualisiere WP Draft (wp_post_id=%s, sched=%s)", article_id, fresh.get("wp_post_id"), fresh.get("scheduled_publish_at")) wp_post_id, wp_post_url = publish_article_draft(fresh) logger.info("_do_rewrite_and_draft #%d: WP Draft fertig (post_id=%s)", article_id, wp_post_id) From 426a799371cf8aa0f748a821a06b26b9834820c5 Mon Sep 17 00:00:00 2001 From: OliverGiertz Date: Sun, 29 Mar 2026 14:29:25 +0000 Subject: [PATCH 42/54] fix(wordpress): use status=future for posts with a future scheduled_publish_at MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit WordPress ignores the date field for draft posts and shows "Sofort veröffentlichen" instead. Setting status=future causes WP to display and honour the scheduled date, auto-publishing the post at the given time as intended. Co-Authored-By: Claude Sonnet 4.6 --- backend/app/wordpress.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/backend/app/wordpress.py b/backend/app/wordpress.py index 912e85c..8b6d5a0 100644 --- a/backend/app/wordpress.py +++ b/backend/app/wordpress.py @@ -446,6 +446,14 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]: scheduled_at = article.get("scheduled_publish_at") if scheduled_at: payload["date"] = scheduled_at # e.g. "2026-03-24T09:00:00" + # Use status "future" so WP schedules auto-publishing at the given date. + # WP ignores date for drafts and shows "Sofort veröffentlichen" instead. + try: + from datetime import datetime as _dt + if _dt.fromisoformat(scheduled_at) > _dt.now(): + payload["status"] = "future" + except Exception: + pass wp_post_id = article.get("wp_post_id") tag_ids = _resolve_wp_tag_ids( From 764e7bff6ab4907cedaf91680334d78b48039f32 Mon Sep 17 00:00:00 2001 From: OliverGiertz Date: Tue, 7 Apr 2026 09:09:44 +0000 Subject: [PATCH 43/54] fix(ingestion): skip data: URIs and known placeholder images - ingestion.py: filter out data:image/... inline URIs before ranking - ingestion.py: penalise (-300) known placeholder paths (some-default.jpg etc.) - wordpress.py: _is_usable_image_url rejects data: URIs and placeholder paths Co-Authored-By: Claude Sonnet 4.6 --- backend/app/ingestion.py | 35 ++++++++++++++++++++- backend/app/wordpress.py | 68 +++++++++++++++++++++++++++++++++++----- 2 files changed, 94 insertions(+), 9 deletions(-) diff --git a/backend/app/ingestion.py b/backend/app/ingestion.py index d76f4c4..30140ca 100644 --- a/backend/app/ingestion.py +++ b/backend/app/ingestion.py @@ -38,6 +38,26 @@ class IngestionStats: MAX_FEED_FETCH_RETRIES = 3 +def _normalize_article_url(url: str) -> str: + """Strip AMP and tracking query parameters from article URLs. + + Removes ?outputType=valid_amp and other AMP/tracking params so that + AMP and non-AMP versions of the same article are deduplicated. + """ + _AMP_PARAMS = {"outputtype", "amp", "outputformat"} + try: + from urllib.parse import parse_qs, urlencode + parsed = urlparse(url) + if not parsed.query: + return url + params = parse_qs(parsed.query, keep_blank_values=True) + filtered = {k: v for k, v in params.items() if k.lower() not in _AMP_PARAMS} + new_query = urlencode(filtered, doseq=True) + return parsed._replace(query=new_query).geturl() + except Exception: + return url + + def _resolve_google_redirect(url: str) -> str: """Extract the real article URL from Google redirect URLs. @@ -103,16 +123,27 @@ def _rank_image_candidates(source_url: str, title: str, images: list[str]) -> li source_host = (urlparse(source_url).hostname or "").lower() is_presseportal = "presseportal.de" in source_host title_tokens = _normalize_tokens(title) - blocked_patterns = ("logo", "badge", "app-store", "google-play", "na-logo", "sprite", "icon", "favicon", "tracking", "pixel") + blocked_patterns = ("logo", "badge", "app-store", "google-play", "na-logo", "sprite", "icon", "favicon", "tracking", "pixel", ".svg", ".ico", ".gif") + # Known placeholder/default images that should never be used as featured image + placeholder_patterns = ("some-default.jpg", "default-image", "placeholder", "no-image", "noimage") + ranked: list[dict[str, Any]] = [] for url in images: + # Skip inline data: URIs (e.g. base64-encoded SVG placeholders) + if url.startswith("data:"): + continue + parsed = urlparse(url) path = unquote(parsed.path.lower()) full = f"{parsed.netloc.lower()}{path}" score = 0 reasons: list[str] = [] + if any(token in full for token in placeholder_patterns): + score -= 300 + reasons.append("placeholder-image") + if any(token in full for token in blocked_patterns): score -= 150 reasons.append("blocked-pattern") @@ -242,6 +273,8 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats: continue # Resolve Google redirect URLs (google.com/url?...&url=&...) link = _resolve_google_redirect(link) + # Normalize AMP/tracking params (e.g. ?outputType=valid_amp) + link = _normalize_article_url(link) summary, content_raw = _entry_text(entry) # Strip HTML tags from title (Google Alerts wraps matched keywords in ) diff --git a/backend/app/wordpress.py b/backend/app/wordpress.py index 8b6d5a0..cced743 100644 --- a/backend/app/wordpress.py +++ b/backend/app/wordpress.py @@ -2,11 +2,13 @@ from __future__ import annotations import base64 from html import escape +import logging import json import mimetypes from pathlib import Path import re from typing import Any +from html import unescape as _html_unescape from urllib.parse import quote_plus, urlparse from urllib.request import Request, urlopen @@ -135,9 +137,37 @@ def _resolve_wp_tag_ids(*, base_url: str, auth_header: str, tags: list[str]) -> return ids +_BLOCKED_IMAGE_EXTS = {".svg", ".gif", ".ico", ".webp"} +_logger = logging.getLogger(__name__) + + +def _sanitize_image_url(url: str) -> str: + """Decode HTML entities (e.g. & → &) in image URLs from RSS feeds.""" + return _html_unescape(url) + + +_PLACEHOLDER_PATTERNS = ("some-default.jpg", "default-image", "placeholder", "no-image", "noimage") + +def _is_usable_image_url(url: str) -> bool: + """Return False for URLs that are unlikely to work as WP featured images.""" + if not url or url.startswith("data:"): + return False + try: + path = urlparse(url).path.lower() + _, ext = path.rsplit(".", 1) if "." in path else ("", "") + if f".{ext}" in _BLOCKED_IMAGE_EXTS: + return False + if any(p in path for p in _PLACEHOLDER_PATTERNS): + return False + except Exception: + pass + return True + + def _download_image_bytes(url: str, referer: str | None = None) -> tuple[bytes, str]: + url = _sanitize_image_url(url) headers = { - "User-Agent": "rss-news-publisher/1.0", + "User-Agent": "Mozilla/5.0 (compatible; rss-news-publisher/1.0)", "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8", } if referer: @@ -153,11 +183,14 @@ def _download_image_bytes(url: str, referer: str | None = None) -> tuple[bytes, def _guess_filename(image_url: str, content_type: str) -> str: - parsed = urlparse(image_url) + parsed = urlparse(_sanitize_image_url(image_url)) stem = Path(parsed.path).name or "article-image" if "." not in stem: ext = mimetypes.guess_extension(content_type.split(";")[0].strip()) or ".jpg" stem = f"{stem}{ext}" + # Sanitize to ASCII-safe characters for the HTTP Content-Disposition header + stem = stem.encode("ascii", errors="ignore").decode("ascii") + stem = re.sub(r"[^\w.\-]", "_", stem) or "article-image.jpg" return stem @@ -416,24 +449,43 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]: featured_media_id = None selected_image_url = _selected_image_url_from_meta(article.get("meta_json")) - if selected_image_url: - image_meta = _get_image_meta_for_url(article.get("meta_json"), selected_image_url) + + # Build candidate list: primary selected URL + fallbacks from image_urls_json + image_candidates: list[str] = [] + if selected_image_url and _is_usable_image_url(selected_image_url): + image_candidates.append(selected_image_url) + try: + extra_urls = json.loads(article.get("image_urls_json") or "[]") + for u in extra_urls: + if u and u not in image_candidates and _is_usable_image_url(u): + image_candidates.append(u) + except Exception: + pass + + for candidate_url in image_candidates: + image_meta = _get_image_meta_for_url(article.get("meta_json"), candidate_url) image_caption = _build_image_caption(image_meta, source_url) try: featured_media_id = _upload_featured_media( base_url=settings.wordpress_base_url, auth_header=auth, - image_url=selected_image_url, + image_url=candidate_url, article_title=title, source_url=source_url, image_caption=image_caption, ) + break # success — stop trying further candidates except Exception as img_exc: - import logging - logging.getLogger(__name__).warning( - "Bild-Upload fehlgeschlagen (wird übersprungen): %s — %s", selected_image_url, img_exc + _logger.warning( + "Bild-Upload fehlgeschlagen, versuche nächste URL: %s — %s", candidate_url, img_exc ) + if not featured_media_id and image_candidates: + _logger.warning( + "Alle %d Bild-Kandidaten fehlgeschlagen für Artikel #%s (%s)", + len(image_candidates), article.get("id"), title[:60], + ) + payload = { "title": title, "content": content, From 8fa46312e84e56e6334ae536ae9c7ee42c19dd85 Mon Sep 17 00:00:00 2001 From: OliverGiertz Date: Wed, 8 Apr 2026 09:29:24 +0000 Subject: [PATCH 44/54] fix(scheduler): query WordPress future posts to avoid double-booking slots The scheduler previously only checked the local SQLite DB for occupied slots. Posts created outside the pipeline (e.g. recovery scripts) were invisible, causing newly scheduled articles to land on already-taken WP dates. _fetch_wp_occupied_slots() now queries WP /wp/v2/posts?status=future before each slot assignment. All scheduling functions accept a wp_occupied set. Co-Authored-By: Claude Sonnet 4.6 --- backend/app/scheduler.py | 194 +++++++++++++++++++++++++++------------ 1 file changed, 134 insertions(+), 60 deletions(-) diff --git a/backend/app/scheduler.py b/backend/app/scheduler.py index d4c6aaf..a92ba08 100644 --- a/backend/app/scheduler.py +++ b/backend/app/scheduler.py @@ -3,11 +3,15 @@ Calculates suggested publish slots for new WordPress drafts. Rules: - Maximum N drafts per day (configurable, default 2) -- Prefer slots spread across the week for steady traffic -- Preferred hours: configurable (default 09:00 and 14:00 CET) +- Preferred slots: configurable hours (default 09:00 and 14:00 CET) +- New articles queue up after the last already-scheduled article +- Checks both local DB AND WordPress future posts to avoid double-booking """ from __future__ import annotations +import base64 +import json +import urllib.request from datetime import date, datetime, timedelta, timezone from typing import Any @@ -15,7 +19,7 @@ from .config import get_settings from .db import get_conn -# CET offset (UTC+1 winter / UTC+2 summer – we use a fixed +1 for simplicity) +# CET offset (UTC+1 winter / UTC+2 summer – fixed +1 for simplicity) _CET_OFFSET = timedelta(hours=1) @@ -31,35 +35,87 @@ def _preferred_hours() -> list[int]: return [9, 14] -def _count_scheduled_on_day(target_date: date) -> int: - """Count articles already scheduled for publication on a given date.""" - date_str = target_date.isoformat() +def _fetch_wp_occupied_slots() -> set[tuple[str, int]]: + """Fetch all future-scheduled WordPress posts and return occupied (date_iso, hour) pairs. + + This prevents the scheduler from assigning a slot that is already taken + by a WP post that was not created via this pipeline (e.g. manually or via recovery scripts). + Returns an empty set on any error so the scheduler degrades gracefully. + """ + settings = get_settings() + try: + auth = base64.b64encode( + f"{settings.wordpress_username}:{settings.wordpress_password}".encode() + ).decode() + url = ( + f"{settings.wordpress_base_url}/wp-json/wp/v2/posts" + f"?status=future&per_page=100&orderby=date&order=asc&_fields=id,date" + ) + req = urllib.request.Request(url, headers={"Authorization": f"Basic {auth}"}) + with urllib.request.urlopen(req, timeout=10) as resp: + posts = json.loads(resp.read()) + occupied: set[tuple[str, int]] = set() + for p in posts: + try: + dt = datetime.fromisoformat(p["date"]) + occupied.add((dt.date().isoformat(), dt.hour)) + except Exception: + pass + return occupied + except Exception: + return set() + + +def _get_last_future_scheduled_date(wp_occupied: set[tuple[str, int]]) -> date | None: + """Return the date of the latest already-scheduled slot (DB + WP).""" + today = _today_cet() + + # Latest from local DB with get_conn() as conn: row = conn.execute( """ - SELECT COUNT(*) AS cnt + SELECT MAX(scheduled_publish_at) AS last_slot FROM articles - WHERE scheduled_publish_at >= ? AND scheduled_publish_at < ? - AND status NOT IN ('error') + WHERE scheduled_publish_at IS NOT NULL + AND scheduled_publish_at >= ? + AND status NOT IN ('error', 'no_image') """, - (date_str + "T00:00:00", date_str + "T23:59:59"), + (today.isoformat() + "T00:00:00",), ).fetchone() - return int(row["cnt"]) if row else 0 + db_last: date | None = None + if row and row["last_slot"]: + try: + db_last = datetime.fromisoformat(row["last_slot"]).date() + except Exception: + pass + + # Latest from WP + wp_last: date | None = None + for d_str, _ in wp_occupied: + try: + d = date.fromisoformat(d_str) + if d >= today and (wp_last is None or d > wp_last): + wp_last = d + except Exception: + pass + + if db_last and wp_last: + return max(db_last, wp_last) + return db_last or wp_last -def _next_free_hour(target_date: date) -> int | None: - """Return first preferred hour that is not yet used on target_date, or None if day is full.""" - settings = get_settings() - max_per_day = settings.pipeline_max_drafts_per_day +def _next_free_hour(target_date: date, wp_occupied: set[tuple[str, int]]) -> int | None: + """Return first preferred hour not yet used on target_date (DB + WP), or None if day is full.""" hours = _preferred_hours() - date_str = target_date.isoformat() + + # Hours used in local DB with get_conn() as conn: rows = conn.execute( """ SELECT scheduled_publish_at FROM articles WHERE scheduled_publish_at >= ? AND scheduled_publish_at < ? - AND status NOT IN ('error') + AND status NOT IN ('error', 'no_image') """, (date_str + "T00:00:00", date_str + "T23:59:59"), ).fetchall() @@ -72,68 +128,86 @@ def _next_free_hour(target_date: date) -> int | None: except Exception: pass + # Hours used in WordPress + for d_str, h in wp_occupied: + if d_str == date_str: + used_hours.add(h) + for h in hours: if h not in used_hours: return h - return None # day is full + return None -def suggest_publish_slot(lookahead_days: int = 14) -> str: - """Return a suggested publish datetime string (ISO, CET) for the next free slot. - - Format: 'Mo, 24.03.2026 um 09:00 Uhr' - Also updates DB so consecutive calls return different slots. - """ - today = _today_cet() +def _format_slot(d: date, hour: int) -> str: weekday_names = ["Mo", "Di", "Mi", "Do", "Fr", "Sa", "So"] + wd = weekday_names[d.weekday()] + return f"{wd}, {d.strftime('%d.%m.%Y')} um {hour:02d}:00 Uhr" - for offset in range(1, lookahead_days + 1): - candidate = today + timedelta(days=offset) - hour = _next_free_hour(candidate) - if hour is not None: - wd = weekday_names[candidate.weekday()] - return f"{wd}, {candidate.strftime('%d.%m.%Y')} um {hour:02d}:00 Uhr" - # Fallback: just tomorrow morning +def _find_next_free_slot( + wp_occupied: set[tuple[str, int]], lookahead_days: int = 30 +) -> tuple[date, int] | None: + """Find the next free (date, hour) slot, anchored after the last scheduled article.""" + today = _today_cet() tomorrow = today + timedelta(days=1) - hours = _preferred_hours() - h = hours[0] if hours else 9 - wd = weekday_names[tomorrow.weekday()] - return f"{wd}, {tomorrow.strftime('%d.%m.%Y')} um {h:02d}:00 Uhr" + + last_date = _get_last_future_scheduled_date(wp_occupied) + start_date = last_date if (last_date and last_date >= tomorrow) else tomorrow + + for offset in range(0, lookahead_days + 1): + candidate = start_date + timedelta(days=offset) + hour = _next_free_hour(candidate, wp_occupied) + if hour is not None: + return candidate, hour + + return tomorrow, _preferred_hours()[0] if _preferred_hours() else 9 + + +def suggest_publish_slot() -> str: + """Return a suggested publish datetime string (CET) for the next free slot.""" + wp_occupied = _fetch_wp_occupied_slots() + result = _find_next_free_slot(wp_occupied) + if result: + d, hour = result + return _format_slot(d, hour) + tomorrow = _today_cet() + timedelta(days=1) + return _format_slot(tomorrow, _preferred_hours()[0] if _preferred_hours() else 9) def reserve_publish_slot(article_id: int) -> str: """Reserve a publish slot for an article and persist it in the DB. - Returns the suggested publish datetime string. + If the article already has a scheduled_publish_at, keep it unchanged. + Returns the formatted publish datetime string. """ - today = _today_cet() - lookahead_days = 14 - weekday_names = ["Mo", "Di", "Mi", "Do", "Fr", "Sa", "So"] + # Check if already has a slot + with get_conn() as conn: + row = conn.execute( + "SELECT scheduled_publish_at FROM articles WHERE id = ?", + (article_id,), + ).fetchone() + existing_slot = row["scheduled_publish_at"] if row else None + if existing_slot: + try: + dt = datetime.fromisoformat(existing_slot) + return _format_slot(dt.date(), dt.hour) + except Exception: + pass # invalid slot, re-assign below - for offset in range(1, lookahead_days + 1): - candidate = today + timedelta(days=offset) - hour = _next_free_hour(candidate) - if hour is not None: - # Reserve this slot by writing to the article - iso_ts = f"{candidate.isoformat()}T{hour:02d}:00:00" - with get_conn() as conn: - conn.execute( - "UPDATE articles SET scheduled_publish_at = ? WHERE id = ?", - (iso_ts, article_id), - ) - wd = weekday_names[candidate.weekday()] - return f"{wd}, {candidate.strftime('%d.%m.%Y')} um {hour:02d}:00 Uhr" + wp_occupied = _fetch_wp_occupied_slots() + result = _find_next_free_slot(wp_occupied, lookahead_days=30) + if result: + candidate, hour = result + else: + candidate = _today_cet() + timedelta(days=1) + hours = _preferred_hours() + hour = hours[0] if hours else 9 - # Fallback - tomorrow = today + timedelta(days=1) - hours = _preferred_hours() - h = hours[0] if hours else 9 - iso_ts = f"{tomorrow.isoformat()}T{h:02d}:00:00" + iso_ts = f"{candidate.isoformat()}T{hour:02d}:00:00" with get_conn() as conn: conn.execute( "UPDATE articles SET scheduled_publish_at = ? WHERE id = ?", (iso_ts, article_id), ) - wd = weekday_names[tomorrow.weekday()] - return f"{wd}, {tomorrow.strftime('%d.%m.%Y')} um {h:02d}:00 Uhr" + return _format_slot(candidate, hour) From 94bd93a18a4206ce38dc0a0adcdff3c2ae350e26 Mon Sep 17 00:00:00 2001 From: OliverGiertz Date: Wed, 8 Apr 2026 09:34:27 +0000 Subject: [PATCH 45/54] fix(scheduler): fill schedule gaps instead of always appending to end MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously the scheduler started searching from the last scheduled post date, skipping all free slots in between (e.g. a free slot on Apr 20 would be ignored if the last post was on May 18). Now starts scanning from tomorrow, finding the first available slot regardless of whether earlier dates have gaps — fills the calendar naturally. Also extended lookahead from 30 to 60 days. Co-Authored-By: Claude Sonnet 4.6 --- backend/app/scheduler.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/backend/app/scheduler.py b/backend/app/scheduler.py index a92ba08..a1028fc 100644 --- a/backend/app/scheduler.py +++ b/backend/app/scheduler.py @@ -146,17 +146,18 @@ def _format_slot(d: date, hour: int) -> str: def _find_next_free_slot( - wp_occupied: set[tuple[str, int]], lookahead_days: int = 30 + wp_occupied: set[tuple[str, int]], lookahead_days: int = 60 ) -> tuple[date, int] | None: - """Find the next free (date, hour) slot, anchored after the last scheduled article.""" + """Find the next free (date, hour) slot. + + Starts from tomorrow and scans forward, filling any gaps in the schedule + rather than always appending after the last existing post. + """ today = _today_cet() tomorrow = today + timedelta(days=1) - last_date = _get_last_future_scheduled_date(wp_occupied) - start_date = last_date if (last_date and last_date >= tomorrow) else tomorrow - for offset in range(0, lookahead_days + 1): - candidate = start_date + timedelta(days=offset) + candidate = tomorrow + timedelta(days=offset) hour = _next_free_hour(candidate, wp_occupied) if hour is not None: return candidate, hour From 09dcf6ce368079df10dbbc5749c901e806f32f69 Mon Sep 17 00:00:00 2001 From: OliverGiertz Date: Wed, 8 Apr 2026 09:42:02 +0000 Subject: [PATCH 46/54] feat(pipeline): add two-stage article quality gate (min word count) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stage 1 (before OpenAI rewrite): reject if raw content < pipeline_min_words_raw (default 120) Stage 2 (after rewrite): reject if rewritten text < pipeline_min_words_rewritten (default 150) Both stages set status='error' with a descriptive note and skip WP draft creation. The reserved publish slot is released so it stays available for the next article. Quality rejections don't abort the pipeline — processing continues with the next article. New config settings (overridable via .env): PIPELINE_MIN_WORDS_RAW=120 PIPELINE_MIN_WORDS_REWRITTEN=150 Co-Authored-By: Claude Sonnet 4.6 --- backend/app/config.py | 2 ++ backend/app/pipeline.py | 35 ++++++++++++++++++++++++++++++++++- backend/app/scheduler.py | 9 +++++++++ 3 files changed, 45 insertions(+), 1 deletion(-) diff --git a/backend/app/config.py b/backend/app/config.py index d56ce11..713669e 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -46,6 +46,8 @@ class Settings(BaseSettings): pipeline_relevance_warn: int = 60 # >= this: Telegram warning, else reject pipeline_max_drafts_per_day: int = 2 pipeline_publish_hours: str = "9,14" # comma-separated preferred publish hours (CET) + pipeline_min_words_raw: int = 120 # minimum words in raw content before rewrite (else reject) + pipeline_min_words_rewritten: int = 150 # minimum words in rewritten content (else reject) @lru_cache(maxsize=1) diff --git a/backend/app/pipeline.py b/backend/app/pipeline.py index 059ccf5..9cd9059 100644 --- a/backend/app/pipeline.py +++ b/backend/app/pipeline.py @@ -109,10 +109,35 @@ def _store_relevance(article_id: int, relevance: dict[str, Any]) -> None: def _do_rewrite_and_draft(article: dict[str, Any]) -> tuple[int, str | None]: """Rewrite article and create WP draft. Returns (wp_post_id, wp_post_url).""" article_id = int(article["id"]) + settings = get_settings() + + # ── Quality gate 1: raw content length ────────────────────────────────── + import re as _re + raw_text = _re.sub(r"<[^>]+>", " ", article.get("content_raw") or "") + raw_words = len(raw_text.split()) + if raw_words < settings.pipeline_min_words_raw: + note = ( + f"Zu wenig Rohinhalt: {raw_words} Wörter " + f"(Minimum: {settings.pipeline_min_words_raw})" + ) + logger.warning("_do_rewrite_and_draft #%d: %s — überspringe", article_id, note) + update_article_status(article_id, "error", actor="pipeline", note=note) + raise ValueError(note) # Rewrite - logger.info("_do_rewrite_and_draft #%d: starte OpenAI-Rewrite", article_id) + logger.info("_do_rewrite_and_draft #%d: starte OpenAI-Rewrite (%d Roh-Wörter)", article_id, raw_words) rewritten = rewrite_article_text(article) + + # ── Quality gate 2: rewritten content length ───────────────────────────── + rewritten_words = len(rewritten.split()) + if rewritten_words < settings.pipeline_min_words_rewritten: + note = ( + f"Rewrite zu kurz: {rewritten_words} Wörter " + f"(Minimum: {settings.pipeline_min_words_rewritten})" + ) + logger.warning("_do_rewrite_and_draft #%d: %s — überspringe", article_id, note) + update_article_status(article_id, "error", actor="pipeline", note=note) + raise ValueError(note) logger.info("_do_rewrite_and_draft #%d: Rewrite fertig (%d Wörter), generiere Tags", article_id, len(rewritten.split())) tags: list[str] = [] try: @@ -342,6 +367,14 @@ def _process_article(article: dict[str, Any], stats: PipelineStats, settings: An except Exception as exc: logger.warning("Telegram draft-Benachrichtigung für #%d fehlgeschlagen: %s", article_id, exc) + except ValueError as exc: + # Quality gate rejection (too short etc.) — status already set in _do_rewrite_and_draft + # Release the reserved slot so it's available for the next article + from .scheduler import release_publish_slot + release_publish_slot(article_id) + stats.rejected_articles.append(get_article_by_id(article_id) or {}) + logger.info("Artikel #%d wegen Qualitätsprüfung abgelehnt: %s", article_id, exc) + except Exception as exc: logger.error("Draft-Erstellung für #%d fehlgeschlagen: %s", article_id, exc) update_article_status(article_id, "error", actor="pipeline", note=f"Draft-Fehler: {exc}") diff --git a/backend/app/scheduler.py b/backend/app/scheduler.py index a1028fc..0ec38d8 100644 --- a/backend/app/scheduler.py +++ b/backend/app/scheduler.py @@ -165,6 +165,15 @@ def _find_next_free_slot( return tomorrow, _preferred_hours()[0] if _preferred_hours() else 9 +def release_publish_slot(article_id: int) -> None: + """Clear a previously reserved slot (e.g. when article is rejected after slot assignment).""" + with get_conn() as conn: + conn.execute( + "UPDATE articles SET scheduled_publish_at = NULL WHERE id = ?", + (article_id,), + ) + + def suggest_publish_slot() -> str: """Return a suggested publish datetime string (CET) for the next free slot.""" wp_occupied = _fetch_wp_occupied_slots() From 2d1dd14e45e83efd7a57d8e8efbaefcaeaa8a76f Mon Sep 17 00:00:00 2001 From: OliverGiertz Date: Thu, 9 Apr 2026 07:02:03 +0000 Subject: [PATCH 47/54] fix(pipeline): send individual Telegram notifications for quality gate rejections MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add individual Telegram message when an article is rejected by quality gate (too short raw content or rewritten text), so users see each rejection in real time instead of only in the bulk summary - Add quality_gate_rejected counter to PipelineStats and result dict - Show quality gate rejections separately in pipeline-done summary (✂️ Qualitätsprüfung: N) distinct from score-based rejections Co-Authored-By: Claude Sonnet 4.6 --- backend/app/pipeline.py | 15 ++++++++++++++- backend/app/telegram_bot.py | 5 ++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/backend/app/pipeline.py b/backend/app/pipeline.py index 9cd9059..1ae1ff6 100644 --- a/backend/app/pipeline.py +++ b/backend/app/pipeline.py @@ -43,6 +43,7 @@ class PipelineStats: processed: int = 0 drafts_created: int = 0 rejected: int = 0 + quality_gate_rejected: int = 0 warnings: int = 0 errors: int = 0 no_image: int = 0 @@ -261,6 +262,7 @@ def run_auto_pipeline(trigger: str = "auto") -> dict[str, Any]: "processed": stats.processed, "drafts_created": stats.drafts_created, "rejected": stats.rejected, + "quality_gate_rejected": stats.quality_gate_rejected, "no_image": stats.no_image, "warnings": stats.warnings, "errors": stats.errors, @@ -372,8 +374,19 @@ def _process_article(article: dict[str, Any], stats: PipelineStats, settings: An # Release the reserved slot so it's available for the next article from .scheduler import release_publish_slot release_publish_slot(article_id) - stats.rejected_articles.append(get_article_by_id(article_id) or {}) + stats.quality_gate_rejected += 1 logger.info("Artikel #%d wegen Qualitätsprüfung abgelehnt: %s", article_id, exc) + # Individual Telegram notification for quality gate rejection + try: + title = (article.get("title") or "Ohne Titel")[:80] + tg.send_message( + f"✂️ Qualitätsprüfung nicht bestanden\n" + f"📰 {title}\n" + f"💯 Score: {score}/100\n" + f"⚠️ {exc}" + ) + except Exception as tg_exc: + logger.warning("Telegram QG-Benachrichtigung für #%d fehlgeschlagen: %s", article_id, tg_exc) except Exception as exc: logger.error("Draft-Erstellung für #%d fehlgeschlagen: %s", article_id, exc) diff --git a/backend/app/telegram_bot.py b/backend/app/telegram_bot.py index c92b009..880a49d 100644 --- a/backend/app/telegram_bot.py +++ b/backend/app/telegram_bot.py @@ -289,6 +289,7 @@ def notify_pipeline_done(stats: dict[str, Any]) -> None: processed = stats.get("processed", 0) drafts = stats.get("drafts_created", 0) rejected = stats.get("rejected", 0) + quality_gate_rejected = stats.get("quality_gate_rejected", 0) no_image = stats.get("no_image", 0) warnings = stats.get("warnings", 0) errors = stats.get("errors", 0) @@ -300,7 +301,9 @@ def notify_pipeline_done(stats: dict[str, Any]) -> None: f"📝 Drafts erstellt: {drafts}", ] if rejected: - lines.append(f"🚫 Abgelehnt: {rejected}") + lines.append(f"🚫 Abgelehnt (Score): {rejected}") + if quality_gate_rejected: + lines.append(f"✂️ Qualitätsprüfung: {quality_gate_rejected}") if no_image: lines.append(f"🖼️ Kein Bild: {no_image}") if warnings: From cf2d826c8a2fc6112453b91e7eae4d68020e9f54 Mon Sep 17 00:00:00 2001 From: OliverGiertz Date: Fri, 10 Apr 2026 08:22:26 +0000 Subject: [PATCH 48/54] fix(scheduler,pipeline): fix WP auth attribute name and release slot on hard errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - scheduler: use wordpress_app_password (not wordpress_password) so _fetch_wp_occupied_slots() can actually authenticate against the WP REST API — previously always returned empty set silently - pipeline: release reserved publish slot when draft creation fails with a non-ValueError exception (e.g. WP API error), preventing permanently blocked slots on failed articles Co-Authored-By: Claude Sonnet 4.6 --- backend/app/pipeline.py | 3 +++ backend/app/scheduler.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/backend/app/pipeline.py b/backend/app/pipeline.py index 1ae1ff6..d9766ae 100644 --- a/backend/app/pipeline.py +++ b/backend/app/pipeline.py @@ -391,6 +391,9 @@ def _process_article(article: dict[str, Any], stats: PipelineStats, settings: An except Exception as exc: logger.error("Draft-Erstellung für #%d fehlgeschlagen: %s", article_id, exc) update_article_status(article_id, "error", actor="pipeline", note=f"Draft-Fehler: {exc}") + # Release reserved slot so it's not permanently blocked by a failed article + from .scheduler import release_publish_slot + release_publish_slot(article_id) raise diff --git a/backend/app/scheduler.py b/backend/app/scheduler.py index 0ec38d8..ff5cecf 100644 --- a/backend/app/scheduler.py +++ b/backend/app/scheduler.py @@ -45,7 +45,7 @@ def _fetch_wp_occupied_slots() -> set[tuple[str, int]]: settings = get_settings() try: auth = base64.b64encode( - f"{settings.wordpress_username}:{settings.wordpress_password}".encode() + f"{settings.wordpress_username}:{settings.wordpress_app_password}".encode() ).decode() url = ( f"{settings.wordpress_base_url}/wp-json/wp/v2/posts" From 8676ace1026347491e186428db9788ecf2b5a244 Mon Sep 17 00:00:00 2001 From: OliverGiertz Date: Fri, 10 Apr 2026 08:44:28 +0000 Subject: [PATCH 49/54] feat(pipeline): article age filter, image URL validation, schedule UI, retry button MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Article age filter (ingestion.py + config.py): - New setting pipeline_max_article_age_days=7 (0 = no limit) - Skip RSS entries older than N days before expensive extract_article() - Prevents old articles from Google Alerts re-entering pipeline 2. Image URL pre-validation (ingestion.py): - HEAD request probe for each primary image candidate during ingestion - Falls back to next-best candidate if primary returns 4xx - Network errors treated as OK to avoid false negatives on flaky servers 3. Stale WP draft cleanup (pipeline.py): - Quality gate rejections now delete any pre-existing WP draft (wp_post_id) - Prevents orphaned drafts when re-running articles that previously had drafts 4. Schedule overview UI (scheduler.py + admin_ui.py + admin_schedule.html): - New /admin/schedule page showing calendar grid of all booked slots - Distinguishes Pipeline-DB slots from WordPress-only slots - Link added to dashboard navigation 5. Retry for failed articles (admin_ui.py + admin_dashboard.html): - New POST /admin/articles/{id}/retry endpoint: resets to 'new', releases slot - '🔄 Wiederholen' button shown in dashboard for all 'close' (error) articles Co-Authored-By: Claude Sonnet 4.6 --- backend/app/admin_ui.py | 72 +++++++++++++ backend/app/config.py | 1 + backend/app/ingestion.py | 62 +++++++++++- backend/app/pipeline.py | 9 ++ backend/app/scheduler.py | 66 ++++++++++++ backend/templates/admin_dashboard.html | 6 ++ backend/templates/admin_schedule.html | 133 +++++++++++++++++++++++++ 7 files changed, 344 insertions(+), 5 deletions(-) create mode 100644 backend/templates/admin_schedule.html diff --git a/backend/app/admin_ui.py b/backend/app/admin_ui.py index 689efce..26085ff 100644 --- a/backend/app/admin_ui.py +++ b/backend/app/admin_ui.py @@ -929,3 +929,75 @@ def admin_transition_article(request: Request, article_id: int, target_status: s update_article_status(article_id, target_internal, actor=user, note=note or None) return _dashboard_redirect(msg=f"Artikel #{article_id}: {current_ui} -> {target_ui}") return _dashboard_redirect(msg=f"Ungueltiger Statuswechsel fuer Artikel #{article_id}", msg_type="error") + + +@router.post("/admin/articles/{article_id}/retry") +def admin_retry_article(request: Request, article_id: int): + """Reset a failed article to 'new' so the pipeline picks it up on next run.""" + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + + article = get_article_by_id(article_id) + if not article: + return _dashboard_redirect(msg=f"Artikel #{article_id} nicht gefunden", msg_type="error") + + from .scheduler import release_publish_slot + release_publish_slot(article_id) + update_article_status(article_id, "new", actor=user, note="Manuell zurückgesetzt für erneuten Pipeline-Versuch") + return _dashboard_redirect( + msg=f"Artikel #{article_id} wurde auf 'neu' zurückgesetzt und wird beim nächsten Pipeline-Lauf verarbeitet", + status_filter="close", + ) + + +@router.get("/admin/schedule", response_class=HTMLResponse) +def admin_schedule(request: Request): + """Schedule overview: all booked slots from DB and WordPress.""" + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + + from .scheduler import get_schedule_overview, _preferred_hours, _today_cet + from datetime import timedelta + + slots = get_schedule_overview(lookahead_days=60) + today = _today_cet() + hours = _preferred_hours() + + # Build a calendar grid: for each day in the next 60 days, show each preferred hour slot + booked: dict[tuple[str, int], dict] = {(s["date"], s["hour"]): s for s in slots} + calendar_days = [] + for offset in range(0, 61): + d = today + timedelta(days=offset) + d_str = d.isoformat() + day_slots = [] + for h in hours: + key = (d_str, h) + day_slots.append({ + "hour": h, + "booked": key in booked, + "slot": booked.get(key), + }) + calendar_days.append({ + "date": d_str, + "date_fmt": d.strftime("%d.%m.%Y"), + "weekday": ["Mo", "Di", "Mi", "Do", "Fr", "Sa", "So"][d.weekday()], + "slots": day_slots, + "any_booked": any(s["booked"] for s in day_slots), + }) + + return templates.TemplateResponse( + request, + "admin_schedule.html", + { + "request": request, + "title": "Veröffentlichungsplan", + "user": user, + "slots": slots, + "calendar_days": calendar_days, + "hours": hours, + "flash_msg": request.query_params.get("msg", ""), + "flash_type": request.query_params.get("type", "success"), + }, + ) diff --git a/backend/app/config.py b/backend/app/config.py index 713669e..24c3902 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -48,6 +48,7 @@ class Settings(BaseSettings): pipeline_publish_hours: str = "9,14" # comma-separated preferred publish hours (CET) pipeline_min_words_raw: int = 120 # minimum words in raw content before rewrite (else reject) pipeline_min_words_rewritten: int = 150 # minimum words in rewritten content (else reject) + pipeline_max_article_age_days: int = 7 # skip articles older than N days during ingestion (0 = no limit) @lru_cache(maxsize=1) diff --git a/backend/app/ingestion.py b/backend/app/ingestion.py index 30140ca..391af92 100644 --- a/backend/app/ingestion.py +++ b/backend/app/ingestion.py @@ -1,13 +1,15 @@ from __future__ import annotations from dataclasses import dataclass -from datetime import datetime, timezone +from datetime import datetime, timedelta, timezone import hashlib import json import re import time from typing import Any from urllib.parse import unquote, urlencode, urlparse, parse_qs +import urllib.error +import urllib.request as _urllib_req import feedparser @@ -119,6 +121,26 @@ def _normalize_tokens(text: str) -> set[str]: return {token for token in normalized.split() if len(token) >= 4} +def _probe_image_url(url: str, timeout: int = 5) -> bool: + """Return True if URL responds without a 4xx/5xx error (HEAD request). + + Returns True on network/connection errors so that a flaky server does not + cause a valid image to be silently dropped. + """ + try: + req = _urllib_req.Request( + url, + method="HEAD", + headers={"User-Agent": "Mozilla/5.0 (compatible; rss-news/1.0)"}, + ) + with _urllib_req.urlopen(req, timeout=timeout) as resp: + return resp.status < 400 + except urllib.error.HTTPError as exc: + return exc.code < 400 # 3xx redirects are OK; 4xx/5xx are not + except Exception: + return True # network error → don't filter, let WP try later + + def _rank_image_candidates(source_url: str, title: str, images: list[str]) -> list[dict[str, Any]]: source_host = (urlparse(source_url).hostname or "").lower() is_presseportal = "presseportal.de" in source_host @@ -184,10 +206,25 @@ def _select_relevant_images(source_url: str, title: str, images: list[str], max_ deduped.append(image) ranked = _rank_image_candidates(source_url, title, deduped) - kept = [item["url"] for item in ranked if item["score"] > 0][:max_keep] - if not kept and ranked: - kept = [ranked[0]["url"]] - primary = kept[0] if kept else None + candidates = [item["url"] for item in ranked if item["score"] > -100] + + # Probe top candidates (max 4) to skip definitively broken URLs (HTTP 4xx). + # Network errors are treated as OK to avoid false negatives on flaky servers. + primary = None + kept: list[str] = [] + for url in candidates[:4]: + if _probe_image_url(url): + if primary is None: + primary = url + kept.append(url) + if len(kept) >= max_keep: + break + + # Fallback: if all probes failed with network errors, use best candidate anyway + if not kept and candidates: + primary = candidates[0] + kept = candidates[:max_keep] + return kept, primary, ranked @@ -265,12 +302,27 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats: feed_entries_seen = 0 feed_upserts = 0 + from .config import get_settings as _get_settings + _max_age_days = _get_settings().pipeline_max_article_age_days for entry in _parsed_get(parsed, "entries", []): entries_seen += 1 feed_entries_seen += 1 link = entry.get("link") if not link: continue + + # Age filter: skip articles older than max_age_days (0 = no limit) + if _max_age_days > 0: + published_iso = _entry_published_iso(entry) + if published_iso: + try: + published_dt = datetime.fromisoformat(published_iso) + age = datetime.now(timezone.utc) - published_dt + if age > timedelta(days=_max_age_days): + continue + except Exception: + pass # can't parse date → allow through + # Resolve Google redirect URLs (google.com/url?...&url=&...) link = _resolve_google_redirect(link) # Normalize AMP/tracking params (e.g. ?outputType=valid_amp) diff --git a/backend/app/pipeline.py b/backend/app/pipeline.py index d9766ae..93a251b 100644 --- a/backend/app/pipeline.py +++ b/backend/app/pipeline.py @@ -374,6 +374,15 @@ def _process_article(article: dict[str, Any], stats: PipelineStats, settings: An # Release the reserved slot so it's available for the next article from .scheduler import release_publish_slot release_publish_slot(article_id) + # Clean up any stale WP draft from a previous pipeline run + stale = get_article_by_id(article_id) + if stale and stale.get("wp_post_id"): + try: + from .wordpress import delete_wp_post + delete_wp_post(int(stale["wp_post_id"])) + logger.info("Artikel #%d: veralteten WP-Draft #%s gelöscht", article_id, stale["wp_post_id"]) + except Exception as del_exc: + logger.warning("Artikel #%d: WP-Draft konnte nicht gelöscht werden: %s", article_id, del_exc) stats.quality_gate_rejected += 1 logger.info("Artikel #%d wegen Qualitätsprüfung abgelehnt: %s", article_id, exc) # Individual Telegram notification for quality gate rejection diff --git a/backend/app/scheduler.py b/backend/app/scheduler.py index ff5cecf..8f8d498 100644 --- a/backend/app/scheduler.py +++ b/backend/app/scheduler.py @@ -165,6 +165,72 @@ def _find_next_free_slot( return tomorrow, _preferred_hours()[0] if _preferred_hours() else 9 +def get_schedule_overview(lookahead_days: int = 60) -> list[dict]: + """Return all booked scheduling slots (DB + WP) for the next N days, sorted by date.""" + today = _today_cet() + hours = _preferred_hours() + + # Slots booked in local DB + with get_conn() as conn: + rows = conn.execute( + """ + SELECT id, title, status, wp_post_id, wp_post_url, scheduled_publish_at + FROM articles + WHERE scheduled_publish_at IS NOT NULL + AND scheduled_publish_at >= ? + AND status NOT IN ('error', 'no_image') + ORDER BY scheduled_publish_at + """, + (today.isoformat() + "T00:00:00",), + ).fetchall() + + db_slots: dict[tuple[str, int], dict] = {} + for row in rows: + try: + dt = datetime.fromisoformat(row["scheduled_publish_at"]) + key = (dt.date().isoformat(), dt.hour) + db_slots[key] = { + "date": dt.date().isoformat(), + "hour": dt.hour, + "formatted": _format_slot(dt.date(), dt.hour), + "source": "db", + "article_id": row["id"], + "article_title": row["title"], + "article_status": row["status"], + "wp_post_id": row["wp_post_id"], + "wp_post_url": row["wp_post_url"], + } + except Exception: + pass + + # Slots occupied in WordPress but not in local DB + wp_occupied = _fetch_wp_occupied_slots() + wp_only: list[dict] = [] + for d_str, h in sorted(wp_occupied): + if (d_str, h) in db_slots: + continue + try: + d = date.fromisoformat(d_str) + if d >= today: + wp_only.append({ + "date": d_str, + "hour": h, + "formatted": _format_slot(d, h), + "source": "wordpress", + "article_id": None, + "article_title": "(WP-Beitrag außerhalb Pipeline)", + "article_status": None, + "wp_post_id": None, + "wp_post_url": None, + }) + except Exception: + pass + + all_slots = list(db_slots.values()) + wp_only + all_slots.sort(key=lambda s: (s["date"], s["hour"])) + return all_slots + + def release_publish_slot(article_id: int) -> None: """Clear a previously reserved slot (e.g. when article is rejected after slot assignment).""" with get_conn() as conn: diff --git a/backend/templates/admin_dashboard.html b/backend/templates/admin_dashboard.html index 15f3daf..67738c7 100644 --- a/backend/templates/admin_dashboard.html +++ b/backend/templates/admin_dashboard.html @@ -13,6 +13,7 @@

          Angemeldet als {{ user }}

          + Veröffentlichungsplan Connectivity Check
          @@ -330,6 +331,11 @@ keine Aktion {% endif %} + {% if a.status_ui == 'close' %} +
          + + + {% endif %} {% endfor %} diff --git a/backend/templates/admin_schedule.html b/backend/templates/admin_schedule.html new file mode 100644 index 0000000..f585b00 --- /dev/null +++ b/backend/templates/admin_schedule.html @@ -0,0 +1,133 @@ + + + + + + {{ title }} + + + + +
          +
          +

          rss-news Veröffentlichungsplan

          +

          Angemeldet als {{ user }}

          +
          +
          + Dashboard + Connectivity +
          + + +
          +
          + +
          + {% if flash_msg %} +
          + {{ flash_msg }} +
          + {% endif %} + +
          +

          Slot-Übersicht (nächste 60 Tage)

          +
          + 📅 Belegte Slots gesamt: {{ slots|length }} + 🗄️ Aus Pipeline-DB: {{ slots|selectattr('source', 'eq', 'db')|list|length }} + 🌐 Nur in WordPress: {{ slots|selectattr('source', 'eq', 'wordpress')|list|length }} +
          +

        + + + + {% for h in hours %} + + {% endfor %} + + + + {% for day in calendar_days %} + {% if day.any_booked %} + + + {% for s in day.slots %} + + {% endfor %} + + {% endif %} + {% endfor %} + +
        Tag{{ "%02d:00 Uhr"|format(h) }}
        {{ day.weekday }} {{ day.date_fmt }} + {% if s.booked %} + {% set info = s.slot %} + {% if info.source == 'db' %} + + DB +
        + {% if info.article_id %} + + {{ (info.article_title or "Artikel")[:50] }}{% if (info.article_title or "")|length > 50 %}…{% endif %} + + {% endif %} +
        Status: {{ info.article_status }} + {% if info.wp_post_url %} +
        WP öffnen + {% endif %} +
        + {% else %} + ⚠️ + WP +
        {{ info.article_title }}
        + {% endif %} + {% else %} + frei + {% endif %} +
        + {% if not slots %} +

        Keine geplanten Beiträge in den nächsten 60 Tagen.

        + {% endif %} +
        + +
        +

        Alle belegten Slots (Liste)

        + + + + + + {% for s in slots %} + + + + + + + + {% endfor %} + +
        Datum/ZeitQuelleArtikelStatusWordPress
        {{ s.formatted }} + {% if s.source == 'db' %}Pipeline-DB + {% else %}WordPress{% endif %} + + {% if s.article_id %} + {{ (s.article_title or "")[:60] }} + {% else %} + {{ s.article_title or "-" }} + {% endif %} + {{ s.article_status or "-" }} + {% if s.wp_post_url %} + Draft öffnen + {% else %}-{% endif %} +
        +
        + + + From 2d02b56b65e35a15a5ba1770b869db4cc8cd8de5 Mon Sep 17 00:00:00 2001 From: OliverGiertz Date: Fri, 10 Apr 2026 08:53:44 +0000 Subject: [PATCH 50/54] =?UTF-8?q?feat(admin):=20WordPress=E2=86=92DB=20syn?= =?UTF-8?q?c=20for=20scheduled=20slots?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds sync_db_from_wordpress() that treats WordPress as source of truth: - future posts: update scheduled_publish_at to WP's actual date - draft posts: clear scheduled_publish_at (not yet scheduled) - published posts: mark article as 'published' in DB - trashed/deleted posts: clear wp_post_id + wp_post_url + slot so article can be re-processed Exposed via POST /admin/wp-sync with a sync button on the schedule page. Run after any manual rescheduling in WordPress to bring DB back in sync. Co-Authored-By: Claude Sonnet 4.6 --- backend/app/admin_ui.py | 22 +++++ backend/app/wordpress.py | 128 ++++++++++++++++++++++++++ backend/templates/admin_schedule.html | 10 ++ 3 files changed, 160 insertions(+) diff --git a/backend/app/admin_ui.py b/backend/app/admin_ui.py index 26085ff..51c2377 100644 --- a/backend/app/admin_ui.py +++ b/backend/app/admin_ui.py @@ -931,6 +931,28 @@ def admin_transition_article(request: Request, article_id: int, target_status: s return _dashboard_redirect(msg=f"Ungueltiger Statuswechsel fuer Artikel #{article_id}", msg_type="error") +@router.post("/admin/wp-sync") +def admin_wp_sync(request: Request): + """Sync scheduled_publish_at and WP references in the DB from WordPress.""" + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + try: + from .wordpress import sync_db_from_wordpress + stats = sync_db_from_wordpress() + msg = ( + f"WP-Sync abgeschlossen: " + f"{stats['slot_updated']} Slots aktualisiert, " + f"{stats['slot_cleared_draft']} Slots geleert (Draft), " + f"{stats['marked_published']} als veröffentlicht markiert, " + f"{stats['wp_reference_cleared']} WP-Referenzen gelöscht (Papierkorb), " + f"{stats['already_in_sync']} bereits synchron." + ) + return RedirectResponse(url=f"/admin/schedule?msg={msg}&type=success", status_code=303) + except Exception as exc: + return RedirectResponse(url=f"/admin/schedule?msg=Sync fehlgeschlagen: {exc}&type=error", status_code=303) + + @router.post("/admin/articles/{article_id}/retry") def admin_retry_article(request: Request, article_id: int): """Reset a failed article to 'new' so the pipeline picks it up on next run.""" diff --git a/backend/app/wordpress.py b/backend/app/wordpress.py index cced743..bb96198 100644 --- a/backend/app/wordpress.py +++ b/backend/app/wordpress.py @@ -559,3 +559,131 @@ def delete_wp_post(wp_post_id: int) -> None: method="DELETE", endpoint=f"posts/{wp_post_id}?force=true", ) + + +def sync_db_from_wordpress() -> dict[str, Any]: + """Sync scheduled_publish_at and wp_post_url in the DB from WordPress. + + WordPress is treated as the source of truth for scheduling. + For each DB article that has a wp_post_id: + - If WP post exists as 'future': update scheduled_publish_at to WP date. + - If WP post exists as 'draft': clear scheduled_publish_at (not yet scheduled). + - If WP post exists as 'publish': mark article as published in DB. + - If WP post is trashed/deleted (404 or trash status): clear wp_post_id, + wp_post_url, and scheduled_publish_at so the article can be re-processed. + Returns a stats dict with counts of each action taken. + """ + from .db import get_conn + + settings = get_settings() + if not settings.wordpress_base_url or not settings.wordpress_username or not settings.wordpress_app_password: + raise RuntimeError("WordPress Konfiguration fehlt") + auth = _auth_header(settings.wordpress_username, settings.wordpress_app_password) + base_url = settings.wordpress_base_url.rstrip("/") + + # Fetch all future + draft + published WP posts in one pass (up to 300 per status) + wp_posts: dict[int, dict] = {} + for status in ("future", "draft", "publish"): + for page in range(1, 4): # max 300 per status + try: + result = _wp_request( + base_url=base_url, + auth_header=auth, + method="GET", + endpoint=f"posts?status={status}&per_page=100&page={page}&_fields=id,date,status,link", + ) + except Exception: + break + if not isinstance(result, list) or not result: + break + for post in result: + try: + wp_posts[int(post["id"])] = post + except Exception: + pass + if len(result) < 100: + break + + # Load all DB articles that have a wp_post_id + with get_conn() as conn: + rows = conn.execute( + """ + SELECT id, wp_post_id, wp_post_url, scheduled_publish_at, status + FROM articles + WHERE wp_post_id IS NOT NULL + AND status NOT IN ('no_image') + ORDER BY id + """ + ).fetchall() + + stats: dict[str, int] = { + "total_db_articles": len(rows), + "wp_posts_found": len(wp_posts), + "slot_updated": 0, + "slot_cleared_draft": 0, + "marked_published": 0, + "wp_reference_cleared": 0, + "already_in_sync": 0, + } + + for row in rows: + article_id = row["id"] + wp_post_id = int(row["wp_post_id"]) + wp_post = wp_posts.get(wp_post_id) + + if wp_post is None: + # Post not found in future/draft/publish — likely trashed or deleted + # Clear wp reference so article can be re-processed if needed + with get_conn() as conn: + conn.execute( + """UPDATE articles + SET wp_post_id = NULL, wp_post_url = NULL, scheduled_publish_at = NULL + WHERE id = ?""", + (article_id,), + ) + stats["wp_reference_cleared"] += 1 + continue + + wp_status = wp_post.get("status", "") + wp_date = wp_post.get("date", "") # local CET datetime, e.g. "2026-05-05T09:00:00" + wp_link = wp_post.get("link") or row["wp_post_url"] + + if wp_status == "publish": + # Already published in WP — mark as published in DB if not already + if row["status"] != "published": + with get_conn() as conn: + conn.execute( + "UPDATE articles SET status = 'published', wp_post_url = ? WHERE id = ?", + (wp_link, article_id), + ) + stats["marked_published"] += 1 + else: + stats["already_in_sync"] += 1 + + elif wp_status == "future": + # Scheduled — sync the date into scheduled_publish_at + current_slot = row["scheduled_publish_at"] or "" + # WP returns e.g. "2026-05-05T09:00:00" — compare ignoring seconds + if current_slot[:16] != wp_date[:16]: + with get_conn() as conn: + conn.execute( + "UPDATE articles SET scheduled_publish_at = ?, wp_post_url = ? WHERE id = ?", + (wp_date, wp_link, article_id), + ) + stats["slot_updated"] += 1 + else: + stats["already_in_sync"] += 1 + + elif wp_status == "draft": + # Draft without a schedule — clear scheduled_publish_at if set + if row["scheduled_publish_at"]: + with get_conn() as conn: + conn.execute( + "UPDATE articles SET scheduled_publish_at = NULL WHERE id = ?", + (article_id,), + ) + stats["slot_cleared_draft"] += 1 + else: + stats["already_in_sync"] += 1 + + return stats diff --git a/backend/templates/admin_schedule.html b/backend/templates/admin_schedule.html index f585b00..4f2513a 100644 --- a/backend/templates/admin_schedule.html +++ b/backend/templates/admin_schedule.html @@ -37,6 +37,16 @@ {% endif %} +
        +
        +

        WordPress → DB Synchronisieren

        +

        Liest alle geplanten WP-Beiträge und aktualisiert die Slots in der lokalen DB.
        Nutze dies nach manuellen Änderungen in WordPress.

        +
        +
        + +
        +
        +

        Slot-Übersicht (nächste 60 Tage)

        From cdcf441daf9dfba08187df602e2e854621588cea Mon Sep 17 00:00:00 2001 From: OliverGiertz Date: Fri, 10 Apr 2026 09:00:25 +0000 Subject: [PATCH 51/54] feat(admin): bulk-editable article list with WP ID inline editing - New /admin/article-list: paginated (50/page) table with thumbnail, title, excerpt (120 chars), status, scheduled date, and WP ID input - Sticky save bar with live change counter (JS tracks modified inputs, highlights changed cells in amber, disables save when nothing changed) - POST /admin/article-list/update: saves only changed WP IDs in one request; clears stale wp_post_url so WP-Sync repopulates it cleanly - Filter by status + free-text search (title or article ID) - Pagination with page/filter state preserved through save redirects - repositories: add list_articles_page() (offset + search) and bulk_update_wp_post_ids() - Dashboard nav: add Artikelliste link Co-Authored-By: Claude Sonnet 4.6 --- backend/app/admin_ui.py | 101 ++++++++++ backend/app/repositories.py | 61 ++++++ backend/templates/admin_article_list.html | 221 ++++++++++++++++++++++ backend/templates/admin_dashboard.html | 1 + 4 files changed, 384 insertions(+) create mode 100644 backend/templates/admin_article_list.html diff --git a/backend/app/admin_ui.py b/backend/app/admin_ui.py index 51c2377..a25199c 100644 --- a/backend/app/admin_ui.py +++ b/backend/app/admin_ui.py @@ -33,6 +33,8 @@ from .repositories import ( get_article_by_id, get_feed_by_id, list_articles, + list_articles_page, + bulk_update_wp_post_ids, list_feeds, list_publish_jobs, list_runs, @@ -931,6 +933,105 @@ def admin_transition_article(request: Request, article_id: int, target_status: s return _dashboard_redirect(msg=f"Ungueltiger Statuswechsel fuer Artikel #{article_id}", msg_type="error") +_PAGE_SIZE = 50 + + +@router.get("/admin/article-list", response_class=HTMLResponse) +def admin_article_list(request: Request): + """Paginated article list with inline WP ID editing.""" + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + + page = max(1, int(request.query_params.get("page", 1))) + status_filter = request.query_params.get("status_filter", "") or None + search = request.query_params.get("search", "").strip() or None + offset = (page - 1) * _PAGE_SIZE + + articles, total = list_articles_page( + limit=_PAGE_SIZE, offset=offset, + status_filter=status_filter, search=search, + ) + + # Enrich each article with thumbnail URL + for a in articles: + meta = _parse_meta_json(a.get("meta_json")) + image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {} + sel = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None + if not sel: + sel = (meta.get("extraction") or {}).get("image_selection", {}).get("primary") + a["thumb_url"] = sel + a["thumb_proxy"] = f"/admin/images/proxy?{urlencode({'url': sel})}" if sel else None + raw = (a.get("content_raw") or a.get("summary") or "").strip() + a["excerpt"] = raw[:120] + "…" if len(raw) > 120 else raw + + total_pages = max(1, (total + _PAGE_SIZE - 1) // _PAGE_SIZE) + + return templates.TemplateResponse( + request, + "admin_article_list.html", + { + "request": request, + "title": "Artikelliste", + "user": user, + "articles": articles, + "page": page, + "total_pages": total_pages, + "total": total, + "page_size": _PAGE_SIZE, + "status_filter": status_filter or "", + "search": search or "", + "flash_msg": request.query_params.get("msg", ""), + "flash_type": request.query_params.get("type", "success"), + }, + ) + + +@router.post("/admin/article-list/update") +async def admin_article_list_update(request: Request): + """Bulk update WP post IDs from the article list form.""" + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + + form = await request.form() + updates: list[tuple[int, int | None]] = [] + + # Form fields: wp_ = new value, orig_ = original value + for key, new_val in form.items(): + if not key.startswith("wp_"): + continue + try: + article_id = int(key[3:]) + except ValueError: + continue + orig_val = str(form.get(f"orig_{article_id}", "")).strip() + new_val_s = str(new_val).strip() + if new_val_s == orig_val: + continue # unchanged + new_wp_id = int(new_val_s) if new_val_s else None + updates.append((article_id, new_wp_id)) + + if updates: + count = bulk_update_wp_post_ids(updates) + msg = f"{count} WP-ID(s) aktualisiert. Bitte jetzt WP-Sync ausführen um Slots & URLs zu aktualisieren." + msg_type = "success" + else: + msg = "Keine Änderungen erkannt." + msg_type = "success" + + # Preserve pagination/filter params from referer + page = form.get("page", "1") + status_filter = form.get("status_filter", "") + search = form.get("search", "") + qs: dict[str, str] = {"msg": msg, "type": msg_type, "page": page} + if status_filter: + qs["status_filter"] = status_filter + if search: + qs["search"] = search + return RedirectResponse(url=f"/admin/article-list?{urlencode(qs)}", status_code=303) + + @router.post("/admin/wp-sync") def admin_wp_sync(request: Request): """Sync scheduled_publish_at and WP references in the DB from WordPress.""" diff --git a/backend/app/repositories.py b/backend/app/repositories.py index 9556ed3..cf38055 100644 --- a/backend/app/repositories.py +++ b/backend/app/repositories.py @@ -757,6 +757,67 @@ def upsert_article(payload: ArticleUpsert) -> int: return int(existing_id) if existing_id else 0 +def list_articles_page( + limit: int = 50, + offset: int = 0, + status_filter: str | None = None, + search: str | None = None, +) -> tuple[list[dict[str, Any]], int]: + """Return (articles, total_count) with optional status filter and title search.""" + safe_limit = max(1, min(limit, 200)) + safe_offset = max(0, offset) + + conditions: list[str] = [] + params: list[Any] = [] + if status_filter: + conditions.append("a.status = ?") + params.append(status_filter) + if search: + conditions.append("(a.title LIKE ? OR a.id = ?)") + try: + params.extend([f"%{search}%", int(search)]) + except ValueError: + params.extend([f"%{search}%", -1]) + + where = f"WHERE {' AND '.join(conditions)}" if conditions else "" + select = """ + SELECT a.id, a.title, a.status, a.published_at, a.summary, a.content_raw, + a.meta_json, a.wp_post_id, a.wp_post_url, a.scheduled_publish_at, + a.word_count, f.name AS feed_name + FROM articles a + LEFT JOIN feeds f ON f.id = a.feed_id + """ + with get_conn() as conn: + total = conn.execute( + f"SELECT COUNT(*) FROM articles a {where}", params + ).fetchone()[0] + rows = conn.execute( + f"{select} {where} ORDER BY a.id DESC LIMIT ? OFFSET ?", + params + [safe_limit, safe_offset], + ).fetchall() + return rows_to_dicts(rows), total + + +def bulk_update_wp_post_ids(updates: list[tuple[int, int | None]]) -> int: + """Update wp_post_id (and clear stale wp_post_url) for multiple articles. + + Returns the number of rows actually updated. + Call sync_db_from_wordpress() afterwards to repopulate wp_post_url and + scheduled_publish_at from the live WordPress data. + """ + if not updates: + return 0 + updated = 0 + with get_conn() as conn: + for article_id, new_wp_id in updates: + conn.execute( + "UPDATE articles SET wp_post_id = ?, wp_post_url = NULL WHERE id = ?", + (new_wp_id, article_id), + ) + updated += 1 + return updated + + def list_articles(limit: int = 100, status_filter: str | None = None) -> list[dict[str, Any]]: safe_limit = max(1, min(limit, 500)) with get_conn() as conn: diff --git a/backend/templates/admin_article_list.html b/backend/templates/admin_article_list.html new file mode 100644 index 0000000..38bfb22 --- /dev/null +++ b/backend/templates/admin_article_list.html @@ -0,0 +1,221 @@ + + + + + + {{ title }} + + + + +
        +
        +

        Artikelliste

        +

        Angemeldet als {{ user }}

        +
        +
        + Dashboard + Veröffentlichungsplan +
        + +
        +
        +
        + +
        + {% if flash_msg %} +
        + {{ flash_msg }} +
        + {% endif %} + + +
        +
        +
        +
        + + +
        +
        + + +
        +
        + + Reset +
        +
        +
        +

        {{ total }} Artikel gesamt · Seite {{ page }} / {{ total_pages }} · {{ page_size }} pro Seite

        +
        + + +
        + + + + + + + +
        + + + + + + + + + + + + {% for a in articles %} + + + + + + + + {% endfor %} + +
        BildTitel & KurztextStatusDatumWP ID
        + {% if a.thumb_proxy %} + + Vorschau + + + {% else %} +
        🖼
        + {% endif %} +
        + + {% if a.excerpt %} +
        {{ a.excerpt }}
        + {% endif %} + {% if a.feed_name %} +
        📡 {{ a.feed_name }}
        + {% endif %} +
        + {{ a.status }} + + {% if a.scheduled_publish_at %} + 📅 {{ a.scheduled_publish_at[:16] }} + {% elif a.published_at %} + {{ a.published_at[:10] }} + {% else %} + — + {% endif %} + + + + + {% if a.wp_post_url %} + ↗ WP öffnen + {% endif %} +
        +
        +
        + + + +
        + + + + diff --git a/backend/templates/admin_dashboard.html b/backend/templates/admin_dashboard.html index 67738c7..0795b96 100644 --- a/backend/templates/admin_dashboard.html +++ b/backend/templates/admin_dashboard.html @@ -13,6 +13,7 @@

        Angemeldet als {{ user }}

        + Artikelliste Veröffentlichungsplan Connectivity Check
        From 1498fa715656aa55ae480236330233c0a79c007c Mon Sep 17 00:00:00 2001 From: OliverGiertz Date: Mon, 20 Apr 2026 06:20:20 +0000 Subject: [PATCH 52/54] chore: gitignore CLAUDE_CONTEXT.md (contains credentials) --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index fcbde33..3419386 100644 --- a/.gitignore +++ b/.gitignore @@ -36,3 +36,4 @@ internal/copy_files.sh internal/_line.txt internal/push_commit.txt internal/git.sh +CLAUDE_CONTEXT.md From 2456e4aca7cc9bce08ffb323dc969c00e4f9d9cd Mon Sep 17 00:00:00 2001 From: OliverGiertz Date: Mon, 20 Apr 2026 06:24:34 +0000 Subject: [PATCH 53/54] chore: gitignore CLAUDE.md instead of CLAUDE_CONTEXT.md --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 3419386..aac3a2f 100644 --- a/.gitignore +++ b/.gitignore @@ -36,4 +36,4 @@ internal/copy_files.sh internal/_line.txt internal/push_commit.txt internal/git.sh -CLAUDE_CONTEXT.md +CLAUDE.md From f7101418281dda8af138788d1c968413cece21da Mon Sep 17 00:00:00 2001 From: OliverGiertz Date: Fri, 8 May 2026 05:03:13 +0000 Subject: [PATCH 54/54] fix(scheduler): prevent duplicate slot assignment from concurrent pipeline runs Two bugs caused multiple articles to land on the same publish slot: 1. main.py: asyncio.create_task() returned immediately, allowing a second pipeline trigger (N8N + Telegram /run or two N8N calls) to start a second concurrent run. Added asyncio.Lock (_pipeline_lock) so any second trigger while the pipeline is running is rejected immediately. 2. scheduler.py: reserve_publish_slot() read the list of occupied slots and wrote the new slot in two separate DB connections. Concurrent threads could both see the same "free" slot before either committed its write. Fixed by wrapping the entire read-find-write cycle in a threading.Lock (_slot_lock) and a single DB connection, so the slot check and the slot assignment are atomic. Co-Authored-By: Claude Sonnet 4.6 --- backend/app/main.py | 22 ++++++--- backend/app/scheduler.py | 103 ++++++++++++++++++++++++++++----------- 2 files changed, 90 insertions(+), 35 deletions(-) diff --git a/backend/app/main.py b/backend/app/main.py index 264a2be..b4776af 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -1,8 +1,10 @@ +import asyncio from contextlib import asynccontextmanager import csv from datetime import datetime, timezone import io import json +import logging from pathlib import Path from fastapi import Depends, FastAPI, HTTPException, Request, Response, status @@ -637,20 +639,26 @@ def _require_api_key(request: Request) -> None: raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Ungültiger API-Key") +_pipeline_lock = asyncio.Lock() + + @app.post("/api/n8n/pipeline") async def api_n8n_pipeline(request: Request) -> dict: """Trigger the full auto pipeline in background. Returns immediately. Called by N8N (2x/day or on demand). Results arrive via Telegram.""" _require_api_key(request) - import asyncio - import logging + + if _pipeline_lock.locked(): + logging.getLogger(__name__).warning("Pipeline bereits aktiv – Trigger ignoriert") + return {"ok": False, "message": "Pipeline läuft bereits – Trigger ignoriert"} async def _run(): - loop = asyncio.get_event_loop() - try: - await loop.run_in_executor(None, lambda: run_auto_pipeline(trigger="n8n")) - except Exception as exc: - logging.getLogger(__name__).error("Background pipeline error: %s", exc) + async with _pipeline_lock: + loop = asyncio.get_event_loop() + try: + await loop.run_in_executor(None, lambda: run_auto_pipeline(trigger="n8n")) + except Exception as exc: + logging.getLogger(__name__).error("Background pipeline error: %s", exc) asyncio.create_task(_run()) return {"ok": True, "message": "Pipeline gestartet – Ergebnisse kommen per Telegram"} diff --git a/backend/app/scheduler.py b/backend/app/scheduler.py index 8f8d498..d5ea5bf 100644 --- a/backend/app/scheduler.py +++ b/backend/app/scheduler.py @@ -11,6 +11,7 @@ from __future__ import annotations import base64 import json +import threading import urllib.request from datetime import date, datetime, timedelta, timezone from typing import Any @@ -18,6 +19,9 @@ from typing import Any from .config import get_settings from .db import get_conn +# Ensures that concurrent pipeline runs (two threads) never assign the same slot. +_slot_lock = threading.Lock() + # CET offset (UTC+1 winter / UTC+2 summer – fixed +1 for simplicity) _CET_OFFSET = timedelta(hours=1) @@ -256,34 +260,77 @@ def reserve_publish_slot(article_id: int) -> str: If the article already has a scheduled_publish_at, keep it unchanged. Returns the formatted publish datetime string. + + Uses a module-level lock so that concurrent pipeline runs (two threads) + cannot read the same "free" slot and assign it twice. """ - # Check if already has a slot - with get_conn() as conn: - row = conn.execute( - "SELECT scheduled_publish_at FROM articles WHERE id = ?", - (article_id,), - ).fetchone() - existing_slot = row["scheduled_publish_at"] if row else None - if existing_slot: - try: - dt = datetime.fromisoformat(existing_slot) - return _format_slot(dt.date(), dt.hour) - except Exception: - pass # invalid slot, re-assign below - + # Fetch WP-occupied slots BEFORE acquiring the lock — the API call can be slow + # and must not block other threads unnecessarily. wp_occupied = _fetch_wp_occupied_slots() - result = _find_next_free_slot(wp_occupied, lookahead_days=30) - if result: - candidate, hour = result - else: - candidate = _today_cet() + timedelta(days=1) - hours = _preferred_hours() - hour = hours[0] if hours else 9 - iso_ts = f"{candidate.isoformat()}T{hour:02d}:00:00" - with get_conn() as conn: - conn.execute( - "UPDATE articles SET scheduled_publish_at = ? WHERE id = ?", - (iso_ts, article_id), - ) - return _format_slot(candidate, hour) + with _slot_lock: + # Single DB connection for the entire read-find-write cycle so the + # slot we pick is still free when we write it. + with get_conn() as conn: + row = conn.execute( + "SELECT scheduled_publish_at FROM articles WHERE id = ?", + (article_id,), + ).fetchone() + existing_slot = row["scheduled_publish_at"] if row else None + if existing_slot: + try: + dt = datetime.fromisoformat(existing_slot) + return _format_slot(dt.date(), dt.hour) + except Exception: + pass # invalid — fall through and assign a fresh slot + + # Find the next free (date, hour) slot using THIS connection so we + # see all slots written during this lock window. + hours = _preferred_hours() + today = _today_cet() + tomorrow = today + timedelta(days=1) + candidate: date | None = None + chosen_hour: int | None = None + + for offset in range(0, 61): + d = tomorrow + timedelta(days=offset) + date_str = d.isoformat() + + rows = conn.execute( + """ + SELECT scheduled_publish_at FROM articles + WHERE scheduled_publish_at >= ? AND scheduled_publish_at < ? + AND status NOT IN ('error', 'no_image') + """, + (date_str + "T00:00:00", date_str + "T23:59:59"), + ).fetchall() + + used_hours: set[int] = set() + for r in rows: + ts = r["scheduled_publish_at"] or "" + try: + used_hours.add(datetime.fromisoformat(ts).hour) + except Exception: + pass + for d_str, h in wp_occupied: + if d_str == date_str: + used_hours.add(h) + + for h in hours: + if h not in used_hours: + candidate = d + chosen_hour = h + break + if candidate is not None: + break + + if candidate is None: + candidate = tomorrow + chosen_hour = hours[0] if hours else 9 + + iso_ts = f"{candidate.isoformat()}T{chosen_hour:02d}:00:00" + conn.execute( + "UPDATE articles SET scheduled_publish_at = ? WHERE id = ?", + (iso_ts, article_id), + ) + return _format_slot(candidate, chosen_hour)