diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 5d55808..af3394f 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -19,9 +19,16 @@ jobs: username: oliver key: ${{ secrets.HETZNER_SSH_KEY }} port: 22 + envs: APP_ADMIN_USERNAME,APP_ADMIN_PASSWORD script: | - cd rss-news + cd /opt/rss-news git pull origin main source .venv/bin/activate pip install -r requirements.txt - sudo systemctl restart rss-app + pip install -r backend/requirements.txt || true + sudo systemctl restart rss-news-api + sleep 3 + BASE_URL="https://news.vanityontour.de" APP_ADMIN_USERNAME="${APP_ADMIN_USERNAME}" APP_ADMIN_PASSWORD="${APP_ADMIN_PASSWORD}" bash scripts/smoke_backend.sh + env: + APP_ADMIN_USERNAME: ${{ secrets.NEWS_APP_ADMIN_USERNAME }} + APP_ADMIN_PASSWORD: ${{ secrets.NEWS_APP_ADMIN_PASSWORD }} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..1d627db --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,39 @@ +name: Backend Tests + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + backend-tests: + runs-on: ubuntu-latest + timeout-minutes: 15 + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r backend/requirements.txt + pip install -r backend/requirements-test.txt + + - name: Run tests with coverage + env: + APP_DB_PATH: /tmp/rss_news_test.db + run: | + pytest backend/tests --cov=backend/app --cov-report=term-missing --cov-report=xml + + - name: Upload coverage artifact + uses: actions/upload-artifact@v4 + with: + name: coverage-xml + path: coverage.xml diff --git a/.gitignore b/.gitignore index fcbde33..aac3a2f 100644 --- a/.gitignore +++ b/.gitignore @@ -36,3 +36,4 @@ internal/copy_files.sh internal/_line.txt internal/push_commit.txt internal/git.sh +CLAUDE.md diff --git a/CHANGELOG.md b/CHANGELOG.md index fa80967..66b7237 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,42 @@ -## [1.7.1] - 2025-08-28 +## [1.7.1] - 2025-08-24 -- Beschreibung... +### ✨ Security angepasst + - alle Credentials in die .env Datei verschoben + - beim Start der App werden die Credentials geprüft und beim fehlen entsprechende Meldungen ausgegeben + +--- ## [1.7.0] - 2025-08-24 -- Beschreibung... +### Multi-Select & Massenoperationen: + - ✅ Checkboxes für Artikel-Auswahl im "Artikel verwalten" Bereich + - ✅ "Alle auswählen" / "Auswahl aufheben" Buttons + - ✅ Massenoperationen für ausgewählte Artikel: + - Bulk Status-Änderung für mehrere Artikel gleichzeitig + - Bulk Artikel-Umschreibung mit automatischer Status-Verwaltung + - Bulk WordPress-Upload nur für "Process"-Artikel + - Bulk Papierkorb-Funktion + +### Schnellaktionen Integration: + - ✅ Feed-Aktualisierung direkt im Artikel-Tab verfügbar + - ✅ Alle Dashboard-Schnellaktionen in Artikel-Verwaltung integriert + - ✅ Intelligente Anzeige nur relevanter Operationen (z.B. WordPress-Upload nur bei Process-Artikeln) + +### 🔧 Verbesserungen + + - UI/UX: Verbesserte Artikel-Card-Layouts mit Checkbox-Integration + - Workflow: Streamlined Artikel-Management ohne Tab-Wechsel nötig + - Feedback: Detaillierte Statusmeldungen bei Massenoperationen + - Performance: Optimierte Session-State-Verwaltung für Artikel-Auswahl + +### 🏗️ Technische Änderungen + + - Session State Erweiterung um selected_articles Set + - Neue Bulk-Operation-Funktionen in app.py:326-467 + - Überarbeitetes Artikel-Card-Layout mit 3-Spalten-Design + - Integration bestehender WordPress-Upload und Rewrite-Funktionen + +--- ## [1.6.3] - 2025-08-18 diff --git a/README.md b/README.md index 6846a41..b3c2b4a 100644 --- a/README.md +++ b/README.md @@ -1,89 +1,63 @@ -# 📰 RSS News Bot +# rss-news (Rebuild) -Ein intelligentes Tool zum Einlesen, Umschreiben und Veröffentlichen von Artikeln aus RSS-Feeds – mit automatischer Tag-Erkennung, KI-unterstütztem Rewrite via GPT-4, Bildextraktion aus Originalartikeln und optionaler DALL·E-Bildgenerierung. +`rss-news` wird als bestehendes Repository weitergefuehrt und schrittweise zu einer robusten, rechtssicheren News-Pipeline neu aufgebaut. -![Version](https://img.shields.io/badge/version-1.5.2-blue) -![License](https://img.shields.io/badge/license-MIT-green) -![Python](https://img.shields.io/badge/python-3.10+-yellow) -![Streamlit](https://img.shields.io/badge/built%20with-Streamlit-ff4b4b) +Aktueller Stand: +- Alte Streamlit-App wird nicht produktiv genutzt. +- `news.vanityontour.de` wird bis zum Go-Live der neuen App auf `https://vanityontour.de` umgeleitet. +- Planung, Doku und Wiki werden als Grundlage fuer den Neuaufbau gepflegt. ---- +## Ziele +- RSS-gestuetzte Artikelverarbeitung mit klaren Quellregeln +- Rechtssichere Nutzung (Quellen, Attribution, Lizenzinformationen) +- Zuverlaessige Automatisierung auf Hetzner +- Publikation nach WordPress (IONOS aktuell, spaeter offen) +- Zugriff nur nach Login (zunaechst User/Password) -## 🚀 Features +## Architektur-Richtung (MVP) +- Backend: `Python + FastAPI` +- Jobs: Queue-Worker (z. B. Redis + RQ/Celery) +- Daten: SQLite fuer MVP, spaeter optional PostgreSQL +- Auth: Session-Login mit einem Admin-User +- Publishing: WordPress REST API (Status zunaechst `pending`) -- 📡 **RSS-Feeds verwalten** (hinzufügen, aktualisieren) -- ✍️ **Artikel automatisch umschreiben** mit GPT-4 -- 🏷️ **Tags automatisch generieren** -- 🖼️ **Bilder aus Originalartikeln extrahieren** -- 🪄 **Optionales DALL·E-Bild generieren** -- 🔧 **Bearbeiten von Bildmetadaten** -- 🗂️ **Statusverwaltung der Artikel (New, Rewrite, Process, etc.)** -- 📜 **Log-Viewer-Seite integriert** -- 📥 **Export zur Veröffentlichung auf WordPress vorbereitet** -- 📋 Artikeltabelle mit Status-Filter -- 🔍 Artikel-Expander mit Rewrite, Tags & Bildern -- 🪄 Button für KI-Bildgenerierung +Details: `docs/PROJECT_PLAN.md` +## Projektsteuerung +- GitHub Project: `https://github.com/users/OliverGiertz/projects/3/views/1` +- Dieses Board ist die zentrale Steuerung fuer ToDos, Bugs, Verbesserungen. +- Wiki-Struktur liegt unter `docs/wiki/`. ---- +## Dokumentation +- Projektplan: `docs/PROJECT_PLAN.md` +- ToDo-Liste: `docs/TODO.md` +- Quell- und Lizenzpolicy: `docs/SOURCE_POLICY.md` +- Wiki Home: `docs/wiki/Home.md` -## 🧱 Projektstruktur - -ss-news/ -├── app.py # Haupt-UI mit Streamlit -├── main.py # Logik für Feed-Import und Verarbeitung -├── utils/ -│ └── image_extractor.py # Bilder aus Originalartikeln extrahieren -│ └── dalle_generator.py # DALL·E-Integration (KI-Bild) -├── pages/ -│ └── log_viewer.py # UI zur Anzeige der Logs -├── data/ -│ └── articles.json # Gespeicherte Artikel -│ └── feeds.json # Gespeicherte Feed-URLs -├── logs/ -│ └── rss_tool.log # Logging der Verarbeitung -├── versioning.py # CLI-Tool zur Versionierung & Release -├── TEST-CHECKLIST.md # Manuelle Prüfliste für Releases -├── version.py # Aktuelle Version -└── CHANGELOG.md # Änderungsprotokoll - - ---- - -## ⚙️ Installation +## Lokale Entwicklung (Legacy-Code) +Der vorhandene Legacy-Stand kann weiterhin lokal gestartet werden: ```bash -git clone https://github.com/OliverGiertz/rss-news.git -cd rss-news python -m venv .venv source .venv/bin/activate pip install -r requirements.txt -``` - ---- - -## Update -Ein Update Script findest du hier: https://gist.github.com/OliverGiertz/ad33ae3de9aa1c1163dad5fe8affb6ca - -```bash -bash update.sh -``` - - -## ▶️ Starten der App - streamlit run app.py +``` ---- +Hinweis: Diese App ist funktional historisch und wird durch die neue Architektur ersetzt. -## 🔐 Konfiguration (.env) +## Deployment-Zielbild +- Betrieb auf Hetzner +- Reverse Proxy via CloudPanel/Nginx +- Produktive Domain: `news.vanityontour.de` +- Bis zur Fertigstellung: Redirect auf `https://vanityontour.de` -Lege eine `.env` im Projekt an (siehe `.env.example`). Erforderliche Variablen: +## Sicherheit +- Keine Secrets im Repository +- `.env` lokal/auf Server, nie committen +- Auth-Pflicht fuer die neue WebApp +- spaeter optional: Passkeys/WebAuthn -- `WP_BASE_URL`: Basis-URL deiner WordPress-Seite (z. B. https://example.com) -- Authentifizierung (eine Option wählen): - - `WP_AUTH_BASE64`: Bevorzugt. Base64 von `username:application_password` - - oder `WP_USERNAME` und `WP_PASSWORD`: Benutzer + Anwendungspasswort -- Optional: `OPENAI_API_KEY` für das Umschreiben von Artikeln +## Rechtlicher Hinweis +Dieses Projekt verarbeitet nur Quellen mit dokumentierter Nutzungsgrundlage. Vor produktiver Nutzung ist eine finale rechtliche Pruefung der ausgewaehlten Feeds notwendig. -Hinweis: Der Code liest ausschließlich aus `.env`. Es gibt keine hartkodierten Standard-Credentials. diff --git a/backend/.env.example b/backend/.env.example new file mode 100644 index 0000000..c2dd235 --- /dev/null +++ b/backend/.env.example @@ -0,0 +1,45 @@ +# ─── App ──────────────────────────────────────────────────────────────────── +APP_ENV=development +APP_NAME=rss-news-backend +APP_SECRET_KEY=replace-with-a-long-random-secret +APP_DB_PATH=backend/data/rss_news.db + +APP_ADMIN_USERNAME=admin +APP_ADMIN_PASSWORD=change-me + +SESSION_COOKIE_NAME=rss_news_session +SESSION_MAX_AGE_SECONDS=28800 + +# ─── WordPress ────────────────────────────────────────────────────────────── +WP_BASE_URL=https://your-site.tld +WP_USERNAME=your-wp-username +WP_PASSWORD=your-wp-app-password +# Status für neue Beiträge: draft | future | publish +WORDPRESS_DEFAULT_STATUS=draft + +# ─── OpenAI ───────────────────────────────────────────────────────────────── +OPENAI_API_KEY=sk-... +# gpt-4o-mini empfohlen (Kosten/Qualität) +OPENAI_MODEL=gpt-4o-mini + +# ─── Telegram Bot ──────────────────────────────────────────────────────────── +# Bot-Token von @BotFather +TELEGRAM_BOT_TOKEN=123456789:ABC... +# Chat-ID deines persönlichen Chats oder einer Gruppe +TELEGRAM_CHAT_ID=123456789 +# Zufälliger Secret-Token zur Webhook-Absicherung (mindestens 20 Zeichen) +TELEGRAM_WEBHOOK_SECRET=replace-with-random-secret-min-20-chars + +# ─── N8N API-Key ───────────────────────────────────────────────────────────── +# Wird von N8N im Header X-API-Key mitgeschickt +N8N_API_KEY=replace-with-strong-random-key + +# ─── Pipeline-Einstellungen ────────────────────────────────────────────────── +# Relevanz-Score >= dieser Wert: automatisch verarbeiten (0-100) +PIPELINE_RELEVANCE_AUTO=80 +# Relevanz-Score >= dieser Wert, aber < AUTO: Telegram-Warnung senden +PIPELINE_RELEVANCE_WARN=60 +# Maximale Drafts/Veröffentlichungen pro Tag +PIPELINE_MAX_DRAFTS_PER_DAY=2 +# Bevorzugte Veröffentlichungszeiten (Stunden, kommagetrennt, CET) +PIPELINE_PUBLISH_HOURS=9,14 diff --git a/backend/README.md b/backend/README.md new file mode 100644 index 0000000..7d64a65 --- /dev/null +++ b/backend/README.md @@ -0,0 +1,82 @@ +# Backend Skeleton (FastAPI) + +Dieses Verzeichnis enthaelt das technische Grundgeruest fuer den Rebuild von `rss-news`. + +## Start (lokal) + +```bash +python -m venv .venv +source .venv/bin/activate +pip install -r backend/requirements.txt +uvicorn backend.app.main:app --reload --port 8501 +``` + +## Admin UI +- Login: `http://127.0.0.1:8501/admin/login` +- Dashboard: `http://127.0.0.1:8501/admin/dashboard` + +## Environment +- Datei: `backend/.env` +- Vorlage: `backend/.env.example` + +## Endpoints +- `GET /health` - Healthcheck +- `POST /auth/login` - Login mit Admin-User +- `POST /auth/logout` - Logout +- `GET /auth/me` - Aktiver User +- `GET /api/protected` - Geschuetzter Test-Endpoint +- `GET /api/pipeline/status` - Basisstatus inkl. Datensatzzaehler +- `GET /api/sources` - Quellenliste +- `POST /api/sources` - Quelle anlegen +- `GET /api/sources/{source_id}/policy-check` - Policy-Pruefung fuer Quelle +- `GET /api/feeds` - Feedliste +- `POST /api/feeds` - Feed anlegen +- `GET /api/feeds/{feed_id}/policy-check` - Policy-Pruefung fuer Feed +- `GET /api/runs` - Import-/Job-Runs anzeigen +- `GET /api/runs/{run_id}` - Detailansicht eines Runs +- `POST /api/runs` - Run starten +- `POST /api/runs/{run_id}/finish` - Run abschliessen +- `GET /api/articles` - Artikel anzeigen +- `GET /api/articles/{article_id}` - Artikeldetail +- `POST /api/articles/upsert` - Artikel idempotent anlegen/aktualisieren +- `POST /api/articles/{article_id}/transition` - Statuswechsel nach Workflow-Regeln +- `POST /api/articles/{article_id}/review` - Review-Entscheidung (approve/reject) +- `POST /api/ingestion/run` - Feed-Ingestion starten (optional pro Feed) + +## Datenbank +- SQLite-Datei unter `backend/data/rss_news.db` +- Tabellen werden beim App-Start initialisiert. +- Tabellen: `sources`, `feeds`, `runs`, `articles` +- Dedupe-Strategie Artikel: `source_url` -> `(feed_id, source_article_id)` -> `source_hash` + +## Policy-Enforcement +- Ingestion blockiert Feeds automatisch, wenn die zugeordnete Quelle nicht policy-konform ist. +- Mindestanforderungen: `risk_level=green`, `terms_url`, `license_name`, `last_reviewed_at`, `is_enabled=1`. +- Pro importiertem Artikel wird ein `attribution`-Block in `meta_json` gespeichert. + +## Review-Workflow +- Statuskette: `new -> review -> approved -> published` +- Ablehnung im Review setzt auf `rewrite` +- Ungueltige Statuswechsel werden per API blockiert + +## Verifikation +```bash +python -m unittest backend.tests.test_db_repositories +python -m unittest backend.tests.test_ingestion +python -m unittest backend.tests.test_api_auth +``` + +## CI / Online-Auswertung +- GitHub Actions Workflow: `.github/workflows/test.yml` +- Fuehrt Tests inkl. Coverage auf Push/PR gegen `main` aus. + +## Hetzner Smoketest +```bash +BASE_URL="https://news.vanityontour.de" \ +APP_ADMIN_USERNAME="admin" \ +APP_ADMIN_PASSWORD="..." \ +bash scripts/smoke_backend.sh +``` + +## Hinweis +Passwort-Hashing und CSRF/Rate-Limit sind als naechste Ausbaustufe vorgesehen. diff --git a/backend/__init__.py b/backend/__init__.py new file mode 100644 index 0000000..3623851 --- /dev/null +++ b/backend/__init__.py @@ -0,0 +1 @@ +"""Backend package for rss-news rebuild.""" diff --git a/backend/app/__init__.py b/backend/app/__init__.py new file mode 100644 index 0000000..18b665e --- /dev/null +++ b/backend/app/__init__.py @@ -0,0 +1 @@ +"""Application package.""" diff --git a/backend/app/admin_ui.py b/backend/app/admin_ui.py new file mode 100644 index 0000000..a25199c --- /dev/null +++ b/backend/app/admin_ui.py @@ -0,0 +1,1126 @@ +from __future__ import annotations + +import json +from pathlib import Path +import re +import socket +import ssl +import time +from urllib.parse import urlparse +from urllib.parse import urlencode +from urllib.request import Request as UrlRequest, urlopen + +from fastapi import APIRouter, Form, Request +from fastapi.responses import HTMLResponse, RedirectResponse, Response +from fastapi.templating import Jinja2Templates + +from .auth import create_session_token, verify_credentials, verify_session_token +from .config import get_settings +from .ingestion import run_ingestion +from .policy import evaluate_source_policy +from .publisher import enqueue_publish, run_publisher +from .relevance import article_age_days, article_relevance +from .rewrite import generate_article_tags, merge_generated_tags, rewrite_article_text +from .repositories import ( + FeedCreate, + FeedUpdate, + SourceCreate, + SourceUpdate, + delete_feed, + delete_source, + create_feed, + create_source, + get_article_by_id, + get_feed_by_id, + list_articles, + list_articles_page, + bulk_update_wp_post_ids, + list_feeds, + list_publish_jobs, + list_runs, + list_sources, + set_article_image_decision, + upsert_article, + update_feed, + update_source, + update_article_status, + ArticleUpsert, +) +from .workflow import ALLOWED_UI_TRANSITIONS, UI_STATUSES, internal_to_ui_status, ui_to_internal_status + +settings = get_settings() +router = APIRouter(tags=["admin-ui"]) +templates = Jinja2Templates(directory=str(Path(__file__).resolve().parent.parent / "templates")) +ALLOWED_TRANSITIONS: dict[str, tuple[str, ...]] = { + "new": ("rewrite", "close"), + "rewrite": ("publish", "close"), + "publish": ("published", "close"), + "published": ("rewrite", "close"), + "close": ("rewrite",), +} +IMAGE_PROXY_USER_AGENT = "rss-news-admin/1.0" +_UNSET = object() + + +def _admin_user(request: Request) -> str | None: + token = request.cookies.get(settings.session_cookie_name) + if not token: + return None + return verify_session_token(token) + + +def _to_optional_int(raw: str | None) -> int | None: + if raw is None: + return None + value = raw.strip() + if value == "": + return None + return int(value) + + +def _dashboard_redirect( + *, + msg: str | None = None, + msg_type: str = "success", + status_filter: str | None = None, +) -> RedirectResponse: + query: dict[str, str] = {} + if msg: + query["msg"] = msg + query["type"] = msg_type + if status_filter: + query["status_filter"] = status_filter + suffix = f"?{urlencode(query)}" if query else "" + return RedirectResponse(url=f"/admin/dashboard{suffix}", status_code=303) + + +def _parse_meta_json(raw: str | None) -> dict: + if not raw: + return {} + try: + parsed = json.loads(raw) + return parsed if isinstance(parsed, dict) else {} + except Exception: + return {} + + +def _read_article_images(article: dict, extraction: dict) -> list[str]: + images: list[str] = [] + if article.get("image_urls_json"): + try: + parsed_images = json.loads(article["image_urls_json"]) + if isinstance(parsed_images, list): + images = [str(item) for item in parsed_images if item] + except Exception: + images = [] + if not images and isinstance(extraction.get("images"), list): + images = [str(item) for item in extraction.get("images") if item] + # deduplicate preserving order + seen: set[str] = set() + deduped: list[str] = [] + for image in images: + if image not in seen: + seen.add(image) + deduped.append(image) + return deduped + + +def _is_probably_irrelevant_image(url: str) -> bool: + lowered = url.lower() + patterns = ( + r"logo", + r"icon", + r"sprite", + r"avatar", + r"favicon", + r"/ads/", + r"tracking", + r"pixel", + r"banner", + ) + return any(re.search(pattern, lowered) for pattern in patterns) + + +def _is_http_image_url(url: str) -> bool: + try: + parsed = urlparse(url) + except Exception: + return False + return parsed.scheme in {"http", "https"} and bool(parsed.netloc) + + +def _build_image_entries(article: dict, extraction: dict, meta: dict) -> list[dict[str, object]]: + all_images = _read_article_images(article, extraction) + image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {} + selected_url = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None + excluded_urls = image_review.get("excluded_urls") if isinstance(image_review.get("excluded_urls"), list) else [] + excluded_set = {str(item) for item in excluded_urls if item} + + entries: list[dict[str, object]] = [] + for url in all_images: + entries.append( + { + "url": url, + "proxy_url": f"/admin/images/proxy?{urlencode({'url': url})}", + "is_selected": selected_url == url, + "is_excluded": url in excluded_set, + "is_irrelevant_hint": _is_probably_irrelevant_image(url), + } + ) + return entries + + +def _publish_readiness(article: dict, meta: dict) -> tuple[bool, list[str]]: + reasons: list[str] = [] + if internal_to_ui_status(article.get("status")) not in {"publish", "published"}: + reasons.append("Status ist nicht 'publish'") + image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {} + selected_image = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None + if not selected_image: + reasons.append("Hauptbild nicht ausgewählt") + return len(reasons) == 0, reasons + + +def _classify_publish_error(error_message: str | None) -> tuple[str, str]: + text = (error_message or "").lower() + if not text.strip(): + return "ok", "-" + if "rechtsfreigabe fehlt" in text or "hauptbild nicht gesetzt" in text or "status ist nicht" in text: + return "policy", "Artikelvoraussetzungen im UI prüfen (Status/Hauptbild)." + if "401" in text or "403" in text or "authorization" in text or "forbidden" in text or "unauthorized" in text: + return "auth", "WordPress Nutzer/App-Passwort prüfen." + if "404" in text and ("media" in text or "posts" in text or "wp-json" in text): + return "api", "WordPress REST-Endpunkt prüfen (`/wp-json/wp/v2`)." + if "timed out" in text or "timeout" in text or "nodename nor servname provided" in text or "name or service not known" in text: + return "dns", "DNS/Netzwerk zur WordPress-Domain prüfen." + if "media-upload fehlgeschlagen" in text or "liefert kein bild" in text or "featured_media" in text: + return "media", "Bild-URL/Format prüfen oder anderes Hauptbild auswählen." + return "unknown", "Fehlerdetails prüfen und bei Bedarf Job erneut starten." + + +def _legal_checklist(article: dict, feed: dict | None) -> list[dict[str, str]]: + meta = article.get("meta", {}) + extraction = meta.get("extraction") if isinstance(meta.get("extraction"), dict) else {} + attribution = meta.get("attribution") if isinstance(meta.get("attribution"), dict) else {} + + checks: list[dict[str, str]] = [] + checks.append( + { + "label": "Original-Link vorhanden", + "status": "ok" if article.get("source_url") else "missing", + "value": article.get("source_url") or "-", + } + ) + checks.append( + { + "label": "Autor vorhanden", + "status": "ok" if article.get("author") else "missing", + "value": article.get("author") or "-", + } + ) + checks.append( + { + "label": "Bilder extrahiert", + "status": "ok" if article.get("image_urls_json") else "missing", + "value": str(len(extraction.get("images", []))) if isinstance(extraction.get("images"), list) else "0", + } + ) + checks.append( + { + "label": "Pressekontakt", + "status": "ok" if article.get("press_contact") else "missing", + "value": article.get("press_contact") or extraction.get("press_contact") or "-", + } + ) + checks.append( + { + "label": "Lizenz/Terms", + "status": "ok" if article.get("source_license_name_snapshot") and article.get("source_terms_url_snapshot") else "missing", + "value": f"{article.get('source_license_name_snapshot') or attribution.get('source_license_name') or '-'} | {article.get('source_terms_url_snapshot') or attribution.get('source_terms_url') or '-'}", + } + ) + checks.append( + { + "label": "Risiko-Status Quelle", + "status": "ok" if (feed and feed.get("source_risk_level") == "green") else "missing", + "value": feed.get("source_risk_level") if feed else "-", + } + ) + image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {} + selected_image = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None + checks.append( + { + "label": "Hauptbild ausgewählt", + "status": "ok" if selected_image else "missing", + "value": selected_image or "-", + } + ) + return checks + + +def _build_connectivity_targets() -> list[dict[str, str]]: + targets: list[dict[str, str]] = [] + seen: set[tuple[str, str]] = set() + + def add_target(label: str, kind: str, value: str) -> None: + normalized = (value or "").strip() + if not normalized: + return + key = (kind, normalized.lower()) + if key in seen: + return + seen.add(key) + targets.append({"label": label, "kind": kind, "value": normalized}) + + add_target("OpenAI API", "host", "api.openai.com") + if settings.wordpress_base_url: + parsed = urlparse(settings.wordpress_base_url) + if parsed.hostname: + add_target("WordPress Host", "host", parsed.hostname) + wp_api_url = f"{settings.wordpress_base_url.rstrip('/')}/wp-json/wp/v2" + add_target("WordPress REST", "url", wp_api_url) + + for feed in list_feeds(): + name = (feed.get("name") or "").strip() or f"Feed #{feed.get('id')}" + feed_url = str(feed.get("url") or "").strip() + if not feed_url: + continue + parsed = urlparse(feed_url) + if parsed.hostname: + add_target(f"{name} (Feed)", "host", parsed.hostname) + add_target(f"{name} (Feed URL)", "url", feed_url) + + return targets + + +def _run_connectivity_check(target: dict[str, str]) -> dict[str, object]: + kind = target.get("kind", "") + value = str(target.get("value") or "") + row: dict[str, object] = { + "label": target.get("label") or "-", + "kind": kind, + "target": value, + "dns_ok": False, + "dns_info": "-", + "tcp_ok": False, + "tcp_info": "-", + "http_ok": False, + "http_info": "-", + "duration_ms": 0, + "ok": False, + } + started = time.perf_counter() + try: + hostname = value if kind == "host" else (urlparse(value).hostname or "") + port = 443 + if kind == "url": + parsed = urlparse(value) + if parsed.scheme not in {"http", "https"}: + row["http_info"] = f"unsupported scheme: {parsed.scheme or '-'}" + return row + port = 443 if parsed.scheme == "https" else 80 + if not hostname: + row["dns_info"] = "host fehlt" + return row + + try: + addr_info = socket.getaddrinfo(hostname, None, proto=socket.IPPROTO_TCP) + ips = sorted({entry[4][0] for entry in addr_info if entry and len(entry) > 4 and entry[4]}) + row["dns_ok"] = True + row["dns_info"] = ", ".join(ips[:3]) if ips else "resolved" + except Exception as exc: + row["dns_info"] = str(exc) + return row + + try: + socket.create_connection((hostname, port), timeout=4).close() + row["tcp_ok"] = True + row["tcp_info"] = f"port {port} erreichbar" + except Exception as exc: + row["tcp_info"] = str(exc) + return row + + if kind == "host": + row["http_ok"] = True + row["http_info"] = "n/a (host-only)" + row["ok"] = True + return row + + try: + req = UrlRequest( + url=value, + headers={"User-Agent": IMAGE_PROXY_USER_AGENT, "Accept": "*/*"}, + ) + with urlopen(req, timeout=6, context=ssl.create_default_context()) as resp: + code = getattr(resp, "status", None) or resp.getcode() + row["http_ok"] = True + row["http_info"] = f"HTTP {code}" + except Exception as exc: + row["http_info"] = str(exc) + return row + + row["ok"] = bool(row["dns_ok"] and row["tcp_ok"] and row["http_ok"]) + return row + finally: + row["duration_ms"] = int((time.perf_counter() - started) * 1000) + + +def _upsert_article_from_existing( + article: dict, + *, + content_rewritten: str | None = None, + status: str | None = None, + wp_post_id: int | None | object = _UNSET, + wp_post_url: str | None | object = _UNSET, + publish_attempts: int | object = _UNSET, + publish_last_error: str | None | object = _UNSET, + published_to_wp_at: str | None | object = _UNSET, + meta_json: str | None | object = _UNSET, +) -> None: + rewritten = article.get("content_rewritten") if content_rewritten is None else content_rewritten + upsert_article( + ArticleUpsert( + feed_id=article.get("feed_id"), + source_article_id=article.get("source_article_id"), + source_hash=article.get("source_hash"), + title=article.get("title"), + source_url=article.get("source_url"), + canonical_url=article.get("canonical_url"), + published_at=article.get("published_at"), + author=article.get("author"), + summary=article.get("summary"), + content_raw=article.get("content_raw"), + content_rewritten=rewritten, + image_urls_json=article.get("image_urls_json"), + press_contact=article.get("press_contact"), + source_name_snapshot=article.get("source_name_snapshot"), + source_terms_url_snapshot=article.get("source_terms_url_snapshot"), + source_license_name_snapshot=article.get("source_license_name_snapshot"), + legal_checked=bool(int(article.get("legal_checked", 0))), + legal_checked_at=article.get("legal_checked_at"), + legal_note=article.get("legal_note"), + wp_post_id=article.get("wp_post_id") if wp_post_id is _UNSET else wp_post_id, + wp_post_url=article.get("wp_post_url") if wp_post_url is _UNSET else wp_post_url, + publish_attempts=int(article.get("publish_attempts", 0)) if publish_attempts is _UNSET else publish_attempts, + publish_last_error=article.get("publish_last_error") if publish_last_error is _UNSET else publish_last_error, + published_to_wp_at=article.get("published_to_wp_at") if published_to_wp_at is _UNSET else published_to_wp_at, + word_count=len(str(rewritten or "").split()), + status=article.get("status") if status is None else status, + meta_json=article.get("meta_json") if meta_json is _UNSET else meta_json, + ) + ) + + +@router.get("/admin", response_class=HTMLResponse) +def admin_index(request: Request): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + return RedirectResponse(url="/admin/dashboard", status_code=303) + + +@router.get("/admin/login", response_class=HTMLResponse) +def admin_login_page(request: Request): + return templates.TemplateResponse( + request, + "admin_login.html", + {"request": request, "title": "Admin Login", "error": request.query_params.get("error")}, + ) + + +@router.post("/admin/login") +def admin_login(request: Request, username: str = Form(...), password: str = Form(...)): + if not verify_credentials(username, password): + return RedirectResponse(url="/admin/login?error=1", status_code=303) + + token = create_session_token(username) + response = RedirectResponse(url="/admin/dashboard", status_code=303) + response.set_cookie( + key=settings.session_cookie_name, + value=token, + max_age=settings.session_max_age_seconds, + httponly=True, + secure=False, + samesite="lax", + ) + return response + + +@router.post("/admin/logout") +def admin_logout(): + response = RedirectResponse(url="/admin/login", status_code=303) + response.delete_cookie(settings.session_cookie_name) + return response + + +@router.get("/admin/dashboard", response_class=HTMLResponse) +def admin_dashboard(request: Request): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + + sources = list_sources() + source_policy = {s["id"]: evaluate_source_policy(s) for s in sources} + feeds = list_feeds() + runs = list_runs(limit=30) + publish_jobs = list_publish_jobs(limit=30) + for job in publish_jobs: + category, hint = _classify_publish_error(job.get("error_message")) + job["error_category"] = category + job["error_hint"] = hint + status_filter = request.query_params.get("status_filter") + internal_filter = ui_to_internal_status(status_filter) if status_filter else None + if status_filter in set(UI_STATUSES): + articles = list_articles(limit=100, status_filter=internal_filter) + else: + status_filter = "" + articles = [a for a in list_articles(limit=250) if internal_to_ui_status(a.get("status")) != "close"][:100] + for article in articles: + meta = _parse_meta_json(article.get("meta_json")) + extraction = meta.get("extraction") if isinstance(meta.get("extraction"), dict) else {} + images = _read_article_images(article, extraction) + article["meta"] = meta + ready, reasons = _publish_readiness(article, meta) + article["publish_ready"] = ready + article["publish_blockers"] = reasons + article["extracted_images"] = images + article["image_entries"] = _build_image_entries(article, extraction, meta) + image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {} + article["selected_image_url"] = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None + article["selected_image_proxy_url"] = ( + f"/admin/images/proxy?{urlencode({'url': article['selected_image_url']})}" if article.get("selected_image_url") else None + ) + if not article.get("press_contact") and isinstance(extraction.get("press_contact"), str): + article["press_contact"] = extraction.get("press_contact") + article["extraction_error"] = extraction.get("extraction_error") if isinstance(extraction.get("extraction_error"), str) else None + article["days_old"] = article_age_days(article.get("published_at")) + article["relevance"] = article_relevance(article.get("published_at")) + article["status_ui"] = internal_to_ui_status(article.get("status")) + tags = meta.get("generated_tags") if isinstance(meta.get("generated_tags"), list) else [] + article["generated_tags"] = [str(t) for t in tags if t] + + return templates.TemplateResponse( + request, + "admin_dashboard.html", + { + "request": request, + "title": "Admin Dashboard", + "user": user, + "sources": sources, + "source_policy": source_policy, + "feeds": feeds, + "runs": runs, + "publish_jobs": publish_jobs, + "articles": articles, + "status_options": list(UI_STATUSES), + "allowed_transitions": ALLOWED_TRANSITIONS, + "status_filter": status_filter, + "flash_msg": request.query_params.get("msg", ""), + "flash_type": request.query_params.get("type", "success"), + }, + ) + + +@router.get("/admin/connectivity", response_class=HTMLResponse) +def admin_connectivity(request: Request): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + + checks = [_run_connectivity_check(target) for target in _build_connectivity_targets()] + ok_count = len([c for c in checks if c.get("ok")]) + error_count = len(checks) - ok_count + return templates.TemplateResponse( + request, + "admin_connectivity.html", + { + "request": request, + "title": "Connectivity Check", + "user": user, + "checks": checks, + "ok_count": ok_count, + "error_count": error_count, + }, + ) + + +@router.get("/admin/articles/{article_id}", response_class=HTMLResponse) +def admin_article_detail(request: Request, article_id: int): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + + article = get_article_by_id(article_id) + if not article: + return _dashboard_redirect(msg=f"Artikel #{article_id} nicht gefunden", msg_type="error") + + meta = _parse_meta_json(article.get("meta_json")) + article["meta"] = meta + extraction = meta.get("extraction") if isinstance(meta.get("extraction"), dict) else {} + extraction["images"] = _read_article_images(article, extraction) + if not article.get("press_contact") and isinstance(extraction.get("press_contact"), str): + article["press_contact"] = extraction.get("press_contact") + article["extraction"] = extraction + publish_ready, publish_blockers = _publish_readiness(article, meta) + article["publish_ready"] = publish_ready + article["publish_blockers"] = publish_blockers + article["image_selection"] = extraction.get("image_selection") if isinstance(extraction.get("image_selection"), dict) else {} + article["image_entries"] = _build_image_entries(article, extraction, meta) + image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {} + article["selected_image_url"] = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None + article["selected_image_proxy_url"] = ( + f"/admin/images/proxy?{urlencode({'url': article['selected_image_url']})}" if article.get("selected_image_url") else None + ) + article["days_old"] = article_age_days(article.get("published_at")) + article["relevance"] = article_relevance(article.get("published_at")) + article["status_ui"] = internal_to_ui_status(article.get("status")) + feed = get_feed_by_id(int(article["feed_id"])) if article.get("feed_id") else None + checklist = _legal_checklist(article, feed) + + return templates.TemplateResponse( + request, + "admin_article_detail.html", + { + "request": request, + "title": f"Artikel #{article_id}", + "user": user, + "article": article, + "feed": feed, + "checklist": checklist, + "allowed_transitions": ALLOWED_TRANSITIONS.get(article.get("status_ui"), ()), + "flash_msg": request.query_params.get("msg", ""), + "flash_type": request.query_params.get("type", "success"), + }, + ) + + +@router.post("/admin/articles/{article_id}/images/decision") +def admin_article_image_decision( + request: Request, + article_id: int, + image_url: str = Form(...), + action: str = Form(...), +): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + + ok = set_article_image_decision(article_id=article_id, image_url=image_url, action=action, actor=user) + if not ok: + return _dashboard_redirect(msg=f"Bildaktion fehlgeschlagen fuer Artikel #{article_id}", msg_type="error") + return RedirectResponse(url=f"/admin/articles/{article_id}", status_code=303) + + +@router.post("/admin/articles/{article_id}/publish-enqueue") +def admin_enqueue_publish(request: Request, article_id: int, max_attempts: str = Form("3")): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + try: + job_id = enqueue_publish(article_id=article_id, max_attempts=max(1, int(max_attempts))) + except Exception as exc: + return _dashboard_redirect(msg=f"Publish Queue Fehler fuer Artikel #{article_id}: {exc}", msg_type="error") + return RedirectResponse(url=f"/admin/articles/{article_id}?msg=Publish-Job%20#{job_id}%20erstellt&type=success", status_code=303) + + +@router.post("/admin/publisher/run") +def admin_run_publisher(request: Request, max_jobs: str = Form("10")): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + try: + stats = run_publisher(max_jobs=max(1, int(max_jobs))) + except Exception as exc: + return _dashboard_redirect(msg=f"Publisher Fehler: {exc}", msg_type="error") + return _dashboard_redirect( + msg=f"Publisher: processed={stats.processed}, success={stats.success}, failed={stats.failed}, requeued={stats.requeued}" + ) + + +@router.get("/admin/images/proxy") +def admin_image_proxy(request: Request, url: str): + if not _is_http_image_url(url): + return Response(status_code=400) + + try: + referer = request.headers.get("referer", "") + req = UrlRequest( + url=url, + headers={ + "User-Agent": IMAGE_PROXY_USER_AGENT, + "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8", + "Referer": referer or url, + }, + ) + with urlopen(req, timeout=10) as resp: + body = resp.read() + content_type = resp.headers.get("Content-Type", "application/octet-stream") + except Exception: + return Response(status_code=404) + + if not content_type.lower().startswith("image/"): + return Response(status_code=415) + return Response(content=body, media_type=content_type) + + +@router.post("/admin/sources/create") +def admin_create_source( + request: Request, + name: str = Form(...), + base_url: str = Form(""), + terms_url: str = Form(""), + license_name: str = Form(""), + risk_level: str = Form("yellow"), + last_reviewed_at: str = Form(""), +): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + + try: + create_source( + SourceCreate( + name=name, + base_url=base_url or None, + terms_url=terms_url or None, + license_name=license_name or None, + risk_level=risk_level, + is_enabled=True, + notes=None, + last_reviewed_at=last_reviewed_at or None, + ) + ) + except Exception as exc: + return _dashboard_redirect(msg=f"Quelle konnte nicht gespeichert werden: {exc}", msg_type="error") + return _dashboard_redirect(msg="Quelle gespeichert") + + +@router.post("/admin/sources/{source_id}/update") +def admin_update_source( + request: Request, + source_id: int, + name: str = Form(...), + base_url: str = Form(""), + terms_url: str = Form(""), + license_name: str = Form(""), + risk_level: str = Form("yellow"), + is_enabled: str = Form("1"), + notes: str = Form(""), + last_reviewed_at: str = Form(""), +): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + try: + ok = update_source( + source_id, + SourceUpdate( + name=name, + base_url=base_url or None, + terms_url=terms_url or None, + license_name=license_name or None, + risk_level=risk_level, + is_enabled=is_enabled == "1", + notes=notes or None, + last_reviewed_at=last_reviewed_at or None, + ), + ) + except Exception as exc: + return _dashboard_redirect(msg=f"Quelle #{source_id} Update fehlgeschlagen: {exc}", msg_type="error") + if not ok: + return _dashboard_redirect(msg=f"Quelle #{source_id} nicht gefunden", msg_type="error") + return _dashboard_redirect(msg=f"Quelle #{source_id} aktualisiert") + + +@router.post("/admin/sources/{source_id}/delete") +def admin_delete_source(request: Request, source_id: int): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + ok = delete_source(source_id) + if not ok: + return _dashboard_redirect(msg=f"Quelle #{source_id} nicht gefunden", msg_type="error") + return _dashboard_redirect(msg=f"Quelle #{source_id} gelöscht") + + +@router.post("/admin/feeds/create") +def admin_create_feed( + request: Request, + name: str = Form(...), + url: str = Form(...), + source_id: str = Form(""), +): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + + try: + create_feed( + FeedCreate( + name=name, + url=url, + source_id=_to_optional_int(source_id), + is_enabled=True, + ) + ) + except Exception as exc: + return _dashboard_redirect(msg=f"Feed konnte nicht gespeichert werden: {exc}", msg_type="error") + return _dashboard_redirect(msg="Feed gespeichert") + + +@router.post("/admin/feeds/{feed_id}/update") +def admin_update_feed( + request: Request, + feed_id: int, + name: str = Form(...), + url: str = Form(...), + source_id: str = Form(""), + is_enabled: str = Form("1"), +): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + try: + ok = update_feed( + feed_id, + FeedUpdate( + name=name, + url=url, + source_id=_to_optional_int(source_id), + is_enabled=is_enabled == "1", + ), + ) + except Exception as exc: + return _dashboard_redirect(msg=f"Feed #{feed_id} Update fehlgeschlagen: {exc}", msg_type="error") + if not ok: + return _dashboard_redirect(msg=f"Feed #{feed_id} nicht gefunden", msg_type="error") + return _dashboard_redirect(msg=f"Feed #{feed_id} aktualisiert") + + +@router.post("/admin/feeds/{feed_id}/delete") +def admin_delete_feed(request: Request, feed_id: int): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + ok = delete_feed(feed_id) + if not ok: + return _dashboard_redirect(msg=f"Feed #{feed_id} nicht gefunden", msg_type="error") + return _dashboard_redirect(msg=f"Feed #{feed_id} gelöscht") + + +@router.post("/admin/ingestion/run") +def admin_run_ingestion(request: Request, feed_id: str = Form("")): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + try: + stats = run_ingestion(feed_id=_to_optional_int(feed_id)) + except Exception as exc: + return _dashboard_redirect(msg=f"Ingestion fehlgeschlagen: {exc}", msg_type="error") + return _dashboard_redirect(msg=f"Ingestion: {stats.status}, upserts={stats.articles_upserted}") + + +@router.post("/admin/articles/{article_id}/review") +def admin_review_article(request: Request, article_id: int, decision: str = Form(...), note: str = Form("")): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + + return _dashboard_redirect(msg="Review-Aktion wurde durch Rewrite ersetzt", msg_type="error") + + +@router.post("/admin/articles/{article_id}/rewrite-run") +def admin_rewrite_run(request: Request, article_id: int): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + article = get_article_by_id(article_id) + if not article: + return _dashboard_redirect(msg=f"Artikel #{article_id} nicht gefunden", msg_type="error") + if internal_to_ui_status(article.get("status")) not in {"new", "rewrite"}: + return _dashboard_redirect(msg=f"Rewrite nur aus new/rewrite fuer Artikel #{article_id}", msg_type="error") + try: + rewritten = rewrite_article_text(article) + tags = generate_article_tags(article, rewritten_text=rewritten) + except Exception as exc: + return _dashboard_redirect(msg=f"Rewrite fehlgeschlagen fuer Artikel #{article_id}: {exc}", msg_type="error") + merged_meta = merge_generated_tags(article.get("meta_json"), tags) + _upsert_article_from_existing(article, content_rewritten=rewritten, status="approved", meta_json=merged_meta) + return _dashboard_redirect(msg=f"Rewrite fertig fuer Artikel #{article_id} -> publish") + + +@router.post("/admin/rewrite/run") +def admin_rewrite_run_batch(request: Request, max_jobs: str = Form("10")): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + try: + limit = max(1, min(int(max_jobs), 100)) + except Exception: + limit = 10 + planned = list_articles(limit=limit, status_filter="rewrite") + processed = 0 + success = 0 + failed = 0 + for article in planned: + processed += 1 + try: + rewritten = rewrite_article_text(article) + tags = generate_article_tags(article, rewritten_text=rewritten) + merged_meta = merge_generated_tags(article.get("meta_json"), tags) + _upsert_article_from_existing(article, content_rewritten=rewritten, status="approved", meta_json=merged_meta) + success += 1 + except Exception: + failed += 1 + return _dashboard_redirect(msg=f"Rewrite-Run: processed={processed}, success={success}, failed={failed}") + + +@router.post("/admin/articles/{article_id}/rewrite-save") +def admin_rewrite_save(request: Request, article_id: int, content_rewritten: str = Form(...)): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + article = get_article_by_id(article_id) + if not article: + return _dashboard_redirect(msg=f"Artikel #{article_id} nicht gefunden", msg_type="error") + text = (content_rewritten or "").strip() + if not text: + return RedirectResponse( + url=f"/admin/articles/{article_id}?msg=Rewrite-Text%20darf%20nicht%20leer%20sein&type=error", + status_code=303, + ) + _upsert_article_from_existing(article, content_rewritten=text) + return RedirectResponse(url=f"/admin/articles/{article_id}?msg=Rewrite-Text%20gespeichert&type=success", status_code=303) + + +@router.post("/admin/articles/{article_id}/reopen") +def admin_reopen_article(request: Request, article_id: int): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + article = get_article_by_id(article_id) + if not article: + return _dashboard_redirect(msg=f"Artikel #{article_id} nicht gefunden", msg_type="error") + _upsert_article_from_existing( + article, + status="rewrite", + wp_post_id=None, + wp_post_url=None, + publish_attempts=0, + publish_last_error=None, + published_to_wp_at=None, + ) + return RedirectResponse( + url=f"/admin/articles/{article_id}?msg=Artikel%20zurueck%20in%20Rewrite-Workflow%20gesetzt&type=success", + status_code=303, + ) + + +@router.post("/admin/articles/{article_id}/transition") +def admin_transition_article(request: Request, article_id: int, target_status: str = Form(...), note: str = Form("")): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + + article = get_article_by_id(article_id) + if article: + current_ui = internal_to_ui_status(article.get("status")) + target_internal = ui_to_internal_status(target_status) + target_ui = internal_to_ui_status(target_internal) + if target_ui in ALLOWED_TRANSITIONS.get(current_ui, ()): + update_article_status(article_id, target_internal, actor=user, note=note or None) + return _dashboard_redirect(msg=f"Artikel #{article_id}: {current_ui} -> {target_ui}") + return _dashboard_redirect(msg=f"Ungueltiger Statuswechsel fuer Artikel #{article_id}", msg_type="error") + + +_PAGE_SIZE = 50 + + +@router.get("/admin/article-list", response_class=HTMLResponse) +def admin_article_list(request: Request): + """Paginated article list with inline WP ID editing.""" + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + + page = max(1, int(request.query_params.get("page", 1))) + status_filter = request.query_params.get("status_filter", "") or None + search = request.query_params.get("search", "").strip() or None + offset = (page - 1) * _PAGE_SIZE + + articles, total = list_articles_page( + limit=_PAGE_SIZE, offset=offset, + status_filter=status_filter, search=search, + ) + + # Enrich each article with thumbnail URL + for a in articles: + meta = _parse_meta_json(a.get("meta_json")) + image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {} + sel = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None + if not sel: + sel = (meta.get("extraction") or {}).get("image_selection", {}).get("primary") + a["thumb_url"] = sel + a["thumb_proxy"] = f"/admin/images/proxy?{urlencode({'url': sel})}" if sel else None + raw = (a.get("content_raw") or a.get("summary") or "").strip() + a["excerpt"] = raw[:120] + "…" if len(raw) > 120 else raw + + total_pages = max(1, (total + _PAGE_SIZE - 1) // _PAGE_SIZE) + + return templates.TemplateResponse( + request, + "admin_article_list.html", + { + "request": request, + "title": "Artikelliste", + "user": user, + "articles": articles, + "page": page, + "total_pages": total_pages, + "total": total, + "page_size": _PAGE_SIZE, + "status_filter": status_filter or "", + "search": search or "", + "flash_msg": request.query_params.get("msg", ""), + "flash_type": request.query_params.get("type", "success"), + }, + ) + + +@router.post("/admin/article-list/update") +async def admin_article_list_update(request: Request): + """Bulk update WP post IDs from the article list form.""" + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + + form = await request.form() + updates: list[tuple[int, int | None]] = [] + + # Form fields: wp_ = new value, orig_ = original value + for key, new_val in form.items(): + if not key.startswith("wp_"): + continue + try: + article_id = int(key[3:]) + except ValueError: + continue + orig_val = str(form.get(f"orig_{article_id}", "")).strip() + new_val_s = str(new_val).strip() + if new_val_s == orig_val: + continue # unchanged + new_wp_id = int(new_val_s) if new_val_s else None + updates.append((article_id, new_wp_id)) + + if updates: + count = bulk_update_wp_post_ids(updates) + msg = f"{count} WP-ID(s) aktualisiert. Bitte jetzt WP-Sync ausführen um Slots & URLs zu aktualisieren." + msg_type = "success" + else: + msg = "Keine Änderungen erkannt." + msg_type = "success" + + # Preserve pagination/filter params from referer + page = form.get("page", "1") + status_filter = form.get("status_filter", "") + search = form.get("search", "") + qs: dict[str, str] = {"msg": msg, "type": msg_type, "page": page} + if status_filter: + qs["status_filter"] = status_filter + if search: + qs["search"] = search + return RedirectResponse(url=f"/admin/article-list?{urlencode(qs)}", status_code=303) + + +@router.post("/admin/wp-sync") +def admin_wp_sync(request: Request): + """Sync scheduled_publish_at and WP references in the DB from WordPress.""" + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + try: + from .wordpress import sync_db_from_wordpress + stats = sync_db_from_wordpress() + msg = ( + f"WP-Sync abgeschlossen: " + f"{stats['slot_updated']} Slots aktualisiert, " + f"{stats['slot_cleared_draft']} Slots geleert (Draft), " + f"{stats['marked_published']} als veröffentlicht markiert, " + f"{stats['wp_reference_cleared']} WP-Referenzen gelöscht (Papierkorb), " + f"{stats['already_in_sync']} bereits synchron." + ) + return RedirectResponse(url=f"/admin/schedule?msg={msg}&type=success", status_code=303) + except Exception as exc: + return RedirectResponse(url=f"/admin/schedule?msg=Sync fehlgeschlagen: {exc}&type=error", status_code=303) + + +@router.post("/admin/articles/{article_id}/retry") +def admin_retry_article(request: Request, article_id: int): + """Reset a failed article to 'new' so the pipeline picks it up on next run.""" + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + + article = get_article_by_id(article_id) + if not article: + return _dashboard_redirect(msg=f"Artikel #{article_id} nicht gefunden", msg_type="error") + + from .scheduler import release_publish_slot + release_publish_slot(article_id) + update_article_status(article_id, "new", actor=user, note="Manuell zurückgesetzt für erneuten Pipeline-Versuch") + return _dashboard_redirect( + msg=f"Artikel #{article_id} wurde auf 'neu' zurückgesetzt und wird beim nächsten Pipeline-Lauf verarbeitet", + status_filter="close", + ) + + +@router.get("/admin/schedule", response_class=HTMLResponse) +def admin_schedule(request: Request): + """Schedule overview: all booked slots from DB and WordPress.""" + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + + from .scheduler import get_schedule_overview, _preferred_hours, _today_cet + from datetime import timedelta + + slots = get_schedule_overview(lookahead_days=60) + today = _today_cet() + hours = _preferred_hours() + + # Build a calendar grid: for each day in the next 60 days, show each preferred hour slot + booked: dict[tuple[str, int], dict] = {(s["date"], s["hour"]): s for s in slots} + calendar_days = [] + for offset in range(0, 61): + d = today + timedelta(days=offset) + d_str = d.isoformat() + day_slots = [] + for h in hours: + key = (d_str, h) + day_slots.append({ + "hour": h, + "booked": key in booked, + "slot": booked.get(key), + }) + calendar_days.append({ + "date": d_str, + "date_fmt": d.strftime("%d.%m.%Y"), + "weekday": ["Mo", "Di", "Mi", "Do", "Fr", "Sa", "So"][d.weekday()], + "slots": day_slots, + "any_booked": any(s["booked"] for s in day_slots), + }) + + return templates.TemplateResponse( + request, + "admin_schedule.html", + { + "request": request, + "title": "Veröffentlichungsplan", + "user": user, + "slots": slots, + "calendar_days": calendar_days, + "hours": hours, + "flash_msg": request.query_params.get("msg", ""), + "flash_type": request.query_params.get("type", "success"), + }, + ) diff --git a/backend/app/auth.py b/backend/app/auth.py new file mode 100644 index 0000000..188397f --- /dev/null +++ b/backend/app/auth.py @@ -0,0 +1,31 @@ +import hmac +from typing import Optional + +from itsdangerous import URLSafeTimedSerializer, BadSignature, SignatureExpired + +from .config import get_settings + + +def _serializer() -> URLSafeTimedSerializer: + settings = get_settings() + return URLSafeTimedSerializer(settings.app_secret_key, salt="rss-news-session") + + +def verify_credentials(username: str, password: str) -> bool: + settings = get_settings() + user_ok = hmac.compare_digest(username, settings.app_admin_username) + pw_ok = hmac.compare_digest(password, settings.app_admin_password) + return user_ok and pw_ok + + +def create_session_token(username: str) -> str: + return _serializer().dumps({"username": username}) + + +def verify_session_token(token: str) -> Optional[str]: + settings = get_settings() + try: + payload = _serializer().loads(token, max_age=settings.session_max_age_seconds) + except (BadSignature, SignatureExpired): + return None + return payload.get("username") diff --git a/backend/app/config.py b/backend/app/config.py new file mode 100644 index 0000000..24c3902 --- /dev/null +++ b/backend/app/config.py @@ -0,0 +1,65 @@ +from functools import lru_cache +from pathlib import Path + +from dotenv import load_dotenv +from pydantic import AliasChoices, Field +from pydantic_settings import BaseSettings, SettingsConfigDict + + +class Settings(BaseSettings): + # Prefer backend-specific env file to avoid collisions with legacy root .env + model_config = SettingsConfigDict( + env_file=("backend/.env", ".env"), + env_file_encoding="utf-8", + extra="ignore", + ) + + app_env: str = "development" + app_name: str = "rss-news-backend" + app_secret_key: str = "replace-with-a-long-random-secret" + + app_admin_username: str = "admin" + app_admin_password: str = "change-me" + + session_cookie_name: str = "rss_news_session" + session_max_age_seconds: int = 28800 + + app_db_path: str = "backend/data/rss_news.db" + + wordpress_base_url: str | None = Field(default=None, validation_alias=AliasChoices("WORDPRESS_BASE_URL", "WP_BASE_URL")) + wordpress_username: str | None = Field(default=None, validation_alias=AliasChoices("WORDPRESS_USERNAME", "WP_USERNAME")) + wordpress_app_password: str | None = Field(default=None, validation_alias=AliasChoices("WORDPRESS_APP_PASSWORD", "WP_PASSWORD")) + wordpress_default_status: str = "draft" + openai_api_key: str | None = Field(default=None, validation_alias=AliasChoices("OPENAI_API_KEY")) + openai_model: str = "gpt-4o-mini" + + # Telegram Bot + telegram_bot_token: str | None = Field(default=None, validation_alias=AliasChoices("TELEGRAM_BOT_TOKEN")) + telegram_chat_id: str | None = Field(default=None, validation_alias=AliasChoices("TELEGRAM_CHAT_ID")) + telegram_webhook_secret: str | None = Field(default=None, validation_alias=AliasChoices("TELEGRAM_WEBHOOK_SECRET")) + + # N8N API authentication + n8n_api_key: str | None = Field(default=None, validation_alias=AliasChoices("N8N_API_KEY")) + + # Pipeline behaviour + pipeline_relevance_auto: int = 80 # >= this: auto-process + pipeline_relevance_warn: int = 60 # >= this: Telegram warning, else reject + pipeline_max_drafts_per_day: int = 2 + pipeline_publish_hours: str = "9,14" # comma-separated preferred publish hours (CET) + pipeline_min_words_raw: int = 120 # minimum words in raw content before rewrite (else reject) + pipeline_min_words_rewritten: int = 150 # minimum words in rewritten content (else reject) + pipeline_max_article_age_days: int = 7 # skip articles older than N days during ingestion (0 = no limit) + + +@lru_cache(maxsize=1) +def get_settings() -> Settings: + # Prefer shared legacy env from the original rss-news workspace if present. + env_candidates = ( + Path("/Users/oliver/Documents/rss-news/.env"), + Path("backend/.env"), + Path(".env"), + ) + for env_path in env_candidates: + if env_path.exists(): + load_dotenv(env_path, override=False) + return Settings() diff --git a/backend/app/db.py b/backend/app/db.py new file mode 100644 index 0000000..b6ef898 --- /dev/null +++ b/backend/app/db.py @@ -0,0 +1,293 @@ +import sqlite3 +from contextlib import contextmanager +from pathlib import Path +from typing import Any, Iterator + +from .config import get_settings + + +def _db_path() -> Path: + settings = get_settings() + path = Path(settings.app_db_path) + path.parent.mkdir(parents=True, exist_ok=True) + return path + + +@contextmanager +def get_conn() -> Iterator[sqlite3.Connection]: + conn = sqlite3.connect(_db_path()) + conn.row_factory = sqlite3.Row + conn.execute("PRAGMA foreign_keys=ON;") + try: + yield conn + conn.commit() + finally: + conn.close() + + +def init_db() -> None: + with get_conn() as conn: + conn.executescript( + """ + PRAGMA journal_mode=WAL; + + CREATE TABLE IF NOT EXISTS sources ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + name TEXT NOT NULL, + base_url TEXT, + terms_url TEXT, + license_name TEXT, + risk_level TEXT NOT NULL DEFAULT 'yellow' CHECK (risk_level IN ('green', 'yellow', 'red')), + is_enabled INTEGER NOT NULL DEFAULT 0, + notes TEXT, + last_reviewed_at TEXT, + created_at TEXT NOT NULL DEFAULT (datetime('now')), + updated_at TEXT NOT NULL DEFAULT (datetime('now')) + ); + + CREATE TABLE IF NOT EXISTS feeds ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + source_id INTEGER, + name TEXT NOT NULL, + url TEXT NOT NULL UNIQUE, + is_enabled INTEGER NOT NULL DEFAULT 1, + etag TEXT, + last_modified TEXT, + last_checked_at TEXT, + created_at TEXT NOT NULL DEFAULT (datetime('now')), + updated_at TEXT NOT NULL DEFAULT (datetime('now')), + FOREIGN KEY(source_id) REFERENCES sources(id) ON DELETE SET NULL + ); + + CREATE TABLE IF NOT EXISTS runs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + run_type TEXT NOT NULL, + status TEXT NOT NULL CHECK (status IN ('queued', 'running', 'success', 'failed')), + started_at TEXT NOT NULL DEFAULT (datetime('now')), + finished_at TEXT, + details TEXT + ); + + CREATE TABLE IF NOT EXISTS publish_jobs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + article_id INTEGER NOT NULL, + status TEXT NOT NULL CHECK (status IN ('queued', 'running', 'success', 'failed')), + attempts INTEGER NOT NULL DEFAULT 0, + max_attempts INTEGER NOT NULL DEFAULT 3, + error_message TEXT, + wp_post_id INTEGER, + wp_post_url TEXT, + created_at TEXT NOT NULL DEFAULT (datetime('now')), + started_at TEXT, + finished_at TEXT, + FOREIGN KEY(article_id) REFERENCES articles(id) ON DELETE CASCADE + ); + + CREATE TABLE IF NOT EXISTS articles ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + feed_id INTEGER, + source_article_id TEXT, + source_hash TEXT, + title TEXT NOT NULL, + source_url TEXT NOT NULL, + canonical_url TEXT, + published_at TEXT, + author TEXT, + summary TEXT, + content_raw TEXT, + content_rewritten TEXT, + image_urls_json TEXT, + press_contact TEXT, + source_name_snapshot TEXT, + source_terms_url_snapshot TEXT, + source_license_name_snapshot TEXT, + legal_checked INTEGER NOT NULL DEFAULT 0, + legal_checked_at TEXT, + legal_note TEXT, + wp_post_id INTEGER, + wp_post_url TEXT, + publish_attempts INTEGER NOT NULL DEFAULT 0, + publish_last_error TEXT, + published_to_wp_at TEXT, + word_count INTEGER DEFAULT 0, + status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error', 'no_image')), + meta_json TEXT, + created_at TEXT NOT NULL DEFAULT (datetime('now')), + updated_at TEXT NOT NULL DEFAULT (datetime('now')), + FOREIGN KEY(feed_id) REFERENCES feeds(id) ON DELETE SET NULL, + UNIQUE(source_url) + ); + + CREATE INDEX IF NOT EXISTS idx_articles_source_article_id ON articles(source_article_id); + CREATE INDEX IF NOT EXISTS idx_articles_source_hash ON articles(source_hash); + CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_feed_source_article_id + ON articles(feed_id, source_article_id) + WHERE source_article_id IS NOT NULL; + CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_source_hash + ON articles(source_hash) + WHERE source_hash IS NOT NULL; + CREATE INDEX IF NOT EXISTS idx_articles_status ON articles(status); + CREATE INDEX IF NOT EXISTS idx_feeds_source_id ON feeds(source_id); + CREATE INDEX IF NOT EXISTS idx_runs_started_at ON runs(started_at); + CREATE INDEX IF NOT EXISTS idx_articles_published_at ON articles(published_at); + CREATE INDEX IF NOT EXISTS idx_publish_jobs_status_created_at ON publish_jobs(status, created_at); + + CREATE TRIGGER IF NOT EXISTS trg_sources_updated_at + AFTER UPDATE ON sources + FOR EACH ROW + BEGIN + UPDATE sources SET updated_at = datetime('now') WHERE id = OLD.id; + END; + + CREATE TRIGGER IF NOT EXISTS trg_feeds_updated_at + AFTER UPDATE ON feeds + FOR EACH ROW + BEGIN + UPDATE feeds SET updated_at = datetime('now') WHERE id = OLD.id; + END; + + CREATE TRIGGER IF NOT EXISTS trg_articles_updated_at + AFTER UPDATE ON articles + FOR EACH ROW + BEGIN + UPDATE articles SET updated_at = datetime('now') WHERE id = OLD.id; + END; + """ + ) + + # Lightweight migration for existing DBs created before source_hash was introduced. + existing_columns = { + row["name"] for row in conn.execute("PRAGMA table_info(articles)").fetchall() + } + migration_columns = { + "relevance_score": "ALTER TABLE articles ADD COLUMN relevance_score INTEGER", + "scheduled_publish_at": "ALTER TABLE articles ADD COLUMN scheduled_publish_at TEXT", + "source_hash": "ALTER TABLE articles ADD COLUMN source_hash TEXT", + "image_urls_json": "ALTER TABLE articles ADD COLUMN image_urls_json TEXT", + "press_contact": "ALTER TABLE articles ADD COLUMN press_contact TEXT", + "source_name_snapshot": "ALTER TABLE articles ADD COLUMN source_name_snapshot TEXT", + "source_terms_url_snapshot": "ALTER TABLE articles ADD COLUMN source_terms_url_snapshot TEXT", + "source_license_name_snapshot": "ALTER TABLE articles ADD COLUMN source_license_name_snapshot TEXT", + "legal_checked": "ALTER TABLE articles ADD COLUMN legal_checked INTEGER NOT NULL DEFAULT 0", + "legal_checked_at": "ALTER TABLE articles ADD COLUMN legal_checked_at TEXT", + "legal_note": "ALTER TABLE articles ADD COLUMN legal_note TEXT", + "wp_post_id": "ALTER TABLE articles ADD COLUMN wp_post_id INTEGER", + "wp_post_url": "ALTER TABLE articles ADD COLUMN wp_post_url TEXT", + "publish_attempts": "ALTER TABLE articles ADD COLUMN publish_attempts INTEGER NOT NULL DEFAULT 0", + "publish_last_error": "ALTER TABLE articles ADD COLUMN publish_last_error TEXT", + "published_to_wp_at": "ALTER TABLE articles ADD COLUMN published_to_wp_at TEXT", + } + for column, ddl in migration_columns.items(): + if column not in existing_columns: + conn.execute(ddl) + + # Migration: add 'no_image' to the status CHECK constraint if not present. + # SQLite cannot modify CHECK constraints in-place, so we recreate the table. + table_sql_row = conn.execute( + "SELECT sql FROM sqlite_master WHERE type='table' AND name='articles'" + ).fetchone() + if table_sql_row and "'no_image'" not in (table_sql_row["sql"] or ""): + conn.executescript( + """ + PRAGMA foreign_keys=OFF; + + CREATE TABLE articles_v2 ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + feed_id INTEGER, + source_article_id TEXT, + source_hash TEXT, + title TEXT NOT NULL, + source_url TEXT NOT NULL, + canonical_url TEXT, + published_at TEXT, + author TEXT, + summary TEXT, + content_raw TEXT, + content_rewritten TEXT, + image_urls_json TEXT, + press_contact TEXT, + source_name_snapshot TEXT, + source_terms_url_snapshot TEXT, + source_license_name_snapshot TEXT, + legal_checked INTEGER NOT NULL DEFAULT 0, + legal_checked_at TEXT, + legal_note TEXT, + wp_post_id INTEGER, + wp_post_url TEXT, + publish_attempts INTEGER NOT NULL DEFAULT 0, + publish_last_error TEXT, + published_to_wp_at TEXT, + word_count INTEGER DEFAULT 0, + status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error', 'no_image')), + meta_json TEXT, + relevance_score INTEGER, + scheduled_publish_at TEXT, + created_at TEXT NOT NULL DEFAULT (datetime('now')), + updated_at TEXT NOT NULL DEFAULT (datetime('now')), + FOREIGN KEY(feed_id) REFERENCES feeds(id) ON DELETE SET NULL, + UNIQUE(source_url) + ); + + INSERT INTO articles_v2 SELECT + id, feed_id, source_article_id, source_hash, title, source_url, + canonical_url, published_at, author, summary, content_raw, + content_rewritten, image_urls_json, press_contact, + source_name_snapshot, source_terms_url_snapshot, source_license_name_snapshot, + legal_checked, legal_checked_at, legal_note, + wp_post_id, wp_post_url, publish_attempts, publish_last_error, + published_to_wp_at, word_count, status, meta_json, + relevance_score, scheduled_publish_at, created_at, updated_at + FROM articles; + + DROP TABLE articles; + ALTER TABLE articles_v2 RENAME TO articles; + + CREATE INDEX IF NOT EXISTS idx_articles_source_article_id ON articles(source_article_id); + CREATE INDEX IF NOT EXISTS idx_articles_source_hash ON articles(source_hash); + CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_feed_source_article_id + ON articles(feed_id, source_article_id) + WHERE source_article_id IS NOT NULL; + CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_source_hash + ON articles(source_hash) + WHERE source_hash IS NOT NULL; + CREATE INDEX IF NOT EXISTS idx_articles_status ON articles(status); + CREATE INDEX IF NOT EXISTS idx_articles_published_at ON articles(published_at); + + CREATE TRIGGER IF NOT EXISTS trg_articles_updated_at + AFTER UPDATE ON articles + FOR EACH ROW + BEGIN + UPDATE articles SET updated_at = datetime('now') WHERE id = OLD.id; + END; + + PRAGMA foreign_keys=ON; + """ + ) + + table_rows = conn.execute( + "SELECT name FROM sqlite_master WHERE type = 'table' AND name = 'publish_jobs'" + ).fetchall() + if not table_rows: + conn.executescript( + """ + CREATE TABLE IF NOT EXISTS publish_jobs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + article_id INTEGER NOT NULL, + status TEXT NOT NULL CHECK (status IN ('queued', 'running', 'success', 'failed')), + attempts INTEGER NOT NULL DEFAULT 0, + max_attempts INTEGER NOT NULL DEFAULT 3, + error_message TEXT, + wp_post_id INTEGER, + wp_post_url TEXT, + created_at TEXT NOT NULL DEFAULT (datetime('now')), + started_at TEXT, + finished_at TEXT, + FOREIGN KEY(article_id) REFERENCES articles(id) ON DELETE CASCADE + ); + CREATE INDEX IF NOT EXISTS idx_publish_jobs_status_created_at ON publish_jobs(status, created_at); + """ + ) + + +def rows_to_dicts(rows: list[sqlite3.Row]) -> list[dict[str, Any]]: + return [dict(r) for r in rows] diff --git a/backend/app/ingestion.py b/backend/app/ingestion.py new file mode 100644 index 0000000..391af92 --- /dev/null +++ b/backend/app/ingestion.py @@ -0,0 +1,486 @@ +from __future__ import annotations + +from dataclasses import dataclass +from datetime import datetime, timedelta, timezone +import hashlib +import json +import re +import time +from typing import Any +from urllib.parse import unquote, urlencode, urlparse, parse_qs +import urllib.error +import urllib.request as _urllib_req + +import feedparser + +from .repositories import ( + ArticleUpsert, + RunCreate, + create_run, + find_existing_article_for_upsert, + finish_run, + get_feed_by_id, + list_enabled_feeds, + update_feed_fetch_state, + upsert_article, +) +from .source_extraction import extract_article, extracted_article_to_meta + + +@dataclass(frozen=True) +class IngestionStats: + run_id: int + feeds_processed: int + entries_seen: int + articles_upserted: int + status: str + message: str + + +MAX_FEED_FETCH_RETRIES = 3 + + +def _normalize_article_url(url: str) -> str: + """Strip AMP and tracking query parameters from article URLs. + + Removes ?outputType=valid_amp and other AMP/tracking params so that + AMP and non-AMP versions of the same article are deduplicated. + """ + _AMP_PARAMS = {"outputtype", "amp", "outputformat"} + try: + from urllib.parse import parse_qs, urlencode + parsed = urlparse(url) + if not parsed.query: + return url + params = parse_qs(parsed.query, keep_blank_values=True) + filtered = {k: v for k, v in params.items() if k.lower() not in _AMP_PARAMS} + new_query = urlencode(filtered, doseq=True) + return parsed._replace(query=new_query).geturl() + except Exception: + return url + + +def _resolve_google_redirect(url: str) -> str: + """Extract the real article URL from Google redirect URLs. + + Google Alerts feed entries use tracking links like: + https://www.google.com/url?rct=j&sa=t&url=&ct=ga&... + + This function returns the decoded real URL if detected, otherwise the + original URL unchanged. + """ + try: + parsed = urlparse(url) + host = (parsed.hostname or "").lower() + if host not in ("www.google.com", "google.com"): + return url + if parsed.path not in ("/url", "/url/"): + return url + params = parse_qs(parsed.query, keep_blank_values=False) + real_urls = params.get("url") + if real_urls: + return unquote(real_urls[0]) + except Exception: + pass + return url + + +def _entry_published_iso(entry: dict) -> str | None: + published = entry.get("published_parsed") or entry.get("updated_parsed") + if not published: + return None + return datetime(*published[:6], tzinfo=timezone.utc).isoformat() + + +def _entry_text(entry: dict) -> tuple[str, str]: + summary = entry.get("summary", "") or "" + content = "" + if entry.get("content") and isinstance(entry.get("content"), list): + first = entry["content"][0] + content = first.get("value", "") if isinstance(first, dict) else "" + if not content: + content = summary + return summary, content + + +def _entry_hash(entry: dict, feed_id: int, link: str, title: str, summary: str) -> str: + source_id = entry.get("id") or entry.get("guid") or "" + published = _entry_published_iso(entry) or "" + fingerprint = f"{feed_id}|{source_id}|{link}|{title.strip()}|{summary.strip()}|{published}" + return hashlib.sha256(fingerprint.encode("utf-8")).hexdigest() + + +def _parsed_get(parsed: object, key: str, default: object = None) -> object: + if isinstance(parsed, dict): + return parsed.get(key, default) + return getattr(parsed, key, default) + + +def _normalize_tokens(text: str) -> set[str]: + normalized = re.sub(r"[^a-z0-9]+", " ", text.lower()) + return {token for token in normalized.split() if len(token) >= 4} + + +def _probe_image_url(url: str, timeout: int = 5) -> bool: + """Return True if URL responds without a 4xx/5xx error (HEAD request). + + Returns True on network/connection errors so that a flaky server does not + cause a valid image to be silently dropped. + """ + try: + req = _urllib_req.Request( + url, + method="HEAD", + headers={"User-Agent": "Mozilla/5.0 (compatible; rss-news/1.0)"}, + ) + with _urllib_req.urlopen(req, timeout=timeout) as resp: + return resp.status < 400 + except urllib.error.HTTPError as exc: + return exc.code < 400 # 3xx redirects are OK; 4xx/5xx are not + except Exception: + return True # network error → don't filter, let WP try later + + +def _rank_image_candidates(source_url: str, title: str, images: list[str]) -> list[dict[str, Any]]: + source_host = (urlparse(source_url).hostname or "").lower() + is_presseportal = "presseportal.de" in source_host + title_tokens = _normalize_tokens(title) + blocked_patterns = ("logo", "badge", "app-store", "google-play", "na-logo", "sprite", "icon", "favicon", "tracking", "pixel", ".svg", ".ico", ".gif") + # Known placeholder/default images that should never be used as featured image + placeholder_patterns = ("some-default.jpg", "default-image", "placeholder", "no-image", "noimage") + + + ranked: list[dict[str, Any]] = [] + for url in images: + # Skip inline data: URIs (e.g. base64-encoded SVG placeholders) + if url.startswith("data:"): + continue + + parsed = urlparse(url) + path = unquote(parsed.path.lower()) + full = f"{parsed.netloc.lower()}{path}" + score = 0 + reasons: list[str] = [] + + if any(token in full for token in placeholder_patterns): + score -= 300 + reasons.append("placeholder-image") + + if any(token in full for token in blocked_patterns): + score -= 150 + reasons.append("blocked-pattern") + + if is_presseportal and "/thumbnail/story_big/" in path: + score += 120 + reasons.append("presseportal-story-big") + elif is_presseportal and "/thumbnail/highlight/" in path: + score += 45 + reasons.append("presseportal-highlight") + elif is_presseportal and "/thumbnail/liste/" in path: + score -= 40 + reasons.append("presseportal-list") + + if "crop=" in (parsed.query or "").lower(): + score -= 10 + reasons.append("cropped-preview") + + path_tokens = _normalize_tokens(path.replace("-", " ")) + overlap = len(title_tokens.intersection(path_tokens)) + if overlap > 0: + score += min(30, overlap * 6) + reasons.append(f"title-match:{overlap}") + + ranked.append({"url": url, "score": score, "reasons": reasons}) + + ranked.sort(key=lambda item: item["score"], reverse=True) + return ranked + + +def _select_relevant_images(source_url: str, title: str, images: list[str], max_keep: int = 3) -> tuple[list[str], str | None, list[dict[str, Any]]]: + # dedupe incoming order first + deduped: list[str] = [] + seen: set[str] = set() + for image in images: + if image and image not in seen: + seen.add(image) + deduped.append(image) + + ranked = _rank_image_candidates(source_url, title, deduped) + candidates = [item["url"] for item in ranked if item["score"] > -100] + + # Probe top candidates (max 4) to skip definitively broken URLs (HTTP 4xx). + # Network errors are treated as OK to avoid false negatives on flaky servers. + primary = None + kept: list[str] = [] + for url in candidates[:4]: + if _probe_image_url(url): + if primary is None: + primary = url + kept.append(url) + if len(kept) >= max_keep: + break + + # Fallback: if all probes failed with network errors, use best candidate anyway + if not kept and candidates: + primary = candidates[0] + kept = candidates[:max_keep] + + return kept, primary, ranked + + +def _merge_ingestion_meta(existing_meta_json: str | None, attribution: dict[str, Any], extraction_meta: dict[str, Any]) -> str: + meta: dict[str, Any] = {} + if existing_meta_json: + try: + parsed = json.loads(existing_meta_json) + if isinstance(parsed, dict): + meta = parsed + except Exception: + meta = {} + meta["attribution"] = attribution + meta["extraction"] = extraction_meta + return json.dumps(meta, ensure_ascii=False) + + +def run_ingestion(feed_id: int | None = None) -> IngestionStats: + run_id = create_run(RunCreate(run_type="ingestion", status="running", details="started")) + feeds_processed = 0 + entries_seen = 0 + articles_upserted = 0 + feed_results: list[dict[str, object]] = [] + + try: + if feed_id is not None: + feed = get_feed_by_id(feed_id) + feeds = [feed] if feed and int(feed.get("is_enabled", 0)) == 1 else [] + else: + feeds = list_enabled_feeds() + + for feed in feeds: + if not feed: + continue + feeds_processed += 1 + + parsed = None + feed_error = None + for attempt in range(1, MAX_FEED_FETCH_RETRIES + 1): + try: + parsed = feedparser.parse( + feed["url"], + etag=feed.get("etag"), + modified=feed.get("last_modified"), + ) + break + except Exception as exc: + feed_error = str(exc) + if attempt < MAX_FEED_FETCH_RETRIES: + time.sleep(0.5 * attempt) + + if parsed is None: + feed_results.append( + { + "feed_id": int(feed["id"]), + "feed_url": feed["url"], + "status": "failed", + "error": feed_error or "unknown", + "entries_seen": 0, + "upserts": 0, + } + ) + continue + + # Persist ETag/Last-Modified for conditional requests. + parsed_etag = _parsed_get(parsed, "etag") + parsed_modified = _parsed_get(parsed, "modified") + if parsed_modified and not isinstance(parsed_modified, str): + parsed_modified = str(parsed_modified) + update_feed_fetch_state( + feed_id=int(feed["id"]), + etag=parsed_etag if isinstance(parsed_etag, str) else None, + last_modified=parsed_modified if isinstance(parsed_modified, str) else None, + ) + + feed_entries_seen = 0 + feed_upserts = 0 + from .config import get_settings as _get_settings + _max_age_days = _get_settings().pipeline_max_article_age_days + for entry in _parsed_get(parsed, "entries", []): + entries_seen += 1 + feed_entries_seen += 1 + link = entry.get("link") + if not link: + continue + + # Age filter: skip articles older than max_age_days (0 = no limit) + if _max_age_days > 0: + published_iso = _entry_published_iso(entry) + if published_iso: + try: + published_dt = datetime.fromisoformat(published_iso) + age = datetime.now(timezone.utc) - published_dt + if age > timedelta(days=_max_age_days): + continue + except Exception: + pass # can't parse date → allow through + + # Resolve Google redirect URLs (google.com/url?...&url=&...) + link = _resolve_google_redirect(link) + # Normalize AMP/tracking params (e.g. ?outputType=valid_amp) + link = _normalize_article_url(link) + + summary, content_raw = _entry_text(entry) + # Strip HTML tags from title (Google Alerts wraps matched keywords in ) + raw_title = entry.get("title") or "Ohne Titel" + title = re.sub(r"<[^>]+>", "", raw_title).strip() or "Ohne Titel" + extracted = extract_article(link) + + final_title = extracted.title or title + final_author = extracted.author or entry.get("author") + final_summary = extracted.summary or (summary[:1000] if summary else None) + final_content_raw = extracted.content_text or content_raw + final_canonical = extracted.canonical_url or entry.get("link") + selected_images, primary_image, ranked_images = _select_relevant_images( + link, + final_title, + extracted.images, + max_keep=3, + ) + + source_hash = _entry_hash( + entry, + int(feed["id"]), + link, + final_title, + final_summary or "", + ) + attribution = { + "source_name": feed.get("source_name"), + "source_base_url": feed.get("source_base_url"), + "source_terms_url": feed.get("source_terms_url"), + "source_license_name": feed.get("source_license_name"), + "source_risk_level": feed.get("source_risk_level"), + "original_link": link, + "feed_name": feed.get("name"), + "feed_id": int(feed["id"]), + "imported_at": datetime.now(timezone.utc).isoformat(), + } + extraction_meta: dict[str, Any] = extracted_article_to_meta(extracted) + extraction_meta["fetched_from"] = link + extraction_meta["image_selection"] = { + "primary": primary_image, + "selected_count": len(selected_images), + "total_candidates": len(extracted.images), + "ranked": ranked_images, + } + base_payload = ArticleUpsert( + feed_id=int(feed["id"]), + source_article_id=entry.get("id") or entry.get("guid"), + source_hash=source_hash, + title=final_title, + source_url=link, + canonical_url=final_canonical, + published_at=_entry_published_iso(entry), + author=final_author, + summary=final_summary, + content_raw=final_content_raw, + content_rewritten=None, + image_urls_json=json.dumps(selected_images, ensure_ascii=False) if selected_images else None, + press_contact=extracted.press_contact, + source_name_snapshot=feed.get("source_name"), + source_terms_url_snapshot=feed.get("source_terms_url"), + source_license_name_snapshot=feed.get("source_license_name"), + legal_checked=False, + legal_checked_at=None, + legal_note=None, + wp_post_id=None, + wp_post_url=None, + publish_attempts=0, + publish_last_error=None, + published_to_wp_at=None, + word_count=len((final_content_raw or "").split()), + status="new", + meta_json=json.dumps({"attribution": attribution, "extraction": extraction_meta}, ensure_ascii=False), + ) + existing = find_existing_article_for_upsert(base_payload) + if existing and existing.get("status") == "error": + # Explicitly closed article: ignore on subsequent ingestion runs. + continue + + payload = base_payload + if existing: + payload = ArticleUpsert( + feed_id=base_payload.feed_id, + source_article_id=base_payload.source_article_id, + source_hash=base_payload.source_hash, + title=base_payload.title, + source_url=base_payload.source_url, + canonical_url=base_payload.canonical_url, + published_at=base_payload.published_at, + author=base_payload.author, + summary=base_payload.summary, + content_raw=base_payload.content_raw, + content_rewritten=existing.get("content_rewritten"), + image_urls_json=base_payload.image_urls_json, + press_contact=base_payload.press_contact or existing.get("press_contact"), + source_name_snapshot=base_payload.source_name_snapshot, + source_terms_url_snapshot=base_payload.source_terms_url_snapshot, + source_license_name_snapshot=base_payload.source_license_name_snapshot, + legal_checked=bool(int(existing.get("legal_checked", 0))), + legal_checked_at=existing.get("legal_checked_at"), + legal_note=existing.get("legal_note"), + wp_post_id=existing.get("wp_post_id"), + wp_post_url=existing.get("wp_post_url"), + publish_attempts=int(existing.get("publish_attempts", 0)), + publish_last_error=existing.get("publish_last_error"), + published_to_wp_at=existing.get("published_to_wp_at"), + word_count=base_payload.word_count, + status=existing.get("status") or "new", + meta_json=_merge_ingestion_meta(existing.get("meta_json"), attribution, extraction_meta), + ) + + article_id = upsert_article(payload) + if article_id: + articles_upserted += 1 + feed_upserts += 1 + + feed_results.append( + { + "feed_id": int(feed["id"]), + "feed_url": feed["url"], + "status": "success", + "entries_seen": feed_entries_seen, + "upserts": feed_upserts, + } + ) + + finish_run( + run_id=run_id, + status="success", + details=json.dumps( + { + "feeds_processed": feeds_processed, + "entries_seen": entries_seen, + "upserts": articles_upserted, + "feeds": feed_results, + }, + ensure_ascii=False, + ), + ) + return IngestionStats( + run_id=run_id, + feeds_processed=feeds_processed, + entries_seen=entries_seen, + articles_upserted=articles_upserted, + status="success", + message="Ingestion abgeschlossen", + ) + except Exception as exc: + finish_run(run_id=run_id, status="failed", details=str(exc)) + return IngestionStats( + run_id=run_id, + feeds_processed=feeds_processed, + entries_seen=entries_seen, + articles_upserted=articles_upserted, + status="failed", + message=str(exc), + ) diff --git a/backend/app/main.py b/backend/app/main.py new file mode 100644 index 0000000..b4776af --- /dev/null +++ b/backend/app/main.py @@ -0,0 +1,727 @@ +import asyncio +from contextlib import asynccontextmanager +import csv +from datetime import datetime, timezone +import io +import json +import logging +from pathlib import Path + +from fastapi import Depends, FastAPI, HTTPException, Request, Response, status +from fastapi.responses import JSONResponse +from pydantic import BaseModel, Field +from fastapi.staticfiles import StaticFiles + +from .admin_ui import router as admin_router +from .auth import create_session_token, verify_credentials, verify_session_token +from .config import get_settings +from .db import init_db +from .ingestion import run_ingestion +from .pipeline import run_auto_pipeline +from .policy import evaluate_source_policy, is_source_allowed +from .publisher import enqueue_publish, run_publisher +from .relevance import article_age_days, article_relevance +from .rewrite import generate_article_tags, merge_generated_tags, rewrite_article_text +from .telegram_bot import handle_update, setup_webhook +from .repositories import ( + ArticleUpsert, + FeedCreate, + RunCreate, + SourceCreate, + create_feed as repo_create_feed, + create_run, + create_source as repo_create_source, + finish_run, + get_article_by_id, + get_feed_by_id, + get_run_by_id, + get_source_by_id, + list_publish_jobs, + list_articles as repo_list_articles, + list_feeds as repo_list_feeds, + list_runs, + list_sources as repo_list_sources, + set_article_legal_review, + update_article_status, + upsert_article as repo_upsert_article, +) +from .workflow import ALLOWED_UI_TRANSITIONS, UI_STATUSES, internal_to_ui_status, ui_to_internal_status + +settings = get_settings() + + +@asynccontextmanager +async def app_lifespan(_: FastAPI): + init_db() + yield + + +app = FastAPI(title=settings.app_name, lifespan=app_lifespan) +app.include_router(admin_router) +app.mount( + "/admin/static", + StaticFiles(directory=str(Path(__file__).resolve().parent.parent / "static")), + name="admin-static", +) + + +class LoginRequest(BaseModel): + username: str + password: str + + +class SourceCreateRequest(BaseModel): + name: str = Field(min_length=1, max_length=200) + base_url: str | None = None + terms_url: str | None = None + license_name: str | None = None + risk_level: str = Field(default="yellow", pattern="^(green|yellow|red)$") + is_enabled: bool = False + notes: str | None = None + last_reviewed_at: str | None = None + + +class FeedCreateRequest(BaseModel): + name: str = Field(min_length=1, max_length=200) + url: str = Field(min_length=5, max_length=1000) + source_id: int | None = None + is_enabled: bool = True + + +class RunCreateRequest(BaseModel): + run_type: str = Field(min_length=2, max_length=100) + status: str = Field(default="queued", pattern="^(queued|running|success|failed)$") + details: str | None = None + + +class RunFinishRequest(BaseModel): + status: str = Field(pattern="^(success|failed)$") + details: str | None = None + + +class ArticleUpsertRequest(BaseModel): + feed_id: int | None = None + source_article_id: str | None = None + source_hash: str | None = None + title: str = Field(min_length=1, max_length=500) + source_url: str = Field(min_length=5, max_length=2000) + canonical_url: str | None = None + published_at: str | None = None + author: str | None = None + summary: str | None = None + content_raw: str | None = None + content_rewritten: str | None = None + image_urls_json: str | None = None + press_contact: str | None = None + source_name_snapshot: str | None = None + source_terms_url_snapshot: str | None = None + source_license_name_snapshot: str | None = None + legal_checked: bool = False + legal_checked_at: str | None = None + legal_note: str | None = None + wp_post_id: int | None = None + wp_post_url: str | None = None + publish_attempts: int = 0 + publish_last_error: str | None = None + published_to_wp_at: str | None = None + word_count: int = 0 + status: str = Field(default="new", pattern="^(new|rewrite|publish|published|close|review|approved|error|no_image)$") + meta_json: str | None = None + + +class IngestionRunRequest(BaseModel): + feed_id: int | None = None + + +class ArticleTransitionRequest(BaseModel): + target_status: str = Field(pattern="^(new|rewrite|publish|published|close|review|approved|error|no_image)$") + note: str | None = None + + +class ArticleReviewRequest(BaseModel): + decision: str = Field(pattern="^(approve|reject)$") + note: str | None = None + + +class ArticleLegalReviewRequest(BaseModel): + approved: bool + note: str | None = None + + +class PublisherEnqueueRequest(BaseModel): + article_id: int + max_attempts: int = 3 + + +class PublisherRunRequest(BaseModel): + max_jobs: int = 10 + + +ALLOWED_ARTICLE_TRANSITIONS: dict[str, set[str]] = { + "new": {"rewrite", "error"}, + "rewrite": {"approved", "error"}, + "approved": {"published", "error"}, + "published": {"error"}, + "error": {"rewrite"}, +} + + +def require_auth(request: Request) -> str: + token = request.cookies.get(settings.session_cookie_name) + if not token: + raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Nicht angemeldet") + + username = verify_session_token(token) + if not username: + raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Session ungueltig oder abgelaufen") + + return username + + +@app.get("/health") +def health() -> dict: + return {"status": "ok", "service": settings.app_name, "db_path": settings.app_db_path} + + +@app.post("/auth/login") +def login(payload: LoginRequest, response: Response) -> dict: + if not verify_credentials(payload.username, payload.password): + raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Ungueltige Zugangsdaten") + + token = create_session_token(payload.username) + response.set_cookie( + key=settings.session_cookie_name, + value=token, + max_age=settings.session_max_age_seconds, + httponly=True, + secure=False, + samesite="lax", + ) + return {"ok": True, "username": payload.username} + + +@app.post("/auth/logout") +def logout(response: Response) -> dict: + response.delete_cookie(settings.session_cookie_name) + return {"ok": True} + + +@app.get("/auth/me") +def me(username: str = Depends(require_auth)) -> dict: + return {"authenticated": True, "username": username} + + +@app.get("/api/protected") +def protected(username: str = Depends(require_auth)) -> dict: + return {"ok": True, "message": "Protected endpoint", "username": username} + + +@app.get("/api/pipeline/status") +def pipeline_status(username: str = Depends(require_auth)) -> dict: + feeds_total = len(repo_list_feeds()) + sources_total = len(repo_list_sources()) + articles_total = len(repo_list_articles(limit=500)) + return { + "ok": True, + "stage": "skeleton+db", + "requested_by": username, + "counts": { + "sources": sources_total, + "feeds": feeds_total, + "articles": articles_total, + }, + } + + +@app.get("/api/sources") +def list_sources(username: str = Depends(require_auth)) -> dict: + return {"ok": True, "items": repo_list_sources(), "requested_by": username} + + +@app.get("/api/sources/{source_id}/policy-check") +def source_policy_check(source_id: int, username: str = Depends(require_auth)) -> dict: + source = get_source_by_id(source_id) + if not source: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Quelle nicht gefunden") + issues = evaluate_source_policy(source) + return { + "ok": True, + "source_id": source_id, + "allowed": is_source_allowed(source), + "issues": issues, + "requested_by": username, + } + + +@app.post("/api/sources") +def create_source(payload: SourceCreateRequest, username: str = Depends(require_auth)) -> dict: + source_id = repo_create_source( + SourceCreate( + name=payload.name, + base_url=payload.base_url, + terms_url=payload.terms_url, + license_name=payload.license_name, + risk_level=payload.risk_level, + is_enabled=payload.is_enabled, + notes=payload.notes, + last_reviewed_at=payload.last_reviewed_at, + ) + ) + return {"ok": True, "id": source_id, "requested_by": username} + + +@app.get("/api/feeds") +def list_feeds(username: str = Depends(require_auth)) -> dict: + return {"ok": True, "items": repo_list_feeds(), "requested_by": username} + + +@app.get("/api/feeds/{feed_id}/policy-check") +def feed_policy_check(feed_id: int, username: str = Depends(require_auth)) -> dict: + feed = get_feed_by_id(feed_id) + if not feed: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Feed nicht gefunden") + + source_snapshot = { + "id": feed.get("source_id"), + "name": feed.get("source_name"), + "base_url": feed.get("source_base_url"), + "terms_url": feed.get("source_terms_url"), + "license_name": feed.get("source_license_name"), + "risk_level": feed.get("source_risk_level"), + "last_reviewed_at": feed.get("source_last_reviewed_at"), + "is_enabled": feed.get("source_is_enabled"), + } + issues = evaluate_source_policy(source_snapshot) + return { + "ok": True, + "feed_id": feed_id, + "allowed": len(issues) == 0, + "issues": issues, + "requested_by": username, + } + + +@app.post("/api/feeds") +def create_feed(payload: FeedCreateRequest, username: str = Depends(require_auth)) -> dict: + try: + feed_id = repo_create_feed( + FeedCreate( + name=payload.name, + url=payload.url, + source_id=payload.source_id, + is_enabled=payload.is_enabled, + ) + ) + except Exception as exc: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=f"Feed konnte nicht angelegt werden: {exc}") from exc + + return {"ok": True, "id": feed_id, "requested_by": username} + + +@app.get("/api/runs") +def api_list_runs(limit: int = 50, username: str = Depends(require_auth)) -> dict: + return {"ok": True, "items": list_runs(limit=limit), "requested_by": username} + + +@app.get("/api/runs/{run_id}") +def api_get_run(run_id: int, username: str = Depends(require_auth)) -> dict: + run = get_run_by_id(run_id) + if not run: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Run nicht gefunden") + return {"ok": True, "item": run, "requested_by": username} + + +@app.post("/api/runs") +def api_create_run(payload: RunCreateRequest, username: str = Depends(require_auth)) -> dict: + run_id = create_run(RunCreate(run_type=payload.run_type, status=payload.status, details=payload.details)) + return {"ok": True, "id": run_id, "requested_by": username} + + +@app.post("/api/runs/{run_id}/finish") +def api_finish_run(run_id: int, payload: RunFinishRequest, username: str = Depends(require_auth)) -> dict: + finish_run(run_id=run_id, status=payload.status, details=payload.details) + return {"ok": True, "id": run_id, "requested_by": username} + + +@app.get("/api/articles") +def api_list_articles(limit: int = 100, status_filter: str | None = None, username: str = Depends(require_auth)) -> dict: + internal_filter = ui_to_internal_status(status_filter) if status_filter else None + items = repo_list_articles(limit=limit, status_filter=internal_filter) + for item in items: + item["status_ui"] = internal_to_ui_status(item.get("status")) + return {"ok": True, "items": items, "requested_by": username} + + +@app.get("/api/articles/export") +def api_export_articles( + format: str = "json", + status_filter: str | None = None, + username: str = Depends(require_auth), +): + internal_filter = ui_to_internal_status(status_filter) if status_filter else None + articles = repo_list_articles(limit=500, status_filter=internal_filter) + rows = [] + for article in articles: + meta: dict = {} + if article.get("meta_json"): + try: + parsed = json.loads(article["meta_json"]) + if isinstance(parsed, dict): + meta = parsed + except Exception: + meta = {} + image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {} + selected_image_url = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None + + days_old = article_age_days(article.get("published_at")) + rows.append( + { + "id": article.get("id"), + "title": article.get("title"), + "status": article.get("status"), + "published_at": article.get("published_at"), + "days_old": days_old, + "relevance": article_relevance(article.get("published_at")), + "author": article.get("author"), + "source_url": article.get("source_url"), + "canonical_url": article.get("canonical_url"), + "source_name_snapshot": article.get("source_name_snapshot"), + "source_license_name_snapshot": article.get("source_license_name_snapshot"), + "source_terms_url_snapshot": article.get("source_terms_url_snapshot"), + "press_contact": article.get("press_contact"), + "image_urls_json": article.get("image_urls_json"), + "selected_image_url": selected_image_url, + "legal_checked": bool(int(article.get("legal_checked", 0))), + "legal_checked_at": article.get("legal_checked_at"), + "legal_note": article.get("legal_note"), + } + ) + + generated_at = datetime.now(timezone.utc).isoformat() + if format == "csv": + out = io.StringIO() + fieldnames = [ + "id", + "title", + "status", + "published_at", + "days_old", + "relevance", + "author", + "source_url", + "canonical_url", + "source_name_snapshot", + "source_license_name_snapshot", + "source_terms_url_snapshot", + "press_contact", + "image_urls_json", + "selected_image_url", + "legal_checked", + "legal_checked_at", + "legal_note", + ] + writer = csv.DictWriter(out, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) + return Response( + content=out.getvalue(), + media_type="text/csv; charset=utf-8", + headers={"Content-Disposition": 'attachment; filename="articles_export.csv"'}, + ) + + return JSONResponse( + { + "ok": True, + "count": len(rows), + "generated_at": generated_at, + "status_filter": status_filter, + "items": rows, + "requested_by": username, + } + ) + + +@app.get("/api/articles/{article_id}") +def api_get_article(article_id: int, username: str = Depends(require_auth)) -> dict: + article = get_article_by_id(article_id) + if not article: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden") + article["status_ui"] = internal_to_ui_status(article.get("status")) + return {"ok": True, "item": article, "requested_by": username} + + +@app.post("/api/articles/upsert") +def api_upsert_article(payload: ArticleUpsertRequest, username: str = Depends(require_auth)) -> dict: + article_id = repo_upsert_article( + ArticleUpsert( + feed_id=payload.feed_id, + source_article_id=payload.source_article_id, + source_hash=payload.source_hash, + title=payload.title, + source_url=payload.source_url, + canonical_url=payload.canonical_url, + published_at=payload.published_at, + author=payload.author, + summary=payload.summary, + content_raw=payload.content_raw, + content_rewritten=payload.content_rewritten, + image_urls_json=payload.image_urls_json, + press_contact=payload.press_contact, + source_name_snapshot=payload.source_name_snapshot, + source_terms_url_snapshot=payload.source_terms_url_snapshot, + source_license_name_snapshot=payload.source_license_name_snapshot, + legal_checked=payload.legal_checked, + legal_checked_at=payload.legal_checked_at, + legal_note=payload.legal_note, + wp_post_id=payload.wp_post_id, + wp_post_url=payload.wp_post_url, + publish_attempts=payload.publish_attempts, + publish_last_error=payload.publish_last_error, + published_to_wp_at=payload.published_to_wp_at, + word_count=payload.word_count, + status=ui_to_internal_status(payload.status), + meta_json=payload.meta_json, + ) + ) + return {"ok": True, "id": article_id, "requested_by": username} + + +@app.post("/api/articles/{article_id}/transition") +def api_article_transition(article_id: int, payload: ArticleTransitionRequest, username: str = Depends(require_auth)) -> dict: + article = get_article_by_id(article_id) + if not article: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden") + + current_status = article.get("status") + current_ui = internal_to_ui_status(current_status) + target_internal = ui_to_internal_status(payload.target_status) + target_ui = internal_to_ui_status(target_internal) + allowed_targets = ALLOWED_UI_TRANSITIONS.get(current_ui, set()) + if target_ui not in allowed_targets: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=f"Ungueltiger Statuswechsel: {current_ui} -> {target_ui}", + ) + + updated = update_article_status(article_id, target_internal, actor=username, note=payload.note) + if not updated: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden") + return {"ok": True, "id": article_id, "from_status": current_ui, "to_status": target_ui} + + +@app.post("/api/articles/{article_id}/rewrite-run") +def api_article_rewrite_run(article_id: int, username: str = Depends(require_auth)) -> dict: + article = get_article_by_id(article_id) + if not article: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden") + if internal_to_ui_status(article.get("status")) not in {"rewrite", "new"}: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Rewrite nur aus Status 'new' oder 'rewrite'") + + rewritten = rewrite_article_text(article) + tags: list[str] = [] + try: + tags = generate_article_tags(article, rewritten_text=rewritten) + except Exception: + tags = [] + merged_meta = merge_generated_tags(article.get("meta_json"), tags) + # upsert via status update + existing fields by lightweight path: + repo_upsert_article( + ArticleUpsert( + feed_id=article.get("feed_id"), + source_article_id=article.get("source_article_id"), + source_hash=article.get("source_hash"), + title=article.get("title"), + source_url=article.get("source_url"), + canonical_url=article.get("canonical_url"), + published_at=article.get("published_at"), + author=article.get("author"), + summary=article.get("summary"), + content_raw=article.get("content_raw"), + content_rewritten=rewritten, + image_urls_json=article.get("image_urls_json"), + press_contact=article.get("press_contact"), + source_name_snapshot=article.get("source_name_snapshot"), + source_terms_url_snapshot=article.get("source_terms_url_snapshot"), + source_license_name_snapshot=article.get("source_license_name_snapshot"), + legal_checked=bool(int(article.get("legal_checked", 0))), + legal_checked_at=article.get("legal_checked_at"), + legal_note=article.get("legal_note"), + wp_post_id=article.get("wp_post_id"), + wp_post_url=article.get("wp_post_url"), + publish_attempts=int(article.get("publish_attempts", 0)), + publish_last_error=article.get("publish_last_error"), + published_to_wp_at=article.get("published_to_wp_at"), + word_count=len(rewritten.split()), + status="approved", + meta_json=merged_meta, + ) + ) + return {"ok": True, "id": article_id, "status": "publish", "tags": tags} + + +@app.post("/api/articles/{article_id}/legal-review") +def api_article_legal_review(article_id: int, payload: ArticleLegalReviewRequest, username: str = Depends(require_auth)) -> dict: + article = get_article_by_id(article_id) + if not article: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden") + + updated = set_article_legal_review(article_id, approved=payload.approved, note=payload.note, actor=username) + if not updated: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden") + return { + "ok": True, + "id": article_id, + "legal_checked": payload.approved, + } + + +@app.get("/api/publisher/jobs") +def api_publisher_jobs(limit: int = 100, username: str = Depends(require_auth)) -> dict: + return {"ok": True, "items": list_publish_jobs(limit=limit), "requested_by": username} + + +@app.post("/api/publisher/enqueue") +def api_publisher_enqueue(payload: PublisherEnqueueRequest, username: str = Depends(require_auth)) -> dict: + article = get_article_by_id(payload.article_id) + if not article: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden") + job_id = enqueue_publish(article_id=payload.article_id, max_attempts=payload.max_attempts) + return {"ok": True, "job_id": job_id, "article_id": payload.article_id, "requested_by": username} + + +@app.post("/api/publisher/run") +def api_publisher_run(payload: PublisherRunRequest, username: str = Depends(require_auth)) -> dict: + stats = run_publisher(max_jobs=payload.max_jobs) + return { + "ok": True, + "requested_by": username, + "stats": { + "processed": stats.processed, + "success": stats.success, + "failed": stats.failed, + "requeued": stats.requeued, + }, + } + + +@app.post("/api/articles/{article_id}/review") +def api_article_review(article_id: int, payload: ArticleReviewRequest, username: str = Depends(require_auth)) -> dict: + raise HTTPException(status_code=status.HTTP_410_GONE, detail="Review-Endpoint ersetzt durch Rewrite-Workflow") + + +@app.post("/api/ingestion/run") +def api_run_ingestion(payload: IngestionRunRequest, username: str = Depends(require_auth)) -> dict: + stats = run_ingestion(feed_id=payload.feed_id) + return { + "ok": stats.status == "success", + "run_id": stats.run_id, + "status": stats.status, + "message": stats.message, + "stats": { + "feeds_processed": stats.feeds_processed, + "entries_seen": stats.entries_seen, + "articles_upserted": stats.articles_upserted, + }, + "requested_by": username, + } + + +# --------------------------------------------------------------------------- +# N8N Automation endpoint (API-Key auth, no session cookie required) +# --------------------------------------------------------------------------- + +def _require_api_key(request: Request) -> None: + api_key = request.headers.get("X-API-Key") or request.query_params.get("api_key") + expected = settings.n8n_api_key + if not expected: + raise HTTPException(status_code=status.HTTP_501_NOT_IMPLEMENTED, detail="N8N_API_KEY nicht konfiguriert") + if api_key != expected: + raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Ungültiger API-Key") + + +_pipeline_lock = asyncio.Lock() + + +@app.post("/api/n8n/pipeline") +async def api_n8n_pipeline(request: Request) -> dict: + """Trigger the full auto pipeline in background. Returns immediately. + Called by N8N (2x/day or on demand). Results arrive via Telegram.""" + _require_api_key(request) + + if _pipeline_lock.locked(): + logging.getLogger(__name__).warning("Pipeline bereits aktiv – Trigger ignoriert") + return {"ok": False, "message": "Pipeline läuft bereits – Trigger ignoriert"} + + async def _run(): + async with _pipeline_lock: + loop = asyncio.get_event_loop() + try: + await loop.run_in_executor(None, lambda: run_auto_pipeline(trigger="n8n")) + except Exception as exc: + logging.getLogger(__name__).error("Background pipeline error: %s", exc) + + asyncio.create_task(_run()) + return {"ok": True, "message": "Pipeline gestartet – Ergebnisse kommen per Telegram"} + + +@app.post("/api/n8n/ingest") +def api_n8n_ingest(request: Request) -> dict: + """Run only the ingestion step (no rewrite/publish). For N8N.""" + _require_api_key(request) + stats = run_ingestion() + return { + "ok": stats.status == "success", + "stats": { + "feeds_processed": stats.feeds_processed, + "entries_seen": stats.entries_seen, + "articles_upserted": stats.articles_upserted, + }, + } + + +# --------------------------------------------------------------------------- +# Telegram Webhook +# --------------------------------------------------------------------------- + +@app.post("/telegram/webhook") +async def telegram_webhook(request: Request) -> dict: + """Receive updates from Telegram Bot API. + + Returns 200 immediately so Telegram never retries the same update. + Actual processing runs in a background task. + """ + import asyncio + import logging + + # Verify secret token + secret = settings.telegram_webhook_secret + if secret: + incoming = request.headers.get("X-Telegram-Bot-Api-Secret-Token", "") + if incoming != secret: + raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="Invalid secret") + + body = await request.body() + try: + update = json.loads(body.decode("utf-8")) + except Exception: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON") + + async def _process(): + loop = asyncio.get_event_loop() + try: + await loop.run_in_executor(None, lambda: handle_update(update)) + except Exception as exc: + logging.getLogger(__name__).error("Telegram update handler error: %s", exc) + + asyncio.create_task(_process()) + return {"ok": True} + + +@app.post("/api/telegram/setup-webhook") +def api_setup_telegram_webhook(request: Request) -> dict: + """Register the Telegram webhook URL. Call once after deployment.""" + username = require_auth(request) + base_url = str(request.base_url).rstrip("/") + webhook_url = f"{base_url}/telegram/webhook" + result = setup_webhook(webhook_url) + return {"ok": True, "webhook_url": webhook_url, "telegram_response": result, "requested_by": username} diff --git a/backend/app/pipeline.py b/backend/app/pipeline.py new file mode 100644 index 0000000..93a251b --- /dev/null +++ b/backend/app/pipeline.py @@ -0,0 +1,516 @@ +"""Autonomous RSS-News pipeline. + +Full automated flow: +1. Run RSS ingestion +2. For each new article: + - Auto-select primary image + - Score relevance via GPT + - < warn threshold: reject (error status) → Telegram rejected summary + - warn..auto threshold: Telegram warning with override button + - >= auto threshold: rewrite → create WP draft → Telegram notification +3. Send pipeline summary to Telegram +""" +from __future__ import annotations + +import json +import logging +import time +from dataclasses import dataclass, field +from datetime import datetime, timezone +from typing import Any + +from .config import get_settings +from .ingestion import run_ingestion +from .publisher import enqueue_publish, run_publisher +from .repositories import ( + ArticleUpsert, + get_article_by_id, + list_articles, + set_article_image_decision, + update_article_status, + upsert_article as repo_upsert_article, +) +from .rewrite import generate_article_tags, merge_generated_tags, rewrite_article_text, score_article_relevance +from .scheduler import reserve_publish_slot +from .wordpress import publish_article_draft, selected_image_exists + +logger = logging.getLogger(__name__) + + +@dataclass +class PipelineStats: + ingested: int = 0 + processed: int = 0 + drafts_created: int = 0 + rejected: int = 0 + quality_gate_rejected: int = 0 + warnings: int = 0 + errors: int = 0 + no_image: int = 0 + rejected_articles: list[dict[str, Any]] = field(default_factory=list) + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + +def _auto_select_image(article: dict[str, Any]) -> bool: + """Auto-select the primary image from ingestion metadata if not already selected.""" + meta_json = article.get("meta_json") or "{}" + try: + meta = json.loads(meta_json) + except Exception: + return False + + # Already selected? + image_review = meta.get("image_review") or {} + if isinstance(image_review, dict) and image_review.get("selected_url"): + return True + + # Try to get primary from ingestion extraction + extraction = meta.get("extraction") or {} + image_selection = extraction.get("image_selection") or {} + primary = image_selection.get("primary") + + if not primary: + # Fallback: use first URL from image_urls_json + image_urls_json = article.get("image_urls_json") or "[]" + try: + urls = json.loads(image_urls_json) + if urls: + primary = urls[0] + except Exception: + pass + + if primary: + set_article_image_decision(int(article["id"]), primary, "select", actor="pipeline") + return True + return False + + +def _store_relevance(article_id: int, relevance: dict[str, Any]) -> None: + """Persist relevance score and reason in article meta_json and relevance_score column.""" + article = get_article_by_id(article_id) + if not article: + return + try: + meta = json.loads(article.get("meta_json") or "{}") + except Exception: + meta = {} + meta["relevance"] = relevance + new_meta = json.dumps(meta, ensure_ascii=False) + from .db import get_conn + with get_conn() as conn: + conn.execute( + "UPDATE articles SET meta_json = ?, relevance_score = ? WHERE id = ?", + (new_meta, relevance.get("score", 0), article_id), + ) + + +def _do_rewrite_and_draft(article: dict[str, Any]) -> tuple[int, str | None]: + """Rewrite article and create WP draft. Returns (wp_post_id, wp_post_url).""" + article_id = int(article["id"]) + settings = get_settings() + + # ── Quality gate 1: raw content length ────────────────────────────────── + import re as _re + raw_text = _re.sub(r"<[^>]+>", " ", article.get("content_raw") or "") + raw_words = len(raw_text.split()) + if raw_words < settings.pipeline_min_words_raw: + note = ( + f"Zu wenig Rohinhalt: {raw_words} Wörter " + f"(Minimum: {settings.pipeline_min_words_raw})" + ) + logger.warning("_do_rewrite_and_draft #%d: %s — überspringe", article_id, note) + update_article_status(article_id, "error", actor="pipeline", note=note) + raise ValueError(note) + + # Rewrite + logger.info("_do_rewrite_and_draft #%d: starte OpenAI-Rewrite (%d Roh-Wörter)", article_id, raw_words) + rewritten = rewrite_article_text(article) + + # ── Quality gate 2: rewritten content length ───────────────────────────── + rewritten_words = len(rewritten.split()) + if rewritten_words < settings.pipeline_min_words_rewritten: + note = ( + f"Rewrite zu kurz: {rewritten_words} Wörter " + f"(Minimum: {settings.pipeline_min_words_rewritten})" + ) + logger.warning("_do_rewrite_and_draft #%d: %s — überspringe", article_id, note) + update_article_status(article_id, "error", actor="pipeline", note=note) + raise ValueError(note) + logger.info("_do_rewrite_and_draft #%d: Rewrite fertig (%d Wörter), generiere Tags", article_id, len(rewritten.split())) + tags: list[str] = [] + try: + tags = generate_article_tags(article, rewritten_text=rewritten) + except Exception: + pass + merged_meta = merge_generated_tags(article.get("meta_json"), tags) + + # Save rewritten content + approved status + repo_upsert_article( + ArticleUpsert( + feed_id=article.get("feed_id"), + source_article_id=article.get("source_article_id"), + source_hash=article.get("source_hash"), + title=article.get("title", ""), + source_url=article.get("source_url", ""), + canonical_url=article.get("canonical_url"), + published_at=article.get("published_at"), + author=article.get("author"), + summary=article.get("summary"), + content_raw=article.get("content_raw"), + content_rewritten=rewritten, + image_urls_json=article.get("image_urls_json"), + press_contact=article.get("press_contact"), + source_name_snapshot=article.get("source_name_snapshot"), + source_terms_url_snapshot=article.get("source_terms_url_snapshot"), + source_license_name_snapshot=article.get("source_license_name_snapshot"), + legal_checked=bool(int(article.get("legal_checked", 0))), + legal_checked_at=article.get("legal_checked_at"), + legal_note=article.get("legal_note"), + wp_post_id=article.get("wp_post_id"), + wp_post_url=article.get("wp_post_url"), + publish_attempts=int(article.get("publish_attempts", 0)), + publish_last_error=article.get("publish_last_error"), + published_to_wp_at=article.get("published_to_wp_at"), + word_count=len(rewritten.split()), + status="approved", + meta_json=merged_meta, + ) + ) + + # Reload after save to get updated meta_json + fresh = get_article_by_id(article_id) + if not fresh: + raise RuntimeError(f"Artikel #{article_id} nach Rewrite nicht gefunden") + + # Ensure a publish slot is reserved — reserve one now if not yet set + if not fresh.get("scheduled_publish_at"): + from .scheduler import reserve_publish_slot + logger.info("_do_rewrite_and_draft #%d: kein Slot gesetzt, reserviere jetzt", article_id) + reserve_publish_slot(article_id) + fresh = get_article_by_id(article_id) + if not fresh: + raise RuntimeError(f"Artikel #{article_id} nach Slot-Reservierung nicht gefunden") + + # Create WP draft + logger.info("_do_rewrite_and_draft #%d: erstelle/aktualisiere WP Draft (wp_post_id=%s, sched=%s)", article_id, fresh.get("wp_post_id"), fresh.get("scheduled_publish_at")) + wp_post_id, wp_post_url = publish_article_draft(fresh) + logger.info("_do_rewrite_and_draft #%d: WP Draft fertig (post_id=%s)", article_id, wp_post_id) + + # Update WP info in DB + from .repositories import mark_article_publish_result + mark_article_publish_result( + article_id, + wp_post_id=wp_post_id, + wp_post_url=wp_post_url, + error=None, + increment_attempts=True, + set_published_status=False, + ) + + return wp_post_id, wp_post_url + + +# --------------------------------------------------------------------------- +# Public pipeline functions +# --------------------------------------------------------------------------- + +def run_auto_pipeline(trigger: str = "auto") -> dict[str, Any]: + """Run the full automated pipeline and return stats dict.""" + from . import telegram_bot as tg + + settings = get_settings() + stats = PipelineStats() + + tg.notify_pipeline_started(trigger) + + # Step 1: Ingestion + try: + ingest_result = run_ingestion() + stats.ingested = ingest_result.articles_upserted + except Exception as exc: + tg.notify_error(f"Ingestion fehlgeschlagen: {exc}") + logger.error("Ingestion error: %s", exc) + stats.errors += 1 + + # Step 2: Process new articles + new_articles = list_articles(limit=100, status_filter="new") + + for article in new_articles: + article_id = int(article["id"]) + try: + _process_article(article, stats, settings) + except Exception as exc: + logger.error("Fehler bei Artikel #%d: %s", article_id, exc) + tg.notify_error(f"Fehler bei Artikel #{article_id} ({article.get('title','?')[:50]}): {exc}") + stats.errors += 1 + # Rate limiting between OpenAI calls + time.sleep(1) + + # Step 3: Send rejected summary if any + if stats.rejected_articles: + try: + tg.notify_rejected_summary(stats.rejected_articles) + except Exception as exc: + logger.warning("Telegram rejected summary fehlgeschlagen: %s", exc) + + # Step 4: Summary + result = { + "ingested": stats.ingested, + "processed": stats.processed, + "drafts_created": stats.drafts_created, + "rejected": stats.rejected, + "quality_gate_rejected": stats.quality_gate_rejected, + "no_image": stats.no_image, + "warnings": stats.warnings, + "errors": stats.errors, + } + tg.notify_pipeline_done(result) + return result + + +def _process_article(article: dict[str, Any], stats: PipelineStats, settings: Any) -> None: + """Process a single new article through the pipeline.""" + from . import telegram_bot as tg + + article_id = int(article["id"]) + + # Auto-select image + _auto_select_image(article) + + # Reload to get updated image_review + article = get_article_by_id(article_id) or article + + # Exclude articles without a usable image + try: + meta = json.loads(article.get("meta_json") or "{}") + except Exception: + meta = {} + has_image = bool((meta.get("image_review") or {}).get("selected_url")) + if not has_image: + update_article_status( + article_id, + "no_image", + actor="pipeline", + note="Kein Bild vorhanden – Artikel ausgeschlossen", + ) + stats.no_image += 1 + logger.info("Artikel #%d ausgeschlossen: kein Bild gefunden", article_id) + try: + tg.send_message( + f"🖼️ Kein Bild – Artikel #{article_id} ausgeschlossen\n" + f"📰 {(article.get('title') or '')[:80]}" + ) + except Exception: + pass + return + + # Score relevance + try: + relevance = score_article_relevance(article) + except Exception as exc: + logger.warning("Relevanz-Scoring für #%d fehlgeschlagen: %s", article_id, exc) + relevance = {"score": 0, "reason": f"Scoring-Fehler: {exc}", "topics": []} + + score = relevance.get("score", 0) + reason = relevance.get("reason", "") + _store_relevance(article_id, relevance) + + stats.processed += 1 + + if score < settings.pipeline_relevance_warn: + # Reject + update_article_status( + article_id, + "error", + actor="pipeline", + note=f"Abgelehnt: Score {score}/100 — {reason}", + ) + stats.rejected += 1 + # Reload for summary (now has relevance in meta) + updated = get_article_by_id(article_id) + if updated: + stats.rejected_articles.append(updated) + + elif score < settings.pipeline_relevance_auto: + # Warning zone: set status to "review" so repeated /run calls don't re-warn + update_article_status( + article_id, + "review", + actor="pipeline", + note=f"Niedrige Relevanz: Score {score}/100 — {reason}", + ) + stats.warnings += 1 + try: + tg.notify_relevance_warning(article, score, reason) + except Exception as exc: + logger.warning("Telegram warning für #%d fehlgeschlagen: %s", article_id, exc) + + else: + # Auto-process: rewrite + WP draft + try: + # Reserve publish slot FIRST so it's available when WP draft is created + slot = reserve_publish_slot(article_id) + + # Reload article to get updated image_review + scheduled_publish_at + fresh = get_article_by_id(article_id) + if not fresh: + return + wp_post_id, wp_post_url = _do_rewrite_and_draft(fresh) + stats.drafts_created += 1 + + # Reload for notification + final = get_article_by_id(article_id) + if final: + try: + tg.notify_new_draft(final, score=score, suggested_publish_at=slot) + except Exception as exc: + logger.warning("Telegram draft-Benachrichtigung für #%d fehlgeschlagen: %s", article_id, exc) + + except ValueError as exc: + # Quality gate rejection (too short etc.) — status already set in _do_rewrite_and_draft + # Release the reserved slot so it's available for the next article + from .scheduler import release_publish_slot + release_publish_slot(article_id) + # Clean up any stale WP draft from a previous pipeline run + stale = get_article_by_id(article_id) + if stale and stale.get("wp_post_id"): + try: + from .wordpress import delete_wp_post + delete_wp_post(int(stale["wp_post_id"])) + logger.info("Artikel #%d: veralteten WP-Draft #%s gelöscht", article_id, stale["wp_post_id"]) + except Exception as del_exc: + logger.warning("Artikel #%d: WP-Draft konnte nicht gelöscht werden: %s", article_id, del_exc) + stats.quality_gate_rejected += 1 + logger.info("Artikel #%d wegen Qualitätsprüfung abgelehnt: %s", article_id, exc) + # Individual Telegram notification for quality gate rejection + try: + title = (article.get("title") or "Ohne Titel")[:80] + tg.send_message( + f"✂️ Qualitätsprüfung nicht bestanden\n" + f"📰 {title}\n" + f"💯 Score: {score}/100\n" + f"⚠️ {exc}" + ) + except Exception as tg_exc: + logger.warning("Telegram QG-Benachrichtigung für #%d fehlgeschlagen: %s", article_id, tg_exc) + + except Exception as exc: + logger.error("Draft-Erstellung für #%d fehlgeschlagen: %s", article_id, exc) + update_article_status(article_id, "error", actor="pipeline", note=f"Draft-Fehler: {exc}") + # Release reserved slot so it's not permanently blocked by a failed article + from .scheduler import release_publish_slot + release_publish_slot(article_id) + raise + + +# --------------------------------------------------------------------------- +# Callback actions (called from telegram_bot._handle_callback) +# --------------------------------------------------------------------------- + +def rewrite_and_update_draft(article_id: int) -> None: + """Rewrite article and update the existing WP draft.""" + article = get_article_by_id(article_id) + if not article: + raise RuntimeError(f"Artikel #{article_id} nicht gefunden") + _auto_select_image(article) + fresh = get_article_by_id(article_id) + _do_rewrite_and_draft(fresh) + + +def discard_article(article_id: int) -> None: + """Discard a draft: delete WP post if exists, set article to error.""" + article = get_article_by_id(article_id) + if not article: + return + + wp_post_id = article.get("wp_post_id") + if wp_post_id: + try: + from .wordpress import delete_wp_post + delete_wp_post(int(wp_post_id)) + except Exception as exc: + logger.warning("WP Post #%d konnte nicht gelöscht werden: %s", wp_post_id, exc) + + update_article_status(article_id, "error", actor="telegram", note="Via Telegram verworfen") + + +def override_rejected_article(article_id: int) -> None: + """Force-process a previously rejected article.""" + from . import telegram_bot as tg + + article = get_article_by_id(article_id) + if not article: + raise RuntimeError(f"Artikel #{article_id} nicht gefunden") + + # Reset to new so processing is allowed + update_article_status(article_id, "new", actor="telegram", note="Manuell übernommen via Telegram") + + # Reload + fresh = get_article_by_id(article_id) + if not fresh: + return + + _auto_select_image(fresh) + fresh = get_article_by_id(article_id) + + # Get existing score or re-score + try: + meta = json.loads(fresh.get("meta_json") or "{}") + score = int((meta.get("relevance") or {}).get("score", 0)) + except Exception: + score = 0 + + # Reserve publish slot FIRST so it's in the DB when WP draft is created + slot = reserve_publish_slot(article_id) + fresh = get_article_by_id(article_id) + + wp_post_id, wp_post_url = _do_rewrite_and_draft(fresh) + + final = get_article_by_id(article_id) + if final: + tg.notify_new_draft(final, score=score, suggested_publish_at=slot) + + +# --------------------------------------------------------------------------- +# Status helpers (used by /status command) +# --------------------------------------------------------------------------- + +def get_recently_rejected(days: int = 3) -> list[dict[str, Any]]: + """Return articles rejected in the last N days.""" + from .db import get_conn + from .db import rows_to_dicts + cutoff = datetime.now(timezone.utc).isoformat()[:10] + with get_conn() as conn: + rows = conn.execute( + """ + SELECT id, title, meta_json, source_url, created_at + FROM articles + WHERE status IN ('error', 'review') + AND json_extract(meta_json, '$.relevance.score') IS NOT NULL + AND date(updated_at) >= date('now', ?) + ORDER BY updated_at DESC + LIMIT 20 + """, + (f"-{days} days",), + ).fetchall() + return rows_to_dicts(rows) + + +def get_pipeline_status_text() -> str: + """Return a text summary of current pipeline state.""" + from .repositories import list_articles as _list + new_count = len(_list(limit=500, status_filter="new")) + approved_count = len(_list(limit=500, status_filter="approved")) + published_count = len(_list(limit=500, status_filter="published")) + error_count = len(_list(limit=500, status_filter="error")) + + return ( + f"📊 Pipeline-Status\n" + f"🆕 Neu / wartend: {new_count}\n" + f"✅ Draft / freigegeben: {approved_count}\n" + f"📢 Veröffentlicht: {published_count}\n" + f"🚫 Fehler / abgelehnt: {error_count}" + ) diff --git a/backend/app/policy.py b/backend/app/policy.py new file mode 100644 index 0000000..af6e65c --- /dev/null +++ b/backend/app/policy.py @@ -0,0 +1,35 @@ +from __future__ import annotations + +from typing import Any + + +def evaluate_source_policy(source: dict[str, Any] | None) -> list[str]: + issues: list[str] = [] + if not source: + issues.append("Keine Quelle zugeordnet") + return issues + + risk_level = (source.get("risk_level") or "").strip().lower() + if risk_level != "green": + issues.append(f"Quelle nicht freigegeben (risk_level={risk_level or 'unset'})") + + terms_url = (source.get("terms_url") or "").strip() + if not terms_url: + issues.append("terms_url fehlt") + + license_name = (source.get("license_name") or "").strip() + if not license_name: + issues.append("license_name fehlt") + + last_reviewed_at = (source.get("last_reviewed_at") or "").strip() + if not last_reviewed_at: + issues.append("last_reviewed_at fehlt") + + if int(source.get("is_enabled", 0) or 0) != 1: + issues.append("Quelle ist deaktiviert") + + return issues + + +def is_source_allowed(source: dict[str, Any] | None) -> bool: + return len(evaluate_source_policy(source)) == 0 diff --git a/backend/app/publisher.py b/backend/app/publisher.py new file mode 100644 index 0000000..e27bd1b --- /dev/null +++ b/backend/app/publisher.py @@ -0,0 +1,101 @@ +from __future__ import annotations + +from dataclasses import dataclass + +from .repositories import ( + claim_next_publish_job, + complete_publish_job, + create_publish_job, + fail_publish_job, + get_article_by_id, + mark_article_publish_result, + PublishJobCreate, +) +from .wordpress import publish_article_draft, selected_image_exists + + +@dataclass(frozen=True) +class PublisherStats: + processed: int + success: int + failed: int + requeued: int + + +def enqueue_publish(article_id: int, max_attempts: int = 3) -> int: + return create_publish_job(PublishJobCreate(article_id=article_id, max_attempts=max_attempts)) + + +def _can_publish(article: dict) -> tuple[bool, str | None]: + if article.get("status") not in {"approved", "published"}: + return False, "Artikelstatus muss 'publish' sein" + if not selected_image_exists(article): + return False, "Hauptbild nicht gesetzt" + return True, None + + +def run_publisher(max_jobs: int = 10) -> PublisherStats: + processed = 0 + success = 0 + failed = 0 + requeued = 0 + + for _ in range(max(1, max_jobs)): + job = claim_next_publish_job() + if not job: + break + processed += 1 + job_id = int(job["id"]) + article_id = int(job["article_id"]) + + article = get_article_by_id(article_id) + if not article: + fail_publish_job(job_id, "Artikel nicht gefunden", requeue=False) + failed += 1 + continue + + allowed, reason = _can_publish(article) + if not allowed: + fail_publish_job(job_id, reason or "Publish-Bedingungen nicht erfüllt", requeue=False) + mark_article_publish_result( + article_id, + wp_post_id=article.get("wp_post_id"), + wp_post_url=article.get("wp_post_url"), + error=reason or "blocked", + increment_attempts=True, + set_published_status=False, + ) + failed += 1 + continue + + try: + wp_post_id, wp_post_url = publish_article_draft(article) + complete_publish_job(job_id, wp_post_id=wp_post_id, wp_post_url=wp_post_url) + mark_article_publish_result( + article_id, + wp_post_id=wp_post_id, + wp_post_url=wp_post_url, + error=None, + increment_attempts=True, + set_published_status=True, + ) + success += 1 + except Exception as exc: + attempts = int(job.get("attempts", 1)) + max_attempts = int(job.get("max_attempts", 3)) + should_requeue = attempts < max_attempts + fail_publish_job(job_id, str(exc), requeue=should_requeue) + mark_article_publish_result( + article_id, + wp_post_id=article.get("wp_post_id"), + wp_post_url=article.get("wp_post_url"), + error=str(exc), + increment_attempts=True, + set_published_status=False, + ) + if should_requeue: + requeued += 1 + else: + failed += 1 + + return PublisherStats(processed=processed, success=success, failed=failed, requeued=requeued) diff --git a/backend/app/relevance.py b/backend/app/relevance.py new file mode 100644 index 0000000..8f69693 --- /dev/null +++ b/backend/app/relevance.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from datetime import datetime, timezone + + +def _parse_iso_datetime(value: str | None) -> datetime | None: + if not value: + return None + raw = value.strip() + if not raw: + return None + if raw.endswith("Z"): + raw = raw[:-1] + "+00:00" + try: + parsed = datetime.fromisoformat(raw) + except ValueError: + return None + if parsed.tzinfo is None: + parsed = parsed.replace(tzinfo=timezone.utc) + return parsed + + +def article_age_days(published_at: str | None, now: datetime | None = None) -> int | None: + published = _parse_iso_datetime(published_at) + if not published: + return None + ref = now or datetime.now(timezone.utc) + delta = ref - published + if delta.total_seconds() < 0: + return 0 + return delta.days + + +def article_relevance(published_at: str | None, now: datetime | None = None) -> str: + days = article_age_days(published_at, now=now) + if days is None: + return "unbekannt" + if days <= 2: + return "hoch" + if days <= 7: + return "mittel" + if days <= 30: + return "niedrig" + return "alt" diff --git a/backend/app/repositories.py b/backend/app/repositories.py new file mode 100644 index 0000000..cf38055 --- /dev/null +++ b/backend/app/repositories.py @@ -0,0 +1,855 @@ +from __future__ import annotations + +from dataclasses import dataclass +import json +from datetime import datetime, timezone +from typing import Any + +from .db import get_conn, rows_to_dicts + + +@dataclass(frozen=True) +class SourceCreate: + name: str + base_url: str | None + terms_url: str | None + license_name: str | None + risk_level: str + is_enabled: bool + notes: str | None + last_reviewed_at: str | None + + +@dataclass(frozen=True) +class FeedCreate: + name: str + url: str + source_id: int | None + is_enabled: bool + + +@dataclass(frozen=True) +class SourceUpdate: + name: str + base_url: str | None + terms_url: str | None + license_name: str | None + risk_level: str + is_enabled: bool + notes: str | None + last_reviewed_at: str | None + + +@dataclass(frozen=True) +class FeedUpdate: + name: str + url: str + source_id: int | None + is_enabled: bool + + +@dataclass(frozen=True) +class RunCreate: + run_type: str + status: str + details: str | None = None + + +@dataclass(frozen=True) +class ArticleUpsert: + feed_id: int | None + source_article_id: str | None + source_hash: str | None + title: str + source_url: str + canonical_url: str | None + published_at: str | None + author: str | None + summary: str | None + content_raw: str | None + content_rewritten: str | None + image_urls_json: str | None + press_contact: str | None + source_name_snapshot: str | None + source_terms_url_snapshot: str | None + source_license_name_snapshot: str | None + legal_checked: bool + legal_checked_at: str | None + legal_note: str | None + wp_post_id: int | None + wp_post_url: str | None + publish_attempts: int + publish_last_error: str | None + published_to_wp_at: str | None + word_count: int + status: str + meta_json: str | None + + +@dataclass(frozen=True) +class PublishJobCreate: + article_id: int + max_attempts: int = 3 + + +def create_source(payload: SourceCreate) -> int: + with get_conn() as conn: + cur = conn.execute( + """ + INSERT INTO sources (name, base_url, terms_url, license_name, risk_level, is_enabled, notes, last_reviewed_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + payload.name.strip(), + payload.base_url, + payload.terms_url, + payload.license_name, + payload.risk_level, + 1 if payload.is_enabled else 0, + payload.notes, + payload.last_reviewed_at, + ), + ) + return int(cur.lastrowid) + + +def list_sources() -> list[dict[str, Any]]: + with get_conn() as conn: + rows = conn.execute( + """ + SELECT id, name, base_url, terms_url, license_name, risk_level, is_enabled, notes, last_reviewed_at, created_at, updated_at + FROM sources + ORDER BY id DESC + """ + ).fetchall() + return rows_to_dicts(rows) + + +def get_source_by_id(source_id: int) -> dict[str, Any] | None: + with get_conn() as conn: + row = conn.execute( + """ + SELECT id, name, base_url, terms_url, license_name, risk_level, is_enabled, notes, last_reviewed_at, created_at, updated_at + FROM sources + WHERE id = ? + """, + (source_id,), + ).fetchone() + return dict(row) if row else None + + +def update_source(source_id: int, payload: SourceUpdate) -> bool: + with get_conn() as conn: + cur = conn.execute( + """ + UPDATE sources + SET name = ?, base_url = ?, terms_url = ?, license_name = ?, risk_level = ?, is_enabled = ?, notes = ?, last_reviewed_at = ? + WHERE id = ? + """, + ( + payload.name.strip(), + payload.base_url, + payload.terms_url, + payload.license_name, + payload.risk_level, + 1 if payload.is_enabled else 0, + payload.notes, + payload.last_reviewed_at, + source_id, + ), + ) + return cur.rowcount > 0 + + +def delete_source(source_id: int) -> bool: + with get_conn() as conn: + cur = conn.execute("DELETE FROM sources WHERE id = ?", (source_id,)) + return cur.rowcount > 0 + + +def create_feed(payload: FeedCreate) -> int: + with get_conn() as conn: + cur = conn.execute( + "INSERT INTO feeds (name, url, source_id, is_enabled) VALUES (?, ?, ?, ?)", + (payload.name.strip(), payload.url.strip(), payload.source_id, 1 if payload.is_enabled else 0), + ) + return int(cur.lastrowid) + + +def list_feeds() -> list[dict[str, Any]]: + with get_conn() as conn: + rows = conn.execute( + """ + SELECT f.id, f.name, f.url, f.source_id, f.is_enabled, f.etag, f.last_modified, f.last_checked_at, + f.created_at, f.updated_at, s.name AS source_name, s.license_name AS source_license_name, + s.terms_url AS source_terms_url, s.risk_level AS source_risk_level, s.base_url AS source_base_url, + s.last_reviewed_at AS source_last_reviewed_at, s.is_enabled AS source_is_enabled + FROM feeds f + LEFT JOIN sources s ON s.id = f.source_id + ORDER BY f.id DESC + """ + ).fetchall() + return rows_to_dicts(rows) + + +def list_enabled_feeds() -> list[dict[str, Any]]: + with get_conn() as conn: + rows = conn.execute( + """ + SELECT f.id, f.name, f.url, f.source_id, f.is_enabled, f.etag, f.last_modified, f.last_checked_at, + s.name AS source_name, s.license_name AS source_license_name, s.terms_url AS source_terms_url, + s.risk_level AS source_risk_level, s.base_url AS source_base_url, + s.last_reviewed_at AS source_last_reviewed_at, s.is_enabled AS source_is_enabled + FROM feeds f + LEFT JOIN sources s ON s.id = f.source_id + WHERE f.is_enabled = 1 + ORDER BY f.id ASC + """ + ).fetchall() + return rows_to_dicts(rows) + + +def get_feed_by_id(feed_id: int) -> dict[str, Any] | None: + with get_conn() as conn: + row = conn.execute( + """ + SELECT f.id, f.name, f.url, f.source_id, f.is_enabled, f.etag, f.last_modified, f.last_checked_at, + s.name AS source_name, s.license_name AS source_license_name, s.terms_url AS source_terms_url, + s.risk_level AS source_risk_level, s.base_url AS source_base_url, + s.last_reviewed_at AS source_last_reviewed_at, s.is_enabled AS source_is_enabled + FROM feeds f + LEFT JOIN sources s ON s.id = f.source_id + WHERE f.id = ? + """, + (feed_id,), + ).fetchone() + return dict(row) if row else None + + +def update_feed(feed_id: int, payload: FeedUpdate) -> bool: + with get_conn() as conn: + cur = conn.execute( + """ + UPDATE feeds + SET name = ?, url = ?, source_id = ?, is_enabled = ? + WHERE id = ? + """, + ( + payload.name.strip(), + payload.url.strip(), + payload.source_id, + 1 if payload.is_enabled else 0, + feed_id, + ), + ) + return cur.rowcount > 0 + + +def delete_feed(feed_id: int) -> bool: + with get_conn() as conn: + cur = conn.execute("DELETE FROM feeds WHERE id = ?", (feed_id,)) + return cur.rowcount > 0 + + +def update_feed_fetch_state(feed_id: int, etag: str | None, last_modified: str | None) -> None: + with get_conn() as conn: + conn.execute( + """ + UPDATE feeds + SET etag = ?, last_modified = ?, last_checked_at = datetime('now') + WHERE id = ? + """, + (etag, last_modified, feed_id), + ) + + +def create_run(payload: RunCreate) -> int: + with get_conn() as conn: + cur = conn.execute( + "INSERT INTO runs (run_type, status, details) VALUES (?, ?, ?)", + (payload.run_type, payload.status, payload.details), + ) + return int(cur.lastrowid) + + +def finish_run(run_id: int, status: str, details: str | None = None) -> None: + with get_conn() as conn: + conn.execute( + """ + UPDATE runs + SET status = ?, details = ?, finished_at = datetime('now') + WHERE id = ? + """, + (status, details, run_id), + ) + + +def list_runs(limit: int = 50) -> list[dict[str, Any]]: + safe_limit = max(1, min(limit, 500)) + with get_conn() as conn: + rows = conn.execute( + """ + SELECT id, run_type, status, started_at, finished_at, details + FROM runs + ORDER BY id DESC + LIMIT ? + """, + (safe_limit,), + ).fetchall() + return rows_to_dicts(rows) + + +def get_run_by_id(run_id: int) -> dict[str, Any] | None: + with get_conn() as conn: + row = conn.execute( + """ + SELECT id, run_type, status, started_at, finished_at, details + FROM runs + WHERE id = ? + """, + (run_id,), + ).fetchone() + return dict(row) if row else None + + +def get_article_by_id(article_id: int) -> dict[str, Any] | None: + with get_conn() as conn: + row = conn.execute( + """ + SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author, + a.summary, a.content_raw, a.content_rewritten, a.image_urls_json, a.press_contact, + a.source_name_snapshot, a.source_terms_url_snapshot, a.source_license_name_snapshot, + a.legal_checked, a.legal_checked_at, a.legal_note, + a.wp_post_id, a.wp_post_url, a.publish_attempts, a.publish_last_error, a.published_to_wp_at, + a.word_count, a.status, a.meta_json, a.created_at, a.updated_at, + a.scheduled_publish_at + FROM articles a + WHERE a.id = ? + """, + (article_id,), + ).fetchone() + return dict(row) if row else None + + +def _merge_review_event(meta_json: str | None, event: dict[str, Any]) -> str: + meta: dict[str, Any] = {} + if meta_json: + try: + meta = json.loads(meta_json) + if not isinstance(meta, dict): + meta = {} + except Exception: + meta = {} + + events = meta.get("review_events") + if not isinstance(events, list): + events = [] + events.append(event) + meta["review_events"] = events + return json.dumps(meta, ensure_ascii=False) + + +def _load_meta(meta_json: str | None) -> dict[str, Any]: + if not meta_json: + return {} + try: + parsed = json.loads(meta_json) + return parsed if isinstance(parsed, dict) else {} + except Exception: + return {} + + +def update_article_status( + article_id: int, + new_status: str, + *, + actor: str | None = None, + note: str | None = None, + decision: str | None = None, +) -> bool: + article = get_article_by_id(article_id) + if not article: + return False + + event = { + "timestamp": datetime.now(timezone.utc).isoformat(), + "from_status": article.get("status"), + "to_status": new_status, + "actor": actor or "system", + "note": note, + "decision": decision, + } + merged_meta = _merge_review_event(article.get("meta_json"), event) + + with get_conn() as conn: + conn.execute( + "UPDATE articles SET status = ?, meta_json = ? WHERE id = ?", + (new_status, merged_meta, article_id), + ) + return True + + +def set_article_legal_review(article_id: int, approved: bool, note: str | None, actor: str | None = None) -> bool: + article = get_article_by_id(article_id) + if not article: + return False + + event = { + "timestamp": datetime.now(timezone.utc).isoformat(), + "event": "legal_review", + "approved": approved, + "actor": actor or "system", + "note": note, + } + merged_meta = _merge_review_event(article.get("meta_json"), event) + with get_conn() as conn: + conn.execute( + """ + UPDATE articles + SET legal_checked = ?, legal_checked_at = datetime('now'), legal_note = ?, meta_json = ? + WHERE id = ? + """, + (1 if approved else 0, note, merged_meta, article_id), + ) + return True + + +def set_article_image_decision(article_id: int, image_url: str, action: str, actor: str | None = None) -> bool: + article = get_article_by_id(article_id) + if not article: + return False + url = (image_url or "").strip() + if not url: + return False + if action not in {"select", "exclude", "restore"}: + return False + + meta = _load_meta(article.get("meta_json")) + image_review = meta.get("image_review") + if not isinstance(image_review, dict): + image_review = {} + + excluded = image_review.get("excluded_urls") + if not isinstance(excluded, list): + excluded = [] + excluded_set = {str(item) for item in excluded if item} + + selected_url = image_review.get("selected_url") + if not isinstance(selected_url, str): + selected_url = None + + if action == "select": + selected_url = url + excluded_set.discard(url) + elif action == "exclude": + excluded_set.add(url) + if selected_url == url: + selected_url = None + elif action == "restore": + excluded_set.discard(url) + + image_review["selected_url"] = selected_url + image_review["excluded_urls"] = sorted(excluded_set) + image_review["updated_at"] = datetime.now(timezone.utc).isoformat() + image_review["updated_by"] = actor or "system" + meta["image_review"] = image_review + + with get_conn() as conn: + conn.execute( + "UPDATE articles SET meta_json = ? WHERE id = ?", + (json.dumps(meta, ensure_ascii=False), article_id), + ) + return True + + +def create_publish_job(payload: PublishJobCreate) -> int: + with get_conn() as conn: + existing = conn.execute( + """ + SELECT id FROM publish_jobs + WHERE article_id = ? AND status IN ('queued', 'running') + ORDER BY id DESC + LIMIT 1 + """, + (payload.article_id,), + ).fetchone() + if existing: + return int(existing["id"]) + + cur = conn.execute( + """ + INSERT INTO publish_jobs (article_id, status, attempts, max_attempts) + VALUES (?, 'queued', 0, ?) + """, + (payload.article_id, max(1, payload.max_attempts)), + ) + return int(cur.lastrowid) + + +def list_publish_jobs(limit: int = 100) -> list[dict[str, Any]]: + safe_limit = max(1, min(limit, 500)) + with get_conn() as conn: + rows = conn.execute( + """ + SELECT j.id, j.article_id, j.status, j.attempts, j.max_attempts, j.error_message, j.wp_post_id, j.wp_post_url, + j.created_at, j.started_at, j.finished_at, a.title AS article_title + FROM publish_jobs j + LEFT JOIN articles a ON a.id = j.article_id + ORDER BY j.id DESC + LIMIT ? + """, + (safe_limit,), + ).fetchall() + return rows_to_dicts(rows) + + +def claim_next_publish_job() -> dict[str, Any] | None: + with get_conn() as conn: + row = conn.execute( + """ + SELECT id, article_id, status, attempts, max_attempts, error_message, wp_post_id, wp_post_url + FROM publish_jobs + WHERE status = 'queued' AND attempts < max_attempts + ORDER BY id ASC + LIMIT 1 + """ + ).fetchone() + if not row: + return None + job_id = int(row["id"]) + conn.execute( + """ + UPDATE publish_jobs + SET status = 'running', + attempts = attempts + 1, + started_at = datetime('now'), + finished_at = NULL + WHERE id = ? + """, + (job_id,), + ) + claimed = conn.execute( + """ + SELECT id, article_id, status, attempts, max_attempts, error_message, wp_post_id, wp_post_url + FROM publish_jobs + WHERE id = ? + """, + (job_id,), + ).fetchone() + return dict(claimed) if claimed else None + + +def complete_publish_job(job_id: int, wp_post_id: int | None, wp_post_url: str | None) -> None: + with get_conn() as conn: + conn.execute( + """ + UPDATE publish_jobs + SET status = 'success', + wp_post_id = ?, + wp_post_url = ?, + error_message = NULL, + finished_at = datetime('now') + WHERE id = ? + """, + (wp_post_id, wp_post_url, job_id), + ) + + +def fail_publish_job(job_id: int, error_message: str, requeue: bool) -> None: + next_status = "queued" if requeue else "failed" + with get_conn() as conn: + conn.execute( + """ + UPDATE publish_jobs + SET status = ?, + error_message = ?, + finished_at = datetime('now') + WHERE id = ? + """, + (next_status, error_message[:2000], job_id), + ) + + +def mark_article_publish_result( + article_id: int, + *, + wp_post_id: int | None, + wp_post_url: str | None, + error: str | None, + increment_attempts: bool, + set_published_status: bool, +) -> None: + with get_conn() as conn: + conn.execute( + """ + UPDATE articles + SET wp_post_id = ?, + wp_post_url = ?, + publish_attempts = CASE WHEN ? THEN publish_attempts + 1 ELSE publish_attempts END, + publish_last_error = ?, + published_to_wp_at = CASE WHEN ? IS NOT NULL THEN datetime('now') ELSE published_to_wp_at END, + status = CASE WHEN ? THEN 'published' ELSE status END + WHERE id = ? + """, + ( + wp_post_id, + wp_post_url, + 1 if increment_attempts else 0, + error[:2000] if error else None, + wp_post_id, + 1 if set_published_status else 0, + article_id, + ), + ) + + +def _resolve_existing_article_id(payload: ArticleUpsert) -> int | None: + with get_conn() as conn: + # 1) strongest key: source_url + row = conn.execute( + "SELECT id FROM articles WHERE source_url = ?", + (payload.source_url.strip(),), + ).fetchone() + if row: + return int(row["id"]) + + # 2) stable feed+guid combo + if payload.feed_id is not None and payload.source_article_id: + row = conn.execute( + "SELECT id FROM articles WHERE feed_id = ? AND source_article_id = ?", + (payload.feed_id, payload.source_article_id), + ).fetchone() + if row: + return int(row["id"]) + + # 3) content hash fallback + if payload.source_hash: + row = conn.execute( + "SELECT id FROM articles WHERE source_hash = ?", + (payload.source_hash,), + ).fetchone() + if row: + return int(row["id"]) + + return None + + +def find_existing_article_for_upsert(payload: ArticleUpsert) -> dict[str, Any] | None: + article_id = _resolve_existing_article_id(payload) + if article_id is None: + return None + return get_article_by_id(article_id) + + +def upsert_article(payload: ArticleUpsert) -> int: + existing_id = _resolve_existing_article_id(payload) + with get_conn() as conn: + if existing_id is None: + conn.execute( + """ + INSERT INTO articles ( + feed_id, source_article_id, source_hash, title, source_url, canonical_url, published_at, author, + summary, content_raw, content_rewritten, image_urls_json, press_contact, + source_name_snapshot, source_terms_url_snapshot, source_license_name_snapshot, + legal_checked, legal_checked_at, legal_note, + wp_post_id, wp_post_url, publish_attempts, publish_last_error, published_to_wp_at, + word_count, status, meta_json + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + payload.feed_id, + payload.source_article_id, + payload.source_hash, + payload.title.strip(), + payload.source_url.strip(), + payload.canonical_url, + payload.published_at, + payload.author, + payload.summary, + payload.content_raw, + payload.content_rewritten, + payload.image_urls_json, + payload.press_contact, + payload.source_name_snapshot, + payload.source_terms_url_snapshot, + payload.source_license_name_snapshot, + 1 if payload.legal_checked else 0, + payload.legal_checked_at, + payload.legal_note, + payload.wp_post_id, + payload.wp_post_url, + payload.publish_attempts, + payload.publish_last_error, + payload.published_to_wp_at, + payload.word_count, + payload.status, + payload.meta_json, + ), + ) + else: + conn.execute( + """ + UPDATE articles + SET + feed_id = ?, + source_article_id = ?, + source_hash = ?, + title = ?, + source_url = ?, + canonical_url = ?, + published_at = ?, + author = ?, + summary = ?, + content_raw = ?, + content_rewritten = ?, + image_urls_json = ?, + press_contact = ?, + source_name_snapshot = ?, + source_terms_url_snapshot = ?, + source_license_name_snapshot = ?, + legal_checked = ?, + legal_checked_at = ?, + legal_note = ?, + wp_post_id = ?, + wp_post_url = ?, + publish_attempts = ?, + publish_last_error = ?, + published_to_wp_at = ?, + word_count = ?, + status = ?, + meta_json = ? + WHERE id = ? + """, + ( + payload.feed_id, + payload.source_article_id, + payload.source_hash, + payload.title.strip(), + payload.source_url.strip(), + payload.canonical_url, + payload.published_at, + payload.author, + payload.summary, + payload.content_raw, + payload.content_rewritten, + payload.image_urls_json, + payload.press_contact, + payload.source_name_snapshot, + payload.source_terms_url_snapshot, + payload.source_license_name_snapshot, + 1 if payload.legal_checked else 0, + payload.legal_checked_at, + payload.legal_note, + payload.wp_post_id, + payload.wp_post_url, + payload.publish_attempts, + payload.publish_last_error, + payload.published_to_wp_at, + payload.word_count, + payload.status, + payload.meta_json, + existing_id, + ), + ) + row = conn.execute("SELECT id FROM articles WHERE source_url = ?", (payload.source_url.strip(),)).fetchone() + if row: + return int(row["id"]) + return int(existing_id) if existing_id else 0 + + +def list_articles_page( + limit: int = 50, + offset: int = 0, + status_filter: str | None = None, + search: str | None = None, +) -> tuple[list[dict[str, Any]], int]: + """Return (articles, total_count) with optional status filter and title search.""" + safe_limit = max(1, min(limit, 200)) + safe_offset = max(0, offset) + + conditions: list[str] = [] + params: list[Any] = [] + if status_filter: + conditions.append("a.status = ?") + params.append(status_filter) + if search: + conditions.append("(a.title LIKE ? OR a.id = ?)") + try: + params.extend([f"%{search}%", int(search)]) + except ValueError: + params.extend([f"%{search}%", -1]) + + where = f"WHERE {' AND '.join(conditions)}" if conditions else "" + select = """ + SELECT a.id, a.title, a.status, a.published_at, a.summary, a.content_raw, + a.meta_json, a.wp_post_id, a.wp_post_url, a.scheduled_publish_at, + a.word_count, f.name AS feed_name + FROM articles a + LEFT JOIN feeds f ON f.id = a.feed_id + """ + with get_conn() as conn: + total = conn.execute( + f"SELECT COUNT(*) FROM articles a {where}", params + ).fetchone()[0] + rows = conn.execute( + f"{select} {where} ORDER BY a.id DESC LIMIT ? OFFSET ?", + params + [safe_limit, safe_offset], + ).fetchall() + return rows_to_dicts(rows), total + + +def bulk_update_wp_post_ids(updates: list[tuple[int, int | None]]) -> int: + """Update wp_post_id (and clear stale wp_post_url) for multiple articles. + + Returns the number of rows actually updated. + Call sync_db_from_wordpress() afterwards to repopulate wp_post_url and + scheduled_publish_at from the live WordPress data. + """ + if not updates: + return 0 + updated = 0 + with get_conn() as conn: + for article_id, new_wp_id in updates: + conn.execute( + "UPDATE articles SET wp_post_id = ?, wp_post_url = NULL WHERE id = ?", + (new_wp_id, article_id), + ) + updated += 1 + return updated + + +def list_articles(limit: int = 100, status_filter: str | None = None) -> list[dict[str, Any]]: + safe_limit = max(1, min(limit, 500)) + with get_conn() as conn: + if status_filter: + rows = conn.execute( + """ + SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author, + a.summary, a.content_raw, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at, f.name AS feed_name, + a.image_urls_json, a.press_contact, a.source_name_snapshot, a.source_terms_url_snapshot, + a.source_license_name_snapshot, a.legal_checked, a.legal_checked_at, a.legal_note, + a.wp_post_id, a.wp_post_url, a.publish_attempts, a.publish_last_error, a.published_to_wp_at + FROM articles a + LEFT JOIN feeds f ON f.id = a.feed_id + WHERE a.status = ? + ORDER BY a.id DESC + LIMIT ? + """, + (status_filter, safe_limit), + ).fetchall() + else: + rows = conn.execute( + """ + SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author, + a.summary, a.content_raw, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at, f.name AS feed_name, + a.image_urls_json, a.press_contact, a.source_name_snapshot, a.source_terms_url_snapshot, + a.source_license_name_snapshot, a.legal_checked, a.legal_checked_at, a.legal_note, + a.wp_post_id, a.wp_post_url, a.publish_attempts, a.publish_last_error, a.published_to_wp_at + FROM articles a + LEFT JOIN feeds f ON f.id = a.feed_id + ORDER BY a.id DESC + LIMIT ? + """, + (safe_limit,), + ).fetchall() + return rows_to_dicts(rows) diff --git a/backend/app/rewrite.py b/backend/app/rewrite.py new file mode 100644 index 0000000..05937e5 --- /dev/null +++ b/backend/app/rewrite.py @@ -0,0 +1,204 @@ +from __future__ import annotations + +import json +import re +from typing import Any +from urllib.request import Request, urlopen + +from .config import get_settings + + +def _sanitize_source_text(text: str) -> str: + raw = (text or "").strip() + if not raw: + return "" + + lines = [ln.strip() for ln in raw.splitlines() if ln.strip()] + if len(lines) > 3: + lines = lines[3:] + + joined = "\n".join(lines) + # Remove press contact block at end from "Pressekontakt" onward. + joined = re.sub( + r"\n?\s*Pressekontakt[\s\S]*$", + "", + joined, + flags=re.IGNORECASE, + ).strip() + return joined + + +def _normalize_tags(tags: list[str], max_tags: int = 8) -> list[str]: + out: list[str] = [] + seen: set[str] = set() + for raw in tags: + value = re.sub(r"\s+", " ", str(raw or "").strip()) + value = re.sub(r"^[#\-•\s]+", "", value) + value = re.sub(r"[;,.:\s]+$", "", value) + if not value: + continue + if len(value) < 2 or len(value) > 40: + continue + key = value.casefold() + if key in seen: + continue + seen.add(key) + out.append(value) + if len(out) >= max_tags: + break + return out + + +def _openai_chat(system: str, user: str, temperature: float = 0.4) -> str: + settings = get_settings() + api_key = settings.openai_api_key + if not api_key: + raise RuntimeError("OPENAI_API_KEY fehlt") + + payload = { + "model": settings.openai_model, + "temperature": temperature, + "messages": [ + {"role": "system", "content": system}, + {"role": "user", "content": user}, + ], + } + req = Request( + url="https://api.openai.com/v1/chat/completions", + method="POST", + data=json.dumps(payload).encode("utf-8"), + headers={ + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + "Accept": "application/json", + }, + ) + with urlopen(req, timeout=60) as resp: + raw = resp.read().decode("utf-8", errors="replace") + data = json.loads(raw) + choices = data.get("choices") + if not isinstance(choices, list) or not choices: + raise RuntimeError(f"Ungültige OpenAI-Antwort: {data}") + message = choices[0].get("message", {}) + content = message.get("content") + if not isinstance(content, str) or not content.strip(): + raise RuntimeError("OpenAI lieferte keinen Inhalt") + return content.strip() + + +def rewrite_article_text(article: dict[str, Any]) -> str: + source_text = _sanitize_source_text(article.get("content_raw") or "") + if not source_text: + source_text = (article.get("summary") or "").strip() + if not source_text: + raise RuntimeError("Kein Quelltext für Rewrite verfügbar") + + title = (article.get("title") or "").strip() + source_name = (article.get("source_name_snapshot") or article.get("author") or "die Quelle").strip() + prompt = ( + "Schreibe den folgenden News-Text neu auf Deutsch in persönlicher Du-Form. " + "Stil: ausführlich, gut lesbar, ohne Einleitung mit Datum/Uhrzeit/Firma/Ort, " + "ohne Pressekontakt, ohne Quellenblock. " + "Nutze klare Absätze und Zwischenüberschriften in HTML (

,

,

  • falls passend). " + "Inhaltlich korrekt bleiben, nichts erfinden. " + f"Wichtig: Der Artikel wurde von '{source_name}' veröffentlicht. " + "Verwende NIEMALS 'wir' oder 'ich' aus Sicht der Quelle – beziehe Aussagen stets auf die Quelle, " + f"z.B. 'laut {source_name}', '{source_name} hat ermittelt', 'die Auswertung zeigt'.\n\n" + f"Titel: {title}\n\n" + f"Originaltext:\n{source_text}" + ) + return _openai_chat( + "Du bist ein deutscher News-Redakteur.", + prompt, + temperature=0.4, + ) + + +def generate_article_tags(article: dict[str, Any], rewritten_text: str | None = None, max_tags: int = 8) -> list[str]: + source_text = rewritten_text or _sanitize_source_text(article.get("content_raw") or "") or (article.get("summary") or "") + source_text = str(source_text).strip() + if not source_text: + return [] + title = (article.get("title") or "").strip() + prompt = ( + "Erzeuge präzise Schlagwörter für einen deutschen News-Artikel. " + f"Maximal {max_tags} Tags. Nur relevante Begriffe, keine allgemeinen Wörter wie News/Artikel. " + "Gib ausschließlich ein JSON-Array mit Strings zurück, ohne Erklärung.\n\n" + f"Titel: {title}\n\n" + f"Text:\n{source_text[:3500]}" + ) + raw = _openai_chat( + "Du extrahierst präzise, kurze News-Tags auf Deutsch.", + prompt, + temperature=0.2, + ) + try: + parsed = json.loads(raw) + if isinstance(parsed, list): + return _normalize_tags([str(x) for x in parsed], max_tags=max_tags) + except Exception: + pass + # fallback: extract first JSON-like array if model wrapped output + match = re.search(r"\[[\s\S]*\]", raw) + if match: + try: + parsed = json.loads(match.group(0)) + if isinstance(parsed, list): + return _normalize_tags([str(x) for x in parsed], max_tags=max_tags) + except Exception: + return [] + return [] + + +def score_article_relevance(article: dict[Any, Any]) -> dict[str, Any]: + """Score article relevance for VanLife/Camping/Outdoor blog (0-100). + + Returns {"score": int, "reason": str, "topics": list[str]}. + Raises RuntimeError on OpenAI failure. + """ + title = (article.get("title") or "").strip() + text = _sanitize_source_text(article.get("content_raw") or "") + if not text: + text = (article.get("summary") or "").strip() + + prompt = ( + "Bewerte die Relevanz des folgenden Artikels für einen deutschen VanLife-, Camping- und Outdoor-Blog. " + "Relevante Themen: Campingplätze, Stellplätze, Wohnmobil, Camper, Van, Roadtrip, " + "Outdoor-Ausrüstung, Wandern, Naturreisen, Reise-Tipps für Campende. " + "Nicht relevant: allgemeine Nachrichten, Politik, Wirtschaft, Sport (außer Outdoor), Unterhaltung.\n\n" + "Antworte NUR mit einem JSON-Objekt:\n" + '{"score": <0-100>, "reason": "", "topics": ["", ""]}\n\n' + f"Titel: {title}\n\n" + f"Text (Auszug):\n{text[:2000]}" + ) + raw = _openai_chat( + "Du bist ein Redakteur für einen VanLife- und Camping-Blog und bewertest Artikelrelevanz.", + prompt, + temperature=0.1, + ) + try: + match = re.search(r"\{[\s\S]*\}", raw) + if match: + parsed = json.loads(match.group(0)) + score = max(0, min(100, int(parsed.get("score", 0)))) + return { + "score": score, + "reason": str(parsed.get("reason", "")), + "topics": [str(t) for t in (parsed.get("topics") or [])], + } + except Exception: + pass + return {"score": 0, "reason": "Parsing-Fehler bei Relevanz-Score", "topics": []} + + +def merge_generated_tags(meta_json: str | None, tags: list[str]) -> str: + meta: dict[str, Any] = {} + if meta_json: + try: + parsed = json.loads(meta_json) + if isinstance(parsed, dict): + meta = parsed + except Exception: + meta = {} + meta["generated_tags"] = _normalize_tags(tags) + return json.dumps(meta, ensure_ascii=False) diff --git a/backend/app/scheduler.py b/backend/app/scheduler.py new file mode 100644 index 0000000..d5ea5bf --- /dev/null +++ b/backend/app/scheduler.py @@ -0,0 +1,336 @@ +"""Smart publishing scheduler. + +Calculates suggested publish slots for new WordPress drafts. +Rules: +- Maximum N drafts per day (configurable, default 2) +- Preferred slots: configurable hours (default 09:00 and 14:00 CET) +- New articles queue up after the last already-scheduled article +- Checks both local DB AND WordPress future posts to avoid double-booking +""" +from __future__ import annotations + +import base64 +import json +import threading +import urllib.request +from datetime import date, datetime, timedelta, timezone +from typing import Any + +from .config import get_settings +from .db import get_conn + +# Ensures that concurrent pipeline runs (two threads) never assign the same slot. +_slot_lock = threading.Lock() + + +# CET offset (UTC+1 winter / UTC+2 summer – fixed +1 for simplicity) +_CET_OFFSET = timedelta(hours=1) + + +def _today_cet() -> date: + return (datetime.now(timezone.utc) + _CET_OFFSET).date() + + +def _preferred_hours() -> list[int]: + settings = get_settings() + try: + return [int(h.strip()) for h in settings.pipeline_publish_hours.split(",") if h.strip()] + except Exception: + return [9, 14] + + +def _fetch_wp_occupied_slots() -> set[tuple[str, int]]: + """Fetch all future-scheduled WordPress posts and return occupied (date_iso, hour) pairs. + + This prevents the scheduler from assigning a slot that is already taken + by a WP post that was not created via this pipeline (e.g. manually or via recovery scripts). + Returns an empty set on any error so the scheduler degrades gracefully. + """ + settings = get_settings() + try: + auth = base64.b64encode( + f"{settings.wordpress_username}:{settings.wordpress_app_password}".encode() + ).decode() + url = ( + f"{settings.wordpress_base_url}/wp-json/wp/v2/posts" + f"?status=future&per_page=100&orderby=date&order=asc&_fields=id,date" + ) + req = urllib.request.Request(url, headers={"Authorization": f"Basic {auth}"}) + with urllib.request.urlopen(req, timeout=10) as resp: + posts = json.loads(resp.read()) + occupied: set[tuple[str, int]] = set() + for p in posts: + try: + dt = datetime.fromisoformat(p["date"]) + occupied.add((dt.date().isoformat(), dt.hour)) + except Exception: + pass + return occupied + except Exception: + return set() + + +def _get_last_future_scheduled_date(wp_occupied: set[tuple[str, int]]) -> date | None: + """Return the date of the latest already-scheduled slot (DB + WP).""" + today = _today_cet() + + # Latest from local DB + with get_conn() as conn: + row = conn.execute( + """ + SELECT MAX(scheduled_publish_at) AS last_slot + FROM articles + WHERE scheduled_publish_at IS NOT NULL + AND scheduled_publish_at >= ? + AND status NOT IN ('error', 'no_image') + """, + (today.isoformat() + "T00:00:00",), + ).fetchone() + db_last: date | None = None + if row and row["last_slot"]: + try: + db_last = datetime.fromisoformat(row["last_slot"]).date() + except Exception: + pass + + # Latest from WP + wp_last: date | None = None + for d_str, _ in wp_occupied: + try: + d = date.fromisoformat(d_str) + if d >= today and (wp_last is None or d > wp_last): + wp_last = d + except Exception: + pass + + if db_last and wp_last: + return max(db_last, wp_last) + return db_last or wp_last + + +def _next_free_hour(target_date: date, wp_occupied: set[tuple[str, int]]) -> int | None: + """Return first preferred hour not yet used on target_date (DB + WP), or None if day is full.""" + hours = _preferred_hours() + date_str = target_date.isoformat() + + # Hours used in local DB + with get_conn() as conn: + rows = conn.execute( + """ + SELECT scheduled_publish_at FROM articles + WHERE scheduled_publish_at >= ? AND scheduled_publish_at < ? + AND status NOT IN ('error', 'no_image') + """, + (date_str + "T00:00:00", date_str + "T23:59:59"), + ).fetchall() + + used_hours: set[int] = set() + for row in rows: + ts = row["scheduled_publish_at"] or "" + try: + used_hours.add(datetime.fromisoformat(ts).hour) + except Exception: + pass + + # Hours used in WordPress + for d_str, h in wp_occupied: + if d_str == date_str: + used_hours.add(h) + + for h in hours: + if h not in used_hours: + return h + return None + + +def _format_slot(d: date, hour: int) -> str: + weekday_names = ["Mo", "Di", "Mi", "Do", "Fr", "Sa", "So"] + wd = weekday_names[d.weekday()] + return f"{wd}, {d.strftime('%d.%m.%Y')} um {hour:02d}:00 Uhr" + + +def _find_next_free_slot( + wp_occupied: set[tuple[str, int]], lookahead_days: int = 60 +) -> tuple[date, int] | None: + """Find the next free (date, hour) slot. + + Starts from tomorrow and scans forward, filling any gaps in the schedule + rather than always appending after the last existing post. + """ + today = _today_cet() + tomorrow = today + timedelta(days=1) + + for offset in range(0, lookahead_days + 1): + candidate = tomorrow + timedelta(days=offset) + hour = _next_free_hour(candidate, wp_occupied) + if hour is not None: + return candidate, hour + + return tomorrow, _preferred_hours()[0] if _preferred_hours() else 9 + + +def get_schedule_overview(lookahead_days: int = 60) -> list[dict]: + """Return all booked scheduling slots (DB + WP) for the next N days, sorted by date.""" + today = _today_cet() + hours = _preferred_hours() + + # Slots booked in local DB + with get_conn() as conn: + rows = conn.execute( + """ + SELECT id, title, status, wp_post_id, wp_post_url, scheduled_publish_at + FROM articles + WHERE scheduled_publish_at IS NOT NULL + AND scheduled_publish_at >= ? + AND status NOT IN ('error', 'no_image') + ORDER BY scheduled_publish_at + """, + (today.isoformat() + "T00:00:00",), + ).fetchall() + + db_slots: dict[tuple[str, int], dict] = {} + for row in rows: + try: + dt = datetime.fromisoformat(row["scheduled_publish_at"]) + key = (dt.date().isoformat(), dt.hour) + db_slots[key] = { + "date": dt.date().isoformat(), + "hour": dt.hour, + "formatted": _format_slot(dt.date(), dt.hour), + "source": "db", + "article_id": row["id"], + "article_title": row["title"], + "article_status": row["status"], + "wp_post_id": row["wp_post_id"], + "wp_post_url": row["wp_post_url"], + } + except Exception: + pass + + # Slots occupied in WordPress but not in local DB + wp_occupied = _fetch_wp_occupied_slots() + wp_only: list[dict] = [] + for d_str, h in sorted(wp_occupied): + if (d_str, h) in db_slots: + continue + try: + d = date.fromisoformat(d_str) + if d >= today: + wp_only.append({ + "date": d_str, + "hour": h, + "formatted": _format_slot(d, h), + "source": "wordpress", + "article_id": None, + "article_title": "(WP-Beitrag außerhalb Pipeline)", + "article_status": None, + "wp_post_id": None, + "wp_post_url": None, + }) + except Exception: + pass + + all_slots = list(db_slots.values()) + wp_only + all_slots.sort(key=lambda s: (s["date"], s["hour"])) + return all_slots + + +def release_publish_slot(article_id: int) -> None: + """Clear a previously reserved slot (e.g. when article is rejected after slot assignment).""" + with get_conn() as conn: + conn.execute( + "UPDATE articles SET scheduled_publish_at = NULL WHERE id = ?", + (article_id,), + ) + + +def suggest_publish_slot() -> str: + """Return a suggested publish datetime string (CET) for the next free slot.""" + wp_occupied = _fetch_wp_occupied_slots() + result = _find_next_free_slot(wp_occupied) + if result: + d, hour = result + return _format_slot(d, hour) + tomorrow = _today_cet() + timedelta(days=1) + return _format_slot(tomorrow, _preferred_hours()[0] if _preferred_hours() else 9) + + +def reserve_publish_slot(article_id: int) -> str: + """Reserve a publish slot for an article and persist it in the DB. + + If the article already has a scheduled_publish_at, keep it unchanged. + Returns the formatted publish datetime string. + + Uses a module-level lock so that concurrent pipeline runs (two threads) + cannot read the same "free" slot and assign it twice. + """ + # Fetch WP-occupied slots BEFORE acquiring the lock — the API call can be slow + # and must not block other threads unnecessarily. + wp_occupied = _fetch_wp_occupied_slots() + + with _slot_lock: + # Single DB connection for the entire read-find-write cycle so the + # slot we pick is still free when we write it. + with get_conn() as conn: + row = conn.execute( + "SELECT scheduled_publish_at FROM articles WHERE id = ?", + (article_id,), + ).fetchone() + existing_slot = row["scheduled_publish_at"] if row else None + if existing_slot: + try: + dt = datetime.fromisoformat(existing_slot) + return _format_slot(dt.date(), dt.hour) + except Exception: + pass # invalid — fall through and assign a fresh slot + + # Find the next free (date, hour) slot using THIS connection so we + # see all slots written during this lock window. + hours = _preferred_hours() + today = _today_cet() + tomorrow = today + timedelta(days=1) + candidate: date | None = None + chosen_hour: int | None = None + + for offset in range(0, 61): + d = tomorrow + timedelta(days=offset) + date_str = d.isoformat() + + rows = conn.execute( + """ + SELECT scheduled_publish_at FROM articles + WHERE scheduled_publish_at >= ? AND scheduled_publish_at < ? + AND status NOT IN ('error', 'no_image') + """, + (date_str + "T00:00:00", date_str + "T23:59:59"), + ).fetchall() + + used_hours: set[int] = set() + for r in rows: + ts = r["scheduled_publish_at"] or "" + try: + used_hours.add(datetime.fromisoformat(ts).hour) + except Exception: + pass + for d_str, h in wp_occupied: + if d_str == date_str: + used_hours.add(h) + + for h in hours: + if h not in used_hours: + candidate = d + chosen_hour = h + break + if candidate is not None: + break + + if candidate is None: + candidate = tomorrow + chosen_hour = hours[0] if hours else 9 + + iso_ts = f"{candidate.isoformat()}T{chosen_hour:02d}:00:00" + conn.execute( + "UPDATE articles SET scheduled_publish_at = ? WHERE id = ?", + (iso_ts, article_id), + ) + return _format_slot(candidate, chosen_hour) diff --git a/backend/app/source_extraction.py b/backend/app/source_extraction.py new file mode 100644 index 0000000..d3cbed8 --- /dev/null +++ b/backend/app/source_extraction.py @@ -0,0 +1,442 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from html import unescape +import re +from typing import Any +from urllib.parse import urljoin +from urllib.request import Request, urlopen + +DEFAULT_TIMEOUT_SECONDS = 10 +DEFAULT_USER_AGENT = "rss-news-bot/1.0 (+https://news.vanityontour.de)" + + +@dataclass(frozen=True) +class ExtractedArticle: + title: str | None + author: str | None + canonical_url: str | None + summary: str | None + content_text: str | None + images: list[str] + press_contact: str | None + extraction_error: str | None = None + image_metadata: dict[str, dict] = field(default_factory=dict) + + +def _clean_text(raw: str | None) -> str | None: + if not raw: + return None + text = unescape(raw) + text = re.sub(r"<[^>]+>", " ", text) + text = re.sub(r"\s+", " ", text).strip() + return text or None + + +def _strip_noise(html: str) -> str: + html = re.sub(r"", " ", html, flags=re.IGNORECASE) + html = re.sub(r"", " ", html, flags=re.IGNORECASE) + html = re.sub(r"", " ", html, flags=re.IGNORECASE) + return html + + +def _meta_content(html: str, attr: str, value: str) -> str | None: + pattern = re.compile( + rf"]+{attr}\s*=\s*[\"']{re.escape(value)}[\"'][^>]*content\s*=\s*[\"']([^\"']+)[\"'][^>]*>", + re.IGNORECASE, + ) + match = pattern.search(html) + if match: + return _clean_text(match.group(1)) + + # handle reversed attribute order + pattern_rev = re.compile( + rf"]+content\s*=\s*[\"']([^\"']+)[\"'][^>]*{attr}\s*=\s*[\"']{re.escape(value)}[\"'][^>]*>", + re.IGNORECASE, + ) + match = pattern_rev.search(html) + if match: + return _clean_text(match.group(1)) + return None + + +def _extract_title(html: str) -> str | None: + title = _meta_content(html, "property", "og:title") + if title: + return title + + match = re.search(r"]*>([\s\S]*?)", html, re.IGNORECASE) + if match: + cleaned = _clean_text(match.group(1)) + if cleaned: + return cleaned + + match = re.search(r"]*>([\s\S]*?)

", html, re.IGNORECASE) + if match: + return _clean_text(match.group(1)) + return None + + +def _extract_canonical(html: str) -> str | None: + match = re.search( + r"]+rel\s*=\s*[\"']canonical[\"'][^>]*href\s*=\s*[\"']([^\"']+)[\"'][^>]*>", + html, + re.IGNORECASE, + ) + if match: + return _clean_text(match.group(1)) + + match = re.search( + r"]+href\s*=\s*[\"']([^\"']+)[\"'][^>]*rel\s*=\s*[\"']canonical[\"'][^>]*>", + html, + re.IGNORECASE, + ) + if match: + return _clean_text(match.group(1)) + return None + + +def _extract_author(html: str) -> str | None: + for attr, value in (("name", "author"), ("property", "article:author"), ("property", "og:article:author")): + author = _meta_content(html, attr, value) + if author: + return author + + for pattern in ( + r"(?:Von|Autor(?:in)?)\s*[:\-]\s*([^<\n\r]{3,120})", + r"class=[\"'][^\"']*(?:author|byline)[^\"']*[\"'][^>]*>([\s\S]{1,180})<", + ): + match = re.search(pattern, html, re.IGNORECASE) + if match: + author = _clean_text(match.group(1)) + if author: + return author + return None + + +def _extract_images(html: str, page_url: str) -> list[str]: + images: list[str] = [] + seen: set[str] = set() + + for prop in ("og:image", "twitter:image"): + pattern = re.compile( + rf"]+property\s*=\s*[\"']{re.escape(prop)}[\"'][^>]*content\s*=\s*[\"']([^\"']+)[\"'][^>]*>", + re.IGNORECASE, + ) + for match in pattern.finditer(html): + src = match.group(1).strip() + abs_src = urljoin(page_url, src) + if abs_src not in seen: + seen.add(abs_src) + images.append(abs_src) + + for match in re.finditer(r"]+src\s*=\s*[\"']([^\"']+)[\"'][^>]*>", html, re.IGNORECASE): + src = match.group(1).strip() + abs_src = urljoin(page_url, src) + if abs_src not in seen: + seen.add(abs_src) + images.append(abs_src) + + return images + + +def _extract_content_text(html: str) -> str | None: + section = None + for pattern in ( + r"]*>([\s\S]*?)", + r"]*>([\s\S]*?)", + r"]*>([\s\S]*?)", + ): + match = re.search(pattern, html, re.IGNORECASE) + if match: + section = match.group(1) + break + + if not section: + section = html + + paragraphs = [] + for match in re.finditer(r"]*>([\s\S]*?)", section, re.IGNORECASE): + text = _clean_text(match.group(1)) + if text and re.search(r"\b(pressekontakt|press contact|kontakt|agentur)\b", text, re.IGNORECASE): + paragraphs.append(text) + + for match in re.finditer(r"]*>([\s\S]*?)

", section, re.IGNORECASE): + text = _clean_text(match.group(1)) + if text and len(text) > 2: + paragraphs.append(text) + + if paragraphs: + return "\n".join(paragraphs) + + stripped = _clean_text(section) + return stripped + + +def _extract_press_contact(content_text: str | None) -> str | None: + if not content_text: + return None + + lines = [line.strip() for line in content_text.split("\n") if line.strip()] + marker_re = re.compile(r"\b(pressekontakt|press contact|presse\-kontakt|agentur)\b", re.IGNORECASE) + for idx, line in enumerate(lines): + if marker_re.search(line): + chunk = [line] + for nxt in lines[idx + 1 : idx + 6]: + if re.search(r"\b(original\-content von|ots:|newsroom:|alle meldungen|zum newsroom)\b", nxt, re.IGNORECASE): + break + chunk.append(nxt) + return _clean_text("\n".join(chunk)) + + match = re.search( + r"((?:Pressekontakt|Agentur)[\s\S]{0,1200}?)(?:Original-Content von|OTS:|newsroom:|Alle Meldungen|Zum Newsroom|$)", + content_text, + re.IGNORECASE, + ) + if match: + return _clean_text(match.group(1)) + return None + + +# CSS class keywords that indicate a copyright/credit element inside a figcaption +_CREDIT_CLASS_RE = re.compile( + r"class\s*=\s*[\"'][^\"']*(?:copyright|credit|photographer|photo-credit|image-credit|bildrechte|fotocredit)[^\"']*[\"']", + re.IGNORECASE, +) + +# Inline text patterns that signal a credit/copyright notice +_CREDIT_TEXT_RE = re.compile( + r"(©[^<\n\r]{1,100}|(?:Foto|Bild|Credit|Fotograf|Fotografie)\s*:[^<\n\r]{1,100})", + re.IGNORECASE, +) + +# data-* attribute names that carry credit/caption information directly on +_IMG_DATA_CREDIT_ATTRS = ("data-credit", "data-photographer", "data-copyright") +_IMG_DATA_CAPTION_ATTRS = ("data-caption", "data-description") + +# Class keywords for adjacent sibling credit spans/divs after an +_ADJ_CREDIT_CLASS_RE = re.compile( + r"class\s*=\s*[\"'][^\"']*(?:copyright|credit|photographer|photo-credit|image-credit|bildrechte|fotocredit)[^\"']*[\"']", + re.IGNORECASE, +) + + +def _extract_image_metadata(html: str, page_url: str) -> dict[str, dict]: + """Return a mapping of absolute image URL → {"caption": ..., "credit": ...}. + + Uses three progressive strategies: + 1.
with +
+ 2. data-* attributes on tags not already covered + 3. tags whose immediately following HTML contains a credit element + """ + result: dict[str, dict] = {} + + try: + # ------------------------------------------------------------------ + # Strategy 1:
blocks containing and
+ # ------------------------------------------------------------------ + for fig_match in re.finditer(r"]*>([\s\S]*?)
", html, re.IGNORECASE): + fig_html = fig_match.group(1) + + # Locate image src (src or lazy-loaded data-src) + img_match = re.search( + r"]+(?:src|data-src)\s*=\s*[\"']([^\"']+)[\"'][^>]*>", + fig_html, + re.IGNORECASE, + ) + if not img_match: + continue + img_src = urljoin(page_url, img_match.group(1).strip()) + + # Locate figcaption + figcap_match = re.search( + r"]*>([\s\S]*?)
", + fig_html, + re.IGNORECASE, + ) + if not figcap_match: + continue + figcap_html = figcap_match.group(1) + + # --- Extract credit --- + credit: str | None = None + + # Try credit via class attribute on an inner element + credit_elem_match = re.search( + r"<(?:span|p|div)[^>]*" + + _CREDIT_CLASS_RE.pattern + + r"[^>]*>([\s\S]*?)", + figcap_html, + re.IGNORECASE, + ) + if credit_elem_match: + credit = _clean_text(credit_elem_match.group(1)) + + # Fallback: scan plain text of figcaption for credit patterns + if not credit: + figcap_text = unescape(re.sub(r"<[^>]+>", " ", figcap_html)) + cred_text_match = _CREDIT_TEXT_RE.search(figcap_text) + if cred_text_match: + credit = _clean_text(cred_text_match.group(1)) + + # --- Extract caption (full figcaption text) --- + caption = _clean_text(figcap_html) + + # Only store entries that carry at least one piece of metadata + if caption or credit: + entry: dict[str, str] = {} + if caption: + entry["caption"] = caption + if credit: + entry["credit"] = credit + result[img_src] = entry + + # ------------------------------------------------------------------ + # Strategy 2: data-* attributes on tags + # ------------------------------------------------------------------ + for img_match in re.finditer(r"]+)>", html, re.IGNORECASE): + img_attrs = img_match.group(1) + + # Resolve image URL (prefer src over data-src) + src_match = re.search(r'(?:^|\s)src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE) + if not src_match: + src_match = re.search(r'data-src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE) + if not src_match: + continue + img_src = urljoin(page_url, src_match.group(1).strip()) + + # Skip images already handled by Strategy 1 + if img_src in result: + continue + + credit: str | None = None + caption: str | None = None + + for attr in _IMG_DATA_CREDIT_ATTRS: + attr_match = re.search( + rf'{re.escape(attr)}\s*=\s*["\']([^"\']+)["\']', + img_attrs, + re.IGNORECASE, + ) + if attr_match: + credit = _clean_text(attr_match.group(1)) + break + + for attr in _IMG_DATA_CAPTION_ATTRS: + attr_match = re.search( + rf'{re.escape(attr)}\s*=\s*["\']([^"\']+)["\']', + img_attrs, + re.IGNORECASE, + ) + if attr_match: + caption = _clean_text(attr_match.group(1)) + break + + if caption or credit: + entry = {} + if caption: + entry["caption"] = caption + if credit: + entry["credit"] = credit + result[img_src] = entry + + # ------------------------------------------------------------------ + # Strategy 3: followed within 200 chars by a credit element + # ------------------------------------------------------------------ + for img_match in re.finditer(r"]+)>", html, re.IGNORECASE): + img_attrs = img_match.group(1) + + src_match = re.search(r'(?:^|\s)src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE) + if not src_match: + src_match = re.search(r'data-src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE) + if not src_match: + continue + img_src = urljoin(page_url, src_match.group(1).strip()) + + # Skip images already handled by earlier strategies + if img_src in result: + continue + + # Look at the 200 characters of HTML immediately after the img tag + after_start = img_match.end() + after_html = html[after_start : after_start + 200] + + adj_match = re.search( + r"<(?:span|p|div)[^>]*" + + _ADJ_CREDIT_CLASS_RE.pattern + + r"[^>]*>([\s\S]*?)", + after_html, + re.IGNORECASE, + ) + if adj_match: + credit = _clean_text(adj_match.group(1)) + if credit: + result[img_src] = {"credit": credit} + + except Exception: + return {} + + return result + + +def extract_article(url: str, timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS) -> ExtractedArticle: + try: + req = Request( + url=url, + headers={ + "User-Agent": DEFAULT_USER_AGENT, + "Accept-Language": "de-DE,de;q=0.9,en;q=0.8", + }, + ) + with urlopen(req, timeout=timeout_seconds) as resp: + raw = resp.read() + charset = resp.headers.get_content_charset() or "utf-8" + html = raw.decode(charset, errors="replace") + except Exception as exc: + return ExtractedArticle( + title=None, + author=None, + canonical_url=None, + summary=None, + content_text=None, + images=[], + press_contact=None, + extraction_error=str(exc), + ) + + html = _strip_noise(html) + title = _extract_title(html) + author = _extract_author(html) + canonical_url = _extract_canonical(html) + summary = _meta_content(html, "name", "description") + content_text = _extract_content_text(html) + if not summary and content_text: + summary = _clean_text(content_text[:320]) + images = _extract_images(html, url) + press_contact = _extract_press_contact(content_text) + image_metadata = _extract_image_metadata(html, url) + + return ExtractedArticle( + title=title, + author=author, + canonical_url=canonical_url, + summary=summary, + content_text=content_text, + images=images, + press_contact=press_contact, + extraction_error=None, + image_metadata=image_metadata, + ) + + +def extracted_article_to_meta(article: ExtractedArticle) -> dict[str, Any]: + return { + "title": article.title, + "author": article.author, + "canonical_url": article.canonical_url, + "summary": article.summary, + "images": article.images, + "press_contact": article.press_contact, + "extraction_error": article.extraction_error, + "image_metadata": article.image_metadata, + } diff --git a/backend/app/telegram_bot.py b/backend/app/telegram_bot.py new file mode 100644 index 0000000..880a49d --- /dev/null +++ b/backend/app/telegram_bot.py @@ -0,0 +1,474 @@ +"""Telegram Bot integration for RSS-News pipeline notifications and controls.""" +from __future__ import annotations + +import json +import logging +from typing import Any +from urllib.error import URLError +from urllib.parse import urlencode +from urllib.request import Request, urlopen + +from .config import get_settings + +logger = logging.getLogger(__name__) + +_BASE = "https://api.telegram.org/bot{token}/{method}" +_N8N_APP_RELEASE_WEBHOOK = "https://n8n.vanityontour.de/webhook/tg-app-release-bot-v1/webhook" + + +# --------------------------------------------------------------------------- +# Low-level API helpers +# --------------------------------------------------------------------------- + +def _call(method: str, payload: dict[str, Any]) -> dict[str, Any]: + settings = get_settings() + token = settings.telegram_bot_token + if not token: + raise RuntimeError("TELEGRAM_BOT_TOKEN nicht konfiguriert") + url = _BASE.format(token=token, method=method) + data = json.dumps(payload).encode("utf-8") + req = Request( + url=url, + data=data, + method="POST", + headers={"Content-Type": "application/json", "Accept": "application/json"}, + ) + try: + with urlopen(req, timeout=15) as resp: + raw = resp.read().decode("utf-8", errors="replace") + return json.loads(raw) + except URLError as exc: + logger.error("Telegram API Fehler (%s): %s", method, exc) + raise RuntimeError(f"Telegram API Fehler: {exc}") from exc + + +def _chat_id() -> str: + settings = get_settings() + cid = settings.telegram_chat_id + if not cid: + raise RuntimeError("TELEGRAM_CHAT_ID nicht konfiguriert") + return cid + + +def _inline_keyboard(buttons: list[list[dict[str, str]]]) -> dict: + return {"inline_keyboard": buttons} + + +# --------------------------------------------------------------------------- +# Public send functions +# --------------------------------------------------------------------------- + +def send_message(text: str, reply_markup: dict | None = None, parse_mode: str = "HTML") -> dict: + payload: dict[str, Any] = { + "chat_id": _chat_id(), + "text": text, + "parse_mode": parse_mode, + "disable_web_page_preview": False, + } + if reply_markup: + payload["reply_markup"] = reply_markup + return _call("sendMessage", payload) + + +def send_photo_message( + photo_url: str, + caption: str, + reply_markup: dict | None = None, + parse_mode: str = "HTML", +) -> dict: + payload: dict[str, Any] = { + "chat_id": _chat_id(), + "photo": photo_url, + "caption": caption, + "parse_mode": parse_mode, + } + if reply_markup: + payload["reply_markup"] = reply_markup + try: + return _call("sendPhoto", payload) + except Exception: + # Fall back to text message if photo fails (e.g. image URL no longer valid) + return send_message(caption, reply_markup=reply_markup, parse_mode=parse_mode) + + +def answer_callback_query(callback_query_id: str, text: str = "") -> None: + try: + _call("answerCallbackQuery", {"callback_query_id": callback_query_id, "text": text}) + except Exception as exc: + logger.warning("answerCallbackQuery fehlgeschlagen: %s", exc) + + +def edit_message_reply_markup(chat_id: str, message_id: int, reply_markup: dict | None = None) -> None: + payload: dict[str, Any] = {"chat_id": chat_id, "message_id": message_id} + if reply_markup: + payload["reply_markup"] = reply_markup + else: + payload["reply_markup"] = {"inline_keyboard": []} + try: + _call("editMessageReplyMarkup", payload) + except Exception as exc: + logger.warning("editMessageReplyMarkup fehlgeschlagen: %s", exc) + + +def setup_webhook(webhook_url: str) -> dict: + settings = get_settings() + payload: dict[str, Any] = {"url": webhook_url, "allowed_updates": ["message", "callback_query"]} + if settings.telegram_webhook_secret: + payload["secret_token"] = settings.telegram_webhook_secret + return _call("setWebhook", payload) + + +def delete_webhook() -> dict: + return _call("deleteWebhook", {}) + + +def _forward_to_n8n_app_release(update: dict[str, Any]) -> None: + """Forward a Telegram update to the N8N App Release webhook.""" + try: + data = json.dumps(update).encode("utf-8") + req = Request( + url=_N8N_APP_RELEASE_WEBHOOK, + data=data, + method="POST", + headers={"Content-Type": "application/json"}, + ) + with urlopen(req, timeout=5) as _: + pass + except Exception as exc: + logger.debug("N8N App-Release-Forward fehlgeschlagen: %s", exc) + + +# --------------------------------------------------------------------------- +# Notification helpers +# --------------------------------------------------------------------------- + +def _format_tags(meta_json: str | None) -> str: + if not meta_json: + return "" + try: + meta = json.loads(meta_json) + tags = meta.get("generated_tags") or [] + if tags: + return " ".join(f"#{t.replace(' ', '_')}" for t in tags[:6]) + except Exception: + pass + return "" + + +def _score_emoji(score: int) -> str: + if score >= 85: + return "🟢" + if score >= 70: + return "🟡" + return "🔴" + + +def notify_new_draft( + article: dict[str, Any], + score: int, + suggested_publish_at: str | None = None, +) -> None: + """Send Telegram notification for a newly created WP draft.""" + title = (article.get("title") or "Ohne Titel").strip() + wp_url = article.get("wp_post_url") or "" + tags_str = _format_tags(article.get("meta_json")) + art_id = article.get("id") + + score_line = f"{_score_emoji(score)} Relevanz-Score: {score}/100" + publish_line = f"📅 Vorgeschlagene Veröffentlichung: {suggested_publish_at}" if suggested_publish_at else "" + link_line = f'🔗 Draft in WordPress öffnen' if wp_url else "" + tags_line = f"🏷 {tags_str}" if tags_str else "" + + text_parts = [ + f"✅ Neuer Draft erstellt", + f"📰 {title}", + score_line, + ] + if publish_line: + text_parts.append(publish_line) + if tags_line: + text_parts.append(tags_line) + if link_line: + text_parts.append(link_line) + + text = "\n".join(text_parts) + + keyboard = _inline_keyboard([ + [ + {"text": "✏️ Neu schreiben", "callback_data": f"rewrite:{art_id}"}, + {"text": "❌ Verwerfen", "callback_data": f"discard:{art_id}"}, + ] + ]) + + # Try with image first + meta = {} + try: + meta = json.loads(article.get("meta_json") or "{}") + except Exception: + pass + image_url = None + image_review = meta.get("image_review") or {} + if isinstance(image_review, dict): + image_url = image_review.get("selected_url") + if not image_url: + image_sel = (meta.get("extraction") or {}).get("image_selection") or {} + image_url = image_sel.get("primary") + + if image_url: + send_photo_message(image_url, caption=text, reply_markup=keyboard) + else: + send_message(text, reply_markup=keyboard) + + +def notify_relevance_warning(article: dict[str, Any], score: int, reason: str) -> None: + """Send Telegram warning for borderline articles (score between warn and auto thresholds).""" + title = (article.get("title") or "Ohne Titel").strip() + art_id = article.get("id") + source_url = article.get("source_url") or "" + + text = ( + f"⚠️ Artikel mit niedrigem Relevanz-Score\n" + f"📰 {title}\n" + f"{_score_emoji(score)} Score: {score}/100\n" + f"💬 {reason}\n" + f'🔗 Originalartikel' + ) + keyboard = _inline_keyboard([ + [ + {"text": "➕ Trotzdem verarbeiten", "callback_data": f"override:{art_id}"}, + {"text": "❌ Ablehnen", "callback_data": f"reject:{art_id}"}, + ] + ]) + send_message(text, reply_markup=keyboard) + + +def notify_rejected_summary(articles: list[dict[str, Any]]) -> None: + """Send summary of rejected articles for this pipeline run.""" + if not articles: + return + lines = [f"🚫 {len(articles)} Artikel abgelehnt (Score < {get_settings().pipeline_relevance_warn})\n"] + for art in articles[:10]: + title = (art.get("title") or "Ohne Titel")[:60] + score = _get_relevance_score(art) + reason = _get_rejection_reason(art) + art_id = art.get("id") + lines.append(f"• {title} (Score: {score}) — {reason}") + if len(articles) > 10: + lines.append(f"... und {len(articles) - 10} weitere") + + text = "\n".join(lines) + # Build override buttons for first 5 + rows = [] + for art in articles[:5]: + art_id = art.get("id") + title = (art.get("title") or "")[:25] + rows.append([{"text": f"➕ {title}…", "callback_data": f"override:{art_id}"}]) + + keyboard = _inline_keyboard(rows) if rows else None + send_message(text, reply_markup=keyboard) + + +def notify_error(message: str) -> None: + """Send error alert to Telegram.""" + try: + send_message(f"🔴 Fehler im RSS-Pipeline\n{message}") + except Exception as exc: + logger.error("Telegram Fehler-Benachrichtigung fehlgeschlagen: %s", exc) + + +def notify_pipeline_started(trigger: str = "auto") -> None: + icon = "🤖" if trigger == "auto" else "👤" + try: + send_message(f"{icon} Pipeline gestartet (Auslöser: {trigger})") + except Exception: + pass + + +def notify_pipeline_done(stats: dict[str, Any]) -> None: + ingested = stats.get("ingested", 0) + processed = stats.get("processed", 0) + drafts = stats.get("drafts_created", 0) + rejected = stats.get("rejected", 0) + quality_gate_rejected = stats.get("quality_gate_rejected", 0) + no_image = stats.get("no_image", 0) + warnings = stats.get("warnings", 0) + errors = stats.get("errors", 0) + + lines = [ + "📊 Pipeline abgeschlossen", + f"📥 Neue Artikel importiert: {ingested}", + f"⚙️ Verarbeitet: {processed}", + f"📝 Drafts erstellt: {drafts}", + ] + if rejected: + lines.append(f"🚫 Abgelehnt (Score): {rejected}") + if quality_gate_rejected: + lines.append(f"✂️ Qualitätsprüfung: {quality_gate_rejected}") + if no_image: + lines.append(f"🖼️ Kein Bild: {no_image}") + if warnings: + lines.append(f"⚠️ Warnungen: {warnings}") + if errors: + lines.append(f"🔴 Fehler: {errors}") + + try: + send_message("\n".join(lines)) + except Exception: + pass + + +# --------------------------------------------------------------------------- +# Helper to read relevance info from meta_json +# --------------------------------------------------------------------------- + +def _get_relevance_score(article: dict[str, Any]) -> int: + try: + meta = json.loads(article.get("meta_json") or "{}") + return int(meta.get("relevance", {}).get("score", 0)) + except Exception: + return 0 + + +def _get_rejection_reason(article: dict[str, Any]) -> str: + try: + meta = json.loads(article.get("meta_json") or "{}") + return str(meta.get("relevance", {}).get("reason", ""))[:80] + except Exception: + return "" + + +# --------------------------------------------------------------------------- +# Incoming update handler (called by webhook endpoint) +# --------------------------------------------------------------------------- + +def handle_update(update: dict[str, Any]) -> None: + """Process an incoming Telegram update.""" + # Import here to avoid circular imports + from . import pipeline as _pipeline + + if "callback_query" in update: + _handle_callback(update["callback_query"]) + elif "message" in update: + _handle_message(update["message"]) + + +def _handle_message(message: dict[str, Any]) -> None: + from . import pipeline as _pipeline + + text = (message.get("text") or "").strip() + if not text.startswith("/"): + return + + cmd = text.split()[0].lower().lstrip("/") + if "@" in cmd: + cmd = cmd.split("@")[0] + + if cmd == "run": + send_message("🤖 Pipeline wird manuell gestartet …") + try: + stats = _pipeline.run_auto_pipeline(trigger="manual") + notify_pipeline_done(stats) + except Exception as exc: + notify_error(f"/run fehlgeschlagen: {exc}") + + elif cmd == "rejected": + try: + articles = _pipeline.get_recently_rejected(days=3) + if not articles: + send_message("✅ Keine abgelehnten Artikel in den letzten 3 Tagen.") + else: + notify_rejected_summary(articles) + except Exception as exc: + notify_error(f"/rejected fehlgeschlagen: {exc}") + + elif cmd == "status": + try: + status_text = _pipeline.get_pipeline_status_text() + send_message(status_text) + except Exception as exc: + notify_error(f"/status fehlgeschlagen: {exc}") + + elif cmd == "help": + send_message( + "📋 Verfügbare Befehle\n" + "/run — Pipeline manuell starten\n" + "/rejected — Abgelehnte Artikel der letzten 3 Tage\n" + "/status — Pipeline-Status\n" + "/help — Diese Hilfe" + ) + + else: + # Unbekannter Befehl → an N8N App-Release-Workflow weiterleiten + _forward_to_n8n_app_release({"message": message}) + + +def _handle_callback(callback_query: dict[str, Any]) -> None: + from . import pipeline as _pipeline + from .repositories import get_article_by_id, update_article_status + + query_id = callback_query.get("id", "") + data = (callback_query.get("data") or "").strip() + chat_id = str(callback_query.get("message", {}).get("chat", {}).get("id", "")) + message_id = int(callback_query.get("message", {}).get("message_id", 0)) + + if ":" not in data: + answer_callback_query(query_id, "Ungültige Aktion") + return + + action, _, raw_id = data.partition(":") + try: + article_id = int(raw_id) + except ValueError: + answer_callback_query(query_id, "Ungültige Artikel-ID") + return + + article = get_article_by_id(article_id) + if not article: + answer_callback_query(query_id, "Artikel nicht gefunden") + return + + # Answer Telegram immediately so the spinning indicator stops + action_labels = { + "rewrite": "✏️ Artikel wird neu geschrieben …", + "discard": "❌ Artikel verworfen", + "override": "➕ Artikel wird verarbeitet …", + "reject": "🚫 Abgelehnt", + } + answer_callback_query(query_id, action_labels.get(action, "")) + edit_message_reply_markup(chat_id, message_id) + + logger.info("Callback: action=%s article_id=%s", action, article_id) + + if action == "rewrite": + try: + logger.info("Rewrite #%d: starte rewrite_and_update_draft", article_id) + _pipeline.rewrite_and_update_draft(article_id) + logger.info("Rewrite #%d: abgeschlossen, sende Benachrichtigung", article_id) + updated = get_article_by_id(article_id) + if updated: + from .scheduler import suggest_publish_slot + slot = suggest_publish_slot() + notify_new_draft(updated, score=_get_relevance_score(updated), suggested_publish_at=slot) + except Exception as exc: + logger.error("Rewrite #%d fehlgeschlagen: %s", article_id, exc, exc_info=True) + notify_error(f"Rewrite #{article_id} fehlgeschlagen: {exc}") + + elif action == "discard": + try: + _pipeline.discard_article(article_id) + except Exception as exc: + logger.error("Discard #%d fehlgeschlagen: %s", article_id, exc) + notify_error(f"Verwerfen #{article_id} fehlgeschlagen: {exc}") + + elif action == "override": + try: + _pipeline.override_rejected_article(article_id) + except Exception as exc: + logger.error("Override #%d fehlgeschlagen: %s", article_id, exc) + notify_error(f"Override #{article_id} fehlgeschlagen: {exc}") + + elif action == "reject": + update_article_status(article_id, "error", actor="telegram", note="Manuell abgelehnt via Telegram") + + else: + logger.warning("Unbekannte Callback-Aktion: %s", action) diff --git a/backend/app/wordpress.py b/backend/app/wordpress.py new file mode 100644 index 0000000..bb96198 --- /dev/null +++ b/backend/app/wordpress.py @@ -0,0 +1,689 @@ +from __future__ import annotations + +import base64 +from html import escape +import logging +import json +import mimetypes +from pathlib import Path +import re +from typing import Any +from html import unescape as _html_unescape +from urllib.parse import quote_plus, urlparse +from urllib.request import Request, urlopen + +from .config import get_settings + + +def _auth_header(username: str, app_password: str) -> str: + token = base64.b64encode(f"{username}:{app_password}".encode("utf-8")).decode("ascii") + return f"Basic {token}" + + +def _wp_request( + *, + base_url: str, + auth_header: str, + method: str, + endpoint: str, + payload: dict[str, Any] | None = None, +) -> Any: + url = f"{base_url.rstrip('/')}/wp-json/wp/v2/{endpoint.lstrip('/')}" + data = json.dumps(payload).encode("utf-8") if payload is not None else None + req = Request( + url=url, + data=data, + method=method, + headers={ + "Authorization": auth_header, + "Content-Type": "application/json; charset=utf-8", + "Accept": "application/json", + "User-Agent": "rss-news-publisher/1.0", + }, + ) + with urlopen(req, timeout=20) as resp: + raw = resp.read().decode("utf-8", errors="replace") + return json.loads(raw) if raw else {} + + +def _selected_image_url_from_meta(meta_json: str | None) -> str | None: + if not meta_json: + return None + try: + meta = json.loads(meta_json) + except Exception: + return None + if not isinstance(meta, dict): + return None + image_review = meta.get("image_review") + if not isinstance(image_review, dict): + return None + selected = image_review.get("selected_url") + return selected if isinstance(selected, str) and selected.strip() else None + + +def _selected_tags_from_meta(meta_json: str | None) -> list[str]: + if not meta_json: + return [] + try: + meta = json.loads(meta_json) + except Exception: + return [] + if not isinstance(meta, dict): + return [] + raw_tags = meta.get("generated_tags") + if not isinstance(raw_tags, list): + return [] + tags: list[str] = [] + seen: set[str] = set() + for item in raw_tags: + value = str(item or "").strip() + if not value: + continue + key = value.casefold() + if key in seen: + continue + seen.add(key) + tags.append(value) + if len(tags) >= 12: + break + return tags + + +def _resolve_wp_tag_ids(*, base_url: str, auth_header: str, tags: list[str]) -> list[int]: + ids: list[int] = [] + seen: set[int] = set() + for tag in tags: + name = tag.strip() + if not name: + continue + try: + endpoint = f"tags?search={quote_plus(name)}&per_page=20" + result = _wp_request(base_url=base_url, auth_header=auth_header, method="GET", endpoint=endpoint) + tag_id: int | None = None + if isinstance(result, list): + for row in result: + if not isinstance(row, dict): + continue + row_name = str(row.get("name") or "") + rid = int(row.get("id", 0) or 0) + if rid <= 0: + continue + if row_name.casefold() == name.casefold(): + tag_id = rid + break + if tag_id is None: + for row in result: + if isinstance(row, dict) and int(row.get("id", 0) or 0) > 0: + tag_id = int(row.get("id", 0)) + break + if tag_id is None: + created = _wp_request( + base_url=base_url, + auth_header=auth_header, + method="POST", + endpoint="tags", + payload={"name": name}, + ) + if isinstance(created, dict): + rid = int(created.get("id", 0) or 0) + if rid > 0: + tag_id = rid + if tag_id is not None and tag_id > 0 and tag_id not in seen: + seen.add(tag_id) + ids.append(tag_id) + except Exception: + continue + return ids + + +_BLOCKED_IMAGE_EXTS = {".svg", ".gif", ".ico", ".webp"} +_logger = logging.getLogger(__name__) + + +def _sanitize_image_url(url: str) -> str: + """Decode HTML entities (e.g. & → &) in image URLs from RSS feeds.""" + return _html_unescape(url) + + +_PLACEHOLDER_PATTERNS = ("some-default.jpg", "default-image", "placeholder", "no-image", "noimage") + +def _is_usable_image_url(url: str) -> bool: + """Return False for URLs that are unlikely to work as WP featured images.""" + if not url or url.startswith("data:"): + return False + try: + path = urlparse(url).path.lower() + _, ext = path.rsplit(".", 1) if "." in path else ("", "") + if f".{ext}" in _BLOCKED_IMAGE_EXTS: + return False + if any(p in path for p in _PLACEHOLDER_PATTERNS): + return False + except Exception: + pass + return True + + +def _download_image_bytes(url: str, referer: str | None = None) -> tuple[bytes, str]: + url = _sanitize_image_url(url) + headers = { + "User-Agent": "Mozilla/5.0 (compatible; rss-news-publisher/1.0)", + "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8", + } + if referer: + headers["Referer"] = referer + req = Request(url=url, headers=headers) + with urlopen(req, timeout=20) as resp: + raw = resp.read() + content_type = resp.headers.get("Content-Type", "application/octet-stream") + content_type = content_type.split(";")[0].strip() if content_type else "application/octet-stream" + if not content_type.lower().startswith("image/"): + raise RuntimeError(f"Ausgewählte Bild-URL liefert kein Bild ({content_type})") + return raw, content_type + + +def _guess_filename(image_url: str, content_type: str) -> str: + parsed = urlparse(_sanitize_image_url(image_url)) + stem = Path(parsed.path).name or "article-image" + if "." not in stem: + ext = mimetypes.guess_extension(content_type.split(";")[0].strip()) or ".jpg" + stem = f"{stem}{ext}" + # Sanitize to ASCII-safe characters for the HTTP Content-Disposition header + stem = stem.encode("ascii", errors="ignore").decode("ascii") + stem = re.sub(r"[^\w.\-]", "_", stem) or "article-image.jpg" + return stem + + +def _get_image_meta_for_url(meta_json: str | None, image_url: str) -> dict: + """Return the caption/credit dict for a specific image URL from extraction metadata.""" + if not meta_json or not image_url: + return {} + try: + from urllib.parse import urlparse + meta = json.loads(meta_json) + image_metadata = (meta.get("extraction") or {}).get("image_metadata") or {} + # Exact match first + if image_url in image_metadata: + return image_metadata[image_url] + # Fuzzy match: compare without query string (handles ?w=1200 variants) + base_url = urlparse(image_url)._replace(query="").geturl() + for key, val in image_metadata.items(): + key_base = urlparse(key)._replace(query="").geturl() + if key_base == base_url: + return val + return {} + except Exception: + return {} + + +def _build_image_caption(image_meta: dict, source_url: str) -> str: + """Build a WP caption string from image metadata and source URL.""" + # caption from figcaption typically already contains the credit text + caption = (image_meta.get("caption") or "").strip() + if caption: + return caption + return f"Quelle: {source_url}" + + +def _upload_featured_media( + *, + base_url: str, + auth_header: str, + image_url: str, + article_title: str, + source_url: str, + image_caption: str = "", +) -> int: + image_bytes, content_type = _download_image_bytes(image_url, referer=source_url or None) + filename = _guess_filename(image_url, content_type) + + media_url = f"{base_url.rstrip('/')}/wp-json/wp/v2/media" + media_req = Request( + url=media_url, + data=image_bytes, + method="POST", + headers={ + "Authorization": auth_header, + "Content-Type": content_type, + "Content-Disposition": f'attachment; filename="{filename}"', + "Accept": "application/json", + "User-Agent": "rss-news-publisher/1.0", + }, + ) + with urlopen(media_req, timeout=30) as resp: + media_raw = resp.read().decode("utf-8", errors="replace") + media_payload = json.loads(media_raw) if media_raw else {} + media_id = int(media_payload.get("id", 0)) if isinstance(media_payload, dict) else 0 + if media_id <= 0: + raise RuntimeError(f"WordPress Media-Upload fehlgeschlagen: {media_payload}") + + _wp_request( + base_url=base_url, + auth_header=auth_header, + method="POST", + endpoint=f"media/{media_id}", + payload={ + "title": f"{article_title[:120]} - Bild", + "caption": image_caption or f"Quelle: {source_url}", + "alt_text": article_title[:200], + }, + ) + return media_id + + +def _as_paragraph_html(text: str) -> str: + chunks = [chunk.strip() for chunk in re.split(r"\n{2,}", text.strip()) if chunk.strip()] + if not chunks: + return "" + lines = [] + for chunk in chunks: + compact = re.sub(r"\s*\n\s*", " ", chunk) + lines.append(f"

{escape(compact)}

") + return "\n".join(lines) + + +def _as_block_paragraphs(text: str) -> str: + chunks = [chunk.strip() for chunk in re.split(r"\n{2,}", text.strip()) if chunk.strip()] + if not chunks: + return "" + lines = [] + for chunk in chunks: + compact = re.sub(r"\s*\n\s*", " ", chunk) + lines.append(f"

{escape(compact)}

") + return "\n".join(lines) + + +def _strip_html_tags(raw: str) -> str: + text = re.sub(r"<[^>]+>", " ", raw or "") + return re.sub(r"\s+", " ", text).strip() + + +def _html_to_wp_blocks(html: str) -> str: + src = (html or "").strip() + if not src: + return "" + pattern = re.compile( + r"]*>[\s\S]*?|]*>[\s\S]*?

|]*>[\s\S]*?|]*>[\s\S]*?", + re.IGNORECASE, + ) + blocks: list[str] = [] + for match in pattern.finditer(src): + block_html = match.group(0).strip() + if not block_html: + continue + tag_match = re.match(r"<([a-z0-9]+)", block_html, re.IGNORECASE) + tag = (tag_match.group(1).lower() if tag_match else "") + if tag == "p": + blocks.append(f"{block_html}") + elif tag in {"ul", "ol"}: + ordered = tag == "ol" + if ordered: + blocks.append(f'{block_html}') + else: + blocks.append(f"{block_html}") + elif tag.startswith("h") and len(tag) == 2 and tag[1].isdigit(): + level = int(tag[1]) + blocks.append(f'{block_html}') + if blocks: + return "\n".join(blocks) + return _as_block_paragraphs(_strip_html_tags(src)) + + +def _as_block_heading(level: int, text: str) -> str: + safe_level = min(6, max(1, int(level))) + return f'{escape(text)}' + + +def _as_block_list(items: list[str]) -> str: + if not items: + return "" + content = "".join(f"
  • {item}
  • " for item in items) + return f"
      {content}
    " + + +def _sanitize_publish_text(text: str) -> str: + raw = (text or "").strip() + if not raw: + return "" + lines = [ln.strip() for ln in raw.splitlines() if ln.strip()] + if len(lines) > 3: + lines = lines[3:] + merged = "\n".join(lines) + merged = re.sub(r"\n?\s*Pressekontakt[\s\S]*$", "", merged, flags=re.IGNORECASE).strip() + return merged + + +def _build_attribution_block(article: dict[str, Any]) -> str: + """Build a WP Gutenberg attribution block for the bottom of the article.""" + from urllib.parse import urlparse + source_url = (article.get("canonical_url") or article.get("source_url") or "").strip() + source_name = (article.get("source_name_snapshot") or "").strip() + author = (article.get("author") or "").strip() + + # If the feed name is "Google Alerts" (or similar generic names), derive the + # real source name from the hostname of the canonical URL. + if not source_name or source_name.lower() in ("google alerts", "google"): + try: + hostname = urlparse(source_url).hostname or "" + source_name = hostname.removeprefix("www.") + except Exception: + pass + + # Get image credit from extraction metadata (uses fuzzy URL match) + meta_json = article.get("meta_json") + credit = "" + try: + meta = json.loads(meta_json or "{}") + selected_url = (meta.get("image_review") or {}).get("selected_url") or "" + if selected_url: + img_meta = _get_image_meta_for_url(meta_json, selected_url) + raw_credit = (img_meta.get("credit") or "").strip() + caption_text = (img_meta.get("caption") or "").strip() + # If credit is just a bare marker prefix (e.g. "Foto:", "Bild:"), + # clear it and extract the full credit from the caption text instead. + _BARE_MARKERS = {"foto", "bild", "credit", "fotograf", "fotografie", "photo", "bildnachweis"} + if raw_credit.endswith(":") and raw_credit[:-1].strip().lower() in _BARE_MARKERS: + raw_credit = "" + if raw_credit: + credit = raw_credit + elif caption_text: + # Extract credit markers like "Foto: IMAGO/…", "© Agentur", "Bild: …" + import re as _re + m = _re.search( + r"(©[^\n]{1,120}|(?:Foto|Bild|Credit|Fotograf|Photo)\s*:[^\n]{1,120})", + caption_text, + ) + credit = m.group(1).strip() if m else "" + except Exception: + pass + + parts: list[str] = [] + if source_url: + label = source_name or source_url + parts.append(f'Originalartikel: {escape(label)}') + if author: + parts.append(f"Autor: {escape(author)}") + if credit: + parts.append(f"Bildnachweis: {escape(credit)}") + + if not parts: + return "" + + inner = "  |  ".join(parts) + return ( + "\n" + "
    \n" + f'' + f'

    {inner}

    ' + "" + ) + + +def _build_post_content(article: dict[str, Any]) -> tuple[str, str | None]: + summary = (article.get("summary") or "").strip() + body_text = (article.get("content_rewritten") or article.get("content_raw") or "").strip() + body_text = _sanitize_publish_text(body_text) + if not body_text: + body_text = summary + + has_html = bool(re.search(r"<[a-zA-Z][^>]*>", body_text)) + body_html = _html_to_wp_blocks(body_text) if has_html else _as_block_paragraphs(body_text) + if not body_html: + body_html = "

    Kein Inhalt verfügbar.

    " + + attribution = _build_attribution_block(article) + content = (body_html + attribution).strip() + return content, None + + +def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]: + settings = get_settings() + if not settings.wordpress_base_url or not settings.wordpress_username or not settings.wordpress_app_password: + raise RuntimeError("WordPress Konfiguration fehlt (base_url, username, app_password)") + + auth = _auth_header(settings.wordpress_username, settings.wordpress_app_password) + + title = (article.get("title") or "Ohne Titel").strip() + content, excerpt = _build_post_content(article) + source_url = article.get("source_url") or "" + + featured_media_id = None + selected_image_url = _selected_image_url_from_meta(article.get("meta_json")) + + # Build candidate list: primary selected URL + fallbacks from image_urls_json + image_candidates: list[str] = [] + if selected_image_url and _is_usable_image_url(selected_image_url): + image_candidates.append(selected_image_url) + try: + extra_urls = json.loads(article.get("image_urls_json") or "[]") + for u in extra_urls: + if u and u not in image_candidates and _is_usable_image_url(u): + image_candidates.append(u) + except Exception: + pass + + for candidate_url in image_candidates: + image_meta = _get_image_meta_for_url(article.get("meta_json"), candidate_url) + image_caption = _build_image_caption(image_meta, source_url) + try: + featured_media_id = _upload_featured_media( + base_url=settings.wordpress_base_url, + auth_header=auth, + image_url=candidate_url, + article_title=title, + source_url=source_url, + image_caption=image_caption, + ) + break # success — stop trying further candidates + except Exception as img_exc: + _logger.warning( + "Bild-Upload fehlgeschlagen, versuche nächste URL: %s — %s", candidate_url, img_exc + ) + + if not featured_media_id and image_candidates: + _logger.warning( + "Alle %d Bild-Kandidaten fehlgeschlagen für Artikel #%s (%s)", + len(image_candidates), article.get("id"), title[:60], + ) + + payload = { + "title": title, + "content": content, + "status": settings.wordpress_default_status, + } + if excerpt: + payload["excerpt"] = excerpt + if featured_media_id: + payload["featured_media"] = featured_media_id + scheduled_at = article.get("scheduled_publish_at") + if scheduled_at: + payload["date"] = scheduled_at # e.g. "2026-03-24T09:00:00" + # Use status "future" so WP schedules auto-publishing at the given date. + # WP ignores date for drafts and shows "Sofort veröffentlichen" instead. + try: + from datetime import datetime as _dt + if _dt.fromisoformat(scheduled_at) > _dt.now(): + payload["status"] = "future" + except Exception: + pass + + wp_post_id = article.get("wp_post_id") + tag_ids = _resolve_wp_tag_ids( + base_url=settings.wordpress_base_url, + auth_header=auth, + tags=_selected_tags_from_meta(article.get("meta_json")), + ) + if tag_ids: + payload["tags"] = tag_ids + + if wp_post_id: + result = _wp_request( + base_url=settings.wordpress_base_url, + auth_header=auth, + method="POST", + endpoint=f"posts/{int(wp_post_id)}", + payload=payload, + ) + else: + result = _wp_request( + base_url=settings.wordpress_base_url, + auth_header=auth, + method="POST", + endpoint="posts", + payload=payload, + ) + + if not isinstance(result, dict): + raise RuntimeError(f"WordPress Antwort im unerwarteten Format: {result}") + post_id = int(result.get("id", 0)) + if post_id <= 0: + raise RuntimeError(f"WordPress Antwort ohne Post-ID: {result}") + post_url = result.get("link") + return post_id, post_url if isinstance(post_url, str) else None + + +def selected_image_exists(article: dict[str, Any]) -> bool: + return _selected_image_url_from_meta(article.get("meta_json")) is not None + + +def delete_wp_post(wp_post_id: int) -> None: + """Permanently delete a WordPress post (moves to trash, then deletes).""" + settings = get_settings() + if not settings.wordpress_base_url or not settings.wordpress_username or not settings.wordpress_app_password: + raise RuntimeError("WordPress Konfiguration fehlt") + auth = _auth_header(settings.wordpress_username, settings.wordpress_app_password) + # force=true skips trash + _wp_request( + base_url=settings.wordpress_base_url, + auth_header=auth, + method="DELETE", + endpoint=f"posts/{wp_post_id}?force=true", + ) + + +def sync_db_from_wordpress() -> dict[str, Any]: + """Sync scheduled_publish_at and wp_post_url in the DB from WordPress. + + WordPress is treated as the source of truth for scheduling. + For each DB article that has a wp_post_id: + - If WP post exists as 'future': update scheduled_publish_at to WP date. + - If WP post exists as 'draft': clear scheduled_publish_at (not yet scheduled). + - If WP post exists as 'publish': mark article as published in DB. + - If WP post is trashed/deleted (404 or trash status): clear wp_post_id, + wp_post_url, and scheduled_publish_at so the article can be re-processed. + Returns a stats dict with counts of each action taken. + """ + from .db import get_conn + + settings = get_settings() + if not settings.wordpress_base_url or not settings.wordpress_username or not settings.wordpress_app_password: + raise RuntimeError("WordPress Konfiguration fehlt") + auth = _auth_header(settings.wordpress_username, settings.wordpress_app_password) + base_url = settings.wordpress_base_url.rstrip("/") + + # Fetch all future + draft + published WP posts in one pass (up to 300 per status) + wp_posts: dict[int, dict] = {} + for status in ("future", "draft", "publish"): + for page in range(1, 4): # max 300 per status + try: + result = _wp_request( + base_url=base_url, + auth_header=auth, + method="GET", + endpoint=f"posts?status={status}&per_page=100&page={page}&_fields=id,date,status,link", + ) + except Exception: + break + if not isinstance(result, list) or not result: + break + for post in result: + try: + wp_posts[int(post["id"])] = post + except Exception: + pass + if len(result) < 100: + break + + # Load all DB articles that have a wp_post_id + with get_conn() as conn: + rows = conn.execute( + """ + SELECT id, wp_post_id, wp_post_url, scheduled_publish_at, status + FROM articles + WHERE wp_post_id IS NOT NULL + AND status NOT IN ('no_image') + ORDER BY id + """ + ).fetchall() + + stats: dict[str, int] = { + "total_db_articles": len(rows), + "wp_posts_found": len(wp_posts), + "slot_updated": 0, + "slot_cleared_draft": 0, + "marked_published": 0, + "wp_reference_cleared": 0, + "already_in_sync": 0, + } + + for row in rows: + article_id = row["id"] + wp_post_id = int(row["wp_post_id"]) + wp_post = wp_posts.get(wp_post_id) + + if wp_post is None: + # Post not found in future/draft/publish — likely trashed or deleted + # Clear wp reference so article can be re-processed if needed + with get_conn() as conn: + conn.execute( + """UPDATE articles + SET wp_post_id = NULL, wp_post_url = NULL, scheduled_publish_at = NULL + WHERE id = ?""", + (article_id,), + ) + stats["wp_reference_cleared"] += 1 + continue + + wp_status = wp_post.get("status", "") + wp_date = wp_post.get("date", "") # local CET datetime, e.g. "2026-05-05T09:00:00" + wp_link = wp_post.get("link") or row["wp_post_url"] + + if wp_status == "publish": + # Already published in WP — mark as published in DB if not already + if row["status"] != "published": + with get_conn() as conn: + conn.execute( + "UPDATE articles SET status = 'published', wp_post_url = ? WHERE id = ?", + (wp_link, article_id), + ) + stats["marked_published"] += 1 + else: + stats["already_in_sync"] += 1 + + elif wp_status == "future": + # Scheduled — sync the date into scheduled_publish_at + current_slot = row["scheduled_publish_at"] or "" + # WP returns e.g. "2026-05-05T09:00:00" — compare ignoring seconds + if current_slot[:16] != wp_date[:16]: + with get_conn() as conn: + conn.execute( + "UPDATE articles SET scheduled_publish_at = ?, wp_post_url = ? WHERE id = ?", + (wp_date, wp_link, article_id), + ) + stats["slot_updated"] += 1 + else: + stats["already_in_sync"] += 1 + + elif wp_status == "draft": + # Draft without a schedule — clear scheduled_publish_at if set + if row["scheduled_publish_at"]: + with get_conn() as conn: + conn.execute( + "UPDATE articles SET scheduled_publish_at = NULL WHERE id = ?", + (article_id,), + ) + stats["slot_cleared_draft"] += 1 + else: + stats["already_in_sync"] += 1 + + return stats diff --git a/backend/app/workflow.py b/backend/app/workflow.py new file mode 100644 index 0000000..83e9b63 --- /dev/null +++ b/backend/app/workflow.py @@ -0,0 +1,39 @@ +from __future__ import annotations + +UI_STATUSES = ("new", "rewrite", "publish", "published", "close", "no_image") + + +def internal_to_ui_status(status: str | None) -> str: + value = (status or "").strip() + if value == "approved": + return "publish" + if value == "error": + return "close" + if value == "review": + return "rewrite" + if value in {"new", "rewrite", "published", "no_image"}: + return value + return value or "new" + + +def ui_to_internal_status(status: str | None) -> str: + value = (status or "").strip() + if value == "publish": + return "approved" + if value == "close": + return "error" + if value in {"new", "rewrite", "published", "no_image"}: + return value + if value in {"approved", "error", "review"}: + return value + return value + + +ALLOWED_UI_TRANSITIONS: dict[str, set[str]] = { + "new": {"rewrite", "close"}, + "rewrite": {"publish", "close"}, + "publish": {"published", "close"}, + "published": {"rewrite", "close"}, + "close": {"rewrite"}, + "no_image": {"rewrite", "close"}, +} diff --git a/backend/data/rss_news.db b/backend/data/rss_news.db new file mode 100644 index 0000000..7929307 Binary files /dev/null and b/backend/data/rss_news.db differ diff --git a/backend/requirements-test.txt b/backend/requirements-test.txt new file mode 100644 index 0000000..cf39f84 --- /dev/null +++ b/backend/requirements-test.txt @@ -0,0 +1,3 @@ +pytest==8.3.5 +pytest-cov==6.0.0 +httpx==0.28.1 diff --git a/backend/requirements.txt b/backend/requirements.txt new file mode 100644 index 0000000..f4ffe61 --- /dev/null +++ b/backend/requirements.txt @@ -0,0 +1,8 @@ +fastapi==0.116.1 +uvicorn[standard]==0.35.0 +itsdangerous==2.2.0 +pydantic-settings==2.10.1 +python-dotenv==1.1.1 +feedparser==6.0.11 +jinja2==3.1.4 +python-multipart==0.0.20 diff --git a/backend/static/admin.css b/backend/static/admin.css new file mode 100644 index 0000000..0b31bb5 --- /dev/null +++ b/backend/static/admin.css @@ -0,0 +1,303 @@ +body { + margin: 0; + font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif; + background: #f4f6f8; + color: #1f2937; +} + +.topbar { + display: flex; + justify-content: space-between; + align-items: center; + padding: 20px 28px; + background: #0f172a; + color: #f8fafc; +} + +.container { + padding: 20px 28px 28px 28px; +} + +.login { + max-width: 520px; + margin: 60px auto; +} + +.card { + background: #ffffff; + border-radius: 10px; + box-shadow: 0 1px 3px rgba(0, 0, 0, 0.12); + padding: 16px; + margin-bottom: 16px; +} + +.stats { + display: grid; + grid-template-columns: repeat(4, minmax(0, 1fr)); + gap: 12px; + margin-bottom: 16px; +} + +.stat { + background: #ffffff; + border-radius: 10px; + padding: 12px; + box-shadow: 0 1px 3px rgba(0, 0, 0, 0.12); +} + +.stat .label { + font-size: 12px; + color: #64748b; +} + +.stat .value { + font-size: 24px; + font-weight: 700; +} + +.grid.two { + display: grid; + grid-template-columns: 1fr 1fr; + gap: 16px; +} + +.stack { + display: grid; + gap: 10px; +} + +.row { + display: flex; + gap: 8px; + align-items: center; +} + +.filter-row { + margin-bottom: 10px; +} + +.inline { + display: flex; + gap: 6px; + align-items: center; +} + +table { + width: 100%; + border-collapse: collapse; +} + +th, td { + text-align: left; + padding: 8px; + border-bottom: 1px solid #e5e7eb; + vertical-align: top; +} + +input, select, button, textarea { + padding: 8px; + border-radius: 6px; + border: 1px solid #cbd5e1; + font: inherit; +} + +button { + background: #0ea5e9; + border-color: #0ea5e9; + color: white; + cursor: pointer; +} + +button.secondary { + background: #64748b; + border-color: #64748b; +} + +.badge { + display: inline-block; + padding: 2px 8px; + border-radius: 999px; + background: #e2e8f0; + font-size: 12px; +} + +.badge.ok { + background: #dcfce7; + color: #166534; +} + +.badge.bad { + background: #fee2e2; + color: #991b1b; +} + +.badge.errcat { + margin-bottom: 4px; +} + +.badge.errcat-policy { + background: #fee2e2; + color: #991b1b; +} + +.badge.errcat-auth { + background: #ffedd5; + color: #9a3412; +} + +.badge.errcat-dns { + background: #dbeafe; + color: #1e40af; +} + +.badge.errcat-media { + background: #fef9c3; + color: #854d0e; +} + +.badge.errcat-api { + background: #ede9fe; + color: #5b21b6; +} + +.badge.errcat-unknown { + background: #e2e8f0; + color: #334155; +} + +.alert { + margin-bottom: 12px; + padding: 10px; + border-radius: 8px; + background: #fee2e2; + color: #991b1b; +} + +.flash { + font-weight: 600; +} + +.flash-success { + border-left: 4px solid #10b981; +} + +.flash-error { + border-left: 4px solid #ef4444; +} + +.subtle { + color: #64748b; + font-size: 12px; + margin-top: 4px; +} + +.pre { + white-space: pre-wrap; + line-height: 1.35; + max-height: 220px; + overflow: auto; + background: #f8fafc; + border: 1px solid #e2e8f0; + border-radius: 8px; + padding: 8px; + margin-top: 6px; +} + +.linkbtn { + display: inline-block; + padding: 8px 10px; + border-radius: 6px; + text-decoration: none; + border: 1px solid #cbd5e1; + color: #334155; + background: #f8fafc; +} + +.detail-grid { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(220px, 1fr)); + gap: 8px 12px; + margin-bottom: 10px; +} + +.detail-item { + background: #f8fafc; + border: 1px solid #e2e8f0; + border-radius: 8px; + padding: 8px; + display: grid; + gap: 4px; +} + +.detail-item .k { + font-size: 12px; + color: #64748b; +} + +.thumb { + width: 72px; + height: 72px; + object-fit: cover; + border-radius: 8px; + border: 1px solid #cbd5e1; + margin-top: 6px; +} + +.image-grid { + display: grid; + grid-template-columns: repeat(auto-fill, minmax(180px, 1fr)); + gap: 10px; +} + +.image-card { + border: 1px solid #e2e8f0; + border-radius: 8px; + padding: 8px; + background: #fff; +} + +.image-card img { + width: 100%; + height: 120px; + object-fit: cover; + border-radius: 6px; + border: 1px solid #e2e8f0; + background: #f8fafc; +} + +.img-failed { + opacity: 0.3; + filter: grayscale(1); +} + +.image-meta { + margin-top: 6px; + display: flex; + gap: 6px; + flex-wrap: wrap; +} + +.image-actions { + margin-top: 8px; + display: flex; + gap: 6px; + flex-wrap: wrap; +} + +.image-selected { + border-color: #10b981; + box-shadow: 0 0 0 1px rgba(16, 185, 129, 0.25); +} + +.image-excluded { + opacity: 0.65; +} + +@media (max-width: 920px) { + .stats { + grid-template-columns: repeat(2, minmax(0, 1fr)); + } + .grid.two { + grid-template-columns: 1fr; + } +} diff --git a/backend/templates/admin_article_detail.html b/backend/templates/admin_article_detail.html new file mode 100644 index 0000000..1c16658 --- /dev/null +++ b/backend/templates/admin_article_detail.html @@ -0,0 +1,224 @@ + + + + + + {{ title }} + + + +
    +
    +

    Artikel-Detail #{{ article.id }}

    +

    Angemeldet als {{ user }}

    +
    +
    + Zurück +
    + +
    +
    +
    + +
    + {% if flash_msg %} +
    + {{ flash_msg }} +
    + {% endif %} + +
    +

    {{ article.title }}

    +
    +
    Status{{ article.status_ui }}
    +
    Artikel-Datum{{ article.published_at or "-" }}
    +
    Alter{{ article.days_old if article.days_old is not none else "-" }} Tage
    +
    Relevanz{{ article.relevance }}
    +
    Autor{{ article.author or "-" }}
    +
    Feed{{ feed.name if feed else "-" }}
    +
    Quelle Snapshot{{ article.source_name_snapshot or "-" }}
    +
    Lizenz Snapshot{{ article.source_license_name_snapshot or "-" }}
    +
    Terms Snapshot{{ article.source_terms_url_snapshot or "-" }}
    +
    +

    Quelle: {{ article.source_url }}

    + {% if article.canonical_url %}

    Canonical: {{ article.canonical_url }}

    {% endif %} + {% if article.summary %} +

    Summary: {{ article.summary }}

    + {% endif %} +

    WordPress Post: + {% if article.wp_post_url %} + #{{ article.wp_post_id }} + {% elif article.wp_post_id %} + #{{ article.wp_post_id }} + {% else %} + - + {% endif %} +

    +

    Publish Attempts: {{ article.publish_attempts or 0 }} | Letzter Fehler: {{ article.publish_last_error or "-" }}

    +
    + +
    +

    Checkliste

    + + + + + + {% for c in checklist %} + + + + + + {% endfor %} + +
    KriteriumStatusWert
    {{ c.label }} + {% if c.status == "ok" %} + OK + {% else %} + Fehlt + {% endif %} + {{ c.value }}
    +
    + +
    +

    Extrahierte Daten

    +

    Bilder: {{ article.image_entries|length if article.image_entries else 0 }}

    + {% if article.selected_image_url %} +

    Ausgewähltes Hauptbild: {{ article.selected_image_url }}

    + {% if article.selected_image_proxy_url %} + Ausgewähltes Hauptbild + {% endif %} + {% endif %} + {% if article.image_entries %} + {% if article.image_selection %} +
    + Automatische Bildauswahl (Score + Gründe) +
    Primärbild (Auto): {{ article.image_selection.primary or "-" }}
    +
    Ausgewählt: {{ article.image_selection.selected_count or 0 }} / Kandidaten: {{ article.image_selection.total_candidates or 0 }}
    + {% if article.image_selection.ranked %} + + + + + + {% for r in article.image_selection.ranked %} + + + + + + {% endfor %} + +
    BildScoreGründe
    {{ r.url }}{{ r.score }}{{ r.reasons|join(", ") if r.reasons else "-" }}
    + {% endif %} +
    + {% endif %} +
    + {% for image in article.image_entries %} +
    + + Artikelbild + +
    + {% if image.is_selected %}Ausgewählt{% endif %} + {% if image.is_excluded %}Ausgeblendet{% endif %} + {% if image.is_irrelevant_hint %}evtl. irrelevant{% endif %} +
    +
    +
    + + + +
    + {% if not image.is_excluded %} +
    + + + +
    + {% else %} +
    + + + +
    + {% endif %} +
    + +
    + {% endfor %} +
    + {% endif %} + {% if article.press_contact or article.extraction.press_contact %} +

    Pressekontakt

    +
    {{ article.press_contact or article.extraction.press_contact }}
    + {% endif %} + {% if article.extraction.extraction_error %} +

    Extraktionsfehler: {{ article.extraction.extraction_error }}

    + {% endif %} +
    + +
    +

    Volltext

    +
    {{ article.content_raw or "-" }}
    +
    + +
    +

    Rewrite-Text (editierbar)

    +
    + + +
    + {% if article.meta.generated_tags %} +

    Generierte Tags: {{ article.meta.generated_tags|join("; ") }}

    + {% endif %} +

    Dieser Text wird für den WordPress-Entwurf verwendet, falls vorhanden.

    +
    + +
    +

    Status ändern

    + {% if article.status_ui in ["new", "rewrite"] %} +
    + +
    + {% endif %} + {% if article.status_ui == "published" %} +
    + +
    + {% endif %} +
    + + + +
    +
    + +
    +

    WordPress Publish Queue

    + {% if article.publish_ready %} +

    Publish bereit

    + {% else %} +

    Publish blockiert

    + {% if article.publish_blockers %} +
      + {% for reason in article.publish_blockers %} +
    • {{ reason }}
    • + {% endfor %} +
    + {% endif %} + {% endif %} +

    Voraussetzungen: Status `publish` und Hauptbild gesetzt.

    +
    + + +
    +
    +
    + + diff --git a/backend/templates/admin_article_list.html b/backend/templates/admin_article_list.html new file mode 100644 index 0000000..38bfb22 --- /dev/null +++ b/backend/templates/admin_article_list.html @@ -0,0 +1,221 @@ + + + + + + {{ title }} + + + + +
    +
    +

    Artikelliste

    +

    Angemeldet als {{ user }}

    +
    +
    + Dashboard + Veröffentlichungsplan +
    + +
    +
    +
    + +
    + {% if flash_msg %} +
    + {{ flash_msg }} +
    + {% endif %} + + +
    +
    +
    +
    + + +
    +
    + + +
    +
    + + Reset +
    +
    +
    +

    {{ total }} Artikel gesamt · Seite {{ page }} / {{ total_pages }} · {{ page_size }} pro Seite

    +
    + + +
    + + + + + + + +
    + + + + + + + + + + + + {% for a in articles %} + + + + + + + + {% endfor %} + +
    BildTitel & KurztextStatusDatumWP ID
    + {% if a.thumb_proxy %} + + Vorschau + + + {% else %} +
    🖼
    + {% endif %} +
    + + {% if a.excerpt %} +
    {{ a.excerpt }}
    + {% endif %} + {% if a.feed_name %} +
    📡 {{ a.feed_name }}
    + {% endif %} +
    + {{ a.status }} + + {% if a.scheduled_publish_at %} + 📅 {{ a.scheduled_publish_at[:16] }} + {% elif a.published_at %} + {{ a.published_at[:10] }} + {% else %} + — + {% endif %} + + + + + {% if a.wp_post_url %} + ↗ WP öffnen + {% endif %} +
    +
    +
    + + + +
    + + + + diff --git a/backend/templates/admin_connectivity.html b/backend/templates/admin_connectivity.html new file mode 100644 index 0000000..5fc0392 --- /dev/null +++ b/backend/templates/admin_connectivity.html @@ -0,0 +1,84 @@ + + + + + + {{ title }} + + + +
    +
    +

    Connectivity Check

    +

    Angemeldet als {{ user }}

    +
    +
    + Zurück +
    + +
    +
    +
    + +
    +
    +
    +
    Checks
    +
    {{ checks|length }}
    +
    +
    +
    OK
    +
    {{ ok_count }}
    +
    +
    +
    Fehler
    +
    {{ error_count }}
    +
    +
    +
    Zeitpunkt
    +
    Live
    +
    +
    + +
    +

    Ziele

    +

    Geprüft werden DNS-Auflösung, TCP-Erreichbarkeit und bei URLs ein HTTP-Request.

    +
    + +
    +
    + +
    +

    Ergebnis

    + + + + + + {% for c in checks %} + + + + + + + + + + + {% endfor %} + +
    StatusNameTypZielDNSTCPHTTPDauer
    {% if c.ok %}OK{% else %}Fehler{% endif %}{{ c.label }}{{ c.kind }}{{ c.target }} + {% if c.dns_ok %}OK{% else %}FAIL{% endif %} +
    {{ c.dns_info }}
    +
    + {% if c.tcp_ok %}OK{% else %}FAIL{% endif %} +
    {{ c.tcp_info }}
    +
    + {% if c.http_ok %}OK{% else %}FAIL{% endif %} +
    {{ c.http_info }}
    +
    {{ c.duration_ms }} ms
    +
    +
    + + diff --git a/backend/templates/admin_dashboard.html b/backend/templates/admin_dashboard.html new file mode 100644 index 0000000..0795b96 --- /dev/null +++ b/backend/templates/admin_dashboard.html @@ -0,0 +1,405 @@ + + + + + + {{ title }} + + + +
    +
    +

    rss-news Admin Dashboard

    +

    Angemeldet als {{ user }}

    +
    + +
    + +
    + {% if flash_msg %} +
    + {{ flash_msg }} +
    + {% endif %} + +
    +
    +
    Quellen
    +
    {{ sources|length }}
    +
    +
    +
    Feeds
    +
    {{ feeds|length }}
    +
    +
    +
    Artikel
    +
    {{ articles|length }}
    +
    +
    +
    Runs
    +
    {{ runs|length }}
    +
    +
    + +
    +
    +

    Quelle anlegen

    +
    + + + + + + + +
    +
    + +
    +

    Feed anlegen

    +
    + + + + + +
    +
    +
    + +
    +

    Ingestion starten

    +
    + + +
    +
    + +
    +

    Publisher ausführen

    +
    + + +
    +
    + +
    +

    Rewrite Run (geplante Artikel)

    +

    Verarbeitet alle Artikel im Status rewrite und setzt sie auf publish.

    +
    + + +
    +
    + +
    +

    Quellen + Policy

    + + + + + + {% for s in sources %} + + + + + + + + + {% endfor %} + +
    IDNameRiskLizenzTermsPolicy
    {{ s.id }}{{ s.name }}{{ s.risk_level }}{{ s.license_name or "-" }}{{ s.terms_url or "-" }} + {% if source_policy[s.id] %} + BLOCKED ({{ source_policy[s.id]|length }}) +
    {{ source_policy[s.id]|join(", ") }}
    + {% else %} + OK + {% endif %} +
    +
    + +
    +

    Quellen verwalten

    + + + + + + {% for s in sources %} + {% set source_form_id = 'source-update-' ~ s.id %} + + + + + + + + {% endfor %} + +
    IDNameURLsMetaAktionen
    #{{ s.id }} + + + + + + + + + + + +
    +
    + +
    +
    + +
    +
    +
    +
    + +
    +

    Feeds verwalten

    + + + + + + {% for f in feeds %} + {% set feed_form_id = 'feed-update-' ~ f.id %} + + + + + + + + + {% endfor %} + +
    IDNameURLQuelleStatusAktionen
    #{{ f.id }} + + + + + + +
    +
    + +
    +
    + +
    +
    +
    +
    + +
    +

    Artikel (Review)

    +
    + + + + Reset + Export JSON + Export CSV +
    + + + + + + {% for a in articles %} + + + + + + + + + {% endfor %} + +
    IDArtikelStatusDetailsRewriteTransition
    {{ a.id }} + {{ a.title }}
    + Autor: {{ a.author or "-" }}
    + Datum: {{ a.published_at or "-" }} | Alter: {{ a.days_old if a.days_old is not none else "-" }} Tage | Relevanz: {{ a.relevance }}
    + Original öffnen +
    Details anzeigen + {% if a.canonical_url and a.canonical_url != a.source_url %} +
    Canonical öffnen + {% endif %} +
    {{ a.status_ui }} +
    Publish: {{ "bereit" if a.publish_ready else "blockiert" }}
    + {% if not a.publish_ready and a.publish_blockers %} +
    {{ a.publish_blockers|join(", ") }}
    + {% endif %} + {% if a.selected_image_url %} +
    Hauptbild gesetzt
    + Hauptbild + {% endif %} + {% if a.summary %} +
    Summary: {{ a.summary }}
    + {% endif %} + {% if a.generated_tags %} +
    Tags: {{ a.generated_tags|join("; ") }}
    + {% endif %} + {% if a.content_raw %} +
    + Volltext anzeigen +
    {{ a.content_raw }}
    +
    + {% endif %} +
    Bilder: {{ a.extracted_images|length }}
    + {% if a.extracted_images %} +
    + Bild-URLs +
      + {% for img in a.extracted_images %} +
    • {{ img }}
    • + {% endfor %} +
    +
    + {% endif %} + {% if a.press_contact %} +
    + Pressekontakt +
    {{ a.press_contact }}
    +
    + {% endif %} + {% if a.extraction_error %} +
    Extraktionsfehler: {{ a.extraction_error }}
    + {% endif %} +
    + {% if a.status_ui in ["new", "rewrite"] %} +
    + +
    + {% else %} + - + {% endif %} +
    +
    + + {% if allowed_transitions.get(a.status_ui, []) %} + + {% else %} + keine Aktion + {% endif %} +
    + {% if a.status_ui == 'close' %} +
    + +
    + {% endif %} +
    +
    + +
    +

    Runs

    + + + + + + {% for r in runs %} + + + + + + + + {% endfor %} + +
    IDTypStatusStartEnde
    {{ r.id }}{{ r.run_type }}{{ r.status }}{{ r.started_at }}{{ r.finished_at or "-" }}
    +
    + +
    +

    Publish Jobs

    + + + + + + {% for j in publish_jobs %} + + + + + + + + + + {% endfor %} + +
    IDArtikelStatusAttemptsWP PostFehlerHinweis
    {{ j.id }}#{{ j.article_id }} {{ j.article_title or "-" }}{{ j.status }}{{ j.attempts }}/{{ j.max_attempts }} + {% if j.wp_post_url %} + #{{ j.wp_post_id }} + {% elif j.wp_post_id %} + #{{ j.wp_post_id }} + {% else %} + - + {% endif %} + + {% if j.error_message %} + {{ j.error_category }} +
    {{ j.error_message }}
    + {% else %} + - + {% endif %} +
    {{ j.error_hint or "-" }}
    +
    +
    + + diff --git a/backend/templates/admin_login.html b/backend/templates/admin_login.html new file mode 100644 index 0000000..10e55e7 --- /dev/null +++ b/backend/templates/admin_login.html @@ -0,0 +1,27 @@ + + + + + + {{ title }} + + + +
    +

    rss-news Admin

    +

    Bitte anmelden, um das Tool zu verwalten.

    + {% if error %} +
    Login fehlgeschlagen. Bitte pruefen.
    + {% endif %} +
    + + + +
    +
    + + diff --git a/backend/templates/admin_schedule.html b/backend/templates/admin_schedule.html new file mode 100644 index 0000000..4f2513a --- /dev/null +++ b/backend/templates/admin_schedule.html @@ -0,0 +1,143 @@ + + + + + + {{ title }} + + + + +
    +
    +

    rss-news Veröffentlichungsplan

    +

    Angemeldet als {{ user }}

    +
    +
    + Dashboard + Connectivity +
    + +
    +
    +
    + +
    + {% if flash_msg %} +
    + {{ flash_msg }} +
    + {% endif %} + +
    +
    +

    WordPress → DB Synchronisieren

    +

    Liest alle geplanten WP-Beiträge und aktualisiert die Slots in der lokalen DB.
    Nutze dies nach manuellen Änderungen in WordPress.

    +
    +
    + +
    +
    + +
    +

    Slot-Übersicht (nächste 60 Tage)

    +
    + 📅 Belegte Slots gesamt: {{ slots|length }} + 🗄️ Aus Pipeline-DB: {{ slots|selectattr('source', 'eq', 'db')|list|length }} + 🌐 Nur in WordPress: {{ slots|selectattr('source', 'eq', 'wordpress')|list|length }} +
    + + + + + {% for h in hours %} + + {% endfor %} + + + + {% for day in calendar_days %} + {% if day.any_booked %} + + + {% for s in day.slots %} + + {% endfor %} + + {% endif %} + {% endfor %} + +
    Tag{{ "%02d:00 Uhr"|format(h) }}
    {{ day.weekday }} {{ day.date_fmt }} + {% if s.booked %} + {% set info = s.slot %} + {% if info.source == 'db' %} + + DB +
    + {% if info.article_id %} + + {{ (info.article_title or "Artikel")[:50] }}{% if (info.article_title or "")|length > 50 %}…{% endif %} + + {% endif %} +
    Status: {{ info.article_status }} + {% if info.wp_post_url %} +
    WP öffnen + {% endif %} +
    + {% else %} + ⚠️ + WP +
    {{ info.article_title }}
    + {% endif %} + {% else %} + frei + {% endif %} +
    + {% if not slots %} +

    Keine geplanten Beiträge in den nächsten 60 Tagen.

    + {% endif %} +
    + +
    +

    Alle belegten Slots (Liste)

    + + + + + + {% for s in slots %} + + + + + + + + {% endfor %} + +
    Datum/ZeitQuelleArtikelStatusWordPress
    {{ s.formatted }} + {% if s.source == 'db' %}Pipeline-DB + {% else %}WordPress{% endif %} + + {% if s.article_id %} + {{ (s.article_title or "")[:60] }} + {% else %} + {{ s.article_title or "-" }} + {% endif %} + {{ s.article_status or "-" }} + {% if s.wp_post_url %} + Draft öffnen + {% else %}-{% endif %} +
    +
    +
    + + diff --git a/backend/tests/__init__.py b/backend/tests/__init__.py new file mode 100644 index 0000000..46816dd --- /dev/null +++ b/backend/tests/__init__.py @@ -0,0 +1 @@ +"""Tests package.""" diff --git a/backend/tests/test_admin_ui.py b/backend/tests/test_admin_ui.py new file mode 100644 index 0000000..c7b6ebf --- /dev/null +++ b/backend/tests/test_admin_ui.py @@ -0,0 +1,419 @@ +import os +import tempfile +import unittest +from pathlib import Path +from unittest.mock import patch + +from fastapi.testclient import TestClient + +from backend.app import config as config_module +from backend.app.db import init_db +from backend.app.main import app +from backend.app.repositories import ( + ArticleUpsert, + FeedCreate, + SourceCreate, + create_feed, + create_source, + get_article_by_id, + upsert_article, +) + + +class TestAdminUi(unittest.TestCase): + def setUp(self) -> None: + self.tmp_dir = tempfile.TemporaryDirectory() + os.environ["APP_DB_PATH"] = str(Path(self.tmp_dir.name) / "admin_ui.db") + os.environ["APP_ADMIN_USERNAME"] = "admin" + os.environ["APP_ADMIN_PASSWORD"] = "secret" + config_module.get_settings.cache_clear() + init_db() + self.client = TestClient(app) + + def tearDown(self) -> None: + config_module.get_settings.cache_clear() + os.environ.pop("APP_DB_PATH", None) + os.environ.pop("APP_ADMIN_USERNAME", None) + os.environ.pop("APP_ADMIN_PASSWORD", None) + self.tmp_dir.cleanup() + + def test_admin_login_and_dashboard(self) -> None: + login_page = self.client.get("/admin/login") + self.assertEqual(login_page.status_code, 200) + self.assertIn("rss-news Admin", login_page.text) + + login = self.client.post( + "/admin/login", + data={"username": "admin", "password": "secret"}, + follow_redirects=True, + ) + self.assertEqual(login.status_code, 200) + self.assertIn("Admin Dashboard", login.text) + + def test_dashboard_redirects_if_not_logged_in(self) -> None: + res = self.client.get("/admin/dashboard", follow_redirects=False) + self.assertEqual(res.status_code, 303) + self.assertEqual(res.headers.get("location"), "/admin/login") + + def test_create_feed_with_empty_source_id_does_not_error(self) -> None: + self.client.post( + "/admin/login", + data={"username": "admin", "password": "secret"}, + follow_redirects=True, + ) + # empty source_id used to cause validation issues in form parsing + res = self.client.post( + "/admin/feeds/create", + data={"name": "Feed X", "url": "https://example.org/feed.xml", "source_id": ""}, + follow_redirects=False, + ) + self.assertEqual(res.status_code, 303) + self.assertTrue(res.headers.get("location", "").startswith("/admin/dashboard")) + + def test_article_detail_page_renders(self) -> None: + source_id = create_source( + SourceCreate( + name="Test Source", + base_url="https://example.org", + terms_url="https://example.org/terms", + license_name="cc-by", + risk_level="green", + is_enabled=True, + notes=None, + last_reviewed_at="2026-02-18T00:00:00Z", + ) + ) + feed_id = create_feed( + FeedCreate( + name="Test Feed", + url="https://example.org/feed.xml", + source_id=source_id, + is_enabled=True, + ) + ) + article_id = upsert_article( + ArticleUpsert( + feed_id=feed_id, + source_article_id="id-1", + source_hash="hash-1", + title="Titel A", + source_url="https://example.org/a", + canonical_url="https://example.org/a", + published_at=None, + author="Autor A", + summary="Summary A", + content_raw="Volltext A", + content_rewritten=None, + image_urls_json='["https://example.org/img.jpg"]', + press_contact="Kontakt", + source_name_snapshot="Test Source", + source_terms_url_snapshot="https://example.org/terms", + source_license_name_snapshot="cc-by", + legal_checked=False, + legal_checked_at=None, + legal_note=None, + wp_post_id=None, + wp_post_url=None, + publish_attempts=0, + publish_last_error=None, + published_to_wp_at=None, + word_count=2, + status="new", + meta_json='{"extraction":{"images":["https://example.org/img.jpg"],"press_contact":"Kontakt"}}', + ) + ) + + self.client.post( + "/admin/login", + data={"username": "admin", "password": "secret"}, + follow_redirects=True, + ) + res = self.client.get(f"/admin/articles/{article_id}", follow_redirects=True) + self.assertEqual(res.status_code, 200) + self.assertIn("Artikel-Detail", res.text) + self.assertIn("Checkliste", res.text) + + decision = self.client.post( + f"/admin/articles/{article_id}/images/decision", + data={"image_url": "https://example.org/img.jpg", "action": "select"}, + follow_redirects=True, + ) + self.assertEqual(decision.status_code, 200) + self.assertIn("Ausgewähltes Hauptbild", decision.text) + + article = get_article_by_id(article_id) + self.assertIsNotNone(article) + self.assertIn("selected_url", article.get("meta_json", "")) + + def test_manage_source_and_feed(self) -> None: + source_id = create_source( + SourceCreate( + name="Edit Source", + base_url="https://example.org", + terms_url="https://example.org/terms", + license_name="cc-by", + risk_level="yellow", + is_enabled=True, + notes=None, + last_reviewed_at=None, + ) + ) + feed_id = create_feed( + FeedCreate( + name="Edit Feed", + url="https://example.org/feed.xml", + source_id=source_id, + is_enabled=True, + ) + ) + self.client.post("/admin/login", data={"username": "admin", "password": "secret"}, follow_redirects=True) + + update_source_res = self.client.post( + f"/admin/sources/{source_id}/update", + data={ + "name": "Edit Source 2", + "base_url": "https://example.org/new", + "terms_url": "https://example.org/new-terms", + "license_name": "cc0", + "risk_level": "green", + "is_enabled": "1", + "notes": "ok", + "last_reviewed_at": "2026-02-21T12:00:00Z", + }, + follow_redirects=False, + ) + self.assertEqual(update_source_res.status_code, 303) + + update_feed_res = self.client.post( + f"/admin/feeds/{feed_id}/update", + data={ + "name": "Edit Feed 2", + "url": "https://example.org/feed2.xml", + "source_id": str(source_id), + "is_enabled": "0", + }, + follow_redirects=False, + ) + self.assertEqual(update_feed_res.status_code, 303) + + delete_feed_res = self.client.post(f"/admin/feeds/{feed_id}/delete", follow_redirects=False) + self.assertEqual(delete_feed_res.status_code, 303) + delete_source_res = self.client.post(f"/admin/sources/{source_id}/delete", follow_redirects=False) + self.assertEqual(delete_source_res.status_code, 303) + + def test_rewrite_save_and_reopen(self) -> None: + source_id = create_source( + SourceCreate( + name="Test Source", + base_url="https://example.org", + terms_url="https://example.org/terms", + license_name="cc-by", + risk_level="green", + is_enabled=True, + notes=None, + last_reviewed_at="2026-02-18T00:00:00Z", + ) + ) + feed_id = create_feed( + FeedCreate( + name="Test Feed", + url="https://example.org/feed.xml", + source_id=source_id, + is_enabled=True, + ) + ) + article_id = upsert_article( + ArticleUpsert( + feed_id=feed_id, + source_article_id="id-published", + source_hash="hash-published", + title="Titel Published", + source_url="https://example.org/published", + canonical_url="https://example.org/published", + published_at=None, + author="Autor A", + summary="Summary", + content_raw="Raw", + content_rewritten="

    Alt

    ", + image_urls_json=None, + press_contact=None, + source_name_snapshot="Test Source", + source_terms_url_snapshot="https://example.org/terms", + source_license_name_snapshot="cc-by", + legal_checked=True, + legal_checked_at="2026-02-21T10:00:00Z", + legal_note=None, + wp_post_id=123, + wp_post_url="https://example.org/?p=123", + publish_attempts=2, + publish_last_error=None, + published_to_wp_at="2026-02-21T10:10:00Z", + word_count=1, + status="published", + meta_json="{}", + ) + ) + self.client.post("/admin/login", data={"username": "admin", "password": "secret"}, follow_redirects=True) + + save_res = self.client.post( + f"/admin/articles/{article_id}/rewrite-save", + data={"content_rewritten": "

    Neu

    Text

    "}, + follow_redirects=False, + ) + self.assertEqual(save_res.status_code, 303) + + reopen_res = self.client.post(f"/admin/articles/{article_id}/reopen", follow_redirects=False) + self.assertEqual(reopen_res.status_code, 303) + + article = get_article_by_id(article_id) + self.assertIsNotNone(article) + self.assertEqual(article.get("status"), "rewrite") + self.assertIn("Neu", article.get("content_rewritten") or "") + self.assertIsNone(article.get("wp_post_id")) + + @patch("backend.app.admin_ui.generate_article_tags") + @patch("backend.app.admin_ui.rewrite_article_text") + def test_batch_rewrite_run_processes_planned_articles(self, mock_rewrite_text, mock_tags) -> None: + mock_rewrite_text.return_value = "

    Neu

    Text

    " + mock_tags.return_value = ["Rheingas", "Monheim"] + + source_id = create_source( + SourceCreate( + name="Batch Source", + base_url="https://example.org", + terms_url="https://example.org/terms", + license_name="cc-by", + risk_level="green", + is_enabled=True, + notes=None, + last_reviewed_at=None, + ) + ) + feed_id = create_feed( + FeedCreate( + name="Batch Feed", + url="https://example.org/feed.xml", + source_id=source_id, + is_enabled=True, + ) + ) + article_id = upsert_article( + ArticleUpsert( + feed_id=feed_id, + source_article_id="batch-1", + source_hash="batch-hash-1", + title="Batch Titel", + source_url="https://example.org/batch", + canonical_url="https://example.org/batch", + published_at=None, + author="Autor", + summary="Summary", + content_raw="Raw", + content_rewritten=None, + image_urls_json=None, + press_contact=None, + source_name_snapshot="Batch Source", + source_terms_url_snapshot="https://example.org/terms", + source_license_name_snapshot="cc-by", + legal_checked=False, + legal_checked_at=None, + legal_note=None, + wp_post_id=None, + wp_post_url=None, + publish_attempts=0, + publish_last_error=None, + published_to_wp_at=None, + word_count=1, + status="rewrite", + meta_json="{}", + ) + ) + self.client.post("/admin/login", data={"username": "admin", "password": "secret"}, follow_redirects=True) + res = self.client.post("/admin/rewrite/run", data={"max_jobs": "10"}, follow_redirects=False) + self.assertEqual(res.status_code, 303) + article = get_article_by_id(article_id) + self.assertIsNotNone(article) + self.assertEqual(article.get("status"), "approved") + self.assertIn("generated_tags", article.get("meta_json", "")) + + @patch("backend.app.admin_ui.urlopen") + def test_image_proxy_returns_image_data(self, mock_urlopen) -> None: + class _FakeHeaders: + def get(self, key: str, default=None): + if key.lower() == "content-type": + return "image/jpeg" + return default + + class _FakeResponse: + headers = _FakeHeaders() + + def read(self): + return b"\xff\xd8\xff\xd9" + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + return False + + mock_urlopen.return_value = _FakeResponse() + + self.client.post( + "/admin/login", + data={"username": "admin", "password": "secret"}, + follow_redirects=True, + ) + res = self.client.get("/admin/images/proxy?url=https%3A%2F%2Fexample.org%2Fimg.jpg") + self.assertEqual(res.status_code, 200) + self.assertIn("image/jpeg", res.headers.get("content-type", "")) + + @patch("backend.app.admin_ui._run_connectivity_check") + @patch("backend.app.admin_ui._build_connectivity_targets") + def test_connectivity_page_renders(self, mock_targets, mock_check) -> None: + mock_targets.return_value = [ + {"label": "OpenAI API", "kind": "host", "value": "api.openai.com"}, + {"label": "WordPress REST", "kind": "url", "value": "https://example.org/wp-json/wp/v2"}, + ] + mock_check.side_effect = [ + { + "label": "OpenAI API", + "kind": "host", + "target": "api.openai.com", + "dns_ok": True, + "dns_info": "1.2.3.4", + "tcp_ok": True, + "tcp_info": "port 443 erreichbar", + "http_ok": True, + "http_info": "n/a (host-only)", + "duration_ms": 12, + "ok": True, + }, + { + "label": "WordPress REST", + "kind": "url", + "target": "https://example.org/wp-json/wp/v2", + "dns_ok": False, + "dns_info": "Name or service not known", + "tcp_ok": False, + "tcp_info": "-", + "http_ok": False, + "http_info": "-", + "duration_ms": 10, + "ok": False, + }, + ] + + self.client.post( + "/admin/login", + data={"username": "admin", "password": "secret"}, + follow_redirects=True, + ) + res = self.client.get("/admin/connectivity", follow_redirects=True) + self.assertEqual(res.status_code, 200) + self.assertIn("Connectivity Check", res.text) + self.assertIn("OpenAI API", res.text) + self.assertIn("WordPress REST", res.text) + + +if __name__ == "__main__": + unittest.main() diff --git a/backend/tests/test_api_auth.py b/backend/tests/test_api_auth.py new file mode 100644 index 0000000..96fbe85 --- /dev/null +++ b/backend/tests/test_api_auth.py @@ -0,0 +1,144 @@ +import os +import tempfile +import unittest +from pathlib import Path + +from fastapi.testclient import TestClient + +from backend.app import config as config_module +from backend.app.db import init_db +from backend.app.main import app + + +class TestApiAuth(unittest.TestCase): + def setUp(self) -> None: + self.tmp_dir = tempfile.TemporaryDirectory() + os.environ["APP_DB_PATH"] = str(Path(self.tmp_dir.name) / "api.db") + os.environ["APP_ADMIN_USERNAME"] = "admin" + os.environ["APP_ADMIN_PASSWORD"] = "secret" + config_module.get_settings.cache_clear() + init_db() + self.client = TestClient(app) + + def tearDown(self) -> None: + config_module.get_settings.cache_clear() + os.environ.pop("APP_DB_PATH", None) + os.environ.pop("APP_ADMIN_USERNAME", None) + os.environ.pop("APP_ADMIN_PASSWORD", None) + self.tmp_dir.cleanup() + + def test_login_and_protected_endpoint(self) -> None: + r = self.client.post("/auth/login", json={"username": "admin", "password": "secret"}) + self.assertEqual(r.status_code, 200) + + p = self.client.get("/api/protected") + self.assertEqual(p.status_code, 200) + self.assertTrue(p.json().get("ok")) + + def test_protected_requires_auth(self) -> None: + r = self.client.get("/api/protected") + self.assertEqual(r.status_code, 401) + + def test_run_detail_endpoint(self) -> None: + login = self.client.post("/auth/login", json={"username": "admin", "password": "secret"}) + self.assertEqual(login.status_code, 200) + + created = self.client.post("/api/runs", json={"run_type": "ingestion", "status": "running"}) + self.assertEqual(created.status_code, 200) + run_id = created.json()["id"] + + detail = self.client.get(f"/api/runs/{run_id}") + self.assertEqual(detail.status_code, 200) + self.assertEqual(detail.json()["item"]["id"], run_id) + + def test_source_policy_check_endpoint(self) -> None: + login = self.client.post("/auth/login", json={"username": "admin", "password": "secret"}) + self.assertEqual(login.status_code, 200) + + created = self.client.post( + "/api/sources", + json={ + "name": "Policy Source", + "risk_level": "yellow", + "is_enabled": True, + }, + ) + self.assertEqual(created.status_code, 200) + source_id = created.json()["id"] + + check = self.client.get(f"/api/sources/{source_id}/policy-check") + self.assertEqual(check.status_code, 200) + body = check.json() + self.assertFalse(body["allowed"]) + self.assertGreaterEqual(len(body["issues"]), 1) + + def test_articles_export_json_and_csv_contains_relevance(self) -> None: + login = self.client.post("/auth/login", json={"username": "admin", "password": "secret"}) + self.assertEqual(login.status_code, 200) + + source = self.client.post( + "/api/sources", + json={ + "name": "Export Source", + "base_url": "https://example.org", + "terms_url": "https://example.org/terms", + "license_name": "cc-by", + "risk_level": "green", + "is_enabled": True, + "last_reviewed_at": "2026-02-18T00:00:00Z", + }, + ) + self.assertEqual(source.status_code, 200) + source_id = source.json()["id"] + + feed = self.client.post( + "/api/feeds", + json={"name": "Export Feed", "url": "https://example.org/feed.xml", "source_id": source_id, "is_enabled": True}, + ) + self.assertEqual(feed.status_code, 200) + feed_id = feed.json()["id"] + + article = self.client.post( + "/api/articles/upsert", + json={ + "feed_id": feed_id, + "source_article_id": "exp-1", + "source_hash": "exp-hash-1", + "title": "Export Artikel", + "source_url": "https://example.org/article/1", + "canonical_url": "https://example.org/article/1", + "published_at": "2026-02-18T00:00:00Z", + "author": "Autor", + "summary": "Kurz", + "content_raw": "Langtext", + "image_urls_json": "[\"https://example.org/img.jpg\"]", + "press_contact": "Kontakt", + "source_name_snapshot": "Export Source", + "source_terms_url_snapshot": "https://example.org/terms", + "source_license_name_snapshot": "cc-by", + "status": "review", + }, + ) + self.assertEqual(article.status_code, 200) + + export_json = self.client.get("/api/articles/export?format=json") + self.assertEqual(export_json.status_code, 200) + body = export_json.json() + self.assertTrue(body.get("ok")) + self.assertGreaterEqual(body.get("count", 0), 1) + first = body["items"][0] + self.assertIn("published_at", first) + self.assertIn("days_old", first) + self.assertIn("relevance", first) + + export_csv = self.client.get("/api/articles/export?format=csv") + self.assertEqual(export_csv.status_code, 200) + self.assertIn("text/csv", export_csv.headers.get("content-type", "")) + csv_text = export_csv.text + self.assertIn("published_at", csv_text) + self.assertIn("days_old", csv_text) + self.assertIn("relevance", csv_text) + + +if __name__ == "__main__": + unittest.main() diff --git a/backend/tests/test_article_workflow.py b/backend/tests/test_article_workflow.py new file mode 100644 index 0000000..094b595 --- /dev/null +++ b/backend/tests/test_article_workflow.py @@ -0,0 +1,110 @@ +import os +import tempfile +import unittest +from pathlib import Path + +from fastapi.testclient import TestClient +from unittest.mock import patch + +from backend.app import config as config_module +from backend.app.db import init_db +from backend.app.main import app + + +class TestArticleWorkflow(unittest.TestCase): + def setUp(self) -> None: + self.tmp_dir = tempfile.TemporaryDirectory() + os.environ["APP_DB_PATH"] = str(Path(self.tmp_dir.name) / "workflow.db") + os.environ["APP_ADMIN_USERNAME"] = "admin" + os.environ["APP_ADMIN_PASSWORD"] = "secret" + config_module.get_settings.cache_clear() + init_db() + self.client = TestClient(app) + self.client.post("/auth/login", json={"username": "admin", "password": "secret"}) + + def tearDown(self) -> None: + config_module.get_settings.cache_clear() + os.environ.pop("APP_DB_PATH", None) + os.environ.pop("APP_ADMIN_USERNAME", None) + os.environ.pop("APP_ADMIN_PASSWORD", None) + self.tmp_dir.cleanup() + + def _create_article(self) -> int: + source = self.client.post( + "/api/sources", + json={ + "name": "Workflow Source", + "base_url": "https://example.org", + "terms_url": "https://example.org/terms", + "license_name": "cc-by", + "risk_level": "green", + "is_enabled": True, + "last_reviewed_at": "2026-02-18T00:00:00Z", + }, + ) + source_id = source.json()["id"] + + feed = self.client.post( + "/api/feeds", + json={"name": "Workflow Feed", "url": "https://example.org/feed.xml", "source_id": source_id, "is_enabled": True}, + ) + feed_id = feed.json()["id"] + + article = self.client.post( + "/api/articles/upsert", + json={ + "feed_id": feed_id, + "source_article_id": "wf-1", + "source_url": "https://example.org/a1", + "title": "Workflow Artikel", + "summary": "s", + "content_raw": "c", + "status": "new", + }, + ) + return article.json()["id"] + + def test_valid_transition_chain(self) -> None: + article_id = self._create_article() + + t1 = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "rewrite"}) + self.assertEqual(t1.status_code, 200) + + t2 = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "publish"}) + self.assertEqual(t2.status_code, 200) + + t3 = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "published"}) + self.assertEqual(t3.status_code, 200) + + t4 = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "rewrite"}) + self.assertEqual(t4.status_code, 200) + + final = self.client.get(f"/api/articles/{article_id}") + self.assertEqual(final.status_code, 200) + self.assertEqual(final.json()["item"]["status"], "rewrite") + self.assertEqual(final.json()["item"]["status_ui"], "rewrite") + + def test_invalid_transition_rejected(self) -> None: + article_id = self._create_article() + bad = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "published"}) + self.assertEqual(bad.status_code, 400) + + def test_legacy_review_endpoint_is_gone(self) -> None: + article_id = self._create_article() + bad = self.client.post(f"/api/articles/{article_id}/review", json={"decision": "approve"}) + self.assertEqual(bad.status_code, 410) + + @patch("backend.app.main.rewrite_article_text") + def test_rewrite_run_sets_publish_status(self, mock_rewrite) -> None: + mock_rewrite.return_value = "

    Neu

    Umschreibung

    " + article_id = self._create_article() + self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "rewrite"}) + r = self.client.post(f"/api/articles/{article_id}/rewrite-run") + self.assertEqual(r.status_code, 200) + self.assertEqual(r.json()["status"], "publish") + final = self.client.get(f"/api/articles/{article_id}") + self.assertEqual(final.json()["item"]["status_ui"], "publish") + + +if __name__ == "__main__": + unittest.main() diff --git a/backend/tests/test_db_repositories.py b/backend/tests/test_db_repositories.py new file mode 100644 index 0000000..91436c6 --- /dev/null +++ b/backend/tests/test_db_repositories.py @@ -0,0 +1,145 @@ +import os +import tempfile +import unittest +from pathlib import Path + +from backend.app import config as config_module +from backend.app.db import init_db +from backend.app.repositories import ( + ArticleUpsert, + FeedCreate, + RunCreate, + SourceCreate, + create_feed, + create_run, + create_source, + finish_run, + list_articles, + list_feeds, + list_runs, + list_sources, + upsert_article, +) + + +class TestSQLiteRepositories(unittest.TestCase): + def setUp(self) -> None: + self.tmp_dir = tempfile.TemporaryDirectory() + self.db_path = str(Path(self.tmp_dir.name) / "test.db") + os.environ["APP_DB_PATH"] = self.db_path + config_module.get_settings.cache_clear() + init_db() + + def tearDown(self) -> None: + config_module.get_settings.cache_clear() + os.environ.pop("APP_DB_PATH", None) + self.tmp_dir.cleanup() + + def test_end_to_end_basic_crud(self) -> None: + source_id = create_source( + SourceCreate( + name="GovData", + base_url="https://data.gov.de", + terms_url="https://www.govdata.de/dl-de/by-2-0", + license_name="dl-de/by-2-0", + risk_level="green", + is_enabled=True, + notes="test source", + last_reviewed_at="2026-02-18T00:00:00Z", + ) + ) + self.assertGreater(source_id, 0) + + feed_id = create_feed( + FeedCreate( + name="GovData RSS", + url="https://example.org/feed.xml", + source_id=source_id, + is_enabled=True, + ) + ) + self.assertGreater(feed_id, 0) + + run_id = create_run(RunCreate(run_type="ingest", status="running", details="start")) + self.assertGreater(run_id, 0) + finish_run(run_id=run_id, status="success", details="ok") + + article_id = upsert_article( + ArticleUpsert( + feed_id=feed_id, + source_article_id="abc-1", + source_hash="hash-abc-1", + title="Beispielartikel", + source_url="https://example.org/articles/1", + canonical_url="https://example.org/articles/1", + published_at="2026-02-18T00:00:00Z", + author="Max Mustermann", + summary="Kurzfassung", + content_raw="Originaltext", + content_rewritten="Umschreibung", + image_urls_json='["https://example.org/img.jpg"]', + press_contact="Pressekontakt X", + source_name_snapshot="GovData", + source_terms_url_snapshot="https://www.govdata.de/dl-de/by-2-0", + source_license_name_snapshot="dl-de/by-2-0", + legal_checked=False, + legal_checked_at=None, + legal_note=None, + wp_post_id=None, + wp_post_url=None, + publish_attempts=0, + publish_last_error=None, + published_to_wp_at=None, + word_count=120, + status="review", + meta_json='{"lang":"de"}', + ) + ) + self.assertGreater(article_id, 0) + + # Upsert with same source_url updates same row + article_id_2 = upsert_article( + ArticleUpsert( + feed_id=feed_id, + source_article_id="abc-1", + source_hash="hash-abc-1", + title="Beispielartikel aktualisiert", + source_url="https://example.org/articles/1", + canonical_url="https://example.org/articles/1", + published_at="2026-02-18T00:00:00Z", + author="Max Mustermann", + summary="Kurzfassung 2", + content_raw="Originaltext 2", + content_rewritten="Umschreibung 2", + image_urls_json='["https://example.org/img2.jpg"]', + press_contact="Pressekontakt Y", + source_name_snapshot="GovData", + source_terms_url_snapshot="https://www.govdata.de/dl-de/by-2-0", + source_license_name_snapshot="dl-de/by-2-0", + legal_checked=True, + legal_checked_at="2026-02-18T00:10:00Z", + legal_note="ok", + wp_post_id=123, + wp_post_url="https://example.org/wp/123", + publish_attempts=1, + publish_last_error=None, + published_to_wp_at="2026-02-18T00:12:00Z", + word_count=140, + status="approved", + meta_json='{"lang":"de","v":2}', + ) + ) + self.assertEqual(article_id, article_id_2) + + self.assertEqual(len(list_sources()), 1) + self.assertEqual(len(list_feeds()), 1) + self.assertEqual(len(list_runs()), 1) + + articles = list_articles() + self.assertEqual(len(articles), 1) + self.assertEqual(articles[0]["title"], "Beispielartikel aktualisiert") + self.assertEqual(articles[0]["status"], "approved") + + +if __name__ == "__main__": + unittest.main() diff --git a/backend/tests/test_ingestion.py b/backend/tests/test_ingestion.py new file mode 100644 index 0000000..82bd2ca --- /dev/null +++ b/backend/tests/test_ingestion.py @@ -0,0 +1,245 @@ +import os +import tempfile +import unittest +from pathlib import Path +from unittest.mock import patch + +from backend.app import config as config_module +from backend.app.db import init_db +from backend.app.ingestion import run_ingestion +from backend.app.repositories import ( + ArticleUpsert, + FeedCreate, + SourceCreate, + create_feed, + create_source, + get_article_by_id, + list_articles, + upsert_article, +) +from backend.app.source_extraction import ExtractedArticle + + +class TestIngestion(unittest.TestCase): + def setUp(self) -> None: + self.tmp_dir = tempfile.TemporaryDirectory() + os.environ["APP_DB_PATH"] = str(Path(self.tmp_dir.name) / "ingestion.db") + config_module.get_settings.cache_clear() + init_db() + + source_id = create_source( + SourceCreate( + name="Test Source", + base_url="https://example.org", + terms_url="https://example.org/terms", + license_name="cc-by", + risk_level="green", + is_enabled=True, + notes=None, + last_reviewed_at="2026-02-18T00:00:00Z", + ) + ) + self.feed_id = create_feed( + FeedCreate( + name="Test Feed", + url="https://example.org/feed.xml", + source_id=source_id, + is_enabled=True, + ) + ) + + def tearDown(self) -> None: + config_module.get_settings.cache_clear() + os.environ.pop("APP_DB_PATH", None) + self.tmp_dir.cleanup() + + @patch("backend.app.ingestion.extract_article") + @patch("backend.app.ingestion.feedparser.parse") + def test_ingestion_deduplicates_by_feed_and_guid(self, mock_parse, mock_extract_article) -> None: + mock_extract_article.return_value = ExtractedArticle( + title="Artikel 1 original", + author="Autorin A", + canonical_url="https://example.org/article/1", + summary="Original Summary", + content_text="Original Volltext", + images=["https://example.org/a.jpg"], + press_contact="Pressekontakt: Team A", + extraction_error=None, + ) + mock_parse.return_value = { + "etag": "etag-1", + "modified": "Tue, 18 Feb 2026 10:00:00 GMT", + "entries": [ + { + "id": "item-1", + "title": "Artikel 1", + "link": "https://example.org/article/1", + "summary": "A", + }, + { + "id": "item-1", + "title": "Artikel 1 aktualisiert", + "link": "https://example.org/article/1-neu", + "summary": "B", + }, + ], + } + + stats = run_ingestion(feed_id=self.feed_id) + self.assertEqual(stats.status, "success") + self.assertEqual(stats.entries_seen, 2) + self.assertEqual(len(list_articles()), 1) + article = list_articles()[0] + self.assertEqual(article["title"], "Artikel 1 original") + self.assertEqual(article["author"], "Autorin A") + self.assertIn("Original Volltext", article["content_raw"] or "") + self.assertIn("Pressekontakt", article["meta_json"] or "") + self.assertIsNotNone(article["image_urls_json"]) + + @patch("backend.app.ingestion.extract_article") + @patch("backend.app.ingestion.feedparser.parse") + def test_ingestion_processes_any_enabled_source(self, mock_parse, mock_extract_article) -> None: + # Ampel/risk-level system removed – all enabled feeds are processed regardless of risk_level + source_id = create_source( + SourceCreate( + name="Any Risk Source", + base_url="https://example.net", + terms_url="https://example.net/terms", + license_name="custom", + risk_level="yellow", + is_enabled=True, + notes=None, + last_reviewed_at="2026-02-18T00:00:00Z", + ) + ) + feed_id = create_feed( + FeedCreate( + name="Any Risk Feed", + url="https://example.net/feed.xml", + source_id=source_id, + is_enabled=True, + ) + ) + + mock_parse.return_value = type("FP", (), {"entries": [], "etag": None, "modified": None})() + mock_extract_article.return_value = type("E", (), { + "title": None, "author": None, "summary": None, "content_text": None, + "canonical_url": None, "images": [], "press_contact": None, + })() + + stats = run_ingestion(feed_id=feed_id) + self.assertEqual(stats.status, "success") + # Feed was processed (feedparser was called), even with yellow risk_level + mock_parse.assert_called_once() + + @patch("backend.app.ingestion.extract_article") + @patch("backend.app.ingestion.feedparser.parse") + def test_ingestion_preserves_existing_work_and_skips_closed(self, mock_parse, mock_extract_article) -> None: + existing_closed_id = upsert_article( + ArticleUpsert( + feed_id=self.feed_id, + source_article_id="closed-1", + source_hash="closed-hash-1", + title="Alt Closed", + source_url="https://example.org/closed-article", + canonical_url="https://example.org/closed-article", + published_at=None, + author="Autor", + summary="Alt", + content_raw="Alt Raw", + content_rewritten="

    Alt Rewrite Closed

    ", + image_urls_json=None, + press_contact="Kontakt Alt", + source_name_snapshot="Test Source", + source_terms_url_snapshot="https://example.org/terms", + source_license_name_snapshot="cc-by", + legal_checked=False, + legal_checked_at=None, + legal_note=None, + wp_post_id=42, + wp_post_url="https://wp.local/?p=42", + publish_attempts=2, + publish_last_error=None, + published_to_wp_at="2026-02-21T12:00:00Z", + word_count=3, + status="error", # UI: close + meta_json='{"generated_tags":["AltTag"]}', + ) + ) + existing_published_id = upsert_article( + ArticleUpsert( + feed_id=self.feed_id, + source_article_id="published-1", + source_hash="published-hash-1", + title="Alt Published", + source_url="https://example.org/published-article", + canonical_url="https://example.org/published-article", + published_at=None, + author="Autor", + summary="Alt", + content_raw="Alt Raw", + content_rewritten="

    Alt Rewrite Published

    ", + image_urls_json=None, + press_contact="Kontakt Alt", + source_name_snapshot="Test Source", + source_terms_url_snapshot="https://example.org/terms", + source_license_name_snapshot="cc-by", + legal_checked=False, + legal_checked_at=None, + legal_note=None, + wp_post_id=77, + wp_post_url="https://wp.local/?p=77", + publish_attempts=3, + publish_last_error=None, + published_to_wp_at="2026-02-21T12:10:00Z", + word_count=3, + status="published", + meta_json='{"generated_tags":["Rheingas"],"image_review":{"selected_url":"https://img.local/1.jpg"}}', + ) + ) + + mock_extract_article.return_value = ExtractedArticle( + title="Neu Titel", + author="Neu Autor", + canonical_url=None, + summary="Neu Summary", + content_text="Neu Volltext", + images=["https://example.org/a.jpg"], + press_contact=None, + extraction_error=None, + ) + mock_parse.return_value = { + "etag": "etag-2", + "modified": "Tue, 18 Feb 2026 11:00:00 GMT", + "entries": [ + { + "id": "closed-1", + "title": "Closed Entry", + "link": "https://example.org/closed-article", + "summary": "X", + }, + { + "id": "published-1", + "title": "Published Entry", + "link": "https://example.org/published-article", + "summary": "Y", + }, + ], + } + + stats = run_ingestion(feed_id=self.feed_id) + self.assertEqual(stats.status, "success") + closed_row = get_article_by_id(existing_closed_id) or {} + self.assertEqual(closed_row["status"], "error") + self.assertIn("Alt Rewrite Closed", closed_row.get("content_rewritten") or "") + self.assertEqual(closed_row.get("wp_post_id"), 42) + + published_row = get_article_by_id(existing_published_id) or {} + self.assertEqual(published_row["status"], "published") + self.assertIn("Alt Rewrite Published", published_row.get("content_rewritten") or "") + self.assertEqual(published_row.get("wp_post_id"), 77) + self.assertIn("generated_tags", published_row.get("meta_json") or "") + + +if __name__ == "__main__": + unittest.main() diff --git a/backend/tests/test_publisher.py b/backend/tests/test_publisher.py new file mode 100644 index 0000000..a32150e --- /dev/null +++ b/backend/tests/test_publisher.py @@ -0,0 +1,112 @@ +import os +import tempfile +import unittest +from pathlib import Path +from unittest.mock import patch + +from fastapi.testclient import TestClient + +from backend.app import config as config_module +from backend.app.db import init_db +from backend.app.main import app + + +class TestPublisher(unittest.TestCase): + def setUp(self) -> None: + self.tmp_dir = tempfile.TemporaryDirectory() + os.environ["APP_DB_PATH"] = str(Path(self.tmp_dir.name) / "publisher.db") + os.environ["APP_ADMIN_USERNAME"] = "admin" + os.environ["APP_ADMIN_PASSWORD"] = "secret" + os.environ["WORDPRESS_BASE_URL"] = "https://example.org" + os.environ["WORDPRESS_USERNAME"] = "wp-user" + os.environ["WORDPRESS_APP_PASSWORD"] = "wp-pass" + config_module.get_settings.cache_clear() + init_db() + self.client = TestClient(app) + self.client.post("/auth/login", json={"username": "admin", "password": "secret"}) + + def tearDown(self) -> None: + config_module.get_settings.cache_clear() + for key in ( + "APP_DB_PATH", + "APP_ADMIN_USERNAME", + "APP_ADMIN_PASSWORD", + "WORDPRESS_BASE_URL", + "WORDPRESS_USERNAME", + "WORDPRESS_APP_PASSWORD", + ): + os.environ.pop(key, None) + self.tmp_dir.cleanup() + + def _create_publishable_article(self) -> int: + source = self.client.post( + "/api/sources", + json={ + "name": "WP Source", + "base_url": "https://example.org", + "terms_url": "https://example.org/terms", + "license_name": "cc-by", + "risk_level": "green", + "is_enabled": True, + "last_reviewed_at": "2026-02-18T00:00:00Z", + }, + ) + source_id = source.json()["id"] + feed = self.client.post( + "/api/feeds", + json={"name": "WP Feed", "url": "https://example.org/feed.xml", "source_id": source_id, "is_enabled": True}, + ) + feed_id = feed.json()["id"] + + article = self.client.post( + "/api/articles/upsert", + json={ + "feed_id": feed_id, + "source_article_id": "pub-1", + "source_hash": "pub-hash-1", + "title": "Publish Artikel", + "source_url": "https://example.org/article/1", + "canonical_url": "https://example.org/article/1", + "published_at": "2026-02-18T00:00:00Z", + "author": "Autor", + "summary": "Kurz", + "content_raw": "Langtext", + "image_urls_json": "[\"https://example.org/img.jpg\"]", + "press_contact": "Kontakt", + "source_name_snapshot": "WP Source", + "source_terms_url_snapshot": "https://example.org/terms", + "source_license_name_snapshot": "cc-by", + "legal_checked": True, + "status": "approved", + "meta_json": "{\"image_review\":{\"selected_url\":\"https://example.org/img.jpg\"}}", + }, + ) + return article.json()["id"] + + @patch("backend.app.publisher.publish_article_draft") + def test_enqueue_and_run_publisher(self, mock_publish) -> None: + mock_publish.return_value = (777, "https://example.org/?p=777") + article_id = self._create_publishable_article() + + enqueue = self.client.post("/api/publisher/enqueue", json={"article_id": article_id, "max_attempts": 3}) + self.assertEqual(enqueue.status_code, 200) + + run = self.client.post("/api/publisher/run", json={"max_jobs": 5}) + self.assertEqual(run.status_code, 200) + stats = run.json()["stats"] + self.assertEqual(stats["success"], 1) + + article = self.client.get(f"/api/articles/{article_id}") + self.assertEqual(article.status_code, 200) + item = article.json()["item"] + self.assertEqual(item["status"], "published") + self.assertEqual(item["wp_post_id"], 777) + self.assertIn("?p=777", item["wp_post_url"] or "") + + jobs = self.client.get("/api/publisher/jobs") + self.assertEqual(jobs.status_code, 200) + self.assertGreaterEqual(len(jobs.json()["items"]), 1) + + +if __name__ == "__main__": + unittest.main() diff --git a/backend/tests/test_relevance.py b/backend/tests/test_relevance.py new file mode 100644 index 0000000..573e312 --- /dev/null +++ b/backend/tests/test_relevance.py @@ -0,0 +1,21 @@ +from datetime import datetime, timezone +import unittest + +from backend.app.relevance import article_age_days, article_relevance + + +class TestRelevance(unittest.TestCase): + def test_article_age_and_relevance(self) -> None: + now = datetime(2026, 2, 18, 12, 0, 0, tzinfo=timezone.utc) + self.assertEqual(article_age_days("2026-02-18T10:00:00Z", now=now), 0) + self.assertEqual(article_relevance("2026-02-18T10:00:00Z", now=now), "hoch") + + self.assertEqual(article_age_days("2026-02-14T12:00:00Z", now=now), 4) + self.assertEqual(article_relevance("2026-02-14T12:00:00Z", now=now), "mittel") + + self.assertEqual(article_relevance("2025-12-01T00:00:00Z", now=now), "alt") + self.assertEqual(article_relevance(None, now=now), "unbekannt") + + +if __name__ == "__main__": + unittest.main() diff --git a/backend/tests/test_source_extraction.py b/backend/tests/test_source_extraction.py new file mode 100644 index 0000000..5cafde7 --- /dev/null +++ b/backend/tests/test_source_extraction.py @@ -0,0 +1,96 @@ +import unittest +from unittest.mock import patch + +from backend.app.source_extraction import extract_article + + +SAMPLE_HTML = """ + + + + + + + + + + + +
    +

    Dies ist der vollstaendige Inhalt des Artikels.

    +

    Weitere relevante Informationen fuer die Meldung.

    +

    Pressekontakt

    +

    Musterfirma GmbH, Kontakt: presse@example.org

    +
    + + +""" + +SAMPLE_HTML_AGENTUR = """ + + + + + + + +
    +

    Inhalt der Meldung.

    +

    Agentur

    +

    Agenturname GmbH

    +

    presse@agentur.example

    +

    Original-Content von Beispiel

    +
    + + +""" + + +class _FakeHeaders: + @staticmethod + def get_content_charset(): + return "utf-8" + + +class _FakeResponse: + headers = _FakeHeaders() + + def __init__(self, body: str): + self._body = body.encode("utf-8") + + def read(self): + return self._body + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + return False + + +class TestSourceExtraction(unittest.TestCase): + @patch("backend.app.source_extraction.urlopen") + def test_extract_article_parses_author_images_and_press_contact(self, mock_urlopen) -> None: + mock_urlopen.return_value = _FakeResponse(SAMPLE_HTML) + + extracted = extract_article("https://www.presseportal.de/pm/118273/6158137") + self.assertEqual(extracted.title, "Demo Meldung von Presseportal") + self.assertEqual(extracted.author, "Max Mustermann") + self.assertEqual(extracted.canonical_url, "https://www.presseportal.de/pm/118273/6158137") + self.assertIn("vollstaendige Inhalt", extracted.content_text or "") + self.assertIn("Kurzbeschreibung", extracted.summary or "") + self.assertIn("https://www.presseportal.de/images/demo.jpg", extracted.images) + self.assertIn("Pressekontakt", extracted.press_contact or "") + self.assertIsNone(extracted.extraction_error) + + @patch("backend.app.source_extraction.urlopen") + def test_extract_article_detects_agentur_block_as_press_contact(self, mock_urlopen) -> None: + mock_urlopen.return_value = _FakeResponse(SAMPLE_HTML_AGENTUR) + extracted = extract_article("https://www.presseportal.de/pm/155103/6210401") + self.assertIn("Agentur", extracted.press_contact or "") + self.assertIn("Agenturname", extracted.press_contact or "") + self.assertIn("presse@agentur.example", extracted.press_contact or "") + + +if __name__ == "__main__": + unittest.main() diff --git a/backend/tests/test_wordpress.py b/backend/tests/test_wordpress.py new file mode 100644 index 0000000..20b0618 --- /dev/null +++ b/backend/tests/test_wordpress.py @@ -0,0 +1,139 @@ +import os +import unittest +from unittest.mock import patch + +from backend.app import config as config_module +from backend.app.wordpress import publish_article_draft + + +class TestWordpressPublish(unittest.TestCase): + def setUp(self) -> None: + os.environ["WORDPRESS_BASE_URL"] = "https://example.org" + os.environ["WORDPRESS_USERNAME"] = "wp-user" + os.environ["WORDPRESS_APP_PASSWORD"] = "wp-pass" + config_module.get_settings.cache_clear() + + def tearDown(self) -> None: + for key in ("WORDPRESS_BASE_URL", "WORDPRESS_USERNAME", "WORDPRESS_APP_PASSWORD"): + os.environ.pop(key, None) + config_module.get_settings.cache_clear() + + @patch("backend.app.wordpress._upload_featured_media") + @patch("backend.app.wordpress._wp_request") + def test_publish_sets_featured_media_when_selected_image_exists(self, mock_wp_request, mock_upload_media) -> None: + mock_upload_media.return_value = 456 + mock_wp_request.return_value = {"id": 321, "link": "https://example.org/?p=321"} + + article = { + "title": "Testartikel", + "content_raw": "Inhalt", + "source_url": "https://example.com/source", + "canonical_url": "https://example.com/source", + "meta_json": '{"image_review":{"selected_url":"https://example.com/image.jpg"}}', + } + post_id, post_url = publish_article_draft(article) + + self.assertEqual(post_id, 321) + self.assertIn("?p=321", post_url or "") + self.assertTrue(mock_upload_media.called) + payload = mock_wp_request.call_args.kwargs["payload"] + self.assertEqual(payload.get("featured_media"), 456) + self.assertIn("", payload.get("content", "")) + self.assertIn("

    Inhalt

    ", payload.get("content", "")) + self.assertNotIn("excerpt", payload) + + @patch("backend.app.wordpress._upload_featured_media") + @patch("backend.app.wordpress._wp_request") + def test_publish_without_selected_image_has_no_featured_media(self, mock_wp_request, mock_upload_media) -> None: + mock_wp_request.return_value = {"id": 654, "link": "https://example.org/?p=654"} + + article = { + "title": "Testartikel", + "content_raw": "Inhalt", + "source_url": "https://example.com/source", + "canonical_url": "https://example.com/source", + "meta_json": "{}", + } + post_id, _ = publish_article_draft(article) + + self.assertEqual(post_id, 654) + self.assertFalse(mock_upload_media.called) + payload = mock_wp_request.call_args.kwargs["payload"] + self.assertNotIn("featured_media", payload) + self.assertIn("

    Inhalt

    ", payload.get("content", "")) + + @patch("backend.app.wordpress._upload_featured_media") + @patch("backend.app.wordpress._wp_request") + def test_publish_strips_feed_header_and_press_contact(self, mock_wp_request, mock_upload_media) -> None: + mock_wp_request.return_value = {"id": 100, "link": "https://example.org/?p=100"} + article = { + "title": "Header Test", + "content_raw": "21.02.2026 10:00\nFirma GmbH\n(ots)\nDas ist der eigentliche Text.\nPressekontakt: Test Person", + "source_url": "https://example.com/source", + "canonical_url": "https://example.com/source", + "meta_json": "{}", + } + publish_article_draft(article) + payload = mock_wp_request.call_args.kwargs["payload"] + content = payload.get("content", "") + self.assertNotIn("Firma GmbH", content) + self.assertNotIn("Pressekontakt", content) + self.assertIn("eigentliche Text", content) + self.assertNotIn("Artikeldetails", content) + + @patch("backend.app.wordpress._upload_featured_media") + @patch("backend.app.wordpress._wp_request") + def test_publish_resolves_and_sets_tags(self, mock_wp_request, mock_upload_media) -> None: + def _fake_wp_request(**kwargs): + endpoint = kwargs.get("endpoint", "") + method = kwargs.get("method", "") + if method == "GET" and endpoint.startswith("tags?search="): + if "Rheingas" in endpoint: + return [{"id": 11, "name": "Rheingas"}] + return [] + if method == "POST" and endpoint == "tags": + name = (kwargs.get("payload") or {}).get("name") + if name == "Gasflasche": + return {"id": 12, "name": "Gasflasche"} + return {"id": 13, "name": str(name)} + if method == "POST" and endpoint == "posts": + return {"id": 900, "link": "https://example.org/?p=900"} + return {} + + mock_wp_request.side_effect = _fake_wp_request + article = { + "title": "Tag Test", + "content_raw": "Inhalt", + "source_url": "https://example.com/source", + "canonical_url": "https://example.com/source", + "meta_json": '{"generated_tags":["Rheingas","Gasflasche"]}', + } + post_id, _ = publish_article_draft(article) + self.assertEqual(post_id, 900) + post_calls = [call for call in mock_wp_request.call_args_list if call.kwargs.get("endpoint") == "posts"] + self.assertEqual(len(post_calls), 1) + payload = post_calls[0].kwargs.get("payload", {}) + self.assertEqual(payload.get("tags"), [11, 12]) + + @patch("backend.app.wordpress._upload_featured_media") + @patch("backend.app.wordpress._wp_request") + def test_publish_converts_html_to_wp_blocks_without_html_block(self, mock_wp_request, mock_upload_media) -> None: + mock_wp_request.return_value = {"id": 111, "link": "https://example.org/?p=111"} + article = { + "title": "Block Test", + "content_rewritten": "

    Überschrift

    Absatz 1

    • A
    • B
    ", + "source_url": "https://example.com/source", + "canonical_url": "https://example.com/source", + "meta_json": "{}", + } + publish_article_draft(article) + payload = mock_wp_request.call_args.kwargs["payload"] + content = payload.get("content", "") + self.assertIn("", content) + self.assertIn("", content) + self.assertNotIn("", content) + + +if __name__ == "__main__": + unittest.main() diff --git a/docs/AUTOMATION.md b/docs/AUTOMATION.md new file mode 100644 index 0000000..8008857 --- /dev/null +++ b/docs/AUTOMATION.md @@ -0,0 +1,190 @@ +# Automatischer Pipeline-Betrieb + +## Überblick + +Das System läuft vollautomatisch und benötigt nur noch gelegentliche Telegram-Interaktion. + +``` +N8N (2× täglich, 08:00 + 16:00 Uhr) + └─► POST /api/n8n/pipeline (X-API-Key Header) + ├── RSS Ingestion (alle aktivierten Feeds) + ├── Relevanz-Score per GPT (0–100) + │ ├── Score ≥ 80 → Rewrite + WP-Draft + Telegram + │ ├── Score 60–79 → Telegram-Warnung + manueller Override möglich + │ └── Score < 60 → Abgelehnt + tägliche Telegram-Liste + └── Pipeline-Zusammenfassung via Telegram +``` + +--- + +## Einrichtung + +### 1. Umgebungsvariablen setzen + +Kopiere `backend/.env.example` nach `backend/.env` und fülle alle Felder aus: + +```bash +cp backend/.env.example backend/.env +nano backend/.env +``` + +Wichtige Variablen: + +| Variable | Beschreibung | +|----------|-------------| +| `TELEGRAM_BOT_TOKEN` | Bot-Token von @BotFather | +| `TELEGRAM_CHAT_ID` | Deine persönliche Chat-ID | +| `TELEGRAM_WEBHOOK_SECRET` | Zufälliger String (≥ 20 Zeichen) | +| `N8N_API_KEY` | Starker zufälliger API-Key | +| `OPENAI_API_KEY` | OpenAI API-Key | +| `WP_BASE_URL` | WordPress-URL | +| `WP_USERNAME` | WordPress-Benutzername | +| `WP_PASSWORD` | WordPress App-Passwort | + +### 2. Telegram-Webhook registrieren + +Nach dem Deployment einmalig aufrufen: + +```bash +curl -X POST https://news.vanityontour.de/api/telegram/setup-webhook \ + -H "Cookie: rss_news_session=" +``` + +Oder über die Admin-UI: Settings → Telegram Webhook einrichten. + +### 3. N8N Workflow einrichten + +In N8N einen neuen Workflow erstellen: + +**Trigger:** Cron +- Zeitplan 1: `0 8 * * *` (täglich 08:00) +- Zeitplan 2: `0 16 * * *` (täglich 16:00) + +**Aktion:** HTTP Request +- Method: `POST` +- URL: `https://news.vanityontour.de/api/n8n/pipeline` +- Header: `X-API-Key: ` + +**Fehlerbehandlung:** Bei HTTP-Fehler → E-Mail/Telegram-Alert + +--- + +## Telegram-Befehle + +| Befehl | Funktion | +|--------|----------| +| `/run` | Pipeline manuell starten | +| `/rejected` | Abgelehnte Artikel der letzten 3 Tage anzeigen | +| `/status` | Aktuellen Pipeline-Status | +| `/help` | Alle Befehle anzeigen | + +--- + +## Telegram-Benachrichtigungen + +### Neuer Draft erstellt +Wenn ein Artikel erfolgreich verarbeitet wurde: + +``` +✅ Neuer Draft erstellt +📰 [Artikel-Titel] +🟢 Relevanz-Score: 87/100 +📅 Vorgeschlagene Veröffentlichung: Mo, 24.03.2026 um 09:00 Uhr +🏷 #VanLife #Camping #Wohnmobil +🔗 Draft in WordPress öffnen + + [✏️ Neu schreiben] [❌ Verwerfen] +``` + +### Relevanz-Warnung (Score 60–79) +``` +⚠️ Artikel mit niedrigem Relevanz-Score +📰 [Artikel-Titel] +🟡 Score: 72/100 +💬 Artikel behandelt hauptsächlich... +🔗 Originalartikel + + [➕ Trotzdem verarbeiten] [❌ Ablehnen] +``` + +### Abgelehnte Artikel (Ende jedes Runs) +Liste aller abgelehnten Artikel mit Override-Buttons für jeden einzelnen. + +--- + +## Relevanz-Score + +Der GPT-basierte Score bewertet die Themenrelevanz für den VanLife/Camping-Blog: + +| Score | Aktion | +|-------|--------| +| 80–100 | Automatisch verarbeiten | +| 60–79 | Telegram-Warnung, manueller Override | +| 0–59 | Automatisch abgelehnt | + +Themen die hoch scored werden: Campingplätze, Stellplätze, Wohnmobile, Van-Ausbau, +Outdoor-Equipment, Wandern, Naturreisen, Roadtrips, Camping-Tipps. + +Schwellwerte sind in `.env` konfigurierbar: +``` +PIPELINE_RELEVANCE_AUTO=80 +PIPELINE_RELEVANCE_WARN=60 +``` + +--- + +## Veröffentlichungsplan + +- Maximal **2 Beiträge pro Tag** +- Bevorzugte Zeiten: **09:00 und 14:00 Uhr** (CET) +- Gleichmäßig über die Woche verteilt +- Der Vorschlag erscheint in der Telegram-Nachricht +- Manuell in WordPress setzen oder über WP Scheduling-Plugin automatisieren + +Einstellbar via: +``` +PIPELINE_MAX_DRAFTS_PER_DAY=2 +PIPELINE_PUBLISH_HOURS=9,14 +``` + +--- + +## API-Endpunkte (N8N / extern) + +Alle externen Endpunkte benötigen den Header `X-API-Key: `. + +| Methode | Endpunkt | Funktion | +|---------|----------|----------| +| `POST` | `/api/n8n/pipeline` | Komplette Pipeline starten | +| `POST` | `/api/n8n/ingest` | Nur RSS-Import (ohne Rewrite) | + +--- + +## Deployment (Hetzner via GitHub) + +Das Deployment läuft automatisch über GitHub Actions beim Push auf `main`: + +1. GitHub Action führt Tests aus +2. Bei Erfolg: SSH-Deploy auf Hetzner +3. `pip install -r requirements.txt` +4. Systemd-Dienst `rss-app` neu starten + +Workflow-Dateien: `.github/workflows/test.yml` und `.github/workflows/deploy.yml` + +--- + +## Troubleshooting + +**Pipeline läuft, aber keine Telegram-Nachrichten:** +- `TELEGRAM_BOT_TOKEN` und `TELEGRAM_CHAT_ID` prüfen +- Webhook-Status prüfen: `GET https://api.telegram.org/bot/getWebhookInfo` + +**N8N bekommt 401:** +- `N8N_API_KEY` in `.env` und N8N-Workflow-Header müssen übereinstimmen + +**Alle Artikel werden abgelehnt:** +- `PIPELINE_RELEVANCE_WARN` temporär auf 40 senken zum Testen +- Über `/rejected` + Override-Button manuell testen + +**Artikel werden doppelt importiert:** +- Deduplication läuft über `source_url` (eindeutig). Bereits verarbeitete Artikel werden nie erneut als Draft angelegt. diff --git a/docs/PROJECT_PLAN.md b/docs/PROJECT_PLAN.md new file mode 100644 index 0000000..3c61216 --- /dev/null +++ b/docs/PROJECT_PLAN.md @@ -0,0 +1,91 @@ +# Projektplan (Neustart) + +## Leitentscheidungen +- Bestehendes Repository wird weiterverwendet. +- Kein harter Endtermin: lauffaehig werden, dann iterativ verbessern. +- Hetzner bleibt Laufzeitplattform. +- WordPress (IONOS) bleibt vorerst Ziel fuer Publikation. +- Auth initial nur mit einem User/Password. + +## Zielbild +Eine modulare News-Pipeline mit klaren Stufen: +1. Feed-Ingestion +2. Inhaltsanalyse und Normalisierung +3. Rewrite/Anreicherung +4. Legal- und Qualitaetschecks +5. WordPress-Publikation (Draft-first, Queue + Retry) +6. Monitoring/Logging + +## Grobe Zeitplanung (ohne Fixtermine) +- Phase 0: ca. 1 Woche +- Phase 1: ca. 2-4 Wochen +- Phase 2: ca. 2-3 Wochen +- Phase 3: fortlaufend + +## Phasen + +### Phase 0 - Grundlagen (jetzt) +- Doku und Wiki strukturieren +- Source-Policy definieren +- Redirect fuer `news.vanityontour.de` setzen +- GitHub Project als zentrale Planung scharfstellen + +### Phase 1 - MVP Core +- Neues FastAPI-Projektgeruest +- SQLite-Datenmodell (feeds, articles, runs, source_policy) +- Feed-Import mit Duplikaterkennung +- Admin-Login (ein User) +- Manuelle Review vor Publish +- Admin-UI fuer Rechtscheck, Bildauswahl, Relevanzbewertung + +### Phase 2 - Automation +- Job-Queue (asynchron) +- Regelbasierte Scheduler +- Retry/Dead-Letter-Handling +- Robustes Error-Reporting +- WordPress-Publisher (Draft) mit Mapping `article_id -> wp_post_id` + +### Phase 3 - Compliance und Skalierung +- Source-Whitelisting mit Pflichtfeldern +- Pflicht-Attribution pro Artikel +- Qualitaetsmetriken und Audit-Logs +- Optional: Passkey/WebAuthn + +## Aktueller Stand (Snapshot) +- Backend/API + Admin-UI lauffaehig +- Feed-Ingestion inkl. Originalartikel-Extraktion (Autor, Pressekontakt, Bilder) +- Bildkuration: + - automatische Scoring-Reduktion (u. a. Presseportal `story_big` priorisiert) + - manuelle Auswahl/Ausblendung im UI +- Rechts-/Publish-Gates aktiv: + - `legal_checked` Pflicht + - Hauptbild-Auswahl Pflicht + - Status-Workflow bis `published` +- WordPress-Publishing: + - Queue + Retry + Job-Historie + - Draft-Erstellung/Update erfolgreich getestet +- Exporte: + - JSON/CSV inkl. Datum/Alter/Relevanz + Attribution/Legal-Felder + +## Naechste Iteration (konkret) +1. WordPress `featured_media` Upload aus ausgewaehltem Hauptbild +2. Publish-HTML je Artikel verfeinern (strukturierter Body + konsistenter Quellenblock) +3. Publisher als periodischen Worker (Timer/Cron/Systemd) auf Hetzner betreiben +4. Monitoring/Alerting fuer Queue-Fehler + WP-API Fehlercodes + +## Architekturprinzipien +- Idempotente Jobs +- Trennung von UI, API, Worker +- Strikte Validierung bei Quell-/Lizenzdaten +- Expliziter Publish-Schritt, kein blindes Autoposting + +## Risiken +- Lizenz-/Nutzungsbedingungen je Quelle variieren stark +- Feeds aendern Struktur/Verfuegbarkeit +- WordPress-API und Auth koennen regressionsanfaellig sein + +## Erfolgsmetriken +- Zeit von Feed-Eingang bis Review-Ready +- Quote sauber attribuierter Artikel +- Fehlerrate pro Pipeline-Stufe +- Anzahl manueller Eingriffe pro Woche diff --git a/docs/SOURCE_POLICY.md b/docs/SOURCE_POLICY.md new file mode 100644 index 0000000..d1d2e0c --- /dev/null +++ b/docs/SOURCE_POLICY.md @@ -0,0 +1,81 @@ +# Source Policy und Feed-Vorschlaege + +## Grundsatz +Es werden nur Quellen genutzt, deren Nutzungsbedingungen die geplante Nutzung erlauben oder fuer die eine explizite Genehmigung vorliegt. + +## Pflichtdaten pro Quelle +- Quellname +- Feed-URL +- Originalartikel-URL +- Autor/Herausgeber (wenn vorhanden) +- Lizenz/Nutzungsgrundlage +- Einschraenkungen (kommerziell, Bearbeitung, Bildrechte, Archivierung) +- Datum der letzten Pruefung +- Link auf Nutzungsbedingungen + +## Einstufung (Ampel) +- Gruen: Nutzung fuer geplantes Modell klar erlaubt +- Gelb: teilklar/mit Einschraenkungen, manuelle Pruefung erforderlich +- Rot: fuer das Modell nicht geeignet ohne Zusatzvertrag + +## Verbindliche Regeln +- Keine neue Quelle ohne Eintrag im Source-Register +- Kein automatischer Publish bei Gelb/Rot +- Bilder separat pruefen (Textrecht != Bildrecht) +- Quartalsweiser Re-Check der Terms + +## Ersteinschaetzung (Stand: 16.02.2026) + +### Rot +1. Reuters / Thomson Reuters +- Grund: Inhalte sind urheberrechtlich geschuetzt; Reproduktion/Verteilung laut Terms nur mit vorheriger Zustimmung. +- Folge: Nur mit explizitem Vertrag/Lizenz. +- Referenz: + - https://www.thomsonreuters.com/en/terms-of-use + +2. tagesschau.de RSS +- Grund: Inhalte nur privat/nicht-kommerziell; Veroeffentlichung grundsaetzlich nicht erlaubt (ausser explizit CC-lizenziert). +- Folge: Nicht fuer das geplante Modell geeignet. +- Referenz: + - https://www.tagesschau.de/infoservices/rssfeeds + +### Gelb +1. Presseportal / ots +- Grund: Redaktionelle Nutzung grundsaetzlich moeglich, aber Verantwortung liegt beim Verwender; darueber hinausgehende Geschaeftsnutzung nur mit Genehmigung. +- Folge: Nur mit strikter Einzelpruefung pro Meldung (insb. Bild-/Drittrechte). +- Referenz: + - https://www.presseportal.de/nutzungsbedingungen + - https://www.presseportal.de/feeds/ + +2. Bundesbehoerden-RSS ohne explizite freie Weiterverwendungs-Lizenz +- Grund: RSS wird bereitgestellt, aber nicht immer als offene Lizenz zur kommerziellen Nachnutzung formuliert. +- Folge: Je Behoerde einzeln pruefen und dokumentieren. +- Beispiele: + - https://www.bundesfinanzministerium.de/Content/DE/Standardartikel/Service/rss_base.html + - https://bmas.bund.de/EN/Services/RSS/rss.html + +### Gruen (mit korrekter Attribution) +1. GovData / Open-Data-Portale mit `dl-de/by-2-0`, `dl-de/zero-2-0`, `CC BY 4.0` oder `CC0` +- Grund: Diese Lizenzen erlauben grundsaetzlich auch kommerzielle Weiterverwendung (je nach Lizenzbedingungen). +- Folge: Sehr gut fuer stabile Automatisierung geeignet. +- Referenz: + - https://www.govdata.de/dl-de/by-2-0 + - https://data.gov.de/informationen/lizenzen + - https://www.dcat-ap.de/def/licenses/dl-zero-de/2.0 + +2. EU-Quellen mit expliziter `CC BY 4.0` Wiederverwendungsregel +- Grund: EU-Inhalte sind haeufig unter CC BY 4.0 wiederverwendbar, sofern nicht anders gekennzeichnet. +- Folge: Geeignet, wenn Drittinhalte ausgenommen werden. +- Referenz: + - https://commission.europa.eu/legal-notice_en + - https://eur-lex.europa.eu/content/help/content/legal-notice/legal-notice.html + +## Quelle im Register freischalten (Definition of Done) +- Terms-Link hinterlegt +- Lizenzklasse (Gruen/Gelb/Rot) gesetzt +- Pflicht-Attribution dokumentiert +- Bildrechtsregel dokumentiert +- Letzte Pruefung und Verantwortlicher gepflegt + +## Hinweis +Keine Rechtsberatung. Bei unklaren oder wirtschaftlich kritischen Quellen ist eine juristische Prüfung sinnvoll. diff --git a/docs/TODO.md b/docs/TODO.md new file mode 100644 index 0000000..fee4a67 --- /dev/null +++ b/docs/TODO.md @@ -0,0 +1,38 @@ +# ToDo (Ein-Entwickler Setup) + +## Jetzt +- [ ] WordPress Beitragsbild-Upload implementieren (`featured_media` aus ausgewaehltem Hauptbild) +- [ ] WordPress-HTML-Ausgabe pro Artikel weiter verbessern (sauberes Layout, Quellenblock, Shortcodes falls noetig) +- [ ] Publisher Fehlertexte fuer WP-Auth/Media/API in UI klarer darstellen +- [ ] End-to-end Publish Smoke-Test dokumentieren (lokal + Hetzner) + +## MVP +- [x] Neues Backend-Skelett (`backend/`) aufsetzen (FastAPI) +- [x] Datenmodell in SQLite anlegen +- [x] Feed-Ingestion Service bauen (ETag/Last-Modified) +- [x] Duplikaterkennung ueber `source_url`, `guid`, Hash +- [x] Login mit 1 Admin-Account implementieren +- [x] Artikel-Review-Maske mit Statusworkflow +- [x] WordPress-Publisher als separaten Service implementieren (Queue + Retry + Mapping) +- [x] Bildvorschau + manuelle Bildauswahl im Admin-UI +- [x] Automatische Bildreduktion/Scoring fuer Presseportal-Quellen +- [x] Artikel-Datum + Relevanzscore im UI/Export + +## Recht/Qualitaet +- [x] Source-Policy in DB + Admin-UI abbilden +- [x] Pflichtfelder je Quelle erzwingen (Autor, URL, Lizenz, Hinweise) +- [x] Auto-Block bei fehlender Lizenzinfo +- [x] Pro Artikel Attribution-Block generieren +- [x] Manuelle Rechtsfreigabe als Publish-Gate + +## Betrieb +- [ ] Systemd-Service(s) fuer API/Worker erstellen +- [ ] Nginx-Routing fuer neue App einrichten +- [ ] Healthcheck-Endpunkte + Monitoring einrichten +- [ ] Backup/Restore fuer DB dokumentieren + +## Spaeter +- [ ] Passkey/WebAuthn evaluieren und optional einfuehren +- [ ] Migration auf PostgreSQL bewerten +- [ ] Teilautomatische Freigabe-Regeln definieren +- [ ] KI-Rewrite mit Prompt-Versionierung + Qualitaetsmetriken wieder aktivieren diff --git a/docs/wiki/Architektur.md b/docs/wiki/Architektur.md new file mode 100644 index 0000000..275b578 --- /dev/null +++ b/docs/wiki/Architektur.md @@ -0,0 +1,29 @@ +# Architektur + +## Zielarchitektur +- API: FastAPI +- Worker: Queue-basierte Hintergrundjobs +- DB: SQLite (MVP), spaeter optional PostgreSQL +- Publisher: WordPress REST API +- Frontend/Admin: schlanke Web-UI mit Login + +## Pipeline +1. Feed Fetch +2. Parse + Normalize +3. Deduplicate +4. Enrichment (Rewrite/Tags) +5. Legal/Policy Check +6. Publish (pending) + +## Datenobjekte (MVP) +- `sources` +- `feeds` +- `articles` +- `article_versions` +- `runs` +- `policy_checks` + +## Nichtziele (MVP) +- Multi-User und Rollen +- Vollautomatische Freigabe ohne Review +- Komplexe externe SSO-Integration diff --git a/docs/wiki/Deployment.md b/docs/wiki/Deployment.md new file mode 100644 index 0000000..91388c7 --- /dev/null +++ b/docs/wiki/Deployment.md @@ -0,0 +1,20 @@ +# Deployment (Hetzner + CloudPanel) + +## Umgebung +- Host: Hetzner +- Reverse Proxy: Nginx via CloudPanel +- Ziel-Domain: `news.vanityontour.de` + +## Aktueller Zustand +- Domain ist bis zum Go-Live auf `https://vanityontour.de` umgeleitet. + +## Zielzustand +- `news.vanityontour.de` zeigt auf neue App (interner Port, z. B. `127.0.0.1:8501`) +- API/Worker laufen als systemd-Services +- TLS bleibt ueber CloudPanel/Nginx + +## Mindest-Checks nach Deployment +- `curl -I https://news.vanityontour.de` +- Login erreichbar +- Feed-Import laeuft +- WordPress-Testpublikation (pending) erfolgreich diff --git a/docs/wiki/Home.md b/docs/wiki/Home.md new file mode 100644 index 0000000..300599a --- /dev/null +++ b/docs/wiki/Home.md @@ -0,0 +1,19 @@ +# Wiki Home + +## Zweck +Dieses Wiki dokumentiert Architektur, Betrieb, Sicherheit, Recht und Roadmap des Neuaufbaus von `rss-news`. + +## Inhalte +- `Architektur.md` +- `Deployment.md` +- `Security-Auth.md` +- `Recht-Quellen.md` +- `Operations-Runbook.md` +- `Roadmap.md` +- `Project-Board.md` + +## Projektsteuerung +- GitHub Project #3: https://github.com/users/OliverGiertz/projects/3/views/1 + +## Prinzip +Dokumentation wird bei jeder relevanten Aenderung im selben Pull Request aktualisiert. diff --git a/docs/wiki/Operations-Runbook.md b/docs/wiki/Operations-Runbook.md new file mode 100644 index 0000000..e6c0f88 --- /dev/null +++ b/docs/wiki/Operations-Runbook.md @@ -0,0 +1,43 @@ +# Operations Runbook + +## Daily Checks +- App erreichbar +- Queue/Worker aktiv +- Letzte Feed-Laeufe erfolgreich +- Keine auffaelligen Fehler im Log + +## Incident: Feed-Import faellt aus +1. RSS-Quelle erreichbar? +2. Parser-Fehler im Log? +3. Rate Limits oder Blockaden? +4. Retry-Queue pruefen + +## Incident: WordPress Publish faellt aus +1. WP API erreichbar? +2. Credentials gueltig? +3. Payload-Validation/Tag-Fehler? +4. Artikel in `pending` statt `failed` markieren, wenn unklar + +## Incident: Telegram-Buttons reagieren nicht / Befehle ignoriert + +**Ursache:** N8N "App Release - Telegram Bot"-Workflow hat den Webhook überschrieben. + +**Prüfen:** +```bash +curl -s "https://api.telegram.org/bot8403822424:AAGp8gZoNIGZv3IIan45q7P9HfM868qzXi4/getWebhookInfo" | python3 -m json.tool +``` +→ `url` muss auf `https://news.vanityontour.de/telegram/webhook` zeigen +→ `allowed_updates` muss `["message", "callback_query"]` enthalten + +**Webhook zurücksetzen:** +```bash +curl -s -X POST "https://api.telegram.org/bot8403822424:AAGp8gZoNIGZv3IIan45q7P9HfM868qzXi4/setWebhook" \ + -H "Content-Type: application/json" \ + -d '{"url":"https://news.vanityontour.de/telegram/webhook","allowed_updates":["message","callback_query"],"secret_token":"RWWAaBwfCUX9Y573JVkB9zAeloHsZZoruXOBBgUtsvU"}' +``` + +Vollständige Dokumentation: `projects/webhook/telegram-webhook-reset.md` + +## Backups +- SQLite-Dump taeglich +- Konfiguration und `.env` sicher sichern diff --git a/docs/wiki/Project-Board.md b/docs/wiki/Project-Board.md new file mode 100644 index 0000000..887ac19 --- /dev/null +++ b/docs/wiki/Project-Board.md @@ -0,0 +1,28 @@ +# Project Board Workflow + +## Zentrale Steuerung +- Board: https://github.com/users/OliverGiertz/projects/3/views/1 +- Board ist die einzige Quelle fuer Planungsstatus. + +## Arbeitsmodus (1 Entwickler) +- Neue Arbeit immer als Issue anlegen +- Issue direkt ins Project aufnehmen +- Status nur im Project pflegen +- PR/Commit auf Issue referenzieren + +## Empfohlene Status-Disziplin +- `Todo`: noch nicht begonnen +- `In Progress`: aktiv in Arbeit +- `Done`: umgesetzt und dokumentiert + +## Konventionen fuer Issues +- Prefix fuer Klarheit: + - `[MVP]` + - `[Infra]` + - `[Legal]` + - `[Bug]` +- Definition of Done in jedem Issue notieren + +## Aktueller Backlog-Hinweis +- Thema Userverwaltung ist fuer MVP obsolet (ein Admin-User). +- Entsprechende Issues als `deferred` oder `closed` kennzeichnen. diff --git a/docs/wiki/Recht-Quellen.md b/docs/wiki/Recht-Quellen.md new file mode 100644 index 0000000..212f0d5 --- /dev/null +++ b/docs/wiki/Recht-Quellen.md @@ -0,0 +1,35 @@ +# Recht und Quellen + +## Grundregeln +- Nur freigegebene Quellen aus Source-Register +- Pflicht-Attribution pro Artikel +- Rechte fuer Bilder separat pruefen +- Kein Autopublish bei unklarer Lizenz + +## Bewertungsmodell +- Gruen: Freie Nachnutzung klar erlaubt +- Gelb: Nutzung mit Einschraenkungen/Einzelfallpruefung +- Rot: Ohne Zusatzlizenz nicht geeignet + +## Aktuelle Referenzen +- Reuters/Thomson Reuters Terms: https://www.thomsonreuters.com/en/terms-of-use +- Presseportal Nutzungsbedingungen: https://www.presseportal.de/nutzungsbedingungen +- tagesschau RSS-Hinweise: https://www.tagesschau.de/infoservices/rssfeeds +- Datenlizenz Deutschland BY 2.0: https://www.govdata.de/dl-de/by-2-0 +- GovData Lizenzen: https://data.gov.de/informationen/lizenzen +- EU Legal Notice (CC BY 4.0): https://commission.europa.eu/legal-notice_en + +## Review-Checkliste je Quelle +1. Sind Bearbeitung und Veroeffentlichung erlaubt? +2. Ist kommerzielle Nutzung erlaubt? +3. Gibt es gesonderte Bildrechte? +4. Ist die Quellenangabe vorgeschrieben? +5. Gibt es Archivierungs- oder Weitergabebeschraenkungen? + +## Operativer Schutz +- Source-Register als Pflicht vor Feed-Aktivierung +- Auto-Block bei fehlenden Lizenzdaten +- Quartalsweiser Terms-Recheck + +## Hinweis +Keine Rechtsberatung. Finale Freigabe kritischer Quellen bei Bedarf juristisch validieren. diff --git a/docs/wiki/Roadmap.md b/docs/wiki/Roadmap.md new file mode 100644 index 0000000..fece89e --- /dev/null +++ b/docs/wiki/Roadmap.md @@ -0,0 +1,19 @@ +# Roadmap + +## Jetzt +- Doku und Projektstruktur bereinigen +- Redirect aktiv +- Backlog auf Neustart ausrichten + +## Naechster Schritt +- FastAPI-MVP implementieren +- Login + Feed-Ingestion + Review + WordPress pending + +## Danach +- Worker/Queue +- Source-Policy Enforcement +- Monitoring/Reporting +- Optional Passkey + +## Steuerung +Alle Arbeitsitems liegen im GitHub Project #3. diff --git a/docs/wiki/Security-Auth.md b/docs/wiki/Security-Auth.md new file mode 100644 index 0000000..a9f830a --- /dev/null +++ b/docs/wiki/Security-Auth.md @@ -0,0 +1,16 @@ +# Security und Auth + +## Mindestanforderungen +- Zugriff auf die WebApp nur mit Login +- Ein aktiver Admin-User (kein Rollenmodell im MVP) +- Passwort nicht im Repo, nur als Secret auf Server + +## Empfohlene Umsetzung +- Session-basierte Auth (HTTP-only Cookies) +- Passwort gehasht (Argon2 oder bcrypt) +- Rate Limiting auf Login-Endpunkt +- CSRF-Schutz fuer Form-Aktionen + +## Spaeter (optional) +- Passkey/WebAuthn als zusaetzlicher Login-Faktor +- IP-Allowlist fuer Admin-Zugang diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..c15b448 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +testpaths = backend/tests +python_files = test_*.py +addopts = -q --maxfail=1 diff --git a/scripts/smoke_backend.sh b/scripts/smoke_backend.sh new file mode 100755 index 0000000..f0000ad --- /dev/null +++ b/scripts/smoke_backend.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash +set -euo pipefail + +if [[ -z "${BASE_URL:-}" ]]; then + echo "BASE_URL fehlt (z. B. https://news.vanityontour.de)" + exit 1 +fi + +if [[ -z "${APP_ADMIN_USERNAME:-}" || -z "${APP_ADMIN_PASSWORD:-}" ]]; then + echo "APP_ADMIN_USERNAME/APP_ADMIN_PASSWORD fehlen" + exit 1 +fi + +cookie_file="$(mktemp)" +trap 'rm -f "$cookie_file"' EXIT + +echo "[1/4] Healthcheck" +curl -fsS "${BASE_URL}/health" | grep -q '"status":"ok"' + +echo "[2/4] Login" +curl -fsS -c "$cookie_file" \ + -H "Content-Type: application/json" \ + -X POST "${BASE_URL}/auth/login" \ + -d "{\"username\":\"${APP_ADMIN_USERNAME}\",\"password\":\"${APP_ADMIN_PASSWORD}\"}" \ + | grep -q '"ok":true' + +echo "[3/4] Protected Endpoint" +curl -fsS -b "$cookie_file" "${BASE_URL}/api/protected" | grep -q '"ok":true' + +echo "[4/4] Pipeline Status" +curl -fsS -b "$cookie_file" "${BASE_URL}/api/pipeline/status" | grep -q '"stage":"skeleton+db"' + +echo "Smoke test erfolgreich."