diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index af3394f..5d55808 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -19,16 +19,9 @@ jobs: username: oliver key: ${{ secrets.HETZNER_SSH_KEY }} port: 22 - envs: APP_ADMIN_USERNAME,APP_ADMIN_PASSWORD script: | - cd /opt/rss-news + cd rss-news git pull origin main source .venv/bin/activate pip install -r requirements.txt - pip install -r backend/requirements.txt || true - sudo systemctl restart rss-news-api - sleep 3 - BASE_URL="https://news.vanityontour.de" APP_ADMIN_USERNAME="${APP_ADMIN_USERNAME}" APP_ADMIN_PASSWORD="${APP_ADMIN_PASSWORD}" bash scripts/smoke_backend.sh - env: - APP_ADMIN_USERNAME: ${{ secrets.NEWS_APP_ADMIN_USERNAME }} - APP_ADMIN_PASSWORD: ${{ secrets.NEWS_APP_ADMIN_PASSWORD }} + sudo systemctl restart rss-app diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml deleted file mode 100644 index 1d627db..0000000 --- a/.github/workflows/test.yml +++ /dev/null @@ -1,39 +0,0 @@ -name: Backend Tests - -on: - push: - branches: [main] - pull_request: - branches: [main] - -jobs: - backend-tests: - runs-on: ubuntu-latest - timeout-minutes: 15 - - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.11" - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r backend/requirements.txt - pip install -r backend/requirements-test.txt - - - name: Run tests with coverage - env: - APP_DB_PATH: /tmp/rss_news_test.db - run: | - pytest backend/tests --cov=backend/app --cov-report=term-missing --cov-report=xml - - - name: Upload coverage artifact - uses: actions/upload-artifact@v4 - with: - name: coverage-xml - path: coverage.xml diff --git a/.gitignore b/.gitignore index aac3a2f..fcbde33 100644 --- a/.gitignore +++ b/.gitignore @@ -36,4 +36,3 @@ internal/copy_files.sh internal/_line.txt internal/push_commit.txt internal/git.sh -CLAUDE.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 66b7237..fa80967 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,42 +1,10 @@ -## [1.7.1] - 2025-08-24 +## [1.7.1] - 2025-08-28 -### ✨ Security angepasst - - alle Credentials in die .env Datei verschoben - - beim Start der App werden die Credentials geprüft und beim fehlen entsprechende Meldungen ausgegeben - ---- +- Beschreibung... ## [1.7.0] - 2025-08-24 -### Multi-Select & Massenoperationen: - - ✅ Checkboxes für Artikel-Auswahl im "Artikel verwalten" Bereich - - ✅ "Alle auswählen" / "Auswahl aufheben" Buttons - - ✅ Massenoperationen für ausgewählte Artikel: - - Bulk Status-Änderung für mehrere Artikel gleichzeitig - - Bulk Artikel-Umschreibung mit automatischer Status-Verwaltung - - Bulk WordPress-Upload nur für "Process"-Artikel - - Bulk Papierkorb-Funktion - -### Schnellaktionen Integration: - - ✅ Feed-Aktualisierung direkt im Artikel-Tab verfügbar - - ✅ Alle Dashboard-Schnellaktionen in Artikel-Verwaltung integriert - - ✅ Intelligente Anzeige nur relevanter Operationen (z.B. WordPress-Upload nur bei Process-Artikeln) - -### 🔧 Verbesserungen - - - UI/UX: Verbesserte Artikel-Card-Layouts mit Checkbox-Integration - - Workflow: Streamlined Artikel-Management ohne Tab-Wechsel nötig - - Feedback: Detaillierte Statusmeldungen bei Massenoperationen - - Performance: Optimierte Session-State-Verwaltung für Artikel-Auswahl - -### 🏗️ Technische Änderungen - - - Session State Erweiterung um selected_articles Set - - Neue Bulk-Operation-Funktionen in app.py:326-467 - - Überarbeitetes Artikel-Card-Layout mit 3-Spalten-Design - - Integration bestehender WordPress-Upload und Rewrite-Funktionen - ---- +- Beschreibung... ## [1.6.3] - 2025-08-18 diff --git a/README.md b/README.md index b3c2b4a..6846a41 100644 --- a/README.md +++ b/README.md @@ -1,63 +1,89 @@ -# rss-news (Rebuild) +# 📰 RSS News Bot -`rss-news` wird als bestehendes Repository weitergefuehrt und schrittweise zu einer robusten, rechtssicheren News-Pipeline neu aufgebaut. +Ein intelligentes Tool zum Einlesen, Umschreiben und Veröffentlichen von Artikeln aus RSS-Feeds – mit automatischer Tag-Erkennung, KI-unterstütztem Rewrite via GPT-4, Bildextraktion aus Originalartikeln und optionaler DALL·E-Bildgenerierung. -Aktueller Stand: -- Alte Streamlit-App wird nicht produktiv genutzt. -- `news.vanityontour.de` wird bis zum Go-Live der neuen App auf `https://vanityontour.de` umgeleitet. -- Planung, Doku und Wiki werden als Grundlage fuer den Neuaufbau gepflegt. +![Version](https://img.shields.io/badge/version-1.5.2-blue) +![License](https://img.shields.io/badge/license-MIT-green) +![Python](https://img.shields.io/badge/python-3.10+-yellow) +![Streamlit](https://img.shields.io/badge/built%20with-Streamlit-ff4b4b) -## Ziele -- RSS-gestuetzte Artikelverarbeitung mit klaren Quellregeln -- Rechtssichere Nutzung (Quellen, Attribution, Lizenzinformationen) -- Zuverlaessige Automatisierung auf Hetzner -- Publikation nach WordPress (IONOS aktuell, spaeter offen) -- Zugriff nur nach Login (zunaechst User/Password) +--- -## Architektur-Richtung (MVP) -- Backend: `Python + FastAPI` -- Jobs: Queue-Worker (z. B. Redis + RQ/Celery) -- Daten: SQLite fuer MVP, spaeter optional PostgreSQL -- Auth: Session-Login mit einem Admin-User -- Publishing: WordPress REST API (Status zunaechst `pending`) +## 🚀 Features -Details: `docs/PROJECT_PLAN.md` +- 📡 **RSS-Feeds verwalten** (hinzufügen, aktualisieren) +- ✍️ **Artikel automatisch umschreiben** mit GPT-4 +- 🏷️ **Tags automatisch generieren** +- 🖼️ **Bilder aus Originalartikeln extrahieren** +- 🪄 **Optionales DALL·E-Bild generieren** +- 🔧 **Bearbeiten von Bildmetadaten** +- 🗂️ **Statusverwaltung der Artikel (New, Rewrite, Process, etc.)** +- 📜 **Log-Viewer-Seite integriert** +- 📥 **Export zur Veröffentlichung auf WordPress vorbereitet** +- 📋 Artikeltabelle mit Status-Filter +- 🔍 Artikel-Expander mit Rewrite, Tags & Bildern +- 🪄 Button für KI-Bildgenerierung -## Projektsteuerung -- GitHub Project: `https://github.com/users/OliverGiertz/projects/3/views/1` -- Dieses Board ist die zentrale Steuerung fuer ToDos, Bugs, Verbesserungen. -- Wiki-Struktur liegt unter `docs/wiki/`. -## Dokumentation -- Projektplan: `docs/PROJECT_PLAN.md` -- ToDo-Liste: `docs/TODO.md` -- Quell- und Lizenzpolicy: `docs/SOURCE_POLICY.md` -- Wiki Home: `docs/wiki/Home.md` +--- -## Lokale Entwicklung (Legacy-Code) -Der vorhandene Legacy-Stand kann weiterhin lokal gestartet werden: +## 🧱 Projektstruktur + +ss-news/ +├── app.py # Haupt-UI mit Streamlit +├── main.py # Logik für Feed-Import und Verarbeitung +├── utils/ +│ └── image_extractor.py # Bilder aus Originalartikeln extrahieren +│ └── dalle_generator.py # DALL·E-Integration (KI-Bild) +├── pages/ +│ └── log_viewer.py # UI zur Anzeige der Logs +├── data/ +│ └── articles.json # Gespeicherte Artikel +│ └── feeds.json # Gespeicherte Feed-URLs +├── logs/ +│ └── rss_tool.log # Logging der Verarbeitung +├── versioning.py # CLI-Tool zur Versionierung & Release +├── TEST-CHECKLIST.md # Manuelle Prüfliste für Releases +├── version.py # Aktuelle Version +└── CHANGELOG.md # Änderungsprotokoll + + +--- + +## ⚙️ Installation ```bash +git clone https://github.com/OliverGiertz/rss-news.git +cd rss-news python -m venv .venv source .venv/bin/activate pip install -r requirements.txt -streamlit run app.py ``` -Hinweis: Diese App ist funktional historisch und wird durch die neue Architektur ersetzt. +--- -## Deployment-Zielbild -- Betrieb auf Hetzner -- Reverse Proxy via CloudPanel/Nginx -- Produktive Domain: `news.vanityontour.de` -- Bis zur Fertigstellung: Redirect auf `https://vanityontour.de` +## Update +Ein Update Script findest du hier: https://gist.github.com/OliverGiertz/ad33ae3de9aa1c1163dad5fe8affb6ca -## Sicherheit -- Keine Secrets im Repository -- `.env` lokal/auf Server, nie committen -- Auth-Pflicht fuer die neue WebApp -- spaeter optional: Passkeys/WebAuthn +```bash +bash update.sh +``` -## Rechtlicher Hinweis -Dieses Projekt verarbeitet nur Quellen mit dokumentierter Nutzungsgrundlage. Vor produktiver Nutzung ist eine finale rechtliche Pruefung der ausgewaehlten Feeds notwendig. +## ▶️ Starten der App + +streamlit run app.py + +--- + +## 🔐 Konfiguration (.env) + +Lege eine `.env` im Projekt an (siehe `.env.example`). Erforderliche Variablen: + +- `WP_BASE_URL`: Basis-URL deiner WordPress-Seite (z. B. https://example.com) +- Authentifizierung (eine Option wählen): + - `WP_AUTH_BASE64`: Bevorzugt. Base64 von `username:application_password` + - oder `WP_USERNAME` und `WP_PASSWORD`: Benutzer + Anwendungspasswort +- Optional: `OPENAI_API_KEY` für das Umschreiben von Artikeln + +Hinweis: Der Code liest ausschließlich aus `.env`. Es gibt keine hartkodierten Standard-Credentials. diff --git a/backend/.env.example b/backend/.env.example deleted file mode 100644 index c2dd235..0000000 --- a/backend/.env.example +++ /dev/null @@ -1,45 +0,0 @@ -# ─── App ──────────────────────────────────────────────────────────────────── -APP_ENV=development -APP_NAME=rss-news-backend -APP_SECRET_KEY=replace-with-a-long-random-secret -APP_DB_PATH=backend/data/rss_news.db - -APP_ADMIN_USERNAME=admin -APP_ADMIN_PASSWORD=change-me - -SESSION_COOKIE_NAME=rss_news_session -SESSION_MAX_AGE_SECONDS=28800 - -# ─── WordPress ────────────────────────────────────────────────────────────── -WP_BASE_URL=https://your-site.tld -WP_USERNAME=your-wp-username -WP_PASSWORD=your-wp-app-password -# Status für neue Beiträge: draft | future | publish -WORDPRESS_DEFAULT_STATUS=draft - -# ─── OpenAI ───────────────────────────────────────────────────────────────── -OPENAI_API_KEY=sk-... -# gpt-4o-mini empfohlen (Kosten/Qualität) -OPENAI_MODEL=gpt-4o-mini - -# ─── Telegram Bot ──────────────────────────────────────────────────────────── -# Bot-Token von @BotFather -TELEGRAM_BOT_TOKEN=123456789:ABC... -# Chat-ID deines persönlichen Chats oder einer Gruppe -TELEGRAM_CHAT_ID=123456789 -# Zufälliger Secret-Token zur Webhook-Absicherung (mindestens 20 Zeichen) -TELEGRAM_WEBHOOK_SECRET=replace-with-random-secret-min-20-chars - -# ─── N8N API-Key ───────────────────────────────────────────────────────────── -# Wird von N8N im Header X-API-Key mitgeschickt -N8N_API_KEY=replace-with-strong-random-key - -# ─── Pipeline-Einstellungen ────────────────────────────────────────────────── -# Relevanz-Score >= dieser Wert: automatisch verarbeiten (0-100) -PIPELINE_RELEVANCE_AUTO=80 -# Relevanz-Score >= dieser Wert, aber < AUTO: Telegram-Warnung senden -PIPELINE_RELEVANCE_WARN=60 -# Maximale Drafts/Veröffentlichungen pro Tag -PIPELINE_MAX_DRAFTS_PER_DAY=2 -# Bevorzugte Veröffentlichungszeiten (Stunden, kommagetrennt, CET) -PIPELINE_PUBLISH_HOURS=9,14 diff --git a/backend/README.md b/backend/README.md deleted file mode 100644 index 7d64a65..0000000 --- a/backend/README.md +++ /dev/null @@ -1,82 +0,0 @@ -# Backend Skeleton (FastAPI) - -Dieses Verzeichnis enthaelt das technische Grundgeruest fuer den Rebuild von `rss-news`. - -## Start (lokal) - -```bash -python -m venv .venv -source .venv/bin/activate -pip install -r backend/requirements.txt -uvicorn backend.app.main:app --reload --port 8501 -``` - -## Admin UI -- Login: `http://127.0.0.1:8501/admin/login` -- Dashboard: `http://127.0.0.1:8501/admin/dashboard` - -## Environment -- Datei: `backend/.env` -- Vorlage: `backend/.env.example` - -## Endpoints -- `GET /health` - Healthcheck -- `POST /auth/login` - Login mit Admin-User -- `POST /auth/logout` - Logout -- `GET /auth/me` - Aktiver User -- `GET /api/protected` - Geschuetzter Test-Endpoint -- `GET /api/pipeline/status` - Basisstatus inkl. Datensatzzaehler -- `GET /api/sources` - Quellenliste -- `POST /api/sources` - Quelle anlegen -- `GET /api/sources/{source_id}/policy-check` - Policy-Pruefung fuer Quelle -- `GET /api/feeds` - Feedliste -- `POST /api/feeds` - Feed anlegen -- `GET /api/feeds/{feed_id}/policy-check` - Policy-Pruefung fuer Feed -- `GET /api/runs` - Import-/Job-Runs anzeigen -- `GET /api/runs/{run_id}` - Detailansicht eines Runs -- `POST /api/runs` - Run starten -- `POST /api/runs/{run_id}/finish` - Run abschliessen -- `GET /api/articles` - Artikel anzeigen -- `GET /api/articles/{article_id}` - Artikeldetail -- `POST /api/articles/upsert` - Artikel idempotent anlegen/aktualisieren -- `POST /api/articles/{article_id}/transition` - Statuswechsel nach Workflow-Regeln -- `POST /api/articles/{article_id}/review` - Review-Entscheidung (approve/reject) -- `POST /api/ingestion/run` - Feed-Ingestion starten (optional pro Feed) - -## Datenbank -- SQLite-Datei unter `backend/data/rss_news.db` -- Tabellen werden beim App-Start initialisiert. -- Tabellen: `sources`, `feeds`, `runs`, `articles` -- Dedupe-Strategie Artikel: `source_url` -> `(feed_id, source_article_id)` -> `source_hash` - -## Policy-Enforcement -- Ingestion blockiert Feeds automatisch, wenn die zugeordnete Quelle nicht policy-konform ist. -- Mindestanforderungen: `risk_level=green`, `terms_url`, `license_name`, `last_reviewed_at`, `is_enabled=1`. -- Pro importiertem Artikel wird ein `attribution`-Block in `meta_json` gespeichert. - -## Review-Workflow -- Statuskette: `new -> review -> approved -> published` -- Ablehnung im Review setzt auf `rewrite` -- Ungueltige Statuswechsel werden per API blockiert - -## Verifikation -```bash -python -m unittest backend.tests.test_db_repositories -python -m unittest backend.tests.test_ingestion -python -m unittest backend.tests.test_api_auth -``` - -## CI / Online-Auswertung -- GitHub Actions Workflow: `.github/workflows/test.yml` -- Fuehrt Tests inkl. Coverage auf Push/PR gegen `main` aus. - -## Hetzner Smoketest -```bash -BASE_URL="https://news.vanityontour.de" \ -APP_ADMIN_USERNAME="admin" \ -APP_ADMIN_PASSWORD="..." \ -bash scripts/smoke_backend.sh -``` - -## Hinweis -Passwort-Hashing und CSRF/Rate-Limit sind als naechste Ausbaustufe vorgesehen. diff --git a/backend/__init__.py b/backend/__init__.py deleted file mode 100644 index 3623851..0000000 --- a/backend/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Backend package for rss-news rebuild.""" diff --git a/backend/app/__init__.py b/backend/app/__init__.py deleted file mode 100644 index 18b665e..0000000 --- a/backend/app/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Application package.""" diff --git a/backend/app/admin_ui.py b/backend/app/admin_ui.py deleted file mode 100644 index a25199c..0000000 --- a/backend/app/admin_ui.py +++ /dev/null @@ -1,1126 +0,0 @@ -from __future__ import annotations - -import json -from pathlib import Path -import re -import socket -import ssl -import time -from urllib.parse import urlparse -from urllib.parse import urlencode -from urllib.request import Request as UrlRequest, urlopen - -from fastapi import APIRouter, Form, Request -from fastapi.responses import HTMLResponse, RedirectResponse, Response -from fastapi.templating import Jinja2Templates - -from .auth import create_session_token, verify_credentials, verify_session_token -from .config import get_settings -from .ingestion import run_ingestion -from .policy import evaluate_source_policy -from .publisher import enqueue_publish, run_publisher -from .relevance import article_age_days, article_relevance -from .rewrite import generate_article_tags, merge_generated_tags, rewrite_article_text -from .repositories import ( - FeedCreate, - FeedUpdate, - SourceCreate, - SourceUpdate, - delete_feed, - delete_source, - create_feed, - create_source, - get_article_by_id, - get_feed_by_id, - list_articles, - list_articles_page, - bulk_update_wp_post_ids, - list_feeds, - list_publish_jobs, - list_runs, - list_sources, - set_article_image_decision, - upsert_article, - update_feed, - update_source, - update_article_status, - ArticleUpsert, -) -from .workflow import ALLOWED_UI_TRANSITIONS, UI_STATUSES, internal_to_ui_status, ui_to_internal_status - -settings = get_settings() -router = APIRouter(tags=["admin-ui"]) -templates = Jinja2Templates(directory=str(Path(__file__).resolve().parent.parent / "templates")) -ALLOWED_TRANSITIONS: dict[str, tuple[str, ...]] = { - "new": ("rewrite", "close"), - "rewrite": ("publish", "close"), - "publish": ("published", "close"), - "published": ("rewrite", "close"), - "close": ("rewrite",), -} -IMAGE_PROXY_USER_AGENT = "rss-news-admin/1.0" -_UNSET = object() - - -def _admin_user(request: Request) -> str | None: - token = request.cookies.get(settings.session_cookie_name) - if not token: - return None - return verify_session_token(token) - - -def _to_optional_int(raw: str | None) -> int | None: - if raw is None: - return None - value = raw.strip() - if value == "": - return None - return int(value) - - -def _dashboard_redirect( - *, - msg: str | None = None, - msg_type: str = "success", - status_filter: str | None = None, -) -> RedirectResponse: - query: dict[str, str] = {} - if msg: - query["msg"] = msg - query["type"] = msg_type - if status_filter: - query["status_filter"] = status_filter - suffix = f"?{urlencode(query)}" if query else "" - return RedirectResponse(url=f"/admin/dashboard{suffix}", status_code=303) - - -def _parse_meta_json(raw: str | None) -> dict: - if not raw: - return {} - try: - parsed = json.loads(raw) - return parsed if isinstance(parsed, dict) else {} - except Exception: - return {} - - -def _read_article_images(article: dict, extraction: dict) -> list[str]: - images: list[str] = [] - if article.get("image_urls_json"): - try: - parsed_images = json.loads(article["image_urls_json"]) - if isinstance(parsed_images, list): - images = [str(item) for item in parsed_images if item] - except Exception: - images = [] - if not images and isinstance(extraction.get("images"), list): - images = [str(item) for item in extraction.get("images") if item] - # deduplicate preserving order - seen: set[str] = set() - deduped: list[str] = [] - for image in images: - if image not in seen: - seen.add(image) - deduped.append(image) - return deduped - - -def _is_probably_irrelevant_image(url: str) -> bool: - lowered = url.lower() - patterns = ( - r"logo", - r"icon", - r"sprite", - r"avatar", - r"favicon", - r"/ads/", - r"tracking", - r"pixel", - r"banner", - ) - return any(re.search(pattern, lowered) for pattern in patterns) - - -def _is_http_image_url(url: str) -> bool: - try: - parsed = urlparse(url) - except Exception: - return False - return parsed.scheme in {"http", "https"} and bool(parsed.netloc) - - -def _build_image_entries(article: dict, extraction: dict, meta: dict) -> list[dict[str, object]]: - all_images = _read_article_images(article, extraction) - image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {} - selected_url = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None - excluded_urls = image_review.get("excluded_urls") if isinstance(image_review.get("excluded_urls"), list) else [] - excluded_set = {str(item) for item in excluded_urls if item} - - entries: list[dict[str, object]] = [] - for url in all_images: - entries.append( - { - "url": url, - "proxy_url": f"/admin/images/proxy?{urlencode({'url': url})}", - "is_selected": selected_url == url, - "is_excluded": url in excluded_set, - "is_irrelevant_hint": _is_probably_irrelevant_image(url), - } - ) - return entries - - -def _publish_readiness(article: dict, meta: dict) -> tuple[bool, list[str]]: - reasons: list[str] = [] - if internal_to_ui_status(article.get("status")) not in {"publish", "published"}: - reasons.append("Status ist nicht 'publish'") - image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {} - selected_image = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None - if not selected_image: - reasons.append("Hauptbild nicht ausgewählt") - return len(reasons) == 0, reasons - - -def _classify_publish_error(error_message: str | None) -> tuple[str, str]: - text = (error_message or "").lower() - if not text.strip(): - return "ok", "-" - if "rechtsfreigabe fehlt" in text or "hauptbild nicht gesetzt" in text or "status ist nicht" in text: - return "policy", "Artikelvoraussetzungen im UI prüfen (Status/Hauptbild)." - if "401" in text or "403" in text or "authorization" in text or "forbidden" in text or "unauthorized" in text: - return "auth", "WordPress Nutzer/App-Passwort prüfen." - if "404" in text and ("media" in text or "posts" in text or "wp-json" in text): - return "api", "WordPress REST-Endpunkt prüfen (`/wp-json/wp/v2`)." - if "timed out" in text or "timeout" in text or "nodename nor servname provided" in text or "name or service not known" in text: - return "dns", "DNS/Netzwerk zur WordPress-Domain prüfen." - if "media-upload fehlgeschlagen" in text or "liefert kein bild" in text or "featured_media" in text: - return "media", "Bild-URL/Format prüfen oder anderes Hauptbild auswählen." - return "unknown", "Fehlerdetails prüfen und bei Bedarf Job erneut starten." - - -def _legal_checklist(article: dict, feed: dict | None) -> list[dict[str, str]]: - meta = article.get("meta", {}) - extraction = meta.get("extraction") if isinstance(meta.get("extraction"), dict) else {} - attribution = meta.get("attribution") if isinstance(meta.get("attribution"), dict) else {} - - checks: list[dict[str, str]] = [] - checks.append( - { - "label": "Original-Link vorhanden", - "status": "ok" if article.get("source_url") else "missing", - "value": article.get("source_url") or "-", - } - ) - checks.append( - { - "label": "Autor vorhanden", - "status": "ok" if article.get("author") else "missing", - "value": article.get("author") or "-", - } - ) - checks.append( - { - "label": "Bilder extrahiert", - "status": "ok" if article.get("image_urls_json") else "missing", - "value": str(len(extraction.get("images", []))) if isinstance(extraction.get("images"), list) else "0", - } - ) - checks.append( - { - "label": "Pressekontakt", - "status": "ok" if article.get("press_contact") else "missing", - "value": article.get("press_contact") or extraction.get("press_contact") or "-", - } - ) - checks.append( - { - "label": "Lizenz/Terms", - "status": "ok" if article.get("source_license_name_snapshot") and article.get("source_terms_url_snapshot") else "missing", - "value": f"{article.get('source_license_name_snapshot') or attribution.get('source_license_name') or '-'} | {article.get('source_terms_url_snapshot') or attribution.get('source_terms_url') or '-'}", - } - ) - checks.append( - { - "label": "Risiko-Status Quelle", - "status": "ok" if (feed and feed.get("source_risk_level") == "green") else "missing", - "value": feed.get("source_risk_level") if feed else "-", - } - ) - image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {} - selected_image = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None - checks.append( - { - "label": "Hauptbild ausgewählt", - "status": "ok" if selected_image else "missing", - "value": selected_image or "-", - } - ) - return checks - - -def _build_connectivity_targets() -> list[dict[str, str]]: - targets: list[dict[str, str]] = [] - seen: set[tuple[str, str]] = set() - - def add_target(label: str, kind: str, value: str) -> None: - normalized = (value or "").strip() - if not normalized: - return - key = (kind, normalized.lower()) - if key in seen: - return - seen.add(key) - targets.append({"label": label, "kind": kind, "value": normalized}) - - add_target("OpenAI API", "host", "api.openai.com") - if settings.wordpress_base_url: - parsed = urlparse(settings.wordpress_base_url) - if parsed.hostname: - add_target("WordPress Host", "host", parsed.hostname) - wp_api_url = f"{settings.wordpress_base_url.rstrip('/')}/wp-json/wp/v2" - add_target("WordPress REST", "url", wp_api_url) - - for feed in list_feeds(): - name = (feed.get("name") or "").strip() or f"Feed #{feed.get('id')}" - feed_url = str(feed.get("url") or "").strip() - if not feed_url: - continue - parsed = urlparse(feed_url) - if parsed.hostname: - add_target(f"{name} (Feed)", "host", parsed.hostname) - add_target(f"{name} (Feed URL)", "url", feed_url) - - return targets - - -def _run_connectivity_check(target: dict[str, str]) -> dict[str, object]: - kind = target.get("kind", "") - value = str(target.get("value") or "") - row: dict[str, object] = { - "label": target.get("label") or "-", - "kind": kind, - "target": value, - "dns_ok": False, - "dns_info": "-", - "tcp_ok": False, - "tcp_info": "-", - "http_ok": False, - "http_info": "-", - "duration_ms": 0, - "ok": False, - } - started = time.perf_counter() - try: - hostname = value if kind == "host" else (urlparse(value).hostname or "") - port = 443 - if kind == "url": - parsed = urlparse(value) - if parsed.scheme not in {"http", "https"}: - row["http_info"] = f"unsupported scheme: {parsed.scheme or '-'}" - return row - port = 443 if parsed.scheme == "https" else 80 - if not hostname: - row["dns_info"] = "host fehlt" - return row - - try: - addr_info = socket.getaddrinfo(hostname, None, proto=socket.IPPROTO_TCP) - ips = sorted({entry[4][0] for entry in addr_info if entry and len(entry) > 4 and entry[4]}) - row["dns_ok"] = True - row["dns_info"] = ", ".join(ips[:3]) if ips else "resolved" - except Exception as exc: - row["dns_info"] = str(exc) - return row - - try: - socket.create_connection((hostname, port), timeout=4).close() - row["tcp_ok"] = True - row["tcp_info"] = f"port {port} erreichbar" - except Exception as exc: - row["tcp_info"] = str(exc) - return row - - if kind == "host": - row["http_ok"] = True - row["http_info"] = "n/a (host-only)" - row["ok"] = True - return row - - try: - req = UrlRequest( - url=value, - headers={"User-Agent": IMAGE_PROXY_USER_AGENT, "Accept": "*/*"}, - ) - with urlopen(req, timeout=6, context=ssl.create_default_context()) as resp: - code = getattr(resp, "status", None) or resp.getcode() - row["http_ok"] = True - row["http_info"] = f"HTTP {code}" - except Exception as exc: - row["http_info"] = str(exc) - return row - - row["ok"] = bool(row["dns_ok"] and row["tcp_ok"] and row["http_ok"]) - return row - finally: - row["duration_ms"] = int((time.perf_counter() - started) * 1000) - - -def _upsert_article_from_existing( - article: dict, - *, - content_rewritten: str | None = None, - status: str | None = None, - wp_post_id: int | None | object = _UNSET, - wp_post_url: str | None | object = _UNSET, - publish_attempts: int | object = _UNSET, - publish_last_error: str | None | object = _UNSET, - published_to_wp_at: str | None | object = _UNSET, - meta_json: str | None | object = _UNSET, -) -> None: - rewritten = article.get("content_rewritten") if content_rewritten is None else content_rewritten - upsert_article( - ArticleUpsert( - feed_id=article.get("feed_id"), - source_article_id=article.get("source_article_id"), - source_hash=article.get("source_hash"), - title=article.get("title"), - source_url=article.get("source_url"), - canonical_url=article.get("canonical_url"), - published_at=article.get("published_at"), - author=article.get("author"), - summary=article.get("summary"), - content_raw=article.get("content_raw"), - content_rewritten=rewritten, - image_urls_json=article.get("image_urls_json"), - press_contact=article.get("press_contact"), - source_name_snapshot=article.get("source_name_snapshot"), - source_terms_url_snapshot=article.get("source_terms_url_snapshot"), - source_license_name_snapshot=article.get("source_license_name_snapshot"), - legal_checked=bool(int(article.get("legal_checked", 0))), - legal_checked_at=article.get("legal_checked_at"), - legal_note=article.get("legal_note"), - wp_post_id=article.get("wp_post_id") if wp_post_id is _UNSET else wp_post_id, - wp_post_url=article.get("wp_post_url") if wp_post_url is _UNSET else wp_post_url, - publish_attempts=int(article.get("publish_attempts", 0)) if publish_attempts is _UNSET else publish_attempts, - publish_last_error=article.get("publish_last_error") if publish_last_error is _UNSET else publish_last_error, - published_to_wp_at=article.get("published_to_wp_at") if published_to_wp_at is _UNSET else published_to_wp_at, - word_count=len(str(rewritten or "").split()), - status=article.get("status") if status is None else status, - meta_json=article.get("meta_json") if meta_json is _UNSET else meta_json, - ) - ) - - -@router.get("/admin", response_class=HTMLResponse) -def admin_index(request: Request): - user = _admin_user(request) - if not user: - return RedirectResponse(url="/admin/login", status_code=303) - return RedirectResponse(url="/admin/dashboard", status_code=303) - - -@router.get("/admin/login", response_class=HTMLResponse) -def admin_login_page(request: Request): - return templates.TemplateResponse( - request, - "admin_login.html", - {"request": request, "title": "Admin Login", "error": request.query_params.get("error")}, - ) - - -@router.post("/admin/login") -def admin_login(request: Request, username: str = Form(...), password: str = Form(...)): - if not verify_credentials(username, password): - return RedirectResponse(url="/admin/login?error=1", status_code=303) - - token = create_session_token(username) - response = RedirectResponse(url="/admin/dashboard", status_code=303) - response.set_cookie( - key=settings.session_cookie_name, - value=token, - max_age=settings.session_max_age_seconds, - httponly=True, - secure=False, - samesite="lax", - ) - return response - - -@router.post("/admin/logout") -def admin_logout(): - response = RedirectResponse(url="/admin/login", status_code=303) - response.delete_cookie(settings.session_cookie_name) - return response - - -@router.get("/admin/dashboard", response_class=HTMLResponse) -def admin_dashboard(request: Request): - user = _admin_user(request) - if not user: - return RedirectResponse(url="/admin/login", status_code=303) - - sources = list_sources() - source_policy = {s["id"]: evaluate_source_policy(s) for s in sources} - feeds = list_feeds() - runs = list_runs(limit=30) - publish_jobs = list_publish_jobs(limit=30) - for job in publish_jobs: - category, hint = _classify_publish_error(job.get("error_message")) - job["error_category"] = category - job["error_hint"] = hint - status_filter = request.query_params.get("status_filter") - internal_filter = ui_to_internal_status(status_filter) if status_filter else None - if status_filter in set(UI_STATUSES): - articles = list_articles(limit=100, status_filter=internal_filter) - else: - status_filter = "" - articles = [a for a in list_articles(limit=250) if internal_to_ui_status(a.get("status")) != "close"][:100] - for article in articles: - meta = _parse_meta_json(article.get("meta_json")) - extraction = meta.get("extraction") if isinstance(meta.get("extraction"), dict) else {} - images = _read_article_images(article, extraction) - article["meta"] = meta - ready, reasons = _publish_readiness(article, meta) - article["publish_ready"] = ready - article["publish_blockers"] = reasons - article["extracted_images"] = images - article["image_entries"] = _build_image_entries(article, extraction, meta) - image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {} - article["selected_image_url"] = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None - article["selected_image_proxy_url"] = ( - f"/admin/images/proxy?{urlencode({'url': article['selected_image_url']})}" if article.get("selected_image_url") else None - ) - if not article.get("press_contact") and isinstance(extraction.get("press_contact"), str): - article["press_contact"] = extraction.get("press_contact") - article["extraction_error"] = extraction.get("extraction_error") if isinstance(extraction.get("extraction_error"), str) else None - article["days_old"] = article_age_days(article.get("published_at")) - article["relevance"] = article_relevance(article.get("published_at")) - article["status_ui"] = internal_to_ui_status(article.get("status")) - tags = meta.get("generated_tags") if isinstance(meta.get("generated_tags"), list) else [] - article["generated_tags"] = [str(t) for t in tags if t] - - return templates.TemplateResponse( - request, - "admin_dashboard.html", - { - "request": request, - "title": "Admin Dashboard", - "user": user, - "sources": sources, - "source_policy": source_policy, - "feeds": feeds, - "runs": runs, - "publish_jobs": publish_jobs, - "articles": articles, - "status_options": list(UI_STATUSES), - "allowed_transitions": ALLOWED_TRANSITIONS, - "status_filter": status_filter, - "flash_msg": request.query_params.get("msg", ""), - "flash_type": request.query_params.get("type", "success"), - }, - ) - - -@router.get("/admin/connectivity", response_class=HTMLResponse) -def admin_connectivity(request: Request): - user = _admin_user(request) - if not user: - return RedirectResponse(url="/admin/login", status_code=303) - - checks = [_run_connectivity_check(target) for target in _build_connectivity_targets()] - ok_count = len([c for c in checks if c.get("ok")]) - error_count = len(checks) - ok_count - return templates.TemplateResponse( - request, - "admin_connectivity.html", - { - "request": request, - "title": "Connectivity Check", - "user": user, - "checks": checks, - "ok_count": ok_count, - "error_count": error_count, - }, - ) - - -@router.get("/admin/articles/{article_id}", response_class=HTMLResponse) -def admin_article_detail(request: Request, article_id: int): - user = _admin_user(request) - if not user: - return RedirectResponse(url="/admin/login", status_code=303) - - article = get_article_by_id(article_id) - if not article: - return _dashboard_redirect(msg=f"Artikel #{article_id} nicht gefunden", msg_type="error") - - meta = _parse_meta_json(article.get("meta_json")) - article["meta"] = meta - extraction = meta.get("extraction") if isinstance(meta.get("extraction"), dict) else {} - extraction["images"] = _read_article_images(article, extraction) - if not article.get("press_contact") and isinstance(extraction.get("press_contact"), str): - article["press_contact"] = extraction.get("press_contact") - article["extraction"] = extraction - publish_ready, publish_blockers = _publish_readiness(article, meta) - article["publish_ready"] = publish_ready - article["publish_blockers"] = publish_blockers - article["image_selection"] = extraction.get("image_selection") if isinstance(extraction.get("image_selection"), dict) else {} - article["image_entries"] = _build_image_entries(article, extraction, meta) - image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {} - article["selected_image_url"] = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None - article["selected_image_proxy_url"] = ( - f"/admin/images/proxy?{urlencode({'url': article['selected_image_url']})}" if article.get("selected_image_url") else None - ) - article["days_old"] = article_age_days(article.get("published_at")) - article["relevance"] = article_relevance(article.get("published_at")) - article["status_ui"] = internal_to_ui_status(article.get("status")) - feed = get_feed_by_id(int(article["feed_id"])) if article.get("feed_id") else None - checklist = _legal_checklist(article, feed) - - return templates.TemplateResponse( - request, - "admin_article_detail.html", - { - "request": request, - "title": f"Artikel #{article_id}", - "user": user, - "article": article, - "feed": feed, - "checklist": checklist, - "allowed_transitions": ALLOWED_TRANSITIONS.get(article.get("status_ui"), ()), - "flash_msg": request.query_params.get("msg", ""), - "flash_type": request.query_params.get("type", "success"), - }, - ) - - -@router.post("/admin/articles/{article_id}/images/decision") -def admin_article_image_decision( - request: Request, - article_id: int, - image_url: str = Form(...), - action: str = Form(...), -): - user = _admin_user(request) - if not user: - return RedirectResponse(url="/admin/login", status_code=303) - - ok = set_article_image_decision(article_id=article_id, image_url=image_url, action=action, actor=user) - if not ok: - return _dashboard_redirect(msg=f"Bildaktion fehlgeschlagen fuer Artikel #{article_id}", msg_type="error") - return RedirectResponse(url=f"/admin/articles/{article_id}", status_code=303) - - -@router.post("/admin/articles/{article_id}/publish-enqueue") -def admin_enqueue_publish(request: Request, article_id: int, max_attempts: str = Form("3")): - user = _admin_user(request) - if not user: - return RedirectResponse(url="/admin/login", status_code=303) - try: - job_id = enqueue_publish(article_id=article_id, max_attempts=max(1, int(max_attempts))) - except Exception as exc: - return _dashboard_redirect(msg=f"Publish Queue Fehler fuer Artikel #{article_id}: {exc}", msg_type="error") - return RedirectResponse(url=f"/admin/articles/{article_id}?msg=Publish-Job%20#{job_id}%20erstellt&type=success", status_code=303) - - -@router.post("/admin/publisher/run") -def admin_run_publisher(request: Request, max_jobs: str = Form("10")): - user = _admin_user(request) - if not user: - return RedirectResponse(url="/admin/login", status_code=303) - try: - stats = run_publisher(max_jobs=max(1, int(max_jobs))) - except Exception as exc: - return _dashboard_redirect(msg=f"Publisher Fehler: {exc}", msg_type="error") - return _dashboard_redirect( - msg=f"Publisher: processed={stats.processed}, success={stats.success}, failed={stats.failed}, requeued={stats.requeued}" - ) - - -@router.get("/admin/images/proxy") -def admin_image_proxy(request: Request, url: str): - if not _is_http_image_url(url): - return Response(status_code=400) - - try: - referer = request.headers.get("referer", "") - req = UrlRequest( - url=url, - headers={ - "User-Agent": IMAGE_PROXY_USER_AGENT, - "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8", - "Referer": referer or url, - }, - ) - with urlopen(req, timeout=10) as resp: - body = resp.read() - content_type = resp.headers.get("Content-Type", "application/octet-stream") - except Exception: - return Response(status_code=404) - - if not content_type.lower().startswith("image/"): - return Response(status_code=415) - return Response(content=body, media_type=content_type) - - -@router.post("/admin/sources/create") -def admin_create_source( - request: Request, - name: str = Form(...), - base_url: str = Form(""), - terms_url: str = Form(""), - license_name: str = Form(""), - risk_level: str = Form("yellow"), - last_reviewed_at: str = Form(""), -): - user = _admin_user(request) - if not user: - return RedirectResponse(url="/admin/login", status_code=303) - - try: - create_source( - SourceCreate( - name=name, - base_url=base_url or None, - terms_url=terms_url or None, - license_name=license_name or None, - risk_level=risk_level, - is_enabled=True, - notes=None, - last_reviewed_at=last_reviewed_at or None, - ) - ) - except Exception as exc: - return _dashboard_redirect(msg=f"Quelle konnte nicht gespeichert werden: {exc}", msg_type="error") - return _dashboard_redirect(msg="Quelle gespeichert") - - -@router.post("/admin/sources/{source_id}/update") -def admin_update_source( - request: Request, - source_id: int, - name: str = Form(...), - base_url: str = Form(""), - terms_url: str = Form(""), - license_name: str = Form(""), - risk_level: str = Form("yellow"), - is_enabled: str = Form("1"), - notes: str = Form(""), - last_reviewed_at: str = Form(""), -): - user = _admin_user(request) - if not user: - return RedirectResponse(url="/admin/login", status_code=303) - try: - ok = update_source( - source_id, - SourceUpdate( - name=name, - base_url=base_url or None, - terms_url=terms_url or None, - license_name=license_name or None, - risk_level=risk_level, - is_enabled=is_enabled == "1", - notes=notes or None, - last_reviewed_at=last_reviewed_at or None, - ), - ) - except Exception as exc: - return _dashboard_redirect(msg=f"Quelle #{source_id} Update fehlgeschlagen: {exc}", msg_type="error") - if not ok: - return _dashboard_redirect(msg=f"Quelle #{source_id} nicht gefunden", msg_type="error") - return _dashboard_redirect(msg=f"Quelle #{source_id} aktualisiert") - - -@router.post("/admin/sources/{source_id}/delete") -def admin_delete_source(request: Request, source_id: int): - user = _admin_user(request) - if not user: - return RedirectResponse(url="/admin/login", status_code=303) - ok = delete_source(source_id) - if not ok: - return _dashboard_redirect(msg=f"Quelle #{source_id} nicht gefunden", msg_type="error") - return _dashboard_redirect(msg=f"Quelle #{source_id} gelöscht") - - -@router.post("/admin/feeds/create") -def admin_create_feed( - request: Request, - name: str = Form(...), - url: str = Form(...), - source_id: str = Form(""), -): - user = _admin_user(request) - if not user: - return RedirectResponse(url="/admin/login", status_code=303) - - try: - create_feed( - FeedCreate( - name=name, - url=url, - source_id=_to_optional_int(source_id), - is_enabled=True, - ) - ) - except Exception as exc: - return _dashboard_redirect(msg=f"Feed konnte nicht gespeichert werden: {exc}", msg_type="error") - return _dashboard_redirect(msg="Feed gespeichert") - - -@router.post("/admin/feeds/{feed_id}/update") -def admin_update_feed( - request: Request, - feed_id: int, - name: str = Form(...), - url: str = Form(...), - source_id: str = Form(""), - is_enabled: str = Form("1"), -): - user = _admin_user(request) - if not user: - return RedirectResponse(url="/admin/login", status_code=303) - try: - ok = update_feed( - feed_id, - FeedUpdate( - name=name, - url=url, - source_id=_to_optional_int(source_id), - is_enabled=is_enabled == "1", - ), - ) - except Exception as exc: - return _dashboard_redirect(msg=f"Feed #{feed_id} Update fehlgeschlagen: {exc}", msg_type="error") - if not ok: - return _dashboard_redirect(msg=f"Feed #{feed_id} nicht gefunden", msg_type="error") - return _dashboard_redirect(msg=f"Feed #{feed_id} aktualisiert") - - -@router.post("/admin/feeds/{feed_id}/delete") -def admin_delete_feed(request: Request, feed_id: int): - user = _admin_user(request) - if not user: - return RedirectResponse(url="/admin/login", status_code=303) - ok = delete_feed(feed_id) - if not ok: - return _dashboard_redirect(msg=f"Feed #{feed_id} nicht gefunden", msg_type="error") - return _dashboard_redirect(msg=f"Feed #{feed_id} gelöscht") - - -@router.post("/admin/ingestion/run") -def admin_run_ingestion(request: Request, feed_id: str = Form("")): - user = _admin_user(request) - if not user: - return RedirectResponse(url="/admin/login", status_code=303) - try: - stats = run_ingestion(feed_id=_to_optional_int(feed_id)) - except Exception as exc: - return _dashboard_redirect(msg=f"Ingestion fehlgeschlagen: {exc}", msg_type="error") - return _dashboard_redirect(msg=f"Ingestion: {stats.status}, upserts={stats.articles_upserted}") - - -@router.post("/admin/articles/{article_id}/review") -def admin_review_article(request: Request, article_id: int, decision: str = Form(...), note: str = Form("")): - user = _admin_user(request) - if not user: - return RedirectResponse(url="/admin/login", status_code=303) - - return _dashboard_redirect(msg="Review-Aktion wurde durch Rewrite ersetzt", msg_type="error") - - -@router.post("/admin/articles/{article_id}/rewrite-run") -def admin_rewrite_run(request: Request, article_id: int): - user = _admin_user(request) - if not user: - return RedirectResponse(url="/admin/login", status_code=303) - article = get_article_by_id(article_id) - if not article: - return _dashboard_redirect(msg=f"Artikel #{article_id} nicht gefunden", msg_type="error") - if internal_to_ui_status(article.get("status")) not in {"new", "rewrite"}: - return _dashboard_redirect(msg=f"Rewrite nur aus new/rewrite fuer Artikel #{article_id}", msg_type="error") - try: - rewritten = rewrite_article_text(article) - tags = generate_article_tags(article, rewritten_text=rewritten) - except Exception as exc: - return _dashboard_redirect(msg=f"Rewrite fehlgeschlagen fuer Artikel #{article_id}: {exc}", msg_type="error") - merged_meta = merge_generated_tags(article.get("meta_json"), tags) - _upsert_article_from_existing(article, content_rewritten=rewritten, status="approved", meta_json=merged_meta) - return _dashboard_redirect(msg=f"Rewrite fertig fuer Artikel #{article_id} -> publish") - - -@router.post("/admin/rewrite/run") -def admin_rewrite_run_batch(request: Request, max_jobs: str = Form("10")): - user = _admin_user(request) - if not user: - return RedirectResponse(url="/admin/login", status_code=303) - try: - limit = max(1, min(int(max_jobs), 100)) - except Exception: - limit = 10 - planned = list_articles(limit=limit, status_filter="rewrite") - processed = 0 - success = 0 - failed = 0 - for article in planned: - processed += 1 - try: - rewritten = rewrite_article_text(article) - tags = generate_article_tags(article, rewritten_text=rewritten) - merged_meta = merge_generated_tags(article.get("meta_json"), tags) - _upsert_article_from_existing(article, content_rewritten=rewritten, status="approved", meta_json=merged_meta) - success += 1 - except Exception: - failed += 1 - return _dashboard_redirect(msg=f"Rewrite-Run: processed={processed}, success={success}, failed={failed}") - - -@router.post("/admin/articles/{article_id}/rewrite-save") -def admin_rewrite_save(request: Request, article_id: int, content_rewritten: str = Form(...)): - user = _admin_user(request) - if not user: - return RedirectResponse(url="/admin/login", status_code=303) - article = get_article_by_id(article_id) - if not article: - return _dashboard_redirect(msg=f"Artikel #{article_id} nicht gefunden", msg_type="error") - text = (content_rewritten or "").strip() - if not text: - return RedirectResponse( - url=f"/admin/articles/{article_id}?msg=Rewrite-Text%20darf%20nicht%20leer%20sein&type=error", - status_code=303, - ) - _upsert_article_from_existing(article, content_rewritten=text) - return RedirectResponse(url=f"/admin/articles/{article_id}?msg=Rewrite-Text%20gespeichert&type=success", status_code=303) - - -@router.post("/admin/articles/{article_id}/reopen") -def admin_reopen_article(request: Request, article_id: int): - user = _admin_user(request) - if not user: - return RedirectResponse(url="/admin/login", status_code=303) - article = get_article_by_id(article_id) - if not article: - return _dashboard_redirect(msg=f"Artikel #{article_id} nicht gefunden", msg_type="error") - _upsert_article_from_existing( - article, - status="rewrite", - wp_post_id=None, - wp_post_url=None, - publish_attempts=0, - publish_last_error=None, - published_to_wp_at=None, - ) - return RedirectResponse( - url=f"/admin/articles/{article_id}?msg=Artikel%20zurueck%20in%20Rewrite-Workflow%20gesetzt&type=success", - status_code=303, - ) - - -@router.post("/admin/articles/{article_id}/transition") -def admin_transition_article(request: Request, article_id: int, target_status: str = Form(...), note: str = Form("")): - user = _admin_user(request) - if not user: - return RedirectResponse(url="/admin/login", status_code=303) - - article = get_article_by_id(article_id) - if article: - current_ui = internal_to_ui_status(article.get("status")) - target_internal = ui_to_internal_status(target_status) - target_ui = internal_to_ui_status(target_internal) - if target_ui in ALLOWED_TRANSITIONS.get(current_ui, ()): - update_article_status(article_id, target_internal, actor=user, note=note or None) - return _dashboard_redirect(msg=f"Artikel #{article_id}: {current_ui} -> {target_ui}") - return _dashboard_redirect(msg=f"Ungueltiger Statuswechsel fuer Artikel #{article_id}", msg_type="error") - - -_PAGE_SIZE = 50 - - -@router.get("/admin/article-list", response_class=HTMLResponse) -def admin_article_list(request: Request): - """Paginated article list with inline WP ID editing.""" - user = _admin_user(request) - if not user: - return RedirectResponse(url="/admin/login", status_code=303) - - page = max(1, int(request.query_params.get("page", 1))) - status_filter = request.query_params.get("status_filter", "") or None - search = request.query_params.get("search", "").strip() or None - offset = (page - 1) * _PAGE_SIZE - - articles, total = list_articles_page( - limit=_PAGE_SIZE, offset=offset, - status_filter=status_filter, search=search, - ) - - # Enrich each article with thumbnail URL - for a in articles: - meta = _parse_meta_json(a.get("meta_json")) - image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {} - sel = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None - if not sel: - sel = (meta.get("extraction") or {}).get("image_selection", {}).get("primary") - a["thumb_url"] = sel - a["thumb_proxy"] = f"/admin/images/proxy?{urlencode({'url': sel})}" if sel else None - raw = (a.get("content_raw") or a.get("summary") or "").strip() - a["excerpt"] = raw[:120] + "…" if len(raw) > 120 else raw - - total_pages = max(1, (total + _PAGE_SIZE - 1) // _PAGE_SIZE) - - return templates.TemplateResponse( - request, - "admin_article_list.html", - { - "request": request, - "title": "Artikelliste", - "user": user, - "articles": articles, - "page": page, - "total_pages": total_pages, - "total": total, - "page_size": _PAGE_SIZE, - "status_filter": status_filter or "", - "search": search or "", - "flash_msg": request.query_params.get("msg", ""), - "flash_type": request.query_params.get("type", "success"), - }, - ) - - -@router.post("/admin/article-list/update") -async def admin_article_list_update(request: Request): - """Bulk update WP post IDs from the article list form.""" - user = _admin_user(request) - if not user: - return RedirectResponse(url="/admin/login", status_code=303) - - form = await request.form() - updates: list[tuple[int, int | None]] = [] - - # Form fields: wp_ = new value, orig_ = original value - for key, new_val in form.items(): - if not key.startswith("wp_"): - continue - try: - article_id = int(key[3:]) - except ValueError: - continue - orig_val = str(form.get(f"orig_{article_id}", "")).strip() - new_val_s = str(new_val).strip() - if new_val_s == orig_val: - continue # unchanged - new_wp_id = int(new_val_s) if new_val_s else None - updates.append((article_id, new_wp_id)) - - if updates: - count = bulk_update_wp_post_ids(updates) - msg = f"{count} WP-ID(s) aktualisiert. Bitte jetzt WP-Sync ausführen um Slots & URLs zu aktualisieren." - msg_type = "success" - else: - msg = "Keine Änderungen erkannt." - msg_type = "success" - - # Preserve pagination/filter params from referer - page = form.get("page", "1") - status_filter = form.get("status_filter", "") - search = form.get("search", "") - qs: dict[str, str] = {"msg": msg, "type": msg_type, "page": page} - if status_filter: - qs["status_filter"] = status_filter - if search: - qs["search"] = search - return RedirectResponse(url=f"/admin/article-list?{urlencode(qs)}", status_code=303) - - -@router.post("/admin/wp-sync") -def admin_wp_sync(request: Request): - """Sync scheduled_publish_at and WP references in the DB from WordPress.""" - user = _admin_user(request) - if not user: - return RedirectResponse(url="/admin/login", status_code=303) - try: - from .wordpress import sync_db_from_wordpress - stats = sync_db_from_wordpress() - msg = ( - f"WP-Sync abgeschlossen: " - f"{stats['slot_updated']} Slots aktualisiert, " - f"{stats['slot_cleared_draft']} Slots geleert (Draft), " - f"{stats['marked_published']} als veröffentlicht markiert, " - f"{stats['wp_reference_cleared']} WP-Referenzen gelöscht (Papierkorb), " - f"{stats['already_in_sync']} bereits synchron." - ) - return RedirectResponse(url=f"/admin/schedule?msg={msg}&type=success", status_code=303) - except Exception as exc: - return RedirectResponse(url=f"/admin/schedule?msg=Sync fehlgeschlagen: {exc}&type=error", status_code=303) - - -@router.post("/admin/articles/{article_id}/retry") -def admin_retry_article(request: Request, article_id: int): - """Reset a failed article to 'new' so the pipeline picks it up on next run.""" - user = _admin_user(request) - if not user: - return RedirectResponse(url="/admin/login", status_code=303) - - article = get_article_by_id(article_id) - if not article: - return _dashboard_redirect(msg=f"Artikel #{article_id} nicht gefunden", msg_type="error") - - from .scheduler import release_publish_slot - release_publish_slot(article_id) - update_article_status(article_id, "new", actor=user, note="Manuell zurückgesetzt für erneuten Pipeline-Versuch") - return _dashboard_redirect( - msg=f"Artikel #{article_id} wurde auf 'neu' zurückgesetzt und wird beim nächsten Pipeline-Lauf verarbeitet", - status_filter="close", - ) - - -@router.get("/admin/schedule", response_class=HTMLResponse) -def admin_schedule(request: Request): - """Schedule overview: all booked slots from DB and WordPress.""" - user = _admin_user(request) - if not user: - return RedirectResponse(url="/admin/login", status_code=303) - - from .scheduler import get_schedule_overview, _preferred_hours, _today_cet - from datetime import timedelta - - slots = get_schedule_overview(lookahead_days=60) - today = _today_cet() - hours = _preferred_hours() - - # Build a calendar grid: for each day in the next 60 days, show each preferred hour slot - booked: dict[tuple[str, int], dict] = {(s["date"], s["hour"]): s for s in slots} - calendar_days = [] - for offset in range(0, 61): - d = today + timedelta(days=offset) - d_str = d.isoformat() - day_slots = [] - for h in hours: - key = (d_str, h) - day_slots.append({ - "hour": h, - "booked": key in booked, - "slot": booked.get(key), - }) - calendar_days.append({ - "date": d_str, - "date_fmt": d.strftime("%d.%m.%Y"), - "weekday": ["Mo", "Di", "Mi", "Do", "Fr", "Sa", "So"][d.weekday()], - "slots": day_slots, - "any_booked": any(s["booked"] for s in day_slots), - }) - - return templates.TemplateResponse( - request, - "admin_schedule.html", - { - "request": request, - "title": "Veröffentlichungsplan", - "user": user, - "slots": slots, - "calendar_days": calendar_days, - "hours": hours, - "flash_msg": request.query_params.get("msg", ""), - "flash_type": request.query_params.get("type", "success"), - }, - ) diff --git a/backend/app/auth.py b/backend/app/auth.py deleted file mode 100644 index 188397f..0000000 --- a/backend/app/auth.py +++ /dev/null @@ -1,31 +0,0 @@ -import hmac -from typing import Optional - -from itsdangerous import URLSafeTimedSerializer, BadSignature, SignatureExpired - -from .config import get_settings - - -def _serializer() -> URLSafeTimedSerializer: - settings = get_settings() - return URLSafeTimedSerializer(settings.app_secret_key, salt="rss-news-session") - - -def verify_credentials(username: str, password: str) -> bool: - settings = get_settings() - user_ok = hmac.compare_digest(username, settings.app_admin_username) - pw_ok = hmac.compare_digest(password, settings.app_admin_password) - return user_ok and pw_ok - - -def create_session_token(username: str) -> str: - return _serializer().dumps({"username": username}) - - -def verify_session_token(token: str) -> Optional[str]: - settings = get_settings() - try: - payload = _serializer().loads(token, max_age=settings.session_max_age_seconds) - except (BadSignature, SignatureExpired): - return None - return payload.get("username") diff --git a/backend/app/config.py b/backend/app/config.py deleted file mode 100644 index 24c3902..0000000 --- a/backend/app/config.py +++ /dev/null @@ -1,65 +0,0 @@ -from functools import lru_cache -from pathlib import Path - -from dotenv import load_dotenv -from pydantic import AliasChoices, Field -from pydantic_settings import BaseSettings, SettingsConfigDict - - -class Settings(BaseSettings): - # Prefer backend-specific env file to avoid collisions with legacy root .env - model_config = SettingsConfigDict( - env_file=("backend/.env", ".env"), - env_file_encoding="utf-8", - extra="ignore", - ) - - app_env: str = "development" - app_name: str = "rss-news-backend" - app_secret_key: str = "replace-with-a-long-random-secret" - - app_admin_username: str = "admin" - app_admin_password: str = "change-me" - - session_cookie_name: str = "rss_news_session" - session_max_age_seconds: int = 28800 - - app_db_path: str = "backend/data/rss_news.db" - - wordpress_base_url: str | None = Field(default=None, validation_alias=AliasChoices("WORDPRESS_BASE_URL", "WP_BASE_URL")) - wordpress_username: str | None = Field(default=None, validation_alias=AliasChoices("WORDPRESS_USERNAME", "WP_USERNAME")) - wordpress_app_password: str | None = Field(default=None, validation_alias=AliasChoices("WORDPRESS_APP_PASSWORD", "WP_PASSWORD")) - wordpress_default_status: str = "draft" - openai_api_key: str | None = Field(default=None, validation_alias=AliasChoices("OPENAI_API_KEY")) - openai_model: str = "gpt-4o-mini" - - # Telegram Bot - telegram_bot_token: str | None = Field(default=None, validation_alias=AliasChoices("TELEGRAM_BOT_TOKEN")) - telegram_chat_id: str | None = Field(default=None, validation_alias=AliasChoices("TELEGRAM_CHAT_ID")) - telegram_webhook_secret: str | None = Field(default=None, validation_alias=AliasChoices("TELEGRAM_WEBHOOK_SECRET")) - - # N8N API authentication - n8n_api_key: str | None = Field(default=None, validation_alias=AliasChoices("N8N_API_KEY")) - - # Pipeline behaviour - pipeline_relevance_auto: int = 80 # >= this: auto-process - pipeline_relevance_warn: int = 60 # >= this: Telegram warning, else reject - pipeline_max_drafts_per_day: int = 2 - pipeline_publish_hours: str = "9,14" # comma-separated preferred publish hours (CET) - pipeline_min_words_raw: int = 120 # minimum words in raw content before rewrite (else reject) - pipeline_min_words_rewritten: int = 150 # minimum words in rewritten content (else reject) - pipeline_max_article_age_days: int = 7 # skip articles older than N days during ingestion (0 = no limit) - - -@lru_cache(maxsize=1) -def get_settings() -> Settings: - # Prefer shared legacy env from the original rss-news workspace if present. - env_candidates = ( - Path("/Users/oliver/Documents/rss-news/.env"), - Path("backend/.env"), - Path(".env"), - ) - for env_path in env_candidates: - if env_path.exists(): - load_dotenv(env_path, override=False) - return Settings() diff --git a/backend/app/db.py b/backend/app/db.py deleted file mode 100644 index b6ef898..0000000 --- a/backend/app/db.py +++ /dev/null @@ -1,293 +0,0 @@ -import sqlite3 -from contextlib import contextmanager -from pathlib import Path -from typing import Any, Iterator - -from .config import get_settings - - -def _db_path() -> Path: - settings = get_settings() - path = Path(settings.app_db_path) - path.parent.mkdir(parents=True, exist_ok=True) - return path - - -@contextmanager -def get_conn() -> Iterator[sqlite3.Connection]: - conn = sqlite3.connect(_db_path()) - conn.row_factory = sqlite3.Row - conn.execute("PRAGMA foreign_keys=ON;") - try: - yield conn - conn.commit() - finally: - conn.close() - - -def init_db() -> None: - with get_conn() as conn: - conn.executescript( - """ - PRAGMA journal_mode=WAL; - - CREATE TABLE IF NOT EXISTS sources ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - name TEXT NOT NULL, - base_url TEXT, - terms_url TEXT, - license_name TEXT, - risk_level TEXT NOT NULL DEFAULT 'yellow' CHECK (risk_level IN ('green', 'yellow', 'red')), - is_enabled INTEGER NOT NULL DEFAULT 0, - notes TEXT, - last_reviewed_at TEXT, - created_at TEXT NOT NULL DEFAULT (datetime('now')), - updated_at TEXT NOT NULL DEFAULT (datetime('now')) - ); - - CREATE TABLE IF NOT EXISTS feeds ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - source_id INTEGER, - name TEXT NOT NULL, - url TEXT NOT NULL UNIQUE, - is_enabled INTEGER NOT NULL DEFAULT 1, - etag TEXT, - last_modified TEXT, - last_checked_at TEXT, - created_at TEXT NOT NULL DEFAULT (datetime('now')), - updated_at TEXT NOT NULL DEFAULT (datetime('now')), - FOREIGN KEY(source_id) REFERENCES sources(id) ON DELETE SET NULL - ); - - CREATE TABLE IF NOT EXISTS runs ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - run_type TEXT NOT NULL, - status TEXT NOT NULL CHECK (status IN ('queued', 'running', 'success', 'failed')), - started_at TEXT NOT NULL DEFAULT (datetime('now')), - finished_at TEXT, - details TEXT - ); - - CREATE TABLE IF NOT EXISTS publish_jobs ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - article_id INTEGER NOT NULL, - status TEXT NOT NULL CHECK (status IN ('queued', 'running', 'success', 'failed')), - attempts INTEGER NOT NULL DEFAULT 0, - max_attempts INTEGER NOT NULL DEFAULT 3, - error_message TEXT, - wp_post_id INTEGER, - wp_post_url TEXT, - created_at TEXT NOT NULL DEFAULT (datetime('now')), - started_at TEXT, - finished_at TEXT, - FOREIGN KEY(article_id) REFERENCES articles(id) ON DELETE CASCADE - ); - - CREATE TABLE IF NOT EXISTS articles ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - feed_id INTEGER, - source_article_id TEXT, - source_hash TEXT, - title TEXT NOT NULL, - source_url TEXT NOT NULL, - canonical_url TEXT, - published_at TEXT, - author TEXT, - summary TEXT, - content_raw TEXT, - content_rewritten TEXT, - image_urls_json TEXT, - press_contact TEXT, - source_name_snapshot TEXT, - source_terms_url_snapshot TEXT, - source_license_name_snapshot TEXT, - legal_checked INTEGER NOT NULL DEFAULT 0, - legal_checked_at TEXT, - legal_note TEXT, - wp_post_id INTEGER, - wp_post_url TEXT, - publish_attempts INTEGER NOT NULL DEFAULT 0, - publish_last_error TEXT, - published_to_wp_at TEXT, - word_count INTEGER DEFAULT 0, - status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error', 'no_image')), - meta_json TEXT, - created_at TEXT NOT NULL DEFAULT (datetime('now')), - updated_at TEXT NOT NULL DEFAULT (datetime('now')), - FOREIGN KEY(feed_id) REFERENCES feeds(id) ON DELETE SET NULL, - UNIQUE(source_url) - ); - - CREATE INDEX IF NOT EXISTS idx_articles_source_article_id ON articles(source_article_id); - CREATE INDEX IF NOT EXISTS idx_articles_source_hash ON articles(source_hash); - CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_feed_source_article_id - ON articles(feed_id, source_article_id) - WHERE source_article_id IS NOT NULL; - CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_source_hash - ON articles(source_hash) - WHERE source_hash IS NOT NULL; - CREATE INDEX IF NOT EXISTS idx_articles_status ON articles(status); - CREATE INDEX IF NOT EXISTS idx_feeds_source_id ON feeds(source_id); - CREATE INDEX IF NOT EXISTS idx_runs_started_at ON runs(started_at); - CREATE INDEX IF NOT EXISTS idx_articles_published_at ON articles(published_at); - CREATE INDEX IF NOT EXISTS idx_publish_jobs_status_created_at ON publish_jobs(status, created_at); - - CREATE TRIGGER IF NOT EXISTS trg_sources_updated_at - AFTER UPDATE ON sources - FOR EACH ROW - BEGIN - UPDATE sources SET updated_at = datetime('now') WHERE id = OLD.id; - END; - - CREATE TRIGGER IF NOT EXISTS trg_feeds_updated_at - AFTER UPDATE ON feeds - FOR EACH ROW - BEGIN - UPDATE feeds SET updated_at = datetime('now') WHERE id = OLD.id; - END; - - CREATE TRIGGER IF NOT EXISTS trg_articles_updated_at - AFTER UPDATE ON articles - FOR EACH ROW - BEGIN - UPDATE articles SET updated_at = datetime('now') WHERE id = OLD.id; - END; - """ - ) - - # Lightweight migration for existing DBs created before source_hash was introduced. - existing_columns = { - row["name"] for row in conn.execute("PRAGMA table_info(articles)").fetchall() - } - migration_columns = { - "relevance_score": "ALTER TABLE articles ADD COLUMN relevance_score INTEGER", - "scheduled_publish_at": "ALTER TABLE articles ADD COLUMN scheduled_publish_at TEXT", - "source_hash": "ALTER TABLE articles ADD COLUMN source_hash TEXT", - "image_urls_json": "ALTER TABLE articles ADD COLUMN image_urls_json TEXT", - "press_contact": "ALTER TABLE articles ADD COLUMN press_contact TEXT", - "source_name_snapshot": "ALTER TABLE articles ADD COLUMN source_name_snapshot TEXT", - "source_terms_url_snapshot": "ALTER TABLE articles ADD COLUMN source_terms_url_snapshot TEXT", - "source_license_name_snapshot": "ALTER TABLE articles ADD COLUMN source_license_name_snapshot TEXT", - "legal_checked": "ALTER TABLE articles ADD COLUMN legal_checked INTEGER NOT NULL DEFAULT 0", - "legal_checked_at": "ALTER TABLE articles ADD COLUMN legal_checked_at TEXT", - "legal_note": "ALTER TABLE articles ADD COLUMN legal_note TEXT", - "wp_post_id": "ALTER TABLE articles ADD COLUMN wp_post_id INTEGER", - "wp_post_url": "ALTER TABLE articles ADD COLUMN wp_post_url TEXT", - "publish_attempts": "ALTER TABLE articles ADD COLUMN publish_attempts INTEGER NOT NULL DEFAULT 0", - "publish_last_error": "ALTER TABLE articles ADD COLUMN publish_last_error TEXT", - "published_to_wp_at": "ALTER TABLE articles ADD COLUMN published_to_wp_at TEXT", - } - for column, ddl in migration_columns.items(): - if column not in existing_columns: - conn.execute(ddl) - - # Migration: add 'no_image' to the status CHECK constraint if not present. - # SQLite cannot modify CHECK constraints in-place, so we recreate the table. - table_sql_row = conn.execute( - "SELECT sql FROM sqlite_master WHERE type='table' AND name='articles'" - ).fetchone() - if table_sql_row and "'no_image'" not in (table_sql_row["sql"] or ""): - conn.executescript( - """ - PRAGMA foreign_keys=OFF; - - CREATE TABLE articles_v2 ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - feed_id INTEGER, - source_article_id TEXT, - source_hash TEXT, - title TEXT NOT NULL, - source_url TEXT NOT NULL, - canonical_url TEXT, - published_at TEXT, - author TEXT, - summary TEXT, - content_raw TEXT, - content_rewritten TEXT, - image_urls_json TEXT, - press_contact TEXT, - source_name_snapshot TEXT, - source_terms_url_snapshot TEXT, - source_license_name_snapshot TEXT, - legal_checked INTEGER NOT NULL DEFAULT 0, - legal_checked_at TEXT, - legal_note TEXT, - wp_post_id INTEGER, - wp_post_url TEXT, - publish_attempts INTEGER NOT NULL DEFAULT 0, - publish_last_error TEXT, - published_to_wp_at TEXT, - word_count INTEGER DEFAULT 0, - status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error', 'no_image')), - meta_json TEXT, - relevance_score INTEGER, - scheduled_publish_at TEXT, - created_at TEXT NOT NULL DEFAULT (datetime('now')), - updated_at TEXT NOT NULL DEFAULT (datetime('now')), - FOREIGN KEY(feed_id) REFERENCES feeds(id) ON DELETE SET NULL, - UNIQUE(source_url) - ); - - INSERT INTO articles_v2 SELECT - id, feed_id, source_article_id, source_hash, title, source_url, - canonical_url, published_at, author, summary, content_raw, - content_rewritten, image_urls_json, press_contact, - source_name_snapshot, source_terms_url_snapshot, source_license_name_snapshot, - legal_checked, legal_checked_at, legal_note, - wp_post_id, wp_post_url, publish_attempts, publish_last_error, - published_to_wp_at, word_count, status, meta_json, - relevance_score, scheduled_publish_at, created_at, updated_at - FROM articles; - - DROP TABLE articles; - ALTER TABLE articles_v2 RENAME TO articles; - - CREATE INDEX IF NOT EXISTS idx_articles_source_article_id ON articles(source_article_id); - CREATE INDEX IF NOT EXISTS idx_articles_source_hash ON articles(source_hash); - CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_feed_source_article_id - ON articles(feed_id, source_article_id) - WHERE source_article_id IS NOT NULL; - CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_source_hash - ON articles(source_hash) - WHERE source_hash IS NOT NULL; - CREATE INDEX IF NOT EXISTS idx_articles_status ON articles(status); - CREATE INDEX IF NOT EXISTS idx_articles_published_at ON articles(published_at); - - CREATE TRIGGER IF NOT EXISTS trg_articles_updated_at - AFTER UPDATE ON articles - FOR EACH ROW - BEGIN - UPDATE articles SET updated_at = datetime('now') WHERE id = OLD.id; - END; - - PRAGMA foreign_keys=ON; - """ - ) - - table_rows = conn.execute( - "SELECT name FROM sqlite_master WHERE type = 'table' AND name = 'publish_jobs'" - ).fetchall() - if not table_rows: - conn.executescript( - """ - CREATE TABLE IF NOT EXISTS publish_jobs ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - article_id INTEGER NOT NULL, - status TEXT NOT NULL CHECK (status IN ('queued', 'running', 'success', 'failed')), - attempts INTEGER NOT NULL DEFAULT 0, - max_attempts INTEGER NOT NULL DEFAULT 3, - error_message TEXT, - wp_post_id INTEGER, - wp_post_url TEXT, - created_at TEXT NOT NULL DEFAULT (datetime('now')), - started_at TEXT, - finished_at TEXT, - FOREIGN KEY(article_id) REFERENCES articles(id) ON DELETE CASCADE - ); - CREATE INDEX IF NOT EXISTS idx_publish_jobs_status_created_at ON publish_jobs(status, created_at); - """ - ) - - -def rows_to_dicts(rows: list[sqlite3.Row]) -> list[dict[str, Any]]: - return [dict(r) for r in rows] diff --git a/backend/app/ingestion.py b/backend/app/ingestion.py deleted file mode 100644 index 391af92..0000000 --- a/backend/app/ingestion.py +++ /dev/null @@ -1,486 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass -from datetime import datetime, timedelta, timezone -import hashlib -import json -import re -import time -from typing import Any -from urllib.parse import unquote, urlencode, urlparse, parse_qs -import urllib.error -import urllib.request as _urllib_req - -import feedparser - -from .repositories import ( - ArticleUpsert, - RunCreate, - create_run, - find_existing_article_for_upsert, - finish_run, - get_feed_by_id, - list_enabled_feeds, - update_feed_fetch_state, - upsert_article, -) -from .source_extraction import extract_article, extracted_article_to_meta - - -@dataclass(frozen=True) -class IngestionStats: - run_id: int - feeds_processed: int - entries_seen: int - articles_upserted: int - status: str - message: str - - -MAX_FEED_FETCH_RETRIES = 3 - - -def _normalize_article_url(url: str) -> str: - """Strip AMP and tracking query parameters from article URLs. - - Removes ?outputType=valid_amp and other AMP/tracking params so that - AMP and non-AMP versions of the same article are deduplicated. - """ - _AMP_PARAMS = {"outputtype", "amp", "outputformat"} - try: - from urllib.parse import parse_qs, urlencode - parsed = urlparse(url) - if not parsed.query: - return url - params = parse_qs(parsed.query, keep_blank_values=True) - filtered = {k: v for k, v in params.items() if k.lower() not in _AMP_PARAMS} - new_query = urlencode(filtered, doseq=True) - return parsed._replace(query=new_query).geturl() - except Exception: - return url - - -def _resolve_google_redirect(url: str) -> str: - """Extract the real article URL from Google redirect URLs. - - Google Alerts feed entries use tracking links like: - https://www.google.com/url?rct=j&sa=t&url=&ct=ga&... - - This function returns the decoded real URL if detected, otherwise the - original URL unchanged. - """ - try: - parsed = urlparse(url) - host = (parsed.hostname or "").lower() - if host not in ("www.google.com", "google.com"): - return url - if parsed.path not in ("/url", "/url/"): - return url - params = parse_qs(parsed.query, keep_blank_values=False) - real_urls = params.get("url") - if real_urls: - return unquote(real_urls[0]) - except Exception: - pass - return url - - -def _entry_published_iso(entry: dict) -> str | None: - published = entry.get("published_parsed") or entry.get("updated_parsed") - if not published: - return None - return datetime(*published[:6], tzinfo=timezone.utc).isoformat() - - -def _entry_text(entry: dict) -> tuple[str, str]: - summary = entry.get("summary", "") or "" - content = "" - if entry.get("content") and isinstance(entry.get("content"), list): - first = entry["content"][0] - content = first.get("value", "") if isinstance(first, dict) else "" - if not content: - content = summary - return summary, content - - -def _entry_hash(entry: dict, feed_id: int, link: str, title: str, summary: str) -> str: - source_id = entry.get("id") or entry.get("guid") or "" - published = _entry_published_iso(entry) or "" - fingerprint = f"{feed_id}|{source_id}|{link}|{title.strip()}|{summary.strip()}|{published}" - return hashlib.sha256(fingerprint.encode("utf-8")).hexdigest() - - -def _parsed_get(parsed: object, key: str, default: object = None) -> object: - if isinstance(parsed, dict): - return parsed.get(key, default) - return getattr(parsed, key, default) - - -def _normalize_tokens(text: str) -> set[str]: - normalized = re.sub(r"[^a-z0-9]+", " ", text.lower()) - return {token for token in normalized.split() if len(token) >= 4} - - -def _probe_image_url(url: str, timeout: int = 5) -> bool: - """Return True if URL responds without a 4xx/5xx error (HEAD request). - - Returns True on network/connection errors so that a flaky server does not - cause a valid image to be silently dropped. - """ - try: - req = _urllib_req.Request( - url, - method="HEAD", - headers={"User-Agent": "Mozilla/5.0 (compatible; rss-news/1.0)"}, - ) - with _urllib_req.urlopen(req, timeout=timeout) as resp: - return resp.status < 400 - except urllib.error.HTTPError as exc: - return exc.code < 400 # 3xx redirects are OK; 4xx/5xx are not - except Exception: - return True # network error → don't filter, let WP try later - - -def _rank_image_candidates(source_url: str, title: str, images: list[str]) -> list[dict[str, Any]]: - source_host = (urlparse(source_url).hostname or "").lower() - is_presseportal = "presseportal.de" in source_host - title_tokens = _normalize_tokens(title) - blocked_patterns = ("logo", "badge", "app-store", "google-play", "na-logo", "sprite", "icon", "favicon", "tracking", "pixel", ".svg", ".ico", ".gif") - # Known placeholder/default images that should never be used as featured image - placeholder_patterns = ("some-default.jpg", "default-image", "placeholder", "no-image", "noimage") - - - ranked: list[dict[str, Any]] = [] - for url in images: - # Skip inline data: URIs (e.g. base64-encoded SVG placeholders) - if url.startswith("data:"): - continue - - parsed = urlparse(url) - path = unquote(parsed.path.lower()) - full = f"{parsed.netloc.lower()}{path}" - score = 0 - reasons: list[str] = [] - - if any(token in full for token in placeholder_patterns): - score -= 300 - reasons.append("placeholder-image") - - if any(token in full for token in blocked_patterns): - score -= 150 - reasons.append("blocked-pattern") - - if is_presseportal and "/thumbnail/story_big/" in path: - score += 120 - reasons.append("presseportal-story-big") - elif is_presseportal and "/thumbnail/highlight/" in path: - score += 45 - reasons.append("presseportal-highlight") - elif is_presseportal and "/thumbnail/liste/" in path: - score -= 40 - reasons.append("presseportal-list") - - if "crop=" in (parsed.query or "").lower(): - score -= 10 - reasons.append("cropped-preview") - - path_tokens = _normalize_tokens(path.replace("-", " ")) - overlap = len(title_tokens.intersection(path_tokens)) - if overlap > 0: - score += min(30, overlap * 6) - reasons.append(f"title-match:{overlap}") - - ranked.append({"url": url, "score": score, "reasons": reasons}) - - ranked.sort(key=lambda item: item["score"], reverse=True) - return ranked - - -def _select_relevant_images(source_url: str, title: str, images: list[str], max_keep: int = 3) -> tuple[list[str], str | None, list[dict[str, Any]]]: - # dedupe incoming order first - deduped: list[str] = [] - seen: set[str] = set() - for image in images: - if image and image not in seen: - seen.add(image) - deduped.append(image) - - ranked = _rank_image_candidates(source_url, title, deduped) - candidates = [item["url"] for item in ranked if item["score"] > -100] - - # Probe top candidates (max 4) to skip definitively broken URLs (HTTP 4xx). - # Network errors are treated as OK to avoid false negatives on flaky servers. - primary = None - kept: list[str] = [] - for url in candidates[:4]: - if _probe_image_url(url): - if primary is None: - primary = url - kept.append(url) - if len(kept) >= max_keep: - break - - # Fallback: if all probes failed with network errors, use best candidate anyway - if not kept and candidates: - primary = candidates[0] - kept = candidates[:max_keep] - - return kept, primary, ranked - - -def _merge_ingestion_meta(existing_meta_json: str | None, attribution: dict[str, Any], extraction_meta: dict[str, Any]) -> str: - meta: dict[str, Any] = {} - if existing_meta_json: - try: - parsed = json.loads(existing_meta_json) - if isinstance(parsed, dict): - meta = parsed - except Exception: - meta = {} - meta["attribution"] = attribution - meta["extraction"] = extraction_meta - return json.dumps(meta, ensure_ascii=False) - - -def run_ingestion(feed_id: int | None = None) -> IngestionStats: - run_id = create_run(RunCreate(run_type="ingestion", status="running", details="started")) - feeds_processed = 0 - entries_seen = 0 - articles_upserted = 0 - feed_results: list[dict[str, object]] = [] - - try: - if feed_id is not None: - feed = get_feed_by_id(feed_id) - feeds = [feed] if feed and int(feed.get("is_enabled", 0)) == 1 else [] - else: - feeds = list_enabled_feeds() - - for feed in feeds: - if not feed: - continue - feeds_processed += 1 - - parsed = None - feed_error = None - for attempt in range(1, MAX_FEED_FETCH_RETRIES + 1): - try: - parsed = feedparser.parse( - feed["url"], - etag=feed.get("etag"), - modified=feed.get("last_modified"), - ) - break - except Exception as exc: - feed_error = str(exc) - if attempt < MAX_FEED_FETCH_RETRIES: - time.sleep(0.5 * attempt) - - if parsed is None: - feed_results.append( - { - "feed_id": int(feed["id"]), - "feed_url": feed["url"], - "status": "failed", - "error": feed_error or "unknown", - "entries_seen": 0, - "upserts": 0, - } - ) - continue - - # Persist ETag/Last-Modified for conditional requests. - parsed_etag = _parsed_get(parsed, "etag") - parsed_modified = _parsed_get(parsed, "modified") - if parsed_modified and not isinstance(parsed_modified, str): - parsed_modified = str(parsed_modified) - update_feed_fetch_state( - feed_id=int(feed["id"]), - etag=parsed_etag if isinstance(parsed_etag, str) else None, - last_modified=parsed_modified if isinstance(parsed_modified, str) else None, - ) - - feed_entries_seen = 0 - feed_upserts = 0 - from .config import get_settings as _get_settings - _max_age_days = _get_settings().pipeline_max_article_age_days - for entry in _parsed_get(parsed, "entries", []): - entries_seen += 1 - feed_entries_seen += 1 - link = entry.get("link") - if not link: - continue - - # Age filter: skip articles older than max_age_days (0 = no limit) - if _max_age_days > 0: - published_iso = _entry_published_iso(entry) - if published_iso: - try: - published_dt = datetime.fromisoformat(published_iso) - age = datetime.now(timezone.utc) - published_dt - if age > timedelta(days=_max_age_days): - continue - except Exception: - pass # can't parse date → allow through - - # Resolve Google redirect URLs (google.com/url?...&url=&...) - link = _resolve_google_redirect(link) - # Normalize AMP/tracking params (e.g. ?outputType=valid_amp) - link = _normalize_article_url(link) - - summary, content_raw = _entry_text(entry) - # Strip HTML tags from title (Google Alerts wraps matched keywords in ) - raw_title = entry.get("title") or "Ohne Titel" - title = re.sub(r"<[^>]+>", "", raw_title).strip() or "Ohne Titel" - extracted = extract_article(link) - - final_title = extracted.title or title - final_author = extracted.author or entry.get("author") - final_summary = extracted.summary or (summary[:1000] if summary else None) - final_content_raw = extracted.content_text or content_raw - final_canonical = extracted.canonical_url or entry.get("link") - selected_images, primary_image, ranked_images = _select_relevant_images( - link, - final_title, - extracted.images, - max_keep=3, - ) - - source_hash = _entry_hash( - entry, - int(feed["id"]), - link, - final_title, - final_summary or "", - ) - attribution = { - "source_name": feed.get("source_name"), - "source_base_url": feed.get("source_base_url"), - "source_terms_url": feed.get("source_terms_url"), - "source_license_name": feed.get("source_license_name"), - "source_risk_level": feed.get("source_risk_level"), - "original_link": link, - "feed_name": feed.get("name"), - "feed_id": int(feed["id"]), - "imported_at": datetime.now(timezone.utc).isoformat(), - } - extraction_meta: dict[str, Any] = extracted_article_to_meta(extracted) - extraction_meta["fetched_from"] = link - extraction_meta["image_selection"] = { - "primary": primary_image, - "selected_count": len(selected_images), - "total_candidates": len(extracted.images), - "ranked": ranked_images, - } - base_payload = ArticleUpsert( - feed_id=int(feed["id"]), - source_article_id=entry.get("id") or entry.get("guid"), - source_hash=source_hash, - title=final_title, - source_url=link, - canonical_url=final_canonical, - published_at=_entry_published_iso(entry), - author=final_author, - summary=final_summary, - content_raw=final_content_raw, - content_rewritten=None, - image_urls_json=json.dumps(selected_images, ensure_ascii=False) if selected_images else None, - press_contact=extracted.press_contact, - source_name_snapshot=feed.get("source_name"), - source_terms_url_snapshot=feed.get("source_terms_url"), - source_license_name_snapshot=feed.get("source_license_name"), - legal_checked=False, - legal_checked_at=None, - legal_note=None, - wp_post_id=None, - wp_post_url=None, - publish_attempts=0, - publish_last_error=None, - published_to_wp_at=None, - word_count=len((final_content_raw or "").split()), - status="new", - meta_json=json.dumps({"attribution": attribution, "extraction": extraction_meta}, ensure_ascii=False), - ) - existing = find_existing_article_for_upsert(base_payload) - if existing and existing.get("status") == "error": - # Explicitly closed article: ignore on subsequent ingestion runs. - continue - - payload = base_payload - if existing: - payload = ArticleUpsert( - feed_id=base_payload.feed_id, - source_article_id=base_payload.source_article_id, - source_hash=base_payload.source_hash, - title=base_payload.title, - source_url=base_payload.source_url, - canonical_url=base_payload.canonical_url, - published_at=base_payload.published_at, - author=base_payload.author, - summary=base_payload.summary, - content_raw=base_payload.content_raw, - content_rewritten=existing.get("content_rewritten"), - image_urls_json=base_payload.image_urls_json, - press_contact=base_payload.press_contact or existing.get("press_contact"), - source_name_snapshot=base_payload.source_name_snapshot, - source_terms_url_snapshot=base_payload.source_terms_url_snapshot, - source_license_name_snapshot=base_payload.source_license_name_snapshot, - legal_checked=bool(int(existing.get("legal_checked", 0))), - legal_checked_at=existing.get("legal_checked_at"), - legal_note=existing.get("legal_note"), - wp_post_id=existing.get("wp_post_id"), - wp_post_url=existing.get("wp_post_url"), - publish_attempts=int(existing.get("publish_attempts", 0)), - publish_last_error=existing.get("publish_last_error"), - published_to_wp_at=existing.get("published_to_wp_at"), - word_count=base_payload.word_count, - status=existing.get("status") or "new", - meta_json=_merge_ingestion_meta(existing.get("meta_json"), attribution, extraction_meta), - ) - - article_id = upsert_article(payload) - if article_id: - articles_upserted += 1 - feed_upserts += 1 - - feed_results.append( - { - "feed_id": int(feed["id"]), - "feed_url": feed["url"], - "status": "success", - "entries_seen": feed_entries_seen, - "upserts": feed_upserts, - } - ) - - finish_run( - run_id=run_id, - status="success", - details=json.dumps( - { - "feeds_processed": feeds_processed, - "entries_seen": entries_seen, - "upserts": articles_upserted, - "feeds": feed_results, - }, - ensure_ascii=False, - ), - ) - return IngestionStats( - run_id=run_id, - feeds_processed=feeds_processed, - entries_seen=entries_seen, - articles_upserted=articles_upserted, - status="success", - message="Ingestion abgeschlossen", - ) - except Exception as exc: - finish_run(run_id=run_id, status="failed", details=str(exc)) - return IngestionStats( - run_id=run_id, - feeds_processed=feeds_processed, - entries_seen=entries_seen, - articles_upserted=articles_upserted, - status="failed", - message=str(exc), - ) diff --git a/backend/app/main.py b/backend/app/main.py deleted file mode 100644 index b4776af..0000000 --- a/backend/app/main.py +++ /dev/null @@ -1,727 +0,0 @@ -import asyncio -from contextlib import asynccontextmanager -import csv -from datetime import datetime, timezone -import io -import json -import logging -from pathlib import Path - -from fastapi import Depends, FastAPI, HTTPException, Request, Response, status -from fastapi.responses import JSONResponse -from pydantic import BaseModel, Field -from fastapi.staticfiles import StaticFiles - -from .admin_ui import router as admin_router -from .auth import create_session_token, verify_credentials, verify_session_token -from .config import get_settings -from .db import init_db -from .ingestion import run_ingestion -from .pipeline import run_auto_pipeline -from .policy import evaluate_source_policy, is_source_allowed -from .publisher import enqueue_publish, run_publisher -from .relevance import article_age_days, article_relevance -from .rewrite import generate_article_tags, merge_generated_tags, rewrite_article_text -from .telegram_bot import handle_update, setup_webhook -from .repositories import ( - ArticleUpsert, - FeedCreate, - RunCreate, - SourceCreate, - create_feed as repo_create_feed, - create_run, - create_source as repo_create_source, - finish_run, - get_article_by_id, - get_feed_by_id, - get_run_by_id, - get_source_by_id, - list_publish_jobs, - list_articles as repo_list_articles, - list_feeds as repo_list_feeds, - list_runs, - list_sources as repo_list_sources, - set_article_legal_review, - update_article_status, - upsert_article as repo_upsert_article, -) -from .workflow import ALLOWED_UI_TRANSITIONS, UI_STATUSES, internal_to_ui_status, ui_to_internal_status - -settings = get_settings() - - -@asynccontextmanager -async def app_lifespan(_: FastAPI): - init_db() - yield - - -app = FastAPI(title=settings.app_name, lifespan=app_lifespan) -app.include_router(admin_router) -app.mount( - "/admin/static", - StaticFiles(directory=str(Path(__file__).resolve().parent.parent / "static")), - name="admin-static", -) - - -class LoginRequest(BaseModel): - username: str - password: str - - -class SourceCreateRequest(BaseModel): - name: str = Field(min_length=1, max_length=200) - base_url: str | None = None - terms_url: str | None = None - license_name: str | None = None - risk_level: str = Field(default="yellow", pattern="^(green|yellow|red)$") - is_enabled: bool = False - notes: str | None = None - last_reviewed_at: str | None = None - - -class FeedCreateRequest(BaseModel): - name: str = Field(min_length=1, max_length=200) - url: str = Field(min_length=5, max_length=1000) - source_id: int | None = None - is_enabled: bool = True - - -class RunCreateRequest(BaseModel): - run_type: str = Field(min_length=2, max_length=100) - status: str = Field(default="queued", pattern="^(queued|running|success|failed)$") - details: str | None = None - - -class RunFinishRequest(BaseModel): - status: str = Field(pattern="^(success|failed)$") - details: str | None = None - - -class ArticleUpsertRequest(BaseModel): - feed_id: int | None = None - source_article_id: str | None = None - source_hash: str | None = None - title: str = Field(min_length=1, max_length=500) - source_url: str = Field(min_length=5, max_length=2000) - canonical_url: str | None = None - published_at: str | None = None - author: str | None = None - summary: str | None = None - content_raw: str | None = None - content_rewritten: str | None = None - image_urls_json: str | None = None - press_contact: str | None = None - source_name_snapshot: str | None = None - source_terms_url_snapshot: str | None = None - source_license_name_snapshot: str | None = None - legal_checked: bool = False - legal_checked_at: str | None = None - legal_note: str | None = None - wp_post_id: int | None = None - wp_post_url: str | None = None - publish_attempts: int = 0 - publish_last_error: str | None = None - published_to_wp_at: str | None = None - word_count: int = 0 - status: str = Field(default="new", pattern="^(new|rewrite|publish|published|close|review|approved|error|no_image)$") - meta_json: str | None = None - - -class IngestionRunRequest(BaseModel): - feed_id: int | None = None - - -class ArticleTransitionRequest(BaseModel): - target_status: str = Field(pattern="^(new|rewrite|publish|published|close|review|approved|error|no_image)$") - note: str | None = None - - -class ArticleReviewRequest(BaseModel): - decision: str = Field(pattern="^(approve|reject)$") - note: str | None = None - - -class ArticleLegalReviewRequest(BaseModel): - approved: bool - note: str | None = None - - -class PublisherEnqueueRequest(BaseModel): - article_id: int - max_attempts: int = 3 - - -class PublisherRunRequest(BaseModel): - max_jobs: int = 10 - - -ALLOWED_ARTICLE_TRANSITIONS: dict[str, set[str]] = { - "new": {"rewrite", "error"}, - "rewrite": {"approved", "error"}, - "approved": {"published", "error"}, - "published": {"error"}, - "error": {"rewrite"}, -} - - -def require_auth(request: Request) -> str: - token = request.cookies.get(settings.session_cookie_name) - if not token: - raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Nicht angemeldet") - - username = verify_session_token(token) - if not username: - raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Session ungueltig oder abgelaufen") - - return username - - -@app.get("/health") -def health() -> dict: - return {"status": "ok", "service": settings.app_name, "db_path": settings.app_db_path} - - -@app.post("/auth/login") -def login(payload: LoginRequest, response: Response) -> dict: - if not verify_credentials(payload.username, payload.password): - raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Ungueltige Zugangsdaten") - - token = create_session_token(payload.username) - response.set_cookie( - key=settings.session_cookie_name, - value=token, - max_age=settings.session_max_age_seconds, - httponly=True, - secure=False, - samesite="lax", - ) - return {"ok": True, "username": payload.username} - - -@app.post("/auth/logout") -def logout(response: Response) -> dict: - response.delete_cookie(settings.session_cookie_name) - return {"ok": True} - - -@app.get("/auth/me") -def me(username: str = Depends(require_auth)) -> dict: - return {"authenticated": True, "username": username} - - -@app.get("/api/protected") -def protected(username: str = Depends(require_auth)) -> dict: - return {"ok": True, "message": "Protected endpoint", "username": username} - - -@app.get("/api/pipeline/status") -def pipeline_status(username: str = Depends(require_auth)) -> dict: - feeds_total = len(repo_list_feeds()) - sources_total = len(repo_list_sources()) - articles_total = len(repo_list_articles(limit=500)) - return { - "ok": True, - "stage": "skeleton+db", - "requested_by": username, - "counts": { - "sources": sources_total, - "feeds": feeds_total, - "articles": articles_total, - }, - } - - -@app.get("/api/sources") -def list_sources(username: str = Depends(require_auth)) -> dict: - return {"ok": True, "items": repo_list_sources(), "requested_by": username} - - -@app.get("/api/sources/{source_id}/policy-check") -def source_policy_check(source_id: int, username: str = Depends(require_auth)) -> dict: - source = get_source_by_id(source_id) - if not source: - raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Quelle nicht gefunden") - issues = evaluate_source_policy(source) - return { - "ok": True, - "source_id": source_id, - "allowed": is_source_allowed(source), - "issues": issues, - "requested_by": username, - } - - -@app.post("/api/sources") -def create_source(payload: SourceCreateRequest, username: str = Depends(require_auth)) -> dict: - source_id = repo_create_source( - SourceCreate( - name=payload.name, - base_url=payload.base_url, - terms_url=payload.terms_url, - license_name=payload.license_name, - risk_level=payload.risk_level, - is_enabled=payload.is_enabled, - notes=payload.notes, - last_reviewed_at=payload.last_reviewed_at, - ) - ) - return {"ok": True, "id": source_id, "requested_by": username} - - -@app.get("/api/feeds") -def list_feeds(username: str = Depends(require_auth)) -> dict: - return {"ok": True, "items": repo_list_feeds(), "requested_by": username} - - -@app.get("/api/feeds/{feed_id}/policy-check") -def feed_policy_check(feed_id: int, username: str = Depends(require_auth)) -> dict: - feed = get_feed_by_id(feed_id) - if not feed: - raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Feed nicht gefunden") - - source_snapshot = { - "id": feed.get("source_id"), - "name": feed.get("source_name"), - "base_url": feed.get("source_base_url"), - "terms_url": feed.get("source_terms_url"), - "license_name": feed.get("source_license_name"), - "risk_level": feed.get("source_risk_level"), - "last_reviewed_at": feed.get("source_last_reviewed_at"), - "is_enabled": feed.get("source_is_enabled"), - } - issues = evaluate_source_policy(source_snapshot) - return { - "ok": True, - "feed_id": feed_id, - "allowed": len(issues) == 0, - "issues": issues, - "requested_by": username, - } - - -@app.post("/api/feeds") -def create_feed(payload: FeedCreateRequest, username: str = Depends(require_auth)) -> dict: - try: - feed_id = repo_create_feed( - FeedCreate( - name=payload.name, - url=payload.url, - source_id=payload.source_id, - is_enabled=payload.is_enabled, - ) - ) - except Exception as exc: - raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=f"Feed konnte nicht angelegt werden: {exc}") from exc - - return {"ok": True, "id": feed_id, "requested_by": username} - - -@app.get("/api/runs") -def api_list_runs(limit: int = 50, username: str = Depends(require_auth)) -> dict: - return {"ok": True, "items": list_runs(limit=limit), "requested_by": username} - - -@app.get("/api/runs/{run_id}") -def api_get_run(run_id: int, username: str = Depends(require_auth)) -> dict: - run = get_run_by_id(run_id) - if not run: - raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Run nicht gefunden") - return {"ok": True, "item": run, "requested_by": username} - - -@app.post("/api/runs") -def api_create_run(payload: RunCreateRequest, username: str = Depends(require_auth)) -> dict: - run_id = create_run(RunCreate(run_type=payload.run_type, status=payload.status, details=payload.details)) - return {"ok": True, "id": run_id, "requested_by": username} - - -@app.post("/api/runs/{run_id}/finish") -def api_finish_run(run_id: int, payload: RunFinishRequest, username: str = Depends(require_auth)) -> dict: - finish_run(run_id=run_id, status=payload.status, details=payload.details) - return {"ok": True, "id": run_id, "requested_by": username} - - -@app.get("/api/articles") -def api_list_articles(limit: int = 100, status_filter: str | None = None, username: str = Depends(require_auth)) -> dict: - internal_filter = ui_to_internal_status(status_filter) if status_filter else None - items = repo_list_articles(limit=limit, status_filter=internal_filter) - for item in items: - item["status_ui"] = internal_to_ui_status(item.get("status")) - return {"ok": True, "items": items, "requested_by": username} - - -@app.get("/api/articles/export") -def api_export_articles( - format: str = "json", - status_filter: str | None = None, - username: str = Depends(require_auth), -): - internal_filter = ui_to_internal_status(status_filter) if status_filter else None - articles = repo_list_articles(limit=500, status_filter=internal_filter) - rows = [] - for article in articles: - meta: dict = {} - if article.get("meta_json"): - try: - parsed = json.loads(article["meta_json"]) - if isinstance(parsed, dict): - meta = parsed - except Exception: - meta = {} - image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {} - selected_image_url = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None - - days_old = article_age_days(article.get("published_at")) - rows.append( - { - "id": article.get("id"), - "title": article.get("title"), - "status": article.get("status"), - "published_at": article.get("published_at"), - "days_old": days_old, - "relevance": article_relevance(article.get("published_at")), - "author": article.get("author"), - "source_url": article.get("source_url"), - "canonical_url": article.get("canonical_url"), - "source_name_snapshot": article.get("source_name_snapshot"), - "source_license_name_snapshot": article.get("source_license_name_snapshot"), - "source_terms_url_snapshot": article.get("source_terms_url_snapshot"), - "press_contact": article.get("press_contact"), - "image_urls_json": article.get("image_urls_json"), - "selected_image_url": selected_image_url, - "legal_checked": bool(int(article.get("legal_checked", 0))), - "legal_checked_at": article.get("legal_checked_at"), - "legal_note": article.get("legal_note"), - } - ) - - generated_at = datetime.now(timezone.utc).isoformat() - if format == "csv": - out = io.StringIO() - fieldnames = [ - "id", - "title", - "status", - "published_at", - "days_old", - "relevance", - "author", - "source_url", - "canonical_url", - "source_name_snapshot", - "source_license_name_snapshot", - "source_terms_url_snapshot", - "press_contact", - "image_urls_json", - "selected_image_url", - "legal_checked", - "legal_checked_at", - "legal_note", - ] - writer = csv.DictWriter(out, fieldnames=fieldnames) - writer.writeheader() - writer.writerows(rows) - return Response( - content=out.getvalue(), - media_type="text/csv; charset=utf-8", - headers={"Content-Disposition": 'attachment; filename="articles_export.csv"'}, - ) - - return JSONResponse( - { - "ok": True, - "count": len(rows), - "generated_at": generated_at, - "status_filter": status_filter, - "items": rows, - "requested_by": username, - } - ) - - -@app.get("/api/articles/{article_id}") -def api_get_article(article_id: int, username: str = Depends(require_auth)) -> dict: - article = get_article_by_id(article_id) - if not article: - raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden") - article["status_ui"] = internal_to_ui_status(article.get("status")) - return {"ok": True, "item": article, "requested_by": username} - - -@app.post("/api/articles/upsert") -def api_upsert_article(payload: ArticleUpsertRequest, username: str = Depends(require_auth)) -> dict: - article_id = repo_upsert_article( - ArticleUpsert( - feed_id=payload.feed_id, - source_article_id=payload.source_article_id, - source_hash=payload.source_hash, - title=payload.title, - source_url=payload.source_url, - canonical_url=payload.canonical_url, - published_at=payload.published_at, - author=payload.author, - summary=payload.summary, - content_raw=payload.content_raw, - content_rewritten=payload.content_rewritten, - image_urls_json=payload.image_urls_json, - press_contact=payload.press_contact, - source_name_snapshot=payload.source_name_snapshot, - source_terms_url_snapshot=payload.source_terms_url_snapshot, - source_license_name_snapshot=payload.source_license_name_snapshot, - legal_checked=payload.legal_checked, - legal_checked_at=payload.legal_checked_at, - legal_note=payload.legal_note, - wp_post_id=payload.wp_post_id, - wp_post_url=payload.wp_post_url, - publish_attempts=payload.publish_attempts, - publish_last_error=payload.publish_last_error, - published_to_wp_at=payload.published_to_wp_at, - word_count=payload.word_count, - status=ui_to_internal_status(payload.status), - meta_json=payload.meta_json, - ) - ) - return {"ok": True, "id": article_id, "requested_by": username} - - -@app.post("/api/articles/{article_id}/transition") -def api_article_transition(article_id: int, payload: ArticleTransitionRequest, username: str = Depends(require_auth)) -> dict: - article = get_article_by_id(article_id) - if not article: - raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden") - - current_status = article.get("status") - current_ui = internal_to_ui_status(current_status) - target_internal = ui_to_internal_status(payload.target_status) - target_ui = internal_to_ui_status(target_internal) - allowed_targets = ALLOWED_UI_TRANSITIONS.get(current_ui, set()) - if target_ui not in allowed_targets: - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail=f"Ungueltiger Statuswechsel: {current_ui} -> {target_ui}", - ) - - updated = update_article_status(article_id, target_internal, actor=username, note=payload.note) - if not updated: - raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden") - return {"ok": True, "id": article_id, "from_status": current_ui, "to_status": target_ui} - - -@app.post("/api/articles/{article_id}/rewrite-run") -def api_article_rewrite_run(article_id: int, username: str = Depends(require_auth)) -> dict: - article = get_article_by_id(article_id) - if not article: - raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden") - if internal_to_ui_status(article.get("status")) not in {"rewrite", "new"}: - raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Rewrite nur aus Status 'new' oder 'rewrite'") - - rewritten = rewrite_article_text(article) - tags: list[str] = [] - try: - tags = generate_article_tags(article, rewritten_text=rewritten) - except Exception: - tags = [] - merged_meta = merge_generated_tags(article.get("meta_json"), tags) - # upsert via status update + existing fields by lightweight path: - repo_upsert_article( - ArticleUpsert( - feed_id=article.get("feed_id"), - source_article_id=article.get("source_article_id"), - source_hash=article.get("source_hash"), - title=article.get("title"), - source_url=article.get("source_url"), - canonical_url=article.get("canonical_url"), - published_at=article.get("published_at"), - author=article.get("author"), - summary=article.get("summary"), - content_raw=article.get("content_raw"), - content_rewritten=rewritten, - image_urls_json=article.get("image_urls_json"), - press_contact=article.get("press_contact"), - source_name_snapshot=article.get("source_name_snapshot"), - source_terms_url_snapshot=article.get("source_terms_url_snapshot"), - source_license_name_snapshot=article.get("source_license_name_snapshot"), - legal_checked=bool(int(article.get("legal_checked", 0))), - legal_checked_at=article.get("legal_checked_at"), - legal_note=article.get("legal_note"), - wp_post_id=article.get("wp_post_id"), - wp_post_url=article.get("wp_post_url"), - publish_attempts=int(article.get("publish_attempts", 0)), - publish_last_error=article.get("publish_last_error"), - published_to_wp_at=article.get("published_to_wp_at"), - word_count=len(rewritten.split()), - status="approved", - meta_json=merged_meta, - ) - ) - return {"ok": True, "id": article_id, "status": "publish", "tags": tags} - - -@app.post("/api/articles/{article_id}/legal-review") -def api_article_legal_review(article_id: int, payload: ArticleLegalReviewRequest, username: str = Depends(require_auth)) -> dict: - article = get_article_by_id(article_id) - if not article: - raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden") - - updated = set_article_legal_review(article_id, approved=payload.approved, note=payload.note, actor=username) - if not updated: - raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden") - return { - "ok": True, - "id": article_id, - "legal_checked": payload.approved, - } - - -@app.get("/api/publisher/jobs") -def api_publisher_jobs(limit: int = 100, username: str = Depends(require_auth)) -> dict: - return {"ok": True, "items": list_publish_jobs(limit=limit), "requested_by": username} - - -@app.post("/api/publisher/enqueue") -def api_publisher_enqueue(payload: PublisherEnqueueRequest, username: str = Depends(require_auth)) -> dict: - article = get_article_by_id(payload.article_id) - if not article: - raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden") - job_id = enqueue_publish(article_id=payload.article_id, max_attempts=payload.max_attempts) - return {"ok": True, "job_id": job_id, "article_id": payload.article_id, "requested_by": username} - - -@app.post("/api/publisher/run") -def api_publisher_run(payload: PublisherRunRequest, username: str = Depends(require_auth)) -> dict: - stats = run_publisher(max_jobs=payload.max_jobs) - return { - "ok": True, - "requested_by": username, - "stats": { - "processed": stats.processed, - "success": stats.success, - "failed": stats.failed, - "requeued": stats.requeued, - }, - } - - -@app.post("/api/articles/{article_id}/review") -def api_article_review(article_id: int, payload: ArticleReviewRequest, username: str = Depends(require_auth)) -> dict: - raise HTTPException(status_code=status.HTTP_410_GONE, detail="Review-Endpoint ersetzt durch Rewrite-Workflow") - - -@app.post("/api/ingestion/run") -def api_run_ingestion(payload: IngestionRunRequest, username: str = Depends(require_auth)) -> dict: - stats = run_ingestion(feed_id=payload.feed_id) - return { - "ok": stats.status == "success", - "run_id": stats.run_id, - "status": stats.status, - "message": stats.message, - "stats": { - "feeds_processed": stats.feeds_processed, - "entries_seen": stats.entries_seen, - "articles_upserted": stats.articles_upserted, - }, - "requested_by": username, - } - - -# --------------------------------------------------------------------------- -# N8N Automation endpoint (API-Key auth, no session cookie required) -# --------------------------------------------------------------------------- - -def _require_api_key(request: Request) -> None: - api_key = request.headers.get("X-API-Key") or request.query_params.get("api_key") - expected = settings.n8n_api_key - if not expected: - raise HTTPException(status_code=status.HTTP_501_NOT_IMPLEMENTED, detail="N8N_API_KEY nicht konfiguriert") - if api_key != expected: - raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Ungültiger API-Key") - - -_pipeline_lock = asyncio.Lock() - - -@app.post("/api/n8n/pipeline") -async def api_n8n_pipeline(request: Request) -> dict: - """Trigger the full auto pipeline in background. Returns immediately. - Called by N8N (2x/day or on demand). Results arrive via Telegram.""" - _require_api_key(request) - - if _pipeline_lock.locked(): - logging.getLogger(__name__).warning("Pipeline bereits aktiv – Trigger ignoriert") - return {"ok": False, "message": "Pipeline läuft bereits – Trigger ignoriert"} - - async def _run(): - async with _pipeline_lock: - loop = asyncio.get_event_loop() - try: - await loop.run_in_executor(None, lambda: run_auto_pipeline(trigger="n8n")) - except Exception as exc: - logging.getLogger(__name__).error("Background pipeline error: %s", exc) - - asyncio.create_task(_run()) - return {"ok": True, "message": "Pipeline gestartet – Ergebnisse kommen per Telegram"} - - -@app.post("/api/n8n/ingest") -def api_n8n_ingest(request: Request) -> dict: - """Run only the ingestion step (no rewrite/publish). For N8N.""" - _require_api_key(request) - stats = run_ingestion() - return { - "ok": stats.status == "success", - "stats": { - "feeds_processed": stats.feeds_processed, - "entries_seen": stats.entries_seen, - "articles_upserted": stats.articles_upserted, - }, - } - - -# --------------------------------------------------------------------------- -# Telegram Webhook -# --------------------------------------------------------------------------- - -@app.post("/telegram/webhook") -async def telegram_webhook(request: Request) -> dict: - """Receive updates from Telegram Bot API. - - Returns 200 immediately so Telegram never retries the same update. - Actual processing runs in a background task. - """ - import asyncio - import logging - - # Verify secret token - secret = settings.telegram_webhook_secret - if secret: - incoming = request.headers.get("X-Telegram-Bot-Api-Secret-Token", "") - if incoming != secret: - raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="Invalid secret") - - body = await request.body() - try: - update = json.loads(body.decode("utf-8")) - except Exception: - raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON") - - async def _process(): - loop = asyncio.get_event_loop() - try: - await loop.run_in_executor(None, lambda: handle_update(update)) - except Exception as exc: - logging.getLogger(__name__).error("Telegram update handler error: %s", exc) - - asyncio.create_task(_process()) - return {"ok": True} - - -@app.post("/api/telegram/setup-webhook") -def api_setup_telegram_webhook(request: Request) -> dict: - """Register the Telegram webhook URL. Call once after deployment.""" - username = require_auth(request) - base_url = str(request.base_url).rstrip("/") - webhook_url = f"{base_url}/telegram/webhook" - result = setup_webhook(webhook_url) - return {"ok": True, "webhook_url": webhook_url, "telegram_response": result, "requested_by": username} diff --git a/backend/app/pipeline.py b/backend/app/pipeline.py deleted file mode 100644 index 93a251b..0000000 --- a/backend/app/pipeline.py +++ /dev/null @@ -1,516 +0,0 @@ -"""Autonomous RSS-News pipeline. - -Full automated flow: -1. Run RSS ingestion -2. For each new article: - - Auto-select primary image - - Score relevance via GPT - - < warn threshold: reject (error status) → Telegram rejected summary - - warn..auto threshold: Telegram warning with override button - - >= auto threshold: rewrite → create WP draft → Telegram notification -3. Send pipeline summary to Telegram -""" -from __future__ import annotations - -import json -import logging -import time -from dataclasses import dataclass, field -from datetime import datetime, timezone -from typing import Any - -from .config import get_settings -from .ingestion import run_ingestion -from .publisher import enqueue_publish, run_publisher -from .repositories import ( - ArticleUpsert, - get_article_by_id, - list_articles, - set_article_image_decision, - update_article_status, - upsert_article as repo_upsert_article, -) -from .rewrite import generate_article_tags, merge_generated_tags, rewrite_article_text, score_article_relevance -from .scheduler import reserve_publish_slot -from .wordpress import publish_article_draft, selected_image_exists - -logger = logging.getLogger(__name__) - - -@dataclass -class PipelineStats: - ingested: int = 0 - processed: int = 0 - drafts_created: int = 0 - rejected: int = 0 - quality_gate_rejected: int = 0 - warnings: int = 0 - errors: int = 0 - no_image: int = 0 - rejected_articles: list[dict[str, Any]] = field(default_factory=list) - - -# --------------------------------------------------------------------------- -# Internal helpers -# --------------------------------------------------------------------------- - -def _auto_select_image(article: dict[str, Any]) -> bool: - """Auto-select the primary image from ingestion metadata if not already selected.""" - meta_json = article.get("meta_json") or "{}" - try: - meta = json.loads(meta_json) - except Exception: - return False - - # Already selected? - image_review = meta.get("image_review") or {} - if isinstance(image_review, dict) and image_review.get("selected_url"): - return True - - # Try to get primary from ingestion extraction - extraction = meta.get("extraction") or {} - image_selection = extraction.get("image_selection") or {} - primary = image_selection.get("primary") - - if not primary: - # Fallback: use first URL from image_urls_json - image_urls_json = article.get("image_urls_json") or "[]" - try: - urls = json.loads(image_urls_json) - if urls: - primary = urls[0] - except Exception: - pass - - if primary: - set_article_image_decision(int(article["id"]), primary, "select", actor="pipeline") - return True - return False - - -def _store_relevance(article_id: int, relevance: dict[str, Any]) -> None: - """Persist relevance score and reason in article meta_json and relevance_score column.""" - article = get_article_by_id(article_id) - if not article: - return - try: - meta = json.loads(article.get("meta_json") or "{}") - except Exception: - meta = {} - meta["relevance"] = relevance - new_meta = json.dumps(meta, ensure_ascii=False) - from .db import get_conn - with get_conn() as conn: - conn.execute( - "UPDATE articles SET meta_json = ?, relevance_score = ? WHERE id = ?", - (new_meta, relevance.get("score", 0), article_id), - ) - - -def _do_rewrite_and_draft(article: dict[str, Any]) -> tuple[int, str | None]: - """Rewrite article and create WP draft. Returns (wp_post_id, wp_post_url).""" - article_id = int(article["id"]) - settings = get_settings() - - # ── Quality gate 1: raw content length ────────────────────────────────── - import re as _re - raw_text = _re.sub(r"<[^>]+>", " ", article.get("content_raw") or "") - raw_words = len(raw_text.split()) - if raw_words < settings.pipeline_min_words_raw: - note = ( - f"Zu wenig Rohinhalt: {raw_words} Wörter " - f"(Minimum: {settings.pipeline_min_words_raw})" - ) - logger.warning("_do_rewrite_and_draft #%d: %s — überspringe", article_id, note) - update_article_status(article_id, "error", actor="pipeline", note=note) - raise ValueError(note) - - # Rewrite - logger.info("_do_rewrite_and_draft #%d: starte OpenAI-Rewrite (%d Roh-Wörter)", article_id, raw_words) - rewritten = rewrite_article_text(article) - - # ── Quality gate 2: rewritten content length ───────────────────────────── - rewritten_words = len(rewritten.split()) - if rewritten_words < settings.pipeline_min_words_rewritten: - note = ( - f"Rewrite zu kurz: {rewritten_words} Wörter " - f"(Minimum: {settings.pipeline_min_words_rewritten})" - ) - logger.warning("_do_rewrite_and_draft #%d: %s — überspringe", article_id, note) - update_article_status(article_id, "error", actor="pipeline", note=note) - raise ValueError(note) - logger.info("_do_rewrite_and_draft #%d: Rewrite fertig (%d Wörter), generiere Tags", article_id, len(rewritten.split())) - tags: list[str] = [] - try: - tags = generate_article_tags(article, rewritten_text=rewritten) - except Exception: - pass - merged_meta = merge_generated_tags(article.get("meta_json"), tags) - - # Save rewritten content + approved status - repo_upsert_article( - ArticleUpsert( - feed_id=article.get("feed_id"), - source_article_id=article.get("source_article_id"), - source_hash=article.get("source_hash"), - title=article.get("title", ""), - source_url=article.get("source_url", ""), - canonical_url=article.get("canonical_url"), - published_at=article.get("published_at"), - author=article.get("author"), - summary=article.get("summary"), - content_raw=article.get("content_raw"), - content_rewritten=rewritten, - image_urls_json=article.get("image_urls_json"), - press_contact=article.get("press_contact"), - source_name_snapshot=article.get("source_name_snapshot"), - source_terms_url_snapshot=article.get("source_terms_url_snapshot"), - source_license_name_snapshot=article.get("source_license_name_snapshot"), - legal_checked=bool(int(article.get("legal_checked", 0))), - legal_checked_at=article.get("legal_checked_at"), - legal_note=article.get("legal_note"), - wp_post_id=article.get("wp_post_id"), - wp_post_url=article.get("wp_post_url"), - publish_attempts=int(article.get("publish_attempts", 0)), - publish_last_error=article.get("publish_last_error"), - published_to_wp_at=article.get("published_to_wp_at"), - word_count=len(rewritten.split()), - status="approved", - meta_json=merged_meta, - ) - ) - - # Reload after save to get updated meta_json - fresh = get_article_by_id(article_id) - if not fresh: - raise RuntimeError(f"Artikel #{article_id} nach Rewrite nicht gefunden") - - # Ensure a publish slot is reserved — reserve one now if not yet set - if not fresh.get("scheduled_publish_at"): - from .scheduler import reserve_publish_slot - logger.info("_do_rewrite_and_draft #%d: kein Slot gesetzt, reserviere jetzt", article_id) - reserve_publish_slot(article_id) - fresh = get_article_by_id(article_id) - if not fresh: - raise RuntimeError(f"Artikel #{article_id} nach Slot-Reservierung nicht gefunden") - - # Create WP draft - logger.info("_do_rewrite_and_draft #%d: erstelle/aktualisiere WP Draft (wp_post_id=%s, sched=%s)", article_id, fresh.get("wp_post_id"), fresh.get("scheduled_publish_at")) - wp_post_id, wp_post_url = publish_article_draft(fresh) - logger.info("_do_rewrite_and_draft #%d: WP Draft fertig (post_id=%s)", article_id, wp_post_id) - - # Update WP info in DB - from .repositories import mark_article_publish_result - mark_article_publish_result( - article_id, - wp_post_id=wp_post_id, - wp_post_url=wp_post_url, - error=None, - increment_attempts=True, - set_published_status=False, - ) - - return wp_post_id, wp_post_url - - -# --------------------------------------------------------------------------- -# Public pipeline functions -# --------------------------------------------------------------------------- - -def run_auto_pipeline(trigger: str = "auto") -> dict[str, Any]: - """Run the full automated pipeline and return stats dict.""" - from . import telegram_bot as tg - - settings = get_settings() - stats = PipelineStats() - - tg.notify_pipeline_started(trigger) - - # Step 1: Ingestion - try: - ingest_result = run_ingestion() - stats.ingested = ingest_result.articles_upserted - except Exception as exc: - tg.notify_error(f"Ingestion fehlgeschlagen: {exc}") - logger.error("Ingestion error: %s", exc) - stats.errors += 1 - - # Step 2: Process new articles - new_articles = list_articles(limit=100, status_filter="new") - - for article in new_articles: - article_id = int(article["id"]) - try: - _process_article(article, stats, settings) - except Exception as exc: - logger.error("Fehler bei Artikel #%d: %s", article_id, exc) - tg.notify_error(f"Fehler bei Artikel #{article_id} ({article.get('title','?')[:50]}): {exc}") - stats.errors += 1 - # Rate limiting between OpenAI calls - time.sleep(1) - - # Step 3: Send rejected summary if any - if stats.rejected_articles: - try: - tg.notify_rejected_summary(stats.rejected_articles) - except Exception as exc: - logger.warning("Telegram rejected summary fehlgeschlagen: %s", exc) - - # Step 4: Summary - result = { - "ingested": stats.ingested, - "processed": stats.processed, - "drafts_created": stats.drafts_created, - "rejected": stats.rejected, - "quality_gate_rejected": stats.quality_gate_rejected, - "no_image": stats.no_image, - "warnings": stats.warnings, - "errors": stats.errors, - } - tg.notify_pipeline_done(result) - return result - - -def _process_article(article: dict[str, Any], stats: PipelineStats, settings: Any) -> None: - """Process a single new article through the pipeline.""" - from . import telegram_bot as tg - - article_id = int(article["id"]) - - # Auto-select image - _auto_select_image(article) - - # Reload to get updated image_review - article = get_article_by_id(article_id) or article - - # Exclude articles without a usable image - try: - meta = json.loads(article.get("meta_json") or "{}") - except Exception: - meta = {} - has_image = bool((meta.get("image_review") or {}).get("selected_url")) - if not has_image: - update_article_status( - article_id, - "no_image", - actor="pipeline", - note="Kein Bild vorhanden – Artikel ausgeschlossen", - ) - stats.no_image += 1 - logger.info("Artikel #%d ausgeschlossen: kein Bild gefunden", article_id) - try: - tg.send_message( - f"🖼️ Kein Bild – Artikel #{article_id} ausgeschlossen\n" - f"📰 {(article.get('title') or '')[:80]}" - ) - except Exception: - pass - return - - # Score relevance - try: - relevance = score_article_relevance(article) - except Exception as exc: - logger.warning("Relevanz-Scoring für #%d fehlgeschlagen: %s", article_id, exc) - relevance = {"score": 0, "reason": f"Scoring-Fehler: {exc}", "topics": []} - - score = relevance.get("score", 0) - reason = relevance.get("reason", "") - _store_relevance(article_id, relevance) - - stats.processed += 1 - - if score < settings.pipeline_relevance_warn: - # Reject - update_article_status( - article_id, - "error", - actor="pipeline", - note=f"Abgelehnt: Score {score}/100 — {reason}", - ) - stats.rejected += 1 - # Reload for summary (now has relevance in meta) - updated = get_article_by_id(article_id) - if updated: - stats.rejected_articles.append(updated) - - elif score < settings.pipeline_relevance_auto: - # Warning zone: set status to "review" so repeated /run calls don't re-warn - update_article_status( - article_id, - "review", - actor="pipeline", - note=f"Niedrige Relevanz: Score {score}/100 — {reason}", - ) - stats.warnings += 1 - try: - tg.notify_relevance_warning(article, score, reason) - except Exception as exc: - logger.warning("Telegram warning für #%d fehlgeschlagen: %s", article_id, exc) - - else: - # Auto-process: rewrite + WP draft - try: - # Reserve publish slot FIRST so it's available when WP draft is created - slot = reserve_publish_slot(article_id) - - # Reload article to get updated image_review + scheduled_publish_at - fresh = get_article_by_id(article_id) - if not fresh: - return - wp_post_id, wp_post_url = _do_rewrite_and_draft(fresh) - stats.drafts_created += 1 - - # Reload for notification - final = get_article_by_id(article_id) - if final: - try: - tg.notify_new_draft(final, score=score, suggested_publish_at=slot) - except Exception as exc: - logger.warning("Telegram draft-Benachrichtigung für #%d fehlgeschlagen: %s", article_id, exc) - - except ValueError as exc: - # Quality gate rejection (too short etc.) — status already set in _do_rewrite_and_draft - # Release the reserved slot so it's available for the next article - from .scheduler import release_publish_slot - release_publish_slot(article_id) - # Clean up any stale WP draft from a previous pipeline run - stale = get_article_by_id(article_id) - if stale and stale.get("wp_post_id"): - try: - from .wordpress import delete_wp_post - delete_wp_post(int(stale["wp_post_id"])) - logger.info("Artikel #%d: veralteten WP-Draft #%s gelöscht", article_id, stale["wp_post_id"]) - except Exception as del_exc: - logger.warning("Artikel #%d: WP-Draft konnte nicht gelöscht werden: %s", article_id, del_exc) - stats.quality_gate_rejected += 1 - logger.info("Artikel #%d wegen Qualitätsprüfung abgelehnt: %s", article_id, exc) - # Individual Telegram notification for quality gate rejection - try: - title = (article.get("title") or "Ohne Titel")[:80] - tg.send_message( - f"✂️ Qualitätsprüfung nicht bestanden\n" - f"📰 {title}\n" - f"💯 Score: {score}/100\n" - f"⚠️ {exc}" - ) - except Exception as tg_exc: - logger.warning("Telegram QG-Benachrichtigung für #%d fehlgeschlagen: %s", article_id, tg_exc) - - except Exception as exc: - logger.error("Draft-Erstellung für #%d fehlgeschlagen: %s", article_id, exc) - update_article_status(article_id, "error", actor="pipeline", note=f"Draft-Fehler: {exc}") - # Release reserved slot so it's not permanently blocked by a failed article - from .scheduler import release_publish_slot - release_publish_slot(article_id) - raise - - -# --------------------------------------------------------------------------- -# Callback actions (called from telegram_bot._handle_callback) -# --------------------------------------------------------------------------- - -def rewrite_and_update_draft(article_id: int) -> None: - """Rewrite article and update the existing WP draft.""" - article = get_article_by_id(article_id) - if not article: - raise RuntimeError(f"Artikel #{article_id} nicht gefunden") - _auto_select_image(article) - fresh = get_article_by_id(article_id) - _do_rewrite_and_draft(fresh) - - -def discard_article(article_id: int) -> None: - """Discard a draft: delete WP post if exists, set article to error.""" - article = get_article_by_id(article_id) - if not article: - return - - wp_post_id = article.get("wp_post_id") - if wp_post_id: - try: - from .wordpress import delete_wp_post - delete_wp_post(int(wp_post_id)) - except Exception as exc: - logger.warning("WP Post #%d konnte nicht gelöscht werden: %s", wp_post_id, exc) - - update_article_status(article_id, "error", actor="telegram", note="Via Telegram verworfen") - - -def override_rejected_article(article_id: int) -> None: - """Force-process a previously rejected article.""" - from . import telegram_bot as tg - - article = get_article_by_id(article_id) - if not article: - raise RuntimeError(f"Artikel #{article_id} nicht gefunden") - - # Reset to new so processing is allowed - update_article_status(article_id, "new", actor="telegram", note="Manuell übernommen via Telegram") - - # Reload - fresh = get_article_by_id(article_id) - if not fresh: - return - - _auto_select_image(fresh) - fresh = get_article_by_id(article_id) - - # Get existing score or re-score - try: - meta = json.loads(fresh.get("meta_json") or "{}") - score = int((meta.get("relevance") or {}).get("score", 0)) - except Exception: - score = 0 - - # Reserve publish slot FIRST so it's in the DB when WP draft is created - slot = reserve_publish_slot(article_id) - fresh = get_article_by_id(article_id) - - wp_post_id, wp_post_url = _do_rewrite_and_draft(fresh) - - final = get_article_by_id(article_id) - if final: - tg.notify_new_draft(final, score=score, suggested_publish_at=slot) - - -# --------------------------------------------------------------------------- -# Status helpers (used by /status command) -# --------------------------------------------------------------------------- - -def get_recently_rejected(days: int = 3) -> list[dict[str, Any]]: - """Return articles rejected in the last N days.""" - from .db import get_conn - from .db import rows_to_dicts - cutoff = datetime.now(timezone.utc).isoformat()[:10] - with get_conn() as conn: - rows = conn.execute( - """ - SELECT id, title, meta_json, source_url, created_at - FROM articles - WHERE status IN ('error', 'review') - AND json_extract(meta_json, '$.relevance.score') IS NOT NULL - AND date(updated_at) >= date('now', ?) - ORDER BY updated_at DESC - LIMIT 20 - """, - (f"-{days} days",), - ).fetchall() - return rows_to_dicts(rows) - - -def get_pipeline_status_text() -> str: - """Return a text summary of current pipeline state.""" - from .repositories import list_articles as _list - new_count = len(_list(limit=500, status_filter="new")) - approved_count = len(_list(limit=500, status_filter="approved")) - published_count = len(_list(limit=500, status_filter="published")) - error_count = len(_list(limit=500, status_filter="error")) - - return ( - f"📊 Pipeline-Status\n" - f"🆕 Neu / wartend: {new_count}\n" - f"✅ Draft / freigegeben: {approved_count}\n" - f"📢 Veröffentlicht: {published_count}\n" - f"🚫 Fehler / abgelehnt: {error_count}" - ) diff --git a/backend/app/policy.py b/backend/app/policy.py deleted file mode 100644 index af6e65c..0000000 --- a/backend/app/policy.py +++ /dev/null @@ -1,35 +0,0 @@ -from __future__ import annotations - -from typing import Any - - -def evaluate_source_policy(source: dict[str, Any] | None) -> list[str]: - issues: list[str] = [] - if not source: - issues.append("Keine Quelle zugeordnet") - return issues - - risk_level = (source.get("risk_level") or "").strip().lower() - if risk_level != "green": - issues.append(f"Quelle nicht freigegeben (risk_level={risk_level or 'unset'})") - - terms_url = (source.get("terms_url") or "").strip() - if not terms_url: - issues.append("terms_url fehlt") - - license_name = (source.get("license_name") or "").strip() - if not license_name: - issues.append("license_name fehlt") - - last_reviewed_at = (source.get("last_reviewed_at") or "").strip() - if not last_reviewed_at: - issues.append("last_reviewed_at fehlt") - - if int(source.get("is_enabled", 0) or 0) != 1: - issues.append("Quelle ist deaktiviert") - - return issues - - -def is_source_allowed(source: dict[str, Any] | None) -> bool: - return len(evaluate_source_policy(source)) == 0 diff --git a/backend/app/publisher.py b/backend/app/publisher.py deleted file mode 100644 index e27bd1b..0000000 --- a/backend/app/publisher.py +++ /dev/null @@ -1,101 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass - -from .repositories import ( - claim_next_publish_job, - complete_publish_job, - create_publish_job, - fail_publish_job, - get_article_by_id, - mark_article_publish_result, - PublishJobCreate, -) -from .wordpress import publish_article_draft, selected_image_exists - - -@dataclass(frozen=True) -class PublisherStats: - processed: int - success: int - failed: int - requeued: int - - -def enqueue_publish(article_id: int, max_attempts: int = 3) -> int: - return create_publish_job(PublishJobCreate(article_id=article_id, max_attempts=max_attempts)) - - -def _can_publish(article: dict) -> tuple[bool, str | None]: - if article.get("status") not in {"approved", "published"}: - return False, "Artikelstatus muss 'publish' sein" - if not selected_image_exists(article): - return False, "Hauptbild nicht gesetzt" - return True, None - - -def run_publisher(max_jobs: int = 10) -> PublisherStats: - processed = 0 - success = 0 - failed = 0 - requeued = 0 - - for _ in range(max(1, max_jobs)): - job = claim_next_publish_job() - if not job: - break - processed += 1 - job_id = int(job["id"]) - article_id = int(job["article_id"]) - - article = get_article_by_id(article_id) - if not article: - fail_publish_job(job_id, "Artikel nicht gefunden", requeue=False) - failed += 1 - continue - - allowed, reason = _can_publish(article) - if not allowed: - fail_publish_job(job_id, reason or "Publish-Bedingungen nicht erfüllt", requeue=False) - mark_article_publish_result( - article_id, - wp_post_id=article.get("wp_post_id"), - wp_post_url=article.get("wp_post_url"), - error=reason or "blocked", - increment_attempts=True, - set_published_status=False, - ) - failed += 1 - continue - - try: - wp_post_id, wp_post_url = publish_article_draft(article) - complete_publish_job(job_id, wp_post_id=wp_post_id, wp_post_url=wp_post_url) - mark_article_publish_result( - article_id, - wp_post_id=wp_post_id, - wp_post_url=wp_post_url, - error=None, - increment_attempts=True, - set_published_status=True, - ) - success += 1 - except Exception as exc: - attempts = int(job.get("attempts", 1)) - max_attempts = int(job.get("max_attempts", 3)) - should_requeue = attempts < max_attempts - fail_publish_job(job_id, str(exc), requeue=should_requeue) - mark_article_publish_result( - article_id, - wp_post_id=article.get("wp_post_id"), - wp_post_url=article.get("wp_post_url"), - error=str(exc), - increment_attempts=True, - set_published_status=False, - ) - if should_requeue: - requeued += 1 - else: - failed += 1 - - return PublisherStats(processed=processed, success=success, failed=failed, requeued=requeued) diff --git a/backend/app/relevance.py b/backend/app/relevance.py deleted file mode 100644 index 8f69693..0000000 --- a/backend/app/relevance.py +++ /dev/null @@ -1,44 +0,0 @@ -from __future__ import annotations - -from datetime import datetime, timezone - - -def _parse_iso_datetime(value: str | None) -> datetime | None: - if not value: - return None - raw = value.strip() - if not raw: - return None - if raw.endswith("Z"): - raw = raw[:-1] + "+00:00" - try: - parsed = datetime.fromisoformat(raw) - except ValueError: - return None - if parsed.tzinfo is None: - parsed = parsed.replace(tzinfo=timezone.utc) - return parsed - - -def article_age_days(published_at: str | None, now: datetime | None = None) -> int | None: - published = _parse_iso_datetime(published_at) - if not published: - return None - ref = now or datetime.now(timezone.utc) - delta = ref - published - if delta.total_seconds() < 0: - return 0 - return delta.days - - -def article_relevance(published_at: str | None, now: datetime | None = None) -> str: - days = article_age_days(published_at, now=now) - if days is None: - return "unbekannt" - if days <= 2: - return "hoch" - if days <= 7: - return "mittel" - if days <= 30: - return "niedrig" - return "alt" diff --git a/backend/app/repositories.py b/backend/app/repositories.py deleted file mode 100644 index cf38055..0000000 --- a/backend/app/repositories.py +++ /dev/null @@ -1,855 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass -import json -from datetime import datetime, timezone -from typing import Any - -from .db import get_conn, rows_to_dicts - - -@dataclass(frozen=True) -class SourceCreate: - name: str - base_url: str | None - terms_url: str | None - license_name: str | None - risk_level: str - is_enabled: bool - notes: str | None - last_reviewed_at: str | None - - -@dataclass(frozen=True) -class FeedCreate: - name: str - url: str - source_id: int | None - is_enabled: bool - - -@dataclass(frozen=True) -class SourceUpdate: - name: str - base_url: str | None - terms_url: str | None - license_name: str | None - risk_level: str - is_enabled: bool - notes: str | None - last_reviewed_at: str | None - - -@dataclass(frozen=True) -class FeedUpdate: - name: str - url: str - source_id: int | None - is_enabled: bool - - -@dataclass(frozen=True) -class RunCreate: - run_type: str - status: str - details: str | None = None - - -@dataclass(frozen=True) -class ArticleUpsert: - feed_id: int | None - source_article_id: str | None - source_hash: str | None - title: str - source_url: str - canonical_url: str | None - published_at: str | None - author: str | None - summary: str | None - content_raw: str | None - content_rewritten: str | None - image_urls_json: str | None - press_contact: str | None - source_name_snapshot: str | None - source_terms_url_snapshot: str | None - source_license_name_snapshot: str | None - legal_checked: bool - legal_checked_at: str | None - legal_note: str | None - wp_post_id: int | None - wp_post_url: str | None - publish_attempts: int - publish_last_error: str | None - published_to_wp_at: str | None - word_count: int - status: str - meta_json: str | None - - -@dataclass(frozen=True) -class PublishJobCreate: - article_id: int - max_attempts: int = 3 - - -def create_source(payload: SourceCreate) -> int: - with get_conn() as conn: - cur = conn.execute( - """ - INSERT INTO sources (name, base_url, terms_url, license_name, risk_level, is_enabled, notes, last_reviewed_at) - VALUES (?, ?, ?, ?, ?, ?, ?, ?) - """, - ( - payload.name.strip(), - payload.base_url, - payload.terms_url, - payload.license_name, - payload.risk_level, - 1 if payload.is_enabled else 0, - payload.notes, - payload.last_reviewed_at, - ), - ) - return int(cur.lastrowid) - - -def list_sources() -> list[dict[str, Any]]: - with get_conn() as conn: - rows = conn.execute( - """ - SELECT id, name, base_url, terms_url, license_name, risk_level, is_enabled, notes, last_reviewed_at, created_at, updated_at - FROM sources - ORDER BY id DESC - """ - ).fetchall() - return rows_to_dicts(rows) - - -def get_source_by_id(source_id: int) -> dict[str, Any] | None: - with get_conn() as conn: - row = conn.execute( - """ - SELECT id, name, base_url, terms_url, license_name, risk_level, is_enabled, notes, last_reviewed_at, created_at, updated_at - FROM sources - WHERE id = ? - """, - (source_id,), - ).fetchone() - return dict(row) if row else None - - -def update_source(source_id: int, payload: SourceUpdate) -> bool: - with get_conn() as conn: - cur = conn.execute( - """ - UPDATE sources - SET name = ?, base_url = ?, terms_url = ?, license_name = ?, risk_level = ?, is_enabled = ?, notes = ?, last_reviewed_at = ? - WHERE id = ? - """, - ( - payload.name.strip(), - payload.base_url, - payload.terms_url, - payload.license_name, - payload.risk_level, - 1 if payload.is_enabled else 0, - payload.notes, - payload.last_reviewed_at, - source_id, - ), - ) - return cur.rowcount > 0 - - -def delete_source(source_id: int) -> bool: - with get_conn() as conn: - cur = conn.execute("DELETE FROM sources WHERE id = ?", (source_id,)) - return cur.rowcount > 0 - - -def create_feed(payload: FeedCreate) -> int: - with get_conn() as conn: - cur = conn.execute( - "INSERT INTO feeds (name, url, source_id, is_enabled) VALUES (?, ?, ?, ?)", - (payload.name.strip(), payload.url.strip(), payload.source_id, 1 if payload.is_enabled else 0), - ) - return int(cur.lastrowid) - - -def list_feeds() -> list[dict[str, Any]]: - with get_conn() as conn: - rows = conn.execute( - """ - SELECT f.id, f.name, f.url, f.source_id, f.is_enabled, f.etag, f.last_modified, f.last_checked_at, - f.created_at, f.updated_at, s.name AS source_name, s.license_name AS source_license_name, - s.terms_url AS source_terms_url, s.risk_level AS source_risk_level, s.base_url AS source_base_url, - s.last_reviewed_at AS source_last_reviewed_at, s.is_enabled AS source_is_enabled - FROM feeds f - LEFT JOIN sources s ON s.id = f.source_id - ORDER BY f.id DESC - """ - ).fetchall() - return rows_to_dicts(rows) - - -def list_enabled_feeds() -> list[dict[str, Any]]: - with get_conn() as conn: - rows = conn.execute( - """ - SELECT f.id, f.name, f.url, f.source_id, f.is_enabled, f.etag, f.last_modified, f.last_checked_at, - s.name AS source_name, s.license_name AS source_license_name, s.terms_url AS source_terms_url, - s.risk_level AS source_risk_level, s.base_url AS source_base_url, - s.last_reviewed_at AS source_last_reviewed_at, s.is_enabled AS source_is_enabled - FROM feeds f - LEFT JOIN sources s ON s.id = f.source_id - WHERE f.is_enabled = 1 - ORDER BY f.id ASC - """ - ).fetchall() - return rows_to_dicts(rows) - - -def get_feed_by_id(feed_id: int) -> dict[str, Any] | None: - with get_conn() as conn: - row = conn.execute( - """ - SELECT f.id, f.name, f.url, f.source_id, f.is_enabled, f.etag, f.last_modified, f.last_checked_at, - s.name AS source_name, s.license_name AS source_license_name, s.terms_url AS source_terms_url, - s.risk_level AS source_risk_level, s.base_url AS source_base_url, - s.last_reviewed_at AS source_last_reviewed_at, s.is_enabled AS source_is_enabled - FROM feeds f - LEFT JOIN sources s ON s.id = f.source_id - WHERE f.id = ? - """, - (feed_id,), - ).fetchone() - return dict(row) if row else None - - -def update_feed(feed_id: int, payload: FeedUpdate) -> bool: - with get_conn() as conn: - cur = conn.execute( - """ - UPDATE feeds - SET name = ?, url = ?, source_id = ?, is_enabled = ? - WHERE id = ? - """, - ( - payload.name.strip(), - payload.url.strip(), - payload.source_id, - 1 if payload.is_enabled else 0, - feed_id, - ), - ) - return cur.rowcount > 0 - - -def delete_feed(feed_id: int) -> bool: - with get_conn() as conn: - cur = conn.execute("DELETE FROM feeds WHERE id = ?", (feed_id,)) - return cur.rowcount > 0 - - -def update_feed_fetch_state(feed_id: int, etag: str | None, last_modified: str | None) -> None: - with get_conn() as conn: - conn.execute( - """ - UPDATE feeds - SET etag = ?, last_modified = ?, last_checked_at = datetime('now') - WHERE id = ? - """, - (etag, last_modified, feed_id), - ) - - -def create_run(payload: RunCreate) -> int: - with get_conn() as conn: - cur = conn.execute( - "INSERT INTO runs (run_type, status, details) VALUES (?, ?, ?)", - (payload.run_type, payload.status, payload.details), - ) - return int(cur.lastrowid) - - -def finish_run(run_id: int, status: str, details: str | None = None) -> None: - with get_conn() as conn: - conn.execute( - """ - UPDATE runs - SET status = ?, details = ?, finished_at = datetime('now') - WHERE id = ? - """, - (status, details, run_id), - ) - - -def list_runs(limit: int = 50) -> list[dict[str, Any]]: - safe_limit = max(1, min(limit, 500)) - with get_conn() as conn: - rows = conn.execute( - """ - SELECT id, run_type, status, started_at, finished_at, details - FROM runs - ORDER BY id DESC - LIMIT ? - """, - (safe_limit,), - ).fetchall() - return rows_to_dicts(rows) - - -def get_run_by_id(run_id: int) -> dict[str, Any] | None: - with get_conn() as conn: - row = conn.execute( - """ - SELECT id, run_type, status, started_at, finished_at, details - FROM runs - WHERE id = ? - """, - (run_id,), - ).fetchone() - return dict(row) if row else None - - -def get_article_by_id(article_id: int) -> dict[str, Any] | None: - with get_conn() as conn: - row = conn.execute( - """ - SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author, - a.summary, a.content_raw, a.content_rewritten, a.image_urls_json, a.press_contact, - a.source_name_snapshot, a.source_terms_url_snapshot, a.source_license_name_snapshot, - a.legal_checked, a.legal_checked_at, a.legal_note, - a.wp_post_id, a.wp_post_url, a.publish_attempts, a.publish_last_error, a.published_to_wp_at, - a.word_count, a.status, a.meta_json, a.created_at, a.updated_at, - a.scheduled_publish_at - FROM articles a - WHERE a.id = ? - """, - (article_id,), - ).fetchone() - return dict(row) if row else None - - -def _merge_review_event(meta_json: str | None, event: dict[str, Any]) -> str: - meta: dict[str, Any] = {} - if meta_json: - try: - meta = json.loads(meta_json) - if not isinstance(meta, dict): - meta = {} - except Exception: - meta = {} - - events = meta.get("review_events") - if not isinstance(events, list): - events = [] - events.append(event) - meta["review_events"] = events - return json.dumps(meta, ensure_ascii=False) - - -def _load_meta(meta_json: str | None) -> dict[str, Any]: - if not meta_json: - return {} - try: - parsed = json.loads(meta_json) - return parsed if isinstance(parsed, dict) else {} - except Exception: - return {} - - -def update_article_status( - article_id: int, - new_status: str, - *, - actor: str | None = None, - note: str | None = None, - decision: str | None = None, -) -> bool: - article = get_article_by_id(article_id) - if not article: - return False - - event = { - "timestamp": datetime.now(timezone.utc).isoformat(), - "from_status": article.get("status"), - "to_status": new_status, - "actor": actor or "system", - "note": note, - "decision": decision, - } - merged_meta = _merge_review_event(article.get("meta_json"), event) - - with get_conn() as conn: - conn.execute( - "UPDATE articles SET status = ?, meta_json = ? WHERE id = ?", - (new_status, merged_meta, article_id), - ) - return True - - -def set_article_legal_review(article_id: int, approved: bool, note: str | None, actor: str | None = None) -> bool: - article = get_article_by_id(article_id) - if not article: - return False - - event = { - "timestamp": datetime.now(timezone.utc).isoformat(), - "event": "legal_review", - "approved": approved, - "actor": actor or "system", - "note": note, - } - merged_meta = _merge_review_event(article.get("meta_json"), event) - with get_conn() as conn: - conn.execute( - """ - UPDATE articles - SET legal_checked = ?, legal_checked_at = datetime('now'), legal_note = ?, meta_json = ? - WHERE id = ? - """, - (1 if approved else 0, note, merged_meta, article_id), - ) - return True - - -def set_article_image_decision(article_id: int, image_url: str, action: str, actor: str | None = None) -> bool: - article = get_article_by_id(article_id) - if not article: - return False - url = (image_url or "").strip() - if not url: - return False - if action not in {"select", "exclude", "restore"}: - return False - - meta = _load_meta(article.get("meta_json")) - image_review = meta.get("image_review") - if not isinstance(image_review, dict): - image_review = {} - - excluded = image_review.get("excluded_urls") - if not isinstance(excluded, list): - excluded = [] - excluded_set = {str(item) for item in excluded if item} - - selected_url = image_review.get("selected_url") - if not isinstance(selected_url, str): - selected_url = None - - if action == "select": - selected_url = url - excluded_set.discard(url) - elif action == "exclude": - excluded_set.add(url) - if selected_url == url: - selected_url = None - elif action == "restore": - excluded_set.discard(url) - - image_review["selected_url"] = selected_url - image_review["excluded_urls"] = sorted(excluded_set) - image_review["updated_at"] = datetime.now(timezone.utc).isoformat() - image_review["updated_by"] = actor or "system" - meta["image_review"] = image_review - - with get_conn() as conn: - conn.execute( - "UPDATE articles SET meta_json = ? WHERE id = ?", - (json.dumps(meta, ensure_ascii=False), article_id), - ) - return True - - -def create_publish_job(payload: PublishJobCreate) -> int: - with get_conn() as conn: - existing = conn.execute( - """ - SELECT id FROM publish_jobs - WHERE article_id = ? AND status IN ('queued', 'running') - ORDER BY id DESC - LIMIT 1 - """, - (payload.article_id,), - ).fetchone() - if existing: - return int(existing["id"]) - - cur = conn.execute( - """ - INSERT INTO publish_jobs (article_id, status, attempts, max_attempts) - VALUES (?, 'queued', 0, ?) - """, - (payload.article_id, max(1, payload.max_attempts)), - ) - return int(cur.lastrowid) - - -def list_publish_jobs(limit: int = 100) -> list[dict[str, Any]]: - safe_limit = max(1, min(limit, 500)) - with get_conn() as conn: - rows = conn.execute( - """ - SELECT j.id, j.article_id, j.status, j.attempts, j.max_attempts, j.error_message, j.wp_post_id, j.wp_post_url, - j.created_at, j.started_at, j.finished_at, a.title AS article_title - FROM publish_jobs j - LEFT JOIN articles a ON a.id = j.article_id - ORDER BY j.id DESC - LIMIT ? - """, - (safe_limit,), - ).fetchall() - return rows_to_dicts(rows) - - -def claim_next_publish_job() -> dict[str, Any] | None: - with get_conn() as conn: - row = conn.execute( - """ - SELECT id, article_id, status, attempts, max_attempts, error_message, wp_post_id, wp_post_url - FROM publish_jobs - WHERE status = 'queued' AND attempts < max_attempts - ORDER BY id ASC - LIMIT 1 - """ - ).fetchone() - if not row: - return None - job_id = int(row["id"]) - conn.execute( - """ - UPDATE publish_jobs - SET status = 'running', - attempts = attempts + 1, - started_at = datetime('now'), - finished_at = NULL - WHERE id = ? - """, - (job_id,), - ) - claimed = conn.execute( - """ - SELECT id, article_id, status, attempts, max_attempts, error_message, wp_post_id, wp_post_url - FROM publish_jobs - WHERE id = ? - """, - (job_id,), - ).fetchone() - return dict(claimed) if claimed else None - - -def complete_publish_job(job_id: int, wp_post_id: int | None, wp_post_url: str | None) -> None: - with get_conn() as conn: - conn.execute( - """ - UPDATE publish_jobs - SET status = 'success', - wp_post_id = ?, - wp_post_url = ?, - error_message = NULL, - finished_at = datetime('now') - WHERE id = ? - """, - (wp_post_id, wp_post_url, job_id), - ) - - -def fail_publish_job(job_id: int, error_message: str, requeue: bool) -> None: - next_status = "queued" if requeue else "failed" - with get_conn() as conn: - conn.execute( - """ - UPDATE publish_jobs - SET status = ?, - error_message = ?, - finished_at = datetime('now') - WHERE id = ? - """, - (next_status, error_message[:2000], job_id), - ) - - -def mark_article_publish_result( - article_id: int, - *, - wp_post_id: int | None, - wp_post_url: str | None, - error: str | None, - increment_attempts: bool, - set_published_status: bool, -) -> None: - with get_conn() as conn: - conn.execute( - """ - UPDATE articles - SET wp_post_id = ?, - wp_post_url = ?, - publish_attempts = CASE WHEN ? THEN publish_attempts + 1 ELSE publish_attempts END, - publish_last_error = ?, - published_to_wp_at = CASE WHEN ? IS NOT NULL THEN datetime('now') ELSE published_to_wp_at END, - status = CASE WHEN ? THEN 'published' ELSE status END - WHERE id = ? - """, - ( - wp_post_id, - wp_post_url, - 1 if increment_attempts else 0, - error[:2000] if error else None, - wp_post_id, - 1 if set_published_status else 0, - article_id, - ), - ) - - -def _resolve_existing_article_id(payload: ArticleUpsert) -> int | None: - with get_conn() as conn: - # 1) strongest key: source_url - row = conn.execute( - "SELECT id FROM articles WHERE source_url = ?", - (payload.source_url.strip(),), - ).fetchone() - if row: - return int(row["id"]) - - # 2) stable feed+guid combo - if payload.feed_id is not None and payload.source_article_id: - row = conn.execute( - "SELECT id FROM articles WHERE feed_id = ? AND source_article_id = ?", - (payload.feed_id, payload.source_article_id), - ).fetchone() - if row: - return int(row["id"]) - - # 3) content hash fallback - if payload.source_hash: - row = conn.execute( - "SELECT id FROM articles WHERE source_hash = ?", - (payload.source_hash,), - ).fetchone() - if row: - return int(row["id"]) - - return None - - -def find_existing_article_for_upsert(payload: ArticleUpsert) -> dict[str, Any] | None: - article_id = _resolve_existing_article_id(payload) - if article_id is None: - return None - return get_article_by_id(article_id) - - -def upsert_article(payload: ArticleUpsert) -> int: - existing_id = _resolve_existing_article_id(payload) - with get_conn() as conn: - if existing_id is None: - conn.execute( - """ - INSERT INTO articles ( - feed_id, source_article_id, source_hash, title, source_url, canonical_url, published_at, author, - summary, content_raw, content_rewritten, image_urls_json, press_contact, - source_name_snapshot, source_terms_url_snapshot, source_license_name_snapshot, - legal_checked, legal_checked_at, legal_note, - wp_post_id, wp_post_url, publish_attempts, publish_last_error, published_to_wp_at, - word_count, status, meta_json - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - ( - payload.feed_id, - payload.source_article_id, - payload.source_hash, - payload.title.strip(), - payload.source_url.strip(), - payload.canonical_url, - payload.published_at, - payload.author, - payload.summary, - payload.content_raw, - payload.content_rewritten, - payload.image_urls_json, - payload.press_contact, - payload.source_name_snapshot, - payload.source_terms_url_snapshot, - payload.source_license_name_snapshot, - 1 if payload.legal_checked else 0, - payload.legal_checked_at, - payload.legal_note, - payload.wp_post_id, - payload.wp_post_url, - payload.publish_attempts, - payload.publish_last_error, - payload.published_to_wp_at, - payload.word_count, - payload.status, - payload.meta_json, - ), - ) - else: - conn.execute( - """ - UPDATE articles - SET - feed_id = ?, - source_article_id = ?, - source_hash = ?, - title = ?, - source_url = ?, - canonical_url = ?, - published_at = ?, - author = ?, - summary = ?, - content_raw = ?, - content_rewritten = ?, - image_urls_json = ?, - press_contact = ?, - source_name_snapshot = ?, - source_terms_url_snapshot = ?, - source_license_name_snapshot = ?, - legal_checked = ?, - legal_checked_at = ?, - legal_note = ?, - wp_post_id = ?, - wp_post_url = ?, - publish_attempts = ?, - publish_last_error = ?, - published_to_wp_at = ?, - word_count = ?, - status = ?, - meta_json = ? - WHERE id = ? - """, - ( - payload.feed_id, - payload.source_article_id, - payload.source_hash, - payload.title.strip(), - payload.source_url.strip(), - payload.canonical_url, - payload.published_at, - payload.author, - payload.summary, - payload.content_raw, - payload.content_rewritten, - payload.image_urls_json, - payload.press_contact, - payload.source_name_snapshot, - payload.source_terms_url_snapshot, - payload.source_license_name_snapshot, - 1 if payload.legal_checked else 0, - payload.legal_checked_at, - payload.legal_note, - payload.wp_post_id, - payload.wp_post_url, - payload.publish_attempts, - payload.publish_last_error, - payload.published_to_wp_at, - payload.word_count, - payload.status, - payload.meta_json, - existing_id, - ), - ) - row = conn.execute("SELECT id FROM articles WHERE source_url = ?", (payload.source_url.strip(),)).fetchone() - if row: - return int(row["id"]) - return int(existing_id) if existing_id else 0 - - -def list_articles_page( - limit: int = 50, - offset: int = 0, - status_filter: str | None = None, - search: str | None = None, -) -> tuple[list[dict[str, Any]], int]: - """Return (articles, total_count) with optional status filter and title search.""" - safe_limit = max(1, min(limit, 200)) - safe_offset = max(0, offset) - - conditions: list[str] = [] - params: list[Any] = [] - if status_filter: - conditions.append("a.status = ?") - params.append(status_filter) - if search: - conditions.append("(a.title LIKE ? OR a.id = ?)") - try: - params.extend([f"%{search}%", int(search)]) - except ValueError: - params.extend([f"%{search}%", -1]) - - where = f"WHERE {' AND '.join(conditions)}" if conditions else "" - select = """ - SELECT a.id, a.title, a.status, a.published_at, a.summary, a.content_raw, - a.meta_json, a.wp_post_id, a.wp_post_url, a.scheduled_publish_at, - a.word_count, f.name AS feed_name - FROM articles a - LEFT JOIN feeds f ON f.id = a.feed_id - """ - with get_conn() as conn: - total = conn.execute( - f"SELECT COUNT(*) FROM articles a {where}", params - ).fetchone()[0] - rows = conn.execute( - f"{select} {where} ORDER BY a.id DESC LIMIT ? OFFSET ?", - params + [safe_limit, safe_offset], - ).fetchall() - return rows_to_dicts(rows), total - - -def bulk_update_wp_post_ids(updates: list[tuple[int, int | None]]) -> int: - """Update wp_post_id (and clear stale wp_post_url) for multiple articles. - - Returns the number of rows actually updated. - Call sync_db_from_wordpress() afterwards to repopulate wp_post_url and - scheduled_publish_at from the live WordPress data. - """ - if not updates: - return 0 - updated = 0 - with get_conn() as conn: - for article_id, new_wp_id in updates: - conn.execute( - "UPDATE articles SET wp_post_id = ?, wp_post_url = NULL WHERE id = ?", - (new_wp_id, article_id), - ) - updated += 1 - return updated - - -def list_articles(limit: int = 100, status_filter: str | None = None) -> list[dict[str, Any]]: - safe_limit = max(1, min(limit, 500)) - with get_conn() as conn: - if status_filter: - rows = conn.execute( - """ - SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author, - a.summary, a.content_raw, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at, f.name AS feed_name, - a.image_urls_json, a.press_contact, a.source_name_snapshot, a.source_terms_url_snapshot, - a.source_license_name_snapshot, a.legal_checked, a.legal_checked_at, a.legal_note, - a.wp_post_id, a.wp_post_url, a.publish_attempts, a.publish_last_error, a.published_to_wp_at - FROM articles a - LEFT JOIN feeds f ON f.id = a.feed_id - WHERE a.status = ? - ORDER BY a.id DESC - LIMIT ? - """, - (status_filter, safe_limit), - ).fetchall() - else: - rows = conn.execute( - """ - SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author, - a.summary, a.content_raw, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at, f.name AS feed_name, - a.image_urls_json, a.press_contact, a.source_name_snapshot, a.source_terms_url_snapshot, - a.source_license_name_snapshot, a.legal_checked, a.legal_checked_at, a.legal_note, - a.wp_post_id, a.wp_post_url, a.publish_attempts, a.publish_last_error, a.published_to_wp_at - FROM articles a - LEFT JOIN feeds f ON f.id = a.feed_id - ORDER BY a.id DESC - LIMIT ? - """, - (safe_limit,), - ).fetchall() - return rows_to_dicts(rows) diff --git a/backend/app/rewrite.py b/backend/app/rewrite.py deleted file mode 100644 index 05937e5..0000000 --- a/backend/app/rewrite.py +++ /dev/null @@ -1,204 +0,0 @@ -from __future__ import annotations - -import json -import re -from typing import Any -from urllib.request import Request, urlopen - -from .config import get_settings - - -def _sanitize_source_text(text: str) -> str: - raw = (text or "").strip() - if not raw: - return "" - - lines = [ln.strip() for ln in raw.splitlines() if ln.strip()] - if len(lines) > 3: - lines = lines[3:] - - joined = "\n".join(lines) - # Remove press contact block at end from "Pressekontakt" onward. - joined = re.sub( - r"\n?\s*Pressekontakt[\s\S]*$", - "", - joined, - flags=re.IGNORECASE, - ).strip() - return joined - - -def _normalize_tags(tags: list[str], max_tags: int = 8) -> list[str]: - out: list[str] = [] - seen: set[str] = set() - for raw in tags: - value = re.sub(r"\s+", " ", str(raw or "").strip()) - value = re.sub(r"^[#\-•\s]+", "", value) - value = re.sub(r"[;,.:\s]+$", "", value) - if not value: - continue - if len(value) < 2 or len(value) > 40: - continue - key = value.casefold() - if key in seen: - continue - seen.add(key) - out.append(value) - if len(out) >= max_tags: - break - return out - - -def _openai_chat(system: str, user: str, temperature: float = 0.4) -> str: - settings = get_settings() - api_key = settings.openai_api_key - if not api_key: - raise RuntimeError("OPENAI_API_KEY fehlt") - - payload = { - "model": settings.openai_model, - "temperature": temperature, - "messages": [ - {"role": "system", "content": system}, - {"role": "user", "content": user}, - ], - } - req = Request( - url="https://api.openai.com/v1/chat/completions", - method="POST", - data=json.dumps(payload).encode("utf-8"), - headers={ - "Authorization": f"Bearer {api_key}", - "Content-Type": "application/json", - "Accept": "application/json", - }, - ) - with urlopen(req, timeout=60) as resp: - raw = resp.read().decode("utf-8", errors="replace") - data = json.loads(raw) - choices = data.get("choices") - if not isinstance(choices, list) or not choices: - raise RuntimeError(f"Ungültige OpenAI-Antwort: {data}") - message = choices[0].get("message", {}) - content = message.get("content") - if not isinstance(content, str) or not content.strip(): - raise RuntimeError("OpenAI lieferte keinen Inhalt") - return content.strip() - - -def rewrite_article_text(article: dict[str, Any]) -> str: - source_text = _sanitize_source_text(article.get("content_raw") or "") - if not source_text: - source_text = (article.get("summary") or "").strip() - if not source_text: - raise RuntimeError("Kein Quelltext für Rewrite verfügbar") - - title = (article.get("title") or "").strip() - source_name = (article.get("source_name_snapshot") or article.get("author") or "die Quelle").strip() - prompt = ( - "Schreibe den folgenden News-Text neu auf Deutsch in persönlicher Du-Form. " - "Stil: ausführlich, gut lesbar, ohne Einleitung mit Datum/Uhrzeit/Firma/Ort, " - "ohne Pressekontakt, ohne Quellenblock. " - "Nutze klare Absätze und Zwischenüberschriften in HTML (

,

,

  • falls passend). " - "Inhaltlich korrekt bleiben, nichts erfinden. " - f"Wichtig: Der Artikel wurde von '{source_name}' veröffentlicht. " - "Verwende NIEMALS 'wir' oder 'ich' aus Sicht der Quelle – beziehe Aussagen stets auf die Quelle, " - f"z.B. 'laut {source_name}', '{source_name} hat ermittelt', 'die Auswertung zeigt'.\n\n" - f"Titel: {title}\n\n" - f"Originaltext:\n{source_text}" - ) - return _openai_chat( - "Du bist ein deutscher News-Redakteur.", - prompt, - temperature=0.4, - ) - - -def generate_article_tags(article: dict[str, Any], rewritten_text: str | None = None, max_tags: int = 8) -> list[str]: - source_text = rewritten_text or _sanitize_source_text(article.get("content_raw") or "") or (article.get("summary") or "") - source_text = str(source_text).strip() - if not source_text: - return [] - title = (article.get("title") or "").strip() - prompt = ( - "Erzeuge präzise Schlagwörter für einen deutschen News-Artikel. " - f"Maximal {max_tags} Tags. Nur relevante Begriffe, keine allgemeinen Wörter wie News/Artikel. " - "Gib ausschließlich ein JSON-Array mit Strings zurück, ohne Erklärung.\n\n" - f"Titel: {title}\n\n" - f"Text:\n{source_text[:3500]}" - ) - raw = _openai_chat( - "Du extrahierst präzise, kurze News-Tags auf Deutsch.", - prompt, - temperature=0.2, - ) - try: - parsed = json.loads(raw) - if isinstance(parsed, list): - return _normalize_tags([str(x) for x in parsed], max_tags=max_tags) - except Exception: - pass - # fallback: extract first JSON-like array if model wrapped output - match = re.search(r"\[[\s\S]*\]", raw) - if match: - try: - parsed = json.loads(match.group(0)) - if isinstance(parsed, list): - return _normalize_tags([str(x) for x in parsed], max_tags=max_tags) - except Exception: - return [] - return [] - - -def score_article_relevance(article: dict[Any, Any]) -> dict[str, Any]: - """Score article relevance for VanLife/Camping/Outdoor blog (0-100). - - Returns {"score": int, "reason": str, "topics": list[str]}. - Raises RuntimeError on OpenAI failure. - """ - title = (article.get("title") or "").strip() - text = _sanitize_source_text(article.get("content_raw") or "") - if not text: - text = (article.get("summary") or "").strip() - - prompt = ( - "Bewerte die Relevanz des folgenden Artikels für einen deutschen VanLife-, Camping- und Outdoor-Blog. " - "Relevante Themen: Campingplätze, Stellplätze, Wohnmobil, Camper, Van, Roadtrip, " - "Outdoor-Ausrüstung, Wandern, Naturreisen, Reise-Tipps für Campende. " - "Nicht relevant: allgemeine Nachrichten, Politik, Wirtschaft, Sport (außer Outdoor), Unterhaltung.\n\n" - "Antworte NUR mit einem JSON-Objekt:\n" - '{"score": <0-100>, "reason": "", "topics": ["", ""]}\n\n' - f"Titel: {title}\n\n" - f"Text (Auszug):\n{text[:2000]}" - ) - raw = _openai_chat( - "Du bist ein Redakteur für einen VanLife- und Camping-Blog und bewertest Artikelrelevanz.", - prompt, - temperature=0.1, - ) - try: - match = re.search(r"\{[\s\S]*\}", raw) - if match: - parsed = json.loads(match.group(0)) - score = max(0, min(100, int(parsed.get("score", 0)))) - return { - "score": score, - "reason": str(parsed.get("reason", "")), - "topics": [str(t) for t in (parsed.get("topics") or [])], - } - except Exception: - pass - return {"score": 0, "reason": "Parsing-Fehler bei Relevanz-Score", "topics": []} - - -def merge_generated_tags(meta_json: str | None, tags: list[str]) -> str: - meta: dict[str, Any] = {} - if meta_json: - try: - parsed = json.loads(meta_json) - if isinstance(parsed, dict): - meta = parsed - except Exception: - meta = {} - meta["generated_tags"] = _normalize_tags(tags) - return json.dumps(meta, ensure_ascii=False) diff --git a/backend/app/scheduler.py b/backend/app/scheduler.py deleted file mode 100644 index d5ea5bf..0000000 --- a/backend/app/scheduler.py +++ /dev/null @@ -1,336 +0,0 @@ -"""Smart publishing scheduler. - -Calculates suggested publish slots for new WordPress drafts. -Rules: -- Maximum N drafts per day (configurable, default 2) -- Preferred slots: configurable hours (default 09:00 and 14:00 CET) -- New articles queue up after the last already-scheduled article -- Checks both local DB AND WordPress future posts to avoid double-booking -""" -from __future__ import annotations - -import base64 -import json -import threading -import urllib.request -from datetime import date, datetime, timedelta, timezone -from typing import Any - -from .config import get_settings -from .db import get_conn - -# Ensures that concurrent pipeline runs (two threads) never assign the same slot. -_slot_lock = threading.Lock() - - -# CET offset (UTC+1 winter / UTC+2 summer – fixed +1 for simplicity) -_CET_OFFSET = timedelta(hours=1) - - -def _today_cet() -> date: - return (datetime.now(timezone.utc) + _CET_OFFSET).date() - - -def _preferred_hours() -> list[int]: - settings = get_settings() - try: - return [int(h.strip()) for h in settings.pipeline_publish_hours.split(",") if h.strip()] - except Exception: - return [9, 14] - - -def _fetch_wp_occupied_slots() -> set[tuple[str, int]]: - """Fetch all future-scheduled WordPress posts and return occupied (date_iso, hour) pairs. - - This prevents the scheduler from assigning a slot that is already taken - by a WP post that was not created via this pipeline (e.g. manually or via recovery scripts). - Returns an empty set on any error so the scheduler degrades gracefully. - """ - settings = get_settings() - try: - auth = base64.b64encode( - f"{settings.wordpress_username}:{settings.wordpress_app_password}".encode() - ).decode() - url = ( - f"{settings.wordpress_base_url}/wp-json/wp/v2/posts" - f"?status=future&per_page=100&orderby=date&order=asc&_fields=id,date" - ) - req = urllib.request.Request(url, headers={"Authorization": f"Basic {auth}"}) - with urllib.request.urlopen(req, timeout=10) as resp: - posts = json.loads(resp.read()) - occupied: set[tuple[str, int]] = set() - for p in posts: - try: - dt = datetime.fromisoformat(p["date"]) - occupied.add((dt.date().isoformat(), dt.hour)) - except Exception: - pass - return occupied - except Exception: - return set() - - -def _get_last_future_scheduled_date(wp_occupied: set[tuple[str, int]]) -> date | None: - """Return the date of the latest already-scheduled slot (DB + WP).""" - today = _today_cet() - - # Latest from local DB - with get_conn() as conn: - row = conn.execute( - """ - SELECT MAX(scheduled_publish_at) AS last_slot - FROM articles - WHERE scheduled_publish_at IS NOT NULL - AND scheduled_publish_at >= ? - AND status NOT IN ('error', 'no_image') - """, - (today.isoformat() + "T00:00:00",), - ).fetchone() - db_last: date | None = None - if row and row["last_slot"]: - try: - db_last = datetime.fromisoformat(row["last_slot"]).date() - except Exception: - pass - - # Latest from WP - wp_last: date | None = None - for d_str, _ in wp_occupied: - try: - d = date.fromisoformat(d_str) - if d >= today and (wp_last is None or d > wp_last): - wp_last = d - except Exception: - pass - - if db_last and wp_last: - return max(db_last, wp_last) - return db_last or wp_last - - -def _next_free_hour(target_date: date, wp_occupied: set[tuple[str, int]]) -> int | None: - """Return first preferred hour not yet used on target_date (DB + WP), or None if day is full.""" - hours = _preferred_hours() - date_str = target_date.isoformat() - - # Hours used in local DB - with get_conn() as conn: - rows = conn.execute( - """ - SELECT scheduled_publish_at FROM articles - WHERE scheduled_publish_at >= ? AND scheduled_publish_at < ? - AND status NOT IN ('error', 'no_image') - """, - (date_str + "T00:00:00", date_str + "T23:59:59"), - ).fetchall() - - used_hours: set[int] = set() - for row in rows: - ts = row["scheduled_publish_at"] or "" - try: - used_hours.add(datetime.fromisoformat(ts).hour) - except Exception: - pass - - # Hours used in WordPress - for d_str, h in wp_occupied: - if d_str == date_str: - used_hours.add(h) - - for h in hours: - if h not in used_hours: - return h - return None - - -def _format_slot(d: date, hour: int) -> str: - weekday_names = ["Mo", "Di", "Mi", "Do", "Fr", "Sa", "So"] - wd = weekday_names[d.weekday()] - return f"{wd}, {d.strftime('%d.%m.%Y')} um {hour:02d}:00 Uhr" - - -def _find_next_free_slot( - wp_occupied: set[tuple[str, int]], lookahead_days: int = 60 -) -> tuple[date, int] | None: - """Find the next free (date, hour) slot. - - Starts from tomorrow and scans forward, filling any gaps in the schedule - rather than always appending after the last existing post. - """ - today = _today_cet() - tomorrow = today + timedelta(days=1) - - for offset in range(0, lookahead_days + 1): - candidate = tomorrow + timedelta(days=offset) - hour = _next_free_hour(candidate, wp_occupied) - if hour is not None: - return candidate, hour - - return tomorrow, _preferred_hours()[0] if _preferred_hours() else 9 - - -def get_schedule_overview(lookahead_days: int = 60) -> list[dict]: - """Return all booked scheduling slots (DB + WP) for the next N days, sorted by date.""" - today = _today_cet() - hours = _preferred_hours() - - # Slots booked in local DB - with get_conn() as conn: - rows = conn.execute( - """ - SELECT id, title, status, wp_post_id, wp_post_url, scheduled_publish_at - FROM articles - WHERE scheduled_publish_at IS NOT NULL - AND scheduled_publish_at >= ? - AND status NOT IN ('error', 'no_image') - ORDER BY scheduled_publish_at - """, - (today.isoformat() + "T00:00:00",), - ).fetchall() - - db_slots: dict[tuple[str, int], dict] = {} - for row in rows: - try: - dt = datetime.fromisoformat(row["scheduled_publish_at"]) - key = (dt.date().isoformat(), dt.hour) - db_slots[key] = { - "date": dt.date().isoformat(), - "hour": dt.hour, - "formatted": _format_slot(dt.date(), dt.hour), - "source": "db", - "article_id": row["id"], - "article_title": row["title"], - "article_status": row["status"], - "wp_post_id": row["wp_post_id"], - "wp_post_url": row["wp_post_url"], - } - except Exception: - pass - - # Slots occupied in WordPress but not in local DB - wp_occupied = _fetch_wp_occupied_slots() - wp_only: list[dict] = [] - for d_str, h in sorted(wp_occupied): - if (d_str, h) in db_slots: - continue - try: - d = date.fromisoformat(d_str) - if d >= today: - wp_only.append({ - "date": d_str, - "hour": h, - "formatted": _format_slot(d, h), - "source": "wordpress", - "article_id": None, - "article_title": "(WP-Beitrag außerhalb Pipeline)", - "article_status": None, - "wp_post_id": None, - "wp_post_url": None, - }) - except Exception: - pass - - all_slots = list(db_slots.values()) + wp_only - all_slots.sort(key=lambda s: (s["date"], s["hour"])) - return all_slots - - -def release_publish_slot(article_id: int) -> None: - """Clear a previously reserved slot (e.g. when article is rejected after slot assignment).""" - with get_conn() as conn: - conn.execute( - "UPDATE articles SET scheduled_publish_at = NULL WHERE id = ?", - (article_id,), - ) - - -def suggest_publish_slot() -> str: - """Return a suggested publish datetime string (CET) for the next free slot.""" - wp_occupied = _fetch_wp_occupied_slots() - result = _find_next_free_slot(wp_occupied) - if result: - d, hour = result - return _format_slot(d, hour) - tomorrow = _today_cet() + timedelta(days=1) - return _format_slot(tomorrow, _preferred_hours()[0] if _preferred_hours() else 9) - - -def reserve_publish_slot(article_id: int) -> str: - """Reserve a publish slot for an article and persist it in the DB. - - If the article already has a scheduled_publish_at, keep it unchanged. - Returns the formatted publish datetime string. - - Uses a module-level lock so that concurrent pipeline runs (two threads) - cannot read the same "free" slot and assign it twice. - """ - # Fetch WP-occupied slots BEFORE acquiring the lock — the API call can be slow - # and must not block other threads unnecessarily. - wp_occupied = _fetch_wp_occupied_slots() - - with _slot_lock: - # Single DB connection for the entire read-find-write cycle so the - # slot we pick is still free when we write it. - with get_conn() as conn: - row = conn.execute( - "SELECT scheduled_publish_at FROM articles WHERE id = ?", - (article_id,), - ).fetchone() - existing_slot = row["scheduled_publish_at"] if row else None - if existing_slot: - try: - dt = datetime.fromisoformat(existing_slot) - return _format_slot(dt.date(), dt.hour) - except Exception: - pass # invalid — fall through and assign a fresh slot - - # Find the next free (date, hour) slot using THIS connection so we - # see all slots written during this lock window. - hours = _preferred_hours() - today = _today_cet() - tomorrow = today + timedelta(days=1) - candidate: date | None = None - chosen_hour: int | None = None - - for offset in range(0, 61): - d = tomorrow + timedelta(days=offset) - date_str = d.isoformat() - - rows = conn.execute( - """ - SELECT scheduled_publish_at FROM articles - WHERE scheduled_publish_at >= ? AND scheduled_publish_at < ? - AND status NOT IN ('error', 'no_image') - """, - (date_str + "T00:00:00", date_str + "T23:59:59"), - ).fetchall() - - used_hours: set[int] = set() - for r in rows: - ts = r["scheduled_publish_at"] or "" - try: - used_hours.add(datetime.fromisoformat(ts).hour) - except Exception: - pass - for d_str, h in wp_occupied: - if d_str == date_str: - used_hours.add(h) - - for h in hours: - if h not in used_hours: - candidate = d - chosen_hour = h - break - if candidate is not None: - break - - if candidate is None: - candidate = tomorrow - chosen_hour = hours[0] if hours else 9 - - iso_ts = f"{candidate.isoformat()}T{chosen_hour:02d}:00:00" - conn.execute( - "UPDATE articles SET scheduled_publish_at = ? WHERE id = ?", - (iso_ts, article_id), - ) - return _format_slot(candidate, chosen_hour) diff --git a/backend/app/source_extraction.py b/backend/app/source_extraction.py deleted file mode 100644 index d3cbed8..0000000 --- a/backend/app/source_extraction.py +++ /dev/null @@ -1,442 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass, field -from html import unescape -import re -from typing import Any -from urllib.parse import urljoin -from urllib.request import Request, urlopen - -DEFAULT_TIMEOUT_SECONDS = 10 -DEFAULT_USER_AGENT = "rss-news-bot/1.0 (+https://news.vanityontour.de)" - - -@dataclass(frozen=True) -class ExtractedArticle: - title: str | None - author: str | None - canonical_url: str | None - summary: str | None - content_text: str | None - images: list[str] - press_contact: str | None - extraction_error: str | None = None - image_metadata: dict[str, dict] = field(default_factory=dict) - - -def _clean_text(raw: str | None) -> str | None: - if not raw: - return None - text = unescape(raw) - text = re.sub(r"<[^>]+>", " ", text) - text = re.sub(r"\s+", " ", text).strip() - return text or None - - -def _strip_noise(html: str) -> str: - html = re.sub(r"", " ", html, flags=re.IGNORECASE) - html = re.sub(r"", " ", html, flags=re.IGNORECASE) - html = re.sub(r"", " ", html, flags=re.IGNORECASE) - return html - - -def _meta_content(html: str, attr: str, value: str) -> str | None: - pattern = re.compile( - rf"]+{attr}\s*=\s*[\"']{re.escape(value)}[\"'][^>]*content\s*=\s*[\"']([^\"']+)[\"'][^>]*>", - re.IGNORECASE, - ) - match = pattern.search(html) - if match: - return _clean_text(match.group(1)) - - # handle reversed attribute order - pattern_rev = re.compile( - rf"]+content\s*=\s*[\"']([^\"']+)[\"'][^>]*{attr}\s*=\s*[\"']{re.escape(value)}[\"'][^>]*>", - re.IGNORECASE, - ) - match = pattern_rev.search(html) - if match: - return _clean_text(match.group(1)) - return None - - -def _extract_title(html: str) -> str | None: - title = _meta_content(html, "property", "og:title") - if title: - return title - - match = re.search(r"]*>([\s\S]*?)", html, re.IGNORECASE) - if match: - cleaned = _clean_text(match.group(1)) - if cleaned: - return cleaned - - match = re.search(r"]*>([\s\S]*?)

", html, re.IGNORECASE) - if match: - return _clean_text(match.group(1)) - return None - - -def _extract_canonical(html: str) -> str | None: - match = re.search( - r"]+rel\s*=\s*[\"']canonical[\"'][^>]*href\s*=\s*[\"']([^\"']+)[\"'][^>]*>", - html, - re.IGNORECASE, - ) - if match: - return _clean_text(match.group(1)) - - match = re.search( - r"]+href\s*=\s*[\"']([^\"']+)[\"'][^>]*rel\s*=\s*[\"']canonical[\"'][^>]*>", - html, - re.IGNORECASE, - ) - if match: - return _clean_text(match.group(1)) - return None - - -def _extract_author(html: str) -> str | None: - for attr, value in (("name", "author"), ("property", "article:author"), ("property", "og:article:author")): - author = _meta_content(html, attr, value) - if author: - return author - - for pattern in ( - r"(?:Von|Autor(?:in)?)\s*[:\-]\s*([^<\n\r]{3,120})", - r"class=[\"'][^\"']*(?:author|byline)[^\"']*[\"'][^>]*>([\s\S]{1,180})<", - ): - match = re.search(pattern, html, re.IGNORECASE) - if match: - author = _clean_text(match.group(1)) - if author: - return author - return None - - -def _extract_images(html: str, page_url: str) -> list[str]: - images: list[str] = [] - seen: set[str] = set() - - for prop in ("og:image", "twitter:image"): - pattern = re.compile( - rf"]+property\s*=\s*[\"']{re.escape(prop)}[\"'][^>]*content\s*=\s*[\"']([^\"']+)[\"'][^>]*>", - re.IGNORECASE, - ) - for match in pattern.finditer(html): - src = match.group(1).strip() - abs_src = urljoin(page_url, src) - if abs_src not in seen: - seen.add(abs_src) - images.append(abs_src) - - for match in re.finditer(r"]+src\s*=\s*[\"']([^\"']+)[\"'][^>]*>", html, re.IGNORECASE): - src = match.group(1).strip() - abs_src = urljoin(page_url, src) - if abs_src not in seen: - seen.add(abs_src) - images.append(abs_src) - - return images - - -def _extract_content_text(html: str) -> str | None: - section = None - for pattern in ( - r"]*>([\s\S]*?)", - r"]*>([\s\S]*?)", - r"]*>([\s\S]*?)", - ): - match = re.search(pattern, html, re.IGNORECASE) - if match: - section = match.group(1) - break - - if not section: - section = html - - paragraphs = [] - for match in re.finditer(r"]*>([\s\S]*?)", section, re.IGNORECASE): - text = _clean_text(match.group(1)) - if text and re.search(r"\b(pressekontakt|press contact|kontakt|agentur)\b", text, re.IGNORECASE): - paragraphs.append(text) - - for match in re.finditer(r"]*>([\s\S]*?)

", section, re.IGNORECASE): - text = _clean_text(match.group(1)) - if text and len(text) > 2: - paragraphs.append(text) - - if paragraphs: - return "\n".join(paragraphs) - - stripped = _clean_text(section) - return stripped - - -def _extract_press_contact(content_text: str | None) -> str | None: - if not content_text: - return None - - lines = [line.strip() for line in content_text.split("\n") if line.strip()] - marker_re = re.compile(r"\b(pressekontakt|press contact|presse\-kontakt|agentur)\b", re.IGNORECASE) - for idx, line in enumerate(lines): - if marker_re.search(line): - chunk = [line] - for nxt in lines[idx + 1 : idx + 6]: - if re.search(r"\b(original\-content von|ots:|newsroom:|alle meldungen|zum newsroom)\b", nxt, re.IGNORECASE): - break - chunk.append(nxt) - return _clean_text("\n".join(chunk)) - - match = re.search( - r"((?:Pressekontakt|Agentur)[\s\S]{0,1200}?)(?:Original-Content von|OTS:|newsroom:|Alle Meldungen|Zum Newsroom|$)", - content_text, - re.IGNORECASE, - ) - if match: - return _clean_text(match.group(1)) - return None - - -# CSS class keywords that indicate a copyright/credit element inside a figcaption -_CREDIT_CLASS_RE = re.compile( - r"class\s*=\s*[\"'][^\"']*(?:copyright|credit|photographer|photo-credit|image-credit|bildrechte|fotocredit)[^\"']*[\"']", - re.IGNORECASE, -) - -# Inline text patterns that signal a credit/copyright notice -_CREDIT_TEXT_RE = re.compile( - r"(©[^<\n\r]{1,100}|(?:Foto|Bild|Credit|Fotograf|Fotografie)\s*:[^<\n\r]{1,100})", - re.IGNORECASE, -) - -# data-* attribute names that carry credit/caption information directly on -_IMG_DATA_CREDIT_ATTRS = ("data-credit", "data-photographer", "data-copyright") -_IMG_DATA_CAPTION_ATTRS = ("data-caption", "data-description") - -# Class keywords for adjacent sibling credit spans/divs after an -_ADJ_CREDIT_CLASS_RE = re.compile( - r"class\s*=\s*[\"'][^\"']*(?:copyright|credit|photographer|photo-credit|image-credit|bildrechte|fotocredit)[^\"']*[\"']", - re.IGNORECASE, -) - - -def _extract_image_metadata(html: str, page_url: str) -> dict[str, dict]: - """Return a mapping of absolute image URL → {"caption": ..., "credit": ...}. - - Uses three progressive strategies: - 1.
with +
- 2. data-* attributes on tags not already covered - 3. tags whose immediately following HTML contains a credit element - """ - result: dict[str, dict] = {} - - try: - # ------------------------------------------------------------------ - # Strategy 1:
blocks containing and
- # ------------------------------------------------------------------ - for fig_match in re.finditer(r"]*>([\s\S]*?)
", html, re.IGNORECASE): - fig_html = fig_match.group(1) - - # Locate image src (src or lazy-loaded data-src) - img_match = re.search( - r"]+(?:src|data-src)\s*=\s*[\"']([^\"']+)[\"'][^>]*>", - fig_html, - re.IGNORECASE, - ) - if not img_match: - continue - img_src = urljoin(page_url, img_match.group(1).strip()) - - # Locate figcaption - figcap_match = re.search( - r"]*>([\s\S]*?)
", - fig_html, - re.IGNORECASE, - ) - if not figcap_match: - continue - figcap_html = figcap_match.group(1) - - # --- Extract credit --- - credit: str | None = None - - # Try credit via class attribute on an inner element - credit_elem_match = re.search( - r"<(?:span|p|div)[^>]*" - + _CREDIT_CLASS_RE.pattern - + r"[^>]*>([\s\S]*?)", - figcap_html, - re.IGNORECASE, - ) - if credit_elem_match: - credit = _clean_text(credit_elem_match.group(1)) - - # Fallback: scan plain text of figcaption for credit patterns - if not credit: - figcap_text = unescape(re.sub(r"<[^>]+>", " ", figcap_html)) - cred_text_match = _CREDIT_TEXT_RE.search(figcap_text) - if cred_text_match: - credit = _clean_text(cred_text_match.group(1)) - - # --- Extract caption (full figcaption text) --- - caption = _clean_text(figcap_html) - - # Only store entries that carry at least one piece of metadata - if caption or credit: - entry: dict[str, str] = {} - if caption: - entry["caption"] = caption - if credit: - entry["credit"] = credit - result[img_src] = entry - - # ------------------------------------------------------------------ - # Strategy 2: data-* attributes on tags - # ------------------------------------------------------------------ - for img_match in re.finditer(r"]+)>", html, re.IGNORECASE): - img_attrs = img_match.group(1) - - # Resolve image URL (prefer src over data-src) - src_match = re.search(r'(?:^|\s)src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE) - if not src_match: - src_match = re.search(r'data-src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE) - if not src_match: - continue - img_src = urljoin(page_url, src_match.group(1).strip()) - - # Skip images already handled by Strategy 1 - if img_src in result: - continue - - credit: str | None = None - caption: str | None = None - - for attr in _IMG_DATA_CREDIT_ATTRS: - attr_match = re.search( - rf'{re.escape(attr)}\s*=\s*["\']([^"\']+)["\']', - img_attrs, - re.IGNORECASE, - ) - if attr_match: - credit = _clean_text(attr_match.group(1)) - break - - for attr in _IMG_DATA_CAPTION_ATTRS: - attr_match = re.search( - rf'{re.escape(attr)}\s*=\s*["\']([^"\']+)["\']', - img_attrs, - re.IGNORECASE, - ) - if attr_match: - caption = _clean_text(attr_match.group(1)) - break - - if caption or credit: - entry = {} - if caption: - entry["caption"] = caption - if credit: - entry["credit"] = credit - result[img_src] = entry - - # ------------------------------------------------------------------ - # Strategy 3: followed within 200 chars by a credit element - # ------------------------------------------------------------------ - for img_match in re.finditer(r"]+)>", html, re.IGNORECASE): - img_attrs = img_match.group(1) - - src_match = re.search(r'(?:^|\s)src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE) - if not src_match: - src_match = re.search(r'data-src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE) - if not src_match: - continue - img_src = urljoin(page_url, src_match.group(1).strip()) - - # Skip images already handled by earlier strategies - if img_src in result: - continue - - # Look at the 200 characters of HTML immediately after the img tag - after_start = img_match.end() - after_html = html[after_start : after_start + 200] - - adj_match = re.search( - r"<(?:span|p|div)[^>]*" - + _ADJ_CREDIT_CLASS_RE.pattern - + r"[^>]*>([\s\S]*?)", - after_html, - re.IGNORECASE, - ) - if adj_match: - credit = _clean_text(adj_match.group(1)) - if credit: - result[img_src] = {"credit": credit} - - except Exception: - return {} - - return result - - -def extract_article(url: str, timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS) -> ExtractedArticle: - try: - req = Request( - url=url, - headers={ - "User-Agent": DEFAULT_USER_AGENT, - "Accept-Language": "de-DE,de;q=0.9,en;q=0.8", - }, - ) - with urlopen(req, timeout=timeout_seconds) as resp: - raw = resp.read() - charset = resp.headers.get_content_charset() or "utf-8" - html = raw.decode(charset, errors="replace") - except Exception as exc: - return ExtractedArticle( - title=None, - author=None, - canonical_url=None, - summary=None, - content_text=None, - images=[], - press_contact=None, - extraction_error=str(exc), - ) - - html = _strip_noise(html) - title = _extract_title(html) - author = _extract_author(html) - canonical_url = _extract_canonical(html) - summary = _meta_content(html, "name", "description") - content_text = _extract_content_text(html) - if not summary and content_text: - summary = _clean_text(content_text[:320]) - images = _extract_images(html, url) - press_contact = _extract_press_contact(content_text) - image_metadata = _extract_image_metadata(html, url) - - return ExtractedArticle( - title=title, - author=author, - canonical_url=canonical_url, - summary=summary, - content_text=content_text, - images=images, - press_contact=press_contact, - extraction_error=None, - image_metadata=image_metadata, - ) - - -def extracted_article_to_meta(article: ExtractedArticle) -> dict[str, Any]: - return { - "title": article.title, - "author": article.author, - "canonical_url": article.canonical_url, - "summary": article.summary, - "images": article.images, - "press_contact": article.press_contact, - "extraction_error": article.extraction_error, - "image_metadata": article.image_metadata, - } diff --git a/backend/app/telegram_bot.py b/backend/app/telegram_bot.py deleted file mode 100644 index 880a49d..0000000 --- a/backend/app/telegram_bot.py +++ /dev/null @@ -1,474 +0,0 @@ -"""Telegram Bot integration for RSS-News pipeline notifications and controls.""" -from __future__ import annotations - -import json -import logging -from typing import Any -from urllib.error import URLError -from urllib.parse import urlencode -from urllib.request import Request, urlopen - -from .config import get_settings - -logger = logging.getLogger(__name__) - -_BASE = "https://api.telegram.org/bot{token}/{method}" -_N8N_APP_RELEASE_WEBHOOK = "https://n8n.vanityontour.de/webhook/tg-app-release-bot-v1/webhook" - - -# --------------------------------------------------------------------------- -# Low-level API helpers -# --------------------------------------------------------------------------- - -def _call(method: str, payload: dict[str, Any]) -> dict[str, Any]: - settings = get_settings() - token = settings.telegram_bot_token - if not token: - raise RuntimeError("TELEGRAM_BOT_TOKEN nicht konfiguriert") - url = _BASE.format(token=token, method=method) - data = json.dumps(payload).encode("utf-8") - req = Request( - url=url, - data=data, - method="POST", - headers={"Content-Type": "application/json", "Accept": "application/json"}, - ) - try: - with urlopen(req, timeout=15) as resp: - raw = resp.read().decode("utf-8", errors="replace") - return json.loads(raw) - except URLError as exc: - logger.error("Telegram API Fehler (%s): %s", method, exc) - raise RuntimeError(f"Telegram API Fehler: {exc}") from exc - - -def _chat_id() -> str: - settings = get_settings() - cid = settings.telegram_chat_id - if not cid: - raise RuntimeError("TELEGRAM_CHAT_ID nicht konfiguriert") - return cid - - -def _inline_keyboard(buttons: list[list[dict[str, str]]]) -> dict: - return {"inline_keyboard": buttons} - - -# --------------------------------------------------------------------------- -# Public send functions -# --------------------------------------------------------------------------- - -def send_message(text: str, reply_markup: dict | None = None, parse_mode: str = "HTML") -> dict: - payload: dict[str, Any] = { - "chat_id": _chat_id(), - "text": text, - "parse_mode": parse_mode, - "disable_web_page_preview": False, - } - if reply_markup: - payload["reply_markup"] = reply_markup - return _call("sendMessage", payload) - - -def send_photo_message( - photo_url: str, - caption: str, - reply_markup: dict | None = None, - parse_mode: str = "HTML", -) -> dict: - payload: dict[str, Any] = { - "chat_id": _chat_id(), - "photo": photo_url, - "caption": caption, - "parse_mode": parse_mode, - } - if reply_markup: - payload["reply_markup"] = reply_markup - try: - return _call("sendPhoto", payload) - except Exception: - # Fall back to text message if photo fails (e.g. image URL no longer valid) - return send_message(caption, reply_markup=reply_markup, parse_mode=parse_mode) - - -def answer_callback_query(callback_query_id: str, text: str = "") -> None: - try: - _call("answerCallbackQuery", {"callback_query_id": callback_query_id, "text": text}) - except Exception as exc: - logger.warning("answerCallbackQuery fehlgeschlagen: %s", exc) - - -def edit_message_reply_markup(chat_id: str, message_id: int, reply_markup: dict | None = None) -> None: - payload: dict[str, Any] = {"chat_id": chat_id, "message_id": message_id} - if reply_markup: - payload["reply_markup"] = reply_markup - else: - payload["reply_markup"] = {"inline_keyboard": []} - try: - _call("editMessageReplyMarkup", payload) - except Exception as exc: - logger.warning("editMessageReplyMarkup fehlgeschlagen: %s", exc) - - -def setup_webhook(webhook_url: str) -> dict: - settings = get_settings() - payload: dict[str, Any] = {"url": webhook_url, "allowed_updates": ["message", "callback_query"]} - if settings.telegram_webhook_secret: - payload["secret_token"] = settings.telegram_webhook_secret - return _call("setWebhook", payload) - - -def delete_webhook() -> dict: - return _call("deleteWebhook", {}) - - -def _forward_to_n8n_app_release(update: dict[str, Any]) -> None: - """Forward a Telegram update to the N8N App Release webhook.""" - try: - data = json.dumps(update).encode("utf-8") - req = Request( - url=_N8N_APP_RELEASE_WEBHOOK, - data=data, - method="POST", - headers={"Content-Type": "application/json"}, - ) - with urlopen(req, timeout=5) as _: - pass - except Exception as exc: - logger.debug("N8N App-Release-Forward fehlgeschlagen: %s", exc) - - -# --------------------------------------------------------------------------- -# Notification helpers -# --------------------------------------------------------------------------- - -def _format_tags(meta_json: str | None) -> str: - if not meta_json: - return "" - try: - meta = json.loads(meta_json) - tags = meta.get("generated_tags") or [] - if tags: - return " ".join(f"#{t.replace(' ', '_')}" for t in tags[:6]) - except Exception: - pass - return "" - - -def _score_emoji(score: int) -> str: - if score >= 85: - return "🟢" - if score >= 70: - return "🟡" - return "🔴" - - -def notify_new_draft( - article: dict[str, Any], - score: int, - suggested_publish_at: str | None = None, -) -> None: - """Send Telegram notification for a newly created WP draft.""" - title = (article.get("title") or "Ohne Titel").strip() - wp_url = article.get("wp_post_url") or "" - tags_str = _format_tags(article.get("meta_json")) - art_id = article.get("id") - - score_line = f"{_score_emoji(score)} Relevanz-Score: {score}/100" - publish_line = f"📅 Vorgeschlagene Veröffentlichung: {suggested_publish_at}" if suggested_publish_at else "" - link_line = f'🔗 Draft in WordPress öffnen' if wp_url else "" - tags_line = f"🏷 {tags_str}" if tags_str else "" - - text_parts = [ - f"✅ Neuer Draft erstellt", - f"📰 {title}", - score_line, - ] - if publish_line: - text_parts.append(publish_line) - if tags_line: - text_parts.append(tags_line) - if link_line: - text_parts.append(link_line) - - text = "\n".join(text_parts) - - keyboard = _inline_keyboard([ - [ - {"text": "✏️ Neu schreiben", "callback_data": f"rewrite:{art_id}"}, - {"text": "❌ Verwerfen", "callback_data": f"discard:{art_id}"}, - ] - ]) - - # Try with image first - meta = {} - try: - meta = json.loads(article.get("meta_json") or "{}") - except Exception: - pass - image_url = None - image_review = meta.get("image_review") or {} - if isinstance(image_review, dict): - image_url = image_review.get("selected_url") - if not image_url: - image_sel = (meta.get("extraction") or {}).get("image_selection") or {} - image_url = image_sel.get("primary") - - if image_url: - send_photo_message(image_url, caption=text, reply_markup=keyboard) - else: - send_message(text, reply_markup=keyboard) - - -def notify_relevance_warning(article: dict[str, Any], score: int, reason: str) -> None: - """Send Telegram warning for borderline articles (score between warn and auto thresholds).""" - title = (article.get("title") or "Ohne Titel").strip() - art_id = article.get("id") - source_url = article.get("source_url") or "" - - text = ( - f"⚠️ Artikel mit niedrigem Relevanz-Score\n" - f"📰 {title}\n" - f"{_score_emoji(score)} Score: {score}/100\n" - f"💬 {reason}\n" - f'🔗 Originalartikel' - ) - keyboard = _inline_keyboard([ - [ - {"text": "➕ Trotzdem verarbeiten", "callback_data": f"override:{art_id}"}, - {"text": "❌ Ablehnen", "callback_data": f"reject:{art_id}"}, - ] - ]) - send_message(text, reply_markup=keyboard) - - -def notify_rejected_summary(articles: list[dict[str, Any]]) -> None: - """Send summary of rejected articles for this pipeline run.""" - if not articles: - return - lines = [f"🚫 {len(articles)} Artikel abgelehnt (Score < {get_settings().pipeline_relevance_warn})\n"] - for art in articles[:10]: - title = (art.get("title") or "Ohne Titel")[:60] - score = _get_relevance_score(art) - reason = _get_rejection_reason(art) - art_id = art.get("id") - lines.append(f"• {title} (Score: {score}) — {reason}") - if len(articles) > 10: - lines.append(f"... und {len(articles) - 10} weitere") - - text = "\n".join(lines) - # Build override buttons for first 5 - rows = [] - for art in articles[:5]: - art_id = art.get("id") - title = (art.get("title") or "")[:25] - rows.append([{"text": f"➕ {title}…", "callback_data": f"override:{art_id}"}]) - - keyboard = _inline_keyboard(rows) if rows else None - send_message(text, reply_markup=keyboard) - - -def notify_error(message: str) -> None: - """Send error alert to Telegram.""" - try: - send_message(f"🔴 Fehler im RSS-Pipeline\n{message}") - except Exception as exc: - logger.error("Telegram Fehler-Benachrichtigung fehlgeschlagen: %s", exc) - - -def notify_pipeline_started(trigger: str = "auto") -> None: - icon = "🤖" if trigger == "auto" else "👤" - try: - send_message(f"{icon} Pipeline gestartet (Auslöser: {trigger})") - except Exception: - pass - - -def notify_pipeline_done(stats: dict[str, Any]) -> None: - ingested = stats.get("ingested", 0) - processed = stats.get("processed", 0) - drafts = stats.get("drafts_created", 0) - rejected = stats.get("rejected", 0) - quality_gate_rejected = stats.get("quality_gate_rejected", 0) - no_image = stats.get("no_image", 0) - warnings = stats.get("warnings", 0) - errors = stats.get("errors", 0) - - lines = [ - "📊 Pipeline abgeschlossen", - f"📥 Neue Artikel importiert: {ingested}", - f"⚙️ Verarbeitet: {processed}", - f"📝 Drafts erstellt: {drafts}", - ] - if rejected: - lines.append(f"🚫 Abgelehnt (Score): {rejected}") - if quality_gate_rejected: - lines.append(f"✂️ Qualitätsprüfung: {quality_gate_rejected}") - if no_image: - lines.append(f"🖼️ Kein Bild: {no_image}") - if warnings: - lines.append(f"⚠️ Warnungen: {warnings}") - if errors: - lines.append(f"🔴 Fehler: {errors}") - - try: - send_message("\n".join(lines)) - except Exception: - pass - - -# --------------------------------------------------------------------------- -# Helper to read relevance info from meta_json -# --------------------------------------------------------------------------- - -def _get_relevance_score(article: dict[str, Any]) -> int: - try: - meta = json.loads(article.get("meta_json") or "{}") - return int(meta.get("relevance", {}).get("score", 0)) - except Exception: - return 0 - - -def _get_rejection_reason(article: dict[str, Any]) -> str: - try: - meta = json.loads(article.get("meta_json") or "{}") - return str(meta.get("relevance", {}).get("reason", ""))[:80] - except Exception: - return "" - - -# --------------------------------------------------------------------------- -# Incoming update handler (called by webhook endpoint) -# --------------------------------------------------------------------------- - -def handle_update(update: dict[str, Any]) -> None: - """Process an incoming Telegram update.""" - # Import here to avoid circular imports - from . import pipeline as _pipeline - - if "callback_query" in update: - _handle_callback(update["callback_query"]) - elif "message" in update: - _handle_message(update["message"]) - - -def _handle_message(message: dict[str, Any]) -> None: - from . import pipeline as _pipeline - - text = (message.get("text") or "").strip() - if not text.startswith("/"): - return - - cmd = text.split()[0].lower().lstrip("/") - if "@" in cmd: - cmd = cmd.split("@")[0] - - if cmd == "run": - send_message("🤖 Pipeline wird manuell gestartet …") - try: - stats = _pipeline.run_auto_pipeline(trigger="manual") - notify_pipeline_done(stats) - except Exception as exc: - notify_error(f"/run fehlgeschlagen: {exc}") - - elif cmd == "rejected": - try: - articles = _pipeline.get_recently_rejected(days=3) - if not articles: - send_message("✅ Keine abgelehnten Artikel in den letzten 3 Tagen.") - else: - notify_rejected_summary(articles) - except Exception as exc: - notify_error(f"/rejected fehlgeschlagen: {exc}") - - elif cmd == "status": - try: - status_text = _pipeline.get_pipeline_status_text() - send_message(status_text) - except Exception as exc: - notify_error(f"/status fehlgeschlagen: {exc}") - - elif cmd == "help": - send_message( - "📋 Verfügbare Befehle\n" - "/run — Pipeline manuell starten\n" - "/rejected — Abgelehnte Artikel der letzten 3 Tage\n" - "/status — Pipeline-Status\n" - "/help — Diese Hilfe" - ) - - else: - # Unbekannter Befehl → an N8N App-Release-Workflow weiterleiten - _forward_to_n8n_app_release({"message": message}) - - -def _handle_callback(callback_query: dict[str, Any]) -> None: - from . import pipeline as _pipeline - from .repositories import get_article_by_id, update_article_status - - query_id = callback_query.get("id", "") - data = (callback_query.get("data") or "").strip() - chat_id = str(callback_query.get("message", {}).get("chat", {}).get("id", "")) - message_id = int(callback_query.get("message", {}).get("message_id", 0)) - - if ":" not in data: - answer_callback_query(query_id, "Ungültige Aktion") - return - - action, _, raw_id = data.partition(":") - try: - article_id = int(raw_id) - except ValueError: - answer_callback_query(query_id, "Ungültige Artikel-ID") - return - - article = get_article_by_id(article_id) - if not article: - answer_callback_query(query_id, "Artikel nicht gefunden") - return - - # Answer Telegram immediately so the spinning indicator stops - action_labels = { - "rewrite": "✏️ Artikel wird neu geschrieben …", - "discard": "❌ Artikel verworfen", - "override": "➕ Artikel wird verarbeitet …", - "reject": "🚫 Abgelehnt", - } - answer_callback_query(query_id, action_labels.get(action, "")) - edit_message_reply_markup(chat_id, message_id) - - logger.info("Callback: action=%s article_id=%s", action, article_id) - - if action == "rewrite": - try: - logger.info("Rewrite #%d: starte rewrite_and_update_draft", article_id) - _pipeline.rewrite_and_update_draft(article_id) - logger.info("Rewrite #%d: abgeschlossen, sende Benachrichtigung", article_id) - updated = get_article_by_id(article_id) - if updated: - from .scheduler import suggest_publish_slot - slot = suggest_publish_slot() - notify_new_draft(updated, score=_get_relevance_score(updated), suggested_publish_at=slot) - except Exception as exc: - logger.error("Rewrite #%d fehlgeschlagen: %s", article_id, exc, exc_info=True) - notify_error(f"Rewrite #{article_id} fehlgeschlagen: {exc}") - - elif action == "discard": - try: - _pipeline.discard_article(article_id) - except Exception as exc: - logger.error("Discard #%d fehlgeschlagen: %s", article_id, exc) - notify_error(f"Verwerfen #{article_id} fehlgeschlagen: {exc}") - - elif action == "override": - try: - _pipeline.override_rejected_article(article_id) - except Exception as exc: - logger.error("Override #%d fehlgeschlagen: %s", article_id, exc) - notify_error(f"Override #{article_id} fehlgeschlagen: {exc}") - - elif action == "reject": - update_article_status(article_id, "error", actor="telegram", note="Manuell abgelehnt via Telegram") - - else: - logger.warning("Unbekannte Callback-Aktion: %s", action) diff --git a/backend/app/wordpress.py b/backend/app/wordpress.py deleted file mode 100644 index bb96198..0000000 --- a/backend/app/wordpress.py +++ /dev/null @@ -1,689 +0,0 @@ -from __future__ import annotations - -import base64 -from html import escape -import logging -import json -import mimetypes -from pathlib import Path -import re -from typing import Any -from html import unescape as _html_unescape -from urllib.parse import quote_plus, urlparse -from urllib.request import Request, urlopen - -from .config import get_settings - - -def _auth_header(username: str, app_password: str) -> str: - token = base64.b64encode(f"{username}:{app_password}".encode("utf-8")).decode("ascii") - return f"Basic {token}" - - -def _wp_request( - *, - base_url: str, - auth_header: str, - method: str, - endpoint: str, - payload: dict[str, Any] | None = None, -) -> Any: - url = f"{base_url.rstrip('/')}/wp-json/wp/v2/{endpoint.lstrip('/')}" - data = json.dumps(payload).encode("utf-8") if payload is not None else None - req = Request( - url=url, - data=data, - method=method, - headers={ - "Authorization": auth_header, - "Content-Type": "application/json; charset=utf-8", - "Accept": "application/json", - "User-Agent": "rss-news-publisher/1.0", - }, - ) - with urlopen(req, timeout=20) as resp: - raw = resp.read().decode("utf-8", errors="replace") - return json.loads(raw) if raw else {} - - -def _selected_image_url_from_meta(meta_json: str | None) -> str | None: - if not meta_json: - return None - try: - meta = json.loads(meta_json) - except Exception: - return None - if not isinstance(meta, dict): - return None - image_review = meta.get("image_review") - if not isinstance(image_review, dict): - return None - selected = image_review.get("selected_url") - return selected if isinstance(selected, str) and selected.strip() else None - - -def _selected_tags_from_meta(meta_json: str | None) -> list[str]: - if not meta_json: - return [] - try: - meta = json.loads(meta_json) - except Exception: - return [] - if not isinstance(meta, dict): - return [] - raw_tags = meta.get("generated_tags") - if not isinstance(raw_tags, list): - return [] - tags: list[str] = [] - seen: set[str] = set() - for item in raw_tags: - value = str(item or "").strip() - if not value: - continue - key = value.casefold() - if key in seen: - continue - seen.add(key) - tags.append(value) - if len(tags) >= 12: - break - return tags - - -def _resolve_wp_tag_ids(*, base_url: str, auth_header: str, tags: list[str]) -> list[int]: - ids: list[int] = [] - seen: set[int] = set() - for tag in tags: - name = tag.strip() - if not name: - continue - try: - endpoint = f"tags?search={quote_plus(name)}&per_page=20" - result = _wp_request(base_url=base_url, auth_header=auth_header, method="GET", endpoint=endpoint) - tag_id: int | None = None - if isinstance(result, list): - for row in result: - if not isinstance(row, dict): - continue - row_name = str(row.get("name") or "") - rid = int(row.get("id", 0) or 0) - if rid <= 0: - continue - if row_name.casefold() == name.casefold(): - tag_id = rid - break - if tag_id is None: - for row in result: - if isinstance(row, dict) and int(row.get("id", 0) or 0) > 0: - tag_id = int(row.get("id", 0)) - break - if tag_id is None: - created = _wp_request( - base_url=base_url, - auth_header=auth_header, - method="POST", - endpoint="tags", - payload={"name": name}, - ) - if isinstance(created, dict): - rid = int(created.get("id", 0) or 0) - if rid > 0: - tag_id = rid - if tag_id is not None and tag_id > 0 and tag_id not in seen: - seen.add(tag_id) - ids.append(tag_id) - except Exception: - continue - return ids - - -_BLOCKED_IMAGE_EXTS = {".svg", ".gif", ".ico", ".webp"} -_logger = logging.getLogger(__name__) - - -def _sanitize_image_url(url: str) -> str: - """Decode HTML entities (e.g. & → &) in image URLs from RSS feeds.""" - return _html_unescape(url) - - -_PLACEHOLDER_PATTERNS = ("some-default.jpg", "default-image", "placeholder", "no-image", "noimage") - -def _is_usable_image_url(url: str) -> bool: - """Return False for URLs that are unlikely to work as WP featured images.""" - if not url or url.startswith("data:"): - return False - try: - path = urlparse(url).path.lower() - _, ext = path.rsplit(".", 1) if "." in path else ("", "") - if f".{ext}" in _BLOCKED_IMAGE_EXTS: - return False - if any(p in path for p in _PLACEHOLDER_PATTERNS): - return False - except Exception: - pass - return True - - -def _download_image_bytes(url: str, referer: str | None = None) -> tuple[bytes, str]: - url = _sanitize_image_url(url) - headers = { - "User-Agent": "Mozilla/5.0 (compatible; rss-news-publisher/1.0)", - "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8", - } - if referer: - headers["Referer"] = referer - req = Request(url=url, headers=headers) - with urlopen(req, timeout=20) as resp: - raw = resp.read() - content_type = resp.headers.get("Content-Type", "application/octet-stream") - content_type = content_type.split(";")[0].strip() if content_type else "application/octet-stream" - if not content_type.lower().startswith("image/"): - raise RuntimeError(f"Ausgewählte Bild-URL liefert kein Bild ({content_type})") - return raw, content_type - - -def _guess_filename(image_url: str, content_type: str) -> str: - parsed = urlparse(_sanitize_image_url(image_url)) - stem = Path(parsed.path).name or "article-image" - if "." not in stem: - ext = mimetypes.guess_extension(content_type.split(";")[0].strip()) or ".jpg" - stem = f"{stem}{ext}" - # Sanitize to ASCII-safe characters for the HTTP Content-Disposition header - stem = stem.encode("ascii", errors="ignore").decode("ascii") - stem = re.sub(r"[^\w.\-]", "_", stem) or "article-image.jpg" - return stem - - -def _get_image_meta_for_url(meta_json: str | None, image_url: str) -> dict: - """Return the caption/credit dict for a specific image URL from extraction metadata.""" - if not meta_json or not image_url: - return {} - try: - from urllib.parse import urlparse - meta = json.loads(meta_json) - image_metadata = (meta.get("extraction") or {}).get("image_metadata") or {} - # Exact match first - if image_url in image_metadata: - return image_metadata[image_url] - # Fuzzy match: compare without query string (handles ?w=1200 variants) - base_url = urlparse(image_url)._replace(query="").geturl() - for key, val in image_metadata.items(): - key_base = urlparse(key)._replace(query="").geturl() - if key_base == base_url: - return val - return {} - except Exception: - return {} - - -def _build_image_caption(image_meta: dict, source_url: str) -> str: - """Build a WP caption string from image metadata and source URL.""" - # caption from figcaption typically already contains the credit text - caption = (image_meta.get("caption") or "").strip() - if caption: - return caption - return f"Quelle: {source_url}" - - -def _upload_featured_media( - *, - base_url: str, - auth_header: str, - image_url: str, - article_title: str, - source_url: str, - image_caption: str = "", -) -> int: - image_bytes, content_type = _download_image_bytes(image_url, referer=source_url or None) - filename = _guess_filename(image_url, content_type) - - media_url = f"{base_url.rstrip('/')}/wp-json/wp/v2/media" - media_req = Request( - url=media_url, - data=image_bytes, - method="POST", - headers={ - "Authorization": auth_header, - "Content-Type": content_type, - "Content-Disposition": f'attachment; filename="{filename}"', - "Accept": "application/json", - "User-Agent": "rss-news-publisher/1.0", - }, - ) - with urlopen(media_req, timeout=30) as resp: - media_raw = resp.read().decode("utf-8", errors="replace") - media_payload = json.loads(media_raw) if media_raw else {} - media_id = int(media_payload.get("id", 0)) if isinstance(media_payload, dict) else 0 - if media_id <= 0: - raise RuntimeError(f"WordPress Media-Upload fehlgeschlagen: {media_payload}") - - _wp_request( - base_url=base_url, - auth_header=auth_header, - method="POST", - endpoint=f"media/{media_id}", - payload={ - "title": f"{article_title[:120]} - Bild", - "caption": image_caption or f"Quelle: {source_url}", - "alt_text": article_title[:200], - }, - ) - return media_id - - -def _as_paragraph_html(text: str) -> str: - chunks = [chunk.strip() for chunk in re.split(r"\n{2,}", text.strip()) if chunk.strip()] - if not chunks: - return "" - lines = [] - for chunk in chunks: - compact = re.sub(r"\s*\n\s*", " ", chunk) - lines.append(f"

{escape(compact)}

") - return "\n".join(lines) - - -def _as_block_paragraphs(text: str) -> str: - chunks = [chunk.strip() for chunk in re.split(r"\n{2,}", text.strip()) if chunk.strip()] - if not chunks: - return "" - lines = [] - for chunk in chunks: - compact = re.sub(r"\s*\n\s*", " ", chunk) - lines.append(f"

{escape(compact)}

") - return "\n".join(lines) - - -def _strip_html_tags(raw: str) -> str: - text = re.sub(r"<[^>]+>", " ", raw or "") - return re.sub(r"\s+", " ", text).strip() - - -def _html_to_wp_blocks(html: str) -> str: - src = (html or "").strip() - if not src: - return "" - pattern = re.compile( - r"]*>[\s\S]*?|]*>[\s\S]*?

|]*>[\s\S]*?|]*>[\s\S]*?", - re.IGNORECASE, - ) - blocks: list[str] = [] - for match in pattern.finditer(src): - block_html = match.group(0).strip() - if not block_html: - continue - tag_match = re.match(r"<([a-z0-9]+)", block_html, re.IGNORECASE) - tag = (tag_match.group(1).lower() if tag_match else "") - if tag == "p": - blocks.append(f"{block_html}") - elif tag in {"ul", "ol"}: - ordered = tag == "ol" - if ordered: - blocks.append(f'{block_html}') - else: - blocks.append(f"{block_html}") - elif tag.startswith("h") and len(tag) == 2 and tag[1].isdigit(): - level = int(tag[1]) - blocks.append(f'{block_html}') - if blocks: - return "\n".join(blocks) - return _as_block_paragraphs(_strip_html_tags(src)) - - -def _as_block_heading(level: int, text: str) -> str: - safe_level = min(6, max(1, int(level))) - return f'{escape(text)}' - - -def _as_block_list(items: list[str]) -> str: - if not items: - return "" - content = "".join(f"
  • {item}
  • " for item in items) - return f"
      {content}
    " - - -def _sanitize_publish_text(text: str) -> str: - raw = (text or "").strip() - if not raw: - return "" - lines = [ln.strip() for ln in raw.splitlines() if ln.strip()] - if len(lines) > 3: - lines = lines[3:] - merged = "\n".join(lines) - merged = re.sub(r"\n?\s*Pressekontakt[\s\S]*$", "", merged, flags=re.IGNORECASE).strip() - return merged - - -def _build_attribution_block(article: dict[str, Any]) -> str: - """Build a WP Gutenberg attribution block for the bottom of the article.""" - from urllib.parse import urlparse - source_url = (article.get("canonical_url") or article.get("source_url") or "").strip() - source_name = (article.get("source_name_snapshot") or "").strip() - author = (article.get("author") or "").strip() - - # If the feed name is "Google Alerts" (or similar generic names), derive the - # real source name from the hostname of the canonical URL. - if not source_name or source_name.lower() in ("google alerts", "google"): - try: - hostname = urlparse(source_url).hostname or "" - source_name = hostname.removeprefix("www.") - except Exception: - pass - - # Get image credit from extraction metadata (uses fuzzy URL match) - meta_json = article.get("meta_json") - credit = "" - try: - meta = json.loads(meta_json or "{}") - selected_url = (meta.get("image_review") or {}).get("selected_url") or "" - if selected_url: - img_meta = _get_image_meta_for_url(meta_json, selected_url) - raw_credit = (img_meta.get("credit") or "").strip() - caption_text = (img_meta.get("caption") or "").strip() - # If credit is just a bare marker prefix (e.g. "Foto:", "Bild:"), - # clear it and extract the full credit from the caption text instead. - _BARE_MARKERS = {"foto", "bild", "credit", "fotograf", "fotografie", "photo", "bildnachweis"} - if raw_credit.endswith(":") and raw_credit[:-1].strip().lower() in _BARE_MARKERS: - raw_credit = "" - if raw_credit: - credit = raw_credit - elif caption_text: - # Extract credit markers like "Foto: IMAGO/…", "© Agentur", "Bild: …" - import re as _re - m = _re.search( - r"(©[^\n]{1,120}|(?:Foto|Bild|Credit|Fotograf|Photo)\s*:[^\n]{1,120})", - caption_text, - ) - credit = m.group(1).strip() if m else "" - except Exception: - pass - - parts: list[str] = [] - if source_url: - label = source_name or source_url - parts.append(f'Originalartikel: {escape(label)}') - if author: - parts.append(f"Autor: {escape(author)}") - if credit: - parts.append(f"Bildnachweis: {escape(credit)}") - - if not parts: - return "" - - inner = "  |  ".join(parts) - return ( - "\n" - "
    \n" - f'' - f'

    {inner}

    ' - "" - ) - - -def _build_post_content(article: dict[str, Any]) -> tuple[str, str | None]: - summary = (article.get("summary") or "").strip() - body_text = (article.get("content_rewritten") or article.get("content_raw") or "").strip() - body_text = _sanitize_publish_text(body_text) - if not body_text: - body_text = summary - - has_html = bool(re.search(r"<[a-zA-Z][^>]*>", body_text)) - body_html = _html_to_wp_blocks(body_text) if has_html else _as_block_paragraphs(body_text) - if not body_html: - body_html = "

    Kein Inhalt verfügbar.

    " - - attribution = _build_attribution_block(article) - content = (body_html + attribution).strip() - return content, None - - -def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]: - settings = get_settings() - if not settings.wordpress_base_url or not settings.wordpress_username or not settings.wordpress_app_password: - raise RuntimeError("WordPress Konfiguration fehlt (base_url, username, app_password)") - - auth = _auth_header(settings.wordpress_username, settings.wordpress_app_password) - - title = (article.get("title") or "Ohne Titel").strip() - content, excerpt = _build_post_content(article) - source_url = article.get("source_url") or "" - - featured_media_id = None - selected_image_url = _selected_image_url_from_meta(article.get("meta_json")) - - # Build candidate list: primary selected URL + fallbacks from image_urls_json - image_candidates: list[str] = [] - if selected_image_url and _is_usable_image_url(selected_image_url): - image_candidates.append(selected_image_url) - try: - extra_urls = json.loads(article.get("image_urls_json") or "[]") - for u in extra_urls: - if u and u not in image_candidates and _is_usable_image_url(u): - image_candidates.append(u) - except Exception: - pass - - for candidate_url in image_candidates: - image_meta = _get_image_meta_for_url(article.get("meta_json"), candidate_url) - image_caption = _build_image_caption(image_meta, source_url) - try: - featured_media_id = _upload_featured_media( - base_url=settings.wordpress_base_url, - auth_header=auth, - image_url=candidate_url, - article_title=title, - source_url=source_url, - image_caption=image_caption, - ) - break # success — stop trying further candidates - except Exception as img_exc: - _logger.warning( - "Bild-Upload fehlgeschlagen, versuche nächste URL: %s — %s", candidate_url, img_exc - ) - - if not featured_media_id and image_candidates: - _logger.warning( - "Alle %d Bild-Kandidaten fehlgeschlagen für Artikel #%s (%s)", - len(image_candidates), article.get("id"), title[:60], - ) - - payload = { - "title": title, - "content": content, - "status": settings.wordpress_default_status, - } - if excerpt: - payload["excerpt"] = excerpt - if featured_media_id: - payload["featured_media"] = featured_media_id - scheduled_at = article.get("scheduled_publish_at") - if scheduled_at: - payload["date"] = scheduled_at # e.g. "2026-03-24T09:00:00" - # Use status "future" so WP schedules auto-publishing at the given date. - # WP ignores date for drafts and shows "Sofort veröffentlichen" instead. - try: - from datetime import datetime as _dt - if _dt.fromisoformat(scheduled_at) > _dt.now(): - payload["status"] = "future" - except Exception: - pass - - wp_post_id = article.get("wp_post_id") - tag_ids = _resolve_wp_tag_ids( - base_url=settings.wordpress_base_url, - auth_header=auth, - tags=_selected_tags_from_meta(article.get("meta_json")), - ) - if tag_ids: - payload["tags"] = tag_ids - - if wp_post_id: - result = _wp_request( - base_url=settings.wordpress_base_url, - auth_header=auth, - method="POST", - endpoint=f"posts/{int(wp_post_id)}", - payload=payload, - ) - else: - result = _wp_request( - base_url=settings.wordpress_base_url, - auth_header=auth, - method="POST", - endpoint="posts", - payload=payload, - ) - - if not isinstance(result, dict): - raise RuntimeError(f"WordPress Antwort im unerwarteten Format: {result}") - post_id = int(result.get("id", 0)) - if post_id <= 0: - raise RuntimeError(f"WordPress Antwort ohne Post-ID: {result}") - post_url = result.get("link") - return post_id, post_url if isinstance(post_url, str) else None - - -def selected_image_exists(article: dict[str, Any]) -> bool: - return _selected_image_url_from_meta(article.get("meta_json")) is not None - - -def delete_wp_post(wp_post_id: int) -> None: - """Permanently delete a WordPress post (moves to trash, then deletes).""" - settings = get_settings() - if not settings.wordpress_base_url or not settings.wordpress_username or not settings.wordpress_app_password: - raise RuntimeError("WordPress Konfiguration fehlt") - auth = _auth_header(settings.wordpress_username, settings.wordpress_app_password) - # force=true skips trash - _wp_request( - base_url=settings.wordpress_base_url, - auth_header=auth, - method="DELETE", - endpoint=f"posts/{wp_post_id}?force=true", - ) - - -def sync_db_from_wordpress() -> dict[str, Any]: - """Sync scheduled_publish_at and wp_post_url in the DB from WordPress. - - WordPress is treated as the source of truth for scheduling. - For each DB article that has a wp_post_id: - - If WP post exists as 'future': update scheduled_publish_at to WP date. - - If WP post exists as 'draft': clear scheduled_publish_at (not yet scheduled). - - If WP post exists as 'publish': mark article as published in DB. - - If WP post is trashed/deleted (404 or trash status): clear wp_post_id, - wp_post_url, and scheduled_publish_at so the article can be re-processed. - Returns a stats dict with counts of each action taken. - """ - from .db import get_conn - - settings = get_settings() - if not settings.wordpress_base_url or not settings.wordpress_username or not settings.wordpress_app_password: - raise RuntimeError("WordPress Konfiguration fehlt") - auth = _auth_header(settings.wordpress_username, settings.wordpress_app_password) - base_url = settings.wordpress_base_url.rstrip("/") - - # Fetch all future + draft + published WP posts in one pass (up to 300 per status) - wp_posts: dict[int, dict] = {} - for status in ("future", "draft", "publish"): - for page in range(1, 4): # max 300 per status - try: - result = _wp_request( - base_url=base_url, - auth_header=auth, - method="GET", - endpoint=f"posts?status={status}&per_page=100&page={page}&_fields=id,date,status,link", - ) - except Exception: - break - if not isinstance(result, list) or not result: - break - for post in result: - try: - wp_posts[int(post["id"])] = post - except Exception: - pass - if len(result) < 100: - break - - # Load all DB articles that have a wp_post_id - with get_conn() as conn: - rows = conn.execute( - """ - SELECT id, wp_post_id, wp_post_url, scheduled_publish_at, status - FROM articles - WHERE wp_post_id IS NOT NULL - AND status NOT IN ('no_image') - ORDER BY id - """ - ).fetchall() - - stats: dict[str, int] = { - "total_db_articles": len(rows), - "wp_posts_found": len(wp_posts), - "slot_updated": 0, - "slot_cleared_draft": 0, - "marked_published": 0, - "wp_reference_cleared": 0, - "already_in_sync": 0, - } - - for row in rows: - article_id = row["id"] - wp_post_id = int(row["wp_post_id"]) - wp_post = wp_posts.get(wp_post_id) - - if wp_post is None: - # Post not found in future/draft/publish — likely trashed or deleted - # Clear wp reference so article can be re-processed if needed - with get_conn() as conn: - conn.execute( - """UPDATE articles - SET wp_post_id = NULL, wp_post_url = NULL, scheduled_publish_at = NULL - WHERE id = ?""", - (article_id,), - ) - stats["wp_reference_cleared"] += 1 - continue - - wp_status = wp_post.get("status", "") - wp_date = wp_post.get("date", "") # local CET datetime, e.g. "2026-05-05T09:00:00" - wp_link = wp_post.get("link") or row["wp_post_url"] - - if wp_status == "publish": - # Already published in WP — mark as published in DB if not already - if row["status"] != "published": - with get_conn() as conn: - conn.execute( - "UPDATE articles SET status = 'published', wp_post_url = ? WHERE id = ?", - (wp_link, article_id), - ) - stats["marked_published"] += 1 - else: - stats["already_in_sync"] += 1 - - elif wp_status == "future": - # Scheduled — sync the date into scheduled_publish_at - current_slot = row["scheduled_publish_at"] or "" - # WP returns e.g. "2026-05-05T09:00:00" — compare ignoring seconds - if current_slot[:16] != wp_date[:16]: - with get_conn() as conn: - conn.execute( - "UPDATE articles SET scheduled_publish_at = ?, wp_post_url = ? WHERE id = ?", - (wp_date, wp_link, article_id), - ) - stats["slot_updated"] += 1 - else: - stats["already_in_sync"] += 1 - - elif wp_status == "draft": - # Draft without a schedule — clear scheduled_publish_at if set - if row["scheduled_publish_at"]: - with get_conn() as conn: - conn.execute( - "UPDATE articles SET scheduled_publish_at = NULL WHERE id = ?", - (article_id,), - ) - stats["slot_cleared_draft"] += 1 - else: - stats["already_in_sync"] += 1 - - return stats diff --git a/backend/app/workflow.py b/backend/app/workflow.py deleted file mode 100644 index 83e9b63..0000000 --- a/backend/app/workflow.py +++ /dev/null @@ -1,39 +0,0 @@ -from __future__ import annotations - -UI_STATUSES = ("new", "rewrite", "publish", "published", "close", "no_image") - - -def internal_to_ui_status(status: str | None) -> str: - value = (status or "").strip() - if value == "approved": - return "publish" - if value == "error": - return "close" - if value == "review": - return "rewrite" - if value in {"new", "rewrite", "published", "no_image"}: - return value - return value or "new" - - -def ui_to_internal_status(status: str | None) -> str: - value = (status or "").strip() - if value == "publish": - return "approved" - if value == "close": - return "error" - if value in {"new", "rewrite", "published", "no_image"}: - return value - if value in {"approved", "error", "review"}: - return value - return value - - -ALLOWED_UI_TRANSITIONS: dict[str, set[str]] = { - "new": {"rewrite", "close"}, - "rewrite": {"publish", "close"}, - "publish": {"published", "close"}, - "published": {"rewrite", "close"}, - "close": {"rewrite"}, - "no_image": {"rewrite", "close"}, -} diff --git a/backend/data/rss_news.db b/backend/data/rss_news.db deleted file mode 100644 index 7929307..0000000 Binary files a/backend/data/rss_news.db and /dev/null differ diff --git a/backend/requirements-test.txt b/backend/requirements-test.txt deleted file mode 100644 index cf39f84..0000000 --- a/backend/requirements-test.txt +++ /dev/null @@ -1,3 +0,0 @@ -pytest==8.3.5 -pytest-cov==6.0.0 -httpx==0.28.1 diff --git a/backend/requirements.txt b/backend/requirements.txt deleted file mode 100644 index f4ffe61..0000000 --- a/backend/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -fastapi==0.116.1 -uvicorn[standard]==0.35.0 -itsdangerous==2.2.0 -pydantic-settings==2.10.1 -python-dotenv==1.1.1 -feedparser==6.0.11 -jinja2==3.1.4 -python-multipart==0.0.20 diff --git a/backend/static/admin.css b/backend/static/admin.css deleted file mode 100644 index 0b31bb5..0000000 --- a/backend/static/admin.css +++ /dev/null @@ -1,303 +0,0 @@ -body { - margin: 0; - font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif; - background: #f4f6f8; - color: #1f2937; -} - -.topbar { - display: flex; - justify-content: space-between; - align-items: center; - padding: 20px 28px; - background: #0f172a; - color: #f8fafc; -} - -.container { - padding: 20px 28px 28px 28px; -} - -.login { - max-width: 520px; - margin: 60px auto; -} - -.card { - background: #ffffff; - border-radius: 10px; - box-shadow: 0 1px 3px rgba(0, 0, 0, 0.12); - padding: 16px; - margin-bottom: 16px; -} - -.stats { - display: grid; - grid-template-columns: repeat(4, minmax(0, 1fr)); - gap: 12px; - margin-bottom: 16px; -} - -.stat { - background: #ffffff; - border-radius: 10px; - padding: 12px; - box-shadow: 0 1px 3px rgba(0, 0, 0, 0.12); -} - -.stat .label { - font-size: 12px; - color: #64748b; -} - -.stat .value { - font-size: 24px; - font-weight: 700; -} - -.grid.two { - display: grid; - grid-template-columns: 1fr 1fr; - gap: 16px; -} - -.stack { - display: grid; - gap: 10px; -} - -.row { - display: flex; - gap: 8px; - align-items: center; -} - -.filter-row { - margin-bottom: 10px; -} - -.inline { - display: flex; - gap: 6px; - align-items: center; -} - -table { - width: 100%; - border-collapse: collapse; -} - -th, td { - text-align: left; - padding: 8px; - border-bottom: 1px solid #e5e7eb; - vertical-align: top; -} - -input, select, button, textarea { - padding: 8px; - border-radius: 6px; - border: 1px solid #cbd5e1; - font: inherit; -} - -button { - background: #0ea5e9; - border-color: #0ea5e9; - color: white; - cursor: pointer; -} - -button.secondary { - background: #64748b; - border-color: #64748b; -} - -.badge { - display: inline-block; - padding: 2px 8px; - border-radius: 999px; - background: #e2e8f0; - font-size: 12px; -} - -.badge.ok { - background: #dcfce7; - color: #166534; -} - -.badge.bad { - background: #fee2e2; - color: #991b1b; -} - -.badge.errcat { - margin-bottom: 4px; -} - -.badge.errcat-policy { - background: #fee2e2; - color: #991b1b; -} - -.badge.errcat-auth { - background: #ffedd5; - color: #9a3412; -} - -.badge.errcat-dns { - background: #dbeafe; - color: #1e40af; -} - -.badge.errcat-media { - background: #fef9c3; - color: #854d0e; -} - -.badge.errcat-api { - background: #ede9fe; - color: #5b21b6; -} - -.badge.errcat-unknown { - background: #e2e8f0; - color: #334155; -} - -.alert { - margin-bottom: 12px; - padding: 10px; - border-radius: 8px; - background: #fee2e2; - color: #991b1b; -} - -.flash { - font-weight: 600; -} - -.flash-success { - border-left: 4px solid #10b981; -} - -.flash-error { - border-left: 4px solid #ef4444; -} - -.subtle { - color: #64748b; - font-size: 12px; - margin-top: 4px; -} - -.pre { - white-space: pre-wrap; - line-height: 1.35; - max-height: 220px; - overflow: auto; - background: #f8fafc; - border: 1px solid #e2e8f0; - border-radius: 8px; - padding: 8px; - margin-top: 6px; -} - -.linkbtn { - display: inline-block; - padding: 8px 10px; - border-radius: 6px; - text-decoration: none; - border: 1px solid #cbd5e1; - color: #334155; - background: #f8fafc; -} - -.detail-grid { - display: grid; - grid-template-columns: repeat(auto-fit, minmax(220px, 1fr)); - gap: 8px 12px; - margin-bottom: 10px; -} - -.detail-item { - background: #f8fafc; - border: 1px solid #e2e8f0; - border-radius: 8px; - padding: 8px; - display: grid; - gap: 4px; -} - -.detail-item .k { - font-size: 12px; - color: #64748b; -} - -.thumb { - width: 72px; - height: 72px; - object-fit: cover; - border-radius: 8px; - border: 1px solid #cbd5e1; - margin-top: 6px; -} - -.image-grid { - display: grid; - grid-template-columns: repeat(auto-fill, minmax(180px, 1fr)); - gap: 10px; -} - -.image-card { - border: 1px solid #e2e8f0; - border-radius: 8px; - padding: 8px; - background: #fff; -} - -.image-card img { - width: 100%; - height: 120px; - object-fit: cover; - border-radius: 6px; - border: 1px solid #e2e8f0; - background: #f8fafc; -} - -.img-failed { - opacity: 0.3; - filter: grayscale(1); -} - -.image-meta { - margin-top: 6px; - display: flex; - gap: 6px; - flex-wrap: wrap; -} - -.image-actions { - margin-top: 8px; - display: flex; - gap: 6px; - flex-wrap: wrap; -} - -.image-selected { - border-color: #10b981; - box-shadow: 0 0 0 1px rgba(16, 185, 129, 0.25); -} - -.image-excluded { - opacity: 0.65; -} - -@media (max-width: 920px) { - .stats { - grid-template-columns: repeat(2, minmax(0, 1fr)); - } - .grid.two { - grid-template-columns: 1fr; - } -} diff --git a/backend/templates/admin_article_detail.html b/backend/templates/admin_article_detail.html deleted file mode 100644 index 1c16658..0000000 --- a/backend/templates/admin_article_detail.html +++ /dev/null @@ -1,224 +0,0 @@ - - - - - - {{ title }} - - - -
    -
    -

    Artikel-Detail #{{ article.id }}

    -

    Angemeldet als {{ user }}

    -
    -
    - Zurück -
    - -
    -
    -
    - -
    - {% if flash_msg %} -
    - {{ flash_msg }} -
    - {% endif %} - -
    -

    {{ article.title }}

    -
    -
    Status{{ article.status_ui }}
    -
    Artikel-Datum{{ article.published_at or "-" }}
    -
    Alter{{ article.days_old if article.days_old is not none else "-" }} Tage
    -
    Relevanz{{ article.relevance }}
    -
    Autor{{ article.author or "-" }}
    -
    Feed{{ feed.name if feed else "-" }}
    -
    Quelle Snapshot{{ article.source_name_snapshot or "-" }}
    -
    Lizenz Snapshot{{ article.source_license_name_snapshot or "-" }}
    -
    Terms Snapshot{{ article.source_terms_url_snapshot or "-" }}
    -
    -

    Quelle: {{ article.source_url }}

    - {% if article.canonical_url %}

    Canonical: {{ article.canonical_url }}

    {% endif %} - {% if article.summary %} -

    Summary: {{ article.summary }}

    - {% endif %} -

    WordPress Post: - {% if article.wp_post_url %} - #{{ article.wp_post_id }} - {% elif article.wp_post_id %} - #{{ article.wp_post_id }} - {% else %} - - - {% endif %} -

    -

    Publish Attempts: {{ article.publish_attempts or 0 }} | Letzter Fehler: {{ article.publish_last_error or "-" }}

    -
    - -
    -

    Checkliste

    - - - - - - {% for c in checklist %} - - - - - - {% endfor %} - -
    KriteriumStatusWert
    {{ c.label }} - {% if c.status == "ok" %} - OK - {% else %} - Fehlt - {% endif %} - {{ c.value }}
    -
    - -
    -

    Extrahierte Daten

    -

    Bilder: {{ article.image_entries|length if article.image_entries else 0 }}

    - {% if article.selected_image_url %} -

    Ausgewähltes Hauptbild: {{ article.selected_image_url }}

    - {% if article.selected_image_proxy_url %} - Ausgewähltes Hauptbild - {% endif %} - {% endif %} - {% if article.image_entries %} - {% if article.image_selection %} -
    - Automatische Bildauswahl (Score + Gründe) -
    Primärbild (Auto): {{ article.image_selection.primary or "-" }}
    -
    Ausgewählt: {{ article.image_selection.selected_count or 0 }} / Kandidaten: {{ article.image_selection.total_candidates or 0 }}
    - {% if article.image_selection.ranked %} - - - - - - {% for r in article.image_selection.ranked %} - - - - - - {% endfor %} - -
    BildScoreGründe
    {{ r.url }}{{ r.score }}{{ r.reasons|join(", ") if r.reasons else "-" }}
    - {% endif %} -
    - {% endif %} -
    - {% for image in article.image_entries %} -
    - - Artikelbild - -
    - {% if image.is_selected %}Ausgewählt{% endif %} - {% if image.is_excluded %}Ausgeblendet{% endif %} - {% if image.is_irrelevant_hint %}evtl. irrelevant{% endif %} -
    -
    -
    - - - -
    - {% if not image.is_excluded %} -
    - - - -
    - {% else %} -
    - - - -
    - {% endif %} -
    - -
    - {% endfor %} -
    - {% endif %} - {% if article.press_contact or article.extraction.press_contact %} -

    Pressekontakt

    -
    {{ article.press_contact or article.extraction.press_contact }}
    - {% endif %} - {% if article.extraction.extraction_error %} -

    Extraktionsfehler: {{ article.extraction.extraction_error }}

    - {% endif %} -
    - -
    -

    Volltext

    -
    {{ article.content_raw or "-" }}
    -
    - -
    -

    Rewrite-Text (editierbar)

    -
    - - -
    - {% if article.meta.generated_tags %} -

    Generierte Tags: {{ article.meta.generated_tags|join("; ") }}

    - {% endif %} -

    Dieser Text wird für den WordPress-Entwurf verwendet, falls vorhanden.

    -
    - -
    -

    Status ändern

    - {% if article.status_ui in ["new", "rewrite"] %} -
    - -
    - {% endif %} - {% if article.status_ui == "published" %} -
    - -
    - {% endif %} -
    - - - -
    -
    - -
    -

    WordPress Publish Queue

    - {% if article.publish_ready %} -

    Publish bereit

    - {% else %} -

    Publish blockiert

    - {% if article.publish_blockers %} -
      - {% for reason in article.publish_blockers %} -
    • {{ reason }}
    • - {% endfor %} -
    - {% endif %} - {% endif %} -

    Voraussetzungen: Status `publish` und Hauptbild gesetzt.

    -
    - - -
    -
    -
    - - diff --git a/backend/templates/admin_article_list.html b/backend/templates/admin_article_list.html deleted file mode 100644 index 38bfb22..0000000 --- a/backend/templates/admin_article_list.html +++ /dev/null @@ -1,221 +0,0 @@ - - - - - - {{ title }} - - - - -
    -
    -

    Artikelliste

    -

    Angemeldet als {{ user }}

    -
    -
    - Dashboard - Veröffentlichungsplan -
    - -
    -
    -
    - -
    - {% if flash_msg %} -
    - {{ flash_msg }} -
    - {% endif %} - - -
    -
    -
    -
    - - -
    -
    - - -
    -
    - - Reset -
    -
    -
    -

    {{ total }} Artikel gesamt · Seite {{ page }} / {{ total_pages }} · {{ page_size }} pro Seite

    -
    - - -
    - - - - - - - -
    - - - - - - - - - - - - {% for a in articles %} - - - - - - - - {% endfor %} - -
    BildTitel & KurztextStatusDatumWP ID
    - {% if a.thumb_proxy %} - - Vorschau - - - {% else %} -
    🖼
    - {% endif %} -
    - - {% if a.excerpt %} -
    {{ a.excerpt }}
    - {% endif %} - {% if a.feed_name %} -
    📡 {{ a.feed_name }}
    - {% endif %} -
    - {{ a.status }} - - {% if a.scheduled_publish_at %} - 📅 {{ a.scheduled_publish_at[:16] }} - {% elif a.published_at %} - {{ a.published_at[:10] }} - {% else %} - — - {% endif %} - - - - - {% if a.wp_post_url %} - ↗ WP öffnen - {% endif %} -
    -
    -
    - - - -
    - - - - diff --git a/backend/templates/admin_connectivity.html b/backend/templates/admin_connectivity.html deleted file mode 100644 index 5fc0392..0000000 --- a/backend/templates/admin_connectivity.html +++ /dev/null @@ -1,84 +0,0 @@ - - - - - - {{ title }} - - - -
    -
    -

    Connectivity Check

    -

    Angemeldet als {{ user }}

    -
    -
    - Zurück -
    - -
    -
    -
    - -
    -
    -
    -
    Checks
    -
    {{ checks|length }}
    -
    -
    -
    OK
    -
    {{ ok_count }}
    -
    -
    -
    Fehler
    -
    {{ error_count }}
    -
    -
    -
    Zeitpunkt
    -
    Live
    -
    -
    - -
    -

    Ziele

    -

    Geprüft werden DNS-Auflösung, TCP-Erreichbarkeit und bei URLs ein HTTP-Request.

    -
    - -
    -
    - -
    -

    Ergebnis

    - - - - - - {% for c in checks %} - - - - - - - - - - - {% endfor %} - -
    StatusNameTypZielDNSTCPHTTPDauer
    {% if c.ok %}OK{% else %}Fehler{% endif %}{{ c.label }}{{ c.kind }}{{ c.target }} - {% if c.dns_ok %}OK{% else %}FAIL{% endif %} -
    {{ c.dns_info }}
    -
    - {% if c.tcp_ok %}OK{% else %}FAIL{% endif %} -
    {{ c.tcp_info }}
    -
    - {% if c.http_ok %}OK{% else %}FAIL{% endif %} -
    {{ c.http_info }}
    -
    {{ c.duration_ms }} ms
    -
    -
    - - diff --git a/backend/templates/admin_dashboard.html b/backend/templates/admin_dashboard.html deleted file mode 100644 index 0795b96..0000000 --- a/backend/templates/admin_dashboard.html +++ /dev/null @@ -1,405 +0,0 @@ - - - - - - {{ title }} - - - -
    -
    -

    rss-news Admin Dashboard

    -

    Angemeldet als {{ user }}

    -
    - -
    - -
    - {% if flash_msg %} -
    - {{ flash_msg }} -
    - {% endif %} - -
    -
    -
    Quellen
    -
    {{ sources|length }}
    -
    -
    -
    Feeds
    -
    {{ feeds|length }}
    -
    -
    -
    Artikel
    -
    {{ articles|length }}
    -
    -
    -
    Runs
    -
    {{ runs|length }}
    -
    -
    - -
    -
    -

    Quelle anlegen

    -
    - - - - - - - -
    -
    - -
    -

    Feed anlegen

    -
    - - - - - -
    -
    -
    - -
    -

    Ingestion starten

    -
    - - -
    -
    - -
    -

    Publisher ausführen

    -
    - - -
    -
    - -
    -

    Rewrite Run (geplante Artikel)

    -

    Verarbeitet alle Artikel im Status rewrite und setzt sie auf publish.

    -
    - - -
    -
    - -
    -

    Quellen + Policy

    - - - - - - {% for s in sources %} - - - - - - - - - {% endfor %} - -
    IDNameRiskLizenzTermsPolicy
    {{ s.id }}{{ s.name }}{{ s.risk_level }}{{ s.license_name or "-" }}{{ s.terms_url or "-" }} - {% if source_policy[s.id] %} - BLOCKED ({{ source_policy[s.id]|length }}) -
    {{ source_policy[s.id]|join(", ") }}
    - {% else %} - OK - {% endif %} -
    -
    - -
    -

    Quellen verwalten

    - - - - - - {% for s in sources %} - {% set source_form_id = 'source-update-' ~ s.id %} - - - - - - - - {% endfor %} - -
    IDNameURLsMetaAktionen
    #{{ s.id }} - - - - - - - - - - - -
    -
    - -
    -
    - -
    -
    -
    -
    - -
    -

    Feeds verwalten

    - - - - - - {% for f in feeds %} - {% set feed_form_id = 'feed-update-' ~ f.id %} - - - - - - - - - {% endfor %} - -
    IDNameURLQuelleStatusAktionen
    #{{ f.id }} - - - - - - -
    -
    - -
    -
    - -
    -
    -
    -
    - -
    -

    Artikel (Review)

    -
    - - - - Reset - Export JSON - Export CSV -
    - - - - - - {% for a in articles %} - - - - - - - - - {% endfor %} - -
    IDArtikelStatusDetailsRewriteTransition
    {{ a.id }} - {{ a.title }}
    - Autor: {{ a.author or "-" }}
    - Datum: {{ a.published_at or "-" }} | Alter: {{ a.days_old if a.days_old is not none else "-" }} Tage | Relevanz: {{ a.relevance }}
    - Original öffnen -
    Details anzeigen - {% if a.canonical_url and a.canonical_url != a.source_url %} -
    Canonical öffnen - {% endif %} -
    {{ a.status_ui }} -
    Publish: {{ "bereit" if a.publish_ready else "blockiert" }}
    - {% if not a.publish_ready and a.publish_blockers %} -
    {{ a.publish_blockers|join(", ") }}
    - {% endif %} - {% if a.selected_image_url %} -
    Hauptbild gesetzt
    - Hauptbild - {% endif %} - {% if a.summary %} -
    Summary: {{ a.summary }}
    - {% endif %} - {% if a.generated_tags %} -
    Tags: {{ a.generated_tags|join("; ") }}
    - {% endif %} - {% if a.content_raw %} -
    - Volltext anzeigen -
    {{ a.content_raw }}
    -
    - {% endif %} -
    Bilder: {{ a.extracted_images|length }}
    - {% if a.extracted_images %} -
    - Bild-URLs -
      - {% for img in a.extracted_images %} -
    • {{ img }}
    • - {% endfor %} -
    -
    - {% endif %} - {% if a.press_contact %} -
    - Pressekontakt -
    {{ a.press_contact }}
    -
    - {% endif %} - {% if a.extraction_error %} -
    Extraktionsfehler: {{ a.extraction_error }}
    - {% endif %} -
    - {% if a.status_ui in ["new", "rewrite"] %} -
    - -
    - {% else %} - - - {% endif %} -
    -
    - - {% if allowed_transitions.get(a.status_ui, []) %} - - {% else %} - keine Aktion - {% endif %} -
    - {% if a.status_ui == 'close' %} -
    - -
    - {% endif %} -
    -
    - -
    -

    Runs

    - - - - - - {% for r in runs %} - - - - - - - - {% endfor %} - -
    IDTypStatusStartEnde
    {{ r.id }}{{ r.run_type }}{{ r.status }}{{ r.started_at }}{{ r.finished_at or "-" }}
    -
    - -
    -

    Publish Jobs

    - - - - - - {% for j in publish_jobs %} - - - - - - - - - - {% endfor %} - -
    IDArtikelStatusAttemptsWP PostFehlerHinweis
    {{ j.id }}#{{ j.article_id }} {{ j.article_title or "-" }}{{ j.status }}{{ j.attempts }}/{{ j.max_attempts }} - {% if j.wp_post_url %} - #{{ j.wp_post_id }} - {% elif j.wp_post_id %} - #{{ j.wp_post_id }} - {% else %} - - - {% endif %} - - {% if j.error_message %} - {{ j.error_category }} -
    {{ j.error_message }}
    - {% else %} - - - {% endif %} -
    {{ j.error_hint or "-" }}
    -
    -
    - - diff --git a/backend/templates/admin_login.html b/backend/templates/admin_login.html deleted file mode 100644 index 10e55e7..0000000 --- a/backend/templates/admin_login.html +++ /dev/null @@ -1,27 +0,0 @@ - - - - - - {{ title }} - - - -
    -

    rss-news Admin

    -

    Bitte anmelden, um das Tool zu verwalten.

    - {% if error %} -
    Login fehlgeschlagen. Bitte pruefen.
    - {% endif %} -
    - - - -
    -
    - - diff --git a/backend/templates/admin_schedule.html b/backend/templates/admin_schedule.html deleted file mode 100644 index 4f2513a..0000000 --- a/backend/templates/admin_schedule.html +++ /dev/null @@ -1,143 +0,0 @@ - - - - - - {{ title }} - - - - -
    -
    -

    rss-news Veröffentlichungsplan

    -

    Angemeldet als {{ user }}

    -
    -
    - Dashboard - Connectivity -
    - -
    -
    -
    - -
    - {% if flash_msg %} -
    - {{ flash_msg }} -
    - {% endif %} - -
    -
    -

    WordPress → DB Synchronisieren

    -

    Liest alle geplanten WP-Beiträge und aktualisiert die Slots in der lokalen DB.
    Nutze dies nach manuellen Änderungen in WordPress.

    -
    -
    - -
    -
    - -
    -

    Slot-Übersicht (nächste 60 Tage)

    -
    - 📅 Belegte Slots gesamt: {{ slots|length }} - 🗄️ Aus Pipeline-DB: {{ slots|selectattr('source', 'eq', 'db')|list|length }} - 🌐 Nur in WordPress: {{ slots|selectattr('source', 'eq', 'wordpress')|list|length }} -
    - - - - - {% for h in hours %} - - {% endfor %} - - - - {% for day in calendar_days %} - {% if day.any_booked %} - - - {% for s in day.slots %} - - {% endfor %} - - {% endif %} - {% endfor %} - -
    Tag{{ "%02d:00 Uhr"|format(h) }}
    {{ day.weekday }} {{ day.date_fmt }} - {% if s.booked %} - {% set info = s.slot %} - {% if info.source == 'db' %} - - DB -
    - {% if info.article_id %} - - {{ (info.article_title or "Artikel")[:50] }}{% if (info.article_title or "")|length > 50 %}…{% endif %} - - {% endif %} -
    Status: {{ info.article_status }} - {% if info.wp_post_url %} -
    WP öffnen - {% endif %} -
    - {% else %} - ⚠️ - WP -
    {{ info.article_title }}
    - {% endif %} - {% else %} - frei - {% endif %} -
    - {% if not slots %} -

    Keine geplanten Beiträge in den nächsten 60 Tagen.

    - {% endif %} -
    - -
    -

    Alle belegten Slots (Liste)

    - - - - - - {% for s in slots %} - - - - - - - - {% endfor %} - -
    Datum/ZeitQuelleArtikelStatusWordPress
    {{ s.formatted }} - {% if s.source == 'db' %}Pipeline-DB - {% else %}WordPress{% endif %} - - {% if s.article_id %} - {{ (s.article_title or "")[:60] }} - {% else %} - {{ s.article_title or "-" }} - {% endif %} - {{ s.article_status or "-" }} - {% if s.wp_post_url %} - Draft öffnen - {% else %}-{% endif %} -
    -
    -
    - - diff --git a/backend/tests/__init__.py b/backend/tests/__init__.py deleted file mode 100644 index 46816dd..0000000 --- a/backend/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Tests package.""" diff --git a/backend/tests/test_admin_ui.py b/backend/tests/test_admin_ui.py deleted file mode 100644 index c7b6ebf..0000000 --- a/backend/tests/test_admin_ui.py +++ /dev/null @@ -1,419 +0,0 @@ -import os -import tempfile -import unittest -from pathlib import Path -from unittest.mock import patch - -from fastapi.testclient import TestClient - -from backend.app import config as config_module -from backend.app.db import init_db -from backend.app.main import app -from backend.app.repositories import ( - ArticleUpsert, - FeedCreate, - SourceCreate, - create_feed, - create_source, - get_article_by_id, - upsert_article, -) - - -class TestAdminUi(unittest.TestCase): - def setUp(self) -> None: - self.tmp_dir = tempfile.TemporaryDirectory() - os.environ["APP_DB_PATH"] = str(Path(self.tmp_dir.name) / "admin_ui.db") - os.environ["APP_ADMIN_USERNAME"] = "admin" - os.environ["APP_ADMIN_PASSWORD"] = "secret" - config_module.get_settings.cache_clear() - init_db() - self.client = TestClient(app) - - def tearDown(self) -> None: - config_module.get_settings.cache_clear() - os.environ.pop("APP_DB_PATH", None) - os.environ.pop("APP_ADMIN_USERNAME", None) - os.environ.pop("APP_ADMIN_PASSWORD", None) - self.tmp_dir.cleanup() - - def test_admin_login_and_dashboard(self) -> None: - login_page = self.client.get("/admin/login") - self.assertEqual(login_page.status_code, 200) - self.assertIn("rss-news Admin", login_page.text) - - login = self.client.post( - "/admin/login", - data={"username": "admin", "password": "secret"}, - follow_redirects=True, - ) - self.assertEqual(login.status_code, 200) - self.assertIn("Admin Dashboard", login.text) - - def test_dashboard_redirects_if_not_logged_in(self) -> None: - res = self.client.get("/admin/dashboard", follow_redirects=False) - self.assertEqual(res.status_code, 303) - self.assertEqual(res.headers.get("location"), "/admin/login") - - def test_create_feed_with_empty_source_id_does_not_error(self) -> None: - self.client.post( - "/admin/login", - data={"username": "admin", "password": "secret"}, - follow_redirects=True, - ) - # empty source_id used to cause validation issues in form parsing - res = self.client.post( - "/admin/feeds/create", - data={"name": "Feed X", "url": "https://example.org/feed.xml", "source_id": ""}, - follow_redirects=False, - ) - self.assertEqual(res.status_code, 303) - self.assertTrue(res.headers.get("location", "").startswith("/admin/dashboard")) - - def test_article_detail_page_renders(self) -> None: - source_id = create_source( - SourceCreate( - name="Test Source", - base_url="https://example.org", - terms_url="https://example.org/terms", - license_name="cc-by", - risk_level="green", - is_enabled=True, - notes=None, - last_reviewed_at="2026-02-18T00:00:00Z", - ) - ) - feed_id = create_feed( - FeedCreate( - name="Test Feed", - url="https://example.org/feed.xml", - source_id=source_id, - is_enabled=True, - ) - ) - article_id = upsert_article( - ArticleUpsert( - feed_id=feed_id, - source_article_id="id-1", - source_hash="hash-1", - title="Titel A", - source_url="https://example.org/a", - canonical_url="https://example.org/a", - published_at=None, - author="Autor A", - summary="Summary A", - content_raw="Volltext A", - content_rewritten=None, - image_urls_json='["https://example.org/img.jpg"]', - press_contact="Kontakt", - source_name_snapshot="Test Source", - source_terms_url_snapshot="https://example.org/terms", - source_license_name_snapshot="cc-by", - legal_checked=False, - legal_checked_at=None, - legal_note=None, - wp_post_id=None, - wp_post_url=None, - publish_attempts=0, - publish_last_error=None, - published_to_wp_at=None, - word_count=2, - status="new", - meta_json='{"extraction":{"images":["https://example.org/img.jpg"],"press_contact":"Kontakt"}}', - ) - ) - - self.client.post( - "/admin/login", - data={"username": "admin", "password": "secret"}, - follow_redirects=True, - ) - res = self.client.get(f"/admin/articles/{article_id}", follow_redirects=True) - self.assertEqual(res.status_code, 200) - self.assertIn("Artikel-Detail", res.text) - self.assertIn("Checkliste", res.text) - - decision = self.client.post( - f"/admin/articles/{article_id}/images/decision", - data={"image_url": "https://example.org/img.jpg", "action": "select"}, - follow_redirects=True, - ) - self.assertEqual(decision.status_code, 200) - self.assertIn("Ausgewähltes Hauptbild", decision.text) - - article = get_article_by_id(article_id) - self.assertIsNotNone(article) - self.assertIn("selected_url", article.get("meta_json", "")) - - def test_manage_source_and_feed(self) -> None: - source_id = create_source( - SourceCreate( - name="Edit Source", - base_url="https://example.org", - terms_url="https://example.org/terms", - license_name="cc-by", - risk_level="yellow", - is_enabled=True, - notes=None, - last_reviewed_at=None, - ) - ) - feed_id = create_feed( - FeedCreate( - name="Edit Feed", - url="https://example.org/feed.xml", - source_id=source_id, - is_enabled=True, - ) - ) - self.client.post("/admin/login", data={"username": "admin", "password": "secret"}, follow_redirects=True) - - update_source_res = self.client.post( - f"/admin/sources/{source_id}/update", - data={ - "name": "Edit Source 2", - "base_url": "https://example.org/new", - "terms_url": "https://example.org/new-terms", - "license_name": "cc0", - "risk_level": "green", - "is_enabled": "1", - "notes": "ok", - "last_reviewed_at": "2026-02-21T12:00:00Z", - }, - follow_redirects=False, - ) - self.assertEqual(update_source_res.status_code, 303) - - update_feed_res = self.client.post( - f"/admin/feeds/{feed_id}/update", - data={ - "name": "Edit Feed 2", - "url": "https://example.org/feed2.xml", - "source_id": str(source_id), - "is_enabled": "0", - }, - follow_redirects=False, - ) - self.assertEqual(update_feed_res.status_code, 303) - - delete_feed_res = self.client.post(f"/admin/feeds/{feed_id}/delete", follow_redirects=False) - self.assertEqual(delete_feed_res.status_code, 303) - delete_source_res = self.client.post(f"/admin/sources/{source_id}/delete", follow_redirects=False) - self.assertEqual(delete_source_res.status_code, 303) - - def test_rewrite_save_and_reopen(self) -> None: - source_id = create_source( - SourceCreate( - name="Test Source", - base_url="https://example.org", - terms_url="https://example.org/terms", - license_name="cc-by", - risk_level="green", - is_enabled=True, - notes=None, - last_reviewed_at="2026-02-18T00:00:00Z", - ) - ) - feed_id = create_feed( - FeedCreate( - name="Test Feed", - url="https://example.org/feed.xml", - source_id=source_id, - is_enabled=True, - ) - ) - article_id = upsert_article( - ArticleUpsert( - feed_id=feed_id, - source_article_id="id-published", - source_hash="hash-published", - title="Titel Published", - source_url="https://example.org/published", - canonical_url="https://example.org/published", - published_at=None, - author="Autor A", - summary="Summary", - content_raw="Raw", - content_rewritten="

    Alt

    ", - image_urls_json=None, - press_contact=None, - source_name_snapshot="Test Source", - source_terms_url_snapshot="https://example.org/terms", - source_license_name_snapshot="cc-by", - legal_checked=True, - legal_checked_at="2026-02-21T10:00:00Z", - legal_note=None, - wp_post_id=123, - wp_post_url="https://example.org/?p=123", - publish_attempts=2, - publish_last_error=None, - published_to_wp_at="2026-02-21T10:10:00Z", - word_count=1, - status="published", - meta_json="{}", - ) - ) - self.client.post("/admin/login", data={"username": "admin", "password": "secret"}, follow_redirects=True) - - save_res = self.client.post( - f"/admin/articles/{article_id}/rewrite-save", - data={"content_rewritten": "

    Neu

    Text

    "}, - follow_redirects=False, - ) - self.assertEqual(save_res.status_code, 303) - - reopen_res = self.client.post(f"/admin/articles/{article_id}/reopen", follow_redirects=False) - self.assertEqual(reopen_res.status_code, 303) - - article = get_article_by_id(article_id) - self.assertIsNotNone(article) - self.assertEqual(article.get("status"), "rewrite") - self.assertIn("Neu", article.get("content_rewritten") or "") - self.assertIsNone(article.get("wp_post_id")) - - @patch("backend.app.admin_ui.generate_article_tags") - @patch("backend.app.admin_ui.rewrite_article_text") - def test_batch_rewrite_run_processes_planned_articles(self, mock_rewrite_text, mock_tags) -> None: - mock_rewrite_text.return_value = "

    Neu

    Text

    " - mock_tags.return_value = ["Rheingas", "Monheim"] - - source_id = create_source( - SourceCreate( - name="Batch Source", - base_url="https://example.org", - terms_url="https://example.org/terms", - license_name="cc-by", - risk_level="green", - is_enabled=True, - notes=None, - last_reviewed_at=None, - ) - ) - feed_id = create_feed( - FeedCreate( - name="Batch Feed", - url="https://example.org/feed.xml", - source_id=source_id, - is_enabled=True, - ) - ) - article_id = upsert_article( - ArticleUpsert( - feed_id=feed_id, - source_article_id="batch-1", - source_hash="batch-hash-1", - title="Batch Titel", - source_url="https://example.org/batch", - canonical_url="https://example.org/batch", - published_at=None, - author="Autor", - summary="Summary", - content_raw="Raw", - content_rewritten=None, - image_urls_json=None, - press_contact=None, - source_name_snapshot="Batch Source", - source_terms_url_snapshot="https://example.org/terms", - source_license_name_snapshot="cc-by", - legal_checked=False, - legal_checked_at=None, - legal_note=None, - wp_post_id=None, - wp_post_url=None, - publish_attempts=0, - publish_last_error=None, - published_to_wp_at=None, - word_count=1, - status="rewrite", - meta_json="{}", - ) - ) - self.client.post("/admin/login", data={"username": "admin", "password": "secret"}, follow_redirects=True) - res = self.client.post("/admin/rewrite/run", data={"max_jobs": "10"}, follow_redirects=False) - self.assertEqual(res.status_code, 303) - article = get_article_by_id(article_id) - self.assertIsNotNone(article) - self.assertEqual(article.get("status"), "approved") - self.assertIn("generated_tags", article.get("meta_json", "")) - - @patch("backend.app.admin_ui.urlopen") - def test_image_proxy_returns_image_data(self, mock_urlopen) -> None: - class _FakeHeaders: - def get(self, key: str, default=None): - if key.lower() == "content-type": - return "image/jpeg" - return default - - class _FakeResponse: - headers = _FakeHeaders() - - def read(self): - return b"\xff\xd8\xff\xd9" - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - return False - - mock_urlopen.return_value = _FakeResponse() - - self.client.post( - "/admin/login", - data={"username": "admin", "password": "secret"}, - follow_redirects=True, - ) - res = self.client.get("/admin/images/proxy?url=https%3A%2F%2Fexample.org%2Fimg.jpg") - self.assertEqual(res.status_code, 200) - self.assertIn("image/jpeg", res.headers.get("content-type", "")) - - @patch("backend.app.admin_ui._run_connectivity_check") - @patch("backend.app.admin_ui._build_connectivity_targets") - def test_connectivity_page_renders(self, mock_targets, mock_check) -> None: - mock_targets.return_value = [ - {"label": "OpenAI API", "kind": "host", "value": "api.openai.com"}, - {"label": "WordPress REST", "kind": "url", "value": "https://example.org/wp-json/wp/v2"}, - ] - mock_check.side_effect = [ - { - "label": "OpenAI API", - "kind": "host", - "target": "api.openai.com", - "dns_ok": True, - "dns_info": "1.2.3.4", - "tcp_ok": True, - "tcp_info": "port 443 erreichbar", - "http_ok": True, - "http_info": "n/a (host-only)", - "duration_ms": 12, - "ok": True, - }, - { - "label": "WordPress REST", - "kind": "url", - "target": "https://example.org/wp-json/wp/v2", - "dns_ok": False, - "dns_info": "Name or service not known", - "tcp_ok": False, - "tcp_info": "-", - "http_ok": False, - "http_info": "-", - "duration_ms": 10, - "ok": False, - }, - ] - - self.client.post( - "/admin/login", - data={"username": "admin", "password": "secret"}, - follow_redirects=True, - ) - res = self.client.get("/admin/connectivity", follow_redirects=True) - self.assertEqual(res.status_code, 200) - self.assertIn("Connectivity Check", res.text) - self.assertIn("OpenAI API", res.text) - self.assertIn("WordPress REST", res.text) - - -if __name__ == "__main__": - unittest.main() diff --git a/backend/tests/test_api_auth.py b/backend/tests/test_api_auth.py deleted file mode 100644 index 96fbe85..0000000 --- a/backend/tests/test_api_auth.py +++ /dev/null @@ -1,144 +0,0 @@ -import os -import tempfile -import unittest -from pathlib import Path - -from fastapi.testclient import TestClient - -from backend.app import config as config_module -from backend.app.db import init_db -from backend.app.main import app - - -class TestApiAuth(unittest.TestCase): - def setUp(self) -> None: - self.tmp_dir = tempfile.TemporaryDirectory() - os.environ["APP_DB_PATH"] = str(Path(self.tmp_dir.name) / "api.db") - os.environ["APP_ADMIN_USERNAME"] = "admin" - os.environ["APP_ADMIN_PASSWORD"] = "secret" - config_module.get_settings.cache_clear() - init_db() - self.client = TestClient(app) - - def tearDown(self) -> None: - config_module.get_settings.cache_clear() - os.environ.pop("APP_DB_PATH", None) - os.environ.pop("APP_ADMIN_USERNAME", None) - os.environ.pop("APP_ADMIN_PASSWORD", None) - self.tmp_dir.cleanup() - - def test_login_and_protected_endpoint(self) -> None: - r = self.client.post("/auth/login", json={"username": "admin", "password": "secret"}) - self.assertEqual(r.status_code, 200) - - p = self.client.get("/api/protected") - self.assertEqual(p.status_code, 200) - self.assertTrue(p.json().get("ok")) - - def test_protected_requires_auth(self) -> None: - r = self.client.get("/api/protected") - self.assertEqual(r.status_code, 401) - - def test_run_detail_endpoint(self) -> None: - login = self.client.post("/auth/login", json={"username": "admin", "password": "secret"}) - self.assertEqual(login.status_code, 200) - - created = self.client.post("/api/runs", json={"run_type": "ingestion", "status": "running"}) - self.assertEqual(created.status_code, 200) - run_id = created.json()["id"] - - detail = self.client.get(f"/api/runs/{run_id}") - self.assertEqual(detail.status_code, 200) - self.assertEqual(detail.json()["item"]["id"], run_id) - - def test_source_policy_check_endpoint(self) -> None: - login = self.client.post("/auth/login", json={"username": "admin", "password": "secret"}) - self.assertEqual(login.status_code, 200) - - created = self.client.post( - "/api/sources", - json={ - "name": "Policy Source", - "risk_level": "yellow", - "is_enabled": True, - }, - ) - self.assertEqual(created.status_code, 200) - source_id = created.json()["id"] - - check = self.client.get(f"/api/sources/{source_id}/policy-check") - self.assertEqual(check.status_code, 200) - body = check.json() - self.assertFalse(body["allowed"]) - self.assertGreaterEqual(len(body["issues"]), 1) - - def test_articles_export_json_and_csv_contains_relevance(self) -> None: - login = self.client.post("/auth/login", json={"username": "admin", "password": "secret"}) - self.assertEqual(login.status_code, 200) - - source = self.client.post( - "/api/sources", - json={ - "name": "Export Source", - "base_url": "https://example.org", - "terms_url": "https://example.org/terms", - "license_name": "cc-by", - "risk_level": "green", - "is_enabled": True, - "last_reviewed_at": "2026-02-18T00:00:00Z", - }, - ) - self.assertEqual(source.status_code, 200) - source_id = source.json()["id"] - - feed = self.client.post( - "/api/feeds", - json={"name": "Export Feed", "url": "https://example.org/feed.xml", "source_id": source_id, "is_enabled": True}, - ) - self.assertEqual(feed.status_code, 200) - feed_id = feed.json()["id"] - - article = self.client.post( - "/api/articles/upsert", - json={ - "feed_id": feed_id, - "source_article_id": "exp-1", - "source_hash": "exp-hash-1", - "title": "Export Artikel", - "source_url": "https://example.org/article/1", - "canonical_url": "https://example.org/article/1", - "published_at": "2026-02-18T00:00:00Z", - "author": "Autor", - "summary": "Kurz", - "content_raw": "Langtext", - "image_urls_json": "[\"https://example.org/img.jpg\"]", - "press_contact": "Kontakt", - "source_name_snapshot": "Export Source", - "source_terms_url_snapshot": "https://example.org/terms", - "source_license_name_snapshot": "cc-by", - "status": "review", - }, - ) - self.assertEqual(article.status_code, 200) - - export_json = self.client.get("/api/articles/export?format=json") - self.assertEqual(export_json.status_code, 200) - body = export_json.json() - self.assertTrue(body.get("ok")) - self.assertGreaterEqual(body.get("count", 0), 1) - first = body["items"][0] - self.assertIn("published_at", first) - self.assertIn("days_old", first) - self.assertIn("relevance", first) - - export_csv = self.client.get("/api/articles/export?format=csv") - self.assertEqual(export_csv.status_code, 200) - self.assertIn("text/csv", export_csv.headers.get("content-type", "")) - csv_text = export_csv.text - self.assertIn("published_at", csv_text) - self.assertIn("days_old", csv_text) - self.assertIn("relevance", csv_text) - - -if __name__ == "__main__": - unittest.main() diff --git a/backend/tests/test_article_workflow.py b/backend/tests/test_article_workflow.py deleted file mode 100644 index 094b595..0000000 --- a/backend/tests/test_article_workflow.py +++ /dev/null @@ -1,110 +0,0 @@ -import os -import tempfile -import unittest -from pathlib import Path - -from fastapi.testclient import TestClient -from unittest.mock import patch - -from backend.app import config as config_module -from backend.app.db import init_db -from backend.app.main import app - - -class TestArticleWorkflow(unittest.TestCase): - def setUp(self) -> None: - self.tmp_dir = tempfile.TemporaryDirectory() - os.environ["APP_DB_PATH"] = str(Path(self.tmp_dir.name) / "workflow.db") - os.environ["APP_ADMIN_USERNAME"] = "admin" - os.environ["APP_ADMIN_PASSWORD"] = "secret" - config_module.get_settings.cache_clear() - init_db() - self.client = TestClient(app) - self.client.post("/auth/login", json={"username": "admin", "password": "secret"}) - - def tearDown(self) -> None: - config_module.get_settings.cache_clear() - os.environ.pop("APP_DB_PATH", None) - os.environ.pop("APP_ADMIN_USERNAME", None) - os.environ.pop("APP_ADMIN_PASSWORD", None) - self.tmp_dir.cleanup() - - def _create_article(self) -> int: - source = self.client.post( - "/api/sources", - json={ - "name": "Workflow Source", - "base_url": "https://example.org", - "terms_url": "https://example.org/terms", - "license_name": "cc-by", - "risk_level": "green", - "is_enabled": True, - "last_reviewed_at": "2026-02-18T00:00:00Z", - }, - ) - source_id = source.json()["id"] - - feed = self.client.post( - "/api/feeds", - json={"name": "Workflow Feed", "url": "https://example.org/feed.xml", "source_id": source_id, "is_enabled": True}, - ) - feed_id = feed.json()["id"] - - article = self.client.post( - "/api/articles/upsert", - json={ - "feed_id": feed_id, - "source_article_id": "wf-1", - "source_url": "https://example.org/a1", - "title": "Workflow Artikel", - "summary": "s", - "content_raw": "c", - "status": "new", - }, - ) - return article.json()["id"] - - def test_valid_transition_chain(self) -> None: - article_id = self._create_article() - - t1 = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "rewrite"}) - self.assertEqual(t1.status_code, 200) - - t2 = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "publish"}) - self.assertEqual(t2.status_code, 200) - - t3 = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "published"}) - self.assertEqual(t3.status_code, 200) - - t4 = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "rewrite"}) - self.assertEqual(t4.status_code, 200) - - final = self.client.get(f"/api/articles/{article_id}") - self.assertEqual(final.status_code, 200) - self.assertEqual(final.json()["item"]["status"], "rewrite") - self.assertEqual(final.json()["item"]["status_ui"], "rewrite") - - def test_invalid_transition_rejected(self) -> None: - article_id = self._create_article() - bad = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "published"}) - self.assertEqual(bad.status_code, 400) - - def test_legacy_review_endpoint_is_gone(self) -> None: - article_id = self._create_article() - bad = self.client.post(f"/api/articles/{article_id}/review", json={"decision": "approve"}) - self.assertEqual(bad.status_code, 410) - - @patch("backend.app.main.rewrite_article_text") - def test_rewrite_run_sets_publish_status(self, mock_rewrite) -> None: - mock_rewrite.return_value = "

    Neu

    Umschreibung

    " - article_id = self._create_article() - self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "rewrite"}) - r = self.client.post(f"/api/articles/{article_id}/rewrite-run") - self.assertEqual(r.status_code, 200) - self.assertEqual(r.json()["status"], "publish") - final = self.client.get(f"/api/articles/{article_id}") - self.assertEqual(final.json()["item"]["status_ui"], "publish") - - -if __name__ == "__main__": - unittest.main() diff --git a/backend/tests/test_db_repositories.py b/backend/tests/test_db_repositories.py deleted file mode 100644 index 91436c6..0000000 --- a/backend/tests/test_db_repositories.py +++ /dev/null @@ -1,145 +0,0 @@ -import os -import tempfile -import unittest -from pathlib import Path - -from backend.app import config as config_module -from backend.app.db import init_db -from backend.app.repositories import ( - ArticleUpsert, - FeedCreate, - RunCreate, - SourceCreate, - create_feed, - create_run, - create_source, - finish_run, - list_articles, - list_feeds, - list_runs, - list_sources, - upsert_article, -) - - -class TestSQLiteRepositories(unittest.TestCase): - def setUp(self) -> None: - self.tmp_dir = tempfile.TemporaryDirectory() - self.db_path = str(Path(self.tmp_dir.name) / "test.db") - os.environ["APP_DB_PATH"] = self.db_path - config_module.get_settings.cache_clear() - init_db() - - def tearDown(self) -> None: - config_module.get_settings.cache_clear() - os.environ.pop("APP_DB_PATH", None) - self.tmp_dir.cleanup() - - def test_end_to_end_basic_crud(self) -> None: - source_id = create_source( - SourceCreate( - name="GovData", - base_url="https://data.gov.de", - terms_url="https://www.govdata.de/dl-de/by-2-0", - license_name="dl-de/by-2-0", - risk_level="green", - is_enabled=True, - notes="test source", - last_reviewed_at="2026-02-18T00:00:00Z", - ) - ) - self.assertGreater(source_id, 0) - - feed_id = create_feed( - FeedCreate( - name="GovData RSS", - url="https://example.org/feed.xml", - source_id=source_id, - is_enabled=True, - ) - ) - self.assertGreater(feed_id, 0) - - run_id = create_run(RunCreate(run_type="ingest", status="running", details="start")) - self.assertGreater(run_id, 0) - finish_run(run_id=run_id, status="success", details="ok") - - article_id = upsert_article( - ArticleUpsert( - feed_id=feed_id, - source_article_id="abc-1", - source_hash="hash-abc-1", - title="Beispielartikel", - source_url="https://example.org/articles/1", - canonical_url="https://example.org/articles/1", - published_at="2026-02-18T00:00:00Z", - author="Max Mustermann", - summary="Kurzfassung", - content_raw="Originaltext", - content_rewritten="Umschreibung", - image_urls_json='["https://example.org/img.jpg"]', - press_contact="Pressekontakt X", - source_name_snapshot="GovData", - source_terms_url_snapshot="https://www.govdata.de/dl-de/by-2-0", - source_license_name_snapshot="dl-de/by-2-0", - legal_checked=False, - legal_checked_at=None, - legal_note=None, - wp_post_id=None, - wp_post_url=None, - publish_attempts=0, - publish_last_error=None, - published_to_wp_at=None, - word_count=120, - status="review", - meta_json='{"lang":"de"}', - ) - ) - self.assertGreater(article_id, 0) - - # Upsert with same source_url updates same row - article_id_2 = upsert_article( - ArticleUpsert( - feed_id=feed_id, - source_article_id="abc-1", - source_hash="hash-abc-1", - title="Beispielartikel aktualisiert", - source_url="https://example.org/articles/1", - canonical_url="https://example.org/articles/1", - published_at="2026-02-18T00:00:00Z", - author="Max Mustermann", - summary="Kurzfassung 2", - content_raw="Originaltext 2", - content_rewritten="Umschreibung 2", - image_urls_json='["https://example.org/img2.jpg"]', - press_contact="Pressekontakt Y", - source_name_snapshot="GovData", - source_terms_url_snapshot="https://www.govdata.de/dl-de/by-2-0", - source_license_name_snapshot="dl-de/by-2-0", - legal_checked=True, - legal_checked_at="2026-02-18T00:10:00Z", - legal_note="ok", - wp_post_id=123, - wp_post_url="https://example.org/wp/123", - publish_attempts=1, - publish_last_error=None, - published_to_wp_at="2026-02-18T00:12:00Z", - word_count=140, - status="approved", - meta_json='{"lang":"de","v":2}', - ) - ) - self.assertEqual(article_id, article_id_2) - - self.assertEqual(len(list_sources()), 1) - self.assertEqual(len(list_feeds()), 1) - self.assertEqual(len(list_runs()), 1) - - articles = list_articles() - self.assertEqual(len(articles), 1) - self.assertEqual(articles[0]["title"], "Beispielartikel aktualisiert") - self.assertEqual(articles[0]["status"], "approved") - - -if __name__ == "__main__": - unittest.main() diff --git a/backend/tests/test_ingestion.py b/backend/tests/test_ingestion.py deleted file mode 100644 index 82bd2ca..0000000 --- a/backend/tests/test_ingestion.py +++ /dev/null @@ -1,245 +0,0 @@ -import os -import tempfile -import unittest -from pathlib import Path -from unittest.mock import patch - -from backend.app import config as config_module -from backend.app.db import init_db -from backend.app.ingestion import run_ingestion -from backend.app.repositories import ( - ArticleUpsert, - FeedCreate, - SourceCreate, - create_feed, - create_source, - get_article_by_id, - list_articles, - upsert_article, -) -from backend.app.source_extraction import ExtractedArticle - - -class TestIngestion(unittest.TestCase): - def setUp(self) -> None: - self.tmp_dir = tempfile.TemporaryDirectory() - os.environ["APP_DB_PATH"] = str(Path(self.tmp_dir.name) / "ingestion.db") - config_module.get_settings.cache_clear() - init_db() - - source_id = create_source( - SourceCreate( - name="Test Source", - base_url="https://example.org", - terms_url="https://example.org/terms", - license_name="cc-by", - risk_level="green", - is_enabled=True, - notes=None, - last_reviewed_at="2026-02-18T00:00:00Z", - ) - ) - self.feed_id = create_feed( - FeedCreate( - name="Test Feed", - url="https://example.org/feed.xml", - source_id=source_id, - is_enabled=True, - ) - ) - - def tearDown(self) -> None: - config_module.get_settings.cache_clear() - os.environ.pop("APP_DB_PATH", None) - self.tmp_dir.cleanup() - - @patch("backend.app.ingestion.extract_article") - @patch("backend.app.ingestion.feedparser.parse") - def test_ingestion_deduplicates_by_feed_and_guid(self, mock_parse, mock_extract_article) -> None: - mock_extract_article.return_value = ExtractedArticle( - title="Artikel 1 original", - author="Autorin A", - canonical_url="https://example.org/article/1", - summary="Original Summary", - content_text="Original Volltext", - images=["https://example.org/a.jpg"], - press_contact="Pressekontakt: Team A", - extraction_error=None, - ) - mock_parse.return_value = { - "etag": "etag-1", - "modified": "Tue, 18 Feb 2026 10:00:00 GMT", - "entries": [ - { - "id": "item-1", - "title": "Artikel 1", - "link": "https://example.org/article/1", - "summary": "A", - }, - { - "id": "item-1", - "title": "Artikel 1 aktualisiert", - "link": "https://example.org/article/1-neu", - "summary": "B", - }, - ], - } - - stats = run_ingestion(feed_id=self.feed_id) - self.assertEqual(stats.status, "success") - self.assertEqual(stats.entries_seen, 2) - self.assertEqual(len(list_articles()), 1) - article = list_articles()[0] - self.assertEqual(article["title"], "Artikel 1 original") - self.assertEqual(article["author"], "Autorin A") - self.assertIn("Original Volltext", article["content_raw"] or "") - self.assertIn("Pressekontakt", article["meta_json"] or "") - self.assertIsNotNone(article["image_urls_json"]) - - @patch("backend.app.ingestion.extract_article") - @patch("backend.app.ingestion.feedparser.parse") - def test_ingestion_processes_any_enabled_source(self, mock_parse, mock_extract_article) -> None: - # Ampel/risk-level system removed – all enabled feeds are processed regardless of risk_level - source_id = create_source( - SourceCreate( - name="Any Risk Source", - base_url="https://example.net", - terms_url="https://example.net/terms", - license_name="custom", - risk_level="yellow", - is_enabled=True, - notes=None, - last_reviewed_at="2026-02-18T00:00:00Z", - ) - ) - feed_id = create_feed( - FeedCreate( - name="Any Risk Feed", - url="https://example.net/feed.xml", - source_id=source_id, - is_enabled=True, - ) - ) - - mock_parse.return_value = type("FP", (), {"entries": [], "etag": None, "modified": None})() - mock_extract_article.return_value = type("E", (), { - "title": None, "author": None, "summary": None, "content_text": None, - "canonical_url": None, "images": [], "press_contact": None, - })() - - stats = run_ingestion(feed_id=feed_id) - self.assertEqual(stats.status, "success") - # Feed was processed (feedparser was called), even with yellow risk_level - mock_parse.assert_called_once() - - @patch("backend.app.ingestion.extract_article") - @patch("backend.app.ingestion.feedparser.parse") - def test_ingestion_preserves_existing_work_and_skips_closed(self, mock_parse, mock_extract_article) -> None: - existing_closed_id = upsert_article( - ArticleUpsert( - feed_id=self.feed_id, - source_article_id="closed-1", - source_hash="closed-hash-1", - title="Alt Closed", - source_url="https://example.org/closed-article", - canonical_url="https://example.org/closed-article", - published_at=None, - author="Autor", - summary="Alt", - content_raw="Alt Raw", - content_rewritten="

    Alt Rewrite Closed

    ", - image_urls_json=None, - press_contact="Kontakt Alt", - source_name_snapshot="Test Source", - source_terms_url_snapshot="https://example.org/terms", - source_license_name_snapshot="cc-by", - legal_checked=False, - legal_checked_at=None, - legal_note=None, - wp_post_id=42, - wp_post_url="https://wp.local/?p=42", - publish_attempts=2, - publish_last_error=None, - published_to_wp_at="2026-02-21T12:00:00Z", - word_count=3, - status="error", # UI: close - meta_json='{"generated_tags":["AltTag"]}', - ) - ) - existing_published_id = upsert_article( - ArticleUpsert( - feed_id=self.feed_id, - source_article_id="published-1", - source_hash="published-hash-1", - title="Alt Published", - source_url="https://example.org/published-article", - canonical_url="https://example.org/published-article", - published_at=None, - author="Autor", - summary="Alt", - content_raw="Alt Raw", - content_rewritten="

    Alt Rewrite Published

    ", - image_urls_json=None, - press_contact="Kontakt Alt", - source_name_snapshot="Test Source", - source_terms_url_snapshot="https://example.org/terms", - source_license_name_snapshot="cc-by", - legal_checked=False, - legal_checked_at=None, - legal_note=None, - wp_post_id=77, - wp_post_url="https://wp.local/?p=77", - publish_attempts=3, - publish_last_error=None, - published_to_wp_at="2026-02-21T12:10:00Z", - word_count=3, - status="published", - meta_json='{"generated_tags":["Rheingas"],"image_review":{"selected_url":"https://img.local/1.jpg"}}', - ) - ) - - mock_extract_article.return_value = ExtractedArticle( - title="Neu Titel", - author="Neu Autor", - canonical_url=None, - summary="Neu Summary", - content_text="Neu Volltext", - images=["https://example.org/a.jpg"], - press_contact=None, - extraction_error=None, - ) - mock_parse.return_value = { - "etag": "etag-2", - "modified": "Tue, 18 Feb 2026 11:00:00 GMT", - "entries": [ - { - "id": "closed-1", - "title": "Closed Entry", - "link": "https://example.org/closed-article", - "summary": "X", - }, - { - "id": "published-1", - "title": "Published Entry", - "link": "https://example.org/published-article", - "summary": "Y", - }, - ], - } - - stats = run_ingestion(feed_id=self.feed_id) - self.assertEqual(stats.status, "success") - closed_row = get_article_by_id(existing_closed_id) or {} - self.assertEqual(closed_row["status"], "error") - self.assertIn("Alt Rewrite Closed", closed_row.get("content_rewritten") or "") - self.assertEqual(closed_row.get("wp_post_id"), 42) - - published_row = get_article_by_id(existing_published_id) or {} - self.assertEqual(published_row["status"], "published") - self.assertIn("Alt Rewrite Published", published_row.get("content_rewritten") or "") - self.assertEqual(published_row.get("wp_post_id"), 77) - self.assertIn("generated_tags", published_row.get("meta_json") or "") - - -if __name__ == "__main__": - unittest.main() diff --git a/backend/tests/test_publisher.py b/backend/tests/test_publisher.py deleted file mode 100644 index a32150e..0000000 --- a/backend/tests/test_publisher.py +++ /dev/null @@ -1,112 +0,0 @@ -import os -import tempfile -import unittest -from pathlib import Path -from unittest.mock import patch - -from fastapi.testclient import TestClient - -from backend.app import config as config_module -from backend.app.db import init_db -from backend.app.main import app - - -class TestPublisher(unittest.TestCase): - def setUp(self) -> None: - self.tmp_dir = tempfile.TemporaryDirectory() - os.environ["APP_DB_PATH"] = str(Path(self.tmp_dir.name) / "publisher.db") - os.environ["APP_ADMIN_USERNAME"] = "admin" - os.environ["APP_ADMIN_PASSWORD"] = "secret" - os.environ["WORDPRESS_BASE_URL"] = "https://example.org" - os.environ["WORDPRESS_USERNAME"] = "wp-user" - os.environ["WORDPRESS_APP_PASSWORD"] = "wp-pass" - config_module.get_settings.cache_clear() - init_db() - self.client = TestClient(app) - self.client.post("/auth/login", json={"username": "admin", "password": "secret"}) - - def tearDown(self) -> None: - config_module.get_settings.cache_clear() - for key in ( - "APP_DB_PATH", - "APP_ADMIN_USERNAME", - "APP_ADMIN_PASSWORD", - "WORDPRESS_BASE_URL", - "WORDPRESS_USERNAME", - "WORDPRESS_APP_PASSWORD", - ): - os.environ.pop(key, None) - self.tmp_dir.cleanup() - - def _create_publishable_article(self) -> int: - source = self.client.post( - "/api/sources", - json={ - "name": "WP Source", - "base_url": "https://example.org", - "terms_url": "https://example.org/terms", - "license_name": "cc-by", - "risk_level": "green", - "is_enabled": True, - "last_reviewed_at": "2026-02-18T00:00:00Z", - }, - ) - source_id = source.json()["id"] - feed = self.client.post( - "/api/feeds", - json={"name": "WP Feed", "url": "https://example.org/feed.xml", "source_id": source_id, "is_enabled": True}, - ) - feed_id = feed.json()["id"] - - article = self.client.post( - "/api/articles/upsert", - json={ - "feed_id": feed_id, - "source_article_id": "pub-1", - "source_hash": "pub-hash-1", - "title": "Publish Artikel", - "source_url": "https://example.org/article/1", - "canonical_url": "https://example.org/article/1", - "published_at": "2026-02-18T00:00:00Z", - "author": "Autor", - "summary": "Kurz", - "content_raw": "Langtext", - "image_urls_json": "[\"https://example.org/img.jpg\"]", - "press_contact": "Kontakt", - "source_name_snapshot": "WP Source", - "source_terms_url_snapshot": "https://example.org/terms", - "source_license_name_snapshot": "cc-by", - "legal_checked": True, - "status": "approved", - "meta_json": "{\"image_review\":{\"selected_url\":\"https://example.org/img.jpg\"}}", - }, - ) - return article.json()["id"] - - @patch("backend.app.publisher.publish_article_draft") - def test_enqueue_and_run_publisher(self, mock_publish) -> None: - mock_publish.return_value = (777, "https://example.org/?p=777") - article_id = self._create_publishable_article() - - enqueue = self.client.post("/api/publisher/enqueue", json={"article_id": article_id, "max_attempts": 3}) - self.assertEqual(enqueue.status_code, 200) - - run = self.client.post("/api/publisher/run", json={"max_jobs": 5}) - self.assertEqual(run.status_code, 200) - stats = run.json()["stats"] - self.assertEqual(stats["success"], 1) - - article = self.client.get(f"/api/articles/{article_id}") - self.assertEqual(article.status_code, 200) - item = article.json()["item"] - self.assertEqual(item["status"], "published") - self.assertEqual(item["wp_post_id"], 777) - self.assertIn("?p=777", item["wp_post_url"] or "") - - jobs = self.client.get("/api/publisher/jobs") - self.assertEqual(jobs.status_code, 200) - self.assertGreaterEqual(len(jobs.json()["items"]), 1) - - -if __name__ == "__main__": - unittest.main() diff --git a/backend/tests/test_relevance.py b/backend/tests/test_relevance.py deleted file mode 100644 index 573e312..0000000 --- a/backend/tests/test_relevance.py +++ /dev/null @@ -1,21 +0,0 @@ -from datetime import datetime, timezone -import unittest - -from backend.app.relevance import article_age_days, article_relevance - - -class TestRelevance(unittest.TestCase): - def test_article_age_and_relevance(self) -> None: - now = datetime(2026, 2, 18, 12, 0, 0, tzinfo=timezone.utc) - self.assertEqual(article_age_days("2026-02-18T10:00:00Z", now=now), 0) - self.assertEqual(article_relevance("2026-02-18T10:00:00Z", now=now), "hoch") - - self.assertEqual(article_age_days("2026-02-14T12:00:00Z", now=now), 4) - self.assertEqual(article_relevance("2026-02-14T12:00:00Z", now=now), "mittel") - - self.assertEqual(article_relevance("2025-12-01T00:00:00Z", now=now), "alt") - self.assertEqual(article_relevance(None, now=now), "unbekannt") - - -if __name__ == "__main__": - unittest.main() diff --git a/backend/tests/test_source_extraction.py b/backend/tests/test_source_extraction.py deleted file mode 100644 index 5cafde7..0000000 --- a/backend/tests/test_source_extraction.py +++ /dev/null @@ -1,96 +0,0 @@ -import unittest -from unittest.mock import patch - -from backend.app.source_extraction import extract_article - - -SAMPLE_HTML = """ - - - - - - - - - - - -
    -

    Dies ist der vollstaendige Inhalt des Artikels.

    -

    Weitere relevante Informationen fuer die Meldung.

    -

    Pressekontakt

    -

    Musterfirma GmbH, Kontakt: presse@example.org

    -
    - - -""" - -SAMPLE_HTML_AGENTUR = """ - - - - - - - -
    -

    Inhalt der Meldung.

    -

    Agentur

    -

    Agenturname GmbH

    -

    presse@agentur.example

    -

    Original-Content von Beispiel

    -
    - - -""" - - -class _FakeHeaders: - @staticmethod - def get_content_charset(): - return "utf-8" - - -class _FakeResponse: - headers = _FakeHeaders() - - def __init__(self, body: str): - self._body = body.encode("utf-8") - - def read(self): - return self._body - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - return False - - -class TestSourceExtraction(unittest.TestCase): - @patch("backend.app.source_extraction.urlopen") - def test_extract_article_parses_author_images_and_press_contact(self, mock_urlopen) -> None: - mock_urlopen.return_value = _FakeResponse(SAMPLE_HTML) - - extracted = extract_article("https://www.presseportal.de/pm/118273/6158137") - self.assertEqual(extracted.title, "Demo Meldung von Presseportal") - self.assertEqual(extracted.author, "Max Mustermann") - self.assertEqual(extracted.canonical_url, "https://www.presseportal.de/pm/118273/6158137") - self.assertIn("vollstaendige Inhalt", extracted.content_text or "") - self.assertIn("Kurzbeschreibung", extracted.summary or "") - self.assertIn("https://www.presseportal.de/images/demo.jpg", extracted.images) - self.assertIn("Pressekontakt", extracted.press_contact or "") - self.assertIsNone(extracted.extraction_error) - - @patch("backend.app.source_extraction.urlopen") - def test_extract_article_detects_agentur_block_as_press_contact(self, mock_urlopen) -> None: - mock_urlopen.return_value = _FakeResponse(SAMPLE_HTML_AGENTUR) - extracted = extract_article("https://www.presseportal.de/pm/155103/6210401") - self.assertIn("Agentur", extracted.press_contact or "") - self.assertIn("Agenturname", extracted.press_contact or "") - self.assertIn("presse@agentur.example", extracted.press_contact or "") - - -if __name__ == "__main__": - unittest.main() diff --git a/backend/tests/test_wordpress.py b/backend/tests/test_wordpress.py deleted file mode 100644 index 20b0618..0000000 --- a/backend/tests/test_wordpress.py +++ /dev/null @@ -1,139 +0,0 @@ -import os -import unittest -from unittest.mock import patch - -from backend.app import config as config_module -from backend.app.wordpress import publish_article_draft - - -class TestWordpressPublish(unittest.TestCase): - def setUp(self) -> None: - os.environ["WORDPRESS_BASE_URL"] = "https://example.org" - os.environ["WORDPRESS_USERNAME"] = "wp-user" - os.environ["WORDPRESS_APP_PASSWORD"] = "wp-pass" - config_module.get_settings.cache_clear() - - def tearDown(self) -> None: - for key in ("WORDPRESS_BASE_URL", "WORDPRESS_USERNAME", "WORDPRESS_APP_PASSWORD"): - os.environ.pop(key, None) - config_module.get_settings.cache_clear() - - @patch("backend.app.wordpress._upload_featured_media") - @patch("backend.app.wordpress._wp_request") - def test_publish_sets_featured_media_when_selected_image_exists(self, mock_wp_request, mock_upload_media) -> None: - mock_upload_media.return_value = 456 - mock_wp_request.return_value = {"id": 321, "link": "https://example.org/?p=321"} - - article = { - "title": "Testartikel", - "content_raw": "Inhalt", - "source_url": "https://example.com/source", - "canonical_url": "https://example.com/source", - "meta_json": '{"image_review":{"selected_url":"https://example.com/image.jpg"}}', - } - post_id, post_url = publish_article_draft(article) - - self.assertEqual(post_id, 321) - self.assertIn("?p=321", post_url or "") - self.assertTrue(mock_upload_media.called) - payload = mock_wp_request.call_args.kwargs["payload"] - self.assertEqual(payload.get("featured_media"), 456) - self.assertIn("", payload.get("content", "")) - self.assertIn("

    Inhalt

    ", payload.get("content", "")) - self.assertNotIn("excerpt", payload) - - @patch("backend.app.wordpress._upload_featured_media") - @patch("backend.app.wordpress._wp_request") - def test_publish_without_selected_image_has_no_featured_media(self, mock_wp_request, mock_upload_media) -> None: - mock_wp_request.return_value = {"id": 654, "link": "https://example.org/?p=654"} - - article = { - "title": "Testartikel", - "content_raw": "Inhalt", - "source_url": "https://example.com/source", - "canonical_url": "https://example.com/source", - "meta_json": "{}", - } - post_id, _ = publish_article_draft(article) - - self.assertEqual(post_id, 654) - self.assertFalse(mock_upload_media.called) - payload = mock_wp_request.call_args.kwargs["payload"] - self.assertNotIn("featured_media", payload) - self.assertIn("

    Inhalt

    ", payload.get("content", "")) - - @patch("backend.app.wordpress._upload_featured_media") - @patch("backend.app.wordpress._wp_request") - def test_publish_strips_feed_header_and_press_contact(self, mock_wp_request, mock_upload_media) -> None: - mock_wp_request.return_value = {"id": 100, "link": "https://example.org/?p=100"} - article = { - "title": "Header Test", - "content_raw": "21.02.2026 10:00\nFirma GmbH\n(ots)\nDas ist der eigentliche Text.\nPressekontakt: Test Person", - "source_url": "https://example.com/source", - "canonical_url": "https://example.com/source", - "meta_json": "{}", - } - publish_article_draft(article) - payload = mock_wp_request.call_args.kwargs["payload"] - content = payload.get("content", "") - self.assertNotIn("Firma GmbH", content) - self.assertNotIn("Pressekontakt", content) - self.assertIn("eigentliche Text", content) - self.assertNotIn("Artikeldetails", content) - - @patch("backend.app.wordpress._upload_featured_media") - @patch("backend.app.wordpress._wp_request") - def test_publish_resolves_and_sets_tags(self, mock_wp_request, mock_upload_media) -> None: - def _fake_wp_request(**kwargs): - endpoint = kwargs.get("endpoint", "") - method = kwargs.get("method", "") - if method == "GET" and endpoint.startswith("tags?search="): - if "Rheingas" in endpoint: - return [{"id": 11, "name": "Rheingas"}] - return [] - if method == "POST" and endpoint == "tags": - name = (kwargs.get("payload") or {}).get("name") - if name == "Gasflasche": - return {"id": 12, "name": "Gasflasche"} - return {"id": 13, "name": str(name)} - if method == "POST" and endpoint == "posts": - return {"id": 900, "link": "https://example.org/?p=900"} - return {} - - mock_wp_request.side_effect = _fake_wp_request - article = { - "title": "Tag Test", - "content_raw": "Inhalt", - "source_url": "https://example.com/source", - "canonical_url": "https://example.com/source", - "meta_json": '{"generated_tags":["Rheingas","Gasflasche"]}', - } - post_id, _ = publish_article_draft(article) - self.assertEqual(post_id, 900) - post_calls = [call for call in mock_wp_request.call_args_list if call.kwargs.get("endpoint") == "posts"] - self.assertEqual(len(post_calls), 1) - payload = post_calls[0].kwargs.get("payload", {}) - self.assertEqual(payload.get("tags"), [11, 12]) - - @patch("backend.app.wordpress._upload_featured_media") - @patch("backend.app.wordpress._wp_request") - def test_publish_converts_html_to_wp_blocks_without_html_block(self, mock_wp_request, mock_upload_media) -> None: - mock_wp_request.return_value = {"id": 111, "link": "https://example.org/?p=111"} - article = { - "title": "Block Test", - "content_rewritten": "

    Überschrift

    Absatz 1

    • A
    • B
    ", - "source_url": "https://example.com/source", - "canonical_url": "https://example.com/source", - "meta_json": "{}", - } - publish_article_draft(article) - payload = mock_wp_request.call_args.kwargs["payload"] - content = payload.get("content", "") - self.assertIn("", content) - self.assertIn("", content) - self.assertNotIn("", content) - - -if __name__ == "__main__": - unittest.main() diff --git a/docs/AUTOMATION.md b/docs/AUTOMATION.md deleted file mode 100644 index 8008857..0000000 --- a/docs/AUTOMATION.md +++ /dev/null @@ -1,190 +0,0 @@ -# Automatischer Pipeline-Betrieb - -## Überblick - -Das System läuft vollautomatisch und benötigt nur noch gelegentliche Telegram-Interaktion. - -``` -N8N (2× täglich, 08:00 + 16:00 Uhr) - └─► POST /api/n8n/pipeline (X-API-Key Header) - ├── RSS Ingestion (alle aktivierten Feeds) - ├── Relevanz-Score per GPT (0–100) - │ ├── Score ≥ 80 → Rewrite + WP-Draft + Telegram - │ ├── Score 60–79 → Telegram-Warnung + manueller Override möglich - │ └── Score < 60 → Abgelehnt + tägliche Telegram-Liste - └── Pipeline-Zusammenfassung via Telegram -``` - ---- - -## Einrichtung - -### 1. Umgebungsvariablen setzen - -Kopiere `backend/.env.example` nach `backend/.env` und fülle alle Felder aus: - -```bash -cp backend/.env.example backend/.env -nano backend/.env -``` - -Wichtige Variablen: - -| Variable | Beschreibung | -|----------|-------------| -| `TELEGRAM_BOT_TOKEN` | Bot-Token von @BotFather | -| `TELEGRAM_CHAT_ID` | Deine persönliche Chat-ID | -| `TELEGRAM_WEBHOOK_SECRET` | Zufälliger String (≥ 20 Zeichen) | -| `N8N_API_KEY` | Starker zufälliger API-Key | -| `OPENAI_API_KEY` | OpenAI API-Key | -| `WP_BASE_URL` | WordPress-URL | -| `WP_USERNAME` | WordPress-Benutzername | -| `WP_PASSWORD` | WordPress App-Passwort | - -### 2. Telegram-Webhook registrieren - -Nach dem Deployment einmalig aufrufen: - -```bash -curl -X POST https://news.vanityontour.de/api/telegram/setup-webhook \ - -H "Cookie: rss_news_session=" -``` - -Oder über die Admin-UI: Settings → Telegram Webhook einrichten. - -### 3. N8N Workflow einrichten - -In N8N einen neuen Workflow erstellen: - -**Trigger:** Cron -- Zeitplan 1: `0 8 * * *` (täglich 08:00) -- Zeitplan 2: `0 16 * * *` (täglich 16:00) - -**Aktion:** HTTP Request -- Method: `POST` -- URL: `https://news.vanityontour.de/api/n8n/pipeline` -- Header: `X-API-Key: ` - -**Fehlerbehandlung:** Bei HTTP-Fehler → E-Mail/Telegram-Alert - ---- - -## Telegram-Befehle - -| Befehl | Funktion | -|--------|----------| -| `/run` | Pipeline manuell starten | -| `/rejected` | Abgelehnte Artikel der letzten 3 Tage anzeigen | -| `/status` | Aktuellen Pipeline-Status | -| `/help` | Alle Befehle anzeigen | - ---- - -## Telegram-Benachrichtigungen - -### Neuer Draft erstellt -Wenn ein Artikel erfolgreich verarbeitet wurde: - -``` -✅ Neuer Draft erstellt -📰 [Artikel-Titel] -🟢 Relevanz-Score: 87/100 -📅 Vorgeschlagene Veröffentlichung: Mo, 24.03.2026 um 09:00 Uhr -🏷 #VanLife #Camping #Wohnmobil -🔗 Draft in WordPress öffnen - - [✏️ Neu schreiben] [❌ Verwerfen] -``` - -### Relevanz-Warnung (Score 60–79) -``` -⚠️ Artikel mit niedrigem Relevanz-Score -📰 [Artikel-Titel] -🟡 Score: 72/100 -💬 Artikel behandelt hauptsächlich... -🔗 Originalartikel - - [➕ Trotzdem verarbeiten] [❌ Ablehnen] -``` - -### Abgelehnte Artikel (Ende jedes Runs) -Liste aller abgelehnten Artikel mit Override-Buttons für jeden einzelnen. - ---- - -## Relevanz-Score - -Der GPT-basierte Score bewertet die Themenrelevanz für den VanLife/Camping-Blog: - -| Score | Aktion | -|-------|--------| -| 80–100 | Automatisch verarbeiten | -| 60–79 | Telegram-Warnung, manueller Override | -| 0–59 | Automatisch abgelehnt | - -Themen die hoch scored werden: Campingplätze, Stellplätze, Wohnmobile, Van-Ausbau, -Outdoor-Equipment, Wandern, Naturreisen, Roadtrips, Camping-Tipps. - -Schwellwerte sind in `.env` konfigurierbar: -``` -PIPELINE_RELEVANCE_AUTO=80 -PIPELINE_RELEVANCE_WARN=60 -``` - ---- - -## Veröffentlichungsplan - -- Maximal **2 Beiträge pro Tag** -- Bevorzugte Zeiten: **09:00 und 14:00 Uhr** (CET) -- Gleichmäßig über die Woche verteilt -- Der Vorschlag erscheint in der Telegram-Nachricht -- Manuell in WordPress setzen oder über WP Scheduling-Plugin automatisieren - -Einstellbar via: -``` -PIPELINE_MAX_DRAFTS_PER_DAY=2 -PIPELINE_PUBLISH_HOURS=9,14 -``` - ---- - -## API-Endpunkte (N8N / extern) - -Alle externen Endpunkte benötigen den Header `X-API-Key: `. - -| Methode | Endpunkt | Funktion | -|---------|----------|----------| -| `POST` | `/api/n8n/pipeline` | Komplette Pipeline starten | -| `POST` | `/api/n8n/ingest` | Nur RSS-Import (ohne Rewrite) | - ---- - -## Deployment (Hetzner via GitHub) - -Das Deployment läuft automatisch über GitHub Actions beim Push auf `main`: - -1. GitHub Action führt Tests aus -2. Bei Erfolg: SSH-Deploy auf Hetzner -3. `pip install -r requirements.txt` -4. Systemd-Dienst `rss-app` neu starten - -Workflow-Dateien: `.github/workflows/test.yml` und `.github/workflows/deploy.yml` - ---- - -## Troubleshooting - -**Pipeline läuft, aber keine Telegram-Nachrichten:** -- `TELEGRAM_BOT_TOKEN` und `TELEGRAM_CHAT_ID` prüfen -- Webhook-Status prüfen: `GET https://api.telegram.org/bot/getWebhookInfo` - -**N8N bekommt 401:** -- `N8N_API_KEY` in `.env` und N8N-Workflow-Header müssen übereinstimmen - -**Alle Artikel werden abgelehnt:** -- `PIPELINE_RELEVANCE_WARN` temporär auf 40 senken zum Testen -- Über `/rejected` + Override-Button manuell testen - -**Artikel werden doppelt importiert:** -- Deduplication läuft über `source_url` (eindeutig). Bereits verarbeitete Artikel werden nie erneut als Draft angelegt. diff --git a/docs/PROJECT_PLAN.md b/docs/PROJECT_PLAN.md deleted file mode 100644 index 3c61216..0000000 --- a/docs/PROJECT_PLAN.md +++ /dev/null @@ -1,91 +0,0 @@ -# Projektplan (Neustart) - -## Leitentscheidungen -- Bestehendes Repository wird weiterverwendet. -- Kein harter Endtermin: lauffaehig werden, dann iterativ verbessern. -- Hetzner bleibt Laufzeitplattform. -- WordPress (IONOS) bleibt vorerst Ziel fuer Publikation. -- Auth initial nur mit einem User/Password. - -## Zielbild -Eine modulare News-Pipeline mit klaren Stufen: -1. Feed-Ingestion -2. Inhaltsanalyse und Normalisierung -3. Rewrite/Anreicherung -4. Legal- und Qualitaetschecks -5. WordPress-Publikation (Draft-first, Queue + Retry) -6. Monitoring/Logging - -## Grobe Zeitplanung (ohne Fixtermine) -- Phase 0: ca. 1 Woche -- Phase 1: ca. 2-4 Wochen -- Phase 2: ca. 2-3 Wochen -- Phase 3: fortlaufend - -## Phasen - -### Phase 0 - Grundlagen (jetzt) -- Doku und Wiki strukturieren -- Source-Policy definieren -- Redirect fuer `news.vanityontour.de` setzen -- GitHub Project als zentrale Planung scharfstellen - -### Phase 1 - MVP Core -- Neues FastAPI-Projektgeruest -- SQLite-Datenmodell (feeds, articles, runs, source_policy) -- Feed-Import mit Duplikaterkennung -- Admin-Login (ein User) -- Manuelle Review vor Publish -- Admin-UI fuer Rechtscheck, Bildauswahl, Relevanzbewertung - -### Phase 2 - Automation -- Job-Queue (asynchron) -- Regelbasierte Scheduler -- Retry/Dead-Letter-Handling -- Robustes Error-Reporting -- WordPress-Publisher (Draft) mit Mapping `article_id -> wp_post_id` - -### Phase 3 - Compliance und Skalierung -- Source-Whitelisting mit Pflichtfeldern -- Pflicht-Attribution pro Artikel -- Qualitaetsmetriken und Audit-Logs -- Optional: Passkey/WebAuthn - -## Aktueller Stand (Snapshot) -- Backend/API + Admin-UI lauffaehig -- Feed-Ingestion inkl. Originalartikel-Extraktion (Autor, Pressekontakt, Bilder) -- Bildkuration: - - automatische Scoring-Reduktion (u. a. Presseportal `story_big` priorisiert) - - manuelle Auswahl/Ausblendung im UI -- Rechts-/Publish-Gates aktiv: - - `legal_checked` Pflicht - - Hauptbild-Auswahl Pflicht - - Status-Workflow bis `published` -- WordPress-Publishing: - - Queue + Retry + Job-Historie - - Draft-Erstellung/Update erfolgreich getestet -- Exporte: - - JSON/CSV inkl. Datum/Alter/Relevanz + Attribution/Legal-Felder - -## Naechste Iteration (konkret) -1. WordPress `featured_media` Upload aus ausgewaehltem Hauptbild -2. Publish-HTML je Artikel verfeinern (strukturierter Body + konsistenter Quellenblock) -3. Publisher als periodischen Worker (Timer/Cron/Systemd) auf Hetzner betreiben -4. Monitoring/Alerting fuer Queue-Fehler + WP-API Fehlercodes - -## Architekturprinzipien -- Idempotente Jobs -- Trennung von UI, API, Worker -- Strikte Validierung bei Quell-/Lizenzdaten -- Expliziter Publish-Schritt, kein blindes Autoposting - -## Risiken -- Lizenz-/Nutzungsbedingungen je Quelle variieren stark -- Feeds aendern Struktur/Verfuegbarkeit -- WordPress-API und Auth koennen regressionsanfaellig sein - -## Erfolgsmetriken -- Zeit von Feed-Eingang bis Review-Ready -- Quote sauber attribuierter Artikel -- Fehlerrate pro Pipeline-Stufe -- Anzahl manueller Eingriffe pro Woche diff --git a/docs/SOURCE_POLICY.md b/docs/SOURCE_POLICY.md deleted file mode 100644 index d1d2e0c..0000000 --- a/docs/SOURCE_POLICY.md +++ /dev/null @@ -1,81 +0,0 @@ -# Source Policy und Feed-Vorschlaege - -## Grundsatz -Es werden nur Quellen genutzt, deren Nutzungsbedingungen die geplante Nutzung erlauben oder fuer die eine explizite Genehmigung vorliegt. - -## Pflichtdaten pro Quelle -- Quellname -- Feed-URL -- Originalartikel-URL -- Autor/Herausgeber (wenn vorhanden) -- Lizenz/Nutzungsgrundlage -- Einschraenkungen (kommerziell, Bearbeitung, Bildrechte, Archivierung) -- Datum der letzten Pruefung -- Link auf Nutzungsbedingungen - -## Einstufung (Ampel) -- Gruen: Nutzung fuer geplantes Modell klar erlaubt -- Gelb: teilklar/mit Einschraenkungen, manuelle Pruefung erforderlich -- Rot: fuer das Modell nicht geeignet ohne Zusatzvertrag - -## Verbindliche Regeln -- Keine neue Quelle ohne Eintrag im Source-Register -- Kein automatischer Publish bei Gelb/Rot -- Bilder separat pruefen (Textrecht != Bildrecht) -- Quartalsweiser Re-Check der Terms - -## Ersteinschaetzung (Stand: 16.02.2026) - -### Rot -1. Reuters / Thomson Reuters -- Grund: Inhalte sind urheberrechtlich geschuetzt; Reproduktion/Verteilung laut Terms nur mit vorheriger Zustimmung. -- Folge: Nur mit explizitem Vertrag/Lizenz. -- Referenz: - - https://www.thomsonreuters.com/en/terms-of-use - -2. tagesschau.de RSS -- Grund: Inhalte nur privat/nicht-kommerziell; Veroeffentlichung grundsaetzlich nicht erlaubt (ausser explizit CC-lizenziert). -- Folge: Nicht fuer das geplante Modell geeignet. -- Referenz: - - https://www.tagesschau.de/infoservices/rssfeeds - -### Gelb -1. Presseportal / ots -- Grund: Redaktionelle Nutzung grundsaetzlich moeglich, aber Verantwortung liegt beim Verwender; darueber hinausgehende Geschaeftsnutzung nur mit Genehmigung. -- Folge: Nur mit strikter Einzelpruefung pro Meldung (insb. Bild-/Drittrechte). -- Referenz: - - https://www.presseportal.de/nutzungsbedingungen - - https://www.presseportal.de/feeds/ - -2. Bundesbehoerden-RSS ohne explizite freie Weiterverwendungs-Lizenz -- Grund: RSS wird bereitgestellt, aber nicht immer als offene Lizenz zur kommerziellen Nachnutzung formuliert. -- Folge: Je Behoerde einzeln pruefen und dokumentieren. -- Beispiele: - - https://www.bundesfinanzministerium.de/Content/DE/Standardartikel/Service/rss_base.html - - https://bmas.bund.de/EN/Services/RSS/rss.html - -### Gruen (mit korrekter Attribution) -1. GovData / Open-Data-Portale mit `dl-de/by-2-0`, `dl-de/zero-2-0`, `CC BY 4.0` oder `CC0` -- Grund: Diese Lizenzen erlauben grundsaetzlich auch kommerzielle Weiterverwendung (je nach Lizenzbedingungen). -- Folge: Sehr gut fuer stabile Automatisierung geeignet. -- Referenz: - - https://www.govdata.de/dl-de/by-2-0 - - https://data.gov.de/informationen/lizenzen - - https://www.dcat-ap.de/def/licenses/dl-zero-de/2.0 - -2. EU-Quellen mit expliziter `CC BY 4.0` Wiederverwendungsregel -- Grund: EU-Inhalte sind haeufig unter CC BY 4.0 wiederverwendbar, sofern nicht anders gekennzeichnet. -- Folge: Geeignet, wenn Drittinhalte ausgenommen werden. -- Referenz: - - https://commission.europa.eu/legal-notice_en - - https://eur-lex.europa.eu/content/help/content/legal-notice/legal-notice.html - -## Quelle im Register freischalten (Definition of Done) -- Terms-Link hinterlegt -- Lizenzklasse (Gruen/Gelb/Rot) gesetzt -- Pflicht-Attribution dokumentiert -- Bildrechtsregel dokumentiert -- Letzte Pruefung und Verantwortlicher gepflegt - -## Hinweis -Keine Rechtsberatung. Bei unklaren oder wirtschaftlich kritischen Quellen ist eine juristische Prüfung sinnvoll. diff --git a/docs/TODO.md b/docs/TODO.md deleted file mode 100644 index fee4a67..0000000 --- a/docs/TODO.md +++ /dev/null @@ -1,38 +0,0 @@ -# ToDo (Ein-Entwickler Setup) - -## Jetzt -- [ ] WordPress Beitragsbild-Upload implementieren (`featured_media` aus ausgewaehltem Hauptbild) -- [ ] WordPress-HTML-Ausgabe pro Artikel weiter verbessern (sauberes Layout, Quellenblock, Shortcodes falls noetig) -- [ ] Publisher Fehlertexte fuer WP-Auth/Media/API in UI klarer darstellen -- [ ] End-to-end Publish Smoke-Test dokumentieren (lokal + Hetzner) - -## MVP -- [x] Neues Backend-Skelett (`backend/`) aufsetzen (FastAPI) -- [x] Datenmodell in SQLite anlegen -- [x] Feed-Ingestion Service bauen (ETag/Last-Modified) -- [x] Duplikaterkennung ueber `source_url`, `guid`, Hash -- [x] Login mit 1 Admin-Account implementieren -- [x] Artikel-Review-Maske mit Statusworkflow -- [x] WordPress-Publisher als separaten Service implementieren (Queue + Retry + Mapping) -- [x] Bildvorschau + manuelle Bildauswahl im Admin-UI -- [x] Automatische Bildreduktion/Scoring fuer Presseportal-Quellen -- [x] Artikel-Datum + Relevanzscore im UI/Export - -## Recht/Qualitaet -- [x] Source-Policy in DB + Admin-UI abbilden -- [x] Pflichtfelder je Quelle erzwingen (Autor, URL, Lizenz, Hinweise) -- [x] Auto-Block bei fehlender Lizenzinfo -- [x] Pro Artikel Attribution-Block generieren -- [x] Manuelle Rechtsfreigabe als Publish-Gate - -## Betrieb -- [ ] Systemd-Service(s) fuer API/Worker erstellen -- [ ] Nginx-Routing fuer neue App einrichten -- [ ] Healthcheck-Endpunkte + Monitoring einrichten -- [ ] Backup/Restore fuer DB dokumentieren - -## Spaeter -- [ ] Passkey/WebAuthn evaluieren und optional einfuehren -- [ ] Migration auf PostgreSQL bewerten -- [ ] Teilautomatische Freigabe-Regeln definieren -- [ ] KI-Rewrite mit Prompt-Versionierung + Qualitaetsmetriken wieder aktivieren diff --git a/docs/wiki/Architektur.md b/docs/wiki/Architektur.md deleted file mode 100644 index 275b578..0000000 --- a/docs/wiki/Architektur.md +++ /dev/null @@ -1,29 +0,0 @@ -# Architektur - -## Zielarchitektur -- API: FastAPI -- Worker: Queue-basierte Hintergrundjobs -- DB: SQLite (MVP), spaeter optional PostgreSQL -- Publisher: WordPress REST API -- Frontend/Admin: schlanke Web-UI mit Login - -## Pipeline -1. Feed Fetch -2. Parse + Normalize -3. Deduplicate -4. Enrichment (Rewrite/Tags) -5. Legal/Policy Check -6. Publish (pending) - -## Datenobjekte (MVP) -- `sources` -- `feeds` -- `articles` -- `article_versions` -- `runs` -- `policy_checks` - -## Nichtziele (MVP) -- Multi-User und Rollen -- Vollautomatische Freigabe ohne Review -- Komplexe externe SSO-Integration diff --git a/docs/wiki/Deployment.md b/docs/wiki/Deployment.md deleted file mode 100644 index 91388c7..0000000 --- a/docs/wiki/Deployment.md +++ /dev/null @@ -1,20 +0,0 @@ -# Deployment (Hetzner + CloudPanel) - -## Umgebung -- Host: Hetzner -- Reverse Proxy: Nginx via CloudPanel -- Ziel-Domain: `news.vanityontour.de` - -## Aktueller Zustand -- Domain ist bis zum Go-Live auf `https://vanityontour.de` umgeleitet. - -## Zielzustand -- `news.vanityontour.de` zeigt auf neue App (interner Port, z. B. `127.0.0.1:8501`) -- API/Worker laufen als systemd-Services -- TLS bleibt ueber CloudPanel/Nginx - -## Mindest-Checks nach Deployment -- `curl -I https://news.vanityontour.de` -- Login erreichbar -- Feed-Import laeuft -- WordPress-Testpublikation (pending) erfolgreich diff --git a/docs/wiki/Home.md b/docs/wiki/Home.md deleted file mode 100644 index 300599a..0000000 --- a/docs/wiki/Home.md +++ /dev/null @@ -1,19 +0,0 @@ -# Wiki Home - -## Zweck -Dieses Wiki dokumentiert Architektur, Betrieb, Sicherheit, Recht und Roadmap des Neuaufbaus von `rss-news`. - -## Inhalte -- `Architektur.md` -- `Deployment.md` -- `Security-Auth.md` -- `Recht-Quellen.md` -- `Operations-Runbook.md` -- `Roadmap.md` -- `Project-Board.md` - -## Projektsteuerung -- GitHub Project #3: https://github.com/users/OliverGiertz/projects/3/views/1 - -## Prinzip -Dokumentation wird bei jeder relevanten Aenderung im selben Pull Request aktualisiert. diff --git a/docs/wiki/Operations-Runbook.md b/docs/wiki/Operations-Runbook.md deleted file mode 100644 index e6c0f88..0000000 --- a/docs/wiki/Operations-Runbook.md +++ /dev/null @@ -1,43 +0,0 @@ -# Operations Runbook - -## Daily Checks -- App erreichbar -- Queue/Worker aktiv -- Letzte Feed-Laeufe erfolgreich -- Keine auffaelligen Fehler im Log - -## Incident: Feed-Import faellt aus -1. RSS-Quelle erreichbar? -2. Parser-Fehler im Log? -3. Rate Limits oder Blockaden? -4. Retry-Queue pruefen - -## Incident: WordPress Publish faellt aus -1. WP API erreichbar? -2. Credentials gueltig? -3. Payload-Validation/Tag-Fehler? -4. Artikel in `pending` statt `failed` markieren, wenn unklar - -## Incident: Telegram-Buttons reagieren nicht / Befehle ignoriert - -**Ursache:** N8N "App Release - Telegram Bot"-Workflow hat den Webhook überschrieben. - -**Prüfen:** -```bash -curl -s "https://api.telegram.org/bot8403822424:AAGp8gZoNIGZv3IIan45q7P9HfM868qzXi4/getWebhookInfo" | python3 -m json.tool -``` -→ `url` muss auf `https://news.vanityontour.de/telegram/webhook` zeigen -→ `allowed_updates` muss `["message", "callback_query"]` enthalten - -**Webhook zurücksetzen:** -```bash -curl -s -X POST "https://api.telegram.org/bot8403822424:AAGp8gZoNIGZv3IIan45q7P9HfM868qzXi4/setWebhook" \ - -H "Content-Type: application/json" \ - -d '{"url":"https://news.vanityontour.de/telegram/webhook","allowed_updates":["message","callback_query"],"secret_token":"RWWAaBwfCUX9Y573JVkB9zAeloHsZZoruXOBBgUtsvU"}' -``` - -Vollständige Dokumentation: `projects/webhook/telegram-webhook-reset.md` - -## Backups -- SQLite-Dump taeglich -- Konfiguration und `.env` sicher sichern diff --git a/docs/wiki/Project-Board.md b/docs/wiki/Project-Board.md deleted file mode 100644 index 887ac19..0000000 --- a/docs/wiki/Project-Board.md +++ /dev/null @@ -1,28 +0,0 @@ -# Project Board Workflow - -## Zentrale Steuerung -- Board: https://github.com/users/OliverGiertz/projects/3/views/1 -- Board ist die einzige Quelle fuer Planungsstatus. - -## Arbeitsmodus (1 Entwickler) -- Neue Arbeit immer als Issue anlegen -- Issue direkt ins Project aufnehmen -- Status nur im Project pflegen -- PR/Commit auf Issue referenzieren - -## Empfohlene Status-Disziplin -- `Todo`: noch nicht begonnen -- `In Progress`: aktiv in Arbeit -- `Done`: umgesetzt und dokumentiert - -## Konventionen fuer Issues -- Prefix fuer Klarheit: - - `[MVP]` - - `[Infra]` - - `[Legal]` - - `[Bug]` -- Definition of Done in jedem Issue notieren - -## Aktueller Backlog-Hinweis -- Thema Userverwaltung ist fuer MVP obsolet (ein Admin-User). -- Entsprechende Issues als `deferred` oder `closed` kennzeichnen. diff --git a/docs/wiki/Recht-Quellen.md b/docs/wiki/Recht-Quellen.md deleted file mode 100644 index 212f0d5..0000000 --- a/docs/wiki/Recht-Quellen.md +++ /dev/null @@ -1,35 +0,0 @@ -# Recht und Quellen - -## Grundregeln -- Nur freigegebene Quellen aus Source-Register -- Pflicht-Attribution pro Artikel -- Rechte fuer Bilder separat pruefen -- Kein Autopublish bei unklarer Lizenz - -## Bewertungsmodell -- Gruen: Freie Nachnutzung klar erlaubt -- Gelb: Nutzung mit Einschraenkungen/Einzelfallpruefung -- Rot: Ohne Zusatzlizenz nicht geeignet - -## Aktuelle Referenzen -- Reuters/Thomson Reuters Terms: https://www.thomsonreuters.com/en/terms-of-use -- Presseportal Nutzungsbedingungen: https://www.presseportal.de/nutzungsbedingungen -- tagesschau RSS-Hinweise: https://www.tagesschau.de/infoservices/rssfeeds -- Datenlizenz Deutschland BY 2.0: https://www.govdata.de/dl-de/by-2-0 -- GovData Lizenzen: https://data.gov.de/informationen/lizenzen -- EU Legal Notice (CC BY 4.0): https://commission.europa.eu/legal-notice_en - -## Review-Checkliste je Quelle -1. Sind Bearbeitung und Veroeffentlichung erlaubt? -2. Ist kommerzielle Nutzung erlaubt? -3. Gibt es gesonderte Bildrechte? -4. Ist die Quellenangabe vorgeschrieben? -5. Gibt es Archivierungs- oder Weitergabebeschraenkungen? - -## Operativer Schutz -- Source-Register als Pflicht vor Feed-Aktivierung -- Auto-Block bei fehlenden Lizenzdaten -- Quartalsweiser Terms-Recheck - -## Hinweis -Keine Rechtsberatung. Finale Freigabe kritischer Quellen bei Bedarf juristisch validieren. diff --git a/docs/wiki/Roadmap.md b/docs/wiki/Roadmap.md deleted file mode 100644 index fece89e..0000000 --- a/docs/wiki/Roadmap.md +++ /dev/null @@ -1,19 +0,0 @@ -# Roadmap - -## Jetzt -- Doku und Projektstruktur bereinigen -- Redirect aktiv -- Backlog auf Neustart ausrichten - -## Naechster Schritt -- FastAPI-MVP implementieren -- Login + Feed-Ingestion + Review + WordPress pending - -## Danach -- Worker/Queue -- Source-Policy Enforcement -- Monitoring/Reporting -- Optional Passkey - -## Steuerung -Alle Arbeitsitems liegen im GitHub Project #3. diff --git a/docs/wiki/Security-Auth.md b/docs/wiki/Security-Auth.md deleted file mode 100644 index a9f830a..0000000 --- a/docs/wiki/Security-Auth.md +++ /dev/null @@ -1,16 +0,0 @@ -# Security und Auth - -## Mindestanforderungen -- Zugriff auf die WebApp nur mit Login -- Ein aktiver Admin-User (kein Rollenmodell im MVP) -- Passwort nicht im Repo, nur als Secret auf Server - -## Empfohlene Umsetzung -- Session-basierte Auth (HTTP-only Cookies) -- Passwort gehasht (Argon2 oder bcrypt) -- Rate Limiting auf Login-Endpunkt -- CSRF-Schutz fuer Form-Aktionen - -## Spaeter (optional) -- Passkey/WebAuthn als zusaetzlicher Login-Faktor -- IP-Allowlist fuer Admin-Zugang diff --git a/pytest.ini b/pytest.ini deleted file mode 100644 index c15b448..0000000 --- a/pytest.ini +++ /dev/null @@ -1,4 +0,0 @@ -[pytest] -testpaths = backend/tests -python_files = test_*.py -addopts = -q --maxfail=1 diff --git a/scripts/smoke_backend.sh b/scripts/smoke_backend.sh deleted file mode 100755 index f0000ad..0000000 --- a/scripts/smoke_backend.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -if [[ -z "${BASE_URL:-}" ]]; then - echo "BASE_URL fehlt (z. B. https://news.vanityontour.de)" - exit 1 -fi - -if [[ -z "${APP_ADMIN_USERNAME:-}" || -z "${APP_ADMIN_PASSWORD:-}" ]]; then - echo "APP_ADMIN_USERNAME/APP_ADMIN_PASSWORD fehlen" - exit 1 -fi - -cookie_file="$(mktemp)" -trap 'rm -f "$cookie_file"' EXIT - -echo "[1/4] Healthcheck" -curl -fsS "${BASE_URL}/health" | grep -q '"status":"ok"' - -echo "[2/4] Login" -curl -fsS -c "$cookie_file" \ - -H "Content-Type: application/json" \ - -X POST "${BASE_URL}/auth/login" \ - -d "{\"username\":\"${APP_ADMIN_USERNAME}\",\"password\":\"${APP_ADMIN_PASSWORD}\"}" \ - | grep -q '"ok":true' - -echo "[3/4] Protected Endpoint" -curl -fsS -b "$cookie_file" "${BASE_URL}/api/protected" | grep -q '"ok":true' - -echo "[4/4] Pipeline Status" -curl -fsS -b "$cookie_file" "${BASE_URL}/api/pipeline/status" | grep -q '"stage":"skeleton+db"' - -echo "Smoke test erfolgreich."