From 2c331d683b6e5a241c6b4d6d6e54f68846fd6caf Mon Sep 17 00:00:00 2001 From: Oliver G Date: Wed, 18 Feb 2026 09:46:44 +0100 Subject: [PATCH] feat: rebuild rss-news backend, admin ui, and legal extraction pipeline --- .github/workflows/deploy.yml | 6 + .github/workflows/test.yml | 39 +++ CHANGELOG.md | 38 ++- README.md | 114 +++---- backend/.env.example | 10 + backend/README.md | 82 +++++ backend/__init__.py | 1 + backend/app/__init__.py | 1 + backend/app/admin_ui.py | 265 +++++++++++++++ backend/app/auth.py | 31 ++ backend/app/config.py | 29 ++ backend/app/db.py | 138 ++++++++ backend/app/ingestion.py | 253 ++++++++++++++ backend/app/main.py | 404 +++++++++++++++++++++++ backend/app/policy.py | 35 ++ backend/app/repositories.py | 416 ++++++++++++++++++++++++ backend/app/source_extraction.py | 257 +++++++++++++++ backend/data/rss_news.db | Bin 0 -> 94208 bytes backend/requirements-test.txt | 3 + backend/requirements.txt | 8 + backend/static/admin.css | 189 +++++++++++ backend/templates/admin_dashboard.html | 235 +++++++++++++ backend/templates/admin_login.html | 27 ++ backend/tests/__init__.py | 1 + backend/tests/test_admin_ui.py | 65 ++++ backend/tests/test_api_auth.py | 77 +++++ backend/tests/test_article_workflow.py | 95 ++++++ backend/tests/test_db_repositories.py | 119 +++++++ backend/tests/test_ingestion.py | 122 +++++++ backend/tests/test_source_extraction.py | 69 ++++ docs/PROJECT_PLAN.md | 67 ++++ docs/SOURCE_POLICY.md | 81 +++++ docs/TODO.md | 33 ++ docs/wiki/Architektur.md | 29 ++ docs/wiki/Deployment.md | 20 ++ docs/wiki/Home.md | 19 ++ docs/wiki/Operations-Runbook.md | 23 ++ docs/wiki/Project-Board.md | 28 ++ docs/wiki/Recht-Quellen.md | 35 ++ docs/wiki/Roadmap.md | 19 ++ docs/wiki/Security-Auth.md | 16 + pytest.ini | 4 + scripts/smoke_backend.sh | 33 ++ 43 files changed, 3463 insertions(+), 73 deletions(-) create mode 100644 .github/workflows/test.yml create mode 100644 backend/.env.example create mode 100644 backend/README.md create mode 100644 backend/__init__.py create mode 100644 backend/app/__init__.py create mode 100644 backend/app/admin_ui.py create mode 100644 backend/app/auth.py create mode 100644 backend/app/config.py create mode 100644 backend/app/db.py create mode 100644 backend/app/ingestion.py create mode 100644 backend/app/main.py create mode 100644 backend/app/policy.py create mode 100644 backend/app/repositories.py create mode 100644 backend/app/source_extraction.py create mode 100644 backend/data/rss_news.db create mode 100644 backend/requirements-test.txt create mode 100644 backend/requirements.txt create mode 100644 backend/static/admin.css create mode 100644 backend/templates/admin_dashboard.html create mode 100644 backend/templates/admin_login.html create mode 100644 backend/tests/__init__.py create mode 100644 backend/tests/test_admin_ui.py create mode 100644 backend/tests/test_api_auth.py create mode 100644 backend/tests/test_article_workflow.py create mode 100644 backend/tests/test_db_repositories.py create mode 100644 backend/tests/test_ingestion.py create mode 100644 backend/tests/test_source_extraction.py create mode 100644 docs/PROJECT_PLAN.md create mode 100644 docs/SOURCE_POLICY.md create mode 100644 docs/TODO.md create mode 100644 docs/wiki/Architektur.md create mode 100644 docs/wiki/Deployment.md create mode 100644 docs/wiki/Home.md create mode 100644 docs/wiki/Operations-Runbook.md create mode 100644 docs/wiki/Project-Board.md create mode 100644 docs/wiki/Recht-Quellen.md create mode 100644 docs/wiki/Roadmap.md create mode 100644 docs/wiki/Security-Auth.md create mode 100644 pytest.ini create mode 100755 scripts/smoke_backend.sh diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 5d55808..4ac1c42 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -19,9 +19,15 @@ jobs: username: oliver key: ${{ secrets.HETZNER_SSH_KEY }} port: 22 + envs: APP_ADMIN_USERNAME,APP_ADMIN_PASSWORD script: | cd rss-news git pull origin main source .venv/bin/activate pip install -r requirements.txt + pip install -r backend/requirements.txt || true sudo systemctl restart rss-app + BASE_URL="https://news.vanityontour.de" APP_ADMIN_USERNAME="${APP_ADMIN_USERNAME}" APP_ADMIN_PASSWORD="${APP_ADMIN_PASSWORD}" bash scripts/smoke_backend.sh + env: + APP_ADMIN_USERNAME: ${{ secrets.NEWS_APP_ADMIN_USERNAME }} + APP_ADMIN_PASSWORD: ${{ secrets.NEWS_APP_ADMIN_PASSWORD }} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..1d627db --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,39 @@ +name: Backend Tests + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + backend-tests: + runs-on: ubuntu-latest + timeout-minutes: 15 + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r backend/requirements.txt + pip install -r backend/requirements-test.txt + + - name: Run tests with coverage + env: + APP_DB_PATH: /tmp/rss_news_test.db + run: | + pytest backend/tests --cov=backend/app --cov-report=term-missing --cov-report=xml + + - name: Upload coverage artifact + uses: actions/upload-artifact@v4 + with: + name: coverage-xml + path: coverage.xml diff --git a/CHANGELOG.md b/CHANGELOG.md index fa80967..66b7237 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,42 @@ -## [1.7.1] - 2025-08-28 +## [1.7.1] - 2025-08-24 -- Beschreibung... +### ✨ Security angepasst + - alle Credentials in die .env Datei verschoben + - beim Start der App werden die Credentials geprüft und beim fehlen entsprechende Meldungen ausgegeben + +--- ## [1.7.0] - 2025-08-24 -- Beschreibung... +### Multi-Select & Massenoperationen: + - ✅ Checkboxes für Artikel-Auswahl im "Artikel verwalten" Bereich + - ✅ "Alle auswählen" / "Auswahl aufheben" Buttons + - ✅ Massenoperationen für ausgewählte Artikel: + - Bulk Status-Änderung für mehrere Artikel gleichzeitig + - Bulk Artikel-Umschreibung mit automatischer Status-Verwaltung + - Bulk WordPress-Upload nur für "Process"-Artikel + - Bulk Papierkorb-Funktion + +### Schnellaktionen Integration: + - ✅ Feed-Aktualisierung direkt im Artikel-Tab verfügbar + - ✅ Alle Dashboard-Schnellaktionen in Artikel-Verwaltung integriert + - ✅ Intelligente Anzeige nur relevanter Operationen (z.B. WordPress-Upload nur bei Process-Artikeln) + +### 🔧 Verbesserungen + + - UI/UX: Verbesserte Artikel-Card-Layouts mit Checkbox-Integration + - Workflow: Streamlined Artikel-Management ohne Tab-Wechsel nötig + - Feedback: Detaillierte Statusmeldungen bei Massenoperationen + - Performance: Optimierte Session-State-Verwaltung für Artikel-Auswahl + +### 🏗️ Technische Änderungen + + - Session State Erweiterung um selected_articles Set + - Neue Bulk-Operation-Funktionen in app.py:326-467 + - Überarbeitetes Artikel-Card-Layout mit 3-Spalten-Design + - Integration bestehender WordPress-Upload und Rewrite-Funktionen + +--- ## [1.6.3] - 2025-08-18 diff --git a/README.md b/README.md index 6846a41..b3c2b4a 100644 --- a/README.md +++ b/README.md @@ -1,89 +1,63 @@ -# 📰 RSS News Bot +# rss-news (Rebuild) -Ein intelligentes Tool zum Einlesen, Umschreiben und Veröffentlichen von Artikeln aus RSS-Feeds – mit automatischer Tag-Erkennung, KI-unterstütztem Rewrite via GPT-4, Bildextraktion aus Originalartikeln und optionaler DALL·E-Bildgenerierung. +`rss-news` wird als bestehendes Repository weitergefuehrt und schrittweise zu einer robusten, rechtssicheren News-Pipeline neu aufgebaut. -![Version](https://img.shields.io/badge/version-1.5.2-blue) -![License](https://img.shields.io/badge/license-MIT-green) -![Python](https://img.shields.io/badge/python-3.10+-yellow) -![Streamlit](https://img.shields.io/badge/built%20with-Streamlit-ff4b4b) +Aktueller Stand: +- Alte Streamlit-App wird nicht produktiv genutzt. +- `news.vanityontour.de` wird bis zum Go-Live der neuen App auf `https://vanityontour.de` umgeleitet. +- Planung, Doku und Wiki werden als Grundlage fuer den Neuaufbau gepflegt. ---- +## Ziele +- RSS-gestuetzte Artikelverarbeitung mit klaren Quellregeln +- Rechtssichere Nutzung (Quellen, Attribution, Lizenzinformationen) +- Zuverlaessige Automatisierung auf Hetzner +- Publikation nach WordPress (IONOS aktuell, spaeter offen) +- Zugriff nur nach Login (zunaechst User/Password) -## 🚀 Features +## Architektur-Richtung (MVP) +- Backend: `Python + FastAPI` +- Jobs: Queue-Worker (z. B. Redis + RQ/Celery) +- Daten: SQLite fuer MVP, spaeter optional PostgreSQL +- Auth: Session-Login mit einem Admin-User +- Publishing: WordPress REST API (Status zunaechst `pending`) -- 📡 **RSS-Feeds verwalten** (hinzufügen, aktualisieren) -- ✍️ **Artikel automatisch umschreiben** mit GPT-4 -- 🏷️ **Tags automatisch generieren** -- 🖼️ **Bilder aus Originalartikeln extrahieren** -- 🪄 **Optionales DALL·E-Bild generieren** -- 🔧 **Bearbeiten von Bildmetadaten** -- 🗂️ **Statusverwaltung der Artikel (New, Rewrite, Process, etc.)** -- 📜 **Log-Viewer-Seite integriert** -- 📥 **Export zur Veröffentlichung auf WordPress vorbereitet** -- 📋 Artikeltabelle mit Status-Filter -- 🔍 Artikel-Expander mit Rewrite, Tags & Bildern -- 🪄 Button für KI-Bildgenerierung +Details: `docs/PROJECT_PLAN.md` +## Projektsteuerung +- GitHub Project: `https://github.com/users/OliverGiertz/projects/3/views/1` +- Dieses Board ist die zentrale Steuerung fuer ToDos, Bugs, Verbesserungen. +- Wiki-Struktur liegt unter `docs/wiki/`. ---- +## Dokumentation +- Projektplan: `docs/PROJECT_PLAN.md` +- ToDo-Liste: `docs/TODO.md` +- Quell- und Lizenzpolicy: `docs/SOURCE_POLICY.md` +- Wiki Home: `docs/wiki/Home.md` -## 🧱 Projektstruktur - -ss-news/ -├── app.py # Haupt-UI mit Streamlit -├── main.py # Logik für Feed-Import und Verarbeitung -├── utils/ -│ └── image_extractor.py # Bilder aus Originalartikeln extrahieren -│ └── dalle_generator.py # DALL·E-Integration (KI-Bild) -├── pages/ -│ └── log_viewer.py # UI zur Anzeige der Logs -├── data/ -│ └── articles.json # Gespeicherte Artikel -│ └── feeds.json # Gespeicherte Feed-URLs -├── logs/ -│ └── rss_tool.log # Logging der Verarbeitung -├── versioning.py # CLI-Tool zur Versionierung & Release -├── TEST-CHECKLIST.md # Manuelle Prüfliste für Releases -├── version.py # Aktuelle Version -└── CHANGELOG.md # Änderungsprotokoll - - ---- - -## ⚙️ Installation +## Lokale Entwicklung (Legacy-Code) +Der vorhandene Legacy-Stand kann weiterhin lokal gestartet werden: ```bash -git clone https://github.com/OliverGiertz/rss-news.git -cd rss-news python -m venv .venv source .venv/bin/activate pip install -r requirements.txt -``` - ---- - -## Update -Ein Update Script findest du hier: https://gist.github.com/OliverGiertz/ad33ae3de9aa1c1163dad5fe8affb6ca - -```bash -bash update.sh -``` - - -## ▶️ Starten der App - streamlit run app.py +``` ---- +Hinweis: Diese App ist funktional historisch und wird durch die neue Architektur ersetzt. -## 🔐 Konfiguration (.env) +## Deployment-Zielbild +- Betrieb auf Hetzner +- Reverse Proxy via CloudPanel/Nginx +- Produktive Domain: `news.vanityontour.de` +- Bis zur Fertigstellung: Redirect auf `https://vanityontour.de` -Lege eine `.env` im Projekt an (siehe `.env.example`). Erforderliche Variablen: +## Sicherheit +- Keine Secrets im Repository +- `.env` lokal/auf Server, nie committen +- Auth-Pflicht fuer die neue WebApp +- spaeter optional: Passkeys/WebAuthn -- `WP_BASE_URL`: Basis-URL deiner WordPress-Seite (z. B. https://example.com) -- Authentifizierung (eine Option wählen): - - `WP_AUTH_BASE64`: Bevorzugt. Base64 von `username:application_password` - - oder `WP_USERNAME` und `WP_PASSWORD`: Benutzer + Anwendungspasswort -- Optional: `OPENAI_API_KEY` für das Umschreiben von Artikeln +## Rechtlicher Hinweis +Dieses Projekt verarbeitet nur Quellen mit dokumentierter Nutzungsgrundlage. Vor produktiver Nutzung ist eine finale rechtliche Pruefung der ausgewaehlten Feeds notwendig. -Hinweis: Der Code liest ausschließlich aus `.env`. Es gibt keine hartkodierten Standard-Credentials. diff --git a/backend/.env.example b/backend/.env.example new file mode 100644 index 0000000..74e9c4b --- /dev/null +++ b/backend/.env.example @@ -0,0 +1,10 @@ +APP_ENV=development +APP_NAME=rss-news-backend +APP_SECRET_KEY=replace-with-a-long-random-secret +APP_DB_PATH=backend/data/rss_news.db + +APP_ADMIN_USERNAME=admin +APP_ADMIN_PASSWORD=change-me + +SESSION_COOKIE_NAME=rss_news_session +SESSION_MAX_AGE_SECONDS=28800 diff --git a/backend/README.md b/backend/README.md new file mode 100644 index 0000000..7d64a65 --- /dev/null +++ b/backend/README.md @@ -0,0 +1,82 @@ +# Backend Skeleton (FastAPI) + +Dieses Verzeichnis enthaelt das technische Grundgeruest fuer den Rebuild von `rss-news`. + +## Start (lokal) + +```bash +python -m venv .venv +source .venv/bin/activate +pip install -r backend/requirements.txt +uvicorn backend.app.main:app --reload --port 8501 +``` + +## Admin UI +- Login: `http://127.0.0.1:8501/admin/login` +- Dashboard: `http://127.0.0.1:8501/admin/dashboard` + +## Environment +- Datei: `backend/.env` +- Vorlage: `backend/.env.example` + +## Endpoints +- `GET /health` - Healthcheck +- `POST /auth/login` - Login mit Admin-User +- `POST /auth/logout` - Logout +- `GET /auth/me` - Aktiver User +- `GET /api/protected` - Geschuetzter Test-Endpoint +- `GET /api/pipeline/status` - Basisstatus inkl. Datensatzzaehler +- `GET /api/sources` - Quellenliste +- `POST /api/sources` - Quelle anlegen +- `GET /api/sources/{source_id}/policy-check` - Policy-Pruefung fuer Quelle +- `GET /api/feeds` - Feedliste +- `POST /api/feeds` - Feed anlegen +- `GET /api/feeds/{feed_id}/policy-check` - Policy-Pruefung fuer Feed +- `GET /api/runs` - Import-/Job-Runs anzeigen +- `GET /api/runs/{run_id}` - Detailansicht eines Runs +- `POST /api/runs` - Run starten +- `POST /api/runs/{run_id}/finish` - Run abschliessen +- `GET /api/articles` - Artikel anzeigen +- `GET /api/articles/{article_id}` - Artikeldetail +- `POST /api/articles/upsert` - Artikel idempotent anlegen/aktualisieren +- `POST /api/articles/{article_id}/transition` - Statuswechsel nach Workflow-Regeln +- `POST /api/articles/{article_id}/review` - Review-Entscheidung (approve/reject) +- `POST /api/ingestion/run` - Feed-Ingestion starten (optional pro Feed) + +## Datenbank +- SQLite-Datei unter `backend/data/rss_news.db` +- Tabellen werden beim App-Start initialisiert. +- Tabellen: `sources`, `feeds`, `runs`, `articles` +- Dedupe-Strategie Artikel: `source_url` -> `(feed_id, source_article_id)` -> `source_hash` + +## Policy-Enforcement +- Ingestion blockiert Feeds automatisch, wenn die zugeordnete Quelle nicht policy-konform ist. +- Mindestanforderungen: `risk_level=green`, `terms_url`, `license_name`, `last_reviewed_at`, `is_enabled=1`. +- Pro importiertem Artikel wird ein `attribution`-Block in `meta_json` gespeichert. + +## Review-Workflow +- Statuskette: `new -> review -> approved -> published` +- Ablehnung im Review setzt auf `rewrite` +- Ungueltige Statuswechsel werden per API blockiert + +## Verifikation +```bash +python -m unittest backend.tests.test_db_repositories +python -m unittest backend.tests.test_ingestion +python -m unittest backend.tests.test_api_auth +``` + +## CI / Online-Auswertung +- GitHub Actions Workflow: `.github/workflows/test.yml` +- Fuehrt Tests inkl. Coverage auf Push/PR gegen `main` aus. + +## Hetzner Smoketest +```bash +BASE_URL="https://news.vanityontour.de" \ +APP_ADMIN_USERNAME="admin" \ +APP_ADMIN_PASSWORD="..." \ +bash scripts/smoke_backend.sh +``` + +## Hinweis +Passwort-Hashing und CSRF/Rate-Limit sind als naechste Ausbaustufe vorgesehen. diff --git a/backend/__init__.py b/backend/__init__.py new file mode 100644 index 0000000..3623851 --- /dev/null +++ b/backend/__init__.py @@ -0,0 +1 @@ +"""Backend package for rss-news rebuild.""" diff --git a/backend/app/__init__.py b/backend/app/__init__.py new file mode 100644 index 0000000..18b665e --- /dev/null +++ b/backend/app/__init__.py @@ -0,0 +1 @@ +"""Application package.""" diff --git a/backend/app/admin_ui.py b/backend/app/admin_ui.py new file mode 100644 index 0000000..9587664 --- /dev/null +++ b/backend/app/admin_ui.py @@ -0,0 +1,265 @@ +from __future__ import annotations + +import json +from pathlib import Path +from urllib.parse import urlencode + +from fastapi import APIRouter, Form, Request +from fastapi.responses import HTMLResponse, RedirectResponse +from fastapi.templating import Jinja2Templates + +from .auth import create_session_token, verify_credentials, verify_session_token +from .config import get_settings +from .ingestion import run_ingestion +from .policy import evaluate_source_policy +from .repositories import ( + FeedCreate, + SourceCreate, + create_feed, + create_source, + get_article_by_id, + list_articles, + list_feeds, + list_runs, + list_sources, + update_article_status, +) + +settings = get_settings() +router = APIRouter(tags=["admin-ui"]) +templates = Jinja2Templates(directory=str(Path(__file__).resolve().parent.parent / "templates")) +ALLOWED_TRANSITIONS: dict[str, tuple[str, ...]] = { + "new": ("review", "rewrite", "error"), + "rewrite": ("review", "error"), + "review": ("approved", "rewrite", "error"), + "approved": ("published", "error"), + "published": ("error",), + "error": ("review", "rewrite"), +} + + +def _admin_user(request: Request) -> str | None: + token = request.cookies.get(settings.session_cookie_name) + if not token: + return None + return verify_session_token(token) + + +def _to_optional_int(raw: str | None) -> int | None: + if raw is None: + return None + value = raw.strip() + if value == "": + return None + return int(value) + + +def _dashboard_redirect( + *, + msg: str | None = None, + msg_type: str = "success", + status_filter: str | None = None, +) -> RedirectResponse: + query: dict[str, str] = {} + if msg: + query["msg"] = msg + query["type"] = msg_type + if status_filter: + query["status_filter"] = status_filter + suffix = f"?{urlencode(query)}" if query else "" + return RedirectResponse(url=f"/admin/dashboard{suffix}", status_code=303) + + +def _parse_meta_json(raw: str | None) -> dict: + if not raw: + return {} + try: + parsed = json.loads(raw) + return parsed if isinstance(parsed, dict) else {} + except Exception: + return {} + + +@router.get("/admin", response_class=HTMLResponse) +def admin_index(request: Request): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + return RedirectResponse(url="/admin/dashboard", status_code=303) + + +@router.get("/admin/login", response_class=HTMLResponse) +def admin_login_page(request: Request): + return templates.TemplateResponse( + request, + "admin_login.html", + {"request": request, "title": "Admin Login", "error": request.query_params.get("error")}, + ) + + +@router.post("/admin/login") +def admin_login(request: Request, username: str = Form(...), password: str = Form(...)): + if not verify_credentials(username, password): + return RedirectResponse(url="/admin/login?error=1", status_code=303) + + token = create_session_token(username) + response = RedirectResponse(url="/admin/dashboard", status_code=303) + response.set_cookie( + key=settings.session_cookie_name, + value=token, + max_age=settings.session_max_age_seconds, + httponly=True, + secure=False, + samesite="lax", + ) + return response + + +@router.post("/admin/logout") +def admin_logout(): + response = RedirectResponse(url="/admin/login", status_code=303) + response.delete_cookie(settings.session_cookie_name) + return response + + +@router.get("/admin/dashboard", response_class=HTMLResponse) +def admin_dashboard(request: Request): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + + sources = list_sources() + source_policy = {s["id"]: evaluate_source_policy(s) for s in sources} + feeds = list_feeds() + runs = list_runs(limit=30) + status_filter = request.query_params.get("status_filter") + if status_filter in {"new", "rewrite", "review", "approved", "published", "error"}: + articles = list_articles(limit=100, status_filter=status_filter) + else: + status_filter = "" + articles = list_articles(limit=100) + for article in articles: + meta = _parse_meta_json(article.get("meta_json")) + extraction = meta.get("extraction") if isinstance(meta.get("extraction"), dict) else {} + article["meta"] = meta + article["extracted_images"] = extraction.get("images") if isinstance(extraction.get("images"), list) else [] + article["press_contact"] = extraction.get("press_contact") if isinstance(extraction.get("press_contact"), str) else None + article["extraction_error"] = extraction.get("extraction_error") if isinstance(extraction.get("extraction_error"), str) else None + + return templates.TemplateResponse( + request, + "admin_dashboard.html", + { + "request": request, + "title": "Admin Dashboard", + "user": user, + "sources": sources, + "source_policy": source_policy, + "feeds": feeds, + "runs": runs, + "articles": articles, + "status_options": ["new", "rewrite", "review", "approved", "published", "error"], + "allowed_transitions": ALLOWED_TRANSITIONS, + "status_filter": status_filter, + "flash_msg": request.query_params.get("msg", ""), + "flash_type": request.query_params.get("type", "success"), + }, + ) + + +@router.post("/admin/sources/create") +def admin_create_source( + request: Request, + name: str = Form(...), + base_url: str = Form(""), + terms_url: str = Form(""), + license_name: str = Form(""), + risk_level: str = Form("yellow"), + last_reviewed_at: str = Form(""), +): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + + try: + create_source( + SourceCreate( + name=name, + base_url=base_url or None, + terms_url=terms_url or None, + license_name=license_name or None, + risk_level=risk_level, + is_enabled=True, + notes=None, + last_reviewed_at=last_reviewed_at or None, + ) + ) + except Exception as exc: + return _dashboard_redirect(msg=f"Quelle konnte nicht gespeichert werden: {exc}", msg_type="error") + return _dashboard_redirect(msg="Quelle gespeichert") + + +@router.post("/admin/feeds/create") +def admin_create_feed( + request: Request, + name: str = Form(...), + url: str = Form(...), + source_id: str = Form(""), +): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + + try: + create_feed( + FeedCreate( + name=name, + url=url, + source_id=_to_optional_int(source_id), + is_enabled=True, + ) + ) + except Exception as exc: + return _dashboard_redirect(msg=f"Feed konnte nicht gespeichert werden: {exc}", msg_type="error") + return _dashboard_redirect(msg="Feed gespeichert") + + +@router.post("/admin/ingestion/run") +def admin_run_ingestion(request: Request, feed_id: str = Form("")): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + try: + stats = run_ingestion(feed_id=_to_optional_int(feed_id)) + except Exception as exc: + return _dashboard_redirect(msg=f"Ingestion fehlgeschlagen: {exc}", msg_type="error") + return _dashboard_redirect(msg=f"Ingestion: {stats.status}, upserts={stats.articles_upserted}") + + +@router.post("/admin/articles/{article_id}/review") +def admin_review_article(request: Request, article_id: int, decision: str = Form(...), note: str = Form("")): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + + article = get_article_by_id(article_id) + if article and article.get("status") == "review" and decision in {"approve", "reject"}: + target = "approved" if decision == "approve" else "rewrite" + update_article_status(article_id, target, actor=user, note=note or None, decision=decision) + return _dashboard_redirect(msg=f"Artikel #{article_id}: {decision}") + return _dashboard_redirect(msg=f"Review-Aktion ungueltig fuer Artikel #{article_id}", msg_type="error") + + +@router.post("/admin/articles/{article_id}/transition") +def admin_transition_article(request: Request, article_id: int, target_status: str = Form(...), note: str = Form("")): + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + + article = get_article_by_id(article_id) + if article: + current = article.get("status") + if target_status in ALLOWED_TRANSITIONS.get(current, ()): + update_article_status(article_id, target_status, actor=user, note=note or None) + return _dashboard_redirect(msg=f"Artikel #{article_id}: {current} -> {target_status}") + return _dashboard_redirect(msg=f"Ungueltiger Statuswechsel fuer Artikel #{article_id}", msg_type="error") diff --git a/backend/app/auth.py b/backend/app/auth.py new file mode 100644 index 0000000..188397f --- /dev/null +++ b/backend/app/auth.py @@ -0,0 +1,31 @@ +import hmac +from typing import Optional + +from itsdangerous import URLSafeTimedSerializer, BadSignature, SignatureExpired + +from .config import get_settings + + +def _serializer() -> URLSafeTimedSerializer: + settings = get_settings() + return URLSafeTimedSerializer(settings.app_secret_key, salt="rss-news-session") + + +def verify_credentials(username: str, password: str) -> bool: + settings = get_settings() + user_ok = hmac.compare_digest(username, settings.app_admin_username) + pw_ok = hmac.compare_digest(password, settings.app_admin_password) + return user_ok and pw_ok + + +def create_session_token(username: str) -> str: + return _serializer().dumps({"username": username}) + + +def verify_session_token(token: str) -> Optional[str]: + settings = get_settings() + try: + payload = _serializer().loads(token, max_age=settings.session_max_age_seconds) + except (BadSignature, SignatureExpired): + return None + return payload.get("username") diff --git a/backend/app/config.py b/backend/app/config.py new file mode 100644 index 0000000..f32b8c4 --- /dev/null +++ b/backend/app/config.py @@ -0,0 +1,29 @@ +from functools import lru_cache + +from pydantic_settings import BaseSettings, SettingsConfigDict + + +class Settings(BaseSettings): + # Prefer backend-specific env file to avoid collisions with legacy root .env + model_config = SettingsConfigDict( + env_file=("backend/.env", ".env"), + env_file_encoding="utf-8", + extra="ignore", + ) + + app_env: str = "development" + app_name: str = "rss-news-backend" + app_secret_key: str = "replace-with-a-long-random-secret" + + app_admin_username: str = "admin" + app_admin_password: str = "change-me" + + session_cookie_name: str = "rss_news_session" + session_max_age_seconds: int = 28800 + + app_db_path: str = "backend/data/rss_news.db" + + +@lru_cache(maxsize=1) +def get_settings() -> Settings: + return Settings() diff --git a/backend/app/db.py b/backend/app/db.py new file mode 100644 index 0000000..c914044 --- /dev/null +++ b/backend/app/db.py @@ -0,0 +1,138 @@ +import sqlite3 +from contextlib import contextmanager +from pathlib import Path +from typing import Any, Iterator + +from .config import get_settings + + +def _db_path() -> Path: + settings = get_settings() + path = Path(settings.app_db_path) + path.parent.mkdir(parents=True, exist_ok=True) + return path + + +@contextmanager +def get_conn() -> Iterator[sqlite3.Connection]: + conn = sqlite3.connect(_db_path()) + conn.row_factory = sqlite3.Row + conn.execute("PRAGMA foreign_keys=ON;") + try: + yield conn + conn.commit() + finally: + conn.close() + + +def init_db() -> None: + with get_conn() as conn: + conn.executescript( + """ + PRAGMA journal_mode=WAL; + + CREATE TABLE IF NOT EXISTS sources ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + name TEXT NOT NULL, + base_url TEXT, + terms_url TEXT, + license_name TEXT, + risk_level TEXT NOT NULL DEFAULT 'yellow' CHECK (risk_level IN ('green', 'yellow', 'red')), + is_enabled INTEGER NOT NULL DEFAULT 0, + notes TEXT, + last_reviewed_at TEXT, + created_at TEXT NOT NULL DEFAULT (datetime('now')), + updated_at TEXT NOT NULL DEFAULT (datetime('now')) + ); + + CREATE TABLE IF NOT EXISTS feeds ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + source_id INTEGER, + name TEXT NOT NULL, + url TEXT NOT NULL UNIQUE, + is_enabled INTEGER NOT NULL DEFAULT 1, + etag TEXT, + last_modified TEXT, + last_checked_at TEXT, + created_at TEXT NOT NULL DEFAULT (datetime('now')), + updated_at TEXT NOT NULL DEFAULT (datetime('now')), + FOREIGN KEY(source_id) REFERENCES sources(id) ON DELETE SET NULL + ); + + CREATE TABLE IF NOT EXISTS runs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + run_type TEXT NOT NULL, + status TEXT NOT NULL CHECK (status IN ('queued', 'running', 'success', 'failed')), + started_at TEXT NOT NULL DEFAULT (datetime('now')), + finished_at TEXT, + details TEXT + ); + + CREATE TABLE IF NOT EXISTS articles ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + feed_id INTEGER, + source_article_id TEXT, + source_hash TEXT, + title TEXT NOT NULL, + source_url TEXT NOT NULL, + canonical_url TEXT, + published_at TEXT, + author TEXT, + summary TEXT, + content_raw TEXT, + content_rewritten TEXT, + word_count INTEGER DEFAULT 0, + status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error')), + meta_json TEXT, + created_at TEXT NOT NULL DEFAULT (datetime('now')), + updated_at TEXT NOT NULL DEFAULT (datetime('now')), + FOREIGN KEY(feed_id) REFERENCES feeds(id) ON DELETE SET NULL, + UNIQUE(source_url) + ); + + CREATE INDEX IF NOT EXISTS idx_articles_source_article_id ON articles(source_article_id); + CREATE INDEX IF NOT EXISTS idx_articles_source_hash ON articles(source_hash); + CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_feed_source_article_id + ON articles(feed_id, source_article_id) + WHERE source_article_id IS NOT NULL; + CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_source_hash + ON articles(source_hash) + WHERE source_hash IS NOT NULL; + CREATE INDEX IF NOT EXISTS idx_articles_status ON articles(status); + CREATE INDEX IF NOT EXISTS idx_feeds_source_id ON feeds(source_id); + CREATE INDEX IF NOT EXISTS idx_runs_started_at ON runs(started_at); + CREATE INDEX IF NOT EXISTS idx_articles_published_at ON articles(published_at); + + CREATE TRIGGER IF NOT EXISTS trg_sources_updated_at + AFTER UPDATE ON sources + FOR EACH ROW + BEGIN + UPDATE sources SET updated_at = datetime('now') WHERE id = OLD.id; + END; + + CREATE TRIGGER IF NOT EXISTS trg_feeds_updated_at + AFTER UPDATE ON feeds + FOR EACH ROW + BEGIN + UPDATE feeds SET updated_at = datetime('now') WHERE id = OLD.id; + END; + + CREATE TRIGGER IF NOT EXISTS trg_articles_updated_at + AFTER UPDATE ON articles + FOR EACH ROW + BEGIN + UPDATE articles SET updated_at = datetime('now') WHERE id = OLD.id; + END; + """ + ) + + # Lightweight migration for existing DBs created before source_hash was introduced. + existing_columns = { + row["name"] for row in conn.execute("PRAGMA table_info(articles)").fetchall() + } + if "source_hash" not in existing_columns: + conn.execute("ALTER TABLE articles ADD COLUMN source_hash TEXT") + + +def rows_to_dicts(rows: list[sqlite3.Row]) -> list[dict[str, Any]]: + return [dict(r) for r in rows] diff --git a/backend/app/ingestion.py b/backend/app/ingestion.py new file mode 100644 index 0000000..87e44c2 --- /dev/null +++ b/backend/app/ingestion.py @@ -0,0 +1,253 @@ +from __future__ import annotations + +from dataclasses import dataclass +from datetime import datetime, timezone +import hashlib +import json +import time +from typing import Any + +import feedparser + +from .policy import evaluate_source_policy +from .repositories import ( + ArticleUpsert, + RunCreate, + create_run, + finish_run, + get_feed_by_id, + list_enabled_feeds, + update_feed_fetch_state, + upsert_article, +) +from .source_extraction import extract_article, extracted_article_to_meta + + +@dataclass(frozen=True) +class IngestionStats: + run_id: int + feeds_processed: int + entries_seen: int + articles_upserted: int + status: str + message: str + + +MAX_FEED_FETCH_RETRIES = 3 + + +def _entry_published_iso(entry: dict) -> str | None: + published = entry.get("published_parsed") or entry.get("updated_parsed") + if not published: + return None + return datetime(*published[:6], tzinfo=timezone.utc).isoformat() + + +def _entry_text(entry: dict) -> tuple[str, str]: + summary = entry.get("summary", "") or "" + content = "" + if entry.get("content") and isinstance(entry.get("content"), list): + first = entry["content"][0] + content = first.get("value", "") if isinstance(first, dict) else "" + if not content: + content = summary + return summary, content + + +def _entry_hash(entry: dict, feed_id: int, link: str, title: str, summary: str) -> str: + source_id = entry.get("id") or entry.get("guid") or "" + published = _entry_published_iso(entry) or "" + fingerprint = f"{feed_id}|{source_id}|{link}|{title.strip()}|{summary.strip()}|{published}" + return hashlib.sha256(fingerprint.encode("utf-8")).hexdigest() + + +def _parsed_get(parsed: object, key: str, default: object = None) -> object: + if isinstance(parsed, dict): + return parsed.get(key, default) + return getattr(parsed, key, default) + + +def run_ingestion(feed_id: int | None = None) -> IngestionStats: + run_id = create_run(RunCreate(run_type="ingestion", status="running", details="started")) + feeds_processed = 0 + entries_seen = 0 + articles_upserted = 0 + feed_results: list[dict[str, object]] = [] + + try: + if feed_id is not None: + feed = get_feed_by_id(feed_id) + feeds = [feed] if feed and int(feed.get("is_enabled", 0)) == 1 else [] + else: + feeds = list_enabled_feeds() + + for feed in feeds: + if not feed: + continue + feeds_processed += 1 + + source_snapshot = { + "id": feed.get("source_id"), + "name": feed.get("source_name"), + "base_url": feed.get("source_base_url"), + "terms_url": feed.get("source_terms_url"), + "license_name": feed.get("source_license_name"), + "risk_level": feed.get("source_risk_level"), + "last_reviewed_at": feed.get("source_last_reviewed_at"), + "is_enabled": feed.get("source_is_enabled"), + } + policy_issues = evaluate_source_policy(source_snapshot) + if policy_issues: + feed_results.append( + { + "feed_id": int(feed["id"]), + "feed_url": feed["url"], + "status": "blocked", + "policy_issues": policy_issues, + "entries_seen": 0, + "upserts": 0, + } + ) + continue + + parsed = None + feed_error = None + for attempt in range(1, MAX_FEED_FETCH_RETRIES + 1): + try: + parsed = feedparser.parse( + feed["url"], + etag=feed.get("etag"), + modified=feed.get("last_modified"), + ) + break + except Exception as exc: + feed_error = str(exc) + if attempt < MAX_FEED_FETCH_RETRIES: + time.sleep(0.5 * attempt) + + if parsed is None: + feed_results.append( + { + "feed_id": int(feed["id"]), + "feed_url": feed["url"], + "status": "failed", + "error": feed_error or "unknown", + "entries_seen": 0, + "upserts": 0, + } + ) + continue + + # Persist ETag/Last-Modified for conditional requests. + parsed_etag = _parsed_get(parsed, "etag") + parsed_modified = _parsed_get(parsed, "modified") + if parsed_modified and not isinstance(parsed_modified, str): + parsed_modified = str(parsed_modified) + update_feed_fetch_state( + feed_id=int(feed["id"]), + etag=parsed_etag if isinstance(parsed_etag, str) else None, + last_modified=parsed_modified if isinstance(parsed_modified, str) else None, + ) + + feed_entries_seen = 0 + feed_upserts = 0 + for entry in _parsed_get(parsed, "entries", []): + entries_seen += 1 + feed_entries_seen += 1 + link = entry.get("link") + if not link: + continue + + summary, content_raw = _entry_text(entry) + title = entry.get("title") or "Ohne Titel" + extracted = extract_article(link) + + final_title = extracted.title or title + final_author = extracted.author or entry.get("author") + final_summary = extracted.summary or (summary[:1000] if summary else None) + final_content_raw = extracted.content_text or content_raw + final_canonical = extracted.canonical_url or entry.get("link") + + source_hash = _entry_hash( + entry, + int(feed["id"]), + link, + final_title, + final_summary or "", + ) + attribution = { + "source_name": feed.get("source_name"), + "source_base_url": feed.get("source_base_url"), + "source_terms_url": feed.get("source_terms_url"), + "source_license_name": feed.get("source_license_name"), + "source_risk_level": feed.get("source_risk_level"), + "original_link": link, + "feed_name": feed.get("name"), + "feed_id": int(feed["id"]), + "imported_at": datetime.now(timezone.utc).isoformat(), + } + extraction_meta: dict[str, Any] = extracted_article_to_meta(extracted) + extraction_meta["fetched_from"] = link + article_id = upsert_article( + ArticleUpsert( + feed_id=int(feed["id"]), + source_article_id=entry.get("id") or entry.get("guid"), + source_hash=source_hash, + title=final_title, + source_url=link, + canonical_url=final_canonical, + published_at=_entry_published_iso(entry), + author=final_author, + summary=final_summary, + content_raw=final_content_raw, + content_rewritten=None, + word_count=len((final_content_raw or "").split()), + status="new", + meta_json=json.dumps({"attribution": attribution, "extraction": extraction_meta}, ensure_ascii=False), + ) + ) + if article_id: + articles_upserted += 1 + feed_upserts += 1 + + feed_results.append( + { + "feed_id": int(feed["id"]), + "feed_url": feed["url"], + "status": "success", + "entries_seen": feed_entries_seen, + "upserts": feed_upserts, + } + ) + + finish_run( + run_id=run_id, + status="success", + details=json.dumps( + { + "feeds_processed": feeds_processed, + "entries_seen": entries_seen, + "upserts": articles_upserted, + "feeds": feed_results, + }, + ensure_ascii=False, + ), + ) + return IngestionStats( + run_id=run_id, + feeds_processed=feeds_processed, + entries_seen=entries_seen, + articles_upserted=articles_upserted, + status="success", + message="Ingestion abgeschlossen", + ) + except Exception as exc: + finish_run(run_id=run_id, status="failed", details=str(exc)) + return IngestionStats( + run_id=run_id, + feeds_processed=feeds_processed, + entries_seen=entries_seen, + articles_upserted=articles_upserted, + status="failed", + message=str(exc), + ) diff --git a/backend/app/main.py b/backend/app/main.py new file mode 100644 index 0000000..616dd77 --- /dev/null +++ b/backend/app/main.py @@ -0,0 +1,404 @@ +from contextlib import asynccontextmanager +from pathlib import Path + +from fastapi import Depends, FastAPI, HTTPException, Request, Response, status +from pydantic import BaseModel, Field +from fastapi.staticfiles import StaticFiles + +from .admin_ui import router as admin_router +from .auth import create_session_token, verify_credentials, verify_session_token +from .config import get_settings +from .db import init_db +from .ingestion import run_ingestion +from .policy import evaluate_source_policy, is_source_allowed +from .repositories import ( + ArticleUpsert, + FeedCreate, + RunCreate, + SourceCreate, + create_feed as repo_create_feed, + create_run, + create_source as repo_create_source, + finish_run, + get_article_by_id, + get_feed_by_id, + get_run_by_id, + get_source_by_id, + list_articles as repo_list_articles, + list_feeds as repo_list_feeds, + list_runs, + list_sources as repo_list_sources, + update_article_status, + upsert_article as repo_upsert_article, +) + +settings = get_settings() + + +@asynccontextmanager +async def app_lifespan(_: FastAPI): + init_db() + yield + + +app = FastAPI(title=settings.app_name, lifespan=app_lifespan) +app.include_router(admin_router) +app.mount( + "/admin/static", + StaticFiles(directory=str(Path(__file__).resolve().parent.parent / "static")), + name="admin-static", +) + + +class LoginRequest(BaseModel): + username: str + password: str + + +class SourceCreateRequest(BaseModel): + name: str = Field(min_length=1, max_length=200) + base_url: str | None = None + terms_url: str | None = None + license_name: str | None = None + risk_level: str = Field(default="yellow", pattern="^(green|yellow|red)$") + is_enabled: bool = False + notes: str | None = None + last_reviewed_at: str | None = None + + +class FeedCreateRequest(BaseModel): + name: str = Field(min_length=1, max_length=200) + url: str = Field(min_length=5, max_length=1000) + source_id: int | None = None + is_enabled: bool = True + + +class RunCreateRequest(BaseModel): + run_type: str = Field(min_length=2, max_length=100) + status: str = Field(default="queued", pattern="^(queued|running|success|failed)$") + details: str | None = None + + +class RunFinishRequest(BaseModel): + status: str = Field(pattern="^(success|failed)$") + details: str | None = None + + +class ArticleUpsertRequest(BaseModel): + feed_id: int | None = None + source_article_id: str | None = None + source_hash: str | None = None + title: str = Field(min_length=1, max_length=500) + source_url: str = Field(min_length=5, max_length=2000) + canonical_url: str | None = None + published_at: str | None = None + author: str | None = None + summary: str | None = None + content_raw: str | None = None + content_rewritten: str | None = None + word_count: int = 0 + status: str = Field(default="new", pattern="^(new|rewrite|review|approved|published|error)$") + meta_json: str | None = None + + +class IngestionRunRequest(BaseModel): + feed_id: int | None = None + + +class ArticleTransitionRequest(BaseModel): + target_status: str = Field(pattern="^(new|rewrite|review|approved|published|error)$") + note: str | None = None + + +class ArticleReviewRequest(BaseModel): + decision: str = Field(pattern="^(approve|reject)$") + note: str | None = None + + +ALLOWED_ARTICLE_TRANSITIONS: dict[str, set[str]] = { + "new": {"review", "rewrite", "error"}, + "rewrite": {"review", "error"}, + "review": {"approved", "rewrite", "error"}, + "approved": {"published", "error"}, + "published": {"error"}, + "error": {"review", "rewrite"}, +} + + +def require_auth(request: Request) -> str: + token = request.cookies.get(settings.session_cookie_name) + if not token: + raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Nicht angemeldet") + + username = verify_session_token(token) + if not username: + raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Session ungueltig oder abgelaufen") + + return username + + +@app.get("/health") +def health() -> dict: + return {"status": "ok", "service": settings.app_name, "db_path": settings.app_db_path} + + +@app.post("/auth/login") +def login(payload: LoginRequest, response: Response) -> dict: + if not verify_credentials(payload.username, payload.password): + raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Ungueltige Zugangsdaten") + + token = create_session_token(payload.username) + response.set_cookie( + key=settings.session_cookie_name, + value=token, + max_age=settings.session_max_age_seconds, + httponly=True, + secure=False, + samesite="lax", + ) + return {"ok": True, "username": payload.username} + + +@app.post("/auth/logout") +def logout(response: Response) -> dict: + response.delete_cookie(settings.session_cookie_name) + return {"ok": True} + + +@app.get("/auth/me") +def me(username: str = Depends(require_auth)) -> dict: + return {"authenticated": True, "username": username} + + +@app.get("/api/protected") +def protected(username: str = Depends(require_auth)) -> dict: + return {"ok": True, "message": "Protected endpoint", "username": username} + + +@app.get("/api/pipeline/status") +def pipeline_status(username: str = Depends(require_auth)) -> dict: + feeds_total = len(repo_list_feeds()) + sources_total = len(repo_list_sources()) + articles_total = len(repo_list_articles(limit=500)) + return { + "ok": True, + "stage": "skeleton+db", + "requested_by": username, + "counts": { + "sources": sources_total, + "feeds": feeds_total, + "articles": articles_total, + }, + } + + +@app.get("/api/sources") +def list_sources(username: str = Depends(require_auth)) -> dict: + return {"ok": True, "items": repo_list_sources(), "requested_by": username} + + +@app.get("/api/sources/{source_id}/policy-check") +def source_policy_check(source_id: int, username: str = Depends(require_auth)) -> dict: + source = get_source_by_id(source_id) + if not source: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Quelle nicht gefunden") + issues = evaluate_source_policy(source) + return { + "ok": True, + "source_id": source_id, + "allowed": is_source_allowed(source), + "issues": issues, + "requested_by": username, + } + + +@app.post("/api/sources") +def create_source(payload: SourceCreateRequest, username: str = Depends(require_auth)) -> dict: + source_id = repo_create_source( + SourceCreate( + name=payload.name, + base_url=payload.base_url, + terms_url=payload.terms_url, + license_name=payload.license_name, + risk_level=payload.risk_level, + is_enabled=payload.is_enabled, + notes=payload.notes, + last_reviewed_at=payload.last_reviewed_at, + ) + ) + return {"ok": True, "id": source_id, "requested_by": username} + + +@app.get("/api/feeds") +def list_feeds(username: str = Depends(require_auth)) -> dict: + return {"ok": True, "items": repo_list_feeds(), "requested_by": username} + + +@app.get("/api/feeds/{feed_id}/policy-check") +def feed_policy_check(feed_id: int, username: str = Depends(require_auth)) -> dict: + feed = get_feed_by_id(feed_id) + if not feed: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Feed nicht gefunden") + + source_snapshot = { + "id": feed.get("source_id"), + "name": feed.get("source_name"), + "base_url": feed.get("source_base_url"), + "terms_url": feed.get("source_terms_url"), + "license_name": feed.get("source_license_name"), + "risk_level": feed.get("source_risk_level"), + "last_reviewed_at": feed.get("source_last_reviewed_at"), + "is_enabled": feed.get("source_is_enabled"), + } + issues = evaluate_source_policy(source_snapshot) + return { + "ok": True, + "feed_id": feed_id, + "allowed": len(issues) == 0, + "issues": issues, + "requested_by": username, + } + + +@app.post("/api/feeds") +def create_feed(payload: FeedCreateRequest, username: str = Depends(require_auth)) -> dict: + try: + feed_id = repo_create_feed( + FeedCreate( + name=payload.name, + url=payload.url, + source_id=payload.source_id, + is_enabled=payload.is_enabled, + ) + ) + except Exception as exc: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=f"Feed konnte nicht angelegt werden: {exc}") from exc + + return {"ok": True, "id": feed_id, "requested_by": username} + + +@app.get("/api/runs") +def api_list_runs(limit: int = 50, username: str = Depends(require_auth)) -> dict: + return {"ok": True, "items": list_runs(limit=limit), "requested_by": username} + + +@app.get("/api/runs/{run_id}") +def api_get_run(run_id: int, username: str = Depends(require_auth)) -> dict: + run = get_run_by_id(run_id) + if not run: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Run nicht gefunden") + return {"ok": True, "item": run, "requested_by": username} + + +@app.post("/api/runs") +def api_create_run(payload: RunCreateRequest, username: str = Depends(require_auth)) -> dict: + run_id = create_run(RunCreate(run_type=payload.run_type, status=payload.status, details=payload.details)) + return {"ok": True, "id": run_id, "requested_by": username} + + +@app.post("/api/runs/{run_id}/finish") +def api_finish_run(run_id: int, payload: RunFinishRequest, username: str = Depends(require_auth)) -> dict: + finish_run(run_id=run_id, status=payload.status, details=payload.details) + return {"ok": True, "id": run_id, "requested_by": username} + + +@app.get("/api/articles") +def api_list_articles(limit: int = 100, status_filter: str | None = None, username: str = Depends(require_auth)) -> dict: + return {"ok": True, "items": repo_list_articles(limit=limit, status_filter=status_filter), "requested_by": username} + + +@app.get("/api/articles/{article_id}") +def api_get_article(article_id: int, username: str = Depends(require_auth)) -> dict: + article = get_article_by_id(article_id) + if not article: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden") + return {"ok": True, "item": article, "requested_by": username} + + +@app.post("/api/articles/upsert") +def api_upsert_article(payload: ArticleUpsertRequest, username: str = Depends(require_auth)) -> dict: + article_id = repo_upsert_article( + ArticleUpsert( + feed_id=payload.feed_id, + source_article_id=payload.source_article_id, + source_hash=payload.source_hash, + title=payload.title, + source_url=payload.source_url, + canonical_url=payload.canonical_url, + published_at=payload.published_at, + author=payload.author, + summary=payload.summary, + content_raw=payload.content_raw, + content_rewritten=payload.content_rewritten, + word_count=payload.word_count, + status=payload.status, + meta_json=payload.meta_json, + ) + ) + return {"ok": True, "id": article_id, "requested_by": username} + + +@app.post("/api/articles/{article_id}/transition") +def api_article_transition(article_id: int, payload: ArticleTransitionRequest, username: str = Depends(require_auth)) -> dict: + article = get_article_by_id(article_id) + if not article: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden") + + current_status = article.get("status") + allowed_targets = ALLOWED_ARTICLE_TRANSITIONS.get(current_status, set()) + if payload.target_status not in allowed_targets: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=f"Ungueltiger Statuswechsel: {current_status} -> {payload.target_status}", + ) + + updated = update_article_status(article_id, payload.target_status, actor=username, note=payload.note) + if not updated: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden") + return {"ok": True, "id": article_id, "from_status": current_status, "to_status": payload.target_status} + + +@app.post("/api/articles/{article_id}/review") +def api_article_review(article_id: int, payload: ArticleReviewRequest, username: str = Depends(require_auth)) -> dict: + article = get_article_by_id(article_id) + if not article: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden") + if article.get("status") != "review": + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=f"Review nur fuer Status 'review' erlaubt (aktuell: {article.get('status')})", + ) + + target_status = "approved" if payload.decision == "approve" else "rewrite" + updated = update_article_status( + article_id, + target_status, + actor=username, + note=payload.note, + decision=payload.decision, + ) + if not updated: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden") + return { + "ok": True, + "id": article_id, + "decision": payload.decision, + "to_status": target_status, + } + + +@app.post("/api/ingestion/run") +def api_run_ingestion(payload: IngestionRunRequest, username: str = Depends(require_auth)) -> dict: + stats = run_ingestion(feed_id=payload.feed_id) + return { + "ok": stats.status == "success", + "run_id": stats.run_id, + "status": stats.status, + "message": stats.message, + "stats": { + "feeds_processed": stats.feeds_processed, + "entries_seen": stats.entries_seen, + "articles_upserted": stats.articles_upserted, + }, + "requested_by": username, + } diff --git a/backend/app/policy.py b/backend/app/policy.py new file mode 100644 index 0000000..af6e65c --- /dev/null +++ b/backend/app/policy.py @@ -0,0 +1,35 @@ +from __future__ import annotations + +from typing import Any + + +def evaluate_source_policy(source: dict[str, Any] | None) -> list[str]: + issues: list[str] = [] + if not source: + issues.append("Keine Quelle zugeordnet") + return issues + + risk_level = (source.get("risk_level") or "").strip().lower() + if risk_level != "green": + issues.append(f"Quelle nicht freigegeben (risk_level={risk_level or 'unset'})") + + terms_url = (source.get("terms_url") or "").strip() + if not terms_url: + issues.append("terms_url fehlt") + + license_name = (source.get("license_name") or "").strip() + if not license_name: + issues.append("license_name fehlt") + + last_reviewed_at = (source.get("last_reviewed_at") or "").strip() + if not last_reviewed_at: + issues.append("last_reviewed_at fehlt") + + if int(source.get("is_enabled", 0) or 0) != 1: + issues.append("Quelle ist deaktiviert") + + return issues + + +def is_source_allowed(source: dict[str, Any] | None) -> bool: + return len(evaluate_source_policy(source)) == 0 diff --git a/backend/app/repositories.py b/backend/app/repositories.py new file mode 100644 index 0000000..e170a20 --- /dev/null +++ b/backend/app/repositories.py @@ -0,0 +1,416 @@ +from __future__ import annotations + +from dataclasses import dataclass +import json +from datetime import datetime, timezone +from typing import Any + +from .db import get_conn, rows_to_dicts + + +@dataclass(frozen=True) +class SourceCreate: + name: str + base_url: str | None + terms_url: str | None + license_name: str | None + risk_level: str + is_enabled: bool + notes: str | None + last_reviewed_at: str | None + + +@dataclass(frozen=True) +class FeedCreate: + name: str + url: str + source_id: int | None + is_enabled: bool + + +@dataclass(frozen=True) +class RunCreate: + run_type: str + status: str + details: str | None = None + + +@dataclass(frozen=True) +class ArticleUpsert: + feed_id: int | None + source_article_id: str | None + source_hash: str | None + title: str + source_url: str + canonical_url: str | None + published_at: str | None + author: str | None + summary: str | None + content_raw: str | None + content_rewritten: str | None + word_count: int + status: str + meta_json: str | None + + +def create_source(payload: SourceCreate) -> int: + with get_conn() as conn: + cur = conn.execute( + """ + INSERT INTO sources (name, base_url, terms_url, license_name, risk_level, is_enabled, notes, last_reviewed_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + payload.name.strip(), + payload.base_url, + payload.terms_url, + payload.license_name, + payload.risk_level, + 1 if payload.is_enabled else 0, + payload.notes, + payload.last_reviewed_at, + ), + ) + return int(cur.lastrowid) + + +def list_sources() -> list[dict[str, Any]]: + with get_conn() as conn: + rows = conn.execute( + """ + SELECT id, name, base_url, terms_url, license_name, risk_level, is_enabled, notes, last_reviewed_at, created_at, updated_at + FROM sources + ORDER BY id DESC + """ + ).fetchall() + return rows_to_dicts(rows) + + +def get_source_by_id(source_id: int) -> dict[str, Any] | None: + with get_conn() as conn: + row = conn.execute( + """ + SELECT id, name, base_url, terms_url, license_name, risk_level, is_enabled, notes, last_reviewed_at, created_at, updated_at + FROM sources + WHERE id = ? + """, + (source_id,), + ).fetchone() + return dict(row) if row else None + + +def create_feed(payload: FeedCreate) -> int: + with get_conn() as conn: + cur = conn.execute( + "INSERT INTO feeds (name, url, source_id, is_enabled) VALUES (?, ?, ?, ?)", + (payload.name.strip(), payload.url.strip(), payload.source_id, 1 if payload.is_enabled else 0), + ) + return int(cur.lastrowid) + + +def list_feeds() -> list[dict[str, Any]]: + with get_conn() as conn: + rows = conn.execute( + """ + SELECT f.id, f.name, f.url, f.source_id, f.is_enabled, f.etag, f.last_modified, f.last_checked_at, + f.created_at, f.updated_at, s.name AS source_name, s.license_name AS source_license_name, + s.terms_url AS source_terms_url, s.risk_level AS source_risk_level, s.base_url AS source_base_url, + s.last_reviewed_at AS source_last_reviewed_at, s.is_enabled AS source_is_enabled + FROM feeds f + LEFT JOIN sources s ON s.id = f.source_id + ORDER BY f.id DESC + """ + ).fetchall() + return rows_to_dicts(rows) + + +def list_enabled_feeds() -> list[dict[str, Any]]: + with get_conn() as conn: + rows = conn.execute( + """ + SELECT f.id, f.name, f.url, f.source_id, f.is_enabled, f.etag, f.last_modified, f.last_checked_at, + s.name AS source_name, s.license_name AS source_license_name, s.terms_url AS source_terms_url, + s.risk_level AS source_risk_level, s.base_url AS source_base_url, + s.last_reviewed_at AS source_last_reviewed_at, s.is_enabled AS source_is_enabled + FROM feeds f + LEFT JOIN sources s ON s.id = f.source_id + WHERE f.is_enabled = 1 + ORDER BY f.id ASC + """ + ).fetchall() + return rows_to_dicts(rows) + + +def get_feed_by_id(feed_id: int) -> dict[str, Any] | None: + with get_conn() as conn: + row = conn.execute( + """ + SELECT f.id, f.name, f.url, f.source_id, f.is_enabled, f.etag, f.last_modified, f.last_checked_at, + s.name AS source_name, s.license_name AS source_license_name, s.terms_url AS source_terms_url, + s.risk_level AS source_risk_level, s.base_url AS source_base_url, + s.last_reviewed_at AS source_last_reviewed_at, s.is_enabled AS source_is_enabled + FROM feeds f + LEFT JOIN sources s ON s.id = f.source_id + WHERE f.id = ? + """, + (feed_id,), + ).fetchone() + return dict(row) if row else None + + +def update_feed_fetch_state(feed_id: int, etag: str | None, last_modified: str | None) -> None: + with get_conn() as conn: + conn.execute( + """ + UPDATE feeds + SET etag = ?, last_modified = ?, last_checked_at = datetime('now') + WHERE id = ? + """, + (etag, last_modified, feed_id), + ) + + +def create_run(payload: RunCreate) -> int: + with get_conn() as conn: + cur = conn.execute( + "INSERT INTO runs (run_type, status, details) VALUES (?, ?, ?)", + (payload.run_type, payload.status, payload.details), + ) + return int(cur.lastrowid) + + +def finish_run(run_id: int, status: str, details: str | None = None) -> None: + with get_conn() as conn: + conn.execute( + """ + UPDATE runs + SET status = ?, details = ?, finished_at = datetime('now') + WHERE id = ? + """, + (status, details, run_id), + ) + + +def list_runs(limit: int = 50) -> list[dict[str, Any]]: + safe_limit = max(1, min(limit, 500)) + with get_conn() as conn: + rows = conn.execute( + """ + SELECT id, run_type, status, started_at, finished_at, details + FROM runs + ORDER BY id DESC + LIMIT ? + """, + (safe_limit,), + ).fetchall() + return rows_to_dicts(rows) + + +def get_run_by_id(run_id: int) -> dict[str, Any] | None: + with get_conn() as conn: + row = conn.execute( + """ + SELECT id, run_type, status, started_at, finished_at, details + FROM runs + WHERE id = ? + """, + (run_id,), + ).fetchone() + return dict(row) if row else None + + +def get_article_by_id(article_id: int) -> dict[str, Any] | None: + with get_conn() as conn: + row = conn.execute( + """ + SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author, + a.summary, a.content_raw, a.content_rewritten, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at + FROM articles a + WHERE a.id = ? + """, + (article_id,), + ).fetchone() + return dict(row) if row else None + + +def _merge_review_event(meta_json: str | None, event: dict[str, Any]) -> str: + meta: dict[str, Any] = {} + if meta_json: + try: + meta = json.loads(meta_json) + if not isinstance(meta, dict): + meta = {} + except Exception: + meta = {} + + events = meta.get("review_events") + if not isinstance(events, list): + events = [] + events.append(event) + meta["review_events"] = events + return json.dumps(meta, ensure_ascii=False) + + +def update_article_status( + article_id: int, + new_status: str, + *, + actor: str | None = None, + note: str | None = None, + decision: str | None = None, +) -> bool: + article = get_article_by_id(article_id) + if not article: + return False + + event = { + "timestamp": datetime.now(timezone.utc).isoformat(), + "from_status": article.get("status"), + "to_status": new_status, + "actor": actor or "system", + "note": note, + "decision": decision, + } + merged_meta = _merge_review_event(article.get("meta_json"), event) + + with get_conn() as conn: + conn.execute( + "UPDATE articles SET status = ?, meta_json = ? WHERE id = ?", + (new_status, merged_meta, article_id), + ) + return True + + +def _resolve_existing_article_id(payload: ArticleUpsert) -> int | None: + with get_conn() as conn: + # 1) strongest key: source_url + row = conn.execute( + "SELECT id FROM articles WHERE source_url = ?", + (payload.source_url.strip(),), + ).fetchone() + if row: + return int(row["id"]) + + # 2) stable feed+guid combo + if payload.feed_id is not None and payload.source_article_id: + row = conn.execute( + "SELECT id FROM articles WHERE feed_id = ? AND source_article_id = ?", + (payload.feed_id, payload.source_article_id), + ).fetchone() + if row: + return int(row["id"]) + + # 3) content hash fallback + if payload.source_hash: + row = conn.execute( + "SELECT id FROM articles WHERE source_hash = ?", + (payload.source_hash,), + ).fetchone() + if row: + return int(row["id"]) + + return None + + +def upsert_article(payload: ArticleUpsert) -> int: + existing_id = _resolve_existing_article_id(payload) + with get_conn() as conn: + if existing_id is None: + conn.execute( + """ + INSERT INTO articles ( + feed_id, source_article_id, source_hash, title, source_url, canonical_url, published_at, author, + summary, content_raw, content_rewritten, word_count, status, meta_json + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + payload.feed_id, + payload.source_article_id, + payload.source_hash, + payload.title.strip(), + payload.source_url.strip(), + payload.canonical_url, + payload.published_at, + payload.author, + payload.summary, + payload.content_raw, + payload.content_rewritten, + payload.word_count, + payload.status, + payload.meta_json, + ), + ) + else: + conn.execute( + """ + UPDATE articles + SET + feed_id = ?, + source_article_id = ?, + source_hash = ?, + title = ?, + source_url = ?, + canonical_url = ?, + published_at = ?, + author = ?, + summary = ?, + content_raw = ?, + content_rewritten = ?, + word_count = ?, + status = ?, + meta_json = ? + WHERE id = ? + """, + ( + payload.feed_id, + payload.source_article_id, + payload.source_hash, + payload.title.strip(), + payload.source_url.strip(), + payload.canonical_url, + payload.published_at, + payload.author, + payload.summary, + payload.content_raw, + payload.content_rewritten, + payload.word_count, + payload.status, + payload.meta_json, + existing_id, + ), + ) + row = conn.execute("SELECT id FROM articles WHERE source_url = ?", (payload.source_url.strip(),)).fetchone() + if row: + return int(row["id"]) + return int(existing_id) if existing_id else 0 + + +def list_articles(limit: int = 100, status_filter: str | None = None) -> list[dict[str, Any]]: + safe_limit = max(1, min(limit, 500)) + with get_conn() as conn: + if status_filter: + rows = conn.execute( + """ + SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author, + a.summary, a.content_raw, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at, f.name AS feed_name + FROM articles a + LEFT JOIN feeds f ON f.id = a.feed_id + WHERE a.status = ? + ORDER BY a.id DESC + LIMIT ? + """, + (status_filter, safe_limit), + ).fetchall() + else: + rows = conn.execute( + """ + SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author, + a.summary, a.content_raw, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at, f.name AS feed_name + FROM articles a + LEFT JOIN feeds f ON f.id = a.feed_id + ORDER BY a.id DESC + LIMIT ? + """, + (safe_limit,), + ).fetchall() + return rows_to_dicts(rows) diff --git a/backend/app/source_extraction.py b/backend/app/source_extraction.py new file mode 100644 index 0000000..7fd65ce --- /dev/null +++ b/backend/app/source_extraction.py @@ -0,0 +1,257 @@ +from __future__ import annotations + +from dataclasses import dataclass +from html import unescape +import re +from typing import Any +from urllib.parse import urljoin +from urllib.request import Request, urlopen + +DEFAULT_TIMEOUT_SECONDS = 10 +DEFAULT_USER_AGENT = "rss-news-bot/1.0 (+https://news.vanityontour.de)" + + +@dataclass(frozen=True) +class ExtractedArticle: + title: str | None + author: str | None + canonical_url: str | None + summary: str | None + content_text: str | None + images: list[str] + press_contact: str | None + extraction_error: str | None = None + + +def _clean_text(raw: str | None) -> str | None: + if not raw: + return None + text = unescape(raw) + text = re.sub(r"<[^>]+>", " ", text) + text = re.sub(r"\s+", " ", text).strip() + return text or None + + +def _strip_noise(html: str) -> str: + html = re.sub(r"", " ", html, flags=re.IGNORECASE) + html = re.sub(r"", " ", html, flags=re.IGNORECASE) + html = re.sub(r"", " ", html, flags=re.IGNORECASE) + return html + + +def _meta_content(html: str, attr: str, value: str) -> str | None: + pattern = re.compile( + rf"]+{attr}\s*=\s*[\"']{re.escape(value)}[\"'][^>]*content\s*=\s*[\"']([^\"']+)[\"'][^>]*>", + re.IGNORECASE, + ) + match = pattern.search(html) + if match: + return _clean_text(match.group(1)) + + # handle reversed attribute order + pattern_rev = re.compile( + rf"]+content\s*=\s*[\"']([^\"']+)[\"'][^>]*{attr}\s*=\s*[\"']{re.escape(value)}[\"'][^>]*>", + re.IGNORECASE, + ) + match = pattern_rev.search(html) + if match: + return _clean_text(match.group(1)) + return None + + +def _extract_title(html: str) -> str | None: + title = _meta_content(html, "property", "og:title") + if title: + return title + + match = re.search(r"]*>([\s\S]*?)", html, re.IGNORECASE) + if match: + cleaned = _clean_text(match.group(1)) + if cleaned: + return cleaned + + match = re.search(r"]*>([\s\S]*?)", html, re.IGNORECASE) + if match: + return _clean_text(match.group(1)) + return None + + +def _extract_canonical(html: str) -> str | None: + match = re.search( + r"]+rel\s*=\s*[\"']canonical[\"'][^>]*href\s*=\s*[\"']([^\"']+)[\"'][^>]*>", + html, + re.IGNORECASE, + ) + if match: + return _clean_text(match.group(1)) + + match = re.search( + r"]+href\s*=\s*[\"']([^\"']+)[\"'][^>]*rel\s*=\s*[\"']canonical[\"'][^>]*>", + html, + re.IGNORECASE, + ) + if match: + return _clean_text(match.group(1)) + return None + + +def _extract_author(html: str) -> str | None: + for attr, value in (("name", "author"), ("property", "article:author"), ("property", "og:article:author")): + author = _meta_content(html, attr, value) + if author: + return author + + for pattern in ( + r"(?:Von|Autor(?:in)?)\s*[:\-]\s*([^<\n\r]{3,120})", + r"class=[\"'][^\"']*(?:author|byline)[^\"']*[\"'][^>]*>([\s\S]{1,180})<", + ): + match = re.search(pattern, html, re.IGNORECASE) + if match: + author = _clean_text(match.group(1)) + if author: + return author + return None + + +def _extract_images(html: str, page_url: str) -> list[str]: + images: list[str] = [] + seen: set[str] = set() + + for prop in ("og:image", "twitter:image"): + pattern = re.compile( + rf"]+property\s*=\s*[\"']{re.escape(prop)}[\"'][^>]*content\s*=\s*[\"']([^\"']+)[\"'][^>]*>", + re.IGNORECASE, + ) + for match in pattern.finditer(html): + src = match.group(1).strip() + abs_src = urljoin(page_url, src) + if abs_src not in seen: + seen.add(abs_src) + images.append(abs_src) + + for match in re.finditer(r"]+src\s*=\s*[\"']([^\"']+)[\"'][^>]*>", html, re.IGNORECASE): + src = match.group(1).strip() + abs_src = urljoin(page_url, src) + if abs_src not in seen: + seen.add(abs_src) + images.append(abs_src) + + return images + + +def _extract_content_text(html: str) -> str | None: + section = None + for pattern in ( + r"]*>([\s\S]*?)", + r"]*>([\s\S]*?)", + r"]*>([\s\S]*?)", + ): + match = re.search(pattern, html, re.IGNORECASE) + if match: + section = match.group(1) + break + + if not section: + section = html + + paragraphs = [] + for match in re.finditer(r"]*>([\s\S]*?)", section, re.IGNORECASE): + text = _clean_text(match.group(1)) + if text and re.search(r"\b(pressekontakt|press contact|kontakt)\b", text, re.IGNORECASE): + paragraphs.append(text) + + for match in re.finditer(r"]*>([\s\S]*?)

", section, re.IGNORECASE): + text = _clean_text(match.group(1)) + if text and len(text) > 2: + paragraphs.append(text) + + if paragraphs: + return "\n".join(paragraphs) + + stripped = _clean_text(section) + return stripped + + +def _extract_press_contact(content_text: str | None) -> str | None: + if not content_text: + return None + + lines = [line.strip() for line in content_text.split("\n") if line.strip()] + marker_re = re.compile(r"\b(pressekontakt|press contact|presse\-kontakt)\b", re.IGNORECASE) + for idx, line in enumerate(lines): + if marker_re.search(line): + chunk = [line] + for nxt in lines[idx + 1 : idx + 6]: + if re.search(r"\b(original\-content von|ots:|newsroom:)\b", nxt, re.IGNORECASE): + break + chunk.append(nxt) + return _clean_text("\n".join(chunk)) + + match = re.search( + r"(Pressekontakt[\s\S]{0,1200}?)(?:Original-Content von|OTS:|newsroom:|$)", + content_text, + re.IGNORECASE, + ) + if match: + return _clean_text(match.group(1)) + return None + + +def extract_article(url: str, timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS) -> ExtractedArticle: + try: + req = Request( + url=url, + headers={ + "User-Agent": DEFAULT_USER_AGENT, + "Accept-Language": "de-DE,de;q=0.9,en;q=0.8", + }, + ) + with urlopen(req, timeout=timeout_seconds) as resp: + raw = resp.read() + charset = resp.headers.get_content_charset() or "utf-8" + html = raw.decode(charset, errors="replace") + except Exception as exc: + return ExtractedArticle( + title=None, + author=None, + canonical_url=None, + summary=None, + content_text=None, + images=[], + press_contact=None, + extraction_error=str(exc), + ) + + html = _strip_noise(html) + title = _extract_title(html) + author = _extract_author(html) + canonical_url = _extract_canonical(html) + summary = _meta_content(html, "name", "description") + content_text = _extract_content_text(html) + if not summary and content_text: + summary = _clean_text(content_text[:320]) + images = _extract_images(html, url) + press_contact = _extract_press_contact(content_text) + + return ExtractedArticle( + title=title, + author=author, + canonical_url=canonical_url, + summary=summary, + content_text=content_text, + images=images, + press_contact=press_contact, + extraction_error=None, + ) + + +def extracted_article_to_meta(article: ExtractedArticle) -> dict[str, Any]: + return { + "title": article.title, + "author": article.author, + "canonical_url": article.canonical_url, + "summary": article.summary, + "images": article.images, + "press_contact": article.press_contact, + "extraction_error": article.extraction_error, + } diff --git a/backend/data/rss_news.db b/backend/data/rss_news.db new file mode 100644 index 0000000000000000000000000000000000000000..1b1c3900225265e8b460abed889fc85eda838c00 GIT binary patch literal 94208 zcmeHwTWlOxnjV{^NKrD`p7DB!fuiggVrL|d+3dP^RqNuRsg}$UMadFn*=u-+x}55+ zVRsc>RV`X_yjgiXJL}2X39`rl$wT%b1I$|2!1$OsofaD>_ zVu4_SO}6r^BF3p^p{G-X2 zCLWCc>3A^yY~gWX{h%T~KjD6Hdj9#dg-5fV@6b0r=gloMj6J(aqph8`)%2nUb+*hn z4}5y{c5P*|M%Fg2)LtP6CMP#HNS@_Y zh!xL*&hO=0d}S5|&vzBFy9-GM&soZwx>x6+J=#mw-LLJL>G?~T1nC&`P<0m)d1p=?GR`Tr zu_5an?7i^Cw@*&buc5MUbf!RF@WJ@a99d;o$GTz4n!gvj_@nPmP0yb_TlkldPZ-wUWQ7Pzd=*38UK)T%^bhhlE?Z^2*kSn#T zD|gp7i7?o!bfrp)>LA%kf6k}7=g8`dwbko{Eyq|7Q6@Q`gfEeEVbTyJ%Mbm=aed}oN zur!R_ee^P}-n?B~ySBmR!ugEY7s>6~RjjHTtF=2BbMc%-wmw{`t=GUz@6^)eb+Fgz zI-m0*5MJ-%wpgB?nqNIz7)zFt=#3`CqAgHZkUZ~k5;33MC*OH^VtW4BXA3{Pm&|^~ zMA`qTo>|XAj%~)Hu&wyP&e1b5r5!fg_o!?ix_YBc+bQLsWZ(1ajBcW~ZPO@XKV8#n zB7Q#_0(oXq#z!At*Yk6kC@Exc%p6dv*Nl?bp7n3>-MbUh^YinC_hu6C8F@s>pUEEh zlb~b6KU*mGmg~V5`)2FN;mH@hy|GDYbFefo+M+&VEqe?immfN-hB%lFsMQZZv}s21 zRx5Bk*F&NE^MfRf-JtgN-UJ@3U+bOxwrok&BY^ZTz9y>zuWIW>Q2{-9MLA*U@d z`C_}8lWcQxYkX?{x#tS6CWM$R91+Fb@-dInbtHeB=poL&$n%VLmXLJu-9}-0{>+)e zd*{+J6Nx9vx-s5cMjCf)Bt8uNT4qEK;oLC|ThSr$P0yx2O6XKINL!&7ZErQ{ecC5V zbcp`k1KMnY_2k^LD?`FD>mj8+Tf(w*_!Sl#=e<0WGrt;TJ2#~9u{{MiiW&7eKjgop}!F< z=+WuI?3nuKG>q#rV|}e4t7dBldf3Bi45O%w|I*g|I9hJRZ$AIVB|FXat)}VMJ?D~` zfYaL=(WuJ8K0&j^zZ{z^{_%M6&x*e+{!Q_ZKk^{+j2r=u07rl$z!BgGa0EC490861 zM}Q;15#R`X!U#MyIdQg-JEomFU6?pi$h<`+Q1i(dxOdHrPs|q*$CdGu6BExCm}A-m zjhOv^eC}_?ia)|1{^1C41ULd50geDifFr;W;0SO8I0762j=&cPfk#in4WRJrkBW)? zU7olXH2hXzdCd{c?z?%uj^Vx6z%FT3Q5^^pq=6t&-7P#dHuLpIr_rL0OTdV&>30I? zt3{{^@;*VQP~mv7Jyv{-Km5ZH;0SO8I0762jsQo1Bft^h2yg^A0vrL3!0{n)b@EZM zFyi)pP}wlPIthirh{g1436k^wr_YSd{qx!1z!(2;1ULd50geDifFr;W;0SO8zN`qm zJ2i1~?AOo8@{i8qNG^?HFYvR=5&H~j``YVYT3QCK|79^q$0FF>YIt1un4u>0n-H0E_==<9WFr-iqdNOxBm zOdq4GgPWz^&St~N0{faZFc@#Y{`NcD<0&D$TR(yjl&ais-DN@h5ds^`zPkG$$skBZ zpc|VnG#N0o3qztU44(35K^*O&C*A9Bvs)}Di~obI@$<>!Sbp>-YOF zjn9u2>W^k0U9Z=lfA!t@AHOwLc;<(mERP??Aa+`1QB_4jE^Cq~D1s;(rebPR#j#~W z)|85F>5eK<(RLKs)kRA&WW}H=mDGyvD7LDZvg&AtVab{+Shjx447VdN3WDoDpL|#c ze`v&HH4LJtw92l>4j+_Q66#k;&GX51+`t_W_sQdsT%qkavKzR)0(a0q`3Wl|q{vR% zzC-TV4P2s0{gL`IQsSJM3(BS_R#gethdd|1T}JSHH~3E9-6Z6EU%+Cg-`85!i&a88 z&z}qq>*b>j;Jm%M{3SBbjwCL}5R$^KthxI9$#ewfVzr1x5NV&Oh_ z8eUou7DyD@drh}ZfS%q|)54w95CoQe8kgfnyJh(ZC`WM+KG=#{X0utAB%QjlrIl<` zG)szMxFyRm^^&9-RH-PUD^shS1GfDAo%#aV^_;kIX+acGF}jPMUo4}~mpdc+3C*j| zCOwg%#%>?dS*MIH?|76nJP`UW#Wn6BBTxn?C<$+4F)_LrBnGpJ8~5K@Fk{H$R-4K9 zkapi%$ZnBl_Y*V8IK8hZ5g^l7PvrL>v&h3%nyxHpM0zLk2zk0CY?dv%`KYVZy<3l2 zd~)j#iw#1|Y#(n|0F`bZRpB6pncSJ&&&+an+#^}piBO*GMwp21wb<=R$sNZmWxkS% z`X;-DiEWFP4ONyZ*@C+8_S<_&l8c%S*g1VrFMy{}#?vSTPm|w&hViue(L7BNRXt&8 zx@2gQEsC;Y8Ja0Mj;4Z3RZOiSS0q8#OzLQkZa5W7v=zrysBJ5<>f)bL(Ii8svMOC` zK6z|!M|FzzvVj+1JZ!hf%UCh}*z>V-s#nRaAd2n4p=3S4J0h^FBUo-@>NR8X5VS{b zc+Dor3M5%;hXK}PvO(KX98$bq;NfAkgJJ3E$|z|&7*{q0qpAR7WfX>%v12;exhlEx z_G@iHg5V#`AwbfKKZcWGr@p>h5ROC@r;Kc%eqG` zxIzEgF71PEJ>G zG|e@sQ4%y)DJin1mMlZnN)<~M@E!@NVhDEmM&O$yr7z9U+h`AlN zaY6sZa$4}k^3LTKIIHbn(>we$moz>urxjI2N>}=QIW6|KUt@2qc$U4{@7LI?{GJ(S z{{N@uzB*QH7OQh#E&l7dD|0`ayI=fo#hJNZAc}uD0vrL307rl$z!BgGa0EC490861 zN8k&KfH+Y<)Hj_ejB&eu1zJ|BYjg9n|Ktcl;n5rcjsQo1Bft^h2yg^A z0vrL307u}Z+xkN=k7ZL#s%ZY&$l7872-TX%Q(D7Dz!LGQPqso?S zIg;tPQl`bXBtq(Qp+MDe9siSh`_2ieRdiig3fQao7n*o($Ql zsG6jjrXZN|$+Rt+4ePsQ3yLAhTE)_J6ogt8#Y7VvO*S3Fb)<@I$f7NY4mBK0HZAl{ zumr;vr;}17oWBxWTQ^+Mv24jwsbHYvf>}}2iX*C)u2d}3l?`B_;XlBlGDQ`lW>hqR zP9>$-x@j7YNOcjNHcSIJtD2@;hU8L3s#vN_bw`&hOGe?UWGbqNl3=eFsqL7SbRxyY zz;M_KRaHSmTXY4by3~*>IH6fF6dU=_VOLVb3VLM7rY!*y#lg`abl(&w(;kVM>53IY zq?W2Wf@vU&gTrNxg@e?#2C!YVBAdWgG$b2`rl~6GDA{og#Z(=2A}QtLp0^&q0;-$& zxJTR)`~U3UjTPOZH2ZhOe>Eq~{mVJG_+RG!-`s!x@=avkV~zkvfFr;W;0SO8I0762 zjsQo1Bk)BEw&zcg8(N7mEp@jqfUiO;_{S+I|+rpN1lsD}0W z{SVi$UeEs_^Yr-s50|Ic`yT+%nWM(D=eCG+|Ih9JU(^}S`^6F92yg^A0vrL307rl$ zz!BgG95VumJ(t`6x&8lhYX2|(+p*&R!Oj1_EdFKj7sa0!|D^aIihqbe{^1C41ULd5 z0geDifFr;W;0SO8I0762jsQpC?*V~R6JIZ+Z`V9EJ@HHduT+@HzE5V~)7ke__I)Dz zp3J@{vhVS!iDzbzpV|M%i+?*-{J;3aKO6y$07rl$z!BgGa0EC490861M}Q;15#R`X zfe<(`Q7BA|PfX0ru>Jo;@ejvf`~Nw<_=h9F5#R`L1ULd50geDifFr;W;0SO8I077j z&k_Rj6LLWkB&{S!B~d4WUX`?}s7~w?u2f}d{A&~POkaATD(cDk|G7UOoBQ**|6Tk_ zv0S`S+%CRX{8{lo761CP)Lq^djsQo1Bft^h2yg^A0vrL307rl$z!BgG3?OiJ;(4|= zR!gE-5|vG%QWb=%q&$bef^e!QR^PD;tXHNd$LL7SaIt}Bnz7+-S(5BH=d&eTH>D7P5M~wz63xo441lU8~(D`=>ocx;`scH<9q}tt%|w%?*+j)RX4w&D*55vic&q zee+&V#0#}+Ya2ZwtmLe)tR!-$wn_FHN-h!B=Gbe|^XL3v_uL}6_hRjKjd%{CZmwTh z_M8hnMb?R=CXkfsd@R?Lc3F4*!H8XC`Rg}BKBAusH86uE1m_N-^;i7$}9?=?<&}( z)IrI~cy4)4@;En8a$hP|auRgDlX3C2r%>|Qu9EYfv1Z5ZD1V$CC_InJ684~y=3RXB z?bFlq7tR*mnMn|7zp;mZ)`Er^HS&U<%Br}#vG&s48c9dtfIK}TO?vB2L185?_9aUx zrmJeQc86@-+$0-!*ViB2JvBZ55*oWU&{zgq4tCaBfS$Me^il2JKYL%>x``^u;5kd# ztjbYCorm^luW8-=+Mb!7zl2GUjzJGqcOj8?=F}nMya()ofWLimdVUR+eWNo4@`4Y> zZ|2Aw*u@`xcWQe6?AgKx_1LtUG_O1Vp3Z98TzO%=)|tcPeBV$Iwy3Oa zY}VLXc$~Hf(v;!2|;hCY!Za_9>`qTnPHs)J-F{W+Y+J4aSu ztgT)rY&phyh%yNn5W<(pxiD!8O@H3^(m-=(Ck*aWhlO@9V-GY8gYewq;$S0MG&Z-s z9|Z?>$qs48uMcSG`99vbPaR#Xlo8!vXNM>Q4868?ZG+8)^BJ)(lH0Yb;KLiMwL2Me z!79bphby)98e{{;#@X^Z*z0ti&v_9DuXk}Y4Q{>NE4Q`+eTejmDeqfOf> z<)CEW^XrUmqP7j8En+`i(`zDrKNUsSrxDllbD8LX$>o4jy=Iie_N;%4@7|r5 zo}ZsDyf>48&&VT6{!I43p9CEn{@Fsgw_FcWnQyj^9G-m9+nW?AZ4Q>^MO)NotYwcu z0k!%8h&IhA-f9Jo=Xxk~e}0gpu^ZIhPQ=IqQhgRziOvA^tP^QybbkMp zqL;1~C#U8w%^$Q1B;>RuCSND813JleLAS=I=AV16@M=Pc*}@S~%q<`DC|yVL$B7=| z?2A0lXlDsY7vF6ZrsvO`DZFImW!)I>EhCLPHWD9(el0Vihj8wghAnLRGBIF~ zI-8zNeU#9tYA{LYMcZ3VdY|@*5*?yH_kcE=*p}tovMWQvG3z0vK3l@Fbodn>u#yck z$M)gmeh|}iVH#|HGJ(=_`k%cS)JyNPAmFD?{@GaZzZP%J{q@}G+3(F3p8DBS8>j!` z)GucK0ulVf5#R`L1ULd50geDi;B$b$JJVmCK6Byh$seqA86)y-dzUpL56T7VE^|a4 z)s?zOZtAy3uux$O78E!|= z{Sm1Do_tt`mah?$)i8*n(ke5Lg%3(B3H7U_=K170EN}sFpF9r9721v?yU{d#*r1>M zgcTA}WG8LkA$RP?E_JBih~u58S}yPI?k?|yFi_B)AdJoCvO~)|t#WT)MoOHYiW%^n z@q9Pl8gZpOe-N%sJq2YaKT}|KeC>x>#cE_uG01#TtbX=B55*w2hSiM}c5z?@VLSXr9 z(ME^?Yq>z2Sk!Bg+tiB?;lRPCL6(=7FIty7{bo2^EL)e!YZpy|Qrt@mBaO$x_FNiXS`ZdU6xw@Dw@rYaQeF!-DHzL^eHxeJM!RMC2q;H!5I)$7 zT4u9ZmL#3JvZa-5Q#4D8VYnsBG4+z98dRw$qAOFYoCCJ}{hj&(+4Y>bacMymQ8BuU zo?k4Z&zCzR`U%ae&n7*Qp~h}t-Y;BUr;IM|c$73e5c)2K88>7E$^Zo=fyJ8`T?`U~ zS%qc&tpzj2?#OD#Uf?fO$y*DVyvXbmcwV@btTqcvB#+4LA1v}P%I?a7Mx-{85%P4~ zNm#b4rWf9gXjiFwRu2}R*gRNl02e&Zhj#_2bo;0Z2QfqmOh``bGc5PLpy9Uy%WJZ- zlNirIxP&LK#TMbj#=%nNE2)@xRVL>Vmkm{xDw!+d!rO1}CFvYl+$ZRRdI3C*GM+{$ zc$)nFGmNLzkLGDTFE^HKQIr+S&`im3G!L%JtvIeiZCjC5 z7yp!sCK)=FRq0yu$zyvvs#C0&4LHC&Y`4hESTX(B^I-t5SIMm)itWImWIce7NC>jS zavM{x8Iy;gJ#xcqHbGV($yz%MuqKlY+K%Fo!oA?(VYGu`>FLTSX*+=SJ9`et&@y&R z2Rm0KSDriu=WRMc=$5;BLe2+qv`9)|F$#c8$J^Z2w2E(G8WHx z2(qku)S^qo2@sW}eP}kCi3|W^I?(iKqea7Sr`(8T7{uKQu|UTZt0Y6&@vsiG$alfT z$bI+-VZP&$V%Vy1?2Jg=^dFLX7(Dp{O43nYCXz09k4~q)l)W8{%4och?#7=m5C5%}=^#rR_YzbOg%;NfRJZ4T#w zUs9?YnhEyN)Mz~L;?CuZiPUoWX2v%luP5Gdm^l!v4kmmE>yIrkHs&6O1uqo_!BHb( zZpUpH=r5Mjf-jbLF2BH8ZU36y;itKz@o_n=s47x=Bw%Pxo9_R=H+J$L7rys7-28sV zoBY8)ocvm{H-9zRnlJv|?yy^PQN#I_vL@myTUH7b* z?1Wf`JSY|*LwGwo(Gr2LuGx0e9c+q;ZMJr>m4jqQ67j(&9zZIwm5;1NK@)q3Hlmp# zVw2hp-`mC}IbzB0FzF?1V1JRs-h%dmrR*&>+fYO9pr{odw~$>Zxf+BmKp$nh=xCXB z2krFPW`(Rn4T1;Lh0pO4*>0N1vEFvmYEoI`U_eHbMS6$yQx`_k>W@^5GK5mygVb7C!ok=y;`% zgGqfP>7&M_54O?oS;CK9{LoZ&49= z*lcz zmzS&L1sXzo)M2b8a)&wmXItwG9G59PutT^THcjwkCQuvz1!E2KMZG1Hf|E-e`L}E$C&IQQT^X zvv{RD@G%a;C1yWB>+A;fRx-8jXxT;_)imX&*TJDjAwywN<_1e~o6Ik?4 z9N623bK5T)@j(c^&>_GP7;i6G9WUcOdXeA^cyvvB&zgVi3|Iv({3!eX@&7(HGl@U^ z!x8ubA@F8Q>P`+ro1S=5iM}lkXhATQyzE~<1 z49&3xv!bXKM^r6csaU2f8#WT+zd{8nQ^nLp&4B8L_VB?fxz>)EMFM(Lw%hMWRNH0~ zVhyub(C8skDwN5}m6g?0bCD}9pfZ5<9j4-3oddaO)eO!1CJvdJ{&vu6#YYH|-8~y2 zX{y12#MytV$#4=U>q@m_Y3@jstC{u%)*y&!KBWW-mpU+MknNlmdTSa+QBU3XJ9g;6os1v z^_Q7<3IxzZluhJ#65$9i>Up@ED3g3#lTe4tMYKcFl4dh=ra+O%7@6_IWQ=Zjemkm% zCaW=1+5m&m!i>hoHreT_JlGAj{kX))r z6-$+=?&y+b$>1LDAZ{?M2;2sg_T}-@>B4! zCey*gJK_j#(gW2fX>;Sjz?E6Uv7UR(XYE_$1+VEabw|^zV_9zR;1Y!rxke*^!*uu& zKy?yD15_U-V1>HC-&21ukl08> zh+toq_T>bBXl~XA;PJAvQK!mCtZ#+0KCr&t@jGtbS26NS4o9>9kNwHm$=|`B&-3S> zR!^o({LwcPCSLl04#~t7p&~O=yCzC1Y=Jga9Z8~=u4MWYL#CT0Glqz)PG+FKwMKpLYyE!LH7%l9eNA zyS(`_xktT*pBbTD3Kn#f0imbeM{v}hwE1QZZkoN*9wv{)4kmQB%`s#p3&KtEq;>j?@8^)Y97#Vh@=9`031|g7zK2-wa}FREvn!AwLJ@FsVeq&|*9494mv<6h`f_@&S2>9*v3XsBfDF-` zh9wvk{axB9d9CDUgr80Hm&k0_n#n8b-RpPT}y z?wzJs=TZpsLhR;v$8CiGBOTca@xktN<_7Tf#0{YK{`Fzq05naJl!SZ3_t@2JaB<5P z6ho3V_`d02;Mi;{P{&GwqsgXYxQ$U{L7H>l<*aXLE#x1su zhJ70<#dediJ6O2kvX4U!H=7UOGlsJv4o>|vnb$gu=FDr1u?^-1cIcT9=7rJIMt$4A%mb&usraAOb0$?Y+%bTw++;?VUtt%7U)|Sf9XEJmKrh@IJPxx!5b%t~ z*Lw%=Q)ldH^~cHCE7Yh6vLuvL3m1_qx+0cLoDwZrl?u*iQdd)Cx%_=LVM=j2VYYU= zCljOJ!^?u?Sk*go!^*_X17-{=|jWgbXrT^GD zuYiLfAI1JZw#D}UTc5!8|DXNFsf@9=62@MBACF)EUHNDetjLt=G8nt6i*m)O2#O*> zqb}=?;0lhcSryk+;TDgRdxE7X8r3a9pthldnH%s26k(&qNhH}db=A^tV`u0y@3yND zWZ9CB-|TYV7I+ib1`a480S<_TZqURzN>~HIgV;`;nP|(Mv(HSs3PuUXvO2^|c+8=w z)CU~uVu#>(AXDt#ZG=a0^`6d+l(-`T^uO0)I1%7h80^66*};SD-&5~5a{U^Zf5=Wc zcI=5*jhS+s8Dtp;XUELdgr9fvn=swR;IvH3o}-AcB%XNK4q&s*^xXbF$RiNgzd&*N zmRi8uhy9uvU~gd8o*dv~NBQzY>^@n4@*&jQuxzk1>_~ypyqkzJ>~LUbx1QQ{nPr$M z?4gp!z~GA=0=$VfZsYX4%j_deVW08+{V~a%vVhBoT)$^Vj+e*pPxs6b5i~y% zcfX!qcFT{__j0XcXYe@YmAw&Y;cXZXQq;F_PqH6JIDr4!TMO)sHQ-yVoqf;aE9x?i zh~dynCM7TfL>RQPcdD=~5Yt$zNv>mP6$uJ^$Hp(i#F2i@qHsv2wx;N^uGxYPMSx&e zm{vfxWXq9oF31#B!BBPGPPW3yp7c}M|Fb3^#r!+U0+zVL{QF;9H5~IFLa68a|3lRy zBT522gZJr60!j79DG8_&CXTCHB?Ah!l7d@8;P_v0N|GknuIuPBd|mkd|6}Q)hL;Wc li0gP|gCUlRkt77Z|4(iy>m4(2%;UUiN0W42A94Tx{{iTtj_Uvb literal 0 HcmV?d00001 diff --git a/backend/requirements-test.txt b/backend/requirements-test.txt new file mode 100644 index 0000000..cf39f84 --- /dev/null +++ b/backend/requirements-test.txt @@ -0,0 +1,3 @@ +pytest==8.3.5 +pytest-cov==6.0.0 +httpx==0.28.1 diff --git a/backend/requirements.txt b/backend/requirements.txt new file mode 100644 index 0000000..f4ffe61 --- /dev/null +++ b/backend/requirements.txt @@ -0,0 +1,8 @@ +fastapi==0.116.1 +uvicorn[standard]==0.35.0 +itsdangerous==2.2.0 +pydantic-settings==2.10.1 +python-dotenv==1.1.1 +feedparser==6.0.11 +jinja2==3.1.4 +python-multipart==0.0.20 diff --git a/backend/static/admin.css b/backend/static/admin.css new file mode 100644 index 0000000..348264f --- /dev/null +++ b/backend/static/admin.css @@ -0,0 +1,189 @@ +body { + margin: 0; + font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif; + background: #f4f6f8; + color: #1f2937; +} + +.topbar { + display: flex; + justify-content: space-between; + align-items: center; + padding: 20px 28px; + background: #0f172a; + color: #f8fafc; +} + +.container { + padding: 20px 28px 28px 28px; +} + +.login { + max-width: 520px; + margin: 60px auto; +} + +.card { + background: #ffffff; + border-radius: 10px; + box-shadow: 0 1px 3px rgba(0, 0, 0, 0.12); + padding: 16px; + margin-bottom: 16px; +} + +.stats { + display: grid; + grid-template-columns: repeat(4, minmax(0, 1fr)); + gap: 12px; + margin-bottom: 16px; +} + +.stat { + background: #ffffff; + border-radius: 10px; + padding: 12px; + box-shadow: 0 1px 3px rgba(0, 0, 0, 0.12); +} + +.stat .label { + font-size: 12px; + color: #64748b; +} + +.stat .value { + font-size: 24px; + font-weight: 700; +} + +.grid.two { + display: grid; + grid-template-columns: 1fr 1fr; + gap: 16px; +} + +.stack { + display: grid; + gap: 10px; +} + +.row { + display: flex; + gap: 8px; + align-items: center; +} + +.filter-row { + margin-bottom: 10px; +} + +.inline { + display: flex; + gap: 6px; + align-items: center; +} + +table { + width: 100%; + border-collapse: collapse; +} + +th, td { + text-align: left; + padding: 8px; + border-bottom: 1px solid #e5e7eb; + vertical-align: top; +} + +input, select, button { + padding: 8px; + border-radius: 6px; + border: 1px solid #cbd5e1; + font: inherit; +} + +button { + background: #0ea5e9; + border-color: #0ea5e9; + color: white; + cursor: pointer; +} + +button.secondary { + background: #64748b; + border-color: #64748b; +} + +.badge { + display: inline-block; + padding: 2px 8px; + border-radius: 999px; + background: #e2e8f0; + font-size: 12px; +} + +.badge.ok { + background: #dcfce7; + color: #166534; +} + +.badge.bad { + background: #fee2e2; + color: #991b1b; +} + +.alert { + margin-bottom: 12px; + padding: 10px; + border-radius: 8px; + background: #fee2e2; + color: #991b1b; +} + +.flash { + font-weight: 600; +} + +.flash-success { + border-left: 4px solid #10b981; +} + +.flash-error { + border-left: 4px solid #ef4444; +} + +.subtle { + color: #64748b; + font-size: 12px; + margin-top: 4px; +} + +.pre { + white-space: pre-wrap; + line-height: 1.35; + max-height: 220px; + overflow: auto; + background: #f8fafc; + border: 1px solid #e2e8f0; + border-radius: 8px; + padding: 8px; + margin-top: 6px; +} + +.linkbtn { + display: inline-block; + padding: 8px 10px; + border-radius: 6px; + text-decoration: none; + border: 1px solid #cbd5e1; + color: #334155; + background: #f8fafc; +} + +@media (max-width: 920px) { + .stats { + grid-template-columns: repeat(2, minmax(0, 1fr)); + } + .grid.two { + grid-template-columns: 1fr; + } +} diff --git a/backend/templates/admin_dashboard.html b/backend/templates/admin_dashboard.html new file mode 100644 index 0000000..36b30a7 --- /dev/null +++ b/backend/templates/admin_dashboard.html @@ -0,0 +1,235 @@ + + + + + + {{ title }} + + + +
+
+

rss-news Admin Dashboard

+

Angemeldet als {{ user }}

+
+
+ +
+
+ +
+ {% if flash_msg %} +
+ {{ flash_msg }} +
+ {% endif %} + +
+
+
Quellen
+
{{ sources|length }}
+
+
+
Feeds
+
{{ feeds|length }}
+
+
+
Artikel
+
{{ articles|length }}
+
+
+
Runs
+
{{ runs|length }}
+
+
+ +
+
+

Quelle anlegen

+
+ + + + + + + +
+
+ +
+

Feed anlegen

+
+ + + + + +
+
+
+ +
+

Ingestion starten

+
+ + +
+
+ +
+

Quellen + Policy

+ + + + + + {% for s in sources %} + + + + + + + + + {% endfor %} + +
IDNameRiskLizenzTermsPolicy
{{ s.id }}{{ s.name }}{{ s.risk_level }}{{ s.license_name or "-" }}{{ s.terms_url or "-" }} + {% if source_policy[s.id] %} + BLOCKED ({{ source_policy[s.id]|length }}) +
{{ source_policy[s.id]|join(", ") }}
+ {% else %} + OK + {% endif %} +
+
+ +
+

Artikel (Review)

+
+ + + + Reset +
+ + + + + + {% for a in articles %} + + + + + + + + + {% endfor %} + +
IDArtikelStatusDetailsReviewTransition
{{ a.id }} + {{ a.title }}
+ Autor: {{ a.author or "-" }}
+ Original öffnen + {% if a.canonical_url and a.canonical_url != a.source_url %} +
Canonical öffnen + {% endif %} +
{{ a.status }} + {% if a.summary %} +
Summary: {{ a.summary }}
+ {% endif %} + {% if a.content_raw %} +
+ Volltext anzeigen +
{{ a.content_raw }}
+
+ {% endif %} +
Bilder: {{ a.extracted_images|length }}
+ {% if a.extracted_images %} +
+ Bild-URLs +
    + {% for img in a.extracted_images %} +
  • {{ img }}
  • + {% endfor %} +
+
+ {% endif %} + {% if a.press_contact %} +
+ Pressekontakt +
{{ a.press_contact }}
+
+ {% endif %} + {% if a.extraction_error %} +
Extraktionsfehler: {{ a.extraction_error }}
+ {% endif %} +
+ {% if a.status == "review" %} +
+ + + +
+ {% else %} + - + {% endif %} +
+
+ + {% if allowed_transitions.get(a.status, []) %} + + {% else %} + keine Aktion + {% endif %} +
+
+
+ +
+

Runs

+ + + + + + {% for r in runs %} + + + + + + + + {% endfor %} + +
IDTypStatusStartEnde
{{ r.id }}{{ r.run_type }}{{ r.status }}{{ r.started_at }}{{ r.finished_at or "-" }}
+
+
+ + diff --git a/backend/templates/admin_login.html b/backend/templates/admin_login.html new file mode 100644 index 0000000..10e55e7 --- /dev/null +++ b/backend/templates/admin_login.html @@ -0,0 +1,27 @@ + + + + + + {{ title }} + + + +
+

rss-news Admin

+

Bitte anmelden, um das Tool zu verwalten.

+ {% if error %} +
Login fehlgeschlagen. Bitte pruefen.
+ {% endif %} +
+ + + +
+
+ + diff --git a/backend/tests/__init__.py b/backend/tests/__init__.py new file mode 100644 index 0000000..46816dd --- /dev/null +++ b/backend/tests/__init__.py @@ -0,0 +1 @@ +"""Tests package.""" diff --git a/backend/tests/test_admin_ui.py b/backend/tests/test_admin_ui.py new file mode 100644 index 0000000..c6b2188 --- /dev/null +++ b/backend/tests/test_admin_ui.py @@ -0,0 +1,65 @@ +import os +import tempfile +import unittest +from pathlib import Path + +from fastapi.testclient import TestClient + +from backend.app import config as config_module +from backend.app.db import init_db +from backend.app.main import app + + +class TestAdminUi(unittest.TestCase): + def setUp(self) -> None: + self.tmp_dir = tempfile.TemporaryDirectory() + os.environ["APP_DB_PATH"] = str(Path(self.tmp_dir.name) / "admin_ui.db") + os.environ["APP_ADMIN_USERNAME"] = "admin" + os.environ["APP_ADMIN_PASSWORD"] = "secret" + config_module.get_settings.cache_clear() + init_db() + self.client = TestClient(app) + + def tearDown(self) -> None: + config_module.get_settings.cache_clear() + os.environ.pop("APP_DB_PATH", None) + os.environ.pop("APP_ADMIN_USERNAME", None) + os.environ.pop("APP_ADMIN_PASSWORD", None) + self.tmp_dir.cleanup() + + def test_admin_login_and_dashboard(self) -> None: + login_page = self.client.get("/admin/login") + self.assertEqual(login_page.status_code, 200) + self.assertIn("rss-news Admin", login_page.text) + + login = self.client.post( + "/admin/login", + data={"username": "admin", "password": "secret"}, + follow_redirects=True, + ) + self.assertEqual(login.status_code, 200) + self.assertIn("Admin Dashboard", login.text) + + def test_dashboard_redirects_if_not_logged_in(self) -> None: + res = self.client.get("/admin/dashboard", follow_redirects=False) + self.assertEqual(res.status_code, 303) + self.assertEqual(res.headers.get("location"), "/admin/login") + + def test_create_feed_with_empty_source_id_does_not_error(self) -> None: + self.client.post( + "/admin/login", + data={"username": "admin", "password": "secret"}, + follow_redirects=True, + ) + # empty source_id used to cause validation issues in form parsing + res = self.client.post( + "/admin/feeds/create", + data={"name": "Feed X", "url": "https://example.org/feed.xml", "source_id": ""}, + follow_redirects=False, + ) + self.assertEqual(res.status_code, 303) + self.assertTrue(res.headers.get("location", "").startswith("/admin/dashboard")) + + +if __name__ == "__main__": + unittest.main() diff --git a/backend/tests/test_api_auth.py b/backend/tests/test_api_auth.py new file mode 100644 index 0000000..aa86821 --- /dev/null +++ b/backend/tests/test_api_auth.py @@ -0,0 +1,77 @@ +import os +import tempfile +import unittest +from pathlib import Path + +from fastapi.testclient import TestClient + +from backend.app import config as config_module +from backend.app.db import init_db +from backend.app.main import app + + +class TestApiAuth(unittest.TestCase): + def setUp(self) -> None: + self.tmp_dir = tempfile.TemporaryDirectory() + os.environ["APP_DB_PATH"] = str(Path(self.tmp_dir.name) / "api.db") + os.environ["APP_ADMIN_USERNAME"] = "admin" + os.environ["APP_ADMIN_PASSWORD"] = "secret" + config_module.get_settings.cache_clear() + init_db() + self.client = TestClient(app) + + def tearDown(self) -> None: + config_module.get_settings.cache_clear() + os.environ.pop("APP_DB_PATH", None) + os.environ.pop("APP_ADMIN_USERNAME", None) + os.environ.pop("APP_ADMIN_PASSWORD", None) + self.tmp_dir.cleanup() + + def test_login_and_protected_endpoint(self) -> None: + r = self.client.post("/auth/login", json={"username": "admin", "password": "secret"}) + self.assertEqual(r.status_code, 200) + + p = self.client.get("/api/protected") + self.assertEqual(p.status_code, 200) + self.assertTrue(p.json().get("ok")) + + def test_protected_requires_auth(self) -> None: + r = self.client.get("/api/protected") + self.assertEqual(r.status_code, 401) + + def test_run_detail_endpoint(self) -> None: + login = self.client.post("/auth/login", json={"username": "admin", "password": "secret"}) + self.assertEqual(login.status_code, 200) + + created = self.client.post("/api/runs", json={"run_type": "ingestion", "status": "running"}) + self.assertEqual(created.status_code, 200) + run_id = created.json()["id"] + + detail = self.client.get(f"/api/runs/{run_id}") + self.assertEqual(detail.status_code, 200) + self.assertEqual(detail.json()["item"]["id"], run_id) + + def test_source_policy_check_endpoint(self) -> None: + login = self.client.post("/auth/login", json={"username": "admin", "password": "secret"}) + self.assertEqual(login.status_code, 200) + + created = self.client.post( + "/api/sources", + json={ + "name": "Policy Source", + "risk_level": "yellow", + "is_enabled": True, + }, + ) + self.assertEqual(created.status_code, 200) + source_id = created.json()["id"] + + check = self.client.get(f"/api/sources/{source_id}/policy-check") + self.assertEqual(check.status_code, 200) + body = check.json() + self.assertFalse(body["allowed"]) + self.assertGreaterEqual(len(body["issues"]), 1) + + +if __name__ == "__main__": + unittest.main() diff --git a/backend/tests/test_article_workflow.py b/backend/tests/test_article_workflow.py new file mode 100644 index 0000000..28bb1eb --- /dev/null +++ b/backend/tests/test_article_workflow.py @@ -0,0 +1,95 @@ +import os +import tempfile +import unittest +from pathlib import Path + +from fastapi.testclient import TestClient + +from backend.app import config as config_module +from backend.app.db import init_db +from backend.app.main import app + + +class TestArticleWorkflow(unittest.TestCase): + def setUp(self) -> None: + self.tmp_dir = tempfile.TemporaryDirectory() + os.environ["APP_DB_PATH"] = str(Path(self.tmp_dir.name) / "workflow.db") + os.environ["APP_ADMIN_USERNAME"] = "admin" + os.environ["APP_ADMIN_PASSWORD"] = "secret" + config_module.get_settings.cache_clear() + init_db() + self.client = TestClient(app) + self.client.post("/auth/login", json={"username": "admin", "password": "secret"}) + + def tearDown(self) -> None: + config_module.get_settings.cache_clear() + os.environ.pop("APP_DB_PATH", None) + os.environ.pop("APP_ADMIN_USERNAME", None) + os.environ.pop("APP_ADMIN_PASSWORD", None) + self.tmp_dir.cleanup() + + def _create_article(self) -> int: + source = self.client.post( + "/api/sources", + json={ + "name": "Workflow Source", + "base_url": "https://example.org", + "terms_url": "https://example.org/terms", + "license_name": "cc-by", + "risk_level": "green", + "is_enabled": True, + "last_reviewed_at": "2026-02-18T00:00:00Z", + }, + ) + source_id = source.json()["id"] + + feed = self.client.post( + "/api/feeds", + json={"name": "Workflow Feed", "url": "https://example.org/feed.xml", "source_id": source_id, "is_enabled": True}, + ) + feed_id = feed.json()["id"] + + article = self.client.post( + "/api/articles/upsert", + json={ + "feed_id": feed_id, + "source_article_id": "wf-1", + "source_url": "https://example.org/a1", + "title": "Workflow Artikel", + "summary": "s", + "content_raw": "c", + "status": "new", + }, + ) + return article.json()["id"] + + def test_valid_transition_chain(self) -> None: + article_id = self._create_article() + + t1 = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "review"}) + self.assertEqual(t1.status_code, 200) + + r1 = self.client.post(f"/api/articles/{article_id}/review", json={"decision": "approve", "note": "ok"}) + self.assertEqual(r1.status_code, 200) + self.assertEqual(r1.json()["to_status"], "approved") + + t2 = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "published"}) + self.assertEqual(t2.status_code, 200) + + final = self.client.get(f"/api/articles/{article_id}") + self.assertEqual(final.status_code, 200) + self.assertEqual(final.json()["item"]["status"], "published") + + def test_invalid_transition_rejected(self) -> None: + article_id = self._create_article() + bad = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "published"}) + self.assertEqual(bad.status_code, 400) + + def test_review_only_allowed_in_review_status(self) -> None: + article_id = self._create_article() + bad = self.client.post(f"/api/articles/{article_id}/review", json={"decision": "approve"}) + self.assertEqual(bad.status_code, 400) + + +if __name__ == "__main__": + unittest.main() diff --git a/backend/tests/test_db_repositories.py b/backend/tests/test_db_repositories.py new file mode 100644 index 0000000..825ae8d --- /dev/null +++ b/backend/tests/test_db_repositories.py @@ -0,0 +1,119 @@ +import os +import tempfile +import unittest +from pathlib import Path + +from backend.app import config as config_module +from backend.app.db import init_db +from backend.app.repositories import ( + ArticleUpsert, + FeedCreate, + RunCreate, + SourceCreate, + create_feed, + create_run, + create_source, + finish_run, + list_articles, + list_feeds, + list_runs, + list_sources, + upsert_article, +) + + +class TestSQLiteRepositories(unittest.TestCase): + def setUp(self) -> None: + self.tmp_dir = tempfile.TemporaryDirectory() + self.db_path = str(Path(self.tmp_dir.name) / "test.db") + os.environ["APP_DB_PATH"] = self.db_path + config_module.get_settings.cache_clear() + init_db() + + def tearDown(self) -> None: + config_module.get_settings.cache_clear() + os.environ.pop("APP_DB_PATH", None) + self.tmp_dir.cleanup() + + def test_end_to_end_basic_crud(self) -> None: + source_id = create_source( + SourceCreate( + name="GovData", + base_url="https://data.gov.de", + terms_url="https://www.govdata.de/dl-de/by-2-0", + license_name="dl-de/by-2-0", + risk_level="green", + is_enabled=True, + notes="test source", + last_reviewed_at="2026-02-18T00:00:00Z", + ) + ) + self.assertGreater(source_id, 0) + + feed_id = create_feed( + FeedCreate( + name="GovData RSS", + url="https://example.org/feed.xml", + source_id=source_id, + is_enabled=True, + ) + ) + self.assertGreater(feed_id, 0) + + run_id = create_run(RunCreate(run_type="ingest", status="running", details="start")) + self.assertGreater(run_id, 0) + finish_run(run_id=run_id, status="success", details="ok") + + article_id = upsert_article( + ArticleUpsert( + feed_id=feed_id, + source_article_id="abc-1", + source_hash="hash-abc-1", + title="Beispielartikel", + source_url="https://example.org/articles/1", + canonical_url="https://example.org/articles/1", + published_at="2026-02-18T00:00:00Z", + author="Max Mustermann", + summary="Kurzfassung", + content_raw="Originaltext", + content_rewritten="Umschreibung", + word_count=120, + status="review", + meta_json='{"lang":"de"}', + ) + ) + self.assertGreater(article_id, 0) + + # Upsert with same source_url updates same row + article_id_2 = upsert_article( + ArticleUpsert( + feed_id=feed_id, + source_article_id="abc-1", + source_hash="hash-abc-1", + title="Beispielartikel aktualisiert", + source_url="https://example.org/articles/1", + canonical_url="https://example.org/articles/1", + published_at="2026-02-18T00:00:00Z", + author="Max Mustermann", + summary="Kurzfassung 2", + content_raw="Originaltext 2", + content_rewritten="Umschreibung 2", + word_count=140, + status="approved", + meta_json='{"lang":"de","v":2}', + ) + ) + self.assertEqual(article_id, article_id_2) + + self.assertEqual(len(list_sources()), 1) + self.assertEqual(len(list_feeds()), 1) + self.assertEqual(len(list_runs()), 1) + + articles = list_articles() + self.assertEqual(len(articles), 1) + self.assertEqual(articles[0]["title"], "Beispielartikel aktualisiert") + self.assertEqual(articles[0]["status"], "approved") + + +if __name__ == "__main__": + unittest.main() diff --git a/backend/tests/test_ingestion.py b/backend/tests/test_ingestion.py new file mode 100644 index 0000000..05b2c2b --- /dev/null +++ b/backend/tests/test_ingestion.py @@ -0,0 +1,122 @@ +import os +import tempfile +import unittest +from pathlib import Path +from unittest.mock import patch + +from backend.app import config as config_module +from backend.app.db import init_db +from backend.app.ingestion import run_ingestion +from backend.app.repositories import FeedCreate, SourceCreate, create_feed, create_source, list_articles +from backend.app.source_extraction import ExtractedArticle + + +class TestIngestion(unittest.TestCase): + def setUp(self) -> None: + self.tmp_dir = tempfile.TemporaryDirectory() + os.environ["APP_DB_PATH"] = str(Path(self.tmp_dir.name) / "ingestion.db") + config_module.get_settings.cache_clear() + init_db() + + source_id = create_source( + SourceCreate( + name="Test Source", + base_url="https://example.org", + terms_url="https://example.org/terms", + license_name="cc-by", + risk_level="green", + is_enabled=True, + notes=None, + last_reviewed_at="2026-02-18T00:00:00Z", + ) + ) + self.feed_id = create_feed( + FeedCreate( + name="Test Feed", + url="https://example.org/feed.xml", + source_id=source_id, + is_enabled=True, + ) + ) + + def tearDown(self) -> None: + config_module.get_settings.cache_clear() + os.environ.pop("APP_DB_PATH", None) + self.tmp_dir.cleanup() + + @patch("backend.app.ingestion.extract_article") + @patch("backend.app.ingestion.feedparser.parse") + def test_ingestion_deduplicates_by_feed_and_guid(self, mock_parse, mock_extract_article) -> None: + mock_extract_article.return_value = ExtractedArticle( + title="Artikel 1 original", + author="Autorin A", + canonical_url="https://example.org/article/1", + summary="Original Summary", + content_text="Original Volltext", + images=["https://example.org/a.jpg"], + press_contact="Pressekontakt: Team A", + extraction_error=None, + ) + mock_parse.return_value = { + "etag": "etag-1", + "modified": "Tue, 18 Feb 2026 10:00:00 GMT", + "entries": [ + { + "id": "item-1", + "title": "Artikel 1", + "link": "https://example.org/article/1", + "summary": "A", + }, + { + "id": "item-1", + "title": "Artikel 1 aktualisiert", + "link": "https://example.org/article/1-neu", + "summary": "B", + }, + ], + } + + stats = run_ingestion(feed_id=self.feed_id) + self.assertEqual(stats.status, "success") + self.assertEqual(stats.entries_seen, 2) + self.assertEqual(len(list_articles()), 1) + article = list_articles()[0] + self.assertEqual(article["title"], "Artikel 1 original") + self.assertEqual(article["author"], "Autorin A") + self.assertIn("Original Volltext", article["content_raw"] or "") + self.assertIn("Pressekontakt", article["meta_json"] or "") + + @patch("backend.app.ingestion.extract_article") + @patch("backend.app.ingestion.feedparser.parse") + def test_ingestion_blocks_non_green_source(self, mock_parse, mock_extract_article) -> None: + # Re-create source/feed with yellow risk to verify enforcement + source_id = create_source( + SourceCreate( + name="Blocked Source", + base_url="https://example.net", + terms_url="https://example.net/terms", + license_name="custom", + risk_level="yellow", + is_enabled=True, + notes=None, + last_reviewed_at="2026-02-18T00:00:00Z", + ) + ) + blocked_feed_id = create_feed( + FeedCreate( + name="Blocked Feed", + url="https://example.net/feed.xml", + source_id=source_id, + is_enabled=True, + ) + ) + + stats = run_ingestion(feed_id=blocked_feed_id) + self.assertEqual(stats.status, "success") + self.assertEqual(stats.articles_upserted, 0) + mock_parse.assert_not_called() + mock_extract_article.assert_not_called() + + +if __name__ == "__main__": + unittest.main() diff --git a/backend/tests/test_source_extraction.py b/backend/tests/test_source_extraction.py new file mode 100644 index 0000000..f6787ff --- /dev/null +++ b/backend/tests/test_source_extraction.py @@ -0,0 +1,69 @@ +import unittest +from unittest.mock import patch + +from backend.app.source_extraction import extract_article + + +SAMPLE_HTML = """ + + + + + + + + + + + +
+

Dies ist der vollstaendige Inhalt des Artikels.

+

Weitere relevante Informationen fuer die Meldung.

+

Pressekontakt

+

Musterfirma GmbH, Kontakt: presse@example.org

+
+ + +""" + + +class _FakeHeaders: + @staticmethod + def get_content_charset(): + return "utf-8" + + +class _FakeResponse: + headers = _FakeHeaders() + + def __init__(self, body: str): + self._body = body.encode("utf-8") + + def read(self): + return self._body + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + return False + + +class TestSourceExtraction(unittest.TestCase): + @patch("backend.app.source_extraction.urlopen") + def test_extract_article_parses_author_images_and_press_contact(self, mock_urlopen) -> None: + mock_urlopen.return_value = _FakeResponse(SAMPLE_HTML) + + extracted = extract_article("https://www.presseportal.de/pm/118273/6158137") + self.assertEqual(extracted.title, "Demo Meldung von Presseportal") + self.assertEqual(extracted.author, "Max Mustermann") + self.assertEqual(extracted.canonical_url, "https://www.presseportal.de/pm/118273/6158137") + self.assertIn("vollstaendige Inhalt", extracted.content_text or "") + self.assertIn("Kurzbeschreibung", extracted.summary or "") + self.assertIn("https://www.presseportal.de/images/demo.jpg", extracted.images) + self.assertIn("Pressekontakt", extracted.press_contact or "") + self.assertIsNone(extracted.extraction_error) + + +if __name__ == "__main__": + unittest.main() diff --git a/docs/PROJECT_PLAN.md b/docs/PROJECT_PLAN.md new file mode 100644 index 0000000..c758f5e --- /dev/null +++ b/docs/PROJECT_PLAN.md @@ -0,0 +1,67 @@ +# Projektplan (Neustart) + +## Leitentscheidungen +- Bestehendes Repository wird weiterverwendet. +- Kein harter Endtermin: lauffaehig werden, dann iterativ verbessern. +- Hetzner bleibt Laufzeitplattform. +- WordPress (IONOS) bleibt vorerst Ziel fuer Publikation. +- Auth initial nur mit einem User/Password. + +## Zielbild +Eine modulare News-Pipeline mit klaren Stufen: +1. Feed-Ingestion +2. Inhaltsanalyse und Normalisierung +3. Rewrite/Anreicherung +4. Legal- und Qualitaetschecks +5. WordPress-Publikation (`pending`) +6. Monitoring/Logging + +## Grobe Zeitplanung (ohne Fixtermine) +- Phase 0: ca. 1 Woche +- Phase 1: ca. 2-4 Wochen +- Phase 2: ca. 2-3 Wochen +- Phase 3: fortlaufend + +## Phasen + +### Phase 0 - Grundlagen (jetzt) +- Doku und Wiki strukturieren +- Source-Policy definieren +- Redirect fuer `news.vanityontour.de` setzen +- GitHub Project als zentrale Planung scharfstellen + +### Phase 1 - MVP Core +- Neues FastAPI-Projektgeruest +- SQLite-Datenmodell (feeds, articles, runs, source_policy) +- Feed-Import mit Duplikaterkennung +- Admin-Login (ein User) +- Manuelle Review vor Publish + +### Phase 2 - Automation +- Job-Queue (asynchron) +- Regelbasierte Scheduler +- Retry/Dead-Letter-Handling +- Robustes Error-Reporting + +### Phase 3 - Compliance und Skalierung +- Source-Whitelisting mit Pflichtfeldern +- Pflicht-Attribution pro Artikel +- Qualitaetsmetriken und Audit-Logs +- Optional: Passkey/WebAuthn + +## Architekturprinzipien +- Idempotente Jobs +- Trennung von UI, API, Worker +- Strikte Validierung bei Quell-/Lizenzdaten +- Expliziter Publish-Schritt, kein blindes Autoposting + +## Risiken +- Lizenz-/Nutzungsbedingungen je Quelle variieren stark +- Feeds aendern Struktur/Verfuegbarkeit +- WordPress-API und Auth koennen regressionsanfaellig sein + +## Erfolgsmetriken +- Zeit von Feed-Eingang bis Review-Ready +- Quote sauber attribuierter Artikel +- Fehlerrate pro Pipeline-Stufe +- Anzahl manueller Eingriffe pro Woche diff --git a/docs/SOURCE_POLICY.md b/docs/SOURCE_POLICY.md new file mode 100644 index 0000000..d1d2e0c --- /dev/null +++ b/docs/SOURCE_POLICY.md @@ -0,0 +1,81 @@ +# Source Policy und Feed-Vorschlaege + +## Grundsatz +Es werden nur Quellen genutzt, deren Nutzungsbedingungen die geplante Nutzung erlauben oder fuer die eine explizite Genehmigung vorliegt. + +## Pflichtdaten pro Quelle +- Quellname +- Feed-URL +- Originalartikel-URL +- Autor/Herausgeber (wenn vorhanden) +- Lizenz/Nutzungsgrundlage +- Einschraenkungen (kommerziell, Bearbeitung, Bildrechte, Archivierung) +- Datum der letzten Pruefung +- Link auf Nutzungsbedingungen + +## Einstufung (Ampel) +- Gruen: Nutzung fuer geplantes Modell klar erlaubt +- Gelb: teilklar/mit Einschraenkungen, manuelle Pruefung erforderlich +- Rot: fuer das Modell nicht geeignet ohne Zusatzvertrag + +## Verbindliche Regeln +- Keine neue Quelle ohne Eintrag im Source-Register +- Kein automatischer Publish bei Gelb/Rot +- Bilder separat pruefen (Textrecht != Bildrecht) +- Quartalsweiser Re-Check der Terms + +## Ersteinschaetzung (Stand: 16.02.2026) + +### Rot +1. Reuters / Thomson Reuters +- Grund: Inhalte sind urheberrechtlich geschuetzt; Reproduktion/Verteilung laut Terms nur mit vorheriger Zustimmung. +- Folge: Nur mit explizitem Vertrag/Lizenz. +- Referenz: + - https://www.thomsonreuters.com/en/terms-of-use + +2. tagesschau.de RSS +- Grund: Inhalte nur privat/nicht-kommerziell; Veroeffentlichung grundsaetzlich nicht erlaubt (ausser explizit CC-lizenziert). +- Folge: Nicht fuer das geplante Modell geeignet. +- Referenz: + - https://www.tagesschau.de/infoservices/rssfeeds + +### Gelb +1. Presseportal / ots +- Grund: Redaktionelle Nutzung grundsaetzlich moeglich, aber Verantwortung liegt beim Verwender; darueber hinausgehende Geschaeftsnutzung nur mit Genehmigung. +- Folge: Nur mit strikter Einzelpruefung pro Meldung (insb. Bild-/Drittrechte). +- Referenz: + - https://www.presseportal.de/nutzungsbedingungen + - https://www.presseportal.de/feeds/ + +2. Bundesbehoerden-RSS ohne explizite freie Weiterverwendungs-Lizenz +- Grund: RSS wird bereitgestellt, aber nicht immer als offene Lizenz zur kommerziellen Nachnutzung formuliert. +- Folge: Je Behoerde einzeln pruefen und dokumentieren. +- Beispiele: + - https://www.bundesfinanzministerium.de/Content/DE/Standardartikel/Service/rss_base.html + - https://bmas.bund.de/EN/Services/RSS/rss.html + +### Gruen (mit korrekter Attribution) +1. GovData / Open-Data-Portale mit `dl-de/by-2-0`, `dl-de/zero-2-0`, `CC BY 4.0` oder `CC0` +- Grund: Diese Lizenzen erlauben grundsaetzlich auch kommerzielle Weiterverwendung (je nach Lizenzbedingungen). +- Folge: Sehr gut fuer stabile Automatisierung geeignet. +- Referenz: + - https://www.govdata.de/dl-de/by-2-0 + - https://data.gov.de/informationen/lizenzen + - https://www.dcat-ap.de/def/licenses/dl-zero-de/2.0 + +2. EU-Quellen mit expliziter `CC BY 4.0` Wiederverwendungsregel +- Grund: EU-Inhalte sind haeufig unter CC BY 4.0 wiederverwendbar, sofern nicht anders gekennzeichnet. +- Folge: Geeignet, wenn Drittinhalte ausgenommen werden. +- Referenz: + - https://commission.europa.eu/legal-notice_en + - https://eur-lex.europa.eu/content/help/content/legal-notice/legal-notice.html + +## Quelle im Register freischalten (Definition of Done) +- Terms-Link hinterlegt +- Lizenzklasse (Gruen/Gelb/Rot) gesetzt +- Pflicht-Attribution dokumentiert +- Bildrechtsregel dokumentiert +- Letzte Pruefung und Verantwortlicher gepflegt + +## Hinweis +Keine Rechtsberatung. Bei unklaren oder wirtschaftlich kritischen Quellen ist eine juristische Prüfung sinnvoll. diff --git a/docs/TODO.md b/docs/TODO.md new file mode 100644 index 0000000..ad9b549 --- /dev/null +++ b/docs/TODO.md @@ -0,0 +1,33 @@ +# ToDo (Ein-Entwickler Setup) + +## Jetzt +- [ ] GitHub Project #3 Felder/Views fuer Neustart vereinheitlichen +- [ ] Alte/obsolet gewordene Issues kennzeichnen (z. B. User-Verwaltung) +- [ ] Redirect `news.vanityontour.de -> vanityontour.de` aktiv halten +- [ ] Wiki-Basis fertigstellen und verlinken + +## MVP +- [x] Neues Backend-Skelett (`backend/`) aufsetzen (FastAPI) +- [x] Datenmodell in SQLite anlegen +- [x] Feed-Ingestion Service bauen (ETag/Last-Modified) +- [x] Duplikaterkennung ueber `source_url`, `guid`, Hash +- [x] Login mit 1 Admin-Account implementieren +- [ ] Artikel-Review-Maske mit Statusworkflow +- [ ] WordPress-Publisher als separaten Service implementieren + +## Recht/Qualitaet +- [ ] Source-Policy in DB + Admin-UI abbilden +- [ ] Pflichtfelder je Quelle erzwingen (Autor, URL, Lizenz, Hinweise) +- [ ] Auto-Block bei fehlender Lizenzinfo +- [ ] Pro Artikel Attribution-Block generieren + +## Betrieb +- [ ] Systemd-Service(s) fuer API/Worker erstellen +- [ ] Nginx-Routing fuer neue App einrichten +- [ ] Healthcheck-Endpunkte + Monitoring einrichten +- [ ] Backup/Restore fuer DB dokumentieren + +## Spaeter +- [ ] Passkey/WebAuthn evaluieren und optional einfuehren +- [ ] Migration auf PostgreSQL bewerten +- [ ] Teilautomatische Freigabe-Regeln definieren diff --git a/docs/wiki/Architektur.md b/docs/wiki/Architektur.md new file mode 100644 index 0000000..275b578 --- /dev/null +++ b/docs/wiki/Architektur.md @@ -0,0 +1,29 @@ +# Architektur + +## Zielarchitektur +- API: FastAPI +- Worker: Queue-basierte Hintergrundjobs +- DB: SQLite (MVP), spaeter optional PostgreSQL +- Publisher: WordPress REST API +- Frontend/Admin: schlanke Web-UI mit Login + +## Pipeline +1. Feed Fetch +2. Parse + Normalize +3. Deduplicate +4. Enrichment (Rewrite/Tags) +5. Legal/Policy Check +6. Publish (pending) + +## Datenobjekte (MVP) +- `sources` +- `feeds` +- `articles` +- `article_versions` +- `runs` +- `policy_checks` + +## Nichtziele (MVP) +- Multi-User und Rollen +- Vollautomatische Freigabe ohne Review +- Komplexe externe SSO-Integration diff --git a/docs/wiki/Deployment.md b/docs/wiki/Deployment.md new file mode 100644 index 0000000..91388c7 --- /dev/null +++ b/docs/wiki/Deployment.md @@ -0,0 +1,20 @@ +# Deployment (Hetzner + CloudPanel) + +## Umgebung +- Host: Hetzner +- Reverse Proxy: Nginx via CloudPanel +- Ziel-Domain: `news.vanityontour.de` + +## Aktueller Zustand +- Domain ist bis zum Go-Live auf `https://vanityontour.de` umgeleitet. + +## Zielzustand +- `news.vanityontour.de` zeigt auf neue App (interner Port, z. B. `127.0.0.1:8501`) +- API/Worker laufen als systemd-Services +- TLS bleibt ueber CloudPanel/Nginx + +## Mindest-Checks nach Deployment +- `curl -I https://news.vanityontour.de` +- Login erreichbar +- Feed-Import laeuft +- WordPress-Testpublikation (pending) erfolgreich diff --git a/docs/wiki/Home.md b/docs/wiki/Home.md new file mode 100644 index 0000000..300599a --- /dev/null +++ b/docs/wiki/Home.md @@ -0,0 +1,19 @@ +# Wiki Home + +## Zweck +Dieses Wiki dokumentiert Architektur, Betrieb, Sicherheit, Recht und Roadmap des Neuaufbaus von `rss-news`. + +## Inhalte +- `Architektur.md` +- `Deployment.md` +- `Security-Auth.md` +- `Recht-Quellen.md` +- `Operations-Runbook.md` +- `Roadmap.md` +- `Project-Board.md` + +## Projektsteuerung +- GitHub Project #3: https://github.com/users/OliverGiertz/projects/3/views/1 + +## Prinzip +Dokumentation wird bei jeder relevanten Aenderung im selben Pull Request aktualisiert. diff --git a/docs/wiki/Operations-Runbook.md b/docs/wiki/Operations-Runbook.md new file mode 100644 index 0000000..32bf5c4 --- /dev/null +++ b/docs/wiki/Operations-Runbook.md @@ -0,0 +1,23 @@ +# Operations Runbook + +## Daily Checks +- App erreichbar +- Queue/Worker aktiv +- Letzte Feed-Laeufe erfolgreich +- Keine auffaelligen Fehler im Log + +## Incident: Feed-Import faellt aus +1. RSS-Quelle erreichbar? +2. Parser-Fehler im Log? +3. Rate Limits oder Blockaden? +4. Retry-Queue pruefen + +## Incident: WordPress Publish faellt aus +1. WP API erreichbar? +2. Credentials gueltig? +3. Payload-Validation/Tag-Fehler? +4. Artikel in `pending` statt `failed` markieren, wenn unklar + +## Backups +- SQLite-Dump taeglich +- Konfiguration und `.env` sicher sichern diff --git a/docs/wiki/Project-Board.md b/docs/wiki/Project-Board.md new file mode 100644 index 0000000..887ac19 --- /dev/null +++ b/docs/wiki/Project-Board.md @@ -0,0 +1,28 @@ +# Project Board Workflow + +## Zentrale Steuerung +- Board: https://github.com/users/OliverGiertz/projects/3/views/1 +- Board ist die einzige Quelle fuer Planungsstatus. + +## Arbeitsmodus (1 Entwickler) +- Neue Arbeit immer als Issue anlegen +- Issue direkt ins Project aufnehmen +- Status nur im Project pflegen +- PR/Commit auf Issue referenzieren + +## Empfohlene Status-Disziplin +- `Todo`: noch nicht begonnen +- `In Progress`: aktiv in Arbeit +- `Done`: umgesetzt und dokumentiert + +## Konventionen fuer Issues +- Prefix fuer Klarheit: + - `[MVP]` + - `[Infra]` + - `[Legal]` + - `[Bug]` +- Definition of Done in jedem Issue notieren + +## Aktueller Backlog-Hinweis +- Thema Userverwaltung ist fuer MVP obsolet (ein Admin-User). +- Entsprechende Issues als `deferred` oder `closed` kennzeichnen. diff --git a/docs/wiki/Recht-Quellen.md b/docs/wiki/Recht-Quellen.md new file mode 100644 index 0000000..212f0d5 --- /dev/null +++ b/docs/wiki/Recht-Quellen.md @@ -0,0 +1,35 @@ +# Recht und Quellen + +## Grundregeln +- Nur freigegebene Quellen aus Source-Register +- Pflicht-Attribution pro Artikel +- Rechte fuer Bilder separat pruefen +- Kein Autopublish bei unklarer Lizenz + +## Bewertungsmodell +- Gruen: Freie Nachnutzung klar erlaubt +- Gelb: Nutzung mit Einschraenkungen/Einzelfallpruefung +- Rot: Ohne Zusatzlizenz nicht geeignet + +## Aktuelle Referenzen +- Reuters/Thomson Reuters Terms: https://www.thomsonreuters.com/en/terms-of-use +- Presseportal Nutzungsbedingungen: https://www.presseportal.de/nutzungsbedingungen +- tagesschau RSS-Hinweise: https://www.tagesschau.de/infoservices/rssfeeds +- Datenlizenz Deutschland BY 2.0: https://www.govdata.de/dl-de/by-2-0 +- GovData Lizenzen: https://data.gov.de/informationen/lizenzen +- EU Legal Notice (CC BY 4.0): https://commission.europa.eu/legal-notice_en + +## Review-Checkliste je Quelle +1. Sind Bearbeitung und Veroeffentlichung erlaubt? +2. Ist kommerzielle Nutzung erlaubt? +3. Gibt es gesonderte Bildrechte? +4. Ist die Quellenangabe vorgeschrieben? +5. Gibt es Archivierungs- oder Weitergabebeschraenkungen? + +## Operativer Schutz +- Source-Register als Pflicht vor Feed-Aktivierung +- Auto-Block bei fehlenden Lizenzdaten +- Quartalsweiser Terms-Recheck + +## Hinweis +Keine Rechtsberatung. Finale Freigabe kritischer Quellen bei Bedarf juristisch validieren. diff --git a/docs/wiki/Roadmap.md b/docs/wiki/Roadmap.md new file mode 100644 index 0000000..fece89e --- /dev/null +++ b/docs/wiki/Roadmap.md @@ -0,0 +1,19 @@ +# Roadmap + +## Jetzt +- Doku und Projektstruktur bereinigen +- Redirect aktiv +- Backlog auf Neustart ausrichten + +## Naechster Schritt +- FastAPI-MVP implementieren +- Login + Feed-Ingestion + Review + WordPress pending + +## Danach +- Worker/Queue +- Source-Policy Enforcement +- Monitoring/Reporting +- Optional Passkey + +## Steuerung +Alle Arbeitsitems liegen im GitHub Project #3. diff --git a/docs/wiki/Security-Auth.md b/docs/wiki/Security-Auth.md new file mode 100644 index 0000000..a9f830a --- /dev/null +++ b/docs/wiki/Security-Auth.md @@ -0,0 +1,16 @@ +# Security und Auth + +## Mindestanforderungen +- Zugriff auf die WebApp nur mit Login +- Ein aktiver Admin-User (kein Rollenmodell im MVP) +- Passwort nicht im Repo, nur als Secret auf Server + +## Empfohlene Umsetzung +- Session-basierte Auth (HTTP-only Cookies) +- Passwort gehasht (Argon2 oder bcrypt) +- Rate Limiting auf Login-Endpunkt +- CSRF-Schutz fuer Form-Aktionen + +## Spaeter (optional) +- Passkey/WebAuthn als zusaetzlicher Login-Faktor +- IP-Allowlist fuer Admin-Zugang diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..c15b448 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +testpaths = backend/tests +python_files = test_*.py +addopts = -q --maxfail=1 diff --git a/scripts/smoke_backend.sh b/scripts/smoke_backend.sh new file mode 100755 index 0000000..f0000ad --- /dev/null +++ b/scripts/smoke_backend.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash +set -euo pipefail + +if [[ -z "${BASE_URL:-}" ]]; then + echo "BASE_URL fehlt (z. B. https://news.vanityontour.de)" + exit 1 +fi + +if [[ -z "${APP_ADMIN_USERNAME:-}" || -z "${APP_ADMIN_PASSWORD:-}" ]]; then + echo "APP_ADMIN_USERNAME/APP_ADMIN_PASSWORD fehlen" + exit 1 +fi + +cookie_file="$(mktemp)" +trap 'rm -f "$cookie_file"' EXIT + +echo "[1/4] Healthcheck" +curl -fsS "${BASE_URL}/health" | grep -q '"status":"ok"' + +echo "[2/4] Login" +curl -fsS -c "$cookie_file" \ + -H "Content-Type: application/json" \ + -X POST "${BASE_URL}/auth/login" \ + -d "{\"username\":\"${APP_ADMIN_USERNAME}\",\"password\":\"${APP_ADMIN_PASSWORD}\"}" \ + | grep -q '"ok":true' + +echo "[3/4] Protected Endpoint" +curl -fsS -b "$cookie_file" "${BASE_URL}/api/protected" | grep -q '"ok":true' + +echo "[4/4] Pipeline Status" +curl -fsS -b "$cookie_file" "${BASE_URL}/api/pipeline/status" | grep -q '"stage":"skeleton+db"' + +echo "Smoke test erfolgreich."