diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000..fd3ded5
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,16 @@
+# Copy to .env and fill in values
+
+# WordPress base URL (required)
+WP_BASE_URL=https://your-site.tld
+
+# Authentication: prefer WP_AUTH_BASE64 OR use USERNAME+PASSWORD (Application Password)
+# Example to generate: base64(username:application_password)
+WP_AUTH_BASE64=
+
+# Alternatively provide username and application password
+WP_USERNAME=
+WP_PASSWORD=
+
+# OpenAI API key (optional, enables rewrite)
+OPENAI_API_KEY=
+
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
index 5d55808..af3394f 100644
--- a/.github/workflows/deploy.yml
+++ b/.github/workflows/deploy.yml
@@ -19,9 +19,16 @@ jobs:
username: oliver
key: ${{ secrets.HETZNER_SSH_KEY }}
port: 22
+ envs: APP_ADMIN_USERNAME,APP_ADMIN_PASSWORD
script: |
- cd rss-news
+ cd /opt/rss-news
git pull origin main
source .venv/bin/activate
pip install -r requirements.txt
- sudo systemctl restart rss-app
+ pip install -r backend/requirements.txt || true
+ sudo systemctl restart rss-news-api
+ sleep 3
+ BASE_URL="https://news.vanityontour.de" APP_ADMIN_USERNAME="${APP_ADMIN_USERNAME}" APP_ADMIN_PASSWORD="${APP_ADMIN_PASSWORD}" bash scripts/smoke_backend.sh
+ env:
+ APP_ADMIN_USERNAME: ${{ secrets.NEWS_APP_ADMIN_USERNAME }}
+ APP_ADMIN_PASSWORD: ${{ secrets.NEWS_APP_ADMIN_PASSWORD }}
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 0000000..1d627db
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,39 @@
+name: Backend Tests
+
+on:
+ push:
+ branches: [main]
+ pull_request:
+ branches: [main]
+
+jobs:
+ backend-tests:
+ runs-on: ubuntu-latest
+ timeout-minutes: 15
+
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v4
+
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: "3.11"
+
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install -r backend/requirements.txt
+ pip install -r backend/requirements-test.txt
+
+ - name: Run tests with coverage
+ env:
+ APP_DB_PATH: /tmp/rss_news_test.db
+ run: |
+ pytest backend/tests --cov=backend/app --cov-report=term-missing --cov-report=xml
+
+ - name: Upload coverage artifact
+ uses: actions/upload-artifact@v4
+ with:
+ name: coverage-xml
+ path: coverage.xml
diff --git a/.gitignore b/.gitignore
index fcbde33..aac3a2f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -36,3 +36,4 @@ internal/copy_files.sh
internal/_line.txt
internal/push_commit.txt
internal/git.sh
+CLAUDE.md
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000..7251de6
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,40 @@
+# Repository Guidelines
+
+## Project Structure & Module Organization
+- `app.py`: Streamlit UI (entry point for the app).
+- `main.py`: RSS fetching, rewrite, and WordPress upload logic.
+- `utils/`: Helpers (image/article extraction, WP uploader, UI helpers).
+- `pages/`: Streamlit pages (e.g., `01_feed_manager.py`, `log_viewer.py`).
+- `data/`: JSON state (`articles.json`, `feeds.json`).
+- `logs/`: Runtime logs (`rss_tool.log`).
+- `docs/`: Project notes (e.g., roadmap).
+- `__version__.py`: Version string written by `versioning.py`.
+
+## Build, Test, and Development Commands
+- Create env: `python -m venv .venv && source .venv/bin/activate`
+- Install deps: `pip install -r requirements.txt`
+- Run app: `streamlit run app.py`
+- Version bump: `python versioning.py --level patch --push` (updates `__version__.py`, prepares `CHANGELOG.md`, creates tag; see `--help`).
+
+## Coding Style & Naming Conventions
+- Python 3.10+, PEP 8, 4-space indentation, type hints where practical.
+- Modules and functions: `snake_case`; classes: `PascalCase`.
+- Streamlit pages: numeric prefix for order, e.g., `pages/01_feature.py`.
+- Keep functions small and pure in `utils/`; isolate I/O in app layers.
+- Suggested tools (optional): Black (`black .`) and Ruff (`ruff check .`).
+
+## Testing Guidelines
+- Framework: pytest (recommended). Place tests under `tests/` with `test_*.py`.
+- Unit tests for `utils/*`; light integration checks for `main.py` with temporary files.
+- Run: `pytest -q`. Add coverage if needed (e.g., `pytest --cov=utils`).
+- Test data: avoid mutating files in `data/`; use temp dirs or fixtures.
+
+## Commit & Pull Request Guidelines
+- Commits: imperative mood, concise; examples: `Add feed dedupe`, `Fix WP upload retry`, `Bump version to v1.7.0`.
+- PRs: clear description, linked issue, screenshots/GIFs for UI changes, note env variables touched.
+- Update `CHANGELOG.md` and bump version via `versioning.py` before release PRs.
+
+## Security & Configuration Tips
+- Required env: `OPENAI_API_KEY`, `WP_BASE_URL`, `WP_USERNAME`, `WP_PASSWORD` or `WP_AUTH_BASE64` (see `.env`).
+- Never commit secrets; `.env` is git-ignored. Avoid hardcoded credentials; prefer `os.getenv`.
+- Logs and data may contain content; do not commit `logs/` or large `data/` snapshots.
diff --git a/CHANGELOG.md b/CHANGELOG.md
index c15f26b..66b7237 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,42 @@
+## [1.7.1] - 2025-08-24
+
+### ✨ Security angepasst
+ - alle Credentials in die .env Datei verschoben
+ - beim Start der App werden die Credentials geprüft und beim fehlen entsprechende Meldungen ausgegeben
+
+---
+
## [1.7.0] - 2025-08-24
-- Beschreibung...
+### Multi-Select & Massenoperationen:
+ - ✅ Checkboxes für Artikel-Auswahl im "Artikel verwalten" Bereich
+ - ✅ "Alle auswählen" / "Auswahl aufheben" Buttons
+ - ✅ Massenoperationen für ausgewählte Artikel:
+ - Bulk Status-Änderung für mehrere Artikel gleichzeitig
+ - Bulk Artikel-Umschreibung mit automatischer Status-Verwaltung
+ - Bulk WordPress-Upload nur für "Process"-Artikel
+ - Bulk Papierkorb-Funktion
+
+### Schnellaktionen Integration:
+ - ✅ Feed-Aktualisierung direkt im Artikel-Tab verfügbar
+ - ✅ Alle Dashboard-Schnellaktionen in Artikel-Verwaltung integriert
+ - ✅ Intelligente Anzeige nur relevanter Operationen (z.B. WordPress-Upload nur bei Process-Artikeln)
+
+### 🔧 Verbesserungen
+
+ - UI/UX: Verbesserte Artikel-Card-Layouts mit Checkbox-Integration
+ - Workflow: Streamlined Artikel-Management ohne Tab-Wechsel nötig
+ - Feedback: Detaillierte Statusmeldungen bei Massenoperationen
+ - Performance: Optimierte Session-State-Verwaltung für Artikel-Auswahl
+
+### 🏗️ Technische Änderungen
+
+ - Session State Erweiterung um selected_articles Set
+ - Neue Bulk-Operation-Funktionen in app.py:326-467
+ - Überarbeitetes Artikel-Card-Layout mit 3-Spalten-Design
+ - Integration bestehender WordPress-Upload und Rewrite-Funktionen
+
+---
## [1.6.3] - 2025-08-18
diff --git a/README.md b/README.md
index 0f3d86c..b3c2b4a 100644
--- a/README.md
+++ b/README.md
@@ -1,76 +1,63 @@
-# 📰 RSS News Bot
+# rss-news (Rebuild)
-Ein intelligentes Tool zum Einlesen, Umschreiben und Veröffentlichen von Artikeln aus RSS-Feeds – mit automatischer Tag-Erkennung, KI-unterstütztem Rewrite via GPT-4, Bildextraktion aus Originalartikeln und optionaler DALL·E-Bildgenerierung.
+`rss-news` wird als bestehendes Repository weitergefuehrt und schrittweise zu einer robusten, rechtssicheren News-Pipeline neu aufgebaut.
-
-
-
-
+Aktueller Stand:
+- Alte Streamlit-App wird nicht produktiv genutzt.
+- `news.vanityontour.de` wird bis zum Go-Live der neuen App auf `https://vanityontour.de` umgeleitet.
+- Planung, Doku und Wiki werden als Grundlage fuer den Neuaufbau gepflegt.
----
+## Ziele
+- RSS-gestuetzte Artikelverarbeitung mit klaren Quellregeln
+- Rechtssichere Nutzung (Quellen, Attribution, Lizenzinformationen)
+- Zuverlaessige Automatisierung auf Hetzner
+- Publikation nach WordPress (IONOS aktuell, spaeter offen)
+- Zugriff nur nach Login (zunaechst User/Password)
-## 🚀 Features
+## Architektur-Richtung (MVP)
+- Backend: `Python + FastAPI`
+- Jobs: Queue-Worker (z. B. Redis + RQ/Celery)
+- Daten: SQLite fuer MVP, spaeter optional PostgreSQL
+- Auth: Session-Login mit einem Admin-User
+- Publishing: WordPress REST API (Status zunaechst `pending`)
-- 📡 **RSS-Feeds verwalten** (hinzufügen, aktualisieren)
-- ✍️ **Artikel automatisch umschreiben** mit GPT-4
-- 🏷️ **Tags automatisch generieren**
-- 🖼️ **Bilder aus Originalartikeln extrahieren**
-- 🪄 **Optionales DALL·E-Bild generieren**
-- 🔧 **Bearbeiten von Bildmetadaten**
-- 🗂️ **Statusverwaltung der Artikel (New, Rewrite, Process, etc.)**
-- 📜 **Log-Viewer-Seite integriert**
-- 📥 **Export zur Veröffentlichung auf WordPress vorbereitet**
-- 📋 Artikeltabelle mit Status-Filter
-- 🔍 Artikel-Expander mit Rewrite, Tags & Bildern
-- 🪄 Button für KI-Bildgenerierung
+Details: `docs/PROJECT_PLAN.md`
+## Projektsteuerung
+- GitHub Project: `https://github.com/users/OliverGiertz/projects/3/views/1`
+- Dieses Board ist die zentrale Steuerung fuer ToDos, Bugs, Verbesserungen.
+- Wiki-Struktur liegt unter `docs/wiki/`.
----
+## Dokumentation
+- Projektplan: `docs/PROJECT_PLAN.md`
+- ToDo-Liste: `docs/TODO.md`
+- Quell- und Lizenzpolicy: `docs/SOURCE_POLICY.md`
+- Wiki Home: `docs/wiki/Home.md`
-## 🧱 Projektstruktur
-
-ss-news/
-├── app.py # Haupt-UI mit Streamlit
-├── main.py # Logik für Feed-Import und Verarbeitung
-├── utils/
-│ └── image_extractor.py # Bilder aus Originalartikeln extrahieren
-│ └── dalle_generator.py # DALL·E-Integration (KI-Bild)
-├── pages/
-│ └── log_viewer.py # UI zur Anzeige der Logs
-├── data/
-│ └── articles.json # Gespeicherte Artikel
-│ └── feeds.json # Gespeicherte Feed-URLs
-├── logs/
-│ └── rss_tool.log # Logging der Verarbeitung
-├── versioning.py # CLI-Tool zur Versionierung & Release
-├── TEST-CHECKLIST.md # Manuelle Prüfliste für Releases
-├── version.py # Aktuelle Version
-└── CHANGELOG.md # Änderungsprotokoll
-
-
----
-
-## ⚙️ Installation
+## Lokale Entwicklung (Legacy-Code)
+Der vorhandene Legacy-Stand kann weiterhin lokal gestartet werden:
```bash
-git clone https://github.com/OliverGiertz/rss-news.git
-cd rss-news
python -m venv .venv
source .venv/bin/activate
pip install -r requirements.txt
-```
-
----
-
-## Update
-Ein Update Script findest du hier: https://gist.github.com/OliverGiertz/ad33ae3de9aa1c1163dad5fe8affb6ca
-
-```bash
-bash update.sh
-```
-
-
-## ▶️ Starten der App
-
streamlit run app.py
+```
+
+Hinweis: Diese App ist funktional historisch und wird durch die neue Architektur ersetzt.
+
+## Deployment-Zielbild
+- Betrieb auf Hetzner
+- Reverse Proxy via CloudPanel/Nginx
+- Produktive Domain: `news.vanityontour.de`
+- Bis zur Fertigstellung: Redirect auf `https://vanityontour.de`
+
+## Sicherheit
+- Keine Secrets im Repository
+- `.env` lokal/auf Server, nie committen
+- Auth-Pflicht fuer die neue WebApp
+- spaeter optional: Passkeys/WebAuthn
+
+## Rechtlicher Hinweis
+Dieses Projekt verarbeitet nur Quellen mit dokumentierter Nutzungsgrundlage. Vor produktiver Nutzung ist eine finale rechtliche Pruefung der ausgewaehlten Feeds notwendig.
diff --git a/__version__.py b/__version__.py
index 32d334c..6a6d0f8 100644
--- a/__version__.py
+++ b/__version__.py
@@ -1 +1 @@
-VERSION = "1.7.0"
+VERSION = "1.7.1"
diff --git a/app.py b/app.py
index a6d2dfa..f161a65 100644
--- a/app.py
+++ b/app.py
@@ -14,6 +14,7 @@ from main import (
from utils.dalle_generator import generate_dalle_image
from utils.wordpress_uploader import WordPressUploader
from utils.css_loader import load_css, apply_dark_theme
+from utils.config import validate_env
import os
from collections import Counter
import time
@@ -29,6 +30,19 @@ st.set_page_config(
load_css()
apply_dark_theme()
+# === Environment-Validierung (.env) ===
+env_check = validate_env()
+if not env_check.get("ok"):
+ st.error("🔒 Sicherheits-/Konfigurationshinweis: Bitte .env korrekt konfigurieren.")
+ for msg in env_check.get("errors", []):
+ st.markdown(f"- ❌ {msg}")
+ for msg in env_check.get("warnings", []):
+ st.markdown(f"- ⚠️ {msg}")
+elif env_check.get("warnings"):
+ st.info("ℹ️ Hinweise zur Konfiguration:")
+ for msg in env_check.get("warnings", []):
+ st.markdown(f"- ⚠️ {msg}")
+
# === Initialize Session State ===
if 'selected_articles' not in st.session_state:
st.session_state.selected_articles = set()
@@ -928,20 +942,7 @@ with tab6:
""", unsafe_allow_html=True)
- # WordPress Auth Debug (nur für Entwicklung)
- if st.checkbox("🔧 Debug-Modus (Auth-Details anzeigen)", value=False):
- st.warning("⚠️ Nur für Entwicklung - zeigt Auth-Details!")
-
- wp_base64 = os.getenv("WP_AUTH_BASE64", "")
- if wp_base64:
- try:
- import base64
- decoded = base64.b64decode(wp_base64).decode('utf-8')
- st.code(f"Base64: {wp_base64}\nDecoded: {decoded}")
- except Exception as e:
- st.error(f"Fehler beim Dekodieren: {e}")
- else:
- st.info("Kein Base64-String konfiguriert")
+ # Sicherheit: Kein Anzeigen sensibler Auth-Details mehr
# Bulk Upload
st.subheader("📦 Massenupload")
@@ -1062,15 +1063,16 @@ with tab6:
with st.expander("📋 .env-Datei Vorlage", expanded=False):
st.code("""
# WordPress-Konfiguration
-WP_BASE_URL=https://vanityontour.de
-WP_USERNAME=ogiertz
-WP_PASSWORD=whNEx9aZCIUXViV89Z3e7Z03
+WP_BASE_URL=https://your-site.tld
-# WordPress Base64-Authentifizierung (bevorzugte Methode)
-WP_AUTH_BASE64=b2dpZXJ0ejp3aE5FeDlhWkNJVVhWaVY4OVozZTdaMDM=
+# Entweder Base64 (empfohlen) ODER Benutzername/Passwort (Application Password)
+WP_AUTH_BASE64=
+# Oder alternativ:
+WP_USERNAME=
+WP_PASSWORD=
-# OpenAI-Konfiguration (für Artikel-Umschreibung)
-OPENAI_API_KEY=sk-...
+# OpenAI-Konfiguration (optional für Umschreibung)
+OPENAI_API_KEY=
""", language="bash")
with st.expander("🔑 Base64-Authentifizierung verstehen", expanded=False):
@@ -1078,21 +1080,10 @@ OPENAI_API_KEY=sk-...
WordPress REST API Authentifizierung:
- Die WordPress REST API erfordert eine Base64-kodierte Authentifizierung im Format:
- Authorization: Basic <base64_encoded_credentials>
-
- Ihr bereitgestellter Base64-String:
- • b2dpZXJ0ejp3aE5FeDlhWkNJVVhWaVY4OVozZTdaMDM=
- • Dekodiert: ogiertz:whNEx9aZCIUXViV89Z3e7Z03
-
- So funktioniert es:
- 1. Benutzername und Anwendungspasswort werden kombiniert: username:password
- 2. Dieser String wird Base64-kodiert
- 3. Im Authorization-Header verwendet: Basic <base64_string>
-
- Fallback-Verhalten:
- • Wenn WP_AUTH_BASE64 gesetzt ist → Direkter Base64-String verwendet
- • Wenn nicht gesetzt → Base64 wird aus WP_USERNAME:WP_PASSWORD generiert
+ Die WordPress REST API nutzt Basic-Auth mit Base64-kodierten Zugangsdaten:
+ Authorization: Basic <base64(username:password)>
+ Empfehlung: In der .env WP_AUTH_BASE64 setzen (aus username:application_password erzeugt).
+ Alternativ können WP_USERNAME und WP_PASSWORD gesetzt werden; dann wird Base64 zur Laufzeit generiert.
""", unsafe_allow_html=True)
@@ -1115,4 +1106,4 @@ OPENAI_API_KEY=sk-...
4. Generiertes Passwort in .env-Datei eintragen
- """, unsafe_allow_html=True)
\ No newline at end of file
+ """, unsafe_allow_html=True)
diff --git a/backend/.env.example b/backend/.env.example
new file mode 100644
index 0000000..c2dd235
--- /dev/null
+++ b/backend/.env.example
@@ -0,0 +1,45 @@
+# ─── App ────────────────────────────────────────────────────────────────────
+APP_ENV=development
+APP_NAME=rss-news-backend
+APP_SECRET_KEY=replace-with-a-long-random-secret
+APP_DB_PATH=backend/data/rss_news.db
+
+APP_ADMIN_USERNAME=admin
+APP_ADMIN_PASSWORD=change-me
+
+SESSION_COOKIE_NAME=rss_news_session
+SESSION_MAX_AGE_SECONDS=28800
+
+# ─── WordPress ──────────────────────────────────────────────────────────────
+WP_BASE_URL=https://your-site.tld
+WP_USERNAME=your-wp-username
+WP_PASSWORD=your-wp-app-password
+# Status für neue Beiträge: draft | future | publish
+WORDPRESS_DEFAULT_STATUS=draft
+
+# ─── OpenAI ─────────────────────────────────────────────────────────────────
+OPENAI_API_KEY=sk-...
+# gpt-4o-mini empfohlen (Kosten/Qualität)
+OPENAI_MODEL=gpt-4o-mini
+
+# ─── Telegram Bot ────────────────────────────────────────────────────────────
+# Bot-Token von @BotFather
+TELEGRAM_BOT_TOKEN=123456789:ABC...
+# Chat-ID deines persönlichen Chats oder einer Gruppe
+TELEGRAM_CHAT_ID=123456789
+# Zufälliger Secret-Token zur Webhook-Absicherung (mindestens 20 Zeichen)
+TELEGRAM_WEBHOOK_SECRET=replace-with-random-secret-min-20-chars
+
+# ─── N8N API-Key ─────────────────────────────────────────────────────────────
+# Wird von N8N im Header X-API-Key mitgeschickt
+N8N_API_KEY=replace-with-strong-random-key
+
+# ─── Pipeline-Einstellungen ──────────────────────────────────────────────────
+# Relevanz-Score >= dieser Wert: automatisch verarbeiten (0-100)
+PIPELINE_RELEVANCE_AUTO=80
+# Relevanz-Score >= dieser Wert, aber < AUTO: Telegram-Warnung senden
+PIPELINE_RELEVANCE_WARN=60
+# Maximale Drafts/Veröffentlichungen pro Tag
+PIPELINE_MAX_DRAFTS_PER_DAY=2
+# Bevorzugte Veröffentlichungszeiten (Stunden, kommagetrennt, CET)
+PIPELINE_PUBLISH_HOURS=9,14
diff --git a/backend/README.md b/backend/README.md
new file mode 100644
index 0000000..7d64a65
--- /dev/null
+++ b/backend/README.md
@@ -0,0 +1,82 @@
+# Backend Skeleton (FastAPI)
+
+Dieses Verzeichnis enthaelt das technische Grundgeruest fuer den Rebuild von `rss-news`.
+
+## Start (lokal)
+
+```bash
+python -m venv .venv
+source .venv/bin/activate
+pip install -r backend/requirements.txt
+uvicorn backend.app.main:app --reload --port 8501
+```
+
+## Admin UI
+- Login: `http://127.0.0.1:8501/admin/login`
+- Dashboard: `http://127.0.0.1:8501/admin/dashboard`
+
+## Environment
+- Datei: `backend/.env`
+- Vorlage: `backend/.env.example`
+
+## Endpoints
+- `GET /health` - Healthcheck
+- `POST /auth/login` - Login mit Admin-User
+- `POST /auth/logout` - Logout
+- `GET /auth/me` - Aktiver User
+- `GET /api/protected` - Geschuetzter Test-Endpoint
+- `GET /api/pipeline/status` - Basisstatus inkl. Datensatzzaehler
+- `GET /api/sources` - Quellenliste
+- `POST /api/sources` - Quelle anlegen
+- `GET /api/sources/{source_id}/policy-check` - Policy-Pruefung fuer Quelle
+- `GET /api/feeds` - Feedliste
+- `POST /api/feeds` - Feed anlegen
+- `GET /api/feeds/{feed_id}/policy-check` - Policy-Pruefung fuer Feed
+- `GET /api/runs` - Import-/Job-Runs anzeigen
+- `GET /api/runs/{run_id}` - Detailansicht eines Runs
+- `POST /api/runs` - Run starten
+- `POST /api/runs/{run_id}/finish` - Run abschliessen
+- `GET /api/articles` - Artikel anzeigen
+- `GET /api/articles/{article_id}` - Artikeldetail
+- `POST /api/articles/upsert` - Artikel idempotent anlegen/aktualisieren
+- `POST /api/articles/{article_id}/transition` - Statuswechsel nach Workflow-Regeln
+- `POST /api/articles/{article_id}/review` - Review-Entscheidung (approve/reject)
+- `POST /api/ingestion/run` - Feed-Ingestion starten (optional pro Feed)
+
+## Datenbank
+- SQLite-Datei unter `backend/data/rss_news.db`
+- Tabellen werden beim App-Start initialisiert.
+- Tabellen: `sources`, `feeds`, `runs`, `articles`
+- Dedupe-Strategie Artikel: `source_url` -> `(feed_id, source_article_id)` -> `source_hash`
+
+## Policy-Enforcement
+- Ingestion blockiert Feeds automatisch, wenn die zugeordnete Quelle nicht policy-konform ist.
+- Mindestanforderungen: `risk_level=green`, `terms_url`, `license_name`, `last_reviewed_at`, `is_enabled=1`.
+- Pro importiertem Artikel wird ein `attribution`-Block in `meta_json` gespeichert.
+
+## Review-Workflow
+- Statuskette: `new -> review -> approved -> published`
+- Ablehnung im Review setzt auf `rewrite`
+- Ungueltige Statuswechsel werden per API blockiert
+
+## Verifikation
+```bash
+python -m unittest backend.tests.test_db_repositories
+python -m unittest backend.tests.test_ingestion
+python -m unittest backend.tests.test_api_auth
+```
+
+## CI / Online-Auswertung
+- GitHub Actions Workflow: `.github/workflows/test.yml`
+- Fuehrt Tests inkl. Coverage auf Push/PR gegen `main` aus.
+
+## Hetzner Smoketest
+```bash
+BASE_URL="https://news.vanityontour.de" \
+APP_ADMIN_USERNAME="admin" \
+APP_ADMIN_PASSWORD="..." \
+bash scripts/smoke_backend.sh
+```
+
+## Hinweis
+Passwort-Hashing und CSRF/Rate-Limit sind als naechste Ausbaustufe vorgesehen.
diff --git a/backend/__init__.py b/backend/__init__.py
new file mode 100644
index 0000000..3623851
--- /dev/null
+++ b/backend/__init__.py
@@ -0,0 +1 @@
+"""Backend package for rss-news rebuild."""
diff --git a/backend/app/__init__.py b/backend/app/__init__.py
new file mode 100644
index 0000000..18b665e
--- /dev/null
+++ b/backend/app/__init__.py
@@ -0,0 +1 @@
+"""Application package."""
diff --git a/backend/app/admin_ui.py b/backend/app/admin_ui.py
new file mode 100644
index 0000000..a25199c
--- /dev/null
+++ b/backend/app/admin_ui.py
@@ -0,0 +1,1126 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+import re
+import socket
+import ssl
+import time
+from urllib.parse import urlparse
+from urllib.parse import urlencode
+from urllib.request import Request as UrlRequest, urlopen
+
+from fastapi import APIRouter, Form, Request
+from fastapi.responses import HTMLResponse, RedirectResponse, Response
+from fastapi.templating import Jinja2Templates
+
+from .auth import create_session_token, verify_credentials, verify_session_token
+from .config import get_settings
+from .ingestion import run_ingestion
+from .policy import evaluate_source_policy
+from .publisher import enqueue_publish, run_publisher
+from .relevance import article_age_days, article_relevance
+from .rewrite import generate_article_tags, merge_generated_tags, rewrite_article_text
+from .repositories import (
+ FeedCreate,
+ FeedUpdate,
+ SourceCreate,
+ SourceUpdate,
+ delete_feed,
+ delete_source,
+ create_feed,
+ create_source,
+ get_article_by_id,
+ get_feed_by_id,
+ list_articles,
+ list_articles_page,
+ bulk_update_wp_post_ids,
+ list_feeds,
+ list_publish_jobs,
+ list_runs,
+ list_sources,
+ set_article_image_decision,
+ upsert_article,
+ update_feed,
+ update_source,
+ update_article_status,
+ ArticleUpsert,
+)
+from .workflow import ALLOWED_UI_TRANSITIONS, UI_STATUSES, internal_to_ui_status, ui_to_internal_status
+
+settings = get_settings()
+router = APIRouter(tags=["admin-ui"])
+templates = Jinja2Templates(directory=str(Path(__file__).resolve().parent.parent / "templates"))
+ALLOWED_TRANSITIONS: dict[str, tuple[str, ...]] = {
+ "new": ("rewrite", "close"),
+ "rewrite": ("publish", "close"),
+ "publish": ("published", "close"),
+ "published": ("rewrite", "close"),
+ "close": ("rewrite",),
+}
+IMAGE_PROXY_USER_AGENT = "rss-news-admin/1.0"
+_UNSET = object()
+
+
+def _admin_user(request: Request) -> str | None:
+ token = request.cookies.get(settings.session_cookie_name)
+ if not token:
+ return None
+ return verify_session_token(token)
+
+
+def _to_optional_int(raw: str | None) -> int | None:
+ if raw is None:
+ return None
+ value = raw.strip()
+ if value == "":
+ return None
+ return int(value)
+
+
+def _dashboard_redirect(
+ *,
+ msg: str | None = None,
+ msg_type: str = "success",
+ status_filter: str | None = None,
+) -> RedirectResponse:
+ query: dict[str, str] = {}
+ if msg:
+ query["msg"] = msg
+ query["type"] = msg_type
+ if status_filter:
+ query["status_filter"] = status_filter
+ suffix = f"?{urlencode(query)}" if query else ""
+ return RedirectResponse(url=f"/admin/dashboard{suffix}", status_code=303)
+
+
+def _parse_meta_json(raw: str | None) -> dict:
+ if not raw:
+ return {}
+ try:
+ parsed = json.loads(raw)
+ return parsed if isinstance(parsed, dict) else {}
+ except Exception:
+ return {}
+
+
+def _read_article_images(article: dict, extraction: dict) -> list[str]:
+ images: list[str] = []
+ if article.get("image_urls_json"):
+ try:
+ parsed_images = json.loads(article["image_urls_json"])
+ if isinstance(parsed_images, list):
+ images = [str(item) for item in parsed_images if item]
+ except Exception:
+ images = []
+ if not images and isinstance(extraction.get("images"), list):
+ images = [str(item) for item in extraction.get("images") if item]
+ # deduplicate preserving order
+ seen: set[str] = set()
+ deduped: list[str] = []
+ for image in images:
+ if image not in seen:
+ seen.add(image)
+ deduped.append(image)
+ return deduped
+
+
+def _is_probably_irrelevant_image(url: str) -> bool:
+ lowered = url.lower()
+ patterns = (
+ r"logo",
+ r"icon",
+ r"sprite",
+ r"avatar",
+ r"favicon",
+ r"/ads/",
+ r"tracking",
+ r"pixel",
+ r"banner",
+ )
+ return any(re.search(pattern, lowered) for pattern in patterns)
+
+
+def _is_http_image_url(url: str) -> bool:
+ try:
+ parsed = urlparse(url)
+ except Exception:
+ return False
+ return parsed.scheme in {"http", "https"} and bool(parsed.netloc)
+
+
+def _build_image_entries(article: dict, extraction: dict, meta: dict) -> list[dict[str, object]]:
+ all_images = _read_article_images(article, extraction)
+ image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {}
+ selected_url = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None
+ excluded_urls = image_review.get("excluded_urls") if isinstance(image_review.get("excluded_urls"), list) else []
+ excluded_set = {str(item) for item in excluded_urls if item}
+
+ entries: list[dict[str, object]] = []
+ for url in all_images:
+ entries.append(
+ {
+ "url": url,
+ "proxy_url": f"/admin/images/proxy?{urlencode({'url': url})}",
+ "is_selected": selected_url == url,
+ "is_excluded": url in excluded_set,
+ "is_irrelevant_hint": _is_probably_irrelevant_image(url),
+ }
+ )
+ return entries
+
+
+def _publish_readiness(article: dict, meta: dict) -> tuple[bool, list[str]]:
+ reasons: list[str] = []
+ if internal_to_ui_status(article.get("status")) not in {"publish", "published"}:
+ reasons.append("Status ist nicht 'publish'")
+ image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {}
+ selected_image = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None
+ if not selected_image:
+ reasons.append("Hauptbild nicht ausgewählt")
+ return len(reasons) == 0, reasons
+
+
+def _classify_publish_error(error_message: str | None) -> tuple[str, str]:
+ text = (error_message or "").lower()
+ if not text.strip():
+ return "ok", "-"
+ if "rechtsfreigabe fehlt" in text or "hauptbild nicht gesetzt" in text or "status ist nicht" in text:
+ return "policy", "Artikelvoraussetzungen im UI prüfen (Status/Hauptbild)."
+ if "401" in text or "403" in text or "authorization" in text or "forbidden" in text or "unauthorized" in text:
+ return "auth", "WordPress Nutzer/App-Passwort prüfen."
+ if "404" in text and ("media" in text or "posts" in text or "wp-json" in text):
+ return "api", "WordPress REST-Endpunkt prüfen (`/wp-json/wp/v2`)."
+ if "timed out" in text or "timeout" in text or "nodename nor servname provided" in text or "name or service not known" in text:
+ return "dns", "DNS/Netzwerk zur WordPress-Domain prüfen."
+ if "media-upload fehlgeschlagen" in text or "liefert kein bild" in text or "featured_media" in text:
+ return "media", "Bild-URL/Format prüfen oder anderes Hauptbild auswählen."
+ return "unknown", "Fehlerdetails prüfen und bei Bedarf Job erneut starten."
+
+
+def _legal_checklist(article: dict, feed: dict | None) -> list[dict[str, str]]:
+ meta = article.get("meta", {})
+ extraction = meta.get("extraction") if isinstance(meta.get("extraction"), dict) else {}
+ attribution = meta.get("attribution") if isinstance(meta.get("attribution"), dict) else {}
+
+ checks: list[dict[str, str]] = []
+ checks.append(
+ {
+ "label": "Original-Link vorhanden",
+ "status": "ok" if article.get("source_url") else "missing",
+ "value": article.get("source_url") or "-",
+ }
+ )
+ checks.append(
+ {
+ "label": "Autor vorhanden",
+ "status": "ok" if article.get("author") else "missing",
+ "value": article.get("author") or "-",
+ }
+ )
+ checks.append(
+ {
+ "label": "Bilder extrahiert",
+ "status": "ok" if article.get("image_urls_json") else "missing",
+ "value": str(len(extraction.get("images", []))) if isinstance(extraction.get("images"), list) else "0",
+ }
+ )
+ checks.append(
+ {
+ "label": "Pressekontakt",
+ "status": "ok" if article.get("press_contact") else "missing",
+ "value": article.get("press_contact") or extraction.get("press_contact") or "-",
+ }
+ )
+ checks.append(
+ {
+ "label": "Lizenz/Terms",
+ "status": "ok" if article.get("source_license_name_snapshot") and article.get("source_terms_url_snapshot") else "missing",
+ "value": f"{article.get('source_license_name_snapshot') or attribution.get('source_license_name') or '-'} | {article.get('source_terms_url_snapshot') or attribution.get('source_terms_url') or '-'}",
+ }
+ )
+ checks.append(
+ {
+ "label": "Risiko-Status Quelle",
+ "status": "ok" if (feed and feed.get("source_risk_level") == "green") else "missing",
+ "value": feed.get("source_risk_level") if feed else "-",
+ }
+ )
+ image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {}
+ selected_image = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None
+ checks.append(
+ {
+ "label": "Hauptbild ausgewählt",
+ "status": "ok" if selected_image else "missing",
+ "value": selected_image or "-",
+ }
+ )
+ return checks
+
+
+def _build_connectivity_targets() -> list[dict[str, str]]:
+ targets: list[dict[str, str]] = []
+ seen: set[tuple[str, str]] = set()
+
+ def add_target(label: str, kind: str, value: str) -> None:
+ normalized = (value or "").strip()
+ if not normalized:
+ return
+ key = (kind, normalized.lower())
+ if key in seen:
+ return
+ seen.add(key)
+ targets.append({"label": label, "kind": kind, "value": normalized})
+
+ add_target("OpenAI API", "host", "api.openai.com")
+ if settings.wordpress_base_url:
+ parsed = urlparse(settings.wordpress_base_url)
+ if parsed.hostname:
+ add_target("WordPress Host", "host", parsed.hostname)
+ wp_api_url = f"{settings.wordpress_base_url.rstrip('/')}/wp-json/wp/v2"
+ add_target("WordPress REST", "url", wp_api_url)
+
+ for feed in list_feeds():
+ name = (feed.get("name") or "").strip() or f"Feed #{feed.get('id')}"
+ feed_url = str(feed.get("url") or "").strip()
+ if not feed_url:
+ continue
+ parsed = urlparse(feed_url)
+ if parsed.hostname:
+ add_target(f"{name} (Feed)", "host", parsed.hostname)
+ add_target(f"{name} (Feed URL)", "url", feed_url)
+
+ return targets
+
+
+def _run_connectivity_check(target: dict[str, str]) -> dict[str, object]:
+ kind = target.get("kind", "")
+ value = str(target.get("value") or "")
+ row: dict[str, object] = {
+ "label": target.get("label") or "-",
+ "kind": kind,
+ "target": value,
+ "dns_ok": False,
+ "dns_info": "-",
+ "tcp_ok": False,
+ "tcp_info": "-",
+ "http_ok": False,
+ "http_info": "-",
+ "duration_ms": 0,
+ "ok": False,
+ }
+ started = time.perf_counter()
+ try:
+ hostname = value if kind == "host" else (urlparse(value).hostname or "")
+ port = 443
+ if kind == "url":
+ parsed = urlparse(value)
+ if parsed.scheme not in {"http", "https"}:
+ row["http_info"] = f"unsupported scheme: {parsed.scheme or '-'}"
+ return row
+ port = 443 if parsed.scheme == "https" else 80
+ if not hostname:
+ row["dns_info"] = "host fehlt"
+ return row
+
+ try:
+ addr_info = socket.getaddrinfo(hostname, None, proto=socket.IPPROTO_TCP)
+ ips = sorted({entry[4][0] for entry in addr_info if entry and len(entry) > 4 and entry[4]})
+ row["dns_ok"] = True
+ row["dns_info"] = ", ".join(ips[:3]) if ips else "resolved"
+ except Exception as exc:
+ row["dns_info"] = str(exc)
+ return row
+
+ try:
+ socket.create_connection((hostname, port), timeout=4).close()
+ row["tcp_ok"] = True
+ row["tcp_info"] = f"port {port} erreichbar"
+ except Exception as exc:
+ row["tcp_info"] = str(exc)
+ return row
+
+ if kind == "host":
+ row["http_ok"] = True
+ row["http_info"] = "n/a (host-only)"
+ row["ok"] = True
+ return row
+
+ try:
+ req = UrlRequest(
+ url=value,
+ headers={"User-Agent": IMAGE_PROXY_USER_AGENT, "Accept": "*/*"},
+ )
+ with urlopen(req, timeout=6, context=ssl.create_default_context()) as resp:
+ code = getattr(resp, "status", None) or resp.getcode()
+ row["http_ok"] = True
+ row["http_info"] = f"HTTP {code}"
+ except Exception as exc:
+ row["http_info"] = str(exc)
+ return row
+
+ row["ok"] = bool(row["dns_ok"] and row["tcp_ok"] and row["http_ok"])
+ return row
+ finally:
+ row["duration_ms"] = int((time.perf_counter() - started) * 1000)
+
+
+def _upsert_article_from_existing(
+ article: dict,
+ *,
+ content_rewritten: str | None = None,
+ status: str | None = None,
+ wp_post_id: int | None | object = _UNSET,
+ wp_post_url: str | None | object = _UNSET,
+ publish_attempts: int | object = _UNSET,
+ publish_last_error: str | None | object = _UNSET,
+ published_to_wp_at: str | None | object = _UNSET,
+ meta_json: str | None | object = _UNSET,
+) -> None:
+ rewritten = article.get("content_rewritten") if content_rewritten is None else content_rewritten
+ upsert_article(
+ ArticleUpsert(
+ feed_id=article.get("feed_id"),
+ source_article_id=article.get("source_article_id"),
+ source_hash=article.get("source_hash"),
+ title=article.get("title"),
+ source_url=article.get("source_url"),
+ canonical_url=article.get("canonical_url"),
+ published_at=article.get("published_at"),
+ author=article.get("author"),
+ summary=article.get("summary"),
+ content_raw=article.get("content_raw"),
+ content_rewritten=rewritten,
+ image_urls_json=article.get("image_urls_json"),
+ press_contact=article.get("press_contact"),
+ source_name_snapshot=article.get("source_name_snapshot"),
+ source_terms_url_snapshot=article.get("source_terms_url_snapshot"),
+ source_license_name_snapshot=article.get("source_license_name_snapshot"),
+ legal_checked=bool(int(article.get("legal_checked", 0))),
+ legal_checked_at=article.get("legal_checked_at"),
+ legal_note=article.get("legal_note"),
+ wp_post_id=article.get("wp_post_id") if wp_post_id is _UNSET else wp_post_id,
+ wp_post_url=article.get("wp_post_url") if wp_post_url is _UNSET else wp_post_url,
+ publish_attempts=int(article.get("publish_attempts", 0)) if publish_attempts is _UNSET else publish_attempts,
+ publish_last_error=article.get("publish_last_error") if publish_last_error is _UNSET else publish_last_error,
+ published_to_wp_at=article.get("published_to_wp_at") if published_to_wp_at is _UNSET else published_to_wp_at,
+ word_count=len(str(rewritten or "").split()),
+ status=article.get("status") if status is None else status,
+ meta_json=article.get("meta_json") if meta_json is _UNSET else meta_json,
+ )
+ )
+
+
+@router.get("/admin", response_class=HTMLResponse)
+def admin_index(request: Request):
+ user = _admin_user(request)
+ if not user:
+ return RedirectResponse(url="/admin/login", status_code=303)
+ return RedirectResponse(url="/admin/dashboard", status_code=303)
+
+
+@router.get("/admin/login", response_class=HTMLResponse)
+def admin_login_page(request: Request):
+ return templates.TemplateResponse(
+ request,
+ "admin_login.html",
+ {"request": request, "title": "Admin Login", "error": request.query_params.get("error")},
+ )
+
+
+@router.post("/admin/login")
+def admin_login(request: Request, username: str = Form(...), password: str = Form(...)):
+ if not verify_credentials(username, password):
+ return RedirectResponse(url="/admin/login?error=1", status_code=303)
+
+ token = create_session_token(username)
+ response = RedirectResponse(url="/admin/dashboard", status_code=303)
+ response.set_cookie(
+ key=settings.session_cookie_name,
+ value=token,
+ max_age=settings.session_max_age_seconds,
+ httponly=True,
+ secure=False,
+ samesite="lax",
+ )
+ return response
+
+
+@router.post("/admin/logout")
+def admin_logout():
+ response = RedirectResponse(url="/admin/login", status_code=303)
+ response.delete_cookie(settings.session_cookie_name)
+ return response
+
+
+@router.get("/admin/dashboard", response_class=HTMLResponse)
+def admin_dashboard(request: Request):
+ user = _admin_user(request)
+ if not user:
+ return RedirectResponse(url="/admin/login", status_code=303)
+
+ sources = list_sources()
+ source_policy = {s["id"]: evaluate_source_policy(s) for s in sources}
+ feeds = list_feeds()
+ runs = list_runs(limit=30)
+ publish_jobs = list_publish_jobs(limit=30)
+ for job in publish_jobs:
+ category, hint = _classify_publish_error(job.get("error_message"))
+ job["error_category"] = category
+ job["error_hint"] = hint
+ status_filter = request.query_params.get("status_filter")
+ internal_filter = ui_to_internal_status(status_filter) if status_filter else None
+ if status_filter in set(UI_STATUSES):
+ articles = list_articles(limit=100, status_filter=internal_filter)
+ else:
+ status_filter = ""
+ articles = [a for a in list_articles(limit=250) if internal_to_ui_status(a.get("status")) != "close"][:100]
+ for article in articles:
+ meta = _parse_meta_json(article.get("meta_json"))
+ extraction = meta.get("extraction") if isinstance(meta.get("extraction"), dict) else {}
+ images = _read_article_images(article, extraction)
+ article["meta"] = meta
+ ready, reasons = _publish_readiness(article, meta)
+ article["publish_ready"] = ready
+ article["publish_blockers"] = reasons
+ article["extracted_images"] = images
+ article["image_entries"] = _build_image_entries(article, extraction, meta)
+ image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {}
+ article["selected_image_url"] = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None
+ article["selected_image_proxy_url"] = (
+ f"/admin/images/proxy?{urlencode({'url': article['selected_image_url']})}" if article.get("selected_image_url") else None
+ )
+ if not article.get("press_contact") and isinstance(extraction.get("press_contact"), str):
+ article["press_contact"] = extraction.get("press_contact")
+ article["extraction_error"] = extraction.get("extraction_error") if isinstance(extraction.get("extraction_error"), str) else None
+ article["days_old"] = article_age_days(article.get("published_at"))
+ article["relevance"] = article_relevance(article.get("published_at"))
+ article["status_ui"] = internal_to_ui_status(article.get("status"))
+ tags = meta.get("generated_tags") if isinstance(meta.get("generated_tags"), list) else []
+ article["generated_tags"] = [str(t) for t in tags if t]
+
+ return templates.TemplateResponse(
+ request,
+ "admin_dashboard.html",
+ {
+ "request": request,
+ "title": "Admin Dashboard",
+ "user": user,
+ "sources": sources,
+ "source_policy": source_policy,
+ "feeds": feeds,
+ "runs": runs,
+ "publish_jobs": publish_jobs,
+ "articles": articles,
+ "status_options": list(UI_STATUSES),
+ "allowed_transitions": ALLOWED_TRANSITIONS,
+ "status_filter": status_filter,
+ "flash_msg": request.query_params.get("msg", ""),
+ "flash_type": request.query_params.get("type", "success"),
+ },
+ )
+
+
+@router.get("/admin/connectivity", response_class=HTMLResponse)
+def admin_connectivity(request: Request):
+ user = _admin_user(request)
+ if not user:
+ return RedirectResponse(url="/admin/login", status_code=303)
+
+ checks = [_run_connectivity_check(target) for target in _build_connectivity_targets()]
+ ok_count = len([c for c in checks if c.get("ok")])
+ error_count = len(checks) - ok_count
+ return templates.TemplateResponse(
+ request,
+ "admin_connectivity.html",
+ {
+ "request": request,
+ "title": "Connectivity Check",
+ "user": user,
+ "checks": checks,
+ "ok_count": ok_count,
+ "error_count": error_count,
+ },
+ )
+
+
+@router.get("/admin/articles/{article_id}", response_class=HTMLResponse)
+def admin_article_detail(request: Request, article_id: int):
+ user = _admin_user(request)
+ if not user:
+ return RedirectResponse(url="/admin/login", status_code=303)
+
+ article = get_article_by_id(article_id)
+ if not article:
+ return _dashboard_redirect(msg=f"Artikel #{article_id} nicht gefunden", msg_type="error")
+
+ meta = _parse_meta_json(article.get("meta_json"))
+ article["meta"] = meta
+ extraction = meta.get("extraction") if isinstance(meta.get("extraction"), dict) else {}
+ extraction["images"] = _read_article_images(article, extraction)
+ if not article.get("press_contact") and isinstance(extraction.get("press_contact"), str):
+ article["press_contact"] = extraction.get("press_contact")
+ article["extraction"] = extraction
+ publish_ready, publish_blockers = _publish_readiness(article, meta)
+ article["publish_ready"] = publish_ready
+ article["publish_blockers"] = publish_blockers
+ article["image_selection"] = extraction.get("image_selection") if isinstance(extraction.get("image_selection"), dict) else {}
+ article["image_entries"] = _build_image_entries(article, extraction, meta)
+ image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {}
+ article["selected_image_url"] = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None
+ article["selected_image_proxy_url"] = (
+ f"/admin/images/proxy?{urlencode({'url': article['selected_image_url']})}" if article.get("selected_image_url") else None
+ )
+ article["days_old"] = article_age_days(article.get("published_at"))
+ article["relevance"] = article_relevance(article.get("published_at"))
+ article["status_ui"] = internal_to_ui_status(article.get("status"))
+ feed = get_feed_by_id(int(article["feed_id"])) if article.get("feed_id") else None
+ checklist = _legal_checklist(article, feed)
+
+ return templates.TemplateResponse(
+ request,
+ "admin_article_detail.html",
+ {
+ "request": request,
+ "title": f"Artikel #{article_id}",
+ "user": user,
+ "article": article,
+ "feed": feed,
+ "checklist": checklist,
+ "allowed_transitions": ALLOWED_TRANSITIONS.get(article.get("status_ui"), ()),
+ "flash_msg": request.query_params.get("msg", ""),
+ "flash_type": request.query_params.get("type", "success"),
+ },
+ )
+
+
+@router.post("/admin/articles/{article_id}/images/decision")
+def admin_article_image_decision(
+ request: Request,
+ article_id: int,
+ image_url: str = Form(...),
+ action: str = Form(...),
+):
+ user = _admin_user(request)
+ if not user:
+ return RedirectResponse(url="/admin/login", status_code=303)
+
+ ok = set_article_image_decision(article_id=article_id, image_url=image_url, action=action, actor=user)
+ if not ok:
+ return _dashboard_redirect(msg=f"Bildaktion fehlgeschlagen fuer Artikel #{article_id}", msg_type="error")
+ return RedirectResponse(url=f"/admin/articles/{article_id}", status_code=303)
+
+
+@router.post("/admin/articles/{article_id}/publish-enqueue")
+def admin_enqueue_publish(request: Request, article_id: int, max_attempts: str = Form("3")):
+ user = _admin_user(request)
+ if not user:
+ return RedirectResponse(url="/admin/login", status_code=303)
+ try:
+ job_id = enqueue_publish(article_id=article_id, max_attempts=max(1, int(max_attempts)))
+ except Exception as exc:
+ return _dashboard_redirect(msg=f"Publish Queue Fehler fuer Artikel #{article_id}: {exc}", msg_type="error")
+ return RedirectResponse(url=f"/admin/articles/{article_id}?msg=Publish-Job%20#{job_id}%20erstellt&type=success", status_code=303)
+
+
+@router.post("/admin/publisher/run")
+def admin_run_publisher(request: Request, max_jobs: str = Form("10")):
+ user = _admin_user(request)
+ if not user:
+ return RedirectResponse(url="/admin/login", status_code=303)
+ try:
+ stats = run_publisher(max_jobs=max(1, int(max_jobs)))
+ except Exception as exc:
+ return _dashboard_redirect(msg=f"Publisher Fehler: {exc}", msg_type="error")
+ return _dashboard_redirect(
+ msg=f"Publisher: processed={stats.processed}, success={stats.success}, failed={stats.failed}, requeued={stats.requeued}"
+ )
+
+
+@router.get("/admin/images/proxy")
+def admin_image_proxy(request: Request, url: str):
+ if not _is_http_image_url(url):
+ return Response(status_code=400)
+
+ try:
+ referer = request.headers.get("referer", "")
+ req = UrlRequest(
+ url=url,
+ headers={
+ "User-Agent": IMAGE_PROXY_USER_AGENT,
+ "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
+ "Referer": referer or url,
+ },
+ )
+ with urlopen(req, timeout=10) as resp:
+ body = resp.read()
+ content_type = resp.headers.get("Content-Type", "application/octet-stream")
+ except Exception:
+ return Response(status_code=404)
+
+ if not content_type.lower().startswith("image/"):
+ return Response(status_code=415)
+ return Response(content=body, media_type=content_type)
+
+
+@router.post("/admin/sources/create")
+def admin_create_source(
+ request: Request,
+ name: str = Form(...),
+ base_url: str = Form(""),
+ terms_url: str = Form(""),
+ license_name: str = Form(""),
+ risk_level: str = Form("yellow"),
+ last_reviewed_at: str = Form(""),
+):
+ user = _admin_user(request)
+ if not user:
+ return RedirectResponse(url="/admin/login", status_code=303)
+
+ try:
+ create_source(
+ SourceCreate(
+ name=name,
+ base_url=base_url or None,
+ terms_url=terms_url or None,
+ license_name=license_name or None,
+ risk_level=risk_level,
+ is_enabled=True,
+ notes=None,
+ last_reviewed_at=last_reviewed_at or None,
+ )
+ )
+ except Exception as exc:
+ return _dashboard_redirect(msg=f"Quelle konnte nicht gespeichert werden: {exc}", msg_type="error")
+ return _dashboard_redirect(msg="Quelle gespeichert")
+
+
+@router.post("/admin/sources/{source_id}/update")
+def admin_update_source(
+ request: Request,
+ source_id: int,
+ name: str = Form(...),
+ base_url: str = Form(""),
+ terms_url: str = Form(""),
+ license_name: str = Form(""),
+ risk_level: str = Form("yellow"),
+ is_enabled: str = Form("1"),
+ notes: str = Form(""),
+ last_reviewed_at: str = Form(""),
+):
+ user = _admin_user(request)
+ if not user:
+ return RedirectResponse(url="/admin/login", status_code=303)
+ try:
+ ok = update_source(
+ source_id,
+ SourceUpdate(
+ name=name,
+ base_url=base_url or None,
+ terms_url=terms_url or None,
+ license_name=license_name or None,
+ risk_level=risk_level,
+ is_enabled=is_enabled == "1",
+ notes=notes or None,
+ last_reviewed_at=last_reviewed_at or None,
+ ),
+ )
+ except Exception as exc:
+ return _dashboard_redirect(msg=f"Quelle #{source_id} Update fehlgeschlagen: {exc}", msg_type="error")
+ if not ok:
+ return _dashboard_redirect(msg=f"Quelle #{source_id} nicht gefunden", msg_type="error")
+ return _dashboard_redirect(msg=f"Quelle #{source_id} aktualisiert")
+
+
+@router.post("/admin/sources/{source_id}/delete")
+def admin_delete_source(request: Request, source_id: int):
+ user = _admin_user(request)
+ if not user:
+ return RedirectResponse(url="/admin/login", status_code=303)
+ ok = delete_source(source_id)
+ if not ok:
+ return _dashboard_redirect(msg=f"Quelle #{source_id} nicht gefunden", msg_type="error")
+ return _dashboard_redirect(msg=f"Quelle #{source_id} gelöscht")
+
+
+@router.post("/admin/feeds/create")
+def admin_create_feed(
+ request: Request,
+ name: str = Form(...),
+ url: str = Form(...),
+ source_id: str = Form(""),
+):
+ user = _admin_user(request)
+ if not user:
+ return RedirectResponse(url="/admin/login", status_code=303)
+
+ try:
+ create_feed(
+ FeedCreate(
+ name=name,
+ url=url,
+ source_id=_to_optional_int(source_id),
+ is_enabled=True,
+ )
+ )
+ except Exception as exc:
+ return _dashboard_redirect(msg=f"Feed konnte nicht gespeichert werden: {exc}", msg_type="error")
+ return _dashboard_redirect(msg="Feed gespeichert")
+
+
+@router.post("/admin/feeds/{feed_id}/update")
+def admin_update_feed(
+ request: Request,
+ feed_id: int,
+ name: str = Form(...),
+ url: str = Form(...),
+ source_id: str = Form(""),
+ is_enabled: str = Form("1"),
+):
+ user = _admin_user(request)
+ if not user:
+ return RedirectResponse(url="/admin/login", status_code=303)
+ try:
+ ok = update_feed(
+ feed_id,
+ FeedUpdate(
+ name=name,
+ url=url,
+ source_id=_to_optional_int(source_id),
+ is_enabled=is_enabled == "1",
+ ),
+ )
+ except Exception as exc:
+ return _dashboard_redirect(msg=f"Feed #{feed_id} Update fehlgeschlagen: {exc}", msg_type="error")
+ if not ok:
+ return _dashboard_redirect(msg=f"Feed #{feed_id} nicht gefunden", msg_type="error")
+ return _dashboard_redirect(msg=f"Feed #{feed_id} aktualisiert")
+
+
+@router.post("/admin/feeds/{feed_id}/delete")
+def admin_delete_feed(request: Request, feed_id: int):
+ user = _admin_user(request)
+ if not user:
+ return RedirectResponse(url="/admin/login", status_code=303)
+ ok = delete_feed(feed_id)
+ if not ok:
+ return _dashboard_redirect(msg=f"Feed #{feed_id} nicht gefunden", msg_type="error")
+ return _dashboard_redirect(msg=f"Feed #{feed_id} gelöscht")
+
+
+@router.post("/admin/ingestion/run")
+def admin_run_ingestion(request: Request, feed_id: str = Form("")):
+ user = _admin_user(request)
+ if not user:
+ return RedirectResponse(url="/admin/login", status_code=303)
+ try:
+ stats = run_ingestion(feed_id=_to_optional_int(feed_id))
+ except Exception as exc:
+ return _dashboard_redirect(msg=f"Ingestion fehlgeschlagen: {exc}", msg_type="error")
+ return _dashboard_redirect(msg=f"Ingestion: {stats.status}, upserts={stats.articles_upserted}")
+
+
+@router.post("/admin/articles/{article_id}/review")
+def admin_review_article(request: Request, article_id: int, decision: str = Form(...), note: str = Form("")):
+ user = _admin_user(request)
+ if not user:
+ return RedirectResponse(url="/admin/login", status_code=303)
+
+ return _dashboard_redirect(msg="Review-Aktion wurde durch Rewrite ersetzt", msg_type="error")
+
+
+@router.post("/admin/articles/{article_id}/rewrite-run")
+def admin_rewrite_run(request: Request, article_id: int):
+ user = _admin_user(request)
+ if not user:
+ return RedirectResponse(url="/admin/login", status_code=303)
+ article = get_article_by_id(article_id)
+ if not article:
+ return _dashboard_redirect(msg=f"Artikel #{article_id} nicht gefunden", msg_type="error")
+ if internal_to_ui_status(article.get("status")) not in {"new", "rewrite"}:
+ return _dashboard_redirect(msg=f"Rewrite nur aus new/rewrite fuer Artikel #{article_id}", msg_type="error")
+ try:
+ rewritten = rewrite_article_text(article)
+ tags = generate_article_tags(article, rewritten_text=rewritten)
+ except Exception as exc:
+ return _dashboard_redirect(msg=f"Rewrite fehlgeschlagen fuer Artikel #{article_id}: {exc}", msg_type="error")
+ merged_meta = merge_generated_tags(article.get("meta_json"), tags)
+ _upsert_article_from_existing(article, content_rewritten=rewritten, status="approved", meta_json=merged_meta)
+ return _dashboard_redirect(msg=f"Rewrite fertig fuer Artikel #{article_id} -> publish")
+
+
+@router.post("/admin/rewrite/run")
+def admin_rewrite_run_batch(request: Request, max_jobs: str = Form("10")):
+ user = _admin_user(request)
+ if not user:
+ return RedirectResponse(url="/admin/login", status_code=303)
+ try:
+ limit = max(1, min(int(max_jobs), 100))
+ except Exception:
+ limit = 10
+ planned = list_articles(limit=limit, status_filter="rewrite")
+ processed = 0
+ success = 0
+ failed = 0
+ for article in planned:
+ processed += 1
+ try:
+ rewritten = rewrite_article_text(article)
+ tags = generate_article_tags(article, rewritten_text=rewritten)
+ merged_meta = merge_generated_tags(article.get("meta_json"), tags)
+ _upsert_article_from_existing(article, content_rewritten=rewritten, status="approved", meta_json=merged_meta)
+ success += 1
+ except Exception:
+ failed += 1
+ return _dashboard_redirect(msg=f"Rewrite-Run: processed={processed}, success={success}, failed={failed}")
+
+
+@router.post("/admin/articles/{article_id}/rewrite-save")
+def admin_rewrite_save(request: Request, article_id: int, content_rewritten: str = Form(...)):
+ user = _admin_user(request)
+ if not user:
+ return RedirectResponse(url="/admin/login", status_code=303)
+ article = get_article_by_id(article_id)
+ if not article:
+ return _dashboard_redirect(msg=f"Artikel #{article_id} nicht gefunden", msg_type="error")
+ text = (content_rewritten or "").strip()
+ if not text:
+ return RedirectResponse(
+ url=f"/admin/articles/{article_id}?msg=Rewrite-Text%20darf%20nicht%20leer%20sein&type=error",
+ status_code=303,
+ )
+ _upsert_article_from_existing(article, content_rewritten=text)
+ return RedirectResponse(url=f"/admin/articles/{article_id}?msg=Rewrite-Text%20gespeichert&type=success", status_code=303)
+
+
+@router.post("/admin/articles/{article_id}/reopen")
+def admin_reopen_article(request: Request, article_id: int):
+ user = _admin_user(request)
+ if not user:
+ return RedirectResponse(url="/admin/login", status_code=303)
+ article = get_article_by_id(article_id)
+ if not article:
+ return _dashboard_redirect(msg=f"Artikel #{article_id} nicht gefunden", msg_type="error")
+ _upsert_article_from_existing(
+ article,
+ status="rewrite",
+ wp_post_id=None,
+ wp_post_url=None,
+ publish_attempts=0,
+ publish_last_error=None,
+ published_to_wp_at=None,
+ )
+ return RedirectResponse(
+ url=f"/admin/articles/{article_id}?msg=Artikel%20zurueck%20in%20Rewrite-Workflow%20gesetzt&type=success",
+ status_code=303,
+ )
+
+
+@router.post("/admin/articles/{article_id}/transition")
+def admin_transition_article(request: Request, article_id: int, target_status: str = Form(...), note: str = Form("")):
+ user = _admin_user(request)
+ if not user:
+ return RedirectResponse(url="/admin/login", status_code=303)
+
+ article = get_article_by_id(article_id)
+ if article:
+ current_ui = internal_to_ui_status(article.get("status"))
+ target_internal = ui_to_internal_status(target_status)
+ target_ui = internal_to_ui_status(target_internal)
+ if target_ui in ALLOWED_TRANSITIONS.get(current_ui, ()):
+ update_article_status(article_id, target_internal, actor=user, note=note or None)
+ return _dashboard_redirect(msg=f"Artikel #{article_id}: {current_ui} -> {target_ui}")
+ return _dashboard_redirect(msg=f"Ungueltiger Statuswechsel fuer Artikel #{article_id}", msg_type="error")
+
+
+_PAGE_SIZE = 50
+
+
+@router.get("/admin/article-list", response_class=HTMLResponse)
+def admin_article_list(request: Request):
+ """Paginated article list with inline WP ID editing."""
+ user = _admin_user(request)
+ if not user:
+ return RedirectResponse(url="/admin/login", status_code=303)
+
+ page = max(1, int(request.query_params.get("page", 1)))
+ status_filter = request.query_params.get("status_filter", "") or None
+ search = request.query_params.get("search", "").strip() or None
+ offset = (page - 1) * _PAGE_SIZE
+
+ articles, total = list_articles_page(
+ limit=_PAGE_SIZE, offset=offset,
+ status_filter=status_filter, search=search,
+ )
+
+ # Enrich each article with thumbnail URL
+ for a in articles:
+ meta = _parse_meta_json(a.get("meta_json"))
+ image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {}
+ sel = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None
+ if not sel:
+ sel = (meta.get("extraction") or {}).get("image_selection", {}).get("primary")
+ a["thumb_url"] = sel
+ a["thumb_proxy"] = f"/admin/images/proxy?{urlencode({'url': sel})}" if sel else None
+ raw = (a.get("content_raw") or a.get("summary") or "").strip()
+ a["excerpt"] = raw[:120] + "…" if len(raw) > 120 else raw
+
+ total_pages = max(1, (total + _PAGE_SIZE - 1) // _PAGE_SIZE)
+
+ return templates.TemplateResponse(
+ request,
+ "admin_article_list.html",
+ {
+ "request": request,
+ "title": "Artikelliste",
+ "user": user,
+ "articles": articles,
+ "page": page,
+ "total_pages": total_pages,
+ "total": total,
+ "page_size": _PAGE_SIZE,
+ "status_filter": status_filter or "",
+ "search": search or "",
+ "flash_msg": request.query_params.get("msg", ""),
+ "flash_type": request.query_params.get("type", "success"),
+ },
+ )
+
+
+@router.post("/admin/article-list/update")
+async def admin_article_list_update(request: Request):
+ """Bulk update WP post IDs from the article list form."""
+ user = _admin_user(request)
+ if not user:
+ return RedirectResponse(url="/admin/login", status_code=303)
+
+ form = await request.form()
+ updates: list[tuple[int, int | None]] = []
+
+ # Form fields: wp_ = new value, orig_ = original value
+ for key, new_val in form.items():
+ if not key.startswith("wp_"):
+ continue
+ try:
+ article_id = int(key[3:])
+ except ValueError:
+ continue
+ orig_val = str(form.get(f"orig_{article_id}", "")).strip()
+ new_val_s = str(new_val).strip()
+ if new_val_s == orig_val:
+ continue # unchanged
+ new_wp_id = int(new_val_s) if new_val_s else None
+ updates.append((article_id, new_wp_id))
+
+ if updates:
+ count = bulk_update_wp_post_ids(updates)
+ msg = f"{count} WP-ID(s) aktualisiert. Bitte jetzt WP-Sync ausführen um Slots & URLs zu aktualisieren."
+ msg_type = "success"
+ else:
+ msg = "Keine Änderungen erkannt."
+ msg_type = "success"
+
+ # Preserve pagination/filter params from referer
+ page = form.get("page", "1")
+ status_filter = form.get("status_filter", "")
+ search = form.get("search", "")
+ qs: dict[str, str] = {"msg": msg, "type": msg_type, "page": page}
+ if status_filter:
+ qs["status_filter"] = status_filter
+ if search:
+ qs["search"] = search
+ return RedirectResponse(url=f"/admin/article-list?{urlencode(qs)}", status_code=303)
+
+
+@router.post("/admin/wp-sync")
+def admin_wp_sync(request: Request):
+ """Sync scheduled_publish_at and WP references in the DB from WordPress."""
+ user = _admin_user(request)
+ if not user:
+ return RedirectResponse(url="/admin/login", status_code=303)
+ try:
+ from .wordpress import sync_db_from_wordpress
+ stats = sync_db_from_wordpress()
+ msg = (
+ f"WP-Sync abgeschlossen: "
+ f"{stats['slot_updated']} Slots aktualisiert, "
+ f"{stats['slot_cleared_draft']} Slots geleert (Draft), "
+ f"{stats['marked_published']} als veröffentlicht markiert, "
+ f"{stats['wp_reference_cleared']} WP-Referenzen gelöscht (Papierkorb), "
+ f"{stats['already_in_sync']} bereits synchron."
+ )
+ return RedirectResponse(url=f"/admin/schedule?msg={msg}&type=success", status_code=303)
+ except Exception as exc:
+ return RedirectResponse(url=f"/admin/schedule?msg=Sync fehlgeschlagen: {exc}&type=error", status_code=303)
+
+
+@router.post("/admin/articles/{article_id}/retry")
+def admin_retry_article(request: Request, article_id: int):
+ """Reset a failed article to 'new' so the pipeline picks it up on next run."""
+ user = _admin_user(request)
+ if not user:
+ return RedirectResponse(url="/admin/login", status_code=303)
+
+ article = get_article_by_id(article_id)
+ if not article:
+ return _dashboard_redirect(msg=f"Artikel #{article_id} nicht gefunden", msg_type="error")
+
+ from .scheduler import release_publish_slot
+ release_publish_slot(article_id)
+ update_article_status(article_id, "new", actor=user, note="Manuell zurückgesetzt für erneuten Pipeline-Versuch")
+ return _dashboard_redirect(
+ msg=f"Artikel #{article_id} wurde auf 'neu' zurückgesetzt und wird beim nächsten Pipeline-Lauf verarbeitet",
+ status_filter="close",
+ )
+
+
+@router.get("/admin/schedule", response_class=HTMLResponse)
+def admin_schedule(request: Request):
+ """Schedule overview: all booked slots from DB and WordPress."""
+ user = _admin_user(request)
+ if not user:
+ return RedirectResponse(url="/admin/login", status_code=303)
+
+ from .scheduler import get_schedule_overview, _preferred_hours, _today_cet
+ from datetime import timedelta
+
+ slots = get_schedule_overview(lookahead_days=60)
+ today = _today_cet()
+ hours = _preferred_hours()
+
+ # Build a calendar grid: for each day in the next 60 days, show each preferred hour slot
+ booked: dict[tuple[str, int], dict] = {(s["date"], s["hour"]): s for s in slots}
+ calendar_days = []
+ for offset in range(0, 61):
+ d = today + timedelta(days=offset)
+ d_str = d.isoformat()
+ day_slots = []
+ for h in hours:
+ key = (d_str, h)
+ day_slots.append({
+ "hour": h,
+ "booked": key in booked,
+ "slot": booked.get(key),
+ })
+ calendar_days.append({
+ "date": d_str,
+ "date_fmt": d.strftime("%d.%m.%Y"),
+ "weekday": ["Mo", "Di", "Mi", "Do", "Fr", "Sa", "So"][d.weekday()],
+ "slots": day_slots,
+ "any_booked": any(s["booked"] for s in day_slots),
+ })
+
+ return templates.TemplateResponse(
+ request,
+ "admin_schedule.html",
+ {
+ "request": request,
+ "title": "Veröffentlichungsplan",
+ "user": user,
+ "slots": slots,
+ "calendar_days": calendar_days,
+ "hours": hours,
+ "flash_msg": request.query_params.get("msg", ""),
+ "flash_type": request.query_params.get("type", "success"),
+ },
+ )
diff --git a/backend/app/auth.py b/backend/app/auth.py
new file mode 100644
index 0000000..188397f
--- /dev/null
+++ b/backend/app/auth.py
@@ -0,0 +1,31 @@
+import hmac
+from typing import Optional
+
+from itsdangerous import URLSafeTimedSerializer, BadSignature, SignatureExpired
+
+from .config import get_settings
+
+
+def _serializer() -> URLSafeTimedSerializer:
+ settings = get_settings()
+ return URLSafeTimedSerializer(settings.app_secret_key, salt="rss-news-session")
+
+
+def verify_credentials(username: str, password: str) -> bool:
+ settings = get_settings()
+ user_ok = hmac.compare_digest(username, settings.app_admin_username)
+ pw_ok = hmac.compare_digest(password, settings.app_admin_password)
+ return user_ok and pw_ok
+
+
+def create_session_token(username: str) -> str:
+ return _serializer().dumps({"username": username})
+
+
+def verify_session_token(token: str) -> Optional[str]:
+ settings = get_settings()
+ try:
+ payload = _serializer().loads(token, max_age=settings.session_max_age_seconds)
+ except (BadSignature, SignatureExpired):
+ return None
+ return payload.get("username")
diff --git a/backend/app/config.py b/backend/app/config.py
new file mode 100644
index 0000000..24c3902
--- /dev/null
+++ b/backend/app/config.py
@@ -0,0 +1,65 @@
+from functools import lru_cache
+from pathlib import Path
+
+from dotenv import load_dotenv
+from pydantic import AliasChoices, Field
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+
+class Settings(BaseSettings):
+ # Prefer backend-specific env file to avoid collisions with legacy root .env
+ model_config = SettingsConfigDict(
+ env_file=("backend/.env", ".env"),
+ env_file_encoding="utf-8",
+ extra="ignore",
+ )
+
+ app_env: str = "development"
+ app_name: str = "rss-news-backend"
+ app_secret_key: str = "replace-with-a-long-random-secret"
+
+ app_admin_username: str = "admin"
+ app_admin_password: str = "change-me"
+
+ session_cookie_name: str = "rss_news_session"
+ session_max_age_seconds: int = 28800
+
+ app_db_path: str = "backend/data/rss_news.db"
+
+ wordpress_base_url: str | None = Field(default=None, validation_alias=AliasChoices("WORDPRESS_BASE_URL", "WP_BASE_URL"))
+ wordpress_username: str | None = Field(default=None, validation_alias=AliasChoices("WORDPRESS_USERNAME", "WP_USERNAME"))
+ wordpress_app_password: str | None = Field(default=None, validation_alias=AliasChoices("WORDPRESS_APP_PASSWORD", "WP_PASSWORD"))
+ wordpress_default_status: str = "draft"
+ openai_api_key: str | None = Field(default=None, validation_alias=AliasChoices("OPENAI_API_KEY"))
+ openai_model: str = "gpt-4o-mini"
+
+ # Telegram Bot
+ telegram_bot_token: str | None = Field(default=None, validation_alias=AliasChoices("TELEGRAM_BOT_TOKEN"))
+ telegram_chat_id: str | None = Field(default=None, validation_alias=AliasChoices("TELEGRAM_CHAT_ID"))
+ telegram_webhook_secret: str | None = Field(default=None, validation_alias=AliasChoices("TELEGRAM_WEBHOOK_SECRET"))
+
+ # N8N API authentication
+ n8n_api_key: str | None = Field(default=None, validation_alias=AliasChoices("N8N_API_KEY"))
+
+ # Pipeline behaviour
+ pipeline_relevance_auto: int = 80 # >= this: auto-process
+ pipeline_relevance_warn: int = 60 # >= this: Telegram warning, else reject
+ pipeline_max_drafts_per_day: int = 2
+ pipeline_publish_hours: str = "9,14" # comma-separated preferred publish hours (CET)
+ pipeline_min_words_raw: int = 120 # minimum words in raw content before rewrite (else reject)
+ pipeline_min_words_rewritten: int = 150 # minimum words in rewritten content (else reject)
+ pipeline_max_article_age_days: int = 7 # skip articles older than N days during ingestion (0 = no limit)
+
+
+@lru_cache(maxsize=1)
+def get_settings() -> Settings:
+ # Prefer shared legacy env from the original rss-news workspace if present.
+ env_candidates = (
+ Path("/Users/oliver/Documents/rss-news/.env"),
+ Path("backend/.env"),
+ Path(".env"),
+ )
+ for env_path in env_candidates:
+ if env_path.exists():
+ load_dotenv(env_path, override=False)
+ return Settings()
diff --git a/backend/app/db.py b/backend/app/db.py
new file mode 100644
index 0000000..b6ef898
--- /dev/null
+++ b/backend/app/db.py
@@ -0,0 +1,293 @@
+import sqlite3
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Any, Iterator
+
+from .config import get_settings
+
+
+def _db_path() -> Path:
+ settings = get_settings()
+ path = Path(settings.app_db_path)
+ path.parent.mkdir(parents=True, exist_ok=True)
+ return path
+
+
+@contextmanager
+def get_conn() -> Iterator[sqlite3.Connection]:
+ conn = sqlite3.connect(_db_path())
+ conn.row_factory = sqlite3.Row
+ conn.execute("PRAGMA foreign_keys=ON;")
+ try:
+ yield conn
+ conn.commit()
+ finally:
+ conn.close()
+
+
+def init_db() -> None:
+ with get_conn() as conn:
+ conn.executescript(
+ """
+ PRAGMA journal_mode=WAL;
+
+ CREATE TABLE IF NOT EXISTS sources (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ name TEXT NOT NULL,
+ base_url TEXT,
+ terms_url TEXT,
+ license_name TEXT,
+ risk_level TEXT NOT NULL DEFAULT 'yellow' CHECK (risk_level IN ('green', 'yellow', 'red')),
+ is_enabled INTEGER NOT NULL DEFAULT 0,
+ notes TEXT,
+ last_reviewed_at TEXT,
+ created_at TEXT NOT NULL DEFAULT (datetime('now')),
+ updated_at TEXT NOT NULL DEFAULT (datetime('now'))
+ );
+
+ CREATE TABLE IF NOT EXISTS feeds (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ source_id INTEGER,
+ name TEXT NOT NULL,
+ url TEXT NOT NULL UNIQUE,
+ is_enabled INTEGER NOT NULL DEFAULT 1,
+ etag TEXT,
+ last_modified TEXT,
+ last_checked_at TEXT,
+ created_at TEXT NOT NULL DEFAULT (datetime('now')),
+ updated_at TEXT NOT NULL DEFAULT (datetime('now')),
+ FOREIGN KEY(source_id) REFERENCES sources(id) ON DELETE SET NULL
+ );
+
+ CREATE TABLE IF NOT EXISTS runs (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ run_type TEXT NOT NULL,
+ status TEXT NOT NULL CHECK (status IN ('queued', 'running', 'success', 'failed')),
+ started_at TEXT NOT NULL DEFAULT (datetime('now')),
+ finished_at TEXT,
+ details TEXT
+ );
+
+ CREATE TABLE IF NOT EXISTS publish_jobs (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ article_id INTEGER NOT NULL,
+ status TEXT NOT NULL CHECK (status IN ('queued', 'running', 'success', 'failed')),
+ attempts INTEGER NOT NULL DEFAULT 0,
+ max_attempts INTEGER NOT NULL DEFAULT 3,
+ error_message TEXT,
+ wp_post_id INTEGER,
+ wp_post_url TEXT,
+ created_at TEXT NOT NULL DEFAULT (datetime('now')),
+ started_at TEXT,
+ finished_at TEXT,
+ FOREIGN KEY(article_id) REFERENCES articles(id) ON DELETE CASCADE
+ );
+
+ CREATE TABLE IF NOT EXISTS articles (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ feed_id INTEGER,
+ source_article_id TEXT,
+ source_hash TEXT,
+ title TEXT NOT NULL,
+ source_url TEXT NOT NULL,
+ canonical_url TEXT,
+ published_at TEXT,
+ author TEXT,
+ summary TEXT,
+ content_raw TEXT,
+ content_rewritten TEXT,
+ image_urls_json TEXT,
+ press_contact TEXT,
+ source_name_snapshot TEXT,
+ source_terms_url_snapshot TEXT,
+ source_license_name_snapshot TEXT,
+ legal_checked INTEGER NOT NULL DEFAULT 0,
+ legal_checked_at TEXT,
+ legal_note TEXT,
+ wp_post_id INTEGER,
+ wp_post_url TEXT,
+ publish_attempts INTEGER NOT NULL DEFAULT 0,
+ publish_last_error TEXT,
+ published_to_wp_at TEXT,
+ word_count INTEGER DEFAULT 0,
+ status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error', 'no_image')),
+ meta_json TEXT,
+ created_at TEXT NOT NULL DEFAULT (datetime('now')),
+ updated_at TEXT NOT NULL DEFAULT (datetime('now')),
+ FOREIGN KEY(feed_id) REFERENCES feeds(id) ON DELETE SET NULL,
+ UNIQUE(source_url)
+ );
+
+ CREATE INDEX IF NOT EXISTS idx_articles_source_article_id ON articles(source_article_id);
+ CREATE INDEX IF NOT EXISTS idx_articles_source_hash ON articles(source_hash);
+ CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_feed_source_article_id
+ ON articles(feed_id, source_article_id)
+ WHERE source_article_id IS NOT NULL;
+ CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_source_hash
+ ON articles(source_hash)
+ WHERE source_hash IS NOT NULL;
+ CREATE INDEX IF NOT EXISTS idx_articles_status ON articles(status);
+ CREATE INDEX IF NOT EXISTS idx_feeds_source_id ON feeds(source_id);
+ CREATE INDEX IF NOT EXISTS idx_runs_started_at ON runs(started_at);
+ CREATE INDEX IF NOT EXISTS idx_articles_published_at ON articles(published_at);
+ CREATE INDEX IF NOT EXISTS idx_publish_jobs_status_created_at ON publish_jobs(status, created_at);
+
+ CREATE TRIGGER IF NOT EXISTS trg_sources_updated_at
+ AFTER UPDATE ON sources
+ FOR EACH ROW
+ BEGIN
+ UPDATE sources SET updated_at = datetime('now') WHERE id = OLD.id;
+ END;
+
+ CREATE TRIGGER IF NOT EXISTS trg_feeds_updated_at
+ AFTER UPDATE ON feeds
+ FOR EACH ROW
+ BEGIN
+ UPDATE feeds SET updated_at = datetime('now') WHERE id = OLD.id;
+ END;
+
+ CREATE TRIGGER IF NOT EXISTS trg_articles_updated_at
+ AFTER UPDATE ON articles
+ FOR EACH ROW
+ BEGIN
+ UPDATE articles SET updated_at = datetime('now') WHERE id = OLD.id;
+ END;
+ """
+ )
+
+ # Lightweight migration for existing DBs created before source_hash was introduced.
+ existing_columns = {
+ row["name"] for row in conn.execute("PRAGMA table_info(articles)").fetchall()
+ }
+ migration_columns = {
+ "relevance_score": "ALTER TABLE articles ADD COLUMN relevance_score INTEGER",
+ "scheduled_publish_at": "ALTER TABLE articles ADD COLUMN scheduled_publish_at TEXT",
+ "source_hash": "ALTER TABLE articles ADD COLUMN source_hash TEXT",
+ "image_urls_json": "ALTER TABLE articles ADD COLUMN image_urls_json TEXT",
+ "press_contact": "ALTER TABLE articles ADD COLUMN press_contact TEXT",
+ "source_name_snapshot": "ALTER TABLE articles ADD COLUMN source_name_snapshot TEXT",
+ "source_terms_url_snapshot": "ALTER TABLE articles ADD COLUMN source_terms_url_snapshot TEXT",
+ "source_license_name_snapshot": "ALTER TABLE articles ADD COLUMN source_license_name_snapshot TEXT",
+ "legal_checked": "ALTER TABLE articles ADD COLUMN legal_checked INTEGER NOT NULL DEFAULT 0",
+ "legal_checked_at": "ALTER TABLE articles ADD COLUMN legal_checked_at TEXT",
+ "legal_note": "ALTER TABLE articles ADD COLUMN legal_note TEXT",
+ "wp_post_id": "ALTER TABLE articles ADD COLUMN wp_post_id INTEGER",
+ "wp_post_url": "ALTER TABLE articles ADD COLUMN wp_post_url TEXT",
+ "publish_attempts": "ALTER TABLE articles ADD COLUMN publish_attempts INTEGER NOT NULL DEFAULT 0",
+ "publish_last_error": "ALTER TABLE articles ADD COLUMN publish_last_error TEXT",
+ "published_to_wp_at": "ALTER TABLE articles ADD COLUMN published_to_wp_at TEXT",
+ }
+ for column, ddl in migration_columns.items():
+ if column not in existing_columns:
+ conn.execute(ddl)
+
+ # Migration: add 'no_image' to the status CHECK constraint if not present.
+ # SQLite cannot modify CHECK constraints in-place, so we recreate the table.
+ table_sql_row = conn.execute(
+ "SELECT sql FROM sqlite_master WHERE type='table' AND name='articles'"
+ ).fetchone()
+ if table_sql_row and "'no_image'" not in (table_sql_row["sql"] or ""):
+ conn.executescript(
+ """
+ PRAGMA foreign_keys=OFF;
+
+ CREATE TABLE articles_v2 (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ feed_id INTEGER,
+ source_article_id TEXT,
+ source_hash TEXT,
+ title TEXT NOT NULL,
+ source_url TEXT NOT NULL,
+ canonical_url TEXT,
+ published_at TEXT,
+ author TEXT,
+ summary TEXT,
+ content_raw TEXT,
+ content_rewritten TEXT,
+ image_urls_json TEXT,
+ press_contact TEXT,
+ source_name_snapshot TEXT,
+ source_terms_url_snapshot TEXT,
+ source_license_name_snapshot TEXT,
+ legal_checked INTEGER NOT NULL DEFAULT 0,
+ legal_checked_at TEXT,
+ legal_note TEXT,
+ wp_post_id INTEGER,
+ wp_post_url TEXT,
+ publish_attempts INTEGER NOT NULL DEFAULT 0,
+ publish_last_error TEXT,
+ published_to_wp_at TEXT,
+ word_count INTEGER DEFAULT 0,
+ status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error', 'no_image')),
+ meta_json TEXT,
+ relevance_score INTEGER,
+ scheduled_publish_at TEXT,
+ created_at TEXT NOT NULL DEFAULT (datetime('now')),
+ updated_at TEXT NOT NULL DEFAULT (datetime('now')),
+ FOREIGN KEY(feed_id) REFERENCES feeds(id) ON DELETE SET NULL,
+ UNIQUE(source_url)
+ );
+
+ INSERT INTO articles_v2 SELECT
+ id, feed_id, source_article_id, source_hash, title, source_url,
+ canonical_url, published_at, author, summary, content_raw,
+ content_rewritten, image_urls_json, press_contact,
+ source_name_snapshot, source_terms_url_snapshot, source_license_name_snapshot,
+ legal_checked, legal_checked_at, legal_note,
+ wp_post_id, wp_post_url, publish_attempts, publish_last_error,
+ published_to_wp_at, word_count, status, meta_json,
+ relevance_score, scheduled_publish_at, created_at, updated_at
+ FROM articles;
+
+ DROP TABLE articles;
+ ALTER TABLE articles_v2 RENAME TO articles;
+
+ CREATE INDEX IF NOT EXISTS idx_articles_source_article_id ON articles(source_article_id);
+ CREATE INDEX IF NOT EXISTS idx_articles_source_hash ON articles(source_hash);
+ CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_feed_source_article_id
+ ON articles(feed_id, source_article_id)
+ WHERE source_article_id IS NOT NULL;
+ CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_source_hash
+ ON articles(source_hash)
+ WHERE source_hash IS NOT NULL;
+ CREATE INDEX IF NOT EXISTS idx_articles_status ON articles(status);
+ CREATE INDEX IF NOT EXISTS idx_articles_published_at ON articles(published_at);
+
+ CREATE TRIGGER IF NOT EXISTS trg_articles_updated_at
+ AFTER UPDATE ON articles
+ FOR EACH ROW
+ BEGIN
+ UPDATE articles SET updated_at = datetime('now') WHERE id = OLD.id;
+ END;
+
+ PRAGMA foreign_keys=ON;
+ """
+ )
+
+ table_rows = conn.execute(
+ "SELECT name FROM sqlite_master WHERE type = 'table' AND name = 'publish_jobs'"
+ ).fetchall()
+ if not table_rows:
+ conn.executescript(
+ """
+ CREATE TABLE IF NOT EXISTS publish_jobs (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ article_id INTEGER NOT NULL,
+ status TEXT NOT NULL CHECK (status IN ('queued', 'running', 'success', 'failed')),
+ attempts INTEGER NOT NULL DEFAULT 0,
+ max_attempts INTEGER NOT NULL DEFAULT 3,
+ error_message TEXT,
+ wp_post_id INTEGER,
+ wp_post_url TEXT,
+ created_at TEXT NOT NULL DEFAULT (datetime('now')),
+ started_at TEXT,
+ finished_at TEXT,
+ FOREIGN KEY(article_id) REFERENCES articles(id) ON DELETE CASCADE
+ );
+ CREATE INDEX IF NOT EXISTS idx_publish_jobs_status_created_at ON publish_jobs(status, created_at);
+ """
+ )
+
+
+def rows_to_dicts(rows: list[sqlite3.Row]) -> list[dict[str, Any]]:
+ return [dict(r) for r in rows]
diff --git a/backend/app/ingestion.py b/backend/app/ingestion.py
new file mode 100644
index 0000000..391af92
--- /dev/null
+++ b/backend/app/ingestion.py
@@ -0,0 +1,486 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from datetime import datetime, timedelta, timezone
+import hashlib
+import json
+import re
+import time
+from typing import Any
+from urllib.parse import unquote, urlencode, urlparse, parse_qs
+import urllib.error
+import urllib.request as _urllib_req
+
+import feedparser
+
+from .repositories import (
+ ArticleUpsert,
+ RunCreate,
+ create_run,
+ find_existing_article_for_upsert,
+ finish_run,
+ get_feed_by_id,
+ list_enabled_feeds,
+ update_feed_fetch_state,
+ upsert_article,
+)
+from .source_extraction import extract_article, extracted_article_to_meta
+
+
+@dataclass(frozen=True)
+class IngestionStats:
+ run_id: int
+ feeds_processed: int
+ entries_seen: int
+ articles_upserted: int
+ status: str
+ message: str
+
+
+MAX_FEED_FETCH_RETRIES = 3
+
+
+def _normalize_article_url(url: str) -> str:
+ """Strip AMP and tracking query parameters from article URLs.
+
+ Removes ?outputType=valid_amp and other AMP/tracking params so that
+ AMP and non-AMP versions of the same article are deduplicated.
+ """
+ _AMP_PARAMS = {"outputtype", "amp", "outputformat"}
+ try:
+ from urllib.parse import parse_qs, urlencode
+ parsed = urlparse(url)
+ if not parsed.query:
+ return url
+ params = parse_qs(parsed.query, keep_blank_values=True)
+ filtered = {k: v for k, v in params.items() if k.lower() not in _AMP_PARAMS}
+ new_query = urlencode(filtered, doseq=True)
+ return parsed._replace(query=new_query).geturl()
+ except Exception:
+ return url
+
+
+def _resolve_google_redirect(url: str) -> str:
+ """Extract the real article URL from Google redirect URLs.
+
+ Google Alerts feed entries use tracking links like:
+ https://www.google.com/url?rct=j&sa=t&url=&ct=ga&...
+
+ This function returns the decoded real URL if detected, otherwise the
+ original URL unchanged.
+ """
+ try:
+ parsed = urlparse(url)
+ host = (parsed.hostname or "").lower()
+ if host not in ("www.google.com", "google.com"):
+ return url
+ if parsed.path not in ("/url", "/url/"):
+ return url
+ params = parse_qs(parsed.query, keep_blank_values=False)
+ real_urls = params.get("url")
+ if real_urls:
+ return unquote(real_urls[0])
+ except Exception:
+ pass
+ return url
+
+
+def _entry_published_iso(entry: dict) -> str | None:
+ published = entry.get("published_parsed") or entry.get("updated_parsed")
+ if not published:
+ return None
+ return datetime(*published[:6], tzinfo=timezone.utc).isoformat()
+
+
+def _entry_text(entry: dict) -> tuple[str, str]:
+ summary = entry.get("summary", "") or ""
+ content = ""
+ if entry.get("content") and isinstance(entry.get("content"), list):
+ first = entry["content"][0]
+ content = first.get("value", "") if isinstance(first, dict) else ""
+ if not content:
+ content = summary
+ return summary, content
+
+
+def _entry_hash(entry: dict, feed_id: int, link: str, title: str, summary: str) -> str:
+ source_id = entry.get("id") or entry.get("guid") or ""
+ published = _entry_published_iso(entry) or ""
+ fingerprint = f"{feed_id}|{source_id}|{link}|{title.strip()}|{summary.strip()}|{published}"
+ return hashlib.sha256(fingerprint.encode("utf-8")).hexdigest()
+
+
+def _parsed_get(parsed: object, key: str, default: object = None) -> object:
+ if isinstance(parsed, dict):
+ return parsed.get(key, default)
+ return getattr(parsed, key, default)
+
+
+def _normalize_tokens(text: str) -> set[str]:
+ normalized = re.sub(r"[^a-z0-9]+", " ", text.lower())
+ return {token for token in normalized.split() if len(token) >= 4}
+
+
+def _probe_image_url(url: str, timeout: int = 5) -> bool:
+ """Return True if URL responds without a 4xx/5xx error (HEAD request).
+
+ Returns True on network/connection errors so that a flaky server does not
+ cause a valid image to be silently dropped.
+ """
+ try:
+ req = _urllib_req.Request(
+ url,
+ method="HEAD",
+ headers={"User-Agent": "Mozilla/5.0 (compatible; rss-news/1.0)"},
+ )
+ with _urllib_req.urlopen(req, timeout=timeout) as resp:
+ return resp.status < 400
+ except urllib.error.HTTPError as exc:
+ return exc.code < 400 # 3xx redirects are OK; 4xx/5xx are not
+ except Exception:
+ return True # network error → don't filter, let WP try later
+
+
+def _rank_image_candidates(source_url: str, title: str, images: list[str]) -> list[dict[str, Any]]:
+ source_host = (urlparse(source_url).hostname or "").lower()
+ is_presseportal = "presseportal.de" in source_host
+ title_tokens = _normalize_tokens(title)
+ blocked_patterns = ("logo", "badge", "app-store", "google-play", "na-logo", "sprite", "icon", "favicon", "tracking", "pixel", ".svg", ".ico", ".gif")
+ # Known placeholder/default images that should never be used as featured image
+ placeholder_patterns = ("some-default.jpg", "default-image", "placeholder", "no-image", "noimage")
+
+
+ ranked: list[dict[str, Any]] = []
+ for url in images:
+ # Skip inline data: URIs (e.g. base64-encoded SVG placeholders)
+ if url.startswith("data:"):
+ continue
+
+ parsed = urlparse(url)
+ path = unquote(parsed.path.lower())
+ full = f"{parsed.netloc.lower()}{path}"
+ score = 0
+ reasons: list[str] = []
+
+ if any(token in full for token in placeholder_patterns):
+ score -= 300
+ reasons.append("placeholder-image")
+
+ if any(token in full for token in blocked_patterns):
+ score -= 150
+ reasons.append("blocked-pattern")
+
+ if is_presseportal and "/thumbnail/story_big/" in path:
+ score += 120
+ reasons.append("presseportal-story-big")
+ elif is_presseportal and "/thumbnail/highlight/" in path:
+ score += 45
+ reasons.append("presseportal-highlight")
+ elif is_presseportal and "/thumbnail/liste/" in path:
+ score -= 40
+ reasons.append("presseportal-list")
+
+ if "crop=" in (parsed.query or "").lower():
+ score -= 10
+ reasons.append("cropped-preview")
+
+ path_tokens = _normalize_tokens(path.replace("-", " "))
+ overlap = len(title_tokens.intersection(path_tokens))
+ if overlap > 0:
+ score += min(30, overlap * 6)
+ reasons.append(f"title-match:{overlap}")
+
+ ranked.append({"url": url, "score": score, "reasons": reasons})
+
+ ranked.sort(key=lambda item: item["score"], reverse=True)
+ return ranked
+
+
+def _select_relevant_images(source_url: str, title: str, images: list[str], max_keep: int = 3) -> tuple[list[str], str | None, list[dict[str, Any]]]:
+ # dedupe incoming order first
+ deduped: list[str] = []
+ seen: set[str] = set()
+ for image in images:
+ if image and image not in seen:
+ seen.add(image)
+ deduped.append(image)
+
+ ranked = _rank_image_candidates(source_url, title, deduped)
+ candidates = [item["url"] for item in ranked if item["score"] > -100]
+
+ # Probe top candidates (max 4) to skip definitively broken URLs (HTTP 4xx).
+ # Network errors are treated as OK to avoid false negatives on flaky servers.
+ primary = None
+ kept: list[str] = []
+ for url in candidates[:4]:
+ if _probe_image_url(url):
+ if primary is None:
+ primary = url
+ kept.append(url)
+ if len(kept) >= max_keep:
+ break
+
+ # Fallback: if all probes failed with network errors, use best candidate anyway
+ if not kept and candidates:
+ primary = candidates[0]
+ kept = candidates[:max_keep]
+
+ return kept, primary, ranked
+
+
+def _merge_ingestion_meta(existing_meta_json: str | None, attribution: dict[str, Any], extraction_meta: dict[str, Any]) -> str:
+ meta: dict[str, Any] = {}
+ if existing_meta_json:
+ try:
+ parsed = json.loads(existing_meta_json)
+ if isinstance(parsed, dict):
+ meta = parsed
+ except Exception:
+ meta = {}
+ meta["attribution"] = attribution
+ meta["extraction"] = extraction_meta
+ return json.dumps(meta, ensure_ascii=False)
+
+
+def run_ingestion(feed_id: int | None = None) -> IngestionStats:
+ run_id = create_run(RunCreate(run_type="ingestion", status="running", details="started"))
+ feeds_processed = 0
+ entries_seen = 0
+ articles_upserted = 0
+ feed_results: list[dict[str, object]] = []
+
+ try:
+ if feed_id is not None:
+ feed = get_feed_by_id(feed_id)
+ feeds = [feed] if feed and int(feed.get("is_enabled", 0)) == 1 else []
+ else:
+ feeds = list_enabled_feeds()
+
+ for feed in feeds:
+ if not feed:
+ continue
+ feeds_processed += 1
+
+ parsed = None
+ feed_error = None
+ for attempt in range(1, MAX_FEED_FETCH_RETRIES + 1):
+ try:
+ parsed = feedparser.parse(
+ feed["url"],
+ etag=feed.get("etag"),
+ modified=feed.get("last_modified"),
+ )
+ break
+ except Exception as exc:
+ feed_error = str(exc)
+ if attempt < MAX_FEED_FETCH_RETRIES:
+ time.sleep(0.5 * attempt)
+
+ if parsed is None:
+ feed_results.append(
+ {
+ "feed_id": int(feed["id"]),
+ "feed_url": feed["url"],
+ "status": "failed",
+ "error": feed_error or "unknown",
+ "entries_seen": 0,
+ "upserts": 0,
+ }
+ )
+ continue
+
+ # Persist ETag/Last-Modified for conditional requests.
+ parsed_etag = _parsed_get(parsed, "etag")
+ parsed_modified = _parsed_get(parsed, "modified")
+ if parsed_modified and not isinstance(parsed_modified, str):
+ parsed_modified = str(parsed_modified)
+ update_feed_fetch_state(
+ feed_id=int(feed["id"]),
+ etag=parsed_etag if isinstance(parsed_etag, str) else None,
+ last_modified=parsed_modified if isinstance(parsed_modified, str) else None,
+ )
+
+ feed_entries_seen = 0
+ feed_upserts = 0
+ from .config import get_settings as _get_settings
+ _max_age_days = _get_settings().pipeline_max_article_age_days
+ for entry in _parsed_get(parsed, "entries", []):
+ entries_seen += 1
+ feed_entries_seen += 1
+ link = entry.get("link")
+ if not link:
+ continue
+
+ # Age filter: skip articles older than max_age_days (0 = no limit)
+ if _max_age_days > 0:
+ published_iso = _entry_published_iso(entry)
+ if published_iso:
+ try:
+ published_dt = datetime.fromisoformat(published_iso)
+ age = datetime.now(timezone.utc) - published_dt
+ if age > timedelta(days=_max_age_days):
+ continue
+ except Exception:
+ pass # can't parse date → allow through
+
+ # Resolve Google redirect URLs (google.com/url?...&url=&...)
+ link = _resolve_google_redirect(link)
+ # Normalize AMP/tracking params (e.g. ?outputType=valid_amp)
+ link = _normalize_article_url(link)
+
+ summary, content_raw = _entry_text(entry)
+ # Strip HTML tags from title (Google Alerts wraps matched keywords in )
+ raw_title = entry.get("title") or "Ohne Titel"
+ title = re.sub(r"<[^>]+>", "", raw_title).strip() or "Ohne Titel"
+ extracted = extract_article(link)
+
+ final_title = extracted.title or title
+ final_author = extracted.author or entry.get("author")
+ final_summary = extracted.summary or (summary[:1000] if summary else None)
+ final_content_raw = extracted.content_text or content_raw
+ final_canonical = extracted.canonical_url or entry.get("link")
+ selected_images, primary_image, ranked_images = _select_relevant_images(
+ link,
+ final_title,
+ extracted.images,
+ max_keep=3,
+ )
+
+ source_hash = _entry_hash(
+ entry,
+ int(feed["id"]),
+ link,
+ final_title,
+ final_summary or "",
+ )
+ attribution = {
+ "source_name": feed.get("source_name"),
+ "source_base_url": feed.get("source_base_url"),
+ "source_terms_url": feed.get("source_terms_url"),
+ "source_license_name": feed.get("source_license_name"),
+ "source_risk_level": feed.get("source_risk_level"),
+ "original_link": link,
+ "feed_name": feed.get("name"),
+ "feed_id": int(feed["id"]),
+ "imported_at": datetime.now(timezone.utc).isoformat(),
+ }
+ extraction_meta: dict[str, Any] = extracted_article_to_meta(extracted)
+ extraction_meta["fetched_from"] = link
+ extraction_meta["image_selection"] = {
+ "primary": primary_image,
+ "selected_count": len(selected_images),
+ "total_candidates": len(extracted.images),
+ "ranked": ranked_images,
+ }
+ base_payload = ArticleUpsert(
+ feed_id=int(feed["id"]),
+ source_article_id=entry.get("id") or entry.get("guid"),
+ source_hash=source_hash,
+ title=final_title,
+ source_url=link,
+ canonical_url=final_canonical,
+ published_at=_entry_published_iso(entry),
+ author=final_author,
+ summary=final_summary,
+ content_raw=final_content_raw,
+ content_rewritten=None,
+ image_urls_json=json.dumps(selected_images, ensure_ascii=False) if selected_images else None,
+ press_contact=extracted.press_contact,
+ source_name_snapshot=feed.get("source_name"),
+ source_terms_url_snapshot=feed.get("source_terms_url"),
+ source_license_name_snapshot=feed.get("source_license_name"),
+ legal_checked=False,
+ legal_checked_at=None,
+ legal_note=None,
+ wp_post_id=None,
+ wp_post_url=None,
+ publish_attempts=0,
+ publish_last_error=None,
+ published_to_wp_at=None,
+ word_count=len((final_content_raw or "").split()),
+ status="new",
+ meta_json=json.dumps({"attribution": attribution, "extraction": extraction_meta}, ensure_ascii=False),
+ )
+ existing = find_existing_article_for_upsert(base_payload)
+ if existing and existing.get("status") == "error":
+ # Explicitly closed article: ignore on subsequent ingestion runs.
+ continue
+
+ payload = base_payload
+ if existing:
+ payload = ArticleUpsert(
+ feed_id=base_payload.feed_id,
+ source_article_id=base_payload.source_article_id,
+ source_hash=base_payload.source_hash,
+ title=base_payload.title,
+ source_url=base_payload.source_url,
+ canonical_url=base_payload.canonical_url,
+ published_at=base_payload.published_at,
+ author=base_payload.author,
+ summary=base_payload.summary,
+ content_raw=base_payload.content_raw,
+ content_rewritten=existing.get("content_rewritten"),
+ image_urls_json=base_payload.image_urls_json,
+ press_contact=base_payload.press_contact or existing.get("press_contact"),
+ source_name_snapshot=base_payload.source_name_snapshot,
+ source_terms_url_snapshot=base_payload.source_terms_url_snapshot,
+ source_license_name_snapshot=base_payload.source_license_name_snapshot,
+ legal_checked=bool(int(existing.get("legal_checked", 0))),
+ legal_checked_at=existing.get("legal_checked_at"),
+ legal_note=existing.get("legal_note"),
+ wp_post_id=existing.get("wp_post_id"),
+ wp_post_url=existing.get("wp_post_url"),
+ publish_attempts=int(existing.get("publish_attempts", 0)),
+ publish_last_error=existing.get("publish_last_error"),
+ published_to_wp_at=existing.get("published_to_wp_at"),
+ word_count=base_payload.word_count,
+ status=existing.get("status") or "new",
+ meta_json=_merge_ingestion_meta(existing.get("meta_json"), attribution, extraction_meta),
+ )
+
+ article_id = upsert_article(payload)
+ if article_id:
+ articles_upserted += 1
+ feed_upserts += 1
+
+ feed_results.append(
+ {
+ "feed_id": int(feed["id"]),
+ "feed_url": feed["url"],
+ "status": "success",
+ "entries_seen": feed_entries_seen,
+ "upserts": feed_upserts,
+ }
+ )
+
+ finish_run(
+ run_id=run_id,
+ status="success",
+ details=json.dumps(
+ {
+ "feeds_processed": feeds_processed,
+ "entries_seen": entries_seen,
+ "upserts": articles_upserted,
+ "feeds": feed_results,
+ },
+ ensure_ascii=False,
+ ),
+ )
+ return IngestionStats(
+ run_id=run_id,
+ feeds_processed=feeds_processed,
+ entries_seen=entries_seen,
+ articles_upserted=articles_upserted,
+ status="success",
+ message="Ingestion abgeschlossen",
+ )
+ except Exception as exc:
+ finish_run(run_id=run_id, status="failed", details=str(exc))
+ return IngestionStats(
+ run_id=run_id,
+ feeds_processed=feeds_processed,
+ entries_seen=entries_seen,
+ articles_upserted=articles_upserted,
+ status="failed",
+ message=str(exc),
+ )
diff --git a/backend/app/main.py b/backend/app/main.py
new file mode 100644
index 0000000..b4776af
--- /dev/null
+++ b/backend/app/main.py
@@ -0,0 +1,727 @@
+import asyncio
+from contextlib import asynccontextmanager
+import csv
+from datetime import datetime, timezone
+import io
+import json
+import logging
+from pathlib import Path
+
+from fastapi import Depends, FastAPI, HTTPException, Request, Response, status
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel, Field
+from fastapi.staticfiles import StaticFiles
+
+from .admin_ui import router as admin_router
+from .auth import create_session_token, verify_credentials, verify_session_token
+from .config import get_settings
+from .db import init_db
+from .ingestion import run_ingestion
+from .pipeline import run_auto_pipeline
+from .policy import evaluate_source_policy, is_source_allowed
+from .publisher import enqueue_publish, run_publisher
+from .relevance import article_age_days, article_relevance
+from .rewrite import generate_article_tags, merge_generated_tags, rewrite_article_text
+from .telegram_bot import handle_update, setup_webhook
+from .repositories import (
+ ArticleUpsert,
+ FeedCreate,
+ RunCreate,
+ SourceCreate,
+ create_feed as repo_create_feed,
+ create_run,
+ create_source as repo_create_source,
+ finish_run,
+ get_article_by_id,
+ get_feed_by_id,
+ get_run_by_id,
+ get_source_by_id,
+ list_publish_jobs,
+ list_articles as repo_list_articles,
+ list_feeds as repo_list_feeds,
+ list_runs,
+ list_sources as repo_list_sources,
+ set_article_legal_review,
+ update_article_status,
+ upsert_article as repo_upsert_article,
+)
+from .workflow import ALLOWED_UI_TRANSITIONS, UI_STATUSES, internal_to_ui_status, ui_to_internal_status
+
+settings = get_settings()
+
+
+@asynccontextmanager
+async def app_lifespan(_: FastAPI):
+ init_db()
+ yield
+
+
+app = FastAPI(title=settings.app_name, lifespan=app_lifespan)
+app.include_router(admin_router)
+app.mount(
+ "/admin/static",
+ StaticFiles(directory=str(Path(__file__).resolve().parent.parent / "static")),
+ name="admin-static",
+)
+
+
+class LoginRequest(BaseModel):
+ username: str
+ password: str
+
+
+class SourceCreateRequest(BaseModel):
+ name: str = Field(min_length=1, max_length=200)
+ base_url: str | None = None
+ terms_url: str | None = None
+ license_name: str | None = None
+ risk_level: str = Field(default="yellow", pattern="^(green|yellow|red)$")
+ is_enabled: bool = False
+ notes: str | None = None
+ last_reviewed_at: str | None = None
+
+
+class FeedCreateRequest(BaseModel):
+ name: str = Field(min_length=1, max_length=200)
+ url: str = Field(min_length=5, max_length=1000)
+ source_id: int | None = None
+ is_enabled: bool = True
+
+
+class RunCreateRequest(BaseModel):
+ run_type: str = Field(min_length=2, max_length=100)
+ status: str = Field(default="queued", pattern="^(queued|running|success|failed)$")
+ details: str | None = None
+
+
+class RunFinishRequest(BaseModel):
+ status: str = Field(pattern="^(success|failed)$")
+ details: str | None = None
+
+
+class ArticleUpsertRequest(BaseModel):
+ feed_id: int | None = None
+ source_article_id: str | None = None
+ source_hash: str | None = None
+ title: str = Field(min_length=1, max_length=500)
+ source_url: str = Field(min_length=5, max_length=2000)
+ canonical_url: str | None = None
+ published_at: str | None = None
+ author: str | None = None
+ summary: str | None = None
+ content_raw: str | None = None
+ content_rewritten: str | None = None
+ image_urls_json: str | None = None
+ press_contact: str | None = None
+ source_name_snapshot: str | None = None
+ source_terms_url_snapshot: str | None = None
+ source_license_name_snapshot: str | None = None
+ legal_checked: bool = False
+ legal_checked_at: str | None = None
+ legal_note: str | None = None
+ wp_post_id: int | None = None
+ wp_post_url: str | None = None
+ publish_attempts: int = 0
+ publish_last_error: str | None = None
+ published_to_wp_at: str | None = None
+ word_count: int = 0
+ status: str = Field(default="new", pattern="^(new|rewrite|publish|published|close|review|approved|error|no_image)$")
+ meta_json: str | None = None
+
+
+class IngestionRunRequest(BaseModel):
+ feed_id: int | None = None
+
+
+class ArticleTransitionRequest(BaseModel):
+ target_status: str = Field(pattern="^(new|rewrite|publish|published|close|review|approved|error|no_image)$")
+ note: str | None = None
+
+
+class ArticleReviewRequest(BaseModel):
+ decision: str = Field(pattern="^(approve|reject)$")
+ note: str | None = None
+
+
+class ArticleLegalReviewRequest(BaseModel):
+ approved: bool
+ note: str | None = None
+
+
+class PublisherEnqueueRequest(BaseModel):
+ article_id: int
+ max_attempts: int = 3
+
+
+class PublisherRunRequest(BaseModel):
+ max_jobs: int = 10
+
+
+ALLOWED_ARTICLE_TRANSITIONS: dict[str, set[str]] = {
+ "new": {"rewrite", "error"},
+ "rewrite": {"approved", "error"},
+ "approved": {"published", "error"},
+ "published": {"error"},
+ "error": {"rewrite"},
+}
+
+
+def require_auth(request: Request) -> str:
+ token = request.cookies.get(settings.session_cookie_name)
+ if not token:
+ raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Nicht angemeldet")
+
+ username = verify_session_token(token)
+ if not username:
+ raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Session ungueltig oder abgelaufen")
+
+ return username
+
+
+@app.get("/health")
+def health() -> dict:
+ return {"status": "ok", "service": settings.app_name, "db_path": settings.app_db_path}
+
+
+@app.post("/auth/login")
+def login(payload: LoginRequest, response: Response) -> dict:
+ if not verify_credentials(payload.username, payload.password):
+ raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Ungueltige Zugangsdaten")
+
+ token = create_session_token(payload.username)
+ response.set_cookie(
+ key=settings.session_cookie_name,
+ value=token,
+ max_age=settings.session_max_age_seconds,
+ httponly=True,
+ secure=False,
+ samesite="lax",
+ )
+ return {"ok": True, "username": payload.username}
+
+
+@app.post("/auth/logout")
+def logout(response: Response) -> dict:
+ response.delete_cookie(settings.session_cookie_name)
+ return {"ok": True}
+
+
+@app.get("/auth/me")
+def me(username: str = Depends(require_auth)) -> dict:
+ return {"authenticated": True, "username": username}
+
+
+@app.get("/api/protected")
+def protected(username: str = Depends(require_auth)) -> dict:
+ return {"ok": True, "message": "Protected endpoint", "username": username}
+
+
+@app.get("/api/pipeline/status")
+def pipeline_status(username: str = Depends(require_auth)) -> dict:
+ feeds_total = len(repo_list_feeds())
+ sources_total = len(repo_list_sources())
+ articles_total = len(repo_list_articles(limit=500))
+ return {
+ "ok": True,
+ "stage": "skeleton+db",
+ "requested_by": username,
+ "counts": {
+ "sources": sources_total,
+ "feeds": feeds_total,
+ "articles": articles_total,
+ },
+ }
+
+
+@app.get("/api/sources")
+def list_sources(username: str = Depends(require_auth)) -> dict:
+ return {"ok": True, "items": repo_list_sources(), "requested_by": username}
+
+
+@app.get("/api/sources/{source_id}/policy-check")
+def source_policy_check(source_id: int, username: str = Depends(require_auth)) -> dict:
+ source = get_source_by_id(source_id)
+ if not source:
+ raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Quelle nicht gefunden")
+ issues = evaluate_source_policy(source)
+ return {
+ "ok": True,
+ "source_id": source_id,
+ "allowed": is_source_allowed(source),
+ "issues": issues,
+ "requested_by": username,
+ }
+
+
+@app.post("/api/sources")
+def create_source(payload: SourceCreateRequest, username: str = Depends(require_auth)) -> dict:
+ source_id = repo_create_source(
+ SourceCreate(
+ name=payload.name,
+ base_url=payload.base_url,
+ terms_url=payload.terms_url,
+ license_name=payload.license_name,
+ risk_level=payload.risk_level,
+ is_enabled=payload.is_enabled,
+ notes=payload.notes,
+ last_reviewed_at=payload.last_reviewed_at,
+ )
+ )
+ return {"ok": True, "id": source_id, "requested_by": username}
+
+
+@app.get("/api/feeds")
+def list_feeds(username: str = Depends(require_auth)) -> dict:
+ return {"ok": True, "items": repo_list_feeds(), "requested_by": username}
+
+
+@app.get("/api/feeds/{feed_id}/policy-check")
+def feed_policy_check(feed_id: int, username: str = Depends(require_auth)) -> dict:
+ feed = get_feed_by_id(feed_id)
+ if not feed:
+ raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Feed nicht gefunden")
+
+ source_snapshot = {
+ "id": feed.get("source_id"),
+ "name": feed.get("source_name"),
+ "base_url": feed.get("source_base_url"),
+ "terms_url": feed.get("source_terms_url"),
+ "license_name": feed.get("source_license_name"),
+ "risk_level": feed.get("source_risk_level"),
+ "last_reviewed_at": feed.get("source_last_reviewed_at"),
+ "is_enabled": feed.get("source_is_enabled"),
+ }
+ issues = evaluate_source_policy(source_snapshot)
+ return {
+ "ok": True,
+ "feed_id": feed_id,
+ "allowed": len(issues) == 0,
+ "issues": issues,
+ "requested_by": username,
+ }
+
+
+@app.post("/api/feeds")
+def create_feed(payload: FeedCreateRequest, username: str = Depends(require_auth)) -> dict:
+ try:
+ feed_id = repo_create_feed(
+ FeedCreate(
+ name=payload.name,
+ url=payload.url,
+ source_id=payload.source_id,
+ is_enabled=payload.is_enabled,
+ )
+ )
+ except Exception as exc:
+ raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=f"Feed konnte nicht angelegt werden: {exc}") from exc
+
+ return {"ok": True, "id": feed_id, "requested_by": username}
+
+
+@app.get("/api/runs")
+def api_list_runs(limit: int = 50, username: str = Depends(require_auth)) -> dict:
+ return {"ok": True, "items": list_runs(limit=limit), "requested_by": username}
+
+
+@app.get("/api/runs/{run_id}")
+def api_get_run(run_id: int, username: str = Depends(require_auth)) -> dict:
+ run = get_run_by_id(run_id)
+ if not run:
+ raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Run nicht gefunden")
+ return {"ok": True, "item": run, "requested_by": username}
+
+
+@app.post("/api/runs")
+def api_create_run(payload: RunCreateRequest, username: str = Depends(require_auth)) -> dict:
+ run_id = create_run(RunCreate(run_type=payload.run_type, status=payload.status, details=payload.details))
+ return {"ok": True, "id": run_id, "requested_by": username}
+
+
+@app.post("/api/runs/{run_id}/finish")
+def api_finish_run(run_id: int, payload: RunFinishRequest, username: str = Depends(require_auth)) -> dict:
+ finish_run(run_id=run_id, status=payload.status, details=payload.details)
+ return {"ok": True, "id": run_id, "requested_by": username}
+
+
+@app.get("/api/articles")
+def api_list_articles(limit: int = 100, status_filter: str | None = None, username: str = Depends(require_auth)) -> dict:
+ internal_filter = ui_to_internal_status(status_filter) if status_filter else None
+ items = repo_list_articles(limit=limit, status_filter=internal_filter)
+ for item in items:
+ item["status_ui"] = internal_to_ui_status(item.get("status"))
+ return {"ok": True, "items": items, "requested_by": username}
+
+
+@app.get("/api/articles/export")
+def api_export_articles(
+ format: str = "json",
+ status_filter: str | None = None,
+ username: str = Depends(require_auth),
+):
+ internal_filter = ui_to_internal_status(status_filter) if status_filter else None
+ articles = repo_list_articles(limit=500, status_filter=internal_filter)
+ rows = []
+ for article in articles:
+ meta: dict = {}
+ if article.get("meta_json"):
+ try:
+ parsed = json.loads(article["meta_json"])
+ if isinstance(parsed, dict):
+ meta = parsed
+ except Exception:
+ meta = {}
+ image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {}
+ selected_image_url = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None
+
+ days_old = article_age_days(article.get("published_at"))
+ rows.append(
+ {
+ "id": article.get("id"),
+ "title": article.get("title"),
+ "status": article.get("status"),
+ "published_at": article.get("published_at"),
+ "days_old": days_old,
+ "relevance": article_relevance(article.get("published_at")),
+ "author": article.get("author"),
+ "source_url": article.get("source_url"),
+ "canonical_url": article.get("canonical_url"),
+ "source_name_snapshot": article.get("source_name_snapshot"),
+ "source_license_name_snapshot": article.get("source_license_name_snapshot"),
+ "source_terms_url_snapshot": article.get("source_terms_url_snapshot"),
+ "press_contact": article.get("press_contact"),
+ "image_urls_json": article.get("image_urls_json"),
+ "selected_image_url": selected_image_url,
+ "legal_checked": bool(int(article.get("legal_checked", 0))),
+ "legal_checked_at": article.get("legal_checked_at"),
+ "legal_note": article.get("legal_note"),
+ }
+ )
+
+ generated_at = datetime.now(timezone.utc).isoformat()
+ if format == "csv":
+ out = io.StringIO()
+ fieldnames = [
+ "id",
+ "title",
+ "status",
+ "published_at",
+ "days_old",
+ "relevance",
+ "author",
+ "source_url",
+ "canonical_url",
+ "source_name_snapshot",
+ "source_license_name_snapshot",
+ "source_terms_url_snapshot",
+ "press_contact",
+ "image_urls_json",
+ "selected_image_url",
+ "legal_checked",
+ "legal_checked_at",
+ "legal_note",
+ ]
+ writer = csv.DictWriter(out, fieldnames=fieldnames)
+ writer.writeheader()
+ writer.writerows(rows)
+ return Response(
+ content=out.getvalue(),
+ media_type="text/csv; charset=utf-8",
+ headers={"Content-Disposition": 'attachment; filename="articles_export.csv"'},
+ )
+
+ return JSONResponse(
+ {
+ "ok": True,
+ "count": len(rows),
+ "generated_at": generated_at,
+ "status_filter": status_filter,
+ "items": rows,
+ "requested_by": username,
+ }
+ )
+
+
+@app.get("/api/articles/{article_id}")
+def api_get_article(article_id: int, username: str = Depends(require_auth)) -> dict:
+ article = get_article_by_id(article_id)
+ if not article:
+ raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
+ article["status_ui"] = internal_to_ui_status(article.get("status"))
+ return {"ok": True, "item": article, "requested_by": username}
+
+
+@app.post("/api/articles/upsert")
+def api_upsert_article(payload: ArticleUpsertRequest, username: str = Depends(require_auth)) -> dict:
+ article_id = repo_upsert_article(
+ ArticleUpsert(
+ feed_id=payload.feed_id,
+ source_article_id=payload.source_article_id,
+ source_hash=payload.source_hash,
+ title=payload.title,
+ source_url=payload.source_url,
+ canonical_url=payload.canonical_url,
+ published_at=payload.published_at,
+ author=payload.author,
+ summary=payload.summary,
+ content_raw=payload.content_raw,
+ content_rewritten=payload.content_rewritten,
+ image_urls_json=payload.image_urls_json,
+ press_contact=payload.press_contact,
+ source_name_snapshot=payload.source_name_snapshot,
+ source_terms_url_snapshot=payload.source_terms_url_snapshot,
+ source_license_name_snapshot=payload.source_license_name_snapshot,
+ legal_checked=payload.legal_checked,
+ legal_checked_at=payload.legal_checked_at,
+ legal_note=payload.legal_note,
+ wp_post_id=payload.wp_post_id,
+ wp_post_url=payload.wp_post_url,
+ publish_attempts=payload.publish_attempts,
+ publish_last_error=payload.publish_last_error,
+ published_to_wp_at=payload.published_to_wp_at,
+ word_count=payload.word_count,
+ status=ui_to_internal_status(payload.status),
+ meta_json=payload.meta_json,
+ )
+ )
+ return {"ok": True, "id": article_id, "requested_by": username}
+
+
+@app.post("/api/articles/{article_id}/transition")
+def api_article_transition(article_id: int, payload: ArticleTransitionRequest, username: str = Depends(require_auth)) -> dict:
+ article = get_article_by_id(article_id)
+ if not article:
+ raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
+
+ current_status = article.get("status")
+ current_ui = internal_to_ui_status(current_status)
+ target_internal = ui_to_internal_status(payload.target_status)
+ target_ui = internal_to_ui_status(target_internal)
+ allowed_targets = ALLOWED_UI_TRANSITIONS.get(current_ui, set())
+ if target_ui not in allowed_targets:
+ raise HTTPException(
+ status_code=status.HTTP_400_BAD_REQUEST,
+ detail=f"Ungueltiger Statuswechsel: {current_ui} -> {target_ui}",
+ )
+
+ updated = update_article_status(article_id, target_internal, actor=username, note=payload.note)
+ if not updated:
+ raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
+ return {"ok": True, "id": article_id, "from_status": current_ui, "to_status": target_ui}
+
+
+@app.post("/api/articles/{article_id}/rewrite-run")
+def api_article_rewrite_run(article_id: int, username: str = Depends(require_auth)) -> dict:
+ article = get_article_by_id(article_id)
+ if not article:
+ raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
+ if internal_to_ui_status(article.get("status")) not in {"rewrite", "new"}:
+ raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Rewrite nur aus Status 'new' oder 'rewrite'")
+
+ rewritten = rewrite_article_text(article)
+ tags: list[str] = []
+ try:
+ tags = generate_article_tags(article, rewritten_text=rewritten)
+ except Exception:
+ tags = []
+ merged_meta = merge_generated_tags(article.get("meta_json"), tags)
+ # upsert via status update + existing fields by lightweight path:
+ repo_upsert_article(
+ ArticleUpsert(
+ feed_id=article.get("feed_id"),
+ source_article_id=article.get("source_article_id"),
+ source_hash=article.get("source_hash"),
+ title=article.get("title"),
+ source_url=article.get("source_url"),
+ canonical_url=article.get("canonical_url"),
+ published_at=article.get("published_at"),
+ author=article.get("author"),
+ summary=article.get("summary"),
+ content_raw=article.get("content_raw"),
+ content_rewritten=rewritten,
+ image_urls_json=article.get("image_urls_json"),
+ press_contact=article.get("press_contact"),
+ source_name_snapshot=article.get("source_name_snapshot"),
+ source_terms_url_snapshot=article.get("source_terms_url_snapshot"),
+ source_license_name_snapshot=article.get("source_license_name_snapshot"),
+ legal_checked=bool(int(article.get("legal_checked", 0))),
+ legal_checked_at=article.get("legal_checked_at"),
+ legal_note=article.get("legal_note"),
+ wp_post_id=article.get("wp_post_id"),
+ wp_post_url=article.get("wp_post_url"),
+ publish_attempts=int(article.get("publish_attempts", 0)),
+ publish_last_error=article.get("publish_last_error"),
+ published_to_wp_at=article.get("published_to_wp_at"),
+ word_count=len(rewritten.split()),
+ status="approved",
+ meta_json=merged_meta,
+ )
+ )
+ return {"ok": True, "id": article_id, "status": "publish", "tags": tags}
+
+
+@app.post("/api/articles/{article_id}/legal-review")
+def api_article_legal_review(article_id: int, payload: ArticleLegalReviewRequest, username: str = Depends(require_auth)) -> dict:
+ article = get_article_by_id(article_id)
+ if not article:
+ raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
+
+ updated = set_article_legal_review(article_id, approved=payload.approved, note=payload.note, actor=username)
+ if not updated:
+ raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
+ return {
+ "ok": True,
+ "id": article_id,
+ "legal_checked": payload.approved,
+ }
+
+
+@app.get("/api/publisher/jobs")
+def api_publisher_jobs(limit: int = 100, username: str = Depends(require_auth)) -> dict:
+ return {"ok": True, "items": list_publish_jobs(limit=limit), "requested_by": username}
+
+
+@app.post("/api/publisher/enqueue")
+def api_publisher_enqueue(payload: PublisherEnqueueRequest, username: str = Depends(require_auth)) -> dict:
+ article = get_article_by_id(payload.article_id)
+ if not article:
+ raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
+ job_id = enqueue_publish(article_id=payload.article_id, max_attempts=payload.max_attempts)
+ return {"ok": True, "job_id": job_id, "article_id": payload.article_id, "requested_by": username}
+
+
+@app.post("/api/publisher/run")
+def api_publisher_run(payload: PublisherRunRequest, username: str = Depends(require_auth)) -> dict:
+ stats = run_publisher(max_jobs=payload.max_jobs)
+ return {
+ "ok": True,
+ "requested_by": username,
+ "stats": {
+ "processed": stats.processed,
+ "success": stats.success,
+ "failed": stats.failed,
+ "requeued": stats.requeued,
+ },
+ }
+
+
+@app.post("/api/articles/{article_id}/review")
+def api_article_review(article_id: int, payload: ArticleReviewRequest, username: str = Depends(require_auth)) -> dict:
+ raise HTTPException(status_code=status.HTTP_410_GONE, detail="Review-Endpoint ersetzt durch Rewrite-Workflow")
+
+
+@app.post("/api/ingestion/run")
+def api_run_ingestion(payload: IngestionRunRequest, username: str = Depends(require_auth)) -> dict:
+ stats = run_ingestion(feed_id=payload.feed_id)
+ return {
+ "ok": stats.status == "success",
+ "run_id": stats.run_id,
+ "status": stats.status,
+ "message": stats.message,
+ "stats": {
+ "feeds_processed": stats.feeds_processed,
+ "entries_seen": stats.entries_seen,
+ "articles_upserted": stats.articles_upserted,
+ },
+ "requested_by": username,
+ }
+
+
+# ---------------------------------------------------------------------------
+# N8N Automation endpoint (API-Key auth, no session cookie required)
+# ---------------------------------------------------------------------------
+
+def _require_api_key(request: Request) -> None:
+ api_key = request.headers.get("X-API-Key") or request.query_params.get("api_key")
+ expected = settings.n8n_api_key
+ if not expected:
+ raise HTTPException(status_code=status.HTTP_501_NOT_IMPLEMENTED, detail="N8N_API_KEY nicht konfiguriert")
+ if api_key != expected:
+ raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Ungültiger API-Key")
+
+
+_pipeline_lock = asyncio.Lock()
+
+
+@app.post("/api/n8n/pipeline")
+async def api_n8n_pipeline(request: Request) -> dict:
+ """Trigger the full auto pipeline in background. Returns immediately.
+ Called by N8N (2x/day or on demand). Results arrive via Telegram."""
+ _require_api_key(request)
+
+ if _pipeline_lock.locked():
+ logging.getLogger(__name__).warning("Pipeline bereits aktiv – Trigger ignoriert")
+ return {"ok": False, "message": "Pipeline läuft bereits – Trigger ignoriert"}
+
+ async def _run():
+ async with _pipeline_lock:
+ loop = asyncio.get_event_loop()
+ try:
+ await loop.run_in_executor(None, lambda: run_auto_pipeline(trigger="n8n"))
+ except Exception as exc:
+ logging.getLogger(__name__).error("Background pipeline error: %s", exc)
+
+ asyncio.create_task(_run())
+ return {"ok": True, "message": "Pipeline gestartet – Ergebnisse kommen per Telegram"}
+
+
+@app.post("/api/n8n/ingest")
+def api_n8n_ingest(request: Request) -> dict:
+ """Run only the ingestion step (no rewrite/publish). For N8N."""
+ _require_api_key(request)
+ stats = run_ingestion()
+ return {
+ "ok": stats.status == "success",
+ "stats": {
+ "feeds_processed": stats.feeds_processed,
+ "entries_seen": stats.entries_seen,
+ "articles_upserted": stats.articles_upserted,
+ },
+ }
+
+
+# ---------------------------------------------------------------------------
+# Telegram Webhook
+# ---------------------------------------------------------------------------
+
+@app.post("/telegram/webhook")
+async def telegram_webhook(request: Request) -> dict:
+ """Receive updates from Telegram Bot API.
+
+ Returns 200 immediately so Telegram never retries the same update.
+ Actual processing runs in a background task.
+ """
+ import asyncio
+ import logging
+
+ # Verify secret token
+ secret = settings.telegram_webhook_secret
+ if secret:
+ incoming = request.headers.get("X-Telegram-Bot-Api-Secret-Token", "")
+ if incoming != secret:
+ raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="Invalid secret")
+
+ body = await request.body()
+ try:
+ update = json.loads(body.decode("utf-8"))
+ except Exception:
+ raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON")
+
+ async def _process():
+ loop = asyncio.get_event_loop()
+ try:
+ await loop.run_in_executor(None, lambda: handle_update(update))
+ except Exception as exc:
+ logging.getLogger(__name__).error("Telegram update handler error: %s", exc)
+
+ asyncio.create_task(_process())
+ return {"ok": True}
+
+
+@app.post("/api/telegram/setup-webhook")
+def api_setup_telegram_webhook(request: Request) -> dict:
+ """Register the Telegram webhook URL. Call once after deployment."""
+ username = require_auth(request)
+ base_url = str(request.base_url).rstrip("/")
+ webhook_url = f"{base_url}/telegram/webhook"
+ result = setup_webhook(webhook_url)
+ return {"ok": True, "webhook_url": webhook_url, "telegram_response": result, "requested_by": username}
diff --git a/backend/app/pipeline.py b/backend/app/pipeline.py
new file mode 100644
index 0000000..93a251b
--- /dev/null
+++ b/backend/app/pipeline.py
@@ -0,0 +1,516 @@
+"""Autonomous RSS-News pipeline.
+
+Full automated flow:
+1. Run RSS ingestion
+2. For each new article:
+ - Auto-select primary image
+ - Score relevance via GPT
+ - < warn threshold: reject (error status) → Telegram rejected summary
+ - warn..auto threshold: Telegram warning with override button
+ - >= auto threshold: rewrite → create WP draft → Telegram notification
+3. Send pipeline summary to Telegram
+"""
+from __future__ import annotations
+
+import json
+import logging
+import time
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from typing import Any
+
+from .config import get_settings
+from .ingestion import run_ingestion
+from .publisher import enqueue_publish, run_publisher
+from .repositories import (
+ ArticleUpsert,
+ get_article_by_id,
+ list_articles,
+ set_article_image_decision,
+ update_article_status,
+ upsert_article as repo_upsert_article,
+)
+from .rewrite import generate_article_tags, merge_generated_tags, rewrite_article_text, score_article_relevance
+from .scheduler import reserve_publish_slot
+from .wordpress import publish_article_draft, selected_image_exists
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class PipelineStats:
+ ingested: int = 0
+ processed: int = 0
+ drafts_created: int = 0
+ rejected: int = 0
+ quality_gate_rejected: int = 0
+ warnings: int = 0
+ errors: int = 0
+ no_image: int = 0
+ rejected_articles: list[dict[str, Any]] = field(default_factory=list)
+
+
+# ---------------------------------------------------------------------------
+# Internal helpers
+# ---------------------------------------------------------------------------
+
+def _auto_select_image(article: dict[str, Any]) -> bool:
+ """Auto-select the primary image from ingestion metadata if not already selected."""
+ meta_json = article.get("meta_json") or "{}"
+ try:
+ meta = json.loads(meta_json)
+ except Exception:
+ return False
+
+ # Already selected?
+ image_review = meta.get("image_review") or {}
+ if isinstance(image_review, dict) and image_review.get("selected_url"):
+ return True
+
+ # Try to get primary from ingestion extraction
+ extraction = meta.get("extraction") or {}
+ image_selection = extraction.get("image_selection") or {}
+ primary = image_selection.get("primary")
+
+ if not primary:
+ # Fallback: use first URL from image_urls_json
+ image_urls_json = article.get("image_urls_json") or "[]"
+ try:
+ urls = json.loads(image_urls_json)
+ if urls:
+ primary = urls[0]
+ except Exception:
+ pass
+
+ if primary:
+ set_article_image_decision(int(article["id"]), primary, "select", actor="pipeline")
+ return True
+ return False
+
+
+def _store_relevance(article_id: int, relevance: dict[str, Any]) -> None:
+ """Persist relevance score and reason in article meta_json and relevance_score column."""
+ article = get_article_by_id(article_id)
+ if not article:
+ return
+ try:
+ meta = json.loads(article.get("meta_json") or "{}")
+ except Exception:
+ meta = {}
+ meta["relevance"] = relevance
+ new_meta = json.dumps(meta, ensure_ascii=False)
+ from .db import get_conn
+ with get_conn() as conn:
+ conn.execute(
+ "UPDATE articles SET meta_json = ?, relevance_score = ? WHERE id = ?",
+ (new_meta, relevance.get("score", 0), article_id),
+ )
+
+
+def _do_rewrite_and_draft(article: dict[str, Any]) -> tuple[int, str | None]:
+ """Rewrite article and create WP draft. Returns (wp_post_id, wp_post_url)."""
+ article_id = int(article["id"])
+ settings = get_settings()
+
+ # ── Quality gate 1: raw content length ──────────────────────────────────
+ import re as _re
+ raw_text = _re.sub(r"<[^>]+>", " ", article.get("content_raw") or "")
+ raw_words = len(raw_text.split())
+ if raw_words < settings.pipeline_min_words_raw:
+ note = (
+ f"Zu wenig Rohinhalt: {raw_words} Wörter "
+ f"(Minimum: {settings.pipeline_min_words_raw})"
+ )
+ logger.warning("_do_rewrite_and_draft #%d: %s — überspringe", article_id, note)
+ update_article_status(article_id, "error", actor="pipeline", note=note)
+ raise ValueError(note)
+
+ # Rewrite
+ logger.info("_do_rewrite_and_draft #%d: starte OpenAI-Rewrite (%d Roh-Wörter)", article_id, raw_words)
+ rewritten = rewrite_article_text(article)
+
+ # ── Quality gate 2: rewritten content length ─────────────────────────────
+ rewritten_words = len(rewritten.split())
+ if rewritten_words < settings.pipeline_min_words_rewritten:
+ note = (
+ f"Rewrite zu kurz: {rewritten_words} Wörter "
+ f"(Minimum: {settings.pipeline_min_words_rewritten})"
+ )
+ logger.warning("_do_rewrite_and_draft #%d: %s — überspringe", article_id, note)
+ update_article_status(article_id, "error", actor="pipeline", note=note)
+ raise ValueError(note)
+ logger.info("_do_rewrite_and_draft #%d: Rewrite fertig (%d Wörter), generiere Tags", article_id, len(rewritten.split()))
+ tags: list[str] = []
+ try:
+ tags = generate_article_tags(article, rewritten_text=rewritten)
+ except Exception:
+ pass
+ merged_meta = merge_generated_tags(article.get("meta_json"), tags)
+
+ # Save rewritten content + approved status
+ repo_upsert_article(
+ ArticleUpsert(
+ feed_id=article.get("feed_id"),
+ source_article_id=article.get("source_article_id"),
+ source_hash=article.get("source_hash"),
+ title=article.get("title", ""),
+ source_url=article.get("source_url", ""),
+ canonical_url=article.get("canonical_url"),
+ published_at=article.get("published_at"),
+ author=article.get("author"),
+ summary=article.get("summary"),
+ content_raw=article.get("content_raw"),
+ content_rewritten=rewritten,
+ image_urls_json=article.get("image_urls_json"),
+ press_contact=article.get("press_contact"),
+ source_name_snapshot=article.get("source_name_snapshot"),
+ source_terms_url_snapshot=article.get("source_terms_url_snapshot"),
+ source_license_name_snapshot=article.get("source_license_name_snapshot"),
+ legal_checked=bool(int(article.get("legal_checked", 0))),
+ legal_checked_at=article.get("legal_checked_at"),
+ legal_note=article.get("legal_note"),
+ wp_post_id=article.get("wp_post_id"),
+ wp_post_url=article.get("wp_post_url"),
+ publish_attempts=int(article.get("publish_attempts", 0)),
+ publish_last_error=article.get("publish_last_error"),
+ published_to_wp_at=article.get("published_to_wp_at"),
+ word_count=len(rewritten.split()),
+ status="approved",
+ meta_json=merged_meta,
+ )
+ )
+
+ # Reload after save to get updated meta_json
+ fresh = get_article_by_id(article_id)
+ if not fresh:
+ raise RuntimeError(f"Artikel #{article_id} nach Rewrite nicht gefunden")
+
+ # Ensure a publish slot is reserved — reserve one now if not yet set
+ if not fresh.get("scheduled_publish_at"):
+ from .scheduler import reserve_publish_slot
+ logger.info("_do_rewrite_and_draft #%d: kein Slot gesetzt, reserviere jetzt", article_id)
+ reserve_publish_slot(article_id)
+ fresh = get_article_by_id(article_id)
+ if not fresh:
+ raise RuntimeError(f"Artikel #{article_id} nach Slot-Reservierung nicht gefunden")
+
+ # Create WP draft
+ logger.info("_do_rewrite_and_draft #%d: erstelle/aktualisiere WP Draft (wp_post_id=%s, sched=%s)", article_id, fresh.get("wp_post_id"), fresh.get("scheduled_publish_at"))
+ wp_post_id, wp_post_url = publish_article_draft(fresh)
+ logger.info("_do_rewrite_and_draft #%d: WP Draft fertig (post_id=%s)", article_id, wp_post_id)
+
+ # Update WP info in DB
+ from .repositories import mark_article_publish_result
+ mark_article_publish_result(
+ article_id,
+ wp_post_id=wp_post_id,
+ wp_post_url=wp_post_url,
+ error=None,
+ increment_attempts=True,
+ set_published_status=False,
+ )
+
+ return wp_post_id, wp_post_url
+
+
+# ---------------------------------------------------------------------------
+# Public pipeline functions
+# ---------------------------------------------------------------------------
+
+def run_auto_pipeline(trigger: str = "auto") -> dict[str, Any]:
+ """Run the full automated pipeline and return stats dict."""
+ from . import telegram_bot as tg
+
+ settings = get_settings()
+ stats = PipelineStats()
+
+ tg.notify_pipeline_started(trigger)
+
+ # Step 1: Ingestion
+ try:
+ ingest_result = run_ingestion()
+ stats.ingested = ingest_result.articles_upserted
+ except Exception as exc:
+ tg.notify_error(f"Ingestion fehlgeschlagen: {exc}")
+ logger.error("Ingestion error: %s", exc)
+ stats.errors += 1
+
+ # Step 2: Process new articles
+ new_articles = list_articles(limit=100, status_filter="new")
+
+ for article in new_articles:
+ article_id = int(article["id"])
+ try:
+ _process_article(article, stats, settings)
+ except Exception as exc:
+ logger.error("Fehler bei Artikel #%d: %s", article_id, exc)
+ tg.notify_error(f"Fehler bei Artikel #{article_id} ({article.get('title','?')[:50]}): {exc}")
+ stats.errors += 1
+ # Rate limiting between OpenAI calls
+ time.sleep(1)
+
+ # Step 3: Send rejected summary if any
+ if stats.rejected_articles:
+ try:
+ tg.notify_rejected_summary(stats.rejected_articles)
+ except Exception as exc:
+ logger.warning("Telegram rejected summary fehlgeschlagen: %s", exc)
+
+ # Step 4: Summary
+ result = {
+ "ingested": stats.ingested,
+ "processed": stats.processed,
+ "drafts_created": stats.drafts_created,
+ "rejected": stats.rejected,
+ "quality_gate_rejected": stats.quality_gate_rejected,
+ "no_image": stats.no_image,
+ "warnings": stats.warnings,
+ "errors": stats.errors,
+ }
+ tg.notify_pipeline_done(result)
+ return result
+
+
+def _process_article(article: dict[str, Any], stats: PipelineStats, settings: Any) -> None:
+ """Process a single new article through the pipeline."""
+ from . import telegram_bot as tg
+
+ article_id = int(article["id"])
+
+ # Auto-select image
+ _auto_select_image(article)
+
+ # Reload to get updated image_review
+ article = get_article_by_id(article_id) or article
+
+ # Exclude articles without a usable image
+ try:
+ meta = json.loads(article.get("meta_json") or "{}")
+ except Exception:
+ meta = {}
+ has_image = bool((meta.get("image_review") or {}).get("selected_url"))
+ if not has_image:
+ update_article_status(
+ article_id,
+ "no_image",
+ actor="pipeline",
+ note="Kein Bild vorhanden – Artikel ausgeschlossen",
+ )
+ stats.no_image += 1
+ logger.info("Artikel #%d ausgeschlossen: kein Bild gefunden", article_id)
+ try:
+ tg.send_message(
+ f"🖼️ Kein Bild – Artikel #{article_id} ausgeschlossen\n"
+ f"📰 {(article.get('title') or '')[:80]}"
+ )
+ except Exception:
+ pass
+ return
+
+ # Score relevance
+ try:
+ relevance = score_article_relevance(article)
+ except Exception as exc:
+ logger.warning("Relevanz-Scoring für #%d fehlgeschlagen: %s", article_id, exc)
+ relevance = {"score": 0, "reason": f"Scoring-Fehler: {exc}", "topics": []}
+
+ score = relevance.get("score", 0)
+ reason = relevance.get("reason", "")
+ _store_relevance(article_id, relevance)
+
+ stats.processed += 1
+
+ if score < settings.pipeline_relevance_warn:
+ # Reject
+ update_article_status(
+ article_id,
+ "error",
+ actor="pipeline",
+ note=f"Abgelehnt: Score {score}/100 — {reason}",
+ )
+ stats.rejected += 1
+ # Reload for summary (now has relevance in meta)
+ updated = get_article_by_id(article_id)
+ if updated:
+ stats.rejected_articles.append(updated)
+
+ elif score < settings.pipeline_relevance_auto:
+ # Warning zone: set status to "review" so repeated /run calls don't re-warn
+ update_article_status(
+ article_id,
+ "review",
+ actor="pipeline",
+ note=f"Niedrige Relevanz: Score {score}/100 — {reason}",
+ )
+ stats.warnings += 1
+ try:
+ tg.notify_relevance_warning(article, score, reason)
+ except Exception as exc:
+ logger.warning("Telegram warning für #%d fehlgeschlagen: %s", article_id, exc)
+
+ else:
+ # Auto-process: rewrite + WP draft
+ try:
+ # Reserve publish slot FIRST so it's available when WP draft is created
+ slot = reserve_publish_slot(article_id)
+
+ # Reload article to get updated image_review + scheduled_publish_at
+ fresh = get_article_by_id(article_id)
+ if not fresh:
+ return
+ wp_post_id, wp_post_url = _do_rewrite_and_draft(fresh)
+ stats.drafts_created += 1
+
+ # Reload for notification
+ final = get_article_by_id(article_id)
+ if final:
+ try:
+ tg.notify_new_draft(final, score=score, suggested_publish_at=slot)
+ except Exception as exc:
+ logger.warning("Telegram draft-Benachrichtigung für #%d fehlgeschlagen: %s", article_id, exc)
+
+ except ValueError as exc:
+ # Quality gate rejection (too short etc.) — status already set in _do_rewrite_and_draft
+ # Release the reserved slot so it's available for the next article
+ from .scheduler import release_publish_slot
+ release_publish_slot(article_id)
+ # Clean up any stale WP draft from a previous pipeline run
+ stale = get_article_by_id(article_id)
+ if stale and stale.get("wp_post_id"):
+ try:
+ from .wordpress import delete_wp_post
+ delete_wp_post(int(stale["wp_post_id"]))
+ logger.info("Artikel #%d: veralteten WP-Draft #%s gelöscht", article_id, stale["wp_post_id"])
+ except Exception as del_exc:
+ logger.warning("Artikel #%d: WP-Draft konnte nicht gelöscht werden: %s", article_id, del_exc)
+ stats.quality_gate_rejected += 1
+ logger.info("Artikel #%d wegen Qualitätsprüfung abgelehnt: %s", article_id, exc)
+ # Individual Telegram notification for quality gate rejection
+ try:
+ title = (article.get("title") or "Ohne Titel")[:80]
+ tg.send_message(
+ f"✂️ Qualitätsprüfung nicht bestanden\n"
+ f"📰 {title}\n"
+ f"💯 Score: {score}/100\n"
+ f"⚠️ {exc}"
+ )
+ except Exception as tg_exc:
+ logger.warning("Telegram QG-Benachrichtigung für #%d fehlgeschlagen: %s", article_id, tg_exc)
+
+ except Exception as exc:
+ logger.error("Draft-Erstellung für #%d fehlgeschlagen: %s", article_id, exc)
+ update_article_status(article_id, "error", actor="pipeline", note=f"Draft-Fehler: {exc}")
+ # Release reserved slot so it's not permanently blocked by a failed article
+ from .scheduler import release_publish_slot
+ release_publish_slot(article_id)
+ raise
+
+
+# ---------------------------------------------------------------------------
+# Callback actions (called from telegram_bot._handle_callback)
+# ---------------------------------------------------------------------------
+
+def rewrite_and_update_draft(article_id: int) -> None:
+ """Rewrite article and update the existing WP draft."""
+ article = get_article_by_id(article_id)
+ if not article:
+ raise RuntimeError(f"Artikel #{article_id} nicht gefunden")
+ _auto_select_image(article)
+ fresh = get_article_by_id(article_id)
+ _do_rewrite_and_draft(fresh)
+
+
+def discard_article(article_id: int) -> None:
+ """Discard a draft: delete WP post if exists, set article to error."""
+ article = get_article_by_id(article_id)
+ if not article:
+ return
+
+ wp_post_id = article.get("wp_post_id")
+ if wp_post_id:
+ try:
+ from .wordpress import delete_wp_post
+ delete_wp_post(int(wp_post_id))
+ except Exception as exc:
+ logger.warning("WP Post #%d konnte nicht gelöscht werden: %s", wp_post_id, exc)
+
+ update_article_status(article_id, "error", actor="telegram", note="Via Telegram verworfen")
+
+
+def override_rejected_article(article_id: int) -> None:
+ """Force-process a previously rejected article."""
+ from . import telegram_bot as tg
+
+ article = get_article_by_id(article_id)
+ if not article:
+ raise RuntimeError(f"Artikel #{article_id} nicht gefunden")
+
+ # Reset to new so processing is allowed
+ update_article_status(article_id, "new", actor="telegram", note="Manuell übernommen via Telegram")
+
+ # Reload
+ fresh = get_article_by_id(article_id)
+ if not fresh:
+ return
+
+ _auto_select_image(fresh)
+ fresh = get_article_by_id(article_id)
+
+ # Get existing score or re-score
+ try:
+ meta = json.loads(fresh.get("meta_json") or "{}")
+ score = int((meta.get("relevance") or {}).get("score", 0))
+ except Exception:
+ score = 0
+
+ # Reserve publish slot FIRST so it's in the DB when WP draft is created
+ slot = reserve_publish_slot(article_id)
+ fresh = get_article_by_id(article_id)
+
+ wp_post_id, wp_post_url = _do_rewrite_and_draft(fresh)
+
+ final = get_article_by_id(article_id)
+ if final:
+ tg.notify_new_draft(final, score=score, suggested_publish_at=slot)
+
+
+# ---------------------------------------------------------------------------
+# Status helpers (used by /status command)
+# ---------------------------------------------------------------------------
+
+def get_recently_rejected(days: int = 3) -> list[dict[str, Any]]:
+ """Return articles rejected in the last N days."""
+ from .db import get_conn
+ from .db import rows_to_dicts
+ cutoff = datetime.now(timezone.utc).isoformat()[:10]
+ with get_conn() as conn:
+ rows = conn.execute(
+ """
+ SELECT id, title, meta_json, source_url, created_at
+ FROM articles
+ WHERE status IN ('error', 'review')
+ AND json_extract(meta_json, '$.relevance.score') IS NOT NULL
+ AND date(updated_at) >= date('now', ?)
+ ORDER BY updated_at DESC
+ LIMIT 20
+ """,
+ (f"-{days} days",),
+ ).fetchall()
+ return rows_to_dicts(rows)
+
+
+def get_pipeline_status_text() -> str:
+ """Return a text summary of current pipeline state."""
+ from .repositories import list_articles as _list
+ new_count = len(_list(limit=500, status_filter="new"))
+ approved_count = len(_list(limit=500, status_filter="approved"))
+ published_count = len(_list(limit=500, status_filter="published"))
+ error_count = len(_list(limit=500, status_filter="error"))
+
+ return (
+ f"📊 Pipeline-Status\n"
+ f"🆕 Neu / wartend: {new_count}\n"
+ f"✅ Draft / freigegeben: {approved_count}\n"
+ f"📢 Veröffentlicht: {published_count}\n"
+ f"🚫 Fehler / abgelehnt: {error_count}"
+ )
diff --git a/backend/app/policy.py b/backend/app/policy.py
new file mode 100644
index 0000000..af6e65c
--- /dev/null
+++ b/backend/app/policy.py
@@ -0,0 +1,35 @@
+from __future__ import annotations
+
+from typing import Any
+
+
+def evaluate_source_policy(source: dict[str, Any] | None) -> list[str]:
+ issues: list[str] = []
+ if not source:
+ issues.append("Keine Quelle zugeordnet")
+ return issues
+
+ risk_level = (source.get("risk_level") or "").strip().lower()
+ if risk_level != "green":
+ issues.append(f"Quelle nicht freigegeben (risk_level={risk_level or 'unset'})")
+
+ terms_url = (source.get("terms_url") or "").strip()
+ if not terms_url:
+ issues.append("terms_url fehlt")
+
+ license_name = (source.get("license_name") or "").strip()
+ if not license_name:
+ issues.append("license_name fehlt")
+
+ last_reviewed_at = (source.get("last_reviewed_at") or "").strip()
+ if not last_reviewed_at:
+ issues.append("last_reviewed_at fehlt")
+
+ if int(source.get("is_enabled", 0) or 0) != 1:
+ issues.append("Quelle ist deaktiviert")
+
+ return issues
+
+
+def is_source_allowed(source: dict[str, Any] | None) -> bool:
+ return len(evaluate_source_policy(source)) == 0
diff --git a/backend/app/publisher.py b/backend/app/publisher.py
new file mode 100644
index 0000000..e27bd1b
--- /dev/null
+++ b/backend/app/publisher.py
@@ -0,0 +1,101 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+from .repositories import (
+ claim_next_publish_job,
+ complete_publish_job,
+ create_publish_job,
+ fail_publish_job,
+ get_article_by_id,
+ mark_article_publish_result,
+ PublishJobCreate,
+)
+from .wordpress import publish_article_draft, selected_image_exists
+
+
+@dataclass(frozen=True)
+class PublisherStats:
+ processed: int
+ success: int
+ failed: int
+ requeued: int
+
+
+def enqueue_publish(article_id: int, max_attempts: int = 3) -> int:
+ return create_publish_job(PublishJobCreate(article_id=article_id, max_attempts=max_attempts))
+
+
+def _can_publish(article: dict) -> tuple[bool, str | None]:
+ if article.get("status") not in {"approved", "published"}:
+ return False, "Artikelstatus muss 'publish' sein"
+ if not selected_image_exists(article):
+ return False, "Hauptbild nicht gesetzt"
+ return True, None
+
+
+def run_publisher(max_jobs: int = 10) -> PublisherStats:
+ processed = 0
+ success = 0
+ failed = 0
+ requeued = 0
+
+ for _ in range(max(1, max_jobs)):
+ job = claim_next_publish_job()
+ if not job:
+ break
+ processed += 1
+ job_id = int(job["id"])
+ article_id = int(job["article_id"])
+
+ article = get_article_by_id(article_id)
+ if not article:
+ fail_publish_job(job_id, "Artikel nicht gefunden", requeue=False)
+ failed += 1
+ continue
+
+ allowed, reason = _can_publish(article)
+ if not allowed:
+ fail_publish_job(job_id, reason or "Publish-Bedingungen nicht erfüllt", requeue=False)
+ mark_article_publish_result(
+ article_id,
+ wp_post_id=article.get("wp_post_id"),
+ wp_post_url=article.get("wp_post_url"),
+ error=reason or "blocked",
+ increment_attempts=True,
+ set_published_status=False,
+ )
+ failed += 1
+ continue
+
+ try:
+ wp_post_id, wp_post_url = publish_article_draft(article)
+ complete_publish_job(job_id, wp_post_id=wp_post_id, wp_post_url=wp_post_url)
+ mark_article_publish_result(
+ article_id,
+ wp_post_id=wp_post_id,
+ wp_post_url=wp_post_url,
+ error=None,
+ increment_attempts=True,
+ set_published_status=True,
+ )
+ success += 1
+ except Exception as exc:
+ attempts = int(job.get("attempts", 1))
+ max_attempts = int(job.get("max_attempts", 3))
+ should_requeue = attempts < max_attempts
+ fail_publish_job(job_id, str(exc), requeue=should_requeue)
+ mark_article_publish_result(
+ article_id,
+ wp_post_id=article.get("wp_post_id"),
+ wp_post_url=article.get("wp_post_url"),
+ error=str(exc),
+ increment_attempts=True,
+ set_published_status=False,
+ )
+ if should_requeue:
+ requeued += 1
+ else:
+ failed += 1
+
+ return PublisherStats(processed=processed, success=success, failed=failed, requeued=requeued)
diff --git a/backend/app/relevance.py b/backend/app/relevance.py
new file mode 100644
index 0000000..8f69693
--- /dev/null
+++ b/backend/app/relevance.py
@@ -0,0 +1,44 @@
+from __future__ import annotations
+
+from datetime import datetime, timezone
+
+
+def _parse_iso_datetime(value: str | None) -> datetime | None:
+ if not value:
+ return None
+ raw = value.strip()
+ if not raw:
+ return None
+ if raw.endswith("Z"):
+ raw = raw[:-1] + "+00:00"
+ try:
+ parsed = datetime.fromisoformat(raw)
+ except ValueError:
+ return None
+ if parsed.tzinfo is None:
+ parsed = parsed.replace(tzinfo=timezone.utc)
+ return parsed
+
+
+def article_age_days(published_at: str | None, now: datetime | None = None) -> int | None:
+ published = _parse_iso_datetime(published_at)
+ if not published:
+ return None
+ ref = now or datetime.now(timezone.utc)
+ delta = ref - published
+ if delta.total_seconds() < 0:
+ return 0
+ return delta.days
+
+
+def article_relevance(published_at: str | None, now: datetime | None = None) -> str:
+ days = article_age_days(published_at, now=now)
+ if days is None:
+ return "unbekannt"
+ if days <= 2:
+ return "hoch"
+ if days <= 7:
+ return "mittel"
+ if days <= 30:
+ return "niedrig"
+ return "alt"
diff --git a/backend/app/repositories.py b/backend/app/repositories.py
new file mode 100644
index 0000000..cf38055
--- /dev/null
+++ b/backend/app/repositories.py
@@ -0,0 +1,855 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+import json
+from datetime import datetime, timezone
+from typing import Any
+
+from .db import get_conn, rows_to_dicts
+
+
+@dataclass(frozen=True)
+class SourceCreate:
+ name: str
+ base_url: str | None
+ terms_url: str | None
+ license_name: str | None
+ risk_level: str
+ is_enabled: bool
+ notes: str | None
+ last_reviewed_at: str | None
+
+
+@dataclass(frozen=True)
+class FeedCreate:
+ name: str
+ url: str
+ source_id: int | None
+ is_enabled: bool
+
+
+@dataclass(frozen=True)
+class SourceUpdate:
+ name: str
+ base_url: str | None
+ terms_url: str | None
+ license_name: str | None
+ risk_level: str
+ is_enabled: bool
+ notes: str | None
+ last_reviewed_at: str | None
+
+
+@dataclass(frozen=True)
+class FeedUpdate:
+ name: str
+ url: str
+ source_id: int | None
+ is_enabled: bool
+
+
+@dataclass(frozen=True)
+class RunCreate:
+ run_type: str
+ status: str
+ details: str | None = None
+
+
+@dataclass(frozen=True)
+class ArticleUpsert:
+ feed_id: int | None
+ source_article_id: str | None
+ source_hash: str | None
+ title: str
+ source_url: str
+ canonical_url: str | None
+ published_at: str | None
+ author: str | None
+ summary: str | None
+ content_raw: str | None
+ content_rewritten: str | None
+ image_urls_json: str | None
+ press_contact: str | None
+ source_name_snapshot: str | None
+ source_terms_url_snapshot: str | None
+ source_license_name_snapshot: str | None
+ legal_checked: bool
+ legal_checked_at: str | None
+ legal_note: str | None
+ wp_post_id: int | None
+ wp_post_url: str | None
+ publish_attempts: int
+ publish_last_error: str | None
+ published_to_wp_at: str | None
+ word_count: int
+ status: str
+ meta_json: str | None
+
+
+@dataclass(frozen=True)
+class PublishJobCreate:
+ article_id: int
+ max_attempts: int = 3
+
+
+def create_source(payload: SourceCreate) -> int:
+ with get_conn() as conn:
+ cur = conn.execute(
+ """
+ INSERT INTO sources (name, base_url, terms_url, license_name, risk_level, is_enabled, notes, last_reviewed_at)
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+ """,
+ (
+ payload.name.strip(),
+ payload.base_url,
+ payload.terms_url,
+ payload.license_name,
+ payload.risk_level,
+ 1 if payload.is_enabled else 0,
+ payload.notes,
+ payload.last_reviewed_at,
+ ),
+ )
+ return int(cur.lastrowid)
+
+
+def list_sources() -> list[dict[str, Any]]:
+ with get_conn() as conn:
+ rows = conn.execute(
+ """
+ SELECT id, name, base_url, terms_url, license_name, risk_level, is_enabled, notes, last_reviewed_at, created_at, updated_at
+ FROM sources
+ ORDER BY id DESC
+ """
+ ).fetchall()
+ return rows_to_dicts(rows)
+
+
+def get_source_by_id(source_id: int) -> dict[str, Any] | None:
+ with get_conn() as conn:
+ row = conn.execute(
+ """
+ SELECT id, name, base_url, terms_url, license_name, risk_level, is_enabled, notes, last_reviewed_at, created_at, updated_at
+ FROM sources
+ WHERE id = ?
+ """,
+ (source_id,),
+ ).fetchone()
+ return dict(row) if row else None
+
+
+def update_source(source_id: int, payload: SourceUpdate) -> bool:
+ with get_conn() as conn:
+ cur = conn.execute(
+ """
+ UPDATE sources
+ SET name = ?, base_url = ?, terms_url = ?, license_name = ?, risk_level = ?, is_enabled = ?, notes = ?, last_reviewed_at = ?
+ WHERE id = ?
+ """,
+ (
+ payload.name.strip(),
+ payload.base_url,
+ payload.terms_url,
+ payload.license_name,
+ payload.risk_level,
+ 1 if payload.is_enabled else 0,
+ payload.notes,
+ payload.last_reviewed_at,
+ source_id,
+ ),
+ )
+ return cur.rowcount > 0
+
+
+def delete_source(source_id: int) -> bool:
+ with get_conn() as conn:
+ cur = conn.execute("DELETE FROM sources WHERE id = ?", (source_id,))
+ return cur.rowcount > 0
+
+
+def create_feed(payload: FeedCreate) -> int:
+ with get_conn() as conn:
+ cur = conn.execute(
+ "INSERT INTO feeds (name, url, source_id, is_enabled) VALUES (?, ?, ?, ?)",
+ (payload.name.strip(), payload.url.strip(), payload.source_id, 1 if payload.is_enabled else 0),
+ )
+ return int(cur.lastrowid)
+
+
+def list_feeds() -> list[dict[str, Any]]:
+ with get_conn() as conn:
+ rows = conn.execute(
+ """
+ SELECT f.id, f.name, f.url, f.source_id, f.is_enabled, f.etag, f.last_modified, f.last_checked_at,
+ f.created_at, f.updated_at, s.name AS source_name, s.license_name AS source_license_name,
+ s.terms_url AS source_terms_url, s.risk_level AS source_risk_level, s.base_url AS source_base_url,
+ s.last_reviewed_at AS source_last_reviewed_at, s.is_enabled AS source_is_enabled
+ FROM feeds f
+ LEFT JOIN sources s ON s.id = f.source_id
+ ORDER BY f.id DESC
+ """
+ ).fetchall()
+ return rows_to_dicts(rows)
+
+
+def list_enabled_feeds() -> list[dict[str, Any]]:
+ with get_conn() as conn:
+ rows = conn.execute(
+ """
+ SELECT f.id, f.name, f.url, f.source_id, f.is_enabled, f.etag, f.last_modified, f.last_checked_at,
+ s.name AS source_name, s.license_name AS source_license_name, s.terms_url AS source_terms_url,
+ s.risk_level AS source_risk_level, s.base_url AS source_base_url,
+ s.last_reviewed_at AS source_last_reviewed_at, s.is_enabled AS source_is_enabled
+ FROM feeds f
+ LEFT JOIN sources s ON s.id = f.source_id
+ WHERE f.is_enabled = 1
+ ORDER BY f.id ASC
+ """
+ ).fetchall()
+ return rows_to_dicts(rows)
+
+
+def get_feed_by_id(feed_id: int) -> dict[str, Any] | None:
+ with get_conn() as conn:
+ row = conn.execute(
+ """
+ SELECT f.id, f.name, f.url, f.source_id, f.is_enabled, f.etag, f.last_modified, f.last_checked_at,
+ s.name AS source_name, s.license_name AS source_license_name, s.terms_url AS source_terms_url,
+ s.risk_level AS source_risk_level, s.base_url AS source_base_url,
+ s.last_reviewed_at AS source_last_reviewed_at, s.is_enabled AS source_is_enabled
+ FROM feeds f
+ LEFT JOIN sources s ON s.id = f.source_id
+ WHERE f.id = ?
+ """,
+ (feed_id,),
+ ).fetchone()
+ return dict(row) if row else None
+
+
+def update_feed(feed_id: int, payload: FeedUpdate) -> bool:
+ with get_conn() as conn:
+ cur = conn.execute(
+ """
+ UPDATE feeds
+ SET name = ?, url = ?, source_id = ?, is_enabled = ?
+ WHERE id = ?
+ """,
+ (
+ payload.name.strip(),
+ payload.url.strip(),
+ payload.source_id,
+ 1 if payload.is_enabled else 0,
+ feed_id,
+ ),
+ )
+ return cur.rowcount > 0
+
+
+def delete_feed(feed_id: int) -> bool:
+ with get_conn() as conn:
+ cur = conn.execute("DELETE FROM feeds WHERE id = ?", (feed_id,))
+ return cur.rowcount > 0
+
+
+def update_feed_fetch_state(feed_id: int, etag: str | None, last_modified: str | None) -> None:
+ with get_conn() as conn:
+ conn.execute(
+ """
+ UPDATE feeds
+ SET etag = ?, last_modified = ?, last_checked_at = datetime('now')
+ WHERE id = ?
+ """,
+ (etag, last_modified, feed_id),
+ )
+
+
+def create_run(payload: RunCreate) -> int:
+ with get_conn() as conn:
+ cur = conn.execute(
+ "INSERT INTO runs (run_type, status, details) VALUES (?, ?, ?)",
+ (payload.run_type, payload.status, payload.details),
+ )
+ return int(cur.lastrowid)
+
+
+def finish_run(run_id: int, status: str, details: str | None = None) -> None:
+ with get_conn() as conn:
+ conn.execute(
+ """
+ UPDATE runs
+ SET status = ?, details = ?, finished_at = datetime('now')
+ WHERE id = ?
+ """,
+ (status, details, run_id),
+ )
+
+
+def list_runs(limit: int = 50) -> list[dict[str, Any]]:
+ safe_limit = max(1, min(limit, 500))
+ with get_conn() as conn:
+ rows = conn.execute(
+ """
+ SELECT id, run_type, status, started_at, finished_at, details
+ FROM runs
+ ORDER BY id DESC
+ LIMIT ?
+ """,
+ (safe_limit,),
+ ).fetchall()
+ return rows_to_dicts(rows)
+
+
+def get_run_by_id(run_id: int) -> dict[str, Any] | None:
+ with get_conn() as conn:
+ row = conn.execute(
+ """
+ SELECT id, run_type, status, started_at, finished_at, details
+ FROM runs
+ WHERE id = ?
+ """,
+ (run_id,),
+ ).fetchone()
+ return dict(row) if row else None
+
+
+def get_article_by_id(article_id: int) -> dict[str, Any] | None:
+ with get_conn() as conn:
+ row = conn.execute(
+ """
+ SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author,
+ a.summary, a.content_raw, a.content_rewritten, a.image_urls_json, a.press_contact,
+ a.source_name_snapshot, a.source_terms_url_snapshot, a.source_license_name_snapshot,
+ a.legal_checked, a.legal_checked_at, a.legal_note,
+ a.wp_post_id, a.wp_post_url, a.publish_attempts, a.publish_last_error, a.published_to_wp_at,
+ a.word_count, a.status, a.meta_json, a.created_at, a.updated_at,
+ a.scheduled_publish_at
+ FROM articles a
+ WHERE a.id = ?
+ """,
+ (article_id,),
+ ).fetchone()
+ return dict(row) if row else None
+
+
+def _merge_review_event(meta_json: str | None, event: dict[str, Any]) -> str:
+ meta: dict[str, Any] = {}
+ if meta_json:
+ try:
+ meta = json.loads(meta_json)
+ if not isinstance(meta, dict):
+ meta = {}
+ except Exception:
+ meta = {}
+
+ events = meta.get("review_events")
+ if not isinstance(events, list):
+ events = []
+ events.append(event)
+ meta["review_events"] = events
+ return json.dumps(meta, ensure_ascii=False)
+
+
+def _load_meta(meta_json: str | None) -> dict[str, Any]:
+ if not meta_json:
+ return {}
+ try:
+ parsed = json.loads(meta_json)
+ return parsed if isinstance(parsed, dict) else {}
+ except Exception:
+ return {}
+
+
+def update_article_status(
+ article_id: int,
+ new_status: str,
+ *,
+ actor: str | None = None,
+ note: str | None = None,
+ decision: str | None = None,
+) -> bool:
+ article = get_article_by_id(article_id)
+ if not article:
+ return False
+
+ event = {
+ "timestamp": datetime.now(timezone.utc).isoformat(),
+ "from_status": article.get("status"),
+ "to_status": new_status,
+ "actor": actor or "system",
+ "note": note,
+ "decision": decision,
+ }
+ merged_meta = _merge_review_event(article.get("meta_json"), event)
+
+ with get_conn() as conn:
+ conn.execute(
+ "UPDATE articles SET status = ?, meta_json = ? WHERE id = ?",
+ (new_status, merged_meta, article_id),
+ )
+ return True
+
+
+def set_article_legal_review(article_id: int, approved: bool, note: str | None, actor: str | None = None) -> bool:
+ article = get_article_by_id(article_id)
+ if not article:
+ return False
+
+ event = {
+ "timestamp": datetime.now(timezone.utc).isoformat(),
+ "event": "legal_review",
+ "approved": approved,
+ "actor": actor or "system",
+ "note": note,
+ }
+ merged_meta = _merge_review_event(article.get("meta_json"), event)
+ with get_conn() as conn:
+ conn.execute(
+ """
+ UPDATE articles
+ SET legal_checked = ?, legal_checked_at = datetime('now'), legal_note = ?, meta_json = ?
+ WHERE id = ?
+ """,
+ (1 if approved else 0, note, merged_meta, article_id),
+ )
+ return True
+
+
+def set_article_image_decision(article_id: int, image_url: str, action: str, actor: str | None = None) -> bool:
+ article = get_article_by_id(article_id)
+ if not article:
+ return False
+ url = (image_url or "").strip()
+ if not url:
+ return False
+ if action not in {"select", "exclude", "restore"}:
+ return False
+
+ meta = _load_meta(article.get("meta_json"))
+ image_review = meta.get("image_review")
+ if not isinstance(image_review, dict):
+ image_review = {}
+
+ excluded = image_review.get("excluded_urls")
+ if not isinstance(excluded, list):
+ excluded = []
+ excluded_set = {str(item) for item in excluded if item}
+
+ selected_url = image_review.get("selected_url")
+ if not isinstance(selected_url, str):
+ selected_url = None
+
+ if action == "select":
+ selected_url = url
+ excluded_set.discard(url)
+ elif action == "exclude":
+ excluded_set.add(url)
+ if selected_url == url:
+ selected_url = None
+ elif action == "restore":
+ excluded_set.discard(url)
+
+ image_review["selected_url"] = selected_url
+ image_review["excluded_urls"] = sorted(excluded_set)
+ image_review["updated_at"] = datetime.now(timezone.utc).isoformat()
+ image_review["updated_by"] = actor or "system"
+ meta["image_review"] = image_review
+
+ with get_conn() as conn:
+ conn.execute(
+ "UPDATE articles SET meta_json = ? WHERE id = ?",
+ (json.dumps(meta, ensure_ascii=False), article_id),
+ )
+ return True
+
+
+def create_publish_job(payload: PublishJobCreate) -> int:
+ with get_conn() as conn:
+ existing = conn.execute(
+ """
+ SELECT id FROM publish_jobs
+ WHERE article_id = ? AND status IN ('queued', 'running')
+ ORDER BY id DESC
+ LIMIT 1
+ """,
+ (payload.article_id,),
+ ).fetchone()
+ if existing:
+ return int(existing["id"])
+
+ cur = conn.execute(
+ """
+ INSERT INTO publish_jobs (article_id, status, attempts, max_attempts)
+ VALUES (?, 'queued', 0, ?)
+ """,
+ (payload.article_id, max(1, payload.max_attempts)),
+ )
+ return int(cur.lastrowid)
+
+
+def list_publish_jobs(limit: int = 100) -> list[dict[str, Any]]:
+ safe_limit = max(1, min(limit, 500))
+ with get_conn() as conn:
+ rows = conn.execute(
+ """
+ SELECT j.id, j.article_id, j.status, j.attempts, j.max_attempts, j.error_message, j.wp_post_id, j.wp_post_url,
+ j.created_at, j.started_at, j.finished_at, a.title AS article_title
+ FROM publish_jobs j
+ LEFT JOIN articles a ON a.id = j.article_id
+ ORDER BY j.id DESC
+ LIMIT ?
+ """,
+ (safe_limit,),
+ ).fetchall()
+ return rows_to_dicts(rows)
+
+
+def claim_next_publish_job() -> dict[str, Any] | None:
+ with get_conn() as conn:
+ row = conn.execute(
+ """
+ SELECT id, article_id, status, attempts, max_attempts, error_message, wp_post_id, wp_post_url
+ FROM publish_jobs
+ WHERE status = 'queued' AND attempts < max_attempts
+ ORDER BY id ASC
+ LIMIT 1
+ """
+ ).fetchone()
+ if not row:
+ return None
+ job_id = int(row["id"])
+ conn.execute(
+ """
+ UPDATE publish_jobs
+ SET status = 'running',
+ attempts = attempts + 1,
+ started_at = datetime('now'),
+ finished_at = NULL
+ WHERE id = ?
+ """,
+ (job_id,),
+ )
+ claimed = conn.execute(
+ """
+ SELECT id, article_id, status, attempts, max_attempts, error_message, wp_post_id, wp_post_url
+ FROM publish_jobs
+ WHERE id = ?
+ """,
+ (job_id,),
+ ).fetchone()
+ return dict(claimed) if claimed else None
+
+
+def complete_publish_job(job_id: int, wp_post_id: int | None, wp_post_url: str | None) -> None:
+ with get_conn() as conn:
+ conn.execute(
+ """
+ UPDATE publish_jobs
+ SET status = 'success',
+ wp_post_id = ?,
+ wp_post_url = ?,
+ error_message = NULL,
+ finished_at = datetime('now')
+ WHERE id = ?
+ """,
+ (wp_post_id, wp_post_url, job_id),
+ )
+
+
+def fail_publish_job(job_id: int, error_message: str, requeue: bool) -> None:
+ next_status = "queued" if requeue else "failed"
+ with get_conn() as conn:
+ conn.execute(
+ """
+ UPDATE publish_jobs
+ SET status = ?,
+ error_message = ?,
+ finished_at = datetime('now')
+ WHERE id = ?
+ """,
+ (next_status, error_message[:2000], job_id),
+ )
+
+
+def mark_article_publish_result(
+ article_id: int,
+ *,
+ wp_post_id: int | None,
+ wp_post_url: str | None,
+ error: str | None,
+ increment_attempts: bool,
+ set_published_status: bool,
+) -> None:
+ with get_conn() as conn:
+ conn.execute(
+ """
+ UPDATE articles
+ SET wp_post_id = ?,
+ wp_post_url = ?,
+ publish_attempts = CASE WHEN ? THEN publish_attempts + 1 ELSE publish_attempts END,
+ publish_last_error = ?,
+ published_to_wp_at = CASE WHEN ? IS NOT NULL THEN datetime('now') ELSE published_to_wp_at END,
+ status = CASE WHEN ? THEN 'published' ELSE status END
+ WHERE id = ?
+ """,
+ (
+ wp_post_id,
+ wp_post_url,
+ 1 if increment_attempts else 0,
+ error[:2000] if error else None,
+ wp_post_id,
+ 1 if set_published_status else 0,
+ article_id,
+ ),
+ )
+
+
+def _resolve_existing_article_id(payload: ArticleUpsert) -> int | None:
+ with get_conn() as conn:
+ # 1) strongest key: source_url
+ row = conn.execute(
+ "SELECT id FROM articles WHERE source_url = ?",
+ (payload.source_url.strip(),),
+ ).fetchone()
+ if row:
+ return int(row["id"])
+
+ # 2) stable feed+guid combo
+ if payload.feed_id is not None and payload.source_article_id:
+ row = conn.execute(
+ "SELECT id FROM articles WHERE feed_id = ? AND source_article_id = ?",
+ (payload.feed_id, payload.source_article_id),
+ ).fetchone()
+ if row:
+ return int(row["id"])
+
+ # 3) content hash fallback
+ if payload.source_hash:
+ row = conn.execute(
+ "SELECT id FROM articles WHERE source_hash = ?",
+ (payload.source_hash,),
+ ).fetchone()
+ if row:
+ return int(row["id"])
+
+ return None
+
+
+def find_existing_article_for_upsert(payload: ArticleUpsert) -> dict[str, Any] | None:
+ article_id = _resolve_existing_article_id(payload)
+ if article_id is None:
+ return None
+ return get_article_by_id(article_id)
+
+
+def upsert_article(payload: ArticleUpsert) -> int:
+ existing_id = _resolve_existing_article_id(payload)
+ with get_conn() as conn:
+ if existing_id is None:
+ conn.execute(
+ """
+ INSERT INTO articles (
+ feed_id, source_article_id, source_hash, title, source_url, canonical_url, published_at, author,
+ summary, content_raw, content_rewritten, image_urls_json, press_contact,
+ source_name_snapshot, source_terms_url_snapshot, source_license_name_snapshot,
+ legal_checked, legal_checked_at, legal_note,
+ wp_post_id, wp_post_url, publish_attempts, publish_last_error, published_to_wp_at,
+ word_count, status, meta_json
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+ """,
+ (
+ payload.feed_id,
+ payload.source_article_id,
+ payload.source_hash,
+ payload.title.strip(),
+ payload.source_url.strip(),
+ payload.canonical_url,
+ payload.published_at,
+ payload.author,
+ payload.summary,
+ payload.content_raw,
+ payload.content_rewritten,
+ payload.image_urls_json,
+ payload.press_contact,
+ payload.source_name_snapshot,
+ payload.source_terms_url_snapshot,
+ payload.source_license_name_snapshot,
+ 1 if payload.legal_checked else 0,
+ payload.legal_checked_at,
+ payload.legal_note,
+ payload.wp_post_id,
+ payload.wp_post_url,
+ payload.publish_attempts,
+ payload.publish_last_error,
+ payload.published_to_wp_at,
+ payload.word_count,
+ payload.status,
+ payload.meta_json,
+ ),
+ )
+ else:
+ conn.execute(
+ """
+ UPDATE articles
+ SET
+ feed_id = ?,
+ source_article_id = ?,
+ source_hash = ?,
+ title = ?,
+ source_url = ?,
+ canonical_url = ?,
+ published_at = ?,
+ author = ?,
+ summary = ?,
+ content_raw = ?,
+ content_rewritten = ?,
+ image_urls_json = ?,
+ press_contact = ?,
+ source_name_snapshot = ?,
+ source_terms_url_snapshot = ?,
+ source_license_name_snapshot = ?,
+ legal_checked = ?,
+ legal_checked_at = ?,
+ legal_note = ?,
+ wp_post_id = ?,
+ wp_post_url = ?,
+ publish_attempts = ?,
+ publish_last_error = ?,
+ published_to_wp_at = ?,
+ word_count = ?,
+ status = ?,
+ meta_json = ?
+ WHERE id = ?
+ """,
+ (
+ payload.feed_id,
+ payload.source_article_id,
+ payload.source_hash,
+ payload.title.strip(),
+ payload.source_url.strip(),
+ payload.canonical_url,
+ payload.published_at,
+ payload.author,
+ payload.summary,
+ payload.content_raw,
+ payload.content_rewritten,
+ payload.image_urls_json,
+ payload.press_contact,
+ payload.source_name_snapshot,
+ payload.source_terms_url_snapshot,
+ payload.source_license_name_snapshot,
+ 1 if payload.legal_checked else 0,
+ payload.legal_checked_at,
+ payload.legal_note,
+ payload.wp_post_id,
+ payload.wp_post_url,
+ payload.publish_attempts,
+ payload.publish_last_error,
+ payload.published_to_wp_at,
+ payload.word_count,
+ payload.status,
+ payload.meta_json,
+ existing_id,
+ ),
+ )
+ row = conn.execute("SELECT id FROM articles WHERE source_url = ?", (payload.source_url.strip(),)).fetchone()
+ if row:
+ return int(row["id"])
+ return int(existing_id) if existing_id else 0
+
+
+def list_articles_page(
+ limit: int = 50,
+ offset: int = 0,
+ status_filter: str | None = None,
+ search: str | None = None,
+) -> tuple[list[dict[str, Any]], int]:
+ """Return (articles, total_count) with optional status filter and title search."""
+ safe_limit = max(1, min(limit, 200))
+ safe_offset = max(0, offset)
+
+ conditions: list[str] = []
+ params: list[Any] = []
+ if status_filter:
+ conditions.append("a.status = ?")
+ params.append(status_filter)
+ if search:
+ conditions.append("(a.title LIKE ? OR a.id = ?)")
+ try:
+ params.extend([f"%{search}%", int(search)])
+ except ValueError:
+ params.extend([f"%{search}%", -1])
+
+ where = f"WHERE {' AND '.join(conditions)}" if conditions else ""
+ select = """
+ SELECT a.id, a.title, a.status, a.published_at, a.summary, a.content_raw,
+ a.meta_json, a.wp_post_id, a.wp_post_url, a.scheduled_publish_at,
+ a.word_count, f.name AS feed_name
+ FROM articles a
+ LEFT JOIN feeds f ON f.id = a.feed_id
+ """
+ with get_conn() as conn:
+ total = conn.execute(
+ f"SELECT COUNT(*) FROM articles a {where}", params
+ ).fetchone()[0]
+ rows = conn.execute(
+ f"{select} {where} ORDER BY a.id DESC LIMIT ? OFFSET ?",
+ params + [safe_limit, safe_offset],
+ ).fetchall()
+ return rows_to_dicts(rows), total
+
+
+def bulk_update_wp_post_ids(updates: list[tuple[int, int | None]]) -> int:
+ """Update wp_post_id (and clear stale wp_post_url) for multiple articles.
+
+ Returns the number of rows actually updated.
+ Call sync_db_from_wordpress() afterwards to repopulate wp_post_url and
+ scheduled_publish_at from the live WordPress data.
+ """
+ if not updates:
+ return 0
+ updated = 0
+ with get_conn() as conn:
+ for article_id, new_wp_id in updates:
+ conn.execute(
+ "UPDATE articles SET wp_post_id = ?, wp_post_url = NULL WHERE id = ?",
+ (new_wp_id, article_id),
+ )
+ updated += 1
+ return updated
+
+
+def list_articles(limit: int = 100, status_filter: str | None = None) -> list[dict[str, Any]]:
+ safe_limit = max(1, min(limit, 500))
+ with get_conn() as conn:
+ if status_filter:
+ rows = conn.execute(
+ """
+ SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author,
+ a.summary, a.content_raw, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at, f.name AS feed_name,
+ a.image_urls_json, a.press_contact, a.source_name_snapshot, a.source_terms_url_snapshot,
+ a.source_license_name_snapshot, a.legal_checked, a.legal_checked_at, a.legal_note,
+ a.wp_post_id, a.wp_post_url, a.publish_attempts, a.publish_last_error, a.published_to_wp_at
+ FROM articles a
+ LEFT JOIN feeds f ON f.id = a.feed_id
+ WHERE a.status = ?
+ ORDER BY a.id DESC
+ LIMIT ?
+ """,
+ (status_filter, safe_limit),
+ ).fetchall()
+ else:
+ rows = conn.execute(
+ """
+ SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author,
+ a.summary, a.content_raw, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at, f.name AS feed_name,
+ a.image_urls_json, a.press_contact, a.source_name_snapshot, a.source_terms_url_snapshot,
+ a.source_license_name_snapshot, a.legal_checked, a.legal_checked_at, a.legal_note,
+ a.wp_post_id, a.wp_post_url, a.publish_attempts, a.publish_last_error, a.published_to_wp_at
+ FROM articles a
+ LEFT JOIN feeds f ON f.id = a.feed_id
+ ORDER BY a.id DESC
+ LIMIT ?
+ """,
+ (safe_limit,),
+ ).fetchall()
+ return rows_to_dicts(rows)
diff --git a/backend/app/rewrite.py b/backend/app/rewrite.py
new file mode 100644
index 0000000..05937e5
--- /dev/null
+++ b/backend/app/rewrite.py
@@ -0,0 +1,204 @@
+from __future__ import annotations
+
+import json
+import re
+from typing import Any
+from urllib.request import Request, urlopen
+
+from .config import get_settings
+
+
+def _sanitize_source_text(text: str) -> str:
+ raw = (text or "").strip()
+ if not raw:
+ return ""
+
+ lines = [ln.strip() for ln in raw.splitlines() if ln.strip()]
+ if len(lines) > 3:
+ lines = lines[3:]
+
+ joined = "\n".join(lines)
+ # Remove press contact block at end from "Pressekontakt" onward.
+ joined = re.sub(
+ r"\n?\s*Pressekontakt[\s\S]*$",
+ "",
+ joined,
+ flags=re.IGNORECASE,
+ ).strip()
+ return joined
+
+
+def _normalize_tags(tags: list[str], max_tags: int = 8) -> list[str]:
+ out: list[str] = []
+ seen: set[str] = set()
+ for raw in tags:
+ value = re.sub(r"\s+", " ", str(raw or "").strip())
+ value = re.sub(r"^[#\-•\s]+", "", value)
+ value = re.sub(r"[;,.:\s]+$", "", value)
+ if not value:
+ continue
+ if len(value) < 2 or len(value) > 40:
+ continue
+ key = value.casefold()
+ if key in seen:
+ continue
+ seen.add(key)
+ out.append(value)
+ if len(out) >= max_tags:
+ break
+ return out
+
+
+def _openai_chat(system: str, user: str, temperature: float = 0.4) -> str:
+ settings = get_settings()
+ api_key = settings.openai_api_key
+ if not api_key:
+ raise RuntimeError("OPENAI_API_KEY fehlt")
+
+ payload = {
+ "model": settings.openai_model,
+ "temperature": temperature,
+ "messages": [
+ {"role": "system", "content": system},
+ {"role": "user", "content": user},
+ ],
+ }
+ req = Request(
+ url="https://api.openai.com/v1/chat/completions",
+ method="POST",
+ data=json.dumps(payload).encode("utf-8"),
+ headers={
+ "Authorization": f"Bearer {api_key}",
+ "Content-Type": "application/json",
+ "Accept": "application/json",
+ },
+ )
+ with urlopen(req, timeout=60) as resp:
+ raw = resp.read().decode("utf-8", errors="replace")
+ data = json.loads(raw)
+ choices = data.get("choices")
+ if not isinstance(choices, list) or not choices:
+ raise RuntimeError(f"Ungültige OpenAI-Antwort: {data}")
+ message = choices[0].get("message", {})
+ content = message.get("content")
+ if not isinstance(content, str) or not content.strip():
+ raise RuntimeError("OpenAI lieferte keinen Inhalt")
+ return content.strip()
+
+
+def rewrite_article_text(article: dict[str, Any]) -> str:
+ source_text = _sanitize_source_text(article.get("content_raw") or "")
+ if not source_text:
+ source_text = (article.get("summary") or "").strip()
+ if not source_text:
+ raise RuntimeError("Kein Quelltext für Rewrite verfügbar")
+
+ title = (article.get("title") or "").strip()
+ source_name = (article.get("source_name_snapshot") or article.get("author") or "die Quelle").strip()
+ prompt = (
+ "Schreibe den folgenden News-Text neu auf Deutsch in persönlicher Du-Form. "
+ "Stil: ausführlich, gut lesbar, ohne Einleitung mit Datum/Uhrzeit/Firma/Ort, "
+ "ohne Pressekontakt, ohne Quellenblock. "
+ "Nutze klare Absätze und Zwischenüberschriften in HTML (,
,
- falls passend). "
+ "Inhaltlich korrekt bleiben, nichts erfinden. "
+ f"Wichtig: Der Artikel wurde von '{source_name}' veröffentlicht. "
+ "Verwende NIEMALS 'wir' oder 'ich' aus Sicht der Quelle – beziehe Aussagen stets auf die Quelle, "
+ f"z.B. 'laut {source_name}', '{source_name} hat ermittelt', 'die Auswertung zeigt'.\n\n"
+ f"Titel: {title}\n\n"
+ f"Originaltext:\n{source_text}"
+ )
+ return _openai_chat(
+ "Du bist ein deutscher News-Redakteur.",
+ prompt,
+ temperature=0.4,
+ )
+
+
+def generate_article_tags(article: dict[str, Any], rewritten_text: str | None = None, max_tags: int = 8) -> list[str]:
+ source_text = rewritten_text or _sanitize_source_text(article.get("content_raw") or "") or (article.get("summary") or "")
+ source_text = str(source_text).strip()
+ if not source_text:
+ return []
+ title = (article.get("title") or "").strip()
+ prompt = (
+ "Erzeuge präzise Schlagwörter für einen deutschen News-Artikel. "
+ f"Maximal {max_tags} Tags. Nur relevante Begriffe, keine allgemeinen Wörter wie News/Artikel. "
+ "Gib ausschließlich ein JSON-Array mit Strings zurück, ohne Erklärung.\n\n"
+ f"Titel: {title}\n\n"
+ f"Text:\n{source_text[:3500]}"
+ )
+ raw = _openai_chat(
+ "Du extrahierst präzise, kurze News-Tags auf Deutsch.",
+ prompt,
+ temperature=0.2,
+ )
+ try:
+ parsed = json.loads(raw)
+ if isinstance(parsed, list):
+ return _normalize_tags([str(x) for x in parsed], max_tags=max_tags)
+ except Exception:
+ pass
+ # fallback: extract first JSON-like array if model wrapped output
+ match = re.search(r"\[[\s\S]*\]", raw)
+ if match:
+ try:
+ parsed = json.loads(match.group(0))
+ if isinstance(parsed, list):
+ return _normalize_tags([str(x) for x in parsed], max_tags=max_tags)
+ except Exception:
+ return []
+ return []
+
+
+def score_article_relevance(article: dict[Any, Any]) -> dict[str, Any]:
+ """Score article relevance for VanLife/Camping/Outdoor blog (0-100).
+
+ Returns {"score": int, "reason": str, "topics": list[str]}.
+ Raises RuntimeError on OpenAI failure.
+ """
+ title = (article.get("title") or "").strip()
+ text = _sanitize_source_text(article.get("content_raw") or "")
+ if not text:
+ text = (article.get("summary") or "").strip()
+
+ prompt = (
+ "Bewerte die Relevanz des folgenden Artikels für einen deutschen VanLife-, Camping- und Outdoor-Blog. "
+ "Relevante Themen: Campingplätze, Stellplätze, Wohnmobil, Camper, Van, Roadtrip, "
+ "Outdoor-Ausrüstung, Wandern, Naturreisen, Reise-Tipps für Campende. "
+ "Nicht relevant: allgemeine Nachrichten, Politik, Wirtschaft, Sport (außer Outdoor), Unterhaltung.\n\n"
+ "Antworte NUR mit einem JSON-Objekt:\n"
+ '{"score": <0-100>, "reason": "", "topics": ["", ""]}\n\n'
+ f"Titel: {title}\n\n"
+ f"Text (Auszug):\n{text[:2000]}"
+ )
+ raw = _openai_chat(
+ "Du bist ein Redakteur für einen VanLife- und Camping-Blog und bewertest Artikelrelevanz.",
+ prompt,
+ temperature=0.1,
+ )
+ try:
+ match = re.search(r"\{[\s\S]*\}", raw)
+ if match:
+ parsed = json.loads(match.group(0))
+ score = max(0, min(100, int(parsed.get("score", 0))))
+ return {
+ "score": score,
+ "reason": str(parsed.get("reason", "")),
+ "topics": [str(t) for t in (parsed.get("topics") or [])],
+ }
+ except Exception:
+ pass
+ return {"score": 0, "reason": "Parsing-Fehler bei Relevanz-Score", "topics": []}
+
+
+def merge_generated_tags(meta_json: str | None, tags: list[str]) -> str:
+ meta: dict[str, Any] = {}
+ if meta_json:
+ try:
+ parsed = json.loads(meta_json)
+ if isinstance(parsed, dict):
+ meta = parsed
+ except Exception:
+ meta = {}
+ meta["generated_tags"] = _normalize_tags(tags)
+ return json.dumps(meta, ensure_ascii=False)
diff --git a/backend/app/scheduler.py b/backend/app/scheduler.py
new file mode 100644
index 0000000..d5ea5bf
--- /dev/null
+++ b/backend/app/scheduler.py
@@ -0,0 +1,336 @@
+"""Smart publishing scheduler.
+
+Calculates suggested publish slots for new WordPress drafts.
+Rules:
+- Maximum N drafts per day (configurable, default 2)
+- Preferred slots: configurable hours (default 09:00 and 14:00 CET)
+- New articles queue up after the last already-scheduled article
+- Checks both local DB AND WordPress future posts to avoid double-booking
+"""
+from __future__ import annotations
+
+import base64
+import json
+import threading
+import urllib.request
+from datetime import date, datetime, timedelta, timezone
+from typing import Any
+
+from .config import get_settings
+from .db import get_conn
+
+# Ensures that concurrent pipeline runs (two threads) never assign the same slot.
+_slot_lock = threading.Lock()
+
+
+# CET offset (UTC+1 winter / UTC+2 summer – fixed +1 for simplicity)
+_CET_OFFSET = timedelta(hours=1)
+
+
+def _today_cet() -> date:
+ return (datetime.now(timezone.utc) + _CET_OFFSET).date()
+
+
+def _preferred_hours() -> list[int]:
+ settings = get_settings()
+ try:
+ return [int(h.strip()) for h in settings.pipeline_publish_hours.split(",") if h.strip()]
+ except Exception:
+ return [9, 14]
+
+
+def _fetch_wp_occupied_slots() -> set[tuple[str, int]]:
+ """Fetch all future-scheduled WordPress posts and return occupied (date_iso, hour) pairs.
+
+ This prevents the scheduler from assigning a slot that is already taken
+ by a WP post that was not created via this pipeline (e.g. manually or via recovery scripts).
+ Returns an empty set on any error so the scheduler degrades gracefully.
+ """
+ settings = get_settings()
+ try:
+ auth = base64.b64encode(
+ f"{settings.wordpress_username}:{settings.wordpress_app_password}".encode()
+ ).decode()
+ url = (
+ f"{settings.wordpress_base_url}/wp-json/wp/v2/posts"
+ f"?status=future&per_page=100&orderby=date&order=asc&_fields=id,date"
+ )
+ req = urllib.request.Request(url, headers={"Authorization": f"Basic {auth}"})
+ with urllib.request.urlopen(req, timeout=10) as resp:
+ posts = json.loads(resp.read())
+ occupied: set[tuple[str, int]] = set()
+ for p in posts:
+ try:
+ dt = datetime.fromisoformat(p["date"])
+ occupied.add((dt.date().isoformat(), dt.hour))
+ except Exception:
+ pass
+ return occupied
+ except Exception:
+ return set()
+
+
+def _get_last_future_scheduled_date(wp_occupied: set[tuple[str, int]]) -> date | None:
+ """Return the date of the latest already-scheduled slot (DB + WP)."""
+ today = _today_cet()
+
+ # Latest from local DB
+ with get_conn() as conn:
+ row = conn.execute(
+ """
+ SELECT MAX(scheduled_publish_at) AS last_slot
+ FROM articles
+ WHERE scheduled_publish_at IS NOT NULL
+ AND scheduled_publish_at >= ?
+ AND status NOT IN ('error', 'no_image')
+ """,
+ (today.isoformat() + "T00:00:00",),
+ ).fetchone()
+ db_last: date | None = None
+ if row and row["last_slot"]:
+ try:
+ db_last = datetime.fromisoformat(row["last_slot"]).date()
+ except Exception:
+ pass
+
+ # Latest from WP
+ wp_last: date | None = None
+ for d_str, _ in wp_occupied:
+ try:
+ d = date.fromisoformat(d_str)
+ if d >= today and (wp_last is None or d > wp_last):
+ wp_last = d
+ except Exception:
+ pass
+
+ if db_last and wp_last:
+ return max(db_last, wp_last)
+ return db_last or wp_last
+
+
+def _next_free_hour(target_date: date, wp_occupied: set[tuple[str, int]]) -> int | None:
+ """Return first preferred hour not yet used on target_date (DB + WP), or None if day is full."""
+ hours = _preferred_hours()
+ date_str = target_date.isoformat()
+
+ # Hours used in local DB
+ with get_conn() as conn:
+ rows = conn.execute(
+ """
+ SELECT scheduled_publish_at FROM articles
+ WHERE scheduled_publish_at >= ? AND scheduled_publish_at < ?
+ AND status NOT IN ('error', 'no_image')
+ """,
+ (date_str + "T00:00:00", date_str + "T23:59:59"),
+ ).fetchall()
+
+ used_hours: set[int] = set()
+ for row in rows:
+ ts = row["scheduled_publish_at"] or ""
+ try:
+ used_hours.add(datetime.fromisoformat(ts).hour)
+ except Exception:
+ pass
+
+ # Hours used in WordPress
+ for d_str, h in wp_occupied:
+ if d_str == date_str:
+ used_hours.add(h)
+
+ for h in hours:
+ if h not in used_hours:
+ return h
+ return None
+
+
+def _format_slot(d: date, hour: int) -> str:
+ weekday_names = ["Mo", "Di", "Mi", "Do", "Fr", "Sa", "So"]
+ wd = weekday_names[d.weekday()]
+ return f"{wd}, {d.strftime('%d.%m.%Y')} um {hour:02d}:00 Uhr"
+
+
+def _find_next_free_slot(
+ wp_occupied: set[tuple[str, int]], lookahead_days: int = 60
+) -> tuple[date, int] | None:
+ """Find the next free (date, hour) slot.
+
+ Starts from tomorrow and scans forward, filling any gaps in the schedule
+ rather than always appending after the last existing post.
+ """
+ today = _today_cet()
+ tomorrow = today + timedelta(days=1)
+
+ for offset in range(0, lookahead_days + 1):
+ candidate = tomorrow + timedelta(days=offset)
+ hour = _next_free_hour(candidate, wp_occupied)
+ if hour is not None:
+ return candidate, hour
+
+ return tomorrow, _preferred_hours()[0] if _preferred_hours() else 9
+
+
+def get_schedule_overview(lookahead_days: int = 60) -> list[dict]:
+ """Return all booked scheduling slots (DB + WP) for the next N days, sorted by date."""
+ today = _today_cet()
+ hours = _preferred_hours()
+
+ # Slots booked in local DB
+ with get_conn() as conn:
+ rows = conn.execute(
+ """
+ SELECT id, title, status, wp_post_id, wp_post_url, scheduled_publish_at
+ FROM articles
+ WHERE scheduled_publish_at IS NOT NULL
+ AND scheduled_publish_at >= ?
+ AND status NOT IN ('error', 'no_image')
+ ORDER BY scheduled_publish_at
+ """,
+ (today.isoformat() + "T00:00:00",),
+ ).fetchall()
+
+ db_slots: dict[tuple[str, int], dict] = {}
+ for row in rows:
+ try:
+ dt = datetime.fromisoformat(row["scheduled_publish_at"])
+ key = (dt.date().isoformat(), dt.hour)
+ db_slots[key] = {
+ "date": dt.date().isoformat(),
+ "hour": dt.hour,
+ "formatted": _format_slot(dt.date(), dt.hour),
+ "source": "db",
+ "article_id": row["id"],
+ "article_title": row["title"],
+ "article_status": row["status"],
+ "wp_post_id": row["wp_post_id"],
+ "wp_post_url": row["wp_post_url"],
+ }
+ except Exception:
+ pass
+
+ # Slots occupied in WordPress but not in local DB
+ wp_occupied = _fetch_wp_occupied_slots()
+ wp_only: list[dict] = []
+ for d_str, h in sorted(wp_occupied):
+ if (d_str, h) in db_slots:
+ continue
+ try:
+ d = date.fromisoformat(d_str)
+ if d >= today:
+ wp_only.append({
+ "date": d_str,
+ "hour": h,
+ "formatted": _format_slot(d, h),
+ "source": "wordpress",
+ "article_id": None,
+ "article_title": "(WP-Beitrag außerhalb Pipeline)",
+ "article_status": None,
+ "wp_post_id": None,
+ "wp_post_url": None,
+ })
+ except Exception:
+ pass
+
+ all_slots = list(db_slots.values()) + wp_only
+ all_slots.sort(key=lambda s: (s["date"], s["hour"]))
+ return all_slots
+
+
+def release_publish_slot(article_id: int) -> None:
+ """Clear a previously reserved slot (e.g. when article is rejected after slot assignment)."""
+ with get_conn() as conn:
+ conn.execute(
+ "UPDATE articles SET scheduled_publish_at = NULL WHERE id = ?",
+ (article_id,),
+ )
+
+
+def suggest_publish_slot() -> str:
+ """Return a suggested publish datetime string (CET) for the next free slot."""
+ wp_occupied = _fetch_wp_occupied_slots()
+ result = _find_next_free_slot(wp_occupied)
+ if result:
+ d, hour = result
+ return _format_slot(d, hour)
+ tomorrow = _today_cet() + timedelta(days=1)
+ return _format_slot(tomorrow, _preferred_hours()[0] if _preferred_hours() else 9)
+
+
+def reserve_publish_slot(article_id: int) -> str:
+ """Reserve a publish slot for an article and persist it in the DB.
+
+ If the article already has a scheduled_publish_at, keep it unchanged.
+ Returns the formatted publish datetime string.
+
+ Uses a module-level lock so that concurrent pipeline runs (two threads)
+ cannot read the same "free" slot and assign it twice.
+ """
+ # Fetch WP-occupied slots BEFORE acquiring the lock — the API call can be slow
+ # and must not block other threads unnecessarily.
+ wp_occupied = _fetch_wp_occupied_slots()
+
+ with _slot_lock:
+ # Single DB connection for the entire read-find-write cycle so the
+ # slot we pick is still free when we write it.
+ with get_conn() as conn:
+ row = conn.execute(
+ "SELECT scheduled_publish_at FROM articles WHERE id = ?",
+ (article_id,),
+ ).fetchone()
+ existing_slot = row["scheduled_publish_at"] if row else None
+ if existing_slot:
+ try:
+ dt = datetime.fromisoformat(existing_slot)
+ return _format_slot(dt.date(), dt.hour)
+ except Exception:
+ pass # invalid — fall through and assign a fresh slot
+
+ # Find the next free (date, hour) slot using THIS connection so we
+ # see all slots written during this lock window.
+ hours = _preferred_hours()
+ today = _today_cet()
+ tomorrow = today + timedelta(days=1)
+ candidate: date | None = None
+ chosen_hour: int | None = None
+
+ for offset in range(0, 61):
+ d = tomorrow + timedelta(days=offset)
+ date_str = d.isoformat()
+
+ rows = conn.execute(
+ """
+ SELECT scheduled_publish_at FROM articles
+ WHERE scheduled_publish_at >= ? AND scheduled_publish_at < ?
+ AND status NOT IN ('error', 'no_image')
+ """,
+ (date_str + "T00:00:00", date_str + "T23:59:59"),
+ ).fetchall()
+
+ used_hours: set[int] = set()
+ for r in rows:
+ ts = r["scheduled_publish_at"] or ""
+ try:
+ used_hours.add(datetime.fromisoformat(ts).hour)
+ except Exception:
+ pass
+ for d_str, h in wp_occupied:
+ if d_str == date_str:
+ used_hours.add(h)
+
+ for h in hours:
+ if h not in used_hours:
+ candidate = d
+ chosen_hour = h
+ break
+ if candidate is not None:
+ break
+
+ if candidate is None:
+ candidate = tomorrow
+ chosen_hour = hours[0] if hours else 9
+
+ iso_ts = f"{candidate.isoformat()}T{chosen_hour:02d}:00:00"
+ conn.execute(
+ "UPDATE articles SET scheduled_publish_at = ? WHERE id = ?",
+ (iso_ts, article_id),
+ )
+ return _format_slot(candidate, chosen_hour)
diff --git a/backend/app/source_extraction.py b/backend/app/source_extraction.py
new file mode 100644
index 0000000..d3cbed8
--- /dev/null
+++ b/backend/app/source_extraction.py
@@ -0,0 +1,442 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from html import unescape
+import re
+from typing import Any
+from urllib.parse import urljoin
+from urllib.request import Request, urlopen
+
+DEFAULT_TIMEOUT_SECONDS = 10
+DEFAULT_USER_AGENT = "rss-news-bot/1.0 (+https://news.vanityontour.de)"
+
+
+@dataclass(frozen=True)
+class ExtractedArticle:
+ title: str | None
+ author: str | None
+ canonical_url: str | None
+ summary: str | None
+ content_text: str | None
+ images: list[str]
+ press_contact: str | None
+ extraction_error: str | None = None
+ image_metadata: dict[str, dict] = field(default_factory=dict)
+
+
+def _clean_text(raw: str | None) -> str | None:
+ if not raw:
+ return None
+ text = unescape(raw)
+ text = re.sub(r"<[^>]+>", " ", text)
+ text = re.sub(r"\s+", " ", text).strip()
+ return text or None
+
+
+def _strip_noise(html: str) -> str:
+ html = re.sub(r"
+