diff --git a/.env.example b/.env.example
deleted file mode 100644
index fd3ded5..0000000
--- a/.env.example
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copy to .env and fill in values
-
-# WordPress base URL (required)
-WP_BASE_URL=https://your-site.tld
-
-# Authentication: prefer WP_AUTH_BASE64 OR use USERNAME+PASSWORD (Application Password)
-# Example to generate: base64(username:application_password)
-WP_AUTH_BASE64=
-
-# Alternatively provide username and application password
-WP_USERNAME=
-WP_PASSWORD=
-
-# OpenAI API key (optional, enables rewrite)
-OPENAI_API_KEY=
-
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
index af3394f..5d55808 100644
--- a/.github/workflows/deploy.yml
+++ b/.github/workflows/deploy.yml
@@ -19,16 +19,9 @@ jobs:
username: oliver
key: ${{ secrets.HETZNER_SSH_KEY }}
port: 22
- envs: APP_ADMIN_USERNAME,APP_ADMIN_PASSWORD
script: |
- cd /opt/rss-news
+ cd rss-news
git pull origin main
source .venv/bin/activate
pip install -r requirements.txt
- pip install -r backend/requirements.txt || true
- sudo systemctl restart rss-news-api
- sleep 3
- BASE_URL="https://news.vanityontour.de" APP_ADMIN_USERNAME="${APP_ADMIN_USERNAME}" APP_ADMIN_PASSWORD="${APP_ADMIN_PASSWORD}" bash scripts/smoke_backend.sh
- env:
- APP_ADMIN_USERNAME: ${{ secrets.NEWS_APP_ADMIN_USERNAME }}
- APP_ADMIN_PASSWORD: ${{ secrets.NEWS_APP_ADMIN_PASSWORD }}
+ sudo systemctl restart rss-app
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
deleted file mode 100644
index 1d627db..0000000
--- a/.github/workflows/test.yml
+++ /dev/null
@@ -1,39 +0,0 @@
-name: Backend Tests
-
-on:
- push:
- branches: [main]
- pull_request:
- branches: [main]
-
-jobs:
- backend-tests:
- runs-on: ubuntu-latest
- timeout-minutes: 15
-
- steps:
- - name: Checkout
- uses: actions/checkout@v4
-
- - name: Set up Python
- uses: actions/setup-python@v5
- with:
- python-version: "3.11"
-
- - name: Install dependencies
- run: |
- python -m pip install --upgrade pip
- pip install -r backend/requirements.txt
- pip install -r backend/requirements-test.txt
-
- - name: Run tests with coverage
- env:
- APP_DB_PATH: /tmp/rss_news_test.db
- run: |
- pytest backend/tests --cov=backend/app --cov-report=term-missing --cov-report=xml
-
- - name: Upload coverage artifact
- uses: actions/upload-artifact@v4
- with:
- name: coverage-xml
- path: coverage.xml
diff --git a/.gitignore b/.gitignore
index aac3a2f..fcbde33 100644
--- a/.gitignore
+++ b/.gitignore
@@ -36,4 +36,3 @@ internal/copy_files.sh
internal/_line.txt
internal/push_commit.txt
internal/git.sh
-CLAUDE.md
diff --git a/AGENTS.md b/AGENTS.md
deleted file mode 100644
index 7251de6..0000000
--- a/AGENTS.md
+++ /dev/null
@@ -1,40 +0,0 @@
-# Repository Guidelines
-
-## Project Structure & Module Organization
-- `app.py`: Streamlit UI (entry point for the app).
-- `main.py`: RSS fetching, rewrite, and WordPress upload logic.
-- `utils/`: Helpers (image/article extraction, WP uploader, UI helpers).
-- `pages/`: Streamlit pages (e.g., `01_feed_manager.py`, `log_viewer.py`).
-- `data/`: JSON state (`articles.json`, `feeds.json`).
-- `logs/`: Runtime logs (`rss_tool.log`).
-- `docs/`: Project notes (e.g., roadmap).
-- `__version__.py`: Version string written by `versioning.py`.
-
-## Build, Test, and Development Commands
-- Create env: `python -m venv .venv && source .venv/bin/activate`
-- Install deps: `pip install -r requirements.txt`
-- Run app: `streamlit run app.py`
-- Version bump: `python versioning.py --level patch --push` (updates `__version__.py`, prepares `CHANGELOG.md`, creates tag; see `--help`).
-
-## Coding Style & Naming Conventions
-- Python 3.10+, PEP 8, 4-space indentation, type hints where practical.
-- Modules and functions: `snake_case`; classes: `PascalCase`.
-- Streamlit pages: numeric prefix for order, e.g., `pages/01_feature.py`.
-- Keep functions small and pure in `utils/`; isolate I/O in app layers.
-- Suggested tools (optional): Black (`black .`) and Ruff (`ruff check .`).
-
-## Testing Guidelines
-- Framework: pytest (recommended). Place tests under `tests/` with `test_*.py`.
-- Unit tests for `utils/*`; light integration checks for `main.py` with temporary files.
-- Run: `pytest -q`. Add coverage if needed (e.g., `pytest --cov=utils`).
-- Test data: avoid mutating files in `data/`; use temp dirs or fixtures.
-
-## Commit & Pull Request Guidelines
-- Commits: imperative mood, concise; examples: `Add feed dedupe`, `Fix WP upload retry`, `Bump version to v1.7.0`.
-- PRs: clear description, linked issue, screenshots/GIFs for UI changes, note env variables touched.
-- Update `CHANGELOG.md` and bump version via `versioning.py` before release PRs.
-
-## Security & Configuration Tips
-- Required env: `OPENAI_API_KEY`, `WP_BASE_URL`, `WP_USERNAME`, `WP_PASSWORD` or `WP_AUTH_BASE64` (see `.env`).
-- Never commit secrets; `.env` is git-ignored. Avoid hardcoded credentials; prefer `os.getenv`.
-- Logs and data may contain content; do not commit `logs/` or large `data/` snapshots.
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 66b7237..c15f26b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,42 +1,6 @@
-## [1.7.1] - 2025-08-24
-
-### ✨ Security angepasst
- - alle Credentials in die .env Datei verschoben
- - beim Start der App werden die Credentials geprüft und beim fehlen entsprechende Meldungen ausgegeben
-
----
-
## [1.7.0] - 2025-08-24
-### Multi-Select & Massenoperationen:
- - ✅ Checkboxes für Artikel-Auswahl im "Artikel verwalten" Bereich
- - ✅ "Alle auswählen" / "Auswahl aufheben" Buttons
- - ✅ Massenoperationen für ausgewählte Artikel:
- - Bulk Status-Änderung für mehrere Artikel gleichzeitig
- - Bulk Artikel-Umschreibung mit automatischer Status-Verwaltung
- - Bulk WordPress-Upload nur für "Process"-Artikel
- - Bulk Papierkorb-Funktion
-
-### Schnellaktionen Integration:
- - ✅ Feed-Aktualisierung direkt im Artikel-Tab verfügbar
- - ✅ Alle Dashboard-Schnellaktionen in Artikel-Verwaltung integriert
- - ✅ Intelligente Anzeige nur relevanter Operationen (z.B. WordPress-Upload nur bei Process-Artikeln)
-
-### 🔧 Verbesserungen
-
- - UI/UX: Verbesserte Artikel-Card-Layouts mit Checkbox-Integration
- - Workflow: Streamlined Artikel-Management ohne Tab-Wechsel nötig
- - Feedback: Detaillierte Statusmeldungen bei Massenoperationen
- - Performance: Optimierte Session-State-Verwaltung für Artikel-Auswahl
-
-### 🏗️ Technische Änderungen
-
- - Session State Erweiterung um selected_articles Set
- - Neue Bulk-Operation-Funktionen in app.py:326-467
- - Überarbeitetes Artikel-Card-Layout mit 3-Spalten-Design
- - Integration bestehender WordPress-Upload und Rewrite-Funktionen
-
----
+- Beschreibung...
## [1.6.3] - 2025-08-18
diff --git a/README.md b/README.md
index b3c2b4a..0f3d86c 100644
--- a/README.md
+++ b/README.md
@@ -1,63 +1,76 @@
-# rss-news (Rebuild)
+# 📰 RSS News Bot
-`rss-news` wird als bestehendes Repository weitergefuehrt und schrittweise zu einer robusten, rechtssicheren News-Pipeline neu aufgebaut.
+Ein intelligentes Tool zum Einlesen, Umschreiben und Veröffentlichen von Artikeln aus RSS-Feeds – mit automatischer Tag-Erkennung, KI-unterstütztem Rewrite via GPT-4, Bildextraktion aus Originalartikeln und optionaler DALL·E-Bildgenerierung.
-Aktueller Stand:
-- Alte Streamlit-App wird nicht produktiv genutzt.
-- `news.vanityontour.de` wird bis zum Go-Live der neuen App auf `https://vanityontour.de` umgeleitet.
-- Planung, Doku und Wiki werden als Grundlage fuer den Neuaufbau gepflegt.
+
+
+
+
-## Ziele
-- RSS-gestuetzte Artikelverarbeitung mit klaren Quellregeln
-- Rechtssichere Nutzung (Quellen, Attribution, Lizenzinformationen)
-- Zuverlaessige Automatisierung auf Hetzner
-- Publikation nach WordPress (IONOS aktuell, spaeter offen)
-- Zugriff nur nach Login (zunaechst User/Password)
+---
-## Architektur-Richtung (MVP)
-- Backend: `Python + FastAPI`
-- Jobs: Queue-Worker (z. B. Redis + RQ/Celery)
-- Daten: SQLite fuer MVP, spaeter optional PostgreSQL
-- Auth: Session-Login mit einem Admin-User
-- Publishing: WordPress REST API (Status zunaechst `pending`)
+## 🚀 Features
-Details: `docs/PROJECT_PLAN.md`
+- 📡 **RSS-Feeds verwalten** (hinzufügen, aktualisieren)
+- ✍️ **Artikel automatisch umschreiben** mit GPT-4
+- 🏷️ **Tags automatisch generieren**
+- 🖼️ **Bilder aus Originalartikeln extrahieren**
+- 🪄 **Optionales DALL·E-Bild generieren**
+- 🔧 **Bearbeiten von Bildmetadaten**
+- 🗂️ **Statusverwaltung der Artikel (New, Rewrite, Process, etc.)**
+- 📜 **Log-Viewer-Seite integriert**
+- 📥 **Export zur Veröffentlichung auf WordPress vorbereitet**
+- 📋 Artikeltabelle mit Status-Filter
+- 🔍 Artikel-Expander mit Rewrite, Tags & Bildern
+- 🪄 Button für KI-Bildgenerierung
-## Projektsteuerung
-- GitHub Project: `https://github.com/users/OliverGiertz/projects/3/views/1`
-- Dieses Board ist die zentrale Steuerung fuer ToDos, Bugs, Verbesserungen.
-- Wiki-Struktur liegt unter `docs/wiki/`.
-## Dokumentation
-- Projektplan: `docs/PROJECT_PLAN.md`
-- ToDo-Liste: `docs/TODO.md`
-- Quell- und Lizenzpolicy: `docs/SOURCE_POLICY.md`
-- Wiki Home: `docs/wiki/Home.md`
+---
-## Lokale Entwicklung (Legacy-Code)
-Der vorhandene Legacy-Stand kann weiterhin lokal gestartet werden:
+## 🧱 Projektstruktur
+
+ss-news/
+├── app.py # Haupt-UI mit Streamlit
+├── main.py # Logik für Feed-Import und Verarbeitung
+├── utils/
+│ └── image_extractor.py # Bilder aus Originalartikeln extrahieren
+│ └── dalle_generator.py # DALL·E-Integration (KI-Bild)
+├── pages/
+│ └── log_viewer.py # UI zur Anzeige der Logs
+├── data/
+│ └── articles.json # Gespeicherte Artikel
+│ └── feeds.json # Gespeicherte Feed-URLs
+├── logs/
+│ └── rss_tool.log # Logging der Verarbeitung
+├── versioning.py # CLI-Tool zur Versionierung & Release
+├── TEST-CHECKLIST.md # Manuelle Prüfliste für Releases
+├── version.py # Aktuelle Version
+└── CHANGELOG.md # Änderungsprotokoll
+
+
+---
+
+## ⚙️ Installation
```bash
+git clone https://github.com/OliverGiertz/rss-news.git
+cd rss-news
python -m venv .venv
source .venv/bin/activate
pip install -r requirements.txt
-streamlit run app.py
```
-Hinweis: Diese App ist funktional historisch und wird durch die neue Architektur ersetzt.
+---
-## Deployment-Zielbild
-- Betrieb auf Hetzner
-- Reverse Proxy via CloudPanel/Nginx
-- Produktive Domain: `news.vanityontour.de`
-- Bis zur Fertigstellung: Redirect auf `https://vanityontour.de`
+## Update
+Ein Update Script findest du hier: https://gist.github.com/OliverGiertz/ad33ae3de9aa1c1163dad5fe8affb6ca
-## Sicherheit
-- Keine Secrets im Repository
-- `.env` lokal/auf Server, nie committen
-- Auth-Pflicht fuer die neue WebApp
-- spaeter optional: Passkeys/WebAuthn
+```bash
+bash update.sh
+```
-## Rechtlicher Hinweis
-Dieses Projekt verarbeitet nur Quellen mit dokumentierter Nutzungsgrundlage. Vor produktiver Nutzung ist eine finale rechtliche Pruefung der ausgewaehlten Feeds notwendig.
+
+## ▶️ Starten der App
+
+streamlit run app.py
diff --git a/__version__.py b/__version__.py
index 6a6d0f8..32d334c 100644
--- a/__version__.py
+++ b/__version__.py
@@ -1 +1 @@
-VERSION = "1.7.1"
+VERSION = "1.7.0"
diff --git a/app.py b/app.py
index f161a65..a6d2dfa 100644
--- a/app.py
+++ b/app.py
@@ -14,7 +14,6 @@ from main import (
from utils.dalle_generator import generate_dalle_image
from utils.wordpress_uploader import WordPressUploader
from utils.css_loader import load_css, apply_dark_theme
-from utils.config import validate_env
import os
from collections import Counter
import time
@@ -30,19 +29,6 @@ st.set_page_config(
load_css()
apply_dark_theme()
-# === Environment-Validierung (.env) ===
-env_check = validate_env()
-if not env_check.get("ok"):
- st.error("🔒 Sicherheits-/Konfigurationshinweis: Bitte .env korrekt konfigurieren.")
- for msg in env_check.get("errors", []):
- st.markdown(f"- ❌ {msg}")
- for msg in env_check.get("warnings", []):
- st.markdown(f"- ⚠️ {msg}")
-elif env_check.get("warnings"):
- st.info("ℹ️ Hinweise zur Konfiguration:")
- for msg in env_check.get("warnings", []):
- st.markdown(f"- ⚠️ {msg}")
-
# === Initialize Session State ===
if 'selected_articles' not in st.session_state:
st.session_state.selected_articles = set()
@@ -942,7 +928,20 @@ with tab6:
""", unsafe_allow_html=True)
- # Sicherheit: Kein Anzeigen sensibler Auth-Details mehr
+ # WordPress Auth Debug (nur für Entwicklung)
+ if st.checkbox("🔧 Debug-Modus (Auth-Details anzeigen)", value=False):
+ st.warning("⚠️ Nur für Entwicklung - zeigt Auth-Details!")
+
+ wp_base64 = os.getenv("WP_AUTH_BASE64", "")
+ if wp_base64:
+ try:
+ import base64
+ decoded = base64.b64decode(wp_base64).decode('utf-8')
+ st.code(f"Base64: {wp_base64}\nDecoded: {decoded}")
+ except Exception as e:
+ st.error(f"Fehler beim Dekodieren: {e}")
+ else:
+ st.info("Kein Base64-String konfiguriert")
# Bulk Upload
st.subheader("📦 Massenupload")
@@ -1063,16 +1062,15 @@ with tab6:
with st.expander("📋 .env-Datei Vorlage", expanded=False):
st.code("""
# WordPress-Konfiguration
-WP_BASE_URL=https://your-site.tld
+WP_BASE_URL=https://vanityontour.de
+WP_USERNAME=ogiertz
+WP_PASSWORD=whNEx9aZCIUXViV89Z3e7Z03
-# Entweder Base64 (empfohlen) ODER Benutzername/Passwort (Application Password)
-WP_AUTH_BASE64=
-# Oder alternativ:
-WP_USERNAME=
-WP_PASSWORD=
+# WordPress Base64-Authentifizierung (bevorzugte Methode)
+WP_AUTH_BASE64=b2dpZXJ0ejp3aE5FeDlhWkNJVVhWaVY4OVozZTdaMDM=
-# OpenAI-Konfiguration (optional für Umschreibung)
-OPENAI_API_KEY=
+# OpenAI-Konfiguration (für Artikel-Umschreibung)
+OPENAI_API_KEY=sk-...
""", language="bash")
with st.expander("🔑 Base64-Authentifizierung verstehen", expanded=False):
@@ -1080,10 +1078,21 @@ OPENAI_API_KEY=
WordPress REST API Authentifizierung:
- Die WordPress REST API nutzt Basic-Auth mit Base64-kodierten Zugangsdaten:
- Authorization: Basic <base64(username:password)>
- Empfehlung: In der .env WP_AUTH_BASE64 setzen (aus username:application_password erzeugt).
- Alternativ können WP_USERNAME und WP_PASSWORD gesetzt werden; dann wird Base64 zur Laufzeit generiert.
+ Die WordPress REST API erfordert eine Base64-kodierte Authentifizierung im Format:
+ Authorization: Basic <base64_encoded_credentials>
+
+ Ihr bereitgestellter Base64-String:
+ • b2dpZXJ0ejp3aE5FeDlhWkNJVVhWaVY4OVozZTdaMDM=
+ • Dekodiert: ogiertz:whNEx9aZCIUXViV89Z3e7Z03
+
+ So funktioniert es:
+ 1. Benutzername und Anwendungspasswort werden kombiniert: username:password
+ 2. Dieser String wird Base64-kodiert
+ 3. Im Authorization-Header verwendet: Basic <base64_string>
+
+ Fallback-Verhalten:
+ • Wenn WP_AUTH_BASE64 gesetzt ist → Direkter Base64-String verwendet
+ • Wenn nicht gesetzt → Base64 wird aus WP_USERNAME:WP_PASSWORD generiert
""", unsafe_allow_html=True)
@@ -1106,4 +1115,4 @@ OPENAI_API_KEY=
4. Generiertes Passwort in .env-Datei eintragen
- """, unsafe_allow_html=True)
+ """, unsafe_allow_html=True)
\ No newline at end of file
diff --git a/backend/.env.example b/backend/.env.example
deleted file mode 100644
index c2dd235..0000000
--- a/backend/.env.example
+++ /dev/null
@@ -1,45 +0,0 @@
-# ─── App ────────────────────────────────────────────────────────────────────
-APP_ENV=development
-APP_NAME=rss-news-backend
-APP_SECRET_KEY=replace-with-a-long-random-secret
-APP_DB_PATH=backend/data/rss_news.db
-
-APP_ADMIN_USERNAME=admin
-APP_ADMIN_PASSWORD=change-me
-
-SESSION_COOKIE_NAME=rss_news_session
-SESSION_MAX_AGE_SECONDS=28800
-
-# ─── WordPress ──────────────────────────────────────────────────────────────
-WP_BASE_URL=https://your-site.tld
-WP_USERNAME=your-wp-username
-WP_PASSWORD=your-wp-app-password
-# Status für neue Beiträge: draft | future | publish
-WORDPRESS_DEFAULT_STATUS=draft
-
-# ─── OpenAI ─────────────────────────────────────────────────────────────────
-OPENAI_API_KEY=sk-...
-# gpt-4o-mini empfohlen (Kosten/Qualität)
-OPENAI_MODEL=gpt-4o-mini
-
-# ─── Telegram Bot ────────────────────────────────────────────────────────────
-# Bot-Token von @BotFather
-TELEGRAM_BOT_TOKEN=123456789:ABC...
-# Chat-ID deines persönlichen Chats oder einer Gruppe
-TELEGRAM_CHAT_ID=123456789
-# Zufälliger Secret-Token zur Webhook-Absicherung (mindestens 20 Zeichen)
-TELEGRAM_WEBHOOK_SECRET=replace-with-random-secret-min-20-chars
-
-# ─── N8N API-Key ─────────────────────────────────────────────────────────────
-# Wird von N8N im Header X-API-Key mitgeschickt
-N8N_API_KEY=replace-with-strong-random-key
-
-# ─── Pipeline-Einstellungen ──────────────────────────────────────────────────
-# Relevanz-Score >= dieser Wert: automatisch verarbeiten (0-100)
-PIPELINE_RELEVANCE_AUTO=80
-# Relevanz-Score >= dieser Wert, aber < AUTO: Telegram-Warnung senden
-PIPELINE_RELEVANCE_WARN=60
-# Maximale Drafts/Veröffentlichungen pro Tag
-PIPELINE_MAX_DRAFTS_PER_DAY=2
-# Bevorzugte Veröffentlichungszeiten (Stunden, kommagetrennt, CET)
-PIPELINE_PUBLISH_HOURS=9,14
diff --git a/backend/README.md b/backend/README.md
deleted file mode 100644
index 7d64a65..0000000
--- a/backend/README.md
+++ /dev/null
@@ -1,82 +0,0 @@
-# Backend Skeleton (FastAPI)
-
-Dieses Verzeichnis enthaelt das technische Grundgeruest fuer den Rebuild von `rss-news`.
-
-## Start (lokal)
-
-```bash
-python -m venv .venv
-source .venv/bin/activate
-pip install -r backend/requirements.txt
-uvicorn backend.app.main:app --reload --port 8501
-```
-
-## Admin UI
-- Login: `http://127.0.0.1:8501/admin/login`
-- Dashboard: `http://127.0.0.1:8501/admin/dashboard`
-
-## Environment
-- Datei: `backend/.env`
-- Vorlage: `backend/.env.example`
-
-## Endpoints
-- `GET /health` - Healthcheck
-- `POST /auth/login` - Login mit Admin-User
-- `POST /auth/logout` - Logout
-- `GET /auth/me` - Aktiver User
-- `GET /api/protected` - Geschuetzter Test-Endpoint
-- `GET /api/pipeline/status` - Basisstatus inkl. Datensatzzaehler
-- `GET /api/sources` - Quellenliste
-- `POST /api/sources` - Quelle anlegen
-- `GET /api/sources/{source_id}/policy-check` - Policy-Pruefung fuer Quelle
-- `GET /api/feeds` - Feedliste
-- `POST /api/feeds` - Feed anlegen
-- `GET /api/feeds/{feed_id}/policy-check` - Policy-Pruefung fuer Feed
-- `GET /api/runs` - Import-/Job-Runs anzeigen
-- `GET /api/runs/{run_id}` - Detailansicht eines Runs
-- `POST /api/runs` - Run starten
-- `POST /api/runs/{run_id}/finish` - Run abschliessen
-- `GET /api/articles` - Artikel anzeigen
-- `GET /api/articles/{article_id}` - Artikeldetail
-- `POST /api/articles/upsert` - Artikel idempotent anlegen/aktualisieren
-- `POST /api/articles/{article_id}/transition` - Statuswechsel nach Workflow-Regeln
-- `POST /api/articles/{article_id}/review` - Review-Entscheidung (approve/reject)
-- `POST /api/ingestion/run` - Feed-Ingestion starten (optional pro Feed)
-
-## Datenbank
-- SQLite-Datei unter `backend/data/rss_news.db`
-- Tabellen werden beim App-Start initialisiert.
-- Tabellen: `sources`, `feeds`, `runs`, `articles`
-- Dedupe-Strategie Artikel: `source_url` -> `(feed_id, source_article_id)` -> `source_hash`
-
-## Policy-Enforcement
-- Ingestion blockiert Feeds automatisch, wenn die zugeordnete Quelle nicht policy-konform ist.
-- Mindestanforderungen: `risk_level=green`, `terms_url`, `license_name`, `last_reviewed_at`, `is_enabled=1`.
-- Pro importiertem Artikel wird ein `attribution`-Block in `meta_json` gespeichert.
-
-## Review-Workflow
-- Statuskette: `new -> review -> approved -> published`
-- Ablehnung im Review setzt auf `rewrite`
-- Ungueltige Statuswechsel werden per API blockiert
-
-## Verifikation
-```bash
-python -m unittest backend.tests.test_db_repositories
-python -m unittest backend.tests.test_ingestion
-python -m unittest backend.tests.test_api_auth
-```
-
-## CI / Online-Auswertung
-- GitHub Actions Workflow: `.github/workflows/test.yml`
-- Fuehrt Tests inkl. Coverage auf Push/PR gegen `main` aus.
-
-## Hetzner Smoketest
-```bash
-BASE_URL="https://news.vanityontour.de" \
-APP_ADMIN_USERNAME="admin" \
-APP_ADMIN_PASSWORD="..." \
-bash scripts/smoke_backend.sh
-```
-
-## Hinweis
-Passwort-Hashing und CSRF/Rate-Limit sind als naechste Ausbaustufe vorgesehen.
diff --git a/backend/__init__.py b/backend/__init__.py
deleted file mode 100644
index 3623851..0000000
--- a/backend/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-"""Backend package for rss-news rebuild."""
diff --git a/backend/app/__init__.py b/backend/app/__init__.py
deleted file mode 100644
index 18b665e..0000000
--- a/backend/app/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-"""Application package."""
diff --git a/backend/app/admin_ui.py b/backend/app/admin_ui.py
deleted file mode 100644
index a25199c..0000000
--- a/backend/app/admin_ui.py
+++ /dev/null
@@ -1,1126 +0,0 @@
-from __future__ import annotations
-
-import json
-from pathlib import Path
-import re
-import socket
-import ssl
-import time
-from urllib.parse import urlparse
-from urllib.parse import urlencode
-from urllib.request import Request as UrlRequest, urlopen
-
-from fastapi import APIRouter, Form, Request
-from fastapi.responses import HTMLResponse, RedirectResponse, Response
-from fastapi.templating import Jinja2Templates
-
-from .auth import create_session_token, verify_credentials, verify_session_token
-from .config import get_settings
-from .ingestion import run_ingestion
-from .policy import evaluate_source_policy
-from .publisher import enqueue_publish, run_publisher
-from .relevance import article_age_days, article_relevance
-from .rewrite import generate_article_tags, merge_generated_tags, rewrite_article_text
-from .repositories import (
- FeedCreate,
- FeedUpdate,
- SourceCreate,
- SourceUpdate,
- delete_feed,
- delete_source,
- create_feed,
- create_source,
- get_article_by_id,
- get_feed_by_id,
- list_articles,
- list_articles_page,
- bulk_update_wp_post_ids,
- list_feeds,
- list_publish_jobs,
- list_runs,
- list_sources,
- set_article_image_decision,
- upsert_article,
- update_feed,
- update_source,
- update_article_status,
- ArticleUpsert,
-)
-from .workflow import ALLOWED_UI_TRANSITIONS, UI_STATUSES, internal_to_ui_status, ui_to_internal_status
-
-settings = get_settings()
-router = APIRouter(tags=["admin-ui"])
-templates = Jinja2Templates(directory=str(Path(__file__).resolve().parent.parent / "templates"))
-ALLOWED_TRANSITIONS: dict[str, tuple[str, ...]] = {
- "new": ("rewrite", "close"),
- "rewrite": ("publish", "close"),
- "publish": ("published", "close"),
- "published": ("rewrite", "close"),
- "close": ("rewrite",),
-}
-IMAGE_PROXY_USER_AGENT = "rss-news-admin/1.0"
-_UNSET = object()
-
-
-def _admin_user(request: Request) -> str | None:
- token = request.cookies.get(settings.session_cookie_name)
- if not token:
- return None
- return verify_session_token(token)
-
-
-def _to_optional_int(raw: str | None) -> int | None:
- if raw is None:
- return None
- value = raw.strip()
- if value == "":
- return None
- return int(value)
-
-
-def _dashboard_redirect(
- *,
- msg: str | None = None,
- msg_type: str = "success",
- status_filter: str | None = None,
-) -> RedirectResponse:
- query: dict[str, str] = {}
- if msg:
- query["msg"] = msg
- query["type"] = msg_type
- if status_filter:
- query["status_filter"] = status_filter
- suffix = f"?{urlencode(query)}" if query else ""
- return RedirectResponse(url=f"/admin/dashboard{suffix}", status_code=303)
-
-
-def _parse_meta_json(raw: str | None) -> dict:
- if not raw:
- return {}
- try:
- parsed = json.loads(raw)
- return parsed if isinstance(parsed, dict) else {}
- except Exception:
- return {}
-
-
-def _read_article_images(article: dict, extraction: dict) -> list[str]:
- images: list[str] = []
- if article.get("image_urls_json"):
- try:
- parsed_images = json.loads(article["image_urls_json"])
- if isinstance(parsed_images, list):
- images = [str(item) for item in parsed_images if item]
- except Exception:
- images = []
- if not images and isinstance(extraction.get("images"), list):
- images = [str(item) for item in extraction.get("images") if item]
- # deduplicate preserving order
- seen: set[str] = set()
- deduped: list[str] = []
- for image in images:
- if image not in seen:
- seen.add(image)
- deduped.append(image)
- return deduped
-
-
-def _is_probably_irrelevant_image(url: str) -> bool:
- lowered = url.lower()
- patterns = (
- r"logo",
- r"icon",
- r"sprite",
- r"avatar",
- r"favicon",
- r"/ads/",
- r"tracking",
- r"pixel",
- r"banner",
- )
- return any(re.search(pattern, lowered) for pattern in patterns)
-
-
-def _is_http_image_url(url: str) -> bool:
- try:
- parsed = urlparse(url)
- except Exception:
- return False
- return parsed.scheme in {"http", "https"} and bool(parsed.netloc)
-
-
-def _build_image_entries(article: dict, extraction: dict, meta: dict) -> list[dict[str, object]]:
- all_images = _read_article_images(article, extraction)
- image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {}
- selected_url = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None
- excluded_urls = image_review.get("excluded_urls") if isinstance(image_review.get("excluded_urls"), list) else []
- excluded_set = {str(item) for item in excluded_urls if item}
-
- entries: list[dict[str, object]] = []
- for url in all_images:
- entries.append(
- {
- "url": url,
- "proxy_url": f"/admin/images/proxy?{urlencode({'url': url})}",
- "is_selected": selected_url == url,
- "is_excluded": url in excluded_set,
- "is_irrelevant_hint": _is_probably_irrelevant_image(url),
- }
- )
- return entries
-
-
-def _publish_readiness(article: dict, meta: dict) -> tuple[bool, list[str]]:
- reasons: list[str] = []
- if internal_to_ui_status(article.get("status")) not in {"publish", "published"}:
- reasons.append("Status ist nicht 'publish'")
- image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {}
- selected_image = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None
- if not selected_image:
- reasons.append("Hauptbild nicht ausgewählt")
- return len(reasons) == 0, reasons
-
-
-def _classify_publish_error(error_message: str | None) -> tuple[str, str]:
- text = (error_message or "").lower()
- if not text.strip():
- return "ok", "-"
- if "rechtsfreigabe fehlt" in text or "hauptbild nicht gesetzt" in text or "status ist nicht" in text:
- return "policy", "Artikelvoraussetzungen im UI prüfen (Status/Hauptbild)."
- if "401" in text or "403" in text or "authorization" in text or "forbidden" in text or "unauthorized" in text:
- return "auth", "WordPress Nutzer/App-Passwort prüfen."
- if "404" in text and ("media" in text or "posts" in text or "wp-json" in text):
- return "api", "WordPress REST-Endpunkt prüfen (`/wp-json/wp/v2`)."
- if "timed out" in text or "timeout" in text or "nodename nor servname provided" in text or "name or service not known" in text:
- return "dns", "DNS/Netzwerk zur WordPress-Domain prüfen."
- if "media-upload fehlgeschlagen" in text or "liefert kein bild" in text or "featured_media" in text:
- return "media", "Bild-URL/Format prüfen oder anderes Hauptbild auswählen."
- return "unknown", "Fehlerdetails prüfen und bei Bedarf Job erneut starten."
-
-
-def _legal_checklist(article: dict, feed: dict | None) -> list[dict[str, str]]:
- meta = article.get("meta", {})
- extraction = meta.get("extraction") if isinstance(meta.get("extraction"), dict) else {}
- attribution = meta.get("attribution") if isinstance(meta.get("attribution"), dict) else {}
-
- checks: list[dict[str, str]] = []
- checks.append(
- {
- "label": "Original-Link vorhanden",
- "status": "ok" if article.get("source_url") else "missing",
- "value": article.get("source_url") or "-",
- }
- )
- checks.append(
- {
- "label": "Autor vorhanden",
- "status": "ok" if article.get("author") else "missing",
- "value": article.get("author") or "-",
- }
- )
- checks.append(
- {
- "label": "Bilder extrahiert",
- "status": "ok" if article.get("image_urls_json") else "missing",
- "value": str(len(extraction.get("images", []))) if isinstance(extraction.get("images"), list) else "0",
- }
- )
- checks.append(
- {
- "label": "Pressekontakt",
- "status": "ok" if article.get("press_contact") else "missing",
- "value": article.get("press_contact") or extraction.get("press_contact") or "-",
- }
- )
- checks.append(
- {
- "label": "Lizenz/Terms",
- "status": "ok" if article.get("source_license_name_snapshot") and article.get("source_terms_url_snapshot") else "missing",
- "value": f"{article.get('source_license_name_snapshot') or attribution.get('source_license_name') or '-'} | {article.get('source_terms_url_snapshot') or attribution.get('source_terms_url') or '-'}",
- }
- )
- checks.append(
- {
- "label": "Risiko-Status Quelle",
- "status": "ok" if (feed and feed.get("source_risk_level") == "green") else "missing",
- "value": feed.get("source_risk_level") if feed else "-",
- }
- )
- image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {}
- selected_image = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None
- checks.append(
- {
- "label": "Hauptbild ausgewählt",
- "status": "ok" if selected_image else "missing",
- "value": selected_image or "-",
- }
- )
- return checks
-
-
-def _build_connectivity_targets() -> list[dict[str, str]]:
- targets: list[dict[str, str]] = []
- seen: set[tuple[str, str]] = set()
-
- def add_target(label: str, kind: str, value: str) -> None:
- normalized = (value or "").strip()
- if not normalized:
- return
- key = (kind, normalized.lower())
- if key in seen:
- return
- seen.add(key)
- targets.append({"label": label, "kind": kind, "value": normalized})
-
- add_target("OpenAI API", "host", "api.openai.com")
- if settings.wordpress_base_url:
- parsed = urlparse(settings.wordpress_base_url)
- if parsed.hostname:
- add_target("WordPress Host", "host", parsed.hostname)
- wp_api_url = f"{settings.wordpress_base_url.rstrip('/')}/wp-json/wp/v2"
- add_target("WordPress REST", "url", wp_api_url)
-
- for feed in list_feeds():
- name = (feed.get("name") or "").strip() or f"Feed #{feed.get('id')}"
- feed_url = str(feed.get("url") or "").strip()
- if not feed_url:
- continue
- parsed = urlparse(feed_url)
- if parsed.hostname:
- add_target(f"{name} (Feed)", "host", parsed.hostname)
- add_target(f"{name} (Feed URL)", "url", feed_url)
-
- return targets
-
-
-def _run_connectivity_check(target: dict[str, str]) -> dict[str, object]:
- kind = target.get("kind", "")
- value = str(target.get("value") or "")
- row: dict[str, object] = {
- "label": target.get("label") or "-",
- "kind": kind,
- "target": value,
- "dns_ok": False,
- "dns_info": "-",
- "tcp_ok": False,
- "tcp_info": "-",
- "http_ok": False,
- "http_info": "-",
- "duration_ms": 0,
- "ok": False,
- }
- started = time.perf_counter()
- try:
- hostname = value if kind == "host" else (urlparse(value).hostname or "")
- port = 443
- if kind == "url":
- parsed = urlparse(value)
- if parsed.scheme not in {"http", "https"}:
- row["http_info"] = f"unsupported scheme: {parsed.scheme or '-'}"
- return row
- port = 443 if parsed.scheme == "https" else 80
- if not hostname:
- row["dns_info"] = "host fehlt"
- return row
-
- try:
- addr_info = socket.getaddrinfo(hostname, None, proto=socket.IPPROTO_TCP)
- ips = sorted({entry[4][0] for entry in addr_info if entry and len(entry) > 4 and entry[4]})
- row["dns_ok"] = True
- row["dns_info"] = ", ".join(ips[:3]) if ips else "resolved"
- except Exception as exc:
- row["dns_info"] = str(exc)
- return row
-
- try:
- socket.create_connection((hostname, port), timeout=4).close()
- row["tcp_ok"] = True
- row["tcp_info"] = f"port {port} erreichbar"
- except Exception as exc:
- row["tcp_info"] = str(exc)
- return row
-
- if kind == "host":
- row["http_ok"] = True
- row["http_info"] = "n/a (host-only)"
- row["ok"] = True
- return row
-
- try:
- req = UrlRequest(
- url=value,
- headers={"User-Agent": IMAGE_PROXY_USER_AGENT, "Accept": "*/*"},
- )
- with urlopen(req, timeout=6, context=ssl.create_default_context()) as resp:
- code = getattr(resp, "status", None) or resp.getcode()
- row["http_ok"] = True
- row["http_info"] = f"HTTP {code}"
- except Exception as exc:
- row["http_info"] = str(exc)
- return row
-
- row["ok"] = bool(row["dns_ok"] and row["tcp_ok"] and row["http_ok"])
- return row
- finally:
- row["duration_ms"] = int((time.perf_counter() - started) * 1000)
-
-
-def _upsert_article_from_existing(
- article: dict,
- *,
- content_rewritten: str | None = None,
- status: str | None = None,
- wp_post_id: int | None | object = _UNSET,
- wp_post_url: str | None | object = _UNSET,
- publish_attempts: int | object = _UNSET,
- publish_last_error: str | None | object = _UNSET,
- published_to_wp_at: str | None | object = _UNSET,
- meta_json: str | None | object = _UNSET,
-) -> None:
- rewritten = article.get("content_rewritten") if content_rewritten is None else content_rewritten
- upsert_article(
- ArticleUpsert(
- feed_id=article.get("feed_id"),
- source_article_id=article.get("source_article_id"),
- source_hash=article.get("source_hash"),
- title=article.get("title"),
- source_url=article.get("source_url"),
- canonical_url=article.get("canonical_url"),
- published_at=article.get("published_at"),
- author=article.get("author"),
- summary=article.get("summary"),
- content_raw=article.get("content_raw"),
- content_rewritten=rewritten,
- image_urls_json=article.get("image_urls_json"),
- press_contact=article.get("press_contact"),
- source_name_snapshot=article.get("source_name_snapshot"),
- source_terms_url_snapshot=article.get("source_terms_url_snapshot"),
- source_license_name_snapshot=article.get("source_license_name_snapshot"),
- legal_checked=bool(int(article.get("legal_checked", 0))),
- legal_checked_at=article.get("legal_checked_at"),
- legal_note=article.get("legal_note"),
- wp_post_id=article.get("wp_post_id") if wp_post_id is _UNSET else wp_post_id,
- wp_post_url=article.get("wp_post_url") if wp_post_url is _UNSET else wp_post_url,
- publish_attempts=int(article.get("publish_attempts", 0)) if publish_attempts is _UNSET else publish_attempts,
- publish_last_error=article.get("publish_last_error") if publish_last_error is _UNSET else publish_last_error,
- published_to_wp_at=article.get("published_to_wp_at") if published_to_wp_at is _UNSET else published_to_wp_at,
- word_count=len(str(rewritten or "").split()),
- status=article.get("status") if status is None else status,
- meta_json=article.get("meta_json") if meta_json is _UNSET else meta_json,
- )
- )
-
-
-@router.get("/admin", response_class=HTMLResponse)
-def admin_index(request: Request):
- user = _admin_user(request)
- if not user:
- return RedirectResponse(url="/admin/login", status_code=303)
- return RedirectResponse(url="/admin/dashboard", status_code=303)
-
-
-@router.get("/admin/login", response_class=HTMLResponse)
-def admin_login_page(request: Request):
- return templates.TemplateResponse(
- request,
- "admin_login.html",
- {"request": request, "title": "Admin Login", "error": request.query_params.get("error")},
- )
-
-
-@router.post("/admin/login")
-def admin_login(request: Request, username: str = Form(...), password: str = Form(...)):
- if not verify_credentials(username, password):
- return RedirectResponse(url="/admin/login?error=1", status_code=303)
-
- token = create_session_token(username)
- response = RedirectResponse(url="/admin/dashboard", status_code=303)
- response.set_cookie(
- key=settings.session_cookie_name,
- value=token,
- max_age=settings.session_max_age_seconds,
- httponly=True,
- secure=False,
- samesite="lax",
- )
- return response
-
-
-@router.post("/admin/logout")
-def admin_logout():
- response = RedirectResponse(url="/admin/login", status_code=303)
- response.delete_cookie(settings.session_cookie_name)
- return response
-
-
-@router.get("/admin/dashboard", response_class=HTMLResponse)
-def admin_dashboard(request: Request):
- user = _admin_user(request)
- if not user:
- return RedirectResponse(url="/admin/login", status_code=303)
-
- sources = list_sources()
- source_policy = {s["id"]: evaluate_source_policy(s) for s in sources}
- feeds = list_feeds()
- runs = list_runs(limit=30)
- publish_jobs = list_publish_jobs(limit=30)
- for job in publish_jobs:
- category, hint = _classify_publish_error(job.get("error_message"))
- job["error_category"] = category
- job["error_hint"] = hint
- status_filter = request.query_params.get("status_filter")
- internal_filter = ui_to_internal_status(status_filter) if status_filter else None
- if status_filter in set(UI_STATUSES):
- articles = list_articles(limit=100, status_filter=internal_filter)
- else:
- status_filter = ""
- articles = [a for a in list_articles(limit=250) if internal_to_ui_status(a.get("status")) != "close"][:100]
- for article in articles:
- meta = _parse_meta_json(article.get("meta_json"))
- extraction = meta.get("extraction") if isinstance(meta.get("extraction"), dict) else {}
- images = _read_article_images(article, extraction)
- article["meta"] = meta
- ready, reasons = _publish_readiness(article, meta)
- article["publish_ready"] = ready
- article["publish_blockers"] = reasons
- article["extracted_images"] = images
- article["image_entries"] = _build_image_entries(article, extraction, meta)
- image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {}
- article["selected_image_url"] = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None
- article["selected_image_proxy_url"] = (
- f"/admin/images/proxy?{urlencode({'url': article['selected_image_url']})}" if article.get("selected_image_url") else None
- )
- if not article.get("press_contact") and isinstance(extraction.get("press_contact"), str):
- article["press_contact"] = extraction.get("press_contact")
- article["extraction_error"] = extraction.get("extraction_error") if isinstance(extraction.get("extraction_error"), str) else None
- article["days_old"] = article_age_days(article.get("published_at"))
- article["relevance"] = article_relevance(article.get("published_at"))
- article["status_ui"] = internal_to_ui_status(article.get("status"))
- tags = meta.get("generated_tags") if isinstance(meta.get("generated_tags"), list) else []
- article["generated_tags"] = [str(t) for t in tags if t]
-
- return templates.TemplateResponse(
- request,
- "admin_dashboard.html",
- {
- "request": request,
- "title": "Admin Dashboard",
- "user": user,
- "sources": sources,
- "source_policy": source_policy,
- "feeds": feeds,
- "runs": runs,
- "publish_jobs": publish_jobs,
- "articles": articles,
- "status_options": list(UI_STATUSES),
- "allowed_transitions": ALLOWED_TRANSITIONS,
- "status_filter": status_filter,
- "flash_msg": request.query_params.get("msg", ""),
- "flash_type": request.query_params.get("type", "success"),
- },
- )
-
-
-@router.get("/admin/connectivity", response_class=HTMLResponse)
-def admin_connectivity(request: Request):
- user = _admin_user(request)
- if not user:
- return RedirectResponse(url="/admin/login", status_code=303)
-
- checks = [_run_connectivity_check(target) for target in _build_connectivity_targets()]
- ok_count = len([c for c in checks if c.get("ok")])
- error_count = len(checks) - ok_count
- return templates.TemplateResponse(
- request,
- "admin_connectivity.html",
- {
- "request": request,
- "title": "Connectivity Check",
- "user": user,
- "checks": checks,
- "ok_count": ok_count,
- "error_count": error_count,
- },
- )
-
-
-@router.get("/admin/articles/{article_id}", response_class=HTMLResponse)
-def admin_article_detail(request: Request, article_id: int):
- user = _admin_user(request)
- if not user:
- return RedirectResponse(url="/admin/login", status_code=303)
-
- article = get_article_by_id(article_id)
- if not article:
- return _dashboard_redirect(msg=f"Artikel #{article_id} nicht gefunden", msg_type="error")
-
- meta = _parse_meta_json(article.get("meta_json"))
- article["meta"] = meta
- extraction = meta.get("extraction") if isinstance(meta.get("extraction"), dict) else {}
- extraction["images"] = _read_article_images(article, extraction)
- if not article.get("press_contact") and isinstance(extraction.get("press_contact"), str):
- article["press_contact"] = extraction.get("press_contact")
- article["extraction"] = extraction
- publish_ready, publish_blockers = _publish_readiness(article, meta)
- article["publish_ready"] = publish_ready
- article["publish_blockers"] = publish_blockers
- article["image_selection"] = extraction.get("image_selection") if isinstance(extraction.get("image_selection"), dict) else {}
- article["image_entries"] = _build_image_entries(article, extraction, meta)
- image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {}
- article["selected_image_url"] = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None
- article["selected_image_proxy_url"] = (
- f"/admin/images/proxy?{urlencode({'url': article['selected_image_url']})}" if article.get("selected_image_url") else None
- )
- article["days_old"] = article_age_days(article.get("published_at"))
- article["relevance"] = article_relevance(article.get("published_at"))
- article["status_ui"] = internal_to_ui_status(article.get("status"))
- feed = get_feed_by_id(int(article["feed_id"])) if article.get("feed_id") else None
- checklist = _legal_checklist(article, feed)
-
- return templates.TemplateResponse(
- request,
- "admin_article_detail.html",
- {
- "request": request,
- "title": f"Artikel #{article_id}",
- "user": user,
- "article": article,
- "feed": feed,
- "checklist": checklist,
- "allowed_transitions": ALLOWED_TRANSITIONS.get(article.get("status_ui"), ()),
- "flash_msg": request.query_params.get("msg", ""),
- "flash_type": request.query_params.get("type", "success"),
- },
- )
-
-
-@router.post("/admin/articles/{article_id}/images/decision")
-def admin_article_image_decision(
- request: Request,
- article_id: int,
- image_url: str = Form(...),
- action: str = Form(...),
-):
- user = _admin_user(request)
- if not user:
- return RedirectResponse(url="/admin/login", status_code=303)
-
- ok = set_article_image_decision(article_id=article_id, image_url=image_url, action=action, actor=user)
- if not ok:
- return _dashboard_redirect(msg=f"Bildaktion fehlgeschlagen fuer Artikel #{article_id}", msg_type="error")
- return RedirectResponse(url=f"/admin/articles/{article_id}", status_code=303)
-
-
-@router.post("/admin/articles/{article_id}/publish-enqueue")
-def admin_enqueue_publish(request: Request, article_id: int, max_attempts: str = Form("3")):
- user = _admin_user(request)
- if not user:
- return RedirectResponse(url="/admin/login", status_code=303)
- try:
- job_id = enqueue_publish(article_id=article_id, max_attempts=max(1, int(max_attempts)))
- except Exception as exc:
- return _dashboard_redirect(msg=f"Publish Queue Fehler fuer Artikel #{article_id}: {exc}", msg_type="error")
- return RedirectResponse(url=f"/admin/articles/{article_id}?msg=Publish-Job%20#{job_id}%20erstellt&type=success", status_code=303)
-
-
-@router.post("/admin/publisher/run")
-def admin_run_publisher(request: Request, max_jobs: str = Form("10")):
- user = _admin_user(request)
- if not user:
- return RedirectResponse(url="/admin/login", status_code=303)
- try:
- stats = run_publisher(max_jobs=max(1, int(max_jobs)))
- except Exception as exc:
- return _dashboard_redirect(msg=f"Publisher Fehler: {exc}", msg_type="error")
- return _dashboard_redirect(
- msg=f"Publisher: processed={stats.processed}, success={stats.success}, failed={stats.failed}, requeued={stats.requeued}"
- )
-
-
-@router.get("/admin/images/proxy")
-def admin_image_proxy(request: Request, url: str):
- if not _is_http_image_url(url):
- return Response(status_code=400)
-
- try:
- referer = request.headers.get("referer", "")
- req = UrlRequest(
- url=url,
- headers={
- "User-Agent": IMAGE_PROXY_USER_AGENT,
- "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
- "Referer": referer or url,
- },
- )
- with urlopen(req, timeout=10) as resp:
- body = resp.read()
- content_type = resp.headers.get("Content-Type", "application/octet-stream")
- except Exception:
- return Response(status_code=404)
-
- if not content_type.lower().startswith("image/"):
- return Response(status_code=415)
- return Response(content=body, media_type=content_type)
-
-
-@router.post("/admin/sources/create")
-def admin_create_source(
- request: Request,
- name: str = Form(...),
- base_url: str = Form(""),
- terms_url: str = Form(""),
- license_name: str = Form(""),
- risk_level: str = Form("yellow"),
- last_reviewed_at: str = Form(""),
-):
- user = _admin_user(request)
- if not user:
- return RedirectResponse(url="/admin/login", status_code=303)
-
- try:
- create_source(
- SourceCreate(
- name=name,
- base_url=base_url or None,
- terms_url=terms_url or None,
- license_name=license_name or None,
- risk_level=risk_level,
- is_enabled=True,
- notes=None,
- last_reviewed_at=last_reviewed_at or None,
- )
- )
- except Exception as exc:
- return _dashboard_redirect(msg=f"Quelle konnte nicht gespeichert werden: {exc}", msg_type="error")
- return _dashboard_redirect(msg="Quelle gespeichert")
-
-
-@router.post("/admin/sources/{source_id}/update")
-def admin_update_source(
- request: Request,
- source_id: int,
- name: str = Form(...),
- base_url: str = Form(""),
- terms_url: str = Form(""),
- license_name: str = Form(""),
- risk_level: str = Form("yellow"),
- is_enabled: str = Form("1"),
- notes: str = Form(""),
- last_reviewed_at: str = Form(""),
-):
- user = _admin_user(request)
- if not user:
- return RedirectResponse(url="/admin/login", status_code=303)
- try:
- ok = update_source(
- source_id,
- SourceUpdate(
- name=name,
- base_url=base_url or None,
- terms_url=terms_url or None,
- license_name=license_name or None,
- risk_level=risk_level,
- is_enabled=is_enabled == "1",
- notes=notes or None,
- last_reviewed_at=last_reviewed_at or None,
- ),
- )
- except Exception as exc:
- return _dashboard_redirect(msg=f"Quelle #{source_id} Update fehlgeschlagen: {exc}", msg_type="error")
- if not ok:
- return _dashboard_redirect(msg=f"Quelle #{source_id} nicht gefunden", msg_type="error")
- return _dashboard_redirect(msg=f"Quelle #{source_id} aktualisiert")
-
-
-@router.post("/admin/sources/{source_id}/delete")
-def admin_delete_source(request: Request, source_id: int):
- user = _admin_user(request)
- if not user:
- return RedirectResponse(url="/admin/login", status_code=303)
- ok = delete_source(source_id)
- if not ok:
- return _dashboard_redirect(msg=f"Quelle #{source_id} nicht gefunden", msg_type="error")
- return _dashboard_redirect(msg=f"Quelle #{source_id} gelöscht")
-
-
-@router.post("/admin/feeds/create")
-def admin_create_feed(
- request: Request,
- name: str = Form(...),
- url: str = Form(...),
- source_id: str = Form(""),
-):
- user = _admin_user(request)
- if not user:
- return RedirectResponse(url="/admin/login", status_code=303)
-
- try:
- create_feed(
- FeedCreate(
- name=name,
- url=url,
- source_id=_to_optional_int(source_id),
- is_enabled=True,
- )
- )
- except Exception as exc:
- return _dashboard_redirect(msg=f"Feed konnte nicht gespeichert werden: {exc}", msg_type="error")
- return _dashboard_redirect(msg="Feed gespeichert")
-
-
-@router.post("/admin/feeds/{feed_id}/update")
-def admin_update_feed(
- request: Request,
- feed_id: int,
- name: str = Form(...),
- url: str = Form(...),
- source_id: str = Form(""),
- is_enabled: str = Form("1"),
-):
- user = _admin_user(request)
- if not user:
- return RedirectResponse(url="/admin/login", status_code=303)
- try:
- ok = update_feed(
- feed_id,
- FeedUpdate(
- name=name,
- url=url,
- source_id=_to_optional_int(source_id),
- is_enabled=is_enabled == "1",
- ),
- )
- except Exception as exc:
- return _dashboard_redirect(msg=f"Feed #{feed_id} Update fehlgeschlagen: {exc}", msg_type="error")
- if not ok:
- return _dashboard_redirect(msg=f"Feed #{feed_id} nicht gefunden", msg_type="error")
- return _dashboard_redirect(msg=f"Feed #{feed_id} aktualisiert")
-
-
-@router.post("/admin/feeds/{feed_id}/delete")
-def admin_delete_feed(request: Request, feed_id: int):
- user = _admin_user(request)
- if not user:
- return RedirectResponse(url="/admin/login", status_code=303)
- ok = delete_feed(feed_id)
- if not ok:
- return _dashboard_redirect(msg=f"Feed #{feed_id} nicht gefunden", msg_type="error")
- return _dashboard_redirect(msg=f"Feed #{feed_id} gelöscht")
-
-
-@router.post("/admin/ingestion/run")
-def admin_run_ingestion(request: Request, feed_id: str = Form("")):
- user = _admin_user(request)
- if not user:
- return RedirectResponse(url="/admin/login", status_code=303)
- try:
- stats = run_ingestion(feed_id=_to_optional_int(feed_id))
- except Exception as exc:
- return _dashboard_redirect(msg=f"Ingestion fehlgeschlagen: {exc}", msg_type="error")
- return _dashboard_redirect(msg=f"Ingestion: {stats.status}, upserts={stats.articles_upserted}")
-
-
-@router.post("/admin/articles/{article_id}/review")
-def admin_review_article(request: Request, article_id: int, decision: str = Form(...), note: str = Form("")):
- user = _admin_user(request)
- if not user:
- return RedirectResponse(url="/admin/login", status_code=303)
-
- return _dashboard_redirect(msg="Review-Aktion wurde durch Rewrite ersetzt", msg_type="error")
-
-
-@router.post("/admin/articles/{article_id}/rewrite-run")
-def admin_rewrite_run(request: Request, article_id: int):
- user = _admin_user(request)
- if not user:
- return RedirectResponse(url="/admin/login", status_code=303)
- article = get_article_by_id(article_id)
- if not article:
- return _dashboard_redirect(msg=f"Artikel #{article_id} nicht gefunden", msg_type="error")
- if internal_to_ui_status(article.get("status")) not in {"new", "rewrite"}:
- return _dashboard_redirect(msg=f"Rewrite nur aus new/rewrite fuer Artikel #{article_id}", msg_type="error")
- try:
- rewritten = rewrite_article_text(article)
- tags = generate_article_tags(article, rewritten_text=rewritten)
- except Exception as exc:
- return _dashboard_redirect(msg=f"Rewrite fehlgeschlagen fuer Artikel #{article_id}: {exc}", msg_type="error")
- merged_meta = merge_generated_tags(article.get("meta_json"), tags)
- _upsert_article_from_existing(article, content_rewritten=rewritten, status="approved", meta_json=merged_meta)
- return _dashboard_redirect(msg=f"Rewrite fertig fuer Artikel #{article_id} -> publish")
-
-
-@router.post("/admin/rewrite/run")
-def admin_rewrite_run_batch(request: Request, max_jobs: str = Form("10")):
- user = _admin_user(request)
- if not user:
- return RedirectResponse(url="/admin/login", status_code=303)
- try:
- limit = max(1, min(int(max_jobs), 100))
- except Exception:
- limit = 10
- planned = list_articles(limit=limit, status_filter="rewrite")
- processed = 0
- success = 0
- failed = 0
- for article in planned:
- processed += 1
- try:
- rewritten = rewrite_article_text(article)
- tags = generate_article_tags(article, rewritten_text=rewritten)
- merged_meta = merge_generated_tags(article.get("meta_json"), tags)
- _upsert_article_from_existing(article, content_rewritten=rewritten, status="approved", meta_json=merged_meta)
- success += 1
- except Exception:
- failed += 1
- return _dashboard_redirect(msg=f"Rewrite-Run: processed={processed}, success={success}, failed={failed}")
-
-
-@router.post("/admin/articles/{article_id}/rewrite-save")
-def admin_rewrite_save(request: Request, article_id: int, content_rewritten: str = Form(...)):
- user = _admin_user(request)
- if not user:
- return RedirectResponse(url="/admin/login", status_code=303)
- article = get_article_by_id(article_id)
- if not article:
- return _dashboard_redirect(msg=f"Artikel #{article_id} nicht gefunden", msg_type="error")
- text = (content_rewritten or "").strip()
- if not text:
- return RedirectResponse(
- url=f"/admin/articles/{article_id}?msg=Rewrite-Text%20darf%20nicht%20leer%20sein&type=error",
- status_code=303,
- )
- _upsert_article_from_existing(article, content_rewritten=text)
- return RedirectResponse(url=f"/admin/articles/{article_id}?msg=Rewrite-Text%20gespeichert&type=success", status_code=303)
-
-
-@router.post("/admin/articles/{article_id}/reopen")
-def admin_reopen_article(request: Request, article_id: int):
- user = _admin_user(request)
- if not user:
- return RedirectResponse(url="/admin/login", status_code=303)
- article = get_article_by_id(article_id)
- if not article:
- return _dashboard_redirect(msg=f"Artikel #{article_id} nicht gefunden", msg_type="error")
- _upsert_article_from_existing(
- article,
- status="rewrite",
- wp_post_id=None,
- wp_post_url=None,
- publish_attempts=0,
- publish_last_error=None,
- published_to_wp_at=None,
- )
- return RedirectResponse(
- url=f"/admin/articles/{article_id}?msg=Artikel%20zurueck%20in%20Rewrite-Workflow%20gesetzt&type=success",
- status_code=303,
- )
-
-
-@router.post("/admin/articles/{article_id}/transition")
-def admin_transition_article(request: Request, article_id: int, target_status: str = Form(...), note: str = Form("")):
- user = _admin_user(request)
- if not user:
- return RedirectResponse(url="/admin/login", status_code=303)
-
- article = get_article_by_id(article_id)
- if article:
- current_ui = internal_to_ui_status(article.get("status"))
- target_internal = ui_to_internal_status(target_status)
- target_ui = internal_to_ui_status(target_internal)
- if target_ui in ALLOWED_TRANSITIONS.get(current_ui, ()):
- update_article_status(article_id, target_internal, actor=user, note=note or None)
- return _dashboard_redirect(msg=f"Artikel #{article_id}: {current_ui} -> {target_ui}")
- return _dashboard_redirect(msg=f"Ungueltiger Statuswechsel fuer Artikel #{article_id}", msg_type="error")
-
-
-_PAGE_SIZE = 50
-
-
-@router.get("/admin/article-list", response_class=HTMLResponse)
-def admin_article_list(request: Request):
- """Paginated article list with inline WP ID editing."""
- user = _admin_user(request)
- if not user:
- return RedirectResponse(url="/admin/login", status_code=303)
-
- page = max(1, int(request.query_params.get("page", 1)))
- status_filter = request.query_params.get("status_filter", "") or None
- search = request.query_params.get("search", "").strip() or None
- offset = (page - 1) * _PAGE_SIZE
-
- articles, total = list_articles_page(
- limit=_PAGE_SIZE, offset=offset,
- status_filter=status_filter, search=search,
- )
-
- # Enrich each article with thumbnail URL
- for a in articles:
- meta = _parse_meta_json(a.get("meta_json"))
- image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {}
- sel = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None
- if not sel:
- sel = (meta.get("extraction") or {}).get("image_selection", {}).get("primary")
- a["thumb_url"] = sel
- a["thumb_proxy"] = f"/admin/images/proxy?{urlencode({'url': sel})}" if sel else None
- raw = (a.get("content_raw") or a.get("summary") or "").strip()
- a["excerpt"] = raw[:120] + "…" if len(raw) > 120 else raw
-
- total_pages = max(1, (total + _PAGE_SIZE - 1) // _PAGE_SIZE)
-
- return templates.TemplateResponse(
- request,
- "admin_article_list.html",
- {
- "request": request,
- "title": "Artikelliste",
- "user": user,
- "articles": articles,
- "page": page,
- "total_pages": total_pages,
- "total": total,
- "page_size": _PAGE_SIZE,
- "status_filter": status_filter or "",
- "search": search or "",
- "flash_msg": request.query_params.get("msg", ""),
- "flash_type": request.query_params.get("type", "success"),
- },
- )
-
-
-@router.post("/admin/article-list/update")
-async def admin_article_list_update(request: Request):
- """Bulk update WP post IDs from the article list form."""
- user = _admin_user(request)
- if not user:
- return RedirectResponse(url="/admin/login", status_code=303)
-
- form = await request.form()
- updates: list[tuple[int, int | None]] = []
-
- # Form fields: wp_ = new value, orig_ = original value
- for key, new_val in form.items():
- if not key.startswith("wp_"):
- continue
- try:
- article_id = int(key[3:])
- except ValueError:
- continue
- orig_val = str(form.get(f"orig_{article_id}", "")).strip()
- new_val_s = str(new_val).strip()
- if new_val_s == orig_val:
- continue # unchanged
- new_wp_id = int(new_val_s) if new_val_s else None
- updates.append((article_id, new_wp_id))
-
- if updates:
- count = bulk_update_wp_post_ids(updates)
- msg = f"{count} WP-ID(s) aktualisiert. Bitte jetzt WP-Sync ausführen um Slots & URLs zu aktualisieren."
- msg_type = "success"
- else:
- msg = "Keine Änderungen erkannt."
- msg_type = "success"
-
- # Preserve pagination/filter params from referer
- page = form.get("page", "1")
- status_filter = form.get("status_filter", "")
- search = form.get("search", "")
- qs: dict[str, str] = {"msg": msg, "type": msg_type, "page": page}
- if status_filter:
- qs["status_filter"] = status_filter
- if search:
- qs["search"] = search
- return RedirectResponse(url=f"/admin/article-list?{urlencode(qs)}", status_code=303)
-
-
-@router.post("/admin/wp-sync")
-def admin_wp_sync(request: Request):
- """Sync scheduled_publish_at and WP references in the DB from WordPress."""
- user = _admin_user(request)
- if not user:
- return RedirectResponse(url="/admin/login", status_code=303)
- try:
- from .wordpress import sync_db_from_wordpress
- stats = sync_db_from_wordpress()
- msg = (
- f"WP-Sync abgeschlossen: "
- f"{stats['slot_updated']} Slots aktualisiert, "
- f"{stats['slot_cleared_draft']} Slots geleert (Draft), "
- f"{stats['marked_published']} als veröffentlicht markiert, "
- f"{stats['wp_reference_cleared']} WP-Referenzen gelöscht (Papierkorb), "
- f"{stats['already_in_sync']} bereits synchron."
- )
- return RedirectResponse(url=f"/admin/schedule?msg={msg}&type=success", status_code=303)
- except Exception as exc:
- return RedirectResponse(url=f"/admin/schedule?msg=Sync fehlgeschlagen: {exc}&type=error", status_code=303)
-
-
-@router.post("/admin/articles/{article_id}/retry")
-def admin_retry_article(request: Request, article_id: int):
- """Reset a failed article to 'new' so the pipeline picks it up on next run."""
- user = _admin_user(request)
- if not user:
- return RedirectResponse(url="/admin/login", status_code=303)
-
- article = get_article_by_id(article_id)
- if not article:
- return _dashboard_redirect(msg=f"Artikel #{article_id} nicht gefunden", msg_type="error")
-
- from .scheduler import release_publish_slot
- release_publish_slot(article_id)
- update_article_status(article_id, "new", actor=user, note="Manuell zurückgesetzt für erneuten Pipeline-Versuch")
- return _dashboard_redirect(
- msg=f"Artikel #{article_id} wurde auf 'neu' zurückgesetzt und wird beim nächsten Pipeline-Lauf verarbeitet",
- status_filter="close",
- )
-
-
-@router.get("/admin/schedule", response_class=HTMLResponse)
-def admin_schedule(request: Request):
- """Schedule overview: all booked slots from DB and WordPress."""
- user = _admin_user(request)
- if not user:
- return RedirectResponse(url="/admin/login", status_code=303)
-
- from .scheduler import get_schedule_overview, _preferred_hours, _today_cet
- from datetime import timedelta
-
- slots = get_schedule_overview(lookahead_days=60)
- today = _today_cet()
- hours = _preferred_hours()
-
- # Build a calendar grid: for each day in the next 60 days, show each preferred hour slot
- booked: dict[tuple[str, int], dict] = {(s["date"], s["hour"]): s for s in slots}
- calendar_days = []
- for offset in range(0, 61):
- d = today + timedelta(days=offset)
- d_str = d.isoformat()
- day_slots = []
- for h in hours:
- key = (d_str, h)
- day_slots.append({
- "hour": h,
- "booked": key in booked,
- "slot": booked.get(key),
- })
- calendar_days.append({
- "date": d_str,
- "date_fmt": d.strftime("%d.%m.%Y"),
- "weekday": ["Mo", "Di", "Mi", "Do", "Fr", "Sa", "So"][d.weekday()],
- "slots": day_slots,
- "any_booked": any(s["booked"] for s in day_slots),
- })
-
- return templates.TemplateResponse(
- request,
- "admin_schedule.html",
- {
- "request": request,
- "title": "Veröffentlichungsplan",
- "user": user,
- "slots": slots,
- "calendar_days": calendar_days,
- "hours": hours,
- "flash_msg": request.query_params.get("msg", ""),
- "flash_type": request.query_params.get("type", "success"),
- },
- )
diff --git a/backend/app/auth.py b/backend/app/auth.py
deleted file mode 100644
index 188397f..0000000
--- a/backend/app/auth.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import hmac
-from typing import Optional
-
-from itsdangerous import URLSafeTimedSerializer, BadSignature, SignatureExpired
-
-from .config import get_settings
-
-
-def _serializer() -> URLSafeTimedSerializer:
- settings = get_settings()
- return URLSafeTimedSerializer(settings.app_secret_key, salt="rss-news-session")
-
-
-def verify_credentials(username: str, password: str) -> bool:
- settings = get_settings()
- user_ok = hmac.compare_digest(username, settings.app_admin_username)
- pw_ok = hmac.compare_digest(password, settings.app_admin_password)
- return user_ok and pw_ok
-
-
-def create_session_token(username: str) -> str:
- return _serializer().dumps({"username": username})
-
-
-def verify_session_token(token: str) -> Optional[str]:
- settings = get_settings()
- try:
- payload = _serializer().loads(token, max_age=settings.session_max_age_seconds)
- except (BadSignature, SignatureExpired):
- return None
- return payload.get("username")
diff --git a/backend/app/config.py b/backend/app/config.py
deleted file mode 100644
index 24c3902..0000000
--- a/backend/app/config.py
+++ /dev/null
@@ -1,65 +0,0 @@
-from functools import lru_cache
-from pathlib import Path
-
-from dotenv import load_dotenv
-from pydantic import AliasChoices, Field
-from pydantic_settings import BaseSettings, SettingsConfigDict
-
-
-class Settings(BaseSettings):
- # Prefer backend-specific env file to avoid collisions with legacy root .env
- model_config = SettingsConfigDict(
- env_file=("backend/.env", ".env"),
- env_file_encoding="utf-8",
- extra="ignore",
- )
-
- app_env: str = "development"
- app_name: str = "rss-news-backend"
- app_secret_key: str = "replace-with-a-long-random-secret"
-
- app_admin_username: str = "admin"
- app_admin_password: str = "change-me"
-
- session_cookie_name: str = "rss_news_session"
- session_max_age_seconds: int = 28800
-
- app_db_path: str = "backend/data/rss_news.db"
-
- wordpress_base_url: str | None = Field(default=None, validation_alias=AliasChoices("WORDPRESS_BASE_URL", "WP_BASE_URL"))
- wordpress_username: str | None = Field(default=None, validation_alias=AliasChoices("WORDPRESS_USERNAME", "WP_USERNAME"))
- wordpress_app_password: str | None = Field(default=None, validation_alias=AliasChoices("WORDPRESS_APP_PASSWORD", "WP_PASSWORD"))
- wordpress_default_status: str = "draft"
- openai_api_key: str | None = Field(default=None, validation_alias=AliasChoices("OPENAI_API_KEY"))
- openai_model: str = "gpt-4o-mini"
-
- # Telegram Bot
- telegram_bot_token: str | None = Field(default=None, validation_alias=AliasChoices("TELEGRAM_BOT_TOKEN"))
- telegram_chat_id: str | None = Field(default=None, validation_alias=AliasChoices("TELEGRAM_CHAT_ID"))
- telegram_webhook_secret: str | None = Field(default=None, validation_alias=AliasChoices("TELEGRAM_WEBHOOK_SECRET"))
-
- # N8N API authentication
- n8n_api_key: str | None = Field(default=None, validation_alias=AliasChoices("N8N_API_KEY"))
-
- # Pipeline behaviour
- pipeline_relevance_auto: int = 80 # >= this: auto-process
- pipeline_relevance_warn: int = 60 # >= this: Telegram warning, else reject
- pipeline_max_drafts_per_day: int = 2
- pipeline_publish_hours: str = "9,14" # comma-separated preferred publish hours (CET)
- pipeline_min_words_raw: int = 120 # minimum words in raw content before rewrite (else reject)
- pipeline_min_words_rewritten: int = 150 # minimum words in rewritten content (else reject)
- pipeline_max_article_age_days: int = 7 # skip articles older than N days during ingestion (0 = no limit)
-
-
-@lru_cache(maxsize=1)
-def get_settings() -> Settings:
- # Prefer shared legacy env from the original rss-news workspace if present.
- env_candidates = (
- Path("/Users/oliver/Documents/rss-news/.env"),
- Path("backend/.env"),
- Path(".env"),
- )
- for env_path in env_candidates:
- if env_path.exists():
- load_dotenv(env_path, override=False)
- return Settings()
diff --git a/backend/app/db.py b/backend/app/db.py
deleted file mode 100644
index b6ef898..0000000
--- a/backend/app/db.py
+++ /dev/null
@@ -1,293 +0,0 @@
-import sqlite3
-from contextlib import contextmanager
-from pathlib import Path
-from typing import Any, Iterator
-
-from .config import get_settings
-
-
-def _db_path() -> Path:
- settings = get_settings()
- path = Path(settings.app_db_path)
- path.parent.mkdir(parents=True, exist_ok=True)
- return path
-
-
-@contextmanager
-def get_conn() -> Iterator[sqlite3.Connection]:
- conn = sqlite3.connect(_db_path())
- conn.row_factory = sqlite3.Row
- conn.execute("PRAGMA foreign_keys=ON;")
- try:
- yield conn
- conn.commit()
- finally:
- conn.close()
-
-
-def init_db() -> None:
- with get_conn() as conn:
- conn.executescript(
- """
- PRAGMA journal_mode=WAL;
-
- CREATE TABLE IF NOT EXISTS sources (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- name TEXT NOT NULL,
- base_url TEXT,
- terms_url TEXT,
- license_name TEXT,
- risk_level TEXT NOT NULL DEFAULT 'yellow' CHECK (risk_level IN ('green', 'yellow', 'red')),
- is_enabled INTEGER NOT NULL DEFAULT 0,
- notes TEXT,
- last_reviewed_at TEXT,
- created_at TEXT NOT NULL DEFAULT (datetime('now')),
- updated_at TEXT NOT NULL DEFAULT (datetime('now'))
- );
-
- CREATE TABLE IF NOT EXISTS feeds (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- source_id INTEGER,
- name TEXT NOT NULL,
- url TEXT NOT NULL UNIQUE,
- is_enabled INTEGER NOT NULL DEFAULT 1,
- etag TEXT,
- last_modified TEXT,
- last_checked_at TEXT,
- created_at TEXT NOT NULL DEFAULT (datetime('now')),
- updated_at TEXT NOT NULL DEFAULT (datetime('now')),
- FOREIGN KEY(source_id) REFERENCES sources(id) ON DELETE SET NULL
- );
-
- CREATE TABLE IF NOT EXISTS runs (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- run_type TEXT NOT NULL,
- status TEXT NOT NULL CHECK (status IN ('queued', 'running', 'success', 'failed')),
- started_at TEXT NOT NULL DEFAULT (datetime('now')),
- finished_at TEXT,
- details TEXT
- );
-
- CREATE TABLE IF NOT EXISTS publish_jobs (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- article_id INTEGER NOT NULL,
- status TEXT NOT NULL CHECK (status IN ('queued', 'running', 'success', 'failed')),
- attempts INTEGER NOT NULL DEFAULT 0,
- max_attempts INTEGER NOT NULL DEFAULT 3,
- error_message TEXT,
- wp_post_id INTEGER,
- wp_post_url TEXT,
- created_at TEXT NOT NULL DEFAULT (datetime('now')),
- started_at TEXT,
- finished_at TEXT,
- FOREIGN KEY(article_id) REFERENCES articles(id) ON DELETE CASCADE
- );
-
- CREATE TABLE IF NOT EXISTS articles (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- feed_id INTEGER,
- source_article_id TEXT,
- source_hash TEXT,
- title TEXT NOT NULL,
- source_url TEXT NOT NULL,
- canonical_url TEXT,
- published_at TEXT,
- author TEXT,
- summary TEXT,
- content_raw TEXT,
- content_rewritten TEXT,
- image_urls_json TEXT,
- press_contact TEXT,
- source_name_snapshot TEXT,
- source_terms_url_snapshot TEXT,
- source_license_name_snapshot TEXT,
- legal_checked INTEGER NOT NULL DEFAULT 0,
- legal_checked_at TEXT,
- legal_note TEXT,
- wp_post_id INTEGER,
- wp_post_url TEXT,
- publish_attempts INTEGER NOT NULL DEFAULT 0,
- publish_last_error TEXT,
- published_to_wp_at TEXT,
- word_count INTEGER DEFAULT 0,
- status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error', 'no_image')),
- meta_json TEXT,
- created_at TEXT NOT NULL DEFAULT (datetime('now')),
- updated_at TEXT NOT NULL DEFAULT (datetime('now')),
- FOREIGN KEY(feed_id) REFERENCES feeds(id) ON DELETE SET NULL,
- UNIQUE(source_url)
- );
-
- CREATE INDEX IF NOT EXISTS idx_articles_source_article_id ON articles(source_article_id);
- CREATE INDEX IF NOT EXISTS idx_articles_source_hash ON articles(source_hash);
- CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_feed_source_article_id
- ON articles(feed_id, source_article_id)
- WHERE source_article_id IS NOT NULL;
- CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_source_hash
- ON articles(source_hash)
- WHERE source_hash IS NOT NULL;
- CREATE INDEX IF NOT EXISTS idx_articles_status ON articles(status);
- CREATE INDEX IF NOT EXISTS idx_feeds_source_id ON feeds(source_id);
- CREATE INDEX IF NOT EXISTS idx_runs_started_at ON runs(started_at);
- CREATE INDEX IF NOT EXISTS idx_articles_published_at ON articles(published_at);
- CREATE INDEX IF NOT EXISTS idx_publish_jobs_status_created_at ON publish_jobs(status, created_at);
-
- CREATE TRIGGER IF NOT EXISTS trg_sources_updated_at
- AFTER UPDATE ON sources
- FOR EACH ROW
- BEGIN
- UPDATE sources SET updated_at = datetime('now') WHERE id = OLD.id;
- END;
-
- CREATE TRIGGER IF NOT EXISTS trg_feeds_updated_at
- AFTER UPDATE ON feeds
- FOR EACH ROW
- BEGIN
- UPDATE feeds SET updated_at = datetime('now') WHERE id = OLD.id;
- END;
-
- CREATE TRIGGER IF NOT EXISTS trg_articles_updated_at
- AFTER UPDATE ON articles
- FOR EACH ROW
- BEGIN
- UPDATE articles SET updated_at = datetime('now') WHERE id = OLD.id;
- END;
- """
- )
-
- # Lightweight migration for existing DBs created before source_hash was introduced.
- existing_columns = {
- row["name"] for row in conn.execute("PRAGMA table_info(articles)").fetchall()
- }
- migration_columns = {
- "relevance_score": "ALTER TABLE articles ADD COLUMN relevance_score INTEGER",
- "scheduled_publish_at": "ALTER TABLE articles ADD COLUMN scheduled_publish_at TEXT",
- "source_hash": "ALTER TABLE articles ADD COLUMN source_hash TEXT",
- "image_urls_json": "ALTER TABLE articles ADD COLUMN image_urls_json TEXT",
- "press_contact": "ALTER TABLE articles ADD COLUMN press_contact TEXT",
- "source_name_snapshot": "ALTER TABLE articles ADD COLUMN source_name_snapshot TEXT",
- "source_terms_url_snapshot": "ALTER TABLE articles ADD COLUMN source_terms_url_snapshot TEXT",
- "source_license_name_snapshot": "ALTER TABLE articles ADD COLUMN source_license_name_snapshot TEXT",
- "legal_checked": "ALTER TABLE articles ADD COLUMN legal_checked INTEGER NOT NULL DEFAULT 0",
- "legal_checked_at": "ALTER TABLE articles ADD COLUMN legal_checked_at TEXT",
- "legal_note": "ALTER TABLE articles ADD COLUMN legal_note TEXT",
- "wp_post_id": "ALTER TABLE articles ADD COLUMN wp_post_id INTEGER",
- "wp_post_url": "ALTER TABLE articles ADD COLUMN wp_post_url TEXT",
- "publish_attempts": "ALTER TABLE articles ADD COLUMN publish_attempts INTEGER NOT NULL DEFAULT 0",
- "publish_last_error": "ALTER TABLE articles ADD COLUMN publish_last_error TEXT",
- "published_to_wp_at": "ALTER TABLE articles ADD COLUMN published_to_wp_at TEXT",
- }
- for column, ddl in migration_columns.items():
- if column not in existing_columns:
- conn.execute(ddl)
-
- # Migration: add 'no_image' to the status CHECK constraint if not present.
- # SQLite cannot modify CHECK constraints in-place, so we recreate the table.
- table_sql_row = conn.execute(
- "SELECT sql FROM sqlite_master WHERE type='table' AND name='articles'"
- ).fetchone()
- if table_sql_row and "'no_image'" not in (table_sql_row["sql"] or ""):
- conn.executescript(
- """
- PRAGMA foreign_keys=OFF;
-
- CREATE TABLE articles_v2 (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- feed_id INTEGER,
- source_article_id TEXT,
- source_hash TEXT,
- title TEXT NOT NULL,
- source_url TEXT NOT NULL,
- canonical_url TEXT,
- published_at TEXT,
- author TEXT,
- summary TEXT,
- content_raw TEXT,
- content_rewritten TEXT,
- image_urls_json TEXT,
- press_contact TEXT,
- source_name_snapshot TEXT,
- source_terms_url_snapshot TEXT,
- source_license_name_snapshot TEXT,
- legal_checked INTEGER NOT NULL DEFAULT 0,
- legal_checked_at TEXT,
- legal_note TEXT,
- wp_post_id INTEGER,
- wp_post_url TEXT,
- publish_attempts INTEGER NOT NULL DEFAULT 0,
- publish_last_error TEXT,
- published_to_wp_at TEXT,
- word_count INTEGER DEFAULT 0,
- status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error', 'no_image')),
- meta_json TEXT,
- relevance_score INTEGER,
- scheduled_publish_at TEXT,
- created_at TEXT NOT NULL DEFAULT (datetime('now')),
- updated_at TEXT NOT NULL DEFAULT (datetime('now')),
- FOREIGN KEY(feed_id) REFERENCES feeds(id) ON DELETE SET NULL,
- UNIQUE(source_url)
- );
-
- INSERT INTO articles_v2 SELECT
- id, feed_id, source_article_id, source_hash, title, source_url,
- canonical_url, published_at, author, summary, content_raw,
- content_rewritten, image_urls_json, press_contact,
- source_name_snapshot, source_terms_url_snapshot, source_license_name_snapshot,
- legal_checked, legal_checked_at, legal_note,
- wp_post_id, wp_post_url, publish_attempts, publish_last_error,
- published_to_wp_at, word_count, status, meta_json,
- relevance_score, scheduled_publish_at, created_at, updated_at
- FROM articles;
-
- DROP TABLE articles;
- ALTER TABLE articles_v2 RENAME TO articles;
-
- CREATE INDEX IF NOT EXISTS idx_articles_source_article_id ON articles(source_article_id);
- CREATE INDEX IF NOT EXISTS idx_articles_source_hash ON articles(source_hash);
- CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_feed_source_article_id
- ON articles(feed_id, source_article_id)
- WHERE source_article_id IS NOT NULL;
- CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_source_hash
- ON articles(source_hash)
- WHERE source_hash IS NOT NULL;
- CREATE INDEX IF NOT EXISTS idx_articles_status ON articles(status);
- CREATE INDEX IF NOT EXISTS idx_articles_published_at ON articles(published_at);
-
- CREATE TRIGGER IF NOT EXISTS trg_articles_updated_at
- AFTER UPDATE ON articles
- FOR EACH ROW
- BEGIN
- UPDATE articles SET updated_at = datetime('now') WHERE id = OLD.id;
- END;
-
- PRAGMA foreign_keys=ON;
- """
- )
-
- table_rows = conn.execute(
- "SELECT name FROM sqlite_master WHERE type = 'table' AND name = 'publish_jobs'"
- ).fetchall()
- if not table_rows:
- conn.executescript(
- """
- CREATE TABLE IF NOT EXISTS publish_jobs (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- article_id INTEGER NOT NULL,
- status TEXT NOT NULL CHECK (status IN ('queued', 'running', 'success', 'failed')),
- attempts INTEGER NOT NULL DEFAULT 0,
- max_attempts INTEGER NOT NULL DEFAULT 3,
- error_message TEXT,
- wp_post_id INTEGER,
- wp_post_url TEXT,
- created_at TEXT NOT NULL DEFAULT (datetime('now')),
- started_at TEXT,
- finished_at TEXT,
- FOREIGN KEY(article_id) REFERENCES articles(id) ON DELETE CASCADE
- );
- CREATE INDEX IF NOT EXISTS idx_publish_jobs_status_created_at ON publish_jobs(status, created_at);
- """
- )
-
-
-def rows_to_dicts(rows: list[sqlite3.Row]) -> list[dict[str, Any]]:
- return [dict(r) for r in rows]
diff --git a/backend/app/ingestion.py b/backend/app/ingestion.py
deleted file mode 100644
index 391af92..0000000
--- a/backend/app/ingestion.py
+++ /dev/null
@@ -1,486 +0,0 @@
-from __future__ import annotations
-
-from dataclasses import dataclass
-from datetime import datetime, timedelta, timezone
-import hashlib
-import json
-import re
-import time
-from typing import Any
-from urllib.parse import unquote, urlencode, urlparse, parse_qs
-import urllib.error
-import urllib.request as _urllib_req
-
-import feedparser
-
-from .repositories import (
- ArticleUpsert,
- RunCreate,
- create_run,
- find_existing_article_for_upsert,
- finish_run,
- get_feed_by_id,
- list_enabled_feeds,
- update_feed_fetch_state,
- upsert_article,
-)
-from .source_extraction import extract_article, extracted_article_to_meta
-
-
-@dataclass(frozen=True)
-class IngestionStats:
- run_id: int
- feeds_processed: int
- entries_seen: int
- articles_upserted: int
- status: str
- message: str
-
-
-MAX_FEED_FETCH_RETRIES = 3
-
-
-def _normalize_article_url(url: str) -> str:
- """Strip AMP and tracking query parameters from article URLs.
-
- Removes ?outputType=valid_amp and other AMP/tracking params so that
- AMP and non-AMP versions of the same article are deduplicated.
- """
- _AMP_PARAMS = {"outputtype", "amp", "outputformat"}
- try:
- from urllib.parse import parse_qs, urlencode
- parsed = urlparse(url)
- if not parsed.query:
- return url
- params = parse_qs(parsed.query, keep_blank_values=True)
- filtered = {k: v for k, v in params.items() if k.lower() not in _AMP_PARAMS}
- new_query = urlencode(filtered, doseq=True)
- return parsed._replace(query=new_query).geturl()
- except Exception:
- return url
-
-
-def _resolve_google_redirect(url: str) -> str:
- """Extract the real article URL from Google redirect URLs.
-
- Google Alerts feed entries use tracking links like:
- https://www.google.com/url?rct=j&sa=t&url=&ct=ga&...
-
- This function returns the decoded real URL if detected, otherwise the
- original URL unchanged.
- """
- try:
- parsed = urlparse(url)
- host = (parsed.hostname or "").lower()
- if host not in ("www.google.com", "google.com"):
- return url
- if parsed.path not in ("/url", "/url/"):
- return url
- params = parse_qs(parsed.query, keep_blank_values=False)
- real_urls = params.get("url")
- if real_urls:
- return unquote(real_urls[0])
- except Exception:
- pass
- return url
-
-
-def _entry_published_iso(entry: dict) -> str | None:
- published = entry.get("published_parsed") or entry.get("updated_parsed")
- if not published:
- return None
- return datetime(*published[:6], tzinfo=timezone.utc).isoformat()
-
-
-def _entry_text(entry: dict) -> tuple[str, str]:
- summary = entry.get("summary", "") or ""
- content = ""
- if entry.get("content") and isinstance(entry.get("content"), list):
- first = entry["content"][0]
- content = first.get("value", "") if isinstance(first, dict) else ""
- if not content:
- content = summary
- return summary, content
-
-
-def _entry_hash(entry: dict, feed_id: int, link: str, title: str, summary: str) -> str:
- source_id = entry.get("id") or entry.get("guid") or ""
- published = _entry_published_iso(entry) or ""
- fingerprint = f"{feed_id}|{source_id}|{link}|{title.strip()}|{summary.strip()}|{published}"
- return hashlib.sha256(fingerprint.encode("utf-8")).hexdigest()
-
-
-def _parsed_get(parsed: object, key: str, default: object = None) -> object:
- if isinstance(parsed, dict):
- return parsed.get(key, default)
- return getattr(parsed, key, default)
-
-
-def _normalize_tokens(text: str) -> set[str]:
- normalized = re.sub(r"[^a-z0-9]+", " ", text.lower())
- return {token for token in normalized.split() if len(token) >= 4}
-
-
-def _probe_image_url(url: str, timeout: int = 5) -> bool:
- """Return True if URL responds without a 4xx/5xx error (HEAD request).
-
- Returns True on network/connection errors so that a flaky server does not
- cause a valid image to be silently dropped.
- """
- try:
- req = _urllib_req.Request(
- url,
- method="HEAD",
- headers={"User-Agent": "Mozilla/5.0 (compatible; rss-news/1.0)"},
- )
- with _urllib_req.urlopen(req, timeout=timeout) as resp:
- return resp.status < 400
- except urllib.error.HTTPError as exc:
- return exc.code < 400 # 3xx redirects are OK; 4xx/5xx are not
- except Exception:
- return True # network error → don't filter, let WP try later
-
-
-def _rank_image_candidates(source_url: str, title: str, images: list[str]) -> list[dict[str, Any]]:
- source_host = (urlparse(source_url).hostname or "").lower()
- is_presseportal = "presseportal.de" in source_host
- title_tokens = _normalize_tokens(title)
- blocked_patterns = ("logo", "badge", "app-store", "google-play", "na-logo", "sprite", "icon", "favicon", "tracking", "pixel", ".svg", ".ico", ".gif")
- # Known placeholder/default images that should never be used as featured image
- placeholder_patterns = ("some-default.jpg", "default-image", "placeholder", "no-image", "noimage")
-
-
- ranked: list[dict[str, Any]] = []
- for url in images:
- # Skip inline data: URIs (e.g. base64-encoded SVG placeholders)
- if url.startswith("data:"):
- continue
-
- parsed = urlparse(url)
- path = unquote(parsed.path.lower())
- full = f"{parsed.netloc.lower()}{path}"
- score = 0
- reasons: list[str] = []
-
- if any(token in full for token in placeholder_patterns):
- score -= 300
- reasons.append("placeholder-image")
-
- if any(token in full for token in blocked_patterns):
- score -= 150
- reasons.append("blocked-pattern")
-
- if is_presseportal and "/thumbnail/story_big/" in path:
- score += 120
- reasons.append("presseportal-story-big")
- elif is_presseportal and "/thumbnail/highlight/" in path:
- score += 45
- reasons.append("presseportal-highlight")
- elif is_presseportal and "/thumbnail/liste/" in path:
- score -= 40
- reasons.append("presseportal-list")
-
- if "crop=" in (parsed.query or "").lower():
- score -= 10
- reasons.append("cropped-preview")
-
- path_tokens = _normalize_tokens(path.replace("-", " "))
- overlap = len(title_tokens.intersection(path_tokens))
- if overlap > 0:
- score += min(30, overlap * 6)
- reasons.append(f"title-match:{overlap}")
-
- ranked.append({"url": url, "score": score, "reasons": reasons})
-
- ranked.sort(key=lambda item: item["score"], reverse=True)
- return ranked
-
-
-def _select_relevant_images(source_url: str, title: str, images: list[str], max_keep: int = 3) -> tuple[list[str], str | None, list[dict[str, Any]]]:
- # dedupe incoming order first
- deduped: list[str] = []
- seen: set[str] = set()
- for image in images:
- if image and image not in seen:
- seen.add(image)
- deduped.append(image)
-
- ranked = _rank_image_candidates(source_url, title, deduped)
- candidates = [item["url"] for item in ranked if item["score"] > -100]
-
- # Probe top candidates (max 4) to skip definitively broken URLs (HTTP 4xx).
- # Network errors are treated as OK to avoid false negatives on flaky servers.
- primary = None
- kept: list[str] = []
- for url in candidates[:4]:
- if _probe_image_url(url):
- if primary is None:
- primary = url
- kept.append(url)
- if len(kept) >= max_keep:
- break
-
- # Fallback: if all probes failed with network errors, use best candidate anyway
- if not kept and candidates:
- primary = candidates[0]
- kept = candidates[:max_keep]
-
- return kept, primary, ranked
-
-
-def _merge_ingestion_meta(existing_meta_json: str | None, attribution: dict[str, Any], extraction_meta: dict[str, Any]) -> str:
- meta: dict[str, Any] = {}
- if existing_meta_json:
- try:
- parsed = json.loads(existing_meta_json)
- if isinstance(parsed, dict):
- meta = parsed
- except Exception:
- meta = {}
- meta["attribution"] = attribution
- meta["extraction"] = extraction_meta
- return json.dumps(meta, ensure_ascii=False)
-
-
-def run_ingestion(feed_id: int | None = None) -> IngestionStats:
- run_id = create_run(RunCreate(run_type="ingestion", status="running", details="started"))
- feeds_processed = 0
- entries_seen = 0
- articles_upserted = 0
- feed_results: list[dict[str, object]] = []
-
- try:
- if feed_id is not None:
- feed = get_feed_by_id(feed_id)
- feeds = [feed] if feed and int(feed.get("is_enabled", 0)) == 1 else []
- else:
- feeds = list_enabled_feeds()
-
- for feed in feeds:
- if not feed:
- continue
- feeds_processed += 1
-
- parsed = None
- feed_error = None
- for attempt in range(1, MAX_FEED_FETCH_RETRIES + 1):
- try:
- parsed = feedparser.parse(
- feed["url"],
- etag=feed.get("etag"),
- modified=feed.get("last_modified"),
- )
- break
- except Exception as exc:
- feed_error = str(exc)
- if attempt < MAX_FEED_FETCH_RETRIES:
- time.sleep(0.5 * attempt)
-
- if parsed is None:
- feed_results.append(
- {
- "feed_id": int(feed["id"]),
- "feed_url": feed["url"],
- "status": "failed",
- "error": feed_error or "unknown",
- "entries_seen": 0,
- "upserts": 0,
- }
- )
- continue
-
- # Persist ETag/Last-Modified for conditional requests.
- parsed_etag = _parsed_get(parsed, "etag")
- parsed_modified = _parsed_get(parsed, "modified")
- if parsed_modified and not isinstance(parsed_modified, str):
- parsed_modified = str(parsed_modified)
- update_feed_fetch_state(
- feed_id=int(feed["id"]),
- etag=parsed_etag if isinstance(parsed_etag, str) else None,
- last_modified=parsed_modified if isinstance(parsed_modified, str) else None,
- )
-
- feed_entries_seen = 0
- feed_upserts = 0
- from .config import get_settings as _get_settings
- _max_age_days = _get_settings().pipeline_max_article_age_days
- for entry in _parsed_get(parsed, "entries", []):
- entries_seen += 1
- feed_entries_seen += 1
- link = entry.get("link")
- if not link:
- continue
-
- # Age filter: skip articles older than max_age_days (0 = no limit)
- if _max_age_days > 0:
- published_iso = _entry_published_iso(entry)
- if published_iso:
- try:
- published_dt = datetime.fromisoformat(published_iso)
- age = datetime.now(timezone.utc) - published_dt
- if age > timedelta(days=_max_age_days):
- continue
- except Exception:
- pass # can't parse date → allow through
-
- # Resolve Google redirect URLs (google.com/url?...&url=&...)
- link = _resolve_google_redirect(link)
- # Normalize AMP/tracking params (e.g. ?outputType=valid_amp)
- link = _normalize_article_url(link)
-
- summary, content_raw = _entry_text(entry)
- # Strip HTML tags from title (Google Alerts wraps matched keywords in )
- raw_title = entry.get("title") or "Ohne Titel"
- title = re.sub(r"<[^>]+>", "", raw_title).strip() or "Ohne Titel"
- extracted = extract_article(link)
-
- final_title = extracted.title or title
- final_author = extracted.author or entry.get("author")
- final_summary = extracted.summary or (summary[:1000] if summary else None)
- final_content_raw = extracted.content_text or content_raw
- final_canonical = extracted.canonical_url or entry.get("link")
- selected_images, primary_image, ranked_images = _select_relevant_images(
- link,
- final_title,
- extracted.images,
- max_keep=3,
- )
-
- source_hash = _entry_hash(
- entry,
- int(feed["id"]),
- link,
- final_title,
- final_summary or "",
- )
- attribution = {
- "source_name": feed.get("source_name"),
- "source_base_url": feed.get("source_base_url"),
- "source_terms_url": feed.get("source_terms_url"),
- "source_license_name": feed.get("source_license_name"),
- "source_risk_level": feed.get("source_risk_level"),
- "original_link": link,
- "feed_name": feed.get("name"),
- "feed_id": int(feed["id"]),
- "imported_at": datetime.now(timezone.utc).isoformat(),
- }
- extraction_meta: dict[str, Any] = extracted_article_to_meta(extracted)
- extraction_meta["fetched_from"] = link
- extraction_meta["image_selection"] = {
- "primary": primary_image,
- "selected_count": len(selected_images),
- "total_candidates": len(extracted.images),
- "ranked": ranked_images,
- }
- base_payload = ArticleUpsert(
- feed_id=int(feed["id"]),
- source_article_id=entry.get("id") or entry.get("guid"),
- source_hash=source_hash,
- title=final_title,
- source_url=link,
- canonical_url=final_canonical,
- published_at=_entry_published_iso(entry),
- author=final_author,
- summary=final_summary,
- content_raw=final_content_raw,
- content_rewritten=None,
- image_urls_json=json.dumps(selected_images, ensure_ascii=False) if selected_images else None,
- press_contact=extracted.press_contact,
- source_name_snapshot=feed.get("source_name"),
- source_terms_url_snapshot=feed.get("source_terms_url"),
- source_license_name_snapshot=feed.get("source_license_name"),
- legal_checked=False,
- legal_checked_at=None,
- legal_note=None,
- wp_post_id=None,
- wp_post_url=None,
- publish_attempts=0,
- publish_last_error=None,
- published_to_wp_at=None,
- word_count=len((final_content_raw or "").split()),
- status="new",
- meta_json=json.dumps({"attribution": attribution, "extraction": extraction_meta}, ensure_ascii=False),
- )
- existing = find_existing_article_for_upsert(base_payload)
- if existing and existing.get("status") == "error":
- # Explicitly closed article: ignore on subsequent ingestion runs.
- continue
-
- payload = base_payload
- if existing:
- payload = ArticleUpsert(
- feed_id=base_payload.feed_id,
- source_article_id=base_payload.source_article_id,
- source_hash=base_payload.source_hash,
- title=base_payload.title,
- source_url=base_payload.source_url,
- canonical_url=base_payload.canonical_url,
- published_at=base_payload.published_at,
- author=base_payload.author,
- summary=base_payload.summary,
- content_raw=base_payload.content_raw,
- content_rewritten=existing.get("content_rewritten"),
- image_urls_json=base_payload.image_urls_json,
- press_contact=base_payload.press_contact or existing.get("press_contact"),
- source_name_snapshot=base_payload.source_name_snapshot,
- source_terms_url_snapshot=base_payload.source_terms_url_snapshot,
- source_license_name_snapshot=base_payload.source_license_name_snapshot,
- legal_checked=bool(int(existing.get("legal_checked", 0))),
- legal_checked_at=existing.get("legal_checked_at"),
- legal_note=existing.get("legal_note"),
- wp_post_id=existing.get("wp_post_id"),
- wp_post_url=existing.get("wp_post_url"),
- publish_attempts=int(existing.get("publish_attempts", 0)),
- publish_last_error=existing.get("publish_last_error"),
- published_to_wp_at=existing.get("published_to_wp_at"),
- word_count=base_payload.word_count,
- status=existing.get("status") or "new",
- meta_json=_merge_ingestion_meta(existing.get("meta_json"), attribution, extraction_meta),
- )
-
- article_id = upsert_article(payload)
- if article_id:
- articles_upserted += 1
- feed_upserts += 1
-
- feed_results.append(
- {
- "feed_id": int(feed["id"]),
- "feed_url": feed["url"],
- "status": "success",
- "entries_seen": feed_entries_seen,
- "upserts": feed_upserts,
- }
- )
-
- finish_run(
- run_id=run_id,
- status="success",
- details=json.dumps(
- {
- "feeds_processed": feeds_processed,
- "entries_seen": entries_seen,
- "upserts": articles_upserted,
- "feeds": feed_results,
- },
- ensure_ascii=False,
- ),
- )
- return IngestionStats(
- run_id=run_id,
- feeds_processed=feeds_processed,
- entries_seen=entries_seen,
- articles_upserted=articles_upserted,
- status="success",
- message="Ingestion abgeschlossen",
- )
- except Exception as exc:
- finish_run(run_id=run_id, status="failed", details=str(exc))
- return IngestionStats(
- run_id=run_id,
- feeds_processed=feeds_processed,
- entries_seen=entries_seen,
- articles_upserted=articles_upserted,
- status="failed",
- message=str(exc),
- )
diff --git a/backend/app/main.py b/backend/app/main.py
deleted file mode 100644
index b4776af..0000000
--- a/backend/app/main.py
+++ /dev/null
@@ -1,727 +0,0 @@
-import asyncio
-from contextlib import asynccontextmanager
-import csv
-from datetime import datetime, timezone
-import io
-import json
-import logging
-from pathlib import Path
-
-from fastapi import Depends, FastAPI, HTTPException, Request, Response, status
-from fastapi.responses import JSONResponse
-from pydantic import BaseModel, Field
-from fastapi.staticfiles import StaticFiles
-
-from .admin_ui import router as admin_router
-from .auth import create_session_token, verify_credentials, verify_session_token
-from .config import get_settings
-from .db import init_db
-from .ingestion import run_ingestion
-from .pipeline import run_auto_pipeline
-from .policy import evaluate_source_policy, is_source_allowed
-from .publisher import enqueue_publish, run_publisher
-from .relevance import article_age_days, article_relevance
-from .rewrite import generate_article_tags, merge_generated_tags, rewrite_article_text
-from .telegram_bot import handle_update, setup_webhook
-from .repositories import (
- ArticleUpsert,
- FeedCreate,
- RunCreate,
- SourceCreate,
- create_feed as repo_create_feed,
- create_run,
- create_source as repo_create_source,
- finish_run,
- get_article_by_id,
- get_feed_by_id,
- get_run_by_id,
- get_source_by_id,
- list_publish_jobs,
- list_articles as repo_list_articles,
- list_feeds as repo_list_feeds,
- list_runs,
- list_sources as repo_list_sources,
- set_article_legal_review,
- update_article_status,
- upsert_article as repo_upsert_article,
-)
-from .workflow import ALLOWED_UI_TRANSITIONS, UI_STATUSES, internal_to_ui_status, ui_to_internal_status
-
-settings = get_settings()
-
-
-@asynccontextmanager
-async def app_lifespan(_: FastAPI):
- init_db()
- yield
-
-
-app = FastAPI(title=settings.app_name, lifespan=app_lifespan)
-app.include_router(admin_router)
-app.mount(
- "/admin/static",
- StaticFiles(directory=str(Path(__file__).resolve().parent.parent / "static")),
- name="admin-static",
-)
-
-
-class LoginRequest(BaseModel):
- username: str
- password: str
-
-
-class SourceCreateRequest(BaseModel):
- name: str = Field(min_length=1, max_length=200)
- base_url: str | None = None
- terms_url: str | None = None
- license_name: str | None = None
- risk_level: str = Field(default="yellow", pattern="^(green|yellow|red)$")
- is_enabled: bool = False
- notes: str | None = None
- last_reviewed_at: str | None = None
-
-
-class FeedCreateRequest(BaseModel):
- name: str = Field(min_length=1, max_length=200)
- url: str = Field(min_length=5, max_length=1000)
- source_id: int | None = None
- is_enabled: bool = True
-
-
-class RunCreateRequest(BaseModel):
- run_type: str = Field(min_length=2, max_length=100)
- status: str = Field(default="queued", pattern="^(queued|running|success|failed)$")
- details: str | None = None
-
-
-class RunFinishRequest(BaseModel):
- status: str = Field(pattern="^(success|failed)$")
- details: str | None = None
-
-
-class ArticleUpsertRequest(BaseModel):
- feed_id: int | None = None
- source_article_id: str | None = None
- source_hash: str | None = None
- title: str = Field(min_length=1, max_length=500)
- source_url: str = Field(min_length=5, max_length=2000)
- canonical_url: str | None = None
- published_at: str | None = None
- author: str | None = None
- summary: str | None = None
- content_raw: str | None = None
- content_rewritten: str | None = None
- image_urls_json: str | None = None
- press_contact: str | None = None
- source_name_snapshot: str | None = None
- source_terms_url_snapshot: str | None = None
- source_license_name_snapshot: str | None = None
- legal_checked: bool = False
- legal_checked_at: str | None = None
- legal_note: str | None = None
- wp_post_id: int | None = None
- wp_post_url: str | None = None
- publish_attempts: int = 0
- publish_last_error: str | None = None
- published_to_wp_at: str | None = None
- word_count: int = 0
- status: str = Field(default="new", pattern="^(new|rewrite|publish|published|close|review|approved|error|no_image)$")
- meta_json: str | None = None
-
-
-class IngestionRunRequest(BaseModel):
- feed_id: int | None = None
-
-
-class ArticleTransitionRequest(BaseModel):
- target_status: str = Field(pattern="^(new|rewrite|publish|published|close|review|approved|error|no_image)$")
- note: str | None = None
-
-
-class ArticleReviewRequest(BaseModel):
- decision: str = Field(pattern="^(approve|reject)$")
- note: str | None = None
-
-
-class ArticleLegalReviewRequest(BaseModel):
- approved: bool
- note: str | None = None
-
-
-class PublisherEnqueueRequest(BaseModel):
- article_id: int
- max_attempts: int = 3
-
-
-class PublisherRunRequest(BaseModel):
- max_jobs: int = 10
-
-
-ALLOWED_ARTICLE_TRANSITIONS: dict[str, set[str]] = {
- "new": {"rewrite", "error"},
- "rewrite": {"approved", "error"},
- "approved": {"published", "error"},
- "published": {"error"},
- "error": {"rewrite"},
-}
-
-
-def require_auth(request: Request) -> str:
- token = request.cookies.get(settings.session_cookie_name)
- if not token:
- raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Nicht angemeldet")
-
- username = verify_session_token(token)
- if not username:
- raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Session ungueltig oder abgelaufen")
-
- return username
-
-
-@app.get("/health")
-def health() -> dict:
- return {"status": "ok", "service": settings.app_name, "db_path": settings.app_db_path}
-
-
-@app.post("/auth/login")
-def login(payload: LoginRequest, response: Response) -> dict:
- if not verify_credentials(payload.username, payload.password):
- raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Ungueltige Zugangsdaten")
-
- token = create_session_token(payload.username)
- response.set_cookie(
- key=settings.session_cookie_name,
- value=token,
- max_age=settings.session_max_age_seconds,
- httponly=True,
- secure=False,
- samesite="lax",
- )
- return {"ok": True, "username": payload.username}
-
-
-@app.post("/auth/logout")
-def logout(response: Response) -> dict:
- response.delete_cookie(settings.session_cookie_name)
- return {"ok": True}
-
-
-@app.get("/auth/me")
-def me(username: str = Depends(require_auth)) -> dict:
- return {"authenticated": True, "username": username}
-
-
-@app.get("/api/protected")
-def protected(username: str = Depends(require_auth)) -> dict:
- return {"ok": True, "message": "Protected endpoint", "username": username}
-
-
-@app.get("/api/pipeline/status")
-def pipeline_status(username: str = Depends(require_auth)) -> dict:
- feeds_total = len(repo_list_feeds())
- sources_total = len(repo_list_sources())
- articles_total = len(repo_list_articles(limit=500))
- return {
- "ok": True,
- "stage": "skeleton+db",
- "requested_by": username,
- "counts": {
- "sources": sources_total,
- "feeds": feeds_total,
- "articles": articles_total,
- },
- }
-
-
-@app.get("/api/sources")
-def list_sources(username: str = Depends(require_auth)) -> dict:
- return {"ok": True, "items": repo_list_sources(), "requested_by": username}
-
-
-@app.get("/api/sources/{source_id}/policy-check")
-def source_policy_check(source_id: int, username: str = Depends(require_auth)) -> dict:
- source = get_source_by_id(source_id)
- if not source:
- raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Quelle nicht gefunden")
- issues = evaluate_source_policy(source)
- return {
- "ok": True,
- "source_id": source_id,
- "allowed": is_source_allowed(source),
- "issues": issues,
- "requested_by": username,
- }
-
-
-@app.post("/api/sources")
-def create_source(payload: SourceCreateRequest, username: str = Depends(require_auth)) -> dict:
- source_id = repo_create_source(
- SourceCreate(
- name=payload.name,
- base_url=payload.base_url,
- terms_url=payload.terms_url,
- license_name=payload.license_name,
- risk_level=payload.risk_level,
- is_enabled=payload.is_enabled,
- notes=payload.notes,
- last_reviewed_at=payload.last_reviewed_at,
- )
- )
- return {"ok": True, "id": source_id, "requested_by": username}
-
-
-@app.get("/api/feeds")
-def list_feeds(username: str = Depends(require_auth)) -> dict:
- return {"ok": True, "items": repo_list_feeds(), "requested_by": username}
-
-
-@app.get("/api/feeds/{feed_id}/policy-check")
-def feed_policy_check(feed_id: int, username: str = Depends(require_auth)) -> dict:
- feed = get_feed_by_id(feed_id)
- if not feed:
- raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Feed nicht gefunden")
-
- source_snapshot = {
- "id": feed.get("source_id"),
- "name": feed.get("source_name"),
- "base_url": feed.get("source_base_url"),
- "terms_url": feed.get("source_terms_url"),
- "license_name": feed.get("source_license_name"),
- "risk_level": feed.get("source_risk_level"),
- "last_reviewed_at": feed.get("source_last_reviewed_at"),
- "is_enabled": feed.get("source_is_enabled"),
- }
- issues = evaluate_source_policy(source_snapshot)
- return {
- "ok": True,
- "feed_id": feed_id,
- "allowed": len(issues) == 0,
- "issues": issues,
- "requested_by": username,
- }
-
-
-@app.post("/api/feeds")
-def create_feed(payload: FeedCreateRequest, username: str = Depends(require_auth)) -> dict:
- try:
- feed_id = repo_create_feed(
- FeedCreate(
- name=payload.name,
- url=payload.url,
- source_id=payload.source_id,
- is_enabled=payload.is_enabled,
- )
- )
- except Exception as exc:
- raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=f"Feed konnte nicht angelegt werden: {exc}") from exc
-
- return {"ok": True, "id": feed_id, "requested_by": username}
-
-
-@app.get("/api/runs")
-def api_list_runs(limit: int = 50, username: str = Depends(require_auth)) -> dict:
- return {"ok": True, "items": list_runs(limit=limit), "requested_by": username}
-
-
-@app.get("/api/runs/{run_id}")
-def api_get_run(run_id: int, username: str = Depends(require_auth)) -> dict:
- run = get_run_by_id(run_id)
- if not run:
- raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Run nicht gefunden")
- return {"ok": True, "item": run, "requested_by": username}
-
-
-@app.post("/api/runs")
-def api_create_run(payload: RunCreateRequest, username: str = Depends(require_auth)) -> dict:
- run_id = create_run(RunCreate(run_type=payload.run_type, status=payload.status, details=payload.details))
- return {"ok": True, "id": run_id, "requested_by": username}
-
-
-@app.post("/api/runs/{run_id}/finish")
-def api_finish_run(run_id: int, payload: RunFinishRequest, username: str = Depends(require_auth)) -> dict:
- finish_run(run_id=run_id, status=payload.status, details=payload.details)
- return {"ok": True, "id": run_id, "requested_by": username}
-
-
-@app.get("/api/articles")
-def api_list_articles(limit: int = 100, status_filter: str | None = None, username: str = Depends(require_auth)) -> dict:
- internal_filter = ui_to_internal_status(status_filter) if status_filter else None
- items = repo_list_articles(limit=limit, status_filter=internal_filter)
- for item in items:
- item["status_ui"] = internal_to_ui_status(item.get("status"))
- return {"ok": True, "items": items, "requested_by": username}
-
-
-@app.get("/api/articles/export")
-def api_export_articles(
- format: str = "json",
- status_filter: str | None = None,
- username: str = Depends(require_auth),
-):
- internal_filter = ui_to_internal_status(status_filter) if status_filter else None
- articles = repo_list_articles(limit=500, status_filter=internal_filter)
- rows = []
- for article in articles:
- meta: dict = {}
- if article.get("meta_json"):
- try:
- parsed = json.loads(article["meta_json"])
- if isinstance(parsed, dict):
- meta = parsed
- except Exception:
- meta = {}
- image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {}
- selected_image_url = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None
-
- days_old = article_age_days(article.get("published_at"))
- rows.append(
- {
- "id": article.get("id"),
- "title": article.get("title"),
- "status": article.get("status"),
- "published_at": article.get("published_at"),
- "days_old": days_old,
- "relevance": article_relevance(article.get("published_at")),
- "author": article.get("author"),
- "source_url": article.get("source_url"),
- "canonical_url": article.get("canonical_url"),
- "source_name_snapshot": article.get("source_name_snapshot"),
- "source_license_name_snapshot": article.get("source_license_name_snapshot"),
- "source_terms_url_snapshot": article.get("source_terms_url_snapshot"),
- "press_contact": article.get("press_contact"),
- "image_urls_json": article.get("image_urls_json"),
- "selected_image_url": selected_image_url,
- "legal_checked": bool(int(article.get("legal_checked", 0))),
- "legal_checked_at": article.get("legal_checked_at"),
- "legal_note": article.get("legal_note"),
- }
- )
-
- generated_at = datetime.now(timezone.utc).isoformat()
- if format == "csv":
- out = io.StringIO()
- fieldnames = [
- "id",
- "title",
- "status",
- "published_at",
- "days_old",
- "relevance",
- "author",
- "source_url",
- "canonical_url",
- "source_name_snapshot",
- "source_license_name_snapshot",
- "source_terms_url_snapshot",
- "press_contact",
- "image_urls_json",
- "selected_image_url",
- "legal_checked",
- "legal_checked_at",
- "legal_note",
- ]
- writer = csv.DictWriter(out, fieldnames=fieldnames)
- writer.writeheader()
- writer.writerows(rows)
- return Response(
- content=out.getvalue(),
- media_type="text/csv; charset=utf-8",
- headers={"Content-Disposition": 'attachment; filename="articles_export.csv"'},
- )
-
- return JSONResponse(
- {
- "ok": True,
- "count": len(rows),
- "generated_at": generated_at,
- "status_filter": status_filter,
- "items": rows,
- "requested_by": username,
- }
- )
-
-
-@app.get("/api/articles/{article_id}")
-def api_get_article(article_id: int, username: str = Depends(require_auth)) -> dict:
- article = get_article_by_id(article_id)
- if not article:
- raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
- article["status_ui"] = internal_to_ui_status(article.get("status"))
- return {"ok": True, "item": article, "requested_by": username}
-
-
-@app.post("/api/articles/upsert")
-def api_upsert_article(payload: ArticleUpsertRequest, username: str = Depends(require_auth)) -> dict:
- article_id = repo_upsert_article(
- ArticleUpsert(
- feed_id=payload.feed_id,
- source_article_id=payload.source_article_id,
- source_hash=payload.source_hash,
- title=payload.title,
- source_url=payload.source_url,
- canonical_url=payload.canonical_url,
- published_at=payload.published_at,
- author=payload.author,
- summary=payload.summary,
- content_raw=payload.content_raw,
- content_rewritten=payload.content_rewritten,
- image_urls_json=payload.image_urls_json,
- press_contact=payload.press_contact,
- source_name_snapshot=payload.source_name_snapshot,
- source_terms_url_snapshot=payload.source_terms_url_snapshot,
- source_license_name_snapshot=payload.source_license_name_snapshot,
- legal_checked=payload.legal_checked,
- legal_checked_at=payload.legal_checked_at,
- legal_note=payload.legal_note,
- wp_post_id=payload.wp_post_id,
- wp_post_url=payload.wp_post_url,
- publish_attempts=payload.publish_attempts,
- publish_last_error=payload.publish_last_error,
- published_to_wp_at=payload.published_to_wp_at,
- word_count=payload.word_count,
- status=ui_to_internal_status(payload.status),
- meta_json=payload.meta_json,
- )
- )
- return {"ok": True, "id": article_id, "requested_by": username}
-
-
-@app.post("/api/articles/{article_id}/transition")
-def api_article_transition(article_id: int, payload: ArticleTransitionRequest, username: str = Depends(require_auth)) -> dict:
- article = get_article_by_id(article_id)
- if not article:
- raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
-
- current_status = article.get("status")
- current_ui = internal_to_ui_status(current_status)
- target_internal = ui_to_internal_status(payload.target_status)
- target_ui = internal_to_ui_status(target_internal)
- allowed_targets = ALLOWED_UI_TRANSITIONS.get(current_ui, set())
- if target_ui not in allowed_targets:
- raise HTTPException(
- status_code=status.HTTP_400_BAD_REQUEST,
- detail=f"Ungueltiger Statuswechsel: {current_ui} -> {target_ui}",
- )
-
- updated = update_article_status(article_id, target_internal, actor=username, note=payload.note)
- if not updated:
- raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
- return {"ok": True, "id": article_id, "from_status": current_ui, "to_status": target_ui}
-
-
-@app.post("/api/articles/{article_id}/rewrite-run")
-def api_article_rewrite_run(article_id: int, username: str = Depends(require_auth)) -> dict:
- article = get_article_by_id(article_id)
- if not article:
- raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
- if internal_to_ui_status(article.get("status")) not in {"rewrite", "new"}:
- raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Rewrite nur aus Status 'new' oder 'rewrite'")
-
- rewritten = rewrite_article_text(article)
- tags: list[str] = []
- try:
- tags = generate_article_tags(article, rewritten_text=rewritten)
- except Exception:
- tags = []
- merged_meta = merge_generated_tags(article.get("meta_json"), tags)
- # upsert via status update + existing fields by lightweight path:
- repo_upsert_article(
- ArticleUpsert(
- feed_id=article.get("feed_id"),
- source_article_id=article.get("source_article_id"),
- source_hash=article.get("source_hash"),
- title=article.get("title"),
- source_url=article.get("source_url"),
- canonical_url=article.get("canonical_url"),
- published_at=article.get("published_at"),
- author=article.get("author"),
- summary=article.get("summary"),
- content_raw=article.get("content_raw"),
- content_rewritten=rewritten,
- image_urls_json=article.get("image_urls_json"),
- press_contact=article.get("press_contact"),
- source_name_snapshot=article.get("source_name_snapshot"),
- source_terms_url_snapshot=article.get("source_terms_url_snapshot"),
- source_license_name_snapshot=article.get("source_license_name_snapshot"),
- legal_checked=bool(int(article.get("legal_checked", 0))),
- legal_checked_at=article.get("legal_checked_at"),
- legal_note=article.get("legal_note"),
- wp_post_id=article.get("wp_post_id"),
- wp_post_url=article.get("wp_post_url"),
- publish_attempts=int(article.get("publish_attempts", 0)),
- publish_last_error=article.get("publish_last_error"),
- published_to_wp_at=article.get("published_to_wp_at"),
- word_count=len(rewritten.split()),
- status="approved",
- meta_json=merged_meta,
- )
- )
- return {"ok": True, "id": article_id, "status": "publish", "tags": tags}
-
-
-@app.post("/api/articles/{article_id}/legal-review")
-def api_article_legal_review(article_id: int, payload: ArticleLegalReviewRequest, username: str = Depends(require_auth)) -> dict:
- article = get_article_by_id(article_id)
- if not article:
- raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
-
- updated = set_article_legal_review(article_id, approved=payload.approved, note=payload.note, actor=username)
- if not updated:
- raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
- return {
- "ok": True,
- "id": article_id,
- "legal_checked": payload.approved,
- }
-
-
-@app.get("/api/publisher/jobs")
-def api_publisher_jobs(limit: int = 100, username: str = Depends(require_auth)) -> dict:
- return {"ok": True, "items": list_publish_jobs(limit=limit), "requested_by": username}
-
-
-@app.post("/api/publisher/enqueue")
-def api_publisher_enqueue(payload: PublisherEnqueueRequest, username: str = Depends(require_auth)) -> dict:
- article = get_article_by_id(payload.article_id)
- if not article:
- raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
- job_id = enqueue_publish(article_id=payload.article_id, max_attempts=payload.max_attempts)
- return {"ok": True, "job_id": job_id, "article_id": payload.article_id, "requested_by": username}
-
-
-@app.post("/api/publisher/run")
-def api_publisher_run(payload: PublisherRunRequest, username: str = Depends(require_auth)) -> dict:
- stats = run_publisher(max_jobs=payload.max_jobs)
- return {
- "ok": True,
- "requested_by": username,
- "stats": {
- "processed": stats.processed,
- "success": stats.success,
- "failed": stats.failed,
- "requeued": stats.requeued,
- },
- }
-
-
-@app.post("/api/articles/{article_id}/review")
-def api_article_review(article_id: int, payload: ArticleReviewRequest, username: str = Depends(require_auth)) -> dict:
- raise HTTPException(status_code=status.HTTP_410_GONE, detail="Review-Endpoint ersetzt durch Rewrite-Workflow")
-
-
-@app.post("/api/ingestion/run")
-def api_run_ingestion(payload: IngestionRunRequest, username: str = Depends(require_auth)) -> dict:
- stats = run_ingestion(feed_id=payload.feed_id)
- return {
- "ok": stats.status == "success",
- "run_id": stats.run_id,
- "status": stats.status,
- "message": stats.message,
- "stats": {
- "feeds_processed": stats.feeds_processed,
- "entries_seen": stats.entries_seen,
- "articles_upserted": stats.articles_upserted,
- },
- "requested_by": username,
- }
-
-
-# ---------------------------------------------------------------------------
-# N8N Automation endpoint (API-Key auth, no session cookie required)
-# ---------------------------------------------------------------------------
-
-def _require_api_key(request: Request) -> None:
- api_key = request.headers.get("X-API-Key") or request.query_params.get("api_key")
- expected = settings.n8n_api_key
- if not expected:
- raise HTTPException(status_code=status.HTTP_501_NOT_IMPLEMENTED, detail="N8N_API_KEY nicht konfiguriert")
- if api_key != expected:
- raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Ungültiger API-Key")
-
-
-_pipeline_lock = asyncio.Lock()
-
-
-@app.post("/api/n8n/pipeline")
-async def api_n8n_pipeline(request: Request) -> dict:
- """Trigger the full auto pipeline in background. Returns immediately.
- Called by N8N (2x/day or on demand). Results arrive via Telegram."""
- _require_api_key(request)
-
- if _pipeline_lock.locked():
- logging.getLogger(__name__).warning("Pipeline bereits aktiv – Trigger ignoriert")
- return {"ok": False, "message": "Pipeline läuft bereits – Trigger ignoriert"}
-
- async def _run():
- async with _pipeline_lock:
- loop = asyncio.get_event_loop()
- try:
- await loop.run_in_executor(None, lambda: run_auto_pipeline(trigger="n8n"))
- except Exception as exc:
- logging.getLogger(__name__).error("Background pipeline error: %s", exc)
-
- asyncio.create_task(_run())
- return {"ok": True, "message": "Pipeline gestartet – Ergebnisse kommen per Telegram"}
-
-
-@app.post("/api/n8n/ingest")
-def api_n8n_ingest(request: Request) -> dict:
- """Run only the ingestion step (no rewrite/publish). For N8N."""
- _require_api_key(request)
- stats = run_ingestion()
- return {
- "ok": stats.status == "success",
- "stats": {
- "feeds_processed": stats.feeds_processed,
- "entries_seen": stats.entries_seen,
- "articles_upserted": stats.articles_upserted,
- },
- }
-
-
-# ---------------------------------------------------------------------------
-# Telegram Webhook
-# ---------------------------------------------------------------------------
-
-@app.post("/telegram/webhook")
-async def telegram_webhook(request: Request) -> dict:
- """Receive updates from Telegram Bot API.
-
- Returns 200 immediately so Telegram never retries the same update.
- Actual processing runs in a background task.
- """
- import asyncio
- import logging
-
- # Verify secret token
- secret = settings.telegram_webhook_secret
- if secret:
- incoming = request.headers.get("X-Telegram-Bot-Api-Secret-Token", "")
- if incoming != secret:
- raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="Invalid secret")
-
- body = await request.body()
- try:
- update = json.loads(body.decode("utf-8"))
- except Exception:
- raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON")
-
- async def _process():
- loop = asyncio.get_event_loop()
- try:
- await loop.run_in_executor(None, lambda: handle_update(update))
- except Exception as exc:
- logging.getLogger(__name__).error("Telegram update handler error: %s", exc)
-
- asyncio.create_task(_process())
- return {"ok": True}
-
-
-@app.post("/api/telegram/setup-webhook")
-def api_setup_telegram_webhook(request: Request) -> dict:
- """Register the Telegram webhook URL. Call once after deployment."""
- username = require_auth(request)
- base_url = str(request.base_url).rstrip("/")
- webhook_url = f"{base_url}/telegram/webhook"
- result = setup_webhook(webhook_url)
- return {"ok": True, "webhook_url": webhook_url, "telegram_response": result, "requested_by": username}
diff --git a/backend/app/pipeline.py b/backend/app/pipeline.py
deleted file mode 100644
index 93a251b..0000000
--- a/backend/app/pipeline.py
+++ /dev/null
@@ -1,516 +0,0 @@
-"""Autonomous RSS-News pipeline.
-
-Full automated flow:
-1. Run RSS ingestion
-2. For each new article:
- - Auto-select primary image
- - Score relevance via GPT
- - < warn threshold: reject (error status) → Telegram rejected summary
- - warn..auto threshold: Telegram warning with override button
- - >= auto threshold: rewrite → create WP draft → Telegram notification
-3. Send pipeline summary to Telegram
-"""
-from __future__ import annotations
-
-import json
-import logging
-import time
-from dataclasses import dataclass, field
-from datetime import datetime, timezone
-from typing import Any
-
-from .config import get_settings
-from .ingestion import run_ingestion
-from .publisher import enqueue_publish, run_publisher
-from .repositories import (
- ArticleUpsert,
- get_article_by_id,
- list_articles,
- set_article_image_decision,
- update_article_status,
- upsert_article as repo_upsert_article,
-)
-from .rewrite import generate_article_tags, merge_generated_tags, rewrite_article_text, score_article_relevance
-from .scheduler import reserve_publish_slot
-from .wordpress import publish_article_draft, selected_image_exists
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class PipelineStats:
- ingested: int = 0
- processed: int = 0
- drafts_created: int = 0
- rejected: int = 0
- quality_gate_rejected: int = 0
- warnings: int = 0
- errors: int = 0
- no_image: int = 0
- rejected_articles: list[dict[str, Any]] = field(default_factory=list)
-
-
-# ---------------------------------------------------------------------------
-# Internal helpers
-# ---------------------------------------------------------------------------
-
-def _auto_select_image(article: dict[str, Any]) -> bool:
- """Auto-select the primary image from ingestion metadata if not already selected."""
- meta_json = article.get("meta_json") or "{}"
- try:
- meta = json.loads(meta_json)
- except Exception:
- return False
-
- # Already selected?
- image_review = meta.get("image_review") or {}
- if isinstance(image_review, dict) and image_review.get("selected_url"):
- return True
-
- # Try to get primary from ingestion extraction
- extraction = meta.get("extraction") or {}
- image_selection = extraction.get("image_selection") or {}
- primary = image_selection.get("primary")
-
- if not primary:
- # Fallback: use first URL from image_urls_json
- image_urls_json = article.get("image_urls_json") or "[]"
- try:
- urls = json.loads(image_urls_json)
- if urls:
- primary = urls[0]
- except Exception:
- pass
-
- if primary:
- set_article_image_decision(int(article["id"]), primary, "select", actor="pipeline")
- return True
- return False
-
-
-def _store_relevance(article_id: int, relevance: dict[str, Any]) -> None:
- """Persist relevance score and reason in article meta_json and relevance_score column."""
- article = get_article_by_id(article_id)
- if not article:
- return
- try:
- meta = json.loads(article.get("meta_json") or "{}")
- except Exception:
- meta = {}
- meta["relevance"] = relevance
- new_meta = json.dumps(meta, ensure_ascii=False)
- from .db import get_conn
- with get_conn() as conn:
- conn.execute(
- "UPDATE articles SET meta_json = ?, relevance_score = ? WHERE id = ?",
- (new_meta, relevance.get("score", 0), article_id),
- )
-
-
-def _do_rewrite_and_draft(article: dict[str, Any]) -> tuple[int, str | None]:
- """Rewrite article and create WP draft. Returns (wp_post_id, wp_post_url)."""
- article_id = int(article["id"])
- settings = get_settings()
-
- # ── Quality gate 1: raw content length ──────────────────────────────────
- import re as _re
- raw_text = _re.sub(r"<[^>]+>", " ", article.get("content_raw") or "")
- raw_words = len(raw_text.split())
- if raw_words < settings.pipeline_min_words_raw:
- note = (
- f"Zu wenig Rohinhalt: {raw_words} Wörter "
- f"(Minimum: {settings.pipeline_min_words_raw})"
- )
- logger.warning("_do_rewrite_and_draft #%d: %s — überspringe", article_id, note)
- update_article_status(article_id, "error", actor="pipeline", note=note)
- raise ValueError(note)
-
- # Rewrite
- logger.info("_do_rewrite_and_draft #%d: starte OpenAI-Rewrite (%d Roh-Wörter)", article_id, raw_words)
- rewritten = rewrite_article_text(article)
-
- # ── Quality gate 2: rewritten content length ─────────────────────────────
- rewritten_words = len(rewritten.split())
- if rewritten_words < settings.pipeline_min_words_rewritten:
- note = (
- f"Rewrite zu kurz: {rewritten_words} Wörter "
- f"(Minimum: {settings.pipeline_min_words_rewritten})"
- )
- logger.warning("_do_rewrite_and_draft #%d: %s — überspringe", article_id, note)
- update_article_status(article_id, "error", actor="pipeline", note=note)
- raise ValueError(note)
- logger.info("_do_rewrite_and_draft #%d: Rewrite fertig (%d Wörter), generiere Tags", article_id, len(rewritten.split()))
- tags: list[str] = []
- try:
- tags = generate_article_tags(article, rewritten_text=rewritten)
- except Exception:
- pass
- merged_meta = merge_generated_tags(article.get("meta_json"), tags)
-
- # Save rewritten content + approved status
- repo_upsert_article(
- ArticleUpsert(
- feed_id=article.get("feed_id"),
- source_article_id=article.get("source_article_id"),
- source_hash=article.get("source_hash"),
- title=article.get("title", ""),
- source_url=article.get("source_url", ""),
- canonical_url=article.get("canonical_url"),
- published_at=article.get("published_at"),
- author=article.get("author"),
- summary=article.get("summary"),
- content_raw=article.get("content_raw"),
- content_rewritten=rewritten,
- image_urls_json=article.get("image_urls_json"),
- press_contact=article.get("press_contact"),
- source_name_snapshot=article.get("source_name_snapshot"),
- source_terms_url_snapshot=article.get("source_terms_url_snapshot"),
- source_license_name_snapshot=article.get("source_license_name_snapshot"),
- legal_checked=bool(int(article.get("legal_checked", 0))),
- legal_checked_at=article.get("legal_checked_at"),
- legal_note=article.get("legal_note"),
- wp_post_id=article.get("wp_post_id"),
- wp_post_url=article.get("wp_post_url"),
- publish_attempts=int(article.get("publish_attempts", 0)),
- publish_last_error=article.get("publish_last_error"),
- published_to_wp_at=article.get("published_to_wp_at"),
- word_count=len(rewritten.split()),
- status="approved",
- meta_json=merged_meta,
- )
- )
-
- # Reload after save to get updated meta_json
- fresh = get_article_by_id(article_id)
- if not fresh:
- raise RuntimeError(f"Artikel #{article_id} nach Rewrite nicht gefunden")
-
- # Ensure a publish slot is reserved — reserve one now if not yet set
- if not fresh.get("scheduled_publish_at"):
- from .scheduler import reserve_publish_slot
- logger.info("_do_rewrite_and_draft #%d: kein Slot gesetzt, reserviere jetzt", article_id)
- reserve_publish_slot(article_id)
- fresh = get_article_by_id(article_id)
- if not fresh:
- raise RuntimeError(f"Artikel #{article_id} nach Slot-Reservierung nicht gefunden")
-
- # Create WP draft
- logger.info("_do_rewrite_and_draft #%d: erstelle/aktualisiere WP Draft (wp_post_id=%s, sched=%s)", article_id, fresh.get("wp_post_id"), fresh.get("scheduled_publish_at"))
- wp_post_id, wp_post_url = publish_article_draft(fresh)
- logger.info("_do_rewrite_and_draft #%d: WP Draft fertig (post_id=%s)", article_id, wp_post_id)
-
- # Update WP info in DB
- from .repositories import mark_article_publish_result
- mark_article_publish_result(
- article_id,
- wp_post_id=wp_post_id,
- wp_post_url=wp_post_url,
- error=None,
- increment_attempts=True,
- set_published_status=False,
- )
-
- return wp_post_id, wp_post_url
-
-
-# ---------------------------------------------------------------------------
-# Public pipeline functions
-# ---------------------------------------------------------------------------
-
-def run_auto_pipeline(trigger: str = "auto") -> dict[str, Any]:
- """Run the full automated pipeline and return stats dict."""
- from . import telegram_bot as tg
-
- settings = get_settings()
- stats = PipelineStats()
-
- tg.notify_pipeline_started(trigger)
-
- # Step 1: Ingestion
- try:
- ingest_result = run_ingestion()
- stats.ingested = ingest_result.articles_upserted
- except Exception as exc:
- tg.notify_error(f"Ingestion fehlgeschlagen: {exc}")
- logger.error("Ingestion error: %s", exc)
- stats.errors += 1
-
- # Step 2: Process new articles
- new_articles = list_articles(limit=100, status_filter="new")
-
- for article in new_articles:
- article_id = int(article["id"])
- try:
- _process_article(article, stats, settings)
- except Exception as exc:
- logger.error("Fehler bei Artikel #%d: %s", article_id, exc)
- tg.notify_error(f"Fehler bei Artikel #{article_id} ({article.get('title','?')[:50]}): {exc}")
- stats.errors += 1
- # Rate limiting between OpenAI calls
- time.sleep(1)
-
- # Step 3: Send rejected summary if any
- if stats.rejected_articles:
- try:
- tg.notify_rejected_summary(stats.rejected_articles)
- except Exception as exc:
- logger.warning("Telegram rejected summary fehlgeschlagen: %s", exc)
-
- # Step 4: Summary
- result = {
- "ingested": stats.ingested,
- "processed": stats.processed,
- "drafts_created": stats.drafts_created,
- "rejected": stats.rejected,
- "quality_gate_rejected": stats.quality_gate_rejected,
- "no_image": stats.no_image,
- "warnings": stats.warnings,
- "errors": stats.errors,
- }
- tg.notify_pipeline_done(result)
- return result
-
-
-def _process_article(article: dict[str, Any], stats: PipelineStats, settings: Any) -> None:
- """Process a single new article through the pipeline."""
- from . import telegram_bot as tg
-
- article_id = int(article["id"])
-
- # Auto-select image
- _auto_select_image(article)
-
- # Reload to get updated image_review
- article = get_article_by_id(article_id) or article
-
- # Exclude articles without a usable image
- try:
- meta = json.loads(article.get("meta_json") or "{}")
- except Exception:
- meta = {}
- has_image = bool((meta.get("image_review") or {}).get("selected_url"))
- if not has_image:
- update_article_status(
- article_id,
- "no_image",
- actor="pipeline",
- note="Kein Bild vorhanden – Artikel ausgeschlossen",
- )
- stats.no_image += 1
- logger.info("Artikel #%d ausgeschlossen: kein Bild gefunden", article_id)
- try:
- tg.send_message(
- f"🖼️ Kein Bild – Artikel #{article_id} ausgeschlossen\n"
- f"📰 {(article.get('title') or '')[:80]}"
- )
- except Exception:
- pass
- return
-
- # Score relevance
- try:
- relevance = score_article_relevance(article)
- except Exception as exc:
- logger.warning("Relevanz-Scoring für #%d fehlgeschlagen: %s", article_id, exc)
- relevance = {"score": 0, "reason": f"Scoring-Fehler: {exc}", "topics": []}
-
- score = relevance.get("score", 0)
- reason = relevance.get("reason", "")
- _store_relevance(article_id, relevance)
-
- stats.processed += 1
-
- if score < settings.pipeline_relevance_warn:
- # Reject
- update_article_status(
- article_id,
- "error",
- actor="pipeline",
- note=f"Abgelehnt: Score {score}/100 — {reason}",
- )
- stats.rejected += 1
- # Reload for summary (now has relevance in meta)
- updated = get_article_by_id(article_id)
- if updated:
- stats.rejected_articles.append(updated)
-
- elif score < settings.pipeline_relevance_auto:
- # Warning zone: set status to "review" so repeated /run calls don't re-warn
- update_article_status(
- article_id,
- "review",
- actor="pipeline",
- note=f"Niedrige Relevanz: Score {score}/100 — {reason}",
- )
- stats.warnings += 1
- try:
- tg.notify_relevance_warning(article, score, reason)
- except Exception as exc:
- logger.warning("Telegram warning für #%d fehlgeschlagen: %s", article_id, exc)
-
- else:
- # Auto-process: rewrite + WP draft
- try:
- # Reserve publish slot FIRST so it's available when WP draft is created
- slot = reserve_publish_slot(article_id)
-
- # Reload article to get updated image_review + scheduled_publish_at
- fresh = get_article_by_id(article_id)
- if not fresh:
- return
- wp_post_id, wp_post_url = _do_rewrite_and_draft(fresh)
- stats.drafts_created += 1
-
- # Reload for notification
- final = get_article_by_id(article_id)
- if final:
- try:
- tg.notify_new_draft(final, score=score, suggested_publish_at=slot)
- except Exception as exc:
- logger.warning("Telegram draft-Benachrichtigung für #%d fehlgeschlagen: %s", article_id, exc)
-
- except ValueError as exc:
- # Quality gate rejection (too short etc.) — status already set in _do_rewrite_and_draft
- # Release the reserved slot so it's available for the next article
- from .scheduler import release_publish_slot
- release_publish_slot(article_id)
- # Clean up any stale WP draft from a previous pipeline run
- stale = get_article_by_id(article_id)
- if stale and stale.get("wp_post_id"):
- try:
- from .wordpress import delete_wp_post
- delete_wp_post(int(stale["wp_post_id"]))
- logger.info("Artikel #%d: veralteten WP-Draft #%s gelöscht", article_id, stale["wp_post_id"])
- except Exception as del_exc:
- logger.warning("Artikel #%d: WP-Draft konnte nicht gelöscht werden: %s", article_id, del_exc)
- stats.quality_gate_rejected += 1
- logger.info("Artikel #%d wegen Qualitätsprüfung abgelehnt: %s", article_id, exc)
- # Individual Telegram notification for quality gate rejection
- try:
- title = (article.get("title") or "Ohne Titel")[:80]
- tg.send_message(
- f"✂️ Qualitätsprüfung nicht bestanden\n"
- f"📰 {title}\n"
- f"💯 Score: {score}/100\n"
- f"⚠️ {exc}"
- )
- except Exception as tg_exc:
- logger.warning("Telegram QG-Benachrichtigung für #%d fehlgeschlagen: %s", article_id, tg_exc)
-
- except Exception as exc:
- logger.error("Draft-Erstellung für #%d fehlgeschlagen: %s", article_id, exc)
- update_article_status(article_id, "error", actor="pipeline", note=f"Draft-Fehler: {exc}")
- # Release reserved slot so it's not permanently blocked by a failed article
- from .scheduler import release_publish_slot
- release_publish_slot(article_id)
- raise
-
-
-# ---------------------------------------------------------------------------
-# Callback actions (called from telegram_bot._handle_callback)
-# ---------------------------------------------------------------------------
-
-def rewrite_and_update_draft(article_id: int) -> None:
- """Rewrite article and update the existing WP draft."""
- article = get_article_by_id(article_id)
- if not article:
- raise RuntimeError(f"Artikel #{article_id} nicht gefunden")
- _auto_select_image(article)
- fresh = get_article_by_id(article_id)
- _do_rewrite_and_draft(fresh)
-
-
-def discard_article(article_id: int) -> None:
- """Discard a draft: delete WP post if exists, set article to error."""
- article = get_article_by_id(article_id)
- if not article:
- return
-
- wp_post_id = article.get("wp_post_id")
- if wp_post_id:
- try:
- from .wordpress import delete_wp_post
- delete_wp_post(int(wp_post_id))
- except Exception as exc:
- logger.warning("WP Post #%d konnte nicht gelöscht werden: %s", wp_post_id, exc)
-
- update_article_status(article_id, "error", actor="telegram", note="Via Telegram verworfen")
-
-
-def override_rejected_article(article_id: int) -> None:
- """Force-process a previously rejected article."""
- from . import telegram_bot as tg
-
- article = get_article_by_id(article_id)
- if not article:
- raise RuntimeError(f"Artikel #{article_id} nicht gefunden")
-
- # Reset to new so processing is allowed
- update_article_status(article_id, "new", actor="telegram", note="Manuell übernommen via Telegram")
-
- # Reload
- fresh = get_article_by_id(article_id)
- if not fresh:
- return
-
- _auto_select_image(fresh)
- fresh = get_article_by_id(article_id)
-
- # Get existing score or re-score
- try:
- meta = json.loads(fresh.get("meta_json") or "{}")
- score = int((meta.get("relevance") or {}).get("score", 0))
- except Exception:
- score = 0
-
- # Reserve publish slot FIRST so it's in the DB when WP draft is created
- slot = reserve_publish_slot(article_id)
- fresh = get_article_by_id(article_id)
-
- wp_post_id, wp_post_url = _do_rewrite_and_draft(fresh)
-
- final = get_article_by_id(article_id)
- if final:
- tg.notify_new_draft(final, score=score, suggested_publish_at=slot)
-
-
-# ---------------------------------------------------------------------------
-# Status helpers (used by /status command)
-# ---------------------------------------------------------------------------
-
-def get_recently_rejected(days: int = 3) -> list[dict[str, Any]]:
- """Return articles rejected in the last N days."""
- from .db import get_conn
- from .db import rows_to_dicts
- cutoff = datetime.now(timezone.utc).isoformat()[:10]
- with get_conn() as conn:
- rows = conn.execute(
- """
- SELECT id, title, meta_json, source_url, created_at
- FROM articles
- WHERE status IN ('error', 'review')
- AND json_extract(meta_json, '$.relevance.score') IS NOT NULL
- AND date(updated_at) >= date('now', ?)
- ORDER BY updated_at DESC
- LIMIT 20
- """,
- (f"-{days} days",),
- ).fetchall()
- return rows_to_dicts(rows)
-
-
-def get_pipeline_status_text() -> str:
- """Return a text summary of current pipeline state."""
- from .repositories import list_articles as _list
- new_count = len(_list(limit=500, status_filter="new"))
- approved_count = len(_list(limit=500, status_filter="approved"))
- published_count = len(_list(limit=500, status_filter="published"))
- error_count = len(_list(limit=500, status_filter="error"))
-
- return (
- f"📊 Pipeline-Status\n"
- f"🆕 Neu / wartend: {new_count}\n"
- f"✅ Draft / freigegeben: {approved_count}\n"
- f"📢 Veröffentlicht: {published_count}\n"
- f"🚫 Fehler / abgelehnt: {error_count}"
- )
diff --git a/backend/app/policy.py b/backend/app/policy.py
deleted file mode 100644
index af6e65c..0000000
--- a/backend/app/policy.py
+++ /dev/null
@@ -1,35 +0,0 @@
-from __future__ import annotations
-
-from typing import Any
-
-
-def evaluate_source_policy(source: dict[str, Any] | None) -> list[str]:
- issues: list[str] = []
- if not source:
- issues.append("Keine Quelle zugeordnet")
- return issues
-
- risk_level = (source.get("risk_level") or "").strip().lower()
- if risk_level != "green":
- issues.append(f"Quelle nicht freigegeben (risk_level={risk_level or 'unset'})")
-
- terms_url = (source.get("terms_url") or "").strip()
- if not terms_url:
- issues.append("terms_url fehlt")
-
- license_name = (source.get("license_name") or "").strip()
- if not license_name:
- issues.append("license_name fehlt")
-
- last_reviewed_at = (source.get("last_reviewed_at") or "").strip()
- if not last_reviewed_at:
- issues.append("last_reviewed_at fehlt")
-
- if int(source.get("is_enabled", 0) or 0) != 1:
- issues.append("Quelle ist deaktiviert")
-
- return issues
-
-
-def is_source_allowed(source: dict[str, Any] | None) -> bool:
- return len(evaluate_source_policy(source)) == 0
diff --git a/backend/app/publisher.py b/backend/app/publisher.py
deleted file mode 100644
index e27bd1b..0000000
--- a/backend/app/publisher.py
+++ /dev/null
@@ -1,101 +0,0 @@
-from __future__ import annotations
-
-from dataclasses import dataclass
-
-from .repositories import (
- claim_next_publish_job,
- complete_publish_job,
- create_publish_job,
- fail_publish_job,
- get_article_by_id,
- mark_article_publish_result,
- PublishJobCreate,
-)
-from .wordpress import publish_article_draft, selected_image_exists
-
-
-@dataclass(frozen=True)
-class PublisherStats:
- processed: int
- success: int
- failed: int
- requeued: int
-
-
-def enqueue_publish(article_id: int, max_attempts: int = 3) -> int:
- return create_publish_job(PublishJobCreate(article_id=article_id, max_attempts=max_attempts))
-
-
-def _can_publish(article: dict) -> tuple[bool, str | None]:
- if article.get("status") not in {"approved", "published"}:
- return False, "Artikelstatus muss 'publish' sein"
- if not selected_image_exists(article):
- return False, "Hauptbild nicht gesetzt"
- return True, None
-
-
-def run_publisher(max_jobs: int = 10) -> PublisherStats:
- processed = 0
- success = 0
- failed = 0
- requeued = 0
-
- for _ in range(max(1, max_jobs)):
- job = claim_next_publish_job()
- if not job:
- break
- processed += 1
- job_id = int(job["id"])
- article_id = int(job["article_id"])
-
- article = get_article_by_id(article_id)
- if not article:
- fail_publish_job(job_id, "Artikel nicht gefunden", requeue=False)
- failed += 1
- continue
-
- allowed, reason = _can_publish(article)
- if not allowed:
- fail_publish_job(job_id, reason or "Publish-Bedingungen nicht erfüllt", requeue=False)
- mark_article_publish_result(
- article_id,
- wp_post_id=article.get("wp_post_id"),
- wp_post_url=article.get("wp_post_url"),
- error=reason or "blocked",
- increment_attempts=True,
- set_published_status=False,
- )
- failed += 1
- continue
-
- try:
- wp_post_id, wp_post_url = publish_article_draft(article)
- complete_publish_job(job_id, wp_post_id=wp_post_id, wp_post_url=wp_post_url)
- mark_article_publish_result(
- article_id,
- wp_post_id=wp_post_id,
- wp_post_url=wp_post_url,
- error=None,
- increment_attempts=True,
- set_published_status=True,
- )
- success += 1
- except Exception as exc:
- attempts = int(job.get("attempts", 1))
- max_attempts = int(job.get("max_attempts", 3))
- should_requeue = attempts < max_attempts
- fail_publish_job(job_id, str(exc), requeue=should_requeue)
- mark_article_publish_result(
- article_id,
- wp_post_id=article.get("wp_post_id"),
- wp_post_url=article.get("wp_post_url"),
- error=str(exc),
- increment_attempts=True,
- set_published_status=False,
- )
- if should_requeue:
- requeued += 1
- else:
- failed += 1
-
- return PublisherStats(processed=processed, success=success, failed=failed, requeued=requeued)
diff --git a/backend/app/relevance.py b/backend/app/relevance.py
deleted file mode 100644
index 8f69693..0000000
--- a/backend/app/relevance.py
+++ /dev/null
@@ -1,44 +0,0 @@
-from __future__ import annotations
-
-from datetime import datetime, timezone
-
-
-def _parse_iso_datetime(value: str | None) -> datetime | None:
- if not value:
- return None
- raw = value.strip()
- if not raw:
- return None
- if raw.endswith("Z"):
- raw = raw[:-1] + "+00:00"
- try:
- parsed = datetime.fromisoformat(raw)
- except ValueError:
- return None
- if parsed.tzinfo is None:
- parsed = parsed.replace(tzinfo=timezone.utc)
- return parsed
-
-
-def article_age_days(published_at: str | None, now: datetime | None = None) -> int | None:
- published = _parse_iso_datetime(published_at)
- if not published:
- return None
- ref = now or datetime.now(timezone.utc)
- delta = ref - published
- if delta.total_seconds() < 0:
- return 0
- return delta.days
-
-
-def article_relevance(published_at: str | None, now: datetime | None = None) -> str:
- days = article_age_days(published_at, now=now)
- if days is None:
- return "unbekannt"
- if days <= 2:
- return "hoch"
- if days <= 7:
- return "mittel"
- if days <= 30:
- return "niedrig"
- return "alt"
diff --git a/backend/app/repositories.py b/backend/app/repositories.py
deleted file mode 100644
index cf38055..0000000
--- a/backend/app/repositories.py
+++ /dev/null
@@ -1,855 +0,0 @@
-from __future__ import annotations
-
-from dataclasses import dataclass
-import json
-from datetime import datetime, timezone
-from typing import Any
-
-from .db import get_conn, rows_to_dicts
-
-
-@dataclass(frozen=True)
-class SourceCreate:
- name: str
- base_url: str | None
- terms_url: str | None
- license_name: str | None
- risk_level: str
- is_enabled: bool
- notes: str | None
- last_reviewed_at: str | None
-
-
-@dataclass(frozen=True)
-class FeedCreate:
- name: str
- url: str
- source_id: int | None
- is_enabled: bool
-
-
-@dataclass(frozen=True)
-class SourceUpdate:
- name: str
- base_url: str | None
- terms_url: str | None
- license_name: str | None
- risk_level: str
- is_enabled: bool
- notes: str | None
- last_reviewed_at: str | None
-
-
-@dataclass(frozen=True)
-class FeedUpdate:
- name: str
- url: str
- source_id: int | None
- is_enabled: bool
-
-
-@dataclass(frozen=True)
-class RunCreate:
- run_type: str
- status: str
- details: str | None = None
-
-
-@dataclass(frozen=True)
-class ArticleUpsert:
- feed_id: int | None
- source_article_id: str | None
- source_hash: str | None
- title: str
- source_url: str
- canonical_url: str | None
- published_at: str | None
- author: str | None
- summary: str | None
- content_raw: str | None
- content_rewritten: str | None
- image_urls_json: str | None
- press_contact: str | None
- source_name_snapshot: str | None
- source_terms_url_snapshot: str | None
- source_license_name_snapshot: str | None
- legal_checked: bool
- legal_checked_at: str | None
- legal_note: str | None
- wp_post_id: int | None
- wp_post_url: str | None
- publish_attempts: int
- publish_last_error: str | None
- published_to_wp_at: str | None
- word_count: int
- status: str
- meta_json: str | None
-
-
-@dataclass(frozen=True)
-class PublishJobCreate:
- article_id: int
- max_attempts: int = 3
-
-
-def create_source(payload: SourceCreate) -> int:
- with get_conn() as conn:
- cur = conn.execute(
- """
- INSERT INTO sources (name, base_url, terms_url, license_name, risk_level, is_enabled, notes, last_reviewed_at)
- VALUES (?, ?, ?, ?, ?, ?, ?, ?)
- """,
- (
- payload.name.strip(),
- payload.base_url,
- payload.terms_url,
- payload.license_name,
- payload.risk_level,
- 1 if payload.is_enabled else 0,
- payload.notes,
- payload.last_reviewed_at,
- ),
- )
- return int(cur.lastrowid)
-
-
-def list_sources() -> list[dict[str, Any]]:
- with get_conn() as conn:
- rows = conn.execute(
- """
- SELECT id, name, base_url, terms_url, license_name, risk_level, is_enabled, notes, last_reviewed_at, created_at, updated_at
- FROM sources
- ORDER BY id DESC
- """
- ).fetchall()
- return rows_to_dicts(rows)
-
-
-def get_source_by_id(source_id: int) -> dict[str, Any] | None:
- with get_conn() as conn:
- row = conn.execute(
- """
- SELECT id, name, base_url, terms_url, license_name, risk_level, is_enabled, notes, last_reviewed_at, created_at, updated_at
- FROM sources
- WHERE id = ?
- """,
- (source_id,),
- ).fetchone()
- return dict(row) if row else None
-
-
-def update_source(source_id: int, payload: SourceUpdate) -> bool:
- with get_conn() as conn:
- cur = conn.execute(
- """
- UPDATE sources
- SET name = ?, base_url = ?, terms_url = ?, license_name = ?, risk_level = ?, is_enabled = ?, notes = ?, last_reviewed_at = ?
- WHERE id = ?
- """,
- (
- payload.name.strip(),
- payload.base_url,
- payload.terms_url,
- payload.license_name,
- payload.risk_level,
- 1 if payload.is_enabled else 0,
- payload.notes,
- payload.last_reviewed_at,
- source_id,
- ),
- )
- return cur.rowcount > 0
-
-
-def delete_source(source_id: int) -> bool:
- with get_conn() as conn:
- cur = conn.execute("DELETE FROM sources WHERE id = ?", (source_id,))
- return cur.rowcount > 0
-
-
-def create_feed(payload: FeedCreate) -> int:
- with get_conn() as conn:
- cur = conn.execute(
- "INSERT INTO feeds (name, url, source_id, is_enabled) VALUES (?, ?, ?, ?)",
- (payload.name.strip(), payload.url.strip(), payload.source_id, 1 if payload.is_enabled else 0),
- )
- return int(cur.lastrowid)
-
-
-def list_feeds() -> list[dict[str, Any]]:
- with get_conn() as conn:
- rows = conn.execute(
- """
- SELECT f.id, f.name, f.url, f.source_id, f.is_enabled, f.etag, f.last_modified, f.last_checked_at,
- f.created_at, f.updated_at, s.name AS source_name, s.license_name AS source_license_name,
- s.terms_url AS source_terms_url, s.risk_level AS source_risk_level, s.base_url AS source_base_url,
- s.last_reviewed_at AS source_last_reviewed_at, s.is_enabled AS source_is_enabled
- FROM feeds f
- LEFT JOIN sources s ON s.id = f.source_id
- ORDER BY f.id DESC
- """
- ).fetchall()
- return rows_to_dicts(rows)
-
-
-def list_enabled_feeds() -> list[dict[str, Any]]:
- with get_conn() as conn:
- rows = conn.execute(
- """
- SELECT f.id, f.name, f.url, f.source_id, f.is_enabled, f.etag, f.last_modified, f.last_checked_at,
- s.name AS source_name, s.license_name AS source_license_name, s.terms_url AS source_terms_url,
- s.risk_level AS source_risk_level, s.base_url AS source_base_url,
- s.last_reviewed_at AS source_last_reviewed_at, s.is_enabled AS source_is_enabled
- FROM feeds f
- LEFT JOIN sources s ON s.id = f.source_id
- WHERE f.is_enabled = 1
- ORDER BY f.id ASC
- """
- ).fetchall()
- return rows_to_dicts(rows)
-
-
-def get_feed_by_id(feed_id: int) -> dict[str, Any] | None:
- with get_conn() as conn:
- row = conn.execute(
- """
- SELECT f.id, f.name, f.url, f.source_id, f.is_enabled, f.etag, f.last_modified, f.last_checked_at,
- s.name AS source_name, s.license_name AS source_license_name, s.terms_url AS source_terms_url,
- s.risk_level AS source_risk_level, s.base_url AS source_base_url,
- s.last_reviewed_at AS source_last_reviewed_at, s.is_enabled AS source_is_enabled
- FROM feeds f
- LEFT JOIN sources s ON s.id = f.source_id
- WHERE f.id = ?
- """,
- (feed_id,),
- ).fetchone()
- return dict(row) if row else None
-
-
-def update_feed(feed_id: int, payload: FeedUpdate) -> bool:
- with get_conn() as conn:
- cur = conn.execute(
- """
- UPDATE feeds
- SET name = ?, url = ?, source_id = ?, is_enabled = ?
- WHERE id = ?
- """,
- (
- payload.name.strip(),
- payload.url.strip(),
- payload.source_id,
- 1 if payload.is_enabled else 0,
- feed_id,
- ),
- )
- return cur.rowcount > 0
-
-
-def delete_feed(feed_id: int) -> bool:
- with get_conn() as conn:
- cur = conn.execute("DELETE FROM feeds WHERE id = ?", (feed_id,))
- return cur.rowcount > 0
-
-
-def update_feed_fetch_state(feed_id: int, etag: str | None, last_modified: str | None) -> None:
- with get_conn() as conn:
- conn.execute(
- """
- UPDATE feeds
- SET etag = ?, last_modified = ?, last_checked_at = datetime('now')
- WHERE id = ?
- """,
- (etag, last_modified, feed_id),
- )
-
-
-def create_run(payload: RunCreate) -> int:
- with get_conn() as conn:
- cur = conn.execute(
- "INSERT INTO runs (run_type, status, details) VALUES (?, ?, ?)",
- (payload.run_type, payload.status, payload.details),
- )
- return int(cur.lastrowid)
-
-
-def finish_run(run_id: int, status: str, details: str | None = None) -> None:
- with get_conn() as conn:
- conn.execute(
- """
- UPDATE runs
- SET status = ?, details = ?, finished_at = datetime('now')
- WHERE id = ?
- """,
- (status, details, run_id),
- )
-
-
-def list_runs(limit: int = 50) -> list[dict[str, Any]]:
- safe_limit = max(1, min(limit, 500))
- with get_conn() as conn:
- rows = conn.execute(
- """
- SELECT id, run_type, status, started_at, finished_at, details
- FROM runs
- ORDER BY id DESC
- LIMIT ?
- """,
- (safe_limit,),
- ).fetchall()
- return rows_to_dicts(rows)
-
-
-def get_run_by_id(run_id: int) -> dict[str, Any] | None:
- with get_conn() as conn:
- row = conn.execute(
- """
- SELECT id, run_type, status, started_at, finished_at, details
- FROM runs
- WHERE id = ?
- """,
- (run_id,),
- ).fetchone()
- return dict(row) if row else None
-
-
-def get_article_by_id(article_id: int) -> dict[str, Any] | None:
- with get_conn() as conn:
- row = conn.execute(
- """
- SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author,
- a.summary, a.content_raw, a.content_rewritten, a.image_urls_json, a.press_contact,
- a.source_name_snapshot, a.source_terms_url_snapshot, a.source_license_name_snapshot,
- a.legal_checked, a.legal_checked_at, a.legal_note,
- a.wp_post_id, a.wp_post_url, a.publish_attempts, a.publish_last_error, a.published_to_wp_at,
- a.word_count, a.status, a.meta_json, a.created_at, a.updated_at,
- a.scheduled_publish_at
- FROM articles a
- WHERE a.id = ?
- """,
- (article_id,),
- ).fetchone()
- return dict(row) if row else None
-
-
-def _merge_review_event(meta_json: str | None, event: dict[str, Any]) -> str:
- meta: dict[str, Any] = {}
- if meta_json:
- try:
- meta = json.loads(meta_json)
- if not isinstance(meta, dict):
- meta = {}
- except Exception:
- meta = {}
-
- events = meta.get("review_events")
- if not isinstance(events, list):
- events = []
- events.append(event)
- meta["review_events"] = events
- return json.dumps(meta, ensure_ascii=False)
-
-
-def _load_meta(meta_json: str | None) -> dict[str, Any]:
- if not meta_json:
- return {}
- try:
- parsed = json.loads(meta_json)
- return parsed if isinstance(parsed, dict) else {}
- except Exception:
- return {}
-
-
-def update_article_status(
- article_id: int,
- new_status: str,
- *,
- actor: str | None = None,
- note: str | None = None,
- decision: str | None = None,
-) -> bool:
- article = get_article_by_id(article_id)
- if not article:
- return False
-
- event = {
- "timestamp": datetime.now(timezone.utc).isoformat(),
- "from_status": article.get("status"),
- "to_status": new_status,
- "actor": actor or "system",
- "note": note,
- "decision": decision,
- }
- merged_meta = _merge_review_event(article.get("meta_json"), event)
-
- with get_conn() as conn:
- conn.execute(
- "UPDATE articles SET status = ?, meta_json = ? WHERE id = ?",
- (new_status, merged_meta, article_id),
- )
- return True
-
-
-def set_article_legal_review(article_id: int, approved: bool, note: str | None, actor: str | None = None) -> bool:
- article = get_article_by_id(article_id)
- if not article:
- return False
-
- event = {
- "timestamp": datetime.now(timezone.utc).isoformat(),
- "event": "legal_review",
- "approved": approved,
- "actor": actor or "system",
- "note": note,
- }
- merged_meta = _merge_review_event(article.get("meta_json"), event)
- with get_conn() as conn:
- conn.execute(
- """
- UPDATE articles
- SET legal_checked = ?, legal_checked_at = datetime('now'), legal_note = ?, meta_json = ?
- WHERE id = ?
- """,
- (1 if approved else 0, note, merged_meta, article_id),
- )
- return True
-
-
-def set_article_image_decision(article_id: int, image_url: str, action: str, actor: str | None = None) -> bool:
- article = get_article_by_id(article_id)
- if not article:
- return False
- url = (image_url or "").strip()
- if not url:
- return False
- if action not in {"select", "exclude", "restore"}:
- return False
-
- meta = _load_meta(article.get("meta_json"))
- image_review = meta.get("image_review")
- if not isinstance(image_review, dict):
- image_review = {}
-
- excluded = image_review.get("excluded_urls")
- if not isinstance(excluded, list):
- excluded = []
- excluded_set = {str(item) for item in excluded if item}
-
- selected_url = image_review.get("selected_url")
- if not isinstance(selected_url, str):
- selected_url = None
-
- if action == "select":
- selected_url = url
- excluded_set.discard(url)
- elif action == "exclude":
- excluded_set.add(url)
- if selected_url == url:
- selected_url = None
- elif action == "restore":
- excluded_set.discard(url)
-
- image_review["selected_url"] = selected_url
- image_review["excluded_urls"] = sorted(excluded_set)
- image_review["updated_at"] = datetime.now(timezone.utc).isoformat()
- image_review["updated_by"] = actor or "system"
- meta["image_review"] = image_review
-
- with get_conn() as conn:
- conn.execute(
- "UPDATE articles SET meta_json = ? WHERE id = ?",
- (json.dumps(meta, ensure_ascii=False), article_id),
- )
- return True
-
-
-def create_publish_job(payload: PublishJobCreate) -> int:
- with get_conn() as conn:
- existing = conn.execute(
- """
- SELECT id FROM publish_jobs
- WHERE article_id = ? AND status IN ('queued', 'running')
- ORDER BY id DESC
- LIMIT 1
- """,
- (payload.article_id,),
- ).fetchone()
- if existing:
- return int(existing["id"])
-
- cur = conn.execute(
- """
- INSERT INTO publish_jobs (article_id, status, attempts, max_attempts)
- VALUES (?, 'queued', 0, ?)
- """,
- (payload.article_id, max(1, payload.max_attempts)),
- )
- return int(cur.lastrowid)
-
-
-def list_publish_jobs(limit: int = 100) -> list[dict[str, Any]]:
- safe_limit = max(1, min(limit, 500))
- with get_conn() as conn:
- rows = conn.execute(
- """
- SELECT j.id, j.article_id, j.status, j.attempts, j.max_attempts, j.error_message, j.wp_post_id, j.wp_post_url,
- j.created_at, j.started_at, j.finished_at, a.title AS article_title
- FROM publish_jobs j
- LEFT JOIN articles a ON a.id = j.article_id
- ORDER BY j.id DESC
- LIMIT ?
- """,
- (safe_limit,),
- ).fetchall()
- return rows_to_dicts(rows)
-
-
-def claim_next_publish_job() -> dict[str, Any] | None:
- with get_conn() as conn:
- row = conn.execute(
- """
- SELECT id, article_id, status, attempts, max_attempts, error_message, wp_post_id, wp_post_url
- FROM publish_jobs
- WHERE status = 'queued' AND attempts < max_attempts
- ORDER BY id ASC
- LIMIT 1
- """
- ).fetchone()
- if not row:
- return None
- job_id = int(row["id"])
- conn.execute(
- """
- UPDATE publish_jobs
- SET status = 'running',
- attempts = attempts + 1,
- started_at = datetime('now'),
- finished_at = NULL
- WHERE id = ?
- """,
- (job_id,),
- )
- claimed = conn.execute(
- """
- SELECT id, article_id, status, attempts, max_attempts, error_message, wp_post_id, wp_post_url
- FROM publish_jobs
- WHERE id = ?
- """,
- (job_id,),
- ).fetchone()
- return dict(claimed) if claimed else None
-
-
-def complete_publish_job(job_id: int, wp_post_id: int | None, wp_post_url: str | None) -> None:
- with get_conn() as conn:
- conn.execute(
- """
- UPDATE publish_jobs
- SET status = 'success',
- wp_post_id = ?,
- wp_post_url = ?,
- error_message = NULL,
- finished_at = datetime('now')
- WHERE id = ?
- """,
- (wp_post_id, wp_post_url, job_id),
- )
-
-
-def fail_publish_job(job_id: int, error_message: str, requeue: bool) -> None:
- next_status = "queued" if requeue else "failed"
- with get_conn() as conn:
- conn.execute(
- """
- UPDATE publish_jobs
- SET status = ?,
- error_message = ?,
- finished_at = datetime('now')
- WHERE id = ?
- """,
- (next_status, error_message[:2000], job_id),
- )
-
-
-def mark_article_publish_result(
- article_id: int,
- *,
- wp_post_id: int | None,
- wp_post_url: str | None,
- error: str | None,
- increment_attempts: bool,
- set_published_status: bool,
-) -> None:
- with get_conn() as conn:
- conn.execute(
- """
- UPDATE articles
- SET wp_post_id = ?,
- wp_post_url = ?,
- publish_attempts = CASE WHEN ? THEN publish_attempts + 1 ELSE publish_attempts END,
- publish_last_error = ?,
- published_to_wp_at = CASE WHEN ? IS NOT NULL THEN datetime('now') ELSE published_to_wp_at END,
- status = CASE WHEN ? THEN 'published' ELSE status END
- WHERE id = ?
- """,
- (
- wp_post_id,
- wp_post_url,
- 1 if increment_attempts else 0,
- error[:2000] if error else None,
- wp_post_id,
- 1 if set_published_status else 0,
- article_id,
- ),
- )
-
-
-def _resolve_existing_article_id(payload: ArticleUpsert) -> int | None:
- with get_conn() as conn:
- # 1) strongest key: source_url
- row = conn.execute(
- "SELECT id FROM articles WHERE source_url = ?",
- (payload.source_url.strip(),),
- ).fetchone()
- if row:
- return int(row["id"])
-
- # 2) stable feed+guid combo
- if payload.feed_id is not None and payload.source_article_id:
- row = conn.execute(
- "SELECT id FROM articles WHERE feed_id = ? AND source_article_id = ?",
- (payload.feed_id, payload.source_article_id),
- ).fetchone()
- if row:
- return int(row["id"])
-
- # 3) content hash fallback
- if payload.source_hash:
- row = conn.execute(
- "SELECT id FROM articles WHERE source_hash = ?",
- (payload.source_hash,),
- ).fetchone()
- if row:
- return int(row["id"])
-
- return None
-
-
-def find_existing_article_for_upsert(payload: ArticleUpsert) -> dict[str, Any] | None:
- article_id = _resolve_existing_article_id(payload)
- if article_id is None:
- return None
- return get_article_by_id(article_id)
-
-
-def upsert_article(payload: ArticleUpsert) -> int:
- existing_id = _resolve_existing_article_id(payload)
- with get_conn() as conn:
- if existing_id is None:
- conn.execute(
- """
- INSERT INTO articles (
- feed_id, source_article_id, source_hash, title, source_url, canonical_url, published_at, author,
- summary, content_raw, content_rewritten, image_urls_json, press_contact,
- source_name_snapshot, source_terms_url_snapshot, source_license_name_snapshot,
- legal_checked, legal_checked_at, legal_note,
- wp_post_id, wp_post_url, publish_attempts, publish_last_error, published_to_wp_at,
- word_count, status, meta_json
- ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
- """,
- (
- payload.feed_id,
- payload.source_article_id,
- payload.source_hash,
- payload.title.strip(),
- payload.source_url.strip(),
- payload.canonical_url,
- payload.published_at,
- payload.author,
- payload.summary,
- payload.content_raw,
- payload.content_rewritten,
- payload.image_urls_json,
- payload.press_contact,
- payload.source_name_snapshot,
- payload.source_terms_url_snapshot,
- payload.source_license_name_snapshot,
- 1 if payload.legal_checked else 0,
- payload.legal_checked_at,
- payload.legal_note,
- payload.wp_post_id,
- payload.wp_post_url,
- payload.publish_attempts,
- payload.publish_last_error,
- payload.published_to_wp_at,
- payload.word_count,
- payload.status,
- payload.meta_json,
- ),
- )
- else:
- conn.execute(
- """
- UPDATE articles
- SET
- feed_id = ?,
- source_article_id = ?,
- source_hash = ?,
- title = ?,
- source_url = ?,
- canonical_url = ?,
- published_at = ?,
- author = ?,
- summary = ?,
- content_raw = ?,
- content_rewritten = ?,
- image_urls_json = ?,
- press_contact = ?,
- source_name_snapshot = ?,
- source_terms_url_snapshot = ?,
- source_license_name_snapshot = ?,
- legal_checked = ?,
- legal_checked_at = ?,
- legal_note = ?,
- wp_post_id = ?,
- wp_post_url = ?,
- publish_attempts = ?,
- publish_last_error = ?,
- published_to_wp_at = ?,
- word_count = ?,
- status = ?,
- meta_json = ?
- WHERE id = ?
- """,
- (
- payload.feed_id,
- payload.source_article_id,
- payload.source_hash,
- payload.title.strip(),
- payload.source_url.strip(),
- payload.canonical_url,
- payload.published_at,
- payload.author,
- payload.summary,
- payload.content_raw,
- payload.content_rewritten,
- payload.image_urls_json,
- payload.press_contact,
- payload.source_name_snapshot,
- payload.source_terms_url_snapshot,
- payload.source_license_name_snapshot,
- 1 if payload.legal_checked else 0,
- payload.legal_checked_at,
- payload.legal_note,
- payload.wp_post_id,
- payload.wp_post_url,
- payload.publish_attempts,
- payload.publish_last_error,
- payload.published_to_wp_at,
- payload.word_count,
- payload.status,
- payload.meta_json,
- existing_id,
- ),
- )
- row = conn.execute("SELECT id FROM articles WHERE source_url = ?", (payload.source_url.strip(),)).fetchone()
- if row:
- return int(row["id"])
- return int(existing_id) if existing_id else 0
-
-
-def list_articles_page(
- limit: int = 50,
- offset: int = 0,
- status_filter: str | None = None,
- search: str | None = None,
-) -> tuple[list[dict[str, Any]], int]:
- """Return (articles, total_count) with optional status filter and title search."""
- safe_limit = max(1, min(limit, 200))
- safe_offset = max(0, offset)
-
- conditions: list[str] = []
- params: list[Any] = []
- if status_filter:
- conditions.append("a.status = ?")
- params.append(status_filter)
- if search:
- conditions.append("(a.title LIKE ? OR a.id = ?)")
- try:
- params.extend([f"%{search}%", int(search)])
- except ValueError:
- params.extend([f"%{search}%", -1])
-
- where = f"WHERE {' AND '.join(conditions)}" if conditions else ""
- select = """
- SELECT a.id, a.title, a.status, a.published_at, a.summary, a.content_raw,
- a.meta_json, a.wp_post_id, a.wp_post_url, a.scheduled_publish_at,
- a.word_count, f.name AS feed_name
- FROM articles a
- LEFT JOIN feeds f ON f.id = a.feed_id
- """
- with get_conn() as conn:
- total = conn.execute(
- f"SELECT COUNT(*) FROM articles a {where}", params
- ).fetchone()[0]
- rows = conn.execute(
- f"{select} {where} ORDER BY a.id DESC LIMIT ? OFFSET ?",
- params + [safe_limit, safe_offset],
- ).fetchall()
- return rows_to_dicts(rows), total
-
-
-def bulk_update_wp_post_ids(updates: list[tuple[int, int | None]]) -> int:
- """Update wp_post_id (and clear stale wp_post_url) for multiple articles.
-
- Returns the number of rows actually updated.
- Call sync_db_from_wordpress() afterwards to repopulate wp_post_url and
- scheduled_publish_at from the live WordPress data.
- """
- if not updates:
- return 0
- updated = 0
- with get_conn() as conn:
- for article_id, new_wp_id in updates:
- conn.execute(
- "UPDATE articles SET wp_post_id = ?, wp_post_url = NULL WHERE id = ?",
- (new_wp_id, article_id),
- )
- updated += 1
- return updated
-
-
-def list_articles(limit: int = 100, status_filter: str | None = None) -> list[dict[str, Any]]:
- safe_limit = max(1, min(limit, 500))
- with get_conn() as conn:
- if status_filter:
- rows = conn.execute(
- """
- SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author,
- a.summary, a.content_raw, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at, f.name AS feed_name,
- a.image_urls_json, a.press_contact, a.source_name_snapshot, a.source_terms_url_snapshot,
- a.source_license_name_snapshot, a.legal_checked, a.legal_checked_at, a.legal_note,
- a.wp_post_id, a.wp_post_url, a.publish_attempts, a.publish_last_error, a.published_to_wp_at
- FROM articles a
- LEFT JOIN feeds f ON f.id = a.feed_id
- WHERE a.status = ?
- ORDER BY a.id DESC
- LIMIT ?
- """,
- (status_filter, safe_limit),
- ).fetchall()
- else:
- rows = conn.execute(
- """
- SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author,
- a.summary, a.content_raw, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at, f.name AS feed_name,
- a.image_urls_json, a.press_contact, a.source_name_snapshot, a.source_terms_url_snapshot,
- a.source_license_name_snapshot, a.legal_checked, a.legal_checked_at, a.legal_note,
- a.wp_post_id, a.wp_post_url, a.publish_attempts, a.publish_last_error, a.published_to_wp_at
- FROM articles a
- LEFT JOIN feeds f ON f.id = a.feed_id
- ORDER BY a.id DESC
- LIMIT ?
- """,
- (safe_limit,),
- ).fetchall()
- return rows_to_dicts(rows)
diff --git a/backend/app/rewrite.py b/backend/app/rewrite.py
deleted file mode 100644
index 05937e5..0000000
--- a/backend/app/rewrite.py
+++ /dev/null
@@ -1,204 +0,0 @@
-from __future__ import annotations
-
-import json
-import re
-from typing import Any
-from urllib.request import Request, urlopen
-
-from .config import get_settings
-
-
-def _sanitize_source_text(text: str) -> str:
- raw = (text or "").strip()
- if not raw:
- return ""
-
- lines = [ln.strip() for ln in raw.splitlines() if ln.strip()]
- if len(lines) > 3:
- lines = lines[3:]
-
- joined = "\n".join(lines)
- # Remove press contact block at end from "Pressekontakt" onward.
- joined = re.sub(
- r"\n?\s*Pressekontakt[\s\S]*$",
- "",
- joined,
- flags=re.IGNORECASE,
- ).strip()
- return joined
-
-
-def _normalize_tags(tags: list[str], max_tags: int = 8) -> list[str]:
- out: list[str] = []
- seen: set[str] = set()
- for raw in tags:
- value = re.sub(r"\s+", " ", str(raw or "").strip())
- value = re.sub(r"^[#\-•\s]+", "", value)
- value = re.sub(r"[;,.:\s]+$", "", value)
- if not value:
- continue
- if len(value) < 2 or len(value) > 40:
- continue
- key = value.casefold()
- if key in seen:
- continue
- seen.add(key)
- out.append(value)
- if len(out) >= max_tags:
- break
- return out
-
-
-def _openai_chat(system: str, user: str, temperature: float = 0.4) -> str:
- settings = get_settings()
- api_key = settings.openai_api_key
- if not api_key:
- raise RuntimeError("OPENAI_API_KEY fehlt")
-
- payload = {
- "model": settings.openai_model,
- "temperature": temperature,
- "messages": [
- {"role": "system", "content": system},
- {"role": "user", "content": user},
- ],
- }
- req = Request(
- url="https://api.openai.com/v1/chat/completions",
- method="POST",
- data=json.dumps(payload).encode("utf-8"),
- headers={
- "Authorization": f"Bearer {api_key}",
- "Content-Type": "application/json",
- "Accept": "application/json",
- },
- )
- with urlopen(req, timeout=60) as resp:
- raw = resp.read().decode("utf-8", errors="replace")
- data = json.loads(raw)
- choices = data.get("choices")
- if not isinstance(choices, list) or not choices:
- raise RuntimeError(f"Ungültige OpenAI-Antwort: {data}")
- message = choices[0].get("message", {})
- content = message.get("content")
- if not isinstance(content, str) or not content.strip():
- raise RuntimeError("OpenAI lieferte keinen Inhalt")
- return content.strip()
-
-
-def rewrite_article_text(article: dict[str, Any]) -> str:
- source_text = _sanitize_source_text(article.get("content_raw") or "")
- if not source_text:
- source_text = (article.get("summary") or "").strip()
- if not source_text:
- raise RuntimeError("Kein Quelltext für Rewrite verfügbar")
-
- title = (article.get("title") or "").strip()
- source_name = (article.get("source_name_snapshot") or article.get("author") or "die Quelle").strip()
- prompt = (
- "Schreibe den folgenden News-Text neu auf Deutsch in persönlicher Du-Form. "
- "Stil: ausführlich, gut lesbar, ohne Einleitung mit Datum/Uhrzeit/Firma/Ort, "
- "ohne Pressekontakt, ohne Quellenblock. "
- "Nutze klare Absätze und Zwischenüberschriften in HTML (,
,
- falls passend). "
- "Inhaltlich korrekt bleiben, nichts erfinden. "
- f"Wichtig: Der Artikel wurde von '{source_name}' veröffentlicht. "
- "Verwende NIEMALS 'wir' oder 'ich' aus Sicht der Quelle – beziehe Aussagen stets auf die Quelle, "
- f"z.B. 'laut {source_name}', '{source_name} hat ermittelt', 'die Auswertung zeigt'.\n\n"
- f"Titel: {title}\n\n"
- f"Originaltext:\n{source_text}"
- )
- return _openai_chat(
- "Du bist ein deutscher News-Redakteur.",
- prompt,
- temperature=0.4,
- )
-
-
-def generate_article_tags(article: dict[str, Any], rewritten_text: str | None = None, max_tags: int = 8) -> list[str]:
- source_text = rewritten_text or _sanitize_source_text(article.get("content_raw") or "") or (article.get("summary") or "")
- source_text = str(source_text).strip()
- if not source_text:
- return []
- title = (article.get("title") or "").strip()
- prompt = (
- "Erzeuge präzise Schlagwörter für einen deutschen News-Artikel. "
- f"Maximal {max_tags} Tags. Nur relevante Begriffe, keine allgemeinen Wörter wie News/Artikel. "
- "Gib ausschließlich ein JSON-Array mit Strings zurück, ohne Erklärung.\n\n"
- f"Titel: {title}\n\n"
- f"Text:\n{source_text[:3500]}"
- )
- raw = _openai_chat(
- "Du extrahierst präzise, kurze News-Tags auf Deutsch.",
- prompt,
- temperature=0.2,
- )
- try:
- parsed = json.loads(raw)
- if isinstance(parsed, list):
- return _normalize_tags([str(x) for x in parsed], max_tags=max_tags)
- except Exception:
- pass
- # fallback: extract first JSON-like array if model wrapped output
- match = re.search(r"\[[\s\S]*\]", raw)
- if match:
- try:
- parsed = json.loads(match.group(0))
- if isinstance(parsed, list):
- return _normalize_tags([str(x) for x in parsed], max_tags=max_tags)
- except Exception:
- return []
- return []
-
-
-def score_article_relevance(article: dict[Any, Any]) -> dict[str, Any]:
- """Score article relevance for VanLife/Camping/Outdoor blog (0-100).
-
- Returns {"score": int, "reason": str, "topics": list[str]}.
- Raises RuntimeError on OpenAI failure.
- """
- title = (article.get("title") or "").strip()
- text = _sanitize_source_text(article.get("content_raw") or "")
- if not text:
- text = (article.get("summary") or "").strip()
-
- prompt = (
- "Bewerte die Relevanz des folgenden Artikels für einen deutschen VanLife-, Camping- und Outdoor-Blog. "
- "Relevante Themen: Campingplätze, Stellplätze, Wohnmobil, Camper, Van, Roadtrip, "
- "Outdoor-Ausrüstung, Wandern, Naturreisen, Reise-Tipps für Campende. "
- "Nicht relevant: allgemeine Nachrichten, Politik, Wirtschaft, Sport (außer Outdoor), Unterhaltung.\n\n"
- "Antworte NUR mit einem JSON-Objekt:\n"
- '{"score": <0-100>, "reason": "", "topics": ["", ""]}\n\n'
- f"Titel: {title}\n\n"
- f"Text (Auszug):\n{text[:2000]}"
- )
- raw = _openai_chat(
- "Du bist ein Redakteur für einen VanLife- und Camping-Blog und bewertest Artikelrelevanz.",
- prompt,
- temperature=0.1,
- )
- try:
- match = re.search(r"\{[\s\S]*\}", raw)
- if match:
- parsed = json.loads(match.group(0))
- score = max(0, min(100, int(parsed.get("score", 0))))
- return {
- "score": score,
- "reason": str(parsed.get("reason", "")),
- "topics": [str(t) for t in (parsed.get("topics") or [])],
- }
- except Exception:
- pass
- return {"score": 0, "reason": "Parsing-Fehler bei Relevanz-Score", "topics": []}
-
-
-def merge_generated_tags(meta_json: str | None, tags: list[str]) -> str:
- meta: dict[str, Any] = {}
- if meta_json:
- try:
- parsed = json.loads(meta_json)
- if isinstance(parsed, dict):
- meta = parsed
- except Exception:
- meta = {}
- meta["generated_tags"] = _normalize_tags(tags)
- return json.dumps(meta, ensure_ascii=False)
diff --git a/backend/app/scheduler.py b/backend/app/scheduler.py
deleted file mode 100644
index d5ea5bf..0000000
--- a/backend/app/scheduler.py
+++ /dev/null
@@ -1,336 +0,0 @@
-"""Smart publishing scheduler.
-
-Calculates suggested publish slots for new WordPress drafts.
-Rules:
-- Maximum N drafts per day (configurable, default 2)
-- Preferred slots: configurable hours (default 09:00 and 14:00 CET)
-- New articles queue up after the last already-scheduled article
-- Checks both local DB AND WordPress future posts to avoid double-booking
-"""
-from __future__ import annotations
-
-import base64
-import json
-import threading
-import urllib.request
-from datetime import date, datetime, timedelta, timezone
-from typing import Any
-
-from .config import get_settings
-from .db import get_conn
-
-# Ensures that concurrent pipeline runs (two threads) never assign the same slot.
-_slot_lock = threading.Lock()
-
-
-# CET offset (UTC+1 winter / UTC+2 summer – fixed +1 for simplicity)
-_CET_OFFSET = timedelta(hours=1)
-
-
-def _today_cet() -> date:
- return (datetime.now(timezone.utc) + _CET_OFFSET).date()
-
-
-def _preferred_hours() -> list[int]:
- settings = get_settings()
- try:
- return [int(h.strip()) for h in settings.pipeline_publish_hours.split(",") if h.strip()]
- except Exception:
- return [9, 14]
-
-
-def _fetch_wp_occupied_slots() -> set[tuple[str, int]]:
- """Fetch all future-scheduled WordPress posts and return occupied (date_iso, hour) pairs.
-
- This prevents the scheduler from assigning a slot that is already taken
- by a WP post that was not created via this pipeline (e.g. manually or via recovery scripts).
- Returns an empty set on any error so the scheduler degrades gracefully.
- """
- settings = get_settings()
- try:
- auth = base64.b64encode(
- f"{settings.wordpress_username}:{settings.wordpress_app_password}".encode()
- ).decode()
- url = (
- f"{settings.wordpress_base_url}/wp-json/wp/v2/posts"
- f"?status=future&per_page=100&orderby=date&order=asc&_fields=id,date"
- )
- req = urllib.request.Request(url, headers={"Authorization": f"Basic {auth}"})
- with urllib.request.urlopen(req, timeout=10) as resp:
- posts = json.loads(resp.read())
- occupied: set[tuple[str, int]] = set()
- for p in posts:
- try:
- dt = datetime.fromisoformat(p["date"])
- occupied.add((dt.date().isoformat(), dt.hour))
- except Exception:
- pass
- return occupied
- except Exception:
- return set()
-
-
-def _get_last_future_scheduled_date(wp_occupied: set[tuple[str, int]]) -> date | None:
- """Return the date of the latest already-scheduled slot (DB + WP)."""
- today = _today_cet()
-
- # Latest from local DB
- with get_conn() as conn:
- row = conn.execute(
- """
- SELECT MAX(scheduled_publish_at) AS last_slot
- FROM articles
- WHERE scheduled_publish_at IS NOT NULL
- AND scheduled_publish_at >= ?
- AND status NOT IN ('error', 'no_image')
- """,
- (today.isoformat() + "T00:00:00",),
- ).fetchone()
- db_last: date | None = None
- if row and row["last_slot"]:
- try:
- db_last = datetime.fromisoformat(row["last_slot"]).date()
- except Exception:
- pass
-
- # Latest from WP
- wp_last: date | None = None
- for d_str, _ in wp_occupied:
- try:
- d = date.fromisoformat(d_str)
- if d >= today and (wp_last is None or d > wp_last):
- wp_last = d
- except Exception:
- pass
-
- if db_last and wp_last:
- return max(db_last, wp_last)
- return db_last or wp_last
-
-
-def _next_free_hour(target_date: date, wp_occupied: set[tuple[str, int]]) -> int | None:
- """Return first preferred hour not yet used on target_date (DB + WP), or None if day is full."""
- hours = _preferred_hours()
- date_str = target_date.isoformat()
-
- # Hours used in local DB
- with get_conn() as conn:
- rows = conn.execute(
- """
- SELECT scheduled_publish_at FROM articles
- WHERE scheduled_publish_at >= ? AND scheduled_publish_at < ?
- AND status NOT IN ('error', 'no_image')
- """,
- (date_str + "T00:00:00", date_str + "T23:59:59"),
- ).fetchall()
-
- used_hours: set[int] = set()
- for row in rows:
- ts = row["scheduled_publish_at"] or ""
- try:
- used_hours.add(datetime.fromisoformat(ts).hour)
- except Exception:
- pass
-
- # Hours used in WordPress
- for d_str, h in wp_occupied:
- if d_str == date_str:
- used_hours.add(h)
-
- for h in hours:
- if h not in used_hours:
- return h
- return None
-
-
-def _format_slot(d: date, hour: int) -> str:
- weekday_names = ["Mo", "Di", "Mi", "Do", "Fr", "Sa", "So"]
- wd = weekday_names[d.weekday()]
- return f"{wd}, {d.strftime('%d.%m.%Y')} um {hour:02d}:00 Uhr"
-
-
-def _find_next_free_slot(
- wp_occupied: set[tuple[str, int]], lookahead_days: int = 60
-) -> tuple[date, int] | None:
- """Find the next free (date, hour) slot.
-
- Starts from tomorrow and scans forward, filling any gaps in the schedule
- rather than always appending after the last existing post.
- """
- today = _today_cet()
- tomorrow = today + timedelta(days=1)
-
- for offset in range(0, lookahead_days + 1):
- candidate = tomorrow + timedelta(days=offset)
- hour = _next_free_hour(candidate, wp_occupied)
- if hour is not None:
- return candidate, hour
-
- return tomorrow, _preferred_hours()[0] if _preferred_hours() else 9
-
-
-def get_schedule_overview(lookahead_days: int = 60) -> list[dict]:
- """Return all booked scheduling slots (DB + WP) for the next N days, sorted by date."""
- today = _today_cet()
- hours = _preferred_hours()
-
- # Slots booked in local DB
- with get_conn() as conn:
- rows = conn.execute(
- """
- SELECT id, title, status, wp_post_id, wp_post_url, scheduled_publish_at
- FROM articles
- WHERE scheduled_publish_at IS NOT NULL
- AND scheduled_publish_at >= ?
- AND status NOT IN ('error', 'no_image')
- ORDER BY scheduled_publish_at
- """,
- (today.isoformat() + "T00:00:00",),
- ).fetchall()
-
- db_slots: dict[tuple[str, int], dict] = {}
- for row in rows:
- try:
- dt = datetime.fromisoformat(row["scheduled_publish_at"])
- key = (dt.date().isoformat(), dt.hour)
- db_slots[key] = {
- "date": dt.date().isoformat(),
- "hour": dt.hour,
- "formatted": _format_slot(dt.date(), dt.hour),
- "source": "db",
- "article_id": row["id"],
- "article_title": row["title"],
- "article_status": row["status"],
- "wp_post_id": row["wp_post_id"],
- "wp_post_url": row["wp_post_url"],
- }
- except Exception:
- pass
-
- # Slots occupied in WordPress but not in local DB
- wp_occupied = _fetch_wp_occupied_slots()
- wp_only: list[dict] = []
- for d_str, h in sorted(wp_occupied):
- if (d_str, h) in db_slots:
- continue
- try:
- d = date.fromisoformat(d_str)
- if d >= today:
- wp_only.append({
- "date": d_str,
- "hour": h,
- "formatted": _format_slot(d, h),
- "source": "wordpress",
- "article_id": None,
- "article_title": "(WP-Beitrag außerhalb Pipeline)",
- "article_status": None,
- "wp_post_id": None,
- "wp_post_url": None,
- })
- except Exception:
- pass
-
- all_slots = list(db_slots.values()) + wp_only
- all_slots.sort(key=lambda s: (s["date"], s["hour"]))
- return all_slots
-
-
-def release_publish_slot(article_id: int) -> None:
- """Clear a previously reserved slot (e.g. when article is rejected after slot assignment)."""
- with get_conn() as conn:
- conn.execute(
- "UPDATE articles SET scheduled_publish_at = NULL WHERE id = ?",
- (article_id,),
- )
-
-
-def suggest_publish_slot() -> str:
- """Return a suggested publish datetime string (CET) for the next free slot."""
- wp_occupied = _fetch_wp_occupied_slots()
- result = _find_next_free_slot(wp_occupied)
- if result:
- d, hour = result
- return _format_slot(d, hour)
- tomorrow = _today_cet() + timedelta(days=1)
- return _format_slot(tomorrow, _preferred_hours()[0] if _preferred_hours() else 9)
-
-
-def reserve_publish_slot(article_id: int) -> str:
- """Reserve a publish slot for an article and persist it in the DB.
-
- If the article already has a scheduled_publish_at, keep it unchanged.
- Returns the formatted publish datetime string.
-
- Uses a module-level lock so that concurrent pipeline runs (two threads)
- cannot read the same "free" slot and assign it twice.
- """
- # Fetch WP-occupied slots BEFORE acquiring the lock — the API call can be slow
- # and must not block other threads unnecessarily.
- wp_occupied = _fetch_wp_occupied_slots()
-
- with _slot_lock:
- # Single DB connection for the entire read-find-write cycle so the
- # slot we pick is still free when we write it.
- with get_conn() as conn:
- row = conn.execute(
- "SELECT scheduled_publish_at FROM articles WHERE id = ?",
- (article_id,),
- ).fetchone()
- existing_slot = row["scheduled_publish_at"] if row else None
- if existing_slot:
- try:
- dt = datetime.fromisoformat(existing_slot)
- return _format_slot(dt.date(), dt.hour)
- except Exception:
- pass # invalid — fall through and assign a fresh slot
-
- # Find the next free (date, hour) slot using THIS connection so we
- # see all slots written during this lock window.
- hours = _preferred_hours()
- today = _today_cet()
- tomorrow = today + timedelta(days=1)
- candidate: date | None = None
- chosen_hour: int | None = None
-
- for offset in range(0, 61):
- d = tomorrow + timedelta(days=offset)
- date_str = d.isoformat()
-
- rows = conn.execute(
- """
- SELECT scheduled_publish_at FROM articles
- WHERE scheduled_publish_at >= ? AND scheduled_publish_at < ?
- AND status NOT IN ('error', 'no_image')
- """,
- (date_str + "T00:00:00", date_str + "T23:59:59"),
- ).fetchall()
-
- used_hours: set[int] = set()
- for r in rows:
- ts = r["scheduled_publish_at"] or ""
- try:
- used_hours.add(datetime.fromisoformat(ts).hour)
- except Exception:
- pass
- for d_str, h in wp_occupied:
- if d_str == date_str:
- used_hours.add(h)
-
- for h in hours:
- if h not in used_hours:
- candidate = d
- chosen_hour = h
- break
- if candidate is not None:
- break
-
- if candidate is None:
- candidate = tomorrow
- chosen_hour = hours[0] if hours else 9
-
- iso_ts = f"{candidate.isoformat()}T{chosen_hour:02d}:00:00"
- conn.execute(
- "UPDATE articles SET scheduled_publish_at = ? WHERE id = ?",
- (iso_ts, article_id),
- )
- return _format_slot(candidate, chosen_hour)
diff --git a/backend/app/source_extraction.py b/backend/app/source_extraction.py
deleted file mode 100644
index d3cbed8..0000000
--- a/backend/app/source_extraction.py
+++ /dev/null
@@ -1,442 +0,0 @@
-from __future__ import annotations
-
-from dataclasses import dataclass, field
-from html import unescape
-import re
-from typing import Any
-from urllib.parse import urljoin
-from urllib.request import Request, urlopen
-
-DEFAULT_TIMEOUT_SECONDS = 10
-DEFAULT_USER_AGENT = "rss-news-bot/1.0 (+https://news.vanityontour.de)"
-
-
-@dataclass(frozen=True)
-class ExtractedArticle:
- title: str | None
- author: str | None
- canonical_url: str | None
- summary: str | None
- content_text: str | None
- images: list[str]
- press_contact: str | None
- extraction_error: str | None = None
- image_metadata: dict[str, dict] = field(default_factory=dict)
-
-
-def _clean_text(raw: str | None) -> str | None:
- if not raw:
- return None
- text = unescape(raw)
- text = re.sub(r"<[^>]+>", " ", text)
- text = re.sub(r"\s+", " ", text).strip()
- return text or None
-
-
-def _strip_noise(html: str) -> str:
- html = re.sub(r"
-