Compare commits
92 commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f710141828 | ||
|
|
2456e4aca7 | ||
|
|
1498fa7156 | ||
|
|
cdcf441daf | ||
|
|
2d02b56b65 | ||
|
|
8676ace102 | ||
|
|
cf2d826c8a | ||
|
|
2d1dd14e45 | ||
|
|
09dcf6ce36 | ||
|
|
94bd93a18a | ||
|
|
8fa46312e8 | ||
|
|
764e7bff6a | ||
|
|
426a799371 | ||
|
|
8c6022fead | ||
|
|
1a8d0775c7 | ||
|
|
45c533c674 | ||
|
|
d1cb809852 | ||
|
|
82f2df610d | ||
|
|
8e65485f0c | ||
|
|
0d07a9804d | ||
|
|
aaac5def27 | ||
|
|
1963e32ab4 | ||
|
|
12932bca90 | ||
|
|
013af2ab62 | ||
|
|
a64bf31ff6 | ||
|
|
970f509ad4 | ||
|
|
e9c472b722 | ||
|
|
1020526e76 | ||
|
|
d9ab599466 | ||
|
|
0a9c0b10d6 | ||
|
|
6192f8e527 | ||
| 6332a9a399 | |||
| 93f52f72b9 | |||
| b0f995d5c9 | |||
| da269d08f1 | |||
| 88b2ee1d01 | |||
| 50f737f434 | |||
| 35ccceb260 | |||
| 8d7375c99f | |||
| 24d8e5ad0f | |||
| e68b6a41fd | |||
| ba83b24510 | |||
| fee5e76842 | |||
| 592d699166 | |||
| 1cee56205e | |||
| dcdf4d954a | |||
| 26e3d26b93 | |||
| fb3465fb10 | |||
| 910ca72c81 | |||
| efaf132936 | |||
| 6691db8051 | |||
| 5159a6e3b4 | |||
| c52363f1a7 | |||
| 2c331d683b | |||
| d65c55d315 | |||
| a46d919118 | |||
| 46e0b98928 | |||
| 0bb7d246c1 | |||
| a02f825274 | |||
| 0cfbb6c37f | |||
| 777c770142 | |||
| beac96095e | |||
| ed91864eda | |||
| 759a313f31 | |||
| d6ab09226a | |||
| 808a39dfc9 | |||
| 1c63163f22 | |||
| 56a766596b | |||
| 0c84dd1a1a | |||
| 050e08859c | |||
| 9898964f00 | |||
| 79bdb57e98 | |||
| 1797f687cd | |||
| 56c379cbed | |||
| 4eaef89be8 | |||
| b41901b1c1 | |||
| c49864c4aa | |||
| 3862879a03 | |||
| 6ab8bdaadb | |||
| 1badb77173 | |||
| d2b6648872 | |||
| 2a9fa1e548 | |||
| 91baacd355 | |||
| 4e042edaaf | |||
| 1b3ecbb3db | |||
| 082737a9c4 | |||
| 1d54cf904b | |||
| c0b03ba477 | |||
| 10212f825d | |||
| 025eb5105c | |||
| bae12a31c5 | |||
| 0df9633439 |
93 changed files with 31187 additions and 496 deletions
BIN
.dedupe/index.sqlite
Normal file
BIN
.dedupe/index.sqlite
Normal file
Binary file not shown.
1
.dedupe/report.csv
Normal file
1
.dedupe/report.csv
Normal file
|
|
@ -0,0 +1 @@
|
|||
group_id,canonical_path,dup_path,dup_size_bytes
|
||||
|
16
.env.example
Normal file
16
.env.example
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
# Copy to .env and fill in values
|
||||
|
||||
# WordPress base URL (required)
|
||||
WP_BASE_URL=https://your-site.tld
|
||||
|
||||
# Authentication: prefer WP_AUTH_BASE64 OR use USERNAME+PASSWORD (Application Password)
|
||||
# Example to generate: base64(username:application_password)
|
||||
WP_AUTH_BASE64=
|
||||
|
||||
# Alternatively provide username and application password
|
||||
WP_USERNAME=
|
||||
WP_PASSWORD=
|
||||
|
||||
# OpenAI API key (optional, enables rewrite)
|
||||
OPENAI_API_KEY=
|
||||
|
||||
30
.github/ISSUE_TEMPLATE/bug_report.md
vendored
Normal file
30
.github/ISSUE_TEMPLATE/bug_report.md
vendored
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
---
|
||||
name: 🐞 Bug melden
|
||||
about: Etwas funktioniert nicht wie erwartet? Dann bitte hier melden.
|
||||
title: "[Bug]: "
|
||||
labels: bug
|
||||
---
|
||||
|
||||
### 🐛 Fehlerbeschreibung
|
||||
|
||||
Beschreibe klar und prägnant, was nicht funktioniert.
|
||||
|
||||
### 🔁 Schritte zur Reproduktion
|
||||
|
||||
1. ...
|
||||
2. ...
|
||||
3. ...
|
||||
|
||||
### 🤔 Erwartetes Verhalten
|
||||
|
||||
Was hättest du erwartet, was passieren soll?
|
||||
|
||||
### 📷 Screenshot oder Logauszug (optional)
|
||||
|
||||
Wenn hilfreich, Screenshot oder Terminal-Fehlermeldung einfügen.
|
||||
|
||||
### 🧩 Kontext / Systeminfos
|
||||
|
||||
- Version: z. B. `v1.5.0`
|
||||
- Python-Version:
|
||||
- Betriebssystem:
|
||||
5
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
5
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
blank_issues_enabled: false
|
||||
contact_links:
|
||||
- name: 💬 Frage oder Feedback
|
||||
url: https://github.com/OliverGiertz/rss-news/discussions
|
||||
about: Nutze GitHub Discussions für allgemeine Fragen oder Feedback.
|
||||
22
.github/ISSUE_TEMPLATE/feature_request.md
vendored
Normal file
22
.github/ISSUE_TEMPLATE/feature_request.md
vendored
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
---
|
||||
name: ✨ Feature Request
|
||||
about: Du hast eine Idee oder willst eine Funktion vorschlagen? Her damit!
|
||||
title: "[Feature]: "
|
||||
labels: enhancement
|
||||
---
|
||||
|
||||
### 🚀 Beschreibung der Funktion
|
||||
|
||||
Was soll die neue Funktion können?
|
||||
|
||||
### 🧠 Warum ist das nützlich?
|
||||
|
||||
Welches Problem löst das Feature oder wie verbessert es das Tool?
|
||||
|
||||
### 💡 Vorschläge zur Umsetzung (optional)
|
||||
|
||||
Wie könnte man das technisch umsetzen?
|
||||
|
||||
### 🔗 Relevante Verweise (optional)
|
||||
|
||||
Links, Screenshots, Mockups oder Referenzcode.
|
||||
18
.github/PULL_REQUEST_TEMPLATE.md
vendored
Normal file
18
.github/PULL_REQUEST_TEMPLATE.md
vendored
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
## ✅ Änderungen in diesem Pull Request
|
||||
|
||||
- [x] Kurze Beschreibung der Änderung
|
||||
- [ ] ggf. Screenshots oder Codebeispiele
|
||||
|
||||
## 🧪 Tests
|
||||
|
||||
- [ ] Lokal getestet
|
||||
- [ ] Streamlit UI geprüft
|
||||
- [ ] Linter/Formatter ausgeführt
|
||||
|
||||
## 🔗 Zusammenhang mit Issues
|
||||
|
||||
Closes #123 (optional)
|
||||
|
||||
## 📎 Sonstiges
|
||||
|
||||
Falls nötig: Kontext, bekannte Einschränkungen etc.
|
||||
34
.github/workflows/deploy.yml
vendored
Normal file
34
.github/workflows/deploy.yml
vendored
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
name: 🚀 Deploy to Hetzner
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
|
||||
jobs:
|
||||
deploy:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout Code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Deploy via SSH
|
||||
uses: appleboy/ssh-action@v1.0.0
|
||||
with:
|
||||
host: 88.99.209.207
|
||||
username: oliver
|
||||
key: ${{ secrets.HETZNER_SSH_KEY }}
|
||||
port: 22
|
||||
envs: APP_ADMIN_USERNAME,APP_ADMIN_PASSWORD
|
||||
script: |
|
||||
cd /opt/rss-news
|
||||
git pull origin main
|
||||
source .venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
pip install -r backend/requirements.txt || true
|
||||
sudo systemctl restart rss-news-api
|
||||
sleep 3
|
||||
BASE_URL="https://news.vanityontour.de" APP_ADMIN_USERNAME="${APP_ADMIN_USERNAME}" APP_ADMIN_PASSWORD="${APP_ADMIN_PASSWORD}" bash scripts/smoke_backend.sh
|
||||
env:
|
||||
APP_ADMIN_USERNAME: ${{ secrets.NEWS_APP_ADMIN_USERNAME }}
|
||||
APP_ADMIN_PASSWORD: ${{ secrets.NEWS_APP_ADMIN_PASSWORD }}
|
||||
36
.github/workflows/release.yml
vendored
36
.github/workflows/release.yml
vendored
|
|
@ -1,4 +1,4 @@
|
|||
name: 🚀 GitHub Release
|
||||
name: 🚀 GitHub Release (Full Changelog)
|
||||
|
||||
on:
|
||||
push:
|
||||
|
|
@ -7,45 +7,25 @@ on:
|
|||
|
||||
jobs:
|
||||
create_release:
|
||||
name: 📦 GitHub Release erstellen
|
||||
name: 📦 Release mit sauberem Markdown-Body
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: 📥 Repository klonen
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: 📝 Release Notes aus CHANGELOG extrahieren
|
||||
- name: 🔧 Lade gesamten CHANGELOG.md-Inhalt
|
||||
id: changelog
|
||||
run: |
|
||||
VERSION=${GITHUB_REF#refs/tags/}
|
||||
echo "📌 Version: $VERSION"
|
||||
echo "body<<EOF" >> $GITHUB_OUTPUT
|
||||
cat CHANGELOG.md >> $GITHUB_OUTPUT
|
||||
echo "EOF" >> $GITHUB_OUTPUT
|
||||
|
||||
BODY=$(awk -v version="$VERSION" '
|
||||
BEGIN { found = 0 }
|
||||
$0 ~ "## \\[v"version"\\]" { found = 1; next }
|
||||
$0 ~ /^## \[v[0-9]+\.[0-9]+\.[0-9]+\]/ && found { exit }
|
||||
found { print }
|
||||
' CHANGELOG.md)
|
||||
|
||||
echo "$BODY" > extracted_changelog.md
|
||||
|
||||
# Prüfen, ob Text extrahiert wurde
|
||||
if [ ! -s extracted_changelog.md ]; then
|
||||
echo "❌ Kein gültiger CHANGELOG-Eintrag für Version $VERSION gefunden. Abbruch."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
BODY_ESCAPED="${BODY//'%'/'%25'}"
|
||||
BODY_ESCAPED="${BODY_ESCAPED//$'\n'/'%0A'}"
|
||||
BODY_ESCAPED="${BODY_ESCAPED//$'\r'/'%0D'}"
|
||||
|
||||
echo "body=$BODY_ESCAPED" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: 🚀 GitHub Release veröffentlichen
|
||||
- name: 🚀 Release erstellen mit Markdown
|
||||
uses: softprops/action-gh-release@v2
|
||||
with:
|
||||
name: ${{ github.ref_name }}
|
||||
body: ${{ steps.changelog.outputs.body }}
|
||||
files: extracted_changelog.md
|
||||
files: CHANGELOG.md
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
|
|
|||
39
.github/workflows/test.yml
vendored
Normal file
39
.github/workflows/test.yml
vendored
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
name: Backend Tests
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
pull_request:
|
||||
branches: [main]
|
||||
|
||||
jobs:
|
||||
backend-tests:
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 15
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.11"
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r backend/requirements.txt
|
||||
pip install -r backend/requirements-test.txt
|
||||
|
||||
- name: Run tests with coverage
|
||||
env:
|
||||
APP_DB_PATH: /tmp/rss_news_test.db
|
||||
run: |
|
||||
pytest backend/tests --cov=backend/app --cov-report=term-missing --cov-report=xml
|
||||
|
||||
- name: Upload coverage artifact
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: coverage-xml
|
||||
path: coverage.xml
|
||||
4
.gitignore
vendored
4
.gitignore
vendored
|
|
@ -34,4 +34,6 @@ Thumbs.db
|
|||
internal/start.sh
|
||||
internal/copy_files.sh
|
||||
internal/_line.txt
|
||||
|
||||
internal/push_commit.txt
|
||||
internal/git.sh
|
||||
CLAUDE.md
|
||||
|
|
|
|||
40
AGENTS.md
Normal file
40
AGENTS.md
Normal file
|
|
@ -0,0 +1,40 @@
|
|||
# Repository Guidelines
|
||||
|
||||
## Project Structure & Module Organization
|
||||
- `app.py`: Streamlit UI (entry point for the app).
|
||||
- `main.py`: RSS fetching, rewrite, and WordPress upload logic.
|
||||
- `utils/`: Helpers (image/article extraction, WP uploader, UI helpers).
|
||||
- `pages/`: Streamlit pages (e.g., `01_feed_manager.py`, `log_viewer.py`).
|
||||
- `data/`: JSON state (`articles.json`, `feeds.json`).
|
||||
- `logs/`: Runtime logs (`rss_tool.log`).
|
||||
- `docs/`: Project notes (e.g., roadmap).
|
||||
- `__version__.py`: Version string written by `versioning.py`.
|
||||
|
||||
## Build, Test, and Development Commands
|
||||
- Create env: `python -m venv .venv && source .venv/bin/activate`
|
||||
- Install deps: `pip install -r requirements.txt`
|
||||
- Run app: `streamlit run app.py`
|
||||
- Version bump: `python versioning.py --level patch --push` (updates `__version__.py`, prepares `CHANGELOG.md`, creates tag; see `--help`).
|
||||
|
||||
## Coding Style & Naming Conventions
|
||||
- Python 3.10+, PEP 8, 4-space indentation, type hints where practical.
|
||||
- Modules and functions: `snake_case`; classes: `PascalCase`.
|
||||
- Streamlit pages: numeric prefix for order, e.g., `pages/01_feature.py`.
|
||||
- Keep functions small and pure in `utils/`; isolate I/O in app layers.
|
||||
- Suggested tools (optional): Black (`black .`) and Ruff (`ruff check .`).
|
||||
|
||||
## Testing Guidelines
|
||||
- Framework: pytest (recommended). Place tests under `tests/` with `test_*.py`.
|
||||
- Unit tests for `utils/*`; light integration checks for `main.py` with temporary files.
|
||||
- Run: `pytest -q`. Add coverage if needed (e.g., `pytest --cov=utils`).
|
||||
- Test data: avoid mutating files in `data/`; use temp dirs or fixtures.
|
||||
|
||||
## Commit & Pull Request Guidelines
|
||||
- Commits: imperative mood, concise; examples: `Add feed dedupe`, `Fix WP upload retry`, `Bump version to v1.7.0`.
|
||||
- PRs: clear description, linked issue, screenshots/GIFs for UI changes, note env variables touched.
|
||||
- Update `CHANGELOG.md` and bump version via `versioning.py` before release PRs.
|
||||
|
||||
## Security & Configuration Tips
|
||||
- Required env: `OPENAI_API_KEY`, `WP_BASE_URL`, `WP_USERNAME`, `WP_PASSWORD` or `WP_AUTH_BASE64` (see `.env`).
|
||||
- Never commit secrets; `.env` is git-ignored. Avoid hardcoded credentials; prefer `os.getenv`.
|
||||
- Logs and data may contain content; do not commit `logs/` or large `data/` snapshots.
|
||||
1061
CHANGELOG.md
1061
CHANGELOG.md
File diff suppressed because it is too large
Load diff
74
README.md
74
README.md
|
|
@ -1,27 +1,63 @@
|
|||
# 📰 RSS Article Manager
|
||||
# rss-news (Rebuild)
|
||||
|
||||
Ein einfaches, modulares Webtool auf Basis von Streamlit, das RSS-Artikel automatisch einliest, umschreibt, zusammenfasst und mit Tags versieht – bereit zur Veröffentlichung auf WordPress.
|
||||
`rss-news` wird als bestehendes Repository weitergefuehrt und schrittweise zu einer robusten, rechtssicheren News-Pipeline neu aufgebaut.
|
||||
|
||||
## ✨ Funktionen
|
||||
Aktueller Stand:
|
||||
- Alte Streamlit-App wird nicht produktiv genutzt.
|
||||
- `news.vanityontour.de` wird bis zum Go-Live der neuen App auf `https://vanityontour.de` umgeleitet.
|
||||
- Planung, Doku und Wiki werden als Grundlage fuer den Neuaufbau gepflegt.
|
||||
|
||||
- 📥 RSS-Feeds direkt über die Oberfläche hinzufügen und verwalten
|
||||
- 📝 Artikel automatisch umschreiben mit Hilfe von ChatGPT
|
||||
- 🏷️ Tags und Zusammenfassungen automatisch generieren
|
||||
- 🗂️ Übersicht in tabellarischer Form mit Filter nach Status
|
||||
- 📋 Kopierbare Inhalte für manuelles Einfügen in WordPress
|
||||
- 📎 Link zum Originalartikel zur einfachen Bildübernahme
|
||||
- 💾 Speicherung in einer lokalen JSON-Datei (später SQLite möglich)
|
||||
- 📦 Versionierung inkl. CHANGELOG und GitHub Releases
|
||||
## Ziele
|
||||
- RSS-gestuetzte Artikelverarbeitung mit klaren Quellregeln
|
||||
- Rechtssichere Nutzung (Quellen, Attribution, Lizenzinformationen)
|
||||
- Zuverlaessige Automatisierung auf Hetzner
|
||||
- Publikation nach WordPress (IONOS aktuell, spaeter offen)
|
||||
- Zugriff nur nach Login (zunaechst User/Password)
|
||||
|
||||
## 🔐 Voraussetzungen
|
||||
## Architektur-Richtung (MVP)
|
||||
- Backend: `Python + FastAPI`
|
||||
- Jobs: Queue-Worker (z. B. Redis + RQ/Celery)
|
||||
- Daten: SQLite fuer MVP, spaeter optional PostgreSQL
|
||||
- Auth: Session-Login mit einem Admin-User
|
||||
- Publishing: WordPress REST API (Status zunaechst `pending`)
|
||||
|
||||
- Python 3.8+
|
||||
- OpenAI API Key (per `.env` eingebunden)
|
||||
Details: `docs/PROJECT_PLAN.md`
|
||||
|
||||
## 🚀 Loslegen
|
||||
## Projektsteuerung
|
||||
- GitHub Project: `https://github.com/users/OliverGiertz/projects/3/views/1`
|
||||
- Dieses Board ist die zentrale Steuerung fuer ToDos, Bugs, Verbesserungen.
|
||||
- Wiki-Struktur liegt unter `docs/wiki/`.
|
||||
|
||||
## Dokumentation
|
||||
- Projektplan: `docs/PROJECT_PLAN.md`
|
||||
- ToDo-Liste: `docs/TODO.md`
|
||||
- Quell- und Lizenzpolicy: `docs/SOURCE_POLICY.md`
|
||||
- Wiki Home: `docs/wiki/Home.md`
|
||||
|
||||
## Lokale Entwicklung (Legacy-Code)
|
||||
Der vorhandene Legacy-Stand kann weiterhin lokal gestartet werden:
|
||||
|
||||
```bash
|
||||
# Setup
|
||||
git clone https://github.com/dein-benutzername/rss-article-manager.git
|
||||
cd rss-article-manager
|
||||
bash start.sh
|
||||
python -m venv .venv
|
||||
source .venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
streamlit run app.py
|
||||
```
|
||||
|
||||
Hinweis: Diese App ist funktional historisch und wird durch die neue Architektur ersetzt.
|
||||
|
||||
## Deployment-Zielbild
|
||||
- Betrieb auf Hetzner
|
||||
- Reverse Proxy via CloudPanel/Nginx
|
||||
- Produktive Domain: `news.vanityontour.de`
|
||||
- Bis zur Fertigstellung: Redirect auf `https://vanityontour.de`
|
||||
|
||||
## Sicherheit
|
||||
- Keine Secrets im Repository
|
||||
- `.env` lokal/auf Server, nie committen
|
||||
- Auth-Pflicht fuer die neue WebApp
|
||||
- spaeter optional: Passkeys/WebAuthn
|
||||
|
||||
## Rechtlicher Hinweis
|
||||
Dieses Projekt verarbeitet nur Quellen mit dokumentierter Nutzungsgrundlage. Vor produktiver Nutzung ist eine finale rechtliche Pruefung der ausgewaehlten Feeds notwendig.
|
||||
|
||||
|
|
|
|||
|
|
@ -1 +1 @@
|
|||
VERSION = "1.4.6"
|
||||
VERSION = "1.7.1"
|
||||
|
|
|
|||
45
backend/.env.example
Normal file
45
backend/.env.example
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
# ─── App ────────────────────────────────────────────────────────────────────
|
||||
APP_ENV=development
|
||||
APP_NAME=rss-news-backend
|
||||
APP_SECRET_KEY=replace-with-a-long-random-secret
|
||||
APP_DB_PATH=backend/data/rss_news.db
|
||||
|
||||
APP_ADMIN_USERNAME=admin
|
||||
APP_ADMIN_PASSWORD=change-me
|
||||
|
||||
SESSION_COOKIE_NAME=rss_news_session
|
||||
SESSION_MAX_AGE_SECONDS=28800
|
||||
|
||||
# ─── WordPress ──────────────────────────────────────────────────────────────
|
||||
WP_BASE_URL=https://your-site.tld
|
||||
WP_USERNAME=your-wp-username
|
||||
WP_PASSWORD=your-wp-app-password
|
||||
# Status für neue Beiträge: draft | future | publish
|
||||
WORDPRESS_DEFAULT_STATUS=draft
|
||||
|
||||
# ─── OpenAI ─────────────────────────────────────────────────────────────────
|
||||
OPENAI_API_KEY=sk-...
|
||||
# gpt-4o-mini empfohlen (Kosten/Qualität)
|
||||
OPENAI_MODEL=gpt-4o-mini
|
||||
|
||||
# ─── Telegram Bot ────────────────────────────────────────────────────────────
|
||||
# Bot-Token von @BotFather
|
||||
TELEGRAM_BOT_TOKEN=123456789:ABC...
|
||||
# Chat-ID deines persönlichen Chats oder einer Gruppe
|
||||
TELEGRAM_CHAT_ID=123456789
|
||||
# Zufälliger Secret-Token zur Webhook-Absicherung (mindestens 20 Zeichen)
|
||||
TELEGRAM_WEBHOOK_SECRET=replace-with-random-secret-min-20-chars
|
||||
|
||||
# ─── N8N API-Key ─────────────────────────────────────────────────────────────
|
||||
# Wird von N8N im Header X-API-Key mitgeschickt
|
||||
N8N_API_KEY=replace-with-strong-random-key
|
||||
|
||||
# ─── Pipeline-Einstellungen ──────────────────────────────────────────────────
|
||||
# Relevanz-Score >= dieser Wert: automatisch verarbeiten (0-100)
|
||||
PIPELINE_RELEVANCE_AUTO=80
|
||||
# Relevanz-Score >= dieser Wert, aber < AUTO: Telegram-Warnung senden
|
||||
PIPELINE_RELEVANCE_WARN=60
|
||||
# Maximale Drafts/Veröffentlichungen pro Tag
|
||||
PIPELINE_MAX_DRAFTS_PER_DAY=2
|
||||
# Bevorzugte Veröffentlichungszeiten (Stunden, kommagetrennt, CET)
|
||||
PIPELINE_PUBLISH_HOURS=9,14
|
||||
82
backend/README.md
Normal file
82
backend/README.md
Normal file
|
|
@ -0,0 +1,82 @@
|
|||
# Backend Skeleton (FastAPI)
|
||||
|
||||
Dieses Verzeichnis enthaelt das technische Grundgeruest fuer den Rebuild von `rss-news`.
|
||||
|
||||
## Start (lokal)
|
||||
|
||||
```bash
|
||||
python -m venv .venv
|
||||
source .venv/bin/activate
|
||||
pip install -r backend/requirements.txt
|
||||
uvicorn backend.app.main:app --reload --port 8501
|
||||
```
|
||||
|
||||
## Admin UI
|
||||
- Login: `http://127.0.0.1:8501/admin/login`
|
||||
- Dashboard: `http://127.0.0.1:8501/admin/dashboard`
|
||||
|
||||
## Environment
|
||||
- Datei: `backend/.env`
|
||||
- Vorlage: `backend/.env.example`
|
||||
|
||||
## Endpoints
|
||||
- `GET /health` - Healthcheck
|
||||
- `POST /auth/login` - Login mit Admin-User
|
||||
- `POST /auth/logout` - Logout
|
||||
- `GET /auth/me` - Aktiver User
|
||||
- `GET /api/protected` - Geschuetzter Test-Endpoint
|
||||
- `GET /api/pipeline/status` - Basisstatus inkl. Datensatzzaehler
|
||||
- `GET /api/sources` - Quellenliste
|
||||
- `POST /api/sources` - Quelle anlegen
|
||||
- `GET /api/sources/{source_id}/policy-check` - Policy-Pruefung fuer Quelle
|
||||
- `GET /api/feeds` - Feedliste
|
||||
- `POST /api/feeds` - Feed anlegen
|
||||
- `GET /api/feeds/{feed_id}/policy-check` - Policy-Pruefung fuer Feed
|
||||
- `GET /api/runs` - Import-/Job-Runs anzeigen
|
||||
- `GET /api/runs/{run_id}` - Detailansicht eines Runs
|
||||
- `POST /api/runs` - Run starten
|
||||
- `POST /api/runs/{run_id}/finish` - Run abschliessen
|
||||
- `GET /api/articles` - Artikel anzeigen
|
||||
- `GET /api/articles/{article_id}` - Artikeldetail
|
||||
- `POST /api/articles/upsert` - Artikel idempotent anlegen/aktualisieren
|
||||
- `POST /api/articles/{article_id}/transition` - Statuswechsel nach Workflow-Regeln
|
||||
- `POST /api/articles/{article_id}/review` - Review-Entscheidung (approve/reject)
|
||||
- `POST /api/ingestion/run` - Feed-Ingestion starten (optional pro Feed)
|
||||
|
||||
## Datenbank
|
||||
- SQLite-Datei unter `backend/data/rss_news.db`
|
||||
- Tabellen werden beim App-Start initialisiert.
|
||||
- Tabellen: `sources`, `feeds`, `runs`, `articles`
|
||||
- Dedupe-Strategie Artikel: `source_url` -> `(feed_id, source_article_id)` -> `source_hash`
|
||||
|
||||
## Policy-Enforcement
|
||||
- Ingestion blockiert Feeds automatisch, wenn die zugeordnete Quelle nicht policy-konform ist.
|
||||
- Mindestanforderungen: `risk_level=green`, `terms_url`, `license_name`, `last_reviewed_at`, `is_enabled=1`.
|
||||
- Pro importiertem Artikel wird ein `attribution`-Block in `meta_json` gespeichert.
|
||||
|
||||
## Review-Workflow
|
||||
- Statuskette: `new -> review -> approved -> published`
|
||||
- Ablehnung im Review setzt auf `rewrite`
|
||||
- Ungueltige Statuswechsel werden per API blockiert
|
||||
|
||||
## Verifikation
|
||||
```bash
|
||||
python -m unittest backend.tests.test_db_repositories
|
||||
python -m unittest backend.tests.test_ingestion
|
||||
python -m unittest backend.tests.test_api_auth
|
||||
```
|
||||
|
||||
## CI / Online-Auswertung
|
||||
- GitHub Actions Workflow: `.github/workflows/test.yml`
|
||||
- Fuehrt Tests inkl. Coverage auf Push/PR gegen `main` aus.
|
||||
|
||||
## Hetzner Smoketest
|
||||
```bash
|
||||
BASE_URL="https://news.vanityontour.de" \
|
||||
APP_ADMIN_USERNAME="admin" \
|
||||
APP_ADMIN_PASSWORD="..." \
|
||||
bash scripts/smoke_backend.sh
|
||||
```
|
||||
|
||||
## Hinweis
|
||||
Passwort-Hashing und CSRF/Rate-Limit sind als naechste Ausbaustufe vorgesehen.
|
||||
1
backend/__init__.py
Normal file
1
backend/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
"""Backend package for rss-news rebuild."""
|
||||
1
backend/app/__init__.py
Normal file
1
backend/app/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
"""Application package."""
|
||||
1126
backend/app/admin_ui.py
Normal file
1126
backend/app/admin_ui.py
Normal file
File diff suppressed because it is too large
Load diff
31
backend/app/auth.py
Normal file
31
backend/app/auth.py
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
import hmac
|
||||
from typing import Optional
|
||||
|
||||
from itsdangerous import URLSafeTimedSerializer, BadSignature, SignatureExpired
|
||||
|
||||
from .config import get_settings
|
||||
|
||||
|
||||
def _serializer() -> URLSafeTimedSerializer:
|
||||
settings = get_settings()
|
||||
return URLSafeTimedSerializer(settings.app_secret_key, salt="rss-news-session")
|
||||
|
||||
|
||||
def verify_credentials(username: str, password: str) -> bool:
|
||||
settings = get_settings()
|
||||
user_ok = hmac.compare_digest(username, settings.app_admin_username)
|
||||
pw_ok = hmac.compare_digest(password, settings.app_admin_password)
|
||||
return user_ok and pw_ok
|
||||
|
||||
|
||||
def create_session_token(username: str) -> str:
|
||||
return _serializer().dumps({"username": username})
|
||||
|
||||
|
||||
def verify_session_token(token: str) -> Optional[str]:
|
||||
settings = get_settings()
|
||||
try:
|
||||
payload = _serializer().loads(token, max_age=settings.session_max_age_seconds)
|
||||
except (BadSignature, SignatureExpired):
|
||||
return None
|
||||
return payload.get("username")
|
||||
65
backend/app/config.py
Normal file
65
backend/app/config.py
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from pydantic import AliasChoices, Field
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
# Prefer backend-specific env file to avoid collisions with legacy root .env
|
||||
model_config = SettingsConfigDict(
|
||||
env_file=("backend/.env", ".env"),
|
||||
env_file_encoding="utf-8",
|
||||
extra="ignore",
|
||||
)
|
||||
|
||||
app_env: str = "development"
|
||||
app_name: str = "rss-news-backend"
|
||||
app_secret_key: str = "replace-with-a-long-random-secret"
|
||||
|
||||
app_admin_username: str = "admin"
|
||||
app_admin_password: str = "change-me"
|
||||
|
||||
session_cookie_name: str = "rss_news_session"
|
||||
session_max_age_seconds: int = 28800
|
||||
|
||||
app_db_path: str = "backend/data/rss_news.db"
|
||||
|
||||
wordpress_base_url: str | None = Field(default=None, validation_alias=AliasChoices("WORDPRESS_BASE_URL", "WP_BASE_URL"))
|
||||
wordpress_username: str | None = Field(default=None, validation_alias=AliasChoices("WORDPRESS_USERNAME", "WP_USERNAME"))
|
||||
wordpress_app_password: str | None = Field(default=None, validation_alias=AliasChoices("WORDPRESS_APP_PASSWORD", "WP_PASSWORD"))
|
||||
wordpress_default_status: str = "draft"
|
||||
openai_api_key: str | None = Field(default=None, validation_alias=AliasChoices("OPENAI_API_KEY"))
|
||||
openai_model: str = "gpt-4o-mini"
|
||||
|
||||
# Telegram Bot
|
||||
telegram_bot_token: str | None = Field(default=None, validation_alias=AliasChoices("TELEGRAM_BOT_TOKEN"))
|
||||
telegram_chat_id: str | None = Field(default=None, validation_alias=AliasChoices("TELEGRAM_CHAT_ID"))
|
||||
telegram_webhook_secret: str | None = Field(default=None, validation_alias=AliasChoices("TELEGRAM_WEBHOOK_SECRET"))
|
||||
|
||||
# N8N API authentication
|
||||
n8n_api_key: str | None = Field(default=None, validation_alias=AliasChoices("N8N_API_KEY"))
|
||||
|
||||
# Pipeline behaviour
|
||||
pipeline_relevance_auto: int = 80 # >= this: auto-process
|
||||
pipeline_relevance_warn: int = 60 # >= this: Telegram warning, else reject
|
||||
pipeline_max_drafts_per_day: int = 2
|
||||
pipeline_publish_hours: str = "9,14" # comma-separated preferred publish hours (CET)
|
||||
pipeline_min_words_raw: int = 120 # minimum words in raw content before rewrite (else reject)
|
||||
pipeline_min_words_rewritten: int = 150 # minimum words in rewritten content (else reject)
|
||||
pipeline_max_article_age_days: int = 7 # skip articles older than N days during ingestion (0 = no limit)
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def get_settings() -> Settings:
|
||||
# Prefer shared legacy env from the original rss-news workspace if present.
|
||||
env_candidates = (
|
||||
Path("/Users/oliver/Documents/rss-news/.env"),
|
||||
Path("backend/.env"),
|
||||
Path(".env"),
|
||||
)
|
||||
for env_path in env_candidates:
|
||||
if env_path.exists():
|
||||
load_dotenv(env_path, override=False)
|
||||
return Settings()
|
||||
293
backend/app/db.py
Normal file
293
backend/app/db.py
Normal file
|
|
@ -0,0 +1,293 @@
|
|||
import sqlite3
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterator
|
||||
|
||||
from .config import get_settings
|
||||
|
||||
|
||||
def _db_path() -> Path:
|
||||
settings = get_settings()
|
||||
path = Path(settings.app_db_path)
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
return path
|
||||
|
||||
|
||||
@contextmanager
|
||||
def get_conn() -> Iterator[sqlite3.Connection]:
|
||||
conn = sqlite3.connect(_db_path())
|
||||
conn.row_factory = sqlite3.Row
|
||||
conn.execute("PRAGMA foreign_keys=ON;")
|
||||
try:
|
||||
yield conn
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def init_db() -> None:
|
||||
with get_conn() as conn:
|
||||
conn.executescript(
|
||||
"""
|
||||
PRAGMA journal_mode=WAL;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS sources (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
name TEXT NOT NULL,
|
||||
base_url TEXT,
|
||||
terms_url TEXT,
|
||||
license_name TEXT,
|
||||
risk_level TEXT NOT NULL DEFAULT 'yellow' CHECK (risk_level IN ('green', 'yellow', 'red')),
|
||||
is_enabled INTEGER NOT NULL DEFAULT 0,
|
||||
notes TEXT,
|
||||
last_reviewed_at TEXT,
|
||||
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS feeds (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
source_id INTEGER,
|
||||
name TEXT NOT NULL,
|
||||
url TEXT NOT NULL UNIQUE,
|
||||
is_enabled INTEGER NOT NULL DEFAULT 1,
|
||||
etag TEXT,
|
||||
last_modified TEXT,
|
||||
last_checked_at TEXT,
|
||||
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
FOREIGN KEY(source_id) REFERENCES sources(id) ON DELETE SET NULL
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS runs (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
run_type TEXT NOT NULL,
|
||||
status TEXT NOT NULL CHECK (status IN ('queued', 'running', 'success', 'failed')),
|
||||
started_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
finished_at TEXT,
|
||||
details TEXT
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS publish_jobs (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
article_id INTEGER NOT NULL,
|
||||
status TEXT NOT NULL CHECK (status IN ('queued', 'running', 'success', 'failed')),
|
||||
attempts INTEGER NOT NULL DEFAULT 0,
|
||||
max_attempts INTEGER NOT NULL DEFAULT 3,
|
||||
error_message TEXT,
|
||||
wp_post_id INTEGER,
|
||||
wp_post_url TEXT,
|
||||
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
started_at TEXT,
|
||||
finished_at TEXT,
|
||||
FOREIGN KEY(article_id) REFERENCES articles(id) ON DELETE CASCADE
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS articles (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
feed_id INTEGER,
|
||||
source_article_id TEXT,
|
||||
source_hash TEXT,
|
||||
title TEXT NOT NULL,
|
||||
source_url TEXT NOT NULL,
|
||||
canonical_url TEXT,
|
||||
published_at TEXT,
|
||||
author TEXT,
|
||||
summary TEXT,
|
||||
content_raw TEXT,
|
||||
content_rewritten TEXT,
|
||||
image_urls_json TEXT,
|
||||
press_contact TEXT,
|
||||
source_name_snapshot TEXT,
|
||||
source_terms_url_snapshot TEXT,
|
||||
source_license_name_snapshot TEXT,
|
||||
legal_checked INTEGER NOT NULL DEFAULT 0,
|
||||
legal_checked_at TEXT,
|
||||
legal_note TEXT,
|
||||
wp_post_id INTEGER,
|
||||
wp_post_url TEXT,
|
||||
publish_attempts INTEGER NOT NULL DEFAULT 0,
|
||||
publish_last_error TEXT,
|
||||
published_to_wp_at TEXT,
|
||||
word_count INTEGER DEFAULT 0,
|
||||
status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error', 'no_image')),
|
||||
meta_json TEXT,
|
||||
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
FOREIGN KEY(feed_id) REFERENCES feeds(id) ON DELETE SET NULL,
|
||||
UNIQUE(source_url)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_articles_source_article_id ON articles(source_article_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_articles_source_hash ON articles(source_hash);
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_feed_source_article_id
|
||||
ON articles(feed_id, source_article_id)
|
||||
WHERE source_article_id IS NOT NULL;
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_source_hash
|
||||
ON articles(source_hash)
|
||||
WHERE source_hash IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_articles_status ON articles(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_source_id ON feeds(source_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_runs_started_at ON runs(started_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_articles_published_at ON articles(published_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_publish_jobs_status_created_at ON publish_jobs(status, created_at);
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS trg_sources_updated_at
|
||||
AFTER UPDATE ON sources
|
||||
FOR EACH ROW
|
||||
BEGIN
|
||||
UPDATE sources SET updated_at = datetime('now') WHERE id = OLD.id;
|
||||
END;
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS trg_feeds_updated_at
|
||||
AFTER UPDATE ON feeds
|
||||
FOR EACH ROW
|
||||
BEGIN
|
||||
UPDATE feeds SET updated_at = datetime('now') WHERE id = OLD.id;
|
||||
END;
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS trg_articles_updated_at
|
||||
AFTER UPDATE ON articles
|
||||
FOR EACH ROW
|
||||
BEGIN
|
||||
UPDATE articles SET updated_at = datetime('now') WHERE id = OLD.id;
|
||||
END;
|
||||
"""
|
||||
)
|
||||
|
||||
# Lightweight migration for existing DBs created before source_hash was introduced.
|
||||
existing_columns = {
|
||||
row["name"] for row in conn.execute("PRAGMA table_info(articles)").fetchall()
|
||||
}
|
||||
migration_columns = {
|
||||
"relevance_score": "ALTER TABLE articles ADD COLUMN relevance_score INTEGER",
|
||||
"scheduled_publish_at": "ALTER TABLE articles ADD COLUMN scheduled_publish_at TEXT",
|
||||
"source_hash": "ALTER TABLE articles ADD COLUMN source_hash TEXT",
|
||||
"image_urls_json": "ALTER TABLE articles ADD COLUMN image_urls_json TEXT",
|
||||
"press_contact": "ALTER TABLE articles ADD COLUMN press_contact TEXT",
|
||||
"source_name_snapshot": "ALTER TABLE articles ADD COLUMN source_name_snapshot TEXT",
|
||||
"source_terms_url_snapshot": "ALTER TABLE articles ADD COLUMN source_terms_url_snapshot TEXT",
|
||||
"source_license_name_snapshot": "ALTER TABLE articles ADD COLUMN source_license_name_snapshot TEXT",
|
||||
"legal_checked": "ALTER TABLE articles ADD COLUMN legal_checked INTEGER NOT NULL DEFAULT 0",
|
||||
"legal_checked_at": "ALTER TABLE articles ADD COLUMN legal_checked_at TEXT",
|
||||
"legal_note": "ALTER TABLE articles ADD COLUMN legal_note TEXT",
|
||||
"wp_post_id": "ALTER TABLE articles ADD COLUMN wp_post_id INTEGER",
|
||||
"wp_post_url": "ALTER TABLE articles ADD COLUMN wp_post_url TEXT",
|
||||
"publish_attempts": "ALTER TABLE articles ADD COLUMN publish_attempts INTEGER NOT NULL DEFAULT 0",
|
||||
"publish_last_error": "ALTER TABLE articles ADD COLUMN publish_last_error TEXT",
|
||||
"published_to_wp_at": "ALTER TABLE articles ADD COLUMN published_to_wp_at TEXT",
|
||||
}
|
||||
for column, ddl in migration_columns.items():
|
||||
if column not in existing_columns:
|
||||
conn.execute(ddl)
|
||||
|
||||
# Migration: add 'no_image' to the status CHECK constraint if not present.
|
||||
# SQLite cannot modify CHECK constraints in-place, so we recreate the table.
|
||||
table_sql_row = conn.execute(
|
||||
"SELECT sql FROM sqlite_master WHERE type='table' AND name='articles'"
|
||||
).fetchone()
|
||||
if table_sql_row and "'no_image'" not in (table_sql_row["sql"] or ""):
|
||||
conn.executescript(
|
||||
"""
|
||||
PRAGMA foreign_keys=OFF;
|
||||
|
||||
CREATE TABLE articles_v2 (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
feed_id INTEGER,
|
||||
source_article_id TEXT,
|
||||
source_hash TEXT,
|
||||
title TEXT NOT NULL,
|
||||
source_url TEXT NOT NULL,
|
||||
canonical_url TEXT,
|
||||
published_at TEXT,
|
||||
author TEXT,
|
||||
summary TEXT,
|
||||
content_raw TEXT,
|
||||
content_rewritten TEXT,
|
||||
image_urls_json TEXT,
|
||||
press_contact TEXT,
|
||||
source_name_snapshot TEXT,
|
||||
source_terms_url_snapshot TEXT,
|
||||
source_license_name_snapshot TEXT,
|
||||
legal_checked INTEGER NOT NULL DEFAULT 0,
|
||||
legal_checked_at TEXT,
|
||||
legal_note TEXT,
|
||||
wp_post_id INTEGER,
|
||||
wp_post_url TEXT,
|
||||
publish_attempts INTEGER NOT NULL DEFAULT 0,
|
||||
publish_last_error TEXT,
|
||||
published_to_wp_at TEXT,
|
||||
word_count INTEGER DEFAULT 0,
|
||||
status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error', 'no_image')),
|
||||
meta_json TEXT,
|
||||
relevance_score INTEGER,
|
||||
scheduled_publish_at TEXT,
|
||||
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
FOREIGN KEY(feed_id) REFERENCES feeds(id) ON DELETE SET NULL,
|
||||
UNIQUE(source_url)
|
||||
);
|
||||
|
||||
INSERT INTO articles_v2 SELECT
|
||||
id, feed_id, source_article_id, source_hash, title, source_url,
|
||||
canonical_url, published_at, author, summary, content_raw,
|
||||
content_rewritten, image_urls_json, press_contact,
|
||||
source_name_snapshot, source_terms_url_snapshot, source_license_name_snapshot,
|
||||
legal_checked, legal_checked_at, legal_note,
|
||||
wp_post_id, wp_post_url, publish_attempts, publish_last_error,
|
||||
published_to_wp_at, word_count, status, meta_json,
|
||||
relevance_score, scheduled_publish_at, created_at, updated_at
|
||||
FROM articles;
|
||||
|
||||
DROP TABLE articles;
|
||||
ALTER TABLE articles_v2 RENAME TO articles;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_articles_source_article_id ON articles(source_article_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_articles_source_hash ON articles(source_hash);
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_feed_source_article_id
|
||||
ON articles(feed_id, source_article_id)
|
||||
WHERE source_article_id IS NOT NULL;
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_source_hash
|
||||
ON articles(source_hash)
|
||||
WHERE source_hash IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_articles_status ON articles(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_articles_published_at ON articles(published_at);
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS trg_articles_updated_at
|
||||
AFTER UPDATE ON articles
|
||||
FOR EACH ROW
|
||||
BEGIN
|
||||
UPDATE articles SET updated_at = datetime('now') WHERE id = OLD.id;
|
||||
END;
|
||||
|
||||
PRAGMA foreign_keys=ON;
|
||||
"""
|
||||
)
|
||||
|
||||
table_rows = conn.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type = 'table' AND name = 'publish_jobs'"
|
||||
).fetchall()
|
||||
if not table_rows:
|
||||
conn.executescript(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS publish_jobs (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
article_id INTEGER NOT NULL,
|
||||
status TEXT NOT NULL CHECK (status IN ('queued', 'running', 'success', 'failed')),
|
||||
attempts INTEGER NOT NULL DEFAULT 0,
|
||||
max_attempts INTEGER NOT NULL DEFAULT 3,
|
||||
error_message TEXT,
|
||||
wp_post_id INTEGER,
|
||||
wp_post_url TEXT,
|
||||
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
started_at TEXT,
|
||||
finished_at TEXT,
|
||||
FOREIGN KEY(article_id) REFERENCES articles(id) ON DELETE CASCADE
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_publish_jobs_status_created_at ON publish_jobs(status, created_at);
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def rows_to_dicts(rows: list[sqlite3.Row]) -> list[dict[str, Any]]:
|
||||
return [dict(r) for r in rows]
|
||||
486
backend/app/ingestion.py
Normal file
486
backend/app/ingestion.py
Normal file
|
|
@ -0,0 +1,486 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timedelta, timezone
|
||||
import hashlib
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
from typing import Any
|
||||
from urllib.parse import unquote, urlencode, urlparse, parse_qs
|
||||
import urllib.error
|
||||
import urllib.request as _urllib_req
|
||||
|
||||
import feedparser
|
||||
|
||||
from .repositories import (
|
||||
ArticleUpsert,
|
||||
RunCreate,
|
||||
create_run,
|
||||
find_existing_article_for_upsert,
|
||||
finish_run,
|
||||
get_feed_by_id,
|
||||
list_enabled_feeds,
|
||||
update_feed_fetch_state,
|
||||
upsert_article,
|
||||
)
|
||||
from .source_extraction import extract_article, extracted_article_to_meta
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class IngestionStats:
|
||||
run_id: int
|
||||
feeds_processed: int
|
||||
entries_seen: int
|
||||
articles_upserted: int
|
||||
status: str
|
||||
message: str
|
||||
|
||||
|
||||
MAX_FEED_FETCH_RETRIES = 3
|
||||
|
||||
|
||||
def _normalize_article_url(url: str) -> str:
|
||||
"""Strip AMP and tracking query parameters from article URLs.
|
||||
|
||||
Removes ?outputType=valid_amp and other AMP/tracking params so that
|
||||
AMP and non-AMP versions of the same article are deduplicated.
|
||||
"""
|
||||
_AMP_PARAMS = {"outputtype", "amp", "outputformat"}
|
||||
try:
|
||||
from urllib.parse import parse_qs, urlencode
|
||||
parsed = urlparse(url)
|
||||
if not parsed.query:
|
||||
return url
|
||||
params = parse_qs(parsed.query, keep_blank_values=True)
|
||||
filtered = {k: v for k, v in params.items() if k.lower() not in _AMP_PARAMS}
|
||||
new_query = urlencode(filtered, doseq=True)
|
||||
return parsed._replace(query=new_query).geturl()
|
||||
except Exception:
|
||||
return url
|
||||
|
||||
|
||||
def _resolve_google_redirect(url: str) -> str:
|
||||
"""Extract the real article URL from Google redirect URLs.
|
||||
|
||||
Google Alerts feed entries use tracking links like:
|
||||
https://www.google.com/url?rct=j&sa=t&url=<encoded_real_url>&ct=ga&...
|
||||
|
||||
This function returns the decoded real URL if detected, otherwise the
|
||||
original URL unchanged.
|
||||
"""
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
host = (parsed.hostname or "").lower()
|
||||
if host not in ("www.google.com", "google.com"):
|
||||
return url
|
||||
if parsed.path not in ("/url", "/url/"):
|
||||
return url
|
||||
params = parse_qs(parsed.query, keep_blank_values=False)
|
||||
real_urls = params.get("url")
|
||||
if real_urls:
|
||||
return unquote(real_urls[0])
|
||||
except Exception:
|
||||
pass
|
||||
return url
|
||||
|
||||
|
||||
def _entry_published_iso(entry: dict) -> str | None:
|
||||
published = entry.get("published_parsed") or entry.get("updated_parsed")
|
||||
if not published:
|
||||
return None
|
||||
return datetime(*published[:6], tzinfo=timezone.utc).isoformat()
|
||||
|
||||
|
||||
def _entry_text(entry: dict) -> tuple[str, str]:
|
||||
summary = entry.get("summary", "") or ""
|
||||
content = ""
|
||||
if entry.get("content") and isinstance(entry.get("content"), list):
|
||||
first = entry["content"][0]
|
||||
content = first.get("value", "") if isinstance(first, dict) else ""
|
||||
if not content:
|
||||
content = summary
|
||||
return summary, content
|
||||
|
||||
|
||||
def _entry_hash(entry: dict, feed_id: int, link: str, title: str, summary: str) -> str:
|
||||
source_id = entry.get("id") or entry.get("guid") or ""
|
||||
published = _entry_published_iso(entry) or ""
|
||||
fingerprint = f"{feed_id}|{source_id}|{link}|{title.strip()}|{summary.strip()}|{published}"
|
||||
return hashlib.sha256(fingerprint.encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
def _parsed_get(parsed: object, key: str, default: object = None) -> object:
|
||||
if isinstance(parsed, dict):
|
||||
return parsed.get(key, default)
|
||||
return getattr(parsed, key, default)
|
||||
|
||||
|
||||
def _normalize_tokens(text: str) -> set[str]:
|
||||
normalized = re.sub(r"[^a-z0-9]+", " ", text.lower())
|
||||
return {token for token in normalized.split() if len(token) >= 4}
|
||||
|
||||
|
||||
def _probe_image_url(url: str, timeout: int = 5) -> bool:
|
||||
"""Return True if URL responds without a 4xx/5xx error (HEAD request).
|
||||
|
||||
Returns True on network/connection errors so that a flaky server does not
|
||||
cause a valid image to be silently dropped.
|
||||
"""
|
||||
try:
|
||||
req = _urllib_req.Request(
|
||||
url,
|
||||
method="HEAD",
|
||||
headers={"User-Agent": "Mozilla/5.0 (compatible; rss-news/1.0)"},
|
||||
)
|
||||
with _urllib_req.urlopen(req, timeout=timeout) as resp:
|
||||
return resp.status < 400
|
||||
except urllib.error.HTTPError as exc:
|
||||
return exc.code < 400 # 3xx redirects are OK; 4xx/5xx are not
|
||||
except Exception:
|
||||
return True # network error → don't filter, let WP try later
|
||||
|
||||
|
||||
def _rank_image_candidates(source_url: str, title: str, images: list[str]) -> list[dict[str, Any]]:
|
||||
source_host = (urlparse(source_url).hostname or "").lower()
|
||||
is_presseportal = "presseportal.de" in source_host
|
||||
title_tokens = _normalize_tokens(title)
|
||||
blocked_patterns = ("logo", "badge", "app-store", "google-play", "na-logo", "sprite", "icon", "favicon", "tracking", "pixel", ".svg", ".ico", ".gif")
|
||||
# Known placeholder/default images that should never be used as featured image
|
||||
placeholder_patterns = ("some-default.jpg", "default-image", "placeholder", "no-image", "noimage")
|
||||
|
||||
|
||||
ranked: list[dict[str, Any]] = []
|
||||
for url in images:
|
||||
# Skip inline data: URIs (e.g. base64-encoded SVG placeholders)
|
||||
if url.startswith("data:"):
|
||||
continue
|
||||
|
||||
parsed = urlparse(url)
|
||||
path = unquote(parsed.path.lower())
|
||||
full = f"{parsed.netloc.lower()}{path}"
|
||||
score = 0
|
||||
reasons: list[str] = []
|
||||
|
||||
if any(token in full for token in placeholder_patterns):
|
||||
score -= 300
|
||||
reasons.append("placeholder-image")
|
||||
|
||||
if any(token in full for token in blocked_patterns):
|
||||
score -= 150
|
||||
reasons.append("blocked-pattern")
|
||||
|
||||
if is_presseportal and "/thumbnail/story_big/" in path:
|
||||
score += 120
|
||||
reasons.append("presseportal-story-big")
|
||||
elif is_presseportal and "/thumbnail/highlight/" in path:
|
||||
score += 45
|
||||
reasons.append("presseportal-highlight")
|
||||
elif is_presseportal and "/thumbnail/liste/" in path:
|
||||
score -= 40
|
||||
reasons.append("presseportal-list")
|
||||
|
||||
if "crop=" in (parsed.query or "").lower():
|
||||
score -= 10
|
||||
reasons.append("cropped-preview")
|
||||
|
||||
path_tokens = _normalize_tokens(path.replace("-", " "))
|
||||
overlap = len(title_tokens.intersection(path_tokens))
|
||||
if overlap > 0:
|
||||
score += min(30, overlap * 6)
|
||||
reasons.append(f"title-match:{overlap}")
|
||||
|
||||
ranked.append({"url": url, "score": score, "reasons": reasons})
|
||||
|
||||
ranked.sort(key=lambda item: item["score"], reverse=True)
|
||||
return ranked
|
||||
|
||||
|
||||
def _select_relevant_images(source_url: str, title: str, images: list[str], max_keep: int = 3) -> tuple[list[str], str | None, list[dict[str, Any]]]:
|
||||
# dedupe incoming order first
|
||||
deduped: list[str] = []
|
||||
seen: set[str] = set()
|
||||
for image in images:
|
||||
if image and image not in seen:
|
||||
seen.add(image)
|
||||
deduped.append(image)
|
||||
|
||||
ranked = _rank_image_candidates(source_url, title, deduped)
|
||||
candidates = [item["url"] for item in ranked if item["score"] > -100]
|
||||
|
||||
# Probe top candidates (max 4) to skip definitively broken URLs (HTTP 4xx).
|
||||
# Network errors are treated as OK to avoid false negatives on flaky servers.
|
||||
primary = None
|
||||
kept: list[str] = []
|
||||
for url in candidates[:4]:
|
||||
if _probe_image_url(url):
|
||||
if primary is None:
|
||||
primary = url
|
||||
kept.append(url)
|
||||
if len(kept) >= max_keep:
|
||||
break
|
||||
|
||||
# Fallback: if all probes failed with network errors, use best candidate anyway
|
||||
if not kept and candidates:
|
||||
primary = candidates[0]
|
||||
kept = candidates[:max_keep]
|
||||
|
||||
return kept, primary, ranked
|
||||
|
||||
|
||||
def _merge_ingestion_meta(existing_meta_json: str | None, attribution: dict[str, Any], extraction_meta: dict[str, Any]) -> str:
|
||||
meta: dict[str, Any] = {}
|
||||
if existing_meta_json:
|
||||
try:
|
||||
parsed = json.loads(existing_meta_json)
|
||||
if isinstance(parsed, dict):
|
||||
meta = parsed
|
||||
except Exception:
|
||||
meta = {}
|
||||
meta["attribution"] = attribution
|
||||
meta["extraction"] = extraction_meta
|
||||
return json.dumps(meta, ensure_ascii=False)
|
||||
|
||||
|
||||
def run_ingestion(feed_id: int | None = None) -> IngestionStats:
|
||||
run_id = create_run(RunCreate(run_type="ingestion", status="running", details="started"))
|
||||
feeds_processed = 0
|
||||
entries_seen = 0
|
||||
articles_upserted = 0
|
||||
feed_results: list[dict[str, object]] = []
|
||||
|
||||
try:
|
||||
if feed_id is not None:
|
||||
feed = get_feed_by_id(feed_id)
|
||||
feeds = [feed] if feed and int(feed.get("is_enabled", 0)) == 1 else []
|
||||
else:
|
||||
feeds = list_enabled_feeds()
|
||||
|
||||
for feed in feeds:
|
||||
if not feed:
|
||||
continue
|
||||
feeds_processed += 1
|
||||
|
||||
parsed = None
|
||||
feed_error = None
|
||||
for attempt in range(1, MAX_FEED_FETCH_RETRIES + 1):
|
||||
try:
|
||||
parsed = feedparser.parse(
|
||||
feed["url"],
|
||||
etag=feed.get("etag"),
|
||||
modified=feed.get("last_modified"),
|
||||
)
|
||||
break
|
||||
except Exception as exc:
|
||||
feed_error = str(exc)
|
||||
if attempt < MAX_FEED_FETCH_RETRIES:
|
||||
time.sleep(0.5 * attempt)
|
||||
|
||||
if parsed is None:
|
||||
feed_results.append(
|
||||
{
|
||||
"feed_id": int(feed["id"]),
|
||||
"feed_url": feed["url"],
|
||||
"status": "failed",
|
||||
"error": feed_error or "unknown",
|
||||
"entries_seen": 0,
|
||||
"upserts": 0,
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
# Persist ETag/Last-Modified for conditional requests.
|
||||
parsed_etag = _parsed_get(parsed, "etag")
|
||||
parsed_modified = _parsed_get(parsed, "modified")
|
||||
if parsed_modified and not isinstance(parsed_modified, str):
|
||||
parsed_modified = str(parsed_modified)
|
||||
update_feed_fetch_state(
|
||||
feed_id=int(feed["id"]),
|
||||
etag=parsed_etag if isinstance(parsed_etag, str) else None,
|
||||
last_modified=parsed_modified if isinstance(parsed_modified, str) else None,
|
||||
)
|
||||
|
||||
feed_entries_seen = 0
|
||||
feed_upserts = 0
|
||||
from .config import get_settings as _get_settings
|
||||
_max_age_days = _get_settings().pipeline_max_article_age_days
|
||||
for entry in _parsed_get(parsed, "entries", []):
|
||||
entries_seen += 1
|
||||
feed_entries_seen += 1
|
||||
link = entry.get("link")
|
||||
if not link:
|
||||
continue
|
||||
|
||||
# Age filter: skip articles older than max_age_days (0 = no limit)
|
||||
if _max_age_days > 0:
|
||||
published_iso = _entry_published_iso(entry)
|
||||
if published_iso:
|
||||
try:
|
||||
published_dt = datetime.fromisoformat(published_iso)
|
||||
age = datetime.now(timezone.utc) - published_dt
|
||||
if age > timedelta(days=_max_age_days):
|
||||
continue
|
||||
except Exception:
|
||||
pass # can't parse date → allow through
|
||||
|
||||
# Resolve Google redirect URLs (google.com/url?...&url=<actual_url>&...)
|
||||
link = _resolve_google_redirect(link)
|
||||
# Normalize AMP/tracking params (e.g. ?outputType=valid_amp)
|
||||
link = _normalize_article_url(link)
|
||||
|
||||
summary, content_raw = _entry_text(entry)
|
||||
# Strip HTML tags from title (Google Alerts wraps matched keywords in <b>)
|
||||
raw_title = entry.get("title") or "Ohne Titel"
|
||||
title = re.sub(r"<[^>]+>", "", raw_title).strip() or "Ohne Titel"
|
||||
extracted = extract_article(link)
|
||||
|
||||
final_title = extracted.title or title
|
||||
final_author = extracted.author or entry.get("author")
|
||||
final_summary = extracted.summary or (summary[:1000] if summary else None)
|
||||
final_content_raw = extracted.content_text or content_raw
|
||||
final_canonical = extracted.canonical_url or entry.get("link")
|
||||
selected_images, primary_image, ranked_images = _select_relevant_images(
|
||||
link,
|
||||
final_title,
|
||||
extracted.images,
|
||||
max_keep=3,
|
||||
)
|
||||
|
||||
source_hash = _entry_hash(
|
||||
entry,
|
||||
int(feed["id"]),
|
||||
link,
|
||||
final_title,
|
||||
final_summary or "",
|
||||
)
|
||||
attribution = {
|
||||
"source_name": feed.get("source_name"),
|
||||
"source_base_url": feed.get("source_base_url"),
|
||||
"source_terms_url": feed.get("source_terms_url"),
|
||||
"source_license_name": feed.get("source_license_name"),
|
||||
"source_risk_level": feed.get("source_risk_level"),
|
||||
"original_link": link,
|
||||
"feed_name": feed.get("name"),
|
||||
"feed_id": int(feed["id"]),
|
||||
"imported_at": datetime.now(timezone.utc).isoformat(),
|
||||
}
|
||||
extraction_meta: dict[str, Any] = extracted_article_to_meta(extracted)
|
||||
extraction_meta["fetched_from"] = link
|
||||
extraction_meta["image_selection"] = {
|
||||
"primary": primary_image,
|
||||
"selected_count": len(selected_images),
|
||||
"total_candidates": len(extracted.images),
|
||||
"ranked": ranked_images,
|
||||
}
|
||||
base_payload = ArticleUpsert(
|
||||
feed_id=int(feed["id"]),
|
||||
source_article_id=entry.get("id") or entry.get("guid"),
|
||||
source_hash=source_hash,
|
||||
title=final_title,
|
||||
source_url=link,
|
||||
canonical_url=final_canonical,
|
||||
published_at=_entry_published_iso(entry),
|
||||
author=final_author,
|
||||
summary=final_summary,
|
||||
content_raw=final_content_raw,
|
||||
content_rewritten=None,
|
||||
image_urls_json=json.dumps(selected_images, ensure_ascii=False) if selected_images else None,
|
||||
press_contact=extracted.press_contact,
|
||||
source_name_snapshot=feed.get("source_name"),
|
||||
source_terms_url_snapshot=feed.get("source_terms_url"),
|
||||
source_license_name_snapshot=feed.get("source_license_name"),
|
||||
legal_checked=False,
|
||||
legal_checked_at=None,
|
||||
legal_note=None,
|
||||
wp_post_id=None,
|
||||
wp_post_url=None,
|
||||
publish_attempts=0,
|
||||
publish_last_error=None,
|
||||
published_to_wp_at=None,
|
||||
word_count=len((final_content_raw or "").split()),
|
||||
status="new",
|
||||
meta_json=json.dumps({"attribution": attribution, "extraction": extraction_meta}, ensure_ascii=False),
|
||||
)
|
||||
existing = find_existing_article_for_upsert(base_payload)
|
||||
if existing and existing.get("status") == "error":
|
||||
# Explicitly closed article: ignore on subsequent ingestion runs.
|
||||
continue
|
||||
|
||||
payload = base_payload
|
||||
if existing:
|
||||
payload = ArticleUpsert(
|
||||
feed_id=base_payload.feed_id,
|
||||
source_article_id=base_payload.source_article_id,
|
||||
source_hash=base_payload.source_hash,
|
||||
title=base_payload.title,
|
||||
source_url=base_payload.source_url,
|
||||
canonical_url=base_payload.canonical_url,
|
||||
published_at=base_payload.published_at,
|
||||
author=base_payload.author,
|
||||
summary=base_payload.summary,
|
||||
content_raw=base_payload.content_raw,
|
||||
content_rewritten=existing.get("content_rewritten"),
|
||||
image_urls_json=base_payload.image_urls_json,
|
||||
press_contact=base_payload.press_contact or existing.get("press_contact"),
|
||||
source_name_snapshot=base_payload.source_name_snapshot,
|
||||
source_terms_url_snapshot=base_payload.source_terms_url_snapshot,
|
||||
source_license_name_snapshot=base_payload.source_license_name_snapshot,
|
||||
legal_checked=bool(int(existing.get("legal_checked", 0))),
|
||||
legal_checked_at=existing.get("legal_checked_at"),
|
||||
legal_note=existing.get("legal_note"),
|
||||
wp_post_id=existing.get("wp_post_id"),
|
||||
wp_post_url=existing.get("wp_post_url"),
|
||||
publish_attempts=int(existing.get("publish_attempts", 0)),
|
||||
publish_last_error=existing.get("publish_last_error"),
|
||||
published_to_wp_at=existing.get("published_to_wp_at"),
|
||||
word_count=base_payload.word_count,
|
||||
status=existing.get("status") or "new",
|
||||
meta_json=_merge_ingestion_meta(existing.get("meta_json"), attribution, extraction_meta),
|
||||
)
|
||||
|
||||
article_id = upsert_article(payload)
|
||||
if article_id:
|
||||
articles_upserted += 1
|
||||
feed_upserts += 1
|
||||
|
||||
feed_results.append(
|
||||
{
|
||||
"feed_id": int(feed["id"]),
|
||||
"feed_url": feed["url"],
|
||||
"status": "success",
|
||||
"entries_seen": feed_entries_seen,
|
||||
"upserts": feed_upserts,
|
||||
}
|
||||
)
|
||||
|
||||
finish_run(
|
||||
run_id=run_id,
|
||||
status="success",
|
||||
details=json.dumps(
|
||||
{
|
||||
"feeds_processed": feeds_processed,
|
||||
"entries_seen": entries_seen,
|
||||
"upserts": articles_upserted,
|
||||
"feeds": feed_results,
|
||||
},
|
||||
ensure_ascii=False,
|
||||
),
|
||||
)
|
||||
return IngestionStats(
|
||||
run_id=run_id,
|
||||
feeds_processed=feeds_processed,
|
||||
entries_seen=entries_seen,
|
||||
articles_upserted=articles_upserted,
|
||||
status="success",
|
||||
message="Ingestion abgeschlossen",
|
||||
)
|
||||
except Exception as exc:
|
||||
finish_run(run_id=run_id, status="failed", details=str(exc))
|
||||
return IngestionStats(
|
||||
run_id=run_id,
|
||||
feeds_processed=feeds_processed,
|
||||
entries_seen=entries_seen,
|
||||
articles_upserted=articles_upserted,
|
||||
status="failed",
|
||||
message=str(exc),
|
||||
)
|
||||
727
backend/app/main.py
Normal file
727
backend/app/main.py
Normal file
|
|
@ -0,0 +1,727 @@
|
|||
import asyncio
|
||||
from contextlib import asynccontextmanager
|
||||
import csv
|
||||
from datetime import datetime, timezone
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import Depends, FastAPI, HTTPException, Request, Response, status
|
||||
from fastapi.responses import JSONResponse
|
||||
from pydantic import BaseModel, Field
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
|
||||
from .admin_ui import router as admin_router
|
||||
from .auth import create_session_token, verify_credentials, verify_session_token
|
||||
from .config import get_settings
|
||||
from .db import init_db
|
||||
from .ingestion import run_ingestion
|
||||
from .pipeline import run_auto_pipeline
|
||||
from .policy import evaluate_source_policy, is_source_allowed
|
||||
from .publisher import enqueue_publish, run_publisher
|
||||
from .relevance import article_age_days, article_relevance
|
||||
from .rewrite import generate_article_tags, merge_generated_tags, rewrite_article_text
|
||||
from .telegram_bot import handle_update, setup_webhook
|
||||
from .repositories import (
|
||||
ArticleUpsert,
|
||||
FeedCreate,
|
||||
RunCreate,
|
||||
SourceCreate,
|
||||
create_feed as repo_create_feed,
|
||||
create_run,
|
||||
create_source as repo_create_source,
|
||||
finish_run,
|
||||
get_article_by_id,
|
||||
get_feed_by_id,
|
||||
get_run_by_id,
|
||||
get_source_by_id,
|
||||
list_publish_jobs,
|
||||
list_articles as repo_list_articles,
|
||||
list_feeds as repo_list_feeds,
|
||||
list_runs,
|
||||
list_sources as repo_list_sources,
|
||||
set_article_legal_review,
|
||||
update_article_status,
|
||||
upsert_article as repo_upsert_article,
|
||||
)
|
||||
from .workflow import ALLOWED_UI_TRANSITIONS, UI_STATUSES, internal_to_ui_status, ui_to_internal_status
|
||||
|
||||
settings = get_settings()
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def app_lifespan(_: FastAPI):
|
||||
init_db()
|
||||
yield
|
||||
|
||||
|
||||
app = FastAPI(title=settings.app_name, lifespan=app_lifespan)
|
||||
app.include_router(admin_router)
|
||||
app.mount(
|
||||
"/admin/static",
|
||||
StaticFiles(directory=str(Path(__file__).resolve().parent.parent / "static")),
|
||||
name="admin-static",
|
||||
)
|
||||
|
||||
|
||||
class LoginRequest(BaseModel):
|
||||
username: str
|
||||
password: str
|
||||
|
||||
|
||||
class SourceCreateRequest(BaseModel):
|
||||
name: str = Field(min_length=1, max_length=200)
|
||||
base_url: str | None = None
|
||||
terms_url: str | None = None
|
||||
license_name: str | None = None
|
||||
risk_level: str = Field(default="yellow", pattern="^(green|yellow|red)$")
|
||||
is_enabled: bool = False
|
||||
notes: str | None = None
|
||||
last_reviewed_at: str | None = None
|
||||
|
||||
|
||||
class FeedCreateRequest(BaseModel):
|
||||
name: str = Field(min_length=1, max_length=200)
|
||||
url: str = Field(min_length=5, max_length=1000)
|
||||
source_id: int | None = None
|
||||
is_enabled: bool = True
|
||||
|
||||
|
||||
class RunCreateRequest(BaseModel):
|
||||
run_type: str = Field(min_length=2, max_length=100)
|
||||
status: str = Field(default="queued", pattern="^(queued|running|success|failed)$")
|
||||
details: str | None = None
|
||||
|
||||
|
||||
class RunFinishRequest(BaseModel):
|
||||
status: str = Field(pattern="^(success|failed)$")
|
||||
details: str | None = None
|
||||
|
||||
|
||||
class ArticleUpsertRequest(BaseModel):
|
||||
feed_id: int | None = None
|
||||
source_article_id: str | None = None
|
||||
source_hash: str | None = None
|
||||
title: str = Field(min_length=1, max_length=500)
|
||||
source_url: str = Field(min_length=5, max_length=2000)
|
||||
canonical_url: str | None = None
|
||||
published_at: str | None = None
|
||||
author: str | None = None
|
||||
summary: str | None = None
|
||||
content_raw: str | None = None
|
||||
content_rewritten: str | None = None
|
||||
image_urls_json: str | None = None
|
||||
press_contact: str | None = None
|
||||
source_name_snapshot: str | None = None
|
||||
source_terms_url_snapshot: str | None = None
|
||||
source_license_name_snapshot: str | None = None
|
||||
legal_checked: bool = False
|
||||
legal_checked_at: str | None = None
|
||||
legal_note: str | None = None
|
||||
wp_post_id: int | None = None
|
||||
wp_post_url: str | None = None
|
||||
publish_attempts: int = 0
|
||||
publish_last_error: str | None = None
|
||||
published_to_wp_at: str | None = None
|
||||
word_count: int = 0
|
||||
status: str = Field(default="new", pattern="^(new|rewrite|publish|published|close|review|approved|error|no_image)$")
|
||||
meta_json: str | None = None
|
||||
|
||||
|
||||
class IngestionRunRequest(BaseModel):
|
||||
feed_id: int | None = None
|
||||
|
||||
|
||||
class ArticleTransitionRequest(BaseModel):
|
||||
target_status: str = Field(pattern="^(new|rewrite|publish|published|close|review|approved|error|no_image)$")
|
||||
note: str | None = None
|
||||
|
||||
|
||||
class ArticleReviewRequest(BaseModel):
|
||||
decision: str = Field(pattern="^(approve|reject)$")
|
||||
note: str | None = None
|
||||
|
||||
|
||||
class ArticleLegalReviewRequest(BaseModel):
|
||||
approved: bool
|
||||
note: str | None = None
|
||||
|
||||
|
||||
class PublisherEnqueueRequest(BaseModel):
|
||||
article_id: int
|
||||
max_attempts: int = 3
|
||||
|
||||
|
||||
class PublisherRunRequest(BaseModel):
|
||||
max_jobs: int = 10
|
||||
|
||||
|
||||
ALLOWED_ARTICLE_TRANSITIONS: dict[str, set[str]] = {
|
||||
"new": {"rewrite", "error"},
|
||||
"rewrite": {"approved", "error"},
|
||||
"approved": {"published", "error"},
|
||||
"published": {"error"},
|
||||
"error": {"rewrite"},
|
||||
}
|
||||
|
||||
|
||||
def require_auth(request: Request) -> str:
|
||||
token = request.cookies.get(settings.session_cookie_name)
|
||||
if not token:
|
||||
raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Nicht angemeldet")
|
||||
|
||||
username = verify_session_token(token)
|
||||
if not username:
|
||||
raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Session ungueltig oder abgelaufen")
|
||||
|
||||
return username
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
def health() -> dict:
|
||||
return {"status": "ok", "service": settings.app_name, "db_path": settings.app_db_path}
|
||||
|
||||
|
||||
@app.post("/auth/login")
|
||||
def login(payload: LoginRequest, response: Response) -> dict:
|
||||
if not verify_credentials(payload.username, payload.password):
|
||||
raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Ungueltige Zugangsdaten")
|
||||
|
||||
token = create_session_token(payload.username)
|
||||
response.set_cookie(
|
||||
key=settings.session_cookie_name,
|
||||
value=token,
|
||||
max_age=settings.session_max_age_seconds,
|
||||
httponly=True,
|
||||
secure=False,
|
||||
samesite="lax",
|
||||
)
|
||||
return {"ok": True, "username": payload.username}
|
||||
|
||||
|
||||
@app.post("/auth/logout")
|
||||
def logout(response: Response) -> dict:
|
||||
response.delete_cookie(settings.session_cookie_name)
|
||||
return {"ok": True}
|
||||
|
||||
|
||||
@app.get("/auth/me")
|
||||
def me(username: str = Depends(require_auth)) -> dict:
|
||||
return {"authenticated": True, "username": username}
|
||||
|
||||
|
||||
@app.get("/api/protected")
|
||||
def protected(username: str = Depends(require_auth)) -> dict:
|
||||
return {"ok": True, "message": "Protected endpoint", "username": username}
|
||||
|
||||
|
||||
@app.get("/api/pipeline/status")
|
||||
def pipeline_status(username: str = Depends(require_auth)) -> dict:
|
||||
feeds_total = len(repo_list_feeds())
|
||||
sources_total = len(repo_list_sources())
|
||||
articles_total = len(repo_list_articles(limit=500))
|
||||
return {
|
||||
"ok": True,
|
||||
"stage": "skeleton+db",
|
||||
"requested_by": username,
|
||||
"counts": {
|
||||
"sources": sources_total,
|
||||
"feeds": feeds_total,
|
||||
"articles": articles_total,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/sources")
|
||||
def list_sources(username: str = Depends(require_auth)) -> dict:
|
||||
return {"ok": True, "items": repo_list_sources(), "requested_by": username}
|
||||
|
||||
|
||||
@app.get("/api/sources/{source_id}/policy-check")
|
||||
def source_policy_check(source_id: int, username: str = Depends(require_auth)) -> dict:
|
||||
source = get_source_by_id(source_id)
|
||||
if not source:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Quelle nicht gefunden")
|
||||
issues = evaluate_source_policy(source)
|
||||
return {
|
||||
"ok": True,
|
||||
"source_id": source_id,
|
||||
"allowed": is_source_allowed(source),
|
||||
"issues": issues,
|
||||
"requested_by": username,
|
||||
}
|
||||
|
||||
|
||||
@app.post("/api/sources")
|
||||
def create_source(payload: SourceCreateRequest, username: str = Depends(require_auth)) -> dict:
|
||||
source_id = repo_create_source(
|
||||
SourceCreate(
|
||||
name=payload.name,
|
||||
base_url=payload.base_url,
|
||||
terms_url=payload.terms_url,
|
||||
license_name=payload.license_name,
|
||||
risk_level=payload.risk_level,
|
||||
is_enabled=payload.is_enabled,
|
||||
notes=payload.notes,
|
||||
last_reviewed_at=payload.last_reviewed_at,
|
||||
)
|
||||
)
|
||||
return {"ok": True, "id": source_id, "requested_by": username}
|
||||
|
||||
|
||||
@app.get("/api/feeds")
|
||||
def list_feeds(username: str = Depends(require_auth)) -> dict:
|
||||
return {"ok": True, "items": repo_list_feeds(), "requested_by": username}
|
||||
|
||||
|
||||
@app.get("/api/feeds/{feed_id}/policy-check")
|
||||
def feed_policy_check(feed_id: int, username: str = Depends(require_auth)) -> dict:
|
||||
feed = get_feed_by_id(feed_id)
|
||||
if not feed:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Feed nicht gefunden")
|
||||
|
||||
source_snapshot = {
|
||||
"id": feed.get("source_id"),
|
||||
"name": feed.get("source_name"),
|
||||
"base_url": feed.get("source_base_url"),
|
||||
"terms_url": feed.get("source_terms_url"),
|
||||
"license_name": feed.get("source_license_name"),
|
||||
"risk_level": feed.get("source_risk_level"),
|
||||
"last_reviewed_at": feed.get("source_last_reviewed_at"),
|
||||
"is_enabled": feed.get("source_is_enabled"),
|
||||
}
|
||||
issues = evaluate_source_policy(source_snapshot)
|
||||
return {
|
||||
"ok": True,
|
||||
"feed_id": feed_id,
|
||||
"allowed": len(issues) == 0,
|
||||
"issues": issues,
|
||||
"requested_by": username,
|
||||
}
|
||||
|
||||
|
||||
@app.post("/api/feeds")
|
||||
def create_feed(payload: FeedCreateRequest, username: str = Depends(require_auth)) -> dict:
|
||||
try:
|
||||
feed_id = repo_create_feed(
|
||||
FeedCreate(
|
||||
name=payload.name,
|
||||
url=payload.url,
|
||||
source_id=payload.source_id,
|
||||
is_enabled=payload.is_enabled,
|
||||
)
|
||||
)
|
||||
except Exception as exc:
|
||||
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=f"Feed konnte nicht angelegt werden: {exc}") from exc
|
||||
|
||||
return {"ok": True, "id": feed_id, "requested_by": username}
|
||||
|
||||
|
||||
@app.get("/api/runs")
|
||||
def api_list_runs(limit: int = 50, username: str = Depends(require_auth)) -> dict:
|
||||
return {"ok": True, "items": list_runs(limit=limit), "requested_by": username}
|
||||
|
||||
|
||||
@app.get("/api/runs/{run_id}")
|
||||
def api_get_run(run_id: int, username: str = Depends(require_auth)) -> dict:
|
||||
run = get_run_by_id(run_id)
|
||||
if not run:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Run nicht gefunden")
|
||||
return {"ok": True, "item": run, "requested_by": username}
|
||||
|
||||
|
||||
@app.post("/api/runs")
|
||||
def api_create_run(payload: RunCreateRequest, username: str = Depends(require_auth)) -> dict:
|
||||
run_id = create_run(RunCreate(run_type=payload.run_type, status=payload.status, details=payload.details))
|
||||
return {"ok": True, "id": run_id, "requested_by": username}
|
||||
|
||||
|
||||
@app.post("/api/runs/{run_id}/finish")
|
||||
def api_finish_run(run_id: int, payload: RunFinishRequest, username: str = Depends(require_auth)) -> dict:
|
||||
finish_run(run_id=run_id, status=payload.status, details=payload.details)
|
||||
return {"ok": True, "id": run_id, "requested_by": username}
|
||||
|
||||
|
||||
@app.get("/api/articles")
|
||||
def api_list_articles(limit: int = 100, status_filter: str | None = None, username: str = Depends(require_auth)) -> dict:
|
||||
internal_filter = ui_to_internal_status(status_filter) if status_filter else None
|
||||
items = repo_list_articles(limit=limit, status_filter=internal_filter)
|
||||
for item in items:
|
||||
item["status_ui"] = internal_to_ui_status(item.get("status"))
|
||||
return {"ok": True, "items": items, "requested_by": username}
|
||||
|
||||
|
||||
@app.get("/api/articles/export")
|
||||
def api_export_articles(
|
||||
format: str = "json",
|
||||
status_filter: str | None = None,
|
||||
username: str = Depends(require_auth),
|
||||
):
|
||||
internal_filter = ui_to_internal_status(status_filter) if status_filter else None
|
||||
articles = repo_list_articles(limit=500, status_filter=internal_filter)
|
||||
rows = []
|
||||
for article in articles:
|
||||
meta: dict = {}
|
||||
if article.get("meta_json"):
|
||||
try:
|
||||
parsed = json.loads(article["meta_json"])
|
||||
if isinstance(parsed, dict):
|
||||
meta = parsed
|
||||
except Exception:
|
||||
meta = {}
|
||||
image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {}
|
||||
selected_image_url = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None
|
||||
|
||||
days_old = article_age_days(article.get("published_at"))
|
||||
rows.append(
|
||||
{
|
||||
"id": article.get("id"),
|
||||
"title": article.get("title"),
|
||||
"status": article.get("status"),
|
||||
"published_at": article.get("published_at"),
|
||||
"days_old": days_old,
|
||||
"relevance": article_relevance(article.get("published_at")),
|
||||
"author": article.get("author"),
|
||||
"source_url": article.get("source_url"),
|
||||
"canonical_url": article.get("canonical_url"),
|
||||
"source_name_snapshot": article.get("source_name_snapshot"),
|
||||
"source_license_name_snapshot": article.get("source_license_name_snapshot"),
|
||||
"source_terms_url_snapshot": article.get("source_terms_url_snapshot"),
|
||||
"press_contact": article.get("press_contact"),
|
||||
"image_urls_json": article.get("image_urls_json"),
|
||||
"selected_image_url": selected_image_url,
|
||||
"legal_checked": bool(int(article.get("legal_checked", 0))),
|
||||
"legal_checked_at": article.get("legal_checked_at"),
|
||||
"legal_note": article.get("legal_note"),
|
||||
}
|
||||
)
|
||||
|
||||
generated_at = datetime.now(timezone.utc).isoformat()
|
||||
if format == "csv":
|
||||
out = io.StringIO()
|
||||
fieldnames = [
|
||||
"id",
|
||||
"title",
|
||||
"status",
|
||||
"published_at",
|
||||
"days_old",
|
||||
"relevance",
|
||||
"author",
|
||||
"source_url",
|
||||
"canonical_url",
|
||||
"source_name_snapshot",
|
||||
"source_license_name_snapshot",
|
||||
"source_terms_url_snapshot",
|
||||
"press_contact",
|
||||
"image_urls_json",
|
||||
"selected_image_url",
|
||||
"legal_checked",
|
||||
"legal_checked_at",
|
||||
"legal_note",
|
||||
]
|
||||
writer = csv.DictWriter(out, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(rows)
|
||||
return Response(
|
||||
content=out.getvalue(),
|
||||
media_type="text/csv; charset=utf-8",
|
||||
headers={"Content-Disposition": 'attachment; filename="articles_export.csv"'},
|
||||
)
|
||||
|
||||
return JSONResponse(
|
||||
{
|
||||
"ok": True,
|
||||
"count": len(rows),
|
||||
"generated_at": generated_at,
|
||||
"status_filter": status_filter,
|
||||
"items": rows,
|
||||
"requested_by": username,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@app.get("/api/articles/{article_id}")
|
||||
def api_get_article(article_id: int, username: str = Depends(require_auth)) -> dict:
|
||||
article = get_article_by_id(article_id)
|
||||
if not article:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
|
||||
article["status_ui"] = internal_to_ui_status(article.get("status"))
|
||||
return {"ok": True, "item": article, "requested_by": username}
|
||||
|
||||
|
||||
@app.post("/api/articles/upsert")
|
||||
def api_upsert_article(payload: ArticleUpsertRequest, username: str = Depends(require_auth)) -> dict:
|
||||
article_id = repo_upsert_article(
|
||||
ArticleUpsert(
|
||||
feed_id=payload.feed_id,
|
||||
source_article_id=payload.source_article_id,
|
||||
source_hash=payload.source_hash,
|
||||
title=payload.title,
|
||||
source_url=payload.source_url,
|
||||
canonical_url=payload.canonical_url,
|
||||
published_at=payload.published_at,
|
||||
author=payload.author,
|
||||
summary=payload.summary,
|
||||
content_raw=payload.content_raw,
|
||||
content_rewritten=payload.content_rewritten,
|
||||
image_urls_json=payload.image_urls_json,
|
||||
press_contact=payload.press_contact,
|
||||
source_name_snapshot=payload.source_name_snapshot,
|
||||
source_terms_url_snapshot=payload.source_terms_url_snapshot,
|
||||
source_license_name_snapshot=payload.source_license_name_snapshot,
|
||||
legal_checked=payload.legal_checked,
|
||||
legal_checked_at=payload.legal_checked_at,
|
||||
legal_note=payload.legal_note,
|
||||
wp_post_id=payload.wp_post_id,
|
||||
wp_post_url=payload.wp_post_url,
|
||||
publish_attempts=payload.publish_attempts,
|
||||
publish_last_error=payload.publish_last_error,
|
||||
published_to_wp_at=payload.published_to_wp_at,
|
||||
word_count=payload.word_count,
|
||||
status=ui_to_internal_status(payload.status),
|
||||
meta_json=payload.meta_json,
|
||||
)
|
||||
)
|
||||
return {"ok": True, "id": article_id, "requested_by": username}
|
||||
|
||||
|
||||
@app.post("/api/articles/{article_id}/transition")
|
||||
def api_article_transition(article_id: int, payload: ArticleTransitionRequest, username: str = Depends(require_auth)) -> dict:
|
||||
article = get_article_by_id(article_id)
|
||||
if not article:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
|
||||
|
||||
current_status = article.get("status")
|
||||
current_ui = internal_to_ui_status(current_status)
|
||||
target_internal = ui_to_internal_status(payload.target_status)
|
||||
target_ui = internal_to_ui_status(target_internal)
|
||||
allowed_targets = ALLOWED_UI_TRANSITIONS.get(current_ui, set())
|
||||
if target_ui not in allowed_targets:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail=f"Ungueltiger Statuswechsel: {current_ui} -> {target_ui}",
|
||||
)
|
||||
|
||||
updated = update_article_status(article_id, target_internal, actor=username, note=payload.note)
|
||||
if not updated:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
|
||||
return {"ok": True, "id": article_id, "from_status": current_ui, "to_status": target_ui}
|
||||
|
||||
|
||||
@app.post("/api/articles/{article_id}/rewrite-run")
|
||||
def api_article_rewrite_run(article_id: int, username: str = Depends(require_auth)) -> dict:
|
||||
article = get_article_by_id(article_id)
|
||||
if not article:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
|
||||
if internal_to_ui_status(article.get("status")) not in {"rewrite", "new"}:
|
||||
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Rewrite nur aus Status 'new' oder 'rewrite'")
|
||||
|
||||
rewritten = rewrite_article_text(article)
|
||||
tags: list[str] = []
|
||||
try:
|
||||
tags = generate_article_tags(article, rewritten_text=rewritten)
|
||||
except Exception:
|
||||
tags = []
|
||||
merged_meta = merge_generated_tags(article.get("meta_json"), tags)
|
||||
# upsert via status update + existing fields by lightweight path:
|
||||
repo_upsert_article(
|
||||
ArticleUpsert(
|
||||
feed_id=article.get("feed_id"),
|
||||
source_article_id=article.get("source_article_id"),
|
||||
source_hash=article.get("source_hash"),
|
||||
title=article.get("title"),
|
||||
source_url=article.get("source_url"),
|
||||
canonical_url=article.get("canonical_url"),
|
||||
published_at=article.get("published_at"),
|
||||
author=article.get("author"),
|
||||
summary=article.get("summary"),
|
||||
content_raw=article.get("content_raw"),
|
||||
content_rewritten=rewritten,
|
||||
image_urls_json=article.get("image_urls_json"),
|
||||
press_contact=article.get("press_contact"),
|
||||
source_name_snapshot=article.get("source_name_snapshot"),
|
||||
source_terms_url_snapshot=article.get("source_terms_url_snapshot"),
|
||||
source_license_name_snapshot=article.get("source_license_name_snapshot"),
|
||||
legal_checked=bool(int(article.get("legal_checked", 0))),
|
||||
legal_checked_at=article.get("legal_checked_at"),
|
||||
legal_note=article.get("legal_note"),
|
||||
wp_post_id=article.get("wp_post_id"),
|
||||
wp_post_url=article.get("wp_post_url"),
|
||||
publish_attempts=int(article.get("publish_attempts", 0)),
|
||||
publish_last_error=article.get("publish_last_error"),
|
||||
published_to_wp_at=article.get("published_to_wp_at"),
|
||||
word_count=len(rewritten.split()),
|
||||
status="approved",
|
||||
meta_json=merged_meta,
|
||||
)
|
||||
)
|
||||
return {"ok": True, "id": article_id, "status": "publish", "tags": tags}
|
||||
|
||||
|
||||
@app.post("/api/articles/{article_id}/legal-review")
|
||||
def api_article_legal_review(article_id: int, payload: ArticleLegalReviewRequest, username: str = Depends(require_auth)) -> dict:
|
||||
article = get_article_by_id(article_id)
|
||||
if not article:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
|
||||
|
||||
updated = set_article_legal_review(article_id, approved=payload.approved, note=payload.note, actor=username)
|
||||
if not updated:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
|
||||
return {
|
||||
"ok": True,
|
||||
"id": article_id,
|
||||
"legal_checked": payload.approved,
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/publisher/jobs")
|
||||
def api_publisher_jobs(limit: int = 100, username: str = Depends(require_auth)) -> dict:
|
||||
return {"ok": True, "items": list_publish_jobs(limit=limit), "requested_by": username}
|
||||
|
||||
|
||||
@app.post("/api/publisher/enqueue")
|
||||
def api_publisher_enqueue(payload: PublisherEnqueueRequest, username: str = Depends(require_auth)) -> dict:
|
||||
article = get_article_by_id(payload.article_id)
|
||||
if not article:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
|
||||
job_id = enqueue_publish(article_id=payload.article_id, max_attempts=payload.max_attempts)
|
||||
return {"ok": True, "job_id": job_id, "article_id": payload.article_id, "requested_by": username}
|
||||
|
||||
|
||||
@app.post("/api/publisher/run")
|
||||
def api_publisher_run(payload: PublisherRunRequest, username: str = Depends(require_auth)) -> dict:
|
||||
stats = run_publisher(max_jobs=payload.max_jobs)
|
||||
return {
|
||||
"ok": True,
|
||||
"requested_by": username,
|
||||
"stats": {
|
||||
"processed": stats.processed,
|
||||
"success": stats.success,
|
||||
"failed": stats.failed,
|
||||
"requeued": stats.requeued,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@app.post("/api/articles/{article_id}/review")
|
||||
def api_article_review(article_id: int, payload: ArticleReviewRequest, username: str = Depends(require_auth)) -> dict:
|
||||
raise HTTPException(status_code=status.HTTP_410_GONE, detail="Review-Endpoint ersetzt durch Rewrite-Workflow")
|
||||
|
||||
|
||||
@app.post("/api/ingestion/run")
|
||||
def api_run_ingestion(payload: IngestionRunRequest, username: str = Depends(require_auth)) -> dict:
|
||||
stats = run_ingestion(feed_id=payload.feed_id)
|
||||
return {
|
||||
"ok": stats.status == "success",
|
||||
"run_id": stats.run_id,
|
||||
"status": stats.status,
|
||||
"message": stats.message,
|
||||
"stats": {
|
||||
"feeds_processed": stats.feeds_processed,
|
||||
"entries_seen": stats.entries_seen,
|
||||
"articles_upserted": stats.articles_upserted,
|
||||
},
|
||||
"requested_by": username,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# N8N Automation endpoint (API-Key auth, no session cookie required)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _require_api_key(request: Request) -> None:
|
||||
api_key = request.headers.get("X-API-Key") or request.query_params.get("api_key")
|
||||
expected = settings.n8n_api_key
|
||||
if not expected:
|
||||
raise HTTPException(status_code=status.HTTP_501_NOT_IMPLEMENTED, detail="N8N_API_KEY nicht konfiguriert")
|
||||
if api_key != expected:
|
||||
raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Ungültiger API-Key")
|
||||
|
||||
|
||||
_pipeline_lock = asyncio.Lock()
|
||||
|
||||
|
||||
@app.post("/api/n8n/pipeline")
|
||||
async def api_n8n_pipeline(request: Request) -> dict:
|
||||
"""Trigger the full auto pipeline in background. Returns immediately.
|
||||
Called by N8N (2x/day or on demand). Results arrive via Telegram."""
|
||||
_require_api_key(request)
|
||||
|
||||
if _pipeline_lock.locked():
|
||||
logging.getLogger(__name__).warning("Pipeline bereits aktiv – Trigger ignoriert")
|
||||
return {"ok": False, "message": "Pipeline läuft bereits – Trigger ignoriert"}
|
||||
|
||||
async def _run():
|
||||
async with _pipeline_lock:
|
||||
loop = asyncio.get_event_loop()
|
||||
try:
|
||||
await loop.run_in_executor(None, lambda: run_auto_pipeline(trigger="n8n"))
|
||||
except Exception as exc:
|
||||
logging.getLogger(__name__).error("Background pipeline error: %s", exc)
|
||||
|
||||
asyncio.create_task(_run())
|
||||
return {"ok": True, "message": "Pipeline gestartet – Ergebnisse kommen per Telegram"}
|
||||
|
||||
|
||||
@app.post("/api/n8n/ingest")
|
||||
def api_n8n_ingest(request: Request) -> dict:
|
||||
"""Run only the ingestion step (no rewrite/publish). For N8N."""
|
||||
_require_api_key(request)
|
||||
stats = run_ingestion()
|
||||
return {
|
||||
"ok": stats.status == "success",
|
||||
"stats": {
|
||||
"feeds_processed": stats.feeds_processed,
|
||||
"entries_seen": stats.entries_seen,
|
||||
"articles_upserted": stats.articles_upserted,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Telegram Webhook
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@app.post("/telegram/webhook")
|
||||
async def telegram_webhook(request: Request) -> dict:
|
||||
"""Receive updates from Telegram Bot API.
|
||||
|
||||
Returns 200 immediately so Telegram never retries the same update.
|
||||
Actual processing runs in a background task.
|
||||
"""
|
||||
import asyncio
|
||||
import logging
|
||||
|
||||
# Verify secret token
|
||||
secret = settings.telegram_webhook_secret
|
||||
if secret:
|
||||
incoming = request.headers.get("X-Telegram-Bot-Api-Secret-Token", "")
|
||||
if incoming != secret:
|
||||
raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="Invalid secret")
|
||||
|
||||
body = await request.body()
|
||||
try:
|
||||
update = json.loads(body.decode("utf-8"))
|
||||
except Exception:
|
||||
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON")
|
||||
|
||||
async def _process():
|
||||
loop = asyncio.get_event_loop()
|
||||
try:
|
||||
await loop.run_in_executor(None, lambda: handle_update(update))
|
||||
except Exception as exc:
|
||||
logging.getLogger(__name__).error("Telegram update handler error: %s", exc)
|
||||
|
||||
asyncio.create_task(_process())
|
||||
return {"ok": True}
|
||||
|
||||
|
||||
@app.post("/api/telegram/setup-webhook")
|
||||
def api_setup_telegram_webhook(request: Request) -> dict:
|
||||
"""Register the Telegram webhook URL. Call once after deployment."""
|
||||
username = require_auth(request)
|
||||
base_url = str(request.base_url).rstrip("/")
|
||||
webhook_url = f"{base_url}/telegram/webhook"
|
||||
result = setup_webhook(webhook_url)
|
||||
return {"ok": True, "webhook_url": webhook_url, "telegram_response": result, "requested_by": username}
|
||||
516
backend/app/pipeline.py
Normal file
516
backend/app/pipeline.py
Normal file
|
|
@ -0,0 +1,516 @@
|
|||
"""Autonomous RSS-News pipeline.
|
||||
|
||||
Full automated flow:
|
||||
1. Run RSS ingestion
|
||||
2. For each new article:
|
||||
- Auto-select primary image
|
||||
- Score relevance via GPT
|
||||
- < warn threshold: reject (error status) → Telegram rejected summary
|
||||
- warn..auto threshold: Telegram warning with override button
|
||||
- >= auto threshold: rewrite → create WP draft → Telegram notification
|
||||
3. Send pipeline summary to Telegram
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
from .config import get_settings
|
||||
from .ingestion import run_ingestion
|
||||
from .publisher import enqueue_publish, run_publisher
|
||||
from .repositories import (
|
||||
ArticleUpsert,
|
||||
get_article_by_id,
|
||||
list_articles,
|
||||
set_article_image_decision,
|
||||
update_article_status,
|
||||
upsert_article as repo_upsert_article,
|
||||
)
|
||||
from .rewrite import generate_article_tags, merge_generated_tags, rewrite_article_text, score_article_relevance
|
||||
from .scheduler import reserve_publish_slot
|
||||
from .wordpress import publish_article_draft, selected_image_exists
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class PipelineStats:
|
||||
ingested: int = 0
|
||||
processed: int = 0
|
||||
drafts_created: int = 0
|
||||
rejected: int = 0
|
||||
quality_gate_rejected: int = 0
|
||||
warnings: int = 0
|
||||
errors: int = 0
|
||||
no_image: int = 0
|
||||
rejected_articles: list[dict[str, Any]] = field(default_factory=list)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Internal helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _auto_select_image(article: dict[str, Any]) -> bool:
|
||||
"""Auto-select the primary image from ingestion metadata if not already selected."""
|
||||
meta_json = article.get("meta_json") or "{}"
|
||||
try:
|
||||
meta = json.loads(meta_json)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
# Already selected?
|
||||
image_review = meta.get("image_review") or {}
|
||||
if isinstance(image_review, dict) and image_review.get("selected_url"):
|
||||
return True
|
||||
|
||||
# Try to get primary from ingestion extraction
|
||||
extraction = meta.get("extraction") or {}
|
||||
image_selection = extraction.get("image_selection") or {}
|
||||
primary = image_selection.get("primary")
|
||||
|
||||
if not primary:
|
||||
# Fallback: use first URL from image_urls_json
|
||||
image_urls_json = article.get("image_urls_json") or "[]"
|
||||
try:
|
||||
urls = json.loads(image_urls_json)
|
||||
if urls:
|
||||
primary = urls[0]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if primary:
|
||||
set_article_image_decision(int(article["id"]), primary, "select", actor="pipeline")
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _store_relevance(article_id: int, relevance: dict[str, Any]) -> None:
|
||||
"""Persist relevance score and reason in article meta_json and relevance_score column."""
|
||||
article = get_article_by_id(article_id)
|
||||
if not article:
|
||||
return
|
||||
try:
|
||||
meta = json.loads(article.get("meta_json") or "{}")
|
||||
except Exception:
|
||||
meta = {}
|
||||
meta["relevance"] = relevance
|
||||
new_meta = json.dumps(meta, ensure_ascii=False)
|
||||
from .db import get_conn
|
||||
with get_conn() as conn:
|
||||
conn.execute(
|
||||
"UPDATE articles SET meta_json = ?, relevance_score = ? WHERE id = ?",
|
||||
(new_meta, relevance.get("score", 0), article_id),
|
||||
)
|
||||
|
||||
|
||||
def _do_rewrite_and_draft(article: dict[str, Any]) -> tuple[int, str | None]:
|
||||
"""Rewrite article and create WP draft. Returns (wp_post_id, wp_post_url)."""
|
||||
article_id = int(article["id"])
|
||||
settings = get_settings()
|
||||
|
||||
# ── Quality gate 1: raw content length ──────────────────────────────────
|
||||
import re as _re
|
||||
raw_text = _re.sub(r"<[^>]+>", " ", article.get("content_raw") or "")
|
||||
raw_words = len(raw_text.split())
|
||||
if raw_words < settings.pipeline_min_words_raw:
|
||||
note = (
|
||||
f"Zu wenig Rohinhalt: {raw_words} Wörter "
|
||||
f"(Minimum: {settings.pipeline_min_words_raw})"
|
||||
)
|
||||
logger.warning("_do_rewrite_and_draft #%d: %s — überspringe", article_id, note)
|
||||
update_article_status(article_id, "error", actor="pipeline", note=note)
|
||||
raise ValueError(note)
|
||||
|
||||
# Rewrite
|
||||
logger.info("_do_rewrite_and_draft #%d: starte OpenAI-Rewrite (%d Roh-Wörter)", article_id, raw_words)
|
||||
rewritten = rewrite_article_text(article)
|
||||
|
||||
# ── Quality gate 2: rewritten content length ─────────────────────────────
|
||||
rewritten_words = len(rewritten.split())
|
||||
if rewritten_words < settings.pipeline_min_words_rewritten:
|
||||
note = (
|
||||
f"Rewrite zu kurz: {rewritten_words} Wörter "
|
||||
f"(Minimum: {settings.pipeline_min_words_rewritten})"
|
||||
)
|
||||
logger.warning("_do_rewrite_and_draft #%d: %s — überspringe", article_id, note)
|
||||
update_article_status(article_id, "error", actor="pipeline", note=note)
|
||||
raise ValueError(note)
|
||||
logger.info("_do_rewrite_and_draft #%d: Rewrite fertig (%d Wörter), generiere Tags", article_id, len(rewritten.split()))
|
||||
tags: list[str] = []
|
||||
try:
|
||||
tags = generate_article_tags(article, rewritten_text=rewritten)
|
||||
except Exception:
|
||||
pass
|
||||
merged_meta = merge_generated_tags(article.get("meta_json"), tags)
|
||||
|
||||
# Save rewritten content + approved status
|
||||
repo_upsert_article(
|
||||
ArticleUpsert(
|
||||
feed_id=article.get("feed_id"),
|
||||
source_article_id=article.get("source_article_id"),
|
||||
source_hash=article.get("source_hash"),
|
||||
title=article.get("title", ""),
|
||||
source_url=article.get("source_url", ""),
|
||||
canonical_url=article.get("canonical_url"),
|
||||
published_at=article.get("published_at"),
|
||||
author=article.get("author"),
|
||||
summary=article.get("summary"),
|
||||
content_raw=article.get("content_raw"),
|
||||
content_rewritten=rewritten,
|
||||
image_urls_json=article.get("image_urls_json"),
|
||||
press_contact=article.get("press_contact"),
|
||||
source_name_snapshot=article.get("source_name_snapshot"),
|
||||
source_terms_url_snapshot=article.get("source_terms_url_snapshot"),
|
||||
source_license_name_snapshot=article.get("source_license_name_snapshot"),
|
||||
legal_checked=bool(int(article.get("legal_checked", 0))),
|
||||
legal_checked_at=article.get("legal_checked_at"),
|
||||
legal_note=article.get("legal_note"),
|
||||
wp_post_id=article.get("wp_post_id"),
|
||||
wp_post_url=article.get("wp_post_url"),
|
||||
publish_attempts=int(article.get("publish_attempts", 0)),
|
||||
publish_last_error=article.get("publish_last_error"),
|
||||
published_to_wp_at=article.get("published_to_wp_at"),
|
||||
word_count=len(rewritten.split()),
|
||||
status="approved",
|
||||
meta_json=merged_meta,
|
||||
)
|
||||
)
|
||||
|
||||
# Reload after save to get updated meta_json
|
||||
fresh = get_article_by_id(article_id)
|
||||
if not fresh:
|
||||
raise RuntimeError(f"Artikel #{article_id} nach Rewrite nicht gefunden")
|
||||
|
||||
# Ensure a publish slot is reserved — reserve one now if not yet set
|
||||
if not fresh.get("scheduled_publish_at"):
|
||||
from .scheduler import reserve_publish_slot
|
||||
logger.info("_do_rewrite_and_draft #%d: kein Slot gesetzt, reserviere jetzt", article_id)
|
||||
reserve_publish_slot(article_id)
|
||||
fresh = get_article_by_id(article_id)
|
||||
if not fresh:
|
||||
raise RuntimeError(f"Artikel #{article_id} nach Slot-Reservierung nicht gefunden")
|
||||
|
||||
# Create WP draft
|
||||
logger.info("_do_rewrite_and_draft #%d: erstelle/aktualisiere WP Draft (wp_post_id=%s, sched=%s)", article_id, fresh.get("wp_post_id"), fresh.get("scheduled_publish_at"))
|
||||
wp_post_id, wp_post_url = publish_article_draft(fresh)
|
||||
logger.info("_do_rewrite_and_draft #%d: WP Draft fertig (post_id=%s)", article_id, wp_post_id)
|
||||
|
||||
# Update WP info in DB
|
||||
from .repositories import mark_article_publish_result
|
||||
mark_article_publish_result(
|
||||
article_id,
|
||||
wp_post_id=wp_post_id,
|
||||
wp_post_url=wp_post_url,
|
||||
error=None,
|
||||
increment_attempts=True,
|
||||
set_published_status=False,
|
||||
)
|
||||
|
||||
return wp_post_id, wp_post_url
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public pipeline functions
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def run_auto_pipeline(trigger: str = "auto") -> dict[str, Any]:
|
||||
"""Run the full automated pipeline and return stats dict."""
|
||||
from . import telegram_bot as tg
|
||||
|
||||
settings = get_settings()
|
||||
stats = PipelineStats()
|
||||
|
||||
tg.notify_pipeline_started(trigger)
|
||||
|
||||
# Step 1: Ingestion
|
||||
try:
|
||||
ingest_result = run_ingestion()
|
||||
stats.ingested = ingest_result.articles_upserted
|
||||
except Exception as exc:
|
||||
tg.notify_error(f"Ingestion fehlgeschlagen: {exc}")
|
||||
logger.error("Ingestion error: %s", exc)
|
||||
stats.errors += 1
|
||||
|
||||
# Step 2: Process new articles
|
||||
new_articles = list_articles(limit=100, status_filter="new")
|
||||
|
||||
for article in new_articles:
|
||||
article_id = int(article["id"])
|
||||
try:
|
||||
_process_article(article, stats, settings)
|
||||
except Exception as exc:
|
||||
logger.error("Fehler bei Artikel #%d: %s", article_id, exc)
|
||||
tg.notify_error(f"Fehler bei Artikel #{article_id} ({article.get('title','?')[:50]}): {exc}")
|
||||
stats.errors += 1
|
||||
# Rate limiting between OpenAI calls
|
||||
time.sleep(1)
|
||||
|
||||
# Step 3: Send rejected summary if any
|
||||
if stats.rejected_articles:
|
||||
try:
|
||||
tg.notify_rejected_summary(stats.rejected_articles)
|
||||
except Exception as exc:
|
||||
logger.warning("Telegram rejected summary fehlgeschlagen: %s", exc)
|
||||
|
||||
# Step 4: Summary
|
||||
result = {
|
||||
"ingested": stats.ingested,
|
||||
"processed": stats.processed,
|
||||
"drafts_created": stats.drafts_created,
|
||||
"rejected": stats.rejected,
|
||||
"quality_gate_rejected": stats.quality_gate_rejected,
|
||||
"no_image": stats.no_image,
|
||||
"warnings": stats.warnings,
|
||||
"errors": stats.errors,
|
||||
}
|
||||
tg.notify_pipeline_done(result)
|
||||
return result
|
||||
|
||||
|
||||
def _process_article(article: dict[str, Any], stats: PipelineStats, settings: Any) -> None:
|
||||
"""Process a single new article through the pipeline."""
|
||||
from . import telegram_bot as tg
|
||||
|
||||
article_id = int(article["id"])
|
||||
|
||||
# Auto-select image
|
||||
_auto_select_image(article)
|
||||
|
||||
# Reload to get updated image_review
|
||||
article = get_article_by_id(article_id) or article
|
||||
|
||||
# Exclude articles without a usable image
|
||||
try:
|
||||
meta = json.loads(article.get("meta_json") or "{}")
|
||||
except Exception:
|
||||
meta = {}
|
||||
has_image = bool((meta.get("image_review") or {}).get("selected_url"))
|
||||
if not has_image:
|
||||
update_article_status(
|
||||
article_id,
|
||||
"no_image",
|
||||
actor="pipeline",
|
||||
note="Kein Bild vorhanden – Artikel ausgeschlossen",
|
||||
)
|
||||
stats.no_image += 1
|
||||
logger.info("Artikel #%d ausgeschlossen: kein Bild gefunden", article_id)
|
||||
try:
|
||||
tg.send_message(
|
||||
f"🖼️ <b>Kein Bild</b> – Artikel #{article_id} ausgeschlossen\n"
|
||||
f"📰 {(article.get('title') or '')[:80]}"
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
return
|
||||
|
||||
# Score relevance
|
||||
try:
|
||||
relevance = score_article_relevance(article)
|
||||
except Exception as exc:
|
||||
logger.warning("Relevanz-Scoring für #%d fehlgeschlagen: %s", article_id, exc)
|
||||
relevance = {"score": 0, "reason": f"Scoring-Fehler: {exc}", "topics": []}
|
||||
|
||||
score = relevance.get("score", 0)
|
||||
reason = relevance.get("reason", "")
|
||||
_store_relevance(article_id, relevance)
|
||||
|
||||
stats.processed += 1
|
||||
|
||||
if score < settings.pipeline_relevance_warn:
|
||||
# Reject
|
||||
update_article_status(
|
||||
article_id,
|
||||
"error",
|
||||
actor="pipeline",
|
||||
note=f"Abgelehnt: Score {score}/100 — {reason}",
|
||||
)
|
||||
stats.rejected += 1
|
||||
# Reload for summary (now has relevance in meta)
|
||||
updated = get_article_by_id(article_id)
|
||||
if updated:
|
||||
stats.rejected_articles.append(updated)
|
||||
|
||||
elif score < settings.pipeline_relevance_auto:
|
||||
# Warning zone: set status to "review" so repeated /run calls don't re-warn
|
||||
update_article_status(
|
||||
article_id,
|
||||
"review",
|
||||
actor="pipeline",
|
||||
note=f"Niedrige Relevanz: Score {score}/100 — {reason}",
|
||||
)
|
||||
stats.warnings += 1
|
||||
try:
|
||||
tg.notify_relevance_warning(article, score, reason)
|
||||
except Exception as exc:
|
||||
logger.warning("Telegram warning für #%d fehlgeschlagen: %s", article_id, exc)
|
||||
|
||||
else:
|
||||
# Auto-process: rewrite + WP draft
|
||||
try:
|
||||
# Reserve publish slot FIRST so it's available when WP draft is created
|
||||
slot = reserve_publish_slot(article_id)
|
||||
|
||||
# Reload article to get updated image_review + scheduled_publish_at
|
||||
fresh = get_article_by_id(article_id)
|
||||
if not fresh:
|
||||
return
|
||||
wp_post_id, wp_post_url = _do_rewrite_and_draft(fresh)
|
||||
stats.drafts_created += 1
|
||||
|
||||
# Reload for notification
|
||||
final = get_article_by_id(article_id)
|
||||
if final:
|
||||
try:
|
||||
tg.notify_new_draft(final, score=score, suggested_publish_at=slot)
|
||||
except Exception as exc:
|
||||
logger.warning("Telegram draft-Benachrichtigung für #%d fehlgeschlagen: %s", article_id, exc)
|
||||
|
||||
except ValueError as exc:
|
||||
# Quality gate rejection (too short etc.) — status already set in _do_rewrite_and_draft
|
||||
# Release the reserved slot so it's available for the next article
|
||||
from .scheduler import release_publish_slot
|
||||
release_publish_slot(article_id)
|
||||
# Clean up any stale WP draft from a previous pipeline run
|
||||
stale = get_article_by_id(article_id)
|
||||
if stale and stale.get("wp_post_id"):
|
||||
try:
|
||||
from .wordpress import delete_wp_post
|
||||
delete_wp_post(int(stale["wp_post_id"]))
|
||||
logger.info("Artikel #%d: veralteten WP-Draft #%s gelöscht", article_id, stale["wp_post_id"])
|
||||
except Exception as del_exc:
|
||||
logger.warning("Artikel #%d: WP-Draft konnte nicht gelöscht werden: %s", article_id, del_exc)
|
||||
stats.quality_gate_rejected += 1
|
||||
logger.info("Artikel #%d wegen Qualitätsprüfung abgelehnt: %s", article_id, exc)
|
||||
# Individual Telegram notification for quality gate rejection
|
||||
try:
|
||||
title = (article.get("title") or "Ohne Titel")[:80]
|
||||
tg.send_message(
|
||||
f"✂️ <b>Qualitätsprüfung nicht bestanden</b>\n"
|
||||
f"📰 {title}\n"
|
||||
f"💯 Score: {score}/100\n"
|
||||
f"⚠️ {exc}"
|
||||
)
|
||||
except Exception as tg_exc:
|
||||
logger.warning("Telegram QG-Benachrichtigung für #%d fehlgeschlagen: %s", article_id, tg_exc)
|
||||
|
||||
except Exception as exc:
|
||||
logger.error("Draft-Erstellung für #%d fehlgeschlagen: %s", article_id, exc)
|
||||
update_article_status(article_id, "error", actor="pipeline", note=f"Draft-Fehler: {exc}")
|
||||
# Release reserved slot so it's not permanently blocked by a failed article
|
||||
from .scheduler import release_publish_slot
|
||||
release_publish_slot(article_id)
|
||||
raise
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Callback actions (called from telegram_bot._handle_callback)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def rewrite_and_update_draft(article_id: int) -> None:
|
||||
"""Rewrite article and update the existing WP draft."""
|
||||
article = get_article_by_id(article_id)
|
||||
if not article:
|
||||
raise RuntimeError(f"Artikel #{article_id} nicht gefunden")
|
||||
_auto_select_image(article)
|
||||
fresh = get_article_by_id(article_id)
|
||||
_do_rewrite_and_draft(fresh)
|
||||
|
||||
|
||||
def discard_article(article_id: int) -> None:
|
||||
"""Discard a draft: delete WP post if exists, set article to error."""
|
||||
article = get_article_by_id(article_id)
|
||||
if not article:
|
||||
return
|
||||
|
||||
wp_post_id = article.get("wp_post_id")
|
||||
if wp_post_id:
|
||||
try:
|
||||
from .wordpress import delete_wp_post
|
||||
delete_wp_post(int(wp_post_id))
|
||||
except Exception as exc:
|
||||
logger.warning("WP Post #%d konnte nicht gelöscht werden: %s", wp_post_id, exc)
|
||||
|
||||
update_article_status(article_id, "error", actor="telegram", note="Via Telegram verworfen")
|
||||
|
||||
|
||||
def override_rejected_article(article_id: int) -> None:
|
||||
"""Force-process a previously rejected article."""
|
||||
from . import telegram_bot as tg
|
||||
|
||||
article = get_article_by_id(article_id)
|
||||
if not article:
|
||||
raise RuntimeError(f"Artikel #{article_id} nicht gefunden")
|
||||
|
||||
# Reset to new so processing is allowed
|
||||
update_article_status(article_id, "new", actor="telegram", note="Manuell übernommen via Telegram")
|
||||
|
||||
# Reload
|
||||
fresh = get_article_by_id(article_id)
|
||||
if not fresh:
|
||||
return
|
||||
|
||||
_auto_select_image(fresh)
|
||||
fresh = get_article_by_id(article_id)
|
||||
|
||||
# Get existing score or re-score
|
||||
try:
|
||||
meta = json.loads(fresh.get("meta_json") or "{}")
|
||||
score = int((meta.get("relevance") or {}).get("score", 0))
|
||||
except Exception:
|
||||
score = 0
|
||||
|
||||
# Reserve publish slot FIRST so it's in the DB when WP draft is created
|
||||
slot = reserve_publish_slot(article_id)
|
||||
fresh = get_article_by_id(article_id)
|
||||
|
||||
wp_post_id, wp_post_url = _do_rewrite_and_draft(fresh)
|
||||
|
||||
final = get_article_by_id(article_id)
|
||||
if final:
|
||||
tg.notify_new_draft(final, score=score, suggested_publish_at=slot)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Status helpers (used by /status command)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def get_recently_rejected(days: int = 3) -> list[dict[str, Any]]:
|
||||
"""Return articles rejected in the last N days."""
|
||||
from .db import get_conn
|
||||
from .db import rows_to_dicts
|
||||
cutoff = datetime.now(timezone.utc).isoformat()[:10]
|
||||
with get_conn() as conn:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT id, title, meta_json, source_url, created_at
|
||||
FROM articles
|
||||
WHERE status IN ('error', 'review')
|
||||
AND json_extract(meta_json, '$.relevance.score') IS NOT NULL
|
||||
AND date(updated_at) >= date('now', ?)
|
||||
ORDER BY updated_at DESC
|
||||
LIMIT 20
|
||||
""",
|
||||
(f"-{days} days",),
|
||||
).fetchall()
|
||||
return rows_to_dicts(rows)
|
||||
|
||||
|
||||
def get_pipeline_status_text() -> str:
|
||||
"""Return a text summary of current pipeline state."""
|
||||
from .repositories import list_articles as _list
|
||||
new_count = len(_list(limit=500, status_filter="new"))
|
||||
approved_count = len(_list(limit=500, status_filter="approved"))
|
||||
published_count = len(_list(limit=500, status_filter="published"))
|
||||
error_count = len(_list(limit=500, status_filter="error"))
|
||||
|
||||
return (
|
||||
f"📊 <b>Pipeline-Status</b>\n"
|
||||
f"🆕 Neu / wartend: {new_count}\n"
|
||||
f"✅ Draft / freigegeben: {approved_count}\n"
|
||||
f"📢 Veröffentlicht: {published_count}\n"
|
||||
f"🚫 Fehler / abgelehnt: {error_count}"
|
||||
)
|
||||
35
backend/app/policy.py
Normal file
35
backend/app/policy.py
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
|
||||
def evaluate_source_policy(source: dict[str, Any] | None) -> list[str]:
|
||||
issues: list[str] = []
|
||||
if not source:
|
||||
issues.append("Keine Quelle zugeordnet")
|
||||
return issues
|
||||
|
||||
risk_level = (source.get("risk_level") or "").strip().lower()
|
||||
if risk_level != "green":
|
||||
issues.append(f"Quelle nicht freigegeben (risk_level={risk_level or 'unset'})")
|
||||
|
||||
terms_url = (source.get("terms_url") or "").strip()
|
||||
if not terms_url:
|
||||
issues.append("terms_url fehlt")
|
||||
|
||||
license_name = (source.get("license_name") or "").strip()
|
||||
if not license_name:
|
||||
issues.append("license_name fehlt")
|
||||
|
||||
last_reviewed_at = (source.get("last_reviewed_at") or "").strip()
|
||||
if not last_reviewed_at:
|
||||
issues.append("last_reviewed_at fehlt")
|
||||
|
||||
if int(source.get("is_enabled", 0) or 0) != 1:
|
||||
issues.append("Quelle ist deaktiviert")
|
||||
|
||||
return issues
|
||||
|
||||
|
||||
def is_source_allowed(source: dict[str, Any] | None) -> bool:
|
||||
return len(evaluate_source_policy(source)) == 0
|
||||
101
backend/app/publisher.py
Normal file
101
backend/app/publisher.py
Normal file
|
|
@ -0,0 +1,101 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
from .repositories import (
|
||||
claim_next_publish_job,
|
||||
complete_publish_job,
|
||||
create_publish_job,
|
||||
fail_publish_job,
|
||||
get_article_by_id,
|
||||
mark_article_publish_result,
|
||||
PublishJobCreate,
|
||||
)
|
||||
from .wordpress import publish_article_draft, selected_image_exists
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PublisherStats:
|
||||
processed: int
|
||||
success: int
|
||||
failed: int
|
||||
requeued: int
|
||||
|
||||
|
||||
def enqueue_publish(article_id: int, max_attempts: int = 3) -> int:
|
||||
return create_publish_job(PublishJobCreate(article_id=article_id, max_attempts=max_attempts))
|
||||
|
||||
|
||||
def _can_publish(article: dict) -> tuple[bool, str | None]:
|
||||
if article.get("status") not in {"approved", "published"}:
|
||||
return False, "Artikelstatus muss 'publish' sein"
|
||||
if not selected_image_exists(article):
|
||||
return False, "Hauptbild nicht gesetzt"
|
||||
return True, None
|
||||
|
||||
|
||||
def run_publisher(max_jobs: int = 10) -> PublisherStats:
|
||||
processed = 0
|
||||
success = 0
|
||||
failed = 0
|
||||
requeued = 0
|
||||
|
||||
for _ in range(max(1, max_jobs)):
|
||||
job = claim_next_publish_job()
|
||||
if not job:
|
||||
break
|
||||
processed += 1
|
||||
job_id = int(job["id"])
|
||||
article_id = int(job["article_id"])
|
||||
|
||||
article = get_article_by_id(article_id)
|
||||
if not article:
|
||||
fail_publish_job(job_id, "Artikel nicht gefunden", requeue=False)
|
||||
failed += 1
|
||||
continue
|
||||
|
||||
allowed, reason = _can_publish(article)
|
||||
if not allowed:
|
||||
fail_publish_job(job_id, reason or "Publish-Bedingungen nicht erfüllt", requeue=False)
|
||||
mark_article_publish_result(
|
||||
article_id,
|
||||
wp_post_id=article.get("wp_post_id"),
|
||||
wp_post_url=article.get("wp_post_url"),
|
||||
error=reason or "blocked",
|
||||
increment_attempts=True,
|
||||
set_published_status=False,
|
||||
)
|
||||
failed += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
wp_post_id, wp_post_url = publish_article_draft(article)
|
||||
complete_publish_job(job_id, wp_post_id=wp_post_id, wp_post_url=wp_post_url)
|
||||
mark_article_publish_result(
|
||||
article_id,
|
||||
wp_post_id=wp_post_id,
|
||||
wp_post_url=wp_post_url,
|
||||
error=None,
|
||||
increment_attempts=True,
|
||||
set_published_status=True,
|
||||
)
|
||||
success += 1
|
||||
except Exception as exc:
|
||||
attempts = int(job.get("attempts", 1))
|
||||
max_attempts = int(job.get("max_attempts", 3))
|
||||
should_requeue = attempts < max_attempts
|
||||
fail_publish_job(job_id, str(exc), requeue=should_requeue)
|
||||
mark_article_publish_result(
|
||||
article_id,
|
||||
wp_post_id=article.get("wp_post_id"),
|
||||
wp_post_url=article.get("wp_post_url"),
|
||||
error=str(exc),
|
||||
increment_attempts=True,
|
||||
set_published_status=False,
|
||||
)
|
||||
if should_requeue:
|
||||
requeued += 1
|
||||
else:
|
||||
failed += 1
|
||||
|
||||
return PublisherStats(processed=processed, success=success, failed=failed, requeued=requeued)
|
||||
44
backend/app/relevance.py
Normal file
44
backend/app/relevance.py
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timezone
|
||||
|
||||
|
||||
def _parse_iso_datetime(value: str | None) -> datetime | None:
|
||||
if not value:
|
||||
return None
|
||||
raw = value.strip()
|
||||
if not raw:
|
||||
return None
|
||||
if raw.endswith("Z"):
|
||||
raw = raw[:-1] + "+00:00"
|
||||
try:
|
||||
parsed = datetime.fromisoformat(raw)
|
||||
except ValueError:
|
||||
return None
|
||||
if parsed.tzinfo is None:
|
||||
parsed = parsed.replace(tzinfo=timezone.utc)
|
||||
return parsed
|
||||
|
||||
|
||||
def article_age_days(published_at: str | None, now: datetime | None = None) -> int | None:
|
||||
published = _parse_iso_datetime(published_at)
|
||||
if not published:
|
||||
return None
|
||||
ref = now or datetime.now(timezone.utc)
|
||||
delta = ref - published
|
||||
if delta.total_seconds() < 0:
|
||||
return 0
|
||||
return delta.days
|
||||
|
||||
|
||||
def article_relevance(published_at: str | None, now: datetime | None = None) -> str:
|
||||
days = article_age_days(published_at, now=now)
|
||||
if days is None:
|
||||
return "unbekannt"
|
||||
if days <= 2:
|
||||
return "hoch"
|
||||
if days <= 7:
|
||||
return "mittel"
|
||||
if days <= 30:
|
||||
return "niedrig"
|
||||
return "alt"
|
||||
855
backend/app/repositories.py
Normal file
855
backend/app/repositories.py
Normal file
|
|
@ -0,0 +1,855 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
from .db import get_conn, rows_to_dicts
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class SourceCreate:
|
||||
name: str
|
||||
base_url: str | None
|
||||
terms_url: str | None
|
||||
license_name: str | None
|
||||
risk_level: str
|
||||
is_enabled: bool
|
||||
notes: str | None
|
||||
last_reviewed_at: str | None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class FeedCreate:
|
||||
name: str
|
||||
url: str
|
||||
source_id: int | None
|
||||
is_enabled: bool
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class SourceUpdate:
|
||||
name: str
|
||||
base_url: str | None
|
||||
terms_url: str | None
|
||||
license_name: str | None
|
||||
risk_level: str
|
||||
is_enabled: bool
|
||||
notes: str | None
|
||||
last_reviewed_at: str | None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class FeedUpdate:
|
||||
name: str
|
||||
url: str
|
||||
source_id: int | None
|
||||
is_enabled: bool
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RunCreate:
|
||||
run_type: str
|
||||
status: str
|
||||
details: str | None = None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ArticleUpsert:
|
||||
feed_id: int | None
|
||||
source_article_id: str | None
|
||||
source_hash: str | None
|
||||
title: str
|
||||
source_url: str
|
||||
canonical_url: str | None
|
||||
published_at: str | None
|
||||
author: str | None
|
||||
summary: str | None
|
||||
content_raw: str | None
|
||||
content_rewritten: str | None
|
||||
image_urls_json: str | None
|
||||
press_contact: str | None
|
||||
source_name_snapshot: str | None
|
||||
source_terms_url_snapshot: str | None
|
||||
source_license_name_snapshot: str | None
|
||||
legal_checked: bool
|
||||
legal_checked_at: str | None
|
||||
legal_note: str | None
|
||||
wp_post_id: int | None
|
||||
wp_post_url: str | None
|
||||
publish_attempts: int
|
||||
publish_last_error: str | None
|
||||
published_to_wp_at: str | None
|
||||
word_count: int
|
||||
status: str
|
||||
meta_json: str | None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PublishJobCreate:
|
||||
article_id: int
|
||||
max_attempts: int = 3
|
||||
|
||||
|
||||
def create_source(payload: SourceCreate) -> int:
|
||||
with get_conn() as conn:
|
||||
cur = conn.execute(
|
||||
"""
|
||||
INSERT INTO sources (name, base_url, terms_url, license_name, risk_level, is_enabled, notes, last_reviewed_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""",
|
||||
(
|
||||
payload.name.strip(),
|
||||
payload.base_url,
|
||||
payload.terms_url,
|
||||
payload.license_name,
|
||||
payload.risk_level,
|
||||
1 if payload.is_enabled else 0,
|
||||
payload.notes,
|
||||
payload.last_reviewed_at,
|
||||
),
|
||||
)
|
||||
return int(cur.lastrowid)
|
||||
|
||||
|
||||
def list_sources() -> list[dict[str, Any]]:
|
||||
with get_conn() as conn:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT id, name, base_url, terms_url, license_name, risk_level, is_enabled, notes, last_reviewed_at, created_at, updated_at
|
||||
FROM sources
|
||||
ORDER BY id DESC
|
||||
"""
|
||||
).fetchall()
|
||||
return rows_to_dicts(rows)
|
||||
|
||||
|
||||
def get_source_by_id(source_id: int) -> dict[str, Any] | None:
|
||||
with get_conn() as conn:
|
||||
row = conn.execute(
|
||||
"""
|
||||
SELECT id, name, base_url, terms_url, license_name, risk_level, is_enabled, notes, last_reviewed_at, created_at, updated_at
|
||||
FROM sources
|
||||
WHERE id = ?
|
||||
""",
|
||||
(source_id,),
|
||||
).fetchone()
|
||||
return dict(row) if row else None
|
||||
|
||||
|
||||
def update_source(source_id: int, payload: SourceUpdate) -> bool:
|
||||
with get_conn() as conn:
|
||||
cur = conn.execute(
|
||||
"""
|
||||
UPDATE sources
|
||||
SET name = ?, base_url = ?, terms_url = ?, license_name = ?, risk_level = ?, is_enabled = ?, notes = ?, last_reviewed_at = ?
|
||||
WHERE id = ?
|
||||
""",
|
||||
(
|
||||
payload.name.strip(),
|
||||
payload.base_url,
|
||||
payload.terms_url,
|
||||
payload.license_name,
|
||||
payload.risk_level,
|
||||
1 if payload.is_enabled else 0,
|
||||
payload.notes,
|
||||
payload.last_reviewed_at,
|
||||
source_id,
|
||||
),
|
||||
)
|
||||
return cur.rowcount > 0
|
||||
|
||||
|
||||
def delete_source(source_id: int) -> bool:
|
||||
with get_conn() as conn:
|
||||
cur = conn.execute("DELETE FROM sources WHERE id = ?", (source_id,))
|
||||
return cur.rowcount > 0
|
||||
|
||||
|
||||
def create_feed(payload: FeedCreate) -> int:
|
||||
with get_conn() as conn:
|
||||
cur = conn.execute(
|
||||
"INSERT INTO feeds (name, url, source_id, is_enabled) VALUES (?, ?, ?, ?)",
|
||||
(payload.name.strip(), payload.url.strip(), payload.source_id, 1 if payload.is_enabled else 0),
|
||||
)
|
||||
return int(cur.lastrowid)
|
||||
|
||||
|
||||
def list_feeds() -> list[dict[str, Any]]:
|
||||
with get_conn() as conn:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT f.id, f.name, f.url, f.source_id, f.is_enabled, f.etag, f.last_modified, f.last_checked_at,
|
||||
f.created_at, f.updated_at, s.name AS source_name, s.license_name AS source_license_name,
|
||||
s.terms_url AS source_terms_url, s.risk_level AS source_risk_level, s.base_url AS source_base_url,
|
||||
s.last_reviewed_at AS source_last_reviewed_at, s.is_enabled AS source_is_enabled
|
||||
FROM feeds f
|
||||
LEFT JOIN sources s ON s.id = f.source_id
|
||||
ORDER BY f.id DESC
|
||||
"""
|
||||
).fetchall()
|
||||
return rows_to_dicts(rows)
|
||||
|
||||
|
||||
def list_enabled_feeds() -> list[dict[str, Any]]:
|
||||
with get_conn() as conn:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT f.id, f.name, f.url, f.source_id, f.is_enabled, f.etag, f.last_modified, f.last_checked_at,
|
||||
s.name AS source_name, s.license_name AS source_license_name, s.terms_url AS source_terms_url,
|
||||
s.risk_level AS source_risk_level, s.base_url AS source_base_url,
|
||||
s.last_reviewed_at AS source_last_reviewed_at, s.is_enabled AS source_is_enabled
|
||||
FROM feeds f
|
||||
LEFT JOIN sources s ON s.id = f.source_id
|
||||
WHERE f.is_enabled = 1
|
||||
ORDER BY f.id ASC
|
||||
"""
|
||||
).fetchall()
|
||||
return rows_to_dicts(rows)
|
||||
|
||||
|
||||
def get_feed_by_id(feed_id: int) -> dict[str, Any] | None:
|
||||
with get_conn() as conn:
|
||||
row = conn.execute(
|
||||
"""
|
||||
SELECT f.id, f.name, f.url, f.source_id, f.is_enabled, f.etag, f.last_modified, f.last_checked_at,
|
||||
s.name AS source_name, s.license_name AS source_license_name, s.terms_url AS source_terms_url,
|
||||
s.risk_level AS source_risk_level, s.base_url AS source_base_url,
|
||||
s.last_reviewed_at AS source_last_reviewed_at, s.is_enabled AS source_is_enabled
|
||||
FROM feeds f
|
||||
LEFT JOIN sources s ON s.id = f.source_id
|
||||
WHERE f.id = ?
|
||||
""",
|
||||
(feed_id,),
|
||||
).fetchone()
|
||||
return dict(row) if row else None
|
||||
|
||||
|
||||
def update_feed(feed_id: int, payload: FeedUpdate) -> bool:
|
||||
with get_conn() as conn:
|
||||
cur = conn.execute(
|
||||
"""
|
||||
UPDATE feeds
|
||||
SET name = ?, url = ?, source_id = ?, is_enabled = ?
|
||||
WHERE id = ?
|
||||
""",
|
||||
(
|
||||
payload.name.strip(),
|
||||
payload.url.strip(),
|
||||
payload.source_id,
|
||||
1 if payload.is_enabled else 0,
|
||||
feed_id,
|
||||
),
|
||||
)
|
||||
return cur.rowcount > 0
|
||||
|
||||
|
||||
def delete_feed(feed_id: int) -> bool:
|
||||
with get_conn() as conn:
|
||||
cur = conn.execute("DELETE FROM feeds WHERE id = ?", (feed_id,))
|
||||
return cur.rowcount > 0
|
||||
|
||||
|
||||
def update_feed_fetch_state(feed_id: int, etag: str | None, last_modified: str | None) -> None:
|
||||
with get_conn() as conn:
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE feeds
|
||||
SET etag = ?, last_modified = ?, last_checked_at = datetime('now')
|
||||
WHERE id = ?
|
||||
""",
|
||||
(etag, last_modified, feed_id),
|
||||
)
|
||||
|
||||
|
||||
def create_run(payload: RunCreate) -> int:
|
||||
with get_conn() as conn:
|
||||
cur = conn.execute(
|
||||
"INSERT INTO runs (run_type, status, details) VALUES (?, ?, ?)",
|
||||
(payload.run_type, payload.status, payload.details),
|
||||
)
|
||||
return int(cur.lastrowid)
|
||||
|
||||
|
||||
def finish_run(run_id: int, status: str, details: str | None = None) -> None:
|
||||
with get_conn() as conn:
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE runs
|
||||
SET status = ?, details = ?, finished_at = datetime('now')
|
||||
WHERE id = ?
|
||||
""",
|
||||
(status, details, run_id),
|
||||
)
|
||||
|
||||
|
||||
def list_runs(limit: int = 50) -> list[dict[str, Any]]:
|
||||
safe_limit = max(1, min(limit, 500))
|
||||
with get_conn() as conn:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT id, run_type, status, started_at, finished_at, details
|
||||
FROM runs
|
||||
ORDER BY id DESC
|
||||
LIMIT ?
|
||||
""",
|
||||
(safe_limit,),
|
||||
).fetchall()
|
||||
return rows_to_dicts(rows)
|
||||
|
||||
|
||||
def get_run_by_id(run_id: int) -> dict[str, Any] | None:
|
||||
with get_conn() as conn:
|
||||
row = conn.execute(
|
||||
"""
|
||||
SELECT id, run_type, status, started_at, finished_at, details
|
||||
FROM runs
|
||||
WHERE id = ?
|
||||
""",
|
||||
(run_id,),
|
||||
).fetchone()
|
||||
return dict(row) if row else None
|
||||
|
||||
|
||||
def get_article_by_id(article_id: int) -> dict[str, Any] | None:
|
||||
with get_conn() as conn:
|
||||
row = conn.execute(
|
||||
"""
|
||||
SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author,
|
||||
a.summary, a.content_raw, a.content_rewritten, a.image_urls_json, a.press_contact,
|
||||
a.source_name_snapshot, a.source_terms_url_snapshot, a.source_license_name_snapshot,
|
||||
a.legal_checked, a.legal_checked_at, a.legal_note,
|
||||
a.wp_post_id, a.wp_post_url, a.publish_attempts, a.publish_last_error, a.published_to_wp_at,
|
||||
a.word_count, a.status, a.meta_json, a.created_at, a.updated_at,
|
||||
a.scheduled_publish_at
|
||||
FROM articles a
|
||||
WHERE a.id = ?
|
||||
""",
|
||||
(article_id,),
|
||||
).fetchone()
|
||||
return dict(row) if row else None
|
||||
|
||||
|
||||
def _merge_review_event(meta_json: str | None, event: dict[str, Any]) -> str:
|
||||
meta: dict[str, Any] = {}
|
||||
if meta_json:
|
||||
try:
|
||||
meta = json.loads(meta_json)
|
||||
if not isinstance(meta, dict):
|
||||
meta = {}
|
||||
except Exception:
|
||||
meta = {}
|
||||
|
||||
events = meta.get("review_events")
|
||||
if not isinstance(events, list):
|
||||
events = []
|
||||
events.append(event)
|
||||
meta["review_events"] = events
|
||||
return json.dumps(meta, ensure_ascii=False)
|
||||
|
||||
|
||||
def _load_meta(meta_json: str | None) -> dict[str, Any]:
|
||||
if not meta_json:
|
||||
return {}
|
||||
try:
|
||||
parsed = json.loads(meta_json)
|
||||
return parsed if isinstance(parsed, dict) else {}
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
|
||||
def update_article_status(
|
||||
article_id: int,
|
||||
new_status: str,
|
||||
*,
|
||||
actor: str | None = None,
|
||||
note: str | None = None,
|
||||
decision: str | None = None,
|
||||
) -> bool:
|
||||
article = get_article_by_id(article_id)
|
||||
if not article:
|
||||
return False
|
||||
|
||||
event = {
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"from_status": article.get("status"),
|
||||
"to_status": new_status,
|
||||
"actor": actor or "system",
|
||||
"note": note,
|
||||
"decision": decision,
|
||||
}
|
||||
merged_meta = _merge_review_event(article.get("meta_json"), event)
|
||||
|
||||
with get_conn() as conn:
|
||||
conn.execute(
|
||||
"UPDATE articles SET status = ?, meta_json = ? WHERE id = ?",
|
||||
(new_status, merged_meta, article_id),
|
||||
)
|
||||
return True
|
||||
|
||||
|
||||
def set_article_legal_review(article_id: int, approved: bool, note: str | None, actor: str | None = None) -> bool:
|
||||
article = get_article_by_id(article_id)
|
||||
if not article:
|
||||
return False
|
||||
|
||||
event = {
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"event": "legal_review",
|
||||
"approved": approved,
|
||||
"actor": actor or "system",
|
||||
"note": note,
|
||||
}
|
||||
merged_meta = _merge_review_event(article.get("meta_json"), event)
|
||||
with get_conn() as conn:
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE articles
|
||||
SET legal_checked = ?, legal_checked_at = datetime('now'), legal_note = ?, meta_json = ?
|
||||
WHERE id = ?
|
||||
""",
|
||||
(1 if approved else 0, note, merged_meta, article_id),
|
||||
)
|
||||
return True
|
||||
|
||||
|
||||
def set_article_image_decision(article_id: int, image_url: str, action: str, actor: str | None = None) -> bool:
|
||||
article = get_article_by_id(article_id)
|
||||
if not article:
|
||||
return False
|
||||
url = (image_url or "").strip()
|
||||
if not url:
|
||||
return False
|
||||
if action not in {"select", "exclude", "restore"}:
|
||||
return False
|
||||
|
||||
meta = _load_meta(article.get("meta_json"))
|
||||
image_review = meta.get("image_review")
|
||||
if not isinstance(image_review, dict):
|
||||
image_review = {}
|
||||
|
||||
excluded = image_review.get("excluded_urls")
|
||||
if not isinstance(excluded, list):
|
||||
excluded = []
|
||||
excluded_set = {str(item) for item in excluded if item}
|
||||
|
||||
selected_url = image_review.get("selected_url")
|
||||
if not isinstance(selected_url, str):
|
||||
selected_url = None
|
||||
|
||||
if action == "select":
|
||||
selected_url = url
|
||||
excluded_set.discard(url)
|
||||
elif action == "exclude":
|
||||
excluded_set.add(url)
|
||||
if selected_url == url:
|
||||
selected_url = None
|
||||
elif action == "restore":
|
||||
excluded_set.discard(url)
|
||||
|
||||
image_review["selected_url"] = selected_url
|
||||
image_review["excluded_urls"] = sorted(excluded_set)
|
||||
image_review["updated_at"] = datetime.now(timezone.utc).isoformat()
|
||||
image_review["updated_by"] = actor or "system"
|
||||
meta["image_review"] = image_review
|
||||
|
||||
with get_conn() as conn:
|
||||
conn.execute(
|
||||
"UPDATE articles SET meta_json = ? WHERE id = ?",
|
||||
(json.dumps(meta, ensure_ascii=False), article_id),
|
||||
)
|
||||
return True
|
||||
|
||||
|
||||
def create_publish_job(payload: PublishJobCreate) -> int:
|
||||
with get_conn() as conn:
|
||||
existing = conn.execute(
|
||||
"""
|
||||
SELECT id FROM publish_jobs
|
||||
WHERE article_id = ? AND status IN ('queued', 'running')
|
||||
ORDER BY id DESC
|
||||
LIMIT 1
|
||||
""",
|
||||
(payload.article_id,),
|
||||
).fetchone()
|
||||
if existing:
|
||||
return int(existing["id"])
|
||||
|
||||
cur = conn.execute(
|
||||
"""
|
||||
INSERT INTO publish_jobs (article_id, status, attempts, max_attempts)
|
||||
VALUES (?, 'queued', 0, ?)
|
||||
""",
|
||||
(payload.article_id, max(1, payload.max_attempts)),
|
||||
)
|
||||
return int(cur.lastrowid)
|
||||
|
||||
|
||||
def list_publish_jobs(limit: int = 100) -> list[dict[str, Any]]:
|
||||
safe_limit = max(1, min(limit, 500))
|
||||
with get_conn() as conn:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT j.id, j.article_id, j.status, j.attempts, j.max_attempts, j.error_message, j.wp_post_id, j.wp_post_url,
|
||||
j.created_at, j.started_at, j.finished_at, a.title AS article_title
|
||||
FROM publish_jobs j
|
||||
LEFT JOIN articles a ON a.id = j.article_id
|
||||
ORDER BY j.id DESC
|
||||
LIMIT ?
|
||||
""",
|
||||
(safe_limit,),
|
||||
).fetchall()
|
||||
return rows_to_dicts(rows)
|
||||
|
||||
|
||||
def claim_next_publish_job() -> dict[str, Any] | None:
|
||||
with get_conn() as conn:
|
||||
row = conn.execute(
|
||||
"""
|
||||
SELECT id, article_id, status, attempts, max_attempts, error_message, wp_post_id, wp_post_url
|
||||
FROM publish_jobs
|
||||
WHERE status = 'queued' AND attempts < max_attempts
|
||||
ORDER BY id ASC
|
||||
LIMIT 1
|
||||
"""
|
||||
).fetchone()
|
||||
if not row:
|
||||
return None
|
||||
job_id = int(row["id"])
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE publish_jobs
|
||||
SET status = 'running',
|
||||
attempts = attempts + 1,
|
||||
started_at = datetime('now'),
|
||||
finished_at = NULL
|
||||
WHERE id = ?
|
||||
""",
|
||||
(job_id,),
|
||||
)
|
||||
claimed = conn.execute(
|
||||
"""
|
||||
SELECT id, article_id, status, attempts, max_attempts, error_message, wp_post_id, wp_post_url
|
||||
FROM publish_jobs
|
||||
WHERE id = ?
|
||||
""",
|
||||
(job_id,),
|
||||
).fetchone()
|
||||
return dict(claimed) if claimed else None
|
||||
|
||||
|
||||
def complete_publish_job(job_id: int, wp_post_id: int | None, wp_post_url: str | None) -> None:
|
||||
with get_conn() as conn:
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE publish_jobs
|
||||
SET status = 'success',
|
||||
wp_post_id = ?,
|
||||
wp_post_url = ?,
|
||||
error_message = NULL,
|
||||
finished_at = datetime('now')
|
||||
WHERE id = ?
|
||||
""",
|
||||
(wp_post_id, wp_post_url, job_id),
|
||||
)
|
||||
|
||||
|
||||
def fail_publish_job(job_id: int, error_message: str, requeue: bool) -> None:
|
||||
next_status = "queued" if requeue else "failed"
|
||||
with get_conn() as conn:
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE publish_jobs
|
||||
SET status = ?,
|
||||
error_message = ?,
|
||||
finished_at = datetime('now')
|
||||
WHERE id = ?
|
||||
""",
|
||||
(next_status, error_message[:2000], job_id),
|
||||
)
|
||||
|
||||
|
||||
def mark_article_publish_result(
|
||||
article_id: int,
|
||||
*,
|
||||
wp_post_id: int | None,
|
||||
wp_post_url: str | None,
|
||||
error: str | None,
|
||||
increment_attempts: bool,
|
||||
set_published_status: bool,
|
||||
) -> None:
|
||||
with get_conn() as conn:
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE articles
|
||||
SET wp_post_id = ?,
|
||||
wp_post_url = ?,
|
||||
publish_attempts = CASE WHEN ? THEN publish_attempts + 1 ELSE publish_attempts END,
|
||||
publish_last_error = ?,
|
||||
published_to_wp_at = CASE WHEN ? IS NOT NULL THEN datetime('now') ELSE published_to_wp_at END,
|
||||
status = CASE WHEN ? THEN 'published' ELSE status END
|
||||
WHERE id = ?
|
||||
""",
|
||||
(
|
||||
wp_post_id,
|
||||
wp_post_url,
|
||||
1 if increment_attempts else 0,
|
||||
error[:2000] if error else None,
|
||||
wp_post_id,
|
||||
1 if set_published_status else 0,
|
||||
article_id,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def _resolve_existing_article_id(payload: ArticleUpsert) -> int | None:
|
||||
with get_conn() as conn:
|
||||
# 1) strongest key: source_url
|
||||
row = conn.execute(
|
||||
"SELECT id FROM articles WHERE source_url = ?",
|
||||
(payload.source_url.strip(),),
|
||||
).fetchone()
|
||||
if row:
|
||||
return int(row["id"])
|
||||
|
||||
# 2) stable feed+guid combo
|
||||
if payload.feed_id is not None and payload.source_article_id:
|
||||
row = conn.execute(
|
||||
"SELECT id FROM articles WHERE feed_id = ? AND source_article_id = ?",
|
||||
(payload.feed_id, payload.source_article_id),
|
||||
).fetchone()
|
||||
if row:
|
||||
return int(row["id"])
|
||||
|
||||
# 3) content hash fallback
|
||||
if payload.source_hash:
|
||||
row = conn.execute(
|
||||
"SELECT id FROM articles WHERE source_hash = ?",
|
||||
(payload.source_hash,),
|
||||
).fetchone()
|
||||
if row:
|
||||
return int(row["id"])
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def find_existing_article_for_upsert(payload: ArticleUpsert) -> dict[str, Any] | None:
|
||||
article_id = _resolve_existing_article_id(payload)
|
||||
if article_id is None:
|
||||
return None
|
||||
return get_article_by_id(article_id)
|
||||
|
||||
|
||||
def upsert_article(payload: ArticleUpsert) -> int:
|
||||
existing_id = _resolve_existing_article_id(payload)
|
||||
with get_conn() as conn:
|
||||
if existing_id is None:
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO articles (
|
||||
feed_id, source_article_id, source_hash, title, source_url, canonical_url, published_at, author,
|
||||
summary, content_raw, content_rewritten, image_urls_json, press_contact,
|
||||
source_name_snapshot, source_terms_url_snapshot, source_license_name_snapshot,
|
||||
legal_checked, legal_checked_at, legal_note,
|
||||
wp_post_id, wp_post_url, publish_attempts, publish_last_error, published_to_wp_at,
|
||||
word_count, status, meta_json
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""",
|
||||
(
|
||||
payload.feed_id,
|
||||
payload.source_article_id,
|
||||
payload.source_hash,
|
||||
payload.title.strip(),
|
||||
payload.source_url.strip(),
|
||||
payload.canonical_url,
|
||||
payload.published_at,
|
||||
payload.author,
|
||||
payload.summary,
|
||||
payload.content_raw,
|
||||
payload.content_rewritten,
|
||||
payload.image_urls_json,
|
||||
payload.press_contact,
|
||||
payload.source_name_snapshot,
|
||||
payload.source_terms_url_snapshot,
|
||||
payload.source_license_name_snapshot,
|
||||
1 if payload.legal_checked else 0,
|
||||
payload.legal_checked_at,
|
||||
payload.legal_note,
|
||||
payload.wp_post_id,
|
||||
payload.wp_post_url,
|
||||
payload.publish_attempts,
|
||||
payload.publish_last_error,
|
||||
payload.published_to_wp_at,
|
||||
payload.word_count,
|
||||
payload.status,
|
||||
payload.meta_json,
|
||||
),
|
||||
)
|
||||
else:
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE articles
|
||||
SET
|
||||
feed_id = ?,
|
||||
source_article_id = ?,
|
||||
source_hash = ?,
|
||||
title = ?,
|
||||
source_url = ?,
|
||||
canonical_url = ?,
|
||||
published_at = ?,
|
||||
author = ?,
|
||||
summary = ?,
|
||||
content_raw = ?,
|
||||
content_rewritten = ?,
|
||||
image_urls_json = ?,
|
||||
press_contact = ?,
|
||||
source_name_snapshot = ?,
|
||||
source_terms_url_snapshot = ?,
|
||||
source_license_name_snapshot = ?,
|
||||
legal_checked = ?,
|
||||
legal_checked_at = ?,
|
||||
legal_note = ?,
|
||||
wp_post_id = ?,
|
||||
wp_post_url = ?,
|
||||
publish_attempts = ?,
|
||||
publish_last_error = ?,
|
||||
published_to_wp_at = ?,
|
||||
word_count = ?,
|
||||
status = ?,
|
||||
meta_json = ?
|
||||
WHERE id = ?
|
||||
""",
|
||||
(
|
||||
payload.feed_id,
|
||||
payload.source_article_id,
|
||||
payload.source_hash,
|
||||
payload.title.strip(),
|
||||
payload.source_url.strip(),
|
||||
payload.canonical_url,
|
||||
payload.published_at,
|
||||
payload.author,
|
||||
payload.summary,
|
||||
payload.content_raw,
|
||||
payload.content_rewritten,
|
||||
payload.image_urls_json,
|
||||
payload.press_contact,
|
||||
payload.source_name_snapshot,
|
||||
payload.source_terms_url_snapshot,
|
||||
payload.source_license_name_snapshot,
|
||||
1 if payload.legal_checked else 0,
|
||||
payload.legal_checked_at,
|
||||
payload.legal_note,
|
||||
payload.wp_post_id,
|
||||
payload.wp_post_url,
|
||||
payload.publish_attempts,
|
||||
payload.publish_last_error,
|
||||
payload.published_to_wp_at,
|
||||
payload.word_count,
|
||||
payload.status,
|
||||
payload.meta_json,
|
||||
existing_id,
|
||||
),
|
||||
)
|
||||
row = conn.execute("SELECT id FROM articles WHERE source_url = ?", (payload.source_url.strip(),)).fetchone()
|
||||
if row:
|
||||
return int(row["id"])
|
||||
return int(existing_id) if existing_id else 0
|
||||
|
||||
|
||||
def list_articles_page(
|
||||
limit: int = 50,
|
||||
offset: int = 0,
|
||||
status_filter: str | None = None,
|
||||
search: str | None = None,
|
||||
) -> tuple[list[dict[str, Any]], int]:
|
||||
"""Return (articles, total_count) with optional status filter and title search."""
|
||||
safe_limit = max(1, min(limit, 200))
|
||||
safe_offset = max(0, offset)
|
||||
|
||||
conditions: list[str] = []
|
||||
params: list[Any] = []
|
||||
if status_filter:
|
||||
conditions.append("a.status = ?")
|
||||
params.append(status_filter)
|
||||
if search:
|
||||
conditions.append("(a.title LIKE ? OR a.id = ?)")
|
||||
try:
|
||||
params.extend([f"%{search}%", int(search)])
|
||||
except ValueError:
|
||||
params.extend([f"%{search}%", -1])
|
||||
|
||||
where = f"WHERE {' AND '.join(conditions)}" if conditions else ""
|
||||
select = """
|
||||
SELECT a.id, a.title, a.status, a.published_at, a.summary, a.content_raw,
|
||||
a.meta_json, a.wp_post_id, a.wp_post_url, a.scheduled_publish_at,
|
||||
a.word_count, f.name AS feed_name
|
||||
FROM articles a
|
||||
LEFT JOIN feeds f ON f.id = a.feed_id
|
||||
"""
|
||||
with get_conn() as conn:
|
||||
total = conn.execute(
|
||||
f"SELECT COUNT(*) FROM articles a {where}", params
|
||||
).fetchone()[0]
|
||||
rows = conn.execute(
|
||||
f"{select} {where} ORDER BY a.id DESC LIMIT ? OFFSET ?",
|
||||
params + [safe_limit, safe_offset],
|
||||
).fetchall()
|
||||
return rows_to_dicts(rows), total
|
||||
|
||||
|
||||
def bulk_update_wp_post_ids(updates: list[tuple[int, int | None]]) -> int:
|
||||
"""Update wp_post_id (and clear stale wp_post_url) for multiple articles.
|
||||
|
||||
Returns the number of rows actually updated.
|
||||
Call sync_db_from_wordpress() afterwards to repopulate wp_post_url and
|
||||
scheduled_publish_at from the live WordPress data.
|
||||
"""
|
||||
if not updates:
|
||||
return 0
|
||||
updated = 0
|
||||
with get_conn() as conn:
|
||||
for article_id, new_wp_id in updates:
|
||||
conn.execute(
|
||||
"UPDATE articles SET wp_post_id = ?, wp_post_url = NULL WHERE id = ?",
|
||||
(new_wp_id, article_id),
|
||||
)
|
||||
updated += 1
|
||||
return updated
|
||||
|
||||
|
||||
def list_articles(limit: int = 100, status_filter: str | None = None) -> list[dict[str, Any]]:
|
||||
safe_limit = max(1, min(limit, 500))
|
||||
with get_conn() as conn:
|
||||
if status_filter:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author,
|
||||
a.summary, a.content_raw, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at, f.name AS feed_name,
|
||||
a.image_urls_json, a.press_contact, a.source_name_snapshot, a.source_terms_url_snapshot,
|
||||
a.source_license_name_snapshot, a.legal_checked, a.legal_checked_at, a.legal_note,
|
||||
a.wp_post_id, a.wp_post_url, a.publish_attempts, a.publish_last_error, a.published_to_wp_at
|
||||
FROM articles a
|
||||
LEFT JOIN feeds f ON f.id = a.feed_id
|
||||
WHERE a.status = ?
|
||||
ORDER BY a.id DESC
|
||||
LIMIT ?
|
||||
""",
|
||||
(status_filter, safe_limit),
|
||||
).fetchall()
|
||||
else:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author,
|
||||
a.summary, a.content_raw, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at, f.name AS feed_name,
|
||||
a.image_urls_json, a.press_contact, a.source_name_snapshot, a.source_terms_url_snapshot,
|
||||
a.source_license_name_snapshot, a.legal_checked, a.legal_checked_at, a.legal_note,
|
||||
a.wp_post_id, a.wp_post_url, a.publish_attempts, a.publish_last_error, a.published_to_wp_at
|
||||
FROM articles a
|
||||
LEFT JOIN feeds f ON f.id = a.feed_id
|
||||
ORDER BY a.id DESC
|
||||
LIMIT ?
|
||||
""",
|
||||
(safe_limit,),
|
||||
).fetchall()
|
||||
return rows_to_dicts(rows)
|
||||
204
backend/app/rewrite.py
Normal file
204
backend/app/rewrite.py
Normal file
|
|
@ -0,0 +1,204 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from typing import Any
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
from .config import get_settings
|
||||
|
||||
|
||||
def _sanitize_source_text(text: str) -> str:
|
||||
raw = (text or "").strip()
|
||||
if not raw:
|
||||
return ""
|
||||
|
||||
lines = [ln.strip() for ln in raw.splitlines() if ln.strip()]
|
||||
if len(lines) > 3:
|
||||
lines = lines[3:]
|
||||
|
||||
joined = "\n".join(lines)
|
||||
# Remove press contact block at end from "Pressekontakt" onward.
|
||||
joined = re.sub(
|
||||
r"\n?\s*Pressekontakt[\s\S]*$",
|
||||
"",
|
||||
joined,
|
||||
flags=re.IGNORECASE,
|
||||
).strip()
|
||||
return joined
|
||||
|
||||
|
||||
def _normalize_tags(tags: list[str], max_tags: int = 8) -> list[str]:
|
||||
out: list[str] = []
|
||||
seen: set[str] = set()
|
||||
for raw in tags:
|
||||
value = re.sub(r"\s+", " ", str(raw or "").strip())
|
||||
value = re.sub(r"^[#\-•\s]+", "", value)
|
||||
value = re.sub(r"[;,.:\s]+$", "", value)
|
||||
if not value:
|
||||
continue
|
||||
if len(value) < 2 or len(value) > 40:
|
||||
continue
|
||||
key = value.casefold()
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
out.append(value)
|
||||
if len(out) >= max_tags:
|
||||
break
|
||||
return out
|
||||
|
||||
|
||||
def _openai_chat(system: str, user: str, temperature: float = 0.4) -> str:
|
||||
settings = get_settings()
|
||||
api_key = settings.openai_api_key
|
||||
if not api_key:
|
||||
raise RuntimeError("OPENAI_API_KEY fehlt")
|
||||
|
||||
payload = {
|
||||
"model": settings.openai_model,
|
||||
"temperature": temperature,
|
||||
"messages": [
|
||||
{"role": "system", "content": system},
|
||||
{"role": "user", "content": user},
|
||||
],
|
||||
}
|
||||
req = Request(
|
||||
url="https://api.openai.com/v1/chat/completions",
|
||||
method="POST",
|
||||
data=json.dumps(payload).encode("utf-8"),
|
||||
headers={
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
"Content-Type": "application/json",
|
||||
"Accept": "application/json",
|
||||
},
|
||||
)
|
||||
with urlopen(req, timeout=60) as resp:
|
||||
raw = resp.read().decode("utf-8", errors="replace")
|
||||
data = json.loads(raw)
|
||||
choices = data.get("choices")
|
||||
if not isinstance(choices, list) or not choices:
|
||||
raise RuntimeError(f"Ungültige OpenAI-Antwort: {data}")
|
||||
message = choices[0].get("message", {})
|
||||
content = message.get("content")
|
||||
if not isinstance(content, str) or not content.strip():
|
||||
raise RuntimeError("OpenAI lieferte keinen Inhalt")
|
||||
return content.strip()
|
||||
|
||||
|
||||
def rewrite_article_text(article: dict[str, Any]) -> str:
|
||||
source_text = _sanitize_source_text(article.get("content_raw") or "")
|
||||
if not source_text:
|
||||
source_text = (article.get("summary") or "").strip()
|
||||
if not source_text:
|
||||
raise RuntimeError("Kein Quelltext für Rewrite verfügbar")
|
||||
|
||||
title = (article.get("title") or "").strip()
|
||||
source_name = (article.get("source_name_snapshot") or article.get("author") or "die Quelle").strip()
|
||||
prompt = (
|
||||
"Schreibe den folgenden News-Text neu auf Deutsch in persönlicher Du-Form. "
|
||||
"Stil: ausführlich, gut lesbar, ohne Einleitung mit Datum/Uhrzeit/Firma/Ort, "
|
||||
"ohne Pressekontakt, ohne Quellenblock. "
|
||||
"Nutze klare Absätze und Zwischenüberschriften in HTML (<h2>, <p>, <ul><li> falls passend). "
|
||||
"Inhaltlich korrekt bleiben, nichts erfinden. "
|
||||
f"Wichtig: Der Artikel wurde von '{source_name}' veröffentlicht. "
|
||||
"Verwende NIEMALS 'wir' oder 'ich' aus Sicht der Quelle – beziehe Aussagen stets auf die Quelle, "
|
||||
f"z.B. 'laut {source_name}', '{source_name} hat ermittelt', 'die Auswertung zeigt'.\n\n"
|
||||
f"Titel: {title}\n\n"
|
||||
f"Originaltext:\n{source_text}"
|
||||
)
|
||||
return _openai_chat(
|
||||
"Du bist ein deutscher News-Redakteur.",
|
||||
prompt,
|
||||
temperature=0.4,
|
||||
)
|
||||
|
||||
|
||||
def generate_article_tags(article: dict[str, Any], rewritten_text: str | None = None, max_tags: int = 8) -> list[str]:
|
||||
source_text = rewritten_text or _sanitize_source_text(article.get("content_raw") or "") or (article.get("summary") or "")
|
||||
source_text = str(source_text).strip()
|
||||
if not source_text:
|
||||
return []
|
||||
title = (article.get("title") or "").strip()
|
||||
prompt = (
|
||||
"Erzeuge präzise Schlagwörter für einen deutschen News-Artikel. "
|
||||
f"Maximal {max_tags} Tags. Nur relevante Begriffe, keine allgemeinen Wörter wie News/Artikel. "
|
||||
"Gib ausschließlich ein JSON-Array mit Strings zurück, ohne Erklärung.\n\n"
|
||||
f"Titel: {title}\n\n"
|
||||
f"Text:\n{source_text[:3500]}"
|
||||
)
|
||||
raw = _openai_chat(
|
||||
"Du extrahierst präzise, kurze News-Tags auf Deutsch.",
|
||||
prompt,
|
||||
temperature=0.2,
|
||||
)
|
||||
try:
|
||||
parsed = json.loads(raw)
|
||||
if isinstance(parsed, list):
|
||||
return _normalize_tags([str(x) for x in parsed], max_tags=max_tags)
|
||||
except Exception:
|
||||
pass
|
||||
# fallback: extract first JSON-like array if model wrapped output
|
||||
match = re.search(r"\[[\s\S]*\]", raw)
|
||||
if match:
|
||||
try:
|
||||
parsed = json.loads(match.group(0))
|
||||
if isinstance(parsed, list):
|
||||
return _normalize_tags([str(x) for x in parsed], max_tags=max_tags)
|
||||
except Exception:
|
||||
return []
|
||||
return []
|
||||
|
||||
|
||||
def score_article_relevance(article: dict[Any, Any]) -> dict[str, Any]:
|
||||
"""Score article relevance for VanLife/Camping/Outdoor blog (0-100).
|
||||
|
||||
Returns {"score": int, "reason": str, "topics": list[str]}.
|
||||
Raises RuntimeError on OpenAI failure.
|
||||
"""
|
||||
title = (article.get("title") or "").strip()
|
||||
text = _sanitize_source_text(article.get("content_raw") or "")
|
||||
if not text:
|
||||
text = (article.get("summary") or "").strip()
|
||||
|
||||
prompt = (
|
||||
"Bewerte die Relevanz des folgenden Artikels für einen deutschen VanLife-, Camping- und Outdoor-Blog. "
|
||||
"Relevante Themen: Campingplätze, Stellplätze, Wohnmobil, Camper, Van, Roadtrip, "
|
||||
"Outdoor-Ausrüstung, Wandern, Naturreisen, Reise-Tipps für Campende. "
|
||||
"Nicht relevant: allgemeine Nachrichten, Politik, Wirtschaft, Sport (außer Outdoor), Unterhaltung.\n\n"
|
||||
"Antworte NUR mit einem JSON-Objekt:\n"
|
||||
'{"score": <0-100>, "reason": "<kurze Begründung auf Deutsch>", "topics": ["<Thema1>", "<Thema2>"]}\n\n'
|
||||
f"Titel: {title}\n\n"
|
||||
f"Text (Auszug):\n{text[:2000]}"
|
||||
)
|
||||
raw = _openai_chat(
|
||||
"Du bist ein Redakteur für einen VanLife- und Camping-Blog und bewertest Artikelrelevanz.",
|
||||
prompt,
|
||||
temperature=0.1,
|
||||
)
|
||||
try:
|
||||
match = re.search(r"\{[\s\S]*\}", raw)
|
||||
if match:
|
||||
parsed = json.loads(match.group(0))
|
||||
score = max(0, min(100, int(parsed.get("score", 0))))
|
||||
return {
|
||||
"score": score,
|
||||
"reason": str(parsed.get("reason", "")),
|
||||
"topics": [str(t) for t in (parsed.get("topics") or [])],
|
||||
}
|
||||
except Exception:
|
||||
pass
|
||||
return {"score": 0, "reason": "Parsing-Fehler bei Relevanz-Score", "topics": []}
|
||||
|
||||
|
||||
def merge_generated_tags(meta_json: str | None, tags: list[str]) -> str:
|
||||
meta: dict[str, Any] = {}
|
||||
if meta_json:
|
||||
try:
|
||||
parsed = json.loads(meta_json)
|
||||
if isinstance(parsed, dict):
|
||||
meta = parsed
|
||||
except Exception:
|
||||
meta = {}
|
||||
meta["generated_tags"] = _normalize_tags(tags)
|
||||
return json.dumps(meta, ensure_ascii=False)
|
||||
336
backend/app/scheduler.py
Normal file
336
backend/app/scheduler.py
Normal file
|
|
@ -0,0 +1,336 @@
|
|||
"""Smart publishing scheduler.
|
||||
|
||||
Calculates suggested publish slots for new WordPress drafts.
|
||||
Rules:
|
||||
- Maximum N drafts per day (configurable, default 2)
|
||||
- Preferred slots: configurable hours (default 09:00 and 14:00 CET)
|
||||
- New articles queue up after the last already-scheduled article
|
||||
- Checks both local DB AND WordPress future posts to avoid double-booking
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import json
|
||||
import threading
|
||||
import urllib.request
|
||||
from datetime import date, datetime, timedelta, timezone
|
||||
from typing import Any
|
||||
|
||||
from .config import get_settings
|
||||
from .db import get_conn
|
||||
|
||||
# Ensures that concurrent pipeline runs (two threads) never assign the same slot.
|
||||
_slot_lock = threading.Lock()
|
||||
|
||||
|
||||
# CET offset (UTC+1 winter / UTC+2 summer – fixed +1 for simplicity)
|
||||
_CET_OFFSET = timedelta(hours=1)
|
||||
|
||||
|
||||
def _today_cet() -> date:
|
||||
return (datetime.now(timezone.utc) + _CET_OFFSET).date()
|
||||
|
||||
|
||||
def _preferred_hours() -> list[int]:
|
||||
settings = get_settings()
|
||||
try:
|
||||
return [int(h.strip()) for h in settings.pipeline_publish_hours.split(",") if h.strip()]
|
||||
except Exception:
|
||||
return [9, 14]
|
||||
|
||||
|
||||
def _fetch_wp_occupied_slots() -> set[tuple[str, int]]:
|
||||
"""Fetch all future-scheduled WordPress posts and return occupied (date_iso, hour) pairs.
|
||||
|
||||
This prevents the scheduler from assigning a slot that is already taken
|
||||
by a WP post that was not created via this pipeline (e.g. manually or via recovery scripts).
|
||||
Returns an empty set on any error so the scheduler degrades gracefully.
|
||||
"""
|
||||
settings = get_settings()
|
||||
try:
|
||||
auth = base64.b64encode(
|
||||
f"{settings.wordpress_username}:{settings.wordpress_app_password}".encode()
|
||||
).decode()
|
||||
url = (
|
||||
f"{settings.wordpress_base_url}/wp-json/wp/v2/posts"
|
||||
f"?status=future&per_page=100&orderby=date&order=asc&_fields=id,date"
|
||||
)
|
||||
req = urllib.request.Request(url, headers={"Authorization": f"Basic {auth}"})
|
||||
with urllib.request.urlopen(req, timeout=10) as resp:
|
||||
posts = json.loads(resp.read())
|
||||
occupied: set[tuple[str, int]] = set()
|
||||
for p in posts:
|
||||
try:
|
||||
dt = datetime.fromisoformat(p["date"])
|
||||
occupied.add((dt.date().isoformat(), dt.hour))
|
||||
except Exception:
|
||||
pass
|
||||
return occupied
|
||||
except Exception:
|
||||
return set()
|
||||
|
||||
|
||||
def _get_last_future_scheduled_date(wp_occupied: set[tuple[str, int]]) -> date | None:
|
||||
"""Return the date of the latest already-scheduled slot (DB + WP)."""
|
||||
today = _today_cet()
|
||||
|
||||
# Latest from local DB
|
||||
with get_conn() as conn:
|
||||
row = conn.execute(
|
||||
"""
|
||||
SELECT MAX(scheduled_publish_at) AS last_slot
|
||||
FROM articles
|
||||
WHERE scheduled_publish_at IS NOT NULL
|
||||
AND scheduled_publish_at >= ?
|
||||
AND status NOT IN ('error', 'no_image')
|
||||
""",
|
||||
(today.isoformat() + "T00:00:00",),
|
||||
).fetchone()
|
||||
db_last: date | None = None
|
||||
if row and row["last_slot"]:
|
||||
try:
|
||||
db_last = datetime.fromisoformat(row["last_slot"]).date()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Latest from WP
|
||||
wp_last: date | None = None
|
||||
for d_str, _ in wp_occupied:
|
||||
try:
|
||||
d = date.fromisoformat(d_str)
|
||||
if d >= today and (wp_last is None or d > wp_last):
|
||||
wp_last = d
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if db_last and wp_last:
|
||||
return max(db_last, wp_last)
|
||||
return db_last or wp_last
|
||||
|
||||
|
||||
def _next_free_hour(target_date: date, wp_occupied: set[tuple[str, int]]) -> int | None:
|
||||
"""Return first preferred hour not yet used on target_date (DB + WP), or None if day is full."""
|
||||
hours = _preferred_hours()
|
||||
date_str = target_date.isoformat()
|
||||
|
||||
# Hours used in local DB
|
||||
with get_conn() as conn:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT scheduled_publish_at FROM articles
|
||||
WHERE scheduled_publish_at >= ? AND scheduled_publish_at < ?
|
||||
AND status NOT IN ('error', 'no_image')
|
||||
""",
|
||||
(date_str + "T00:00:00", date_str + "T23:59:59"),
|
||||
).fetchall()
|
||||
|
||||
used_hours: set[int] = set()
|
||||
for row in rows:
|
||||
ts = row["scheduled_publish_at"] or ""
|
||||
try:
|
||||
used_hours.add(datetime.fromisoformat(ts).hour)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Hours used in WordPress
|
||||
for d_str, h in wp_occupied:
|
||||
if d_str == date_str:
|
||||
used_hours.add(h)
|
||||
|
||||
for h in hours:
|
||||
if h not in used_hours:
|
||||
return h
|
||||
return None
|
||||
|
||||
|
||||
def _format_slot(d: date, hour: int) -> str:
|
||||
weekday_names = ["Mo", "Di", "Mi", "Do", "Fr", "Sa", "So"]
|
||||
wd = weekday_names[d.weekday()]
|
||||
return f"{wd}, {d.strftime('%d.%m.%Y')} um {hour:02d}:00 Uhr"
|
||||
|
||||
|
||||
def _find_next_free_slot(
|
||||
wp_occupied: set[tuple[str, int]], lookahead_days: int = 60
|
||||
) -> tuple[date, int] | None:
|
||||
"""Find the next free (date, hour) slot.
|
||||
|
||||
Starts from tomorrow and scans forward, filling any gaps in the schedule
|
||||
rather than always appending after the last existing post.
|
||||
"""
|
||||
today = _today_cet()
|
||||
tomorrow = today + timedelta(days=1)
|
||||
|
||||
for offset in range(0, lookahead_days + 1):
|
||||
candidate = tomorrow + timedelta(days=offset)
|
||||
hour = _next_free_hour(candidate, wp_occupied)
|
||||
if hour is not None:
|
||||
return candidate, hour
|
||||
|
||||
return tomorrow, _preferred_hours()[0] if _preferred_hours() else 9
|
||||
|
||||
|
||||
def get_schedule_overview(lookahead_days: int = 60) -> list[dict]:
|
||||
"""Return all booked scheduling slots (DB + WP) for the next N days, sorted by date."""
|
||||
today = _today_cet()
|
||||
hours = _preferred_hours()
|
||||
|
||||
# Slots booked in local DB
|
||||
with get_conn() as conn:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT id, title, status, wp_post_id, wp_post_url, scheduled_publish_at
|
||||
FROM articles
|
||||
WHERE scheduled_publish_at IS NOT NULL
|
||||
AND scheduled_publish_at >= ?
|
||||
AND status NOT IN ('error', 'no_image')
|
||||
ORDER BY scheduled_publish_at
|
||||
""",
|
||||
(today.isoformat() + "T00:00:00",),
|
||||
).fetchall()
|
||||
|
||||
db_slots: dict[tuple[str, int], dict] = {}
|
||||
for row in rows:
|
||||
try:
|
||||
dt = datetime.fromisoformat(row["scheduled_publish_at"])
|
||||
key = (dt.date().isoformat(), dt.hour)
|
||||
db_slots[key] = {
|
||||
"date": dt.date().isoformat(),
|
||||
"hour": dt.hour,
|
||||
"formatted": _format_slot(dt.date(), dt.hour),
|
||||
"source": "db",
|
||||
"article_id": row["id"],
|
||||
"article_title": row["title"],
|
||||
"article_status": row["status"],
|
||||
"wp_post_id": row["wp_post_id"],
|
||||
"wp_post_url": row["wp_post_url"],
|
||||
}
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Slots occupied in WordPress but not in local DB
|
||||
wp_occupied = _fetch_wp_occupied_slots()
|
||||
wp_only: list[dict] = []
|
||||
for d_str, h in sorted(wp_occupied):
|
||||
if (d_str, h) in db_slots:
|
||||
continue
|
||||
try:
|
||||
d = date.fromisoformat(d_str)
|
||||
if d >= today:
|
||||
wp_only.append({
|
||||
"date": d_str,
|
||||
"hour": h,
|
||||
"formatted": _format_slot(d, h),
|
||||
"source": "wordpress",
|
||||
"article_id": None,
|
||||
"article_title": "(WP-Beitrag außerhalb Pipeline)",
|
||||
"article_status": None,
|
||||
"wp_post_id": None,
|
||||
"wp_post_url": None,
|
||||
})
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
all_slots = list(db_slots.values()) + wp_only
|
||||
all_slots.sort(key=lambda s: (s["date"], s["hour"]))
|
||||
return all_slots
|
||||
|
||||
|
||||
def release_publish_slot(article_id: int) -> None:
|
||||
"""Clear a previously reserved slot (e.g. when article is rejected after slot assignment)."""
|
||||
with get_conn() as conn:
|
||||
conn.execute(
|
||||
"UPDATE articles SET scheduled_publish_at = NULL WHERE id = ?",
|
||||
(article_id,),
|
||||
)
|
||||
|
||||
|
||||
def suggest_publish_slot() -> str:
|
||||
"""Return a suggested publish datetime string (CET) for the next free slot."""
|
||||
wp_occupied = _fetch_wp_occupied_slots()
|
||||
result = _find_next_free_slot(wp_occupied)
|
||||
if result:
|
||||
d, hour = result
|
||||
return _format_slot(d, hour)
|
||||
tomorrow = _today_cet() + timedelta(days=1)
|
||||
return _format_slot(tomorrow, _preferred_hours()[0] if _preferred_hours() else 9)
|
||||
|
||||
|
||||
def reserve_publish_slot(article_id: int) -> str:
|
||||
"""Reserve a publish slot for an article and persist it in the DB.
|
||||
|
||||
If the article already has a scheduled_publish_at, keep it unchanged.
|
||||
Returns the formatted publish datetime string.
|
||||
|
||||
Uses a module-level lock so that concurrent pipeline runs (two threads)
|
||||
cannot read the same "free" slot and assign it twice.
|
||||
"""
|
||||
# Fetch WP-occupied slots BEFORE acquiring the lock — the API call can be slow
|
||||
# and must not block other threads unnecessarily.
|
||||
wp_occupied = _fetch_wp_occupied_slots()
|
||||
|
||||
with _slot_lock:
|
||||
# Single DB connection for the entire read-find-write cycle so the
|
||||
# slot we pick is still free when we write it.
|
||||
with get_conn() as conn:
|
||||
row = conn.execute(
|
||||
"SELECT scheduled_publish_at FROM articles WHERE id = ?",
|
||||
(article_id,),
|
||||
).fetchone()
|
||||
existing_slot = row["scheduled_publish_at"] if row else None
|
||||
if existing_slot:
|
||||
try:
|
||||
dt = datetime.fromisoformat(existing_slot)
|
||||
return _format_slot(dt.date(), dt.hour)
|
||||
except Exception:
|
||||
pass # invalid — fall through and assign a fresh slot
|
||||
|
||||
# Find the next free (date, hour) slot using THIS connection so we
|
||||
# see all slots written during this lock window.
|
||||
hours = _preferred_hours()
|
||||
today = _today_cet()
|
||||
tomorrow = today + timedelta(days=1)
|
||||
candidate: date | None = None
|
||||
chosen_hour: int | None = None
|
||||
|
||||
for offset in range(0, 61):
|
||||
d = tomorrow + timedelta(days=offset)
|
||||
date_str = d.isoformat()
|
||||
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT scheduled_publish_at FROM articles
|
||||
WHERE scheduled_publish_at >= ? AND scheduled_publish_at < ?
|
||||
AND status NOT IN ('error', 'no_image')
|
||||
""",
|
||||
(date_str + "T00:00:00", date_str + "T23:59:59"),
|
||||
).fetchall()
|
||||
|
||||
used_hours: set[int] = set()
|
||||
for r in rows:
|
||||
ts = r["scheduled_publish_at"] or ""
|
||||
try:
|
||||
used_hours.add(datetime.fromisoformat(ts).hour)
|
||||
except Exception:
|
||||
pass
|
||||
for d_str, h in wp_occupied:
|
||||
if d_str == date_str:
|
||||
used_hours.add(h)
|
||||
|
||||
for h in hours:
|
||||
if h not in used_hours:
|
||||
candidate = d
|
||||
chosen_hour = h
|
||||
break
|
||||
if candidate is not None:
|
||||
break
|
||||
|
||||
if candidate is None:
|
||||
candidate = tomorrow
|
||||
chosen_hour = hours[0] if hours else 9
|
||||
|
||||
iso_ts = f"{candidate.isoformat()}T{chosen_hour:02d}:00:00"
|
||||
conn.execute(
|
||||
"UPDATE articles SET scheduled_publish_at = ? WHERE id = ?",
|
||||
(iso_ts, article_id),
|
||||
)
|
||||
return _format_slot(candidate, chosen_hour)
|
||||
442
backend/app/source_extraction.py
Normal file
442
backend/app/source_extraction.py
Normal file
|
|
@ -0,0 +1,442 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from html import unescape
|
||||
import re
|
||||
from typing import Any
|
||||
from urllib.parse import urljoin
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
DEFAULT_TIMEOUT_SECONDS = 10
|
||||
DEFAULT_USER_AGENT = "rss-news-bot/1.0 (+https://news.vanityontour.de)"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ExtractedArticle:
|
||||
title: str | None
|
||||
author: str | None
|
||||
canonical_url: str | None
|
||||
summary: str | None
|
||||
content_text: str | None
|
||||
images: list[str]
|
||||
press_contact: str | None
|
||||
extraction_error: str | None = None
|
||||
image_metadata: dict[str, dict] = field(default_factory=dict)
|
||||
|
||||
|
||||
def _clean_text(raw: str | None) -> str | None:
|
||||
if not raw:
|
||||
return None
|
||||
text = unescape(raw)
|
||||
text = re.sub(r"<[^>]+>", " ", text)
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
return text or None
|
||||
|
||||
|
||||
def _strip_noise(html: str) -> str:
|
||||
html = re.sub(r"<script[\s\S]*?</script>", " ", html, flags=re.IGNORECASE)
|
||||
html = re.sub(r"<style[\s\S]*?</style>", " ", html, flags=re.IGNORECASE)
|
||||
html = re.sub(r"<noscript[\s\S]*?</noscript>", " ", html, flags=re.IGNORECASE)
|
||||
return html
|
||||
|
||||
|
||||
def _meta_content(html: str, attr: str, value: str) -> str | None:
|
||||
pattern = re.compile(
|
||||
rf"<meta[^>]+{attr}\s*=\s*[\"']{re.escape(value)}[\"'][^>]*content\s*=\s*[\"']([^\"']+)[\"'][^>]*>",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
match = pattern.search(html)
|
||||
if match:
|
||||
return _clean_text(match.group(1))
|
||||
|
||||
# handle reversed attribute order
|
||||
pattern_rev = re.compile(
|
||||
rf"<meta[^>]+content\s*=\s*[\"']([^\"']+)[\"'][^>]*{attr}\s*=\s*[\"']{re.escape(value)}[\"'][^>]*>",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
match = pattern_rev.search(html)
|
||||
if match:
|
||||
return _clean_text(match.group(1))
|
||||
return None
|
||||
|
||||
|
||||
def _extract_title(html: str) -> str | None:
|
||||
title = _meta_content(html, "property", "og:title")
|
||||
if title:
|
||||
return title
|
||||
|
||||
match = re.search(r"<title[^>]*>([\s\S]*?)</title>", html, re.IGNORECASE)
|
||||
if match:
|
||||
cleaned = _clean_text(match.group(1))
|
||||
if cleaned:
|
||||
return cleaned
|
||||
|
||||
match = re.search(r"<h1[^>]*>([\s\S]*?)</h1>", html, re.IGNORECASE)
|
||||
if match:
|
||||
return _clean_text(match.group(1))
|
||||
return None
|
||||
|
||||
|
||||
def _extract_canonical(html: str) -> str | None:
|
||||
match = re.search(
|
||||
r"<link[^>]+rel\s*=\s*[\"']canonical[\"'][^>]*href\s*=\s*[\"']([^\"']+)[\"'][^>]*>",
|
||||
html,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if match:
|
||||
return _clean_text(match.group(1))
|
||||
|
||||
match = re.search(
|
||||
r"<link[^>]+href\s*=\s*[\"']([^\"']+)[\"'][^>]*rel\s*=\s*[\"']canonical[\"'][^>]*>",
|
||||
html,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if match:
|
||||
return _clean_text(match.group(1))
|
||||
return None
|
||||
|
||||
|
||||
def _extract_author(html: str) -> str | None:
|
||||
for attr, value in (("name", "author"), ("property", "article:author"), ("property", "og:article:author")):
|
||||
author = _meta_content(html, attr, value)
|
||||
if author:
|
||||
return author
|
||||
|
||||
for pattern in (
|
||||
r"(?:Von|Autor(?:in)?)\s*[:\-]\s*([^<\n\r]{3,120})",
|
||||
r"class=[\"'][^\"']*(?:author|byline)[^\"']*[\"'][^>]*>([\s\S]{1,180})<",
|
||||
):
|
||||
match = re.search(pattern, html, re.IGNORECASE)
|
||||
if match:
|
||||
author = _clean_text(match.group(1))
|
||||
if author:
|
||||
return author
|
||||
return None
|
||||
|
||||
|
||||
def _extract_images(html: str, page_url: str) -> list[str]:
|
||||
images: list[str] = []
|
||||
seen: set[str] = set()
|
||||
|
||||
for prop in ("og:image", "twitter:image"):
|
||||
pattern = re.compile(
|
||||
rf"<meta[^>]+property\s*=\s*[\"']{re.escape(prop)}[\"'][^>]*content\s*=\s*[\"']([^\"']+)[\"'][^>]*>",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
for match in pattern.finditer(html):
|
||||
src = match.group(1).strip()
|
||||
abs_src = urljoin(page_url, src)
|
||||
if abs_src not in seen:
|
||||
seen.add(abs_src)
|
||||
images.append(abs_src)
|
||||
|
||||
for match in re.finditer(r"<img[^>]+src\s*=\s*[\"']([^\"']+)[\"'][^>]*>", html, re.IGNORECASE):
|
||||
src = match.group(1).strip()
|
||||
abs_src = urljoin(page_url, src)
|
||||
if abs_src not in seen:
|
||||
seen.add(abs_src)
|
||||
images.append(abs_src)
|
||||
|
||||
return images
|
||||
|
||||
|
||||
def _extract_content_text(html: str) -> str | None:
|
||||
section = None
|
||||
for pattern in (
|
||||
r"<article[^>]*>([\s\S]*?)</article>",
|
||||
r"<main[^>]*>([\s\S]*?)</main>",
|
||||
r"<body[^>]*>([\s\S]*?)</body>",
|
||||
):
|
||||
match = re.search(pattern, html, re.IGNORECASE)
|
||||
if match:
|
||||
section = match.group(1)
|
||||
break
|
||||
|
||||
if not section:
|
||||
section = html
|
||||
|
||||
paragraphs = []
|
||||
for match in re.finditer(r"<h[2-4][^>]*>([\s\S]*?)</h[2-4]>", section, re.IGNORECASE):
|
||||
text = _clean_text(match.group(1))
|
||||
if text and re.search(r"\b(pressekontakt|press contact|kontakt|agentur)\b", text, re.IGNORECASE):
|
||||
paragraphs.append(text)
|
||||
|
||||
for match in re.finditer(r"<p[^>]*>([\s\S]*?)</p>", section, re.IGNORECASE):
|
||||
text = _clean_text(match.group(1))
|
||||
if text and len(text) > 2:
|
||||
paragraphs.append(text)
|
||||
|
||||
if paragraphs:
|
||||
return "\n".join(paragraphs)
|
||||
|
||||
stripped = _clean_text(section)
|
||||
return stripped
|
||||
|
||||
|
||||
def _extract_press_contact(content_text: str | None) -> str | None:
|
||||
if not content_text:
|
||||
return None
|
||||
|
||||
lines = [line.strip() for line in content_text.split("\n") if line.strip()]
|
||||
marker_re = re.compile(r"\b(pressekontakt|press contact|presse\-kontakt|agentur)\b", re.IGNORECASE)
|
||||
for idx, line in enumerate(lines):
|
||||
if marker_re.search(line):
|
||||
chunk = [line]
|
||||
for nxt in lines[idx + 1 : idx + 6]:
|
||||
if re.search(r"\b(original\-content von|ots:|newsroom:|alle meldungen|zum newsroom)\b", nxt, re.IGNORECASE):
|
||||
break
|
||||
chunk.append(nxt)
|
||||
return _clean_text("\n".join(chunk))
|
||||
|
||||
match = re.search(
|
||||
r"((?:Pressekontakt|Agentur)[\s\S]{0,1200}?)(?:Original-Content von|OTS:|newsroom:|Alle Meldungen|Zum Newsroom|$)",
|
||||
content_text,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if match:
|
||||
return _clean_text(match.group(1))
|
||||
return None
|
||||
|
||||
|
||||
# CSS class keywords that indicate a copyright/credit element inside a figcaption
|
||||
_CREDIT_CLASS_RE = re.compile(
|
||||
r"class\s*=\s*[\"'][^\"']*(?:copyright|credit|photographer|photo-credit|image-credit|bildrechte|fotocredit)[^\"']*[\"']",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Inline text patterns that signal a credit/copyright notice
|
||||
_CREDIT_TEXT_RE = re.compile(
|
||||
r"(©[^<\n\r]{1,100}|(?:Foto|Bild|Credit|Fotograf|Fotografie)\s*:[^<\n\r]{1,100})",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# data-* attribute names that carry credit/caption information directly on <img>
|
||||
_IMG_DATA_CREDIT_ATTRS = ("data-credit", "data-photographer", "data-copyright")
|
||||
_IMG_DATA_CAPTION_ATTRS = ("data-caption", "data-description")
|
||||
|
||||
# Class keywords for adjacent sibling credit spans/divs after an <img>
|
||||
_ADJ_CREDIT_CLASS_RE = re.compile(
|
||||
r"class\s*=\s*[\"'][^\"']*(?:copyright|credit|photographer|photo-credit|image-credit|bildrechte|fotocredit)[^\"']*[\"']",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def _extract_image_metadata(html: str, page_url: str) -> dict[str, dict]:
|
||||
"""Return a mapping of absolute image URL → {"caption": ..., "credit": ...}.
|
||||
|
||||
Uses three progressive strategies:
|
||||
1. <figure> with <img> + <figcaption>
|
||||
2. data-* attributes on <img> tags not already covered
|
||||
3. <img> tags whose immediately following HTML contains a credit element
|
||||
"""
|
||||
result: dict[str, dict] = {}
|
||||
|
||||
try:
|
||||
# ------------------------------------------------------------------
|
||||
# Strategy 1: <figure> blocks containing <img> and <figcaption>
|
||||
# ------------------------------------------------------------------
|
||||
for fig_match in re.finditer(r"<figure[^>]*>([\s\S]*?)</figure>", html, re.IGNORECASE):
|
||||
fig_html = fig_match.group(1)
|
||||
|
||||
# Locate image src (src or lazy-loaded data-src)
|
||||
img_match = re.search(
|
||||
r"<img[^>]+(?:src|data-src)\s*=\s*[\"']([^\"']+)[\"'][^>]*>",
|
||||
fig_html,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if not img_match:
|
||||
continue
|
||||
img_src = urljoin(page_url, img_match.group(1).strip())
|
||||
|
||||
# Locate figcaption
|
||||
figcap_match = re.search(
|
||||
r"<figcaption[^>]*>([\s\S]*?)</figcaption>",
|
||||
fig_html,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if not figcap_match:
|
||||
continue
|
||||
figcap_html = figcap_match.group(1)
|
||||
|
||||
# --- Extract credit ---
|
||||
credit: str | None = None
|
||||
|
||||
# Try credit via class attribute on an inner element
|
||||
credit_elem_match = re.search(
|
||||
r"<(?:span|p|div)[^>]*"
|
||||
+ _CREDIT_CLASS_RE.pattern
|
||||
+ r"[^>]*>([\s\S]*?)</(?:span|p|div)>",
|
||||
figcap_html,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if credit_elem_match:
|
||||
credit = _clean_text(credit_elem_match.group(1))
|
||||
|
||||
# Fallback: scan plain text of figcaption for credit patterns
|
||||
if not credit:
|
||||
figcap_text = unescape(re.sub(r"<[^>]+>", " ", figcap_html))
|
||||
cred_text_match = _CREDIT_TEXT_RE.search(figcap_text)
|
||||
if cred_text_match:
|
||||
credit = _clean_text(cred_text_match.group(1))
|
||||
|
||||
# --- Extract caption (full figcaption text) ---
|
||||
caption = _clean_text(figcap_html)
|
||||
|
||||
# Only store entries that carry at least one piece of metadata
|
||||
if caption or credit:
|
||||
entry: dict[str, str] = {}
|
||||
if caption:
|
||||
entry["caption"] = caption
|
||||
if credit:
|
||||
entry["credit"] = credit
|
||||
result[img_src] = entry
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Strategy 2: data-* attributes on <img> tags
|
||||
# ------------------------------------------------------------------
|
||||
for img_match in re.finditer(r"<img([^>]+)>", html, re.IGNORECASE):
|
||||
img_attrs = img_match.group(1)
|
||||
|
||||
# Resolve image URL (prefer src over data-src)
|
||||
src_match = re.search(r'(?:^|\s)src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE)
|
||||
if not src_match:
|
||||
src_match = re.search(r'data-src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE)
|
||||
if not src_match:
|
||||
continue
|
||||
img_src = urljoin(page_url, src_match.group(1).strip())
|
||||
|
||||
# Skip images already handled by Strategy 1
|
||||
if img_src in result:
|
||||
continue
|
||||
|
||||
credit: str | None = None
|
||||
caption: str | None = None
|
||||
|
||||
for attr in _IMG_DATA_CREDIT_ATTRS:
|
||||
attr_match = re.search(
|
||||
rf'{re.escape(attr)}\s*=\s*["\']([^"\']+)["\']',
|
||||
img_attrs,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if attr_match:
|
||||
credit = _clean_text(attr_match.group(1))
|
||||
break
|
||||
|
||||
for attr in _IMG_DATA_CAPTION_ATTRS:
|
||||
attr_match = re.search(
|
||||
rf'{re.escape(attr)}\s*=\s*["\']([^"\']+)["\']',
|
||||
img_attrs,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if attr_match:
|
||||
caption = _clean_text(attr_match.group(1))
|
||||
break
|
||||
|
||||
if caption or credit:
|
||||
entry = {}
|
||||
if caption:
|
||||
entry["caption"] = caption
|
||||
if credit:
|
||||
entry["credit"] = credit
|
||||
result[img_src] = entry
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Strategy 3: <img> followed within 200 chars by a credit element
|
||||
# ------------------------------------------------------------------
|
||||
for img_match in re.finditer(r"<img([^>]+)>", html, re.IGNORECASE):
|
||||
img_attrs = img_match.group(1)
|
||||
|
||||
src_match = re.search(r'(?:^|\s)src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE)
|
||||
if not src_match:
|
||||
src_match = re.search(r'data-src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE)
|
||||
if not src_match:
|
||||
continue
|
||||
img_src = urljoin(page_url, src_match.group(1).strip())
|
||||
|
||||
# Skip images already handled by earlier strategies
|
||||
if img_src in result:
|
||||
continue
|
||||
|
||||
# Look at the 200 characters of HTML immediately after the img tag
|
||||
after_start = img_match.end()
|
||||
after_html = html[after_start : after_start + 200]
|
||||
|
||||
adj_match = re.search(
|
||||
r"<(?:span|p|div)[^>]*"
|
||||
+ _ADJ_CREDIT_CLASS_RE.pattern
|
||||
+ r"[^>]*>([\s\S]*?)</(?:span|p|div)>",
|
||||
after_html,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if adj_match:
|
||||
credit = _clean_text(adj_match.group(1))
|
||||
if credit:
|
||||
result[img_src] = {"credit": credit}
|
||||
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def extract_article(url: str, timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS) -> ExtractedArticle:
|
||||
try:
|
||||
req = Request(
|
||||
url=url,
|
||||
headers={
|
||||
"User-Agent": DEFAULT_USER_AGENT,
|
||||
"Accept-Language": "de-DE,de;q=0.9,en;q=0.8",
|
||||
},
|
||||
)
|
||||
with urlopen(req, timeout=timeout_seconds) as resp:
|
||||
raw = resp.read()
|
||||
charset = resp.headers.get_content_charset() or "utf-8"
|
||||
html = raw.decode(charset, errors="replace")
|
||||
except Exception as exc:
|
||||
return ExtractedArticle(
|
||||
title=None,
|
||||
author=None,
|
||||
canonical_url=None,
|
||||
summary=None,
|
||||
content_text=None,
|
||||
images=[],
|
||||
press_contact=None,
|
||||
extraction_error=str(exc),
|
||||
)
|
||||
|
||||
html = _strip_noise(html)
|
||||
title = _extract_title(html)
|
||||
author = _extract_author(html)
|
||||
canonical_url = _extract_canonical(html)
|
||||
summary = _meta_content(html, "name", "description")
|
||||
content_text = _extract_content_text(html)
|
||||
if not summary and content_text:
|
||||
summary = _clean_text(content_text[:320])
|
||||
images = _extract_images(html, url)
|
||||
press_contact = _extract_press_contact(content_text)
|
||||
image_metadata = _extract_image_metadata(html, url)
|
||||
|
||||
return ExtractedArticle(
|
||||
title=title,
|
||||
author=author,
|
||||
canonical_url=canonical_url,
|
||||
summary=summary,
|
||||
content_text=content_text,
|
||||
images=images,
|
||||
press_contact=press_contact,
|
||||
extraction_error=None,
|
||||
image_metadata=image_metadata,
|
||||
)
|
||||
|
||||
|
||||
def extracted_article_to_meta(article: ExtractedArticle) -> dict[str, Any]:
|
||||
return {
|
||||
"title": article.title,
|
||||
"author": article.author,
|
||||
"canonical_url": article.canonical_url,
|
||||
"summary": article.summary,
|
||||
"images": article.images,
|
||||
"press_contact": article.press_contact,
|
||||
"extraction_error": article.extraction_error,
|
||||
"image_metadata": article.image_metadata,
|
||||
}
|
||||
474
backend/app/telegram_bot.py
Normal file
474
backend/app/telegram_bot.py
Normal file
|
|
@ -0,0 +1,474 @@
|
|||
"""Telegram Bot integration for RSS-News pipeline notifications and controls."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
from typing import Any
|
||||
from urllib.error import URLError
|
||||
from urllib.parse import urlencode
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
from .config import get_settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_BASE = "https://api.telegram.org/bot{token}/{method}"
|
||||
_N8N_APP_RELEASE_WEBHOOK = "https://n8n.vanityontour.de/webhook/tg-app-release-bot-v1/webhook"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Low-level API helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _call(method: str, payload: dict[str, Any]) -> dict[str, Any]:
|
||||
settings = get_settings()
|
||||
token = settings.telegram_bot_token
|
||||
if not token:
|
||||
raise RuntimeError("TELEGRAM_BOT_TOKEN nicht konfiguriert")
|
||||
url = _BASE.format(token=token, method=method)
|
||||
data = json.dumps(payload).encode("utf-8")
|
||||
req = Request(
|
||||
url=url,
|
||||
data=data,
|
||||
method="POST",
|
||||
headers={"Content-Type": "application/json", "Accept": "application/json"},
|
||||
)
|
||||
try:
|
||||
with urlopen(req, timeout=15) as resp:
|
||||
raw = resp.read().decode("utf-8", errors="replace")
|
||||
return json.loads(raw)
|
||||
except URLError as exc:
|
||||
logger.error("Telegram API Fehler (%s): %s", method, exc)
|
||||
raise RuntimeError(f"Telegram API Fehler: {exc}") from exc
|
||||
|
||||
|
||||
def _chat_id() -> str:
|
||||
settings = get_settings()
|
||||
cid = settings.telegram_chat_id
|
||||
if not cid:
|
||||
raise RuntimeError("TELEGRAM_CHAT_ID nicht konfiguriert")
|
||||
return cid
|
||||
|
||||
|
||||
def _inline_keyboard(buttons: list[list[dict[str, str]]]) -> dict:
|
||||
return {"inline_keyboard": buttons}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public send functions
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def send_message(text: str, reply_markup: dict | None = None, parse_mode: str = "HTML") -> dict:
|
||||
payload: dict[str, Any] = {
|
||||
"chat_id": _chat_id(),
|
||||
"text": text,
|
||||
"parse_mode": parse_mode,
|
||||
"disable_web_page_preview": False,
|
||||
}
|
||||
if reply_markup:
|
||||
payload["reply_markup"] = reply_markup
|
||||
return _call("sendMessage", payload)
|
||||
|
||||
|
||||
def send_photo_message(
|
||||
photo_url: str,
|
||||
caption: str,
|
||||
reply_markup: dict | None = None,
|
||||
parse_mode: str = "HTML",
|
||||
) -> dict:
|
||||
payload: dict[str, Any] = {
|
||||
"chat_id": _chat_id(),
|
||||
"photo": photo_url,
|
||||
"caption": caption,
|
||||
"parse_mode": parse_mode,
|
||||
}
|
||||
if reply_markup:
|
||||
payload["reply_markup"] = reply_markup
|
||||
try:
|
||||
return _call("sendPhoto", payload)
|
||||
except Exception:
|
||||
# Fall back to text message if photo fails (e.g. image URL no longer valid)
|
||||
return send_message(caption, reply_markup=reply_markup, parse_mode=parse_mode)
|
||||
|
||||
|
||||
def answer_callback_query(callback_query_id: str, text: str = "") -> None:
|
||||
try:
|
||||
_call("answerCallbackQuery", {"callback_query_id": callback_query_id, "text": text})
|
||||
except Exception as exc:
|
||||
logger.warning("answerCallbackQuery fehlgeschlagen: %s", exc)
|
||||
|
||||
|
||||
def edit_message_reply_markup(chat_id: str, message_id: int, reply_markup: dict | None = None) -> None:
|
||||
payload: dict[str, Any] = {"chat_id": chat_id, "message_id": message_id}
|
||||
if reply_markup:
|
||||
payload["reply_markup"] = reply_markup
|
||||
else:
|
||||
payload["reply_markup"] = {"inline_keyboard": []}
|
||||
try:
|
||||
_call("editMessageReplyMarkup", payload)
|
||||
except Exception as exc:
|
||||
logger.warning("editMessageReplyMarkup fehlgeschlagen: %s", exc)
|
||||
|
||||
|
||||
def setup_webhook(webhook_url: str) -> dict:
|
||||
settings = get_settings()
|
||||
payload: dict[str, Any] = {"url": webhook_url, "allowed_updates": ["message", "callback_query"]}
|
||||
if settings.telegram_webhook_secret:
|
||||
payload["secret_token"] = settings.telegram_webhook_secret
|
||||
return _call("setWebhook", payload)
|
||||
|
||||
|
||||
def delete_webhook() -> dict:
|
||||
return _call("deleteWebhook", {})
|
||||
|
||||
|
||||
def _forward_to_n8n_app_release(update: dict[str, Any]) -> None:
|
||||
"""Forward a Telegram update to the N8N App Release webhook."""
|
||||
try:
|
||||
data = json.dumps(update).encode("utf-8")
|
||||
req = Request(
|
||||
url=_N8N_APP_RELEASE_WEBHOOK,
|
||||
data=data,
|
||||
method="POST",
|
||||
headers={"Content-Type": "application/json"},
|
||||
)
|
||||
with urlopen(req, timeout=5) as _:
|
||||
pass
|
||||
except Exception as exc:
|
||||
logger.debug("N8N App-Release-Forward fehlgeschlagen: %s", exc)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Notification helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _format_tags(meta_json: str | None) -> str:
|
||||
if not meta_json:
|
||||
return ""
|
||||
try:
|
||||
meta = json.loads(meta_json)
|
||||
tags = meta.get("generated_tags") or []
|
||||
if tags:
|
||||
return " ".join(f"#{t.replace(' ', '_')}" for t in tags[:6])
|
||||
except Exception:
|
||||
pass
|
||||
return ""
|
||||
|
||||
|
||||
def _score_emoji(score: int) -> str:
|
||||
if score >= 85:
|
||||
return "🟢"
|
||||
if score >= 70:
|
||||
return "🟡"
|
||||
return "🔴"
|
||||
|
||||
|
||||
def notify_new_draft(
|
||||
article: dict[str, Any],
|
||||
score: int,
|
||||
suggested_publish_at: str | None = None,
|
||||
) -> None:
|
||||
"""Send Telegram notification for a newly created WP draft."""
|
||||
title = (article.get("title") or "Ohne Titel").strip()
|
||||
wp_url = article.get("wp_post_url") or ""
|
||||
tags_str = _format_tags(article.get("meta_json"))
|
||||
art_id = article.get("id")
|
||||
|
||||
score_line = f"{_score_emoji(score)} Relevanz-Score: <b>{score}/100</b>"
|
||||
publish_line = f"📅 Vorgeschlagene Veröffentlichung: <b>{suggested_publish_at}</b>" if suggested_publish_at else ""
|
||||
link_line = f'🔗 <a href="{wp_url}">Draft in WordPress öffnen</a>' if wp_url else ""
|
||||
tags_line = f"🏷 {tags_str}" if tags_str else ""
|
||||
|
||||
text_parts = [
|
||||
f"✅ <b>Neuer Draft erstellt</b>",
|
||||
f"📰 <b>{title}</b>",
|
||||
score_line,
|
||||
]
|
||||
if publish_line:
|
||||
text_parts.append(publish_line)
|
||||
if tags_line:
|
||||
text_parts.append(tags_line)
|
||||
if link_line:
|
||||
text_parts.append(link_line)
|
||||
|
||||
text = "\n".join(text_parts)
|
||||
|
||||
keyboard = _inline_keyboard([
|
||||
[
|
||||
{"text": "✏️ Neu schreiben", "callback_data": f"rewrite:{art_id}"},
|
||||
{"text": "❌ Verwerfen", "callback_data": f"discard:{art_id}"},
|
||||
]
|
||||
])
|
||||
|
||||
# Try with image first
|
||||
meta = {}
|
||||
try:
|
||||
meta = json.loads(article.get("meta_json") or "{}")
|
||||
except Exception:
|
||||
pass
|
||||
image_url = None
|
||||
image_review = meta.get("image_review") or {}
|
||||
if isinstance(image_review, dict):
|
||||
image_url = image_review.get("selected_url")
|
||||
if not image_url:
|
||||
image_sel = (meta.get("extraction") or {}).get("image_selection") or {}
|
||||
image_url = image_sel.get("primary")
|
||||
|
||||
if image_url:
|
||||
send_photo_message(image_url, caption=text, reply_markup=keyboard)
|
||||
else:
|
||||
send_message(text, reply_markup=keyboard)
|
||||
|
||||
|
||||
def notify_relevance_warning(article: dict[str, Any], score: int, reason: str) -> None:
|
||||
"""Send Telegram warning for borderline articles (score between warn and auto thresholds)."""
|
||||
title = (article.get("title") or "Ohne Titel").strip()
|
||||
art_id = article.get("id")
|
||||
source_url = article.get("source_url") or ""
|
||||
|
||||
text = (
|
||||
f"⚠️ <b>Artikel mit niedrigem Relevanz-Score</b>\n"
|
||||
f"📰 <b>{title}</b>\n"
|
||||
f"{_score_emoji(score)} Score: <b>{score}/100</b>\n"
|
||||
f"💬 {reason}\n"
|
||||
f'🔗 <a href="{source_url}">Originalartikel</a>'
|
||||
)
|
||||
keyboard = _inline_keyboard([
|
||||
[
|
||||
{"text": "➕ Trotzdem verarbeiten", "callback_data": f"override:{art_id}"},
|
||||
{"text": "❌ Ablehnen", "callback_data": f"reject:{art_id}"},
|
||||
]
|
||||
])
|
||||
send_message(text, reply_markup=keyboard)
|
||||
|
||||
|
||||
def notify_rejected_summary(articles: list[dict[str, Any]]) -> None:
|
||||
"""Send summary of rejected articles for this pipeline run."""
|
||||
if not articles:
|
||||
return
|
||||
lines = [f"🚫 <b>{len(articles)} Artikel abgelehnt (Score < {get_settings().pipeline_relevance_warn})</b>\n"]
|
||||
for art in articles[:10]:
|
||||
title = (art.get("title") or "Ohne Titel")[:60]
|
||||
score = _get_relevance_score(art)
|
||||
reason = _get_rejection_reason(art)
|
||||
art_id = art.get("id")
|
||||
lines.append(f"• <b>{title}</b> (Score: {score}) — {reason}")
|
||||
if len(articles) > 10:
|
||||
lines.append(f"... und {len(articles) - 10} weitere")
|
||||
|
||||
text = "\n".join(lines)
|
||||
# Build override buttons for first 5
|
||||
rows = []
|
||||
for art in articles[:5]:
|
||||
art_id = art.get("id")
|
||||
title = (art.get("title") or "")[:25]
|
||||
rows.append([{"text": f"➕ {title}…", "callback_data": f"override:{art_id}"}])
|
||||
|
||||
keyboard = _inline_keyboard(rows) if rows else None
|
||||
send_message(text, reply_markup=keyboard)
|
||||
|
||||
|
||||
def notify_error(message: str) -> None:
|
||||
"""Send error alert to Telegram."""
|
||||
try:
|
||||
send_message(f"🔴 <b>Fehler im RSS-Pipeline</b>\n{message}")
|
||||
except Exception as exc:
|
||||
logger.error("Telegram Fehler-Benachrichtigung fehlgeschlagen: %s", exc)
|
||||
|
||||
|
||||
def notify_pipeline_started(trigger: str = "auto") -> None:
|
||||
icon = "🤖" if trigger == "auto" else "👤"
|
||||
try:
|
||||
send_message(f"{icon} Pipeline gestartet (Auslöser: {trigger})")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def notify_pipeline_done(stats: dict[str, Any]) -> None:
|
||||
ingested = stats.get("ingested", 0)
|
||||
processed = stats.get("processed", 0)
|
||||
drafts = stats.get("drafts_created", 0)
|
||||
rejected = stats.get("rejected", 0)
|
||||
quality_gate_rejected = stats.get("quality_gate_rejected", 0)
|
||||
no_image = stats.get("no_image", 0)
|
||||
warnings = stats.get("warnings", 0)
|
||||
errors = stats.get("errors", 0)
|
||||
|
||||
lines = [
|
||||
"📊 <b>Pipeline abgeschlossen</b>",
|
||||
f"📥 Neue Artikel importiert: {ingested}",
|
||||
f"⚙️ Verarbeitet: {processed}",
|
||||
f"📝 Drafts erstellt: {drafts}",
|
||||
]
|
||||
if rejected:
|
||||
lines.append(f"🚫 Abgelehnt (Score): {rejected}")
|
||||
if quality_gate_rejected:
|
||||
lines.append(f"✂️ Qualitätsprüfung: {quality_gate_rejected}")
|
||||
if no_image:
|
||||
lines.append(f"🖼️ Kein Bild: {no_image}")
|
||||
if warnings:
|
||||
lines.append(f"⚠️ Warnungen: {warnings}")
|
||||
if errors:
|
||||
lines.append(f"🔴 Fehler: {errors}")
|
||||
|
||||
try:
|
||||
send_message("\n".join(lines))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helper to read relevance info from meta_json
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _get_relevance_score(article: dict[str, Any]) -> int:
|
||||
try:
|
||||
meta = json.loads(article.get("meta_json") or "{}")
|
||||
return int(meta.get("relevance", {}).get("score", 0))
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
|
||||
def _get_rejection_reason(article: dict[str, Any]) -> str:
|
||||
try:
|
||||
meta = json.loads(article.get("meta_json") or "{}")
|
||||
return str(meta.get("relevance", {}).get("reason", ""))[:80]
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Incoming update handler (called by webhook endpoint)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def handle_update(update: dict[str, Any]) -> None:
|
||||
"""Process an incoming Telegram update."""
|
||||
# Import here to avoid circular imports
|
||||
from . import pipeline as _pipeline
|
||||
|
||||
if "callback_query" in update:
|
||||
_handle_callback(update["callback_query"])
|
||||
elif "message" in update:
|
||||
_handle_message(update["message"])
|
||||
|
||||
|
||||
def _handle_message(message: dict[str, Any]) -> None:
|
||||
from . import pipeline as _pipeline
|
||||
|
||||
text = (message.get("text") or "").strip()
|
||||
if not text.startswith("/"):
|
||||
return
|
||||
|
||||
cmd = text.split()[0].lower().lstrip("/")
|
||||
if "@" in cmd:
|
||||
cmd = cmd.split("@")[0]
|
||||
|
||||
if cmd == "run":
|
||||
send_message("🤖 Pipeline wird manuell gestartet …")
|
||||
try:
|
||||
stats = _pipeline.run_auto_pipeline(trigger="manual")
|
||||
notify_pipeline_done(stats)
|
||||
except Exception as exc:
|
||||
notify_error(f"/run fehlgeschlagen: {exc}")
|
||||
|
||||
elif cmd == "rejected":
|
||||
try:
|
||||
articles = _pipeline.get_recently_rejected(days=3)
|
||||
if not articles:
|
||||
send_message("✅ Keine abgelehnten Artikel in den letzten 3 Tagen.")
|
||||
else:
|
||||
notify_rejected_summary(articles)
|
||||
except Exception as exc:
|
||||
notify_error(f"/rejected fehlgeschlagen: {exc}")
|
||||
|
||||
elif cmd == "status":
|
||||
try:
|
||||
status_text = _pipeline.get_pipeline_status_text()
|
||||
send_message(status_text)
|
||||
except Exception as exc:
|
||||
notify_error(f"/status fehlgeschlagen: {exc}")
|
||||
|
||||
elif cmd == "help":
|
||||
send_message(
|
||||
"📋 <b>Verfügbare Befehle</b>\n"
|
||||
"/run — Pipeline manuell starten\n"
|
||||
"/rejected — Abgelehnte Artikel der letzten 3 Tage\n"
|
||||
"/status — Pipeline-Status\n"
|
||||
"/help — Diese Hilfe"
|
||||
)
|
||||
|
||||
else:
|
||||
# Unbekannter Befehl → an N8N App-Release-Workflow weiterleiten
|
||||
_forward_to_n8n_app_release({"message": message})
|
||||
|
||||
|
||||
def _handle_callback(callback_query: dict[str, Any]) -> None:
|
||||
from . import pipeline as _pipeline
|
||||
from .repositories import get_article_by_id, update_article_status
|
||||
|
||||
query_id = callback_query.get("id", "")
|
||||
data = (callback_query.get("data") or "").strip()
|
||||
chat_id = str(callback_query.get("message", {}).get("chat", {}).get("id", ""))
|
||||
message_id = int(callback_query.get("message", {}).get("message_id", 0))
|
||||
|
||||
if ":" not in data:
|
||||
answer_callback_query(query_id, "Ungültige Aktion")
|
||||
return
|
||||
|
||||
action, _, raw_id = data.partition(":")
|
||||
try:
|
||||
article_id = int(raw_id)
|
||||
except ValueError:
|
||||
answer_callback_query(query_id, "Ungültige Artikel-ID")
|
||||
return
|
||||
|
||||
article = get_article_by_id(article_id)
|
||||
if not article:
|
||||
answer_callback_query(query_id, "Artikel nicht gefunden")
|
||||
return
|
||||
|
||||
# Answer Telegram immediately so the spinning indicator stops
|
||||
action_labels = {
|
||||
"rewrite": "✏️ Artikel wird neu geschrieben …",
|
||||
"discard": "❌ Artikel verworfen",
|
||||
"override": "➕ Artikel wird verarbeitet …",
|
||||
"reject": "🚫 Abgelehnt",
|
||||
}
|
||||
answer_callback_query(query_id, action_labels.get(action, ""))
|
||||
edit_message_reply_markup(chat_id, message_id)
|
||||
|
||||
logger.info("Callback: action=%s article_id=%s", action, article_id)
|
||||
|
||||
if action == "rewrite":
|
||||
try:
|
||||
logger.info("Rewrite #%d: starte rewrite_and_update_draft", article_id)
|
||||
_pipeline.rewrite_and_update_draft(article_id)
|
||||
logger.info("Rewrite #%d: abgeschlossen, sende Benachrichtigung", article_id)
|
||||
updated = get_article_by_id(article_id)
|
||||
if updated:
|
||||
from .scheduler import suggest_publish_slot
|
||||
slot = suggest_publish_slot()
|
||||
notify_new_draft(updated, score=_get_relevance_score(updated), suggested_publish_at=slot)
|
||||
except Exception as exc:
|
||||
logger.error("Rewrite #%d fehlgeschlagen: %s", article_id, exc, exc_info=True)
|
||||
notify_error(f"Rewrite #{article_id} fehlgeschlagen: {exc}")
|
||||
|
||||
elif action == "discard":
|
||||
try:
|
||||
_pipeline.discard_article(article_id)
|
||||
except Exception as exc:
|
||||
logger.error("Discard #%d fehlgeschlagen: %s", article_id, exc)
|
||||
notify_error(f"Verwerfen #{article_id} fehlgeschlagen: {exc}")
|
||||
|
||||
elif action == "override":
|
||||
try:
|
||||
_pipeline.override_rejected_article(article_id)
|
||||
except Exception as exc:
|
||||
logger.error("Override #%d fehlgeschlagen: %s", article_id, exc)
|
||||
notify_error(f"Override #{article_id} fehlgeschlagen: {exc}")
|
||||
|
||||
elif action == "reject":
|
||||
update_article_status(article_id, "error", actor="telegram", note="Manuell abgelehnt via Telegram")
|
||||
|
||||
else:
|
||||
logger.warning("Unbekannte Callback-Aktion: %s", action)
|
||||
689
backend/app/wordpress.py
Normal file
689
backend/app/wordpress.py
Normal file
|
|
@ -0,0 +1,689 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
from html import escape
|
||||
import logging
|
||||
import json
|
||||
import mimetypes
|
||||
from pathlib import Path
|
||||
import re
|
||||
from typing import Any
|
||||
from html import unescape as _html_unescape
|
||||
from urllib.parse import quote_plus, urlparse
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
from .config import get_settings
|
||||
|
||||
|
||||
def _auth_header(username: str, app_password: str) -> str:
|
||||
token = base64.b64encode(f"{username}:{app_password}".encode("utf-8")).decode("ascii")
|
||||
return f"Basic {token}"
|
||||
|
||||
|
||||
def _wp_request(
|
||||
*,
|
||||
base_url: str,
|
||||
auth_header: str,
|
||||
method: str,
|
||||
endpoint: str,
|
||||
payload: dict[str, Any] | None = None,
|
||||
) -> Any:
|
||||
url = f"{base_url.rstrip('/')}/wp-json/wp/v2/{endpoint.lstrip('/')}"
|
||||
data = json.dumps(payload).encode("utf-8") if payload is not None else None
|
||||
req = Request(
|
||||
url=url,
|
||||
data=data,
|
||||
method=method,
|
||||
headers={
|
||||
"Authorization": auth_header,
|
||||
"Content-Type": "application/json; charset=utf-8",
|
||||
"Accept": "application/json",
|
||||
"User-Agent": "rss-news-publisher/1.0",
|
||||
},
|
||||
)
|
||||
with urlopen(req, timeout=20) as resp:
|
||||
raw = resp.read().decode("utf-8", errors="replace")
|
||||
return json.loads(raw) if raw else {}
|
||||
|
||||
|
||||
def _selected_image_url_from_meta(meta_json: str | None) -> str | None:
|
||||
if not meta_json:
|
||||
return None
|
||||
try:
|
||||
meta = json.loads(meta_json)
|
||||
except Exception:
|
||||
return None
|
||||
if not isinstance(meta, dict):
|
||||
return None
|
||||
image_review = meta.get("image_review")
|
||||
if not isinstance(image_review, dict):
|
||||
return None
|
||||
selected = image_review.get("selected_url")
|
||||
return selected if isinstance(selected, str) and selected.strip() else None
|
||||
|
||||
|
||||
def _selected_tags_from_meta(meta_json: str | None) -> list[str]:
|
||||
if not meta_json:
|
||||
return []
|
||||
try:
|
||||
meta = json.loads(meta_json)
|
||||
except Exception:
|
||||
return []
|
||||
if not isinstance(meta, dict):
|
||||
return []
|
||||
raw_tags = meta.get("generated_tags")
|
||||
if not isinstance(raw_tags, list):
|
||||
return []
|
||||
tags: list[str] = []
|
||||
seen: set[str] = set()
|
||||
for item in raw_tags:
|
||||
value = str(item or "").strip()
|
||||
if not value:
|
||||
continue
|
||||
key = value.casefold()
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
tags.append(value)
|
||||
if len(tags) >= 12:
|
||||
break
|
||||
return tags
|
||||
|
||||
|
||||
def _resolve_wp_tag_ids(*, base_url: str, auth_header: str, tags: list[str]) -> list[int]:
|
||||
ids: list[int] = []
|
||||
seen: set[int] = set()
|
||||
for tag in tags:
|
||||
name = tag.strip()
|
||||
if not name:
|
||||
continue
|
||||
try:
|
||||
endpoint = f"tags?search={quote_plus(name)}&per_page=20"
|
||||
result = _wp_request(base_url=base_url, auth_header=auth_header, method="GET", endpoint=endpoint)
|
||||
tag_id: int | None = None
|
||||
if isinstance(result, list):
|
||||
for row in result:
|
||||
if not isinstance(row, dict):
|
||||
continue
|
||||
row_name = str(row.get("name") or "")
|
||||
rid = int(row.get("id", 0) or 0)
|
||||
if rid <= 0:
|
||||
continue
|
||||
if row_name.casefold() == name.casefold():
|
||||
tag_id = rid
|
||||
break
|
||||
if tag_id is None:
|
||||
for row in result:
|
||||
if isinstance(row, dict) and int(row.get("id", 0) or 0) > 0:
|
||||
tag_id = int(row.get("id", 0))
|
||||
break
|
||||
if tag_id is None:
|
||||
created = _wp_request(
|
||||
base_url=base_url,
|
||||
auth_header=auth_header,
|
||||
method="POST",
|
||||
endpoint="tags",
|
||||
payload={"name": name},
|
||||
)
|
||||
if isinstance(created, dict):
|
||||
rid = int(created.get("id", 0) or 0)
|
||||
if rid > 0:
|
||||
tag_id = rid
|
||||
if tag_id is not None and tag_id > 0 and tag_id not in seen:
|
||||
seen.add(tag_id)
|
||||
ids.append(tag_id)
|
||||
except Exception:
|
||||
continue
|
||||
return ids
|
||||
|
||||
|
||||
_BLOCKED_IMAGE_EXTS = {".svg", ".gif", ".ico", ".webp"}
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _sanitize_image_url(url: str) -> str:
|
||||
"""Decode HTML entities (e.g. & → &) in image URLs from RSS feeds."""
|
||||
return _html_unescape(url)
|
||||
|
||||
|
||||
_PLACEHOLDER_PATTERNS = ("some-default.jpg", "default-image", "placeholder", "no-image", "noimage")
|
||||
|
||||
def _is_usable_image_url(url: str) -> bool:
|
||||
"""Return False for URLs that are unlikely to work as WP featured images."""
|
||||
if not url or url.startswith("data:"):
|
||||
return False
|
||||
try:
|
||||
path = urlparse(url).path.lower()
|
||||
_, ext = path.rsplit(".", 1) if "." in path else ("", "")
|
||||
if f".{ext}" in _BLOCKED_IMAGE_EXTS:
|
||||
return False
|
||||
if any(p in path for p in _PLACEHOLDER_PATTERNS):
|
||||
return False
|
||||
except Exception:
|
||||
pass
|
||||
return True
|
||||
|
||||
|
||||
def _download_image_bytes(url: str, referer: str | None = None) -> tuple[bytes, str]:
|
||||
url = _sanitize_image_url(url)
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; rss-news-publisher/1.0)",
|
||||
"Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
|
||||
}
|
||||
if referer:
|
||||
headers["Referer"] = referer
|
||||
req = Request(url=url, headers=headers)
|
||||
with urlopen(req, timeout=20) as resp:
|
||||
raw = resp.read()
|
||||
content_type = resp.headers.get("Content-Type", "application/octet-stream")
|
||||
content_type = content_type.split(";")[0].strip() if content_type else "application/octet-stream"
|
||||
if not content_type.lower().startswith("image/"):
|
||||
raise RuntimeError(f"Ausgewählte Bild-URL liefert kein Bild ({content_type})")
|
||||
return raw, content_type
|
||||
|
||||
|
||||
def _guess_filename(image_url: str, content_type: str) -> str:
|
||||
parsed = urlparse(_sanitize_image_url(image_url))
|
||||
stem = Path(parsed.path).name or "article-image"
|
||||
if "." not in stem:
|
||||
ext = mimetypes.guess_extension(content_type.split(";")[0].strip()) or ".jpg"
|
||||
stem = f"{stem}{ext}"
|
||||
# Sanitize to ASCII-safe characters for the HTTP Content-Disposition header
|
||||
stem = stem.encode("ascii", errors="ignore").decode("ascii")
|
||||
stem = re.sub(r"[^\w.\-]", "_", stem) or "article-image.jpg"
|
||||
return stem
|
||||
|
||||
|
||||
def _get_image_meta_for_url(meta_json: str | None, image_url: str) -> dict:
|
||||
"""Return the caption/credit dict for a specific image URL from extraction metadata."""
|
||||
if not meta_json or not image_url:
|
||||
return {}
|
||||
try:
|
||||
from urllib.parse import urlparse
|
||||
meta = json.loads(meta_json)
|
||||
image_metadata = (meta.get("extraction") or {}).get("image_metadata") or {}
|
||||
# Exact match first
|
||||
if image_url in image_metadata:
|
||||
return image_metadata[image_url]
|
||||
# Fuzzy match: compare without query string (handles ?w=1200 variants)
|
||||
base_url = urlparse(image_url)._replace(query="").geturl()
|
||||
for key, val in image_metadata.items():
|
||||
key_base = urlparse(key)._replace(query="").geturl()
|
||||
if key_base == base_url:
|
||||
return val
|
||||
return {}
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
|
||||
def _build_image_caption(image_meta: dict, source_url: str) -> str:
|
||||
"""Build a WP caption string from image metadata and source URL."""
|
||||
# caption from figcaption typically already contains the credit text
|
||||
caption = (image_meta.get("caption") or "").strip()
|
||||
if caption:
|
||||
return caption
|
||||
return f"Quelle: {source_url}"
|
||||
|
||||
|
||||
def _upload_featured_media(
|
||||
*,
|
||||
base_url: str,
|
||||
auth_header: str,
|
||||
image_url: str,
|
||||
article_title: str,
|
||||
source_url: str,
|
||||
image_caption: str = "",
|
||||
) -> int:
|
||||
image_bytes, content_type = _download_image_bytes(image_url, referer=source_url or None)
|
||||
filename = _guess_filename(image_url, content_type)
|
||||
|
||||
media_url = f"{base_url.rstrip('/')}/wp-json/wp/v2/media"
|
||||
media_req = Request(
|
||||
url=media_url,
|
||||
data=image_bytes,
|
||||
method="POST",
|
||||
headers={
|
||||
"Authorization": auth_header,
|
||||
"Content-Type": content_type,
|
||||
"Content-Disposition": f'attachment; filename="{filename}"',
|
||||
"Accept": "application/json",
|
||||
"User-Agent": "rss-news-publisher/1.0",
|
||||
},
|
||||
)
|
||||
with urlopen(media_req, timeout=30) as resp:
|
||||
media_raw = resp.read().decode("utf-8", errors="replace")
|
||||
media_payload = json.loads(media_raw) if media_raw else {}
|
||||
media_id = int(media_payload.get("id", 0)) if isinstance(media_payload, dict) else 0
|
||||
if media_id <= 0:
|
||||
raise RuntimeError(f"WordPress Media-Upload fehlgeschlagen: {media_payload}")
|
||||
|
||||
_wp_request(
|
||||
base_url=base_url,
|
||||
auth_header=auth_header,
|
||||
method="POST",
|
||||
endpoint=f"media/{media_id}",
|
||||
payload={
|
||||
"title": f"{article_title[:120]} - Bild",
|
||||
"caption": image_caption or f"Quelle: {source_url}",
|
||||
"alt_text": article_title[:200],
|
||||
},
|
||||
)
|
||||
return media_id
|
||||
|
||||
|
||||
def _as_paragraph_html(text: str) -> str:
|
||||
chunks = [chunk.strip() for chunk in re.split(r"\n{2,}", text.strip()) if chunk.strip()]
|
||||
if not chunks:
|
||||
return ""
|
||||
lines = []
|
||||
for chunk in chunks:
|
||||
compact = re.sub(r"\s*\n\s*", " ", chunk)
|
||||
lines.append(f"<p>{escape(compact)}</p>")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _as_block_paragraphs(text: str) -> str:
|
||||
chunks = [chunk.strip() for chunk in re.split(r"\n{2,}", text.strip()) if chunk.strip()]
|
||||
if not chunks:
|
||||
return ""
|
||||
lines = []
|
||||
for chunk in chunks:
|
||||
compact = re.sub(r"\s*\n\s*", " ", chunk)
|
||||
lines.append(f"<!-- wp:paragraph --><p>{escape(compact)}</p><!-- /wp:paragraph -->")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _strip_html_tags(raw: str) -> str:
|
||||
text = re.sub(r"<[^>]+>", " ", raw or "")
|
||||
return re.sub(r"\s+", " ", text).strip()
|
||||
|
||||
|
||||
def _html_to_wp_blocks(html: str) -> str:
|
||||
src = (html or "").strip()
|
||||
if not src:
|
||||
return ""
|
||||
pattern = re.compile(
|
||||
r"<h([2-6])[^>]*>[\s\S]*?</h\1>|<p[^>]*>[\s\S]*?</p>|<ul[^>]*>[\s\S]*?</ul>|<ol[^>]*>[\s\S]*?</ol>",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
blocks: list[str] = []
|
||||
for match in pattern.finditer(src):
|
||||
block_html = match.group(0).strip()
|
||||
if not block_html:
|
||||
continue
|
||||
tag_match = re.match(r"<([a-z0-9]+)", block_html, re.IGNORECASE)
|
||||
tag = (tag_match.group(1).lower() if tag_match else "")
|
||||
if tag == "p":
|
||||
blocks.append(f"<!-- wp:paragraph -->{block_html}<!-- /wp:paragraph -->")
|
||||
elif tag in {"ul", "ol"}:
|
||||
ordered = tag == "ol"
|
||||
if ordered:
|
||||
blocks.append(f'<!-- wp:list {{"ordered":true}} -->{block_html}<!-- /wp:list -->')
|
||||
else:
|
||||
blocks.append(f"<!-- wp:list -->{block_html}<!-- /wp:list -->")
|
||||
elif tag.startswith("h") and len(tag) == 2 and tag[1].isdigit():
|
||||
level = int(tag[1])
|
||||
blocks.append(f'<!-- wp:heading {{"level":{level}}} -->{block_html}<!-- /wp:heading -->')
|
||||
if blocks:
|
||||
return "\n".join(blocks)
|
||||
return _as_block_paragraphs(_strip_html_tags(src))
|
||||
|
||||
|
||||
def _as_block_heading(level: int, text: str) -> str:
|
||||
safe_level = min(6, max(1, int(level)))
|
||||
return f'<!-- wp:heading {{"level":{safe_level}}} --><h{safe_level}>{escape(text)}</h{safe_level}><!-- /wp:heading -->'
|
||||
|
||||
|
||||
def _as_block_list(items: list[str]) -> str:
|
||||
if not items:
|
||||
return ""
|
||||
content = "".join(f"<li>{item}</li>" for item in items)
|
||||
return f"<!-- wp:list --><ul>{content}</ul><!-- /wp:list -->"
|
||||
|
||||
|
||||
def _sanitize_publish_text(text: str) -> str:
|
||||
raw = (text or "").strip()
|
||||
if not raw:
|
||||
return ""
|
||||
lines = [ln.strip() for ln in raw.splitlines() if ln.strip()]
|
||||
if len(lines) > 3:
|
||||
lines = lines[3:]
|
||||
merged = "\n".join(lines)
|
||||
merged = re.sub(r"\n?\s*Pressekontakt[\s\S]*$", "", merged, flags=re.IGNORECASE).strip()
|
||||
return merged
|
||||
|
||||
|
||||
def _build_attribution_block(article: dict[str, Any]) -> str:
|
||||
"""Build a WP Gutenberg attribution block for the bottom of the article."""
|
||||
from urllib.parse import urlparse
|
||||
source_url = (article.get("canonical_url") or article.get("source_url") or "").strip()
|
||||
source_name = (article.get("source_name_snapshot") or "").strip()
|
||||
author = (article.get("author") or "").strip()
|
||||
|
||||
# If the feed name is "Google Alerts" (or similar generic names), derive the
|
||||
# real source name from the hostname of the canonical URL.
|
||||
if not source_name or source_name.lower() in ("google alerts", "google"):
|
||||
try:
|
||||
hostname = urlparse(source_url).hostname or ""
|
||||
source_name = hostname.removeprefix("www.")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Get image credit from extraction metadata (uses fuzzy URL match)
|
||||
meta_json = article.get("meta_json")
|
||||
credit = ""
|
||||
try:
|
||||
meta = json.loads(meta_json or "{}")
|
||||
selected_url = (meta.get("image_review") or {}).get("selected_url") or ""
|
||||
if selected_url:
|
||||
img_meta = _get_image_meta_for_url(meta_json, selected_url)
|
||||
raw_credit = (img_meta.get("credit") or "").strip()
|
||||
caption_text = (img_meta.get("caption") or "").strip()
|
||||
# If credit is just a bare marker prefix (e.g. "Foto:", "Bild:"),
|
||||
# clear it and extract the full credit from the caption text instead.
|
||||
_BARE_MARKERS = {"foto", "bild", "credit", "fotograf", "fotografie", "photo", "bildnachweis"}
|
||||
if raw_credit.endswith(":") and raw_credit[:-1].strip().lower() in _BARE_MARKERS:
|
||||
raw_credit = ""
|
||||
if raw_credit:
|
||||
credit = raw_credit
|
||||
elif caption_text:
|
||||
# Extract credit markers like "Foto: IMAGO/…", "© Agentur", "Bild: …"
|
||||
import re as _re
|
||||
m = _re.search(
|
||||
r"(©[^\n]{1,120}|(?:Foto|Bild|Credit|Fotograf|Photo)\s*:[^\n]{1,120})",
|
||||
caption_text,
|
||||
)
|
||||
credit = m.group(1).strip() if m else ""
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
parts: list[str] = []
|
||||
if source_url:
|
||||
label = source_name or source_url
|
||||
parts.append(f'Originalartikel: <a href="{source_url}">{escape(label)}</a>')
|
||||
if author:
|
||||
parts.append(f"Autor: {escape(author)}")
|
||||
if credit:
|
||||
parts.append(f"Bildnachweis: {escape(credit)}")
|
||||
|
||||
if not parts:
|
||||
return ""
|
||||
|
||||
inner = " | ".join(parts)
|
||||
return (
|
||||
"\n<!-- wp:separator {\"className\":\"is-style-wide\"} -->"
|
||||
"<hr class=\"wp-block-separator is-style-wide\"/><!-- /wp:separator -->\n"
|
||||
f'<!-- wp:paragraph {{\"className\":\"article-attribution\"}} -->'
|
||||
f'<p class="article-attribution"><em>{inner}</em></p>'
|
||||
"<!-- /wp:paragraph -->"
|
||||
)
|
||||
|
||||
|
||||
def _build_post_content(article: dict[str, Any]) -> tuple[str, str | None]:
|
||||
summary = (article.get("summary") or "").strip()
|
||||
body_text = (article.get("content_rewritten") or article.get("content_raw") or "").strip()
|
||||
body_text = _sanitize_publish_text(body_text)
|
||||
if not body_text:
|
||||
body_text = summary
|
||||
|
||||
has_html = bool(re.search(r"<[a-zA-Z][^>]*>", body_text))
|
||||
body_html = _html_to_wp_blocks(body_text) if has_html else _as_block_paragraphs(body_text)
|
||||
if not body_html:
|
||||
body_html = "<!-- wp:paragraph --><p>Kein Inhalt verfügbar.</p><!-- /wp:paragraph -->"
|
||||
|
||||
attribution = _build_attribution_block(article)
|
||||
content = (body_html + attribution).strip()
|
||||
return content, None
|
||||
|
||||
|
||||
def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]:
|
||||
settings = get_settings()
|
||||
if not settings.wordpress_base_url or not settings.wordpress_username or not settings.wordpress_app_password:
|
||||
raise RuntimeError("WordPress Konfiguration fehlt (base_url, username, app_password)")
|
||||
|
||||
auth = _auth_header(settings.wordpress_username, settings.wordpress_app_password)
|
||||
|
||||
title = (article.get("title") or "Ohne Titel").strip()
|
||||
content, excerpt = _build_post_content(article)
|
||||
source_url = article.get("source_url") or ""
|
||||
|
||||
featured_media_id = None
|
||||
selected_image_url = _selected_image_url_from_meta(article.get("meta_json"))
|
||||
|
||||
# Build candidate list: primary selected URL + fallbacks from image_urls_json
|
||||
image_candidates: list[str] = []
|
||||
if selected_image_url and _is_usable_image_url(selected_image_url):
|
||||
image_candidates.append(selected_image_url)
|
||||
try:
|
||||
extra_urls = json.loads(article.get("image_urls_json") or "[]")
|
||||
for u in extra_urls:
|
||||
if u and u not in image_candidates and _is_usable_image_url(u):
|
||||
image_candidates.append(u)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
for candidate_url in image_candidates:
|
||||
image_meta = _get_image_meta_for_url(article.get("meta_json"), candidate_url)
|
||||
image_caption = _build_image_caption(image_meta, source_url)
|
||||
try:
|
||||
featured_media_id = _upload_featured_media(
|
||||
base_url=settings.wordpress_base_url,
|
||||
auth_header=auth,
|
||||
image_url=candidate_url,
|
||||
article_title=title,
|
||||
source_url=source_url,
|
||||
image_caption=image_caption,
|
||||
)
|
||||
break # success — stop trying further candidates
|
||||
except Exception as img_exc:
|
||||
_logger.warning(
|
||||
"Bild-Upload fehlgeschlagen, versuche nächste URL: %s — %s", candidate_url, img_exc
|
||||
)
|
||||
|
||||
if not featured_media_id and image_candidates:
|
||||
_logger.warning(
|
||||
"Alle %d Bild-Kandidaten fehlgeschlagen für Artikel #%s (%s)",
|
||||
len(image_candidates), article.get("id"), title[:60],
|
||||
)
|
||||
|
||||
payload = {
|
||||
"title": title,
|
||||
"content": content,
|
||||
"status": settings.wordpress_default_status,
|
||||
}
|
||||
if excerpt:
|
||||
payload["excerpt"] = excerpt
|
||||
if featured_media_id:
|
||||
payload["featured_media"] = featured_media_id
|
||||
scheduled_at = article.get("scheduled_publish_at")
|
||||
if scheduled_at:
|
||||
payload["date"] = scheduled_at # e.g. "2026-03-24T09:00:00"
|
||||
# Use status "future" so WP schedules auto-publishing at the given date.
|
||||
# WP ignores date for drafts and shows "Sofort veröffentlichen" instead.
|
||||
try:
|
||||
from datetime import datetime as _dt
|
||||
if _dt.fromisoformat(scheduled_at) > _dt.now():
|
||||
payload["status"] = "future"
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
wp_post_id = article.get("wp_post_id")
|
||||
tag_ids = _resolve_wp_tag_ids(
|
||||
base_url=settings.wordpress_base_url,
|
||||
auth_header=auth,
|
||||
tags=_selected_tags_from_meta(article.get("meta_json")),
|
||||
)
|
||||
if tag_ids:
|
||||
payload["tags"] = tag_ids
|
||||
|
||||
if wp_post_id:
|
||||
result = _wp_request(
|
||||
base_url=settings.wordpress_base_url,
|
||||
auth_header=auth,
|
||||
method="POST",
|
||||
endpoint=f"posts/{int(wp_post_id)}",
|
||||
payload=payload,
|
||||
)
|
||||
else:
|
||||
result = _wp_request(
|
||||
base_url=settings.wordpress_base_url,
|
||||
auth_header=auth,
|
||||
method="POST",
|
||||
endpoint="posts",
|
||||
payload=payload,
|
||||
)
|
||||
|
||||
if not isinstance(result, dict):
|
||||
raise RuntimeError(f"WordPress Antwort im unerwarteten Format: {result}")
|
||||
post_id = int(result.get("id", 0))
|
||||
if post_id <= 0:
|
||||
raise RuntimeError(f"WordPress Antwort ohne Post-ID: {result}")
|
||||
post_url = result.get("link")
|
||||
return post_id, post_url if isinstance(post_url, str) else None
|
||||
|
||||
|
||||
def selected_image_exists(article: dict[str, Any]) -> bool:
|
||||
return _selected_image_url_from_meta(article.get("meta_json")) is not None
|
||||
|
||||
|
||||
def delete_wp_post(wp_post_id: int) -> None:
|
||||
"""Permanently delete a WordPress post (moves to trash, then deletes)."""
|
||||
settings = get_settings()
|
||||
if not settings.wordpress_base_url or not settings.wordpress_username or not settings.wordpress_app_password:
|
||||
raise RuntimeError("WordPress Konfiguration fehlt")
|
||||
auth = _auth_header(settings.wordpress_username, settings.wordpress_app_password)
|
||||
# force=true skips trash
|
||||
_wp_request(
|
||||
base_url=settings.wordpress_base_url,
|
||||
auth_header=auth,
|
||||
method="DELETE",
|
||||
endpoint=f"posts/{wp_post_id}?force=true",
|
||||
)
|
||||
|
||||
|
||||
def sync_db_from_wordpress() -> dict[str, Any]:
|
||||
"""Sync scheduled_publish_at and wp_post_url in the DB from WordPress.
|
||||
|
||||
WordPress is treated as the source of truth for scheduling.
|
||||
For each DB article that has a wp_post_id:
|
||||
- If WP post exists as 'future': update scheduled_publish_at to WP date.
|
||||
- If WP post exists as 'draft': clear scheduled_publish_at (not yet scheduled).
|
||||
- If WP post exists as 'publish': mark article as published in DB.
|
||||
- If WP post is trashed/deleted (404 or trash status): clear wp_post_id,
|
||||
wp_post_url, and scheduled_publish_at so the article can be re-processed.
|
||||
Returns a stats dict with counts of each action taken.
|
||||
"""
|
||||
from .db import get_conn
|
||||
|
||||
settings = get_settings()
|
||||
if not settings.wordpress_base_url or not settings.wordpress_username or not settings.wordpress_app_password:
|
||||
raise RuntimeError("WordPress Konfiguration fehlt")
|
||||
auth = _auth_header(settings.wordpress_username, settings.wordpress_app_password)
|
||||
base_url = settings.wordpress_base_url.rstrip("/")
|
||||
|
||||
# Fetch all future + draft + published WP posts in one pass (up to 300 per status)
|
||||
wp_posts: dict[int, dict] = {}
|
||||
for status in ("future", "draft", "publish"):
|
||||
for page in range(1, 4): # max 300 per status
|
||||
try:
|
||||
result = _wp_request(
|
||||
base_url=base_url,
|
||||
auth_header=auth,
|
||||
method="GET",
|
||||
endpoint=f"posts?status={status}&per_page=100&page={page}&_fields=id,date,status,link",
|
||||
)
|
||||
except Exception:
|
||||
break
|
||||
if not isinstance(result, list) or not result:
|
||||
break
|
||||
for post in result:
|
||||
try:
|
||||
wp_posts[int(post["id"])] = post
|
||||
except Exception:
|
||||
pass
|
||||
if len(result) < 100:
|
||||
break
|
||||
|
||||
# Load all DB articles that have a wp_post_id
|
||||
with get_conn() as conn:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT id, wp_post_id, wp_post_url, scheduled_publish_at, status
|
||||
FROM articles
|
||||
WHERE wp_post_id IS NOT NULL
|
||||
AND status NOT IN ('no_image')
|
||||
ORDER BY id
|
||||
"""
|
||||
).fetchall()
|
||||
|
||||
stats: dict[str, int] = {
|
||||
"total_db_articles": len(rows),
|
||||
"wp_posts_found": len(wp_posts),
|
||||
"slot_updated": 0,
|
||||
"slot_cleared_draft": 0,
|
||||
"marked_published": 0,
|
||||
"wp_reference_cleared": 0,
|
||||
"already_in_sync": 0,
|
||||
}
|
||||
|
||||
for row in rows:
|
||||
article_id = row["id"]
|
||||
wp_post_id = int(row["wp_post_id"])
|
||||
wp_post = wp_posts.get(wp_post_id)
|
||||
|
||||
if wp_post is None:
|
||||
# Post not found in future/draft/publish — likely trashed or deleted
|
||||
# Clear wp reference so article can be re-processed if needed
|
||||
with get_conn() as conn:
|
||||
conn.execute(
|
||||
"""UPDATE articles
|
||||
SET wp_post_id = NULL, wp_post_url = NULL, scheduled_publish_at = NULL
|
||||
WHERE id = ?""",
|
||||
(article_id,),
|
||||
)
|
||||
stats["wp_reference_cleared"] += 1
|
||||
continue
|
||||
|
||||
wp_status = wp_post.get("status", "")
|
||||
wp_date = wp_post.get("date", "") # local CET datetime, e.g. "2026-05-05T09:00:00"
|
||||
wp_link = wp_post.get("link") or row["wp_post_url"]
|
||||
|
||||
if wp_status == "publish":
|
||||
# Already published in WP — mark as published in DB if not already
|
||||
if row["status"] != "published":
|
||||
with get_conn() as conn:
|
||||
conn.execute(
|
||||
"UPDATE articles SET status = 'published', wp_post_url = ? WHERE id = ?",
|
||||
(wp_link, article_id),
|
||||
)
|
||||
stats["marked_published"] += 1
|
||||
else:
|
||||
stats["already_in_sync"] += 1
|
||||
|
||||
elif wp_status == "future":
|
||||
# Scheduled — sync the date into scheduled_publish_at
|
||||
current_slot = row["scheduled_publish_at"] or ""
|
||||
# WP returns e.g. "2026-05-05T09:00:00" — compare ignoring seconds
|
||||
if current_slot[:16] != wp_date[:16]:
|
||||
with get_conn() as conn:
|
||||
conn.execute(
|
||||
"UPDATE articles SET scheduled_publish_at = ?, wp_post_url = ? WHERE id = ?",
|
||||
(wp_date, wp_link, article_id),
|
||||
)
|
||||
stats["slot_updated"] += 1
|
||||
else:
|
||||
stats["already_in_sync"] += 1
|
||||
|
||||
elif wp_status == "draft":
|
||||
# Draft without a schedule — clear scheduled_publish_at if set
|
||||
if row["scheduled_publish_at"]:
|
||||
with get_conn() as conn:
|
||||
conn.execute(
|
||||
"UPDATE articles SET scheduled_publish_at = NULL WHERE id = ?",
|
||||
(article_id,),
|
||||
)
|
||||
stats["slot_cleared_draft"] += 1
|
||||
else:
|
||||
stats["already_in_sync"] += 1
|
||||
|
||||
return stats
|
||||
39
backend/app/workflow.py
Normal file
39
backend/app/workflow.py
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
from __future__ import annotations
|
||||
|
||||
UI_STATUSES = ("new", "rewrite", "publish", "published", "close", "no_image")
|
||||
|
||||
|
||||
def internal_to_ui_status(status: str | None) -> str:
|
||||
value = (status or "").strip()
|
||||
if value == "approved":
|
||||
return "publish"
|
||||
if value == "error":
|
||||
return "close"
|
||||
if value == "review":
|
||||
return "rewrite"
|
||||
if value in {"new", "rewrite", "published", "no_image"}:
|
||||
return value
|
||||
return value or "new"
|
||||
|
||||
|
||||
def ui_to_internal_status(status: str | None) -> str:
|
||||
value = (status or "").strip()
|
||||
if value == "publish":
|
||||
return "approved"
|
||||
if value == "close":
|
||||
return "error"
|
||||
if value in {"new", "rewrite", "published", "no_image"}:
|
||||
return value
|
||||
if value in {"approved", "error", "review"}:
|
||||
return value
|
||||
return value
|
||||
|
||||
|
||||
ALLOWED_UI_TRANSITIONS: dict[str, set[str]] = {
|
||||
"new": {"rewrite", "close"},
|
||||
"rewrite": {"publish", "close"},
|
||||
"publish": {"published", "close"},
|
||||
"published": {"rewrite", "close"},
|
||||
"close": {"rewrite"},
|
||||
"no_image": {"rewrite", "close"},
|
||||
}
|
||||
BIN
backend/data/rss_news.db
Normal file
BIN
backend/data/rss_news.db
Normal file
Binary file not shown.
3
backend/requirements-test.txt
Normal file
3
backend/requirements-test.txt
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
pytest==8.3.5
|
||||
pytest-cov==6.0.0
|
||||
httpx==0.28.1
|
||||
8
backend/requirements.txt
Normal file
8
backend/requirements.txt
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
fastapi==0.116.1
|
||||
uvicorn[standard]==0.35.0
|
||||
itsdangerous==2.2.0
|
||||
pydantic-settings==2.10.1
|
||||
python-dotenv==1.1.1
|
||||
feedparser==6.0.11
|
||||
jinja2==3.1.4
|
||||
python-multipart==0.0.20
|
||||
303
backend/static/admin.css
Normal file
303
backend/static/admin.css
Normal file
|
|
@ -0,0 +1,303 @@
|
|||
body {
|
||||
margin: 0;
|
||||
font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif;
|
||||
background: #f4f6f8;
|
||||
color: #1f2937;
|
||||
}
|
||||
|
||||
.topbar {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
padding: 20px 28px;
|
||||
background: #0f172a;
|
||||
color: #f8fafc;
|
||||
}
|
||||
|
||||
.container {
|
||||
padding: 20px 28px 28px 28px;
|
||||
}
|
||||
|
||||
.login {
|
||||
max-width: 520px;
|
||||
margin: 60px auto;
|
||||
}
|
||||
|
||||
.card {
|
||||
background: #ffffff;
|
||||
border-radius: 10px;
|
||||
box-shadow: 0 1px 3px rgba(0, 0, 0, 0.12);
|
||||
padding: 16px;
|
||||
margin-bottom: 16px;
|
||||
}
|
||||
|
||||
.stats {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(4, minmax(0, 1fr));
|
||||
gap: 12px;
|
||||
margin-bottom: 16px;
|
||||
}
|
||||
|
||||
.stat {
|
||||
background: #ffffff;
|
||||
border-radius: 10px;
|
||||
padding: 12px;
|
||||
box-shadow: 0 1px 3px rgba(0, 0, 0, 0.12);
|
||||
}
|
||||
|
||||
.stat .label {
|
||||
font-size: 12px;
|
||||
color: #64748b;
|
||||
}
|
||||
|
||||
.stat .value {
|
||||
font-size: 24px;
|
||||
font-weight: 700;
|
||||
}
|
||||
|
||||
.grid.two {
|
||||
display: grid;
|
||||
grid-template-columns: 1fr 1fr;
|
||||
gap: 16px;
|
||||
}
|
||||
|
||||
.stack {
|
||||
display: grid;
|
||||
gap: 10px;
|
||||
}
|
||||
|
||||
.row {
|
||||
display: flex;
|
||||
gap: 8px;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.filter-row {
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
.inline {
|
||||
display: flex;
|
||||
gap: 6px;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
table {
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
}
|
||||
|
||||
th, td {
|
||||
text-align: left;
|
||||
padding: 8px;
|
||||
border-bottom: 1px solid #e5e7eb;
|
||||
vertical-align: top;
|
||||
}
|
||||
|
||||
input, select, button, textarea {
|
||||
padding: 8px;
|
||||
border-radius: 6px;
|
||||
border: 1px solid #cbd5e1;
|
||||
font: inherit;
|
||||
}
|
||||
|
||||
button {
|
||||
background: #0ea5e9;
|
||||
border-color: #0ea5e9;
|
||||
color: white;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
button.secondary {
|
||||
background: #64748b;
|
||||
border-color: #64748b;
|
||||
}
|
||||
|
||||
.badge {
|
||||
display: inline-block;
|
||||
padding: 2px 8px;
|
||||
border-radius: 999px;
|
||||
background: #e2e8f0;
|
||||
font-size: 12px;
|
||||
}
|
||||
|
||||
.badge.ok {
|
||||
background: #dcfce7;
|
||||
color: #166534;
|
||||
}
|
||||
|
||||
.badge.bad {
|
||||
background: #fee2e2;
|
||||
color: #991b1b;
|
||||
}
|
||||
|
||||
.badge.errcat {
|
||||
margin-bottom: 4px;
|
||||
}
|
||||
|
||||
.badge.errcat-policy {
|
||||
background: #fee2e2;
|
||||
color: #991b1b;
|
||||
}
|
||||
|
||||
.badge.errcat-auth {
|
||||
background: #ffedd5;
|
||||
color: #9a3412;
|
||||
}
|
||||
|
||||
.badge.errcat-dns {
|
||||
background: #dbeafe;
|
||||
color: #1e40af;
|
||||
}
|
||||
|
||||
.badge.errcat-media {
|
||||
background: #fef9c3;
|
||||
color: #854d0e;
|
||||
}
|
||||
|
||||
.badge.errcat-api {
|
||||
background: #ede9fe;
|
||||
color: #5b21b6;
|
||||
}
|
||||
|
||||
.badge.errcat-unknown {
|
||||
background: #e2e8f0;
|
||||
color: #334155;
|
||||
}
|
||||
|
||||
.alert {
|
||||
margin-bottom: 12px;
|
||||
padding: 10px;
|
||||
border-radius: 8px;
|
||||
background: #fee2e2;
|
||||
color: #991b1b;
|
||||
}
|
||||
|
||||
.flash {
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
.flash-success {
|
||||
border-left: 4px solid #10b981;
|
||||
}
|
||||
|
||||
.flash-error {
|
||||
border-left: 4px solid #ef4444;
|
||||
}
|
||||
|
||||
.subtle {
|
||||
color: #64748b;
|
||||
font-size: 12px;
|
||||
margin-top: 4px;
|
||||
}
|
||||
|
||||
.pre {
|
||||
white-space: pre-wrap;
|
||||
line-height: 1.35;
|
||||
max-height: 220px;
|
||||
overflow: auto;
|
||||
background: #f8fafc;
|
||||
border: 1px solid #e2e8f0;
|
||||
border-radius: 8px;
|
||||
padding: 8px;
|
||||
margin-top: 6px;
|
||||
}
|
||||
|
||||
.linkbtn {
|
||||
display: inline-block;
|
||||
padding: 8px 10px;
|
||||
border-radius: 6px;
|
||||
text-decoration: none;
|
||||
border: 1px solid #cbd5e1;
|
||||
color: #334155;
|
||||
background: #f8fafc;
|
||||
}
|
||||
|
||||
.detail-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(220px, 1fr));
|
||||
gap: 8px 12px;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
.detail-item {
|
||||
background: #f8fafc;
|
||||
border: 1px solid #e2e8f0;
|
||||
border-radius: 8px;
|
||||
padding: 8px;
|
||||
display: grid;
|
||||
gap: 4px;
|
||||
}
|
||||
|
||||
.detail-item .k {
|
||||
font-size: 12px;
|
||||
color: #64748b;
|
||||
}
|
||||
|
||||
.thumb {
|
||||
width: 72px;
|
||||
height: 72px;
|
||||
object-fit: cover;
|
||||
border-radius: 8px;
|
||||
border: 1px solid #cbd5e1;
|
||||
margin-top: 6px;
|
||||
}
|
||||
|
||||
.image-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fill, minmax(180px, 1fr));
|
||||
gap: 10px;
|
||||
}
|
||||
|
||||
.image-card {
|
||||
border: 1px solid #e2e8f0;
|
||||
border-radius: 8px;
|
||||
padding: 8px;
|
||||
background: #fff;
|
||||
}
|
||||
|
||||
.image-card img {
|
||||
width: 100%;
|
||||
height: 120px;
|
||||
object-fit: cover;
|
||||
border-radius: 6px;
|
||||
border: 1px solid #e2e8f0;
|
||||
background: #f8fafc;
|
||||
}
|
||||
|
||||
.img-failed {
|
||||
opacity: 0.3;
|
||||
filter: grayscale(1);
|
||||
}
|
||||
|
||||
.image-meta {
|
||||
margin-top: 6px;
|
||||
display: flex;
|
||||
gap: 6px;
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
|
||||
.image-actions {
|
||||
margin-top: 8px;
|
||||
display: flex;
|
||||
gap: 6px;
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
|
||||
.image-selected {
|
||||
border-color: #10b981;
|
||||
box-shadow: 0 0 0 1px rgba(16, 185, 129, 0.25);
|
||||
}
|
||||
|
||||
.image-excluded {
|
||||
opacity: 0.65;
|
||||
}
|
||||
|
||||
@media (max-width: 920px) {
|
||||
.stats {
|
||||
grid-template-columns: repeat(2, minmax(0, 1fr));
|
||||
}
|
||||
.grid.two {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
}
|
||||
224
backend/templates/admin_article_detail.html
Normal file
224
backend/templates/admin_article_detail.html
Normal file
|
|
@ -0,0 +1,224 @@
|
|||
<!doctype html>
|
||||
<html lang="de">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<title>{{ title }}</title>
|
||||
<link rel="stylesheet" href="/admin/static/admin.css" />
|
||||
</head>
|
||||
<body>
|
||||
<header class="topbar">
|
||||
<div>
|
||||
<h1>Artikel-Detail #{{ article.id }}</h1>
|
||||
<p>Angemeldet als <strong>{{ user }}</strong></p>
|
||||
</div>
|
||||
<div class="row">
|
||||
<a class="linkbtn" href="/admin/dashboard">Zurück</a>
|
||||
<form method="post" action="/admin/logout">
|
||||
<button type="submit" class="secondary">Logout</button>
|
||||
</form>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
<main class="container">
|
||||
{% if flash_msg %}
|
||||
<section class="card flash {{ 'flash-error' if flash_type == 'error' else 'flash-success' }}">
|
||||
{{ flash_msg }}
|
||||
</section>
|
||||
{% endif %}
|
||||
|
||||
<section class="card">
|
||||
<h2>{{ article.title }}</h2>
|
||||
<div class="detail-grid">
|
||||
<div class="detail-item"><span class="k">Status</span><span><span class="badge">{{ article.status_ui }}</span></span></div>
|
||||
<div class="detail-item"><span class="k">Artikel-Datum</span><span>{{ article.published_at or "-" }}</span></div>
|
||||
<div class="detail-item"><span class="k">Alter</span><span>{{ article.days_old if article.days_old is not none else "-" }} Tage</span></div>
|
||||
<div class="detail-item"><span class="k">Relevanz</span><span>{{ article.relevance }}</span></div>
|
||||
<div class="detail-item"><span class="k">Autor</span><span>{{ article.author or "-" }}</span></div>
|
||||
<div class="detail-item"><span class="k">Feed</span><span>{{ feed.name if feed else "-" }}</span></div>
|
||||
<div class="detail-item"><span class="k">Quelle Snapshot</span><span>{{ article.source_name_snapshot or "-" }}</span></div>
|
||||
<div class="detail-item"><span class="k">Lizenz Snapshot</span><span>{{ article.source_license_name_snapshot or "-" }}</span></div>
|
||||
<div class="detail-item"><span class="k">Terms Snapshot</span><span>{{ article.source_terms_url_snapshot or "-" }}</span></div>
|
||||
</div>
|
||||
<p><strong>Quelle:</strong> <a href="{{ article.source_url }}" target="_blank" rel="noopener">{{ article.source_url }}</a></p>
|
||||
{% if article.canonical_url %}<p><strong>Canonical:</strong> <a href="{{ article.canonical_url }}" target="_blank" rel="noopener">{{ article.canonical_url }}</a></p>{% endif %}
|
||||
{% if article.summary %}
|
||||
<p><strong>Summary:</strong> {{ article.summary }}</p>
|
||||
{% endif %}
|
||||
<p><strong>WordPress Post:</strong>
|
||||
{% if article.wp_post_url %}
|
||||
<a href="{{ article.wp_post_url }}" target="_blank" rel="noopener">#{{ article.wp_post_id }}</a>
|
||||
{% elif article.wp_post_id %}
|
||||
#{{ article.wp_post_id }}
|
||||
{% else %}
|
||||
-
|
||||
{% endif %}
|
||||
</p>
|
||||
<p><strong>Publish Attempts:</strong> {{ article.publish_attempts or 0 }} | <strong>Letzter Fehler:</strong> {{ article.publish_last_error or "-" }}</p>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Checkliste</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>Kriterium</th><th>Status</th><th>Wert</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for c in checklist %}
|
||||
<tr>
|
||||
<td>{{ c.label }}</td>
|
||||
<td>
|
||||
{% if c.status == "ok" %}
|
||||
<span class="badge ok">OK</span>
|
||||
{% else %}
|
||||
<span class="badge bad">Fehlt</span>
|
||||
{% endif %}
|
||||
</td>
|
||||
<td>{{ c.value }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Extrahierte Daten</h2>
|
||||
<p><strong>Bilder:</strong> {{ article.image_entries|length if article.image_entries else 0 }}</p>
|
||||
{% if article.selected_image_url %}
|
||||
<p><strong>Ausgewähltes Hauptbild:</strong> <a href="{{ article.selected_image_url }}" target="_blank" rel="noopener">{{ article.selected_image_url }}</a></p>
|
||||
{% if article.selected_image_proxy_url %}
|
||||
<img src="{{ article.selected_image_proxy_url }}" alt="Ausgewähltes Hauptbild" class="thumb" loading="lazy" />
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
{% if article.image_entries %}
|
||||
{% if article.image_selection %}
|
||||
<details>
|
||||
<summary>Automatische Bildauswahl (Score + Gründe)</summary>
|
||||
<div class="subtle">Primärbild (Auto): {{ article.image_selection.primary or "-" }}</div>
|
||||
<div class="subtle">Ausgewählt: {{ article.image_selection.selected_count or 0 }} / Kandidaten: {{ article.image_selection.total_candidates or 0 }}</div>
|
||||
{% if article.image_selection.ranked %}
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>Bild</th><th>Score</th><th>Gründe</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for r in article.image_selection.ranked %}
|
||||
<tr>
|
||||
<td><a href="{{ r.url }}" target="_blank" rel="noopener">{{ r.url }}</a></td>
|
||||
<td>{{ r.score }}</td>
|
||||
<td>{{ r.reasons|join(", ") if r.reasons else "-" }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
{% endif %}
|
||||
</details>
|
||||
{% endif %}
|
||||
<div class="image-grid">
|
||||
{% for image in article.image_entries %}
|
||||
<article class="image-card {{ 'image-selected' if image.is_selected else '' }} {{ 'image-excluded' if image.is_excluded else '' }}">
|
||||
<a href="{{ image.url }}" target="_blank" rel="noopener">
|
||||
<img src="{{ image.proxy_url }}" data-fallback-src="{{ image.url }}" alt="Artikelbild" loading="lazy" onerror="if(!this.dataset.fallbackUsed){this.dataset.fallbackUsed='1';this.src=this.dataset.fallbackSrc;}else{this.classList.add('img-failed');}" />
|
||||
</a>
|
||||
<div class="image-meta">
|
||||
{% if image.is_selected %}<span class="badge ok">Ausgewählt</span>{% endif %}
|
||||
{% if image.is_excluded %}<span class="badge bad">Ausgeblendet</span>{% endif %}
|
||||
{% if image.is_irrelevant_hint %}<span class="badge">evtl. irrelevant</span>{% endif %}
|
||||
</div>
|
||||
<div class="image-actions">
|
||||
<form method="post" action="/admin/articles/{{ article.id }}/images/decision">
|
||||
<input type="hidden" name="image_url" value="{{ image.url }}" />
|
||||
<input type="hidden" name="action" value="select" />
|
||||
<button type="submit">Als Hauptbild</button>
|
||||
</form>
|
||||
{% if not image.is_excluded %}
|
||||
<form method="post" action="/admin/articles/{{ article.id }}/images/decision">
|
||||
<input type="hidden" name="image_url" value="{{ image.url }}" />
|
||||
<input type="hidden" name="action" value="exclude" />
|
||||
<button type="submit" class="secondary">Ausblenden</button>
|
||||
</form>
|
||||
{% else %}
|
||||
<form method="post" action="/admin/articles/{{ article.id }}/images/decision">
|
||||
<input type="hidden" name="image_url" value="{{ image.url }}" />
|
||||
<input type="hidden" name="action" value="restore" />
|
||||
<button type="submit" class="secondary">Einblenden</button>
|
||||
</form>
|
||||
{% endif %}
|
||||
</div>
|
||||
<div class="subtle"><a href="{{ image.url }}" target="_blank" rel="noopener">{{ image.url }}</a></div>
|
||||
</article>
|
||||
{% endfor %}
|
||||
</div>
|
||||
{% endif %}
|
||||
{% if article.press_contact or article.extraction.press_contact %}
|
||||
<p><strong>Pressekontakt</strong></p>
|
||||
<div class="pre">{{ article.press_contact or article.extraction.press_contact }}</div>
|
||||
{% endif %}
|
||||
{% if article.extraction.extraction_error %}
|
||||
<p class="subtle">Extraktionsfehler: {{ article.extraction.extraction_error }}</p>
|
||||
{% endif %}
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Volltext</h2>
|
||||
<div class="pre">{{ article.content_raw or "-" }}</div>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Rewrite-Text (editierbar)</h2>
|
||||
<form method="post" action="/admin/articles/{{ article.id }}/rewrite-save" class="stack">
|
||||
<textarea name="content_rewritten" rows="14" style="width:100%;">{{ article.content_rewritten or "" }}</textarea>
|
||||
<button type="submit">Rewrite-Text speichern</button>
|
||||
</form>
|
||||
{% if article.meta.generated_tags %}
|
||||
<p><strong>Generierte Tags:</strong> {{ article.meta.generated_tags|join("; ") }}</p>
|
||||
{% endif %}
|
||||
<p class="subtle">Dieser Text wird für den WordPress-Entwurf verwendet, falls vorhanden.</p>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Status ändern</h2>
|
||||
{% if article.status_ui in ["new", "rewrite"] %}
|
||||
<form method="post" action="/admin/articles/{{ article.id }}/rewrite-run" class="row" style="margin-bottom:8px;">
|
||||
<button type="submit">Rewrite ausführen (OpenAI)</button>
|
||||
</form>
|
||||
{% endif %}
|
||||
{% if article.status_ui == "published" %}
|
||||
<form method="post" action="/admin/articles/{{ article.id }}/reopen" class="row" style="margin-bottom:8px;">
|
||||
<button type="submit">Zurück in Rewrite-Workflow</button>
|
||||
</form>
|
||||
{% endif %}
|
||||
<form method="post" action="/admin/articles/{{ article.id }}/transition" class="row">
|
||||
<select name="target_status">
|
||||
{% for s in allowed_transitions %}
|
||||
<option value="{{ s }}">{{ s }}</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
<input name="note" placeholder="Notiz" />
|
||||
<button type="submit" class="secondary">Setzen</button>
|
||||
</form>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>WordPress Publish Queue</h2>
|
||||
{% if article.publish_ready %}
|
||||
<p><span class="badge ok">Publish bereit</span></p>
|
||||
{% else %}
|
||||
<p><span class="badge bad">Publish blockiert</span></p>
|
||||
{% if article.publish_blockers %}
|
||||
<ul>
|
||||
{% for reason in article.publish_blockers %}
|
||||
<li class="subtle">{{ reason }}</li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
<p class="subtle">Voraussetzungen: Status `publish` und Hauptbild gesetzt.</p>
|
||||
<form method="post" action="/admin/articles/{{ article.id }}/publish-enqueue" class="row">
|
||||
<input name="max_attempts" value="3" />
|
||||
<button type="submit" {% if not article.publish_ready %}disabled{% endif %}>In Queue einreihen</button>
|
||||
</form>
|
||||
</section>
|
||||
</main>
|
||||
</body>
|
||||
</html>
|
||||
221
backend/templates/admin_article_list.html
Normal file
221
backend/templates/admin_article_list.html
Normal file
|
|
@ -0,0 +1,221 @@
|
|||
<!doctype html>
|
||||
<html lang="de">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<title>{{ title }}</title>
|
||||
<link rel="stylesheet" href="/admin/static/admin.css" />
|
||||
<style>
|
||||
.al-table { width: 100%; border-collapse: collapse; }
|
||||
.al-table th, .al-table td { padding: 8px 10px; border-bottom: 1px solid #e5e7eb; vertical-align: middle; }
|
||||
.al-table th { background: #f3f4f6; font-size: 0.85em; text-transform: uppercase; letter-spacing: .04em; }
|
||||
.al-table tr:hover td { background: #fafafa; }
|
||||
.al-thumb { width: 72px; height: 52px; object-fit: cover; border-radius: 4px; display: block; }
|
||||
.al-thumb-placeholder { width: 72px; height: 52px; background: #e5e7eb; border-radius: 4px; display: flex; align-items: center; justify-content: center; color: #9ca3af; font-size: 1.4em; }
|
||||
.al-title { font-weight: 600; font-size: 0.95em; }
|
||||
.al-excerpt { font-size: 0.82em; color: #6b7280; margin-top: 3px; }
|
||||
.wp-id-input { width: 90px; font-family: monospace; font-size: 0.9em; padding: 4px 6px; border: 1px solid #d1d5db; border-radius: 4px; }
|
||||
.wp-id-input.changed { border-color: #f59e0b; background: #fffbeb; font-weight: bold; }
|
||||
.wp-link { font-size: 0.8em; margin-top: 3px; display: block; }
|
||||
.sticky-bar { position: sticky; top: 0; z-index: 100; background: #1e3a5f; color: #fff; padding: 10px 20px; display: flex; align-items: center; gap: 1.5rem; box-shadow: 0 2px 8px rgba(0,0,0,.2); }
|
||||
.sticky-bar button { background: #f59e0b; color: #000; border: none; padding: 8px 20px; border-radius: 6px; font-weight: bold; cursor: pointer; font-size: 0.95em; }
|
||||
.sticky-bar button:disabled { background: #9ca3af; color: #fff; cursor: default; }
|
||||
.change-badge { background: #f59e0b; color: #000; border-radius: 12px; padding: 2px 10px; font-weight: bold; font-size: 0.85em; display: none; }
|
||||
.change-badge.visible { display: inline; }
|
||||
.filter-bar { display: flex; gap: 1rem; align-items: flex-end; flex-wrap: wrap; margin-bottom: 1rem; }
|
||||
.filter-bar label { font-size: 0.85em; color: #6b7280; display: block; margin-bottom: 3px; }
|
||||
.filter-bar input, .filter-bar select { padding: 6px 10px; border: 1px solid #d1d5db; border-radius: 4px; font-size: 0.9em; }
|
||||
.pagination { display: flex; gap: 8px; align-items: center; justify-content: center; margin-top: 1.5rem; flex-wrap: wrap; }
|
||||
.pagination a, .pagination span { padding: 6px 12px; border: 1px solid #d1d5db; border-radius: 4px; font-size: 0.9em; text-decoration: none; color: #374151; }
|
||||
.pagination .current { background: #1e3a5f; color: #fff; border-color: #1e3a5f; font-weight: bold; }
|
||||
.pagination a:hover { background: #f3f4f6; }
|
||||
.badge-sm { padding: 2px 7px; border-radius: 10px; font-size: 0.75em; font-weight: 600; }
|
||||
.badge-new { background: #dbeafe; color: #1e40af; }
|
||||
.badge-approved { background: #d1fae5; color: #065f46; }
|
||||
.badge-error { background: #fee2e2; color: #991b1b; }
|
||||
.badge-published { background: #ede9fe; color: #5b21b6; }
|
||||
.badge-review { background: #fef3c7; color: #92400e; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<header class="topbar">
|
||||
<div>
|
||||
<h1>Artikelliste</h1>
|
||||
<p>Angemeldet als <strong>{{ user }}</strong></p>
|
||||
</div>
|
||||
<div class="row">
|
||||
<a class="linkbtn" href="/admin/dashboard">Dashboard</a>
|
||||
<a class="linkbtn" href="/admin/schedule">Veröffentlichungsplan</a>
|
||||
<form method="post" action="/admin/logout">
|
||||
<button type="submit" class="secondary">Logout</button>
|
||||
</form>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
<main class="container">
|
||||
{% if flash_msg %}
|
||||
<section class="card flash {{ 'flash-error' if flash_type == 'error' else 'flash-success' }}">
|
||||
{{ flash_msg }}
|
||||
</section>
|
||||
{% endif %}
|
||||
|
||||
<!-- Filter bar (outside main form so it doesn't submit with bulk save) -->
|
||||
<section class="card" style="padding-bottom: 0.5rem;">
|
||||
<form method="get" action="/admin/article-list">
|
||||
<div class="filter-bar">
|
||||
<div>
|
||||
<label>Suche (Titel / ID)</label>
|
||||
<input type="text" name="search" value="{{ search }}" placeholder="z.B. Camping …" />
|
||||
</div>
|
||||
<div>
|
||||
<label>Status</label>
|
||||
<select name="status_filter">
|
||||
<option value="">Alle</option>
|
||||
{% for s in ["new","review","approved","published","error","no_image"] %}
|
||||
<option value="{{ s }}" {% if status_filter == s %}selected{% endif %}>{{ s }}</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
</div>
|
||||
<div style="padding-bottom:1px;">
|
||||
<button type="submit">Filtern</button>
|
||||
<a href="/admin/article-list" class="linkbtn" style="margin-left:4px;">Reset</a>
|
||||
</div>
|
||||
</div>
|
||||
</form>
|
||||
<p class="subtle" style="margin: 4px 0 0;">{{ total }} Artikel gesamt · Seite {{ page }} / {{ total_pages }} · {{ page_size }} pro Seite</p>
|
||||
</section>
|
||||
|
||||
<!-- Main form for bulk WP ID editing -->
|
||||
<form method="post" action="/admin/article-list/update" id="bulk-form">
|
||||
<!-- Pass filter/page state so redirect goes back to same view -->
|
||||
<input type="hidden" name="page" value="{{ page }}">
|
||||
<input type="hidden" name="status_filter" value="{{ status_filter }}">
|
||||
<input type="hidden" name="search" value="{{ search }}">
|
||||
|
||||
<div class="sticky-bar">
|
||||
<button type="submit" id="save-btn" disabled>💾 Änderungen speichern</button>
|
||||
<span class="change-badge" id="change-badge">0 Änderungen</span>
|
||||
<span style="font-size:0.85em;opacity:.8;">Nur geänderte WP-IDs werden gespeichert. Danach WP-Sync ausführen.</span>
|
||||
</div>
|
||||
|
||||
<section class="card" style="padding: 0; overflow-x: auto;">
|
||||
<table class="al-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th style="width:80px;">Bild</th>
|
||||
<th>Titel & Kurztext</th>
|
||||
<th style="width:90px;">Status</th>
|
||||
<th style="width:110px;">Datum</th>
|
||||
<th style="width:140px;">WP ID</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for a in articles %}
|
||||
<tr>
|
||||
<td>
|
||||
{% if a.thumb_proxy %}
|
||||
<a href="{{ a.thumb_url }}" target="_blank" rel="noopener">
|
||||
<img src="{{ a.thumb_proxy }}"
|
||||
class="al-thumb"
|
||||
alt="Vorschau"
|
||||
loading="lazy"
|
||||
onerror="this.style.display='none';this.nextElementSibling.style.display='flex';" />
|
||||
<div class="al-thumb-placeholder" style="display:none;">🖼</div>
|
||||
</a>
|
||||
{% else %}
|
||||
<div class="al-thumb-placeholder">🖼</div>
|
||||
{% endif %}
|
||||
</td>
|
||||
<td>
|
||||
<div class="al-title">
|
||||
<a href="/admin/articles/{{ a.id }}">#{{ a.id }} {{ a.title }}</a>
|
||||
</div>
|
||||
{% if a.excerpt %}
|
||||
<div class="al-excerpt">{{ a.excerpt }}</div>
|
||||
{% endif %}
|
||||
{% if a.feed_name %}
|
||||
<div class="al-excerpt" style="margin-top:4px;">📡 {{ a.feed_name }}</div>
|
||||
{% endif %}
|
||||
</td>
|
||||
<td>
|
||||
<span class="badge-sm badge-{{ a.status }}">{{ a.status }}</span>
|
||||
</td>
|
||||
<td style="font-size:0.82em;">
|
||||
{% if a.scheduled_publish_at %}
|
||||
📅 {{ a.scheduled_publish_at[:16] }}
|
||||
{% elif a.published_at %}
|
||||
{{ a.published_at[:10] }}
|
||||
{% else %}
|
||||
—
|
||||
{% endif %}
|
||||
</td>
|
||||
<td>
|
||||
<!-- Hidden original value for change detection -->
|
||||
<input type="hidden" name="orig_{{ a.id }}" value="{{ a.wp_post_id or '' }}">
|
||||
<input
|
||||
type="text"
|
||||
name="wp_{{ a.id }}"
|
||||
value="{{ a.wp_post_id or '' }}"
|
||||
data-orig="{{ a.wp_post_id or '' }}"
|
||||
class="wp-id-input"
|
||||
placeholder="—"
|
||||
inputmode="numeric"
|
||||
pattern="[0-9]*"
|
||||
/>
|
||||
{% if a.wp_post_url %}
|
||||
<a href="{{ a.wp_post_url }}" target="_blank" rel="noopener" class="wp-link">↗ WP öffnen</a>
|
||||
{% endif %}
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</section>
|
||||
</form>
|
||||
|
||||
<!-- Pagination (outside form) -->
|
||||
<div class="pagination">
|
||||
{% if page > 1 %}
|
||||
<a href="?page=1&status_filter={{ status_filter }}&search={{ search }}">«</a>
|
||||
<a href="?page={{ page - 1 }}&status_filter={{ status_filter }}&search={{ search }}">‹ Zurück</a>
|
||||
{% endif %}
|
||||
|
||||
{% for p in range([1, page - 2]|max, [total_pages + 1, page + 3]|min) %}
|
||||
{% if p == page %}
|
||||
<span class="current">{{ p }}</span>
|
||||
{% else %}
|
||||
<a href="?page={{ p }}&status_filter={{ status_filter }}&search={{ search }}">{{ p }}</a>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
|
||||
{% if page < total_pages %}
|
||||
<a href="?page={{ page + 1 }}&status_filter={{ status_filter }}&search={{ search }}">Weiter ›</a>
|
||||
<a href="?page={{ total_pages }}&status_filter={{ status_filter }}&search={{ search }}">»</a>
|
||||
{% endif %}
|
||||
</div>
|
||||
</main>
|
||||
|
||||
<script>
|
||||
(function () {
|
||||
const inputs = document.querySelectorAll('.wp-id-input');
|
||||
const btn = document.getElementById('save-btn');
|
||||
const badge = document.getElementById('change-badge');
|
||||
|
||||
function countChanges() {
|
||||
let n = 0;
|
||||
inputs.forEach(inp => {
|
||||
const changed = inp.value.trim() !== inp.dataset.orig.trim();
|
||||
inp.classList.toggle('changed', changed);
|
||||
if (changed) n++;
|
||||
});
|
||||
btn.disabled = n === 0;
|
||||
badge.textContent = n + (n === 1 ? ' Änderung' : ' Änderungen');
|
||||
badge.classList.toggle('visible', n > 0);
|
||||
}
|
||||
|
||||
inputs.forEach(inp => inp.addEventListener('input', countChanges));
|
||||
countChanges();
|
||||
})();
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
84
backend/templates/admin_connectivity.html
Normal file
84
backend/templates/admin_connectivity.html
Normal file
|
|
@ -0,0 +1,84 @@
|
|||
<!doctype html>
|
||||
<html lang="de">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<title>{{ title }}</title>
|
||||
<link rel="stylesheet" href="/admin/static/admin.css" />
|
||||
</head>
|
||||
<body>
|
||||
<header class="topbar">
|
||||
<div>
|
||||
<h1>Connectivity Check</h1>
|
||||
<p>Angemeldet als <strong>{{ user }}</strong></p>
|
||||
</div>
|
||||
<div class="row">
|
||||
<a class="linkbtn" href="/admin/dashboard">Zurück</a>
|
||||
<form method="post" action="/admin/logout">
|
||||
<button type="submit" class="secondary">Logout</button>
|
||||
</form>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
<main class="container">
|
||||
<section class="stats">
|
||||
<article class="stat">
|
||||
<div class="label">Checks</div>
|
||||
<div class="value">{{ checks|length }}</div>
|
||||
</article>
|
||||
<article class="stat">
|
||||
<div class="label">OK</div>
|
||||
<div class="value">{{ ok_count }}</div>
|
||||
</article>
|
||||
<article class="stat">
|
||||
<div class="label">Fehler</div>
|
||||
<div class="value">{{ error_count }}</div>
|
||||
</article>
|
||||
<article class="stat">
|
||||
<div class="label">Zeitpunkt</div>
|
||||
<div class="value">Live</div>
|
||||
</article>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Ziele</h2>
|
||||
<p class="subtle">Geprüft werden DNS-Auflösung, TCP-Erreichbarkeit und bei URLs ein HTTP-Request.</p>
|
||||
<form method="get" action="/admin/connectivity" class="row">
|
||||
<button type="submit">Checks neu ausführen</button>
|
||||
</form>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Ergebnis</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>Status</th><th>Name</th><th>Typ</th><th>Ziel</th><th>DNS</th><th>TCP</th><th>HTTP</th><th>Dauer</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for c in checks %}
|
||||
<tr>
|
||||
<td>{% if c.ok %}<span class="badge ok">OK</span>{% else %}<span class="badge bad">Fehler</span>{% endif %}</td>
|
||||
<td>{{ c.label }}</td>
|
||||
<td>{{ c.kind }}</td>
|
||||
<td><code>{{ c.target }}</code></td>
|
||||
<td>
|
||||
{% if c.dns_ok %}<span class="badge ok">OK</span>{% else %}<span class="badge bad">FAIL</span>{% endif %}
|
||||
<div class="subtle">{{ c.dns_info }}</div>
|
||||
</td>
|
||||
<td>
|
||||
{% if c.tcp_ok %}<span class="badge ok">OK</span>{% else %}<span class="badge bad">FAIL</span>{% endif %}
|
||||
<div class="subtle">{{ c.tcp_info }}</div>
|
||||
</td>
|
||||
<td>
|
||||
{% if c.http_ok %}<span class="badge ok">OK</span>{% else %}<span class="badge bad">FAIL</span>{% endif %}
|
||||
<div class="subtle">{{ c.http_info }}</div>
|
||||
</td>
|
||||
<td>{{ c.duration_ms }} ms</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</section>
|
||||
</main>
|
||||
</body>
|
||||
</html>
|
||||
405
backend/templates/admin_dashboard.html
Normal file
405
backend/templates/admin_dashboard.html
Normal file
|
|
@ -0,0 +1,405 @@
|
|||
<!doctype html>
|
||||
<html lang="de">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<title>{{ title }}</title>
|
||||
<link rel="stylesheet" href="/admin/static/admin.css" />
|
||||
</head>
|
||||
<body>
|
||||
<header class="topbar">
|
||||
<div>
|
||||
<h1>rss-news Admin Dashboard</h1>
|
||||
<p>Angemeldet als <strong>{{ user }}</strong></p>
|
||||
</div>
|
||||
<div class="row">
|
||||
<a class="linkbtn" href="/admin/article-list">Artikelliste</a>
|
||||
<a class="linkbtn" href="/admin/schedule">Veröffentlichungsplan</a>
|
||||
<a class="linkbtn" href="/admin/connectivity">Connectivity Check</a>
|
||||
<form method="post" action="/admin/logout">
|
||||
<button type="submit" class="secondary">Logout</button>
|
||||
</form>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
<main class="container">
|
||||
{% if flash_msg %}
|
||||
<section class="card flash {{ 'flash-error' if flash_type == 'error' else 'flash-success' }}">
|
||||
{{ flash_msg }}
|
||||
</section>
|
||||
{% endif %}
|
||||
|
||||
<section class="stats">
|
||||
<article class="stat">
|
||||
<div class="label">Quellen</div>
|
||||
<div class="value">{{ sources|length }}</div>
|
||||
</article>
|
||||
<article class="stat">
|
||||
<div class="label">Feeds</div>
|
||||
<div class="value">{{ feeds|length }}</div>
|
||||
</article>
|
||||
<article class="stat">
|
||||
<div class="label">Artikel</div>
|
||||
<div class="value">{{ articles|length }}</div>
|
||||
</article>
|
||||
<article class="stat">
|
||||
<div class="label">Runs</div>
|
||||
<div class="value">{{ runs|length }}</div>
|
||||
</article>
|
||||
</section>
|
||||
|
||||
<section class="grid two">
|
||||
<article class="card">
|
||||
<h2>Quelle anlegen</h2>
|
||||
<form method="post" action="/admin/sources/create" class="stack">
|
||||
<input name="name" placeholder="Name" required />
|
||||
<input name="base_url" placeholder="Base URL" />
|
||||
<input name="terms_url" placeholder="Terms URL" />
|
||||
<input name="license_name" placeholder="Lizenzname" />
|
||||
<select name="risk_level">
|
||||
<option value="green">green</option>
|
||||
<option value="yellow" selected>yellow</option>
|
||||
<option value="red">red</option>
|
||||
</select>
|
||||
<input name="last_reviewed_at" placeholder="last_reviewed_at (ISO)" />
|
||||
<button type="submit">Quelle speichern</button>
|
||||
</form>
|
||||
</article>
|
||||
|
||||
<article class="card">
|
||||
<h2>Feed anlegen</h2>
|
||||
<form method="post" action="/admin/feeds/create" class="stack">
|
||||
<input name="name" placeholder="Feed Name" required />
|
||||
<input name="url" placeholder="https://..." required />
|
||||
<label>Quelle</label>
|
||||
<select name="source_id">
|
||||
<option value="">-- keine --</option>
|
||||
{% for s in sources %}
|
||||
<option value="{{ s.id }}">{{ s.name }} (#{{ s.id }})</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
<button type="submit">Feed speichern</button>
|
||||
</form>
|
||||
</article>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Ingestion starten</h2>
|
||||
<form method="post" action="/admin/ingestion/run" class="row">
|
||||
<select name="feed_id">
|
||||
<option value="">Alle aktivierten Feeds</option>
|
||||
{% for f in feeds %}
|
||||
<option value="{{ f.id }}">{{ f.name }} (#{{ f.id }})</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
<button type="submit">Ingestion starten</button>
|
||||
</form>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Publisher ausführen</h2>
|
||||
<form method="post" action="/admin/publisher/run" class="row">
|
||||
<input name="max_jobs" value="10" />
|
||||
<button type="submit">Publisher Run starten</button>
|
||||
</form>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Rewrite Run (geplante Artikel)</h2>
|
||||
<p class="subtle">Verarbeitet alle Artikel im Status <code>rewrite</code> und setzt sie auf <code>publish</code>.</p>
|
||||
<form method="post" action="/admin/rewrite/run" class="row">
|
||||
<input name="max_jobs" value="10" />
|
||||
<button type="submit">Rewrite Run starten</button>
|
||||
</form>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Quellen + Policy</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>ID</th><th>Name</th><th>Risk</th><th>Lizenz</th><th>Terms</th><th>Policy</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for s in sources %}
|
||||
<tr>
|
||||
<td>{{ s.id }}</td>
|
||||
<td>{{ s.name }}</td>
|
||||
<td>{{ s.risk_level }}</td>
|
||||
<td>{{ s.license_name or "-" }}</td>
|
||||
<td>{{ s.terms_url or "-" }}</td>
|
||||
<td>
|
||||
{% if source_policy[s.id] %}
|
||||
<span class="badge bad">BLOCKED ({{ source_policy[s.id]|length }})</span>
|
||||
<div class="subtle">{{ source_policy[s.id]|join(", ") }}</div>
|
||||
{% else %}
|
||||
<span class="badge ok">OK</span>
|
||||
{% endif %}
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Quellen verwalten</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>ID</th><th>Name</th><th>URLs</th><th>Meta</th><th>Aktionen</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for s in sources %}
|
||||
{% set source_form_id = 'source-update-' ~ s.id %}
|
||||
<tr>
|
||||
<td>#{{ s.id }}</td>
|
||||
<td>
|
||||
<input form="{{ source_form_id }}" name="name" value="{{ s.name }}" required />
|
||||
</td>
|
||||
<td>
|
||||
<input form="{{ source_form_id }}" name="base_url" value="{{ s.base_url or '' }}" placeholder="Base URL" />
|
||||
<input form="{{ source_form_id }}" name="terms_url" value="{{ s.terms_url or '' }}" placeholder="Terms URL" />
|
||||
<input form="{{ source_form_id }}" name="license_name" value="{{ s.license_name or '' }}" placeholder="Lizenz" />
|
||||
</td>
|
||||
<td>
|
||||
<select form="{{ source_form_id }}" name="risk_level">
|
||||
<option value="green" {% if s.risk_level == 'green' %}selected{% endif %}>green</option>
|
||||
<option value="yellow" {% if s.risk_level == 'yellow' %}selected{% endif %}>yellow</option>
|
||||
<option value="red" {% if s.risk_level == 'red' %}selected{% endif %}>red</option>
|
||||
</select>
|
||||
<select form="{{ source_form_id }}" name="is_enabled">
|
||||
<option value="1" {% if s.is_enabled %}selected{% endif %}>aktiv</option>
|
||||
<option value="0" {% if not s.is_enabled %}selected{% endif %}>inaktiv</option>
|
||||
</select>
|
||||
<input form="{{ source_form_id }}" name="last_reviewed_at" value="{{ s.last_reviewed_at or '' }}" placeholder="last_reviewed_at" />
|
||||
<input form="{{ source_form_id }}" name="notes" value="{{ s.notes or '' }}" placeholder="Notiz" />
|
||||
</td>
|
||||
<td>
|
||||
<div class="inline">
|
||||
<form method="post" action="/admin/sources/{{ s.id }}/update" id="{{ source_form_id }}" class="inline">
|
||||
<button type="submit" class="secondary">Speichern</button>
|
||||
</form>
|
||||
<form method="post" action="/admin/sources/{{ s.id }}/delete" class="inline" onsubmit="return confirm('Quelle wirklich löschen?');">
|
||||
<button type="submit">Löschen</button>
|
||||
</form>
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Feeds verwalten</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>ID</th><th>Name</th><th>URL</th><th>Quelle</th><th>Status</th><th>Aktionen</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for f in feeds %}
|
||||
{% set feed_form_id = 'feed-update-' ~ f.id %}
|
||||
<tr>
|
||||
<td>#{{ f.id }}</td>
|
||||
<td>
|
||||
<input form="{{ feed_form_id }}" name="name" value="{{ f.name }}" required />
|
||||
</td>
|
||||
<td><input form="{{ feed_form_id }}" name="url" value="{{ f.url }}" required /></td>
|
||||
<td>
|
||||
<select form="{{ feed_form_id }}" name="source_id">
|
||||
<option value="">-- keine --</option>
|
||||
{% for s in sources %}
|
||||
<option value="{{ s.id }}" {% if f.source_id == s.id %}selected{% endif %}>{{ s.name }} (#{{ s.id }})</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
</td>
|
||||
<td>
|
||||
<select form="{{ feed_form_id }}" name="is_enabled">
|
||||
<option value="1" {% if f.is_enabled %}selected{% endif %}>aktiv</option>
|
||||
<option value="0" {% if not f.is_enabled %}selected{% endif %}>inaktiv</option>
|
||||
</select>
|
||||
</td>
|
||||
<td>
|
||||
<div class="inline">
|
||||
<form method="post" action="/admin/feeds/{{ f.id }}/update" id="{{ feed_form_id }}" class="inline">
|
||||
<button type="submit" class="secondary">Speichern</button>
|
||||
</form>
|
||||
<form method="post" action="/admin/feeds/{{ f.id }}/delete" class="inline" onsubmit="return confirm('Feed wirklich löschen?');">
|
||||
<button type="submit">Löschen</button>
|
||||
</form>
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Artikel (Review)</h2>
|
||||
<form method="get" action="/admin/dashboard" class="row filter-row">
|
||||
<label>Status-Filter</label>
|
||||
<select name="status_filter">
|
||||
<option value="" {% if not status_filter %}selected{% endif %}>alle</option>
|
||||
{% for s in status_options %}
|
||||
<option value="{{ s }}" {% if status_filter == s %}selected{% endif %}>{{ s }}</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
<button type="submit" class="secondary">Filtern</button>
|
||||
<a href="/admin/dashboard" class="linkbtn">Reset</a>
|
||||
<a href="/api/articles/export?format=json{% if status_filter %}&status_filter={{ status_filter }}{% endif %}" class="linkbtn">Export JSON</a>
|
||||
<a href="/api/articles/export?format=csv{% if status_filter %}&status_filter={{ status_filter }}{% endif %}" class="linkbtn">Export CSV</a>
|
||||
</form>
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>ID</th><th>Artikel</th><th>Status</th><th>Details</th><th>Rewrite</th><th>Transition</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for a in articles %}
|
||||
<tr>
|
||||
<td>{{ a.id }}</td>
|
||||
<td>
|
||||
<strong>{{ a.title }}</strong><br />
|
||||
<span class="subtle">Autor: {{ a.author or "-" }}</span><br />
|
||||
<span class="subtle">Datum: {{ a.published_at or "-" }} | Alter: {{ a.days_old if a.days_old is not none else "-" }} Tage | Relevanz: {{ a.relevance }}</span><br />
|
||||
<a href="{{ a.source_url }}" target="_blank" rel="noopener">Original öffnen</a>
|
||||
<br /><a href="/admin/articles/{{ a.id }}">Details anzeigen</a>
|
||||
{% if a.canonical_url and a.canonical_url != a.source_url %}
|
||||
<br /><a href="{{ a.canonical_url }}" target="_blank" rel="noopener">Canonical öffnen</a>
|
||||
{% endif %}
|
||||
</td>
|
||||
<td><span class="badge">{{ a.status_ui }}</span></td>
|
||||
<td>
|
||||
<div class="subtle">Publish: {{ "bereit" if a.publish_ready else "blockiert" }}</div>
|
||||
{% if not a.publish_ready and a.publish_blockers %}
|
||||
<div class="subtle">{{ a.publish_blockers|join(", ") }}</div>
|
||||
{% endif %}
|
||||
{% if a.selected_image_url %}
|
||||
<div class="subtle">Hauptbild gesetzt</div>
|
||||
<a href="{{ a.selected_image_url }}" target="_blank" rel="noopener"><img src="{{ a.selected_image_proxy_url }}" data-fallback-src="{{ a.selected_image_url }}" alt="Hauptbild" class="thumb" loading="lazy" onerror="if(!this.dataset.fallbackUsed){this.dataset.fallbackUsed='1';this.src=this.dataset.fallbackSrc;}else{this.classList.add('img-failed');}" /></a>
|
||||
{% endif %}
|
||||
{% if a.summary %}
|
||||
<div><strong>Summary:</strong> {{ a.summary }}</div>
|
||||
{% endif %}
|
||||
{% if a.generated_tags %}
|
||||
<div><strong>Tags:</strong> {{ a.generated_tags|join("; ") }}</div>
|
||||
{% endif %}
|
||||
{% if a.content_raw %}
|
||||
<details>
|
||||
<summary>Volltext anzeigen</summary>
|
||||
<div class="pre">{{ a.content_raw }}</div>
|
||||
</details>
|
||||
{% endif %}
|
||||
<div class="subtle">Bilder: {{ a.extracted_images|length }}</div>
|
||||
{% if a.extracted_images %}
|
||||
<details>
|
||||
<summary>Bild-URLs</summary>
|
||||
<ul>
|
||||
{% for img in a.extracted_images %}
|
||||
<li><a href="{{ img }}" target="_blank" rel="noopener">{{ img }}</a></li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
</details>
|
||||
{% endif %}
|
||||
{% if a.press_contact %}
|
||||
<details>
|
||||
<summary>Pressekontakt</summary>
|
||||
<div class="pre">{{ a.press_contact }}</div>
|
||||
</details>
|
||||
{% endif %}
|
||||
{% if a.extraction_error %}
|
||||
<div class="subtle">Extraktionsfehler: {{ a.extraction_error }}</div>
|
||||
{% endif %}
|
||||
</td>
|
||||
<td>
|
||||
{% if a.status_ui in ["new", "rewrite"] %}
|
||||
<form method="post" action="/admin/articles/{{ a.id }}/rewrite-run" class="inline">
|
||||
<button type="submit">Rewrite ausführen</button>
|
||||
</form>
|
||||
{% else %}
|
||||
-
|
||||
{% endif %}
|
||||
</td>
|
||||
<td>
|
||||
<form method="post" action="/admin/articles/{{ a.id }}/transition" class="inline">
|
||||
<select name="target_status">
|
||||
{% for s in allowed_transitions.get(a.status_ui, []) %}
|
||||
<option value="{{ s }}">{{ s }}</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
{% if allowed_transitions.get(a.status_ui, []) %}
|
||||
<button type="submit" class="secondary">Setzen</button>
|
||||
{% else %}
|
||||
<span class="subtle">keine Aktion</span>
|
||||
{% endif %}
|
||||
</form>
|
||||
{% if a.status_ui == 'close' %}
|
||||
<form method="post" action="/admin/articles/{{ a.id }}/retry" class="inline" style="margin-top:4px;">
|
||||
<button type="submit" title="Artikel auf 'neu' zurücksetzen – wird beim nächsten Pipeline-Lauf erneut verarbeitet">🔄 Wiederholen</button>
|
||||
</form>
|
||||
{% endif %}
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Runs</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>ID</th><th>Typ</th><th>Status</th><th>Start</th><th>Ende</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for r in runs %}
|
||||
<tr>
|
||||
<td>{{ r.id }}</td>
|
||||
<td>{{ r.run_type }}</td>
|
||||
<td>{{ r.status }}</td>
|
||||
<td>{{ r.started_at }}</td>
|
||||
<td>{{ r.finished_at or "-" }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Publish Jobs</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>ID</th><th>Artikel</th><th>Status</th><th>Attempts</th><th>WP Post</th><th>Fehler</th><th>Hinweis</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for j in publish_jobs %}
|
||||
<tr>
|
||||
<td>{{ j.id }}</td>
|
||||
<td>#{{ j.article_id }} {{ j.article_title or "-" }}</td>
|
||||
<td>{{ j.status }}</td>
|
||||
<td>{{ j.attempts }}/{{ j.max_attempts }}</td>
|
||||
<td>
|
||||
{% if j.wp_post_url %}
|
||||
<a href="{{ j.wp_post_url }}" target="_blank" rel="noopener">#{{ j.wp_post_id }}</a>
|
||||
{% elif j.wp_post_id %}
|
||||
#{{ j.wp_post_id }}
|
||||
{% else %}
|
||||
-
|
||||
{% endif %}
|
||||
</td>
|
||||
<td>
|
||||
{% if j.error_message %}
|
||||
<span class="badge errcat errcat-{{ j.error_category }}">{{ j.error_category }}</span>
|
||||
<div>{{ j.error_message }}</div>
|
||||
{% else %}
|
||||
-
|
||||
{% endif %}
|
||||
</td>
|
||||
<td>{{ j.error_hint or "-" }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</section>
|
||||
</main>
|
||||
</body>
|
||||
</html>
|
||||
27
backend/templates/admin_login.html
Normal file
27
backend/templates/admin_login.html
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
<!doctype html>
|
||||
<html lang="de">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<title>{{ title }}</title>
|
||||
<link rel="stylesheet" href="/admin/static/admin.css" />
|
||||
</head>
|
||||
<body>
|
||||
<main class="container login">
|
||||
<h1>rss-news Admin</h1>
|
||||
<p>Bitte anmelden, um das Tool zu verwalten.</p>
|
||||
{% if error %}
|
||||
<div class="alert">Login fehlgeschlagen. Bitte pruefen.</div>
|
||||
{% endif %}
|
||||
<form method="post" action="/admin/login" class="card">
|
||||
<label>Benutzername
|
||||
<input type="text" name="username" required />
|
||||
</label>
|
||||
<label>Passwort
|
||||
<input type="password" name="password" required />
|
||||
</label>
|
||||
<button type="submit">Anmelden</button>
|
||||
</form>
|
||||
</main>
|
||||
</body>
|
||||
</html>
|
||||
143
backend/templates/admin_schedule.html
Normal file
143
backend/templates/admin_schedule.html
Normal file
|
|
@ -0,0 +1,143 @@
|
|||
<!doctype html>
|
||||
<html lang="de">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<title>{{ title }}</title>
|
||||
<link rel="stylesheet" href="/admin/static/admin.css" />
|
||||
<style>
|
||||
.schedule-table td, .schedule-table th { padding: 6px 10px; }
|
||||
.slot-free { color: #aaa; font-style: italic; }
|
||||
.slot-booked-db { color: #1a7a1a; font-weight: bold; }
|
||||
.slot-booked-wp { color: #b35a00; font-weight: bold; }
|
||||
.badge-db { background: #d4edda; color: #155724; padding: 2px 6px; border-radius: 4px; font-size: 0.75em; }
|
||||
.badge-wp { background: #fff3cd; color: #856404; padding: 2px 6px; border-radius: 4px; font-size: 0.75em; }
|
||||
.summary-bar { display: flex; gap: 1.5rem; margin-bottom: 1rem; font-size: 0.95em; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<header class="topbar">
|
||||
<div>
|
||||
<h1>rss-news Veröffentlichungsplan</h1>
|
||||
<p>Angemeldet als <strong>{{ user }}</strong></p>
|
||||
</div>
|
||||
<div class="row">
|
||||
<a class="linkbtn" href="/admin/dashboard">Dashboard</a>
|
||||
<a class="linkbtn" href="/admin/connectivity">Connectivity</a>
|
||||
<form method="post" action="/admin/logout">
|
||||
<button type="submit" class="secondary">Logout</button>
|
||||
</form>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
<main class="container">
|
||||
{% if flash_msg %}
|
||||
<section class="card flash {{ 'flash-error' if flash_type == 'error' else 'flash-success' }}">
|
||||
{{ flash_msg }}
|
||||
</section>
|
||||
{% endif %}
|
||||
|
||||
<section class="card" style="display:flex;align-items:center;justify-content:space-between;flex-wrap:wrap;gap:1rem;">
|
||||
<div>
|
||||
<h2 style="margin:0;">WordPress → DB Synchronisieren</h2>
|
||||
<p class="subtle" style="margin:4px 0 0;">Liest alle geplanten WP-Beiträge und aktualisiert die Slots in der lokalen DB.<br>Nutze dies nach manuellen Änderungen in WordPress.</p>
|
||||
</div>
|
||||
<form method="post" action="/admin/wp-sync">
|
||||
<button type="submit">🔄 Jetzt synchronisieren</button>
|
||||
</form>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Slot-Übersicht (nächste 60 Tage)</h2>
|
||||
<div class="summary-bar">
|
||||
<span>📅 Belegte Slots gesamt: <strong>{{ slots|length }}</strong></span>
|
||||
<span>🗄️ Aus Pipeline-DB: <strong>{{ slots|selectattr('source', 'eq', 'db')|list|length }}</strong></span>
|
||||
<span>🌐 Nur in WordPress: <strong>{{ slots|selectattr('source', 'eq', 'wordpress')|list|length }}</strong></span>
|
||||
</div>
|
||||
<table class="schedule-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Tag</th>
|
||||
{% for h in hours %}
|
||||
<th>{{ "%02d:00 Uhr"|format(h) }}</th>
|
||||
{% endfor %}
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for day in calendar_days %}
|
||||
{% if day.any_booked %}
|
||||
<tr>
|
||||
<td><strong>{{ day.weekday }}</strong> {{ day.date_fmt }}</td>
|
||||
{% for s in day.slots %}
|
||||
<td>
|
||||
{% if s.booked %}
|
||||
{% set info = s.slot %}
|
||||
{% if info.source == 'db' %}
|
||||
<span class="slot-booked-db">✅</span>
|
||||
<span class="badge-db">DB</span>
|
||||
<div style="font-size:0.85em;">
|
||||
{% if info.article_id %}
|
||||
<a href="/admin/articles/{{ info.article_id }}">
|
||||
{{ (info.article_title or "Artikel")[:50] }}{% if (info.article_title or "")|length > 50 %}…{% endif %}
|
||||
</a>
|
||||
{% endif %}
|
||||
<br /><span class="subtle">Status: {{ info.article_status }}</span>
|
||||
{% if info.wp_post_url %}
|
||||
<br /><a href="{{ info.wp_post_url }}" target="_blank" rel="noopener">WP öffnen</a>
|
||||
{% endif %}
|
||||
</div>
|
||||
{% else %}
|
||||
<span class="slot-booked-wp">⚠️</span>
|
||||
<span class="badge-wp">WP</span>
|
||||
<div style="font-size:0.85em;">{{ info.article_title }}</div>
|
||||
{% endif %}
|
||||
{% else %}
|
||||
<span class="slot-free">frei</span>
|
||||
{% endif %}
|
||||
</td>
|
||||
{% endfor %}
|
||||
</tr>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
{% if not slots %}
|
||||
<p class="subtle">Keine geplanten Beiträge in den nächsten 60 Tagen.</p>
|
||||
{% endif %}
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Alle belegten Slots (Liste)</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>Datum/Zeit</th><th>Quelle</th><th>Artikel</th><th>Status</th><th>WordPress</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for s in slots %}
|
||||
<tr>
|
||||
<td>{{ s.formatted }}</td>
|
||||
<td>
|
||||
{% if s.source == 'db' %}<span class="badge-db">Pipeline-DB</span>
|
||||
{% else %}<span class="badge-wp">WordPress</span>{% endif %}
|
||||
</td>
|
||||
<td>
|
||||
{% if s.article_id %}
|
||||
<a href="/admin/articles/{{ s.article_id }}">{{ (s.article_title or "")[:60] }}</a>
|
||||
{% else %}
|
||||
{{ s.article_title or "-" }}
|
||||
{% endif %}
|
||||
</td>
|
||||
<td>{{ s.article_status or "-" }}</td>
|
||||
<td>
|
||||
{% if s.wp_post_url %}
|
||||
<a href="{{ s.wp_post_url }}" target="_blank" rel="noopener">Draft öffnen</a>
|
||||
{% else %}-{% endif %}
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</section>
|
||||
</main>
|
||||
</body>
|
||||
</html>
|
||||
1
backend/tests/__init__.py
Normal file
1
backend/tests/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
"""Tests package."""
|
||||
419
backend/tests/test_admin_ui.py
Normal file
419
backend/tests/test_admin_ui.py
Normal file
|
|
@ -0,0 +1,419 @@
|
|||
import os
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from backend.app import config as config_module
|
||||
from backend.app.db import init_db
|
||||
from backend.app.main import app
|
||||
from backend.app.repositories import (
|
||||
ArticleUpsert,
|
||||
FeedCreate,
|
||||
SourceCreate,
|
||||
create_feed,
|
||||
create_source,
|
||||
get_article_by_id,
|
||||
upsert_article,
|
||||
)
|
||||
|
||||
|
||||
class TestAdminUi(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.tmp_dir = tempfile.TemporaryDirectory()
|
||||
os.environ["APP_DB_PATH"] = str(Path(self.tmp_dir.name) / "admin_ui.db")
|
||||
os.environ["APP_ADMIN_USERNAME"] = "admin"
|
||||
os.environ["APP_ADMIN_PASSWORD"] = "secret"
|
||||
config_module.get_settings.cache_clear()
|
||||
init_db()
|
||||
self.client = TestClient(app)
|
||||
|
||||
def tearDown(self) -> None:
|
||||
config_module.get_settings.cache_clear()
|
||||
os.environ.pop("APP_DB_PATH", None)
|
||||
os.environ.pop("APP_ADMIN_USERNAME", None)
|
||||
os.environ.pop("APP_ADMIN_PASSWORD", None)
|
||||
self.tmp_dir.cleanup()
|
||||
|
||||
def test_admin_login_and_dashboard(self) -> None:
|
||||
login_page = self.client.get("/admin/login")
|
||||
self.assertEqual(login_page.status_code, 200)
|
||||
self.assertIn("rss-news Admin", login_page.text)
|
||||
|
||||
login = self.client.post(
|
||||
"/admin/login",
|
||||
data={"username": "admin", "password": "secret"},
|
||||
follow_redirects=True,
|
||||
)
|
||||
self.assertEqual(login.status_code, 200)
|
||||
self.assertIn("Admin Dashboard", login.text)
|
||||
|
||||
def test_dashboard_redirects_if_not_logged_in(self) -> None:
|
||||
res = self.client.get("/admin/dashboard", follow_redirects=False)
|
||||
self.assertEqual(res.status_code, 303)
|
||||
self.assertEqual(res.headers.get("location"), "/admin/login")
|
||||
|
||||
def test_create_feed_with_empty_source_id_does_not_error(self) -> None:
|
||||
self.client.post(
|
||||
"/admin/login",
|
||||
data={"username": "admin", "password": "secret"},
|
||||
follow_redirects=True,
|
||||
)
|
||||
# empty source_id used to cause validation issues in form parsing
|
||||
res = self.client.post(
|
||||
"/admin/feeds/create",
|
||||
data={"name": "Feed X", "url": "https://example.org/feed.xml", "source_id": ""},
|
||||
follow_redirects=False,
|
||||
)
|
||||
self.assertEqual(res.status_code, 303)
|
||||
self.assertTrue(res.headers.get("location", "").startswith("/admin/dashboard"))
|
||||
|
||||
def test_article_detail_page_renders(self) -> None:
|
||||
source_id = create_source(
|
||||
SourceCreate(
|
||||
name="Test Source",
|
||||
base_url="https://example.org",
|
||||
terms_url="https://example.org/terms",
|
||||
license_name="cc-by",
|
||||
risk_level="green",
|
||||
is_enabled=True,
|
||||
notes=None,
|
||||
last_reviewed_at="2026-02-18T00:00:00Z",
|
||||
)
|
||||
)
|
||||
feed_id = create_feed(
|
||||
FeedCreate(
|
||||
name="Test Feed",
|
||||
url="https://example.org/feed.xml",
|
||||
source_id=source_id,
|
||||
is_enabled=True,
|
||||
)
|
||||
)
|
||||
article_id = upsert_article(
|
||||
ArticleUpsert(
|
||||
feed_id=feed_id,
|
||||
source_article_id="id-1",
|
||||
source_hash="hash-1",
|
||||
title="Titel A",
|
||||
source_url="https://example.org/a",
|
||||
canonical_url="https://example.org/a",
|
||||
published_at=None,
|
||||
author="Autor A",
|
||||
summary="Summary A",
|
||||
content_raw="Volltext A",
|
||||
content_rewritten=None,
|
||||
image_urls_json='["https://example.org/img.jpg"]',
|
||||
press_contact="Kontakt",
|
||||
source_name_snapshot="Test Source",
|
||||
source_terms_url_snapshot="https://example.org/terms",
|
||||
source_license_name_snapshot="cc-by",
|
||||
legal_checked=False,
|
||||
legal_checked_at=None,
|
||||
legal_note=None,
|
||||
wp_post_id=None,
|
||||
wp_post_url=None,
|
||||
publish_attempts=0,
|
||||
publish_last_error=None,
|
||||
published_to_wp_at=None,
|
||||
word_count=2,
|
||||
status="new",
|
||||
meta_json='{"extraction":{"images":["https://example.org/img.jpg"],"press_contact":"Kontakt"}}',
|
||||
)
|
||||
)
|
||||
|
||||
self.client.post(
|
||||
"/admin/login",
|
||||
data={"username": "admin", "password": "secret"},
|
||||
follow_redirects=True,
|
||||
)
|
||||
res = self.client.get(f"/admin/articles/{article_id}", follow_redirects=True)
|
||||
self.assertEqual(res.status_code, 200)
|
||||
self.assertIn("Artikel-Detail", res.text)
|
||||
self.assertIn("Checkliste", res.text)
|
||||
|
||||
decision = self.client.post(
|
||||
f"/admin/articles/{article_id}/images/decision",
|
||||
data={"image_url": "https://example.org/img.jpg", "action": "select"},
|
||||
follow_redirects=True,
|
||||
)
|
||||
self.assertEqual(decision.status_code, 200)
|
||||
self.assertIn("Ausgewähltes Hauptbild", decision.text)
|
||||
|
||||
article = get_article_by_id(article_id)
|
||||
self.assertIsNotNone(article)
|
||||
self.assertIn("selected_url", article.get("meta_json", ""))
|
||||
|
||||
def test_manage_source_and_feed(self) -> None:
|
||||
source_id = create_source(
|
||||
SourceCreate(
|
||||
name="Edit Source",
|
||||
base_url="https://example.org",
|
||||
terms_url="https://example.org/terms",
|
||||
license_name="cc-by",
|
||||
risk_level="yellow",
|
||||
is_enabled=True,
|
||||
notes=None,
|
||||
last_reviewed_at=None,
|
||||
)
|
||||
)
|
||||
feed_id = create_feed(
|
||||
FeedCreate(
|
||||
name="Edit Feed",
|
||||
url="https://example.org/feed.xml",
|
||||
source_id=source_id,
|
||||
is_enabled=True,
|
||||
)
|
||||
)
|
||||
self.client.post("/admin/login", data={"username": "admin", "password": "secret"}, follow_redirects=True)
|
||||
|
||||
update_source_res = self.client.post(
|
||||
f"/admin/sources/{source_id}/update",
|
||||
data={
|
||||
"name": "Edit Source 2",
|
||||
"base_url": "https://example.org/new",
|
||||
"terms_url": "https://example.org/new-terms",
|
||||
"license_name": "cc0",
|
||||
"risk_level": "green",
|
||||
"is_enabled": "1",
|
||||
"notes": "ok",
|
||||
"last_reviewed_at": "2026-02-21T12:00:00Z",
|
||||
},
|
||||
follow_redirects=False,
|
||||
)
|
||||
self.assertEqual(update_source_res.status_code, 303)
|
||||
|
||||
update_feed_res = self.client.post(
|
||||
f"/admin/feeds/{feed_id}/update",
|
||||
data={
|
||||
"name": "Edit Feed 2",
|
||||
"url": "https://example.org/feed2.xml",
|
||||
"source_id": str(source_id),
|
||||
"is_enabled": "0",
|
||||
},
|
||||
follow_redirects=False,
|
||||
)
|
||||
self.assertEqual(update_feed_res.status_code, 303)
|
||||
|
||||
delete_feed_res = self.client.post(f"/admin/feeds/{feed_id}/delete", follow_redirects=False)
|
||||
self.assertEqual(delete_feed_res.status_code, 303)
|
||||
delete_source_res = self.client.post(f"/admin/sources/{source_id}/delete", follow_redirects=False)
|
||||
self.assertEqual(delete_source_res.status_code, 303)
|
||||
|
||||
def test_rewrite_save_and_reopen(self) -> None:
|
||||
source_id = create_source(
|
||||
SourceCreate(
|
||||
name="Test Source",
|
||||
base_url="https://example.org",
|
||||
terms_url="https://example.org/terms",
|
||||
license_name="cc-by",
|
||||
risk_level="green",
|
||||
is_enabled=True,
|
||||
notes=None,
|
||||
last_reviewed_at="2026-02-18T00:00:00Z",
|
||||
)
|
||||
)
|
||||
feed_id = create_feed(
|
||||
FeedCreate(
|
||||
name="Test Feed",
|
||||
url="https://example.org/feed.xml",
|
||||
source_id=source_id,
|
||||
is_enabled=True,
|
||||
)
|
||||
)
|
||||
article_id = upsert_article(
|
||||
ArticleUpsert(
|
||||
feed_id=feed_id,
|
||||
source_article_id="id-published",
|
||||
source_hash="hash-published",
|
||||
title="Titel Published",
|
||||
source_url="https://example.org/published",
|
||||
canonical_url="https://example.org/published",
|
||||
published_at=None,
|
||||
author="Autor A",
|
||||
summary="Summary",
|
||||
content_raw="Raw",
|
||||
content_rewritten="<p>Alt</p>",
|
||||
image_urls_json=None,
|
||||
press_contact=None,
|
||||
source_name_snapshot="Test Source",
|
||||
source_terms_url_snapshot="https://example.org/terms",
|
||||
source_license_name_snapshot="cc-by",
|
||||
legal_checked=True,
|
||||
legal_checked_at="2026-02-21T10:00:00Z",
|
||||
legal_note=None,
|
||||
wp_post_id=123,
|
||||
wp_post_url="https://example.org/?p=123",
|
||||
publish_attempts=2,
|
||||
publish_last_error=None,
|
||||
published_to_wp_at="2026-02-21T10:10:00Z",
|
||||
word_count=1,
|
||||
status="published",
|
||||
meta_json="{}",
|
||||
)
|
||||
)
|
||||
self.client.post("/admin/login", data={"username": "admin", "password": "secret"}, follow_redirects=True)
|
||||
|
||||
save_res = self.client.post(
|
||||
f"/admin/articles/{article_id}/rewrite-save",
|
||||
data={"content_rewritten": "<h2>Neu</h2><p>Text</p>"},
|
||||
follow_redirects=False,
|
||||
)
|
||||
self.assertEqual(save_res.status_code, 303)
|
||||
|
||||
reopen_res = self.client.post(f"/admin/articles/{article_id}/reopen", follow_redirects=False)
|
||||
self.assertEqual(reopen_res.status_code, 303)
|
||||
|
||||
article = get_article_by_id(article_id)
|
||||
self.assertIsNotNone(article)
|
||||
self.assertEqual(article.get("status"), "rewrite")
|
||||
self.assertIn("Neu", article.get("content_rewritten") or "")
|
||||
self.assertIsNone(article.get("wp_post_id"))
|
||||
|
||||
@patch("backend.app.admin_ui.generate_article_tags")
|
||||
@patch("backend.app.admin_ui.rewrite_article_text")
|
||||
def test_batch_rewrite_run_processes_planned_articles(self, mock_rewrite_text, mock_tags) -> None:
|
||||
mock_rewrite_text.return_value = "<h2>Neu</h2><p>Text</p>"
|
||||
mock_tags.return_value = ["Rheingas", "Monheim"]
|
||||
|
||||
source_id = create_source(
|
||||
SourceCreate(
|
||||
name="Batch Source",
|
||||
base_url="https://example.org",
|
||||
terms_url="https://example.org/terms",
|
||||
license_name="cc-by",
|
||||
risk_level="green",
|
||||
is_enabled=True,
|
||||
notes=None,
|
||||
last_reviewed_at=None,
|
||||
)
|
||||
)
|
||||
feed_id = create_feed(
|
||||
FeedCreate(
|
||||
name="Batch Feed",
|
||||
url="https://example.org/feed.xml",
|
||||
source_id=source_id,
|
||||
is_enabled=True,
|
||||
)
|
||||
)
|
||||
article_id = upsert_article(
|
||||
ArticleUpsert(
|
||||
feed_id=feed_id,
|
||||
source_article_id="batch-1",
|
||||
source_hash="batch-hash-1",
|
||||
title="Batch Titel",
|
||||
source_url="https://example.org/batch",
|
||||
canonical_url="https://example.org/batch",
|
||||
published_at=None,
|
||||
author="Autor",
|
||||
summary="Summary",
|
||||
content_raw="Raw",
|
||||
content_rewritten=None,
|
||||
image_urls_json=None,
|
||||
press_contact=None,
|
||||
source_name_snapshot="Batch Source",
|
||||
source_terms_url_snapshot="https://example.org/terms",
|
||||
source_license_name_snapshot="cc-by",
|
||||
legal_checked=False,
|
||||
legal_checked_at=None,
|
||||
legal_note=None,
|
||||
wp_post_id=None,
|
||||
wp_post_url=None,
|
||||
publish_attempts=0,
|
||||
publish_last_error=None,
|
||||
published_to_wp_at=None,
|
||||
word_count=1,
|
||||
status="rewrite",
|
||||
meta_json="{}",
|
||||
)
|
||||
)
|
||||
self.client.post("/admin/login", data={"username": "admin", "password": "secret"}, follow_redirects=True)
|
||||
res = self.client.post("/admin/rewrite/run", data={"max_jobs": "10"}, follow_redirects=False)
|
||||
self.assertEqual(res.status_code, 303)
|
||||
article = get_article_by_id(article_id)
|
||||
self.assertIsNotNone(article)
|
||||
self.assertEqual(article.get("status"), "approved")
|
||||
self.assertIn("generated_tags", article.get("meta_json", ""))
|
||||
|
||||
@patch("backend.app.admin_ui.urlopen")
|
||||
def test_image_proxy_returns_image_data(self, mock_urlopen) -> None:
|
||||
class _FakeHeaders:
|
||||
def get(self, key: str, default=None):
|
||||
if key.lower() == "content-type":
|
||||
return "image/jpeg"
|
||||
return default
|
||||
|
||||
class _FakeResponse:
|
||||
headers = _FakeHeaders()
|
||||
|
||||
def read(self):
|
||||
return b"\xff\xd8\xff\xd9"
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
return False
|
||||
|
||||
mock_urlopen.return_value = _FakeResponse()
|
||||
|
||||
self.client.post(
|
||||
"/admin/login",
|
||||
data={"username": "admin", "password": "secret"},
|
||||
follow_redirects=True,
|
||||
)
|
||||
res = self.client.get("/admin/images/proxy?url=https%3A%2F%2Fexample.org%2Fimg.jpg")
|
||||
self.assertEqual(res.status_code, 200)
|
||||
self.assertIn("image/jpeg", res.headers.get("content-type", ""))
|
||||
|
||||
@patch("backend.app.admin_ui._run_connectivity_check")
|
||||
@patch("backend.app.admin_ui._build_connectivity_targets")
|
||||
def test_connectivity_page_renders(self, mock_targets, mock_check) -> None:
|
||||
mock_targets.return_value = [
|
||||
{"label": "OpenAI API", "kind": "host", "value": "api.openai.com"},
|
||||
{"label": "WordPress REST", "kind": "url", "value": "https://example.org/wp-json/wp/v2"},
|
||||
]
|
||||
mock_check.side_effect = [
|
||||
{
|
||||
"label": "OpenAI API",
|
||||
"kind": "host",
|
||||
"target": "api.openai.com",
|
||||
"dns_ok": True,
|
||||
"dns_info": "1.2.3.4",
|
||||
"tcp_ok": True,
|
||||
"tcp_info": "port 443 erreichbar",
|
||||
"http_ok": True,
|
||||
"http_info": "n/a (host-only)",
|
||||
"duration_ms": 12,
|
||||
"ok": True,
|
||||
},
|
||||
{
|
||||
"label": "WordPress REST",
|
||||
"kind": "url",
|
||||
"target": "https://example.org/wp-json/wp/v2",
|
||||
"dns_ok": False,
|
||||
"dns_info": "Name or service not known",
|
||||
"tcp_ok": False,
|
||||
"tcp_info": "-",
|
||||
"http_ok": False,
|
||||
"http_info": "-",
|
||||
"duration_ms": 10,
|
||||
"ok": False,
|
||||
},
|
||||
]
|
||||
|
||||
self.client.post(
|
||||
"/admin/login",
|
||||
data={"username": "admin", "password": "secret"},
|
||||
follow_redirects=True,
|
||||
)
|
||||
res = self.client.get("/admin/connectivity", follow_redirects=True)
|
||||
self.assertEqual(res.status_code, 200)
|
||||
self.assertIn("Connectivity Check", res.text)
|
||||
self.assertIn("OpenAI API", res.text)
|
||||
self.assertIn("WordPress REST", res.text)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
144
backend/tests/test_api_auth.py
Normal file
144
backend/tests/test_api_auth.py
Normal file
|
|
@ -0,0 +1,144 @@
|
|||
import os
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from backend.app import config as config_module
|
||||
from backend.app.db import init_db
|
||||
from backend.app.main import app
|
||||
|
||||
|
||||
class TestApiAuth(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.tmp_dir = tempfile.TemporaryDirectory()
|
||||
os.environ["APP_DB_PATH"] = str(Path(self.tmp_dir.name) / "api.db")
|
||||
os.environ["APP_ADMIN_USERNAME"] = "admin"
|
||||
os.environ["APP_ADMIN_PASSWORD"] = "secret"
|
||||
config_module.get_settings.cache_clear()
|
||||
init_db()
|
||||
self.client = TestClient(app)
|
||||
|
||||
def tearDown(self) -> None:
|
||||
config_module.get_settings.cache_clear()
|
||||
os.environ.pop("APP_DB_PATH", None)
|
||||
os.environ.pop("APP_ADMIN_USERNAME", None)
|
||||
os.environ.pop("APP_ADMIN_PASSWORD", None)
|
||||
self.tmp_dir.cleanup()
|
||||
|
||||
def test_login_and_protected_endpoint(self) -> None:
|
||||
r = self.client.post("/auth/login", json={"username": "admin", "password": "secret"})
|
||||
self.assertEqual(r.status_code, 200)
|
||||
|
||||
p = self.client.get("/api/protected")
|
||||
self.assertEqual(p.status_code, 200)
|
||||
self.assertTrue(p.json().get("ok"))
|
||||
|
||||
def test_protected_requires_auth(self) -> None:
|
||||
r = self.client.get("/api/protected")
|
||||
self.assertEqual(r.status_code, 401)
|
||||
|
||||
def test_run_detail_endpoint(self) -> None:
|
||||
login = self.client.post("/auth/login", json={"username": "admin", "password": "secret"})
|
||||
self.assertEqual(login.status_code, 200)
|
||||
|
||||
created = self.client.post("/api/runs", json={"run_type": "ingestion", "status": "running"})
|
||||
self.assertEqual(created.status_code, 200)
|
||||
run_id = created.json()["id"]
|
||||
|
||||
detail = self.client.get(f"/api/runs/{run_id}")
|
||||
self.assertEqual(detail.status_code, 200)
|
||||
self.assertEqual(detail.json()["item"]["id"], run_id)
|
||||
|
||||
def test_source_policy_check_endpoint(self) -> None:
|
||||
login = self.client.post("/auth/login", json={"username": "admin", "password": "secret"})
|
||||
self.assertEqual(login.status_code, 200)
|
||||
|
||||
created = self.client.post(
|
||||
"/api/sources",
|
||||
json={
|
||||
"name": "Policy Source",
|
||||
"risk_level": "yellow",
|
||||
"is_enabled": True,
|
||||
},
|
||||
)
|
||||
self.assertEqual(created.status_code, 200)
|
||||
source_id = created.json()["id"]
|
||||
|
||||
check = self.client.get(f"/api/sources/{source_id}/policy-check")
|
||||
self.assertEqual(check.status_code, 200)
|
||||
body = check.json()
|
||||
self.assertFalse(body["allowed"])
|
||||
self.assertGreaterEqual(len(body["issues"]), 1)
|
||||
|
||||
def test_articles_export_json_and_csv_contains_relevance(self) -> None:
|
||||
login = self.client.post("/auth/login", json={"username": "admin", "password": "secret"})
|
||||
self.assertEqual(login.status_code, 200)
|
||||
|
||||
source = self.client.post(
|
||||
"/api/sources",
|
||||
json={
|
||||
"name": "Export Source",
|
||||
"base_url": "https://example.org",
|
||||
"terms_url": "https://example.org/terms",
|
||||
"license_name": "cc-by",
|
||||
"risk_level": "green",
|
||||
"is_enabled": True,
|
||||
"last_reviewed_at": "2026-02-18T00:00:00Z",
|
||||
},
|
||||
)
|
||||
self.assertEqual(source.status_code, 200)
|
||||
source_id = source.json()["id"]
|
||||
|
||||
feed = self.client.post(
|
||||
"/api/feeds",
|
||||
json={"name": "Export Feed", "url": "https://example.org/feed.xml", "source_id": source_id, "is_enabled": True},
|
||||
)
|
||||
self.assertEqual(feed.status_code, 200)
|
||||
feed_id = feed.json()["id"]
|
||||
|
||||
article = self.client.post(
|
||||
"/api/articles/upsert",
|
||||
json={
|
||||
"feed_id": feed_id,
|
||||
"source_article_id": "exp-1",
|
||||
"source_hash": "exp-hash-1",
|
||||
"title": "Export Artikel",
|
||||
"source_url": "https://example.org/article/1",
|
||||
"canonical_url": "https://example.org/article/1",
|
||||
"published_at": "2026-02-18T00:00:00Z",
|
||||
"author": "Autor",
|
||||
"summary": "Kurz",
|
||||
"content_raw": "Langtext",
|
||||
"image_urls_json": "[\"https://example.org/img.jpg\"]",
|
||||
"press_contact": "Kontakt",
|
||||
"source_name_snapshot": "Export Source",
|
||||
"source_terms_url_snapshot": "https://example.org/terms",
|
||||
"source_license_name_snapshot": "cc-by",
|
||||
"status": "review",
|
||||
},
|
||||
)
|
||||
self.assertEqual(article.status_code, 200)
|
||||
|
||||
export_json = self.client.get("/api/articles/export?format=json")
|
||||
self.assertEqual(export_json.status_code, 200)
|
||||
body = export_json.json()
|
||||
self.assertTrue(body.get("ok"))
|
||||
self.assertGreaterEqual(body.get("count", 0), 1)
|
||||
first = body["items"][0]
|
||||
self.assertIn("published_at", first)
|
||||
self.assertIn("days_old", first)
|
||||
self.assertIn("relevance", first)
|
||||
|
||||
export_csv = self.client.get("/api/articles/export?format=csv")
|
||||
self.assertEqual(export_csv.status_code, 200)
|
||||
self.assertIn("text/csv", export_csv.headers.get("content-type", ""))
|
||||
csv_text = export_csv.text
|
||||
self.assertIn("published_at", csv_text)
|
||||
self.assertIn("days_old", csv_text)
|
||||
self.assertIn("relevance", csv_text)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
110
backend/tests/test_article_workflow.py
Normal file
110
backend/tests/test_article_workflow.py
Normal file
|
|
@ -0,0 +1,110 @@
|
|||
import os
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi.testclient import TestClient
|
||||
from unittest.mock import patch
|
||||
|
||||
from backend.app import config as config_module
|
||||
from backend.app.db import init_db
|
||||
from backend.app.main import app
|
||||
|
||||
|
||||
class TestArticleWorkflow(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.tmp_dir = tempfile.TemporaryDirectory()
|
||||
os.environ["APP_DB_PATH"] = str(Path(self.tmp_dir.name) / "workflow.db")
|
||||
os.environ["APP_ADMIN_USERNAME"] = "admin"
|
||||
os.environ["APP_ADMIN_PASSWORD"] = "secret"
|
||||
config_module.get_settings.cache_clear()
|
||||
init_db()
|
||||
self.client = TestClient(app)
|
||||
self.client.post("/auth/login", json={"username": "admin", "password": "secret"})
|
||||
|
||||
def tearDown(self) -> None:
|
||||
config_module.get_settings.cache_clear()
|
||||
os.environ.pop("APP_DB_PATH", None)
|
||||
os.environ.pop("APP_ADMIN_USERNAME", None)
|
||||
os.environ.pop("APP_ADMIN_PASSWORD", None)
|
||||
self.tmp_dir.cleanup()
|
||||
|
||||
def _create_article(self) -> int:
|
||||
source = self.client.post(
|
||||
"/api/sources",
|
||||
json={
|
||||
"name": "Workflow Source",
|
||||
"base_url": "https://example.org",
|
||||
"terms_url": "https://example.org/terms",
|
||||
"license_name": "cc-by",
|
||||
"risk_level": "green",
|
||||
"is_enabled": True,
|
||||
"last_reviewed_at": "2026-02-18T00:00:00Z",
|
||||
},
|
||||
)
|
||||
source_id = source.json()["id"]
|
||||
|
||||
feed = self.client.post(
|
||||
"/api/feeds",
|
||||
json={"name": "Workflow Feed", "url": "https://example.org/feed.xml", "source_id": source_id, "is_enabled": True},
|
||||
)
|
||||
feed_id = feed.json()["id"]
|
||||
|
||||
article = self.client.post(
|
||||
"/api/articles/upsert",
|
||||
json={
|
||||
"feed_id": feed_id,
|
||||
"source_article_id": "wf-1",
|
||||
"source_url": "https://example.org/a1",
|
||||
"title": "Workflow Artikel",
|
||||
"summary": "s",
|
||||
"content_raw": "c",
|
||||
"status": "new",
|
||||
},
|
||||
)
|
||||
return article.json()["id"]
|
||||
|
||||
def test_valid_transition_chain(self) -> None:
|
||||
article_id = self._create_article()
|
||||
|
||||
t1 = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "rewrite"})
|
||||
self.assertEqual(t1.status_code, 200)
|
||||
|
||||
t2 = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "publish"})
|
||||
self.assertEqual(t2.status_code, 200)
|
||||
|
||||
t3 = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "published"})
|
||||
self.assertEqual(t3.status_code, 200)
|
||||
|
||||
t4 = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "rewrite"})
|
||||
self.assertEqual(t4.status_code, 200)
|
||||
|
||||
final = self.client.get(f"/api/articles/{article_id}")
|
||||
self.assertEqual(final.status_code, 200)
|
||||
self.assertEqual(final.json()["item"]["status"], "rewrite")
|
||||
self.assertEqual(final.json()["item"]["status_ui"], "rewrite")
|
||||
|
||||
def test_invalid_transition_rejected(self) -> None:
|
||||
article_id = self._create_article()
|
||||
bad = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "published"})
|
||||
self.assertEqual(bad.status_code, 400)
|
||||
|
||||
def test_legacy_review_endpoint_is_gone(self) -> None:
|
||||
article_id = self._create_article()
|
||||
bad = self.client.post(f"/api/articles/{article_id}/review", json={"decision": "approve"})
|
||||
self.assertEqual(bad.status_code, 410)
|
||||
|
||||
@patch("backend.app.main.rewrite_article_text")
|
||||
def test_rewrite_run_sets_publish_status(self, mock_rewrite) -> None:
|
||||
mock_rewrite.return_value = "<h2>Neu</h2><p>Umschreibung</p>"
|
||||
article_id = self._create_article()
|
||||
self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "rewrite"})
|
||||
r = self.client.post(f"/api/articles/{article_id}/rewrite-run")
|
||||
self.assertEqual(r.status_code, 200)
|
||||
self.assertEqual(r.json()["status"], "publish")
|
||||
final = self.client.get(f"/api/articles/{article_id}")
|
||||
self.assertEqual(final.json()["item"]["status_ui"], "publish")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
145
backend/tests/test_db_repositories.py
Normal file
145
backend/tests/test_db_repositories.py
Normal file
|
|
@ -0,0 +1,145 @@
|
|||
import os
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
from backend.app import config as config_module
|
||||
from backend.app.db import init_db
|
||||
from backend.app.repositories import (
|
||||
ArticleUpsert,
|
||||
FeedCreate,
|
||||
RunCreate,
|
||||
SourceCreate,
|
||||
create_feed,
|
||||
create_run,
|
||||
create_source,
|
||||
finish_run,
|
||||
list_articles,
|
||||
list_feeds,
|
||||
list_runs,
|
||||
list_sources,
|
||||
upsert_article,
|
||||
)
|
||||
|
||||
|
||||
class TestSQLiteRepositories(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.tmp_dir = tempfile.TemporaryDirectory()
|
||||
self.db_path = str(Path(self.tmp_dir.name) / "test.db")
|
||||
os.environ["APP_DB_PATH"] = self.db_path
|
||||
config_module.get_settings.cache_clear()
|
||||
init_db()
|
||||
|
||||
def tearDown(self) -> None:
|
||||
config_module.get_settings.cache_clear()
|
||||
os.environ.pop("APP_DB_PATH", None)
|
||||
self.tmp_dir.cleanup()
|
||||
|
||||
def test_end_to_end_basic_crud(self) -> None:
|
||||
source_id = create_source(
|
||||
SourceCreate(
|
||||
name="GovData",
|
||||
base_url="https://data.gov.de",
|
||||
terms_url="https://www.govdata.de/dl-de/by-2-0",
|
||||
license_name="dl-de/by-2-0",
|
||||
risk_level="green",
|
||||
is_enabled=True,
|
||||
notes="test source",
|
||||
last_reviewed_at="2026-02-18T00:00:00Z",
|
||||
)
|
||||
)
|
||||
self.assertGreater(source_id, 0)
|
||||
|
||||
feed_id = create_feed(
|
||||
FeedCreate(
|
||||
name="GovData RSS",
|
||||
url="https://example.org/feed.xml",
|
||||
source_id=source_id,
|
||||
is_enabled=True,
|
||||
)
|
||||
)
|
||||
self.assertGreater(feed_id, 0)
|
||||
|
||||
run_id = create_run(RunCreate(run_type="ingest", status="running", details="start"))
|
||||
self.assertGreater(run_id, 0)
|
||||
finish_run(run_id=run_id, status="success", details="ok")
|
||||
|
||||
article_id = upsert_article(
|
||||
ArticleUpsert(
|
||||
feed_id=feed_id,
|
||||
source_article_id="abc-1",
|
||||
source_hash="hash-abc-1",
|
||||
title="Beispielartikel",
|
||||
source_url="https://example.org/articles/1",
|
||||
canonical_url="https://example.org/articles/1",
|
||||
published_at="2026-02-18T00:00:00Z",
|
||||
author="Max Mustermann",
|
||||
summary="Kurzfassung",
|
||||
content_raw="Originaltext",
|
||||
content_rewritten="Umschreibung",
|
||||
image_urls_json='["https://example.org/img.jpg"]',
|
||||
press_contact="Pressekontakt X",
|
||||
source_name_snapshot="GovData",
|
||||
source_terms_url_snapshot="https://www.govdata.de/dl-de/by-2-0",
|
||||
source_license_name_snapshot="dl-de/by-2-0",
|
||||
legal_checked=False,
|
||||
legal_checked_at=None,
|
||||
legal_note=None,
|
||||
wp_post_id=None,
|
||||
wp_post_url=None,
|
||||
publish_attempts=0,
|
||||
publish_last_error=None,
|
||||
published_to_wp_at=None,
|
||||
word_count=120,
|
||||
status="review",
|
||||
meta_json='{"lang":"de"}',
|
||||
)
|
||||
)
|
||||
self.assertGreater(article_id, 0)
|
||||
|
||||
# Upsert with same source_url updates same row
|
||||
article_id_2 = upsert_article(
|
||||
ArticleUpsert(
|
||||
feed_id=feed_id,
|
||||
source_article_id="abc-1",
|
||||
source_hash="hash-abc-1",
|
||||
title="Beispielartikel aktualisiert",
|
||||
source_url="https://example.org/articles/1",
|
||||
canonical_url="https://example.org/articles/1",
|
||||
published_at="2026-02-18T00:00:00Z",
|
||||
author="Max Mustermann",
|
||||
summary="Kurzfassung 2",
|
||||
content_raw="Originaltext 2",
|
||||
content_rewritten="Umschreibung 2",
|
||||
image_urls_json='["https://example.org/img2.jpg"]',
|
||||
press_contact="Pressekontakt Y",
|
||||
source_name_snapshot="GovData",
|
||||
source_terms_url_snapshot="https://www.govdata.de/dl-de/by-2-0",
|
||||
source_license_name_snapshot="dl-de/by-2-0",
|
||||
legal_checked=True,
|
||||
legal_checked_at="2026-02-18T00:10:00Z",
|
||||
legal_note="ok",
|
||||
wp_post_id=123,
|
||||
wp_post_url="https://example.org/wp/123",
|
||||
publish_attempts=1,
|
||||
publish_last_error=None,
|
||||
published_to_wp_at="2026-02-18T00:12:00Z",
|
||||
word_count=140,
|
||||
status="approved",
|
||||
meta_json='{"lang":"de","v":2}',
|
||||
)
|
||||
)
|
||||
self.assertEqual(article_id, article_id_2)
|
||||
|
||||
self.assertEqual(len(list_sources()), 1)
|
||||
self.assertEqual(len(list_feeds()), 1)
|
||||
self.assertEqual(len(list_runs()), 1)
|
||||
|
||||
articles = list_articles()
|
||||
self.assertEqual(len(articles), 1)
|
||||
self.assertEqual(articles[0]["title"], "Beispielartikel aktualisiert")
|
||||
self.assertEqual(articles[0]["status"], "approved")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
245
backend/tests/test_ingestion.py
Normal file
245
backend/tests/test_ingestion.py
Normal file
|
|
@ -0,0 +1,245 @@
|
|||
import os
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
from backend.app import config as config_module
|
||||
from backend.app.db import init_db
|
||||
from backend.app.ingestion import run_ingestion
|
||||
from backend.app.repositories import (
|
||||
ArticleUpsert,
|
||||
FeedCreate,
|
||||
SourceCreate,
|
||||
create_feed,
|
||||
create_source,
|
||||
get_article_by_id,
|
||||
list_articles,
|
||||
upsert_article,
|
||||
)
|
||||
from backend.app.source_extraction import ExtractedArticle
|
||||
|
||||
|
||||
class TestIngestion(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.tmp_dir = tempfile.TemporaryDirectory()
|
||||
os.environ["APP_DB_PATH"] = str(Path(self.tmp_dir.name) / "ingestion.db")
|
||||
config_module.get_settings.cache_clear()
|
||||
init_db()
|
||||
|
||||
source_id = create_source(
|
||||
SourceCreate(
|
||||
name="Test Source",
|
||||
base_url="https://example.org",
|
||||
terms_url="https://example.org/terms",
|
||||
license_name="cc-by",
|
||||
risk_level="green",
|
||||
is_enabled=True,
|
||||
notes=None,
|
||||
last_reviewed_at="2026-02-18T00:00:00Z",
|
||||
)
|
||||
)
|
||||
self.feed_id = create_feed(
|
||||
FeedCreate(
|
||||
name="Test Feed",
|
||||
url="https://example.org/feed.xml",
|
||||
source_id=source_id,
|
||||
is_enabled=True,
|
||||
)
|
||||
)
|
||||
|
||||
def tearDown(self) -> None:
|
||||
config_module.get_settings.cache_clear()
|
||||
os.environ.pop("APP_DB_PATH", None)
|
||||
self.tmp_dir.cleanup()
|
||||
|
||||
@patch("backend.app.ingestion.extract_article")
|
||||
@patch("backend.app.ingestion.feedparser.parse")
|
||||
def test_ingestion_deduplicates_by_feed_and_guid(self, mock_parse, mock_extract_article) -> None:
|
||||
mock_extract_article.return_value = ExtractedArticle(
|
||||
title="Artikel 1 original",
|
||||
author="Autorin A",
|
||||
canonical_url="https://example.org/article/1",
|
||||
summary="Original Summary",
|
||||
content_text="Original Volltext",
|
||||
images=["https://example.org/a.jpg"],
|
||||
press_contact="Pressekontakt: Team A",
|
||||
extraction_error=None,
|
||||
)
|
||||
mock_parse.return_value = {
|
||||
"etag": "etag-1",
|
||||
"modified": "Tue, 18 Feb 2026 10:00:00 GMT",
|
||||
"entries": [
|
||||
{
|
||||
"id": "item-1",
|
||||
"title": "Artikel 1",
|
||||
"link": "https://example.org/article/1",
|
||||
"summary": "A",
|
||||
},
|
||||
{
|
||||
"id": "item-1",
|
||||
"title": "Artikel 1 aktualisiert",
|
||||
"link": "https://example.org/article/1-neu",
|
||||
"summary": "B",
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
stats = run_ingestion(feed_id=self.feed_id)
|
||||
self.assertEqual(stats.status, "success")
|
||||
self.assertEqual(stats.entries_seen, 2)
|
||||
self.assertEqual(len(list_articles()), 1)
|
||||
article = list_articles()[0]
|
||||
self.assertEqual(article["title"], "Artikel 1 original")
|
||||
self.assertEqual(article["author"], "Autorin A")
|
||||
self.assertIn("Original Volltext", article["content_raw"] or "")
|
||||
self.assertIn("Pressekontakt", article["meta_json"] or "")
|
||||
self.assertIsNotNone(article["image_urls_json"])
|
||||
|
||||
@patch("backend.app.ingestion.extract_article")
|
||||
@patch("backend.app.ingestion.feedparser.parse")
|
||||
def test_ingestion_processes_any_enabled_source(self, mock_parse, mock_extract_article) -> None:
|
||||
# Ampel/risk-level system removed – all enabled feeds are processed regardless of risk_level
|
||||
source_id = create_source(
|
||||
SourceCreate(
|
||||
name="Any Risk Source",
|
||||
base_url="https://example.net",
|
||||
terms_url="https://example.net/terms",
|
||||
license_name="custom",
|
||||
risk_level="yellow",
|
||||
is_enabled=True,
|
||||
notes=None,
|
||||
last_reviewed_at="2026-02-18T00:00:00Z",
|
||||
)
|
||||
)
|
||||
feed_id = create_feed(
|
||||
FeedCreate(
|
||||
name="Any Risk Feed",
|
||||
url="https://example.net/feed.xml",
|
||||
source_id=source_id,
|
||||
is_enabled=True,
|
||||
)
|
||||
)
|
||||
|
||||
mock_parse.return_value = type("FP", (), {"entries": [], "etag": None, "modified": None})()
|
||||
mock_extract_article.return_value = type("E", (), {
|
||||
"title": None, "author": None, "summary": None, "content_text": None,
|
||||
"canonical_url": None, "images": [], "press_contact": None,
|
||||
})()
|
||||
|
||||
stats = run_ingestion(feed_id=feed_id)
|
||||
self.assertEqual(stats.status, "success")
|
||||
# Feed was processed (feedparser was called), even with yellow risk_level
|
||||
mock_parse.assert_called_once()
|
||||
|
||||
@patch("backend.app.ingestion.extract_article")
|
||||
@patch("backend.app.ingestion.feedparser.parse")
|
||||
def test_ingestion_preserves_existing_work_and_skips_closed(self, mock_parse, mock_extract_article) -> None:
|
||||
existing_closed_id = upsert_article(
|
||||
ArticleUpsert(
|
||||
feed_id=self.feed_id,
|
||||
source_article_id="closed-1",
|
||||
source_hash="closed-hash-1",
|
||||
title="Alt Closed",
|
||||
source_url="https://example.org/closed-article",
|
||||
canonical_url="https://example.org/closed-article",
|
||||
published_at=None,
|
||||
author="Autor",
|
||||
summary="Alt",
|
||||
content_raw="Alt Raw",
|
||||
content_rewritten="<p>Alt Rewrite Closed</p>",
|
||||
image_urls_json=None,
|
||||
press_contact="Kontakt Alt",
|
||||
source_name_snapshot="Test Source",
|
||||
source_terms_url_snapshot="https://example.org/terms",
|
||||
source_license_name_snapshot="cc-by",
|
||||
legal_checked=False,
|
||||
legal_checked_at=None,
|
||||
legal_note=None,
|
||||
wp_post_id=42,
|
||||
wp_post_url="https://wp.local/?p=42",
|
||||
publish_attempts=2,
|
||||
publish_last_error=None,
|
||||
published_to_wp_at="2026-02-21T12:00:00Z",
|
||||
word_count=3,
|
||||
status="error", # UI: close
|
||||
meta_json='{"generated_tags":["AltTag"]}',
|
||||
)
|
||||
)
|
||||
existing_published_id = upsert_article(
|
||||
ArticleUpsert(
|
||||
feed_id=self.feed_id,
|
||||
source_article_id="published-1",
|
||||
source_hash="published-hash-1",
|
||||
title="Alt Published",
|
||||
source_url="https://example.org/published-article",
|
||||
canonical_url="https://example.org/published-article",
|
||||
published_at=None,
|
||||
author="Autor",
|
||||
summary="Alt",
|
||||
content_raw="Alt Raw",
|
||||
content_rewritten="<p>Alt Rewrite Published</p>",
|
||||
image_urls_json=None,
|
||||
press_contact="Kontakt Alt",
|
||||
source_name_snapshot="Test Source",
|
||||
source_terms_url_snapshot="https://example.org/terms",
|
||||
source_license_name_snapshot="cc-by",
|
||||
legal_checked=False,
|
||||
legal_checked_at=None,
|
||||
legal_note=None,
|
||||
wp_post_id=77,
|
||||
wp_post_url="https://wp.local/?p=77",
|
||||
publish_attempts=3,
|
||||
publish_last_error=None,
|
||||
published_to_wp_at="2026-02-21T12:10:00Z",
|
||||
word_count=3,
|
||||
status="published",
|
||||
meta_json='{"generated_tags":["Rheingas"],"image_review":{"selected_url":"https://img.local/1.jpg"}}',
|
||||
)
|
||||
)
|
||||
|
||||
mock_extract_article.return_value = ExtractedArticle(
|
||||
title="Neu Titel",
|
||||
author="Neu Autor",
|
||||
canonical_url=None,
|
||||
summary="Neu Summary",
|
||||
content_text="Neu Volltext",
|
||||
images=["https://example.org/a.jpg"],
|
||||
press_contact=None,
|
||||
extraction_error=None,
|
||||
)
|
||||
mock_parse.return_value = {
|
||||
"etag": "etag-2",
|
||||
"modified": "Tue, 18 Feb 2026 11:00:00 GMT",
|
||||
"entries": [
|
||||
{
|
||||
"id": "closed-1",
|
||||
"title": "Closed Entry",
|
||||
"link": "https://example.org/closed-article",
|
||||
"summary": "X",
|
||||
},
|
||||
{
|
||||
"id": "published-1",
|
||||
"title": "Published Entry",
|
||||
"link": "https://example.org/published-article",
|
||||
"summary": "Y",
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
stats = run_ingestion(feed_id=self.feed_id)
|
||||
self.assertEqual(stats.status, "success")
|
||||
closed_row = get_article_by_id(existing_closed_id) or {}
|
||||
self.assertEqual(closed_row["status"], "error")
|
||||
self.assertIn("Alt Rewrite Closed", closed_row.get("content_rewritten") or "")
|
||||
self.assertEqual(closed_row.get("wp_post_id"), 42)
|
||||
|
||||
published_row = get_article_by_id(existing_published_id) or {}
|
||||
self.assertEqual(published_row["status"], "published")
|
||||
self.assertIn("Alt Rewrite Published", published_row.get("content_rewritten") or "")
|
||||
self.assertEqual(published_row.get("wp_post_id"), 77)
|
||||
self.assertIn("generated_tags", published_row.get("meta_json") or "")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
112
backend/tests/test_publisher.py
Normal file
112
backend/tests/test_publisher.py
Normal file
|
|
@ -0,0 +1,112 @@
|
|||
import os
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from backend.app import config as config_module
|
||||
from backend.app.db import init_db
|
||||
from backend.app.main import app
|
||||
|
||||
|
||||
class TestPublisher(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.tmp_dir = tempfile.TemporaryDirectory()
|
||||
os.environ["APP_DB_PATH"] = str(Path(self.tmp_dir.name) / "publisher.db")
|
||||
os.environ["APP_ADMIN_USERNAME"] = "admin"
|
||||
os.environ["APP_ADMIN_PASSWORD"] = "secret"
|
||||
os.environ["WORDPRESS_BASE_URL"] = "https://example.org"
|
||||
os.environ["WORDPRESS_USERNAME"] = "wp-user"
|
||||
os.environ["WORDPRESS_APP_PASSWORD"] = "wp-pass"
|
||||
config_module.get_settings.cache_clear()
|
||||
init_db()
|
||||
self.client = TestClient(app)
|
||||
self.client.post("/auth/login", json={"username": "admin", "password": "secret"})
|
||||
|
||||
def tearDown(self) -> None:
|
||||
config_module.get_settings.cache_clear()
|
||||
for key in (
|
||||
"APP_DB_PATH",
|
||||
"APP_ADMIN_USERNAME",
|
||||
"APP_ADMIN_PASSWORD",
|
||||
"WORDPRESS_BASE_URL",
|
||||
"WORDPRESS_USERNAME",
|
||||
"WORDPRESS_APP_PASSWORD",
|
||||
):
|
||||
os.environ.pop(key, None)
|
||||
self.tmp_dir.cleanup()
|
||||
|
||||
def _create_publishable_article(self) -> int:
|
||||
source = self.client.post(
|
||||
"/api/sources",
|
||||
json={
|
||||
"name": "WP Source",
|
||||
"base_url": "https://example.org",
|
||||
"terms_url": "https://example.org/terms",
|
||||
"license_name": "cc-by",
|
||||
"risk_level": "green",
|
||||
"is_enabled": True,
|
||||
"last_reviewed_at": "2026-02-18T00:00:00Z",
|
||||
},
|
||||
)
|
||||
source_id = source.json()["id"]
|
||||
feed = self.client.post(
|
||||
"/api/feeds",
|
||||
json={"name": "WP Feed", "url": "https://example.org/feed.xml", "source_id": source_id, "is_enabled": True},
|
||||
)
|
||||
feed_id = feed.json()["id"]
|
||||
|
||||
article = self.client.post(
|
||||
"/api/articles/upsert",
|
||||
json={
|
||||
"feed_id": feed_id,
|
||||
"source_article_id": "pub-1",
|
||||
"source_hash": "pub-hash-1",
|
||||
"title": "Publish Artikel",
|
||||
"source_url": "https://example.org/article/1",
|
||||
"canonical_url": "https://example.org/article/1",
|
||||
"published_at": "2026-02-18T00:00:00Z",
|
||||
"author": "Autor",
|
||||
"summary": "Kurz",
|
||||
"content_raw": "Langtext",
|
||||
"image_urls_json": "[\"https://example.org/img.jpg\"]",
|
||||
"press_contact": "Kontakt",
|
||||
"source_name_snapshot": "WP Source",
|
||||
"source_terms_url_snapshot": "https://example.org/terms",
|
||||
"source_license_name_snapshot": "cc-by",
|
||||
"legal_checked": True,
|
||||
"status": "approved",
|
||||
"meta_json": "{\"image_review\":{\"selected_url\":\"https://example.org/img.jpg\"}}",
|
||||
},
|
||||
)
|
||||
return article.json()["id"]
|
||||
|
||||
@patch("backend.app.publisher.publish_article_draft")
|
||||
def test_enqueue_and_run_publisher(self, mock_publish) -> None:
|
||||
mock_publish.return_value = (777, "https://example.org/?p=777")
|
||||
article_id = self._create_publishable_article()
|
||||
|
||||
enqueue = self.client.post("/api/publisher/enqueue", json={"article_id": article_id, "max_attempts": 3})
|
||||
self.assertEqual(enqueue.status_code, 200)
|
||||
|
||||
run = self.client.post("/api/publisher/run", json={"max_jobs": 5})
|
||||
self.assertEqual(run.status_code, 200)
|
||||
stats = run.json()["stats"]
|
||||
self.assertEqual(stats["success"], 1)
|
||||
|
||||
article = self.client.get(f"/api/articles/{article_id}")
|
||||
self.assertEqual(article.status_code, 200)
|
||||
item = article.json()["item"]
|
||||
self.assertEqual(item["status"], "published")
|
||||
self.assertEqual(item["wp_post_id"], 777)
|
||||
self.assertIn("?p=777", item["wp_post_url"] or "")
|
||||
|
||||
jobs = self.client.get("/api/publisher/jobs")
|
||||
self.assertEqual(jobs.status_code, 200)
|
||||
self.assertGreaterEqual(len(jobs.json()["items"]), 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
21
backend/tests/test_relevance.py
Normal file
21
backend/tests/test_relevance.py
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
from datetime import datetime, timezone
|
||||
import unittest
|
||||
|
||||
from backend.app.relevance import article_age_days, article_relevance
|
||||
|
||||
|
||||
class TestRelevance(unittest.TestCase):
|
||||
def test_article_age_and_relevance(self) -> None:
|
||||
now = datetime(2026, 2, 18, 12, 0, 0, tzinfo=timezone.utc)
|
||||
self.assertEqual(article_age_days("2026-02-18T10:00:00Z", now=now), 0)
|
||||
self.assertEqual(article_relevance("2026-02-18T10:00:00Z", now=now), "hoch")
|
||||
|
||||
self.assertEqual(article_age_days("2026-02-14T12:00:00Z", now=now), 4)
|
||||
self.assertEqual(article_relevance("2026-02-14T12:00:00Z", now=now), "mittel")
|
||||
|
||||
self.assertEqual(article_relevance("2025-12-01T00:00:00Z", now=now), "alt")
|
||||
self.assertEqual(article_relevance(None, now=now), "unbekannt")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
96
backend/tests/test_source_extraction.py
Normal file
96
backend/tests/test_source_extraction.py
Normal file
|
|
@ -0,0 +1,96 @@
|
|||
import unittest
|
||||
from unittest.mock import patch
|
||||
|
||||
from backend.app.source_extraction import extract_article
|
||||
|
||||
|
||||
SAMPLE_HTML = """
|
||||
<!doctype html>
|
||||
<html lang="de">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta property="og:title" content="Demo Meldung von Presseportal" />
|
||||
<meta name="author" content="Max Mustermann" />
|
||||
<meta name="description" content="Kurzbeschreibung aus der Originalseite" />
|
||||
<meta property="og:image" content="/images/demo.jpg" />
|
||||
<link rel="canonical" href="https://www.presseportal.de/pm/118273/6158137" />
|
||||
</head>
|
||||
<body>
|
||||
<article>
|
||||
<p>Dies ist der vollstaendige Inhalt des Artikels.</p>
|
||||
<p>Weitere relevante Informationen fuer die Meldung.</p>
|
||||
<h3>Pressekontakt</h3>
|
||||
<p>Musterfirma GmbH, Kontakt: presse@example.org</p>
|
||||
</article>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
SAMPLE_HTML_AGENTUR = """
|
||||
<!doctype html>
|
||||
<html lang="de">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta property="og:title" content="Demo Meldung Agentur" />
|
||||
</head>
|
||||
<body>
|
||||
<article>
|
||||
<p>Inhalt der Meldung.</p>
|
||||
<h3>Agentur</h3>
|
||||
<p>Agenturname GmbH</p>
|
||||
<p>presse@agentur.example</p>
|
||||
<p>Original-Content von Beispiel</p>
|
||||
</article>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
|
||||
class _FakeHeaders:
|
||||
@staticmethod
|
||||
def get_content_charset():
|
||||
return "utf-8"
|
||||
|
||||
|
||||
class _FakeResponse:
|
||||
headers = _FakeHeaders()
|
||||
|
||||
def __init__(self, body: str):
|
||||
self._body = body.encode("utf-8")
|
||||
|
||||
def read(self):
|
||||
return self._body
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
return False
|
||||
|
||||
|
||||
class TestSourceExtraction(unittest.TestCase):
|
||||
@patch("backend.app.source_extraction.urlopen")
|
||||
def test_extract_article_parses_author_images_and_press_contact(self, mock_urlopen) -> None:
|
||||
mock_urlopen.return_value = _FakeResponse(SAMPLE_HTML)
|
||||
|
||||
extracted = extract_article("https://www.presseportal.de/pm/118273/6158137")
|
||||
self.assertEqual(extracted.title, "Demo Meldung von Presseportal")
|
||||
self.assertEqual(extracted.author, "Max Mustermann")
|
||||
self.assertEqual(extracted.canonical_url, "https://www.presseportal.de/pm/118273/6158137")
|
||||
self.assertIn("vollstaendige Inhalt", extracted.content_text or "")
|
||||
self.assertIn("Kurzbeschreibung", extracted.summary or "")
|
||||
self.assertIn("https://www.presseportal.de/images/demo.jpg", extracted.images)
|
||||
self.assertIn("Pressekontakt", extracted.press_contact or "")
|
||||
self.assertIsNone(extracted.extraction_error)
|
||||
|
||||
@patch("backend.app.source_extraction.urlopen")
|
||||
def test_extract_article_detects_agentur_block_as_press_contact(self, mock_urlopen) -> None:
|
||||
mock_urlopen.return_value = _FakeResponse(SAMPLE_HTML_AGENTUR)
|
||||
extracted = extract_article("https://www.presseportal.de/pm/155103/6210401")
|
||||
self.assertIn("Agentur", extracted.press_contact or "")
|
||||
self.assertIn("Agenturname", extracted.press_contact or "")
|
||||
self.assertIn("presse@agentur.example", extracted.press_contact or "")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
139
backend/tests/test_wordpress.py
Normal file
139
backend/tests/test_wordpress.py
Normal file
|
|
@ -0,0 +1,139 @@
|
|||
import os
|
||||
import unittest
|
||||
from unittest.mock import patch
|
||||
|
||||
from backend.app import config as config_module
|
||||
from backend.app.wordpress import publish_article_draft
|
||||
|
||||
|
||||
class TestWordpressPublish(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
os.environ["WORDPRESS_BASE_URL"] = "https://example.org"
|
||||
os.environ["WORDPRESS_USERNAME"] = "wp-user"
|
||||
os.environ["WORDPRESS_APP_PASSWORD"] = "wp-pass"
|
||||
config_module.get_settings.cache_clear()
|
||||
|
||||
def tearDown(self) -> None:
|
||||
for key in ("WORDPRESS_BASE_URL", "WORDPRESS_USERNAME", "WORDPRESS_APP_PASSWORD"):
|
||||
os.environ.pop(key, None)
|
||||
config_module.get_settings.cache_clear()
|
||||
|
||||
@patch("backend.app.wordpress._upload_featured_media")
|
||||
@patch("backend.app.wordpress._wp_request")
|
||||
def test_publish_sets_featured_media_when_selected_image_exists(self, mock_wp_request, mock_upload_media) -> None:
|
||||
mock_upload_media.return_value = 456
|
||||
mock_wp_request.return_value = {"id": 321, "link": "https://example.org/?p=321"}
|
||||
|
||||
article = {
|
||||
"title": "Testartikel",
|
||||
"content_raw": "Inhalt",
|
||||
"source_url": "https://example.com/source",
|
||||
"canonical_url": "https://example.com/source",
|
||||
"meta_json": '{"image_review":{"selected_url":"https://example.com/image.jpg"}}',
|
||||
}
|
||||
post_id, post_url = publish_article_draft(article)
|
||||
|
||||
self.assertEqual(post_id, 321)
|
||||
self.assertIn("?p=321", post_url or "")
|
||||
self.assertTrue(mock_upload_media.called)
|
||||
payload = mock_wp_request.call_args.kwargs["payload"]
|
||||
self.assertEqual(payload.get("featured_media"), 456)
|
||||
self.assertIn("<!-- wp:paragraph -->", payload.get("content", ""))
|
||||
self.assertIn("<p>Inhalt</p>", payload.get("content", ""))
|
||||
self.assertNotIn("excerpt", payload)
|
||||
|
||||
@patch("backend.app.wordpress._upload_featured_media")
|
||||
@patch("backend.app.wordpress._wp_request")
|
||||
def test_publish_without_selected_image_has_no_featured_media(self, mock_wp_request, mock_upload_media) -> None:
|
||||
mock_wp_request.return_value = {"id": 654, "link": "https://example.org/?p=654"}
|
||||
|
||||
article = {
|
||||
"title": "Testartikel",
|
||||
"content_raw": "Inhalt",
|
||||
"source_url": "https://example.com/source",
|
||||
"canonical_url": "https://example.com/source",
|
||||
"meta_json": "{}",
|
||||
}
|
||||
post_id, _ = publish_article_draft(article)
|
||||
|
||||
self.assertEqual(post_id, 654)
|
||||
self.assertFalse(mock_upload_media.called)
|
||||
payload = mock_wp_request.call_args.kwargs["payload"]
|
||||
self.assertNotIn("featured_media", payload)
|
||||
self.assertIn("<p>Inhalt</p>", payload.get("content", ""))
|
||||
|
||||
@patch("backend.app.wordpress._upload_featured_media")
|
||||
@patch("backend.app.wordpress._wp_request")
|
||||
def test_publish_strips_feed_header_and_press_contact(self, mock_wp_request, mock_upload_media) -> None:
|
||||
mock_wp_request.return_value = {"id": 100, "link": "https://example.org/?p=100"}
|
||||
article = {
|
||||
"title": "Header Test",
|
||||
"content_raw": "21.02.2026 10:00\nFirma GmbH\n(ots)\nDas ist der eigentliche Text.\nPressekontakt: Test Person",
|
||||
"source_url": "https://example.com/source",
|
||||
"canonical_url": "https://example.com/source",
|
||||
"meta_json": "{}",
|
||||
}
|
||||
publish_article_draft(article)
|
||||
payload = mock_wp_request.call_args.kwargs["payload"]
|
||||
content = payload.get("content", "")
|
||||
self.assertNotIn("Firma GmbH", content)
|
||||
self.assertNotIn("Pressekontakt", content)
|
||||
self.assertIn("eigentliche Text", content)
|
||||
self.assertNotIn("Artikeldetails", content)
|
||||
|
||||
@patch("backend.app.wordpress._upload_featured_media")
|
||||
@patch("backend.app.wordpress._wp_request")
|
||||
def test_publish_resolves_and_sets_tags(self, mock_wp_request, mock_upload_media) -> None:
|
||||
def _fake_wp_request(**kwargs):
|
||||
endpoint = kwargs.get("endpoint", "")
|
||||
method = kwargs.get("method", "")
|
||||
if method == "GET" and endpoint.startswith("tags?search="):
|
||||
if "Rheingas" in endpoint:
|
||||
return [{"id": 11, "name": "Rheingas"}]
|
||||
return []
|
||||
if method == "POST" and endpoint == "tags":
|
||||
name = (kwargs.get("payload") or {}).get("name")
|
||||
if name == "Gasflasche":
|
||||
return {"id": 12, "name": "Gasflasche"}
|
||||
return {"id": 13, "name": str(name)}
|
||||
if method == "POST" and endpoint == "posts":
|
||||
return {"id": 900, "link": "https://example.org/?p=900"}
|
||||
return {}
|
||||
|
||||
mock_wp_request.side_effect = _fake_wp_request
|
||||
article = {
|
||||
"title": "Tag Test",
|
||||
"content_raw": "Inhalt",
|
||||
"source_url": "https://example.com/source",
|
||||
"canonical_url": "https://example.com/source",
|
||||
"meta_json": '{"generated_tags":["Rheingas","Gasflasche"]}',
|
||||
}
|
||||
post_id, _ = publish_article_draft(article)
|
||||
self.assertEqual(post_id, 900)
|
||||
post_calls = [call for call in mock_wp_request.call_args_list if call.kwargs.get("endpoint") == "posts"]
|
||||
self.assertEqual(len(post_calls), 1)
|
||||
payload = post_calls[0].kwargs.get("payload", {})
|
||||
self.assertEqual(payload.get("tags"), [11, 12])
|
||||
|
||||
@patch("backend.app.wordpress._upload_featured_media")
|
||||
@patch("backend.app.wordpress._wp_request")
|
||||
def test_publish_converts_html_to_wp_blocks_without_html_block(self, mock_wp_request, mock_upload_media) -> None:
|
||||
mock_wp_request.return_value = {"id": 111, "link": "https://example.org/?p=111"}
|
||||
article = {
|
||||
"title": "Block Test",
|
||||
"content_rewritten": "<h2>Überschrift</h2><p>Absatz 1</p><ul><li>A</li><li>B</li></ul>",
|
||||
"source_url": "https://example.com/source",
|
||||
"canonical_url": "https://example.com/source",
|
||||
"meta_json": "{}",
|
||||
}
|
||||
publish_article_draft(article)
|
||||
payload = mock_wp_request.call_args.kwargs["payload"]
|
||||
content = payload.get("content", "")
|
||||
self.assertIn("<!-- wp:heading", content)
|
||||
self.assertIn("<!-- wp:paragraph -->", content)
|
||||
self.assertIn("<!-- wp:list -->", content)
|
||||
self.assertNotIn("<!-- wp:html -->", content)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
8213
data/articles.json
8213
data/articles.json
File diff suppressed because one or more lines are too long
|
|
@ -1,3 +1,42 @@
|
|||
[
|
||||
"https://www.camping-news.de/rss/"
|
||||
{
|
||||
"url": "https://www.camping-news.de/rss/",
|
||||
"name": "Camping News"
|
||||
},
|
||||
{
|
||||
"url": "https://www.promobil.de/rss/news",
|
||||
"name": "Promobil News"
|
||||
},
|
||||
{
|
||||
"url": "https://www.promobil.de/rss/ratgeber",
|
||||
"name": "Promobil Ratgeber"
|
||||
},
|
||||
{
|
||||
"url": "https://www.presseportal.de/rss/rss2_vts.htx?q=camping&langid=1 ",
|
||||
"name": "Presseportal Camping"
|
||||
},
|
||||
{
|
||||
"url": "https://www.presseportal.de/rss/rss2_vts.htx?q=wohnmobil&langid=1 ",
|
||||
"name": "Presseportal Wohnmobil"
|
||||
},
|
||||
{
|
||||
"url": "https://caravan-news.de/rss/schlagzeilen.php",
|
||||
"name": "Caravan News"
|
||||
},
|
||||
{
|
||||
"url": "https://www.google.de/alerts/feeds/03077836356662926441/16793724126187652294",
|
||||
"name": "Google Campingplatz"
|
||||
},
|
||||
{
|
||||
"url": "https://www.google.de/alerts/feeds/03077836356662926441/987500860911797305",
|
||||
"name": "Google VanLife"
|
||||
},
|
||||
{
|
||||
"url": "https://www.google.de/alerts/feeds/03077836356662926441/4770194054838089856",
|
||||
"name": "Google Camping Termine"
|
||||
},
|
||||
{
|
||||
"url": "https://www.google.com/alerts/feeds/03077836356662926441/14685692393152596493",
|
||||
"name": "Google Camping Messe 2025"
|
||||
}
|
||||
]
|
||||
190
docs/AUTOMATION.md
Normal file
190
docs/AUTOMATION.md
Normal file
|
|
@ -0,0 +1,190 @@
|
|||
# Automatischer Pipeline-Betrieb
|
||||
|
||||
## Überblick
|
||||
|
||||
Das System läuft vollautomatisch und benötigt nur noch gelegentliche Telegram-Interaktion.
|
||||
|
||||
```
|
||||
N8N (2× täglich, 08:00 + 16:00 Uhr)
|
||||
└─► POST /api/n8n/pipeline (X-API-Key Header)
|
||||
├── RSS Ingestion (alle aktivierten Feeds)
|
||||
├── Relevanz-Score per GPT (0–100)
|
||||
│ ├── Score ≥ 80 → Rewrite + WP-Draft + Telegram
|
||||
│ ├── Score 60–79 → Telegram-Warnung + manueller Override möglich
|
||||
│ └── Score < 60 → Abgelehnt + tägliche Telegram-Liste
|
||||
└── Pipeline-Zusammenfassung via Telegram
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Einrichtung
|
||||
|
||||
### 1. Umgebungsvariablen setzen
|
||||
|
||||
Kopiere `backend/.env.example` nach `backend/.env` und fülle alle Felder aus:
|
||||
|
||||
```bash
|
||||
cp backend/.env.example backend/.env
|
||||
nano backend/.env
|
||||
```
|
||||
|
||||
Wichtige Variablen:
|
||||
|
||||
| Variable | Beschreibung |
|
||||
|----------|-------------|
|
||||
| `TELEGRAM_BOT_TOKEN` | Bot-Token von @BotFather |
|
||||
| `TELEGRAM_CHAT_ID` | Deine persönliche Chat-ID |
|
||||
| `TELEGRAM_WEBHOOK_SECRET` | Zufälliger String (≥ 20 Zeichen) |
|
||||
| `N8N_API_KEY` | Starker zufälliger API-Key |
|
||||
| `OPENAI_API_KEY` | OpenAI API-Key |
|
||||
| `WP_BASE_URL` | WordPress-URL |
|
||||
| `WP_USERNAME` | WordPress-Benutzername |
|
||||
| `WP_PASSWORD` | WordPress App-Passwort |
|
||||
|
||||
### 2. Telegram-Webhook registrieren
|
||||
|
||||
Nach dem Deployment einmalig aufrufen:
|
||||
|
||||
```bash
|
||||
curl -X POST https://news.vanityontour.de/api/telegram/setup-webhook \
|
||||
-H "Cookie: rss_news_session=<dein-session-token>"
|
||||
```
|
||||
|
||||
Oder über die Admin-UI: Settings → Telegram Webhook einrichten.
|
||||
|
||||
### 3. N8N Workflow einrichten
|
||||
|
||||
In N8N einen neuen Workflow erstellen:
|
||||
|
||||
**Trigger:** Cron
|
||||
- Zeitplan 1: `0 8 * * *` (täglich 08:00)
|
||||
- Zeitplan 2: `0 16 * * *` (täglich 16:00)
|
||||
|
||||
**Aktion:** HTTP Request
|
||||
- Method: `POST`
|
||||
- URL: `https://news.vanityontour.de/api/n8n/pipeline`
|
||||
- Header: `X-API-Key: <dein-n8n-api-key>`
|
||||
|
||||
**Fehlerbehandlung:** Bei HTTP-Fehler → E-Mail/Telegram-Alert
|
||||
|
||||
---
|
||||
|
||||
## Telegram-Befehle
|
||||
|
||||
| Befehl | Funktion |
|
||||
|--------|----------|
|
||||
| `/run` | Pipeline manuell starten |
|
||||
| `/rejected` | Abgelehnte Artikel der letzten 3 Tage anzeigen |
|
||||
| `/status` | Aktuellen Pipeline-Status |
|
||||
| `/help` | Alle Befehle anzeigen |
|
||||
|
||||
---
|
||||
|
||||
## Telegram-Benachrichtigungen
|
||||
|
||||
### Neuer Draft erstellt
|
||||
Wenn ein Artikel erfolgreich verarbeitet wurde:
|
||||
|
||||
```
|
||||
✅ Neuer Draft erstellt
|
||||
📰 [Artikel-Titel]
|
||||
🟢 Relevanz-Score: 87/100
|
||||
📅 Vorgeschlagene Veröffentlichung: Mo, 24.03.2026 um 09:00 Uhr
|
||||
🏷 #VanLife #Camping #Wohnmobil
|
||||
🔗 Draft in WordPress öffnen
|
||||
|
||||
[✏️ Neu schreiben] [❌ Verwerfen]
|
||||
```
|
||||
|
||||
### Relevanz-Warnung (Score 60–79)
|
||||
```
|
||||
⚠️ Artikel mit niedrigem Relevanz-Score
|
||||
📰 [Artikel-Titel]
|
||||
🟡 Score: 72/100
|
||||
💬 Artikel behandelt hauptsächlich...
|
||||
🔗 Originalartikel
|
||||
|
||||
[➕ Trotzdem verarbeiten] [❌ Ablehnen]
|
||||
```
|
||||
|
||||
### Abgelehnte Artikel (Ende jedes Runs)
|
||||
Liste aller abgelehnten Artikel mit Override-Buttons für jeden einzelnen.
|
||||
|
||||
---
|
||||
|
||||
## Relevanz-Score
|
||||
|
||||
Der GPT-basierte Score bewertet die Themenrelevanz für den VanLife/Camping-Blog:
|
||||
|
||||
| Score | Aktion |
|
||||
|-------|--------|
|
||||
| 80–100 | Automatisch verarbeiten |
|
||||
| 60–79 | Telegram-Warnung, manueller Override |
|
||||
| 0–59 | Automatisch abgelehnt |
|
||||
|
||||
Themen die hoch scored werden: Campingplätze, Stellplätze, Wohnmobile, Van-Ausbau,
|
||||
Outdoor-Equipment, Wandern, Naturreisen, Roadtrips, Camping-Tipps.
|
||||
|
||||
Schwellwerte sind in `.env` konfigurierbar:
|
||||
```
|
||||
PIPELINE_RELEVANCE_AUTO=80
|
||||
PIPELINE_RELEVANCE_WARN=60
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Veröffentlichungsplan
|
||||
|
||||
- Maximal **2 Beiträge pro Tag**
|
||||
- Bevorzugte Zeiten: **09:00 und 14:00 Uhr** (CET)
|
||||
- Gleichmäßig über die Woche verteilt
|
||||
- Der Vorschlag erscheint in der Telegram-Nachricht
|
||||
- Manuell in WordPress setzen oder über WP Scheduling-Plugin automatisieren
|
||||
|
||||
Einstellbar via:
|
||||
```
|
||||
PIPELINE_MAX_DRAFTS_PER_DAY=2
|
||||
PIPELINE_PUBLISH_HOURS=9,14
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## API-Endpunkte (N8N / extern)
|
||||
|
||||
Alle externen Endpunkte benötigen den Header `X-API-Key: <N8N_API_KEY>`.
|
||||
|
||||
| Methode | Endpunkt | Funktion |
|
||||
|---------|----------|----------|
|
||||
| `POST` | `/api/n8n/pipeline` | Komplette Pipeline starten |
|
||||
| `POST` | `/api/n8n/ingest` | Nur RSS-Import (ohne Rewrite) |
|
||||
|
||||
---
|
||||
|
||||
## Deployment (Hetzner via GitHub)
|
||||
|
||||
Das Deployment läuft automatisch über GitHub Actions beim Push auf `main`:
|
||||
|
||||
1. GitHub Action führt Tests aus
|
||||
2. Bei Erfolg: SSH-Deploy auf Hetzner
|
||||
3. `pip install -r requirements.txt`
|
||||
4. Systemd-Dienst `rss-app` neu starten
|
||||
|
||||
Workflow-Dateien: `.github/workflows/test.yml` und `.github/workflows/deploy.yml`
|
||||
|
||||
---
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
**Pipeline läuft, aber keine Telegram-Nachrichten:**
|
||||
- `TELEGRAM_BOT_TOKEN` und `TELEGRAM_CHAT_ID` prüfen
|
||||
- Webhook-Status prüfen: `GET https://api.telegram.org/bot<TOKEN>/getWebhookInfo`
|
||||
|
||||
**N8N bekommt 401:**
|
||||
- `N8N_API_KEY` in `.env` und N8N-Workflow-Header müssen übereinstimmen
|
||||
|
||||
**Alle Artikel werden abgelehnt:**
|
||||
- `PIPELINE_RELEVANCE_WARN` temporär auf 40 senken zum Testen
|
||||
- Über `/rejected` + Override-Button manuell testen
|
||||
|
||||
**Artikel werden doppelt importiert:**
|
||||
- Deduplication läuft über `source_url` (eindeutig). Bereits verarbeitete Artikel werden nie erneut als Draft angelegt.
|
||||
91
docs/PROJECT_PLAN.md
Normal file
91
docs/PROJECT_PLAN.md
Normal file
|
|
@ -0,0 +1,91 @@
|
|||
# Projektplan (Neustart)
|
||||
|
||||
## Leitentscheidungen
|
||||
- Bestehendes Repository wird weiterverwendet.
|
||||
- Kein harter Endtermin: lauffaehig werden, dann iterativ verbessern.
|
||||
- Hetzner bleibt Laufzeitplattform.
|
||||
- WordPress (IONOS) bleibt vorerst Ziel fuer Publikation.
|
||||
- Auth initial nur mit einem User/Password.
|
||||
|
||||
## Zielbild
|
||||
Eine modulare News-Pipeline mit klaren Stufen:
|
||||
1. Feed-Ingestion
|
||||
2. Inhaltsanalyse und Normalisierung
|
||||
3. Rewrite/Anreicherung
|
||||
4. Legal- und Qualitaetschecks
|
||||
5. WordPress-Publikation (Draft-first, Queue + Retry)
|
||||
6. Monitoring/Logging
|
||||
|
||||
## Grobe Zeitplanung (ohne Fixtermine)
|
||||
- Phase 0: ca. 1 Woche
|
||||
- Phase 1: ca. 2-4 Wochen
|
||||
- Phase 2: ca. 2-3 Wochen
|
||||
- Phase 3: fortlaufend
|
||||
|
||||
## Phasen
|
||||
|
||||
### Phase 0 - Grundlagen (jetzt)
|
||||
- Doku und Wiki strukturieren
|
||||
- Source-Policy definieren
|
||||
- Redirect fuer `news.vanityontour.de` setzen
|
||||
- GitHub Project als zentrale Planung scharfstellen
|
||||
|
||||
### Phase 1 - MVP Core
|
||||
- Neues FastAPI-Projektgeruest
|
||||
- SQLite-Datenmodell (feeds, articles, runs, source_policy)
|
||||
- Feed-Import mit Duplikaterkennung
|
||||
- Admin-Login (ein User)
|
||||
- Manuelle Review vor Publish
|
||||
- Admin-UI fuer Rechtscheck, Bildauswahl, Relevanzbewertung
|
||||
|
||||
### Phase 2 - Automation
|
||||
- Job-Queue (asynchron)
|
||||
- Regelbasierte Scheduler
|
||||
- Retry/Dead-Letter-Handling
|
||||
- Robustes Error-Reporting
|
||||
- WordPress-Publisher (Draft) mit Mapping `article_id -> wp_post_id`
|
||||
|
||||
### Phase 3 - Compliance und Skalierung
|
||||
- Source-Whitelisting mit Pflichtfeldern
|
||||
- Pflicht-Attribution pro Artikel
|
||||
- Qualitaetsmetriken und Audit-Logs
|
||||
- Optional: Passkey/WebAuthn
|
||||
|
||||
## Aktueller Stand (Snapshot)
|
||||
- Backend/API + Admin-UI lauffaehig
|
||||
- Feed-Ingestion inkl. Originalartikel-Extraktion (Autor, Pressekontakt, Bilder)
|
||||
- Bildkuration:
|
||||
- automatische Scoring-Reduktion (u. a. Presseportal `story_big` priorisiert)
|
||||
- manuelle Auswahl/Ausblendung im UI
|
||||
- Rechts-/Publish-Gates aktiv:
|
||||
- `legal_checked` Pflicht
|
||||
- Hauptbild-Auswahl Pflicht
|
||||
- Status-Workflow bis `published`
|
||||
- WordPress-Publishing:
|
||||
- Queue + Retry + Job-Historie
|
||||
- Draft-Erstellung/Update erfolgreich getestet
|
||||
- Exporte:
|
||||
- JSON/CSV inkl. Datum/Alter/Relevanz + Attribution/Legal-Felder
|
||||
|
||||
## Naechste Iteration (konkret)
|
||||
1. WordPress `featured_media` Upload aus ausgewaehltem Hauptbild
|
||||
2. Publish-HTML je Artikel verfeinern (strukturierter Body + konsistenter Quellenblock)
|
||||
3. Publisher als periodischen Worker (Timer/Cron/Systemd) auf Hetzner betreiben
|
||||
4. Monitoring/Alerting fuer Queue-Fehler + WP-API Fehlercodes
|
||||
|
||||
## Architekturprinzipien
|
||||
- Idempotente Jobs
|
||||
- Trennung von UI, API, Worker
|
||||
- Strikte Validierung bei Quell-/Lizenzdaten
|
||||
- Expliziter Publish-Schritt, kein blindes Autoposting
|
||||
|
||||
## Risiken
|
||||
- Lizenz-/Nutzungsbedingungen je Quelle variieren stark
|
||||
- Feeds aendern Struktur/Verfuegbarkeit
|
||||
- WordPress-API und Auth koennen regressionsanfaellig sein
|
||||
|
||||
## Erfolgsmetriken
|
||||
- Zeit von Feed-Eingang bis Review-Ready
|
||||
- Quote sauber attribuierter Artikel
|
||||
- Fehlerrate pro Pipeline-Stufe
|
||||
- Anzahl manueller Eingriffe pro Woche
|
||||
81
docs/SOURCE_POLICY.md
Normal file
81
docs/SOURCE_POLICY.md
Normal file
|
|
@ -0,0 +1,81 @@
|
|||
# Source Policy und Feed-Vorschlaege
|
||||
|
||||
## Grundsatz
|
||||
Es werden nur Quellen genutzt, deren Nutzungsbedingungen die geplante Nutzung erlauben oder fuer die eine explizite Genehmigung vorliegt.
|
||||
|
||||
## Pflichtdaten pro Quelle
|
||||
- Quellname
|
||||
- Feed-URL
|
||||
- Originalartikel-URL
|
||||
- Autor/Herausgeber (wenn vorhanden)
|
||||
- Lizenz/Nutzungsgrundlage
|
||||
- Einschraenkungen (kommerziell, Bearbeitung, Bildrechte, Archivierung)
|
||||
- Datum der letzten Pruefung
|
||||
- Link auf Nutzungsbedingungen
|
||||
|
||||
## Einstufung (Ampel)
|
||||
- Gruen: Nutzung fuer geplantes Modell klar erlaubt
|
||||
- Gelb: teilklar/mit Einschraenkungen, manuelle Pruefung erforderlich
|
||||
- Rot: fuer das Modell nicht geeignet ohne Zusatzvertrag
|
||||
|
||||
## Verbindliche Regeln
|
||||
- Keine neue Quelle ohne Eintrag im Source-Register
|
||||
- Kein automatischer Publish bei Gelb/Rot
|
||||
- Bilder separat pruefen (Textrecht != Bildrecht)
|
||||
- Quartalsweiser Re-Check der Terms
|
||||
|
||||
## Ersteinschaetzung (Stand: 16.02.2026)
|
||||
|
||||
### Rot
|
||||
1. Reuters / Thomson Reuters
|
||||
- Grund: Inhalte sind urheberrechtlich geschuetzt; Reproduktion/Verteilung laut Terms nur mit vorheriger Zustimmung.
|
||||
- Folge: Nur mit explizitem Vertrag/Lizenz.
|
||||
- Referenz:
|
||||
- https://www.thomsonreuters.com/en/terms-of-use
|
||||
|
||||
2. tagesschau.de RSS
|
||||
- Grund: Inhalte nur privat/nicht-kommerziell; Veroeffentlichung grundsaetzlich nicht erlaubt (ausser explizit CC-lizenziert).
|
||||
- Folge: Nicht fuer das geplante Modell geeignet.
|
||||
- Referenz:
|
||||
- https://www.tagesschau.de/infoservices/rssfeeds
|
||||
|
||||
### Gelb
|
||||
1. Presseportal / ots
|
||||
- Grund: Redaktionelle Nutzung grundsaetzlich moeglich, aber Verantwortung liegt beim Verwender; darueber hinausgehende Geschaeftsnutzung nur mit Genehmigung.
|
||||
- Folge: Nur mit strikter Einzelpruefung pro Meldung (insb. Bild-/Drittrechte).
|
||||
- Referenz:
|
||||
- https://www.presseportal.de/nutzungsbedingungen
|
||||
- https://www.presseportal.de/feeds/
|
||||
|
||||
2. Bundesbehoerden-RSS ohne explizite freie Weiterverwendungs-Lizenz
|
||||
- Grund: RSS wird bereitgestellt, aber nicht immer als offene Lizenz zur kommerziellen Nachnutzung formuliert.
|
||||
- Folge: Je Behoerde einzeln pruefen und dokumentieren.
|
||||
- Beispiele:
|
||||
- https://www.bundesfinanzministerium.de/Content/DE/Standardartikel/Service/rss_base.html
|
||||
- https://bmas.bund.de/EN/Services/RSS/rss.html
|
||||
|
||||
### Gruen (mit korrekter Attribution)
|
||||
1. GovData / Open-Data-Portale mit `dl-de/by-2-0`, `dl-de/zero-2-0`, `CC BY 4.0` oder `CC0`
|
||||
- Grund: Diese Lizenzen erlauben grundsaetzlich auch kommerzielle Weiterverwendung (je nach Lizenzbedingungen).
|
||||
- Folge: Sehr gut fuer stabile Automatisierung geeignet.
|
||||
- Referenz:
|
||||
- https://www.govdata.de/dl-de/by-2-0
|
||||
- https://data.gov.de/informationen/lizenzen
|
||||
- https://www.dcat-ap.de/def/licenses/dl-zero-de/2.0
|
||||
|
||||
2. EU-Quellen mit expliziter `CC BY 4.0` Wiederverwendungsregel
|
||||
- Grund: EU-Inhalte sind haeufig unter CC BY 4.0 wiederverwendbar, sofern nicht anders gekennzeichnet.
|
||||
- Folge: Geeignet, wenn Drittinhalte ausgenommen werden.
|
||||
- Referenz:
|
||||
- https://commission.europa.eu/legal-notice_en
|
||||
- https://eur-lex.europa.eu/content/help/content/legal-notice/legal-notice.html
|
||||
|
||||
## Quelle im Register freischalten (Definition of Done)
|
||||
- Terms-Link hinterlegt
|
||||
- Lizenzklasse (Gruen/Gelb/Rot) gesetzt
|
||||
- Pflicht-Attribution dokumentiert
|
||||
- Bildrechtsregel dokumentiert
|
||||
- Letzte Pruefung und Verantwortlicher gepflegt
|
||||
|
||||
## Hinweis
|
||||
Keine Rechtsberatung. Bei unklaren oder wirtschaftlich kritischen Quellen ist eine juristische Prüfung sinnvoll.
|
||||
38
docs/TODO.md
Normal file
38
docs/TODO.md
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
# ToDo (Ein-Entwickler Setup)
|
||||
|
||||
## Jetzt
|
||||
- [ ] WordPress Beitragsbild-Upload implementieren (`featured_media` aus ausgewaehltem Hauptbild)
|
||||
- [ ] WordPress-HTML-Ausgabe pro Artikel weiter verbessern (sauberes Layout, Quellenblock, Shortcodes falls noetig)
|
||||
- [ ] Publisher Fehlertexte fuer WP-Auth/Media/API in UI klarer darstellen
|
||||
- [ ] End-to-end Publish Smoke-Test dokumentieren (lokal + Hetzner)
|
||||
|
||||
## MVP
|
||||
- [x] Neues Backend-Skelett (`backend/`) aufsetzen (FastAPI)
|
||||
- [x] Datenmodell in SQLite anlegen
|
||||
- [x] Feed-Ingestion Service bauen (ETag/Last-Modified)
|
||||
- [x] Duplikaterkennung ueber `source_url`, `guid`, Hash
|
||||
- [x] Login mit 1 Admin-Account implementieren
|
||||
- [x] Artikel-Review-Maske mit Statusworkflow
|
||||
- [x] WordPress-Publisher als separaten Service implementieren (Queue + Retry + Mapping)
|
||||
- [x] Bildvorschau + manuelle Bildauswahl im Admin-UI
|
||||
- [x] Automatische Bildreduktion/Scoring fuer Presseportal-Quellen
|
||||
- [x] Artikel-Datum + Relevanzscore im UI/Export
|
||||
|
||||
## Recht/Qualitaet
|
||||
- [x] Source-Policy in DB + Admin-UI abbilden
|
||||
- [x] Pflichtfelder je Quelle erzwingen (Autor, URL, Lizenz, Hinweise)
|
||||
- [x] Auto-Block bei fehlender Lizenzinfo
|
||||
- [x] Pro Artikel Attribution-Block generieren
|
||||
- [x] Manuelle Rechtsfreigabe als Publish-Gate
|
||||
|
||||
## Betrieb
|
||||
- [ ] Systemd-Service(s) fuer API/Worker erstellen
|
||||
- [ ] Nginx-Routing fuer neue App einrichten
|
||||
- [ ] Healthcheck-Endpunkte + Monitoring einrichten
|
||||
- [ ] Backup/Restore fuer DB dokumentieren
|
||||
|
||||
## Spaeter
|
||||
- [ ] Passkey/WebAuthn evaluieren und optional einfuehren
|
||||
- [ ] Migration auf PostgreSQL bewerten
|
||||
- [ ] Teilautomatische Freigabe-Regeln definieren
|
||||
- [ ] KI-Rewrite mit Prompt-Versionierung + Qualitaetsmetriken wieder aktivieren
|
||||
37
docs/roadmap-image-dedup.md
Normal file
37
docs/roadmap-image-dedup.md
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
# Roadmap: Bild-Deduplizierung & Medien-Hygiene
|
||||
|
||||
## Ziele
|
||||
- Speicherverbrauch reduzieren
|
||||
- Medienbestand konsistent halten
|
||||
- Pipeline stabilisieren (keine Mehrfach-Uploads und -Speicherungen)
|
||||
|
||||
## Vorgehen (sicher und reversibel)
|
||||
1. **Index aufbauen (Read-Only):**
|
||||
- Alle Bilder (`.jpg/.jpeg/.png/.webp/.gif`) in definierten Verzeichnissen scannen
|
||||
- Für jede Datei: `sha256` (Byte-Hash) + `pHash` (perzeptuell) berechnen
|
||||
- Ergebnis als SQLite-Index + CSV-Report speichern
|
||||
|
||||
2. **Kanonisierung & Referenzen prüfen:**
|
||||
- Pro Duplikatgruppe genau **eine** kanonische Datei wählen (größte/neueste)
|
||||
- Alle internen Referenzen (DB/JSON) testweise auf Kanon aktualisieren (Dry-Run)
|
||||
|
||||
3. **Speicher sparen ohne Risiko:**
|
||||
- Nicht-kanonische Dateien durch **Hardlinks** auf den Kanon ersetzen (gleiches FS)
|
||||
- Alternativ: nur löschen, wenn Referenzen **sicher** auf Kanon zeigen
|
||||
|
||||
4. **Prävention für die Zukunft:**
|
||||
- Beim Speichern: **Content-Addressed Storage** (`<sha256>.<ext>`)
|
||||
- In DB ein `content_hash`-Feld mit **Unique-Constraint**
|
||||
- Vor jedem Speichern/Upload: Hash lookup → vorhandene Datei wiederverwenden
|
||||
|
||||
## Akzeptanzkriterien
|
||||
- Report listet alle Duplikatgruppen mit Pfaden und Größenersparnis
|
||||
- Dry-Run zeigt geplante Änderungen ohne Schreibzugriff
|
||||
- Nach „Anwenden“ verweisen alle Referenzen auf die kanonische Datei
|
||||
- Re-Run findet **keine** Duplikate mehr (idempotent)
|
||||
- Rollback möglich via Backup der Reports/Indexdatei
|
||||
|
||||
## Metriken
|
||||
- Anzahl Bilder vorher/nachher
|
||||
- Ersparter Speicher (MB/GB)
|
||||
- Anzahl gruppierter Duplikate
|
||||
29
docs/wiki/Architektur.md
Normal file
29
docs/wiki/Architektur.md
Normal file
|
|
@ -0,0 +1,29 @@
|
|||
# Architektur
|
||||
|
||||
## Zielarchitektur
|
||||
- API: FastAPI
|
||||
- Worker: Queue-basierte Hintergrundjobs
|
||||
- DB: SQLite (MVP), spaeter optional PostgreSQL
|
||||
- Publisher: WordPress REST API
|
||||
- Frontend/Admin: schlanke Web-UI mit Login
|
||||
|
||||
## Pipeline
|
||||
1. Feed Fetch
|
||||
2. Parse + Normalize
|
||||
3. Deduplicate
|
||||
4. Enrichment (Rewrite/Tags)
|
||||
5. Legal/Policy Check
|
||||
6. Publish (pending)
|
||||
|
||||
## Datenobjekte (MVP)
|
||||
- `sources`
|
||||
- `feeds`
|
||||
- `articles`
|
||||
- `article_versions`
|
||||
- `runs`
|
||||
- `policy_checks`
|
||||
|
||||
## Nichtziele (MVP)
|
||||
- Multi-User und Rollen
|
||||
- Vollautomatische Freigabe ohne Review
|
||||
- Komplexe externe SSO-Integration
|
||||
20
docs/wiki/Deployment.md
Normal file
20
docs/wiki/Deployment.md
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
# Deployment (Hetzner + CloudPanel)
|
||||
|
||||
## Umgebung
|
||||
- Host: Hetzner
|
||||
- Reverse Proxy: Nginx via CloudPanel
|
||||
- Ziel-Domain: `news.vanityontour.de`
|
||||
|
||||
## Aktueller Zustand
|
||||
- Domain ist bis zum Go-Live auf `https://vanityontour.de` umgeleitet.
|
||||
|
||||
## Zielzustand
|
||||
- `news.vanityontour.de` zeigt auf neue App (interner Port, z. B. `127.0.0.1:8501`)
|
||||
- API/Worker laufen als systemd-Services
|
||||
- TLS bleibt ueber CloudPanel/Nginx
|
||||
|
||||
## Mindest-Checks nach Deployment
|
||||
- `curl -I https://news.vanityontour.de`
|
||||
- Login erreichbar
|
||||
- Feed-Import laeuft
|
||||
- WordPress-Testpublikation (pending) erfolgreich
|
||||
19
docs/wiki/Home.md
Normal file
19
docs/wiki/Home.md
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
# Wiki Home
|
||||
|
||||
## Zweck
|
||||
Dieses Wiki dokumentiert Architektur, Betrieb, Sicherheit, Recht und Roadmap des Neuaufbaus von `rss-news`.
|
||||
|
||||
## Inhalte
|
||||
- `Architektur.md`
|
||||
- `Deployment.md`
|
||||
- `Security-Auth.md`
|
||||
- `Recht-Quellen.md`
|
||||
- `Operations-Runbook.md`
|
||||
- `Roadmap.md`
|
||||
- `Project-Board.md`
|
||||
|
||||
## Projektsteuerung
|
||||
- GitHub Project #3: https://github.com/users/OliverGiertz/projects/3/views/1
|
||||
|
||||
## Prinzip
|
||||
Dokumentation wird bei jeder relevanten Aenderung im selben Pull Request aktualisiert.
|
||||
43
docs/wiki/Operations-Runbook.md
Normal file
43
docs/wiki/Operations-Runbook.md
Normal file
|
|
@ -0,0 +1,43 @@
|
|||
# Operations Runbook
|
||||
|
||||
## Daily Checks
|
||||
- App erreichbar
|
||||
- Queue/Worker aktiv
|
||||
- Letzte Feed-Laeufe erfolgreich
|
||||
- Keine auffaelligen Fehler im Log
|
||||
|
||||
## Incident: Feed-Import faellt aus
|
||||
1. RSS-Quelle erreichbar?
|
||||
2. Parser-Fehler im Log?
|
||||
3. Rate Limits oder Blockaden?
|
||||
4. Retry-Queue pruefen
|
||||
|
||||
## Incident: WordPress Publish faellt aus
|
||||
1. WP API erreichbar?
|
||||
2. Credentials gueltig?
|
||||
3. Payload-Validation/Tag-Fehler?
|
||||
4. Artikel in `pending` statt `failed` markieren, wenn unklar
|
||||
|
||||
## Incident: Telegram-Buttons reagieren nicht / Befehle ignoriert
|
||||
|
||||
**Ursache:** N8N "App Release - Telegram Bot"-Workflow hat den Webhook überschrieben.
|
||||
|
||||
**Prüfen:**
|
||||
```bash
|
||||
curl -s "https://api.telegram.org/bot8403822424:AAGp8gZoNIGZv3IIan45q7P9HfM868qzXi4/getWebhookInfo" | python3 -m json.tool
|
||||
```
|
||||
→ `url` muss auf `https://news.vanityontour.de/telegram/webhook` zeigen
|
||||
→ `allowed_updates` muss `["message", "callback_query"]` enthalten
|
||||
|
||||
**Webhook zurücksetzen:**
|
||||
```bash
|
||||
curl -s -X POST "https://api.telegram.org/bot8403822424:AAGp8gZoNIGZv3IIan45q7P9HfM868qzXi4/setWebhook" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"url":"https://news.vanityontour.de/telegram/webhook","allowed_updates":["message","callback_query"],"secret_token":"RWWAaBwfCUX9Y573JVkB9zAeloHsZZoruXOBBgUtsvU"}'
|
||||
```
|
||||
|
||||
Vollständige Dokumentation: `projects/webhook/telegram-webhook-reset.md`
|
||||
|
||||
## Backups
|
||||
- SQLite-Dump taeglich
|
||||
- Konfiguration und `.env` sicher sichern
|
||||
28
docs/wiki/Project-Board.md
Normal file
28
docs/wiki/Project-Board.md
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
# Project Board Workflow
|
||||
|
||||
## Zentrale Steuerung
|
||||
- Board: https://github.com/users/OliverGiertz/projects/3/views/1
|
||||
- Board ist die einzige Quelle fuer Planungsstatus.
|
||||
|
||||
## Arbeitsmodus (1 Entwickler)
|
||||
- Neue Arbeit immer als Issue anlegen
|
||||
- Issue direkt ins Project aufnehmen
|
||||
- Status nur im Project pflegen
|
||||
- PR/Commit auf Issue referenzieren
|
||||
|
||||
## Empfohlene Status-Disziplin
|
||||
- `Todo`: noch nicht begonnen
|
||||
- `In Progress`: aktiv in Arbeit
|
||||
- `Done`: umgesetzt und dokumentiert
|
||||
|
||||
## Konventionen fuer Issues
|
||||
- Prefix fuer Klarheit:
|
||||
- `[MVP]`
|
||||
- `[Infra]`
|
||||
- `[Legal]`
|
||||
- `[Bug]`
|
||||
- Definition of Done in jedem Issue notieren
|
||||
|
||||
## Aktueller Backlog-Hinweis
|
||||
- Thema Userverwaltung ist fuer MVP obsolet (ein Admin-User).
|
||||
- Entsprechende Issues als `deferred` oder `closed` kennzeichnen.
|
||||
35
docs/wiki/Recht-Quellen.md
Normal file
35
docs/wiki/Recht-Quellen.md
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
# Recht und Quellen
|
||||
|
||||
## Grundregeln
|
||||
- Nur freigegebene Quellen aus Source-Register
|
||||
- Pflicht-Attribution pro Artikel
|
||||
- Rechte fuer Bilder separat pruefen
|
||||
- Kein Autopublish bei unklarer Lizenz
|
||||
|
||||
## Bewertungsmodell
|
||||
- Gruen: Freie Nachnutzung klar erlaubt
|
||||
- Gelb: Nutzung mit Einschraenkungen/Einzelfallpruefung
|
||||
- Rot: Ohne Zusatzlizenz nicht geeignet
|
||||
|
||||
## Aktuelle Referenzen
|
||||
- Reuters/Thomson Reuters Terms: https://www.thomsonreuters.com/en/terms-of-use
|
||||
- Presseportal Nutzungsbedingungen: https://www.presseportal.de/nutzungsbedingungen
|
||||
- tagesschau RSS-Hinweise: https://www.tagesschau.de/infoservices/rssfeeds
|
||||
- Datenlizenz Deutschland BY 2.0: https://www.govdata.de/dl-de/by-2-0
|
||||
- GovData Lizenzen: https://data.gov.de/informationen/lizenzen
|
||||
- EU Legal Notice (CC BY 4.0): https://commission.europa.eu/legal-notice_en
|
||||
|
||||
## Review-Checkliste je Quelle
|
||||
1. Sind Bearbeitung und Veroeffentlichung erlaubt?
|
||||
2. Ist kommerzielle Nutzung erlaubt?
|
||||
3. Gibt es gesonderte Bildrechte?
|
||||
4. Ist die Quellenangabe vorgeschrieben?
|
||||
5. Gibt es Archivierungs- oder Weitergabebeschraenkungen?
|
||||
|
||||
## Operativer Schutz
|
||||
- Source-Register als Pflicht vor Feed-Aktivierung
|
||||
- Auto-Block bei fehlenden Lizenzdaten
|
||||
- Quartalsweiser Terms-Recheck
|
||||
|
||||
## Hinweis
|
||||
Keine Rechtsberatung. Finale Freigabe kritischer Quellen bei Bedarf juristisch validieren.
|
||||
19
docs/wiki/Roadmap.md
Normal file
19
docs/wiki/Roadmap.md
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
# Roadmap
|
||||
|
||||
## Jetzt
|
||||
- Doku und Projektstruktur bereinigen
|
||||
- Redirect aktiv
|
||||
- Backlog auf Neustart ausrichten
|
||||
|
||||
## Naechster Schritt
|
||||
- FastAPI-MVP implementieren
|
||||
- Login + Feed-Ingestion + Review + WordPress pending
|
||||
|
||||
## Danach
|
||||
- Worker/Queue
|
||||
- Source-Policy Enforcement
|
||||
- Monitoring/Reporting
|
||||
- Optional Passkey
|
||||
|
||||
## Steuerung
|
||||
Alle Arbeitsitems liegen im GitHub Project #3.
|
||||
16
docs/wiki/Security-Auth.md
Normal file
16
docs/wiki/Security-Auth.md
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
# Security und Auth
|
||||
|
||||
## Mindestanforderungen
|
||||
- Zugriff auf die WebApp nur mit Login
|
||||
- Ein aktiver Admin-User (kein Rollenmodell im MVP)
|
||||
- Passwort nicht im Repo, nur als Secret auf Server
|
||||
|
||||
## Empfohlene Umsetzung
|
||||
- Session-basierte Auth (HTTP-only Cookies)
|
||||
- Passwort gehasht (Argon2 oder bcrypt)
|
||||
- Rate Limiting auf Login-Endpunkt
|
||||
- CSRF-Schutz fuer Form-Aktionen
|
||||
|
||||
## Spaeter (optional)
|
||||
- Passkey/WebAuthn als zusaetzlicher Login-Faktor
|
||||
- IP-Allowlist fuer Admin-Zugang
|
||||
20
internal/git.sh
Normal file
20
internal/git.sh
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
# Aktuellen Stand vom main/master holen
|
||||
git checkout main
|
||||
git pull origin main
|
||||
|
||||
# Neuen Feature-Branch erstellen
|
||||
git checkout -b feature/neue-funktion
|
||||
|
||||
# Entwickeln und committen
|
||||
git add .
|
||||
git commit -m "Neue Funktion implementiert"
|
||||
|
||||
# Branch auf Remote-Repository pushen
|
||||
git push -u origin feature/neue-funktion
|
||||
|
||||
|
||||
# Alle Branches anzeigen
|
||||
git branch -a
|
||||
|
||||
# Aktuellen Branch anzeigen
|
||||
git branch --show-current
|
||||
6694
logs/rss_tool.log
6694
logs/rss_tool.log
File diff suppressed because it is too large
Load diff
535
main.py
535
main.py
|
|
@ -7,8 +7,12 @@ from bs4 import BeautifulSoup
|
|||
from datetime import datetime
|
||||
from dotenv import load_dotenv
|
||||
import logging
|
||||
from utils.image_extractor import extract_images_with_metadata
|
||||
import openai
|
||||
from utils.image_extractor import extract_images_with_metadata
|
||||
from utils.article_extractor import extract_full_article
|
||||
from utils.wordpress_uploader import upload_articles_to_wordpress
|
||||
import hashlib
|
||||
import time
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
|
@ -16,161 +20,474 @@ load_dotenv()
|
|||
log_dir = "logs"
|
||||
os.makedirs(log_dir, exist_ok=True)
|
||||
log_file = os.path.join(log_dir, "rss_tool.log")
|
||||
|
||||
# Logging-Format verbessern
|
||||
logging.basicConfig(
|
||||
filename=log_file,
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s"
|
||||
format="%(asctime)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s",
|
||||
handlers=[
|
||||
logging.FileHandler(log_file, encoding='utf-8'),
|
||||
logging.StreamHandler() # Auch in Konsole ausgeben
|
||||
]
|
||||
)
|
||||
|
||||
openai.api_key = os.getenv("OPENAI_API_KEY")
|
||||
|
||||
ARTICLES_FILE = "data/articles.json"
|
||||
FEEDS_FILE = "data/feeds.json"
|
||||
VALID_STATUSES = ["New", "Rewrite", "Process", "Online", "On Hold", "Trash"]
|
||||
VALID_STATUSES = ["New", "Rewrite", "Process", "Online", "On Hold", "Trash", "WordPress Pending"]
|
||||
|
||||
# === Datenordner erstellen ===
|
||||
os.makedirs("data", exist_ok=True)
|
||||
|
||||
def generate_article_id(title, link, date):
|
||||
"""Generiert eine eindeutige ID für einen Artikel basierend auf mehreren Attributen"""
|
||||
identifier = f"{title}_{link}_{date}"
|
||||
return hashlib.md5(identifier.encode('utf-8')).hexdigest()
|
||||
|
||||
def is_duplicate_article(new_article, existing_articles):
|
||||
"""Prüft ob ein Artikel bereits existiert (erweiterte Duplikatserkennung)"""
|
||||
new_title = new_article.get("title", "").lower().strip()
|
||||
new_link = new_article.get("link", "").strip()
|
||||
|
||||
for existing in existing_articles:
|
||||
existing_title = existing.get("title", "").lower().strip()
|
||||
existing_link = existing.get("link", "").strip()
|
||||
|
||||
# Exakte URL-Übereinstimmung
|
||||
if new_link and existing_link and new_link == existing_link:
|
||||
return True
|
||||
|
||||
# Sehr ähnliche Titel (mindestens 90% Übereinstimmung)
|
||||
if new_title and existing_title:
|
||||
similarity = calculate_similarity(new_title, existing_title)
|
||||
if similarity > 0.9:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def calculate_similarity(text1, text2):
|
||||
"""Berechnet die Ähnlichkeit zwischen zwei Texten (vereinfachte Methode)"""
|
||||
words1 = set(text1.split())
|
||||
words2 = set(text2.split())
|
||||
|
||||
if not words1 and not words2:
|
||||
return 1.0
|
||||
if not words1 or not words2:
|
||||
return 0.0
|
||||
|
||||
intersection = len(words1.intersection(words2))
|
||||
union = len(words1.union(words2))
|
||||
|
||||
return intersection / union if union > 0 else 0.0
|
||||
|
||||
def load_feeds():
|
||||
if not os.path.exists(FEEDS_FILE):
|
||||
"""Lädt RSS-Feeds aus der JSON-Datei"""
|
||||
try:
|
||||
if not os.path.exists(FEEDS_FILE):
|
||||
logging.info("Feeds-Datei existiert nicht, erstelle leere Liste")
|
||||
return []
|
||||
|
||||
with open(FEEDS_FILE, "r", encoding='utf-8') as f:
|
||||
feeds = json.load(f)
|
||||
logging.info(f"✅ {len(feeds)} Feeds geladen")
|
||||
return feeds
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Fehler beim Laden der Feeds: {e}")
|
||||
return []
|
||||
with open(FEEDS_FILE, "r") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def save_feeds(feeds):
|
||||
with open(FEEDS_FILE, "w") as f:
|
||||
json.dump(feeds, f, indent=2)
|
||||
|
||||
"""Speichert RSS-Feeds in die JSON-Datei"""
|
||||
try:
|
||||
with open(FEEDS_FILE, "w", encoding='utf-8') as f:
|
||||
json.dump(feeds, f, indent=2, ensure_ascii=False)
|
||||
logging.info(f"✅ {len(feeds)} Feeds gespeichert")
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Fehler beim Speichern der Feeds: {e}")
|
||||
|
||||
def load_articles():
|
||||
if not os.path.exists(ARTICLES_FILE):
|
||||
"""Lädt Artikel aus der JSON-Datei"""
|
||||
try:
|
||||
if not os.path.exists(ARTICLES_FILE):
|
||||
logging.info("Artikel-Datei existiert nicht, erstelle leere Liste")
|
||||
return []
|
||||
|
||||
with open(ARTICLES_FILE, "r", encoding='utf-8') as f:
|
||||
articles = json.load(f)
|
||||
|
||||
# Status-Validierung
|
||||
for article in articles:
|
||||
if article.get("status") not in VALID_STATUSES:
|
||||
article["status"] = "New"
|
||||
logging.warning(f"⚠️ Ungültiger Status für Artikel '{article.get('title', 'Unbekannt')}' korrigiert")
|
||||
|
||||
logging.info(f"✅ {len(articles)} Artikel geladen")
|
||||
return articles
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Fehler beim Laden der Artikel: {e}")
|
||||
return []
|
||||
with open(ARTICLES_FILE, "r") as f:
|
||||
articles = json.load(f)
|
||||
|
||||
for article in articles:
|
||||
if article.get("status") not in VALID_STATUSES:
|
||||
article["status"] = "New"
|
||||
return articles
|
||||
|
||||
|
||||
def save_articles(articles):
|
||||
with open(ARTICLES_FILE, "w") as f:
|
||||
json.dump(articles, f, indent=2)
|
||||
|
||||
|
||||
def fetch_and_process_feed(feed_url, existing_ids):
|
||||
feed = feedparser.parse(feed_url)
|
||||
new_articles = []
|
||||
|
||||
for entry in feed.entries:
|
||||
article_id = entry.get("id") or entry.get("link")
|
||||
if not article_id or article_id in existing_ids:
|
||||
continue
|
||||
|
||||
title = entry.get("title", "Kein Titel")
|
||||
date = entry.get("published", datetime.now().isoformat())
|
||||
summary = entry.get("summary", "")
|
||||
content = entry.get("content", [{}])[0].get("value") or entry.get("description", "")
|
||||
"""Speichert Artikel in die JSON-Datei"""
|
||||
try:
|
||||
# Validierung vor dem Speichern
|
||||
valid_articles = []
|
||||
for article in articles:
|
||||
if "id" in article and "title" in article:
|
||||
valid_articles.append(article)
|
||||
else:
|
||||
logging.warning(f"⚠️ Ungültiger Artikel übersprungen: {article}")
|
||||
|
||||
with open(ARTICLES_FILE, "w", encoding='utf-8') as f:
|
||||
json.dump(valid_articles, f, indent=2, ensure_ascii=False)
|
||||
|
||||
logging.info(f"✅ {len(valid_articles)} Artikel gespeichert")
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Fehler beim Speichern der Artikel: {e}")
|
||||
|
||||
def clean_html_content(content):
|
||||
"""Bereinigt HTML-Inhalt und extrahiert Text"""
|
||||
try:
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
|
||||
# Entferne Script- und Style-Tags
|
||||
for script in soup(["script", "style"]):
|
||||
script.decompose()
|
||||
|
||||
# Hole sauberen Text
|
||||
clean_text = soup.get_text(" ", strip=True)
|
||||
|
||||
# Entferne überschüssige Leerzeichen
|
||||
clean_text = " ".join(clean_text.split())
|
||||
|
||||
return clean_text
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Fehler beim Bereinigen des HTML-Inhalts: {e}")
|
||||
return content
|
||||
|
||||
images = extract_images_with_metadata(entry.link)
|
||||
def fetch_and_process_feed(feed_url, existing_articles):
|
||||
"""Lädt und verarbeitet einen einzelnen RSS-Feed"""
|
||||
new_articles = []
|
||||
feed_name = "Unbekannt"
|
||||
|
||||
try:
|
||||
logging.info(f"🔄 Verarbeite Feed: {feed_url}")
|
||||
|
||||
# Feed parsen
|
||||
feed = feedparser.parse(feed_url)
|
||||
|
||||
if hasattr(feed, 'feed') and hasattr(feed.feed, 'title'):
|
||||
feed_name = feed.feed.title
|
||||
logging.info(f"📡 Feed-Name: {feed_name}")
|
||||
|
||||
if not feed.entries:
|
||||
logging.warning(f"⚠️ Keine Einträge in Feed gefunden: {feed_url}")
|
||||
return []
|
||||
|
||||
logging.info(f"📰 {len(feed.entries)} Einträge gefunden")
|
||||
|
||||
for entry in feed.entries:
|
||||
try:
|
||||
# Basis-Informationen extrahieren
|
||||
title = entry.get("title", "Kein Titel")
|
||||
date = entry.get("published", datetime.now().isoformat())
|
||||
link = entry.get("link", "")
|
||||
summary = entry.get("summary", "")
|
||||
|
||||
# Content extrahieren
|
||||
content = ""
|
||||
if hasattr(entry, 'content') and entry.content:
|
||||
content = entry.content[0].get("value", "")
|
||||
elif hasattr(entry, 'description'):
|
||||
content = entry.description
|
||||
else:
|
||||
content = summary
|
||||
|
||||
# HTML bereinigen
|
||||
clean_text = clean_html_content(content)
|
||||
|
||||
# Volltext-Extraktion bei kurzen Artikeln
|
||||
if len(clean_text.split()) < 50 and link:
|
||||
logging.info(f"🔍 Kurzer Artikel erkannt, versuche Volltext-Extraktion: {title}")
|
||||
fetched_text = extract_full_article(link)
|
||||
if len(fetched_text.split()) > len(clean_text.split()):
|
||||
clean_text = fetched_text
|
||||
logging.info(f"✅ Volltext extrahiert: {len(clean_text.split())} Wörter")
|
||||
|
||||
# Artikel-ID generieren
|
||||
article_id = generate_article_id(title, link, date)
|
||||
|
||||
# Neuen Artikel erstellen
|
||||
new_article = {
|
||||
"id": article_id,
|
||||
"title": title,
|
||||
"date": date,
|
||||
"summary": summary[:300] + "..." if len(summary) > 300 else summary,
|
||||
"text": clean_text,
|
||||
"tags": [],
|
||||
"status": "New",
|
||||
"link": link,
|
||||
"images": [],
|
||||
"source": feed_url,
|
||||
"source_name": feed_name,
|
||||
"created_at": datetime.now().isoformat(),
|
||||
"word_count": len(clean_text.split())
|
||||
}
|
||||
|
||||
# Duplikatsprüfung
|
||||
if not is_duplicate_article(new_article, existing_articles):
|
||||
# Bilder extrahieren
|
||||
if link:
|
||||
try:
|
||||
images = extract_images_with_metadata(link)
|
||||
new_article["images"] = images
|
||||
logging.info(f"🖼️ {len(images)} Bilder für '{title}' extrahiert")
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Fehler bei Bildextraktion für '{title}': {e}")
|
||||
|
||||
new_articles.append(new_article)
|
||||
logging.info(f"✅ Neuer Artikel hinzugefügt: {title}")
|
||||
else:
|
||||
logging.info(f"🔄 Duplikat übersprungen: {title}")
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Fehler beim Verarbeiten des Eintrags '{entry.get('title', 'Unbekannt')}': {e}")
|
||||
continue
|
||||
|
||||
logging.info(f"✅ Feed verarbeitet: {len(new_articles)} neue Artikel aus {feed_url}")
|
||||
return new_articles
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Kritischer Fehler beim Verarbeiten von {feed_url}: {e}")
|
||||
return []
|
||||
|
||||
new_articles.append({
|
||||
"id": article_id,
|
||||
"title": title,
|
||||
"date": date,
|
||||
"summary": summary,
|
||||
"text": clean_text,
|
||||
"tags": [],
|
||||
"status": "New",
|
||||
"link": entry.get("link", ""),
|
||||
"images": images
|
||||
})
|
||||
|
||||
return new_articles
|
||||
|
||||
|
||||
def process_articles(existing_ids):
|
||||
feeds = load_feeds()
|
||||
all_articles = load_articles()
|
||||
articles_by_id = {article["id"]: article for article in all_articles if "id" in article}
|
||||
new_entries = []
|
||||
|
||||
for feed in feeds:
|
||||
url = feed.get("url") if isinstance(feed, dict) else feed
|
||||
if not url:
|
||||
continue
|
||||
try:
|
||||
logging.info(f"Lade Feed: {url}")
|
||||
entries = fetch_and_process_feed(url, existing_ids)
|
||||
new_entries.extend(entries)
|
||||
logging.info(f"{len(entries)} neue Artikel gefunden in {url}")
|
||||
except Exception as e:
|
||||
logging.exception(f"Fehler beim Verarbeiten von {url}:")
|
||||
|
||||
added = 0
|
||||
for entry in new_entries:
|
||||
if entry["id"] not in articles_by_id:
|
||||
articles_by_id[entry["id"]] = entry
|
||||
added += 1
|
||||
def process_articles(existing_ids=None):
|
||||
"""Verarbeitet alle RSS-Feeds und fügt neue Artikel hinzu"""
|
||||
try:
|
||||
start_time = time.time()
|
||||
logging.info("🚀 Starte Artikel-Verarbeitung")
|
||||
|
||||
feeds = load_feeds()
|
||||
all_articles = load_articles()
|
||||
|
||||
if not feeds:
|
||||
logging.warning("⚠️ Keine RSS-Feeds konfiguriert")
|
||||
return
|
||||
|
||||
# Bestehende Artikel für Duplikatsprüfung
|
||||
existing_articles = all_articles.copy()
|
||||
|
||||
total_new_articles = 0
|
||||
|
||||
for feed in feeds:
|
||||
feed_url = feed.get("url") if isinstance(feed, dict) else feed
|
||||
|
||||
if not feed_url:
|
||||
logging.warning("⚠️ Feed ohne URL übersprungen")
|
||||
continue
|
||||
|
||||
try:
|
||||
new_articles = fetch_and_process_feed(feed_url, existing_articles)
|
||||
|
||||
# Neue Artikel zur Gesamtliste hinzufügen
|
||||
for article in new_articles:
|
||||
all_articles.append(article)
|
||||
existing_articles.append(article) # Für weitere Duplikatsprüfung
|
||||
|
||||
total_new_articles += len(new_articles)
|
||||
|
||||
# Kurze Pause zwischen Feeds
|
||||
time.sleep(1)
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Fehler beim Verarbeiten von Feed {feed_url}: {e}")
|
||||
continue
|
||||
|
||||
# Artikel speichern
|
||||
if total_new_articles > 0:
|
||||
save_articles(all_articles)
|
||||
processing_time = time.time() - start_time
|
||||
logging.info(f"🎉 Verarbeitung abgeschlossen: {total_new_articles} neue Artikel in {processing_time:.2f}s hinzugefügt")
|
||||
else:
|
||||
logging.info(f"Artikel bereits vorhanden, wird übersprungen: {entry['title']}")
|
||||
|
||||
if added > 0:
|
||||
save_articles(list(articles_by_id.values()))
|
||||
logging.info(f"{added} neue Artikel gespeichert.")
|
||||
else:
|
||||
logging.info("Keine neuen Artikel gefunden.")
|
||||
|
||||
logging.info("ℹ️ Keine neuen Artikel gefunden")
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Kritischer Fehler bei der Artikel-Verarbeitung: {e}")
|
||||
|
||||
def rewrite_articles():
|
||||
articles = load_articles()
|
||||
changed = False
|
||||
|
||||
for article in articles:
|
||||
if article.get("status") == "Rewrite":
|
||||
"""Schreibt Artikel mit Status 'Rewrite' um"""
|
||||
try:
|
||||
logging.info("✍️ Starte Artikel-Umschreibung")
|
||||
|
||||
articles = load_articles()
|
||||
rewrite_articles_list = [a for a in articles if a.get("status") == "Rewrite"]
|
||||
|
||||
if not rewrite_articles_list:
|
||||
logging.info("ℹ️ Keine Artikel zum Umschreiben gefunden")
|
||||
return
|
||||
|
||||
if not openai.api_key:
|
||||
logging.error("❌ OpenAI API-Key nicht konfiguriert")
|
||||
return
|
||||
|
||||
changed = False
|
||||
|
||||
for article in rewrite_articles_list:
|
||||
try:
|
||||
logging.info(f"✍️ Umschreiben von: {article['title']}")
|
||||
prompt = f"Schreibe folgenden Artikel um und fasse ihn verständlich zusammen:\n\n{article['text']}"
|
||||
|
||||
# Artikel umschreiben
|
||||
prompt = f"""Schreibe den folgenden Artikel um und fasse ihn verständlich zusammen.
|
||||
Behalte die wichtigsten Informationen bei, aber formuliere alles neu:
|
||||
|
||||
{article['text']}"""
|
||||
|
||||
response = openai.chat.completions.create(
|
||||
model="gpt-4",
|
||||
messages=[
|
||||
{"role": "system", "content": "Du bist ein professioneller Redakteur."},
|
||||
{"role": "system", "content": "Du bist ein professioneller Redakteur, der Artikel umschreibt und verbessert."},
|
||||
{"role": "user", "content": prompt}
|
||||
]
|
||||
],
|
||||
max_tokens=1500,
|
||||
temperature=0.7
|
||||
)
|
||||
|
||||
new_text = response.choices[0].message.content.strip()
|
||||
article["text"] = f"{article['title']}\n\n{new_text}"
|
||||
article["status"] = "Process"
|
||||
|
||||
# Tags generieren
|
||||
tag_prompt = f"""Erstelle 3-5 passende, kurze Stichwörter (Tags) für diesen Artikel.
|
||||
Gib nur die Tags zurück, getrennt durch Kommas:
|
||||
|
||||
tag_prompt = f"Erstelle 3 passende, kurze Stichwörter (Tags) für diesen Artikel:\n\n{new_text}"
|
||||
{new_text}"""
|
||||
|
||||
tag_response = openai.chat.completions.create(
|
||||
model="gpt-4",
|
||||
messages=[
|
||||
{"role": "system", "content": "Du bist ein Blog-Tag-Generator."},
|
||||
{"role": "system", "content": "Du generierst präzise Tags für Blog-Artikel."},
|
||||
{"role": "user", "content": tag_prompt}
|
||||
]
|
||||
],
|
||||
max_tokens=100,
|
||||
temperature=0.5
|
||||
)
|
||||
|
||||
tags_raw = tag_response.choices[0].message.content.strip()
|
||||
tags = [tag.strip(" ,") for tag in tags_raw.replace("\n", ",").split(",") if tag.strip()]
|
||||
tags = [tag.strip().strip(',') for tag in tags_raw.split(",") if tag.strip()]
|
||||
|
||||
# Artikel aktualisieren
|
||||
article["text"] = new_text
|
||||
article["tags"] = tags
|
||||
|
||||
article["status"] = "Process"
|
||||
article["rewritten_at"] = datetime.now().isoformat()
|
||||
article["word_count"] = len(new_text.split())
|
||||
|
||||
# Bildmetadaten vervollständigen falls nötig
|
||||
for img in article.get("images", []):
|
||||
if "caption" not in img:
|
||||
if "caption" not in img or not img["caption"]:
|
||||
img["caption"] = "Kein Bildtitel vorhanden"
|
||||
if "copyright" not in img:
|
||||
if "copyright" not in img or not img["copyright"]:
|
||||
img["copyright"] = "Unbekannt"
|
||||
if "copyright_url" not in img:
|
||||
if "copyright_url" not in img or not img["copyright_url"]:
|
||||
img["copyright_url"] = "#"
|
||||
|
||||
logging.info(f"✅ Artikel umgeschrieben: {article['title']}")
|
||||
|
||||
logging.info(f"✅ Artikel erfolgreich umgeschrieben: {article['title']}")
|
||||
changed = True
|
||||
|
||||
|
||||
# Kurze Pause zwischen API-Calls
|
||||
time.sleep(2)
|
||||
|
||||
except Exception as e:
|
||||
logging.exception(f"❌ Fehler beim Umschreiben von '{article['title']}':")
|
||||
logging.error(f"❌ Fehler beim Umschreiben von '{article['title']}': {e}")
|
||||
continue
|
||||
|
||||
if changed:
|
||||
save_articles(articles)
|
||||
logging.info(f"🎉 {len(rewrite_articles_list)} Artikel erfolgreich umgeschrieben")
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Kritischer Fehler beim Umschreiben: {e}")
|
||||
|
||||
if changed:
|
||||
save_articles(articles)
|
||||
logging.info("Alle Artikel mit Status 'Rewrite' wurden verarbeitet.")
|
||||
def upload_articles_to_wp():
|
||||
"""Lädt Artikel mit Status 'Process' zu WordPress hoch"""
|
||||
try:
|
||||
logging.info("📤 Starte WordPress-Upload")
|
||||
|
||||
articles = load_articles()
|
||||
process_articles_list = [a for a in articles if a.get("status") == "Process"]
|
||||
|
||||
if not process_articles_list:
|
||||
logging.info("ℹ️ Keine Artikel für WordPress-Upload gefunden")
|
||||
return {"total": 0, "successful": 0, "failed": 0, "message": "Keine Artikel zum Hochladen gefunden"}
|
||||
|
||||
logging.info(f"📦 {len(process_articles_list)} Artikel für WordPress-Upload gefunden")
|
||||
|
||||
# WordPress-Upload durchführen
|
||||
upload_results = upload_articles_to_wordpress(process_articles_list)
|
||||
|
||||
# Status der erfolgreich hochgeladenen Artikel ändern
|
||||
if upload_results.get('successful', 0) > 0:
|
||||
changed = False
|
||||
|
||||
for detail in upload_results.get('details', []):
|
||||
if detail.get('success'):
|
||||
article_id = detail.get('article_id')
|
||||
|
||||
# Artikel in der Liste finden und Status ändern
|
||||
for article in articles:
|
||||
if article.get('id') == article_id:
|
||||
article['status'] = "WordPress Pending"
|
||||
article['wp_upload_date'] = datetime.now().isoformat()
|
||||
article['wp_post_id'] = detail.get('wp_post_id')
|
||||
changed = True
|
||||
logging.info(f"✅ Status geändert für '{article.get('title')}': Process → WordPress Pending")
|
||||
break
|
||||
|
||||
if changed:
|
||||
save_articles(articles)
|
||||
logging.info(f"💾 Artikel-Status nach WordPress-Upload aktualisiert")
|
||||
|
||||
return upload_results
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Kritischer Fehler beim WordPress-Upload: {e}")
|
||||
return {"total": 0, "successful": 0, "failed": 1, "error": str(e)}
|
||||
|
||||
def get_article_stats():
|
||||
"""Gibt Statistiken über die Artikel zurück"""
|
||||
try:
|
||||
articles = load_articles()
|
||||
|
||||
stats = {
|
||||
"total_articles": len(articles),
|
||||
"status_distribution": {},
|
||||
"word_count_stats": {},
|
||||
"source_distribution": {},
|
||||
"images_count": 0
|
||||
}
|
||||
|
||||
# Status-Verteilung
|
||||
for article in articles:
|
||||
status = article.get("status", "New")
|
||||
stats["status_distribution"][status] = stats["status_distribution"].get(status, 0) + 1
|
||||
|
||||
# Wortanzahl-Statistiken
|
||||
word_counts = [article.get("word_count", 0) for article in articles if article.get("word_count")]
|
||||
if word_counts:
|
||||
stats["word_count_stats"] = {
|
||||
"average": sum(word_counts) // len(word_counts),
|
||||
"min": min(word_counts),
|
||||
"max": max(word_counts)
|
||||
}
|
||||
|
||||
# Quellen-Verteilung
|
||||
for article in articles:
|
||||
source = article.get("source_name", "Unbekannt")
|
||||
stats["source_distribution"][source] = stats["source_distribution"].get(source, 0) + 1
|
||||
|
||||
# Bilder zählen
|
||||
stats["images_count"] = sum(len(article.get("images", [])) for article in articles)
|
||||
|
||||
return stats
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Fehler beim Erstellen der Statistiken: {e}")
|
||||
return {}
|
||||
240
pages/01_feed_manager.py
Normal file
240
pages/01_feed_manager.py
Normal file
|
|
@ -0,0 +1,240 @@
|
|||
# pages/01_feed_manager.py
|
||||
|
||||
import streamlit as st
|
||||
from main import load_feeds, save_feeds, load_articles
|
||||
from utils.css_loader import load_css, apply_dark_theme
|
||||
import logging
|
||||
|
||||
# === CSS & Theme laden ===
|
||||
load_css()
|
||||
apply_dark_theme()
|
||||
|
||||
# === Logging vorbereiten ===
|
||||
log_dir = "logs"
|
||||
log_file = f"{log_dir}/rss_tool.log"
|
||||
logging.basicConfig(
|
||||
filename=log_file,
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s"
|
||||
)
|
||||
|
||||
st.set_page_config(page_title="📡 Feed-Verwaltung")
|
||||
|
||||
# Header
|
||||
st.markdown("""
|
||||
<div class="main-header">
|
||||
<h1>📡 RSS Feed-Verwaltung</h1>
|
||||
<p>Verwalte deine RSS-Feeds zentral und effizient</p>
|
||||
</div>
|
||||
""", unsafe_allow_html=True)
|
||||
|
||||
feeds = load_feeds()
|
||||
articles = load_articles()
|
||||
|
||||
# === Neuen Feed hinzufügen ===
|
||||
st.markdown('<div class="filter-section">', unsafe_allow_html=True)
|
||||
st.subheader("➕ Neuen Feed hinzufügen")
|
||||
|
||||
with st.form("add_feed_form"):
|
||||
col1, col2 = st.columns(2)
|
||||
with col1:
|
||||
new_url = st.text_input("Feed URL", "", placeholder="https://example.com/feed.xml")
|
||||
with col2:
|
||||
new_name = st.text_input("Feed Name", "", placeholder="Beispiel News")
|
||||
|
||||
submitted = st.form_submit_button("Feed hinzufügen", use_container_width=True)
|
||||
if submitted:
|
||||
if new_url and new_name:
|
||||
if not any(f.get("url") == new_url for f in feeds):
|
||||
feeds.append({"url": new_url, "name": new_name})
|
||||
save_feeds(feeds)
|
||||
logging.info(f"🔗 Neuer Feed hinzugefügt: {new_name} ({new_url})")
|
||||
st.success(f"Feed '{new_name}' hinzugefügt.")
|
||||
st.rerun()
|
||||
else:
|
||||
st.warning("⚠️ Dieser Feed existiert bereits.")
|
||||
else:
|
||||
st.error("❌ Bitte gib sowohl URL als auch Name ein.")
|
||||
|
||||
st.markdown('</div>', unsafe_allow_html=True)
|
||||
|
||||
# === Bestehende Feeds bearbeiten ===
|
||||
st.subheader("🛠️ Vorhandene Feeds verwalten")
|
||||
|
||||
if not feeds:
|
||||
st.info("Noch keine Feeds konfiguriert. Füge oben deinen ersten Feed hinzu!")
|
||||
else:
|
||||
for idx, feed in enumerate(feeds):
|
||||
feed_url = feed.get("url", "")
|
||||
feed_name = feed.get("name", "Unbekannt")
|
||||
article_count = sum(1 for a in articles if a.get("source") == feed_url)
|
||||
|
||||
# Feed Card
|
||||
st.markdown(f"""
|
||||
<div class="article-card">
|
||||
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 1rem;">
|
||||
<div>
|
||||
<h3 class="article-title">{feed_name}</h3>
|
||||
<div class="article-meta">{feed_url}</div>
|
||||
</div>
|
||||
<div>
|
||||
<span class="status-badge status-online">{article_count} Artikel</span>
|
||||
</div>
|
||||
</div>
|
||||
<div class="article-footer">
|
||||
📰 Verknüpfte Artikel: {article_count}
|
||||
</div>
|
||||
</div>
|
||||
""", unsafe_allow_html=True)
|
||||
|
||||
# Actions
|
||||
col1, col2, col3 = st.columns(3)
|
||||
|
||||
with col1:
|
||||
if st.button("💾 Bearbeiten", key=f"edit_{idx}", use_container_width=True):
|
||||
st.session_state[f"edit_mode_{idx}"] = not st.session_state.get(f"edit_mode_{idx}", False)
|
||||
|
||||
with col2:
|
||||
if st.button("🔄 Aktualisieren", key=f"refresh_{idx}", use_container_width=True):
|
||||
with st.spinner(f"Aktualisiere Feed '{feed_name}'..."):
|
||||
# Hier könntest du eine einzelne Feed-Update-Funktion implementieren
|
||||
from main import process_articles
|
||||
existing_ids = [a["id"] for a in articles]
|
||||
process_articles(existing_ids)
|
||||
st.success(f"Feed '{feed_name}' aktualisiert!")
|
||||
st.rerun()
|
||||
|
||||
with col3:
|
||||
if st.button("🗑️ Löschen", key=f"delete_{idx}", use_container_width=True):
|
||||
# Bestätigung über Session State
|
||||
if not st.session_state.get(f"confirm_delete_{idx}", False):
|
||||
st.session_state[f"confirm_delete_{idx}"] = True
|
||||
st.warning(f"Klicke erneut um '{feed_name}' wirklich zu löschen!")
|
||||
else:
|
||||
deleted_feed = feeds.pop(idx)
|
||||
save_feeds(feeds)
|
||||
logging.info(f"❌ Feed gelöscht: {deleted_feed.get('name')} ({deleted_feed.get('url')})")
|
||||
st.success(f"Feed '{feed_name}' wurde gelöscht.")
|
||||
# Cleanup Session State
|
||||
if f"confirm_delete_{idx}" in st.session_state:
|
||||
del st.session_state[f"confirm_delete_{idx}"]
|
||||
st.rerun()
|
||||
|
||||
# Edit Form (wenn aktiviert)
|
||||
if st.session_state.get(f"edit_mode_{idx}", False):
|
||||
st.markdown('<div class="filter-section" style="margin-top: 1rem;">', unsafe_allow_html=True)
|
||||
st.write("**Feed bearbeiten:**")
|
||||
|
||||
with st.form(f"edit_form_{idx}"):
|
||||
col1, col2 = st.columns(2)
|
||||
with col1:
|
||||
edited_url = st.text_input("Feed-URL", value=feed_url, key=f"edit_url_{idx}")
|
||||
with col2:
|
||||
edited_name = st.text_input("Feed-Name", value=feed_name, key=f"edit_name_{idx}")
|
||||
|
||||
form_col1, form_col2 = st.columns(2)
|
||||
with form_col1:
|
||||
if st.form_submit_button("💾 Änderungen speichern", use_container_width=True):
|
||||
old_url, old_name = feed.get("url"), feed.get("name")
|
||||
feeds[idx]["url"] = edited_url
|
||||
feeds[idx]["name"] = edited_name
|
||||
save_feeds(feeds)
|
||||
logging.info(f"✏️ Feed geändert: '{old_name}' ({old_url}) → '{edited_name}' ({edited_url})")
|
||||
st.success("Änderungen gespeichert!")
|
||||
st.session_state[f"edit_mode_{idx}"] = False
|
||||
st.rerun()
|
||||
|
||||
with form_col2:
|
||||
if st.form_submit_button("❌ Abbrechen", use_container_width=True):
|
||||
st.session_state[f"edit_mode_{idx}"] = False
|
||||
st.rerun()
|
||||
|
||||
st.markdown('</div>', unsafe_allow_html=True)
|
||||
|
||||
# === Feed-Statistiken ===
|
||||
if feeds:
|
||||
st.markdown("---")
|
||||
st.subheader("📊 Feed-Statistiken")
|
||||
|
||||
col1, col2, col3 = st.columns(3)
|
||||
|
||||
with col1:
|
||||
st.markdown("""
|
||||
<div class="stats-card">
|
||||
<div class="stats-number">{}</div>
|
||||
<div>Feeds Gesamt</div>
|
||||
</div>
|
||||
""".format(len(feeds)), unsafe_allow_html=True)
|
||||
|
||||
with col2:
|
||||
total_articles = len(articles)
|
||||
st.markdown("""
|
||||
<div class="stats-card">
|
||||
<div class="stats-number">{}</div>
|
||||
<div>Artikel Gesamt</div>
|
||||
</div>
|
||||
""".format(total_articles), unsafe_allow_html=True)
|
||||
|
||||
with col3:
|
||||
avg_articles = total_articles // len(feeds) if feeds else 0
|
||||
st.markdown("""
|
||||
<div class="stats-card">
|
||||
<div class="stats-number">{}</div>
|
||||
<div>Ø Artikel pro Feed</div>
|
||||
</div>
|
||||
""".format(avg_articles), unsafe_allow_html=True)
|
||||
|
||||
# === Bulk Actions ===
|
||||
if feeds:
|
||||
st.markdown("---")
|
||||
st.subheader("⚡ Bulk-Aktionen")
|
||||
|
||||
col1, col2 = st.columns(2)
|
||||
|
||||
with col1:
|
||||
if st.button("🔄 Alle Feeds aktualisieren", use_container_width=True):
|
||||
with st.spinner("Aktualisiere alle Feeds..."):
|
||||
from main import process_articles
|
||||
existing_ids = [a["id"] for a in articles]
|
||||
process_articles(existing_ids)
|
||||
st.success(f"Alle {len(feeds)} Feeds wurden aktualisiert!")
|
||||
st.rerun()
|
||||
|
||||
with col2:
|
||||
if st.button("📊 Feed-Performance anzeigen", use_container_width=True):
|
||||
st.subheader("📈 Feed-Performance")
|
||||
|
||||
# Performance-Daten sammeln
|
||||
feed_performance = []
|
||||
for feed in feeds:
|
||||
feed_url = feed.get("url", "")
|
||||
feed_name = feed.get("name", "Unbekannt")
|
||||
feed_articles = [a for a in articles if a.get("source") == feed_url]
|
||||
|
||||
performance = {
|
||||
"name": feed_name,
|
||||
"url": feed_url,
|
||||
"total_articles": len(feed_articles),
|
||||
"new_articles": len([a for a in feed_articles if a.get("status") == "New"]),
|
||||
"processed_articles": len([a for a in feed_articles if a.get("status") in ["Process", "Online", "WordPress Pending"]])
|
||||
}
|
||||
feed_performance.append(performance)
|
||||
|
||||
# Sortiere nach Artikel-Anzahl
|
||||
feed_performance.sort(key=lambda x: x["total_articles"], reverse=True)
|
||||
|
||||
# Anzeige als Cards
|
||||
for perf in feed_performance:
|
||||
success_rate = (perf["processed_articles"] / perf["total_articles"] * 100) if perf["total_articles"] > 0 else 0
|
||||
|
||||
st.markdown(f"""
|
||||
<div class="article-card">
|
||||
<h3 class="article-title">{perf["name"]}</h3>
|
||||
<div class="article-footer">
|
||||
📰 {perf["total_articles"]} Artikel |
|
||||
🆕 {perf["new_articles"]} Neu |
|
||||
✅ {perf["processed_articles"]} Verarbeitet |
|
||||
📊 {success_rate:.1f}% Success Rate
|
||||
</div>
|
||||
</div>
|
||||
""", unsafe_allow_html=True)
|
||||
|
|
@ -1,23 +1,297 @@
|
|||
# log_viewer.py
|
||||
# pages/log_viewer.py
|
||||
|
||||
import streamlit as st
|
||||
import os
|
||||
from utils.css_loader import load_css, apply_dark_theme
|
||||
from datetime import datetime
|
||||
|
||||
# === CSS & Theme laden ===
|
||||
load_css()
|
||||
apply_dark_theme()
|
||||
|
||||
st.set_page_config(page_title="🧾 Log Viewer", layout="wide")
|
||||
st.title("🧾 Letzte Logeinträge anzeigen")
|
||||
|
||||
# Header
|
||||
st.markdown("""
|
||||
<div class="main-header">
|
||||
<h1>🧾 Log Viewer</h1>
|
||||
<p>Überwache Systemaktivitäten und Debug-Informationen</p>
|
||||
</div>
|
||||
""", unsafe_allow_html=True)
|
||||
|
||||
LOG_FILE = "logs/rss_tool.log"
|
||||
MAX_LINES = 500
|
||||
|
||||
if not os.path.exists(LOG_FILE):
|
||||
st.warning("Keine Logdatei gefunden.")
|
||||
else:
|
||||
with open(LOG_FILE, "r") as f:
|
||||
lines = f.readlines()
|
||||
# === Log-Datei Kontrollen ===
|
||||
st.markdown('<div class="filter-section">', unsafe_allow_html=True)
|
||||
st.subheader("📁 Log-Datei Optionen")
|
||||
|
||||
st.write(f"Letzte {min(len(lines), MAX_LINES)} Zeilen aus `{LOG_FILE}`:")
|
||||
col1, col2, col3, col4 = st.columns(4)
|
||||
|
||||
st.code("".join(lines[-MAX_LINES:]), language="text")
|
||||
with col1:
|
||||
lines_to_show = st.selectbox(
|
||||
"Anzahl Zeilen",
|
||||
[50, 100, 200, 500, 1000],
|
||||
index=3, # Default: 500
|
||||
key="lines_select"
|
||||
)
|
||||
|
||||
if st.button("🔄 Neu laden"):
|
||||
with col2:
|
||||
if st.button("🔄 Neu laden", use_container_width=True):
|
||||
st.rerun()
|
||||
|
||||
with col3:
|
||||
log_level_filter = st.selectbox(
|
||||
"Log Level Filter",
|
||||
["Alle", "INFO", "WARNING", "ERROR", "DEBUG"],
|
||||
key="level_filter"
|
||||
)
|
||||
|
||||
with col4:
|
||||
search_term = st.text_input(
|
||||
"Suche in Logs",
|
||||
placeholder="Suchbegriff...",
|
||||
key="log_search"
|
||||
)
|
||||
|
||||
st.markdown('</div>', unsafe_allow_html=True)
|
||||
|
||||
# === Log-Datei Status ===
|
||||
if not os.path.exists(LOG_FILE):
|
||||
st.markdown("""
|
||||
<div class="wp-status">
|
||||
<strong>⚠️ Keine Log-Datei gefunden</strong><br>
|
||||
<div class="text-secondary">
|
||||
Die Log-Datei wurde noch nicht erstellt oder befindet sich an einem anderen Ort.<br>
|
||||
Erwarteter Pfad: <code>{}</code>
|
||||
</div>
|
||||
</div>
|
||||
""".format(LOG_FILE), unsafe_allow_html=True)
|
||||
else:
|
||||
# Datei-Informationen
|
||||
file_size = os.path.getsize(LOG_FILE)
|
||||
file_mtime = datetime.fromtimestamp(os.path.getmtime(LOG_FILE))
|
||||
|
||||
col1, col2, col3 = st.columns(3)
|
||||
|
||||
with col1:
|
||||
st.markdown("""
|
||||
<div class="stats-card">
|
||||
<div class="stats-number">{:.1f} KB</div>
|
||||
<div>Dateigröße</div>
|
||||
</div>
|
||||
""".format(file_size / 1024), unsafe_allow_html=True)
|
||||
|
||||
with col2:
|
||||
st.markdown("""
|
||||
<div class="stats-card">
|
||||
<div class="stats-number" style="font-size: 1.5rem;">{}</div>
|
||||
<div>Letzte Änderung</div>
|
||||
</div>
|
||||
""".format(file_mtime.strftime("%H:%M:%S")), unsafe_allow_html=True)
|
||||
|
||||
with col3:
|
||||
# Zeilen zählen
|
||||
try:
|
||||
with open(LOG_FILE, "r", encoding="utf-8") as f:
|
||||
total_lines = sum(1 for _ in f)
|
||||
except:
|
||||
total_lines = 0
|
||||
|
||||
st.markdown("""
|
||||
<div class="stats-card">
|
||||
<div class="stats-number">{}</div>
|
||||
<div>Zeilen Gesamt</div>
|
||||
</div>
|
||||
""".format(total_lines), unsafe_allow_html=True)
|
||||
|
||||
# === Log-Inhalt anzeigen ===
|
||||
try:
|
||||
with open(LOG_FILE, "r", encoding="utf-8") as f:
|
||||
lines = f.readlines()
|
||||
|
||||
# Filter anwenden
|
||||
filtered_lines = []
|
||||
|
||||
for line in lines:
|
||||
# Log Level Filter
|
||||
if log_level_filter != "Alle":
|
||||
if f" - {log_level_filter} - " not in line:
|
||||
continue
|
||||
|
||||
# Suchfilter
|
||||
if search_term and search_term.lower() not in line.lower():
|
||||
continue
|
||||
|
||||
filtered_lines.append(line)
|
||||
|
||||
# Anzahl begrenzen
|
||||
display_lines = filtered_lines[-lines_to_show:] if len(filtered_lines) > lines_to_show else filtered_lines
|
||||
|
||||
# Header für Log-Anzeige
|
||||
st.subheader(f"📋 Log-Einträge ({len(display_lines)} von {len(filtered_lines)} gefilterten Zeilen)")
|
||||
|
||||
if display_lines:
|
||||
# Log-Inhalt mit Syntax-Highlighting
|
||||
log_content = "".join(display_lines)
|
||||
|
||||
# Farbkodierung für verschiedene Log-Level
|
||||
colored_content = log_content
|
||||
colored_content = colored_content.replace(" - ERROR - ", " - 🔴 ERROR - ")
|
||||
colored_content = colored_content.replace(" - WARNING - ", " - 🟡 WARNING - ")
|
||||
colored_content = colored_content.replace(" - INFO - ", " - 🔵 INFO - ")
|
||||
colored_content = colored_content.replace(" - DEBUG - ", " - ⚪ DEBUG - ")
|
||||
|
||||
# Log in Card anzeigen
|
||||
st.markdown("""
|
||||
<div class="article-card">
|
||||
<h3 class="article-title">📄 Log-Ausgabe</h3>
|
||||
<div class="article-meta">
|
||||
Letzte {count} Einträge | Filter: {level} | Suche: "{search}"
|
||||
</div>
|
||||
</div>
|
||||
""".format(
|
||||
count=len(display_lines),
|
||||
level=log_level_filter,
|
||||
search=search_term or "Keine"
|
||||
), unsafe_allow_html=True)
|
||||
|
||||
# Code-Block mit Logs
|
||||
st.code(colored_content, language="text")
|
||||
|
||||
# Download-Button
|
||||
st.download_button(
|
||||
label="💾 Log-Datei herunterladen",
|
||||
data=log_content,
|
||||
file_name=f"rss_tool_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
|
||||
mime="text/plain",
|
||||
use_container_width=True
|
||||
)
|
||||
|
||||
else:
|
||||
st.markdown("""
|
||||
<div class="wp-status">
|
||||
<strong>🔍 Keine Log-Einträge gefunden</strong><br>
|
||||
<div class="text-secondary">
|
||||
Mit den aktuellen Filtern wurden keine Log-Einträge gefunden.<br>
|
||||
Versuche andere Filter-Einstellungen.
|
||||
</div>
|
||||
</div>
|
||||
""", unsafe_allow_html=True)
|
||||
|
||||
except Exception as e:
|
||||
st.markdown(f"""
|
||||
<div class="wp-status">
|
||||
<strong>❌ Fehler beim Lesen der Log-Datei</strong><br>
|
||||
<div class="text-secondary">
|
||||
{str(e)}
|
||||
</div>
|
||||
</div>
|
||||
""", unsafe_allow_html=True)
|
||||
|
||||
# === Log-Level Erklärung ===
|
||||
with st.expander("ℹ️ Log-Level Erklärung", expanded=False):
|
||||
st.markdown("""
|
||||
<div class="article-card">
|
||||
<h3 class="article-title">📖 Log-Level Bedeutung</h3>
|
||||
<div class="article-summary">
|
||||
<strong>🔵 INFO:</strong> Normale Programmaktivitäten (Feed-Updates, Artikel verarbeitet)<br>
|
||||
<strong>🟡 WARNING:</strong> Potentielle Probleme (Duplikate, fehlende Daten)<br>
|
||||
<strong>🔴 ERROR:</strong> Fehler die das Programm beeinträchtigen<br>
|
||||
<strong>⚪ DEBUG:</strong> Detaillierte Entwickler-Informationen
|
||||
</div>
|
||||
</div>
|
||||
""", unsafe_allow_html=True)
|
||||
|
||||
# === Log-Datei verwalten ===
|
||||
st.markdown("---")
|
||||
st.subheader("🛠️ Log-Datei Verwaltung")
|
||||
|
||||
col1, col2, col3 = st.columns(3)
|
||||
|
||||
with col1:
|
||||
if st.button("🗑️ Log-Datei leeren", use_container_width=True):
|
||||
if st.button("⚠️ Wirklich leeren?", key="confirm_clear"):
|
||||
try:
|
||||
with open(LOG_FILE, "w", encoding="utf-8") as f:
|
||||
f.write("")
|
||||
st.success("Log-Datei wurde geleert!")
|
||||
st.rerun()
|
||||
except Exception as e:
|
||||
st.error(f"Fehler beim Leeren der Log-Datei: {e}")
|
||||
|
||||
with col2:
|
||||
if st.button("📦 Log archivieren", use_container_width=True):
|
||||
try:
|
||||
archive_name = f"rss_tool_log_archive_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
|
||||
with open(LOG_FILE, "r", encoding="utf-8") as f:
|
||||
log_data = f.read()
|
||||
|
||||
st.download_button(
|
||||
label=f"💾 {archive_name}",
|
||||
data=log_data,
|
||||
file_name=archive_name,
|
||||
mime="text/plain",
|
||||
key="archive_download"
|
||||
)
|
||||
except Exception as e:
|
||||
st.error(f"Fehler beim Archivieren: {e}")
|
||||
|
||||
with col3:
|
||||
if st.button("📊 Log-Statistiken", use_container_width=True):
|
||||
if os.path.exists(LOG_FILE):
|
||||
try:
|
||||
with open(LOG_FILE, "r", encoding="utf-8") as f:
|
||||
all_lines = f.readlines()
|
||||
|
||||
# Statistiken berechnen
|
||||
total_lines = len(all_lines)
|
||||
info_count = sum(1 for line in all_lines if " - INFO - " in line)
|
||||
warning_count = sum(1 for line in all_lines if " - WARNING - " in line)
|
||||
error_count = sum(1 for line in all_lines if " - ERROR - " in line)
|
||||
debug_count = sum(1 for line in all_lines if " - DEBUG - " in line)
|
||||
|
||||
st.subheader("📈 Log-Statistiken")
|
||||
|
||||
col1, col2, col3, col4 = st.columns(4)
|
||||
|
||||
with col1:
|
||||
st.markdown("""
|
||||
<div class="stats-card">
|
||||
<div class="stats-number">{}</div>
|
||||
<div>🔵 INFO</div>
|
||||
</div>
|
||||
""".format(info_count), unsafe_allow_html=True)
|
||||
|
||||
with col2:
|
||||
st.markdown("""
|
||||
<div class="stats-card">
|
||||
<div class="stats-number">{}</div>
|
||||
<div>🟡 WARNING</div>
|
||||
</div>
|
||||
""".format(warning_count), unsafe_allow_html=True)
|
||||
|
||||
with col3:
|
||||
st.markdown("""
|
||||
<div class="stats-card">
|
||||
<div class="stats-number">{}</div>
|
||||
<div>🔴 ERROR</div>
|
||||
</div>
|
||||
""".format(error_count), unsafe_allow_html=True)
|
||||
|
||||
with col4:
|
||||
st.markdown("""
|
||||
<div class="stats-card">
|
||||
<div class="stats-number">{}</div>
|
||||
<div>⚪ DEBUG</div>
|
||||
</div>
|
||||
""".format(debug_count), unsafe_allow_html=True)
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"Fehler beim Berechnen der Statistiken: {e}")
|
||||
|
||||
# === Auto-Refresh Option ===
|
||||
if st.checkbox("🔄 Auto-Refresh (30s)", key="auto_refresh"):
|
||||
import time
|
||||
time.sleep(30)
|
||||
st.rerun()
|
||||
|
|
@ -1 +0,0 @@
|
|||
[]
|
||||
4
pytest.ini
Normal file
4
pytest.ini
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
[pytest]
|
||||
testpaths = backend/tests
|
||||
python_files = test_*.py
|
||||
addopts = -q --maxfail=1
|
||||
|
|
@ -53,3 +53,6 @@ typing_extensions==4.14.0
|
|||
tzdata==2025.2
|
||||
urllib3==2.5.0
|
||||
typer[all]==0.12.3
|
||||
# Bild- und Hashing-Bibliotheken für Dedupe
|
||||
Pillow>=10.0.0 # Bildverarbeitung (öffnet Bilder für pHash)
|
||||
ImageHash>=4.3 # Perzeptueller Hash (pHash) für Near-Duplicate Erkennung
|
||||
|
|
|
|||
33
scripts/smoke_backend.sh
Executable file
33
scripts/smoke_backend.sh
Executable file
|
|
@ -0,0 +1,33 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
if [[ -z "${BASE_URL:-}" ]]; then
|
||||
echo "BASE_URL fehlt (z. B. https://news.vanityontour.de)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ -z "${APP_ADMIN_USERNAME:-}" || -z "${APP_ADMIN_PASSWORD:-}" ]]; then
|
||||
echo "APP_ADMIN_USERNAME/APP_ADMIN_PASSWORD fehlen"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cookie_file="$(mktemp)"
|
||||
trap 'rm -f "$cookie_file"' EXIT
|
||||
|
||||
echo "[1/4] Healthcheck"
|
||||
curl -fsS "${BASE_URL}/health" | grep -q '"status":"ok"'
|
||||
|
||||
echo "[2/4] Login"
|
||||
curl -fsS -c "$cookie_file" \
|
||||
-H "Content-Type: application/json" \
|
||||
-X POST "${BASE_URL}/auth/login" \
|
||||
-d "{\"username\":\"${APP_ADMIN_USERNAME}\",\"password\":\"${APP_ADMIN_PASSWORD}\"}" \
|
||||
| grep -q '"ok":true'
|
||||
|
||||
echo "[3/4] Protected Endpoint"
|
||||
curl -fsS -b "$cookie_file" "${BASE_URL}/api/protected" | grep -q '"ok":true'
|
||||
|
||||
echo "[4/4] Pipeline Status"
|
||||
curl -fsS -b "$cookie_file" "${BASE_URL}/api/pipeline/status" | grep -q '"stage":"skeleton+db"'
|
||||
|
||||
echo "Smoke test erfolgreich."
|
||||
491
static/styles.css
Normal file
491
static/styles.css
Normal file
|
|
@ -0,0 +1,491 @@
|
|||
/* ===============================================
|
||||
RSS Feed Manager - Zentrale CSS-Datei
|
||||
Dark-Mode optimiert mit Fallbacks
|
||||
=============================================== */
|
||||
|
||||
/* === ROOT VARIABLEN === */
|
||||
:root {
|
||||
/* Dark Mode Farbpalette */
|
||||
--bg-primary: #1e1e1e;
|
||||
--bg-secondary: #2d2d30;
|
||||
--bg-card: #2d2d30;
|
||||
--bg-header: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||||
--bg-filter: #363636;
|
||||
|
||||
/* Text Farben */
|
||||
--text-primary: #ffffff;
|
||||
--text-secondary: #b0b0b0;
|
||||
--text-muted: #888888;
|
||||
--text-accent: #667eea;
|
||||
|
||||
/* Status Farben */
|
||||
--status-new: #2196f3;
|
||||
--status-new-bg: #1565c0;
|
||||
--status-rewrite: #ff9800;
|
||||
--status-rewrite-bg: #ef6c00;
|
||||
--status-process: #9c27b0;
|
||||
--status-process-bg: #6a1b9a;
|
||||
--status-online: #4caf50;
|
||||
--status-online-bg: #2e7d32;
|
||||
--status-hold: #e91e63;
|
||||
--status-hold-bg: #ad1457;
|
||||
--status-trash: #f44336;
|
||||
--status-trash-bg: #c62828;
|
||||
--status-wp-pending: #00bcd4;
|
||||
--status-wp-pending-bg: #0097a7;
|
||||
|
||||
/* Borders & Shadows */
|
||||
--border-color: #404040;
|
||||
--shadow-light: 0 2px 8px rgba(0, 0, 0, 0.3);
|
||||
--shadow-hover: 0 8px 20px rgba(0, 0, 0, 0.4);
|
||||
|
||||
/* Accent Colors */
|
||||
--gradient-primary: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||||
--gradient-secondary: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
|
||||
}
|
||||
|
||||
/* === LIGHT MODE FALLBACKS === */
|
||||
[data-theme="light"], .stApp[data-theme="light"] {
|
||||
--bg-primary: #ffffff;
|
||||
--bg-secondary: #f8f9fa;
|
||||
--bg-card: #ffffff;
|
||||
--bg-filter: #f0f2f6;
|
||||
|
||||
--text-primary: #212529;
|
||||
--text-secondary: #495057;
|
||||
--text-muted: #6c757d;
|
||||
--text-accent: #667eea;
|
||||
|
||||
--border-color: #dee2e6;
|
||||
--shadow-light: 0 2px 8px rgba(0, 0, 0, 0.1);
|
||||
--shadow-hover: 0 8px 20px rgba(0, 0, 0, 0.15);
|
||||
}
|
||||
|
||||
/* === GLOBALE RESETS === */
|
||||
* {
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
/* === HAUPTCONTAINER === */
|
||||
.main-header {
|
||||
background: var(--bg-header);
|
||||
padding: 2rem;
|
||||
border-radius: 12px;
|
||||
margin-bottom: 2rem;
|
||||
color: var(--text-primary);
|
||||
text-align: center;
|
||||
box-shadow: var(--shadow-light);
|
||||
}
|
||||
|
||||
.main-header h1 {
|
||||
color: var(--text-primary) !important;
|
||||
margin: 0 0 0.5rem 0;
|
||||
font-size: 2.5rem;
|
||||
font-weight: 700;
|
||||
}
|
||||
|
||||
.main-header p {
|
||||
color: rgba(255, 255, 255, 0.9) !important;
|
||||
margin: 0;
|
||||
font-size: 1.1rem;
|
||||
}
|
||||
|
||||
/* === ARTIKEL CARDS === */
|
||||
.article-card {
|
||||
background: var(--bg-card) !important;
|
||||
border-radius: 12px;
|
||||
padding: 1.5rem;
|
||||
margin-bottom: 1rem;
|
||||
box-shadow: var(--shadow-light);
|
||||
border-left: 4px solid var(--text-accent);
|
||||
border: 1px solid var(--border-color);
|
||||
transition: all 0.3s ease;
|
||||
color: var(--text-primary) !important;
|
||||
}
|
||||
|
||||
.article-card:hover {
|
||||
transform: translateY(-2px);
|
||||
box-shadow: var(--shadow-hover);
|
||||
border-color: var(--text-accent);
|
||||
}
|
||||
|
||||
.article-card h3,
|
||||
.article-card .article-title {
|
||||
color: var(--text-primary) !important;
|
||||
margin: 0 0 0.5rem 0;
|
||||
font-size: 1.2rem;
|
||||
font-weight: 600;
|
||||
line-height: 1.4;
|
||||
}
|
||||
|
||||
.article-card .article-meta {
|
||||
color: var(--text-secondary) !important;
|
||||
font-size: 0.9rem;
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
|
||||
.article-card .article-summary {
|
||||
color: var(--text-secondary) !important;
|
||||
line-height: 1.5;
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
|
||||
.article-card .article-footer {
|
||||
color: var(--text-muted) !important;
|
||||
font-size: 0.85rem;
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
/* === STATUS BADGES === */
|
||||
.status-badge {
|
||||
padding: 0.3rem 0.8rem;
|
||||
border-radius: 20px;
|
||||
font-size: 0.8rem;
|
||||
font-weight: 600;
|
||||
margin-right: 0.5rem;
|
||||
display: inline-block;
|
||||
color: white !important;
|
||||
}
|
||||
|
||||
.status-new {
|
||||
background-color: var(--status-new-bg) !important;
|
||||
color: white !important;
|
||||
}
|
||||
|
||||
.status-rewrite {
|
||||
background-color: var(--status-rewrite-bg) !important;
|
||||
color: white !important;
|
||||
}
|
||||
|
||||
.status-process {
|
||||
background-color: var(--status-process-bg) !important;
|
||||
color: white !important;
|
||||
}
|
||||
|
||||
.status-online {
|
||||
background-color: var(--status-online-bg) !important;
|
||||
color: white !important;
|
||||
}
|
||||
|
||||
.status-hold {
|
||||
background-color: var(--status-hold-bg) !important;
|
||||
color: white !important;
|
||||
}
|
||||
|
||||
.status-trash {
|
||||
background-color: var(--status-trash-bg) !important;
|
||||
color: white !important;
|
||||
}
|
||||
|
||||
.status-wp-pending {
|
||||
background-color: var(--status-wp-pending-bg) !important;
|
||||
color: white !important;
|
||||
}
|
||||
|
||||
/* === FILTER SECTION === */
|
||||
.filter-section {
|
||||
background: var(--bg-filter) !important;
|
||||
padding: 1.5rem;
|
||||
border-radius: 12px;
|
||||
margin-bottom: 2rem;
|
||||
border: 1px solid var(--border-color);
|
||||
box-shadow: var(--shadow-light);
|
||||
}
|
||||
|
||||
.filter-section h3 {
|
||||
color: var(--text-primary) !important;
|
||||
margin: 0 0 1rem 0;
|
||||
font-size: 1.3rem;
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
/* === STATS CARDS === */
|
||||
.stats-card {
|
||||
background: var(--bg-card) !important;
|
||||
padding: 1.5rem;
|
||||
border-radius: 12px;
|
||||
text-align: center;
|
||||
box-shadow: var(--shadow-light);
|
||||
border: 1px solid var(--border-color);
|
||||
transition: transform 0.2s ease;
|
||||
}
|
||||
|
||||
.stats-card:hover {
|
||||
transform: translateY(-2px);
|
||||
}
|
||||
|
||||
.stats-number {
|
||||
font-size: 2.5rem;
|
||||
font-weight: 700;
|
||||
color: var(--text-accent) !important;
|
||||
margin-bottom: 0.5rem;
|
||||
display: block;
|
||||
}
|
||||
|
||||
.stats-card div:last-child {
|
||||
color: var(--text-secondary) !important;
|
||||
font-weight: 500;
|
||||
font-size: 1rem;
|
||||
}
|
||||
|
||||
/* === WORDPRESS STATUS === */
|
||||
.wp-status {
|
||||
background: var(--bg-card) !important;
|
||||
padding: 1rem;
|
||||
border-radius: 8px;
|
||||
margin: 1rem 0;
|
||||
border-left: 4px solid var(--status-wp-pending);
|
||||
border: 1px solid var(--border-color);
|
||||
box-shadow: var(--shadow-light);
|
||||
}
|
||||
|
||||
.wp-status strong {
|
||||
color: var(--text-primary) !important;
|
||||
}
|
||||
|
||||
.wp-status small {
|
||||
color: var(--text-muted) !important;
|
||||
}
|
||||
|
||||
/* === IMAGE GALLERY === */
|
||||
.image-gallery {
|
||||
display: flex;
|
||||
gap: 1rem;
|
||||
overflow-x: auto;
|
||||
padding: 1rem 0;
|
||||
}
|
||||
|
||||
.image-item {
|
||||
min-width: 200px;
|
||||
text-align: center;
|
||||
background: var(--bg-card);
|
||||
padding: 1rem;
|
||||
border-radius: 8px;
|
||||
border: 1px solid var(--border-color);
|
||||
}
|
||||
|
||||
.image-item img {
|
||||
border-radius: 6px;
|
||||
max-width: 100%;
|
||||
}
|
||||
|
||||
.image-item strong,
|
||||
.image-item p {
|
||||
color: var(--text-primary) !important;
|
||||
}
|
||||
|
||||
.image-item small {
|
||||
color: var(--text-muted) !important;
|
||||
}
|
||||
|
||||
/* === BUTTONS & ACTIONS === */
|
||||
.action-button {
|
||||
margin: 0.25rem;
|
||||
border-radius: 6px;
|
||||
}
|
||||
|
||||
/* Streamlit Button Overrides */
|
||||
.stButton > button {
|
||||
background: var(--gradient-primary) !important;
|
||||
color: white !important;
|
||||
border: none !important;
|
||||
border-radius: 8px !important;
|
||||
font-weight: 600 !important;
|
||||
transition: all 0.2s ease !important;
|
||||
}
|
||||
|
||||
.stButton > button:hover {
|
||||
transform: translateY(-1px) !important;
|
||||
box-shadow: 0 4px 12px rgba(102, 126, 234, 0.4) !important;
|
||||
}
|
||||
|
||||
/* === SELECTBOX & INPUT OVERRIDES === */
|
||||
.stSelectbox > div > div {
|
||||
background-color: var(--bg-card) !important;
|
||||
color: var(--text-primary) !important;
|
||||
border: 1px solid var(--border-color) !important;
|
||||
}
|
||||
|
||||
.stTextInput > div > div > input {
|
||||
background-color: var(--bg-card) !important;
|
||||
color: var(--text-primary) !important;
|
||||
border: 1px solid var(--border-color) !important;
|
||||
}
|
||||
|
||||
/* === TABS === */
|
||||
.stTabs [data-baseweb="tab-list"] {
|
||||
background-color: var(--bg-secondary) !important;
|
||||
border-radius: 8px;
|
||||
padding: 0.25rem;
|
||||
}
|
||||
|
||||
.stTabs [data-baseweb="tab"] {
|
||||
color: var(--text-secondary) !important;
|
||||
background-color: transparent !important;
|
||||
border-radius: 6px !important;
|
||||
font-weight: 600 !important;
|
||||
}
|
||||
|
||||
.stTabs [aria-selected="true"] {
|
||||
background-color: var(--text-accent) !important;
|
||||
color: white !important;
|
||||
}
|
||||
|
||||
/* === EXPANDER === */
|
||||
.streamlit-expanderHeader {
|
||||
background-color: var(--bg-card) !important;
|
||||
color: var(--text-primary) !important;
|
||||
border: 1px solid var(--border-color) !important;
|
||||
border-radius: 8px !important;
|
||||
}
|
||||
|
||||
.streamlit-expanderContent {
|
||||
background-color: var(--bg-card) !important;
|
||||
border: 1px solid var(--border-color) !important;
|
||||
border-top: none !important;
|
||||
}
|
||||
|
||||
/* === METRICS === */
|
||||
.metric-container {
|
||||
background: var(--bg-card) !important;
|
||||
padding: 1rem;
|
||||
border-radius: 8px;
|
||||
border: 1px solid var(--border-color);
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.metric-container [data-testid="metric-container"] {
|
||||
background: transparent !important;
|
||||
}
|
||||
|
||||
.metric-container [data-testid="metric-container"] > div {
|
||||
color: var(--text-primary) !important;
|
||||
}
|
||||
|
||||
/* === CODE BLOCKS === */
|
||||
.stCodeBlock {
|
||||
background-color: var(--bg-secondary) !important;
|
||||
border: 1px solid var(--border-color) !important;
|
||||
border-radius: 8px;
|
||||
}
|
||||
|
||||
/* === SUCCESS/ERROR/WARNING/INFO === */
|
||||
.stAlert {
|
||||
border-radius: 8px !important;
|
||||
border: 1px solid var(--border-color) !important;
|
||||
}
|
||||
|
||||
.stSuccess {
|
||||
background-color: rgba(76, 175, 80, 0.1) !important;
|
||||
color: var(--status-online) !important;
|
||||
border-color: var(--status-online) !important;
|
||||
}
|
||||
|
||||
.stError {
|
||||
background-color: rgba(244, 67, 54, 0.1) !important;
|
||||
color: var(--status-trash) !important;
|
||||
border-color: var(--status-trash) !important;
|
||||
}
|
||||
|
||||
.stWarning {
|
||||
background-color: rgba(255, 152, 0, 0.1) !important;
|
||||
color: var(--status-rewrite) !important;
|
||||
border-color: var(--status-rewrite) !important;
|
||||
}
|
||||
|
||||
.stInfo {
|
||||
background-color: rgba(33, 150, 243, 0.1) !important;
|
||||
color: var(--status-new) !important;
|
||||
border-color: var(--status-new) !important;
|
||||
}
|
||||
|
||||
/* === SIDEBAR === */
|
||||
.css-1d391kg {
|
||||
background-color: var(--bg-secondary) !important;
|
||||
}
|
||||
|
||||
.css-1d391kg .stMarkdown {
|
||||
color: var(--text-primary) !important;
|
||||
}
|
||||
|
||||
/* === RESPONSIVE DESIGN === */
|
||||
@media (max-width: 768px) {
|
||||
.main-header {
|
||||
padding: 1.5rem;
|
||||
}
|
||||
|
||||
.main-header h1 {
|
||||
font-size: 2rem;
|
||||
}
|
||||
|
||||
.article-card {
|
||||
padding: 1rem;
|
||||
}
|
||||
|
||||
.stats-card {
|
||||
padding: 1rem;
|
||||
}
|
||||
|
||||
.stats-number {
|
||||
font-size: 2rem;
|
||||
}
|
||||
}
|
||||
|
||||
/* === UTILITY CLASSES === */
|
||||
.text-primary { color: var(--text-primary) !important; }
|
||||
.text-secondary { color: var(--text-secondary) !important; }
|
||||
.text-muted { color: var(--text-muted) !important; }
|
||||
.text-accent { color: var(--text-accent) !important; }
|
||||
|
||||
.bg-card { background-color: var(--bg-card) !important; }
|
||||
.bg-secondary { background-color: var(--bg-secondary) !important; }
|
||||
|
||||
.border-radius { border-radius: 8px; }
|
||||
.shadow-light { box-shadow: var(--shadow-light); }
|
||||
.shadow-hover { box-shadow: var(--shadow-hover); }
|
||||
|
||||
/* === SCROLLBAR STYLING === */
|
||||
::-webkit-scrollbar {
|
||||
width: 8px;
|
||||
height: 8px;
|
||||
}
|
||||
|
||||
::-webkit-scrollbar-track {
|
||||
background: var(--bg-secondary);
|
||||
border-radius: 4px;
|
||||
}
|
||||
|
||||
::-webkit-scrollbar-thumb {
|
||||
background: var(--text-muted);
|
||||
border-radius: 4px;
|
||||
}
|
||||
|
||||
::-webkit-scrollbar-thumb:hover {
|
||||
background: var(--text-secondary);
|
||||
}
|
||||
|
||||
/* === LOADING SPINNER === */
|
||||
@keyframes spin {
|
||||
0% { transform: rotate(0deg); }
|
||||
100% { transform: rotate(360deg); }
|
||||
}
|
||||
|
||||
.loading-spinner {
|
||||
border: 4px solid var(--bg-secondary);
|
||||
border-top: 4px solid var(--text-accent);
|
||||
border-radius: 50%;
|
||||
width: 40px;
|
||||
height: 40px;
|
||||
animation: spin 1s linear infinite;
|
||||
margin: 0 auto 1rem auto;
|
||||
}
|
||||
|
||||
/* === FOCUS STATES === */
|
||||
.stButton > button:focus,
|
||||
.stSelectbox > div > div:focus,
|
||||
.stTextInput > div > div > input:focus {
|
||||
outline: 2px solid var(--text-accent) !important;
|
||||
outline-offset: 2px !important;
|
||||
}
|
||||
35
test_checklist.md
Normal file
35
test_checklist.md
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
## ✅ Artikel-Rewrite prüfen
|
||||
|
||||
| Testschritt | Erwartung | Erfüllt? (✔/✘) |
|
||||
|-----------------------------------------------------|-------------------------------|----------------|
|
||||
| Artikel mit Status "Rewrite" umschreiben | Text wird ersetzt | |
|
||||
| Andere Artikel bleiben unverändert | Kein Datenverlust | |
|
||||
| Tags bei anderen Artikeln bleiben erhalten | Keine versehentliche Änderung | |
|
||||
| Nur bearbeitete Artikel bekommen neue Tags | Zielgenaue Verarbeitung | |
|
||||
| JSON enthält alle Artikel | Kein Verlust nach save() | |
|
||||
|
||||
## ✅ DALL·E-Bildgenerierung prüfen
|
||||
|
||||
| Testschritt | Erwartung | Erfüllt? (✔/✘) |
|
||||
|-----------------------------------------------------|-------------------------------------------|----------------|
|
||||
| KI-Button klickbar | Nur wenn noch kein DALL·E-Bild vorhanden | |
|
||||
| Bild wird korrekt generiert | URL, Metadaten vorhanden | |
|
||||
| Nur ein Bild wird hinzugefügt | Keine Duplikate | |
|
||||
| Bild wird korrekt gespeichert | In `images[]` mit passendem Prompt | |
|
||||
| Andere Artikel bleiben unverändert | Kein Datenverlust | |
|
||||
|
||||
## ✅ Statusänderung prüfen
|
||||
|
||||
| Testschritt | Erwartung | Erfüllt? (✔/✘) |
|
||||
|-----------------------------------------------------|-------------------------------|----------------|
|
||||
| Artikelstatus ändern (z. B. auf Trash) | Wird korrekt übernommen | |
|
||||
| Nur ein Artikel wird verändert | Kein Einfluss auf andere | |
|
||||
| Artikel bleibt in JSON erhalten | Kein versehentliches Löschen | |
|
||||
|
||||
## ✅ Gesamtsystemprüfung
|
||||
|
||||
| Testschritt | Erwartung | Erfüllt? (✔/✘) |
|
||||
|-----------------------------------------------------|-------------------------------------------|----------------|
|
||||
| `articles.json` vollständig | Alle Artikel erhalten | |
|
||||
| Keine Fehlermeldungen im UI oder Log | Logging funktioniert, keine Exceptions | |
|
||||
| Filterfunktion bleibt erhalten nach Aktion | Kein Verlust des Statusfilters | |
|
||||
303
tools/image_deduper.py
Normal file
303
tools/image_deduper.py
Normal file
|
|
@ -0,0 +1,303 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
image_deduper.py — Finde und bereinige Bild-Dubletten sicher & reversibel.
|
||||
|
||||
Funktionen:
|
||||
- Scan: Verzeichnisse rekursiv scannen, sha256 + pHash berechnen
|
||||
- Report: CSV + menschenlesbare Zusammenfassung mit Gruppen
|
||||
- Apply: Duplikate auf kanonische Datei umbiegen (Hardlink/Löschen)
|
||||
- Optional: DB-Referenzen aktualisieren (SQLite/SQLModel kompatibel)
|
||||
|
||||
Nutzung (Beispiele):
|
||||
# 1) Nur scannen + reporten (keine Änderungen):
|
||||
python tools/image_deduper.py scan --roots media,assets/images --out-dir .dedupe --phash
|
||||
|
||||
# 2) Report anzeigen:
|
||||
python tools/image_deduper.py report --index .dedupe/index.sqlite --csv
|
||||
|
||||
# 3) Anwenden (Hardlinks setzen, Dry-Run):
|
||||
python tools/image_deduper.py apply --index .dedupe/index.sqlite --mode hardlink --dry-run
|
||||
|
||||
# 4) Anwenden (wirklich ändern):
|
||||
python tools/image_deduper.py apply --index .dedupe/index.sqlite --mode hardlink
|
||||
|
||||
# 5) Referenzen in DB aktualisieren (optional):
|
||||
python tools/image_deduper.py apply --index .dedupe/index.sqlite --update-db sqlite:///./rssbot.db --dry-run
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import hashlib
|
||||
import os
|
||||
import sqlite3
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Optional, Tuple
|
||||
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
Image = None
|
||||
|
||||
try:
|
||||
import imagehash # type: ignore
|
||||
except ImportError:
|
||||
imagehash = None
|
||||
|
||||
|
||||
IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".webp", ".gif"}
|
||||
DEFAULT_INDEX = ".dedupe/index.sqlite"
|
||||
DEFAULT_REPORT = ".dedupe/report.csv"
|
||||
|
||||
|
||||
def sha256_file(path: Path, bufsize: int = 1024 * 1024) -> str:
|
||||
h = hashlib.sha256()
|
||||
with path.open("rb") as f:
|
||||
while True:
|
||||
b = f.read(bufsize)
|
||||
if not b:
|
||||
break
|
||||
h.update(b)
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
def calc_phash(path: Path) -> Optional[str]:
|
||||
if Image is None or imagehash is None:
|
||||
return None
|
||||
try:
|
||||
with Image.open(path) as im:
|
||||
im = im.convert("RGB")
|
||||
ph = imagehash.phash(im, hash_size=16) # 16x16 → 256-bit
|
||||
return str(ph)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def ensure_dir(p: Path):
|
||||
p.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
def init_index(db_path: Path):
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
CREATE TABLE IF NOT EXISTS files (
|
||||
id INTEGER PRIMARY KEY,
|
||||
path TEXT NOT NULL UNIQUE,
|
||||
size INTEGER NOT NULL,
|
||||
mtime REAL NOT NULL,
|
||||
sha256 TEXT NOT NULL,
|
||||
phash TEXT,
|
||||
ext TEXT NOT NULL
|
||||
);
|
||||
""")
|
||||
cur.execute("CREATE INDEX IF NOT EXISTS idx_sha256 ON files (sha256);")
|
||||
cur.execute("CREATE INDEX IF NOT EXISTS idx_phash ON files (phash);")
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
|
||||
def is_image(path: Path) -> bool:
|
||||
return path.suffix.lower() in IMAGE_EXTS
|
||||
|
||||
|
||||
def walk_images(roots: List[Path]) -> Iterable[Path]:
|
||||
for root in roots:
|
||||
for p in root.rglob("*"):
|
||||
if p.is_file() and is_image(p):
|
||||
yield p
|
||||
|
||||
|
||||
def upsert_file(db_path: Path, path: Path, sha256: str, phash: Optional[str]):
|
||||
st = path.stat()
|
||||
row = (str(path), st.st_size, st.st_mtime, sha256, phash, path.suffix.lower())
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
INSERT INTO files (path, size, mtime, sha256, phash, ext)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(path) DO UPDATE SET
|
||||
size=excluded.size,
|
||||
mtime=excluded.mtime,
|
||||
sha256=excluded.sha256,
|
||||
phash=excluded.phash,
|
||||
ext=excluded.ext;
|
||||
""", row)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
|
||||
def group_by_sha256(db_path: Path) -> List[List[Tuple[int, str, int]]]:
|
||||
"""Return list of groups: [(id, path, size), ...] where sha256 identical and len(group) > 1."""
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT sha256 FROM files GROUP BY sha256 HAVING COUNT(*) > 1;
|
||||
""")
|
||||
hashes = [r[0] for r in cur.fetchall()]
|
||||
groups = []
|
||||
for h in hashes:
|
||||
cur.execute("SELECT id, path, size FROM files WHERE sha256=?", (h,))
|
||||
rows = cur.fetchall()
|
||||
groups.append([(rid, rpath, rsize) for rid, rpath, rsize in rows])
|
||||
conn.close()
|
||||
return groups
|
||||
|
||||
|
||||
def write_csv_report(db_path: Path, csv_path: Path) -> Tuple[int, int]:
|
||||
groups = group_by_sha256(db_path)
|
||||
ensure_dir(csv_path.parent)
|
||||
total_dups = 0
|
||||
total_savings = 0
|
||||
with csv_path.open("w", newline="", encoding="utf-8") as f:
|
||||
w = csv.writer(f)
|
||||
w.writerow(["group_id", "canonical_path", "dup_path", "dup_size_bytes"])
|
||||
gid = 0
|
||||
for g in groups:
|
||||
if not g:
|
||||
continue
|
||||
# Kanon: größte Datei (oder erste)
|
||||
canonical = max(g, key=lambda x: x[2])
|
||||
for rid, path, size in g:
|
||||
if path == canonical[1]:
|
||||
continue
|
||||
total_dups += 1
|
||||
total_savings += size
|
||||
w.writerow([gid, canonical[1], path, size])
|
||||
gid += 1
|
||||
return total_dups, total_savings
|
||||
|
||||
|
||||
def apply_hardlink(canonical: Path, dup: Path, dry_run: bool) -> None:
|
||||
# Ersetzt dup durch Hardlink auf canonical (gleiche Partition nötig)
|
||||
if dry_run:
|
||||
return
|
||||
tmp = dup.with_suffix(dup.suffix + ".dedupe.tmp")
|
||||
dup.unlink() # entferne dup
|
||||
os.link(canonical, tmp) # hardlink temp
|
||||
tmp.replace(dup) # atomarer move
|
||||
|
||||
|
||||
def apply_delete(dup: Path, dry_run: bool) -> None:
|
||||
if dry_run:
|
||||
return
|
||||
dup.unlink()
|
||||
|
||||
|
||||
@dataclass
|
||||
class ApplyStats:
|
||||
processed: int = 0
|
||||
errors: int = 0
|
||||
saved_bytes: int = 0
|
||||
|
||||
|
||||
def apply_changes(csv_report: Path, mode: str, dry_run: bool) -> ApplyStats:
|
||||
stats = ApplyStats()
|
||||
with csv_report.open("r", encoding="utf-8") as f:
|
||||
r = csv.DictReader(f)
|
||||
for row in r:
|
||||
canonical = Path(row["canonical_path"])
|
||||
dup = Path(row["dup_path"])
|
||||
size = int(row["dup_size_bytes"])
|
||||
try:
|
||||
if mode == "hardlink":
|
||||
apply_hardlink(canonical, dup, dry_run)
|
||||
elif mode == "delete":
|
||||
apply_delete(dup, dry_run)
|
||||
else:
|
||||
raise ValueError("mode must be 'hardlink' or 'delete'")
|
||||
stats.processed += 1
|
||||
stats.saved_bytes += size
|
||||
except Exception as e:
|
||||
stats.errors += 1
|
||||
print(f"[ERROR] {dup}: {e}", file=sys.stderr)
|
||||
return stats
|
||||
|
||||
|
||||
def parse_roots(roots_arg: str) -> List[Path]:
|
||||
parts = [Path(p.strip()) for p in roots_arg.split(",") if p.strip()]
|
||||
for p in parts:
|
||||
if not p.exists():
|
||||
raise FileNotFoundError(f"Root not found: {p}")
|
||||
return parts
|
||||
|
||||
|
||||
def cmd_scan(args):
|
||||
out_dir = Path(args.out_dir)
|
||||
index = Path(args.index or DEFAULT_INDEX)
|
||||
ensure_dir(out_dir)
|
||||
ensure_dir(index.parent)
|
||||
init_index(index)
|
||||
roots = parse_roots(args.roots)
|
||||
|
||||
count = 0
|
||||
for path in walk_images(roots):
|
||||
try:
|
||||
h = sha256_file(path)
|
||||
ph = calc_phash(path) if args.phash else None
|
||||
upsert_file(index, path, h, ph)
|
||||
count += 1
|
||||
if count % 500 == 0:
|
||||
print(f"... indexed {count} files")
|
||||
except Exception as e:
|
||||
print(f"[WARN] {path}: {e}", file=sys.stderr)
|
||||
|
||||
dups, savings = write_csv_report(index, Path(args.report or DEFAULT_REPORT))
|
||||
print(f"Indexed {count} images. Found duplicate files: {dups}, potential savings: {savings/1_000_000:.2f} MB")
|
||||
print(f"Index: {index}")
|
||||
print(f"Report: {args.report or DEFAULT_REPORT}")
|
||||
|
||||
|
||||
def cmd_report(args):
|
||||
index = Path(args.index or DEFAULT_INDEX)
|
||||
csv_path = Path(args.report or DEFAULT_REPORT)
|
||||
dups, savings = write_csv_report(index, csv_path)
|
||||
print(f"Duplicates: {dups}, potential savings: {savings/1_000_000:.2f} MB")
|
||||
if args.csv:
|
||||
print(f"CSV written: {csv_path}")
|
||||
|
||||
|
||||
def cmd_apply(args):
|
||||
csv_report = Path(args.report or DEFAULT_REPORT)
|
||||
if not csv_report.exists():
|
||||
raise FileNotFoundError(f"Report not found: {csv_report}")
|
||||
stats = apply_changes(csv_report, args.mode, args.dry_run)
|
||||
print(f"Processed: {stats.processed}, Errors: {stats.errors}, Saved: {stats.saved_bytes/1_000_000:.2f} MB (mode={args.mode}, dry_run={args.dry_run})")
|
||||
if args.update_db:
|
||||
# Platzhalter: hier könntest du eure DB-Referenzen aktualisieren (falls Bilder-Paths in DB gespeichert sind).
|
||||
# Beispiel: SQLModel mit Tabelle ImageMeta(content_hash UNIQUE, local_path) → auf kanonischen Pfad umbiegen.
|
||||
print(f"[INFO] DB update requested for: {args.update_db} (implementierung projektspezifisch)")
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description="Bild-Deduplizierung (scan/report/apply)")
|
||||
sub = ap.add_subparsers(dest="cmd", required=True)
|
||||
|
||||
sc = sub.add_parser("scan", help="Verzeichnisse scannen und Index/Report erstellen")
|
||||
sc.add_argument("--roots", required=True, help="Kommagetrennte Wurzelpfade, z.B. 'media,assets/images'")
|
||||
sc.add_argument("--out-dir", default=".dedupe", help="Ausgabeverzeichnis für Index/Reports")
|
||||
sc.add_argument("--index", help="Pfad zur SQLite-Indexdatei (default .dedupe/index.sqlite)")
|
||||
sc.add_argument("--report", help="Pfad zum CSV-Report (default .dedupe/report.csv)")
|
||||
sc.add_argument("--phash", action="store_true", help="Perzeptuellen Hash berechnen (für zukünftige Near-Dups)")
|
||||
sc.set_defaults(func=cmd_scan)
|
||||
|
||||
rp = sub.add_parser("report", help="Report neu generieren/anzeigen")
|
||||
rp.add_argument("--index", help="Pfad zur SQLite-Indexdatei")
|
||||
rp.add_argument("--report", help="Pfad zum CSV-Report")
|
||||
rp.add_argument("--csv", action="store_true", help="CSV-Pfad ausgeben")
|
||||
rp.set_defaults(func=cmd_report)
|
||||
|
||||
aply = sub.add_parser("apply", help="Änderungen anwenden (Hardlink/Delete)")
|
||||
aply.add_argument("--report", help="Pfad zum CSV-Report")
|
||||
aply.add_argument("--mode", choices=["hardlink", "delete"], default="hardlink", help="Strategie für Duplikate")
|
||||
aply.add_argument("--dry-run", action="store_true", help="Nur anzeigen, nichts ändern")
|
||||
aply.add_argument("--update-db", help="Optional: DB-URL für Referenz-Updates (projektspezifisch)")
|
||||
aply.set_defaults(func=cmd_apply)
|
||||
|
||||
args = ap.parse_args()
|
||||
args.func(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
363
utils/article_extractor.py
Normal file
363
utils/article_extractor.py
Normal file
|
|
@ -0,0 +1,363 @@
|
|||
# utils/article_extractor.py
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import logging
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
# Konfiguration
|
||||
REQUEST_TIMEOUT = 15
|
||||
MAX_RETRIES = 3
|
||||
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
|
||||
# Website-spezifische Selektoren
|
||||
CONTENT_SELECTORS = {
|
||||
# Promobil & Camping-spezifisch
|
||||
'promobil.de': [
|
||||
{'tag': 'div', 'class': 'article__text'},
|
||||
{'tag': 'div', 'class': 'article-content'},
|
||||
{'tag': 'div', 'class': 'content-text'}
|
||||
],
|
||||
'camping.info': [
|
||||
{'tag': 'div', 'class': 'article-body'},
|
||||
{'tag': 'div', 'class': 'post-content'}
|
||||
],
|
||||
'caravaning.de': [
|
||||
{'tag': 'div', 'class': 'article__content'},
|
||||
{'tag': 'div', 'class': 'entry-content'}
|
||||
],
|
||||
|
||||
# WordPress Standard-Selektoren
|
||||
'wordpress': [
|
||||
{'tag': 'div', 'class': 'entry-content'},
|
||||
{'tag': 'div', 'class': 'post-content'},
|
||||
{'tag': 'div', 'class': 'content'},
|
||||
{'tag': 'main', 'class': 'main-content'},
|
||||
{'tag': 'article', 'class': None}
|
||||
],
|
||||
|
||||
# Allgemeine Fallbacks
|
||||
'generic': [
|
||||
{'tag': 'article', 'class': None},
|
||||
{'tag': 'div', 'class': 'content'},
|
||||
{'tag': 'div', 'class': 'post'},
|
||||
{'tag': 'div', 'class': 'entry'},
|
||||
{'tag': 'main', 'class': None},
|
||||
{'tag': 'div', 'id': 'content'},
|
||||
{'tag': 'div', 'id': 'main'}
|
||||
]
|
||||
}
|
||||
|
||||
def get_domain_from_url(url: str) -> str:
|
||||
"""
|
||||
Extrahiert die Domain aus einer URL
|
||||
"""
|
||||
try:
|
||||
from urllib.parse import urlparse
|
||||
parsed = urlparse(url)
|
||||
return parsed.netloc.lower()
|
||||
except:
|
||||
return ""
|
||||
|
||||
def get_selectors_for_domain(domain: str) -> list:
|
||||
"""
|
||||
Gibt die passenden Selektoren für eine Domain zurück
|
||||
"""
|
||||
# Direkte Domain-Matches
|
||||
for known_domain in CONTENT_SELECTORS:
|
||||
if known_domain != 'wordpress' and known_domain != 'generic' and known_domain in domain:
|
||||
return CONTENT_SELECTORS[known_domain]
|
||||
|
||||
# WordPress erkennen (wird später durch Meta-Tags erkannt)
|
||||
return CONTENT_SELECTORS['generic']
|
||||
|
||||
def is_wordpress_site(soup: BeautifulSoup) -> bool:
|
||||
"""
|
||||
Erkennt WordPress-Websites anhand von Meta-Tags
|
||||
"""
|
||||
try:
|
||||
# WordPress Generator Meta-Tag
|
||||
generator = soup.find('meta', attrs={'name': 'generator'})
|
||||
if generator and 'wordpress' in generator.get('content', '').lower():
|
||||
return True
|
||||
|
||||
# WordPress-spezifische Link-Tags
|
||||
wp_links = soup.find_all('link', href=lambda x: x and '/wp-' in x)
|
||||
if wp_links:
|
||||
return True
|
||||
|
||||
# WordPress REST API
|
||||
rest_api = soup.find('link', attrs={'rel': 'https://api.w.org/'})
|
||||
if rest_api:
|
||||
return True
|
||||
|
||||
return False
|
||||
except:
|
||||
return False
|
||||
|
||||
def clean_extracted_text(text: str) -> str:
|
||||
"""
|
||||
Bereinigt extrahierten Text von unerwünschten Elementen
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
lines = text.split('\n')
|
||||
cleaned_lines = []
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
|
||||
# Überspringe sehr kurze Zeilen (wahrscheinlich Navigation/Werbung)
|
||||
if len(line) < 10:
|
||||
continue
|
||||
|
||||
# Überspringe typische Navigation/Footer-Texte
|
||||
skip_patterns = [
|
||||
'cookie', 'datenschutz', 'impressum', 'agb', 'newsletter',
|
||||
'folgen sie uns', 'social media', 'teilen', 'weiterlesen',
|
||||
'mehr zum thema', 'ähnliche artikel', 'kommentare',
|
||||
'anzeige', 'werbung', 'advertisement'
|
||||
]
|
||||
|
||||
if any(pattern in line.lower() for pattern in skip_patterns):
|
||||
continue
|
||||
|
||||
# Überspringe Zeilen mit zu vielen Sonderzeichen (Navigation)
|
||||
if len([c for c in line if c in '|•→←↑↓']) > 3:
|
||||
continue
|
||||
|
||||
cleaned_lines.append(line)
|
||||
|
||||
# Text zusammenfügen
|
||||
cleaned_text = ' '.join(cleaned_lines)
|
||||
|
||||
# Mehrfache Leerzeichen entfernen
|
||||
cleaned_text = ' '.join(cleaned_text.split())
|
||||
|
||||
return cleaned_text
|
||||
|
||||
def extract_with_selectors(soup: BeautifulSoup, selectors: list) -> str:
|
||||
"""
|
||||
Versucht Text mit einer Liste von Selektoren zu extrahieren
|
||||
"""
|
||||
for selector in selectors:
|
||||
try:
|
||||
element = None
|
||||
|
||||
if selector.get('class'):
|
||||
element = soup.find(selector['tag'], class_=selector['class'])
|
||||
elif selector.get('id'):
|
||||
element = soup.find(selector['tag'], id=selector['id'])
|
||||
else:
|
||||
element = soup.find(selector['tag'])
|
||||
|
||||
if element:
|
||||
# Entferne Script- und Style-Tags
|
||||
for script in element(['script', 'style', 'nav', 'header', 'footer', 'aside']):
|
||||
script.decompose()
|
||||
|
||||
text = element.get_text(' ', strip=True)
|
||||
|
||||
# Nur zurückgeben wenn genügend Text vorhanden
|
||||
if len(text.split()) > 50:
|
||||
logging.info(f"✅ Erfolgreiche Extraktion mit Selektor: {selector}")
|
||||
return clean_extracted_text(text)
|
||||
|
||||
except Exception as e:
|
||||
logging.debug(f"Selektor {selector} fehlgeschlagen: {e}")
|
||||
continue
|
||||
|
||||
return ""
|
||||
|
||||
def extract_from_paragraphs(soup: BeautifulSoup) -> str:
|
||||
"""
|
||||
Fallback: Extrahiert Text aus allen Paragraph-Tags
|
||||
"""
|
||||
try:
|
||||
paragraphs = soup.find_all('p')
|
||||
|
||||
if not paragraphs:
|
||||
return ""
|
||||
|
||||
# Sammle alle Paragraph-Texte
|
||||
texts = []
|
||||
for p in paragraphs:
|
||||
text = p.get_text(strip=True)
|
||||
if len(text) > 20: # Nur längere Absätze
|
||||
texts.append(text)
|
||||
|
||||
combined_text = ' '.join(texts)
|
||||
|
||||
if len(combined_text.split()) > 30:
|
||||
logging.info(f"✅ Fallback-Extraktion aus {len(paragraphs)} Paragraphen")
|
||||
return clean_extracted_text(combined_text)
|
||||
|
||||
return ""
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Fehler bei Paragraph-Extraktion: {e}")
|
||||
return ""
|
||||
|
||||
def extract_full_article(url: str) -> str:
|
||||
"""
|
||||
Hauptfunktion: Extrahiert den vollständigen Artikeltext von einer URL
|
||||
"""
|
||||
if not url:
|
||||
return ""
|
||||
|
||||
retries = 0
|
||||
|
||||
while retries < MAX_RETRIES:
|
||||
try:
|
||||
logging.info(f"📰 Starte Volltextextraktion von: {url} (Versuch {retries + 1})")
|
||||
|
||||
# HTTP-Request mit verbessertem Header
|
||||
headers = {
|
||||
'User-Agent': USER_AGENT,
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'de,en-US;q=0.7,en;q=0.3',
|
||||
'Accept-Encoding': 'gzip, deflate',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
}
|
||||
|
||||
response = requests.get(url, timeout=REQUEST_TIMEOUT, headers=headers)
|
||||
response.raise_for_status()
|
||||
|
||||
# Encoding sicherstellen
|
||||
if response.encoding.lower() in ['iso-8859-1', 'windows-1252']:
|
||||
response.encoding = 'utf-8'
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
# Domain-spezifische Selektoren ermitteln
|
||||
domain = get_domain_from_url(url)
|
||||
selectors = get_selectors_for_domain(domain)
|
||||
|
||||
# WordPress erkennen und entsprechende Selektoren verwenden
|
||||
if is_wordpress_site(soup):
|
||||
logging.info("🔧 WordPress-Site erkannt")
|
||||
selectors = CONTENT_SELECTORS['wordpress'] + selectors
|
||||
|
||||
# 1. Versuch: Domain-spezifische Selektoren
|
||||
extracted_text = extract_with_selectors(soup, selectors)
|
||||
|
||||
if extracted_text and len(extracted_text.split()) > 50:
|
||||
logging.info(f"🎉 Erfolgreiche Extraktion: {len(extracted_text.split())} Wörter")
|
||||
return extracted_text
|
||||
|
||||
# 2. Versuch: Generische Selektoren
|
||||
if not extracted_text:
|
||||
logging.info("🔄 Fallback auf generische Selektoren")
|
||||
extracted_text = extract_with_selectors(soup, CONTENT_SELECTORS['generic'])
|
||||
|
||||
if extracted_text and len(extracted_text.split()) > 50:
|
||||
logging.info(f"🎉 Erfolgreiche Extraktion (generisch): {len(extracted_text.split())} Wörter")
|
||||
return extracted_text
|
||||
|
||||
# 3. Versuch: Paragraph-Extraktion
|
||||
if not extracted_text:
|
||||
logging.info("🔄 Fallback auf Paragraph-Extraktion")
|
||||
extracted_text = extract_from_paragraphs(soup)
|
||||
|
||||
if extracted_text and len(extracted_text.split()) > 30:
|
||||
logging.info(f"🎉 Erfolgreiche Extraktion (Paragraphen): {len(extracted_text.split())} Wörter")
|
||||
return extracted_text
|
||||
|
||||
# 4. Letzter Versuch: Gesamter Body-Text
|
||||
if not extracted_text:
|
||||
logging.info("🔄 Letzter Fallback: Body-Text")
|
||||
body = soup.find('body')
|
||||
if body:
|
||||
# Entferne Navigation, Header, Footer
|
||||
for element in body(['nav', 'header', 'footer', 'aside', 'script', 'style']):
|
||||
element.decompose()
|
||||
|
||||
body_text = body.get_text(' ', strip=True)
|
||||
if len(body_text.split()) > 100:
|
||||
extracted_text = clean_extracted_text(body_text)
|
||||
logging.info(f"⚠️ Body-Extraktion: {len(extracted_text.split())} Wörter")
|
||||
return extracted_text
|
||||
|
||||
# Kein brauchbarer Text gefunden
|
||||
if not extracted_text:
|
||||
logging.warning(f"⚠️ Keine verwertbaren Inhalte gefunden bei: {url}")
|
||||
return ""
|
||||
|
||||
return extracted_text
|
||||
|
||||
except requests.RequestException as e:
|
||||
retries += 1
|
||||
logging.warning(f"🌐 Netzwerkfehler bei {url} (Versuch {retries}): {e}")
|
||||
|
||||
if retries < MAX_RETRIES:
|
||||
time.sleep(2 ** retries) # Exponential backoff
|
||||
continue
|
||||
else:
|
||||
logging.error(f"❌ Maximale Anzahl Versuche erreicht für: {url}")
|
||||
return ""
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Unerwarteter Fehler bei Volltextextraktion von {url}: {e}")
|
||||
return ""
|
||||
|
||||
return ""
|
||||
|
||||
def extract_article_summary(full_text: str, max_length: int = 300) -> str:
|
||||
"""
|
||||
Erstellt eine intelligente Zusammenfassung aus dem Volltext
|
||||
"""
|
||||
if not full_text:
|
||||
return ""
|
||||
|
||||
sentences = full_text.split('.')
|
||||
|
||||
# Erste 2-3 sinnvolle Sätze als Summary verwenden
|
||||
summary_sentences = []
|
||||
current_length = 0
|
||||
|
||||
for sentence in sentences[:5]: # Maximal erste 5 Sätze prüfen
|
||||
sentence = sentence.strip()
|
||||
|
||||
if len(sentence) < 20: # Zu kurze Sätze überspringen
|
||||
continue
|
||||
|
||||
if current_length + len(sentence) > max_length:
|
||||
break
|
||||
|
||||
summary_sentences.append(sentence)
|
||||
current_length += len(sentence)
|
||||
|
||||
summary = '. '.join(summary_sentences)
|
||||
|
||||
if summary and not summary.endswith('.'):
|
||||
summary += '.'
|
||||
|
||||
return summary[:max_length]
|
||||
|
||||
def validate_extracted_content(text: str) -> bool:
|
||||
"""
|
||||
Validiert ob der extrahierte Inhalt brauchbar ist
|
||||
"""
|
||||
if not text or len(text.strip()) < 100:
|
||||
return False
|
||||
|
||||
words = text.split()
|
||||
|
||||
# Mindestens 50 Wörter
|
||||
if len(words) < 50:
|
||||
return False
|
||||
|
||||
# Nicht zu viele Sonderzeichen (Navigation etc.)
|
||||
special_chars = len([c for c in text if c in '|•→←↑↓'])
|
||||
if special_chars > len(text) * 0.05: # Mehr als 5% Sonderzeichen
|
||||
return False
|
||||
|
||||
# Durchschnittliche Wortlänge prüfen (zu kurz = Navigation)
|
||||
avg_word_length = sum(len(word) for word in words) / len(words)
|
||||
if avg_word_length < 3:
|
||||
return False
|
||||
|
||||
return True
|
||||
23
utils/article_utils.py
Normal file
23
utils/article_utils.py
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
# utils/article_utils.py
|
||||
|
||||
import hashlib
|
||||
|
||||
def clean_text(text: str) -> str:
|
||||
return text.strip()
|
||||
|
||||
def generate_id(link: str) -> str:
|
||||
return hashlib.md5(link.encode("utf-8")).hexdigest()
|
||||
|
||||
def categorize_article(text: str) -> str:
|
||||
# Dummy-Kategorie
|
||||
return "Allgemein"
|
||||
|
||||
def tag_article(text: str) -> list:
|
||||
# Dummy-Tags
|
||||
return ["tag1", "tag2"]
|
||||
|
||||
def summarize_text(text: str) -> str:
|
||||
return text[:200] + "..."
|
||||
|
||||
def rewrite_text(text: str) -> str:
|
||||
return text # Platzhalter, z. B. für GPT-Rewrite später
|
||||
51
utils/config.py
Normal file
51
utils/config.py
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
import os
|
||||
from typing import Dict, List
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
||||
def validate_env() -> Dict:
|
||||
"""Validiert sicherheitsrelevante .env-Variablen.
|
||||
|
||||
Returns dict with: ok: bool, errors: List[str], warnings: List[str], summary: Dict[str, bool]
|
||||
"""
|
||||
errors: List[str] = []
|
||||
warnings: List[str] = []
|
||||
|
||||
wp_base_url = os.getenv("WP_BASE_URL", "").strip()
|
||||
wp_user = os.getenv("WP_USERNAME", "").strip()
|
||||
wp_pass = os.getenv("WP_PASSWORD", "").strip()
|
||||
wp_b64 = os.getenv("WP_AUTH_BASE64", "").strip()
|
||||
openai_key = os.getenv("OPENAI_API_KEY", "").strip()
|
||||
|
||||
# WP_BASE_URL Pflicht
|
||||
if not wp_base_url:
|
||||
errors.append("WP_BASE_URL fehlt in .env")
|
||||
elif not (wp_base_url.startswith("http://") or wp_base_url.startswith("https://")):
|
||||
errors.append("WP_BASE_URL muss mit http:// oder https:// beginnen")
|
||||
|
||||
# Auth-Creds: entweder Base64 ODER Username+Password
|
||||
if not wp_b64 and not (wp_user and wp_pass):
|
||||
errors.append("Entweder WP_AUTH_BASE64 oder WP_USERNAME + WP_PASSWORD in .env setzen")
|
||||
|
||||
# Empfehlungen
|
||||
if not wp_b64 and (wp_user and wp_pass):
|
||||
warnings.append("WP_AUTH_BASE64 nicht gesetzt – Empfehlung: Base64 nutzen (Application Password)")
|
||||
|
||||
if not openai_key:
|
||||
warnings.append("OPENAI_API_KEY ist nicht gesetzt – Umschreibungsfunktion ist deaktiviert")
|
||||
|
||||
summary = {
|
||||
"WP_BASE_URL": bool(wp_base_url),
|
||||
"WP_USERNAME": bool(wp_user),
|
||||
"WP_PASSWORD": bool(wp_pass),
|
||||
"WP_AUTH_BASE64": bool(wp_b64),
|
||||
"OPENAI_API_KEY": bool(openai_key),
|
||||
}
|
||||
|
||||
return {"ok": len(errors) == 0, "errors": errors, "warnings": warnings, "summary": summary}
|
||||
|
||||
|
||||
__all__ = ["validate_env"]
|
||||
|
||||
367
utils/css_loader.py
Normal file
367
utils/css_loader.py
Normal file
|
|
@ -0,0 +1,367 @@
|
|||
# utils/css_loader.py
|
||||
|
||||
import streamlit as st
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
def load_css():
|
||||
"""
|
||||
Lädt die zentrale CSS-Datei und injiziert sie in die Streamlit-App
|
||||
"""
|
||||
try:
|
||||
# Pfad zur CSS-Datei bestimmen
|
||||
css_file = Path(__file__).parent.parent / "static" / "styles.css"
|
||||
|
||||
if css_file.exists():
|
||||
with open(css_file, "r", encoding="utf-8") as f:
|
||||
css_content = f.read()
|
||||
|
||||
# CSS in Streamlit injizieren
|
||||
st.markdown(f"""
|
||||
<style>
|
||||
{css_content}
|
||||
</style>
|
||||
""", unsafe_allow_html=True)
|
||||
|
||||
return True
|
||||
else:
|
||||
# Fallback: CSS-Datei erstellen
|
||||
create_css_file()
|
||||
return load_css() # Rekursiver Aufruf nach Erstellung
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"Fehler beim Laden der CSS-Datei: {e}")
|
||||
return False
|
||||
|
||||
def create_css_file():
|
||||
"""
|
||||
Erstellt die CSS-Datei falls sie nicht existiert
|
||||
"""
|
||||
css_content = """/* ===============================================
|
||||
RSS Feed Manager - Zentrale CSS-Datei
|
||||
Dark-Mode optimiert mit Fallbacks
|
||||
=============================================== */
|
||||
|
||||
/* === ROOT VARIABLEN === */
|
||||
:root {
|
||||
/* Dark Mode Farbpalette */
|
||||
--bg-primary: #1e1e1e;
|
||||
--bg-secondary: #2d2d30;
|
||||
--bg-card: #2d2d30;
|
||||
--bg-header: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||||
--bg-filter: #363636;
|
||||
|
||||
/* Text Farben */
|
||||
--text-primary: #ffffff;
|
||||
--text-secondary: #b0b0b0;
|
||||
--text-muted: #888888;
|
||||
--text-accent: #667eea;
|
||||
|
||||
/* Status Farben */
|
||||
--status-new: #2196f3;
|
||||
--status-new-bg: #1565c0;
|
||||
--status-rewrite: #ff9800;
|
||||
--status-rewrite-bg: #ef6c00;
|
||||
--status-process: #9c27b0;
|
||||
--status-process-bg: #6a1b9a;
|
||||
--status-online: #4caf50;
|
||||
--status-online-bg: #2e7d32;
|
||||
--status-hold: #e91e63;
|
||||
--status-hold-bg: #ad1457;
|
||||
--status-trash: #f44336;
|
||||
--status-trash-bg: #c62828;
|
||||
--status-wp-pending: #00bcd4;
|
||||
--status-wp-pending-bg: #0097a7;
|
||||
|
||||
/* Borders & Shadows */
|
||||
--border-color: #404040;
|
||||
--shadow-light: 0 2px 8px rgba(0, 0, 0, 0.3);
|
||||
--shadow-hover: 0 8px 20px rgba(0, 0, 0, 0.4);
|
||||
|
||||
/* Accent Colors */
|
||||
--gradient-primary: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||||
--gradient-secondary: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
|
||||
}
|
||||
|
||||
/* === LIGHT MODE FALLBACKS === */
|
||||
[data-theme="light"], .stApp[data-theme="light"] {
|
||||
--bg-primary: #ffffff;
|
||||
--bg-secondary: #f8f9fa;
|
||||
--bg-card: #ffffff;
|
||||
--bg-filter: #f0f2f6;
|
||||
|
||||
--text-primary: #212529;
|
||||
--text-secondary: #495057;
|
||||
--text-muted: #6c757d;
|
||||
--text-accent: #667eea;
|
||||
|
||||
--border-color: #dee2e6;
|
||||
--shadow-light: 0 2px 8px rgba(0, 0, 0, 0.1);
|
||||
--shadow-hover: 0 8px 20px rgba(0, 0, 0, 0.15);
|
||||
}
|
||||
|
||||
/* === HAUPTCONTAINER === */
|
||||
.main-header {
|
||||
background: var(--bg-header);
|
||||
padding: 2rem;
|
||||
border-radius: 12px;
|
||||
margin-bottom: 2rem;
|
||||
color: var(--text-primary);
|
||||
text-align: center;
|
||||
box-shadow: var(--shadow-light);
|
||||
}
|
||||
|
||||
.main-header h1 {
|
||||
color: var(--text-primary) !important;
|
||||
margin: 0 0 0.5rem 0;
|
||||
font-size: 2.5rem;
|
||||
font-weight: 700;
|
||||
}
|
||||
|
||||
.main-header p {
|
||||
color: rgba(255, 255, 255, 0.9) !important;
|
||||
margin: 0;
|
||||
font-size: 1.1rem;
|
||||
}
|
||||
|
||||
/* === ARTIKEL CARDS === */
|
||||
.article-card {
|
||||
background: var(--bg-card) !important;
|
||||
border-radius: 12px;
|
||||
padding: 1.5rem;
|
||||
margin-bottom: 1rem;
|
||||
box-shadow: var(--shadow-light);
|
||||
border-left: 4px solid var(--text-accent);
|
||||
border: 1px solid var(--border-color);
|
||||
transition: all 0.3s ease;
|
||||
color: var(--text-primary) !important;
|
||||
}
|
||||
|
||||
.article-card:hover {
|
||||
transform: translateY(-2px);
|
||||
box-shadow: var(--shadow-hover);
|
||||
border-color: var(--text-accent);
|
||||
}
|
||||
|
||||
.article-card h3,
|
||||
.article-card .article-title {
|
||||
color: var(--text-primary) !important;
|
||||
margin: 0 0 0.5rem 0;
|
||||
font-size: 1.2rem;
|
||||
font-weight: 600;
|
||||
line-height: 1.4;
|
||||
}
|
||||
|
||||
.article-card .article-meta {
|
||||
color: var(--text-secondary) !important;
|
||||
font-size: 0.9rem;
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
|
||||
.article-card .article-summary {
|
||||
color: var(--text-secondary) !important;
|
||||
line-height: 1.5;
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
|
||||
.article-card .article-footer {
|
||||
color: var(--text-muted) !important;
|
||||
font-size: 0.85rem;
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
/* === STATUS BADGES === */
|
||||
.status-badge {
|
||||
padding: 0.3rem 0.8rem;
|
||||
border-radius: 20px;
|
||||
font-size: 0.8rem;
|
||||
font-weight: 600;
|
||||
margin-right: 0.5rem;
|
||||
display: inline-block;
|
||||
color: white !important;
|
||||
}
|
||||
|
||||
.status-new {
|
||||
background-color: var(--status-new-bg) !important;
|
||||
color: white !important;
|
||||
}
|
||||
|
||||
.status-rewrite {
|
||||
background-color: var(--status-rewrite-bg) !important;
|
||||
color: white !important;
|
||||
}
|
||||
|
||||
.status-process {
|
||||
background-color: var(--status-process-bg) !important;
|
||||
color: white !important;
|
||||
}
|
||||
|
||||
.status-online {
|
||||
background-color: var(--status-online-bg) !important;
|
||||
color: white !important;
|
||||
}
|
||||
|
||||
.status-hold {
|
||||
background-color: var(--status-hold-bg) !important;
|
||||
color: white !important;
|
||||
}
|
||||
|
||||
.status-trash {
|
||||
background-color: var(--status-trash-bg) !important;
|
||||
color: white !important;
|
||||
}
|
||||
|
||||
.status-wp-pending {
|
||||
background-color: var(--status-wp-pending-bg) !important;
|
||||
color: white !important;
|
||||
}
|
||||
|
||||
/* === FILTER SECTION === */
|
||||
.filter-section {
|
||||
background: var(--bg-filter) !important;
|
||||
padding: 1.5rem;
|
||||
border-radius: 12px;
|
||||
margin-bottom: 2rem;
|
||||
border: 1px solid var(--border-color);
|
||||
box-shadow: var(--shadow-light);
|
||||
}
|
||||
|
||||
.filter-section h3 {
|
||||
color: var(--text-primary) !important;
|
||||
margin: 0 0 1rem 0;
|
||||
font-size: 1.3rem;
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
/* === STATS CARDS === */
|
||||
.stats-card {
|
||||
background: var(--bg-card) !important;
|
||||
padding: 1.5rem;
|
||||
border-radius: 12px;
|
||||
text-align: center;
|
||||
box-shadow: var(--shadow-light);
|
||||
border: 1px solid var(--border-color);
|
||||
transition: transform 0.2s ease;
|
||||
}
|
||||
|
||||
.stats-card:hover {
|
||||
transform: translateY(-2px);
|
||||
}
|
||||
|
||||
.stats-number {
|
||||
font-size: 2.5rem;
|
||||
font-weight: 700;
|
||||
color: var(--text-accent) !important;
|
||||
margin-bottom: 0.5rem;
|
||||
display: block;
|
||||
}
|
||||
|
||||
.stats-card div:last-child {
|
||||
color: var(--text-secondary) !important;
|
||||
font-weight: 500;
|
||||
font-size: 1rem;
|
||||
}
|
||||
|
||||
/* === WORDPRESS STATUS === */
|
||||
.wp-status {
|
||||
background: var(--bg-card) !important;
|
||||
padding: 1rem;
|
||||
border-radius: 8px;
|
||||
margin: 1rem 0;
|
||||
border-left: 4px solid var(--status-wp-pending);
|
||||
border: 1px solid var(--border-color);
|
||||
box-shadow: var(--shadow-light);
|
||||
}
|
||||
|
||||
.wp-status strong {
|
||||
color: var(--text-primary) !important;
|
||||
}
|
||||
|
||||
.wp-status small {
|
||||
color: var(--text-muted) !important;
|
||||
}
|
||||
|
||||
/* === BUTTONS & ACTIONS === */
|
||||
.stButton > button {
|
||||
background: var(--gradient-primary) !important;
|
||||
color: white !important;
|
||||
border: none !important;
|
||||
border-radius: 8px !important;
|
||||
font-weight: 600 !important;
|
||||
transition: all 0.2s ease !important;
|
||||
}
|
||||
|
||||
.stButton > button:hover {
|
||||
transform: translateY(-1px) !important;
|
||||
box-shadow: 0 4px 12px rgba(102, 126, 234, 0.4) !important;
|
||||
}
|
||||
|
||||
/* === SELECTBOX & INPUT OVERRIDES === */
|
||||
.stSelectbox > div > div {
|
||||
background-color: var(--bg-card) !important;
|
||||
color: var(--text-primary) !important;
|
||||
border: 1px solid var(--border-color) !important;
|
||||
}
|
||||
|
||||
.stTextInput > div > div > input {
|
||||
background-color: var(--bg-card) !important;
|
||||
color: var(--text-primary) !important;
|
||||
border: 1px solid var(--border-color) !important;
|
||||
}
|
||||
|
||||
/* === RESPONSIVE DESIGN === */
|
||||
@media (max-width: 768px) {
|
||||
.main-header {
|
||||
padding: 1.5rem;
|
||||
}
|
||||
|
||||
.main-header h1 {
|
||||
font-size: 2rem;
|
||||
}
|
||||
|
||||
.article-card {
|
||||
padding: 1rem;
|
||||
}
|
||||
|
||||
.stats-card {
|
||||
padding: 1rem;
|
||||
}
|
||||
|
||||
.stats-number {
|
||||
font-size: 2rem;
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
try:
|
||||
# Static-Ordner erstellen falls nicht vorhanden
|
||||
static_dir = Path(__file__).parent.parent / "static"
|
||||
static_dir.mkdir(exist_ok=True)
|
||||
|
||||
# CSS-Datei schreiben
|
||||
css_file = static_dir / "styles.css"
|
||||
with open(css_file, "w", encoding="utf-8") as f:
|
||||
f.write(css_content)
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
st.error(f"Fehler beim Erstellen der CSS-Datei: {e}")
|
||||
return False
|
||||
|
||||
def apply_dark_theme():
|
||||
"""
|
||||
Wendet das Dark Theme an (zusätzlich zur CSS-Datei)
|
||||
"""
|
||||
st.markdown("""
|
||||
<script>
|
||||
// Dark Theme Detection und Anwendung
|
||||
const prefersDark = window.matchMedia('(prefers-color-scheme: dark)').matches;
|
||||
if (prefersDark) {
|
||||
document.documentElement.setAttribute('data-theme', 'dark');
|
||||
} else {
|
||||
document.documentElement.setAttribute('data-theme', 'light');
|
||||
}
|
||||
</script>
|
||||
""", unsafe_allow_html=True)
|
||||
29
utils/dalle_generator.py
Normal file
29
utils/dalle_generator.py
Normal file
|
|
@ -0,0 +1,29 @@
|
|||
import openai
|
||||
import logging
|
||||
import os
|
||||
|
||||
openai.api_key = os.getenv("OPENAI_API_KEY")
|
||||
|
||||
def generate_dalle_image(prompt: str) -> str:
|
||||
"""
|
||||
Erzeugt ein Bild mit DALL·E basierend auf dem übergebenen Prompt.
|
||||
Gibt die URL zum generierten Bild zurück.
|
||||
"""
|
||||
try:
|
||||
logging.info(f"🧠 Generiere DALL·E-Bild für Prompt: {prompt}")
|
||||
|
||||
response = openai.images.generate(
|
||||
model="dall-e-3",
|
||||
prompt=prompt,
|
||||
n=1,
|
||||
size="1024x1024",
|
||||
quality="standard"
|
||||
)
|
||||
|
||||
image_url = response.data[0].url
|
||||
logging.info(f"✅ Bild generiert: {image_url}")
|
||||
return image_url
|
||||
|
||||
except Exception as e:
|
||||
logging.exception("❌ Fehler bei der DALL·E-Bildgenerierung")
|
||||
return None
|
||||
|
|
@ -2,59 +2,325 @@
|
|||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urljoin
|
||||
from urllib.parse import urljoin, urlparse
|
||||
import logging
|
||||
import time
|
||||
from typing import List, Dict
|
||||
|
||||
# Konfiguration
|
||||
MAX_IMAGES = 5
|
||||
MIN_IMAGE_SIZE = 100 # Mindestgröße in Pixeln
|
||||
ALLOWED_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.webp', '.gif'}
|
||||
REQUEST_TIMEOUT = 10
|
||||
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
|
||||
def extract_images_with_metadata(article_url):
|
||||
def is_valid_image_url(url: str) -> bool:
|
||||
"""
|
||||
Versucht, Bilder mit Bildunterschrift und Copyright aus dem Originalartikel zu extrahieren.
|
||||
Gibt eine Liste mit Dictionaries zurück: {url, alt, copyright, copyright_url, caption}
|
||||
Prüft ob eine URL auf ein gültiges Bild zeigt
|
||||
"""
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
path = parsed.path.lower()
|
||||
|
||||
# Prüfe Dateiendung
|
||||
if not any(path.endswith(ext) for ext in ALLOWED_EXTENSIONS):
|
||||
return False
|
||||
|
||||
# Prüfe ob URL vollständig ist
|
||||
if not parsed.scheme or not parsed.netloc:
|
||||
return False
|
||||
|
||||
# Blacklist für unerwünschte Bilder
|
||||
blacklist_patterns = [
|
||||
'avatar', 'profile', 'icon', 'logo', 'banner',
|
||||
'advertisement', 'ads', 'tracking', 'pixel', 'social'
|
||||
]
|
||||
|
||||
return not any(pattern in url.lower() for pattern in blacklist_patterns)
|
||||
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def get_image_dimensions(img_tag) -> tuple:
|
||||
"""
|
||||
Versucht die Bildabmessungen aus HTML-Attributen zu ermitteln
|
||||
"""
|
||||
try:
|
||||
width = img_tag.get('width')
|
||||
height = img_tag.get('height')
|
||||
|
||||
if width and height:
|
||||
return int(width), int(height)
|
||||
|
||||
# Aus Style-Attribut extrahieren
|
||||
style = img_tag.get('style', '')
|
||||
if 'width:' in style or 'height:' in style:
|
||||
# Vereinfachte Extraktion - könnte erweitert werden
|
||||
pass
|
||||
|
||||
return None, None
|
||||
except:
|
||||
return None, None
|
||||
|
||||
def extract_image_metadata(img_tag, base_url: str) -> Dict:
|
||||
"""
|
||||
Extrahiert alle verfügbaren Metadaten eines Bildes
|
||||
"""
|
||||
try:
|
||||
# Basis-URL
|
||||
src = img_tag.get('src') or img_tag.get('data-src') or img_tag.get('data-lazy-src')
|
||||
if not src:
|
||||
return None
|
||||
|
||||
img_url = urljoin(base_url, src)
|
||||
|
||||
if not is_valid_image_url(img_url):
|
||||
return None
|
||||
|
||||
# Alt-Text
|
||||
alt_text = img_tag.get('alt', '').strip()
|
||||
|
||||
# Titel
|
||||
title = img_tag.get('title', '').strip()
|
||||
|
||||
# Bildabmessungen
|
||||
width, height = get_image_dimensions(img_tag)
|
||||
|
||||
# Überspringe sehr kleine Bilder
|
||||
if width and height and (width < MIN_IMAGE_SIZE or height < MIN_IMAGE_SIZE):
|
||||
return None
|
||||
|
||||
# Caption und Copyright aus Parent-Elementen suchen
|
||||
caption = ""
|
||||
copyright_text = "Unbekannt"
|
||||
copyright_url = base_url
|
||||
|
||||
# Suche in Parent-Elementen nach Caption
|
||||
parent = img_tag.find_parent(['figure', 'div', 'span', 'p'])
|
||||
if parent:
|
||||
# Figcaption
|
||||
figcaption = parent.find('figcaption')
|
||||
if figcaption:
|
||||
caption = figcaption.get_text(strip=True)
|
||||
|
||||
# Copyright-Link in Figcaption suchen
|
||||
copyright_link = figcaption.find('a')
|
||||
if copyright_link:
|
||||
copyright_url = urljoin(base_url, copyright_link.get('href', ''))
|
||||
copyright_text = copyright_link.get_text(strip=True)
|
||||
|
||||
# Alternative: Caption in kleinen Texten unter dem Bild
|
||||
caption_candidates = parent.find_all(['small', 'em', 'i'], limit=3)
|
||||
for candidate in caption_candidates:
|
||||
text = candidate.get_text(strip=True)
|
||||
if len(text) > 10 and len(text) < 200: # Plausible Caption-Länge
|
||||
if not caption: # Nur wenn noch keine Caption gefunden
|
||||
caption = text
|
||||
|
||||
# Fallback für Caption
|
||||
if not caption:
|
||||
caption = title or alt_text or "Bild aus Originalartikel"
|
||||
|
||||
return {
|
||||
"url": img_url,
|
||||
"alt": alt_text,
|
||||
"caption": caption[:300] if caption else "Kein Bildtitel vorhanden",
|
||||
"copyright": copyright_text or "Unbekannt",
|
||||
"copyright_url": copyright_url or base_url,
|
||||
"width": width,
|
||||
"height": height,
|
||||
"title": title
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Fehler bei Metadaten-Extraktion: {e}")
|
||||
return None
|
||||
|
||||
def extract_images_with_metadata(article_url: str) -> List[Dict]:
|
||||
"""
|
||||
Hauptfunktion: Extrahiert Bilder mit Metadaten aus einem Artikel
|
||||
"""
|
||||
images = []
|
||||
try:
|
||||
logging.info(f"📷 Extrahiere Bilder von {article_url}")
|
||||
response = requests.get(article_url, timeout=10)
|
||||
if response.status_code != 200:
|
||||
logging.warning(f"Keine gültige Antwort von {article_url} (Status {response.status_code})")
|
||||
return []
|
||||
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
|
||||
for img_tag in soup.find_all("img"):
|
||||
src = img_tag.get("src")
|
||||
if not src:
|
||||
continue
|
||||
|
||||
img_url = urljoin(article_url, src)
|
||||
alt_text = img_tag.get("alt", "").strip()
|
||||
|
||||
copyright_text = "Unbekannt"
|
||||
copyright_link = article_url
|
||||
caption = alt_text or "Bild aus Originalartikel"
|
||||
|
||||
parent = img_tag.find_parent(["figure", "div"])
|
||||
if parent:
|
||||
figcaption = parent.find("figcaption")
|
||||
if figcaption:
|
||||
caption = figcaption.get_text(strip=True)
|
||||
link_tag = figcaption.find("a")
|
||||
if link_tag and link_tag.has_attr("href"):
|
||||
copyright_link = link_tag["href"]
|
||||
copyright_text = link_tag.get_text(strip=True)
|
||||
|
||||
image_data = {
|
||||
"url": img_url,
|
||||
"alt": alt_text,
|
||||
"caption": caption or "Kein Bildtitel vorhanden",
|
||||
"copyright": copyright_text or "Unbekannt",
|
||||
"copyright_url": copyright_link or article_url
|
||||
}
|
||||
images.append(image_data)
|
||||
|
||||
logging.info(f"{len(images)} Bilder gefunden bei {article_url}")
|
||||
|
||||
if not article_url:
|
||||
return images
|
||||
|
||||
|
||||
try:
|
||||
logging.info(f"🖼️ Starte Bildextraktion von: {article_url}")
|
||||
|
||||
# HTTP-Request mit verbessertem Header
|
||||
headers = {
|
||||
'User-Agent': USER_AGENT,
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'de,en-US;q=0.7,en;q=0.3',
|
||||
'Accept-Encoding': 'gzip, deflate',
|
||||
'Connection': 'keep-alive',
|
||||
}
|
||||
|
||||
response = requests.get(article_url, timeout=REQUEST_TIMEOUT, headers=headers)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
|
||||
# Alle img-Tags finden
|
||||
img_tags = soup.find_all("img")
|
||||
logging.info(f"🔍 {len(img_tags)} img-Tags gefunden")
|
||||
|
||||
processed_urls = set() # Duplikate vermeiden
|
||||
|
||||
for img_tag in img_tags:
|
||||
try:
|
||||
# Metadaten extrahieren
|
||||
image_data = extract_image_metadata(img_tag, article_url)
|
||||
|
||||
if image_data and image_data["url"] not in processed_urls:
|
||||
images.append(image_data)
|
||||
processed_urls.add(image_data["url"])
|
||||
|
||||
logging.info(f"✅ Bild hinzugefügt: {image_data['caption'][:50]}...")
|
||||
|
||||
# Maximum erreicht?
|
||||
if len(images) >= MAX_IMAGES:
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Fehler beim Verarbeiten eines Bildes: {e}")
|
||||
continue
|
||||
|
||||
# Bilder nach Größe sortieren (größere zuerst)
|
||||
images.sort(key=lambda x: (x.get('width', 0) * x.get('height', 0)), reverse=True)
|
||||
|
||||
logging.info(f"🎉 {len(images)} Bilder erfolgreich extrahiert von {article_url}")
|
||||
return images[:MAX_IMAGES] # Sicherheitshalber nochmal begrenzen
|
||||
|
||||
except requests.RequestException as e:
|
||||
logging.error(f"🌐 Netzwerkfehler bei {article_url}: {e}")
|
||||
return []
|
||||
except Exception as e:
|
||||
logging.exception(f"Fehler bei der Bildextraktion aus {article_url}:")
|
||||
return []
|
||||
logging.error(f"❌ Unerwarteter Fehler bei Bildextraktion von {article_url}: {e}")
|
||||
return []
|
||||
|
||||
def validate_image_url(url: str) -> bool:
|
||||
"""
|
||||
Prüft ob ein Bild tatsächlich erreichbar ist
|
||||
"""
|
||||
try:
|
||||
response = requests.head(url, timeout=5)
|
||||
content_type = response.headers.get('content-type', '').lower()
|
||||
return response.status_code == 200 and 'image' in content_type
|
||||
except:
|
||||
return False
|
||||
|
||||
def extract_featured_image(article_url: str) -> Dict:
|
||||
"""
|
||||
Versucht das Hauptbild/Featured Image eines Artikels zu finden
|
||||
"""
|
||||
try:
|
||||
headers = {'User-Agent': USER_AGENT}
|
||||
response = requests.get(article_url, timeout=REQUEST_TIMEOUT, headers=headers)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
|
||||
# OpenGraph Image
|
||||
og_image = soup.find('meta', property='og:image')
|
||||
if og_image and og_image.get('content'):
|
||||
img_url = urljoin(article_url, og_image['content'])
|
||||
if is_valid_image_url(img_url):
|
||||
return {
|
||||
"url": img_url,
|
||||
"alt": "Featured Image",
|
||||
"caption": "Hauptbild des Artikels",
|
||||
"copyright": "Unbekannt",
|
||||
"copyright_url": article_url,
|
||||
"type": "featured"
|
||||
}
|
||||
|
||||
# Twitter Card Image
|
||||
twitter_image = soup.find('meta', attrs={'name': 'twitter:image'})
|
||||
if twitter_image and twitter_image.get('content'):
|
||||
img_url = urljoin(article_url, twitter_image['content'])
|
||||
if is_valid_image_url(img_url):
|
||||
return {
|
||||
"url": img_url,
|
||||
"alt": "Featured Image",
|
||||
"caption": "Hauptbild des Artikels",
|
||||
"copyright": "Unbekannt",
|
||||
"copyright_url": article_url,
|
||||
"type": "featured"
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Fehler bei Featured Image Extraktion: {e}")
|
||||
return None
|
||||
|
||||
def clean_image_metadata(images: List[Dict]) -> List[Dict]:
|
||||
"""
|
||||
Bereinigt und normalisiert Bildmetadaten
|
||||
"""
|
||||
cleaned_images = []
|
||||
|
||||
for img in images:
|
||||
try:
|
||||
# URL validieren
|
||||
if not img.get("url") or not is_valid_image_url(img["url"]):
|
||||
continue
|
||||
|
||||
# Metadaten bereinigen
|
||||
cleaned_img = {
|
||||
"url": img["url"].strip(),
|
||||
"alt": (img.get("alt") or "").strip()[:200],
|
||||
"caption": (img.get("caption") or "Kein Bildtitel vorhanden").strip()[:300],
|
||||
"copyright": (img.get("copyright") or "Unbekannt").strip()[:100],
|
||||
"copyright_url": (img.get("copyright_url") or "#").strip(),
|
||||
"width": img.get("width"),
|
||||
"height": img.get("height"),
|
||||
"title": (img.get("title") or "").strip()[:200]
|
||||
}
|
||||
|
||||
# Leere Felder mit Standardwerten füllen
|
||||
if not cleaned_img["caption"]:
|
||||
cleaned_img["caption"] = "Kein Bildtitel vorhanden"
|
||||
if not cleaned_img["copyright"]:
|
||||
cleaned_img["copyright"] = "Unbekannt"
|
||||
if not cleaned_img["copyright_url"] or cleaned_img["copyright_url"] == "#":
|
||||
cleaned_img["copyright_url"] = img["url"] # Bild-URL als Fallback
|
||||
|
||||
cleaned_images.append(cleaned_img)
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Fehler beim Bereinigen der Bildmetadaten: {e}")
|
||||
continue
|
||||
|
||||
return cleaned_images
|
||||
|
||||
# Hauptfunktion für bessere Kompatibilität mit dem bestehenden Code
|
||||
def extract_images_with_metadata_enhanced(article_url: str) -> List[Dict]:
|
||||
"""
|
||||
Erweiterte Bildextraktion mit Fallback-Strategien
|
||||
"""
|
||||
all_images = []
|
||||
|
||||
# 1. Featured Image versuchen
|
||||
featured = extract_featured_image(article_url)
|
||||
if featured:
|
||||
all_images.append(featured)
|
||||
|
||||
# 2. Normale Bildextraktion
|
||||
content_images = extract_images_with_metadata(article_url)
|
||||
all_images.extend(content_images)
|
||||
|
||||
# 3. Duplikate entfernen
|
||||
seen_urls = set()
|
||||
unique_images = []
|
||||
for img in all_images:
|
||||
if img["url"] not in seen_urls:
|
||||
unique_images.append(img)
|
||||
seen_urls.add(img["url"])
|
||||
|
||||
# 4. Metadaten bereinigen
|
||||
cleaned_images = clean_image_metadata(unique_images)
|
||||
|
||||
return cleaned_images[:MAX_IMAGES]
|
||||
236
utils/ui_helpers.py
Normal file
236
utils/ui_helpers.py
Normal file
|
|
@ -0,0 +1,236 @@
|
|||
# utils/ui_helpers.py
|
||||
|
||||
import streamlit as st
|
||||
from datetime import datetime
|
||||
import logging
|
||||
|
||||
def show_toast(message, type="success", duration=3):
|
||||
"""
|
||||
Zeigt eine Toast-Benachrichtigung an
|
||||
"""
|
||||
if type == "success":
|
||||
st.success(message)
|
||||
elif type == "error":
|
||||
st.error(message)
|
||||
elif type == "warning":
|
||||
st.warning(message)
|
||||
elif type == "info":
|
||||
st.info(message)
|
||||
|
||||
def format_datetime(date_str):
|
||||
"""
|
||||
Formatiert Datetime-Strings für bessere Lesbarkeit
|
||||
"""
|
||||
try:
|
||||
if isinstance(date_str, str):
|
||||
if "GMT" in date_str or "+" in date_str:
|
||||
dt = datetime.strptime(date_str, "%a, %d %b %Y %H:%M:%S %z")
|
||||
return dt.strftime("%d.%m.%Y %H:%M")
|
||||
elif "T" in date_str:
|
||||
dt = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
|
||||
return dt.strftime("%d.%m.%Y %H:%M")
|
||||
else:
|
||||
return date_str[:16].replace("T", " ")
|
||||
return str(date_str)
|
||||
except Exception as e:
|
||||
logging.warning(f"Datum konnte nicht formatiert werden: {date_str} - {e}")
|
||||
return str(date_str)[:16]
|
||||
|
||||
def get_status_color(status):
|
||||
"""
|
||||
Gibt die passende Farbe für einen Status zurück
|
||||
"""
|
||||
colors = {
|
||||
"New": "#2196f3",
|
||||
"Rewrite": "#ff9800",
|
||||
"Process": "#9c27b0",
|
||||
"Online": "#4caf50",
|
||||
"On Hold": "#e91e63",
|
||||
"Trash": "#f44336"
|
||||
}
|
||||
return colors.get(status, "#2196f3")
|
||||
|
||||
def create_status_badge(status):
|
||||
"""
|
||||
Erstellt einen HTML-Status-Badge
|
||||
"""
|
||||
color = get_status_color(status)
|
||||
return f"""
|
||||
<span style="
|
||||
background-color: {color}20;
|
||||
color: {color};
|
||||
padding: 0.25rem 0.5rem;
|
||||
border-radius: 12px;
|
||||
font-size: 0.8rem;
|
||||
font-weight: 600;
|
||||
border: 1px solid {color}40;
|
||||
">{status}</span>
|
||||
"""
|
||||
|
||||
def truncate_text(text, max_length=150):
|
||||
"""
|
||||
Kürzt Text auf maximale Länge
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
if len(text) <= max_length:
|
||||
return text
|
||||
|
||||
return text[:max_length].rsplit(' ', 1)[0] + "..."
|
||||
|
||||
def calculate_reading_time(text):
|
||||
"""
|
||||
Berechnet geschätzte Lesezeit (200 Wörter/Minute)
|
||||
"""
|
||||
if not text:
|
||||
return 0
|
||||
|
||||
word_count = len(text.split())
|
||||
reading_time = max(1, word_count // 200)
|
||||
return reading_time
|
||||
|
||||
def validate_url(url):
|
||||
"""
|
||||
Validiert eine URL
|
||||
"""
|
||||
import re
|
||||
pattern = re.compile(
|
||||
r'^https?://' # http:// oder https://
|
||||
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' # domain...
|
||||
r'localhost|' # localhost...
|
||||
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...oder IP
|
||||
r'(?::\d+)?' # optional port
|
||||
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
|
||||
return pattern.match(url) is not None
|
||||
|
||||
def create_article_card_html(article, source_name="Unbekannt"):
|
||||
"""
|
||||
Erstellt HTML für eine Artikel-Karte
|
||||
"""
|
||||
has_images = len(article.get("images", [])) > 0
|
||||
word_count = len(article.get("text", "").split())
|
||||
reading_time = calculate_reading_time(article.get("text", ""))
|
||||
|
||||
# Unvollständige Bilder prüfen
|
||||
incomplete_images = any(
|
||||
not all(k in img and img[k] for k in ("caption", "copyright", "copyright_url"))
|
||||
for img in article.get("images", [])
|
||||
)
|
||||
|
||||
warning_icon = " ⚠️" if incomplete_images else ""
|
||||
|
||||
return f"""
|
||||
<div style="
|
||||
background: white;
|
||||
border-radius: 12px;
|
||||
padding: 1.5rem;
|
||||
margin-bottom: 1rem;
|
||||
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
|
||||
border-left: 4px solid {get_status_color(article.get('status', 'New'))};
|
||||
transition: transform 0.2s ease;
|
||||
" onmouseover="this.style.transform='translateY(-2px)'" onmouseout="this.style.transform='translateY(0)'">
|
||||
|
||||
<div style="display: flex; justify-content: space-between; align-items: start; margin-bottom: 1rem;">
|
||||
<div style="flex: 1;">
|
||||
<h3 style="margin: 0 0 0.5rem 0; color: #333; font-size: 1.1rem;">
|
||||
{article.get('title', 'Kein Titel')}{warning_icon}
|
||||
</h3>
|
||||
<div style="font-size: 0.85rem; color: #666; margin-bottom: 0.5rem;">
|
||||
📅 {format_datetime(article.get('date', ''))} •
|
||||
📝 {word_count} Wörter •
|
||||
⏱️ {reading_time} Min Lesezeit
|
||||
{'• 🖼️ ' + str(len(article.get('images', []))) + ' Bilder' if has_images else ''}
|
||||
</div>
|
||||
</div>
|
||||
<div>
|
||||
{create_status_badge(article.get('status', 'New'))}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div style="margin-bottom: 1rem; color: #555; line-height: 1.4;">
|
||||
{truncate_text(article.get('summary', ''), 200)}
|
||||
</div>
|
||||
|
||||
<div style="display: flex; justify-content: space-between; align-items: center; font-size: 0.8rem; color: #888;">
|
||||
<div>
|
||||
📡 {source_name}
|
||||
</div>
|
||||
<div>
|
||||
🏷️ {', '.join(article.get('tags', [])[:3])}{'...' if len(article.get('tags', [])) > 3 else ''}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
"""
|
||||
|
||||
def create_stats_card(title, value, icon="📊", color="#667eea"):
|
||||
"""
|
||||
Erstellt eine Statistik-Karte
|
||||
"""
|
||||
return f"""
|
||||
<div style="
|
||||
background: white;
|
||||
border-radius: 12px;
|
||||
padding: 1.5rem;
|
||||
text-align: center;
|
||||
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
|
||||
border-top: 4px solid {color};
|
||||
">
|
||||
<div style="font-size: 2rem; margin-bottom: 0.5rem;">{icon}</div>
|
||||
<div style="font-size: 2rem; font-weight: bold; color: {color}; margin-bottom: 0.5rem;">{value}</div>
|
||||
<div style="color: #666; font-weight: 500;">{title}</div>
|
||||
</div>
|
||||
"""
|
||||
|
||||
def show_loading_spinner(text="Lädt..."):
|
||||
"""
|
||||
Zeigt einen Lade-Spinner mit Text
|
||||
"""
|
||||
return st.empty().markdown(f"""
|
||||
<div style="text-align: center; padding: 2rem;">
|
||||
<div style="
|
||||
border: 4px solid #f3f3f3;
|
||||
border-top: 4px solid #667eea;
|
||||
border-radius: 50%;
|
||||
width: 40px;
|
||||
height: 40px;
|
||||
animation: spin 1s linear infinite;
|
||||
margin: 0 auto 1rem auto;
|
||||
"></div>
|
||||
<div style="color: #666;">{text}</div>
|
||||
</div>
|
||||
<style>
|
||||
@keyframes spin {
|
||||
0% { transform: rotate(0deg); }
|
||||
100% { transform: rotate(360deg); }
|
||||
}
|
||||
</style>
|
||||
""", unsafe_allow_html=True)
|
||||
|
||||
def create_filter_section():
|
||||
"""
|
||||
Erstellt einen modernen Filter-Bereich
|
||||
"""
|
||||
return """
|
||||
<div style="
|
||||
background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
|
||||
border-radius: 12px;
|
||||
padding: 1.5rem;
|
||||
margin-bottom: 2rem;
|
||||
box-shadow: 0 4px 6px rgba(0,0,0,0.1);
|
||||
">
|
||||
<h3 style="margin: 0 0 1rem 0; color: #333;">🔍 Filter & Suche</h3>
|
||||
"""
|
||||
|
||||
def get_error_message(error_type, details=""):
|
||||
"""
|
||||
Gibt formatierte Fehlermeldungen zurück
|
||||
"""
|
||||
messages = {
|
||||
"feed_error": f"❌ Fehler beim Laden des Feeds: {details}",
|
||||
"save_error": f"❌ Fehler beim Speichern: {details}",
|
||||
"api_error": f"❌ API-Fehler: {details}",
|
||||
"validation_error": f"⚠️ Validierungsfehler: {details}",
|
||||
"network_error": f"🌐 Netzwerkfehler: {details}"
|
||||
}
|
||||
return messages.get(error_type, f"❌ Unbekannter Fehler: {details}")
|
||||
464
utils/wordpress_uploader.py
Normal file
464
utils/wordpress_uploader.py
Normal file
|
|
@ -0,0 +1,464 @@
|
|||
# utils/wordpress_uploader.py
|
||||
|
||||
import requests
|
||||
import json
|
||||
import os
|
||||
import logging
|
||||
import base64
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
# WordPress API Konfiguration – ausschließlich aus .env
|
||||
WP_BASE_URL = os.getenv("WP_BASE_URL")
|
||||
WP_USERNAME = os.getenv("WP_USERNAME")
|
||||
WP_PASSWORD = os.getenv("WP_PASSWORD")
|
||||
WP_AUTH_BASE64 = os.getenv("WP_AUTH_BASE64")
|
||||
|
||||
# Request-Konfiguration
|
||||
REQUEST_TIMEOUT = 30
|
||||
MAX_RETRIES = 3
|
||||
USER_AGENT = 'RSS-Feed-Manager/1.7.x'
|
||||
|
||||
class WordPressUploader:
|
||||
"""
|
||||
Klasse für den Upload von Artikeln zu WordPress über die REST API
|
||||
mit Base64-Authentifizierung
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
# Basis-URL validieren und Endpunkt bauen
|
||||
if not WP_BASE_URL:
|
||||
raise ValueError("WP_BASE_URL nicht gesetzt. Bitte .env konfigurieren.")
|
||||
self.base_url = WP_BASE_URL.rstrip('/')
|
||||
self.api_endpoint = f"{self.base_url}/wp-json/wp/v2"
|
||||
|
||||
# Zugangsdaten (aus .env)
|
||||
self.username = WP_USERNAME
|
||||
self.password = WP_PASSWORD
|
||||
self.auth_base64 = WP_AUTH_BASE64
|
||||
|
||||
if not self.auth_base64 and not (self.username and self.password):
|
||||
raise ValueError("WordPress-Authentifizierung nicht konfiguriert. WP_AUTH_BASE64 oder WP_USERNAME + WP_PASSWORD setzen.")
|
||||
|
||||
# Session für bessere Performance
|
||||
self.session = requests.Session()
|
||||
|
||||
# Authentifizierung über Authorization Header mit Base64
|
||||
if self.auth_base64:
|
||||
self.session.headers.update({
|
||||
'Authorization': f'Basic {self.auth_base64}',
|
||||
'User-Agent': USER_AGENT,
|
||||
'Content-Type': 'application/json',
|
||||
'Accept': 'application/json'
|
||||
})
|
||||
logging.info("✅ WordPress-Authentifizierung: Base64-String verwendet")
|
||||
else:
|
||||
credentials = f"{self.username}:{self.password}"
|
||||
encoded_credentials = base64.b64encode(credentials.encode('utf-8')).decode('utf-8')
|
||||
self.session.headers.update({
|
||||
'Authorization': f'Basic {encoded_credentials}',
|
||||
'User-Agent': USER_AGENT,
|
||||
'Content-Type': 'application/json',
|
||||
'Accept': 'application/json'
|
||||
})
|
||||
logging.info("✅ WordPress-Authentifizierung: Base64 aus Username/Password generiert")
|
||||
|
||||
# Standard-Kategorie ID ermitteln
|
||||
self.default_category_id = self._get_default_category_id()
|
||||
|
||||
def _get_default_category_id(self) -> int:
|
||||
"""
|
||||
Ermittelt die ID der Standard-Kategorie 'Allgemein'
|
||||
"""
|
||||
try:
|
||||
response = self.session.get(
|
||||
f"{self.api_endpoint}/categories",
|
||||
params={'search': 'Allgemein', 'per_page': 10},
|
||||
timeout=REQUEST_TIMEOUT
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
categories = response.json()
|
||||
|
||||
for category in categories:
|
||||
if category['name'].lower() == 'allgemein':
|
||||
logging.info(f"✅ Standard-Kategorie 'Allgemein' gefunden: ID {category['id']}")
|
||||
return category['id']
|
||||
|
||||
# Fallback: Erste Kategorie oder Standard-ID
|
||||
if categories:
|
||||
logging.warning(f"⚠️ Kategorie 'Allgemein' nicht gefunden, verwende '{categories[0]['name']}' (ID: {categories[0]['id']})")
|
||||
return categories[0]['id']
|
||||
else:
|
||||
logging.warning("⚠️ Keine Kategorien gefunden, verwende Standard-ID 1")
|
||||
return 1
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Fehler beim Ermitteln der Standard-Kategorie: {e}")
|
||||
return 1 # WordPress Standard-Kategorie
|
||||
|
||||
def _get_or_create_tags(self, tag_names: List[str]) -> List[int]:
|
||||
"""
|
||||
Ermittelt oder erstellt Tags und gibt deren IDs zurück
|
||||
"""
|
||||
tag_ids = []
|
||||
|
||||
if not tag_names:
|
||||
return tag_ids
|
||||
|
||||
try:
|
||||
# Bestehende Tags abrufen
|
||||
for tag_name in tag_names:
|
||||
tag_name = tag_name.strip()
|
||||
if not tag_name:
|
||||
continue
|
||||
|
||||
try:
|
||||
# Suche nach existierendem Tag
|
||||
response = self.session.get(
|
||||
f"{self.api_endpoint}/tags",
|
||||
params={'search': tag_name, 'per_page': 10},
|
||||
timeout=REQUEST_TIMEOUT
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
existing_tags = response.json()
|
||||
tag_found = False
|
||||
|
||||
# Exakte Übereinstimmung suchen
|
||||
for tag in existing_tags:
|
||||
if tag['name'].lower() == tag_name.lower():
|
||||
tag_ids.append(tag['id'])
|
||||
tag_found = True
|
||||
logging.info(f"✅ Existierender Tag gefunden: '{tag_name}' (ID: {tag['id']})")
|
||||
break
|
||||
|
||||
# Tag erstellen falls nicht gefunden
|
||||
if not tag_found:
|
||||
create_response = self.session.post(
|
||||
f"{self.api_endpoint}/tags",
|
||||
json={'name': tag_name},
|
||||
timeout=REQUEST_TIMEOUT
|
||||
)
|
||||
|
||||
if create_response.status_code == 201:
|
||||
new_tag = create_response.json()
|
||||
tag_ids.append(new_tag['id'])
|
||||
logging.info(f"✅ Neuer Tag erstellt: '{tag_name}' (ID: {new_tag['id']})")
|
||||
else:
|
||||
logging.warning(f"⚠️ Tag '{tag_name}' konnte nicht erstellt werden: {create_response.status_code}")
|
||||
continue
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Fehler beim Verarbeiten von Tag '{tag_name}': {e}")
|
||||
continue
|
||||
|
||||
logging.info(f"🏷️ Tags verarbeitet: {len(tag_ids)} Tag-IDs erstellt")
|
||||
return tag_ids
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Allgemeiner Fehler bei Tag-Verarbeitung: {e}")
|
||||
return []
|
||||
def _prepare_post_data(self, article: Dict) -> Dict:
|
||||
"""
|
||||
Bereitet die Artikel-Daten für WordPress vor
|
||||
"""
|
||||
# Tags verarbeiten - WordPress benötigt Tag-IDs, nicht Namen
|
||||
tag_names = article.get('tags', [])
|
||||
tag_ids = self._get_or_create_tags(tag_names)
|
||||
|
||||
# Basis Post-Daten
|
||||
post_data = {
|
||||
'title': article.get('title', 'Kein Titel'),
|
||||
'content': article.get('text', ''),
|
||||
'status': 'pending', # Artikel als "Ausstehend" markieren
|
||||
'categories': [self.default_category_id],
|
||||
'excerpt': article.get('summary', '')[:300], # WordPress Excerpt
|
||||
'meta': {
|
||||
'rss_source': article.get('source', ''),
|
||||
'rss_original_link': article.get('link', ''),
|
||||
'rss_import_date': datetime.now().isoformat(),
|
||||
'rss_article_id': article.get('id', '')
|
||||
}
|
||||
}
|
||||
|
||||
# Tags nur hinzufügen wenn vorhanden
|
||||
if tag_ids:
|
||||
post_data['tags'] = tag_ids
|
||||
|
||||
# Optional: Author setzen (falls unterschiedliche Autoren gewünscht)
|
||||
# post_data['author'] = 1 # WordPress User ID
|
||||
|
||||
logging.info(f"📝 Post-Daten vorbereitet: Titel='{post_data['title']}', Tags={len(tag_ids)}, Kategorie={self.default_category_id}")
|
||||
return post_data
|
||||
|
||||
def _check_duplicate(self, article: Dict) -> Optional[int]:
|
||||
"""
|
||||
Prüft, ob ein Artikel bereits in WordPress existiert
|
||||
"""
|
||||
try:
|
||||
# Suche nach Titel
|
||||
title = article.get('title', '')
|
||||
if not title:
|
||||
return None
|
||||
|
||||
response = self.session.get(
|
||||
f"{self.api_endpoint}/posts",
|
||||
params={
|
||||
'search': title,
|
||||
'per_page': 5,
|
||||
'status': 'any' # Alle Status durchsuchen
|
||||
},
|
||||
timeout=REQUEST_TIMEOUT
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
posts = response.json()
|
||||
|
||||
for post in posts:
|
||||
# Exakte Titel-Übereinstimmung
|
||||
if post['title']['rendered'].strip() == title.strip():
|
||||
logging.info(f"🔄 Duplikat gefunden: '{title}' (WordPress ID: {post['id']})")
|
||||
return post['id']
|
||||
|
||||
# Prüfe auch Custom Meta Fields (RSS Article ID)
|
||||
article_id = article.get('id')
|
||||
if article_id:
|
||||
# Meta-Felder würden eine separate API-Abfrage erfordern
|
||||
# Für jetzt: Nur Titel-basierte Duplikatserkennung
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Fehler bei Duplikatsprüfung für '{article.get('title', 'Unbekannt')}': {e}")
|
||||
return None
|
||||
|
||||
def upload_article(self, article: Dict) -> Tuple[bool, str, Optional[int]]:
|
||||
"""
|
||||
Lädt einen einzelnen Artikel zu WordPress hoch
|
||||
|
||||
Returns:
|
||||
Tuple[bool, str, Optional[int]]: (Erfolg, Nachricht, WordPress Post ID)
|
||||
"""
|
||||
title = article.get('title', 'Unbekannt')
|
||||
|
||||
try:
|
||||
logging.info(f"📤 Starte WordPress-Upload: {title}")
|
||||
|
||||
# Duplikatsprüfung
|
||||
existing_post_id = self._check_duplicate(article)
|
||||
if existing_post_id:
|
||||
return False, f"Artikel '{title}' existiert bereits in WordPress (ID: {existing_post_id})", existing_post_id
|
||||
|
||||
# Post-Daten vorbereiten
|
||||
post_data = self._prepare_post_data(article)
|
||||
|
||||
# Upload mit Retry-Logik
|
||||
for attempt in range(MAX_RETRIES):
|
||||
try:
|
||||
response = self.session.post(
|
||||
f"{self.api_endpoint}/posts",
|
||||
json=post_data,
|
||||
timeout=REQUEST_TIMEOUT
|
||||
)
|
||||
|
||||
if response.status_code == 201:
|
||||
# Erfolgreich erstellt
|
||||
wp_post = response.json()
|
||||
wp_post_id = wp_post['id']
|
||||
wp_url = wp_post['link']
|
||||
|
||||
logging.info(f"✅ WordPress-Upload erfolgreich: '{title}' (ID: {wp_post_id})")
|
||||
logging.info(f"🔗 WordPress-URL: {wp_url}")
|
||||
return True, f"Erfolgreich hochgeladen: {wp_url}", wp_post_id
|
||||
|
||||
elif response.status_code == 400:
|
||||
# Client Error - nicht wiederholen
|
||||
error_data = response.json()
|
||||
error_msg = error_data.get('message', 'Unbekannter Fehler')
|
||||
error_code = error_data.get('code', 'unknown')
|
||||
|
||||
# Detaillierte Fehleranalyse
|
||||
if 'parameter' in error_msg.lower() and 'tags' in error_msg.lower():
|
||||
logging.error(f"❌ WordPress-Tag-Fehler für '{title}': {error_msg}")
|
||||
logging.error(f"📋 Post-Daten: {json.dumps(post_data, indent=2, ensure_ascii=False)}")
|
||||
return False, f"Tag-Fehler: {error_msg} (Artikel-Tags: {article.get('tags', [])})", None
|
||||
else:
|
||||
logging.error(f"❌ WordPress-Fehler 400 für '{title}': {error_msg} (Code: {error_code})")
|
||||
logging.error(f"📋 Post-Daten: {json.dumps(post_data, indent=2, ensure_ascii=False)}")
|
||||
return False, f"WordPress-Fehler: {error_msg}", None
|
||||
|
||||
elif response.status_code == 401:
|
||||
# Authentifizierungsfehler
|
||||
logging.error(f"❌ WordPress-Authentifizierungsfehler für '{title}'")
|
||||
return False, "Authentifizierungsfehler - bitte Zugangsdaten prüfen", None
|
||||
|
||||
elif response.status_code == 403:
|
||||
# Berechtigungsfehler
|
||||
logging.error(f"❌ WordPress-Berechtigungsfehler für '{title}'")
|
||||
return False, "Keine Berechtigung zum Erstellen von Posts", None
|
||||
|
||||
else:
|
||||
# Server Error - Retry möglich
|
||||
if attempt < MAX_RETRIES - 1:
|
||||
logging.warning(f"⚠️ WordPress-Upload Versuch {attempt + 1} fehlgeschlagen für '{title}' (Status: {response.status_code}), versuche erneut...")
|
||||
continue
|
||||
else:
|
||||
logging.error(f"❌ WordPress-Upload nach {MAX_RETRIES} Versuchen fehlgeschlagen für '{title}' (Status: {response.status_code})")
|
||||
return False, f"Upload fehlgeschlagen nach {MAX_RETRIES} Versuchen (HTTP {response.status_code})", None
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
if attempt < MAX_RETRIES - 1:
|
||||
logging.warning(f"⏱️ Timeout bei WordPress-Upload für '{title}' (Versuch {attempt + 1}), versuche erneut...")
|
||||
continue
|
||||
else:
|
||||
logging.error(f"❌ Timeout bei WordPress-Upload für '{title}' nach {MAX_RETRIES} Versuchen")
|
||||
return False, f"Timeout nach {MAX_RETRIES} Versuchen", None
|
||||
|
||||
except requests.exceptions.ConnectionError as e:
|
||||
if attempt < MAX_RETRIES - 1:
|
||||
logging.warning(f"🌐 Verbindungsfehler bei WordPress-Upload für '{title}' (Versuch {attempt + 1}): {e}")
|
||||
continue
|
||||
else:
|
||||
logging.error(f"❌ Verbindungsfehler bei WordPress-Upload für '{title}' nach {MAX_RETRIES} Versuchen: {e}")
|
||||
return False, f"Verbindungsfehler nach {MAX_RETRIES} Versuchen", None
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Unerwarteter Fehler bei WordPress-Upload für '{title}': {e}")
|
||||
return False, f"Unerwarteter Fehler: {str(e)}", None
|
||||
|
||||
def test_connection(self) -> Tuple[bool, str]:
|
||||
"""
|
||||
Testet die Verbindung zur WordPress API mit Base64-Authentifizierung
|
||||
"""
|
||||
try:
|
||||
logging.info("🔧 Teste WordPress-API-Verbindung...")
|
||||
|
||||
# Einfache Abfrage der Kategorien als Test
|
||||
response = self.session.get(
|
||||
f"{self.api_endpoint}/categories",
|
||||
params={'per_page': 1},
|
||||
timeout=REQUEST_TIMEOUT
|
||||
)
|
||||
|
||||
logging.info(f"📡 API-Response Status: {response.status_code}")
|
||||
|
||||
if response.status_code == 200:
|
||||
logging.info("✅ WordPress-API-Verbindung erfolgreich")
|
||||
return True, "Verbindung zur WordPress API erfolgreich"
|
||||
elif response.status_code == 401:
|
||||
logging.error("❌ WordPress-API-Authentifizierung fehlgeschlagen")
|
||||
logging.error(f"Response Body: {response.text}")
|
||||
return False, "Authentifizierung fehlgeschlagen - bitte Base64-String oder Zugangsdaten prüfen"
|
||||
elif response.status_code == 403:
|
||||
logging.error("❌ WordPress-API-Berechtigung fehlgeschlagen")
|
||||
logging.error(f"Response Body: {response.text}")
|
||||
return False, "Keine Berechtigung - bitte Benutzerrechte prüfen"
|
||||
else:
|
||||
logging.error(f"❌ WordPress-API-Test fehlgeschlagen (Status: {response.status_code})")
|
||||
logging.error(f"Response Body: {response.text}")
|
||||
return False, f"API-Test fehlgeschlagen (HTTP {response.status_code}): {response.text[:100]}"
|
||||
|
||||
except requests.exceptions.ConnectionError as e:
|
||||
logging.error(f"❌ Verbindungsfehler zur WordPress API: {e}")
|
||||
return False, f"Verbindungsfehler: {str(e)}"
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Unerwarteter Fehler beim WordPress-API-Test: {e}")
|
||||
return False, f"Unerwarteter Fehler: {str(e)}"
|
||||
|
||||
def upload_multiple_articles(self, articles: List[Dict]) -> Dict:
|
||||
"""
|
||||
Lädt mehrere Artikel zu WordPress hoch
|
||||
|
||||
Returns:
|
||||
Dict mit Statistiken über erfolgreiche und fehlgeschlagene Uploads
|
||||
"""
|
||||
results = {
|
||||
'total': len(articles),
|
||||
'successful': 0,
|
||||
'failed': 0,
|
||||
'duplicates': 0,
|
||||
'details': []
|
||||
}
|
||||
|
||||
logging.info(f"📦 Starte Batch-Upload von {len(articles)} Artikeln zu WordPress")
|
||||
|
||||
for i, article in enumerate(articles, 1):
|
||||
title = article.get('title', f'Artikel {i}')
|
||||
logging.info(f"📤 Upload {i}/{len(articles)}: {title}")
|
||||
|
||||
success, message, wp_post_id = self.upload_article(article)
|
||||
|
||||
result_detail = {
|
||||
'article_id': article.get('id'),
|
||||
'title': title,
|
||||
'success': success,
|
||||
'message': message,
|
||||
'wp_post_id': wp_post_id
|
||||
}
|
||||
|
||||
results['details'].append(result_detail)
|
||||
|
||||
if success:
|
||||
results['successful'] += 1
|
||||
elif 'existiert bereits' in message:
|
||||
results['duplicates'] += 1
|
||||
else:
|
||||
results['failed'] += 1
|
||||
|
||||
# Kurze Pause zwischen Uploads
|
||||
if i < len(articles):
|
||||
import time
|
||||
time.sleep(1)
|
||||
|
||||
logging.info(f"📊 Batch-Upload abgeschlossen: {results['successful']} erfolgreich, {results['failed']} fehlgeschlagen, {results['duplicates']} Duplikate")
|
||||
return results
|
||||
|
||||
def __del__(self):
|
||||
"""
|
||||
Session sauber schließen
|
||||
"""
|
||||
if hasattr(self, 'session'):
|
||||
self.session.close()
|
||||
|
||||
|
||||
def upload_articles_to_wordpress(articles: List[Dict]) -> Dict:
|
||||
"""
|
||||
Convenience-Funktion für den Upload von Artikeln zu WordPress
|
||||
"""
|
||||
uploader = WordPressUploader()
|
||||
|
||||
# Verbindung testen
|
||||
connection_ok, connection_msg = uploader.test_connection()
|
||||
if not connection_ok:
|
||||
logging.error(f"❌ WordPress-Verbindung fehlgeschlagen: {connection_msg}")
|
||||
return {
|
||||
'total': len(articles),
|
||||
'successful': 0,
|
||||
'failed': len(articles),
|
||||
'duplicates': 0,
|
||||
'error': connection_msg,
|
||||
'details': []
|
||||
}
|
||||
|
||||
# Artikel hochladen
|
||||
return uploader.upload_multiple_articles(articles)
|
||||
|
||||
|
||||
def upload_single_article_to_wordpress(article: Dict) -> Tuple[bool, str, Optional[int]]:
|
||||
"""
|
||||
Convenience-Funktion für den Upload eines einzelnen Artikels
|
||||
"""
|
||||
uploader = WordPressUploader()
|
||||
|
||||
# Verbindung testen
|
||||
connection_ok, connection_msg = uploader.test_connection()
|
||||
if not connection_ok:
|
||||
return False, connection_msg, None
|
||||
|
||||
# Artikel hochladen
|
||||
return uploader.upload_article(article)
|
||||
263
versioning.py
263
versioning.py
|
|
@ -1,23 +1,23 @@
|
|||
# versioning.py
|
||||
|
||||
import re
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
import typer
|
||||
|
||||
app = typer.Typer()
|
||||
import click
|
||||
|
||||
CHANGELOG_FILE = Path("CHANGELOG.md")
|
||||
VERSION_FILE = Path("__version__.py")
|
||||
VERSION_PATTERN = r"## \[v?(\d+\.\d+\.\d+)\]"
|
||||
|
||||
|
||||
def get_latest_version():
|
||||
content = CHANGELOG_FILE.read_text(encoding="utf-8")
|
||||
matches = re.findall(VERSION_PATTERN, content)
|
||||
return matches[0] if matches else "0.0.0"
|
||||
|
||||
try:
|
||||
# Zuerst versuchen, Git-Tag auszulesen
|
||||
tag = subprocess.check_output(["git", "describe", "--tags", "--abbrev=0"], stderr=subprocess.DEVNULL)
|
||||
return tag.decode("utf-8").strip().lstrip("v")
|
||||
except subprocess.CalledProcessError:
|
||||
# Fallback auf CHANGELOG.md
|
||||
content = CHANGELOG_FILE.read_text(encoding="utf-8")
|
||||
matches = re.findall(VERSION_PATTERN, content)
|
||||
return matches[0] if matches else "0.0.0"
|
||||
|
||||
def bump_version(version: str, level: str = "patch") -> str:
|
||||
major, minor, patch = map(int, version.split("."))
|
||||
|
|
@ -27,106 +27,169 @@ def bump_version(version: str, level: str = "patch") -> str:
|
|||
return f"{major}.{minor + 1}.0"
|
||||
return f"{major}.{minor}.{patch + 1}"
|
||||
|
||||
|
||||
def write_version_file(version: str):
|
||||
VERSION_FILE.write_text(f"VERSION = \"{version}\"\n", encoding="utf-8")
|
||||
typer.echo(f"🔢 __version__.py aktualisiert auf {version}")
|
||||
|
||||
def is_ssh_signing_available() -> bool:
|
||||
return Path("~/.ssh/id_ed25519").expanduser().exists()
|
||||
|
||||
def prepend_changelog(version: str):
|
||||
today = datetime.today().strftime("%Y-%m-%d")
|
||||
new_entry = f"\n\n## [v{version}] – {today}\n\n### 💡 Neue Funktionen\n- \n\n### 🔧 Änderungen & Fixes\n- \n\n### 📦 Internes\n- "
|
||||
original = CHANGELOG_FILE.read_text(encoding="utf-8")
|
||||
CHANGELOG_FILE.write_text(new_entry + original, encoding="utf-8")
|
||||
typer.echo(f"📝 Neuer Eintrag für v{version} zu CHANGELOG.md hinzugefügt")
|
||||
|
||||
|
||||
def validate_changelog(version: str) -> bool:
|
||||
content = CHANGELOG_FILE.read_text(encoding="utf-8")
|
||||
pattern = rf"## \[v?{re.escape(version)}\](.*?)^## \["
|
||||
match = re.search(pattern, content + "\n## [", re.DOTALL | re.MULTILINE)
|
||||
if match:
|
||||
section = match.group(1).strip()
|
||||
if any(line.strip() != "-" for line in section.splitlines() if line.strip()):
|
||||
return True
|
||||
typer.echo("⚠️ CHANGELOG-Eintrag ist noch leer oder unvollständig.")
|
||||
return False
|
||||
|
||||
|
||||
def create_git_tag(version: str):
|
||||
def is_gpg_available() -> bool:
|
||||
try:
|
||||
subprocess.run(["git", "add", "."], check=True)
|
||||
subprocess.run(["git", "commit", "-m", f"🔖 Release v{version}"], check=True)
|
||||
subprocess.run(["git", "tag", f"v{version}"], check=True)
|
||||
typer.echo(f"🏷️ Git-Tag 'v{version}' erstellt und commit durchgeführt.")
|
||||
output = subprocess.check_output(["gpg", "--list-secret-keys"], stderr=subprocess.DEVNULL)
|
||||
return bool(output.strip())
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def tag_exists(tag_name: str) -> bool:
|
||||
"""Prüft, ob ein Git-Tag bereits existiert"""
|
||||
try:
|
||||
result = subprocess.check_output(["git", "tag", "-l", tag_name], stderr=subprocess.DEVNULL).decode().strip()
|
||||
return result == tag_name
|
||||
except subprocess.CalledProcessError:
|
||||
typer.echo("⚠️ Git-Fehler beim Taggen oder Committen. Bitte manuell prüfen.")
|
||||
return False
|
||||
|
||||
def configure_signing(use_ssh: bool):
|
||||
if use_ssh:
|
||||
subprocess.run(["git", "config", "--global", "gpg.format", "ssh"], check=True)
|
||||
subprocess.run(["git", "config", "--global", "user.signingkey", "~/.ssh/id_ed25519.pub"], check=True)
|
||||
else:
|
||||
subprocess.run(["git", "config", "--global", "gpg.format", "openpgp"], check=True)
|
||||
subprocess.run(["git", "config", "--global", "commit.gpgsign", "true"], check=True)
|
||||
|
||||
def push_to_github():
|
||||
try:
|
||||
@click.command()
|
||||
@click.option("--level", default="patch", help="Version bump level: patch, minor, major")
|
||||
@click.option("--version", "specific_version", help="Set specific version (e.g., 2.1.0) instead of auto-bumping")
|
||||
@click.option("--push", is_flag=True, help="Push to GitHub after creating version")
|
||||
@click.option("--no-sign", is_flag=True, help="Skip signing of commits and tags")
|
||||
@click.option("--dry-run", is_flag=True, help="Show what would be done without executing")
|
||||
@click.option("--force", is_flag=True, help="Force creation even if tag already exists (overwrites existing tag)")
|
||||
def create(level, specific_version, push, no_sign, dry_run, force):
|
||||
"""
|
||||
Erstellt eine neue Version mit optional signiertem Commit & Tag.
|
||||
Optional: --push, --no-sign, --dry-run, --version, --force
|
||||
"""
|
||||
current = get_latest_version()
|
||||
|
||||
# Validierung und Festlegung der neuen Version
|
||||
if specific_version:
|
||||
# Validiere das Format der vorgegebenen Version
|
||||
version_pattern = r"^\d+\.\d+\.\d+$"
|
||||
if not re.match(version_pattern, specific_version):
|
||||
click.secho("❌ Fehler: Version muss im Format X.Y.Z sein (z.B. 2.1.0)", fg="red")
|
||||
return
|
||||
|
||||
# Prüfe, ob der Tag bereits existiert
|
||||
tag_name = f"v{specific_version}"
|
||||
if tag_exists(tag_name) and not force:
|
||||
click.secho(f"❌ Fehler: Tag {tag_name} existiert bereits. Verwende --force zum Überschreiben.", fg="red")
|
||||
return
|
||||
elif tag_exists(tag_name) and force:
|
||||
click.secho(f"⚠️ Tag {tag_name} existiert bereits - wird überschrieben (--force aktiviert)", fg="yellow")
|
||||
|
||||
# Prüfe, ob die vorgegebene Version höher als die aktuelle ist (nur ohne force)
|
||||
if not force:
|
||||
def version_tuple(v):
|
||||
return tuple(map(int, v.split('.')))
|
||||
|
||||
if version_tuple(specific_version) <= version_tuple(current):
|
||||
click.secho(f"❌ Fehler: Neue Version {specific_version} muss höher sein als aktuelle Version {current}", fg="red")
|
||||
click.secho("💡 Tipp: Verwende --force um diese Prüfung zu überspringen", fg="blue")
|
||||
return
|
||||
|
||||
new_version = specific_version
|
||||
click.secho(f"📌 Verwende vorgegebene Version: {new_version}", fg="blue")
|
||||
else:
|
||||
new_version = bump_version(current, level)
|
||||
|
||||
# Prüfe auch bei Auto-Bump, ob Tag existiert
|
||||
tag_name = f"v{new_version}"
|
||||
if tag_exists(tag_name) and not force:
|
||||
click.secho(f"❌ Fehler: Tag {tag_name} existiert bereits. Verwende --force zum Überschreiben.", fg="red")
|
||||
return
|
||||
elif tag_exists(tag_name) and force:
|
||||
click.secho(f"⚠️ Tag {tag_name} existiert bereits - wird überschrieben (--force aktiviert)", fg="yellow")
|
||||
|
||||
click.secho(f"🔄 Auto-Bump ({level}): {current} → {new_version}", fg="green")
|
||||
|
||||
if dry_run:
|
||||
click.secho("🔍 Dry-Run aktiviert – keine Dateien oder Git-Kommandos werden ausgeführt.\n", fg="yellow")
|
||||
click.echo(f"➡️ Aktuelle Version: {current}")
|
||||
click.echo(f"➡️ Neue Version: {new_version}")
|
||||
click.echo(f"➡️ Commit-Level: {level}")
|
||||
click.echo(f"➡️ Push nach GitHub: {'Ja' if push else 'Nein'}")
|
||||
click.echo(f"➡️ Signieren: {'Nein' if no_sign else 'Automatisch (SSH > GPG)'}")
|
||||
click.echo(f"➡️ Force-Modus: {'Ja' if force else 'Nein'}")
|
||||
|
||||
date = datetime.now().strftime("%Y-%m-%d")
|
||||
click.echo("\n📄 Vorschlag für CHANGELOG-Eintrag:")
|
||||
click.echo(f"\n## [{new_version}] - {date}\n\n- Beschreibung...\n")
|
||||
click.secho("🚫 Dry-Run beendet.\n", fg="yellow")
|
||||
return
|
||||
|
||||
# Update version file
|
||||
write_version_file(new_version)
|
||||
|
||||
# Prepare or check changelog entry
|
||||
date = datetime.now().strftime("%Y-%m-%d")
|
||||
new_entry = f"## [{new_version}] - {date}\n\n- Beschreibung...\n\n"
|
||||
content = CHANGELOG_FILE.read_text(encoding="utf-8")
|
||||
|
||||
if f"## [{new_version}]" in content:
|
||||
click.secho(f"ℹ️ Version {new_version} ist bereits im CHANGELOG.md vorhanden. Kein Eintrag hinzugefügt.", fg="blue")
|
||||
else:
|
||||
CHANGELOG_FILE.write_text(new_entry + content, encoding="utf-8")
|
||||
click.secho(f"📄 CHANGELOG.md wurde vorbereitet für Version {new_version}.", fg="magenta")
|
||||
|
||||
click.echo("")
|
||||
click.secho("✏️ Bitte jetzt den Eintrag in CHANGELOG.md überprüfen oder anpassen.", fg="cyan")
|
||||
input("⏸️ Drücke [Enter], um fortzufahren...")
|
||||
|
||||
subprocess.run(["git", "add", "."], check=True)
|
||||
|
||||
use_signing = False
|
||||
signing_method = "none"
|
||||
|
||||
if not no_sign:
|
||||
if is_ssh_signing_available():
|
||||
configure_signing(use_ssh=True)
|
||||
use_signing = True
|
||||
signing_method = "ssh"
|
||||
elif is_gpg_available():
|
||||
configure_signing(use_ssh=False)
|
||||
use_signing = True
|
||||
signing_method = "gpg"
|
||||
|
||||
commit_cmd = ["git", "commit", "-m", f"Bump version to v{new_version}"]
|
||||
if use_signing:
|
||||
commit_cmd.append("-S")
|
||||
subprocess.run(commit_cmd, check=True)
|
||||
|
||||
# Tag erstellen
|
||||
tag_name = f"v{new_version}"
|
||||
if use_signing:
|
||||
if force and tag_exists(tag_name):
|
||||
subprocess.run(["git", "tag", "-d", tag_name], check=True) # Lokalen Tag löschen
|
||||
subprocess.run(["git", "tag", "-s", tag_name, "-m", f"Release {tag_name}"], check=True)
|
||||
else:
|
||||
if force and tag_exists(tag_name):
|
||||
subprocess.run(["git", "tag", "-d", tag_name], check=True) # Lokalen Tag löschen
|
||||
subprocess.run(["git", "tag", "-a", tag_name, "-m", f"Release {tag_name} (unsigned)"], check=True)
|
||||
|
||||
if push:
|
||||
subprocess.run(["git", "push"], check=True)
|
||||
subprocess.run(["git", "push", "--tags"], check=True)
|
||||
typer.echo("🚀 Änderungen und Tags an GitHub gepusht.")
|
||||
except subprocess.CalledProcessError:
|
||||
typer.echo("⚠️ Fehler beim Pushen zu GitHub. Bitte Zugang oder Netzwerk prüfen.")
|
||||
if force and tag_exists(tag_name):
|
||||
# Force push des Tags, falls er bereits auf Remote existiert
|
||||
subprocess.run(["git", "push", "origin", tag_name, "--force"], check=True)
|
||||
else:
|
||||
subprocess.run(["git", "push", "origin", tag_name], check=True)
|
||||
|
||||
|
||||
@app.command()
|
||||
def list():
|
||||
"Listet alle verfügbaren Versionen aus dem CHANGELOG"
|
||||
typer.echo("\n📚 Verfügbare Versionen im CHANGELOG:")
|
||||
content = CHANGELOG_FILE.read_text(encoding="utf-8")
|
||||
versions = re.findall(VERSION_PATTERN, content)
|
||||
for v in versions:
|
||||
typer.echo(f"- v{v}")
|
||||
|
||||
|
||||
@app.command()
|
||||
def rollback():
|
||||
"Letzte Version zurückrollen (Tag löschen + Commit zurücknehmen)"
|
||||
last_version = get_latest_version()
|
||||
if typer.confirm(f"⚠️ Letzte Version 'v{last_version}' wirklich zurücknehmen?"):
|
||||
try:
|
||||
subprocess.run(["git", "tag", "-d", f"v{last_version}"], check=True)
|
||||
subprocess.run(["git", "reset", "--hard", "HEAD~1"], check=True)
|
||||
typer.echo(f"🔙 Version 'v{last_version}' wurde zurückgerollt.")
|
||||
except subprocess.CalledProcessError:
|
||||
typer.echo("❌ Rollback fehlgeschlagen.")
|
||||
if use_signing:
|
||||
if signing_method == "ssh":
|
||||
click.secho(f"✅ Version {new_version} erstellt und signiert mit SSH 🔐", fg="green")
|
||||
elif signing_method == "gpg":
|
||||
click.secho(f"✅ Version {new_version} erstellt und signiert mit GPG 🔏", fg="cyan")
|
||||
else:
|
||||
typer.echo("⛔ Abgebrochen.")
|
||||
|
||||
|
||||
@app.command()
|
||||
def create(level: str = typer.Option("patch", help="Versionstyp: patch, minor oder major"),
|
||||
push: bool = typer.Option(False, help="Änderungen direkt an GitHub pushen")):
|
||||
"Neue Version erstellen inkl. CHANGELOG, Git-Tag und optional Push"
|
||||
current_version = get_latest_version()
|
||||
next_version = bump_version(current_version, level)
|
||||
|
||||
typer.echo(f"💡 Aktuelle Version: {current_version}")
|
||||
typer.echo(f"🚀 Neue Version: {next_version}")
|
||||
|
||||
if typer.confirm("Version übernehmen und eintragen?"):
|
||||
write_version_file(next_version)
|
||||
prepend_changelog(next_version)
|
||||
|
||||
typer.echo("\nBitte CHANGELOG.md bearbeiten und danach fortfahren.")
|
||||
typer.prompt("Drücke Enter, sobald du den neuen Abschnitt ausgefüllt hast")
|
||||
|
||||
if not validate_changelog(next_version):
|
||||
typer.echo("❌ Release abgebrochen: Bitte fülle den CHANGELOG-Eintrag aus.")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
create_git_tag(next_version)
|
||||
|
||||
if push:
|
||||
push_to_github()
|
||||
|
||||
typer.echo(f"✅ Version {next_version} erfolgreich erstellt.")
|
||||
else:
|
||||
typer.echo("❌ Abgebrochen.")
|
||||
|
||||
click.secho(f"⚠️ Version {new_version} wurde ohne Signatur erstellt", fg="yellow")
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
create()
|
||||
Loading…
Add table
Add a link
Reference in a new issue