Compare commits
64 commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f710141828 | ||
|
|
2456e4aca7 | ||
|
|
1498fa7156 | ||
|
|
cdcf441daf | ||
|
|
2d02b56b65 | ||
|
|
8676ace102 | ||
|
|
cf2d826c8a | ||
|
|
2d1dd14e45 | ||
|
|
09dcf6ce36 | ||
|
|
94bd93a18a | ||
|
|
8fa46312e8 | ||
|
|
764e7bff6a | ||
|
|
426a799371 | ||
|
|
8c6022fead | ||
|
|
1a8d0775c7 | ||
|
|
45c533c674 | ||
|
|
d1cb809852 | ||
|
|
82f2df610d | ||
|
|
8e65485f0c | ||
|
|
0d07a9804d | ||
|
|
aaac5def27 | ||
|
|
1963e32ab4 | ||
|
|
12932bca90 | ||
|
|
013af2ab62 | ||
|
|
a64bf31ff6 | ||
|
|
970f509ad4 | ||
|
|
e9c472b722 | ||
|
|
1020526e76 | ||
|
|
d9ab599466 | ||
|
|
0a9c0b10d6 | ||
|
|
6192f8e527 | ||
| 6332a9a399 | |||
| 93f52f72b9 | |||
| b0f995d5c9 | |||
| da269d08f1 | |||
| 88b2ee1d01 | |||
| 50f737f434 | |||
| 35ccceb260 | |||
| 8d7375c99f | |||
| 24d8e5ad0f | |||
| e68b6a41fd | |||
| ba83b24510 | |||
| fee5e76842 | |||
| 592d699166 | |||
| 1cee56205e | |||
| dcdf4d954a | |||
| 26e3d26b93 | |||
| fb3465fb10 | |||
| 910ca72c81 | |||
| efaf132936 | |||
| 6691db8051 | |||
| 5159a6e3b4 | |||
| c52363f1a7 | |||
| 2c331d683b | |||
| d65c55d315 | |||
| a46d919118 | |||
| 46e0b98928 | |||
| 0bb7d246c1 | |||
| a02f825274 | |||
| 0cfbb6c37f | |||
| 777c770142 | |||
| beac96095e | |||
| ed91864eda | |||
| 759a313f31 |
78 changed files with 20955 additions and 346 deletions
BIN
.dedupe/index.sqlite
Normal file
BIN
.dedupe/index.sqlite
Normal file
Binary file not shown.
1
.dedupe/report.csv
Normal file
1
.dedupe/report.csv
Normal file
|
|
@ -0,0 +1 @@
|
|||
group_id,canonical_path,dup_path,dup_size_bytes
|
||||
|
16
.env.example
Normal file
16
.env.example
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
# Copy to .env and fill in values
|
||||
|
||||
# WordPress base URL (required)
|
||||
WP_BASE_URL=https://your-site.tld
|
||||
|
||||
# Authentication: prefer WP_AUTH_BASE64 OR use USERNAME+PASSWORD (Application Password)
|
||||
# Example to generate: base64(username:application_password)
|
||||
WP_AUTH_BASE64=
|
||||
|
||||
# Alternatively provide username and application password
|
||||
WP_USERNAME=
|
||||
WP_PASSWORD=
|
||||
|
||||
# OpenAI API key (optional, enables rewrite)
|
||||
OPENAI_API_KEY=
|
||||
|
||||
11
.github/workflows/deploy.yml
vendored
11
.github/workflows/deploy.yml
vendored
|
|
@ -19,9 +19,16 @@ jobs:
|
|||
username: oliver
|
||||
key: ${{ secrets.HETZNER_SSH_KEY }}
|
||||
port: 22
|
||||
envs: APP_ADMIN_USERNAME,APP_ADMIN_PASSWORD
|
||||
script: |
|
||||
cd rss-news
|
||||
cd /opt/rss-news
|
||||
git pull origin main
|
||||
source .venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
sudo systemctl restart rss-app
|
||||
pip install -r backend/requirements.txt || true
|
||||
sudo systemctl restart rss-news-api
|
||||
sleep 3
|
||||
BASE_URL="https://news.vanityontour.de" APP_ADMIN_USERNAME="${APP_ADMIN_USERNAME}" APP_ADMIN_PASSWORD="${APP_ADMIN_PASSWORD}" bash scripts/smoke_backend.sh
|
||||
env:
|
||||
APP_ADMIN_USERNAME: ${{ secrets.NEWS_APP_ADMIN_USERNAME }}
|
||||
APP_ADMIN_PASSWORD: ${{ secrets.NEWS_APP_ADMIN_PASSWORD }}
|
||||
|
|
|
|||
39
.github/workflows/test.yml
vendored
Normal file
39
.github/workflows/test.yml
vendored
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
name: Backend Tests
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
pull_request:
|
||||
branches: [main]
|
||||
|
||||
jobs:
|
||||
backend-tests:
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 15
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.11"
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r backend/requirements.txt
|
||||
pip install -r backend/requirements-test.txt
|
||||
|
||||
- name: Run tests with coverage
|
||||
env:
|
||||
APP_DB_PATH: /tmp/rss_news_test.db
|
||||
run: |
|
||||
pytest backend/tests --cov=backend/app --cov-report=term-missing --cov-report=xml
|
||||
|
||||
- name: Upload coverage artifact
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: coverage-xml
|
||||
path: coverage.xml
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
|
|
@ -35,3 +35,5 @@ internal/start.sh
|
|||
internal/copy_files.sh
|
||||
internal/_line.txt
|
||||
internal/push_commit.txt
|
||||
internal/git.sh
|
||||
CLAUDE.md
|
||||
|
|
|
|||
40
AGENTS.md
Normal file
40
AGENTS.md
Normal file
|
|
@ -0,0 +1,40 @@
|
|||
# Repository Guidelines
|
||||
|
||||
## Project Structure & Module Organization
|
||||
- `app.py`: Streamlit UI (entry point for the app).
|
||||
- `main.py`: RSS fetching, rewrite, and WordPress upload logic.
|
||||
- `utils/`: Helpers (image/article extraction, WP uploader, UI helpers).
|
||||
- `pages/`: Streamlit pages (e.g., `01_feed_manager.py`, `log_viewer.py`).
|
||||
- `data/`: JSON state (`articles.json`, `feeds.json`).
|
||||
- `logs/`: Runtime logs (`rss_tool.log`).
|
||||
- `docs/`: Project notes (e.g., roadmap).
|
||||
- `__version__.py`: Version string written by `versioning.py`.
|
||||
|
||||
## Build, Test, and Development Commands
|
||||
- Create env: `python -m venv .venv && source .venv/bin/activate`
|
||||
- Install deps: `pip install -r requirements.txt`
|
||||
- Run app: `streamlit run app.py`
|
||||
- Version bump: `python versioning.py --level patch --push` (updates `__version__.py`, prepares `CHANGELOG.md`, creates tag; see `--help`).
|
||||
|
||||
## Coding Style & Naming Conventions
|
||||
- Python 3.10+, PEP 8, 4-space indentation, type hints where practical.
|
||||
- Modules and functions: `snake_case`; classes: `PascalCase`.
|
||||
- Streamlit pages: numeric prefix for order, e.g., `pages/01_feature.py`.
|
||||
- Keep functions small and pure in `utils/`; isolate I/O in app layers.
|
||||
- Suggested tools (optional): Black (`black .`) and Ruff (`ruff check .`).
|
||||
|
||||
## Testing Guidelines
|
||||
- Framework: pytest (recommended). Place tests under `tests/` with `test_*.py`.
|
||||
- Unit tests for `utils/*`; light integration checks for `main.py` with temporary files.
|
||||
- Run: `pytest -q`. Add coverage if needed (e.g., `pytest --cov=utils`).
|
||||
- Test data: avoid mutating files in `data/`; use temp dirs or fixtures.
|
||||
|
||||
## Commit & Pull Request Guidelines
|
||||
- Commits: imperative mood, concise; examples: `Add feed dedupe`, `Fix WP upload retry`, `Bump version to v1.7.0`.
|
||||
- PRs: clear description, linked issue, screenshots/GIFs for UI changes, note env variables touched.
|
||||
- Update `CHANGELOG.md` and bump version via `versioning.py` before release PRs.
|
||||
|
||||
## Security & Configuration Tips
|
||||
- Required env: `OPENAI_API_KEY`, `WP_BASE_URL`, `WP_USERNAME`, `WP_PASSWORD` or `WP_AUTH_BASE64` (see `.env`).
|
||||
- Never commit secrets; `.env` is git-ignored. Avoid hardcoded credentials; prefer `os.getenv`.
|
||||
- Logs and data may contain content; do not commit `logs/` or large `data/` snapshots.
|
||||
49
CHANGELOG.md
49
CHANGELOG.md
|
|
@ -1,3 +1,52 @@
|
|||
## [1.7.1] - 2025-08-24
|
||||
|
||||
### ✨ Security angepasst
|
||||
- alle Credentials in die .env Datei verschoben
|
||||
- beim Start der App werden die Credentials geprüft und beim fehlen entsprechende Meldungen ausgegeben
|
||||
|
||||
---
|
||||
|
||||
## [1.7.0] - 2025-08-24
|
||||
|
||||
### Multi-Select & Massenoperationen:
|
||||
- ✅ Checkboxes für Artikel-Auswahl im "Artikel verwalten" Bereich
|
||||
- ✅ "Alle auswählen" / "Auswahl aufheben" Buttons
|
||||
- ✅ Massenoperationen für ausgewählte Artikel:
|
||||
- Bulk Status-Änderung für mehrere Artikel gleichzeitig
|
||||
- Bulk Artikel-Umschreibung mit automatischer Status-Verwaltung
|
||||
- Bulk WordPress-Upload nur für "Process"-Artikel
|
||||
- Bulk Papierkorb-Funktion
|
||||
|
||||
### Schnellaktionen Integration:
|
||||
- ✅ Feed-Aktualisierung direkt im Artikel-Tab verfügbar
|
||||
- ✅ Alle Dashboard-Schnellaktionen in Artikel-Verwaltung integriert
|
||||
- ✅ Intelligente Anzeige nur relevanter Operationen (z.B. WordPress-Upload nur bei Process-Artikeln)
|
||||
|
||||
### 🔧 Verbesserungen
|
||||
|
||||
- UI/UX: Verbesserte Artikel-Card-Layouts mit Checkbox-Integration
|
||||
- Workflow: Streamlined Artikel-Management ohne Tab-Wechsel nötig
|
||||
- Feedback: Detaillierte Statusmeldungen bei Massenoperationen
|
||||
- Performance: Optimierte Session-State-Verwaltung für Artikel-Auswahl
|
||||
|
||||
### 🏗️ Technische Änderungen
|
||||
|
||||
- Session State Erweiterung um selected_articles Set
|
||||
- Neue Bulk-Operation-Funktionen in app.py:326-467
|
||||
- Überarbeitetes Artikel-Card-Layout mit 3-Spalten-Design
|
||||
- Integration bestehender WordPress-Upload und Rewrite-Funktionen
|
||||
|
||||
---
|
||||
|
||||
## [1.6.3] - 2025-08-18
|
||||
|
||||
### 🔧 Verbesserungen
|
||||
- **SyleSheet erneut hinzugefügt
|
||||
- Style wurde bei einem Release leider vergessen
|
||||
- Style auf DarkMode angepasst
|
||||
|
||||
---
|
||||
|
||||
## [v1.6.2] - 2025-08-16
|
||||
|
||||
### 🐛 Kritische Fehlerbehebung
|
||||
|
|
|
|||
109
README.md
109
README.md
|
|
@ -1,76 +1,63 @@
|
|||
# 📰 RSS News Bot
|
||||
# rss-news (Rebuild)
|
||||
|
||||
Ein intelligentes Tool zum Einlesen, Umschreiben und Veröffentlichen von Artikeln aus RSS-Feeds – mit automatischer Tag-Erkennung, KI-unterstütztem Rewrite via GPT-4, Bildextraktion aus Originalartikeln und optionaler DALL·E-Bildgenerierung.
|
||||
`rss-news` wird als bestehendes Repository weitergefuehrt und schrittweise zu einer robusten, rechtssicheren News-Pipeline neu aufgebaut.
|
||||
|
||||

|
||||

|
||||

|
||||

|
||||
Aktueller Stand:
|
||||
- Alte Streamlit-App wird nicht produktiv genutzt.
|
||||
- `news.vanityontour.de` wird bis zum Go-Live der neuen App auf `https://vanityontour.de` umgeleitet.
|
||||
- Planung, Doku und Wiki werden als Grundlage fuer den Neuaufbau gepflegt.
|
||||
|
||||
---
|
||||
## Ziele
|
||||
- RSS-gestuetzte Artikelverarbeitung mit klaren Quellregeln
|
||||
- Rechtssichere Nutzung (Quellen, Attribution, Lizenzinformationen)
|
||||
- Zuverlaessige Automatisierung auf Hetzner
|
||||
- Publikation nach WordPress (IONOS aktuell, spaeter offen)
|
||||
- Zugriff nur nach Login (zunaechst User/Password)
|
||||
|
||||
## 🚀 Features
|
||||
## Architektur-Richtung (MVP)
|
||||
- Backend: `Python + FastAPI`
|
||||
- Jobs: Queue-Worker (z. B. Redis + RQ/Celery)
|
||||
- Daten: SQLite fuer MVP, spaeter optional PostgreSQL
|
||||
- Auth: Session-Login mit einem Admin-User
|
||||
- Publishing: WordPress REST API (Status zunaechst `pending`)
|
||||
|
||||
- 📡 **RSS-Feeds verwalten** (hinzufügen, aktualisieren)
|
||||
- ✍️ **Artikel automatisch umschreiben** mit GPT-4
|
||||
- 🏷️ **Tags automatisch generieren**
|
||||
- 🖼️ **Bilder aus Originalartikeln extrahieren**
|
||||
- 🪄 **Optionales DALL·E-Bild generieren**
|
||||
- 🔧 **Bearbeiten von Bildmetadaten**
|
||||
- 🗂️ **Statusverwaltung der Artikel (New, Rewrite, Process, etc.)**
|
||||
- 📜 **Log-Viewer-Seite integriert**
|
||||
- 📥 **Export zur Veröffentlichung auf WordPress vorbereitet**
|
||||
- 📋 Artikeltabelle mit Status-Filter
|
||||
- 🔍 Artikel-Expander mit Rewrite, Tags & Bildern
|
||||
- 🪄 Button für KI-Bildgenerierung
|
||||
Details: `docs/PROJECT_PLAN.md`
|
||||
|
||||
## Projektsteuerung
|
||||
- GitHub Project: `https://github.com/users/OliverGiertz/projects/3/views/1`
|
||||
- Dieses Board ist die zentrale Steuerung fuer ToDos, Bugs, Verbesserungen.
|
||||
- Wiki-Struktur liegt unter `docs/wiki/`.
|
||||
|
||||
---
|
||||
## Dokumentation
|
||||
- Projektplan: `docs/PROJECT_PLAN.md`
|
||||
- ToDo-Liste: `docs/TODO.md`
|
||||
- Quell- und Lizenzpolicy: `docs/SOURCE_POLICY.md`
|
||||
- Wiki Home: `docs/wiki/Home.md`
|
||||
|
||||
## 🧱 Projektstruktur
|
||||
|
||||
ss-news/
|
||||
├── app.py # Haupt-UI mit Streamlit
|
||||
├── main.py # Logik für Feed-Import und Verarbeitung
|
||||
├── utils/
|
||||
│ └── image_extractor.py # Bilder aus Originalartikeln extrahieren
|
||||
│ └── dalle_generator.py # DALL·E-Integration (KI-Bild)
|
||||
├── pages/
|
||||
│ └── log_viewer.py # UI zur Anzeige der Logs
|
||||
├── data/
|
||||
│ └── articles.json # Gespeicherte Artikel
|
||||
│ └── feeds.json # Gespeicherte Feed-URLs
|
||||
├── logs/
|
||||
│ └── rss_tool.log # Logging der Verarbeitung
|
||||
├── versioning.py # CLI-Tool zur Versionierung & Release
|
||||
├── TEST-CHECKLIST.md # Manuelle Prüfliste für Releases
|
||||
├── version.py # Aktuelle Version
|
||||
└── CHANGELOG.md # Änderungsprotokoll
|
||||
|
||||
|
||||
---
|
||||
|
||||
## ⚙️ Installation
|
||||
## Lokale Entwicklung (Legacy-Code)
|
||||
Der vorhandene Legacy-Stand kann weiterhin lokal gestartet werden:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/OliverGiertz/rss-news.git
|
||||
cd rss-news
|
||||
python -m venv .venv
|
||||
source .venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Update
|
||||
Ein Update Script findest du hier: https://gist.github.com/OliverGiertz/ad33ae3de9aa1c1163dad5fe8affb6ca
|
||||
|
||||
```bash
|
||||
bash update.sh
|
||||
```
|
||||
|
||||
|
||||
## ▶️ Starten der App
|
||||
|
||||
streamlit run app.py
|
||||
```
|
||||
|
||||
Hinweis: Diese App ist funktional historisch und wird durch die neue Architektur ersetzt.
|
||||
|
||||
## Deployment-Zielbild
|
||||
- Betrieb auf Hetzner
|
||||
- Reverse Proxy via CloudPanel/Nginx
|
||||
- Produktive Domain: `news.vanityontour.de`
|
||||
- Bis zur Fertigstellung: Redirect auf `https://vanityontour.de`
|
||||
|
||||
## Sicherheit
|
||||
- Keine Secrets im Repository
|
||||
- `.env` lokal/auf Server, nie committen
|
||||
- Auth-Pflicht fuer die neue WebApp
|
||||
- spaeter optional: Passkeys/WebAuthn
|
||||
|
||||
## Rechtlicher Hinweis
|
||||
Dieses Projekt verarbeitet nur Quellen mit dokumentierter Nutzungsgrundlage. Vor produktiver Nutzung ist eine finale rechtliche Pruefung der ausgewaehlten Feeds notwendig.
|
||||
|
||||
|
|
|
|||
|
|
@ -1 +1 @@
|
|||
VERSION = "1.6.2"
|
||||
VERSION = "1.7.1"
|
||||
|
|
|
|||
516
app.py
516
app.py
|
|
@ -13,6 +13,8 @@ from main import (
|
|||
)
|
||||
from utils.dalle_generator import generate_dalle_image
|
||||
from utils.wordpress_uploader import WordPressUploader
|
||||
from utils.css_loader import load_css, apply_dark_theme
|
||||
from utils.config import validate_env
|
||||
import os
|
||||
from collections import Counter
|
||||
import time
|
||||
|
|
@ -24,103 +26,22 @@ st.set_page_config(
|
|||
initial_sidebar_state="collapsed"
|
||||
)
|
||||
|
||||
# === Custom CSS für modernes Design ===
|
||||
st.markdown("""
|
||||
<style>
|
||||
/* Hauptcontainer */
|
||||
.main-header {
|
||||
background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
|
||||
padding: 2rem;
|
||||
border-radius: 10px;
|
||||
margin-bottom: 2rem;
|
||||
color: white;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
/* Artikel Cards */
|
||||
.article-card {
|
||||
background: white;
|
||||
border-radius: 12px;
|
||||
padding: 1.5rem;
|
||||
margin-bottom: 1rem;
|
||||
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
|
||||
border-left: 4px solid #667eea;
|
||||
transition: transform 0.2s;
|
||||
}
|
||||
|
||||
.article-card:hover {
|
||||
transform: translateY(-2px);
|
||||
box-shadow: 0 8px 15px rgba(0, 0, 0, 0.15);
|
||||
}
|
||||
|
||||
/* Status Badges */
|
||||
.status-badge {
|
||||
padding: 0.3rem 0.8rem;
|
||||
border-radius: 20px;
|
||||
font-size: 0.8rem;
|
||||
font-weight: bold;
|
||||
margin-right: 0.5rem;
|
||||
}
|
||||
|
||||
.status-new { background-color: #e3f2fd; color: #1976d2; }
|
||||
.status-rewrite { background-color: #fff3e0; color: #f57c00; }
|
||||
.status-process { background-color: #f3e5f5; color: #7b1fa2; }
|
||||
.status-online { background-color: #e8f5e8; color: #388e3c; }
|
||||
.status-hold { background-color: #fce4ec; color: #c2185b; }
|
||||
.status-trash { background-color: #ffebee; color: #d32f2f; }
|
||||
.status-wp-pending { background-color: #e1f5fe; color: #0277bd; }
|
||||
|
||||
/* Filter Section */
|
||||
.filter-section {
|
||||
background: #f8f9fa;
|
||||
padding: 1.5rem;
|
||||
border-radius: 10px;
|
||||
margin-bottom: 2rem;
|
||||
}
|
||||
|
||||
/* Stats Cards */
|
||||
.stats-card {
|
||||
background: white;
|
||||
padding: 1.5rem;
|
||||
border-radius: 10px;
|
||||
text-align: center;
|
||||
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
|
||||
}
|
||||
|
||||
.stats-number {
|
||||
font-size: 2rem;
|
||||
font-weight: bold;
|
||||
color: #667eea;
|
||||
}
|
||||
|
||||
/* Action Buttons */
|
||||
.action-button {
|
||||
margin: 0.25rem;
|
||||
}
|
||||
|
||||
/* Image Gallery */
|
||||
.image-gallery {
|
||||
display: flex;
|
||||
gap: 1rem;
|
||||
overflow-x: auto;
|
||||
padding: 1rem 0;
|
||||
}
|
||||
|
||||
.image-item {
|
||||
min-width: 200px;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
/* WordPress Upload Status */
|
||||
.wp-status {
|
||||
background: #e3f2fd;
|
||||
padding: 1rem;
|
||||
border-radius: 8px;
|
||||
margin: 1rem 0;
|
||||
border-left: 4px solid #2196f3;
|
||||
}
|
||||
</style>
|
||||
""", unsafe_allow_html=True)
|
||||
# === CSS & Theme laden ===
|
||||
load_css()
|
||||
apply_dark_theme()
|
||||
|
||||
# === Environment-Validierung (.env) ===
|
||||
env_check = validate_env()
|
||||
if not env_check.get("ok"):
|
||||
st.error("🔒 Sicherheits-/Konfigurationshinweis: Bitte .env korrekt konfigurieren.")
|
||||
for msg in env_check.get("errors", []):
|
||||
st.markdown(f"- ❌ {msg}")
|
||||
for msg in env_check.get("warnings", []):
|
||||
st.markdown(f"- ⚠️ {msg}")
|
||||
elif env_check.get("warnings"):
|
||||
st.info("ℹ️ Hinweise zur Konfiguration:")
|
||||
for msg in env_check.get("warnings", []):
|
||||
st.markdown(f"- ⚠️ {msg}")
|
||||
|
||||
# === Initialize Session State ===
|
||||
if 'selected_articles' not in st.session_state:
|
||||
|
|
@ -340,9 +261,8 @@ with tab1:
|
|||
<div class="article-card">
|
||||
<div style="display: flex; justify-content: space-between; align-items: center;">
|
||||
<div>
|
||||
<strong>{article.get('title', 'Kein Titel')}</strong>
|
||||
<br>
|
||||
<small>{format_date(article.get('date', ''))}</small>
|
||||
<h3 class="article-title">{article.get('title', 'Kein Titel')}</h3>
|
||||
<div class="article-meta">{format_date(article.get('date', ''))}</div>
|
||||
</div>
|
||||
<div>
|
||||
{get_status_badge(article.get('status', 'New'))}
|
||||
|
|
@ -417,8 +337,148 @@ with tab2:
|
|||
or any(query in tag.lower() for tag in a.get("tags", []))
|
||||
]
|
||||
|
||||
# Ergebnisse anzeigen
|
||||
st.write(f"**{len(filtered_articles)} Artikel gefunden**")
|
||||
# Ergebnisse und Massenoperationen
|
||||
col1, col2 = st.columns([2, 1])
|
||||
|
||||
with col1:
|
||||
st.write(f"**{len(filtered_articles)} Artikel gefunden**")
|
||||
|
||||
with col2:
|
||||
# Select All / None Buttons
|
||||
if filtered_articles:
|
||||
col_select_1, col_select_2 = st.columns(2)
|
||||
with col_select_1:
|
||||
if st.button("✓ Alle auswählen", key="select_all"):
|
||||
for article in filtered_articles:
|
||||
st.session_state.selected_articles.add(article['id'])
|
||||
st.rerun()
|
||||
|
||||
with col_select_2:
|
||||
if st.button("✗ Auswahl aufheben", key="select_none"):
|
||||
st.session_state.selected_articles.clear()
|
||||
st.rerun()
|
||||
|
||||
# Bulk Operations Section
|
||||
selected_count = len(st.session_state.selected_articles)
|
||||
if selected_count > 0:
|
||||
st.markdown(f"""
|
||||
<div class="filter-section" style="margin-top: 10px;">
|
||||
<h4>⚡ Massenoperationen ({selected_count} Artikel ausgewählt)</h4>
|
||||
</div>
|
||||
""", unsafe_allow_html=True)
|
||||
|
||||
# Quick Actions für ausgewählte Artikel
|
||||
col1, col2, col3, col4, col5 = st.columns(5)
|
||||
|
||||
with col1:
|
||||
if st.button("🔄 Feeds aktualisieren", use_container_width=True, key="bulk_update_feeds"):
|
||||
with st.spinner("Feeds werden aktualisiert..."):
|
||||
existing_ids = [a["id"] for a in all_articles]
|
||||
process_articles(existing_ids)
|
||||
show_notification("Feeds erfolgreich aktualisiert!")
|
||||
time.sleep(1)
|
||||
st.rerun()
|
||||
|
||||
with col2:
|
||||
# Bulk Status Change
|
||||
bulk_status = st.selectbox(
|
||||
"Status ändern",
|
||||
["--Auswählen--"] + ["New", "Rewrite", "Process", "Online", "On Hold", "Trash", "WordPress Pending"],
|
||||
key="bulk_status"
|
||||
)
|
||||
|
||||
if bulk_status != "--Auswählen--" and st.button("Status anwenden", key="apply_bulk_status"):
|
||||
changed_count = 0
|
||||
for article in all_articles:
|
||||
if article["id"] in st.session_state.selected_articles:
|
||||
article["status"] = bulk_status
|
||||
changed_count += 1
|
||||
|
||||
if changed_count > 0:
|
||||
save_articles(all_articles)
|
||||
show_notification(f"{changed_count} Artikel auf '{bulk_status}' gesetzt!")
|
||||
st.session_state.selected_articles.clear()
|
||||
st.rerun()
|
||||
|
||||
with col3:
|
||||
# Bulk Rewrite
|
||||
rewrite_selected_count = len([a for a in all_articles if a["id"] in st.session_state.selected_articles and a.get("status") != "Rewrite"])
|
||||
if st.button(f"✍️ Artikel umschreiben ({rewrite_selected_count})", use_container_width=True, key="bulk_rewrite"):
|
||||
# Ausgewählte Artikel auf "Rewrite" setzen
|
||||
for article in all_articles:
|
||||
if article["id"] in st.session_state.selected_articles:
|
||||
article["status"] = "Rewrite"
|
||||
|
||||
save_articles(all_articles)
|
||||
|
||||
# Umschreiben starten
|
||||
with st.spinner(f"{rewrite_selected_count} Artikel werden umgeschrieben..."):
|
||||
rewrite_articles()
|
||||
show_notification(f"{rewrite_selected_count} Artikel erfolgreich umgeschrieben!")
|
||||
st.session_state.selected_articles.clear()
|
||||
time.sleep(1)
|
||||
st.rerun()
|
||||
|
||||
with col4:
|
||||
# Bulk WordPress Upload
|
||||
wp_ready_selected = len([a for a in all_articles if a["id"] in st.session_state.selected_articles and a.get("status") == "Process"])
|
||||
if wp_ready_selected > 0:
|
||||
if st.button(f"📤 WordPress Upload ({wp_ready_selected})", use_container_width=True, key="bulk_wp_upload"):
|
||||
with st.spinner(f"{wp_ready_selected} Artikel werden zu WordPress hochgeladen..."):
|
||||
# Nur die ausgewählten "Process" Artikel hochladen
|
||||
selected_process_articles = [a for a in all_articles if a["id"] in st.session_state.selected_articles and a.get("status") == "Process"]
|
||||
|
||||
if selected_process_articles:
|
||||
from utils.wordpress_uploader import upload_articles_to_wordpress
|
||||
upload_results = upload_articles_to_wordpress(selected_process_articles)
|
||||
|
||||
if upload_results.get('error'):
|
||||
show_notification(f"Fehler beim WordPress-Upload: {upload_results['error']}", "error")
|
||||
else:
|
||||
successful = upload_results.get('successful', 0)
|
||||
failed = upload_results.get('failed', 0)
|
||||
duplicates = upload_results.get('duplicates', 0)
|
||||
|
||||
# Status der erfolgreich hochgeladenen Artikel ändern
|
||||
if successful > 0:
|
||||
for detail in upload_results.get('details', []):
|
||||
if detail.get('success'):
|
||||
article_id = detail.get('article_id')
|
||||
for article in all_articles:
|
||||
if article.get('id') == article_id:
|
||||
article['status'] = "WordPress Pending"
|
||||
article['wp_upload_date'] = datetime.now().isoformat()
|
||||
article['wp_post_id'] = detail.get('wp_post_id')
|
||||
break
|
||||
save_articles(all_articles)
|
||||
|
||||
if successful > 0:
|
||||
show_notification(f"✅ {successful} Artikel erfolgreich zu WordPress hochgeladen!")
|
||||
if failed > 0:
|
||||
show_notification(f"⚠️ {failed} Artikel konnten nicht hochgeladen werden.", "warning")
|
||||
if duplicates > 0:
|
||||
show_notification(f"ℹ️ {duplicates} Duplikate übersprungen.", "info")
|
||||
|
||||
st.session_state.selected_articles.clear()
|
||||
time.sleep(2)
|
||||
st.rerun()
|
||||
else:
|
||||
st.markdown("*Keine Process-Artikel ausgewählt*")
|
||||
|
||||
with col5:
|
||||
# Bulk Delete/Trash
|
||||
if st.button("🗑️ In Papierkorb", use_container_width=True, key="bulk_trash"):
|
||||
trash_count = 0
|
||||
for article in all_articles:
|
||||
if article["id"] in st.session_state.selected_articles:
|
||||
article["status"] = "Trash"
|
||||
trash_count += 1
|
||||
|
||||
if trash_count > 0:
|
||||
save_articles(all_articles)
|
||||
show_notification(f"{trash_count} Artikel in Papierkorb verschoben!")
|
||||
st.session_state.selected_articles.clear()
|
||||
st.rerun()
|
||||
|
||||
# Artikel Cards
|
||||
for article in filtered_articles:
|
||||
|
|
@ -430,40 +490,48 @@ with tab2:
|
|||
# Article Card
|
||||
st.markdown('<div class="article-card">', unsafe_allow_html=True)
|
||||
|
||||
# Header
|
||||
col1, col2 = st.columns([3, 1])
|
||||
# Header with Checkbox
|
||||
col_check, col_content, col_status = st.columns([0.3, 2.7, 1])
|
||||
|
||||
with col1:
|
||||
with col_check:
|
||||
# Checkbox für Artikel-Auswahl
|
||||
is_selected = article["id"] in st.session_state.selected_articles
|
||||
if st.checkbox("", value=is_selected, key=f"check_{article['id']}"):
|
||||
st.session_state.selected_articles.add(article['id'])
|
||||
else:
|
||||
st.session_state.selected_articles.discard(article['id'])
|
||||
|
||||
with col_content:
|
||||
title = article.get("title", "Kein Titel")
|
||||
if has_incomplete_images:
|
||||
title += " ⚠️"
|
||||
st.markdown(f"**{title}**")
|
||||
st.markdown(f"📅 {format_date(article.get('date', ''))}")
|
||||
st.markdown(f'<h3 class="article-title">{title}</h3>', unsafe_allow_html=True)
|
||||
st.markdown(f'<div class="article-meta">📅 {format_date(article.get("date", ""))}</div>', unsafe_allow_html=True)
|
||||
|
||||
# WordPress-Info anzeigen falls vorhanden
|
||||
if article.get("wp_post_id"):
|
||||
st.markdown(f"🔗 WordPress ID: {article.get('wp_post_id')} | Upload: {format_date(article.get('wp_upload_date', ''))}")
|
||||
st.markdown(f'<div class="article-meta">🔗 WordPress ID: {article.get("wp_post_id")} | Upload: {format_date(article.get("wp_upload_date", ""))}</div>', unsafe_allow_html=True)
|
||||
|
||||
with col2:
|
||||
with col_status:
|
||||
st.markdown(get_status_badge(article.get("status", "New")), unsafe_allow_html=True)
|
||||
|
||||
# Content Preview
|
||||
summary = article.get("summary", "")[:200]
|
||||
if len(summary) == 200:
|
||||
summary += "..."
|
||||
st.markdown(summary)
|
||||
st.markdown(f'<div class="article-summary">{summary}</div>', unsafe_allow_html=True)
|
||||
|
||||
# Meta Info
|
||||
col1, col2, col3 = st.columns(3)
|
||||
with col1:
|
||||
st.markdown(f"📝 **{get_word_count(article.get('text', ''))} Wörter**")
|
||||
st.markdown(f'<div class="article-footer">📝 **{get_word_count(article.get("text", ""))} Wörter**</div>', unsafe_allow_html=True)
|
||||
with col2:
|
||||
tags = article.get("tags", [])
|
||||
if tags:
|
||||
st.markdown(f"🏷️ {', '.join(tags[:3])}{'...' if len(tags) > 3 else ''}")
|
||||
st.markdown(f'<div class="article-footer">🏷️ {", ".join(tags[:3])}{"..." if len(tags) > 3 else ""}</div>', unsafe_allow_html=True)
|
||||
with col3:
|
||||
source_name = source_to_name.get(article.get("source", ""), "Unbekannt")
|
||||
st.markdown(f"📡 {source_name}")
|
||||
st.markdown(f'<div class="article-footer">📡 {source_name}</div>', unsafe_allow_html=True)
|
||||
|
||||
# Actions
|
||||
col1, col2, col3, col4, col5 = st.columns(5)
|
||||
|
|
@ -642,11 +710,9 @@ with tab3:
|
|||
<div class="article-card">
|
||||
<div style="display: flex; justify-content: space-between; align-items: center;">
|
||||
<div>
|
||||
<strong>{feed_name}</strong>
|
||||
<br>
|
||||
<small>{feed_url}</small>
|
||||
<br>
|
||||
<span style="color: #667eea;">📰 {article_count} Artikel</span>
|
||||
<h3 class="article-title">{feed_name}</h3>
|
||||
<div class="article-meta">{feed_url}</div>
|
||||
<div class="article-footer">📰 {article_count} Artikel</div>
|
||||
</div>
|
||||
<div>
|
||||
<span class="status-badge status-online">{article_count} Artikel</span>
|
||||
|
|
@ -712,13 +778,15 @@ with tab4:
|
|||
cols = st.columns(3)
|
||||
for idx, img in enumerate(all_images):
|
||||
with cols[idx % 3]:
|
||||
st.markdown('<div class="image-item">', unsafe_allow_html=True)
|
||||
st.image(img["url"], use_column_width=True)
|
||||
st.markdown(f"**{img.get('caption', 'Kein Titel')}**")
|
||||
st.markdown(f"📰 {img['article_title']}")
|
||||
st.markdown(f"©️ {img.get('copyright', 'Unbekannt')}")
|
||||
st.markdown(f'<strong class="text-primary">{img.get("caption", "Kein Titel")}</strong>', unsafe_allow_html=True)
|
||||
st.markdown(f'<div class="text-secondary">📰 {img["article_title"]}</div>', unsafe_allow_html=True)
|
||||
st.markdown(f'<div class="text-muted">©️ {img.get("copyright", "Unbekannt")}</div>', unsafe_allow_html=True)
|
||||
|
||||
if img.get("copyright_url") and img["copyright_url"] != "#":
|
||||
st.markdown(f"[🔗 Quelle]({img['copyright_url']})")
|
||||
st.markdown(f'<a href="{img["copyright_url"]}" target="_blank" class="text-accent">🔗 Quelle</a>', unsafe_allow_html=True)
|
||||
st.markdown('</div>', unsafe_allow_html=True)
|
||||
else:
|
||||
st.info("Keine Bilder gefunden.")
|
||||
|
||||
|
|
@ -735,13 +803,13 @@ with tab5:
|
|||
st.subheader("📈 Status Verteilung")
|
||||
for status, count in status_counts.items():
|
||||
percentage = (count / len(all_articles) * 100) if all_articles else 0
|
||||
st.markdown(f"{get_status_badge(status)} {count} ({percentage:.1f}%)", unsafe_allow_html=True)
|
||||
st.markdown(f"{get_status_badge(status)} <span class='text-primary'>{count} ({percentage:.1f}%)</span>", unsafe_allow_html=True)
|
||||
|
||||
with col2:
|
||||
st.subheader("📡 Artikel pro Feed")
|
||||
feed_counts = Counter([source_to_name.get(a.get("source", ""), "Unbekannt") for a in all_articles])
|
||||
for feed_name, count in feed_counts.most_common():
|
||||
st.markdown(f"**{feed_name}:** {count} Artikel")
|
||||
st.markdown(f'<div class="text-primary"><strong>{feed_name}:</strong> {count} Artikel</div>', unsafe_allow_html=True)
|
||||
|
||||
# WordPress-Statistiken
|
||||
st.subheader("🔗 WordPress-Statistiken")
|
||||
|
|
@ -751,15 +819,30 @@ with tab5:
|
|||
col1, col2, col3 = st.columns(3)
|
||||
|
||||
with col1:
|
||||
st.metric("WordPress Artikel", len(wp_articles))
|
||||
st.markdown("""
|
||||
<div class="stats-card">
|
||||
<div class="stats-number">{}</div>
|
||||
<div>WordPress Artikel</div>
|
||||
</div>
|
||||
""".format(len(wp_articles)), unsafe_allow_html=True)
|
||||
|
||||
with col2:
|
||||
pending_count = len([a for a in wp_articles if a.get("status") == "WordPress Pending"])
|
||||
st.metric("Ausstehend", pending_count)
|
||||
st.markdown("""
|
||||
<div class="stats-card">
|
||||
<div class="stats-number">{}</div>
|
||||
<div>Ausstehend</div>
|
||||
</div>
|
||||
""".format(pending_count), unsafe_allow_html=True)
|
||||
|
||||
with col3:
|
||||
online_wp_count = len([a for a in wp_articles if a.get("status") == "Online"])
|
||||
st.metric("Online", online_wp_count)
|
||||
st.markdown("""
|
||||
<div class="stats-card">
|
||||
<div class="stats-number">{}</div>
|
||||
<div>Online</div>
|
||||
</div>
|
||||
""".format(online_wp_count), unsafe_allow_html=True)
|
||||
|
||||
# Neueste WordPress-Uploads
|
||||
recent_wp = sorted([a for a in wp_articles if a.get("wp_upload_date")],
|
||||
|
|
@ -769,9 +852,11 @@ with tab5:
|
|||
st.subheader("🕒 Neueste WordPress-Uploads")
|
||||
for article in recent_wp:
|
||||
st.markdown(f"""
|
||||
**{article.get('title', 'Kein Titel')}** {get_status_badge(article.get('status', 'Unknown'))}
|
||||
|
||||
WP ID: {article.get('wp_post_id')} | Upload: {format_date(article.get('wp_upload_date', ''))}
|
||||
<div class="article-card">
|
||||
<h3 class="article-title">{article.get('title', 'Kein Titel')}</h3>
|
||||
{get_status_badge(article.get('status', 'Unknown'))}
|
||||
<div class="article-meta">WP ID: {article.get('wp_post_id')} | Upload: {format_date(article.get('wp_upload_date', ''))}</div>
|
||||
</div>
|
||||
""", unsafe_allow_html=True)
|
||||
else:
|
||||
st.info("Noch keine Artikel zu WordPress hochgeladen.")
|
||||
|
|
@ -784,13 +869,28 @@ with tab5:
|
|||
col1, col2, col3 = st.columns(3)
|
||||
|
||||
with col1:
|
||||
st.metric("Durchschnittliche Wortanzahl", f"{sum(word_counts) // len(word_counts)}")
|
||||
st.markdown("""
|
||||
<div class="stats-card">
|
||||
<div class="stats-number">{}</div>
|
||||
<div>Durchschnittliche Wortanzahl</div>
|
||||
</div>
|
||||
""".format(sum(word_counts) // len(word_counts)), unsafe_allow_html=True)
|
||||
|
||||
with col2:
|
||||
st.metric("Längster Artikel", f"{max(word_counts)} Wörter")
|
||||
st.markdown("""
|
||||
<div class="stats-card">
|
||||
<div class="stats-number">{}</div>
|
||||
<div>Längster Artikel (Wörter)</div>
|
||||
</div>
|
||||
""".format(max(word_counts)), unsafe_allow_html=True)
|
||||
|
||||
with col3:
|
||||
st.metric("Kürzester Artikel", f"{min(word_counts)} Wörter")
|
||||
st.markdown("""
|
||||
<div class="stats-card">
|
||||
<div class="stats-number">{}</div>
|
||||
<div>Kürzester Artikel (Wörter)</div>
|
||||
</div>
|
||||
""".format(min(word_counts)), unsafe_allow_html=True)
|
||||
|
||||
# Tag Cloud Simulation
|
||||
st.subheader("🏷️ Häufigste Tags")
|
||||
|
|
@ -801,7 +901,7 @@ with tab5:
|
|||
if all_tags:
|
||||
tag_counts = Counter(all_tags)
|
||||
for tag, count in tag_counts.most_common(10):
|
||||
st.markdown(f"**{tag}:** {count}x verwendet")
|
||||
st.markdown(f'<div class="text-primary"><strong>{tag}:</strong> {count}x verwendet</div>', unsafe_allow_html=True)
|
||||
else:
|
||||
st.info("Keine Tags gefunden.")
|
||||
|
||||
|
|
@ -830,28 +930,19 @@ with tab6:
|
|||
wp_user = os.getenv("WP_USERNAME", "Nicht konfiguriert")
|
||||
wp_base64 = os.getenv("WP_AUTH_BASE64", "")
|
||||
|
||||
st.info(f"""
|
||||
**WordPress-Konfiguration:**
|
||||
- URL: {wp_url}
|
||||
- Benutzer: {wp_user}
|
||||
- Passwort: {'✅ Konfiguriert' if os.getenv("WP_PASSWORD") else '❌ Nicht konfiguriert'}
|
||||
- Base64 Auth: {'✅ Konfiguriert' if wp_base64 else '❌ Nicht konfiguriert'}
|
||||
""")
|
||||
st.markdown(f"""
|
||||
<div class="wp-status">
|
||||
<strong>WordPress-Konfiguration:</strong><br>
|
||||
<div class="text-secondary">
|
||||
URL: {wp_url}<br>
|
||||
Benutzer: {wp_user}<br>
|
||||
Passwort: {'✅ Konfiguriert' if os.getenv("WP_PASSWORD") else '❌ Nicht konfiguriert'}<br>
|
||||
Base64 Auth: {'✅ Konfiguriert' if wp_base64 else '❌ Nicht konfiguriert'}
|
||||
</div>
|
||||
</div>
|
||||
""", unsafe_allow_html=True)
|
||||
|
||||
# WordPress Auth Debug (nur für Entwicklung)
|
||||
if st.checkbox("🔧 Debug-Modus (Auth-Details anzeigen)", value=False):
|
||||
st.warning("⚠️ Nur für Entwicklung - zeigt Auth-Details!")
|
||||
|
||||
wp_base64 = os.getenv("WP_AUTH_BASE64", "")
|
||||
if wp_base64:
|
||||
try:
|
||||
import base64
|
||||
decoded = base64.b64decode(wp_base64).decode('utf-8')
|
||||
st.code(f"Base64: {wp_base64}\nDecoded: {decoded}")
|
||||
except Exception as e:
|
||||
st.error(f"Fehler beim Dekodieren: {e}")
|
||||
else:
|
||||
st.info("Kein Base64-String konfiguriert")
|
||||
# Sicherheit: Kein Anzeigen sensibler Auth-Details mehr
|
||||
|
||||
# Bulk Upload
|
||||
st.subheader("📦 Massenupload")
|
||||
|
|
@ -863,10 +954,10 @@ with tab6:
|
|||
|
||||
# Artikel-Vorschau
|
||||
for article in process_articles_list[:5]: # Nur die ersten 5 anzeigen
|
||||
st.markdown(f"• **{article.get('title', 'Kein Titel')}** ({get_word_count(article.get('text', ''))} Wörter)")
|
||||
st.markdown(f'<div class="text-primary">• <strong>{article.get("title", "Kein Titel")}</strong> ({get_word_count(article.get("text", ""))} Wörter)</div>', unsafe_allow_html=True)
|
||||
|
||||
if len(process_articles_list) > 5:
|
||||
st.markdown(f"... und {len(process_articles_list) - 5} weitere")
|
||||
st.markdown(f'<div class="text-muted">... und {len(process_articles_list) - 5} weitere</div>', unsafe_allow_html=True)
|
||||
|
||||
col1, col2 = st.columns(2)
|
||||
|
||||
|
|
@ -884,24 +975,46 @@ with tab6:
|
|||
col1, col2, col3 = st.columns(3)
|
||||
|
||||
with col1:
|
||||
st.metric("Erfolgreich", upload_results.get('successful', 0))
|
||||
st.markdown("""
|
||||
<div class="stats-card">
|
||||
<div class="stats-number">{}</div>
|
||||
<div>Erfolgreich</div>
|
||||
</div>
|
||||
""".format(upload_results.get('successful', 0)), unsafe_allow_html=True)
|
||||
with col2:
|
||||
st.metric("Fehlgeschlagen", upload_results.get('failed', 0))
|
||||
st.markdown("""
|
||||
<div class="stats-card">
|
||||
<div class="stats-number">{}</div>
|
||||
<div>Fehlgeschlagen</div>
|
||||
</div>
|
||||
""".format(upload_results.get('failed', 0)), unsafe_allow_html=True)
|
||||
with col3:
|
||||
st.metric("Duplikate", upload_results.get('duplicates', 0))
|
||||
st.markdown("""
|
||||
<div class="stats-card">
|
||||
<div class="stats-number">{}</div>
|
||||
<div>Duplikate</div>
|
||||
</div>
|
||||
""".format(upload_results.get('duplicates', 0)), unsafe_allow_html=True)
|
||||
|
||||
# Details anzeigen
|
||||
if upload_results.get('details'):
|
||||
st.subheader("📋 Upload-Details")
|
||||
for detail in upload_results['details']:
|
||||
status_icon = "✅" if detail['success'] else "❌"
|
||||
st.markdown(f"{status_icon} **{detail['title']}**: {detail['message']}")
|
||||
st.markdown(f'<div class="text-primary">{status_icon} <strong>{detail["title"]}:</strong> {detail["message"]}</div>', unsafe_allow_html=True)
|
||||
|
||||
time.sleep(2)
|
||||
st.rerun()
|
||||
|
||||
with col2:
|
||||
st.info("💡 Artikel erhalten den Status 'WordPress Pending' nach erfolgreichem Upload.")
|
||||
st.markdown("""
|
||||
<div class="wp-status">
|
||||
<strong>💡 Info:</strong><br>
|
||||
<div class="text-secondary">
|
||||
Artikel erhalten den Status 'WordPress Pending' nach erfolgreichem Upload.
|
||||
</div>
|
||||
</div>
|
||||
""", unsafe_allow_html=True)
|
||||
|
||||
else:
|
||||
st.info("Keine Artikel mit Status 'Process' gefunden. Artikel müssen zuerst umgeschrieben werden.")
|
||||
|
|
@ -931,8 +1044,7 @@ with tab6:
|
|||
<div class="wp-status">
|
||||
<div style="display: flex; justify-content: space-between; align-items: center;">
|
||||
<div>
|
||||
<strong>{article.get('title', 'Kein Titel')}</strong>
|
||||
<br>
|
||||
<strong>{article.get('title', 'Kein Titel')}</strong><br>
|
||||
<small>WP ID: {article.get('wp_post_id')} | Upload: {format_date(article.get('wp_upload_date', ''))}</small>
|
||||
</div>
|
||||
<div>
|
||||
|
|
@ -951,53 +1063,47 @@ with tab6:
|
|||
with st.expander("📋 .env-Datei Vorlage", expanded=False):
|
||||
st.code("""
|
||||
# WordPress-Konfiguration
|
||||
WP_BASE_URL=https://vanityontour.de
|
||||
WP_USERNAME=ogiertz
|
||||
WP_PASSWORD=whNEx9aZCIUXViV89Z3e7Z03
|
||||
WP_BASE_URL=https://your-site.tld
|
||||
|
||||
# WordPress Base64-Authentifizierung (bevorzugte Methode)
|
||||
WP_AUTH_BASE64=b2dpZXJ0ejp3aE5FeDlhWkNJVVhWaVY4OVozZTdaMDM=
|
||||
# Entweder Base64 (empfohlen) ODER Benutzername/Passwort (Application Password)
|
||||
WP_AUTH_BASE64=
|
||||
# Oder alternativ:
|
||||
WP_USERNAME=
|
||||
WP_PASSWORD=
|
||||
|
||||
# OpenAI-Konfiguration (für Artikel-Umschreibung)
|
||||
OPENAI_API_KEY=sk-...
|
||||
# OpenAI-Konfiguration (optional für Umschreibung)
|
||||
OPENAI_API_KEY=
|
||||
""", language="bash")
|
||||
|
||||
with st.expander("🔑 Base64-Authentifizierung verstehen", expanded=False):
|
||||
st.markdown("""
|
||||
**WordPress REST API Authentifizierung:**
|
||||
|
||||
Die WordPress REST API erfordert eine Base64-kodierte Authentifizierung im Format:
|
||||
```
|
||||
Authorization: Basic <base64_encoded_credentials>
|
||||
```
|
||||
|
||||
**Ihr bereitgestellter Base64-String:**
|
||||
- `b2dpZXJ0ejp3aE5FeDlhWkNJVVhWaVY4OVozZTdaMDM=`
|
||||
- Dekodiert: `ogiertz:whNEx9aZCIUXViV89Z3e7Z03`
|
||||
|
||||
**So funktioniert es:**
|
||||
1. Benutzername und Anwendungspasswort werden kombiniert: `username:password`
|
||||
2. Dieser String wird Base64-kodiert
|
||||
3. Im Authorization-Header verwendet: `Basic <base64_string>`
|
||||
|
||||
**Fallback-Verhalten:**
|
||||
- Wenn `WP_AUTH_BASE64` gesetzt ist → Direkter Base64-String verwendet
|
||||
- Wenn nicht gesetzt → Base64 wird aus `WP_USERNAME:WP_PASSWORD` generiert
|
||||
""")
|
||||
<div class="article-card">
|
||||
<h3 class="article-title">WordPress REST API Authentifizierung:</h3>
|
||||
<div class="article-summary">
|
||||
Die WordPress REST API nutzt <code>Basic</code>-Auth mit Base64-kodierten Zugangsdaten:<br>
|
||||
<code>Authorization: Basic <base64(username:password)></code><br><br>
|
||||
Empfehlung: In der .env <code>WP_AUTH_BASE64</code> setzen (aus <code>username:application_password</code> erzeugt).<br>
|
||||
Alternativ können <code>WP_USERNAME</code> und <code>WP_PASSWORD</code> gesetzt werden; dann wird Base64 zur Laufzeit generiert.
|
||||
</div>
|
||||
</div>
|
||||
""", unsafe_allow_html=True)
|
||||
|
||||
with st.expander("📖 WordPress-API Berechtigungen", expanded=False):
|
||||
st.markdown("""
|
||||
**Erforderliche Berechtigungen für den WordPress-Benutzer:**
|
||||
|
||||
- `edit_posts` - Beiträge erstellen und bearbeiten
|
||||
- `publish_posts` - Beiträge veröffentlichen (für Status-Änderungen)
|
||||
- `upload_files` - Dateien hochladen (für spätere Bild-Uploads)
|
||||
- `edit_categories` - Kategorien verwalten
|
||||
- `edit_tags` - Tags verwalten
|
||||
|
||||
**Anwendungspasswort erstellen:**
|
||||
1. WordPress Admin → Benutzer → Profil
|
||||
2. Unter "Anwendungspasswörter" neues Passwort erstellen
|
||||
3. Name: "RSS Feed Manager"
|
||||
4. Generiertes Passwort in .env-Datei eintragen
|
||||
""")
|
||||
<div class="article-card">
|
||||
<h3 class="article-title">Erforderliche Berechtigungen für den WordPress-Benutzer:</h3>
|
||||
<div class="article-summary">
|
||||
• <code>edit_posts</code> - Beiträge erstellen und bearbeiten<br>
|
||||
• <code>publish_posts</code> - Beiträge veröffentlichen (für Status-Änderungen)<br>
|
||||
• <code>upload_files</code> - Dateien hochladen (für spätere Bild-Uploads)<br>
|
||||
• <code>edit_categories</code> - Kategorien verwalten<br>
|
||||
• <code>edit_tags</code> - Tags verwalten
|
||||
<br><br>
|
||||
<strong>Anwendungspasswort erstellen:</strong><br>
|
||||
1. WordPress Admin → Benutzer → Profil<br>
|
||||
2. Unter "Anwendungspasswörter" neues Passwort erstellen<br>
|
||||
3. Name: "RSS Feed Manager"<br>
|
||||
4. Generiertes Passwort in .env-Datei eintragen
|
||||
</div>
|
||||
</div>
|
||||
""", unsafe_allow_html=True)
|
||||
|
|
|
|||
45
backend/.env.example
Normal file
45
backend/.env.example
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
# ─── App ────────────────────────────────────────────────────────────────────
|
||||
APP_ENV=development
|
||||
APP_NAME=rss-news-backend
|
||||
APP_SECRET_KEY=replace-with-a-long-random-secret
|
||||
APP_DB_PATH=backend/data/rss_news.db
|
||||
|
||||
APP_ADMIN_USERNAME=admin
|
||||
APP_ADMIN_PASSWORD=change-me
|
||||
|
||||
SESSION_COOKIE_NAME=rss_news_session
|
||||
SESSION_MAX_AGE_SECONDS=28800
|
||||
|
||||
# ─── WordPress ──────────────────────────────────────────────────────────────
|
||||
WP_BASE_URL=https://your-site.tld
|
||||
WP_USERNAME=your-wp-username
|
||||
WP_PASSWORD=your-wp-app-password
|
||||
# Status für neue Beiträge: draft | future | publish
|
||||
WORDPRESS_DEFAULT_STATUS=draft
|
||||
|
||||
# ─── OpenAI ─────────────────────────────────────────────────────────────────
|
||||
OPENAI_API_KEY=sk-...
|
||||
# gpt-4o-mini empfohlen (Kosten/Qualität)
|
||||
OPENAI_MODEL=gpt-4o-mini
|
||||
|
||||
# ─── Telegram Bot ────────────────────────────────────────────────────────────
|
||||
# Bot-Token von @BotFather
|
||||
TELEGRAM_BOT_TOKEN=123456789:ABC...
|
||||
# Chat-ID deines persönlichen Chats oder einer Gruppe
|
||||
TELEGRAM_CHAT_ID=123456789
|
||||
# Zufälliger Secret-Token zur Webhook-Absicherung (mindestens 20 Zeichen)
|
||||
TELEGRAM_WEBHOOK_SECRET=replace-with-random-secret-min-20-chars
|
||||
|
||||
# ─── N8N API-Key ─────────────────────────────────────────────────────────────
|
||||
# Wird von N8N im Header X-API-Key mitgeschickt
|
||||
N8N_API_KEY=replace-with-strong-random-key
|
||||
|
||||
# ─── Pipeline-Einstellungen ──────────────────────────────────────────────────
|
||||
# Relevanz-Score >= dieser Wert: automatisch verarbeiten (0-100)
|
||||
PIPELINE_RELEVANCE_AUTO=80
|
||||
# Relevanz-Score >= dieser Wert, aber < AUTO: Telegram-Warnung senden
|
||||
PIPELINE_RELEVANCE_WARN=60
|
||||
# Maximale Drafts/Veröffentlichungen pro Tag
|
||||
PIPELINE_MAX_DRAFTS_PER_DAY=2
|
||||
# Bevorzugte Veröffentlichungszeiten (Stunden, kommagetrennt, CET)
|
||||
PIPELINE_PUBLISH_HOURS=9,14
|
||||
82
backend/README.md
Normal file
82
backend/README.md
Normal file
|
|
@ -0,0 +1,82 @@
|
|||
# Backend Skeleton (FastAPI)
|
||||
|
||||
Dieses Verzeichnis enthaelt das technische Grundgeruest fuer den Rebuild von `rss-news`.
|
||||
|
||||
## Start (lokal)
|
||||
|
||||
```bash
|
||||
python -m venv .venv
|
||||
source .venv/bin/activate
|
||||
pip install -r backend/requirements.txt
|
||||
uvicorn backend.app.main:app --reload --port 8501
|
||||
```
|
||||
|
||||
## Admin UI
|
||||
- Login: `http://127.0.0.1:8501/admin/login`
|
||||
- Dashboard: `http://127.0.0.1:8501/admin/dashboard`
|
||||
|
||||
## Environment
|
||||
- Datei: `backend/.env`
|
||||
- Vorlage: `backend/.env.example`
|
||||
|
||||
## Endpoints
|
||||
- `GET /health` - Healthcheck
|
||||
- `POST /auth/login` - Login mit Admin-User
|
||||
- `POST /auth/logout` - Logout
|
||||
- `GET /auth/me` - Aktiver User
|
||||
- `GET /api/protected` - Geschuetzter Test-Endpoint
|
||||
- `GET /api/pipeline/status` - Basisstatus inkl. Datensatzzaehler
|
||||
- `GET /api/sources` - Quellenliste
|
||||
- `POST /api/sources` - Quelle anlegen
|
||||
- `GET /api/sources/{source_id}/policy-check` - Policy-Pruefung fuer Quelle
|
||||
- `GET /api/feeds` - Feedliste
|
||||
- `POST /api/feeds` - Feed anlegen
|
||||
- `GET /api/feeds/{feed_id}/policy-check` - Policy-Pruefung fuer Feed
|
||||
- `GET /api/runs` - Import-/Job-Runs anzeigen
|
||||
- `GET /api/runs/{run_id}` - Detailansicht eines Runs
|
||||
- `POST /api/runs` - Run starten
|
||||
- `POST /api/runs/{run_id}/finish` - Run abschliessen
|
||||
- `GET /api/articles` - Artikel anzeigen
|
||||
- `GET /api/articles/{article_id}` - Artikeldetail
|
||||
- `POST /api/articles/upsert` - Artikel idempotent anlegen/aktualisieren
|
||||
- `POST /api/articles/{article_id}/transition` - Statuswechsel nach Workflow-Regeln
|
||||
- `POST /api/articles/{article_id}/review` - Review-Entscheidung (approve/reject)
|
||||
- `POST /api/ingestion/run` - Feed-Ingestion starten (optional pro Feed)
|
||||
|
||||
## Datenbank
|
||||
- SQLite-Datei unter `backend/data/rss_news.db`
|
||||
- Tabellen werden beim App-Start initialisiert.
|
||||
- Tabellen: `sources`, `feeds`, `runs`, `articles`
|
||||
- Dedupe-Strategie Artikel: `source_url` -> `(feed_id, source_article_id)` -> `source_hash`
|
||||
|
||||
## Policy-Enforcement
|
||||
- Ingestion blockiert Feeds automatisch, wenn die zugeordnete Quelle nicht policy-konform ist.
|
||||
- Mindestanforderungen: `risk_level=green`, `terms_url`, `license_name`, `last_reviewed_at`, `is_enabled=1`.
|
||||
- Pro importiertem Artikel wird ein `attribution`-Block in `meta_json` gespeichert.
|
||||
|
||||
## Review-Workflow
|
||||
- Statuskette: `new -> review -> approved -> published`
|
||||
- Ablehnung im Review setzt auf `rewrite`
|
||||
- Ungueltige Statuswechsel werden per API blockiert
|
||||
|
||||
## Verifikation
|
||||
```bash
|
||||
python -m unittest backend.tests.test_db_repositories
|
||||
python -m unittest backend.tests.test_ingestion
|
||||
python -m unittest backend.tests.test_api_auth
|
||||
```
|
||||
|
||||
## CI / Online-Auswertung
|
||||
- GitHub Actions Workflow: `.github/workflows/test.yml`
|
||||
- Fuehrt Tests inkl. Coverage auf Push/PR gegen `main` aus.
|
||||
|
||||
## Hetzner Smoketest
|
||||
```bash
|
||||
BASE_URL="https://news.vanityontour.de" \
|
||||
APP_ADMIN_USERNAME="admin" \
|
||||
APP_ADMIN_PASSWORD="..." \
|
||||
bash scripts/smoke_backend.sh
|
||||
```
|
||||
|
||||
## Hinweis
|
||||
Passwort-Hashing und CSRF/Rate-Limit sind als naechste Ausbaustufe vorgesehen.
|
||||
1
backend/__init__.py
Normal file
1
backend/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
"""Backend package for rss-news rebuild."""
|
||||
1
backend/app/__init__.py
Normal file
1
backend/app/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
"""Application package."""
|
||||
1126
backend/app/admin_ui.py
Normal file
1126
backend/app/admin_ui.py
Normal file
File diff suppressed because it is too large
Load diff
31
backend/app/auth.py
Normal file
31
backend/app/auth.py
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
import hmac
|
||||
from typing import Optional
|
||||
|
||||
from itsdangerous import URLSafeTimedSerializer, BadSignature, SignatureExpired
|
||||
|
||||
from .config import get_settings
|
||||
|
||||
|
||||
def _serializer() -> URLSafeTimedSerializer:
|
||||
settings = get_settings()
|
||||
return URLSafeTimedSerializer(settings.app_secret_key, salt="rss-news-session")
|
||||
|
||||
|
||||
def verify_credentials(username: str, password: str) -> bool:
|
||||
settings = get_settings()
|
||||
user_ok = hmac.compare_digest(username, settings.app_admin_username)
|
||||
pw_ok = hmac.compare_digest(password, settings.app_admin_password)
|
||||
return user_ok and pw_ok
|
||||
|
||||
|
||||
def create_session_token(username: str) -> str:
|
||||
return _serializer().dumps({"username": username})
|
||||
|
||||
|
||||
def verify_session_token(token: str) -> Optional[str]:
|
||||
settings = get_settings()
|
||||
try:
|
||||
payload = _serializer().loads(token, max_age=settings.session_max_age_seconds)
|
||||
except (BadSignature, SignatureExpired):
|
||||
return None
|
||||
return payload.get("username")
|
||||
65
backend/app/config.py
Normal file
65
backend/app/config.py
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from pydantic import AliasChoices, Field
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
# Prefer backend-specific env file to avoid collisions with legacy root .env
|
||||
model_config = SettingsConfigDict(
|
||||
env_file=("backend/.env", ".env"),
|
||||
env_file_encoding="utf-8",
|
||||
extra="ignore",
|
||||
)
|
||||
|
||||
app_env: str = "development"
|
||||
app_name: str = "rss-news-backend"
|
||||
app_secret_key: str = "replace-with-a-long-random-secret"
|
||||
|
||||
app_admin_username: str = "admin"
|
||||
app_admin_password: str = "change-me"
|
||||
|
||||
session_cookie_name: str = "rss_news_session"
|
||||
session_max_age_seconds: int = 28800
|
||||
|
||||
app_db_path: str = "backend/data/rss_news.db"
|
||||
|
||||
wordpress_base_url: str | None = Field(default=None, validation_alias=AliasChoices("WORDPRESS_BASE_URL", "WP_BASE_URL"))
|
||||
wordpress_username: str | None = Field(default=None, validation_alias=AliasChoices("WORDPRESS_USERNAME", "WP_USERNAME"))
|
||||
wordpress_app_password: str | None = Field(default=None, validation_alias=AliasChoices("WORDPRESS_APP_PASSWORD", "WP_PASSWORD"))
|
||||
wordpress_default_status: str = "draft"
|
||||
openai_api_key: str | None = Field(default=None, validation_alias=AliasChoices("OPENAI_API_KEY"))
|
||||
openai_model: str = "gpt-4o-mini"
|
||||
|
||||
# Telegram Bot
|
||||
telegram_bot_token: str | None = Field(default=None, validation_alias=AliasChoices("TELEGRAM_BOT_TOKEN"))
|
||||
telegram_chat_id: str | None = Field(default=None, validation_alias=AliasChoices("TELEGRAM_CHAT_ID"))
|
||||
telegram_webhook_secret: str | None = Field(default=None, validation_alias=AliasChoices("TELEGRAM_WEBHOOK_SECRET"))
|
||||
|
||||
# N8N API authentication
|
||||
n8n_api_key: str | None = Field(default=None, validation_alias=AliasChoices("N8N_API_KEY"))
|
||||
|
||||
# Pipeline behaviour
|
||||
pipeline_relevance_auto: int = 80 # >= this: auto-process
|
||||
pipeline_relevance_warn: int = 60 # >= this: Telegram warning, else reject
|
||||
pipeline_max_drafts_per_day: int = 2
|
||||
pipeline_publish_hours: str = "9,14" # comma-separated preferred publish hours (CET)
|
||||
pipeline_min_words_raw: int = 120 # minimum words in raw content before rewrite (else reject)
|
||||
pipeline_min_words_rewritten: int = 150 # minimum words in rewritten content (else reject)
|
||||
pipeline_max_article_age_days: int = 7 # skip articles older than N days during ingestion (0 = no limit)
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def get_settings() -> Settings:
|
||||
# Prefer shared legacy env from the original rss-news workspace if present.
|
||||
env_candidates = (
|
||||
Path("/Users/oliver/Documents/rss-news/.env"),
|
||||
Path("backend/.env"),
|
||||
Path(".env"),
|
||||
)
|
||||
for env_path in env_candidates:
|
||||
if env_path.exists():
|
||||
load_dotenv(env_path, override=False)
|
||||
return Settings()
|
||||
293
backend/app/db.py
Normal file
293
backend/app/db.py
Normal file
|
|
@ -0,0 +1,293 @@
|
|||
import sqlite3
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterator
|
||||
|
||||
from .config import get_settings
|
||||
|
||||
|
||||
def _db_path() -> Path:
|
||||
settings = get_settings()
|
||||
path = Path(settings.app_db_path)
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
return path
|
||||
|
||||
|
||||
@contextmanager
|
||||
def get_conn() -> Iterator[sqlite3.Connection]:
|
||||
conn = sqlite3.connect(_db_path())
|
||||
conn.row_factory = sqlite3.Row
|
||||
conn.execute("PRAGMA foreign_keys=ON;")
|
||||
try:
|
||||
yield conn
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def init_db() -> None:
|
||||
with get_conn() as conn:
|
||||
conn.executescript(
|
||||
"""
|
||||
PRAGMA journal_mode=WAL;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS sources (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
name TEXT NOT NULL,
|
||||
base_url TEXT,
|
||||
terms_url TEXT,
|
||||
license_name TEXT,
|
||||
risk_level TEXT NOT NULL DEFAULT 'yellow' CHECK (risk_level IN ('green', 'yellow', 'red')),
|
||||
is_enabled INTEGER NOT NULL DEFAULT 0,
|
||||
notes TEXT,
|
||||
last_reviewed_at TEXT,
|
||||
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS feeds (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
source_id INTEGER,
|
||||
name TEXT NOT NULL,
|
||||
url TEXT NOT NULL UNIQUE,
|
||||
is_enabled INTEGER NOT NULL DEFAULT 1,
|
||||
etag TEXT,
|
||||
last_modified TEXT,
|
||||
last_checked_at TEXT,
|
||||
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
FOREIGN KEY(source_id) REFERENCES sources(id) ON DELETE SET NULL
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS runs (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
run_type TEXT NOT NULL,
|
||||
status TEXT NOT NULL CHECK (status IN ('queued', 'running', 'success', 'failed')),
|
||||
started_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
finished_at TEXT,
|
||||
details TEXT
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS publish_jobs (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
article_id INTEGER NOT NULL,
|
||||
status TEXT NOT NULL CHECK (status IN ('queued', 'running', 'success', 'failed')),
|
||||
attempts INTEGER NOT NULL DEFAULT 0,
|
||||
max_attempts INTEGER NOT NULL DEFAULT 3,
|
||||
error_message TEXT,
|
||||
wp_post_id INTEGER,
|
||||
wp_post_url TEXT,
|
||||
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
started_at TEXT,
|
||||
finished_at TEXT,
|
||||
FOREIGN KEY(article_id) REFERENCES articles(id) ON DELETE CASCADE
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS articles (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
feed_id INTEGER,
|
||||
source_article_id TEXT,
|
||||
source_hash TEXT,
|
||||
title TEXT NOT NULL,
|
||||
source_url TEXT NOT NULL,
|
||||
canonical_url TEXT,
|
||||
published_at TEXT,
|
||||
author TEXT,
|
||||
summary TEXT,
|
||||
content_raw TEXT,
|
||||
content_rewritten TEXT,
|
||||
image_urls_json TEXT,
|
||||
press_contact TEXT,
|
||||
source_name_snapshot TEXT,
|
||||
source_terms_url_snapshot TEXT,
|
||||
source_license_name_snapshot TEXT,
|
||||
legal_checked INTEGER NOT NULL DEFAULT 0,
|
||||
legal_checked_at TEXT,
|
||||
legal_note TEXT,
|
||||
wp_post_id INTEGER,
|
||||
wp_post_url TEXT,
|
||||
publish_attempts INTEGER NOT NULL DEFAULT 0,
|
||||
publish_last_error TEXT,
|
||||
published_to_wp_at TEXT,
|
||||
word_count INTEGER DEFAULT 0,
|
||||
status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error', 'no_image')),
|
||||
meta_json TEXT,
|
||||
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
FOREIGN KEY(feed_id) REFERENCES feeds(id) ON DELETE SET NULL,
|
||||
UNIQUE(source_url)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_articles_source_article_id ON articles(source_article_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_articles_source_hash ON articles(source_hash);
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_feed_source_article_id
|
||||
ON articles(feed_id, source_article_id)
|
||||
WHERE source_article_id IS NOT NULL;
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_source_hash
|
||||
ON articles(source_hash)
|
||||
WHERE source_hash IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_articles_status ON articles(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_source_id ON feeds(source_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_runs_started_at ON runs(started_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_articles_published_at ON articles(published_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_publish_jobs_status_created_at ON publish_jobs(status, created_at);
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS trg_sources_updated_at
|
||||
AFTER UPDATE ON sources
|
||||
FOR EACH ROW
|
||||
BEGIN
|
||||
UPDATE sources SET updated_at = datetime('now') WHERE id = OLD.id;
|
||||
END;
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS trg_feeds_updated_at
|
||||
AFTER UPDATE ON feeds
|
||||
FOR EACH ROW
|
||||
BEGIN
|
||||
UPDATE feeds SET updated_at = datetime('now') WHERE id = OLD.id;
|
||||
END;
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS trg_articles_updated_at
|
||||
AFTER UPDATE ON articles
|
||||
FOR EACH ROW
|
||||
BEGIN
|
||||
UPDATE articles SET updated_at = datetime('now') WHERE id = OLD.id;
|
||||
END;
|
||||
"""
|
||||
)
|
||||
|
||||
# Lightweight migration for existing DBs created before source_hash was introduced.
|
||||
existing_columns = {
|
||||
row["name"] for row in conn.execute("PRAGMA table_info(articles)").fetchall()
|
||||
}
|
||||
migration_columns = {
|
||||
"relevance_score": "ALTER TABLE articles ADD COLUMN relevance_score INTEGER",
|
||||
"scheduled_publish_at": "ALTER TABLE articles ADD COLUMN scheduled_publish_at TEXT",
|
||||
"source_hash": "ALTER TABLE articles ADD COLUMN source_hash TEXT",
|
||||
"image_urls_json": "ALTER TABLE articles ADD COLUMN image_urls_json TEXT",
|
||||
"press_contact": "ALTER TABLE articles ADD COLUMN press_contact TEXT",
|
||||
"source_name_snapshot": "ALTER TABLE articles ADD COLUMN source_name_snapshot TEXT",
|
||||
"source_terms_url_snapshot": "ALTER TABLE articles ADD COLUMN source_terms_url_snapshot TEXT",
|
||||
"source_license_name_snapshot": "ALTER TABLE articles ADD COLUMN source_license_name_snapshot TEXT",
|
||||
"legal_checked": "ALTER TABLE articles ADD COLUMN legal_checked INTEGER NOT NULL DEFAULT 0",
|
||||
"legal_checked_at": "ALTER TABLE articles ADD COLUMN legal_checked_at TEXT",
|
||||
"legal_note": "ALTER TABLE articles ADD COLUMN legal_note TEXT",
|
||||
"wp_post_id": "ALTER TABLE articles ADD COLUMN wp_post_id INTEGER",
|
||||
"wp_post_url": "ALTER TABLE articles ADD COLUMN wp_post_url TEXT",
|
||||
"publish_attempts": "ALTER TABLE articles ADD COLUMN publish_attempts INTEGER NOT NULL DEFAULT 0",
|
||||
"publish_last_error": "ALTER TABLE articles ADD COLUMN publish_last_error TEXT",
|
||||
"published_to_wp_at": "ALTER TABLE articles ADD COLUMN published_to_wp_at TEXT",
|
||||
}
|
||||
for column, ddl in migration_columns.items():
|
||||
if column not in existing_columns:
|
||||
conn.execute(ddl)
|
||||
|
||||
# Migration: add 'no_image' to the status CHECK constraint if not present.
|
||||
# SQLite cannot modify CHECK constraints in-place, so we recreate the table.
|
||||
table_sql_row = conn.execute(
|
||||
"SELECT sql FROM sqlite_master WHERE type='table' AND name='articles'"
|
||||
).fetchone()
|
||||
if table_sql_row and "'no_image'" not in (table_sql_row["sql"] or ""):
|
||||
conn.executescript(
|
||||
"""
|
||||
PRAGMA foreign_keys=OFF;
|
||||
|
||||
CREATE TABLE articles_v2 (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
feed_id INTEGER,
|
||||
source_article_id TEXT,
|
||||
source_hash TEXT,
|
||||
title TEXT NOT NULL,
|
||||
source_url TEXT NOT NULL,
|
||||
canonical_url TEXT,
|
||||
published_at TEXT,
|
||||
author TEXT,
|
||||
summary TEXT,
|
||||
content_raw TEXT,
|
||||
content_rewritten TEXT,
|
||||
image_urls_json TEXT,
|
||||
press_contact TEXT,
|
||||
source_name_snapshot TEXT,
|
||||
source_terms_url_snapshot TEXT,
|
||||
source_license_name_snapshot TEXT,
|
||||
legal_checked INTEGER NOT NULL DEFAULT 0,
|
||||
legal_checked_at TEXT,
|
||||
legal_note TEXT,
|
||||
wp_post_id INTEGER,
|
||||
wp_post_url TEXT,
|
||||
publish_attempts INTEGER NOT NULL DEFAULT 0,
|
||||
publish_last_error TEXT,
|
||||
published_to_wp_at TEXT,
|
||||
word_count INTEGER DEFAULT 0,
|
||||
status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error', 'no_image')),
|
||||
meta_json TEXT,
|
||||
relevance_score INTEGER,
|
||||
scheduled_publish_at TEXT,
|
||||
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
FOREIGN KEY(feed_id) REFERENCES feeds(id) ON DELETE SET NULL,
|
||||
UNIQUE(source_url)
|
||||
);
|
||||
|
||||
INSERT INTO articles_v2 SELECT
|
||||
id, feed_id, source_article_id, source_hash, title, source_url,
|
||||
canonical_url, published_at, author, summary, content_raw,
|
||||
content_rewritten, image_urls_json, press_contact,
|
||||
source_name_snapshot, source_terms_url_snapshot, source_license_name_snapshot,
|
||||
legal_checked, legal_checked_at, legal_note,
|
||||
wp_post_id, wp_post_url, publish_attempts, publish_last_error,
|
||||
published_to_wp_at, word_count, status, meta_json,
|
||||
relevance_score, scheduled_publish_at, created_at, updated_at
|
||||
FROM articles;
|
||||
|
||||
DROP TABLE articles;
|
||||
ALTER TABLE articles_v2 RENAME TO articles;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_articles_source_article_id ON articles(source_article_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_articles_source_hash ON articles(source_hash);
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_feed_source_article_id
|
||||
ON articles(feed_id, source_article_id)
|
||||
WHERE source_article_id IS NOT NULL;
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_source_hash
|
||||
ON articles(source_hash)
|
||||
WHERE source_hash IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_articles_status ON articles(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_articles_published_at ON articles(published_at);
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS trg_articles_updated_at
|
||||
AFTER UPDATE ON articles
|
||||
FOR EACH ROW
|
||||
BEGIN
|
||||
UPDATE articles SET updated_at = datetime('now') WHERE id = OLD.id;
|
||||
END;
|
||||
|
||||
PRAGMA foreign_keys=ON;
|
||||
"""
|
||||
)
|
||||
|
||||
table_rows = conn.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type = 'table' AND name = 'publish_jobs'"
|
||||
).fetchall()
|
||||
if not table_rows:
|
||||
conn.executescript(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS publish_jobs (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
article_id INTEGER NOT NULL,
|
||||
status TEXT NOT NULL CHECK (status IN ('queued', 'running', 'success', 'failed')),
|
||||
attempts INTEGER NOT NULL DEFAULT 0,
|
||||
max_attempts INTEGER NOT NULL DEFAULT 3,
|
||||
error_message TEXT,
|
||||
wp_post_id INTEGER,
|
||||
wp_post_url TEXT,
|
||||
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
started_at TEXT,
|
||||
finished_at TEXT,
|
||||
FOREIGN KEY(article_id) REFERENCES articles(id) ON DELETE CASCADE
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_publish_jobs_status_created_at ON publish_jobs(status, created_at);
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def rows_to_dicts(rows: list[sqlite3.Row]) -> list[dict[str, Any]]:
|
||||
return [dict(r) for r in rows]
|
||||
486
backend/app/ingestion.py
Normal file
486
backend/app/ingestion.py
Normal file
|
|
@ -0,0 +1,486 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timedelta, timezone
|
||||
import hashlib
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
from typing import Any
|
||||
from urllib.parse import unquote, urlencode, urlparse, parse_qs
|
||||
import urllib.error
|
||||
import urllib.request as _urllib_req
|
||||
|
||||
import feedparser
|
||||
|
||||
from .repositories import (
|
||||
ArticleUpsert,
|
||||
RunCreate,
|
||||
create_run,
|
||||
find_existing_article_for_upsert,
|
||||
finish_run,
|
||||
get_feed_by_id,
|
||||
list_enabled_feeds,
|
||||
update_feed_fetch_state,
|
||||
upsert_article,
|
||||
)
|
||||
from .source_extraction import extract_article, extracted_article_to_meta
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class IngestionStats:
|
||||
run_id: int
|
||||
feeds_processed: int
|
||||
entries_seen: int
|
||||
articles_upserted: int
|
||||
status: str
|
||||
message: str
|
||||
|
||||
|
||||
MAX_FEED_FETCH_RETRIES = 3
|
||||
|
||||
|
||||
def _normalize_article_url(url: str) -> str:
|
||||
"""Strip AMP and tracking query parameters from article URLs.
|
||||
|
||||
Removes ?outputType=valid_amp and other AMP/tracking params so that
|
||||
AMP and non-AMP versions of the same article are deduplicated.
|
||||
"""
|
||||
_AMP_PARAMS = {"outputtype", "amp", "outputformat"}
|
||||
try:
|
||||
from urllib.parse import parse_qs, urlencode
|
||||
parsed = urlparse(url)
|
||||
if not parsed.query:
|
||||
return url
|
||||
params = parse_qs(parsed.query, keep_blank_values=True)
|
||||
filtered = {k: v for k, v in params.items() if k.lower() not in _AMP_PARAMS}
|
||||
new_query = urlencode(filtered, doseq=True)
|
||||
return parsed._replace(query=new_query).geturl()
|
||||
except Exception:
|
||||
return url
|
||||
|
||||
|
||||
def _resolve_google_redirect(url: str) -> str:
|
||||
"""Extract the real article URL from Google redirect URLs.
|
||||
|
||||
Google Alerts feed entries use tracking links like:
|
||||
https://www.google.com/url?rct=j&sa=t&url=<encoded_real_url>&ct=ga&...
|
||||
|
||||
This function returns the decoded real URL if detected, otherwise the
|
||||
original URL unchanged.
|
||||
"""
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
host = (parsed.hostname or "").lower()
|
||||
if host not in ("www.google.com", "google.com"):
|
||||
return url
|
||||
if parsed.path not in ("/url", "/url/"):
|
||||
return url
|
||||
params = parse_qs(parsed.query, keep_blank_values=False)
|
||||
real_urls = params.get("url")
|
||||
if real_urls:
|
||||
return unquote(real_urls[0])
|
||||
except Exception:
|
||||
pass
|
||||
return url
|
||||
|
||||
|
||||
def _entry_published_iso(entry: dict) -> str | None:
|
||||
published = entry.get("published_parsed") or entry.get("updated_parsed")
|
||||
if not published:
|
||||
return None
|
||||
return datetime(*published[:6], tzinfo=timezone.utc).isoformat()
|
||||
|
||||
|
||||
def _entry_text(entry: dict) -> tuple[str, str]:
|
||||
summary = entry.get("summary", "") or ""
|
||||
content = ""
|
||||
if entry.get("content") and isinstance(entry.get("content"), list):
|
||||
first = entry["content"][0]
|
||||
content = first.get("value", "") if isinstance(first, dict) else ""
|
||||
if not content:
|
||||
content = summary
|
||||
return summary, content
|
||||
|
||||
|
||||
def _entry_hash(entry: dict, feed_id: int, link: str, title: str, summary: str) -> str:
|
||||
source_id = entry.get("id") or entry.get("guid") or ""
|
||||
published = _entry_published_iso(entry) or ""
|
||||
fingerprint = f"{feed_id}|{source_id}|{link}|{title.strip()}|{summary.strip()}|{published}"
|
||||
return hashlib.sha256(fingerprint.encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
def _parsed_get(parsed: object, key: str, default: object = None) -> object:
|
||||
if isinstance(parsed, dict):
|
||||
return parsed.get(key, default)
|
||||
return getattr(parsed, key, default)
|
||||
|
||||
|
||||
def _normalize_tokens(text: str) -> set[str]:
|
||||
normalized = re.sub(r"[^a-z0-9]+", " ", text.lower())
|
||||
return {token for token in normalized.split() if len(token) >= 4}
|
||||
|
||||
|
||||
def _probe_image_url(url: str, timeout: int = 5) -> bool:
|
||||
"""Return True if URL responds without a 4xx/5xx error (HEAD request).
|
||||
|
||||
Returns True on network/connection errors so that a flaky server does not
|
||||
cause a valid image to be silently dropped.
|
||||
"""
|
||||
try:
|
||||
req = _urllib_req.Request(
|
||||
url,
|
||||
method="HEAD",
|
||||
headers={"User-Agent": "Mozilla/5.0 (compatible; rss-news/1.0)"},
|
||||
)
|
||||
with _urllib_req.urlopen(req, timeout=timeout) as resp:
|
||||
return resp.status < 400
|
||||
except urllib.error.HTTPError as exc:
|
||||
return exc.code < 400 # 3xx redirects are OK; 4xx/5xx are not
|
||||
except Exception:
|
||||
return True # network error → don't filter, let WP try later
|
||||
|
||||
|
||||
def _rank_image_candidates(source_url: str, title: str, images: list[str]) -> list[dict[str, Any]]:
|
||||
source_host = (urlparse(source_url).hostname or "").lower()
|
||||
is_presseportal = "presseportal.de" in source_host
|
||||
title_tokens = _normalize_tokens(title)
|
||||
blocked_patterns = ("logo", "badge", "app-store", "google-play", "na-logo", "sprite", "icon", "favicon", "tracking", "pixel", ".svg", ".ico", ".gif")
|
||||
# Known placeholder/default images that should never be used as featured image
|
||||
placeholder_patterns = ("some-default.jpg", "default-image", "placeholder", "no-image", "noimage")
|
||||
|
||||
|
||||
ranked: list[dict[str, Any]] = []
|
||||
for url in images:
|
||||
# Skip inline data: URIs (e.g. base64-encoded SVG placeholders)
|
||||
if url.startswith("data:"):
|
||||
continue
|
||||
|
||||
parsed = urlparse(url)
|
||||
path = unquote(parsed.path.lower())
|
||||
full = f"{parsed.netloc.lower()}{path}"
|
||||
score = 0
|
||||
reasons: list[str] = []
|
||||
|
||||
if any(token in full for token in placeholder_patterns):
|
||||
score -= 300
|
||||
reasons.append("placeholder-image")
|
||||
|
||||
if any(token in full for token in blocked_patterns):
|
||||
score -= 150
|
||||
reasons.append("blocked-pattern")
|
||||
|
||||
if is_presseportal and "/thumbnail/story_big/" in path:
|
||||
score += 120
|
||||
reasons.append("presseportal-story-big")
|
||||
elif is_presseportal and "/thumbnail/highlight/" in path:
|
||||
score += 45
|
||||
reasons.append("presseportal-highlight")
|
||||
elif is_presseportal and "/thumbnail/liste/" in path:
|
||||
score -= 40
|
||||
reasons.append("presseportal-list")
|
||||
|
||||
if "crop=" in (parsed.query or "").lower():
|
||||
score -= 10
|
||||
reasons.append("cropped-preview")
|
||||
|
||||
path_tokens = _normalize_tokens(path.replace("-", " "))
|
||||
overlap = len(title_tokens.intersection(path_tokens))
|
||||
if overlap > 0:
|
||||
score += min(30, overlap * 6)
|
||||
reasons.append(f"title-match:{overlap}")
|
||||
|
||||
ranked.append({"url": url, "score": score, "reasons": reasons})
|
||||
|
||||
ranked.sort(key=lambda item: item["score"], reverse=True)
|
||||
return ranked
|
||||
|
||||
|
||||
def _select_relevant_images(source_url: str, title: str, images: list[str], max_keep: int = 3) -> tuple[list[str], str | None, list[dict[str, Any]]]:
|
||||
# dedupe incoming order first
|
||||
deduped: list[str] = []
|
||||
seen: set[str] = set()
|
||||
for image in images:
|
||||
if image and image not in seen:
|
||||
seen.add(image)
|
||||
deduped.append(image)
|
||||
|
||||
ranked = _rank_image_candidates(source_url, title, deduped)
|
||||
candidates = [item["url"] for item in ranked if item["score"] > -100]
|
||||
|
||||
# Probe top candidates (max 4) to skip definitively broken URLs (HTTP 4xx).
|
||||
# Network errors are treated as OK to avoid false negatives on flaky servers.
|
||||
primary = None
|
||||
kept: list[str] = []
|
||||
for url in candidates[:4]:
|
||||
if _probe_image_url(url):
|
||||
if primary is None:
|
||||
primary = url
|
||||
kept.append(url)
|
||||
if len(kept) >= max_keep:
|
||||
break
|
||||
|
||||
# Fallback: if all probes failed with network errors, use best candidate anyway
|
||||
if not kept and candidates:
|
||||
primary = candidates[0]
|
||||
kept = candidates[:max_keep]
|
||||
|
||||
return kept, primary, ranked
|
||||
|
||||
|
||||
def _merge_ingestion_meta(existing_meta_json: str | None, attribution: dict[str, Any], extraction_meta: dict[str, Any]) -> str:
|
||||
meta: dict[str, Any] = {}
|
||||
if existing_meta_json:
|
||||
try:
|
||||
parsed = json.loads(existing_meta_json)
|
||||
if isinstance(parsed, dict):
|
||||
meta = parsed
|
||||
except Exception:
|
||||
meta = {}
|
||||
meta["attribution"] = attribution
|
||||
meta["extraction"] = extraction_meta
|
||||
return json.dumps(meta, ensure_ascii=False)
|
||||
|
||||
|
||||
def run_ingestion(feed_id: int | None = None) -> IngestionStats:
|
||||
run_id = create_run(RunCreate(run_type="ingestion", status="running", details="started"))
|
||||
feeds_processed = 0
|
||||
entries_seen = 0
|
||||
articles_upserted = 0
|
||||
feed_results: list[dict[str, object]] = []
|
||||
|
||||
try:
|
||||
if feed_id is not None:
|
||||
feed = get_feed_by_id(feed_id)
|
||||
feeds = [feed] if feed and int(feed.get("is_enabled", 0)) == 1 else []
|
||||
else:
|
||||
feeds = list_enabled_feeds()
|
||||
|
||||
for feed in feeds:
|
||||
if not feed:
|
||||
continue
|
||||
feeds_processed += 1
|
||||
|
||||
parsed = None
|
||||
feed_error = None
|
||||
for attempt in range(1, MAX_FEED_FETCH_RETRIES + 1):
|
||||
try:
|
||||
parsed = feedparser.parse(
|
||||
feed["url"],
|
||||
etag=feed.get("etag"),
|
||||
modified=feed.get("last_modified"),
|
||||
)
|
||||
break
|
||||
except Exception as exc:
|
||||
feed_error = str(exc)
|
||||
if attempt < MAX_FEED_FETCH_RETRIES:
|
||||
time.sleep(0.5 * attempt)
|
||||
|
||||
if parsed is None:
|
||||
feed_results.append(
|
||||
{
|
||||
"feed_id": int(feed["id"]),
|
||||
"feed_url": feed["url"],
|
||||
"status": "failed",
|
||||
"error": feed_error or "unknown",
|
||||
"entries_seen": 0,
|
||||
"upserts": 0,
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
# Persist ETag/Last-Modified for conditional requests.
|
||||
parsed_etag = _parsed_get(parsed, "etag")
|
||||
parsed_modified = _parsed_get(parsed, "modified")
|
||||
if parsed_modified and not isinstance(parsed_modified, str):
|
||||
parsed_modified = str(parsed_modified)
|
||||
update_feed_fetch_state(
|
||||
feed_id=int(feed["id"]),
|
||||
etag=parsed_etag if isinstance(parsed_etag, str) else None,
|
||||
last_modified=parsed_modified if isinstance(parsed_modified, str) else None,
|
||||
)
|
||||
|
||||
feed_entries_seen = 0
|
||||
feed_upserts = 0
|
||||
from .config import get_settings as _get_settings
|
||||
_max_age_days = _get_settings().pipeline_max_article_age_days
|
||||
for entry in _parsed_get(parsed, "entries", []):
|
||||
entries_seen += 1
|
||||
feed_entries_seen += 1
|
||||
link = entry.get("link")
|
||||
if not link:
|
||||
continue
|
||||
|
||||
# Age filter: skip articles older than max_age_days (0 = no limit)
|
||||
if _max_age_days > 0:
|
||||
published_iso = _entry_published_iso(entry)
|
||||
if published_iso:
|
||||
try:
|
||||
published_dt = datetime.fromisoformat(published_iso)
|
||||
age = datetime.now(timezone.utc) - published_dt
|
||||
if age > timedelta(days=_max_age_days):
|
||||
continue
|
||||
except Exception:
|
||||
pass # can't parse date → allow through
|
||||
|
||||
# Resolve Google redirect URLs (google.com/url?...&url=<actual_url>&...)
|
||||
link = _resolve_google_redirect(link)
|
||||
# Normalize AMP/tracking params (e.g. ?outputType=valid_amp)
|
||||
link = _normalize_article_url(link)
|
||||
|
||||
summary, content_raw = _entry_text(entry)
|
||||
# Strip HTML tags from title (Google Alerts wraps matched keywords in <b>)
|
||||
raw_title = entry.get("title") or "Ohne Titel"
|
||||
title = re.sub(r"<[^>]+>", "", raw_title).strip() or "Ohne Titel"
|
||||
extracted = extract_article(link)
|
||||
|
||||
final_title = extracted.title or title
|
||||
final_author = extracted.author or entry.get("author")
|
||||
final_summary = extracted.summary or (summary[:1000] if summary else None)
|
||||
final_content_raw = extracted.content_text or content_raw
|
||||
final_canonical = extracted.canonical_url or entry.get("link")
|
||||
selected_images, primary_image, ranked_images = _select_relevant_images(
|
||||
link,
|
||||
final_title,
|
||||
extracted.images,
|
||||
max_keep=3,
|
||||
)
|
||||
|
||||
source_hash = _entry_hash(
|
||||
entry,
|
||||
int(feed["id"]),
|
||||
link,
|
||||
final_title,
|
||||
final_summary or "",
|
||||
)
|
||||
attribution = {
|
||||
"source_name": feed.get("source_name"),
|
||||
"source_base_url": feed.get("source_base_url"),
|
||||
"source_terms_url": feed.get("source_terms_url"),
|
||||
"source_license_name": feed.get("source_license_name"),
|
||||
"source_risk_level": feed.get("source_risk_level"),
|
||||
"original_link": link,
|
||||
"feed_name": feed.get("name"),
|
||||
"feed_id": int(feed["id"]),
|
||||
"imported_at": datetime.now(timezone.utc).isoformat(),
|
||||
}
|
||||
extraction_meta: dict[str, Any] = extracted_article_to_meta(extracted)
|
||||
extraction_meta["fetched_from"] = link
|
||||
extraction_meta["image_selection"] = {
|
||||
"primary": primary_image,
|
||||
"selected_count": len(selected_images),
|
||||
"total_candidates": len(extracted.images),
|
||||
"ranked": ranked_images,
|
||||
}
|
||||
base_payload = ArticleUpsert(
|
||||
feed_id=int(feed["id"]),
|
||||
source_article_id=entry.get("id") or entry.get("guid"),
|
||||
source_hash=source_hash,
|
||||
title=final_title,
|
||||
source_url=link,
|
||||
canonical_url=final_canonical,
|
||||
published_at=_entry_published_iso(entry),
|
||||
author=final_author,
|
||||
summary=final_summary,
|
||||
content_raw=final_content_raw,
|
||||
content_rewritten=None,
|
||||
image_urls_json=json.dumps(selected_images, ensure_ascii=False) if selected_images else None,
|
||||
press_contact=extracted.press_contact,
|
||||
source_name_snapshot=feed.get("source_name"),
|
||||
source_terms_url_snapshot=feed.get("source_terms_url"),
|
||||
source_license_name_snapshot=feed.get("source_license_name"),
|
||||
legal_checked=False,
|
||||
legal_checked_at=None,
|
||||
legal_note=None,
|
||||
wp_post_id=None,
|
||||
wp_post_url=None,
|
||||
publish_attempts=0,
|
||||
publish_last_error=None,
|
||||
published_to_wp_at=None,
|
||||
word_count=len((final_content_raw or "").split()),
|
||||
status="new",
|
||||
meta_json=json.dumps({"attribution": attribution, "extraction": extraction_meta}, ensure_ascii=False),
|
||||
)
|
||||
existing = find_existing_article_for_upsert(base_payload)
|
||||
if existing and existing.get("status") == "error":
|
||||
# Explicitly closed article: ignore on subsequent ingestion runs.
|
||||
continue
|
||||
|
||||
payload = base_payload
|
||||
if existing:
|
||||
payload = ArticleUpsert(
|
||||
feed_id=base_payload.feed_id,
|
||||
source_article_id=base_payload.source_article_id,
|
||||
source_hash=base_payload.source_hash,
|
||||
title=base_payload.title,
|
||||
source_url=base_payload.source_url,
|
||||
canonical_url=base_payload.canonical_url,
|
||||
published_at=base_payload.published_at,
|
||||
author=base_payload.author,
|
||||
summary=base_payload.summary,
|
||||
content_raw=base_payload.content_raw,
|
||||
content_rewritten=existing.get("content_rewritten"),
|
||||
image_urls_json=base_payload.image_urls_json,
|
||||
press_contact=base_payload.press_contact or existing.get("press_contact"),
|
||||
source_name_snapshot=base_payload.source_name_snapshot,
|
||||
source_terms_url_snapshot=base_payload.source_terms_url_snapshot,
|
||||
source_license_name_snapshot=base_payload.source_license_name_snapshot,
|
||||
legal_checked=bool(int(existing.get("legal_checked", 0))),
|
||||
legal_checked_at=existing.get("legal_checked_at"),
|
||||
legal_note=existing.get("legal_note"),
|
||||
wp_post_id=existing.get("wp_post_id"),
|
||||
wp_post_url=existing.get("wp_post_url"),
|
||||
publish_attempts=int(existing.get("publish_attempts", 0)),
|
||||
publish_last_error=existing.get("publish_last_error"),
|
||||
published_to_wp_at=existing.get("published_to_wp_at"),
|
||||
word_count=base_payload.word_count,
|
||||
status=existing.get("status") or "new",
|
||||
meta_json=_merge_ingestion_meta(existing.get("meta_json"), attribution, extraction_meta),
|
||||
)
|
||||
|
||||
article_id = upsert_article(payload)
|
||||
if article_id:
|
||||
articles_upserted += 1
|
||||
feed_upserts += 1
|
||||
|
||||
feed_results.append(
|
||||
{
|
||||
"feed_id": int(feed["id"]),
|
||||
"feed_url": feed["url"],
|
||||
"status": "success",
|
||||
"entries_seen": feed_entries_seen,
|
||||
"upserts": feed_upserts,
|
||||
}
|
||||
)
|
||||
|
||||
finish_run(
|
||||
run_id=run_id,
|
||||
status="success",
|
||||
details=json.dumps(
|
||||
{
|
||||
"feeds_processed": feeds_processed,
|
||||
"entries_seen": entries_seen,
|
||||
"upserts": articles_upserted,
|
||||
"feeds": feed_results,
|
||||
},
|
||||
ensure_ascii=False,
|
||||
),
|
||||
)
|
||||
return IngestionStats(
|
||||
run_id=run_id,
|
||||
feeds_processed=feeds_processed,
|
||||
entries_seen=entries_seen,
|
||||
articles_upserted=articles_upserted,
|
||||
status="success",
|
||||
message="Ingestion abgeschlossen",
|
||||
)
|
||||
except Exception as exc:
|
||||
finish_run(run_id=run_id, status="failed", details=str(exc))
|
||||
return IngestionStats(
|
||||
run_id=run_id,
|
||||
feeds_processed=feeds_processed,
|
||||
entries_seen=entries_seen,
|
||||
articles_upserted=articles_upserted,
|
||||
status="failed",
|
||||
message=str(exc),
|
||||
)
|
||||
727
backend/app/main.py
Normal file
727
backend/app/main.py
Normal file
|
|
@ -0,0 +1,727 @@
|
|||
import asyncio
|
||||
from contextlib import asynccontextmanager
|
||||
import csv
|
||||
from datetime import datetime, timezone
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import Depends, FastAPI, HTTPException, Request, Response, status
|
||||
from fastapi.responses import JSONResponse
|
||||
from pydantic import BaseModel, Field
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
|
||||
from .admin_ui import router as admin_router
|
||||
from .auth import create_session_token, verify_credentials, verify_session_token
|
||||
from .config import get_settings
|
||||
from .db import init_db
|
||||
from .ingestion import run_ingestion
|
||||
from .pipeline import run_auto_pipeline
|
||||
from .policy import evaluate_source_policy, is_source_allowed
|
||||
from .publisher import enqueue_publish, run_publisher
|
||||
from .relevance import article_age_days, article_relevance
|
||||
from .rewrite import generate_article_tags, merge_generated_tags, rewrite_article_text
|
||||
from .telegram_bot import handle_update, setup_webhook
|
||||
from .repositories import (
|
||||
ArticleUpsert,
|
||||
FeedCreate,
|
||||
RunCreate,
|
||||
SourceCreate,
|
||||
create_feed as repo_create_feed,
|
||||
create_run,
|
||||
create_source as repo_create_source,
|
||||
finish_run,
|
||||
get_article_by_id,
|
||||
get_feed_by_id,
|
||||
get_run_by_id,
|
||||
get_source_by_id,
|
||||
list_publish_jobs,
|
||||
list_articles as repo_list_articles,
|
||||
list_feeds as repo_list_feeds,
|
||||
list_runs,
|
||||
list_sources as repo_list_sources,
|
||||
set_article_legal_review,
|
||||
update_article_status,
|
||||
upsert_article as repo_upsert_article,
|
||||
)
|
||||
from .workflow import ALLOWED_UI_TRANSITIONS, UI_STATUSES, internal_to_ui_status, ui_to_internal_status
|
||||
|
||||
settings = get_settings()
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def app_lifespan(_: FastAPI):
|
||||
init_db()
|
||||
yield
|
||||
|
||||
|
||||
app = FastAPI(title=settings.app_name, lifespan=app_lifespan)
|
||||
app.include_router(admin_router)
|
||||
app.mount(
|
||||
"/admin/static",
|
||||
StaticFiles(directory=str(Path(__file__).resolve().parent.parent / "static")),
|
||||
name="admin-static",
|
||||
)
|
||||
|
||||
|
||||
class LoginRequest(BaseModel):
|
||||
username: str
|
||||
password: str
|
||||
|
||||
|
||||
class SourceCreateRequest(BaseModel):
|
||||
name: str = Field(min_length=1, max_length=200)
|
||||
base_url: str | None = None
|
||||
terms_url: str | None = None
|
||||
license_name: str | None = None
|
||||
risk_level: str = Field(default="yellow", pattern="^(green|yellow|red)$")
|
||||
is_enabled: bool = False
|
||||
notes: str | None = None
|
||||
last_reviewed_at: str | None = None
|
||||
|
||||
|
||||
class FeedCreateRequest(BaseModel):
|
||||
name: str = Field(min_length=1, max_length=200)
|
||||
url: str = Field(min_length=5, max_length=1000)
|
||||
source_id: int | None = None
|
||||
is_enabled: bool = True
|
||||
|
||||
|
||||
class RunCreateRequest(BaseModel):
|
||||
run_type: str = Field(min_length=2, max_length=100)
|
||||
status: str = Field(default="queued", pattern="^(queued|running|success|failed)$")
|
||||
details: str | None = None
|
||||
|
||||
|
||||
class RunFinishRequest(BaseModel):
|
||||
status: str = Field(pattern="^(success|failed)$")
|
||||
details: str | None = None
|
||||
|
||||
|
||||
class ArticleUpsertRequest(BaseModel):
|
||||
feed_id: int | None = None
|
||||
source_article_id: str | None = None
|
||||
source_hash: str | None = None
|
||||
title: str = Field(min_length=1, max_length=500)
|
||||
source_url: str = Field(min_length=5, max_length=2000)
|
||||
canonical_url: str | None = None
|
||||
published_at: str | None = None
|
||||
author: str | None = None
|
||||
summary: str | None = None
|
||||
content_raw: str | None = None
|
||||
content_rewritten: str | None = None
|
||||
image_urls_json: str | None = None
|
||||
press_contact: str | None = None
|
||||
source_name_snapshot: str | None = None
|
||||
source_terms_url_snapshot: str | None = None
|
||||
source_license_name_snapshot: str | None = None
|
||||
legal_checked: bool = False
|
||||
legal_checked_at: str | None = None
|
||||
legal_note: str | None = None
|
||||
wp_post_id: int | None = None
|
||||
wp_post_url: str | None = None
|
||||
publish_attempts: int = 0
|
||||
publish_last_error: str | None = None
|
||||
published_to_wp_at: str | None = None
|
||||
word_count: int = 0
|
||||
status: str = Field(default="new", pattern="^(new|rewrite|publish|published|close|review|approved|error|no_image)$")
|
||||
meta_json: str | None = None
|
||||
|
||||
|
||||
class IngestionRunRequest(BaseModel):
|
||||
feed_id: int | None = None
|
||||
|
||||
|
||||
class ArticleTransitionRequest(BaseModel):
|
||||
target_status: str = Field(pattern="^(new|rewrite|publish|published|close|review|approved|error|no_image)$")
|
||||
note: str | None = None
|
||||
|
||||
|
||||
class ArticleReviewRequest(BaseModel):
|
||||
decision: str = Field(pattern="^(approve|reject)$")
|
||||
note: str | None = None
|
||||
|
||||
|
||||
class ArticleLegalReviewRequest(BaseModel):
|
||||
approved: bool
|
||||
note: str | None = None
|
||||
|
||||
|
||||
class PublisherEnqueueRequest(BaseModel):
|
||||
article_id: int
|
||||
max_attempts: int = 3
|
||||
|
||||
|
||||
class PublisherRunRequest(BaseModel):
|
||||
max_jobs: int = 10
|
||||
|
||||
|
||||
ALLOWED_ARTICLE_TRANSITIONS: dict[str, set[str]] = {
|
||||
"new": {"rewrite", "error"},
|
||||
"rewrite": {"approved", "error"},
|
||||
"approved": {"published", "error"},
|
||||
"published": {"error"},
|
||||
"error": {"rewrite"},
|
||||
}
|
||||
|
||||
|
||||
def require_auth(request: Request) -> str:
|
||||
token = request.cookies.get(settings.session_cookie_name)
|
||||
if not token:
|
||||
raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Nicht angemeldet")
|
||||
|
||||
username = verify_session_token(token)
|
||||
if not username:
|
||||
raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Session ungueltig oder abgelaufen")
|
||||
|
||||
return username
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
def health() -> dict:
|
||||
return {"status": "ok", "service": settings.app_name, "db_path": settings.app_db_path}
|
||||
|
||||
|
||||
@app.post("/auth/login")
|
||||
def login(payload: LoginRequest, response: Response) -> dict:
|
||||
if not verify_credentials(payload.username, payload.password):
|
||||
raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Ungueltige Zugangsdaten")
|
||||
|
||||
token = create_session_token(payload.username)
|
||||
response.set_cookie(
|
||||
key=settings.session_cookie_name,
|
||||
value=token,
|
||||
max_age=settings.session_max_age_seconds,
|
||||
httponly=True,
|
||||
secure=False,
|
||||
samesite="lax",
|
||||
)
|
||||
return {"ok": True, "username": payload.username}
|
||||
|
||||
|
||||
@app.post("/auth/logout")
|
||||
def logout(response: Response) -> dict:
|
||||
response.delete_cookie(settings.session_cookie_name)
|
||||
return {"ok": True}
|
||||
|
||||
|
||||
@app.get("/auth/me")
|
||||
def me(username: str = Depends(require_auth)) -> dict:
|
||||
return {"authenticated": True, "username": username}
|
||||
|
||||
|
||||
@app.get("/api/protected")
|
||||
def protected(username: str = Depends(require_auth)) -> dict:
|
||||
return {"ok": True, "message": "Protected endpoint", "username": username}
|
||||
|
||||
|
||||
@app.get("/api/pipeline/status")
|
||||
def pipeline_status(username: str = Depends(require_auth)) -> dict:
|
||||
feeds_total = len(repo_list_feeds())
|
||||
sources_total = len(repo_list_sources())
|
||||
articles_total = len(repo_list_articles(limit=500))
|
||||
return {
|
||||
"ok": True,
|
||||
"stage": "skeleton+db",
|
||||
"requested_by": username,
|
||||
"counts": {
|
||||
"sources": sources_total,
|
||||
"feeds": feeds_total,
|
||||
"articles": articles_total,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/sources")
|
||||
def list_sources(username: str = Depends(require_auth)) -> dict:
|
||||
return {"ok": True, "items": repo_list_sources(), "requested_by": username}
|
||||
|
||||
|
||||
@app.get("/api/sources/{source_id}/policy-check")
|
||||
def source_policy_check(source_id: int, username: str = Depends(require_auth)) -> dict:
|
||||
source = get_source_by_id(source_id)
|
||||
if not source:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Quelle nicht gefunden")
|
||||
issues = evaluate_source_policy(source)
|
||||
return {
|
||||
"ok": True,
|
||||
"source_id": source_id,
|
||||
"allowed": is_source_allowed(source),
|
||||
"issues": issues,
|
||||
"requested_by": username,
|
||||
}
|
||||
|
||||
|
||||
@app.post("/api/sources")
|
||||
def create_source(payload: SourceCreateRequest, username: str = Depends(require_auth)) -> dict:
|
||||
source_id = repo_create_source(
|
||||
SourceCreate(
|
||||
name=payload.name,
|
||||
base_url=payload.base_url,
|
||||
terms_url=payload.terms_url,
|
||||
license_name=payload.license_name,
|
||||
risk_level=payload.risk_level,
|
||||
is_enabled=payload.is_enabled,
|
||||
notes=payload.notes,
|
||||
last_reviewed_at=payload.last_reviewed_at,
|
||||
)
|
||||
)
|
||||
return {"ok": True, "id": source_id, "requested_by": username}
|
||||
|
||||
|
||||
@app.get("/api/feeds")
|
||||
def list_feeds(username: str = Depends(require_auth)) -> dict:
|
||||
return {"ok": True, "items": repo_list_feeds(), "requested_by": username}
|
||||
|
||||
|
||||
@app.get("/api/feeds/{feed_id}/policy-check")
|
||||
def feed_policy_check(feed_id: int, username: str = Depends(require_auth)) -> dict:
|
||||
feed = get_feed_by_id(feed_id)
|
||||
if not feed:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Feed nicht gefunden")
|
||||
|
||||
source_snapshot = {
|
||||
"id": feed.get("source_id"),
|
||||
"name": feed.get("source_name"),
|
||||
"base_url": feed.get("source_base_url"),
|
||||
"terms_url": feed.get("source_terms_url"),
|
||||
"license_name": feed.get("source_license_name"),
|
||||
"risk_level": feed.get("source_risk_level"),
|
||||
"last_reviewed_at": feed.get("source_last_reviewed_at"),
|
||||
"is_enabled": feed.get("source_is_enabled"),
|
||||
}
|
||||
issues = evaluate_source_policy(source_snapshot)
|
||||
return {
|
||||
"ok": True,
|
||||
"feed_id": feed_id,
|
||||
"allowed": len(issues) == 0,
|
||||
"issues": issues,
|
||||
"requested_by": username,
|
||||
}
|
||||
|
||||
|
||||
@app.post("/api/feeds")
|
||||
def create_feed(payload: FeedCreateRequest, username: str = Depends(require_auth)) -> dict:
|
||||
try:
|
||||
feed_id = repo_create_feed(
|
||||
FeedCreate(
|
||||
name=payload.name,
|
||||
url=payload.url,
|
||||
source_id=payload.source_id,
|
||||
is_enabled=payload.is_enabled,
|
||||
)
|
||||
)
|
||||
except Exception as exc:
|
||||
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=f"Feed konnte nicht angelegt werden: {exc}") from exc
|
||||
|
||||
return {"ok": True, "id": feed_id, "requested_by": username}
|
||||
|
||||
|
||||
@app.get("/api/runs")
|
||||
def api_list_runs(limit: int = 50, username: str = Depends(require_auth)) -> dict:
|
||||
return {"ok": True, "items": list_runs(limit=limit), "requested_by": username}
|
||||
|
||||
|
||||
@app.get("/api/runs/{run_id}")
|
||||
def api_get_run(run_id: int, username: str = Depends(require_auth)) -> dict:
|
||||
run = get_run_by_id(run_id)
|
||||
if not run:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Run nicht gefunden")
|
||||
return {"ok": True, "item": run, "requested_by": username}
|
||||
|
||||
|
||||
@app.post("/api/runs")
|
||||
def api_create_run(payload: RunCreateRequest, username: str = Depends(require_auth)) -> dict:
|
||||
run_id = create_run(RunCreate(run_type=payload.run_type, status=payload.status, details=payload.details))
|
||||
return {"ok": True, "id": run_id, "requested_by": username}
|
||||
|
||||
|
||||
@app.post("/api/runs/{run_id}/finish")
|
||||
def api_finish_run(run_id: int, payload: RunFinishRequest, username: str = Depends(require_auth)) -> dict:
|
||||
finish_run(run_id=run_id, status=payload.status, details=payload.details)
|
||||
return {"ok": True, "id": run_id, "requested_by": username}
|
||||
|
||||
|
||||
@app.get("/api/articles")
|
||||
def api_list_articles(limit: int = 100, status_filter: str | None = None, username: str = Depends(require_auth)) -> dict:
|
||||
internal_filter = ui_to_internal_status(status_filter) if status_filter else None
|
||||
items = repo_list_articles(limit=limit, status_filter=internal_filter)
|
||||
for item in items:
|
||||
item["status_ui"] = internal_to_ui_status(item.get("status"))
|
||||
return {"ok": True, "items": items, "requested_by": username}
|
||||
|
||||
|
||||
@app.get("/api/articles/export")
|
||||
def api_export_articles(
|
||||
format: str = "json",
|
||||
status_filter: str | None = None,
|
||||
username: str = Depends(require_auth),
|
||||
):
|
||||
internal_filter = ui_to_internal_status(status_filter) if status_filter else None
|
||||
articles = repo_list_articles(limit=500, status_filter=internal_filter)
|
||||
rows = []
|
||||
for article in articles:
|
||||
meta: dict = {}
|
||||
if article.get("meta_json"):
|
||||
try:
|
||||
parsed = json.loads(article["meta_json"])
|
||||
if isinstance(parsed, dict):
|
||||
meta = parsed
|
||||
except Exception:
|
||||
meta = {}
|
||||
image_review = meta.get("image_review") if isinstance(meta.get("image_review"), dict) else {}
|
||||
selected_image_url = image_review.get("selected_url") if isinstance(image_review.get("selected_url"), str) else None
|
||||
|
||||
days_old = article_age_days(article.get("published_at"))
|
||||
rows.append(
|
||||
{
|
||||
"id": article.get("id"),
|
||||
"title": article.get("title"),
|
||||
"status": article.get("status"),
|
||||
"published_at": article.get("published_at"),
|
||||
"days_old": days_old,
|
||||
"relevance": article_relevance(article.get("published_at")),
|
||||
"author": article.get("author"),
|
||||
"source_url": article.get("source_url"),
|
||||
"canonical_url": article.get("canonical_url"),
|
||||
"source_name_snapshot": article.get("source_name_snapshot"),
|
||||
"source_license_name_snapshot": article.get("source_license_name_snapshot"),
|
||||
"source_terms_url_snapshot": article.get("source_terms_url_snapshot"),
|
||||
"press_contact": article.get("press_contact"),
|
||||
"image_urls_json": article.get("image_urls_json"),
|
||||
"selected_image_url": selected_image_url,
|
||||
"legal_checked": bool(int(article.get("legal_checked", 0))),
|
||||
"legal_checked_at": article.get("legal_checked_at"),
|
||||
"legal_note": article.get("legal_note"),
|
||||
}
|
||||
)
|
||||
|
||||
generated_at = datetime.now(timezone.utc).isoformat()
|
||||
if format == "csv":
|
||||
out = io.StringIO()
|
||||
fieldnames = [
|
||||
"id",
|
||||
"title",
|
||||
"status",
|
||||
"published_at",
|
||||
"days_old",
|
||||
"relevance",
|
||||
"author",
|
||||
"source_url",
|
||||
"canonical_url",
|
||||
"source_name_snapshot",
|
||||
"source_license_name_snapshot",
|
||||
"source_terms_url_snapshot",
|
||||
"press_contact",
|
||||
"image_urls_json",
|
||||
"selected_image_url",
|
||||
"legal_checked",
|
||||
"legal_checked_at",
|
||||
"legal_note",
|
||||
]
|
||||
writer = csv.DictWriter(out, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(rows)
|
||||
return Response(
|
||||
content=out.getvalue(),
|
||||
media_type="text/csv; charset=utf-8",
|
||||
headers={"Content-Disposition": 'attachment; filename="articles_export.csv"'},
|
||||
)
|
||||
|
||||
return JSONResponse(
|
||||
{
|
||||
"ok": True,
|
||||
"count": len(rows),
|
||||
"generated_at": generated_at,
|
||||
"status_filter": status_filter,
|
||||
"items": rows,
|
||||
"requested_by": username,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@app.get("/api/articles/{article_id}")
|
||||
def api_get_article(article_id: int, username: str = Depends(require_auth)) -> dict:
|
||||
article = get_article_by_id(article_id)
|
||||
if not article:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
|
||||
article["status_ui"] = internal_to_ui_status(article.get("status"))
|
||||
return {"ok": True, "item": article, "requested_by": username}
|
||||
|
||||
|
||||
@app.post("/api/articles/upsert")
|
||||
def api_upsert_article(payload: ArticleUpsertRequest, username: str = Depends(require_auth)) -> dict:
|
||||
article_id = repo_upsert_article(
|
||||
ArticleUpsert(
|
||||
feed_id=payload.feed_id,
|
||||
source_article_id=payload.source_article_id,
|
||||
source_hash=payload.source_hash,
|
||||
title=payload.title,
|
||||
source_url=payload.source_url,
|
||||
canonical_url=payload.canonical_url,
|
||||
published_at=payload.published_at,
|
||||
author=payload.author,
|
||||
summary=payload.summary,
|
||||
content_raw=payload.content_raw,
|
||||
content_rewritten=payload.content_rewritten,
|
||||
image_urls_json=payload.image_urls_json,
|
||||
press_contact=payload.press_contact,
|
||||
source_name_snapshot=payload.source_name_snapshot,
|
||||
source_terms_url_snapshot=payload.source_terms_url_snapshot,
|
||||
source_license_name_snapshot=payload.source_license_name_snapshot,
|
||||
legal_checked=payload.legal_checked,
|
||||
legal_checked_at=payload.legal_checked_at,
|
||||
legal_note=payload.legal_note,
|
||||
wp_post_id=payload.wp_post_id,
|
||||
wp_post_url=payload.wp_post_url,
|
||||
publish_attempts=payload.publish_attempts,
|
||||
publish_last_error=payload.publish_last_error,
|
||||
published_to_wp_at=payload.published_to_wp_at,
|
||||
word_count=payload.word_count,
|
||||
status=ui_to_internal_status(payload.status),
|
||||
meta_json=payload.meta_json,
|
||||
)
|
||||
)
|
||||
return {"ok": True, "id": article_id, "requested_by": username}
|
||||
|
||||
|
||||
@app.post("/api/articles/{article_id}/transition")
|
||||
def api_article_transition(article_id: int, payload: ArticleTransitionRequest, username: str = Depends(require_auth)) -> dict:
|
||||
article = get_article_by_id(article_id)
|
||||
if not article:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
|
||||
|
||||
current_status = article.get("status")
|
||||
current_ui = internal_to_ui_status(current_status)
|
||||
target_internal = ui_to_internal_status(payload.target_status)
|
||||
target_ui = internal_to_ui_status(target_internal)
|
||||
allowed_targets = ALLOWED_UI_TRANSITIONS.get(current_ui, set())
|
||||
if target_ui not in allowed_targets:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail=f"Ungueltiger Statuswechsel: {current_ui} -> {target_ui}",
|
||||
)
|
||||
|
||||
updated = update_article_status(article_id, target_internal, actor=username, note=payload.note)
|
||||
if not updated:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
|
||||
return {"ok": True, "id": article_id, "from_status": current_ui, "to_status": target_ui}
|
||||
|
||||
|
||||
@app.post("/api/articles/{article_id}/rewrite-run")
|
||||
def api_article_rewrite_run(article_id: int, username: str = Depends(require_auth)) -> dict:
|
||||
article = get_article_by_id(article_id)
|
||||
if not article:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
|
||||
if internal_to_ui_status(article.get("status")) not in {"rewrite", "new"}:
|
||||
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Rewrite nur aus Status 'new' oder 'rewrite'")
|
||||
|
||||
rewritten = rewrite_article_text(article)
|
||||
tags: list[str] = []
|
||||
try:
|
||||
tags = generate_article_tags(article, rewritten_text=rewritten)
|
||||
except Exception:
|
||||
tags = []
|
||||
merged_meta = merge_generated_tags(article.get("meta_json"), tags)
|
||||
# upsert via status update + existing fields by lightweight path:
|
||||
repo_upsert_article(
|
||||
ArticleUpsert(
|
||||
feed_id=article.get("feed_id"),
|
||||
source_article_id=article.get("source_article_id"),
|
||||
source_hash=article.get("source_hash"),
|
||||
title=article.get("title"),
|
||||
source_url=article.get("source_url"),
|
||||
canonical_url=article.get("canonical_url"),
|
||||
published_at=article.get("published_at"),
|
||||
author=article.get("author"),
|
||||
summary=article.get("summary"),
|
||||
content_raw=article.get("content_raw"),
|
||||
content_rewritten=rewritten,
|
||||
image_urls_json=article.get("image_urls_json"),
|
||||
press_contact=article.get("press_contact"),
|
||||
source_name_snapshot=article.get("source_name_snapshot"),
|
||||
source_terms_url_snapshot=article.get("source_terms_url_snapshot"),
|
||||
source_license_name_snapshot=article.get("source_license_name_snapshot"),
|
||||
legal_checked=bool(int(article.get("legal_checked", 0))),
|
||||
legal_checked_at=article.get("legal_checked_at"),
|
||||
legal_note=article.get("legal_note"),
|
||||
wp_post_id=article.get("wp_post_id"),
|
||||
wp_post_url=article.get("wp_post_url"),
|
||||
publish_attempts=int(article.get("publish_attempts", 0)),
|
||||
publish_last_error=article.get("publish_last_error"),
|
||||
published_to_wp_at=article.get("published_to_wp_at"),
|
||||
word_count=len(rewritten.split()),
|
||||
status="approved",
|
||||
meta_json=merged_meta,
|
||||
)
|
||||
)
|
||||
return {"ok": True, "id": article_id, "status": "publish", "tags": tags}
|
||||
|
||||
|
||||
@app.post("/api/articles/{article_id}/legal-review")
|
||||
def api_article_legal_review(article_id: int, payload: ArticleLegalReviewRequest, username: str = Depends(require_auth)) -> dict:
|
||||
article = get_article_by_id(article_id)
|
||||
if not article:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
|
||||
|
||||
updated = set_article_legal_review(article_id, approved=payload.approved, note=payload.note, actor=username)
|
||||
if not updated:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
|
||||
return {
|
||||
"ok": True,
|
||||
"id": article_id,
|
||||
"legal_checked": payload.approved,
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/publisher/jobs")
|
||||
def api_publisher_jobs(limit: int = 100, username: str = Depends(require_auth)) -> dict:
|
||||
return {"ok": True, "items": list_publish_jobs(limit=limit), "requested_by": username}
|
||||
|
||||
|
||||
@app.post("/api/publisher/enqueue")
|
||||
def api_publisher_enqueue(payload: PublisherEnqueueRequest, username: str = Depends(require_auth)) -> dict:
|
||||
article = get_article_by_id(payload.article_id)
|
||||
if not article:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
|
||||
job_id = enqueue_publish(article_id=payload.article_id, max_attempts=payload.max_attempts)
|
||||
return {"ok": True, "job_id": job_id, "article_id": payload.article_id, "requested_by": username}
|
||||
|
||||
|
||||
@app.post("/api/publisher/run")
|
||||
def api_publisher_run(payload: PublisherRunRequest, username: str = Depends(require_auth)) -> dict:
|
||||
stats = run_publisher(max_jobs=payload.max_jobs)
|
||||
return {
|
||||
"ok": True,
|
||||
"requested_by": username,
|
||||
"stats": {
|
||||
"processed": stats.processed,
|
||||
"success": stats.success,
|
||||
"failed": stats.failed,
|
||||
"requeued": stats.requeued,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@app.post("/api/articles/{article_id}/review")
|
||||
def api_article_review(article_id: int, payload: ArticleReviewRequest, username: str = Depends(require_auth)) -> dict:
|
||||
raise HTTPException(status_code=status.HTTP_410_GONE, detail="Review-Endpoint ersetzt durch Rewrite-Workflow")
|
||||
|
||||
|
||||
@app.post("/api/ingestion/run")
|
||||
def api_run_ingestion(payload: IngestionRunRequest, username: str = Depends(require_auth)) -> dict:
|
||||
stats = run_ingestion(feed_id=payload.feed_id)
|
||||
return {
|
||||
"ok": stats.status == "success",
|
||||
"run_id": stats.run_id,
|
||||
"status": stats.status,
|
||||
"message": stats.message,
|
||||
"stats": {
|
||||
"feeds_processed": stats.feeds_processed,
|
||||
"entries_seen": stats.entries_seen,
|
||||
"articles_upserted": stats.articles_upserted,
|
||||
},
|
||||
"requested_by": username,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# N8N Automation endpoint (API-Key auth, no session cookie required)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _require_api_key(request: Request) -> None:
|
||||
api_key = request.headers.get("X-API-Key") or request.query_params.get("api_key")
|
||||
expected = settings.n8n_api_key
|
||||
if not expected:
|
||||
raise HTTPException(status_code=status.HTTP_501_NOT_IMPLEMENTED, detail="N8N_API_KEY nicht konfiguriert")
|
||||
if api_key != expected:
|
||||
raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Ungültiger API-Key")
|
||||
|
||||
|
||||
_pipeline_lock = asyncio.Lock()
|
||||
|
||||
|
||||
@app.post("/api/n8n/pipeline")
|
||||
async def api_n8n_pipeline(request: Request) -> dict:
|
||||
"""Trigger the full auto pipeline in background. Returns immediately.
|
||||
Called by N8N (2x/day or on demand). Results arrive via Telegram."""
|
||||
_require_api_key(request)
|
||||
|
||||
if _pipeline_lock.locked():
|
||||
logging.getLogger(__name__).warning("Pipeline bereits aktiv – Trigger ignoriert")
|
||||
return {"ok": False, "message": "Pipeline läuft bereits – Trigger ignoriert"}
|
||||
|
||||
async def _run():
|
||||
async with _pipeline_lock:
|
||||
loop = asyncio.get_event_loop()
|
||||
try:
|
||||
await loop.run_in_executor(None, lambda: run_auto_pipeline(trigger="n8n"))
|
||||
except Exception as exc:
|
||||
logging.getLogger(__name__).error("Background pipeline error: %s", exc)
|
||||
|
||||
asyncio.create_task(_run())
|
||||
return {"ok": True, "message": "Pipeline gestartet – Ergebnisse kommen per Telegram"}
|
||||
|
||||
|
||||
@app.post("/api/n8n/ingest")
|
||||
def api_n8n_ingest(request: Request) -> dict:
|
||||
"""Run only the ingestion step (no rewrite/publish). For N8N."""
|
||||
_require_api_key(request)
|
||||
stats = run_ingestion()
|
||||
return {
|
||||
"ok": stats.status == "success",
|
||||
"stats": {
|
||||
"feeds_processed": stats.feeds_processed,
|
||||
"entries_seen": stats.entries_seen,
|
||||
"articles_upserted": stats.articles_upserted,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Telegram Webhook
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@app.post("/telegram/webhook")
|
||||
async def telegram_webhook(request: Request) -> dict:
|
||||
"""Receive updates from Telegram Bot API.
|
||||
|
||||
Returns 200 immediately so Telegram never retries the same update.
|
||||
Actual processing runs in a background task.
|
||||
"""
|
||||
import asyncio
|
||||
import logging
|
||||
|
||||
# Verify secret token
|
||||
secret = settings.telegram_webhook_secret
|
||||
if secret:
|
||||
incoming = request.headers.get("X-Telegram-Bot-Api-Secret-Token", "")
|
||||
if incoming != secret:
|
||||
raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="Invalid secret")
|
||||
|
||||
body = await request.body()
|
||||
try:
|
||||
update = json.loads(body.decode("utf-8"))
|
||||
except Exception:
|
||||
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON")
|
||||
|
||||
async def _process():
|
||||
loop = asyncio.get_event_loop()
|
||||
try:
|
||||
await loop.run_in_executor(None, lambda: handle_update(update))
|
||||
except Exception as exc:
|
||||
logging.getLogger(__name__).error("Telegram update handler error: %s", exc)
|
||||
|
||||
asyncio.create_task(_process())
|
||||
return {"ok": True}
|
||||
|
||||
|
||||
@app.post("/api/telegram/setup-webhook")
|
||||
def api_setup_telegram_webhook(request: Request) -> dict:
|
||||
"""Register the Telegram webhook URL. Call once after deployment."""
|
||||
username = require_auth(request)
|
||||
base_url = str(request.base_url).rstrip("/")
|
||||
webhook_url = f"{base_url}/telegram/webhook"
|
||||
result = setup_webhook(webhook_url)
|
||||
return {"ok": True, "webhook_url": webhook_url, "telegram_response": result, "requested_by": username}
|
||||
516
backend/app/pipeline.py
Normal file
516
backend/app/pipeline.py
Normal file
|
|
@ -0,0 +1,516 @@
|
|||
"""Autonomous RSS-News pipeline.
|
||||
|
||||
Full automated flow:
|
||||
1. Run RSS ingestion
|
||||
2. For each new article:
|
||||
- Auto-select primary image
|
||||
- Score relevance via GPT
|
||||
- < warn threshold: reject (error status) → Telegram rejected summary
|
||||
- warn..auto threshold: Telegram warning with override button
|
||||
- >= auto threshold: rewrite → create WP draft → Telegram notification
|
||||
3. Send pipeline summary to Telegram
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
from .config import get_settings
|
||||
from .ingestion import run_ingestion
|
||||
from .publisher import enqueue_publish, run_publisher
|
||||
from .repositories import (
|
||||
ArticleUpsert,
|
||||
get_article_by_id,
|
||||
list_articles,
|
||||
set_article_image_decision,
|
||||
update_article_status,
|
||||
upsert_article as repo_upsert_article,
|
||||
)
|
||||
from .rewrite import generate_article_tags, merge_generated_tags, rewrite_article_text, score_article_relevance
|
||||
from .scheduler import reserve_publish_slot
|
||||
from .wordpress import publish_article_draft, selected_image_exists
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class PipelineStats:
|
||||
ingested: int = 0
|
||||
processed: int = 0
|
||||
drafts_created: int = 0
|
||||
rejected: int = 0
|
||||
quality_gate_rejected: int = 0
|
||||
warnings: int = 0
|
||||
errors: int = 0
|
||||
no_image: int = 0
|
||||
rejected_articles: list[dict[str, Any]] = field(default_factory=list)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Internal helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _auto_select_image(article: dict[str, Any]) -> bool:
|
||||
"""Auto-select the primary image from ingestion metadata if not already selected."""
|
||||
meta_json = article.get("meta_json") or "{}"
|
||||
try:
|
||||
meta = json.loads(meta_json)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
# Already selected?
|
||||
image_review = meta.get("image_review") or {}
|
||||
if isinstance(image_review, dict) and image_review.get("selected_url"):
|
||||
return True
|
||||
|
||||
# Try to get primary from ingestion extraction
|
||||
extraction = meta.get("extraction") or {}
|
||||
image_selection = extraction.get("image_selection") or {}
|
||||
primary = image_selection.get("primary")
|
||||
|
||||
if not primary:
|
||||
# Fallback: use first URL from image_urls_json
|
||||
image_urls_json = article.get("image_urls_json") or "[]"
|
||||
try:
|
||||
urls = json.loads(image_urls_json)
|
||||
if urls:
|
||||
primary = urls[0]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if primary:
|
||||
set_article_image_decision(int(article["id"]), primary, "select", actor="pipeline")
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _store_relevance(article_id: int, relevance: dict[str, Any]) -> None:
|
||||
"""Persist relevance score and reason in article meta_json and relevance_score column."""
|
||||
article = get_article_by_id(article_id)
|
||||
if not article:
|
||||
return
|
||||
try:
|
||||
meta = json.loads(article.get("meta_json") or "{}")
|
||||
except Exception:
|
||||
meta = {}
|
||||
meta["relevance"] = relevance
|
||||
new_meta = json.dumps(meta, ensure_ascii=False)
|
||||
from .db import get_conn
|
||||
with get_conn() as conn:
|
||||
conn.execute(
|
||||
"UPDATE articles SET meta_json = ?, relevance_score = ? WHERE id = ?",
|
||||
(new_meta, relevance.get("score", 0), article_id),
|
||||
)
|
||||
|
||||
|
||||
def _do_rewrite_and_draft(article: dict[str, Any]) -> tuple[int, str | None]:
|
||||
"""Rewrite article and create WP draft. Returns (wp_post_id, wp_post_url)."""
|
||||
article_id = int(article["id"])
|
||||
settings = get_settings()
|
||||
|
||||
# ── Quality gate 1: raw content length ──────────────────────────────────
|
||||
import re as _re
|
||||
raw_text = _re.sub(r"<[^>]+>", " ", article.get("content_raw") or "")
|
||||
raw_words = len(raw_text.split())
|
||||
if raw_words < settings.pipeline_min_words_raw:
|
||||
note = (
|
||||
f"Zu wenig Rohinhalt: {raw_words} Wörter "
|
||||
f"(Minimum: {settings.pipeline_min_words_raw})"
|
||||
)
|
||||
logger.warning("_do_rewrite_and_draft #%d: %s — überspringe", article_id, note)
|
||||
update_article_status(article_id, "error", actor="pipeline", note=note)
|
||||
raise ValueError(note)
|
||||
|
||||
# Rewrite
|
||||
logger.info("_do_rewrite_and_draft #%d: starte OpenAI-Rewrite (%d Roh-Wörter)", article_id, raw_words)
|
||||
rewritten = rewrite_article_text(article)
|
||||
|
||||
# ── Quality gate 2: rewritten content length ─────────────────────────────
|
||||
rewritten_words = len(rewritten.split())
|
||||
if rewritten_words < settings.pipeline_min_words_rewritten:
|
||||
note = (
|
||||
f"Rewrite zu kurz: {rewritten_words} Wörter "
|
||||
f"(Minimum: {settings.pipeline_min_words_rewritten})"
|
||||
)
|
||||
logger.warning("_do_rewrite_and_draft #%d: %s — überspringe", article_id, note)
|
||||
update_article_status(article_id, "error", actor="pipeline", note=note)
|
||||
raise ValueError(note)
|
||||
logger.info("_do_rewrite_and_draft #%d: Rewrite fertig (%d Wörter), generiere Tags", article_id, len(rewritten.split()))
|
||||
tags: list[str] = []
|
||||
try:
|
||||
tags = generate_article_tags(article, rewritten_text=rewritten)
|
||||
except Exception:
|
||||
pass
|
||||
merged_meta = merge_generated_tags(article.get("meta_json"), tags)
|
||||
|
||||
# Save rewritten content + approved status
|
||||
repo_upsert_article(
|
||||
ArticleUpsert(
|
||||
feed_id=article.get("feed_id"),
|
||||
source_article_id=article.get("source_article_id"),
|
||||
source_hash=article.get("source_hash"),
|
||||
title=article.get("title", ""),
|
||||
source_url=article.get("source_url", ""),
|
||||
canonical_url=article.get("canonical_url"),
|
||||
published_at=article.get("published_at"),
|
||||
author=article.get("author"),
|
||||
summary=article.get("summary"),
|
||||
content_raw=article.get("content_raw"),
|
||||
content_rewritten=rewritten,
|
||||
image_urls_json=article.get("image_urls_json"),
|
||||
press_contact=article.get("press_contact"),
|
||||
source_name_snapshot=article.get("source_name_snapshot"),
|
||||
source_terms_url_snapshot=article.get("source_terms_url_snapshot"),
|
||||
source_license_name_snapshot=article.get("source_license_name_snapshot"),
|
||||
legal_checked=bool(int(article.get("legal_checked", 0))),
|
||||
legal_checked_at=article.get("legal_checked_at"),
|
||||
legal_note=article.get("legal_note"),
|
||||
wp_post_id=article.get("wp_post_id"),
|
||||
wp_post_url=article.get("wp_post_url"),
|
||||
publish_attempts=int(article.get("publish_attempts", 0)),
|
||||
publish_last_error=article.get("publish_last_error"),
|
||||
published_to_wp_at=article.get("published_to_wp_at"),
|
||||
word_count=len(rewritten.split()),
|
||||
status="approved",
|
||||
meta_json=merged_meta,
|
||||
)
|
||||
)
|
||||
|
||||
# Reload after save to get updated meta_json
|
||||
fresh = get_article_by_id(article_id)
|
||||
if not fresh:
|
||||
raise RuntimeError(f"Artikel #{article_id} nach Rewrite nicht gefunden")
|
||||
|
||||
# Ensure a publish slot is reserved — reserve one now if not yet set
|
||||
if not fresh.get("scheduled_publish_at"):
|
||||
from .scheduler import reserve_publish_slot
|
||||
logger.info("_do_rewrite_and_draft #%d: kein Slot gesetzt, reserviere jetzt", article_id)
|
||||
reserve_publish_slot(article_id)
|
||||
fresh = get_article_by_id(article_id)
|
||||
if not fresh:
|
||||
raise RuntimeError(f"Artikel #{article_id} nach Slot-Reservierung nicht gefunden")
|
||||
|
||||
# Create WP draft
|
||||
logger.info("_do_rewrite_and_draft #%d: erstelle/aktualisiere WP Draft (wp_post_id=%s, sched=%s)", article_id, fresh.get("wp_post_id"), fresh.get("scheduled_publish_at"))
|
||||
wp_post_id, wp_post_url = publish_article_draft(fresh)
|
||||
logger.info("_do_rewrite_and_draft #%d: WP Draft fertig (post_id=%s)", article_id, wp_post_id)
|
||||
|
||||
# Update WP info in DB
|
||||
from .repositories import mark_article_publish_result
|
||||
mark_article_publish_result(
|
||||
article_id,
|
||||
wp_post_id=wp_post_id,
|
||||
wp_post_url=wp_post_url,
|
||||
error=None,
|
||||
increment_attempts=True,
|
||||
set_published_status=False,
|
||||
)
|
||||
|
||||
return wp_post_id, wp_post_url
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public pipeline functions
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def run_auto_pipeline(trigger: str = "auto") -> dict[str, Any]:
|
||||
"""Run the full automated pipeline and return stats dict."""
|
||||
from . import telegram_bot as tg
|
||||
|
||||
settings = get_settings()
|
||||
stats = PipelineStats()
|
||||
|
||||
tg.notify_pipeline_started(trigger)
|
||||
|
||||
# Step 1: Ingestion
|
||||
try:
|
||||
ingest_result = run_ingestion()
|
||||
stats.ingested = ingest_result.articles_upserted
|
||||
except Exception as exc:
|
||||
tg.notify_error(f"Ingestion fehlgeschlagen: {exc}")
|
||||
logger.error("Ingestion error: %s", exc)
|
||||
stats.errors += 1
|
||||
|
||||
# Step 2: Process new articles
|
||||
new_articles = list_articles(limit=100, status_filter="new")
|
||||
|
||||
for article in new_articles:
|
||||
article_id = int(article["id"])
|
||||
try:
|
||||
_process_article(article, stats, settings)
|
||||
except Exception as exc:
|
||||
logger.error("Fehler bei Artikel #%d: %s", article_id, exc)
|
||||
tg.notify_error(f"Fehler bei Artikel #{article_id} ({article.get('title','?')[:50]}): {exc}")
|
||||
stats.errors += 1
|
||||
# Rate limiting between OpenAI calls
|
||||
time.sleep(1)
|
||||
|
||||
# Step 3: Send rejected summary if any
|
||||
if stats.rejected_articles:
|
||||
try:
|
||||
tg.notify_rejected_summary(stats.rejected_articles)
|
||||
except Exception as exc:
|
||||
logger.warning("Telegram rejected summary fehlgeschlagen: %s", exc)
|
||||
|
||||
# Step 4: Summary
|
||||
result = {
|
||||
"ingested": stats.ingested,
|
||||
"processed": stats.processed,
|
||||
"drafts_created": stats.drafts_created,
|
||||
"rejected": stats.rejected,
|
||||
"quality_gate_rejected": stats.quality_gate_rejected,
|
||||
"no_image": stats.no_image,
|
||||
"warnings": stats.warnings,
|
||||
"errors": stats.errors,
|
||||
}
|
||||
tg.notify_pipeline_done(result)
|
||||
return result
|
||||
|
||||
|
||||
def _process_article(article: dict[str, Any], stats: PipelineStats, settings: Any) -> None:
|
||||
"""Process a single new article through the pipeline."""
|
||||
from . import telegram_bot as tg
|
||||
|
||||
article_id = int(article["id"])
|
||||
|
||||
# Auto-select image
|
||||
_auto_select_image(article)
|
||||
|
||||
# Reload to get updated image_review
|
||||
article = get_article_by_id(article_id) or article
|
||||
|
||||
# Exclude articles without a usable image
|
||||
try:
|
||||
meta = json.loads(article.get("meta_json") or "{}")
|
||||
except Exception:
|
||||
meta = {}
|
||||
has_image = bool((meta.get("image_review") or {}).get("selected_url"))
|
||||
if not has_image:
|
||||
update_article_status(
|
||||
article_id,
|
||||
"no_image",
|
||||
actor="pipeline",
|
||||
note="Kein Bild vorhanden – Artikel ausgeschlossen",
|
||||
)
|
||||
stats.no_image += 1
|
||||
logger.info("Artikel #%d ausgeschlossen: kein Bild gefunden", article_id)
|
||||
try:
|
||||
tg.send_message(
|
||||
f"🖼️ <b>Kein Bild</b> – Artikel #{article_id} ausgeschlossen\n"
|
||||
f"📰 {(article.get('title') or '')[:80]}"
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
return
|
||||
|
||||
# Score relevance
|
||||
try:
|
||||
relevance = score_article_relevance(article)
|
||||
except Exception as exc:
|
||||
logger.warning("Relevanz-Scoring für #%d fehlgeschlagen: %s", article_id, exc)
|
||||
relevance = {"score": 0, "reason": f"Scoring-Fehler: {exc}", "topics": []}
|
||||
|
||||
score = relevance.get("score", 0)
|
||||
reason = relevance.get("reason", "")
|
||||
_store_relevance(article_id, relevance)
|
||||
|
||||
stats.processed += 1
|
||||
|
||||
if score < settings.pipeline_relevance_warn:
|
||||
# Reject
|
||||
update_article_status(
|
||||
article_id,
|
||||
"error",
|
||||
actor="pipeline",
|
||||
note=f"Abgelehnt: Score {score}/100 — {reason}",
|
||||
)
|
||||
stats.rejected += 1
|
||||
# Reload for summary (now has relevance in meta)
|
||||
updated = get_article_by_id(article_id)
|
||||
if updated:
|
||||
stats.rejected_articles.append(updated)
|
||||
|
||||
elif score < settings.pipeline_relevance_auto:
|
||||
# Warning zone: set status to "review" so repeated /run calls don't re-warn
|
||||
update_article_status(
|
||||
article_id,
|
||||
"review",
|
||||
actor="pipeline",
|
||||
note=f"Niedrige Relevanz: Score {score}/100 — {reason}",
|
||||
)
|
||||
stats.warnings += 1
|
||||
try:
|
||||
tg.notify_relevance_warning(article, score, reason)
|
||||
except Exception as exc:
|
||||
logger.warning("Telegram warning für #%d fehlgeschlagen: %s", article_id, exc)
|
||||
|
||||
else:
|
||||
# Auto-process: rewrite + WP draft
|
||||
try:
|
||||
# Reserve publish slot FIRST so it's available when WP draft is created
|
||||
slot = reserve_publish_slot(article_id)
|
||||
|
||||
# Reload article to get updated image_review + scheduled_publish_at
|
||||
fresh = get_article_by_id(article_id)
|
||||
if not fresh:
|
||||
return
|
||||
wp_post_id, wp_post_url = _do_rewrite_and_draft(fresh)
|
||||
stats.drafts_created += 1
|
||||
|
||||
# Reload for notification
|
||||
final = get_article_by_id(article_id)
|
||||
if final:
|
||||
try:
|
||||
tg.notify_new_draft(final, score=score, suggested_publish_at=slot)
|
||||
except Exception as exc:
|
||||
logger.warning("Telegram draft-Benachrichtigung für #%d fehlgeschlagen: %s", article_id, exc)
|
||||
|
||||
except ValueError as exc:
|
||||
# Quality gate rejection (too short etc.) — status already set in _do_rewrite_and_draft
|
||||
# Release the reserved slot so it's available for the next article
|
||||
from .scheduler import release_publish_slot
|
||||
release_publish_slot(article_id)
|
||||
# Clean up any stale WP draft from a previous pipeline run
|
||||
stale = get_article_by_id(article_id)
|
||||
if stale and stale.get("wp_post_id"):
|
||||
try:
|
||||
from .wordpress import delete_wp_post
|
||||
delete_wp_post(int(stale["wp_post_id"]))
|
||||
logger.info("Artikel #%d: veralteten WP-Draft #%s gelöscht", article_id, stale["wp_post_id"])
|
||||
except Exception as del_exc:
|
||||
logger.warning("Artikel #%d: WP-Draft konnte nicht gelöscht werden: %s", article_id, del_exc)
|
||||
stats.quality_gate_rejected += 1
|
||||
logger.info("Artikel #%d wegen Qualitätsprüfung abgelehnt: %s", article_id, exc)
|
||||
# Individual Telegram notification for quality gate rejection
|
||||
try:
|
||||
title = (article.get("title") or "Ohne Titel")[:80]
|
||||
tg.send_message(
|
||||
f"✂️ <b>Qualitätsprüfung nicht bestanden</b>\n"
|
||||
f"📰 {title}\n"
|
||||
f"💯 Score: {score}/100\n"
|
||||
f"⚠️ {exc}"
|
||||
)
|
||||
except Exception as tg_exc:
|
||||
logger.warning("Telegram QG-Benachrichtigung für #%d fehlgeschlagen: %s", article_id, tg_exc)
|
||||
|
||||
except Exception as exc:
|
||||
logger.error("Draft-Erstellung für #%d fehlgeschlagen: %s", article_id, exc)
|
||||
update_article_status(article_id, "error", actor="pipeline", note=f"Draft-Fehler: {exc}")
|
||||
# Release reserved slot so it's not permanently blocked by a failed article
|
||||
from .scheduler import release_publish_slot
|
||||
release_publish_slot(article_id)
|
||||
raise
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Callback actions (called from telegram_bot._handle_callback)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def rewrite_and_update_draft(article_id: int) -> None:
|
||||
"""Rewrite article and update the existing WP draft."""
|
||||
article = get_article_by_id(article_id)
|
||||
if not article:
|
||||
raise RuntimeError(f"Artikel #{article_id} nicht gefunden")
|
||||
_auto_select_image(article)
|
||||
fresh = get_article_by_id(article_id)
|
||||
_do_rewrite_and_draft(fresh)
|
||||
|
||||
|
||||
def discard_article(article_id: int) -> None:
|
||||
"""Discard a draft: delete WP post if exists, set article to error."""
|
||||
article = get_article_by_id(article_id)
|
||||
if not article:
|
||||
return
|
||||
|
||||
wp_post_id = article.get("wp_post_id")
|
||||
if wp_post_id:
|
||||
try:
|
||||
from .wordpress import delete_wp_post
|
||||
delete_wp_post(int(wp_post_id))
|
||||
except Exception as exc:
|
||||
logger.warning("WP Post #%d konnte nicht gelöscht werden: %s", wp_post_id, exc)
|
||||
|
||||
update_article_status(article_id, "error", actor="telegram", note="Via Telegram verworfen")
|
||||
|
||||
|
||||
def override_rejected_article(article_id: int) -> None:
|
||||
"""Force-process a previously rejected article."""
|
||||
from . import telegram_bot as tg
|
||||
|
||||
article = get_article_by_id(article_id)
|
||||
if not article:
|
||||
raise RuntimeError(f"Artikel #{article_id} nicht gefunden")
|
||||
|
||||
# Reset to new so processing is allowed
|
||||
update_article_status(article_id, "new", actor="telegram", note="Manuell übernommen via Telegram")
|
||||
|
||||
# Reload
|
||||
fresh = get_article_by_id(article_id)
|
||||
if not fresh:
|
||||
return
|
||||
|
||||
_auto_select_image(fresh)
|
||||
fresh = get_article_by_id(article_id)
|
||||
|
||||
# Get existing score or re-score
|
||||
try:
|
||||
meta = json.loads(fresh.get("meta_json") or "{}")
|
||||
score = int((meta.get("relevance") or {}).get("score", 0))
|
||||
except Exception:
|
||||
score = 0
|
||||
|
||||
# Reserve publish slot FIRST so it's in the DB when WP draft is created
|
||||
slot = reserve_publish_slot(article_id)
|
||||
fresh = get_article_by_id(article_id)
|
||||
|
||||
wp_post_id, wp_post_url = _do_rewrite_and_draft(fresh)
|
||||
|
||||
final = get_article_by_id(article_id)
|
||||
if final:
|
||||
tg.notify_new_draft(final, score=score, suggested_publish_at=slot)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Status helpers (used by /status command)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def get_recently_rejected(days: int = 3) -> list[dict[str, Any]]:
|
||||
"""Return articles rejected in the last N days."""
|
||||
from .db import get_conn
|
||||
from .db import rows_to_dicts
|
||||
cutoff = datetime.now(timezone.utc).isoformat()[:10]
|
||||
with get_conn() as conn:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT id, title, meta_json, source_url, created_at
|
||||
FROM articles
|
||||
WHERE status IN ('error', 'review')
|
||||
AND json_extract(meta_json, '$.relevance.score') IS NOT NULL
|
||||
AND date(updated_at) >= date('now', ?)
|
||||
ORDER BY updated_at DESC
|
||||
LIMIT 20
|
||||
""",
|
||||
(f"-{days} days",),
|
||||
).fetchall()
|
||||
return rows_to_dicts(rows)
|
||||
|
||||
|
||||
def get_pipeline_status_text() -> str:
|
||||
"""Return a text summary of current pipeline state."""
|
||||
from .repositories import list_articles as _list
|
||||
new_count = len(_list(limit=500, status_filter="new"))
|
||||
approved_count = len(_list(limit=500, status_filter="approved"))
|
||||
published_count = len(_list(limit=500, status_filter="published"))
|
||||
error_count = len(_list(limit=500, status_filter="error"))
|
||||
|
||||
return (
|
||||
f"📊 <b>Pipeline-Status</b>\n"
|
||||
f"🆕 Neu / wartend: {new_count}\n"
|
||||
f"✅ Draft / freigegeben: {approved_count}\n"
|
||||
f"📢 Veröffentlicht: {published_count}\n"
|
||||
f"🚫 Fehler / abgelehnt: {error_count}"
|
||||
)
|
||||
35
backend/app/policy.py
Normal file
35
backend/app/policy.py
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
|
||||
def evaluate_source_policy(source: dict[str, Any] | None) -> list[str]:
|
||||
issues: list[str] = []
|
||||
if not source:
|
||||
issues.append("Keine Quelle zugeordnet")
|
||||
return issues
|
||||
|
||||
risk_level = (source.get("risk_level") or "").strip().lower()
|
||||
if risk_level != "green":
|
||||
issues.append(f"Quelle nicht freigegeben (risk_level={risk_level or 'unset'})")
|
||||
|
||||
terms_url = (source.get("terms_url") or "").strip()
|
||||
if not terms_url:
|
||||
issues.append("terms_url fehlt")
|
||||
|
||||
license_name = (source.get("license_name") or "").strip()
|
||||
if not license_name:
|
||||
issues.append("license_name fehlt")
|
||||
|
||||
last_reviewed_at = (source.get("last_reviewed_at") or "").strip()
|
||||
if not last_reviewed_at:
|
||||
issues.append("last_reviewed_at fehlt")
|
||||
|
||||
if int(source.get("is_enabled", 0) or 0) != 1:
|
||||
issues.append("Quelle ist deaktiviert")
|
||||
|
||||
return issues
|
||||
|
||||
|
||||
def is_source_allowed(source: dict[str, Any] | None) -> bool:
|
||||
return len(evaluate_source_policy(source)) == 0
|
||||
101
backend/app/publisher.py
Normal file
101
backend/app/publisher.py
Normal file
|
|
@ -0,0 +1,101 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
from .repositories import (
|
||||
claim_next_publish_job,
|
||||
complete_publish_job,
|
||||
create_publish_job,
|
||||
fail_publish_job,
|
||||
get_article_by_id,
|
||||
mark_article_publish_result,
|
||||
PublishJobCreate,
|
||||
)
|
||||
from .wordpress import publish_article_draft, selected_image_exists
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PublisherStats:
|
||||
processed: int
|
||||
success: int
|
||||
failed: int
|
||||
requeued: int
|
||||
|
||||
|
||||
def enqueue_publish(article_id: int, max_attempts: int = 3) -> int:
|
||||
return create_publish_job(PublishJobCreate(article_id=article_id, max_attempts=max_attempts))
|
||||
|
||||
|
||||
def _can_publish(article: dict) -> tuple[bool, str | None]:
|
||||
if article.get("status") not in {"approved", "published"}:
|
||||
return False, "Artikelstatus muss 'publish' sein"
|
||||
if not selected_image_exists(article):
|
||||
return False, "Hauptbild nicht gesetzt"
|
||||
return True, None
|
||||
|
||||
|
||||
def run_publisher(max_jobs: int = 10) -> PublisherStats:
|
||||
processed = 0
|
||||
success = 0
|
||||
failed = 0
|
||||
requeued = 0
|
||||
|
||||
for _ in range(max(1, max_jobs)):
|
||||
job = claim_next_publish_job()
|
||||
if not job:
|
||||
break
|
||||
processed += 1
|
||||
job_id = int(job["id"])
|
||||
article_id = int(job["article_id"])
|
||||
|
||||
article = get_article_by_id(article_id)
|
||||
if not article:
|
||||
fail_publish_job(job_id, "Artikel nicht gefunden", requeue=False)
|
||||
failed += 1
|
||||
continue
|
||||
|
||||
allowed, reason = _can_publish(article)
|
||||
if not allowed:
|
||||
fail_publish_job(job_id, reason or "Publish-Bedingungen nicht erfüllt", requeue=False)
|
||||
mark_article_publish_result(
|
||||
article_id,
|
||||
wp_post_id=article.get("wp_post_id"),
|
||||
wp_post_url=article.get("wp_post_url"),
|
||||
error=reason or "blocked",
|
||||
increment_attempts=True,
|
||||
set_published_status=False,
|
||||
)
|
||||
failed += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
wp_post_id, wp_post_url = publish_article_draft(article)
|
||||
complete_publish_job(job_id, wp_post_id=wp_post_id, wp_post_url=wp_post_url)
|
||||
mark_article_publish_result(
|
||||
article_id,
|
||||
wp_post_id=wp_post_id,
|
||||
wp_post_url=wp_post_url,
|
||||
error=None,
|
||||
increment_attempts=True,
|
||||
set_published_status=True,
|
||||
)
|
||||
success += 1
|
||||
except Exception as exc:
|
||||
attempts = int(job.get("attempts", 1))
|
||||
max_attempts = int(job.get("max_attempts", 3))
|
||||
should_requeue = attempts < max_attempts
|
||||
fail_publish_job(job_id, str(exc), requeue=should_requeue)
|
||||
mark_article_publish_result(
|
||||
article_id,
|
||||
wp_post_id=article.get("wp_post_id"),
|
||||
wp_post_url=article.get("wp_post_url"),
|
||||
error=str(exc),
|
||||
increment_attempts=True,
|
||||
set_published_status=False,
|
||||
)
|
||||
if should_requeue:
|
||||
requeued += 1
|
||||
else:
|
||||
failed += 1
|
||||
|
||||
return PublisherStats(processed=processed, success=success, failed=failed, requeued=requeued)
|
||||
44
backend/app/relevance.py
Normal file
44
backend/app/relevance.py
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timezone
|
||||
|
||||
|
||||
def _parse_iso_datetime(value: str | None) -> datetime | None:
|
||||
if not value:
|
||||
return None
|
||||
raw = value.strip()
|
||||
if not raw:
|
||||
return None
|
||||
if raw.endswith("Z"):
|
||||
raw = raw[:-1] + "+00:00"
|
||||
try:
|
||||
parsed = datetime.fromisoformat(raw)
|
||||
except ValueError:
|
||||
return None
|
||||
if parsed.tzinfo is None:
|
||||
parsed = parsed.replace(tzinfo=timezone.utc)
|
||||
return parsed
|
||||
|
||||
|
||||
def article_age_days(published_at: str | None, now: datetime | None = None) -> int | None:
|
||||
published = _parse_iso_datetime(published_at)
|
||||
if not published:
|
||||
return None
|
||||
ref = now or datetime.now(timezone.utc)
|
||||
delta = ref - published
|
||||
if delta.total_seconds() < 0:
|
||||
return 0
|
||||
return delta.days
|
||||
|
||||
|
||||
def article_relevance(published_at: str | None, now: datetime | None = None) -> str:
|
||||
days = article_age_days(published_at, now=now)
|
||||
if days is None:
|
||||
return "unbekannt"
|
||||
if days <= 2:
|
||||
return "hoch"
|
||||
if days <= 7:
|
||||
return "mittel"
|
||||
if days <= 30:
|
||||
return "niedrig"
|
||||
return "alt"
|
||||
855
backend/app/repositories.py
Normal file
855
backend/app/repositories.py
Normal file
|
|
@ -0,0 +1,855 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
from .db import get_conn, rows_to_dicts
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class SourceCreate:
|
||||
name: str
|
||||
base_url: str | None
|
||||
terms_url: str | None
|
||||
license_name: str | None
|
||||
risk_level: str
|
||||
is_enabled: bool
|
||||
notes: str | None
|
||||
last_reviewed_at: str | None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class FeedCreate:
|
||||
name: str
|
||||
url: str
|
||||
source_id: int | None
|
||||
is_enabled: bool
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class SourceUpdate:
|
||||
name: str
|
||||
base_url: str | None
|
||||
terms_url: str | None
|
||||
license_name: str | None
|
||||
risk_level: str
|
||||
is_enabled: bool
|
||||
notes: str | None
|
||||
last_reviewed_at: str | None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class FeedUpdate:
|
||||
name: str
|
||||
url: str
|
||||
source_id: int | None
|
||||
is_enabled: bool
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RunCreate:
|
||||
run_type: str
|
||||
status: str
|
||||
details: str | None = None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ArticleUpsert:
|
||||
feed_id: int | None
|
||||
source_article_id: str | None
|
||||
source_hash: str | None
|
||||
title: str
|
||||
source_url: str
|
||||
canonical_url: str | None
|
||||
published_at: str | None
|
||||
author: str | None
|
||||
summary: str | None
|
||||
content_raw: str | None
|
||||
content_rewritten: str | None
|
||||
image_urls_json: str | None
|
||||
press_contact: str | None
|
||||
source_name_snapshot: str | None
|
||||
source_terms_url_snapshot: str | None
|
||||
source_license_name_snapshot: str | None
|
||||
legal_checked: bool
|
||||
legal_checked_at: str | None
|
||||
legal_note: str | None
|
||||
wp_post_id: int | None
|
||||
wp_post_url: str | None
|
||||
publish_attempts: int
|
||||
publish_last_error: str | None
|
||||
published_to_wp_at: str | None
|
||||
word_count: int
|
||||
status: str
|
||||
meta_json: str | None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PublishJobCreate:
|
||||
article_id: int
|
||||
max_attempts: int = 3
|
||||
|
||||
|
||||
def create_source(payload: SourceCreate) -> int:
|
||||
with get_conn() as conn:
|
||||
cur = conn.execute(
|
||||
"""
|
||||
INSERT INTO sources (name, base_url, terms_url, license_name, risk_level, is_enabled, notes, last_reviewed_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""",
|
||||
(
|
||||
payload.name.strip(),
|
||||
payload.base_url,
|
||||
payload.terms_url,
|
||||
payload.license_name,
|
||||
payload.risk_level,
|
||||
1 if payload.is_enabled else 0,
|
||||
payload.notes,
|
||||
payload.last_reviewed_at,
|
||||
),
|
||||
)
|
||||
return int(cur.lastrowid)
|
||||
|
||||
|
||||
def list_sources() -> list[dict[str, Any]]:
|
||||
with get_conn() as conn:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT id, name, base_url, terms_url, license_name, risk_level, is_enabled, notes, last_reviewed_at, created_at, updated_at
|
||||
FROM sources
|
||||
ORDER BY id DESC
|
||||
"""
|
||||
).fetchall()
|
||||
return rows_to_dicts(rows)
|
||||
|
||||
|
||||
def get_source_by_id(source_id: int) -> dict[str, Any] | None:
|
||||
with get_conn() as conn:
|
||||
row = conn.execute(
|
||||
"""
|
||||
SELECT id, name, base_url, terms_url, license_name, risk_level, is_enabled, notes, last_reviewed_at, created_at, updated_at
|
||||
FROM sources
|
||||
WHERE id = ?
|
||||
""",
|
||||
(source_id,),
|
||||
).fetchone()
|
||||
return dict(row) if row else None
|
||||
|
||||
|
||||
def update_source(source_id: int, payload: SourceUpdate) -> bool:
|
||||
with get_conn() as conn:
|
||||
cur = conn.execute(
|
||||
"""
|
||||
UPDATE sources
|
||||
SET name = ?, base_url = ?, terms_url = ?, license_name = ?, risk_level = ?, is_enabled = ?, notes = ?, last_reviewed_at = ?
|
||||
WHERE id = ?
|
||||
""",
|
||||
(
|
||||
payload.name.strip(),
|
||||
payload.base_url,
|
||||
payload.terms_url,
|
||||
payload.license_name,
|
||||
payload.risk_level,
|
||||
1 if payload.is_enabled else 0,
|
||||
payload.notes,
|
||||
payload.last_reviewed_at,
|
||||
source_id,
|
||||
),
|
||||
)
|
||||
return cur.rowcount > 0
|
||||
|
||||
|
||||
def delete_source(source_id: int) -> bool:
|
||||
with get_conn() as conn:
|
||||
cur = conn.execute("DELETE FROM sources WHERE id = ?", (source_id,))
|
||||
return cur.rowcount > 0
|
||||
|
||||
|
||||
def create_feed(payload: FeedCreate) -> int:
|
||||
with get_conn() as conn:
|
||||
cur = conn.execute(
|
||||
"INSERT INTO feeds (name, url, source_id, is_enabled) VALUES (?, ?, ?, ?)",
|
||||
(payload.name.strip(), payload.url.strip(), payload.source_id, 1 if payload.is_enabled else 0),
|
||||
)
|
||||
return int(cur.lastrowid)
|
||||
|
||||
|
||||
def list_feeds() -> list[dict[str, Any]]:
|
||||
with get_conn() as conn:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT f.id, f.name, f.url, f.source_id, f.is_enabled, f.etag, f.last_modified, f.last_checked_at,
|
||||
f.created_at, f.updated_at, s.name AS source_name, s.license_name AS source_license_name,
|
||||
s.terms_url AS source_terms_url, s.risk_level AS source_risk_level, s.base_url AS source_base_url,
|
||||
s.last_reviewed_at AS source_last_reviewed_at, s.is_enabled AS source_is_enabled
|
||||
FROM feeds f
|
||||
LEFT JOIN sources s ON s.id = f.source_id
|
||||
ORDER BY f.id DESC
|
||||
"""
|
||||
).fetchall()
|
||||
return rows_to_dicts(rows)
|
||||
|
||||
|
||||
def list_enabled_feeds() -> list[dict[str, Any]]:
|
||||
with get_conn() as conn:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT f.id, f.name, f.url, f.source_id, f.is_enabled, f.etag, f.last_modified, f.last_checked_at,
|
||||
s.name AS source_name, s.license_name AS source_license_name, s.terms_url AS source_terms_url,
|
||||
s.risk_level AS source_risk_level, s.base_url AS source_base_url,
|
||||
s.last_reviewed_at AS source_last_reviewed_at, s.is_enabled AS source_is_enabled
|
||||
FROM feeds f
|
||||
LEFT JOIN sources s ON s.id = f.source_id
|
||||
WHERE f.is_enabled = 1
|
||||
ORDER BY f.id ASC
|
||||
"""
|
||||
).fetchall()
|
||||
return rows_to_dicts(rows)
|
||||
|
||||
|
||||
def get_feed_by_id(feed_id: int) -> dict[str, Any] | None:
|
||||
with get_conn() as conn:
|
||||
row = conn.execute(
|
||||
"""
|
||||
SELECT f.id, f.name, f.url, f.source_id, f.is_enabled, f.etag, f.last_modified, f.last_checked_at,
|
||||
s.name AS source_name, s.license_name AS source_license_name, s.terms_url AS source_terms_url,
|
||||
s.risk_level AS source_risk_level, s.base_url AS source_base_url,
|
||||
s.last_reviewed_at AS source_last_reviewed_at, s.is_enabled AS source_is_enabled
|
||||
FROM feeds f
|
||||
LEFT JOIN sources s ON s.id = f.source_id
|
||||
WHERE f.id = ?
|
||||
""",
|
||||
(feed_id,),
|
||||
).fetchone()
|
||||
return dict(row) if row else None
|
||||
|
||||
|
||||
def update_feed(feed_id: int, payload: FeedUpdate) -> bool:
|
||||
with get_conn() as conn:
|
||||
cur = conn.execute(
|
||||
"""
|
||||
UPDATE feeds
|
||||
SET name = ?, url = ?, source_id = ?, is_enabled = ?
|
||||
WHERE id = ?
|
||||
""",
|
||||
(
|
||||
payload.name.strip(),
|
||||
payload.url.strip(),
|
||||
payload.source_id,
|
||||
1 if payload.is_enabled else 0,
|
||||
feed_id,
|
||||
),
|
||||
)
|
||||
return cur.rowcount > 0
|
||||
|
||||
|
||||
def delete_feed(feed_id: int) -> bool:
|
||||
with get_conn() as conn:
|
||||
cur = conn.execute("DELETE FROM feeds WHERE id = ?", (feed_id,))
|
||||
return cur.rowcount > 0
|
||||
|
||||
|
||||
def update_feed_fetch_state(feed_id: int, etag: str | None, last_modified: str | None) -> None:
|
||||
with get_conn() as conn:
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE feeds
|
||||
SET etag = ?, last_modified = ?, last_checked_at = datetime('now')
|
||||
WHERE id = ?
|
||||
""",
|
||||
(etag, last_modified, feed_id),
|
||||
)
|
||||
|
||||
|
||||
def create_run(payload: RunCreate) -> int:
|
||||
with get_conn() as conn:
|
||||
cur = conn.execute(
|
||||
"INSERT INTO runs (run_type, status, details) VALUES (?, ?, ?)",
|
||||
(payload.run_type, payload.status, payload.details),
|
||||
)
|
||||
return int(cur.lastrowid)
|
||||
|
||||
|
||||
def finish_run(run_id: int, status: str, details: str | None = None) -> None:
|
||||
with get_conn() as conn:
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE runs
|
||||
SET status = ?, details = ?, finished_at = datetime('now')
|
||||
WHERE id = ?
|
||||
""",
|
||||
(status, details, run_id),
|
||||
)
|
||||
|
||||
|
||||
def list_runs(limit: int = 50) -> list[dict[str, Any]]:
|
||||
safe_limit = max(1, min(limit, 500))
|
||||
with get_conn() as conn:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT id, run_type, status, started_at, finished_at, details
|
||||
FROM runs
|
||||
ORDER BY id DESC
|
||||
LIMIT ?
|
||||
""",
|
||||
(safe_limit,),
|
||||
).fetchall()
|
||||
return rows_to_dicts(rows)
|
||||
|
||||
|
||||
def get_run_by_id(run_id: int) -> dict[str, Any] | None:
|
||||
with get_conn() as conn:
|
||||
row = conn.execute(
|
||||
"""
|
||||
SELECT id, run_type, status, started_at, finished_at, details
|
||||
FROM runs
|
||||
WHERE id = ?
|
||||
""",
|
||||
(run_id,),
|
||||
).fetchone()
|
||||
return dict(row) if row else None
|
||||
|
||||
|
||||
def get_article_by_id(article_id: int) -> dict[str, Any] | None:
|
||||
with get_conn() as conn:
|
||||
row = conn.execute(
|
||||
"""
|
||||
SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author,
|
||||
a.summary, a.content_raw, a.content_rewritten, a.image_urls_json, a.press_contact,
|
||||
a.source_name_snapshot, a.source_terms_url_snapshot, a.source_license_name_snapshot,
|
||||
a.legal_checked, a.legal_checked_at, a.legal_note,
|
||||
a.wp_post_id, a.wp_post_url, a.publish_attempts, a.publish_last_error, a.published_to_wp_at,
|
||||
a.word_count, a.status, a.meta_json, a.created_at, a.updated_at,
|
||||
a.scheduled_publish_at
|
||||
FROM articles a
|
||||
WHERE a.id = ?
|
||||
""",
|
||||
(article_id,),
|
||||
).fetchone()
|
||||
return dict(row) if row else None
|
||||
|
||||
|
||||
def _merge_review_event(meta_json: str | None, event: dict[str, Any]) -> str:
|
||||
meta: dict[str, Any] = {}
|
||||
if meta_json:
|
||||
try:
|
||||
meta = json.loads(meta_json)
|
||||
if not isinstance(meta, dict):
|
||||
meta = {}
|
||||
except Exception:
|
||||
meta = {}
|
||||
|
||||
events = meta.get("review_events")
|
||||
if not isinstance(events, list):
|
||||
events = []
|
||||
events.append(event)
|
||||
meta["review_events"] = events
|
||||
return json.dumps(meta, ensure_ascii=False)
|
||||
|
||||
|
||||
def _load_meta(meta_json: str | None) -> dict[str, Any]:
|
||||
if not meta_json:
|
||||
return {}
|
||||
try:
|
||||
parsed = json.loads(meta_json)
|
||||
return parsed if isinstance(parsed, dict) else {}
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
|
||||
def update_article_status(
|
||||
article_id: int,
|
||||
new_status: str,
|
||||
*,
|
||||
actor: str | None = None,
|
||||
note: str | None = None,
|
||||
decision: str | None = None,
|
||||
) -> bool:
|
||||
article = get_article_by_id(article_id)
|
||||
if not article:
|
||||
return False
|
||||
|
||||
event = {
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"from_status": article.get("status"),
|
||||
"to_status": new_status,
|
||||
"actor": actor or "system",
|
||||
"note": note,
|
||||
"decision": decision,
|
||||
}
|
||||
merged_meta = _merge_review_event(article.get("meta_json"), event)
|
||||
|
||||
with get_conn() as conn:
|
||||
conn.execute(
|
||||
"UPDATE articles SET status = ?, meta_json = ? WHERE id = ?",
|
||||
(new_status, merged_meta, article_id),
|
||||
)
|
||||
return True
|
||||
|
||||
|
||||
def set_article_legal_review(article_id: int, approved: bool, note: str | None, actor: str | None = None) -> bool:
|
||||
article = get_article_by_id(article_id)
|
||||
if not article:
|
||||
return False
|
||||
|
||||
event = {
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"event": "legal_review",
|
||||
"approved": approved,
|
||||
"actor": actor or "system",
|
||||
"note": note,
|
||||
}
|
||||
merged_meta = _merge_review_event(article.get("meta_json"), event)
|
||||
with get_conn() as conn:
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE articles
|
||||
SET legal_checked = ?, legal_checked_at = datetime('now'), legal_note = ?, meta_json = ?
|
||||
WHERE id = ?
|
||||
""",
|
||||
(1 if approved else 0, note, merged_meta, article_id),
|
||||
)
|
||||
return True
|
||||
|
||||
|
||||
def set_article_image_decision(article_id: int, image_url: str, action: str, actor: str | None = None) -> bool:
|
||||
article = get_article_by_id(article_id)
|
||||
if not article:
|
||||
return False
|
||||
url = (image_url or "").strip()
|
||||
if not url:
|
||||
return False
|
||||
if action not in {"select", "exclude", "restore"}:
|
||||
return False
|
||||
|
||||
meta = _load_meta(article.get("meta_json"))
|
||||
image_review = meta.get("image_review")
|
||||
if not isinstance(image_review, dict):
|
||||
image_review = {}
|
||||
|
||||
excluded = image_review.get("excluded_urls")
|
||||
if not isinstance(excluded, list):
|
||||
excluded = []
|
||||
excluded_set = {str(item) for item in excluded if item}
|
||||
|
||||
selected_url = image_review.get("selected_url")
|
||||
if not isinstance(selected_url, str):
|
||||
selected_url = None
|
||||
|
||||
if action == "select":
|
||||
selected_url = url
|
||||
excluded_set.discard(url)
|
||||
elif action == "exclude":
|
||||
excluded_set.add(url)
|
||||
if selected_url == url:
|
||||
selected_url = None
|
||||
elif action == "restore":
|
||||
excluded_set.discard(url)
|
||||
|
||||
image_review["selected_url"] = selected_url
|
||||
image_review["excluded_urls"] = sorted(excluded_set)
|
||||
image_review["updated_at"] = datetime.now(timezone.utc).isoformat()
|
||||
image_review["updated_by"] = actor or "system"
|
||||
meta["image_review"] = image_review
|
||||
|
||||
with get_conn() as conn:
|
||||
conn.execute(
|
||||
"UPDATE articles SET meta_json = ? WHERE id = ?",
|
||||
(json.dumps(meta, ensure_ascii=False), article_id),
|
||||
)
|
||||
return True
|
||||
|
||||
|
||||
def create_publish_job(payload: PublishJobCreate) -> int:
|
||||
with get_conn() as conn:
|
||||
existing = conn.execute(
|
||||
"""
|
||||
SELECT id FROM publish_jobs
|
||||
WHERE article_id = ? AND status IN ('queued', 'running')
|
||||
ORDER BY id DESC
|
||||
LIMIT 1
|
||||
""",
|
||||
(payload.article_id,),
|
||||
).fetchone()
|
||||
if existing:
|
||||
return int(existing["id"])
|
||||
|
||||
cur = conn.execute(
|
||||
"""
|
||||
INSERT INTO publish_jobs (article_id, status, attempts, max_attempts)
|
||||
VALUES (?, 'queued', 0, ?)
|
||||
""",
|
||||
(payload.article_id, max(1, payload.max_attempts)),
|
||||
)
|
||||
return int(cur.lastrowid)
|
||||
|
||||
|
||||
def list_publish_jobs(limit: int = 100) -> list[dict[str, Any]]:
|
||||
safe_limit = max(1, min(limit, 500))
|
||||
with get_conn() as conn:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT j.id, j.article_id, j.status, j.attempts, j.max_attempts, j.error_message, j.wp_post_id, j.wp_post_url,
|
||||
j.created_at, j.started_at, j.finished_at, a.title AS article_title
|
||||
FROM publish_jobs j
|
||||
LEFT JOIN articles a ON a.id = j.article_id
|
||||
ORDER BY j.id DESC
|
||||
LIMIT ?
|
||||
""",
|
||||
(safe_limit,),
|
||||
).fetchall()
|
||||
return rows_to_dicts(rows)
|
||||
|
||||
|
||||
def claim_next_publish_job() -> dict[str, Any] | None:
|
||||
with get_conn() as conn:
|
||||
row = conn.execute(
|
||||
"""
|
||||
SELECT id, article_id, status, attempts, max_attempts, error_message, wp_post_id, wp_post_url
|
||||
FROM publish_jobs
|
||||
WHERE status = 'queued' AND attempts < max_attempts
|
||||
ORDER BY id ASC
|
||||
LIMIT 1
|
||||
"""
|
||||
).fetchone()
|
||||
if not row:
|
||||
return None
|
||||
job_id = int(row["id"])
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE publish_jobs
|
||||
SET status = 'running',
|
||||
attempts = attempts + 1,
|
||||
started_at = datetime('now'),
|
||||
finished_at = NULL
|
||||
WHERE id = ?
|
||||
""",
|
||||
(job_id,),
|
||||
)
|
||||
claimed = conn.execute(
|
||||
"""
|
||||
SELECT id, article_id, status, attempts, max_attempts, error_message, wp_post_id, wp_post_url
|
||||
FROM publish_jobs
|
||||
WHERE id = ?
|
||||
""",
|
||||
(job_id,),
|
||||
).fetchone()
|
||||
return dict(claimed) if claimed else None
|
||||
|
||||
|
||||
def complete_publish_job(job_id: int, wp_post_id: int | None, wp_post_url: str | None) -> None:
|
||||
with get_conn() as conn:
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE publish_jobs
|
||||
SET status = 'success',
|
||||
wp_post_id = ?,
|
||||
wp_post_url = ?,
|
||||
error_message = NULL,
|
||||
finished_at = datetime('now')
|
||||
WHERE id = ?
|
||||
""",
|
||||
(wp_post_id, wp_post_url, job_id),
|
||||
)
|
||||
|
||||
|
||||
def fail_publish_job(job_id: int, error_message: str, requeue: bool) -> None:
|
||||
next_status = "queued" if requeue else "failed"
|
||||
with get_conn() as conn:
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE publish_jobs
|
||||
SET status = ?,
|
||||
error_message = ?,
|
||||
finished_at = datetime('now')
|
||||
WHERE id = ?
|
||||
""",
|
||||
(next_status, error_message[:2000], job_id),
|
||||
)
|
||||
|
||||
|
||||
def mark_article_publish_result(
|
||||
article_id: int,
|
||||
*,
|
||||
wp_post_id: int | None,
|
||||
wp_post_url: str | None,
|
||||
error: str | None,
|
||||
increment_attempts: bool,
|
||||
set_published_status: bool,
|
||||
) -> None:
|
||||
with get_conn() as conn:
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE articles
|
||||
SET wp_post_id = ?,
|
||||
wp_post_url = ?,
|
||||
publish_attempts = CASE WHEN ? THEN publish_attempts + 1 ELSE publish_attempts END,
|
||||
publish_last_error = ?,
|
||||
published_to_wp_at = CASE WHEN ? IS NOT NULL THEN datetime('now') ELSE published_to_wp_at END,
|
||||
status = CASE WHEN ? THEN 'published' ELSE status END
|
||||
WHERE id = ?
|
||||
""",
|
||||
(
|
||||
wp_post_id,
|
||||
wp_post_url,
|
||||
1 if increment_attempts else 0,
|
||||
error[:2000] if error else None,
|
||||
wp_post_id,
|
||||
1 if set_published_status else 0,
|
||||
article_id,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def _resolve_existing_article_id(payload: ArticleUpsert) -> int | None:
|
||||
with get_conn() as conn:
|
||||
# 1) strongest key: source_url
|
||||
row = conn.execute(
|
||||
"SELECT id FROM articles WHERE source_url = ?",
|
||||
(payload.source_url.strip(),),
|
||||
).fetchone()
|
||||
if row:
|
||||
return int(row["id"])
|
||||
|
||||
# 2) stable feed+guid combo
|
||||
if payload.feed_id is not None and payload.source_article_id:
|
||||
row = conn.execute(
|
||||
"SELECT id FROM articles WHERE feed_id = ? AND source_article_id = ?",
|
||||
(payload.feed_id, payload.source_article_id),
|
||||
).fetchone()
|
||||
if row:
|
||||
return int(row["id"])
|
||||
|
||||
# 3) content hash fallback
|
||||
if payload.source_hash:
|
||||
row = conn.execute(
|
||||
"SELECT id FROM articles WHERE source_hash = ?",
|
||||
(payload.source_hash,),
|
||||
).fetchone()
|
||||
if row:
|
||||
return int(row["id"])
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def find_existing_article_for_upsert(payload: ArticleUpsert) -> dict[str, Any] | None:
|
||||
article_id = _resolve_existing_article_id(payload)
|
||||
if article_id is None:
|
||||
return None
|
||||
return get_article_by_id(article_id)
|
||||
|
||||
|
||||
def upsert_article(payload: ArticleUpsert) -> int:
|
||||
existing_id = _resolve_existing_article_id(payload)
|
||||
with get_conn() as conn:
|
||||
if existing_id is None:
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO articles (
|
||||
feed_id, source_article_id, source_hash, title, source_url, canonical_url, published_at, author,
|
||||
summary, content_raw, content_rewritten, image_urls_json, press_contact,
|
||||
source_name_snapshot, source_terms_url_snapshot, source_license_name_snapshot,
|
||||
legal_checked, legal_checked_at, legal_note,
|
||||
wp_post_id, wp_post_url, publish_attempts, publish_last_error, published_to_wp_at,
|
||||
word_count, status, meta_json
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""",
|
||||
(
|
||||
payload.feed_id,
|
||||
payload.source_article_id,
|
||||
payload.source_hash,
|
||||
payload.title.strip(),
|
||||
payload.source_url.strip(),
|
||||
payload.canonical_url,
|
||||
payload.published_at,
|
||||
payload.author,
|
||||
payload.summary,
|
||||
payload.content_raw,
|
||||
payload.content_rewritten,
|
||||
payload.image_urls_json,
|
||||
payload.press_contact,
|
||||
payload.source_name_snapshot,
|
||||
payload.source_terms_url_snapshot,
|
||||
payload.source_license_name_snapshot,
|
||||
1 if payload.legal_checked else 0,
|
||||
payload.legal_checked_at,
|
||||
payload.legal_note,
|
||||
payload.wp_post_id,
|
||||
payload.wp_post_url,
|
||||
payload.publish_attempts,
|
||||
payload.publish_last_error,
|
||||
payload.published_to_wp_at,
|
||||
payload.word_count,
|
||||
payload.status,
|
||||
payload.meta_json,
|
||||
),
|
||||
)
|
||||
else:
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE articles
|
||||
SET
|
||||
feed_id = ?,
|
||||
source_article_id = ?,
|
||||
source_hash = ?,
|
||||
title = ?,
|
||||
source_url = ?,
|
||||
canonical_url = ?,
|
||||
published_at = ?,
|
||||
author = ?,
|
||||
summary = ?,
|
||||
content_raw = ?,
|
||||
content_rewritten = ?,
|
||||
image_urls_json = ?,
|
||||
press_contact = ?,
|
||||
source_name_snapshot = ?,
|
||||
source_terms_url_snapshot = ?,
|
||||
source_license_name_snapshot = ?,
|
||||
legal_checked = ?,
|
||||
legal_checked_at = ?,
|
||||
legal_note = ?,
|
||||
wp_post_id = ?,
|
||||
wp_post_url = ?,
|
||||
publish_attempts = ?,
|
||||
publish_last_error = ?,
|
||||
published_to_wp_at = ?,
|
||||
word_count = ?,
|
||||
status = ?,
|
||||
meta_json = ?
|
||||
WHERE id = ?
|
||||
""",
|
||||
(
|
||||
payload.feed_id,
|
||||
payload.source_article_id,
|
||||
payload.source_hash,
|
||||
payload.title.strip(),
|
||||
payload.source_url.strip(),
|
||||
payload.canonical_url,
|
||||
payload.published_at,
|
||||
payload.author,
|
||||
payload.summary,
|
||||
payload.content_raw,
|
||||
payload.content_rewritten,
|
||||
payload.image_urls_json,
|
||||
payload.press_contact,
|
||||
payload.source_name_snapshot,
|
||||
payload.source_terms_url_snapshot,
|
||||
payload.source_license_name_snapshot,
|
||||
1 if payload.legal_checked else 0,
|
||||
payload.legal_checked_at,
|
||||
payload.legal_note,
|
||||
payload.wp_post_id,
|
||||
payload.wp_post_url,
|
||||
payload.publish_attempts,
|
||||
payload.publish_last_error,
|
||||
payload.published_to_wp_at,
|
||||
payload.word_count,
|
||||
payload.status,
|
||||
payload.meta_json,
|
||||
existing_id,
|
||||
),
|
||||
)
|
||||
row = conn.execute("SELECT id FROM articles WHERE source_url = ?", (payload.source_url.strip(),)).fetchone()
|
||||
if row:
|
||||
return int(row["id"])
|
||||
return int(existing_id) if existing_id else 0
|
||||
|
||||
|
||||
def list_articles_page(
|
||||
limit: int = 50,
|
||||
offset: int = 0,
|
||||
status_filter: str | None = None,
|
||||
search: str | None = None,
|
||||
) -> tuple[list[dict[str, Any]], int]:
|
||||
"""Return (articles, total_count) with optional status filter and title search."""
|
||||
safe_limit = max(1, min(limit, 200))
|
||||
safe_offset = max(0, offset)
|
||||
|
||||
conditions: list[str] = []
|
||||
params: list[Any] = []
|
||||
if status_filter:
|
||||
conditions.append("a.status = ?")
|
||||
params.append(status_filter)
|
||||
if search:
|
||||
conditions.append("(a.title LIKE ? OR a.id = ?)")
|
||||
try:
|
||||
params.extend([f"%{search}%", int(search)])
|
||||
except ValueError:
|
||||
params.extend([f"%{search}%", -1])
|
||||
|
||||
where = f"WHERE {' AND '.join(conditions)}" if conditions else ""
|
||||
select = """
|
||||
SELECT a.id, a.title, a.status, a.published_at, a.summary, a.content_raw,
|
||||
a.meta_json, a.wp_post_id, a.wp_post_url, a.scheduled_publish_at,
|
||||
a.word_count, f.name AS feed_name
|
||||
FROM articles a
|
||||
LEFT JOIN feeds f ON f.id = a.feed_id
|
||||
"""
|
||||
with get_conn() as conn:
|
||||
total = conn.execute(
|
||||
f"SELECT COUNT(*) FROM articles a {where}", params
|
||||
).fetchone()[0]
|
||||
rows = conn.execute(
|
||||
f"{select} {where} ORDER BY a.id DESC LIMIT ? OFFSET ?",
|
||||
params + [safe_limit, safe_offset],
|
||||
).fetchall()
|
||||
return rows_to_dicts(rows), total
|
||||
|
||||
|
||||
def bulk_update_wp_post_ids(updates: list[tuple[int, int | None]]) -> int:
|
||||
"""Update wp_post_id (and clear stale wp_post_url) for multiple articles.
|
||||
|
||||
Returns the number of rows actually updated.
|
||||
Call sync_db_from_wordpress() afterwards to repopulate wp_post_url and
|
||||
scheduled_publish_at from the live WordPress data.
|
||||
"""
|
||||
if not updates:
|
||||
return 0
|
||||
updated = 0
|
||||
with get_conn() as conn:
|
||||
for article_id, new_wp_id in updates:
|
||||
conn.execute(
|
||||
"UPDATE articles SET wp_post_id = ?, wp_post_url = NULL WHERE id = ?",
|
||||
(new_wp_id, article_id),
|
||||
)
|
||||
updated += 1
|
||||
return updated
|
||||
|
||||
|
||||
def list_articles(limit: int = 100, status_filter: str | None = None) -> list[dict[str, Any]]:
|
||||
safe_limit = max(1, min(limit, 500))
|
||||
with get_conn() as conn:
|
||||
if status_filter:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author,
|
||||
a.summary, a.content_raw, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at, f.name AS feed_name,
|
||||
a.image_urls_json, a.press_contact, a.source_name_snapshot, a.source_terms_url_snapshot,
|
||||
a.source_license_name_snapshot, a.legal_checked, a.legal_checked_at, a.legal_note,
|
||||
a.wp_post_id, a.wp_post_url, a.publish_attempts, a.publish_last_error, a.published_to_wp_at
|
||||
FROM articles a
|
||||
LEFT JOIN feeds f ON f.id = a.feed_id
|
||||
WHERE a.status = ?
|
||||
ORDER BY a.id DESC
|
||||
LIMIT ?
|
||||
""",
|
||||
(status_filter, safe_limit),
|
||||
).fetchall()
|
||||
else:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author,
|
||||
a.summary, a.content_raw, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at, f.name AS feed_name,
|
||||
a.image_urls_json, a.press_contact, a.source_name_snapshot, a.source_terms_url_snapshot,
|
||||
a.source_license_name_snapshot, a.legal_checked, a.legal_checked_at, a.legal_note,
|
||||
a.wp_post_id, a.wp_post_url, a.publish_attempts, a.publish_last_error, a.published_to_wp_at
|
||||
FROM articles a
|
||||
LEFT JOIN feeds f ON f.id = a.feed_id
|
||||
ORDER BY a.id DESC
|
||||
LIMIT ?
|
||||
""",
|
||||
(safe_limit,),
|
||||
).fetchall()
|
||||
return rows_to_dicts(rows)
|
||||
204
backend/app/rewrite.py
Normal file
204
backend/app/rewrite.py
Normal file
|
|
@ -0,0 +1,204 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from typing import Any
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
from .config import get_settings
|
||||
|
||||
|
||||
def _sanitize_source_text(text: str) -> str:
|
||||
raw = (text or "").strip()
|
||||
if not raw:
|
||||
return ""
|
||||
|
||||
lines = [ln.strip() for ln in raw.splitlines() if ln.strip()]
|
||||
if len(lines) > 3:
|
||||
lines = lines[3:]
|
||||
|
||||
joined = "\n".join(lines)
|
||||
# Remove press contact block at end from "Pressekontakt" onward.
|
||||
joined = re.sub(
|
||||
r"\n?\s*Pressekontakt[\s\S]*$",
|
||||
"",
|
||||
joined,
|
||||
flags=re.IGNORECASE,
|
||||
).strip()
|
||||
return joined
|
||||
|
||||
|
||||
def _normalize_tags(tags: list[str], max_tags: int = 8) -> list[str]:
|
||||
out: list[str] = []
|
||||
seen: set[str] = set()
|
||||
for raw in tags:
|
||||
value = re.sub(r"\s+", " ", str(raw or "").strip())
|
||||
value = re.sub(r"^[#\-•\s]+", "", value)
|
||||
value = re.sub(r"[;,.:\s]+$", "", value)
|
||||
if not value:
|
||||
continue
|
||||
if len(value) < 2 or len(value) > 40:
|
||||
continue
|
||||
key = value.casefold()
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
out.append(value)
|
||||
if len(out) >= max_tags:
|
||||
break
|
||||
return out
|
||||
|
||||
|
||||
def _openai_chat(system: str, user: str, temperature: float = 0.4) -> str:
|
||||
settings = get_settings()
|
||||
api_key = settings.openai_api_key
|
||||
if not api_key:
|
||||
raise RuntimeError("OPENAI_API_KEY fehlt")
|
||||
|
||||
payload = {
|
||||
"model": settings.openai_model,
|
||||
"temperature": temperature,
|
||||
"messages": [
|
||||
{"role": "system", "content": system},
|
||||
{"role": "user", "content": user},
|
||||
],
|
||||
}
|
||||
req = Request(
|
||||
url="https://api.openai.com/v1/chat/completions",
|
||||
method="POST",
|
||||
data=json.dumps(payload).encode("utf-8"),
|
||||
headers={
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
"Content-Type": "application/json",
|
||||
"Accept": "application/json",
|
||||
},
|
||||
)
|
||||
with urlopen(req, timeout=60) as resp:
|
||||
raw = resp.read().decode("utf-8", errors="replace")
|
||||
data = json.loads(raw)
|
||||
choices = data.get("choices")
|
||||
if not isinstance(choices, list) or not choices:
|
||||
raise RuntimeError(f"Ungültige OpenAI-Antwort: {data}")
|
||||
message = choices[0].get("message", {})
|
||||
content = message.get("content")
|
||||
if not isinstance(content, str) or not content.strip():
|
||||
raise RuntimeError("OpenAI lieferte keinen Inhalt")
|
||||
return content.strip()
|
||||
|
||||
|
||||
def rewrite_article_text(article: dict[str, Any]) -> str:
|
||||
source_text = _sanitize_source_text(article.get("content_raw") or "")
|
||||
if not source_text:
|
||||
source_text = (article.get("summary") or "").strip()
|
||||
if not source_text:
|
||||
raise RuntimeError("Kein Quelltext für Rewrite verfügbar")
|
||||
|
||||
title = (article.get("title") or "").strip()
|
||||
source_name = (article.get("source_name_snapshot") or article.get("author") or "die Quelle").strip()
|
||||
prompt = (
|
||||
"Schreibe den folgenden News-Text neu auf Deutsch in persönlicher Du-Form. "
|
||||
"Stil: ausführlich, gut lesbar, ohne Einleitung mit Datum/Uhrzeit/Firma/Ort, "
|
||||
"ohne Pressekontakt, ohne Quellenblock. "
|
||||
"Nutze klare Absätze und Zwischenüberschriften in HTML (<h2>, <p>, <ul><li> falls passend). "
|
||||
"Inhaltlich korrekt bleiben, nichts erfinden. "
|
||||
f"Wichtig: Der Artikel wurde von '{source_name}' veröffentlicht. "
|
||||
"Verwende NIEMALS 'wir' oder 'ich' aus Sicht der Quelle – beziehe Aussagen stets auf die Quelle, "
|
||||
f"z.B. 'laut {source_name}', '{source_name} hat ermittelt', 'die Auswertung zeigt'.\n\n"
|
||||
f"Titel: {title}\n\n"
|
||||
f"Originaltext:\n{source_text}"
|
||||
)
|
||||
return _openai_chat(
|
||||
"Du bist ein deutscher News-Redakteur.",
|
||||
prompt,
|
||||
temperature=0.4,
|
||||
)
|
||||
|
||||
|
||||
def generate_article_tags(article: dict[str, Any], rewritten_text: str | None = None, max_tags: int = 8) -> list[str]:
|
||||
source_text = rewritten_text or _sanitize_source_text(article.get("content_raw") or "") or (article.get("summary") or "")
|
||||
source_text = str(source_text).strip()
|
||||
if not source_text:
|
||||
return []
|
||||
title = (article.get("title") or "").strip()
|
||||
prompt = (
|
||||
"Erzeuge präzise Schlagwörter für einen deutschen News-Artikel. "
|
||||
f"Maximal {max_tags} Tags. Nur relevante Begriffe, keine allgemeinen Wörter wie News/Artikel. "
|
||||
"Gib ausschließlich ein JSON-Array mit Strings zurück, ohne Erklärung.\n\n"
|
||||
f"Titel: {title}\n\n"
|
||||
f"Text:\n{source_text[:3500]}"
|
||||
)
|
||||
raw = _openai_chat(
|
||||
"Du extrahierst präzise, kurze News-Tags auf Deutsch.",
|
||||
prompt,
|
||||
temperature=0.2,
|
||||
)
|
||||
try:
|
||||
parsed = json.loads(raw)
|
||||
if isinstance(parsed, list):
|
||||
return _normalize_tags([str(x) for x in parsed], max_tags=max_tags)
|
||||
except Exception:
|
||||
pass
|
||||
# fallback: extract first JSON-like array if model wrapped output
|
||||
match = re.search(r"\[[\s\S]*\]", raw)
|
||||
if match:
|
||||
try:
|
||||
parsed = json.loads(match.group(0))
|
||||
if isinstance(parsed, list):
|
||||
return _normalize_tags([str(x) for x in parsed], max_tags=max_tags)
|
||||
except Exception:
|
||||
return []
|
||||
return []
|
||||
|
||||
|
||||
def score_article_relevance(article: dict[Any, Any]) -> dict[str, Any]:
|
||||
"""Score article relevance for VanLife/Camping/Outdoor blog (0-100).
|
||||
|
||||
Returns {"score": int, "reason": str, "topics": list[str]}.
|
||||
Raises RuntimeError on OpenAI failure.
|
||||
"""
|
||||
title = (article.get("title") or "").strip()
|
||||
text = _sanitize_source_text(article.get("content_raw") or "")
|
||||
if not text:
|
||||
text = (article.get("summary") or "").strip()
|
||||
|
||||
prompt = (
|
||||
"Bewerte die Relevanz des folgenden Artikels für einen deutschen VanLife-, Camping- und Outdoor-Blog. "
|
||||
"Relevante Themen: Campingplätze, Stellplätze, Wohnmobil, Camper, Van, Roadtrip, "
|
||||
"Outdoor-Ausrüstung, Wandern, Naturreisen, Reise-Tipps für Campende. "
|
||||
"Nicht relevant: allgemeine Nachrichten, Politik, Wirtschaft, Sport (außer Outdoor), Unterhaltung.\n\n"
|
||||
"Antworte NUR mit einem JSON-Objekt:\n"
|
||||
'{"score": <0-100>, "reason": "<kurze Begründung auf Deutsch>", "topics": ["<Thema1>", "<Thema2>"]}\n\n'
|
||||
f"Titel: {title}\n\n"
|
||||
f"Text (Auszug):\n{text[:2000]}"
|
||||
)
|
||||
raw = _openai_chat(
|
||||
"Du bist ein Redakteur für einen VanLife- und Camping-Blog und bewertest Artikelrelevanz.",
|
||||
prompt,
|
||||
temperature=0.1,
|
||||
)
|
||||
try:
|
||||
match = re.search(r"\{[\s\S]*\}", raw)
|
||||
if match:
|
||||
parsed = json.loads(match.group(0))
|
||||
score = max(0, min(100, int(parsed.get("score", 0))))
|
||||
return {
|
||||
"score": score,
|
||||
"reason": str(parsed.get("reason", "")),
|
||||
"topics": [str(t) for t in (parsed.get("topics") or [])],
|
||||
}
|
||||
except Exception:
|
||||
pass
|
||||
return {"score": 0, "reason": "Parsing-Fehler bei Relevanz-Score", "topics": []}
|
||||
|
||||
|
||||
def merge_generated_tags(meta_json: str | None, tags: list[str]) -> str:
|
||||
meta: dict[str, Any] = {}
|
||||
if meta_json:
|
||||
try:
|
||||
parsed = json.loads(meta_json)
|
||||
if isinstance(parsed, dict):
|
||||
meta = parsed
|
||||
except Exception:
|
||||
meta = {}
|
||||
meta["generated_tags"] = _normalize_tags(tags)
|
||||
return json.dumps(meta, ensure_ascii=False)
|
||||
336
backend/app/scheduler.py
Normal file
336
backend/app/scheduler.py
Normal file
|
|
@ -0,0 +1,336 @@
|
|||
"""Smart publishing scheduler.
|
||||
|
||||
Calculates suggested publish slots for new WordPress drafts.
|
||||
Rules:
|
||||
- Maximum N drafts per day (configurable, default 2)
|
||||
- Preferred slots: configurable hours (default 09:00 and 14:00 CET)
|
||||
- New articles queue up after the last already-scheduled article
|
||||
- Checks both local DB AND WordPress future posts to avoid double-booking
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import json
|
||||
import threading
|
||||
import urllib.request
|
||||
from datetime import date, datetime, timedelta, timezone
|
||||
from typing import Any
|
||||
|
||||
from .config import get_settings
|
||||
from .db import get_conn
|
||||
|
||||
# Ensures that concurrent pipeline runs (two threads) never assign the same slot.
|
||||
_slot_lock = threading.Lock()
|
||||
|
||||
|
||||
# CET offset (UTC+1 winter / UTC+2 summer – fixed +1 for simplicity)
|
||||
_CET_OFFSET = timedelta(hours=1)
|
||||
|
||||
|
||||
def _today_cet() -> date:
|
||||
return (datetime.now(timezone.utc) + _CET_OFFSET).date()
|
||||
|
||||
|
||||
def _preferred_hours() -> list[int]:
|
||||
settings = get_settings()
|
||||
try:
|
||||
return [int(h.strip()) for h in settings.pipeline_publish_hours.split(",") if h.strip()]
|
||||
except Exception:
|
||||
return [9, 14]
|
||||
|
||||
|
||||
def _fetch_wp_occupied_slots() -> set[tuple[str, int]]:
|
||||
"""Fetch all future-scheduled WordPress posts and return occupied (date_iso, hour) pairs.
|
||||
|
||||
This prevents the scheduler from assigning a slot that is already taken
|
||||
by a WP post that was not created via this pipeline (e.g. manually or via recovery scripts).
|
||||
Returns an empty set on any error so the scheduler degrades gracefully.
|
||||
"""
|
||||
settings = get_settings()
|
||||
try:
|
||||
auth = base64.b64encode(
|
||||
f"{settings.wordpress_username}:{settings.wordpress_app_password}".encode()
|
||||
).decode()
|
||||
url = (
|
||||
f"{settings.wordpress_base_url}/wp-json/wp/v2/posts"
|
||||
f"?status=future&per_page=100&orderby=date&order=asc&_fields=id,date"
|
||||
)
|
||||
req = urllib.request.Request(url, headers={"Authorization": f"Basic {auth}"})
|
||||
with urllib.request.urlopen(req, timeout=10) as resp:
|
||||
posts = json.loads(resp.read())
|
||||
occupied: set[tuple[str, int]] = set()
|
||||
for p in posts:
|
||||
try:
|
||||
dt = datetime.fromisoformat(p["date"])
|
||||
occupied.add((dt.date().isoformat(), dt.hour))
|
||||
except Exception:
|
||||
pass
|
||||
return occupied
|
||||
except Exception:
|
||||
return set()
|
||||
|
||||
|
||||
def _get_last_future_scheduled_date(wp_occupied: set[tuple[str, int]]) -> date | None:
|
||||
"""Return the date of the latest already-scheduled slot (DB + WP)."""
|
||||
today = _today_cet()
|
||||
|
||||
# Latest from local DB
|
||||
with get_conn() as conn:
|
||||
row = conn.execute(
|
||||
"""
|
||||
SELECT MAX(scheduled_publish_at) AS last_slot
|
||||
FROM articles
|
||||
WHERE scheduled_publish_at IS NOT NULL
|
||||
AND scheduled_publish_at >= ?
|
||||
AND status NOT IN ('error', 'no_image')
|
||||
""",
|
||||
(today.isoformat() + "T00:00:00",),
|
||||
).fetchone()
|
||||
db_last: date | None = None
|
||||
if row and row["last_slot"]:
|
||||
try:
|
||||
db_last = datetime.fromisoformat(row["last_slot"]).date()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Latest from WP
|
||||
wp_last: date | None = None
|
||||
for d_str, _ in wp_occupied:
|
||||
try:
|
||||
d = date.fromisoformat(d_str)
|
||||
if d >= today and (wp_last is None or d > wp_last):
|
||||
wp_last = d
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if db_last and wp_last:
|
||||
return max(db_last, wp_last)
|
||||
return db_last or wp_last
|
||||
|
||||
|
||||
def _next_free_hour(target_date: date, wp_occupied: set[tuple[str, int]]) -> int | None:
|
||||
"""Return first preferred hour not yet used on target_date (DB + WP), or None if day is full."""
|
||||
hours = _preferred_hours()
|
||||
date_str = target_date.isoformat()
|
||||
|
||||
# Hours used in local DB
|
||||
with get_conn() as conn:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT scheduled_publish_at FROM articles
|
||||
WHERE scheduled_publish_at >= ? AND scheduled_publish_at < ?
|
||||
AND status NOT IN ('error', 'no_image')
|
||||
""",
|
||||
(date_str + "T00:00:00", date_str + "T23:59:59"),
|
||||
).fetchall()
|
||||
|
||||
used_hours: set[int] = set()
|
||||
for row in rows:
|
||||
ts = row["scheduled_publish_at"] or ""
|
||||
try:
|
||||
used_hours.add(datetime.fromisoformat(ts).hour)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Hours used in WordPress
|
||||
for d_str, h in wp_occupied:
|
||||
if d_str == date_str:
|
||||
used_hours.add(h)
|
||||
|
||||
for h in hours:
|
||||
if h not in used_hours:
|
||||
return h
|
||||
return None
|
||||
|
||||
|
||||
def _format_slot(d: date, hour: int) -> str:
|
||||
weekday_names = ["Mo", "Di", "Mi", "Do", "Fr", "Sa", "So"]
|
||||
wd = weekday_names[d.weekday()]
|
||||
return f"{wd}, {d.strftime('%d.%m.%Y')} um {hour:02d}:00 Uhr"
|
||||
|
||||
|
||||
def _find_next_free_slot(
|
||||
wp_occupied: set[tuple[str, int]], lookahead_days: int = 60
|
||||
) -> tuple[date, int] | None:
|
||||
"""Find the next free (date, hour) slot.
|
||||
|
||||
Starts from tomorrow and scans forward, filling any gaps in the schedule
|
||||
rather than always appending after the last existing post.
|
||||
"""
|
||||
today = _today_cet()
|
||||
tomorrow = today + timedelta(days=1)
|
||||
|
||||
for offset in range(0, lookahead_days + 1):
|
||||
candidate = tomorrow + timedelta(days=offset)
|
||||
hour = _next_free_hour(candidate, wp_occupied)
|
||||
if hour is not None:
|
||||
return candidate, hour
|
||||
|
||||
return tomorrow, _preferred_hours()[0] if _preferred_hours() else 9
|
||||
|
||||
|
||||
def get_schedule_overview(lookahead_days: int = 60) -> list[dict]:
|
||||
"""Return all booked scheduling slots (DB + WP) for the next N days, sorted by date."""
|
||||
today = _today_cet()
|
||||
hours = _preferred_hours()
|
||||
|
||||
# Slots booked in local DB
|
||||
with get_conn() as conn:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT id, title, status, wp_post_id, wp_post_url, scheduled_publish_at
|
||||
FROM articles
|
||||
WHERE scheduled_publish_at IS NOT NULL
|
||||
AND scheduled_publish_at >= ?
|
||||
AND status NOT IN ('error', 'no_image')
|
||||
ORDER BY scheduled_publish_at
|
||||
""",
|
||||
(today.isoformat() + "T00:00:00",),
|
||||
).fetchall()
|
||||
|
||||
db_slots: dict[tuple[str, int], dict] = {}
|
||||
for row in rows:
|
||||
try:
|
||||
dt = datetime.fromisoformat(row["scheduled_publish_at"])
|
||||
key = (dt.date().isoformat(), dt.hour)
|
||||
db_slots[key] = {
|
||||
"date": dt.date().isoformat(),
|
||||
"hour": dt.hour,
|
||||
"formatted": _format_slot(dt.date(), dt.hour),
|
||||
"source": "db",
|
||||
"article_id": row["id"],
|
||||
"article_title": row["title"],
|
||||
"article_status": row["status"],
|
||||
"wp_post_id": row["wp_post_id"],
|
||||
"wp_post_url": row["wp_post_url"],
|
||||
}
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Slots occupied in WordPress but not in local DB
|
||||
wp_occupied = _fetch_wp_occupied_slots()
|
||||
wp_only: list[dict] = []
|
||||
for d_str, h in sorted(wp_occupied):
|
||||
if (d_str, h) in db_slots:
|
||||
continue
|
||||
try:
|
||||
d = date.fromisoformat(d_str)
|
||||
if d >= today:
|
||||
wp_only.append({
|
||||
"date": d_str,
|
||||
"hour": h,
|
||||
"formatted": _format_slot(d, h),
|
||||
"source": "wordpress",
|
||||
"article_id": None,
|
||||
"article_title": "(WP-Beitrag außerhalb Pipeline)",
|
||||
"article_status": None,
|
||||
"wp_post_id": None,
|
||||
"wp_post_url": None,
|
||||
})
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
all_slots = list(db_slots.values()) + wp_only
|
||||
all_slots.sort(key=lambda s: (s["date"], s["hour"]))
|
||||
return all_slots
|
||||
|
||||
|
||||
def release_publish_slot(article_id: int) -> None:
|
||||
"""Clear a previously reserved slot (e.g. when article is rejected after slot assignment)."""
|
||||
with get_conn() as conn:
|
||||
conn.execute(
|
||||
"UPDATE articles SET scheduled_publish_at = NULL WHERE id = ?",
|
||||
(article_id,),
|
||||
)
|
||||
|
||||
|
||||
def suggest_publish_slot() -> str:
|
||||
"""Return a suggested publish datetime string (CET) for the next free slot."""
|
||||
wp_occupied = _fetch_wp_occupied_slots()
|
||||
result = _find_next_free_slot(wp_occupied)
|
||||
if result:
|
||||
d, hour = result
|
||||
return _format_slot(d, hour)
|
||||
tomorrow = _today_cet() + timedelta(days=1)
|
||||
return _format_slot(tomorrow, _preferred_hours()[0] if _preferred_hours() else 9)
|
||||
|
||||
|
||||
def reserve_publish_slot(article_id: int) -> str:
|
||||
"""Reserve a publish slot for an article and persist it in the DB.
|
||||
|
||||
If the article already has a scheduled_publish_at, keep it unchanged.
|
||||
Returns the formatted publish datetime string.
|
||||
|
||||
Uses a module-level lock so that concurrent pipeline runs (two threads)
|
||||
cannot read the same "free" slot and assign it twice.
|
||||
"""
|
||||
# Fetch WP-occupied slots BEFORE acquiring the lock — the API call can be slow
|
||||
# and must not block other threads unnecessarily.
|
||||
wp_occupied = _fetch_wp_occupied_slots()
|
||||
|
||||
with _slot_lock:
|
||||
# Single DB connection for the entire read-find-write cycle so the
|
||||
# slot we pick is still free when we write it.
|
||||
with get_conn() as conn:
|
||||
row = conn.execute(
|
||||
"SELECT scheduled_publish_at FROM articles WHERE id = ?",
|
||||
(article_id,),
|
||||
).fetchone()
|
||||
existing_slot = row["scheduled_publish_at"] if row else None
|
||||
if existing_slot:
|
||||
try:
|
||||
dt = datetime.fromisoformat(existing_slot)
|
||||
return _format_slot(dt.date(), dt.hour)
|
||||
except Exception:
|
||||
pass # invalid — fall through and assign a fresh slot
|
||||
|
||||
# Find the next free (date, hour) slot using THIS connection so we
|
||||
# see all slots written during this lock window.
|
||||
hours = _preferred_hours()
|
||||
today = _today_cet()
|
||||
tomorrow = today + timedelta(days=1)
|
||||
candidate: date | None = None
|
||||
chosen_hour: int | None = None
|
||||
|
||||
for offset in range(0, 61):
|
||||
d = tomorrow + timedelta(days=offset)
|
||||
date_str = d.isoformat()
|
||||
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT scheduled_publish_at FROM articles
|
||||
WHERE scheduled_publish_at >= ? AND scheduled_publish_at < ?
|
||||
AND status NOT IN ('error', 'no_image')
|
||||
""",
|
||||
(date_str + "T00:00:00", date_str + "T23:59:59"),
|
||||
).fetchall()
|
||||
|
||||
used_hours: set[int] = set()
|
||||
for r in rows:
|
||||
ts = r["scheduled_publish_at"] or ""
|
||||
try:
|
||||
used_hours.add(datetime.fromisoformat(ts).hour)
|
||||
except Exception:
|
||||
pass
|
||||
for d_str, h in wp_occupied:
|
||||
if d_str == date_str:
|
||||
used_hours.add(h)
|
||||
|
||||
for h in hours:
|
||||
if h not in used_hours:
|
||||
candidate = d
|
||||
chosen_hour = h
|
||||
break
|
||||
if candidate is not None:
|
||||
break
|
||||
|
||||
if candidate is None:
|
||||
candidate = tomorrow
|
||||
chosen_hour = hours[0] if hours else 9
|
||||
|
||||
iso_ts = f"{candidate.isoformat()}T{chosen_hour:02d}:00:00"
|
||||
conn.execute(
|
||||
"UPDATE articles SET scheduled_publish_at = ? WHERE id = ?",
|
||||
(iso_ts, article_id),
|
||||
)
|
||||
return _format_slot(candidate, chosen_hour)
|
||||
442
backend/app/source_extraction.py
Normal file
442
backend/app/source_extraction.py
Normal file
|
|
@ -0,0 +1,442 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from html import unescape
|
||||
import re
|
||||
from typing import Any
|
||||
from urllib.parse import urljoin
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
DEFAULT_TIMEOUT_SECONDS = 10
|
||||
DEFAULT_USER_AGENT = "rss-news-bot/1.0 (+https://news.vanityontour.de)"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ExtractedArticle:
|
||||
title: str | None
|
||||
author: str | None
|
||||
canonical_url: str | None
|
||||
summary: str | None
|
||||
content_text: str | None
|
||||
images: list[str]
|
||||
press_contact: str | None
|
||||
extraction_error: str | None = None
|
||||
image_metadata: dict[str, dict] = field(default_factory=dict)
|
||||
|
||||
|
||||
def _clean_text(raw: str | None) -> str | None:
|
||||
if not raw:
|
||||
return None
|
||||
text = unescape(raw)
|
||||
text = re.sub(r"<[^>]+>", " ", text)
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
return text or None
|
||||
|
||||
|
||||
def _strip_noise(html: str) -> str:
|
||||
html = re.sub(r"<script[\s\S]*?</script>", " ", html, flags=re.IGNORECASE)
|
||||
html = re.sub(r"<style[\s\S]*?</style>", " ", html, flags=re.IGNORECASE)
|
||||
html = re.sub(r"<noscript[\s\S]*?</noscript>", " ", html, flags=re.IGNORECASE)
|
||||
return html
|
||||
|
||||
|
||||
def _meta_content(html: str, attr: str, value: str) -> str | None:
|
||||
pattern = re.compile(
|
||||
rf"<meta[^>]+{attr}\s*=\s*[\"']{re.escape(value)}[\"'][^>]*content\s*=\s*[\"']([^\"']+)[\"'][^>]*>",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
match = pattern.search(html)
|
||||
if match:
|
||||
return _clean_text(match.group(1))
|
||||
|
||||
# handle reversed attribute order
|
||||
pattern_rev = re.compile(
|
||||
rf"<meta[^>]+content\s*=\s*[\"']([^\"']+)[\"'][^>]*{attr}\s*=\s*[\"']{re.escape(value)}[\"'][^>]*>",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
match = pattern_rev.search(html)
|
||||
if match:
|
||||
return _clean_text(match.group(1))
|
||||
return None
|
||||
|
||||
|
||||
def _extract_title(html: str) -> str | None:
|
||||
title = _meta_content(html, "property", "og:title")
|
||||
if title:
|
||||
return title
|
||||
|
||||
match = re.search(r"<title[^>]*>([\s\S]*?)</title>", html, re.IGNORECASE)
|
||||
if match:
|
||||
cleaned = _clean_text(match.group(1))
|
||||
if cleaned:
|
||||
return cleaned
|
||||
|
||||
match = re.search(r"<h1[^>]*>([\s\S]*?)</h1>", html, re.IGNORECASE)
|
||||
if match:
|
||||
return _clean_text(match.group(1))
|
||||
return None
|
||||
|
||||
|
||||
def _extract_canonical(html: str) -> str | None:
|
||||
match = re.search(
|
||||
r"<link[^>]+rel\s*=\s*[\"']canonical[\"'][^>]*href\s*=\s*[\"']([^\"']+)[\"'][^>]*>",
|
||||
html,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if match:
|
||||
return _clean_text(match.group(1))
|
||||
|
||||
match = re.search(
|
||||
r"<link[^>]+href\s*=\s*[\"']([^\"']+)[\"'][^>]*rel\s*=\s*[\"']canonical[\"'][^>]*>",
|
||||
html,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if match:
|
||||
return _clean_text(match.group(1))
|
||||
return None
|
||||
|
||||
|
||||
def _extract_author(html: str) -> str | None:
|
||||
for attr, value in (("name", "author"), ("property", "article:author"), ("property", "og:article:author")):
|
||||
author = _meta_content(html, attr, value)
|
||||
if author:
|
||||
return author
|
||||
|
||||
for pattern in (
|
||||
r"(?:Von|Autor(?:in)?)\s*[:\-]\s*([^<\n\r]{3,120})",
|
||||
r"class=[\"'][^\"']*(?:author|byline)[^\"']*[\"'][^>]*>([\s\S]{1,180})<",
|
||||
):
|
||||
match = re.search(pattern, html, re.IGNORECASE)
|
||||
if match:
|
||||
author = _clean_text(match.group(1))
|
||||
if author:
|
||||
return author
|
||||
return None
|
||||
|
||||
|
||||
def _extract_images(html: str, page_url: str) -> list[str]:
|
||||
images: list[str] = []
|
||||
seen: set[str] = set()
|
||||
|
||||
for prop in ("og:image", "twitter:image"):
|
||||
pattern = re.compile(
|
||||
rf"<meta[^>]+property\s*=\s*[\"']{re.escape(prop)}[\"'][^>]*content\s*=\s*[\"']([^\"']+)[\"'][^>]*>",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
for match in pattern.finditer(html):
|
||||
src = match.group(1).strip()
|
||||
abs_src = urljoin(page_url, src)
|
||||
if abs_src not in seen:
|
||||
seen.add(abs_src)
|
||||
images.append(abs_src)
|
||||
|
||||
for match in re.finditer(r"<img[^>]+src\s*=\s*[\"']([^\"']+)[\"'][^>]*>", html, re.IGNORECASE):
|
||||
src = match.group(1).strip()
|
||||
abs_src = urljoin(page_url, src)
|
||||
if abs_src not in seen:
|
||||
seen.add(abs_src)
|
||||
images.append(abs_src)
|
||||
|
||||
return images
|
||||
|
||||
|
||||
def _extract_content_text(html: str) -> str | None:
|
||||
section = None
|
||||
for pattern in (
|
||||
r"<article[^>]*>([\s\S]*?)</article>",
|
||||
r"<main[^>]*>([\s\S]*?)</main>",
|
||||
r"<body[^>]*>([\s\S]*?)</body>",
|
||||
):
|
||||
match = re.search(pattern, html, re.IGNORECASE)
|
||||
if match:
|
||||
section = match.group(1)
|
||||
break
|
||||
|
||||
if not section:
|
||||
section = html
|
||||
|
||||
paragraphs = []
|
||||
for match in re.finditer(r"<h[2-4][^>]*>([\s\S]*?)</h[2-4]>", section, re.IGNORECASE):
|
||||
text = _clean_text(match.group(1))
|
||||
if text and re.search(r"\b(pressekontakt|press contact|kontakt|agentur)\b", text, re.IGNORECASE):
|
||||
paragraphs.append(text)
|
||||
|
||||
for match in re.finditer(r"<p[^>]*>([\s\S]*?)</p>", section, re.IGNORECASE):
|
||||
text = _clean_text(match.group(1))
|
||||
if text and len(text) > 2:
|
||||
paragraphs.append(text)
|
||||
|
||||
if paragraphs:
|
||||
return "\n".join(paragraphs)
|
||||
|
||||
stripped = _clean_text(section)
|
||||
return stripped
|
||||
|
||||
|
||||
def _extract_press_contact(content_text: str | None) -> str | None:
|
||||
if not content_text:
|
||||
return None
|
||||
|
||||
lines = [line.strip() for line in content_text.split("\n") if line.strip()]
|
||||
marker_re = re.compile(r"\b(pressekontakt|press contact|presse\-kontakt|agentur)\b", re.IGNORECASE)
|
||||
for idx, line in enumerate(lines):
|
||||
if marker_re.search(line):
|
||||
chunk = [line]
|
||||
for nxt in lines[idx + 1 : idx + 6]:
|
||||
if re.search(r"\b(original\-content von|ots:|newsroom:|alle meldungen|zum newsroom)\b", nxt, re.IGNORECASE):
|
||||
break
|
||||
chunk.append(nxt)
|
||||
return _clean_text("\n".join(chunk))
|
||||
|
||||
match = re.search(
|
||||
r"((?:Pressekontakt|Agentur)[\s\S]{0,1200}?)(?:Original-Content von|OTS:|newsroom:|Alle Meldungen|Zum Newsroom|$)",
|
||||
content_text,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if match:
|
||||
return _clean_text(match.group(1))
|
||||
return None
|
||||
|
||||
|
||||
# CSS class keywords that indicate a copyright/credit element inside a figcaption
|
||||
_CREDIT_CLASS_RE = re.compile(
|
||||
r"class\s*=\s*[\"'][^\"']*(?:copyright|credit|photographer|photo-credit|image-credit|bildrechte|fotocredit)[^\"']*[\"']",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Inline text patterns that signal a credit/copyright notice
|
||||
_CREDIT_TEXT_RE = re.compile(
|
||||
r"(©[^<\n\r]{1,100}|(?:Foto|Bild|Credit|Fotograf|Fotografie)\s*:[^<\n\r]{1,100})",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# data-* attribute names that carry credit/caption information directly on <img>
|
||||
_IMG_DATA_CREDIT_ATTRS = ("data-credit", "data-photographer", "data-copyright")
|
||||
_IMG_DATA_CAPTION_ATTRS = ("data-caption", "data-description")
|
||||
|
||||
# Class keywords for adjacent sibling credit spans/divs after an <img>
|
||||
_ADJ_CREDIT_CLASS_RE = re.compile(
|
||||
r"class\s*=\s*[\"'][^\"']*(?:copyright|credit|photographer|photo-credit|image-credit|bildrechte|fotocredit)[^\"']*[\"']",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def _extract_image_metadata(html: str, page_url: str) -> dict[str, dict]:
|
||||
"""Return a mapping of absolute image URL → {"caption": ..., "credit": ...}.
|
||||
|
||||
Uses three progressive strategies:
|
||||
1. <figure> with <img> + <figcaption>
|
||||
2. data-* attributes on <img> tags not already covered
|
||||
3. <img> tags whose immediately following HTML contains a credit element
|
||||
"""
|
||||
result: dict[str, dict] = {}
|
||||
|
||||
try:
|
||||
# ------------------------------------------------------------------
|
||||
# Strategy 1: <figure> blocks containing <img> and <figcaption>
|
||||
# ------------------------------------------------------------------
|
||||
for fig_match in re.finditer(r"<figure[^>]*>([\s\S]*?)</figure>", html, re.IGNORECASE):
|
||||
fig_html = fig_match.group(1)
|
||||
|
||||
# Locate image src (src or lazy-loaded data-src)
|
||||
img_match = re.search(
|
||||
r"<img[^>]+(?:src|data-src)\s*=\s*[\"']([^\"']+)[\"'][^>]*>",
|
||||
fig_html,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if not img_match:
|
||||
continue
|
||||
img_src = urljoin(page_url, img_match.group(1).strip())
|
||||
|
||||
# Locate figcaption
|
||||
figcap_match = re.search(
|
||||
r"<figcaption[^>]*>([\s\S]*?)</figcaption>",
|
||||
fig_html,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if not figcap_match:
|
||||
continue
|
||||
figcap_html = figcap_match.group(1)
|
||||
|
||||
# --- Extract credit ---
|
||||
credit: str | None = None
|
||||
|
||||
# Try credit via class attribute on an inner element
|
||||
credit_elem_match = re.search(
|
||||
r"<(?:span|p|div)[^>]*"
|
||||
+ _CREDIT_CLASS_RE.pattern
|
||||
+ r"[^>]*>([\s\S]*?)</(?:span|p|div)>",
|
||||
figcap_html,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if credit_elem_match:
|
||||
credit = _clean_text(credit_elem_match.group(1))
|
||||
|
||||
# Fallback: scan plain text of figcaption for credit patterns
|
||||
if not credit:
|
||||
figcap_text = unescape(re.sub(r"<[^>]+>", " ", figcap_html))
|
||||
cred_text_match = _CREDIT_TEXT_RE.search(figcap_text)
|
||||
if cred_text_match:
|
||||
credit = _clean_text(cred_text_match.group(1))
|
||||
|
||||
# --- Extract caption (full figcaption text) ---
|
||||
caption = _clean_text(figcap_html)
|
||||
|
||||
# Only store entries that carry at least one piece of metadata
|
||||
if caption or credit:
|
||||
entry: dict[str, str] = {}
|
||||
if caption:
|
||||
entry["caption"] = caption
|
||||
if credit:
|
||||
entry["credit"] = credit
|
||||
result[img_src] = entry
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Strategy 2: data-* attributes on <img> tags
|
||||
# ------------------------------------------------------------------
|
||||
for img_match in re.finditer(r"<img([^>]+)>", html, re.IGNORECASE):
|
||||
img_attrs = img_match.group(1)
|
||||
|
||||
# Resolve image URL (prefer src over data-src)
|
||||
src_match = re.search(r'(?:^|\s)src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE)
|
||||
if not src_match:
|
||||
src_match = re.search(r'data-src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE)
|
||||
if not src_match:
|
||||
continue
|
||||
img_src = urljoin(page_url, src_match.group(1).strip())
|
||||
|
||||
# Skip images already handled by Strategy 1
|
||||
if img_src in result:
|
||||
continue
|
||||
|
||||
credit: str | None = None
|
||||
caption: str | None = None
|
||||
|
||||
for attr in _IMG_DATA_CREDIT_ATTRS:
|
||||
attr_match = re.search(
|
||||
rf'{re.escape(attr)}\s*=\s*["\']([^"\']+)["\']',
|
||||
img_attrs,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if attr_match:
|
||||
credit = _clean_text(attr_match.group(1))
|
||||
break
|
||||
|
||||
for attr in _IMG_DATA_CAPTION_ATTRS:
|
||||
attr_match = re.search(
|
||||
rf'{re.escape(attr)}\s*=\s*["\']([^"\']+)["\']',
|
||||
img_attrs,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if attr_match:
|
||||
caption = _clean_text(attr_match.group(1))
|
||||
break
|
||||
|
||||
if caption or credit:
|
||||
entry = {}
|
||||
if caption:
|
||||
entry["caption"] = caption
|
||||
if credit:
|
||||
entry["credit"] = credit
|
||||
result[img_src] = entry
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Strategy 3: <img> followed within 200 chars by a credit element
|
||||
# ------------------------------------------------------------------
|
||||
for img_match in re.finditer(r"<img([^>]+)>", html, re.IGNORECASE):
|
||||
img_attrs = img_match.group(1)
|
||||
|
||||
src_match = re.search(r'(?:^|\s)src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE)
|
||||
if not src_match:
|
||||
src_match = re.search(r'data-src\s*=\s*["\']([^"\']+)["\']', img_attrs, re.IGNORECASE)
|
||||
if not src_match:
|
||||
continue
|
||||
img_src = urljoin(page_url, src_match.group(1).strip())
|
||||
|
||||
# Skip images already handled by earlier strategies
|
||||
if img_src in result:
|
||||
continue
|
||||
|
||||
# Look at the 200 characters of HTML immediately after the img tag
|
||||
after_start = img_match.end()
|
||||
after_html = html[after_start : after_start + 200]
|
||||
|
||||
adj_match = re.search(
|
||||
r"<(?:span|p|div)[^>]*"
|
||||
+ _ADJ_CREDIT_CLASS_RE.pattern
|
||||
+ r"[^>]*>([\s\S]*?)</(?:span|p|div)>",
|
||||
after_html,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if adj_match:
|
||||
credit = _clean_text(adj_match.group(1))
|
||||
if credit:
|
||||
result[img_src] = {"credit": credit}
|
||||
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def extract_article(url: str, timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS) -> ExtractedArticle:
|
||||
try:
|
||||
req = Request(
|
||||
url=url,
|
||||
headers={
|
||||
"User-Agent": DEFAULT_USER_AGENT,
|
||||
"Accept-Language": "de-DE,de;q=0.9,en;q=0.8",
|
||||
},
|
||||
)
|
||||
with urlopen(req, timeout=timeout_seconds) as resp:
|
||||
raw = resp.read()
|
||||
charset = resp.headers.get_content_charset() or "utf-8"
|
||||
html = raw.decode(charset, errors="replace")
|
||||
except Exception as exc:
|
||||
return ExtractedArticle(
|
||||
title=None,
|
||||
author=None,
|
||||
canonical_url=None,
|
||||
summary=None,
|
||||
content_text=None,
|
||||
images=[],
|
||||
press_contact=None,
|
||||
extraction_error=str(exc),
|
||||
)
|
||||
|
||||
html = _strip_noise(html)
|
||||
title = _extract_title(html)
|
||||
author = _extract_author(html)
|
||||
canonical_url = _extract_canonical(html)
|
||||
summary = _meta_content(html, "name", "description")
|
||||
content_text = _extract_content_text(html)
|
||||
if not summary and content_text:
|
||||
summary = _clean_text(content_text[:320])
|
||||
images = _extract_images(html, url)
|
||||
press_contact = _extract_press_contact(content_text)
|
||||
image_metadata = _extract_image_metadata(html, url)
|
||||
|
||||
return ExtractedArticle(
|
||||
title=title,
|
||||
author=author,
|
||||
canonical_url=canonical_url,
|
||||
summary=summary,
|
||||
content_text=content_text,
|
||||
images=images,
|
||||
press_contact=press_contact,
|
||||
extraction_error=None,
|
||||
image_metadata=image_metadata,
|
||||
)
|
||||
|
||||
|
||||
def extracted_article_to_meta(article: ExtractedArticle) -> dict[str, Any]:
|
||||
return {
|
||||
"title": article.title,
|
||||
"author": article.author,
|
||||
"canonical_url": article.canonical_url,
|
||||
"summary": article.summary,
|
||||
"images": article.images,
|
||||
"press_contact": article.press_contact,
|
||||
"extraction_error": article.extraction_error,
|
||||
"image_metadata": article.image_metadata,
|
||||
}
|
||||
474
backend/app/telegram_bot.py
Normal file
474
backend/app/telegram_bot.py
Normal file
|
|
@ -0,0 +1,474 @@
|
|||
"""Telegram Bot integration for RSS-News pipeline notifications and controls."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
from typing import Any
|
||||
from urllib.error import URLError
|
||||
from urllib.parse import urlencode
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
from .config import get_settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_BASE = "https://api.telegram.org/bot{token}/{method}"
|
||||
_N8N_APP_RELEASE_WEBHOOK = "https://n8n.vanityontour.de/webhook/tg-app-release-bot-v1/webhook"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Low-level API helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _call(method: str, payload: dict[str, Any]) -> dict[str, Any]:
|
||||
settings = get_settings()
|
||||
token = settings.telegram_bot_token
|
||||
if not token:
|
||||
raise RuntimeError("TELEGRAM_BOT_TOKEN nicht konfiguriert")
|
||||
url = _BASE.format(token=token, method=method)
|
||||
data = json.dumps(payload).encode("utf-8")
|
||||
req = Request(
|
||||
url=url,
|
||||
data=data,
|
||||
method="POST",
|
||||
headers={"Content-Type": "application/json", "Accept": "application/json"},
|
||||
)
|
||||
try:
|
||||
with urlopen(req, timeout=15) as resp:
|
||||
raw = resp.read().decode("utf-8", errors="replace")
|
||||
return json.loads(raw)
|
||||
except URLError as exc:
|
||||
logger.error("Telegram API Fehler (%s): %s", method, exc)
|
||||
raise RuntimeError(f"Telegram API Fehler: {exc}") from exc
|
||||
|
||||
|
||||
def _chat_id() -> str:
|
||||
settings = get_settings()
|
||||
cid = settings.telegram_chat_id
|
||||
if not cid:
|
||||
raise RuntimeError("TELEGRAM_CHAT_ID nicht konfiguriert")
|
||||
return cid
|
||||
|
||||
|
||||
def _inline_keyboard(buttons: list[list[dict[str, str]]]) -> dict:
|
||||
return {"inline_keyboard": buttons}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public send functions
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def send_message(text: str, reply_markup: dict | None = None, parse_mode: str = "HTML") -> dict:
|
||||
payload: dict[str, Any] = {
|
||||
"chat_id": _chat_id(),
|
||||
"text": text,
|
||||
"parse_mode": parse_mode,
|
||||
"disable_web_page_preview": False,
|
||||
}
|
||||
if reply_markup:
|
||||
payload["reply_markup"] = reply_markup
|
||||
return _call("sendMessage", payload)
|
||||
|
||||
|
||||
def send_photo_message(
|
||||
photo_url: str,
|
||||
caption: str,
|
||||
reply_markup: dict | None = None,
|
||||
parse_mode: str = "HTML",
|
||||
) -> dict:
|
||||
payload: dict[str, Any] = {
|
||||
"chat_id": _chat_id(),
|
||||
"photo": photo_url,
|
||||
"caption": caption,
|
||||
"parse_mode": parse_mode,
|
||||
}
|
||||
if reply_markup:
|
||||
payload["reply_markup"] = reply_markup
|
||||
try:
|
||||
return _call("sendPhoto", payload)
|
||||
except Exception:
|
||||
# Fall back to text message if photo fails (e.g. image URL no longer valid)
|
||||
return send_message(caption, reply_markup=reply_markup, parse_mode=parse_mode)
|
||||
|
||||
|
||||
def answer_callback_query(callback_query_id: str, text: str = "") -> None:
|
||||
try:
|
||||
_call("answerCallbackQuery", {"callback_query_id": callback_query_id, "text": text})
|
||||
except Exception as exc:
|
||||
logger.warning("answerCallbackQuery fehlgeschlagen: %s", exc)
|
||||
|
||||
|
||||
def edit_message_reply_markup(chat_id: str, message_id: int, reply_markup: dict | None = None) -> None:
|
||||
payload: dict[str, Any] = {"chat_id": chat_id, "message_id": message_id}
|
||||
if reply_markup:
|
||||
payload["reply_markup"] = reply_markup
|
||||
else:
|
||||
payload["reply_markup"] = {"inline_keyboard": []}
|
||||
try:
|
||||
_call("editMessageReplyMarkup", payload)
|
||||
except Exception as exc:
|
||||
logger.warning("editMessageReplyMarkup fehlgeschlagen: %s", exc)
|
||||
|
||||
|
||||
def setup_webhook(webhook_url: str) -> dict:
|
||||
settings = get_settings()
|
||||
payload: dict[str, Any] = {"url": webhook_url, "allowed_updates": ["message", "callback_query"]}
|
||||
if settings.telegram_webhook_secret:
|
||||
payload["secret_token"] = settings.telegram_webhook_secret
|
||||
return _call("setWebhook", payload)
|
||||
|
||||
|
||||
def delete_webhook() -> dict:
|
||||
return _call("deleteWebhook", {})
|
||||
|
||||
|
||||
def _forward_to_n8n_app_release(update: dict[str, Any]) -> None:
|
||||
"""Forward a Telegram update to the N8N App Release webhook."""
|
||||
try:
|
||||
data = json.dumps(update).encode("utf-8")
|
||||
req = Request(
|
||||
url=_N8N_APP_RELEASE_WEBHOOK,
|
||||
data=data,
|
||||
method="POST",
|
||||
headers={"Content-Type": "application/json"},
|
||||
)
|
||||
with urlopen(req, timeout=5) as _:
|
||||
pass
|
||||
except Exception as exc:
|
||||
logger.debug("N8N App-Release-Forward fehlgeschlagen: %s", exc)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Notification helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _format_tags(meta_json: str | None) -> str:
|
||||
if not meta_json:
|
||||
return ""
|
||||
try:
|
||||
meta = json.loads(meta_json)
|
||||
tags = meta.get("generated_tags") or []
|
||||
if tags:
|
||||
return " ".join(f"#{t.replace(' ', '_')}" for t in tags[:6])
|
||||
except Exception:
|
||||
pass
|
||||
return ""
|
||||
|
||||
|
||||
def _score_emoji(score: int) -> str:
|
||||
if score >= 85:
|
||||
return "🟢"
|
||||
if score >= 70:
|
||||
return "🟡"
|
||||
return "🔴"
|
||||
|
||||
|
||||
def notify_new_draft(
|
||||
article: dict[str, Any],
|
||||
score: int,
|
||||
suggested_publish_at: str | None = None,
|
||||
) -> None:
|
||||
"""Send Telegram notification for a newly created WP draft."""
|
||||
title = (article.get("title") or "Ohne Titel").strip()
|
||||
wp_url = article.get("wp_post_url") or ""
|
||||
tags_str = _format_tags(article.get("meta_json"))
|
||||
art_id = article.get("id")
|
||||
|
||||
score_line = f"{_score_emoji(score)} Relevanz-Score: <b>{score}/100</b>"
|
||||
publish_line = f"📅 Vorgeschlagene Veröffentlichung: <b>{suggested_publish_at}</b>" if suggested_publish_at else ""
|
||||
link_line = f'🔗 <a href="{wp_url}">Draft in WordPress öffnen</a>' if wp_url else ""
|
||||
tags_line = f"🏷 {tags_str}" if tags_str else ""
|
||||
|
||||
text_parts = [
|
||||
f"✅ <b>Neuer Draft erstellt</b>",
|
||||
f"📰 <b>{title}</b>",
|
||||
score_line,
|
||||
]
|
||||
if publish_line:
|
||||
text_parts.append(publish_line)
|
||||
if tags_line:
|
||||
text_parts.append(tags_line)
|
||||
if link_line:
|
||||
text_parts.append(link_line)
|
||||
|
||||
text = "\n".join(text_parts)
|
||||
|
||||
keyboard = _inline_keyboard([
|
||||
[
|
||||
{"text": "✏️ Neu schreiben", "callback_data": f"rewrite:{art_id}"},
|
||||
{"text": "❌ Verwerfen", "callback_data": f"discard:{art_id}"},
|
||||
]
|
||||
])
|
||||
|
||||
# Try with image first
|
||||
meta = {}
|
||||
try:
|
||||
meta = json.loads(article.get("meta_json") or "{}")
|
||||
except Exception:
|
||||
pass
|
||||
image_url = None
|
||||
image_review = meta.get("image_review") or {}
|
||||
if isinstance(image_review, dict):
|
||||
image_url = image_review.get("selected_url")
|
||||
if not image_url:
|
||||
image_sel = (meta.get("extraction") or {}).get("image_selection") or {}
|
||||
image_url = image_sel.get("primary")
|
||||
|
||||
if image_url:
|
||||
send_photo_message(image_url, caption=text, reply_markup=keyboard)
|
||||
else:
|
||||
send_message(text, reply_markup=keyboard)
|
||||
|
||||
|
||||
def notify_relevance_warning(article: dict[str, Any], score: int, reason: str) -> None:
|
||||
"""Send Telegram warning for borderline articles (score between warn and auto thresholds)."""
|
||||
title = (article.get("title") or "Ohne Titel").strip()
|
||||
art_id = article.get("id")
|
||||
source_url = article.get("source_url") or ""
|
||||
|
||||
text = (
|
||||
f"⚠️ <b>Artikel mit niedrigem Relevanz-Score</b>\n"
|
||||
f"📰 <b>{title}</b>\n"
|
||||
f"{_score_emoji(score)} Score: <b>{score}/100</b>\n"
|
||||
f"💬 {reason}\n"
|
||||
f'🔗 <a href="{source_url}">Originalartikel</a>'
|
||||
)
|
||||
keyboard = _inline_keyboard([
|
||||
[
|
||||
{"text": "➕ Trotzdem verarbeiten", "callback_data": f"override:{art_id}"},
|
||||
{"text": "❌ Ablehnen", "callback_data": f"reject:{art_id}"},
|
||||
]
|
||||
])
|
||||
send_message(text, reply_markup=keyboard)
|
||||
|
||||
|
||||
def notify_rejected_summary(articles: list[dict[str, Any]]) -> None:
|
||||
"""Send summary of rejected articles for this pipeline run."""
|
||||
if not articles:
|
||||
return
|
||||
lines = [f"🚫 <b>{len(articles)} Artikel abgelehnt (Score < {get_settings().pipeline_relevance_warn})</b>\n"]
|
||||
for art in articles[:10]:
|
||||
title = (art.get("title") or "Ohne Titel")[:60]
|
||||
score = _get_relevance_score(art)
|
||||
reason = _get_rejection_reason(art)
|
||||
art_id = art.get("id")
|
||||
lines.append(f"• <b>{title}</b> (Score: {score}) — {reason}")
|
||||
if len(articles) > 10:
|
||||
lines.append(f"... und {len(articles) - 10} weitere")
|
||||
|
||||
text = "\n".join(lines)
|
||||
# Build override buttons for first 5
|
||||
rows = []
|
||||
for art in articles[:5]:
|
||||
art_id = art.get("id")
|
||||
title = (art.get("title") or "")[:25]
|
||||
rows.append([{"text": f"➕ {title}…", "callback_data": f"override:{art_id}"}])
|
||||
|
||||
keyboard = _inline_keyboard(rows) if rows else None
|
||||
send_message(text, reply_markup=keyboard)
|
||||
|
||||
|
||||
def notify_error(message: str) -> None:
|
||||
"""Send error alert to Telegram."""
|
||||
try:
|
||||
send_message(f"🔴 <b>Fehler im RSS-Pipeline</b>\n{message}")
|
||||
except Exception as exc:
|
||||
logger.error("Telegram Fehler-Benachrichtigung fehlgeschlagen: %s", exc)
|
||||
|
||||
|
||||
def notify_pipeline_started(trigger: str = "auto") -> None:
|
||||
icon = "🤖" if trigger == "auto" else "👤"
|
||||
try:
|
||||
send_message(f"{icon} Pipeline gestartet (Auslöser: {trigger})")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def notify_pipeline_done(stats: dict[str, Any]) -> None:
|
||||
ingested = stats.get("ingested", 0)
|
||||
processed = stats.get("processed", 0)
|
||||
drafts = stats.get("drafts_created", 0)
|
||||
rejected = stats.get("rejected", 0)
|
||||
quality_gate_rejected = stats.get("quality_gate_rejected", 0)
|
||||
no_image = stats.get("no_image", 0)
|
||||
warnings = stats.get("warnings", 0)
|
||||
errors = stats.get("errors", 0)
|
||||
|
||||
lines = [
|
||||
"📊 <b>Pipeline abgeschlossen</b>",
|
||||
f"📥 Neue Artikel importiert: {ingested}",
|
||||
f"⚙️ Verarbeitet: {processed}",
|
||||
f"📝 Drafts erstellt: {drafts}",
|
||||
]
|
||||
if rejected:
|
||||
lines.append(f"🚫 Abgelehnt (Score): {rejected}")
|
||||
if quality_gate_rejected:
|
||||
lines.append(f"✂️ Qualitätsprüfung: {quality_gate_rejected}")
|
||||
if no_image:
|
||||
lines.append(f"🖼️ Kein Bild: {no_image}")
|
||||
if warnings:
|
||||
lines.append(f"⚠️ Warnungen: {warnings}")
|
||||
if errors:
|
||||
lines.append(f"🔴 Fehler: {errors}")
|
||||
|
||||
try:
|
||||
send_message("\n".join(lines))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helper to read relevance info from meta_json
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _get_relevance_score(article: dict[str, Any]) -> int:
|
||||
try:
|
||||
meta = json.loads(article.get("meta_json") or "{}")
|
||||
return int(meta.get("relevance", {}).get("score", 0))
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
|
||||
def _get_rejection_reason(article: dict[str, Any]) -> str:
|
||||
try:
|
||||
meta = json.loads(article.get("meta_json") or "{}")
|
||||
return str(meta.get("relevance", {}).get("reason", ""))[:80]
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Incoming update handler (called by webhook endpoint)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def handle_update(update: dict[str, Any]) -> None:
|
||||
"""Process an incoming Telegram update."""
|
||||
# Import here to avoid circular imports
|
||||
from . import pipeline as _pipeline
|
||||
|
||||
if "callback_query" in update:
|
||||
_handle_callback(update["callback_query"])
|
||||
elif "message" in update:
|
||||
_handle_message(update["message"])
|
||||
|
||||
|
||||
def _handle_message(message: dict[str, Any]) -> None:
|
||||
from . import pipeline as _pipeline
|
||||
|
||||
text = (message.get("text") or "").strip()
|
||||
if not text.startswith("/"):
|
||||
return
|
||||
|
||||
cmd = text.split()[0].lower().lstrip("/")
|
||||
if "@" in cmd:
|
||||
cmd = cmd.split("@")[0]
|
||||
|
||||
if cmd == "run":
|
||||
send_message("🤖 Pipeline wird manuell gestartet …")
|
||||
try:
|
||||
stats = _pipeline.run_auto_pipeline(trigger="manual")
|
||||
notify_pipeline_done(stats)
|
||||
except Exception as exc:
|
||||
notify_error(f"/run fehlgeschlagen: {exc}")
|
||||
|
||||
elif cmd == "rejected":
|
||||
try:
|
||||
articles = _pipeline.get_recently_rejected(days=3)
|
||||
if not articles:
|
||||
send_message("✅ Keine abgelehnten Artikel in den letzten 3 Tagen.")
|
||||
else:
|
||||
notify_rejected_summary(articles)
|
||||
except Exception as exc:
|
||||
notify_error(f"/rejected fehlgeschlagen: {exc}")
|
||||
|
||||
elif cmd == "status":
|
||||
try:
|
||||
status_text = _pipeline.get_pipeline_status_text()
|
||||
send_message(status_text)
|
||||
except Exception as exc:
|
||||
notify_error(f"/status fehlgeschlagen: {exc}")
|
||||
|
||||
elif cmd == "help":
|
||||
send_message(
|
||||
"📋 <b>Verfügbare Befehle</b>\n"
|
||||
"/run — Pipeline manuell starten\n"
|
||||
"/rejected — Abgelehnte Artikel der letzten 3 Tage\n"
|
||||
"/status — Pipeline-Status\n"
|
||||
"/help — Diese Hilfe"
|
||||
)
|
||||
|
||||
else:
|
||||
# Unbekannter Befehl → an N8N App-Release-Workflow weiterleiten
|
||||
_forward_to_n8n_app_release({"message": message})
|
||||
|
||||
|
||||
def _handle_callback(callback_query: dict[str, Any]) -> None:
|
||||
from . import pipeline as _pipeline
|
||||
from .repositories import get_article_by_id, update_article_status
|
||||
|
||||
query_id = callback_query.get("id", "")
|
||||
data = (callback_query.get("data") or "").strip()
|
||||
chat_id = str(callback_query.get("message", {}).get("chat", {}).get("id", ""))
|
||||
message_id = int(callback_query.get("message", {}).get("message_id", 0))
|
||||
|
||||
if ":" not in data:
|
||||
answer_callback_query(query_id, "Ungültige Aktion")
|
||||
return
|
||||
|
||||
action, _, raw_id = data.partition(":")
|
||||
try:
|
||||
article_id = int(raw_id)
|
||||
except ValueError:
|
||||
answer_callback_query(query_id, "Ungültige Artikel-ID")
|
||||
return
|
||||
|
||||
article = get_article_by_id(article_id)
|
||||
if not article:
|
||||
answer_callback_query(query_id, "Artikel nicht gefunden")
|
||||
return
|
||||
|
||||
# Answer Telegram immediately so the spinning indicator stops
|
||||
action_labels = {
|
||||
"rewrite": "✏️ Artikel wird neu geschrieben …",
|
||||
"discard": "❌ Artikel verworfen",
|
||||
"override": "➕ Artikel wird verarbeitet …",
|
||||
"reject": "🚫 Abgelehnt",
|
||||
}
|
||||
answer_callback_query(query_id, action_labels.get(action, ""))
|
||||
edit_message_reply_markup(chat_id, message_id)
|
||||
|
||||
logger.info("Callback: action=%s article_id=%s", action, article_id)
|
||||
|
||||
if action == "rewrite":
|
||||
try:
|
||||
logger.info("Rewrite #%d: starte rewrite_and_update_draft", article_id)
|
||||
_pipeline.rewrite_and_update_draft(article_id)
|
||||
logger.info("Rewrite #%d: abgeschlossen, sende Benachrichtigung", article_id)
|
||||
updated = get_article_by_id(article_id)
|
||||
if updated:
|
||||
from .scheduler import suggest_publish_slot
|
||||
slot = suggest_publish_slot()
|
||||
notify_new_draft(updated, score=_get_relevance_score(updated), suggested_publish_at=slot)
|
||||
except Exception as exc:
|
||||
logger.error("Rewrite #%d fehlgeschlagen: %s", article_id, exc, exc_info=True)
|
||||
notify_error(f"Rewrite #{article_id} fehlgeschlagen: {exc}")
|
||||
|
||||
elif action == "discard":
|
||||
try:
|
||||
_pipeline.discard_article(article_id)
|
||||
except Exception as exc:
|
||||
logger.error("Discard #%d fehlgeschlagen: %s", article_id, exc)
|
||||
notify_error(f"Verwerfen #{article_id} fehlgeschlagen: {exc}")
|
||||
|
||||
elif action == "override":
|
||||
try:
|
||||
_pipeline.override_rejected_article(article_id)
|
||||
except Exception as exc:
|
||||
logger.error("Override #%d fehlgeschlagen: %s", article_id, exc)
|
||||
notify_error(f"Override #{article_id} fehlgeschlagen: {exc}")
|
||||
|
||||
elif action == "reject":
|
||||
update_article_status(article_id, "error", actor="telegram", note="Manuell abgelehnt via Telegram")
|
||||
|
||||
else:
|
||||
logger.warning("Unbekannte Callback-Aktion: %s", action)
|
||||
689
backend/app/wordpress.py
Normal file
689
backend/app/wordpress.py
Normal file
|
|
@ -0,0 +1,689 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
from html import escape
|
||||
import logging
|
||||
import json
|
||||
import mimetypes
|
||||
from pathlib import Path
|
||||
import re
|
||||
from typing import Any
|
||||
from html import unescape as _html_unescape
|
||||
from urllib.parse import quote_plus, urlparse
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
from .config import get_settings
|
||||
|
||||
|
||||
def _auth_header(username: str, app_password: str) -> str:
|
||||
token = base64.b64encode(f"{username}:{app_password}".encode("utf-8")).decode("ascii")
|
||||
return f"Basic {token}"
|
||||
|
||||
|
||||
def _wp_request(
|
||||
*,
|
||||
base_url: str,
|
||||
auth_header: str,
|
||||
method: str,
|
||||
endpoint: str,
|
||||
payload: dict[str, Any] | None = None,
|
||||
) -> Any:
|
||||
url = f"{base_url.rstrip('/')}/wp-json/wp/v2/{endpoint.lstrip('/')}"
|
||||
data = json.dumps(payload).encode("utf-8") if payload is not None else None
|
||||
req = Request(
|
||||
url=url,
|
||||
data=data,
|
||||
method=method,
|
||||
headers={
|
||||
"Authorization": auth_header,
|
||||
"Content-Type": "application/json; charset=utf-8",
|
||||
"Accept": "application/json",
|
||||
"User-Agent": "rss-news-publisher/1.0",
|
||||
},
|
||||
)
|
||||
with urlopen(req, timeout=20) as resp:
|
||||
raw = resp.read().decode("utf-8", errors="replace")
|
||||
return json.loads(raw) if raw else {}
|
||||
|
||||
|
||||
def _selected_image_url_from_meta(meta_json: str | None) -> str | None:
|
||||
if not meta_json:
|
||||
return None
|
||||
try:
|
||||
meta = json.loads(meta_json)
|
||||
except Exception:
|
||||
return None
|
||||
if not isinstance(meta, dict):
|
||||
return None
|
||||
image_review = meta.get("image_review")
|
||||
if not isinstance(image_review, dict):
|
||||
return None
|
||||
selected = image_review.get("selected_url")
|
||||
return selected if isinstance(selected, str) and selected.strip() else None
|
||||
|
||||
|
||||
def _selected_tags_from_meta(meta_json: str | None) -> list[str]:
|
||||
if not meta_json:
|
||||
return []
|
||||
try:
|
||||
meta = json.loads(meta_json)
|
||||
except Exception:
|
||||
return []
|
||||
if not isinstance(meta, dict):
|
||||
return []
|
||||
raw_tags = meta.get("generated_tags")
|
||||
if not isinstance(raw_tags, list):
|
||||
return []
|
||||
tags: list[str] = []
|
||||
seen: set[str] = set()
|
||||
for item in raw_tags:
|
||||
value = str(item or "").strip()
|
||||
if not value:
|
||||
continue
|
||||
key = value.casefold()
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
tags.append(value)
|
||||
if len(tags) >= 12:
|
||||
break
|
||||
return tags
|
||||
|
||||
|
||||
def _resolve_wp_tag_ids(*, base_url: str, auth_header: str, tags: list[str]) -> list[int]:
|
||||
ids: list[int] = []
|
||||
seen: set[int] = set()
|
||||
for tag in tags:
|
||||
name = tag.strip()
|
||||
if not name:
|
||||
continue
|
||||
try:
|
||||
endpoint = f"tags?search={quote_plus(name)}&per_page=20"
|
||||
result = _wp_request(base_url=base_url, auth_header=auth_header, method="GET", endpoint=endpoint)
|
||||
tag_id: int | None = None
|
||||
if isinstance(result, list):
|
||||
for row in result:
|
||||
if not isinstance(row, dict):
|
||||
continue
|
||||
row_name = str(row.get("name") or "")
|
||||
rid = int(row.get("id", 0) or 0)
|
||||
if rid <= 0:
|
||||
continue
|
||||
if row_name.casefold() == name.casefold():
|
||||
tag_id = rid
|
||||
break
|
||||
if tag_id is None:
|
||||
for row in result:
|
||||
if isinstance(row, dict) and int(row.get("id", 0) or 0) > 0:
|
||||
tag_id = int(row.get("id", 0))
|
||||
break
|
||||
if tag_id is None:
|
||||
created = _wp_request(
|
||||
base_url=base_url,
|
||||
auth_header=auth_header,
|
||||
method="POST",
|
||||
endpoint="tags",
|
||||
payload={"name": name},
|
||||
)
|
||||
if isinstance(created, dict):
|
||||
rid = int(created.get("id", 0) or 0)
|
||||
if rid > 0:
|
||||
tag_id = rid
|
||||
if tag_id is not None and tag_id > 0 and tag_id not in seen:
|
||||
seen.add(tag_id)
|
||||
ids.append(tag_id)
|
||||
except Exception:
|
||||
continue
|
||||
return ids
|
||||
|
||||
|
||||
_BLOCKED_IMAGE_EXTS = {".svg", ".gif", ".ico", ".webp"}
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _sanitize_image_url(url: str) -> str:
|
||||
"""Decode HTML entities (e.g. & → &) in image URLs from RSS feeds."""
|
||||
return _html_unescape(url)
|
||||
|
||||
|
||||
_PLACEHOLDER_PATTERNS = ("some-default.jpg", "default-image", "placeholder", "no-image", "noimage")
|
||||
|
||||
def _is_usable_image_url(url: str) -> bool:
|
||||
"""Return False for URLs that are unlikely to work as WP featured images."""
|
||||
if not url or url.startswith("data:"):
|
||||
return False
|
||||
try:
|
||||
path = urlparse(url).path.lower()
|
||||
_, ext = path.rsplit(".", 1) if "." in path else ("", "")
|
||||
if f".{ext}" in _BLOCKED_IMAGE_EXTS:
|
||||
return False
|
||||
if any(p in path for p in _PLACEHOLDER_PATTERNS):
|
||||
return False
|
||||
except Exception:
|
||||
pass
|
||||
return True
|
||||
|
||||
|
||||
def _download_image_bytes(url: str, referer: str | None = None) -> tuple[bytes, str]:
|
||||
url = _sanitize_image_url(url)
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; rss-news-publisher/1.0)",
|
||||
"Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
|
||||
}
|
||||
if referer:
|
||||
headers["Referer"] = referer
|
||||
req = Request(url=url, headers=headers)
|
||||
with urlopen(req, timeout=20) as resp:
|
||||
raw = resp.read()
|
||||
content_type = resp.headers.get("Content-Type", "application/octet-stream")
|
||||
content_type = content_type.split(";")[0].strip() if content_type else "application/octet-stream"
|
||||
if not content_type.lower().startswith("image/"):
|
||||
raise RuntimeError(f"Ausgewählte Bild-URL liefert kein Bild ({content_type})")
|
||||
return raw, content_type
|
||||
|
||||
|
||||
def _guess_filename(image_url: str, content_type: str) -> str:
|
||||
parsed = urlparse(_sanitize_image_url(image_url))
|
||||
stem = Path(parsed.path).name or "article-image"
|
||||
if "." not in stem:
|
||||
ext = mimetypes.guess_extension(content_type.split(";")[0].strip()) or ".jpg"
|
||||
stem = f"{stem}{ext}"
|
||||
# Sanitize to ASCII-safe characters for the HTTP Content-Disposition header
|
||||
stem = stem.encode("ascii", errors="ignore").decode("ascii")
|
||||
stem = re.sub(r"[^\w.\-]", "_", stem) or "article-image.jpg"
|
||||
return stem
|
||||
|
||||
|
||||
def _get_image_meta_for_url(meta_json: str | None, image_url: str) -> dict:
|
||||
"""Return the caption/credit dict for a specific image URL from extraction metadata."""
|
||||
if not meta_json or not image_url:
|
||||
return {}
|
||||
try:
|
||||
from urllib.parse import urlparse
|
||||
meta = json.loads(meta_json)
|
||||
image_metadata = (meta.get("extraction") or {}).get("image_metadata") or {}
|
||||
# Exact match first
|
||||
if image_url in image_metadata:
|
||||
return image_metadata[image_url]
|
||||
# Fuzzy match: compare without query string (handles ?w=1200 variants)
|
||||
base_url = urlparse(image_url)._replace(query="").geturl()
|
||||
for key, val in image_metadata.items():
|
||||
key_base = urlparse(key)._replace(query="").geturl()
|
||||
if key_base == base_url:
|
||||
return val
|
||||
return {}
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
|
||||
def _build_image_caption(image_meta: dict, source_url: str) -> str:
|
||||
"""Build a WP caption string from image metadata and source URL."""
|
||||
# caption from figcaption typically already contains the credit text
|
||||
caption = (image_meta.get("caption") or "").strip()
|
||||
if caption:
|
||||
return caption
|
||||
return f"Quelle: {source_url}"
|
||||
|
||||
|
||||
def _upload_featured_media(
|
||||
*,
|
||||
base_url: str,
|
||||
auth_header: str,
|
||||
image_url: str,
|
||||
article_title: str,
|
||||
source_url: str,
|
||||
image_caption: str = "",
|
||||
) -> int:
|
||||
image_bytes, content_type = _download_image_bytes(image_url, referer=source_url or None)
|
||||
filename = _guess_filename(image_url, content_type)
|
||||
|
||||
media_url = f"{base_url.rstrip('/')}/wp-json/wp/v2/media"
|
||||
media_req = Request(
|
||||
url=media_url,
|
||||
data=image_bytes,
|
||||
method="POST",
|
||||
headers={
|
||||
"Authorization": auth_header,
|
||||
"Content-Type": content_type,
|
||||
"Content-Disposition": f'attachment; filename="{filename}"',
|
||||
"Accept": "application/json",
|
||||
"User-Agent": "rss-news-publisher/1.0",
|
||||
},
|
||||
)
|
||||
with urlopen(media_req, timeout=30) as resp:
|
||||
media_raw = resp.read().decode("utf-8", errors="replace")
|
||||
media_payload = json.loads(media_raw) if media_raw else {}
|
||||
media_id = int(media_payload.get("id", 0)) if isinstance(media_payload, dict) else 0
|
||||
if media_id <= 0:
|
||||
raise RuntimeError(f"WordPress Media-Upload fehlgeschlagen: {media_payload}")
|
||||
|
||||
_wp_request(
|
||||
base_url=base_url,
|
||||
auth_header=auth_header,
|
||||
method="POST",
|
||||
endpoint=f"media/{media_id}",
|
||||
payload={
|
||||
"title": f"{article_title[:120]} - Bild",
|
||||
"caption": image_caption or f"Quelle: {source_url}",
|
||||
"alt_text": article_title[:200],
|
||||
},
|
||||
)
|
||||
return media_id
|
||||
|
||||
|
||||
def _as_paragraph_html(text: str) -> str:
|
||||
chunks = [chunk.strip() for chunk in re.split(r"\n{2,}", text.strip()) if chunk.strip()]
|
||||
if not chunks:
|
||||
return ""
|
||||
lines = []
|
||||
for chunk in chunks:
|
||||
compact = re.sub(r"\s*\n\s*", " ", chunk)
|
||||
lines.append(f"<p>{escape(compact)}</p>")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _as_block_paragraphs(text: str) -> str:
|
||||
chunks = [chunk.strip() for chunk in re.split(r"\n{2,}", text.strip()) if chunk.strip()]
|
||||
if not chunks:
|
||||
return ""
|
||||
lines = []
|
||||
for chunk in chunks:
|
||||
compact = re.sub(r"\s*\n\s*", " ", chunk)
|
||||
lines.append(f"<!-- wp:paragraph --><p>{escape(compact)}</p><!-- /wp:paragraph -->")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _strip_html_tags(raw: str) -> str:
|
||||
text = re.sub(r"<[^>]+>", " ", raw or "")
|
||||
return re.sub(r"\s+", " ", text).strip()
|
||||
|
||||
|
||||
def _html_to_wp_blocks(html: str) -> str:
|
||||
src = (html or "").strip()
|
||||
if not src:
|
||||
return ""
|
||||
pattern = re.compile(
|
||||
r"<h([2-6])[^>]*>[\s\S]*?</h\1>|<p[^>]*>[\s\S]*?</p>|<ul[^>]*>[\s\S]*?</ul>|<ol[^>]*>[\s\S]*?</ol>",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
blocks: list[str] = []
|
||||
for match in pattern.finditer(src):
|
||||
block_html = match.group(0).strip()
|
||||
if not block_html:
|
||||
continue
|
||||
tag_match = re.match(r"<([a-z0-9]+)", block_html, re.IGNORECASE)
|
||||
tag = (tag_match.group(1).lower() if tag_match else "")
|
||||
if tag == "p":
|
||||
blocks.append(f"<!-- wp:paragraph -->{block_html}<!-- /wp:paragraph -->")
|
||||
elif tag in {"ul", "ol"}:
|
||||
ordered = tag == "ol"
|
||||
if ordered:
|
||||
blocks.append(f'<!-- wp:list {{"ordered":true}} -->{block_html}<!-- /wp:list -->')
|
||||
else:
|
||||
blocks.append(f"<!-- wp:list -->{block_html}<!-- /wp:list -->")
|
||||
elif tag.startswith("h") and len(tag) == 2 and tag[1].isdigit():
|
||||
level = int(tag[1])
|
||||
blocks.append(f'<!-- wp:heading {{"level":{level}}} -->{block_html}<!-- /wp:heading -->')
|
||||
if blocks:
|
||||
return "\n".join(blocks)
|
||||
return _as_block_paragraphs(_strip_html_tags(src))
|
||||
|
||||
|
||||
def _as_block_heading(level: int, text: str) -> str:
|
||||
safe_level = min(6, max(1, int(level)))
|
||||
return f'<!-- wp:heading {{"level":{safe_level}}} --><h{safe_level}>{escape(text)}</h{safe_level}><!-- /wp:heading -->'
|
||||
|
||||
|
||||
def _as_block_list(items: list[str]) -> str:
|
||||
if not items:
|
||||
return ""
|
||||
content = "".join(f"<li>{item}</li>" for item in items)
|
||||
return f"<!-- wp:list --><ul>{content}</ul><!-- /wp:list -->"
|
||||
|
||||
|
||||
def _sanitize_publish_text(text: str) -> str:
|
||||
raw = (text or "").strip()
|
||||
if not raw:
|
||||
return ""
|
||||
lines = [ln.strip() for ln in raw.splitlines() if ln.strip()]
|
||||
if len(lines) > 3:
|
||||
lines = lines[3:]
|
||||
merged = "\n".join(lines)
|
||||
merged = re.sub(r"\n?\s*Pressekontakt[\s\S]*$", "", merged, flags=re.IGNORECASE).strip()
|
||||
return merged
|
||||
|
||||
|
||||
def _build_attribution_block(article: dict[str, Any]) -> str:
|
||||
"""Build a WP Gutenberg attribution block for the bottom of the article."""
|
||||
from urllib.parse import urlparse
|
||||
source_url = (article.get("canonical_url") or article.get("source_url") or "").strip()
|
||||
source_name = (article.get("source_name_snapshot") or "").strip()
|
||||
author = (article.get("author") or "").strip()
|
||||
|
||||
# If the feed name is "Google Alerts" (or similar generic names), derive the
|
||||
# real source name from the hostname of the canonical URL.
|
||||
if not source_name or source_name.lower() in ("google alerts", "google"):
|
||||
try:
|
||||
hostname = urlparse(source_url).hostname or ""
|
||||
source_name = hostname.removeprefix("www.")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Get image credit from extraction metadata (uses fuzzy URL match)
|
||||
meta_json = article.get("meta_json")
|
||||
credit = ""
|
||||
try:
|
||||
meta = json.loads(meta_json or "{}")
|
||||
selected_url = (meta.get("image_review") or {}).get("selected_url") or ""
|
||||
if selected_url:
|
||||
img_meta = _get_image_meta_for_url(meta_json, selected_url)
|
||||
raw_credit = (img_meta.get("credit") or "").strip()
|
||||
caption_text = (img_meta.get("caption") or "").strip()
|
||||
# If credit is just a bare marker prefix (e.g. "Foto:", "Bild:"),
|
||||
# clear it and extract the full credit from the caption text instead.
|
||||
_BARE_MARKERS = {"foto", "bild", "credit", "fotograf", "fotografie", "photo", "bildnachweis"}
|
||||
if raw_credit.endswith(":") and raw_credit[:-1].strip().lower() in _BARE_MARKERS:
|
||||
raw_credit = ""
|
||||
if raw_credit:
|
||||
credit = raw_credit
|
||||
elif caption_text:
|
||||
# Extract credit markers like "Foto: IMAGO/…", "© Agentur", "Bild: …"
|
||||
import re as _re
|
||||
m = _re.search(
|
||||
r"(©[^\n]{1,120}|(?:Foto|Bild|Credit|Fotograf|Photo)\s*:[^\n]{1,120})",
|
||||
caption_text,
|
||||
)
|
||||
credit = m.group(1).strip() if m else ""
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
parts: list[str] = []
|
||||
if source_url:
|
||||
label = source_name or source_url
|
||||
parts.append(f'Originalartikel: <a href="{source_url}">{escape(label)}</a>')
|
||||
if author:
|
||||
parts.append(f"Autor: {escape(author)}")
|
||||
if credit:
|
||||
parts.append(f"Bildnachweis: {escape(credit)}")
|
||||
|
||||
if not parts:
|
||||
return ""
|
||||
|
||||
inner = " | ".join(parts)
|
||||
return (
|
||||
"\n<!-- wp:separator {\"className\":\"is-style-wide\"} -->"
|
||||
"<hr class=\"wp-block-separator is-style-wide\"/><!-- /wp:separator -->\n"
|
||||
f'<!-- wp:paragraph {{\"className\":\"article-attribution\"}} -->'
|
||||
f'<p class="article-attribution"><em>{inner}</em></p>'
|
||||
"<!-- /wp:paragraph -->"
|
||||
)
|
||||
|
||||
|
||||
def _build_post_content(article: dict[str, Any]) -> tuple[str, str | None]:
|
||||
summary = (article.get("summary") or "").strip()
|
||||
body_text = (article.get("content_rewritten") or article.get("content_raw") or "").strip()
|
||||
body_text = _sanitize_publish_text(body_text)
|
||||
if not body_text:
|
||||
body_text = summary
|
||||
|
||||
has_html = bool(re.search(r"<[a-zA-Z][^>]*>", body_text))
|
||||
body_html = _html_to_wp_blocks(body_text) if has_html else _as_block_paragraphs(body_text)
|
||||
if not body_html:
|
||||
body_html = "<!-- wp:paragraph --><p>Kein Inhalt verfügbar.</p><!-- /wp:paragraph -->"
|
||||
|
||||
attribution = _build_attribution_block(article)
|
||||
content = (body_html + attribution).strip()
|
||||
return content, None
|
||||
|
||||
|
||||
def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]:
|
||||
settings = get_settings()
|
||||
if not settings.wordpress_base_url or not settings.wordpress_username or not settings.wordpress_app_password:
|
||||
raise RuntimeError("WordPress Konfiguration fehlt (base_url, username, app_password)")
|
||||
|
||||
auth = _auth_header(settings.wordpress_username, settings.wordpress_app_password)
|
||||
|
||||
title = (article.get("title") or "Ohne Titel").strip()
|
||||
content, excerpt = _build_post_content(article)
|
||||
source_url = article.get("source_url") or ""
|
||||
|
||||
featured_media_id = None
|
||||
selected_image_url = _selected_image_url_from_meta(article.get("meta_json"))
|
||||
|
||||
# Build candidate list: primary selected URL + fallbacks from image_urls_json
|
||||
image_candidates: list[str] = []
|
||||
if selected_image_url and _is_usable_image_url(selected_image_url):
|
||||
image_candidates.append(selected_image_url)
|
||||
try:
|
||||
extra_urls = json.loads(article.get("image_urls_json") or "[]")
|
||||
for u in extra_urls:
|
||||
if u and u not in image_candidates and _is_usable_image_url(u):
|
||||
image_candidates.append(u)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
for candidate_url in image_candidates:
|
||||
image_meta = _get_image_meta_for_url(article.get("meta_json"), candidate_url)
|
||||
image_caption = _build_image_caption(image_meta, source_url)
|
||||
try:
|
||||
featured_media_id = _upload_featured_media(
|
||||
base_url=settings.wordpress_base_url,
|
||||
auth_header=auth,
|
||||
image_url=candidate_url,
|
||||
article_title=title,
|
||||
source_url=source_url,
|
||||
image_caption=image_caption,
|
||||
)
|
||||
break # success — stop trying further candidates
|
||||
except Exception as img_exc:
|
||||
_logger.warning(
|
||||
"Bild-Upload fehlgeschlagen, versuche nächste URL: %s — %s", candidate_url, img_exc
|
||||
)
|
||||
|
||||
if not featured_media_id and image_candidates:
|
||||
_logger.warning(
|
||||
"Alle %d Bild-Kandidaten fehlgeschlagen für Artikel #%s (%s)",
|
||||
len(image_candidates), article.get("id"), title[:60],
|
||||
)
|
||||
|
||||
payload = {
|
||||
"title": title,
|
||||
"content": content,
|
||||
"status": settings.wordpress_default_status,
|
||||
}
|
||||
if excerpt:
|
||||
payload["excerpt"] = excerpt
|
||||
if featured_media_id:
|
||||
payload["featured_media"] = featured_media_id
|
||||
scheduled_at = article.get("scheduled_publish_at")
|
||||
if scheduled_at:
|
||||
payload["date"] = scheduled_at # e.g. "2026-03-24T09:00:00"
|
||||
# Use status "future" so WP schedules auto-publishing at the given date.
|
||||
# WP ignores date for drafts and shows "Sofort veröffentlichen" instead.
|
||||
try:
|
||||
from datetime import datetime as _dt
|
||||
if _dt.fromisoformat(scheduled_at) > _dt.now():
|
||||
payload["status"] = "future"
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
wp_post_id = article.get("wp_post_id")
|
||||
tag_ids = _resolve_wp_tag_ids(
|
||||
base_url=settings.wordpress_base_url,
|
||||
auth_header=auth,
|
||||
tags=_selected_tags_from_meta(article.get("meta_json")),
|
||||
)
|
||||
if tag_ids:
|
||||
payload["tags"] = tag_ids
|
||||
|
||||
if wp_post_id:
|
||||
result = _wp_request(
|
||||
base_url=settings.wordpress_base_url,
|
||||
auth_header=auth,
|
||||
method="POST",
|
||||
endpoint=f"posts/{int(wp_post_id)}",
|
||||
payload=payload,
|
||||
)
|
||||
else:
|
||||
result = _wp_request(
|
||||
base_url=settings.wordpress_base_url,
|
||||
auth_header=auth,
|
||||
method="POST",
|
||||
endpoint="posts",
|
||||
payload=payload,
|
||||
)
|
||||
|
||||
if not isinstance(result, dict):
|
||||
raise RuntimeError(f"WordPress Antwort im unerwarteten Format: {result}")
|
||||
post_id = int(result.get("id", 0))
|
||||
if post_id <= 0:
|
||||
raise RuntimeError(f"WordPress Antwort ohne Post-ID: {result}")
|
||||
post_url = result.get("link")
|
||||
return post_id, post_url if isinstance(post_url, str) else None
|
||||
|
||||
|
||||
def selected_image_exists(article: dict[str, Any]) -> bool:
|
||||
return _selected_image_url_from_meta(article.get("meta_json")) is not None
|
||||
|
||||
|
||||
def delete_wp_post(wp_post_id: int) -> None:
|
||||
"""Permanently delete a WordPress post (moves to trash, then deletes)."""
|
||||
settings = get_settings()
|
||||
if not settings.wordpress_base_url or not settings.wordpress_username or not settings.wordpress_app_password:
|
||||
raise RuntimeError("WordPress Konfiguration fehlt")
|
||||
auth = _auth_header(settings.wordpress_username, settings.wordpress_app_password)
|
||||
# force=true skips trash
|
||||
_wp_request(
|
||||
base_url=settings.wordpress_base_url,
|
||||
auth_header=auth,
|
||||
method="DELETE",
|
||||
endpoint=f"posts/{wp_post_id}?force=true",
|
||||
)
|
||||
|
||||
|
||||
def sync_db_from_wordpress() -> dict[str, Any]:
|
||||
"""Sync scheduled_publish_at and wp_post_url in the DB from WordPress.
|
||||
|
||||
WordPress is treated as the source of truth for scheduling.
|
||||
For each DB article that has a wp_post_id:
|
||||
- If WP post exists as 'future': update scheduled_publish_at to WP date.
|
||||
- If WP post exists as 'draft': clear scheduled_publish_at (not yet scheduled).
|
||||
- If WP post exists as 'publish': mark article as published in DB.
|
||||
- If WP post is trashed/deleted (404 or trash status): clear wp_post_id,
|
||||
wp_post_url, and scheduled_publish_at so the article can be re-processed.
|
||||
Returns a stats dict with counts of each action taken.
|
||||
"""
|
||||
from .db import get_conn
|
||||
|
||||
settings = get_settings()
|
||||
if not settings.wordpress_base_url or not settings.wordpress_username or not settings.wordpress_app_password:
|
||||
raise RuntimeError("WordPress Konfiguration fehlt")
|
||||
auth = _auth_header(settings.wordpress_username, settings.wordpress_app_password)
|
||||
base_url = settings.wordpress_base_url.rstrip("/")
|
||||
|
||||
# Fetch all future + draft + published WP posts in one pass (up to 300 per status)
|
||||
wp_posts: dict[int, dict] = {}
|
||||
for status in ("future", "draft", "publish"):
|
||||
for page in range(1, 4): # max 300 per status
|
||||
try:
|
||||
result = _wp_request(
|
||||
base_url=base_url,
|
||||
auth_header=auth,
|
||||
method="GET",
|
||||
endpoint=f"posts?status={status}&per_page=100&page={page}&_fields=id,date,status,link",
|
||||
)
|
||||
except Exception:
|
||||
break
|
||||
if not isinstance(result, list) or not result:
|
||||
break
|
||||
for post in result:
|
||||
try:
|
||||
wp_posts[int(post["id"])] = post
|
||||
except Exception:
|
||||
pass
|
||||
if len(result) < 100:
|
||||
break
|
||||
|
||||
# Load all DB articles that have a wp_post_id
|
||||
with get_conn() as conn:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT id, wp_post_id, wp_post_url, scheduled_publish_at, status
|
||||
FROM articles
|
||||
WHERE wp_post_id IS NOT NULL
|
||||
AND status NOT IN ('no_image')
|
||||
ORDER BY id
|
||||
"""
|
||||
).fetchall()
|
||||
|
||||
stats: dict[str, int] = {
|
||||
"total_db_articles": len(rows),
|
||||
"wp_posts_found": len(wp_posts),
|
||||
"slot_updated": 0,
|
||||
"slot_cleared_draft": 0,
|
||||
"marked_published": 0,
|
||||
"wp_reference_cleared": 0,
|
||||
"already_in_sync": 0,
|
||||
}
|
||||
|
||||
for row in rows:
|
||||
article_id = row["id"]
|
||||
wp_post_id = int(row["wp_post_id"])
|
||||
wp_post = wp_posts.get(wp_post_id)
|
||||
|
||||
if wp_post is None:
|
||||
# Post not found in future/draft/publish — likely trashed or deleted
|
||||
# Clear wp reference so article can be re-processed if needed
|
||||
with get_conn() as conn:
|
||||
conn.execute(
|
||||
"""UPDATE articles
|
||||
SET wp_post_id = NULL, wp_post_url = NULL, scheduled_publish_at = NULL
|
||||
WHERE id = ?""",
|
||||
(article_id,),
|
||||
)
|
||||
stats["wp_reference_cleared"] += 1
|
||||
continue
|
||||
|
||||
wp_status = wp_post.get("status", "")
|
||||
wp_date = wp_post.get("date", "") # local CET datetime, e.g. "2026-05-05T09:00:00"
|
||||
wp_link = wp_post.get("link") or row["wp_post_url"]
|
||||
|
||||
if wp_status == "publish":
|
||||
# Already published in WP — mark as published in DB if not already
|
||||
if row["status"] != "published":
|
||||
with get_conn() as conn:
|
||||
conn.execute(
|
||||
"UPDATE articles SET status = 'published', wp_post_url = ? WHERE id = ?",
|
||||
(wp_link, article_id),
|
||||
)
|
||||
stats["marked_published"] += 1
|
||||
else:
|
||||
stats["already_in_sync"] += 1
|
||||
|
||||
elif wp_status == "future":
|
||||
# Scheduled — sync the date into scheduled_publish_at
|
||||
current_slot = row["scheduled_publish_at"] or ""
|
||||
# WP returns e.g. "2026-05-05T09:00:00" — compare ignoring seconds
|
||||
if current_slot[:16] != wp_date[:16]:
|
||||
with get_conn() as conn:
|
||||
conn.execute(
|
||||
"UPDATE articles SET scheduled_publish_at = ?, wp_post_url = ? WHERE id = ?",
|
||||
(wp_date, wp_link, article_id),
|
||||
)
|
||||
stats["slot_updated"] += 1
|
||||
else:
|
||||
stats["already_in_sync"] += 1
|
||||
|
||||
elif wp_status == "draft":
|
||||
# Draft without a schedule — clear scheduled_publish_at if set
|
||||
if row["scheduled_publish_at"]:
|
||||
with get_conn() as conn:
|
||||
conn.execute(
|
||||
"UPDATE articles SET scheduled_publish_at = NULL WHERE id = ?",
|
||||
(article_id,),
|
||||
)
|
||||
stats["slot_cleared_draft"] += 1
|
||||
else:
|
||||
stats["already_in_sync"] += 1
|
||||
|
||||
return stats
|
||||
39
backend/app/workflow.py
Normal file
39
backend/app/workflow.py
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
from __future__ import annotations
|
||||
|
||||
UI_STATUSES = ("new", "rewrite", "publish", "published", "close", "no_image")
|
||||
|
||||
|
||||
def internal_to_ui_status(status: str | None) -> str:
|
||||
value = (status or "").strip()
|
||||
if value == "approved":
|
||||
return "publish"
|
||||
if value == "error":
|
||||
return "close"
|
||||
if value == "review":
|
||||
return "rewrite"
|
||||
if value in {"new", "rewrite", "published", "no_image"}:
|
||||
return value
|
||||
return value or "new"
|
||||
|
||||
|
||||
def ui_to_internal_status(status: str | None) -> str:
|
||||
value = (status or "").strip()
|
||||
if value == "publish":
|
||||
return "approved"
|
||||
if value == "close":
|
||||
return "error"
|
||||
if value in {"new", "rewrite", "published", "no_image"}:
|
||||
return value
|
||||
if value in {"approved", "error", "review"}:
|
||||
return value
|
||||
return value
|
||||
|
||||
|
||||
ALLOWED_UI_TRANSITIONS: dict[str, set[str]] = {
|
||||
"new": {"rewrite", "close"},
|
||||
"rewrite": {"publish", "close"},
|
||||
"publish": {"published", "close"},
|
||||
"published": {"rewrite", "close"},
|
||||
"close": {"rewrite"},
|
||||
"no_image": {"rewrite", "close"},
|
||||
}
|
||||
BIN
backend/data/rss_news.db
Normal file
BIN
backend/data/rss_news.db
Normal file
Binary file not shown.
3
backend/requirements-test.txt
Normal file
3
backend/requirements-test.txt
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
pytest==8.3.5
|
||||
pytest-cov==6.0.0
|
||||
httpx==0.28.1
|
||||
8
backend/requirements.txt
Normal file
8
backend/requirements.txt
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
fastapi==0.116.1
|
||||
uvicorn[standard]==0.35.0
|
||||
itsdangerous==2.2.0
|
||||
pydantic-settings==2.10.1
|
||||
python-dotenv==1.1.1
|
||||
feedparser==6.0.11
|
||||
jinja2==3.1.4
|
||||
python-multipart==0.0.20
|
||||
303
backend/static/admin.css
Normal file
303
backend/static/admin.css
Normal file
|
|
@ -0,0 +1,303 @@
|
|||
body {
|
||||
margin: 0;
|
||||
font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif;
|
||||
background: #f4f6f8;
|
||||
color: #1f2937;
|
||||
}
|
||||
|
||||
.topbar {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
padding: 20px 28px;
|
||||
background: #0f172a;
|
||||
color: #f8fafc;
|
||||
}
|
||||
|
||||
.container {
|
||||
padding: 20px 28px 28px 28px;
|
||||
}
|
||||
|
||||
.login {
|
||||
max-width: 520px;
|
||||
margin: 60px auto;
|
||||
}
|
||||
|
||||
.card {
|
||||
background: #ffffff;
|
||||
border-radius: 10px;
|
||||
box-shadow: 0 1px 3px rgba(0, 0, 0, 0.12);
|
||||
padding: 16px;
|
||||
margin-bottom: 16px;
|
||||
}
|
||||
|
||||
.stats {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(4, minmax(0, 1fr));
|
||||
gap: 12px;
|
||||
margin-bottom: 16px;
|
||||
}
|
||||
|
||||
.stat {
|
||||
background: #ffffff;
|
||||
border-radius: 10px;
|
||||
padding: 12px;
|
||||
box-shadow: 0 1px 3px rgba(0, 0, 0, 0.12);
|
||||
}
|
||||
|
||||
.stat .label {
|
||||
font-size: 12px;
|
||||
color: #64748b;
|
||||
}
|
||||
|
||||
.stat .value {
|
||||
font-size: 24px;
|
||||
font-weight: 700;
|
||||
}
|
||||
|
||||
.grid.two {
|
||||
display: grid;
|
||||
grid-template-columns: 1fr 1fr;
|
||||
gap: 16px;
|
||||
}
|
||||
|
||||
.stack {
|
||||
display: grid;
|
||||
gap: 10px;
|
||||
}
|
||||
|
||||
.row {
|
||||
display: flex;
|
||||
gap: 8px;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.filter-row {
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
.inline {
|
||||
display: flex;
|
||||
gap: 6px;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
table {
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
}
|
||||
|
||||
th, td {
|
||||
text-align: left;
|
||||
padding: 8px;
|
||||
border-bottom: 1px solid #e5e7eb;
|
||||
vertical-align: top;
|
||||
}
|
||||
|
||||
input, select, button, textarea {
|
||||
padding: 8px;
|
||||
border-radius: 6px;
|
||||
border: 1px solid #cbd5e1;
|
||||
font: inherit;
|
||||
}
|
||||
|
||||
button {
|
||||
background: #0ea5e9;
|
||||
border-color: #0ea5e9;
|
||||
color: white;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
button.secondary {
|
||||
background: #64748b;
|
||||
border-color: #64748b;
|
||||
}
|
||||
|
||||
.badge {
|
||||
display: inline-block;
|
||||
padding: 2px 8px;
|
||||
border-radius: 999px;
|
||||
background: #e2e8f0;
|
||||
font-size: 12px;
|
||||
}
|
||||
|
||||
.badge.ok {
|
||||
background: #dcfce7;
|
||||
color: #166534;
|
||||
}
|
||||
|
||||
.badge.bad {
|
||||
background: #fee2e2;
|
||||
color: #991b1b;
|
||||
}
|
||||
|
||||
.badge.errcat {
|
||||
margin-bottom: 4px;
|
||||
}
|
||||
|
||||
.badge.errcat-policy {
|
||||
background: #fee2e2;
|
||||
color: #991b1b;
|
||||
}
|
||||
|
||||
.badge.errcat-auth {
|
||||
background: #ffedd5;
|
||||
color: #9a3412;
|
||||
}
|
||||
|
||||
.badge.errcat-dns {
|
||||
background: #dbeafe;
|
||||
color: #1e40af;
|
||||
}
|
||||
|
||||
.badge.errcat-media {
|
||||
background: #fef9c3;
|
||||
color: #854d0e;
|
||||
}
|
||||
|
||||
.badge.errcat-api {
|
||||
background: #ede9fe;
|
||||
color: #5b21b6;
|
||||
}
|
||||
|
||||
.badge.errcat-unknown {
|
||||
background: #e2e8f0;
|
||||
color: #334155;
|
||||
}
|
||||
|
||||
.alert {
|
||||
margin-bottom: 12px;
|
||||
padding: 10px;
|
||||
border-radius: 8px;
|
||||
background: #fee2e2;
|
||||
color: #991b1b;
|
||||
}
|
||||
|
||||
.flash {
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
.flash-success {
|
||||
border-left: 4px solid #10b981;
|
||||
}
|
||||
|
||||
.flash-error {
|
||||
border-left: 4px solid #ef4444;
|
||||
}
|
||||
|
||||
.subtle {
|
||||
color: #64748b;
|
||||
font-size: 12px;
|
||||
margin-top: 4px;
|
||||
}
|
||||
|
||||
.pre {
|
||||
white-space: pre-wrap;
|
||||
line-height: 1.35;
|
||||
max-height: 220px;
|
||||
overflow: auto;
|
||||
background: #f8fafc;
|
||||
border: 1px solid #e2e8f0;
|
||||
border-radius: 8px;
|
||||
padding: 8px;
|
||||
margin-top: 6px;
|
||||
}
|
||||
|
||||
.linkbtn {
|
||||
display: inline-block;
|
||||
padding: 8px 10px;
|
||||
border-radius: 6px;
|
||||
text-decoration: none;
|
||||
border: 1px solid #cbd5e1;
|
||||
color: #334155;
|
||||
background: #f8fafc;
|
||||
}
|
||||
|
||||
.detail-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(220px, 1fr));
|
||||
gap: 8px 12px;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
.detail-item {
|
||||
background: #f8fafc;
|
||||
border: 1px solid #e2e8f0;
|
||||
border-radius: 8px;
|
||||
padding: 8px;
|
||||
display: grid;
|
||||
gap: 4px;
|
||||
}
|
||||
|
||||
.detail-item .k {
|
||||
font-size: 12px;
|
||||
color: #64748b;
|
||||
}
|
||||
|
||||
.thumb {
|
||||
width: 72px;
|
||||
height: 72px;
|
||||
object-fit: cover;
|
||||
border-radius: 8px;
|
||||
border: 1px solid #cbd5e1;
|
||||
margin-top: 6px;
|
||||
}
|
||||
|
||||
.image-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fill, minmax(180px, 1fr));
|
||||
gap: 10px;
|
||||
}
|
||||
|
||||
.image-card {
|
||||
border: 1px solid #e2e8f0;
|
||||
border-radius: 8px;
|
||||
padding: 8px;
|
||||
background: #fff;
|
||||
}
|
||||
|
||||
.image-card img {
|
||||
width: 100%;
|
||||
height: 120px;
|
||||
object-fit: cover;
|
||||
border-radius: 6px;
|
||||
border: 1px solid #e2e8f0;
|
||||
background: #f8fafc;
|
||||
}
|
||||
|
||||
.img-failed {
|
||||
opacity: 0.3;
|
||||
filter: grayscale(1);
|
||||
}
|
||||
|
||||
.image-meta {
|
||||
margin-top: 6px;
|
||||
display: flex;
|
||||
gap: 6px;
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
|
||||
.image-actions {
|
||||
margin-top: 8px;
|
||||
display: flex;
|
||||
gap: 6px;
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
|
||||
.image-selected {
|
||||
border-color: #10b981;
|
||||
box-shadow: 0 0 0 1px rgba(16, 185, 129, 0.25);
|
||||
}
|
||||
|
||||
.image-excluded {
|
||||
opacity: 0.65;
|
||||
}
|
||||
|
||||
@media (max-width: 920px) {
|
||||
.stats {
|
||||
grid-template-columns: repeat(2, minmax(0, 1fr));
|
||||
}
|
||||
.grid.two {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
}
|
||||
224
backend/templates/admin_article_detail.html
Normal file
224
backend/templates/admin_article_detail.html
Normal file
|
|
@ -0,0 +1,224 @@
|
|||
<!doctype html>
|
||||
<html lang="de">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<title>{{ title }}</title>
|
||||
<link rel="stylesheet" href="/admin/static/admin.css" />
|
||||
</head>
|
||||
<body>
|
||||
<header class="topbar">
|
||||
<div>
|
||||
<h1>Artikel-Detail #{{ article.id }}</h1>
|
||||
<p>Angemeldet als <strong>{{ user }}</strong></p>
|
||||
</div>
|
||||
<div class="row">
|
||||
<a class="linkbtn" href="/admin/dashboard">Zurück</a>
|
||||
<form method="post" action="/admin/logout">
|
||||
<button type="submit" class="secondary">Logout</button>
|
||||
</form>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
<main class="container">
|
||||
{% if flash_msg %}
|
||||
<section class="card flash {{ 'flash-error' if flash_type == 'error' else 'flash-success' }}">
|
||||
{{ flash_msg }}
|
||||
</section>
|
||||
{% endif %}
|
||||
|
||||
<section class="card">
|
||||
<h2>{{ article.title }}</h2>
|
||||
<div class="detail-grid">
|
||||
<div class="detail-item"><span class="k">Status</span><span><span class="badge">{{ article.status_ui }}</span></span></div>
|
||||
<div class="detail-item"><span class="k">Artikel-Datum</span><span>{{ article.published_at or "-" }}</span></div>
|
||||
<div class="detail-item"><span class="k">Alter</span><span>{{ article.days_old if article.days_old is not none else "-" }} Tage</span></div>
|
||||
<div class="detail-item"><span class="k">Relevanz</span><span>{{ article.relevance }}</span></div>
|
||||
<div class="detail-item"><span class="k">Autor</span><span>{{ article.author or "-" }}</span></div>
|
||||
<div class="detail-item"><span class="k">Feed</span><span>{{ feed.name if feed else "-" }}</span></div>
|
||||
<div class="detail-item"><span class="k">Quelle Snapshot</span><span>{{ article.source_name_snapshot or "-" }}</span></div>
|
||||
<div class="detail-item"><span class="k">Lizenz Snapshot</span><span>{{ article.source_license_name_snapshot or "-" }}</span></div>
|
||||
<div class="detail-item"><span class="k">Terms Snapshot</span><span>{{ article.source_terms_url_snapshot or "-" }}</span></div>
|
||||
</div>
|
||||
<p><strong>Quelle:</strong> <a href="{{ article.source_url }}" target="_blank" rel="noopener">{{ article.source_url }}</a></p>
|
||||
{% if article.canonical_url %}<p><strong>Canonical:</strong> <a href="{{ article.canonical_url }}" target="_blank" rel="noopener">{{ article.canonical_url }}</a></p>{% endif %}
|
||||
{% if article.summary %}
|
||||
<p><strong>Summary:</strong> {{ article.summary }}</p>
|
||||
{% endif %}
|
||||
<p><strong>WordPress Post:</strong>
|
||||
{% if article.wp_post_url %}
|
||||
<a href="{{ article.wp_post_url }}" target="_blank" rel="noopener">#{{ article.wp_post_id }}</a>
|
||||
{% elif article.wp_post_id %}
|
||||
#{{ article.wp_post_id }}
|
||||
{% else %}
|
||||
-
|
||||
{% endif %}
|
||||
</p>
|
||||
<p><strong>Publish Attempts:</strong> {{ article.publish_attempts or 0 }} | <strong>Letzter Fehler:</strong> {{ article.publish_last_error or "-" }}</p>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Checkliste</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>Kriterium</th><th>Status</th><th>Wert</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for c in checklist %}
|
||||
<tr>
|
||||
<td>{{ c.label }}</td>
|
||||
<td>
|
||||
{% if c.status == "ok" %}
|
||||
<span class="badge ok">OK</span>
|
||||
{% else %}
|
||||
<span class="badge bad">Fehlt</span>
|
||||
{% endif %}
|
||||
</td>
|
||||
<td>{{ c.value }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Extrahierte Daten</h2>
|
||||
<p><strong>Bilder:</strong> {{ article.image_entries|length if article.image_entries else 0 }}</p>
|
||||
{% if article.selected_image_url %}
|
||||
<p><strong>Ausgewähltes Hauptbild:</strong> <a href="{{ article.selected_image_url }}" target="_blank" rel="noopener">{{ article.selected_image_url }}</a></p>
|
||||
{% if article.selected_image_proxy_url %}
|
||||
<img src="{{ article.selected_image_proxy_url }}" alt="Ausgewähltes Hauptbild" class="thumb" loading="lazy" />
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
{% if article.image_entries %}
|
||||
{% if article.image_selection %}
|
||||
<details>
|
||||
<summary>Automatische Bildauswahl (Score + Gründe)</summary>
|
||||
<div class="subtle">Primärbild (Auto): {{ article.image_selection.primary or "-" }}</div>
|
||||
<div class="subtle">Ausgewählt: {{ article.image_selection.selected_count or 0 }} / Kandidaten: {{ article.image_selection.total_candidates or 0 }}</div>
|
||||
{% if article.image_selection.ranked %}
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>Bild</th><th>Score</th><th>Gründe</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for r in article.image_selection.ranked %}
|
||||
<tr>
|
||||
<td><a href="{{ r.url }}" target="_blank" rel="noopener">{{ r.url }}</a></td>
|
||||
<td>{{ r.score }}</td>
|
||||
<td>{{ r.reasons|join(", ") if r.reasons else "-" }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
{% endif %}
|
||||
</details>
|
||||
{% endif %}
|
||||
<div class="image-grid">
|
||||
{% for image in article.image_entries %}
|
||||
<article class="image-card {{ 'image-selected' if image.is_selected else '' }} {{ 'image-excluded' if image.is_excluded else '' }}">
|
||||
<a href="{{ image.url }}" target="_blank" rel="noopener">
|
||||
<img src="{{ image.proxy_url }}" data-fallback-src="{{ image.url }}" alt="Artikelbild" loading="lazy" onerror="if(!this.dataset.fallbackUsed){this.dataset.fallbackUsed='1';this.src=this.dataset.fallbackSrc;}else{this.classList.add('img-failed');}" />
|
||||
</a>
|
||||
<div class="image-meta">
|
||||
{% if image.is_selected %}<span class="badge ok">Ausgewählt</span>{% endif %}
|
||||
{% if image.is_excluded %}<span class="badge bad">Ausgeblendet</span>{% endif %}
|
||||
{% if image.is_irrelevant_hint %}<span class="badge">evtl. irrelevant</span>{% endif %}
|
||||
</div>
|
||||
<div class="image-actions">
|
||||
<form method="post" action="/admin/articles/{{ article.id }}/images/decision">
|
||||
<input type="hidden" name="image_url" value="{{ image.url }}" />
|
||||
<input type="hidden" name="action" value="select" />
|
||||
<button type="submit">Als Hauptbild</button>
|
||||
</form>
|
||||
{% if not image.is_excluded %}
|
||||
<form method="post" action="/admin/articles/{{ article.id }}/images/decision">
|
||||
<input type="hidden" name="image_url" value="{{ image.url }}" />
|
||||
<input type="hidden" name="action" value="exclude" />
|
||||
<button type="submit" class="secondary">Ausblenden</button>
|
||||
</form>
|
||||
{% else %}
|
||||
<form method="post" action="/admin/articles/{{ article.id }}/images/decision">
|
||||
<input type="hidden" name="image_url" value="{{ image.url }}" />
|
||||
<input type="hidden" name="action" value="restore" />
|
||||
<button type="submit" class="secondary">Einblenden</button>
|
||||
</form>
|
||||
{% endif %}
|
||||
</div>
|
||||
<div class="subtle"><a href="{{ image.url }}" target="_blank" rel="noopener">{{ image.url }}</a></div>
|
||||
</article>
|
||||
{% endfor %}
|
||||
</div>
|
||||
{% endif %}
|
||||
{% if article.press_contact or article.extraction.press_contact %}
|
||||
<p><strong>Pressekontakt</strong></p>
|
||||
<div class="pre">{{ article.press_contact or article.extraction.press_contact }}</div>
|
||||
{% endif %}
|
||||
{% if article.extraction.extraction_error %}
|
||||
<p class="subtle">Extraktionsfehler: {{ article.extraction.extraction_error }}</p>
|
||||
{% endif %}
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Volltext</h2>
|
||||
<div class="pre">{{ article.content_raw or "-" }}</div>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Rewrite-Text (editierbar)</h2>
|
||||
<form method="post" action="/admin/articles/{{ article.id }}/rewrite-save" class="stack">
|
||||
<textarea name="content_rewritten" rows="14" style="width:100%;">{{ article.content_rewritten or "" }}</textarea>
|
||||
<button type="submit">Rewrite-Text speichern</button>
|
||||
</form>
|
||||
{% if article.meta.generated_tags %}
|
||||
<p><strong>Generierte Tags:</strong> {{ article.meta.generated_tags|join("; ") }}</p>
|
||||
{% endif %}
|
||||
<p class="subtle">Dieser Text wird für den WordPress-Entwurf verwendet, falls vorhanden.</p>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Status ändern</h2>
|
||||
{% if article.status_ui in ["new", "rewrite"] %}
|
||||
<form method="post" action="/admin/articles/{{ article.id }}/rewrite-run" class="row" style="margin-bottom:8px;">
|
||||
<button type="submit">Rewrite ausführen (OpenAI)</button>
|
||||
</form>
|
||||
{% endif %}
|
||||
{% if article.status_ui == "published" %}
|
||||
<form method="post" action="/admin/articles/{{ article.id }}/reopen" class="row" style="margin-bottom:8px;">
|
||||
<button type="submit">Zurück in Rewrite-Workflow</button>
|
||||
</form>
|
||||
{% endif %}
|
||||
<form method="post" action="/admin/articles/{{ article.id }}/transition" class="row">
|
||||
<select name="target_status">
|
||||
{% for s in allowed_transitions %}
|
||||
<option value="{{ s }}">{{ s }}</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
<input name="note" placeholder="Notiz" />
|
||||
<button type="submit" class="secondary">Setzen</button>
|
||||
</form>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>WordPress Publish Queue</h2>
|
||||
{% if article.publish_ready %}
|
||||
<p><span class="badge ok">Publish bereit</span></p>
|
||||
{% else %}
|
||||
<p><span class="badge bad">Publish blockiert</span></p>
|
||||
{% if article.publish_blockers %}
|
||||
<ul>
|
||||
{% for reason in article.publish_blockers %}
|
||||
<li class="subtle">{{ reason }}</li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
<p class="subtle">Voraussetzungen: Status `publish` und Hauptbild gesetzt.</p>
|
||||
<form method="post" action="/admin/articles/{{ article.id }}/publish-enqueue" class="row">
|
||||
<input name="max_attempts" value="3" />
|
||||
<button type="submit" {% if not article.publish_ready %}disabled{% endif %}>In Queue einreihen</button>
|
||||
</form>
|
||||
</section>
|
||||
</main>
|
||||
</body>
|
||||
</html>
|
||||
221
backend/templates/admin_article_list.html
Normal file
221
backend/templates/admin_article_list.html
Normal file
|
|
@ -0,0 +1,221 @@
|
|||
<!doctype html>
|
||||
<html lang="de">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<title>{{ title }}</title>
|
||||
<link rel="stylesheet" href="/admin/static/admin.css" />
|
||||
<style>
|
||||
.al-table { width: 100%; border-collapse: collapse; }
|
||||
.al-table th, .al-table td { padding: 8px 10px; border-bottom: 1px solid #e5e7eb; vertical-align: middle; }
|
||||
.al-table th { background: #f3f4f6; font-size: 0.85em; text-transform: uppercase; letter-spacing: .04em; }
|
||||
.al-table tr:hover td { background: #fafafa; }
|
||||
.al-thumb { width: 72px; height: 52px; object-fit: cover; border-radius: 4px; display: block; }
|
||||
.al-thumb-placeholder { width: 72px; height: 52px; background: #e5e7eb; border-radius: 4px; display: flex; align-items: center; justify-content: center; color: #9ca3af; font-size: 1.4em; }
|
||||
.al-title { font-weight: 600; font-size: 0.95em; }
|
||||
.al-excerpt { font-size: 0.82em; color: #6b7280; margin-top: 3px; }
|
||||
.wp-id-input { width: 90px; font-family: monospace; font-size: 0.9em; padding: 4px 6px; border: 1px solid #d1d5db; border-radius: 4px; }
|
||||
.wp-id-input.changed { border-color: #f59e0b; background: #fffbeb; font-weight: bold; }
|
||||
.wp-link { font-size: 0.8em; margin-top: 3px; display: block; }
|
||||
.sticky-bar { position: sticky; top: 0; z-index: 100; background: #1e3a5f; color: #fff; padding: 10px 20px; display: flex; align-items: center; gap: 1.5rem; box-shadow: 0 2px 8px rgba(0,0,0,.2); }
|
||||
.sticky-bar button { background: #f59e0b; color: #000; border: none; padding: 8px 20px; border-radius: 6px; font-weight: bold; cursor: pointer; font-size: 0.95em; }
|
||||
.sticky-bar button:disabled { background: #9ca3af; color: #fff; cursor: default; }
|
||||
.change-badge { background: #f59e0b; color: #000; border-radius: 12px; padding: 2px 10px; font-weight: bold; font-size: 0.85em; display: none; }
|
||||
.change-badge.visible { display: inline; }
|
||||
.filter-bar { display: flex; gap: 1rem; align-items: flex-end; flex-wrap: wrap; margin-bottom: 1rem; }
|
||||
.filter-bar label { font-size: 0.85em; color: #6b7280; display: block; margin-bottom: 3px; }
|
||||
.filter-bar input, .filter-bar select { padding: 6px 10px; border: 1px solid #d1d5db; border-radius: 4px; font-size: 0.9em; }
|
||||
.pagination { display: flex; gap: 8px; align-items: center; justify-content: center; margin-top: 1.5rem; flex-wrap: wrap; }
|
||||
.pagination a, .pagination span { padding: 6px 12px; border: 1px solid #d1d5db; border-radius: 4px; font-size: 0.9em; text-decoration: none; color: #374151; }
|
||||
.pagination .current { background: #1e3a5f; color: #fff; border-color: #1e3a5f; font-weight: bold; }
|
||||
.pagination a:hover { background: #f3f4f6; }
|
||||
.badge-sm { padding: 2px 7px; border-radius: 10px; font-size: 0.75em; font-weight: 600; }
|
||||
.badge-new { background: #dbeafe; color: #1e40af; }
|
||||
.badge-approved { background: #d1fae5; color: #065f46; }
|
||||
.badge-error { background: #fee2e2; color: #991b1b; }
|
||||
.badge-published { background: #ede9fe; color: #5b21b6; }
|
||||
.badge-review { background: #fef3c7; color: #92400e; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<header class="topbar">
|
||||
<div>
|
||||
<h1>Artikelliste</h1>
|
||||
<p>Angemeldet als <strong>{{ user }}</strong></p>
|
||||
</div>
|
||||
<div class="row">
|
||||
<a class="linkbtn" href="/admin/dashboard">Dashboard</a>
|
||||
<a class="linkbtn" href="/admin/schedule">Veröffentlichungsplan</a>
|
||||
<form method="post" action="/admin/logout">
|
||||
<button type="submit" class="secondary">Logout</button>
|
||||
</form>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
<main class="container">
|
||||
{% if flash_msg %}
|
||||
<section class="card flash {{ 'flash-error' if flash_type == 'error' else 'flash-success' }}">
|
||||
{{ flash_msg }}
|
||||
</section>
|
||||
{% endif %}
|
||||
|
||||
<!-- Filter bar (outside main form so it doesn't submit with bulk save) -->
|
||||
<section class="card" style="padding-bottom: 0.5rem;">
|
||||
<form method="get" action="/admin/article-list">
|
||||
<div class="filter-bar">
|
||||
<div>
|
||||
<label>Suche (Titel / ID)</label>
|
||||
<input type="text" name="search" value="{{ search }}" placeholder="z.B. Camping …" />
|
||||
</div>
|
||||
<div>
|
||||
<label>Status</label>
|
||||
<select name="status_filter">
|
||||
<option value="">Alle</option>
|
||||
{% for s in ["new","review","approved","published","error","no_image"] %}
|
||||
<option value="{{ s }}" {% if status_filter == s %}selected{% endif %}>{{ s }}</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
</div>
|
||||
<div style="padding-bottom:1px;">
|
||||
<button type="submit">Filtern</button>
|
||||
<a href="/admin/article-list" class="linkbtn" style="margin-left:4px;">Reset</a>
|
||||
</div>
|
||||
</div>
|
||||
</form>
|
||||
<p class="subtle" style="margin: 4px 0 0;">{{ total }} Artikel gesamt · Seite {{ page }} / {{ total_pages }} · {{ page_size }} pro Seite</p>
|
||||
</section>
|
||||
|
||||
<!-- Main form for bulk WP ID editing -->
|
||||
<form method="post" action="/admin/article-list/update" id="bulk-form">
|
||||
<!-- Pass filter/page state so redirect goes back to same view -->
|
||||
<input type="hidden" name="page" value="{{ page }}">
|
||||
<input type="hidden" name="status_filter" value="{{ status_filter }}">
|
||||
<input type="hidden" name="search" value="{{ search }}">
|
||||
|
||||
<div class="sticky-bar">
|
||||
<button type="submit" id="save-btn" disabled>💾 Änderungen speichern</button>
|
||||
<span class="change-badge" id="change-badge">0 Änderungen</span>
|
||||
<span style="font-size:0.85em;opacity:.8;">Nur geänderte WP-IDs werden gespeichert. Danach WP-Sync ausführen.</span>
|
||||
</div>
|
||||
|
||||
<section class="card" style="padding: 0; overflow-x: auto;">
|
||||
<table class="al-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th style="width:80px;">Bild</th>
|
||||
<th>Titel & Kurztext</th>
|
||||
<th style="width:90px;">Status</th>
|
||||
<th style="width:110px;">Datum</th>
|
||||
<th style="width:140px;">WP ID</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for a in articles %}
|
||||
<tr>
|
||||
<td>
|
||||
{% if a.thumb_proxy %}
|
||||
<a href="{{ a.thumb_url }}" target="_blank" rel="noopener">
|
||||
<img src="{{ a.thumb_proxy }}"
|
||||
class="al-thumb"
|
||||
alt="Vorschau"
|
||||
loading="lazy"
|
||||
onerror="this.style.display='none';this.nextElementSibling.style.display='flex';" />
|
||||
<div class="al-thumb-placeholder" style="display:none;">🖼</div>
|
||||
</a>
|
||||
{% else %}
|
||||
<div class="al-thumb-placeholder">🖼</div>
|
||||
{% endif %}
|
||||
</td>
|
||||
<td>
|
||||
<div class="al-title">
|
||||
<a href="/admin/articles/{{ a.id }}">#{{ a.id }} {{ a.title }}</a>
|
||||
</div>
|
||||
{% if a.excerpt %}
|
||||
<div class="al-excerpt">{{ a.excerpt }}</div>
|
||||
{% endif %}
|
||||
{% if a.feed_name %}
|
||||
<div class="al-excerpt" style="margin-top:4px;">📡 {{ a.feed_name }}</div>
|
||||
{% endif %}
|
||||
</td>
|
||||
<td>
|
||||
<span class="badge-sm badge-{{ a.status }}">{{ a.status }}</span>
|
||||
</td>
|
||||
<td style="font-size:0.82em;">
|
||||
{% if a.scheduled_publish_at %}
|
||||
📅 {{ a.scheduled_publish_at[:16] }}
|
||||
{% elif a.published_at %}
|
||||
{{ a.published_at[:10] }}
|
||||
{% else %}
|
||||
—
|
||||
{% endif %}
|
||||
</td>
|
||||
<td>
|
||||
<!-- Hidden original value for change detection -->
|
||||
<input type="hidden" name="orig_{{ a.id }}" value="{{ a.wp_post_id or '' }}">
|
||||
<input
|
||||
type="text"
|
||||
name="wp_{{ a.id }}"
|
||||
value="{{ a.wp_post_id or '' }}"
|
||||
data-orig="{{ a.wp_post_id or '' }}"
|
||||
class="wp-id-input"
|
||||
placeholder="—"
|
||||
inputmode="numeric"
|
||||
pattern="[0-9]*"
|
||||
/>
|
||||
{% if a.wp_post_url %}
|
||||
<a href="{{ a.wp_post_url }}" target="_blank" rel="noopener" class="wp-link">↗ WP öffnen</a>
|
||||
{% endif %}
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</section>
|
||||
</form>
|
||||
|
||||
<!-- Pagination (outside form) -->
|
||||
<div class="pagination">
|
||||
{% if page > 1 %}
|
||||
<a href="?page=1&status_filter={{ status_filter }}&search={{ search }}">«</a>
|
||||
<a href="?page={{ page - 1 }}&status_filter={{ status_filter }}&search={{ search }}">‹ Zurück</a>
|
||||
{% endif %}
|
||||
|
||||
{% for p in range([1, page - 2]|max, [total_pages + 1, page + 3]|min) %}
|
||||
{% if p == page %}
|
||||
<span class="current">{{ p }}</span>
|
||||
{% else %}
|
||||
<a href="?page={{ p }}&status_filter={{ status_filter }}&search={{ search }}">{{ p }}</a>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
|
||||
{% if page < total_pages %}
|
||||
<a href="?page={{ page + 1 }}&status_filter={{ status_filter }}&search={{ search }}">Weiter ›</a>
|
||||
<a href="?page={{ total_pages }}&status_filter={{ status_filter }}&search={{ search }}">»</a>
|
||||
{% endif %}
|
||||
</div>
|
||||
</main>
|
||||
|
||||
<script>
|
||||
(function () {
|
||||
const inputs = document.querySelectorAll('.wp-id-input');
|
||||
const btn = document.getElementById('save-btn');
|
||||
const badge = document.getElementById('change-badge');
|
||||
|
||||
function countChanges() {
|
||||
let n = 0;
|
||||
inputs.forEach(inp => {
|
||||
const changed = inp.value.trim() !== inp.dataset.orig.trim();
|
||||
inp.classList.toggle('changed', changed);
|
||||
if (changed) n++;
|
||||
});
|
||||
btn.disabled = n === 0;
|
||||
badge.textContent = n + (n === 1 ? ' Änderung' : ' Änderungen');
|
||||
badge.classList.toggle('visible', n > 0);
|
||||
}
|
||||
|
||||
inputs.forEach(inp => inp.addEventListener('input', countChanges));
|
||||
countChanges();
|
||||
})();
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
84
backend/templates/admin_connectivity.html
Normal file
84
backend/templates/admin_connectivity.html
Normal file
|
|
@ -0,0 +1,84 @@
|
|||
<!doctype html>
|
||||
<html lang="de">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<title>{{ title }}</title>
|
||||
<link rel="stylesheet" href="/admin/static/admin.css" />
|
||||
</head>
|
||||
<body>
|
||||
<header class="topbar">
|
||||
<div>
|
||||
<h1>Connectivity Check</h1>
|
||||
<p>Angemeldet als <strong>{{ user }}</strong></p>
|
||||
</div>
|
||||
<div class="row">
|
||||
<a class="linkbtn" href="/admin/dashboard">Zurück</a>
|
||||
<form method="post" action="/admin/logout">
|
||||
<button type="submit" class="secondary">Logout</button>
|
||||
</form>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
<main class="container">
|
||||
<section class="stats">
|
||||
<article class="stat">
|
||||
<div class="label">Checks</div>
|
||||
<div class="value">{{ checks|length }}</div>
|
||||
</article>
|
||||
<article class="stat">
|
||||
<div class="label">OK</div>
|
||||
<div class="value">{{ ok_count }}</div>
|
||||
</article>
|
||||
<article class="stat">
|
||||
<div class="label">Fehler</div>
|
||||
<div class="value">{{ error_count }}</div>
|
||||
</article>
|
||||
<article class="stat">
|
||||
<div class="label">Zeitpunkt</div>
|
||||
<div class="value">Live</div>
|
||||
</article>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Ziele</h2>
|
||||
<p class="subtle">Geprüft werden DNS-Auflösung, TCP-Erreichbarkeit und bei URLs ein HTTP-Request.</p>
|
||||
<form method="get" action="/admin/connectivity" class="row">
|
||||
<button type="submit">Checks neu ausführen</button>
|
||||
</form>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Ergebnis</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>Status</th><th>Name</th><th>Typ</th><th>Ziel</th><th>DNS</th><th>TCP</th><th>HTTP</th><th>Dauer</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for c in checks %}
|
||||
<tr>
|
||||
<td>{% if c.ok %}<span class="badge ok">OK</span>{% else %}<span class="badge bad">Fehler</span>{% endif %}</td>
|
||||
<td>{{ c.label }}</td>
|
||||
<td>{{ c.kind }}</td>
|
||||
<td><code>{{ c.target }}</code></td>
|
||||
<td>
|
||||
{% if c.dns_ok %}<span class="badge ok">OK</span>{% else %}<span class="badge bad">FAIL</span>{% endif %}
|
||||
<div class="subtle">{{ c.dns_info }}</div>
|
||||
</td>
|
||||
<td>
|
||||
{% if c.tcp_ok %}<span class="badge ok">OK</span>{% else %}<span class="badge bad">FAIL</span>{% endif %}
|
||||
<div class="subtle">{{ c.tcp_info }}</div>
|
||||
</td>
|
||||
<td>
|
||||
{% if c.http_ok %}<span class="badge ok">OK</span>{% else %}<span class="badge bad">FAIL</span>{% endif %}
|
||||
<div class="subtle">{{ c.http_info }}</div>
|
||||
</td>
|
||||
<td>{{ c.duration_ms }} ms</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</section>
|
||||
</main>
|
||||
</body>
|
||||
</html>
|
||||
405
backend/templates/admin_dashboard.html
Normal file
405
backend/templates/admin_dashboard.html
Normal file
|
|
@ -0,0 +1,405 @@
|
|||
<!doctype html>
|
||||
<html lang="de">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<title>{{ title }}</title>
|
||||
<link rel="stylesheet" href="/admin/static/admin.css" />
|
||||
</head>
|
||||
<body>
|
||||
<header class="topbar">
|
||||
<div>
|
||||
<h1>rss-news Admin Dashboard</h1>
|
||||
<p>Angemeldet als <strong>{{ user }}</strong></p>
|
||||
</div>
|
||||
<div class="row">
|
||||
<a class="linkbtn" href="/admin/article-list">Artikelliste</a>
|
||||
<a class="linkbtn" href="/admin/schedule">Veröffentlichungsplan</a>
|
||||
<a class="linkbtn" href="/admin/connectivity">Connectivity Check</a>
|
||||
<form method="post" action="/admin/logout">
|
||||
<button type="submit" class="secondary">Logout</button>
|
||||
</form>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
<main class="container">
|
||||
{% if flash_msg %}
|
||||
<section class="card flash {{ 'flash-error' if flash_type == 'error' else 'flash-success' }}">
|
||||
{{ flash_msg }}
|
||||
</section>
|
||||
{% endif %}
|
||||
|
||||
<section class="stats">
|
||||
<article class="stat">
|
||||
<div class="label">Quellen</div>
|
||||
<div class="value">{{ sources|length }}</div>
|
||||
</article>
|
||||
<article class="stat">
|
||||
<div class="label">Feeds</div>
|
||||
<div class="value">{{ feeds|length }}</div>
|
||||
</article>
|
||||
<article class="stat">
|
||||
<div class="label">Artikel</div>
|
||||
<div class="value">{{ articles|length }}</div>
|
||||
</article>
|
||||
<article class="stat">
|
||||
<div class="label">Runs</div>
|
||||
<div class="value">{{ runs|length }}</div>
|
||||
</article>
|
||||
</section>
|
||||
|
||||
<section class="grid two">
|
||||
<article class="card">
|
||||
<h2>Quelle anlegen</h2>
|
||||
<form method="post" action="/admin/sources/create" class="stack">
|
||||
<input name="name" placeholder="Name" required />
|
||||
<input name="base_url" placeholder="Base URL" />
|
||||
<input name="terms_url" placeholder="Terms URL" />
|
||||
<input name="license_name" placeholder="Lizenzname" />
|
||||
<select name="risk_level">
|
||||
<option value="green">green</option>
|
||||
<option value="yellow" selected>yellow</option>
|
||||
<option value="red">red</option>
|
||||
</select>
|
||||
<input name="last_reviewed_at" placeholder="last_reviewed_at (ISO)" />
|
||||
<button type="submit">Quelle speichern</button>
|
||||
</form>
|
||||
</article>
|
||||
|
||||
<article class="card">
|
||||
<h2>Feed anlegen</h2>
|
||||
<form method="post" action="/admin/feeds/create" class="stack">
|
||||
<input name="name" placeholder="Feed Name" required />
|
||||
<input name="url" placeholder="https://..." required />
|
||||
<label>Quelle</label>
|
||||
<select name="source_id">
|
||||
<option value="">-- keine --</option>
|
||||
{% for s in sources %}
|
||||
<option value="{{ s.id }}">{{ s.name }} (#{{ s.id }})</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
<button type="submit">Feed speichern</button>
|
||||
</form>
|
||||
</article>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Ingestion starten</h2>
|
||||
<form method="post" action="/admin/ingestion/run" class="row">
|
||||
<select name="feed_id">
|
||||
<option value="">Alle aktivierten Feeds</option>
|
||||
{% for f in feeds %}
|
||||
<option value="{{ f.id }}">{{ f.name }} (#{{ f.id }})</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
<button type="submit">Ingestion starten</button>
|
||||
</form>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Publisher ausführen</h2>
|
||||
<form method="post" action="/admin/publisher/run" class="row">
|
||||
<input name="max_jobs" value="10" />
|
||||
<button type="submit">Publisher Run starten</button>
|
||||
</form>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Rewrite Run (geplante Artikel)</h2>
|
||||
<p class="subtle">Verarbeitet alle Artikel im Status <code>rewrite</code> und setzt sie auf <code>publish</code>.</p>
|
||||
<form method="post" action="/admin/rewrite/run" class="row">
|
||||
<input name="max_jobs" value="10" />
|
||||
<button type="submit">Rewrite Run starten</button>
|
||||
</form>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Quellen + Policy</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>ID</th><th>Name</th><th>Risk</th><th>Lizenz</th><th>Terms</th><th>Policy</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for s in sources %}
|
||||
<tr>
|
||||
<td>{{ s.id }}</td>
|
||||
<td>{{ s.name }}</td>
|
||||
<td>{{ s.risk_level }}</td>
|
||||
<td>{{ s.license_name or "-" }}</td>
|
||||
<td>{{ s.terms_url or "-" }}</td>
|
||||
<td>
|
||||
{% if source_policy[s.id] %}
|
||||
<span class="badge bad">BLOCKED ({{ source_policy[s.id]|length }})</span>
|
||||
<div class="subtle">{{ source_policy[s.id]|join(", ") }}</div>
|
||||
{% else %}
|
||||
<span class="badge ok">OK</span>
|
||||
{% endif %}
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Quellen verwalten</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>ID</th><th>Name</th><th>URLs</th><th>Meta</th><th>Aktionen</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for s in sources %}
|
||||
{% set source_form_id = 'source-update-' ~ s.id %}
|
||||
<tr>
|
||||
<td>#{{ s.id }}</td>
|
||||
<td>
|
||||
<input form="{{ source_form_id }}" name="name" value="{{ s.name }}" required />
|
||||
</td>
|
||||
<td>
|
||||
<input form="{{ source_form_id }}" name="base_url" value="{{ s.base_url or '' }}" placeholder="Base URL" />
|
||||
<input form="{{ source_form_id }}" name="terms_url" value="{{ s.terms_url or '' }}" placeholder="Terms URL" />
|
||||
<input form="{{ source_form_id }}" name="license_name" value="{{ s.license_name or '' }}" placeholder="Lizenz" />
|
||||
</td>
|
||||
<td>
|
||||
<select form="{{ source_form_id }}" name="risk_level">
|
||||
<option value="green" {% if s.risk_level == 'green' %}selected{% endif %}>green</option>
|
||||
<option value="yellow" {% if s.risk_level == 'yellow' %}selected{% endif %}>yellow</option>
|
||||
<option value="red" {% if s.risk_level == 'red' %}selected{% endif %}>red</option>
|
||||
</select>
|
||||
<select form="{{ source_form_id }}" name="is_enabled">
|
||||
<option value="1" {% if s.is_enabled %}selected{% endif %}>aktiv</option>
|
||||
<option value="0" {% if not s.is_enabled %}selected{% endif %}>inaktiv</option>
|
||||
</select>
|
||||
<input form="{{ source_form_id }}" name="last_reviewed_at" value="{{ s.last_reviewed_at or '' }}" placeholder="last_reviewed_at" />
|
||||
<input form="{{ source_form_id }}" name="notes" value="{{ s.notes or '' }}" placeholder="Notiz" />
|
||||
</td>
|
||||
<td>
|
||||
<div class="inline">
|
||||
<form method="post" action="/admin/sources/{{ s.id }}/update" id="{{ source_form_id }}" class="inline">
|
||||
<button type="submit" class="secondary">Speichern</button>
|
||||
</form>
|
||||
<form method="post" action="/admin/sources/{{ s.id }}/delete" class="inline" onsubmit="return confirm('Quelle wirklich löschen?');">
|
||||
<button type="submit">Löschen</button>
|
||||
</form>
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Feeds verwalten</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>ID</th><th>Name</th><th>URL</th><th>Quelle</th><th>Status</th><th>Aktionen</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for f in feeds %}
|
||||
{% set feed_form_id = 'feed-update-' ~ f.id %}
|
||||
<tr>
|
||||
<td>#{{ f.id }}</td>
|
||||
<td>
|
||||
<input form="{{ feed_form_id }}" name="name" value="{{ f.name }}" required />
|
||||
</td>
|
||||
<td><input form="{{ feed_form_id }}" name="url" value="{{ f.url }}" required /></td>
|
||||
<td>
|
||||
<select form="{{ feed_form_id }}" name="source_id">
|
||||
<option value="">-- keine --</option>
|
||||
{% for s in sources %}
|
||||
<option value="{{ s.id }}" {% if f.source_id == s.id %}selected{% endif %}>{{ s.name }} (#{{ s.id }})</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
</td>
|
||||
<td>
|
||||
<select form="{{ feed_form_id }}" name="is_enabled">
|
||||
<option value="1" {% if f.is_enabled %}selected{% endif %}>aktiv</option>
|
||||
<option value="0" {% if not f.is_enabled %}selected{% endif %}>inaktiv</option>
|
||||
</select>
|
||||
</td>
|
||||
<td>
|
||||
<div class="inline">
|
||||
<form method="post" action="/admin/feeds/{{ f.id }}/update" id="{{ feed_form_id }}" class="inline">
|
||||
<button type="submit" class="secondary">Speichern</button>
|
||||
</form>
|
||||
<form method="post" action="/admin/feeds/{{ f.id }}/delete" class="inline" onsubmit="return confirm('Feed wirklich löschen?');">
|
||||
<button type="submit">Löschen</button>
|
||||
</form>
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Artikel (Review)</h2>
|
||||
<form method="get" action="/admin/dashboard" class="row filter-row">
|
||||
<label>Status-Filter</label>
|
||||
<select name="status_filter">
|
||||
<option value="" {% if not status_filter %}selected{% endif %}>alle</option>
|
||||
{% for s in status_options %}
|
||||
<option value="{{ s }}" {% if status_filter == s %}selected{% endif %}>{{ s }}</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
<button type="submit" class="secondary">Filtern</button>
|
||||
<a href="/admin/dashboard" class="linkbtn">Reset</a>
|
||||
<a href="/api/articles/export?format=json{% if status_filter %}&status_filter={{ status_filter }}{% endif %}" class="linkbtn">Export JSON</a>
|
||||
<a href="/api/articles/export?format=csv{% if status_filter %}&status_filter={{ status_filter }}{% endif %}" class="linkbtn">Export CSV</a>
|
||||
</form>
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>ID</th><th>Artikel</th><th>Status</th><th>Details</th><th>Rewrite</th><th>Transition</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for a in articles %}
|
||||
<tr>
|
||||
<td>{{ a.id }}</td>
|
||||
<td>
|
||||
<strong>{{ a.title }}</strong><br />
|
||||
<span class="subtle">Autor: {{ a.author or "-" }}</span><br />
|
||||
<span class="subtle">Datum: {{ a.published_at or "-" }} | Alter: {{ a.days_old if a.days_old is not none else "-" }} Tage | Relevanz: {{ a.relevance }}</span><br />
|
||||
<a href="{{ a.source_url }}" target="_blank" rel="noopener">Original öffnen</a>
|
||||
<br /><a href="/admin/articles/{{ a.id }}">Details anzeigen</a>
|
||||
{% if a.canonical_url and a.canonical_url != a.source_url %}
|
||||
<br /><a href="{{ a.canonical_url }}" target="_blank" rel="noopener">Canonical öffnen</a>
|
||||
{% endif %}
|
||||
</td>
|
||||
<td><span class="badge">{{ a.status_ui }}</span></td>
|
||||
<td>
|
||||
<div class="subtle">Publish: {{ "bereit" if a.publish_ready else "blockiert" }}</div>
|
||||
{% if not a.publish_ready and a.publish_blockers %}
|
||||
<div class="subtle">{{ a.publish_blockers|join(", ") }}</div>
|
||||
{% endif %}
|
||||
{% if a.selected_image_url %}
|
||||
<div class="subtle">Hauptbild gesetzt</div>
|
||||
<a href="{{ a.selected_image_url }}" target="_blank" rel="noopener"><img src="{{ a.selected_image_proxy_url }}" data-fallback-src="{{ a.selected_image_url }}" alt="Hauptbild" class="thumb" loading="lazy" onerror="if(!this.dataset.fallbackUsed){this.dataset.fallbackUsed='1';this.src=this.dataset.fallbackSrc;}else{this.classList.add('img-failed');}" /></a>
|
||||
{% endif %}
|
||||
{% if a.summary %}
|
||||
<div><strong>Summary:</strong> {{ a.summary }}</div>
|
||||
{% endif %}
|
||||
{% if a.generated_tags %}
|
||||
<div><strong>Tags:</strong> {{ a.generated_tags|join("; ") }}</div>
|
||||
{% endif %}
|
||||
{% if a.content_raw %}
|
||||
<details>
|
||||
<summary>Volltext anzeigen</summary>
|
||||
<div class="pre">{{ a.content_raw }}</div>
|
||||
</details>
|
||||
{% endif %}
|
||||
<div class="subtle">Bilder: {{ a.extracted_images|length }}</div>
|
||||
{% if a.extracted_images %}
|
||||
<details>
|
||||
<summary>Bild-URLs</summary>
|
||||
<ul>
|
||||
{% for img in a.extracted_images %}
|
||||
<li><a href="{{ img }}" target="_blank" rel="noopener">{{ img }}</a></li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
</details>
|
||||
{% endif %}
|
||||
{% if a.press_contact %}
|
||||
<details>
|
||||
<summary>Pressekontakt</summary>
|
||||
<div class="pre">{{ a.press_contact }}</div>
|
||||
</details>
|
||||
{% endif %}
|
||||
{% if a.extraction_error %}
|
||||
<div class="subtle">Extraktionsfehler: {{ a.extraction_error }}</div>
|
||||
{% endif %}
|
||||
</td>
|
||||
<td>
|
||||
{% if a.status_ui in ["new", "rewrite"] %}
|
||||
<form method="post" action="/admin/articles/{{ a.id }}/rewrite-run" class="inline">
|
||||
<button type="submit">Rewrite ausführen</button>
|
||||
</form>
|
||||
{% else %}
|
||||
-
|
||||
{% endif %}
|
||||
</td>
|
||||
<td>
|
||||
<form method="post" action="/admin/articles/{{ a.id }}/transition" class="inline">
|
||||
<select name="target_status">
|
||||
{% for s in allowed_transitions.get(a.status_ui, []) %}
|
||||
<option value="{{ s }}">{{ s }}</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
{% if allowed_transitions.get(a.status_ui, []) %}
|
||||
<button type="submit" class="secondary">Setzen</button>
|
||||
{% else %}
|
||||
<span class="subtle">keine Aktion</span>
|
||||
{% endif %}
|
||||
</form>
|
||||
{% if a.status_ui == 'close' %}
|
||||
<form method="post" action="/admin/articles/{{ a.id }}/retry" class="inline" style="margin-top:4px;">
|
||||
<button type="submit" title="Artikel auf 'neu' zurücksetzen – wird beim nächsten Pipeline-Lauf erneut verarbeitet">🔄 Wiederholen</button>
|
||||
</form>
|
||||
{% endif %}
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Runs</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>ID</th><th>Typ</th><th>Status</th><th>Start</th><th>Ende</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for r in runs %}
|
||||
<tr>
|
||||
<td>{{ r.id }}</td>
|
||||
<td>{{ r.run_type }}</td>
|
||||
<td>{{ r.status }}</td>
|
||||
<td>{{ r.started_at }}</td>
|
||||
<td>{{ r.finished_at or "-" }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Publish Jobs</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>ID</th><th>Artikel</th><th>Status</th><th>Attempts</th><th>WP Post</th><th>Fehler</th><th>Hinweis</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for j in publish_jobs %}
|
||||
<tr>
|
||||
<td>{{ j.id }}</td>
|
||||
<td>#{{ j.article_id }} {{ j.article_title or "-" }}</td>
|
||||
<td>{{ j.status }}</td>
|
||||
<td>{{ j.attempts }}/{{ j.max_attempts }}</td>
|
||||
<td>
|
||||
{% if j.wp_post_url %}
|
||||
<a href="{{ j.wp_post_url }}" target="_blank" rel="noopener">#{{ j.wp_post_id }}</a>
|
||||
{% elif j.wp_post_id %}
|
||||
#{{ j.wp_post_id }}
|
||||
{% else %}
|
||||
-
|
||||
{% endif %}
|
||||
</td>
|
||||
<td>
|
||||
{% if j.error_message %}
|
||||
<span class="badge errcat errcat-{{ j.error_category }}">{{ j.error_category }}</span>
|
||||
<div>{{ j.error_message }}</div>
|
||||
{% else %}
|
||||
-
|
||||
{% endif %}
|
||||
</td>
|
||||
<td>{{ j.error_hint or "-" }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</section>
|
||||
</main>
|
||||
</body>
|
||||
</html>
|
||||
27
backend/templates/admin_login.html
Normal file
27
backend/templates/admin_login.html
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
<!doctype html>
|
||||
<html lang="de">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<title>{{ title }}</title>
|
||||
<link rel="stylesheet" href="/admin/static/admin.css" />
|
||||
</head>
|
||||
<body>
|
||||
<main class="container login">
|
||||
<h1>rss-news Admin</h1>
|
||||
<p>Bitte anmelden, um das Tool zu verwalten.</p>
|
||||
{% if error %}
|
||||
<div class="alert">Login fehlgeschlagen. Bitte pruefen.</div>
|
||||
{% endif %}
|
||||
<form method="post" action="/admin/login" class="card">
|
||||
<label>Benutzername
|
||||
<input type="text" name="username" required />
|
||||
</label>
|
||||
<label>Passwort
|
||||
<input type="password" name="password" required />
|
||||
</label>
|
||||
<button type="submit">Anmelden</button>
|
||||
</form>
|
||||
</main>
|
||||
</body>
|
||||
</html>
|
||||
143
backend/templates/admin_schedule.html
Normal file
143
backend/templates/admin_schedule.html
Normal file
|
|
@ -0,0 +1,143 @@
|
|||
<!doctype html>
|
||||
<html lang="de">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<title>{{ title }}</title>
|
||||
<link rel="stylesheet" href="/admin/static/admin.css" />
|
||||
<style>
|
||||
.schedule-table td, .schedule-table th { padding: 6px 10px; }
|
||||
.slot-free { color: #aaa; font-style: italic; }
|
||||
.slot-booked-db { color: #1a7a1a; font-weight: bold; }
|
||||
.slot-booked-wp { color: #b35a00; font-weight: bold; }
|
||||
.badge-db { background: #d4edda; color: #155724; padding: 2px 6px; border-radius: 4px; font-size: 0.75em; }
|
||||
.badge-wp { background: #fff3cd; color: #856404; padding: 2px 6px; border-radius: 4px; font-size: 0.75em; }
|
||||
.summary-bar { display: flex; gap: 1.5rem; margin-bottom: 1rem; font-size: 0.95em; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<header class="topbar">
|
||||
<div>
|
||||
<h1>rss-news Veröffentlichungsplan</h1>
|
||||
<p>Angemeldet als <strong>{{ user }}</strong></p>
|
||||
</div>
|
||||
<div class="row">
|
||||
<a class="linkbtn" href="/admin/dashboard">Dashboard</a>
|
||||
<a class="linkbtn" href="/admin/connectivity">Connectivity</a>
|
||||
<form method="post" action="/admin/logout">
|
||||
<button type="submit" class="secondary">Logout</button>
|
||||
</form>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
<main class="container">
|
||||
{% if flash_msg %}
|
||||
<section class="card flash {{ 'flash-error' if flash_type == 'error' else 'flash-success' }}">
|
||||
{{ flash_msg }}
|
||||
</section>
|
||||
{% endif %}
|
||||
|
||||
<section class="card" style="display:flex;align-items:center;justify-content:space-between;flex-wrap:wrap;gap:1rem;">
|
||||
<div>
|
||||
<h2 style="margin:0;">WordPress → DB Synchronisieren</h2>
|
||||
<p class="subtle" style="margin:4px 0 0;">Liest alle geplanten WP-Beiträge und aktualisiert die Slots in der lokalen DB.<br>Nutze dies nach manuellen Änderungen in WordPress.</p>
|
||||
</div>
|
||||
<form method="post" action="/admin/wp-sync">
|
||||
<button type="submit">🔄 Jetzt synchronisieren</button>
|
||||
</form>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Slot-Übersicht (nächste 60 Tage)</h2>
|
||||
<div class="summary-bar">
|
||||
<span>📅 Belegte Slots gesamt: <strong>{{ slots|length }}</strong></span>
|
||||
<span>🗄️ Aus Pipeline-DB: <strong>{{ slots|selectattr('source', 'eq', 'db')|list|length }}</strong></span>
|
||||
<span>🌐 Nur in WordPress: <strong>{{ slots|selectattr('source', 'eq', 'wordpress')|list|length }}</strong></span>
|
||||
</div>
|
||||
<table class="schedule-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Tag</th>
|
||||
{% for h in hours %}
|
||||
<th>{{ "%02d:00 Uhr"|format(h) }}</th>
|
||||
{% endfor %}
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for day in calendar_days %}
|
||||
{% if day.any_booked %}
|
||||
<tr>
|
||||
<td><strong>{{ day.weekday }}</strong> {{ day.date_fmt }}</td>
|
||||
{% for s in day.slots %}
|
||||
<td>
|
||||
{% if s.booked %}
|
||||
{% set info = s.slot %}
|
||||
{% if info.source == 'db' %}
|
||||
<span class="slot-booked-db">✅</span>
|
||||
<span class="badge-db">DB</span>
|
||||
<div style="font-size:0.85em;">
|
||||
{% if info.article_id %}
|
||||
<a href="/admin/articles/{{ info.article_id }}">
|
||||
{{ (info.article_title or "Artikel")[:50] }}{% if (info.article_title or "")|length > 50 %}…{% endif %}
|
||||
</a>
|
||||
{% endif %}
|
||||
<br /><span class="subtle">Status: {{ info.article_status }}</span>
|
||||
{% if info.wp_post_url %}
|
||||
<br /><a href="{{ info.wp_post_url }}" target="_blank" rel="noopener">WP öffnen</a>
|
||||
{% endif %}
|
||||
</div>
|
||||
{% else %}
|
||||
<span class="slot-booked-wp">⚠️</span>
|
||||
<span class="badge-wp">WP</span>
|
||||
<div style="font-size:0.85em;">{{ info.article_title }}</div>
|
||||
{% endif %}
|
||||
{% else %}
|
||||
<span class="slot-free">frei</span>
|
||||
{% endif %}
|
||||
</td>
|
||||
{% endfor %}
|
||||
</tr>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
{% if not slots %}
|
||||
<p class="subtle">Keine geplanten Beiträge in den nächsten 60 Tagen.</p>
|
||||
{% endif %}
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Alle belegten Slots (Liste)</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>Datum/Zeit</th><th>Quelle</th><th>Artikel</th><th>Status</th><th>WordPress</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for s in slots %}
|
||||
<tr>
|
||||
<td>{{ s.formatted }}</td>
|
||||
<td>
|
||||
{% if s.source == 'db' %}<span class="badge-db">Pipeline-DB</span>
|
||||
{% else %}<span class="badge-wp">WordPress</span>{% endif %}
|
||||
</td>
|
||||
<td>
|
||||
{% if s.article_id %}
|
||||
<a href="/admin/articles/{{ s.article_id }}">{{ (s.article_title or "")[:60] }}</a>
|
||||
{% else %}
|
||||
{{ s.article_title or "-" }}
|
||||
{% endif %}
|
||||
</td>
|
||||
<td>{{ s.article_status or "-" }}</td>
|
||||
<td>
|
||||
{% if s.wp_post_url %}
|
||||
<a href="{{ s.wp_post_url }}" target="_blank" rel="noopener">Draft öffnen</a>
|
||||
{% else %}-{% endif %}
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</section>
|
||||
</main>
|
||||
</body>
|
||||
</html>
|
||||
1
backend/tests/__init__.py
Normal file
1
backend/tests/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
"""Tests package."""
|
||||
419
backend/tests/test_admin_ui.py
Normal file
419
backend/tests/test_admin_ui.py
Normal file
|
|
@ -0,0 +1,419 @@
|
|||
import os
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from backend.app import config as config_module
|
||||
from backend.app.db import init_db
|
||||
from backend.app.main import app
|
||||
from backend.app.repositories import (
|
||||
ArticleUpsert,
|
||||
FeedCreate,
|
||||
SourceCreate,
|
||||
create_feed,
|
||||
create_source,
|
||||
get_article_by_id,
|
||||
upsert_article,
|
||||
)
|
||||
|
||||
|
||||
class TestAdminUi(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.tmp_dir = tempfile.TemporaryDirectory()
|
||||
os.environ["APP_DB_PATH"] = str(Path(self.tmp_dir.name) / "admin_ui.db")
|
||||
os.environ["APP_ADMIN_USERNAME"] = "admin"
|
||||
os.environ["APP_ADMIN_PASSWORD"] = "secret"
|
||||
config_module.get_settings.cache_clear()
|
||||
init_db()
|
||||
self.client = TestClient(app)
|
||||
|
||||
def tearDown(self) -> None:
|
||||
config_module.get_settings.cache_clear()
|
||||
os.environ.pop("APP_DB_PATH", None)
|
||||
os.environ.pop("APP_ADMIN_USERNAME", None)
|
||||
os.environ.pop("APP_ADMIN_PASSWORD", None)
|
||||
self.tmp_dir.cleanup()
|
||||
|
||||
def test_admin_login_and_dashboard(self) -> None:
|
||||
login_page = self.client.get("/admin/login")
|
||||
self.assertEqual(login_page.status_code, 200)
|
||||
self.assertIn("rss-news Admin", login_page.text)
|
||||
|
||||
login = self.client.post(
|
||||
"/admin/login",
|
||||
data={"username": "admin", "password": "secret"},
|
||||
follow_redirects=True,
|
||||
)
|
||||
self.assertEqual(login.status_code, 200)
|
||||
self.assertIn("Admin Dashboard", login.text)
|
||||
|
||||
def test_dashboard_redirects_if_not_logged_in(self) -> None:
|
||||
res = self.client.get("/admin/dashboard", follow_redirects=False)
|
||||
self.assertEqual(res.status_code, 303)
|
||||
self.assertEqual(res.headers.get("location"), "/admin/login")
|
||||
|
||||
def test_create_feed_with_empty_source_id_does_not_error(self) -> None:
|
||||
self.client.post(
|
||||
"/admin/login",
|
||||
data={"username": "admin", "password": "secret"},
|
||||
follow_redirects=True,
|
||||
)
|
||||
# empty source_id used to cause validation issues in form parsing
|
||||
res = self.client.post(
|
||||
"/admin/feeds/create",
|
||||
data={"name": "Feed X", "url": "https://example.org/feed.xml", "source_id": ""},
|
||||
follow_redirects=False,
|
||||
)
|
||||
self.assertEqual(res.status_code, 303)
|
||||
self.assertTrue(res.headers.get("location", "").startswith("/admin/dashboard"))
|
||||
|
||||
def test_article_detail_page_renders(self) -> None:
|
||||
source_id = create_source(
|
||||
SourceCreate(
|
||||
name="Test Source",
|
||||
base_url="https://example.org",
|
||||
terms_url="https://example.org/terms",
|
||||
license_name="cc-by",
|
||||
risk_level="green",
|
||||
is_enabled=True,
|
||||
notes=None,
|
||||
last_reviewed_at="2026-02-18T00:00:00Z",
|
||||
)
|
||||
)
|
||||
feed_id = create_feed(
|
||||
FeedCreate(
|
||||
name="Test Feed",
|
||||
url="https://example.org/feed.xml",
|
||||
source_id=source_id,
|
||||
is_enabled=True,
|
||||
)
|
||||
)
|
||||
article_id = upsert_article(
|
||||
ArticleUpsert(
|
||||
feed_id=feed_id,
|
||||
source_article_id="id-1",
|
||||
source_hash="hash-1",
|
||||
title="Titel A",
|
||||
source_url="https://example.org/a",
|
||||
canonical_url="https://example.org/a",
|
||||
published_at=None,
|
||||
author="Autor A",
|
||||
summary="Summary A",
|
||||
content_raw="Volltext A",
|
||||
content_rewritten=None,
|
||||
image_urls_json='["https://example.org/img.jpg"]',
|
||||
press_contact="Kontakt",
|
||||
source_name_snapshot="Test Source",
|
||||
source_terms_url_snapshot="https://example.org/terms",
|
||||
source_license_name_snapshot="cc-by",
|
||||
legal_checked=False,
|
||||
legal_checked_at=None,
|
||||
legal_note=None,
|
||||
wp_post_id=None,
|
||||
wp_post_url=None,
|
||||
publish_attempts=0,
|
||||
publish_last_error=None,
|
||||
published_to_wp_at=None,
|
||||
word_count=2,
|
||||
status="new",
|
||||
meta_json='{"extraction":{"images":["https://example.org/img.jpg"],"press_contact":"Kontakt"}}',
|
||||
)
|
||||
)
|
||||
|
||||
self.client.post(
|
||||
"/admin/login",
|
||||
data={"username": "admin", "password": "secret"},
|
||||
follow_redirects=True,
|
||||
)
|
||||
res = self.client.get(f"/admin/articles/{article_id}", follow_redirects=True)
|
||||
self.assertEqual(res.status_code, 200)
|
||||
self.assertIn("Artikel-Detail", res.text)
|
||||
self.assertIn("Checkliste", res.text)
|
||||
|
||||
decision = self.client.post(
|
||||
f"/admin/articles/{article_id}/images/decision",
|
||||
data={"image_url": "https://example.org/img.jpg", "action": "select"},
|
||||
follow_redirects=True,
|
||||
)
|
||||
self.assertEqual(decision.status_code, 200)
|
||||
self.assertIn("Ausgewähltes Hauptbild", decision.text)
|
||||
|
||||
article = get_article_by_id(article_id)
|
||||
self.assertIsNotNone(article)
|
||||
self.assertIn("selected_url", article.get("meta_json", ""))
|
||||
|
||||
def test_manage_source_and_feed(self) -> None:
|
||||
source_id = create_source(
|
||||
SourceCreate(
|
||||
name="Edit Source",
|
||||
base_url="https://example.org",
|
||||
terms_url="https://example.org/terms",
|
||||
license_name="cc-by",
|
||||
risk_level="yellow",
|
||||
is_enabled=True,
|
||||
notes=None,
|
||||
last_reviewed_at=None,
|
||||
)
|
||||
)
|
||||
feed_id = create_feed(
|
||||
FeedCreate(
|
||||
name="Edit Feed",
|
||||
url="https://example.org/feed.xml",
|
||||
source_id=source_id,
|
||||
is_enabled=True,
|
||||
)
|
||||
)
|
||||
self.client.post("/admin/login", data={"username": "admin", "password": "secret"}, follow_redirects=True)
|
||||
|
||||
update_source_res = self.client.post(
|
||||
f"/admin/sources/{source_id}/update",
|
||||
data={
|
||||
"name": "Edit Source 2",
|
||||
"base_url": "https://example.org/new",
|
||||
"terms_url": "https://example.org/new-terms",
|
||||
"license_name": "cc0",
|
||||
"risk_level": "green",
|
||||
"is_enabled": "1",
|
||||
"notes": "ok",
|
||||
"last_reviewed_at": "2026-02-21T12:00:00Z",
|
||||
},
|
||||
follow_redirects=False,
|
||||
)
|
||||
self.assertEqual(update_source_res.status_code, 303)
|
||||
|
||||
update_feed_res = self.client.post(
|
||||
f"/admin/feeds/{feed_id}/update",
|
||||
data={
|
||||
"name": "Edit Feed 2",
|
||||
"url": "https://example.org/feed2.xml",
|
||||
"source_id": str(source_id),
|
||||
"is_enabled": "0",
|
||||
},
|
||||
follow_redirects=False,
|
||||
)
|
||||
self.assertEqual(update_feed_res.status_code, 303)
|
||||
|
||||
delete_feed_res = self.client.post(f"/admin/feeds/{feed_id}/delete", follow_redirects=False)
|
||||
self.assertEqual(delete_feed_res.status_code, 303)
|
||||
delete_source_res = self.client.post(f"/admin/sources/{source_id}/delete", follow_redirects=False)
|
||||
self.assertEqual(delete_source_res.status_code, 303)
|
||||
|
||||
def test_rewrite_save_and_reopen(self) -> None:
|
||||
source_id = create_source(
|
||||
SourceCreate(
|
||||
name="Test Source",
|
||||
base_url="https://example.org",
|
||||
terms_url="https://example.org/terms",
|
||||
license_name="cc-by",
|
||||
risk_level="green",
|
||||
is_enabled=True,
|
||||
notes=None,
|
||||
last_reviewed_at="2026-02-18T00:00:00Z",
|
||||
)
|
||||
)
|
||||
feed_id = create_feed(
|
||||
FeedCreate(
|
||||
name="Test Feed",
|
||||
url="https://example.org/feed.xml",
|
||||
source_id=source_id,
|
||||
is_enabled=True,
|
||||
)
|
||||
)
|
||||
article_id = upsert_article(
|
||||
ArticleUpsert(
|
||||
feed_id=feed_id,
|
||||
source_article_id="id-published",
|
||||
source_hash="hash-published",
|
||||
title="Titel Published",
|
||||
source_url="https://example.org/published",
|
||||
canonical_url="https://example.org/published",
|
||||
published_at=None,
|
||||
author="Autor A",
|
||||
summary="Summary",
|
||||
content_raw="Raw",
|
||||
content_rewritten="<p>Alt</p>",
|
||||
image_urls_json=None,
|
||||
press_contact=None,
|
||||
source_name_snapshot="Test Source",
|
||||
source_terms_url_snapshot="https://example.org/terms",
|
||||
source_license_name_snapshot="cc-by",
|
||||
legal_checked=True,
|
||||
legal_checked_at="2026-02-21T10:00:00Z",
|
||||
legal_note=None,
|
||||
wp_post_id=123,
|
||||
wp_post_url="https://example.org/?p=123",
|
||||
publish_attempts=2,
|
||||
publish_last_error=None,
|
||||
published_to_wp_at="2026-02-21T10:10:00Z",
|
||||
word_count=1,
|
||||
status="published",
|
||||
meta_json="{}",
|
||||
)
|
||||
)
|
||||
self.client.post("/admin/login", data={"username": "admin", "password": "secret"}, follow_redirects=True)
|
||||
|
||||
save_res = self.client.post(
|
||||
f"/admin/articles/{article_id}/rewrite-save",
|
||||
data={"content_rewritten": "<h2>Neu</h2><p>Text</p>"},
|
||||
follow_redirects=False,
|
||||
)
|
||||
self.assertEqual(save_res.status_code, 303)
|
||||
|
||||
reopen_res = self.client.post(f"/admin/articles/{article_id}/reopen", follow_redirects=False)
|
||||
self.assertEqual(reopen_res.status_code, 303)
|
||||
|
||||
article = get_article_by_id(article_id)
|
||||
self.assertIsNotNone(article)
|
||||
self.assertEqual(article.get("status"), "rewrite")
|
||||
self.assertIn("Neu", article.get("content_rewritten") or "")
|
||||
self.assertIsNone(article.get("wp_post_id"))
|
||||
|
||||
@patch("backend.app.admin_ui.generate_article_tags")
|
||||
@patch("backend.app.admin_ui.rewrite_article_text")
|
||||
def test_batch_rewrite_run_processes_planned_articles(self, mock_rewrite_text, mock_tags) -> None:
|
||||
mock_rewrite_text.return_value = "<h2>Neu</h2><p>Text</p>"
|
||||
mock_tags.return_value = ["Rheingas", "Monheim"]
|
||||
|
||||
source_id = create_source(
|
||||
SourceCreate(
|
||||
name="Batch Source",
|
||||
base_url="https://example.org",
|
||||
terms_url="https://example.org/terms",
|
||||
license_name="cc-by",
|
||||
risk_level="green",
|
||||
is_enabled=True,
|
||||
notes=None,
|
||||
last_reviewed_at=None,
|
||||
)
|
||||
)
|
||||
feed_id = create_feed(
|
||||
FeedCreate(
|
||||
name="Batch Feed",
|
||||
url="https://example.org/feed.xml",
|
||||
source_id=source_id,
|
||||
is_enabled=True,
|
||||
)
|
||||
)
|
||||
article_id = upsert_article(
|
||||
ArticleUpsert(
|
||||
feed_id=feed_id,
|
||||
source_article_id="batch-1",
|
||||
source_hash="batch-hash-1",
|
||||
title="Batch Titel",
|
||||
source_url="https://example.org/batch",
|
||||
canonical_url="https://example.org/batch",
|
||||
published_at=None,
|
||||
author="Autor",
|
||||
summary="Summary",
|
||||
content_raw="Raw",
|
||||
content_rewritten=None,
|
||||
image_urls_json=None,
|
||||
press_contact=None,
|
||||
source_name_snapshot="Batch Source",
|
||||
source_terms_url_snapshot="https://example.org/terms",
|
||||
source_license_name_snapshot="cc-by",
|
||||
legal_checked=False,
|
||||
legal_checked_at=None,
|
||||
legal_note=None,
|
||||
wp_post_id=None,
|
||||
wp_post_url=None,
|
||||
publish_attempts=0,
|
||||
publish_last_error=None,
|
||||
published_to_wp_at=None,
|
||||
word_count=1,
|
||||
status="rewrite",
|
||||
meta_json="{}",
|
||||
)
|
||||
)
|
||||
self.client.post("/admin/login", data={"username": "admin", "password": "secret"}, follow_redirects=True)
|
||||
res = self.client.post("/admin/rewrite/run", data={"max_jobs": "10"}, follow_redirects=False)
|
||||
self.assertEqual(res.status_code, 303)
|
||||
article = get_article_by_id(article_id)
|
||||
self.assertIsNotNone(article)
|
||||
self.assertEqual(article.get("status"), "approved")
|
||||
self.assertIn("generated_tags", article.get("meta_json", ""))
|
||||
|
||||
@patch("backend.app.admin_ui.urlopen")
|
||||
def test_image_proxy_returns_image_data(self, mock_urlopen) -> None:
|
||||
class _FakeHeaders:
|
||||
def get(self, key: str, default=None):
|
||||
if key.lower() == "content-type":
|
||||
return "image/jpeg"
|
||||
return default
|
||||
|
||||
class _FakeResponse:
|
||||
headers = _FakeHeaders()
|
||||
|
||||
def read(self):
|
||||
return b"\xff\xd8\xff\xd9"
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
return False
|
||||
|
||||
mock_urlopen.return_value = _FakeResponse()
|
||||
|
||||
self.client.post(
|
||||
"/admin/login",
|
||||
data={"username": "admin", "password": "secret"},
|
||||
follow_redirects=True,
|
||||
)
|
||||
res = self.client.get("/admin/images/proxy?url=https%3A%2F%2Fexample.org%2Fimg.jpg")
|
||||
self.assertEqual(res.status_code, 200)
|
||||
self.assertIn("image/jpeg", res.headers.get("content-type", ""))
|
||||
|
||||
@patch("backend.app.admin_ui._run_connectivity_check")
|
||||
@patch("backend.app.admin_ui._build_connectivity_targets")
|
||||
def test_connectivity_page_renders(self, mock_targets, mock_check) -> None:
|
||||
mock_targets.return_value = [
|
||||
{"label": "OpenAI API", "kind": "host", "value": "api.openai.com"},
|
||||
{"label": "WordPress REST", "kind": "url", "value": "https://example.org/wp-json/wp/v2"},
|
||||
]
|
||||
mock_check.side_effect = [
|
||||
{
|
||||
"label": "OpenAI API",
|
||||
"kind": "host",
|
||||
"target": "api.openai.com",
|
||||
"dns_ok": True,
|
||||
"dns_info": "1.2.3.4",
|
||||
"tcp_ok": True,
|
||||
"tcp_info": "port 443 erreichbar",
|
||||
"http_ok": True,
|
||||
"http_info": "n/a (host-only)",
|
||||
"duration_ms": 12,
|
||||
"ok": True,
|
||||
},
|
||||
{
|
||||
"label": "WordPress REST",
|
||||
"kind": "url",
|
||||
"target": "https://example.org/wp-json/wp/v2",
|
||||
"dns_ok": False,
|
||||
"dns_info": "Name or service not known",
|
||||
"tcp_ok": False,
|
||||
"tcp_info": "-",
|
||||
"http_ok": False,
|
||||
"http_info": "-",
|
||||
"duration_ms": 10,
|
||||
"ok": False,
|
||||
},
|
||||
]
|
||||
|
||||
self.client.post(
|
||||
"/admin/login",
|
||||
data={"username": "admin", "password": "secret"},
|
||||
follow_redirects=True,
|
||||
)
|
||||
res = self.client.get("/admin/connectivity", follow_redirects=True)
|
||||
self.assertEqual(res.status_code, 200)
|
||||
self.assertIn("Connectivity Check", res.text)
|
||||
self.assertIn("OpenAI API", res.text)
|
||||
self.assertIn("WordPress REST", res.text)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
144
backend/tests/test_api_auth.py
Normal file
144
backend/tests/test_api_auth.py
Normal file
|
|
@ -0,0 +1,144 @@
|
|||
import os
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from backend.app import config as config_module
|
||||
from backend.app.db import init_db
|
||||
from backend.app.main import app
|
||||
|
||||
|
||||
class TestApiAuth(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.tmp_dir = tempfile.TemporaryDirectory()
|
||||
os.environ["APP_DB_PATH"] = str(Path(self.tmp_dir.name) / "api.db")
|
||||
os.environ["APP_ADMIN_USERNAME"] = "admin"
|
||||
os.environ["APP_ADMIN_PASSWORD"] = "secret"
|
||||
config_module.get_settings.cache_clear()
|
||||
init_db()
|
||||
self.client = TestClient(app)
|
||||
|
||||
def tearDown(self) -> None:
|
||||
config_module.get_settings.cache_clear()
|
||||
os.environ.pop("APP_DB_PATH", None)
|
||||
os.environ.pop("APP_ADMIN_USERNAME", None)
|
||||
os.environ.pop("APP_ADMIN_PASSWORD", None)
|
||||
self.tmp_dir.cleanup()
|
||||
|
||||
def test_login_and_protected_endpoint(self) -> None:
|
||||
r = self.client.post("/auth/login", json={"username": "admin", "password": "secret"})
|
||||
self.assertEqual(r.status_code, 200)
|
||||
|
||||
p = self.client.get("/api/protected")
|
||||
self.assertEqual(p.status_code, 200)
|
||||
self.assertTrue(p.json().get("ok"))
|
||||
|
||||
def test_protected_requires_auth(self) -> None:
|
||||
r = self.client.get("/api/protected")
|
||||
self.assertEqual(r.status_code, 401)
|
||||
|
||||
def test_run_detail_endpoint(self) -> None:
|
||||
login = self.client.post("/auth/login", json={"username": "admin", "password": "secret"})
|
||||
self.assertEqual(login.status_code, 200)
|
||||
|
||||
created = self.client.post("/api/runs", json={"run_type": "ingestion", "status": "running"})
|
||||
self.assertEqual(created.status_code, 200)
|
||||
run_id = created.json()["id"]
|
||||
|
||||
detail = self.client.get(f"/api/runs/{run_id}")
|
||||
self.assertEqual(detail.status_code, 200)
|
||||
self.assertEqual(detail.json()["item"]["id"], run_id)
|
||||
|
||||
def test_source_policy_check_endpoint(self) -> None:
|
||||
login = self.client.post("/auth/login", json={"username": "admin", "password": "secret"})
|
||||
self.assertEqual(login.status_code, 200)
|
||||
|
||||
created = self.client.post(
|
||||
"/api/sources",
|
||||
json={
|
||||
"name": "Policy Source",
|
||||
"risk_level": "yellow",
|
||||
"is_enabled": True,
|
||||
},
|
||||
)
|
||||
self.assertEqual(created.status_code, 200)
|
||||
source_id = created.json()["id"]
|
||||
|
||||
check = self.client.get(f"/api/sources/{source_id}/policy-check")
|
||||
self.assertEqual(check.status_code, 200)
|
||||
body = check.json()
|
||||
self.assertFalse(body["allowed"])
|
||||
self.assertGreaterEqual(len(body["issues"]), 1)
|
||||
|
||||
def test_articles_export_json_and_csv_contains_relevance(self) -> None:
|
||||
login = self.client.post("/auth/login", json={"username": "admin", "password": "secret"})
|
||||
self.assertEqual(login.status_code, 200)
|
||||
|
||||
source = self.client.post(
|
||||
"/api/sources",
|
||||
json={
|
||||
"name": "Export Source",
|
||||
"base_url": "https://example.org",
|
||||
"terms_url": "https://example.org/terms",
|
||||
"license_name": "cc-by",
|
||||
"risk_level": "green",
|
||||
"is_enabled": True,
|
||||
"last_reviewed_at": "2026-02-18T00:00:00Z",
|
||||
},
|
||||
)
|
||||
self.assertEqual(source.status_code, 200)
|
||||
source_id = source.json()["id"]
|
||||
|
||||
feed = self.client.post(
|
||||
"/api/feeds",
|
||||
json={"name": "Export Feed", "url": "https://example.org/feed.xml", "source_id": source_id, "is_enabled": True},
|
||||
)
|
||||
self.assertEqual(feed.status_code, 200)
|
||||
feed_id = feed.json()["id"]
|
||||
|
||||
article = self.client.post(
|
||||
"/api/articles/upsert",
|
||||
json={
|
||||
"feed_id": feed_id,
|
||||
"source_article_id": "exp-1",
|
||||
"source_hash": "exp-hash-1",
|
||||
"title": "Export Artikel",
|
||||
"source_url": "https://example.org/article/1",
|
||||
"canonical_url": "https://example.org/article/1",
|
||||
"published_at": "2026-02-18T00:00:00Z",
|
||||
"author": "Autor",
|
||||
"summary": "Kurz",
|
||||
"content_raw": "Langtext",
|
||||
"image_urls_json": "[\"https://example.org/img.jpg\"]",
|
||||
"press_contact": "Kontakt",
|
||||
"source_name_snapshot": "Export Source",
|
||||
"source_terms_url_snapshot": "https://example.org/terms",
|
||||
"source_license_name_snapshot": "cc-by",
|
||||
"status": "review",
|
||||
},
|
||||
)
|
||||
self.assertEqual(article.status_code, 200)
|
||||
|
||||
export_json = self.client.get("/api/articles/export?format=json")
|
||||
self.assertEqual(export_json.status_code, 200)
|
||||
body = export_json.json()
|
||||
self.assertTrue(body.get("ok"))
|
||||
self.assertGreaterEqual(body.get("count", 0), 1)
|
||||
first = body["items"][0]
|
||||
self.assertIn("published_at", first)
|
||||
self.assertIn("days_old", first)
|
||||
self.assertIn("relevance", first)
|
||||
|
||||
export_csv = self.client.get("/api/articles/export?format=csv")
|
||||
self.assertEqual(export_csv.status_code, 200)
|
||||
self.assertIn("text/csv", export_csv.headers.get("content-type", ""))
|
||||
csv_text = export_csv.text
|
||||
self.assertIn("published_at", csv_text)
|
||||
self.assertIn("days_old", csv_text)
|
||||
self.assertIn("relevance", csv_text)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
110
backend/tests/test_article_workflow.py
Normal file
110
backend/tests/test_article_workflow.py
Normal file
|
|
@ -0,0 +1,110 @@
|
|||
import os
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi.testclient import TestClient
|
||||
from unittest.mock import patch
|
||||
|
||||
from backend.app import config as config_module
|
||||
from backend.app.db import init_db
|
||||
from backend.app.main import app
|
||||
|
||||
|
||||
class TestArticleWorkflow(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.tmp_dir = tempfile.TemporaryDirectory()
|
||||
os.environ["APP_DB_PATH"] = str(Path(self.tmp_dir.name) / "workflow.db")
|
||||
os.environ["APP_ADMIN_USERNAME"] = "admin"
|
||||
os.environ["APP_ADMIN_PASSWORD"] = "secret"
|
||||
config_module.get_settings.cache_clear()
|
||||
init_db()
|
||||
self.client = TestClient(app)
|
||||
self.client.post("/auth/login", json={"username": "admin", "password": "secret"})
|
||||
|
||||
def tearDown(self) -> None:
|
||||
config_module.get_settings.cache_clear()
|
||||
os.environ.pop("APP_DB_PATH", None)
|
||||
os.environ.pop("APP_ADMIN_USERNAME", None)
|
||||
os.environ.pop("APP_ADMIN_PASSWORD", None)
|
||||
self.tmp_dir.cleanup()
|
||||
|
||||
def _create_article(self) -> int:
|
||||
source = self.client.post(
|
||||
"/api/sources",
|
||||
json={
|
||||
"name": "Workflow Source",
|
||||
"base_url": "https://example.org",
|
||||
"terms_url": "https://example.org/terms",
|
||||
"license_name": "cc-by",
|
||||
"risk_level": "green",
|
||||
"is_enabled": True,
|
||||
"last_reviewed_at": "2026-02-18T00:00:00Z",
|
||||
},
|
||||
)
|
||||
source_id = source.json()["id"]
|
||||
|
||||
feed = self.client.post(
|
||||
"/api/feeds",
|
||||
json={"name": "Workflow Feed", "url": "https://example.org/feed.xml", "source_id": source_id, "is_enabled": True},
|
||||
)
|
||||
feed_id = feed.json()["id"]
|
||||
|
||||
article = self.client.post(
|
||||
"/api/articles/upsert",
|
||||
json={
|
||||
"feed_id": feed_id,
|
||||
"source_article_id": "wf-1",
|
||||
"source_url": "https://example.org/a1",
|
||||
"title": "Workflow Artikel",
|
||||
"summary": "s",
|
||||
"content_raw": "c",
|
||||
"status": "new",
|
||||
},
|
||||
)
|
||||
return article.json()["id"]
|
||||
|
||||
def test_valid_transition_chain(self) -> None:
|
||||
article_id = self._create_article()
|
||||
|
||||
t1 = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "rewrite"})
|
||||
self.assertEqual(t1.status_code, 200)
|
||||
|
||||
t2 = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "publish"})
|
||||
self.assertEqual(t2.status_code, 200)
|
||||
|
||||
t3 = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "published"})
|
||||
self.assertEqual(t3.status_code, 200)
|
||||
|
||||
t4 = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "rewrite"})
|
||||
self.assertEqual(t4.status_code, 200)
|
||||
|
||||
final = self.client.get(f"/api/articles/{article_id}")
|
||||
self.assertEqual(final.status_code, 200)
|
||||
self.assertEqual(final.json()["item"]["status"], "rewrite")
|
||||
self.assertEqual(final.json()["item"]["status_ui"], "rewrite")
|
||||
|
||||
def test_invalid_transition_rejected(self) -> None:
|
||||
article_id = self._create_article()
|
||||
bad = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "published"})
|
||||
self.assertEqual(bad.status_code, 400)
|
||||
|
||||
def test_legacy_review_endpoint_is_gone(self) -> None:
|
||||
article_id = self._create_article()
|
||||
bad = self.client.post(f"/api/articles/{article_id}/review", json={"decision": "approve"})
|
||||
self.assertEqual(bad.status_code, 410)
|
||||
|
||||
@patch("backend.app.main.rewrite_article_text")
|
||||
def test_rewrite_run_sets_publish_status(self, mock_rewrite) -> None:
|
||||
mock_rewrite.return_value = "<h2>Neu</h2><p>Umschreibung</p>"
|
||||
article_id = self._create_article()
|
||||
self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "rewrite"})
|
||||
r = self.client.post(f"/api/articles/{article_id}/rewrite-run")
|
||||
self.assertEqual(r.status_code, 200)
|
||||
self.assertEqual(r.json()["status"], "publish")
|
||||
final = self.client.get(f"/api/articles/{article_id}")
|
||||
self.assertEqual(final.json()["item"]["status_ui"], "publish")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
145
backend/tests/test_db_repositories.py
Normal file
145
backend/tests/test_db_repositories.py
Normal file
|
|
@ -0,0 +1,145 @@
|
|||
import os
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
from backend.app import config as config_module
|
||||
from backend.app.db import init_db
|
||||
from backend.app.repositories import (
|
||||
ArticleUpsert,
|
||||
FeedCreate,
|
||||
RunCreate,
|
||||
SourceCreate,
|
||||
create_feed,
|
||||
create_run,
|
||||
create_source,
|
||||
finish_run,
|
||||
list_articles,
|
||||
list_feeds,
|
||||
list_runs,
|
||||
list_sources,
|
||||
upsert_article,
|
||||
)
|
||||
|
||||
|
||||
class TestSQLiteRepositories(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.tmp_dir = tempfile.TemporaryDirectory()
|
||||
self.db_path = str(Path(self.tmp_dir.name) / "test.db")
|
||||
os.environ["APP_DB_PATH"] = self.db_path
|
||||
config_module.get_settings.cache_clear()
|
||||
init_db()
|
||||
|
||||
def tearDown(self) -> None:
|
||||
config_module.get_settings.cache_clear()
|
||||
os.environ.pop("APP_DB_PATH", None)
|
||||
self.tmp_dir.cleanup()
|
||||
|
||||
def test_end_to_end_basic_crud(self) -> None:
|
||||
source_id = create_source(
|
||||
SourceCreate(
|
||||
name="GovData",
|
||||
base_url="https://data.gov.de",
|
||||
terms_url="https://www.govdata.de/dl-de/by-2-0",
|
||||
license_name="dl-de/by-2-0",
|
||||
risk_level="green",
|
||||
is_enabled=True,
|
||||
notes="test source",
|
||||
last_reviewed_at="2026-02-18T00:00:00Z",
|
||||
)
|
||||
)
|
||||
self.assertGreater(source_id, 0)
|
||||
|
||||
feed_id = create_feed(
|
||||
FeedCreate(
|
||||
name="GovData RSS",
|
||||
url="https://example.org/feed.xml",
|
||||
source_id=source_id,
|
||||
is_enabled=True,
|
||||
)
|
||||
)
|
||||
self.assertGreater(feed_id, 0)
|
||||
|
||||
run_id = create_run(RunCreate(run_type="ingest", status="running", details="start"))
|
||||
self.assertGreater(run_id, 0)
|
||||
finish_run(run_id=run_id, status="success", details="ok")
|
||||
|
||||
article_id = upsert_article(
|
||||
ArticleUpsert(
|
||||
feed_id=feed_id,
|
||||
source_article_id="abc-1",
|
||||
source_hash="hash-abc-1",
|
||||
title="Beispielartikel",
|
||||
source_url="https://example.org/articles/1",
|
||||
canonical_url="https://example.org/articles/1",
|
||||
published_at="2026-02-18T00:00:00Z",
|
||||
author="Max Mustermann",
|
||||
summary="Kurzfassung",
|
||||
content_raw="Originaltext",
|
||||
content_rewritten="Umschreibung",
|
||||
image_urls_json='["https://example.org/img.jpg"]',
|
||||
press_contact="Pressekontakt X",
|
||||
source_name_snapshot="GovData",
|
||||
source_terms_url_snapshot="https://www.govdata.de/dl-de/by-2-0",
|
||||
source_license_name_snapshot="dl-de/by-2-0",
|
||||
legal_checked=False,
|
||||
legal_checked_at=None,
|
||||
legal_note=None,
|
||||
wp_post_id=None,
|
||||
wp_post_url=None,
|
||||
publish_attempts=0,
|
||||
publish_last_error=None,
|
||||
published_to_wp_at=None,
|
||||
word_count=120,
|
||||
status="review",
|
||||
meta_json='{"lang":"de"}',
|
||||
)
|
||||
)
|
||||
self.assertGreater(article_id, 0)
|
||||
|
||||
# Upsert with same source_url updates same row
|
||||
article_id_2 = upsert_article(
|
||||
ArticleUpsert(
|
||||
feed_id=feed_id,
|
||||
source_article_id="abc-1",
|
||||
source_hash="hash-abc-1",
|
||||
title="Beispielartikel aktualisiert",
|
||||
source_url="https://example.org/articles/1",
|
||||
canonical_url="https://example.org/articles/1",
|
||||
published_at="2026-02-18T00:00:00Z",
|
||||
author="Max Mustermann",
|
||||
summary="Kurzfassung 2",
|
||||
content_raw="Originaltext 2",
|
||||
content_rewritten="Umschreibung 2",
|
||||
image_urls_json='["https://example.org/img2.jpg"]',
|
||||
press_contact="Pressekontakt Y",
|
||||
source_name_snapshot="GovData",
|
||||
source_terms_url_snapshot="https://www.govdata.de/dl-de/by-2-0",
|
||||
source_license_name_snapshot="dl-de/by-2-0",
|
||||
legal_checked=True,
|
||||
legal_checked_at="2026-02-18T00:10:00Z",
|
||||
legal_note="ok",
|
||||
wp_post_id=123,
|
||||
wp_post_url="https://example.org/wp/123",
|
||||
publish_attempts=1,
|
||||
publish_last_error=None,
|
||||
published_to_wp_at="2026-02-18T00:12:00Z",
|
||||
word_count=140,
|
||||
status="approved",
|
||||
meta_json='{"lang":"de","v":2}',
|
||||
)
|
||||
)
|
||||
self.assertEqual(article_id, article_id_2)
|
||||
|
||||
self.assertEqual(len(list_sources()), 1)
|
||||
self.assertEqual(len(list_feeds()), 1)
|
||||
self.assertEqual(len(list_runs()), 1)
|
||||
|
||||
articles = list_articles()
|
||||
self.assertEqual(len(articles), 1)
|
||||
self.assertEqual(articles[0]["title"], "Beispielartikel aktualisiert")
|
||||
self.assertEqual(articles[0]["status"], "approved")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
245
backend/tests/test_ingestion.py
Normal file
245
backend/tests/test_ingestion.py
Normal file
|
|
@ -0,0 +1,245 @@
|
|||
import os
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
from backend.app import config as config_module
|
||||
from backend.app.db import init_db
|
||||
from backend.app.ingestion import run_ingestion
|
||||
from backend.app.repositories import (
|
||||
ArticleUpsert,
|
||||
FeedCreate,
|
||||
SourceCreate,
|
||||
create_feed,
|
||||
create_source,
|
||||
get_article_by_id,
|
||||
list_articles,
|
||||
upsert_article,
|
||||
)
|
||||
from backend.app.source_extraction import ExtractedArticle
|
||||
|
||||
|
||||
class TestIngestion(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.tmp_dir = tempfile.TemporaryDirectory()
|
||||
os.environ["APP_DB_PATH"] = str(Path(self.tmp_dir.name) / "ingestion.db")
|
||||
config_module.get_settings.cache_clear()
|
||||
init_db()
|
||||
|
||||
source_id = create_source(
|
||||
SourceCreate(
|
||||
name="Test Source",
|
||||
base_url="https://example.org",
|
||||
terms_url="https://example.org/terms",
|
||||
license_name="cc-by",
|
||||
risk_level="green",
|
||||
is_enabled=True,
|
||||
notes=None,
|
||||
last_reviewed_at="2026-02-18T00:00:00Z",
|
||||
)
|
||||
)
|
||||
self.feed_id = create_feed(
|
||||
FeedCreate(
|
||||
name="Test Feed",
|
||||
url="https://example.org/feed.xml",
|
||||
source_id=source_id,
|
||||
is_enabled=True,
|
||||
)
|
||||
)
|
||||
|
||||
def tearDown(self) -> None:
|
||||
config_module.get_settings.cache_clear()
|
||||
os.environ.pop("APP_DB_PATH", None)
|
||||
self.tmp_dir.cleanup()
|
||||
|
||||
@patch("backend.app.ingestion.extract_article")
|
||||
@patch("backend.app.ingestion.feedparser.parse")
|
||||
def test_ingestion_deduplicates_by_feed_and_guid(self, mock_parse, mock_extract_article) -> None:
|
||||
mock_extract_article.return_value = ExtractedArticle(
|
||||
title="Artikel 1 original",
|
||||
author="Autorin A",
|
||||
canonical_url="https://example.org/article/1",
|
||||
summary="Original Summary",
|
||||
content_text="Original Volltext",
|
||||
images=["https://example.org/a.jpg"],
|
||||
press_contact="Pressekontakt: Team A",
|
||||
extraction_error=None,
|
||||
)
|
||||
mock_parse.return_value = {
|
||||
"etag": "etag-1",
|
||||
"modified": "Tue, 18 Feb 2026 10:00:00 GMT",
|
||||
"entries": [
|
||||
{
|
||||
"id": "item-1",
|
||||
"title": "Artikel 1",
|
||||
"link": "https://example.org/article/1",
|
||||
"summary": "A",
|
||||
},
|
||||
{
|
||||
"id": "item-1",
|
||||
"title": "Artikel 1 aktualisiert",
|
||||
"link": "https://example.org/article/1-neu",
|
||||
"summary": "B",
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
stats = run_ingestion(feed_id=self.feed_id)
|
||||
self.assertEqual(stats.status, "success")
|
||||
self.assertEqual(stats.entries_seen, 2)
|
||||
self.assertEqual(len(list_articles()), 1)
|
||||
article = list_articles()[0]
|
||||
self.assertEqual(article["title"], "Artikel 1 original")
|
||||
self.assertEqual(article["author"], "Autorin A")
|
||||
self.assertIn("Original Volltext", article["content_raw"] or "")
|
||||
self.assertIn("Pressekontakt", article["meta_json"] or "")
|
||||
self.assertIsNotNone(article["image_urls_json"])
|
||||
|
||||
@patch("backend.app.ingestion.extract_article")
|
||||
@patch("backend.app.ingestion.feedparser.parse")
|
||||
def test_ingestion_processes_any_enabled_source(self, mock_parse, mock_extract_article) -> None:
|
||||
# Ampel/risk-level system removed – all enabled feeds are processed regardless of risk_level
|
||||
source_id = create_source(
|
||||
SourceCreate(
|
||||
name="Any Risk Source",
|
||||
base_url="https://example.net",
|
||||
terms_url="https://example.net/terms",
|
||||
license_name="custom",
|
||||
risk_level="yellow",
|
||||
is_enabled=True,
|
||||
notes=None,
|
||||
last_reviewed_at="2026-02-18T00:00:00Z",
|
||||
)
|
||||
)
|
||||
feed_id = create_feed(
|
||||
FeedCreate(
|
||||
name="Any Risk Feed",
|
||||
url="https://example.net/feed.xml",
|
||||
source_id=source_id,
|
||||
is_enabled=True,
|
||||
)
|
||||
)
|
||||
|
||||
mock_parse.return_value = type("FP", (), {"entries": [], "etag": None, "modified": None})()
|
||||
mock_extract_article.return_value = type("E", (), {
|
||||
"title": None, "author": None, "summary": None, "content_text": None,
|
||||
"canonical_url": None, "images": [], "press_contact": None,
|
||||
})()
|
||||
|
||||
stats = run_ingestion(feed_id=feed_id)
|
||||
self.assertEqual(stats.status, "success")
|
||||
# Feed was processed (feedparser was called), even with yellow risk_level
|
||||
mock_parse.assert_called_once()
|
||||
|
||||
@patch("backend.app.ingestion.extract_article")
|
||||
@patch("backend.app.ingestion.feedparser.parse")
|
||||
def test_ingestion_preserves_existing_work_and_skips_closed(self, mock_parse, mock_extract_article) -> None:
|
||||
existing_closed_id = upsert_article(
|
||||
ArticleUpsert(
|
||||
feed_id=self.feed_id,
|
||||
source_article_id="closed-1",
|
||||
source_hash="closed-hash-1",
|
||||
title="Alt Closed",
|
||||
source_url="https://example.org/closed-article",
|
||||
canonical_url="https://example.org/closed-article",
|
||||
published_at=None,
|
||||
author="Autor",
|
||||
summary="Alt",
|
||||
content_raw="Alt Raw",
|
||||
content_rewritten="<p>Alt Rewrite Closed</p>",
|
||||
image_urls_json=None,
|
||||
press_contact="Kontakt Alt",
|
||||
source_name_snapshot="Test Source",
|
||||
source_terms_url_snapshot="https://example.org/terms",
|
||||
source_license_name_snapshot="cc-by",
|
||||
legal_checked=False,
|
||||
legal_checked_at=None,
|
||||
legal_note=None,
|
||||
wp_post_id=42,
|
||||
wp_post_url="https://wp.local/?p=42",
|
||||
publish_attempts=2,
|
||||
publish_last_error=None,
|
||||
published_to_wp_at="2026-02-21T12:00:00Z",
|
||||
word_count=3,
|
||||
status="error", # UI: close
|
||||
meta_json='{"generated_tags":["AltTag"]}',
|
||||
)
|
||||
)
|
||||
existing_published_id = upsert_article(
|
||||
ArticleUpsert(
|
||||
feed_id=self.feed_id,
|
||||
source_article_id="published-1",
|
||||
source_hash="published-hash-1",
|
||||
title="Alt Published",
|
||||
source_url="https://example.org/published-article",
|
||||
canonical_url="https://example.org/published-article",
|
||||
published_at=None,
|
||||
author="Autor",
|
||||
summary="Alt",
|
||||
content_raw="Alt Raw",
|
||||
content_rewritten="<p>Alt Rewrite Published</p>",
|
||||
image_urls_json=None,
|
||||
press_contact="Kontakt Alt",
|
||||
source_name_snapshot="Test Source",
|
||||
source_terms_url_snapshot="https://example.org/terms",
|
||||
source_license_name_snapshot="cc-by",
|
||||
legal_checked=False,
|
||||
legal_checked_at=None,
|
||||
legal_note=None,
|
||||
wp_post_id=77,
|
||||
wp_post_url="https://wp.local/?p=77",
|
||||
publish_attempts=3,
|
||||
publish_last_error=None,
|
||||
published_to_wp_at="2026-02-21T12:10:00Z",
|
||||
word_count=3,
|
||||
status="published",
|
||||
meta_json='{"generated_tags":["Rheingas"],"image_review":{"selected_url":"https://img.local/1.jpg"}}',
|
||||
)
|
||||
)
|
||||
|
||||
mock_extract_article.return_value = ExtractedArticle(
|
||||
title="Neu Titel",
|
||||
author="Neu Autor",
|
||||
canonical_url=None,
|
||||
summary="Neu Summary",
|
||||
content_text="Neu Volltext",
|
||||
images=["https://example.org/a.jpg"],
|
||||
press_contact=None,
|
||||
extraction_error=None,
|
||||
)
|
||||
mock_parse.return_value = {
|
||||
"etag": "etag-2",
|
||||
"modified": "Tue, 18 Feb 2026 11:00:00 GMT",
|
||||
"entries": [
|
||||
{
|
||||
"id": "closed-1",
|
||||
"title": "Closed Entry",
|
||||
"link": "https://example.org/closed-article",
|
||||
"summary": "X",
|
||||
},
|
||||
{
|
||||
"id": "published-1",
|
||||
"title": "Published Entry",
|
||||
"link": "https://example.org/published-article",
|
||||
"summary": "Y",
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
stats = run_ingestion(feed_id=self.feed_id)
|
||||
self.assertEqual(stats.status, "success")
|
||||
closed_row = get_article_by_id(existing_closed_id) or {}
|
||||
self.assertEqual(closed_row["status"], "error")
|
||||
self.assertIn("Alt Rewrite Closed", closed_row.get("content_rewritten") or "")
|
||||
self.assertEqual(closed_row.get("wp_post_id"), 42)
|
||||
|
||||
published_row = get_article_by_id(existing_published_id) or {}
|
||||
self.assertEqual(published_row["status"], "published")
|
||||
self.assertIn("Alt Rewrite Published", published_row.get("content_rewritten") or "")
|
||||
self.assertEqual(published_row.get("wp_post_id"), 77)
|
||||
self.assertIn("generated_tags", published_row.get("meta_json") or "")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
112
backend/tests/test_publisher.py
Normal file
112
backend/tests/test_publisher.py
Normal file
|
|
@ -0,0 +1,112 @@
|
|||
import os
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from backend.app import config as config_module
|
||||
from backend.app.db import init_db
|
||||
from backend.app.main import app
|
||||
|
||||
|
||||
class TestPublisher(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.tmp_dir = tempfile.TemporaryDirectory()
|
||||
os.environ["APP_DB_PATH"] = str(Path(self.tmp_dir.name) / "publisher.db")
|
||||
os.environ["APP_ADMIN_USERNAME"] = "admin"
|
||||
os.environ["APP_ADMIN_PASSWORD"] = "secret"
|
||||
os.environ["WORDPRESS_BASE_URL"] = "https://example.org"
|
||||
os.environ["WORDPRESS_USERNAME"] = "wp-user"
|
||||
os.environ["WORDPRESS_APP_PASSWORD"] = "wp-pass"
|
||||
config_module.get_settings.cache_clear()
|
||||
init_db()
|
||||
self.client = TestClient(app)
|
||||
self.client.post("/auth/login", json={"username": "admin", "password": "secret"})
|
||||
|
||||
def tearDown(self) -> None:
|
||||
config_module.get_settings.cache_clear()
|
||||
for key in (
|
||||
"APP_DB_PATH",
|
||||
"APP_ADMIN_USERNAME",
|
||||
"APP_ADMIN_PASSWORD",
|
||||
"WORDPRESS_BASE_URL",
|
||||
"WORDPRESS_USERNAME",
|
||||
"WORDPRESS_APP_PASSWORD",
|
||||
):
|
||||
os.environ.pop(key, None)
|
||||
self.tmp_dir.cleanup()
|
||||
|
||||
def _create_publishable_article(self) -> int:
|
||||
source = self.client.post(
|
||||
"/api/sources",
|
||||
json={
|
||||
"name": "WP Source",
|
||||
"base_url": "https://example.org",
|
||||
"terms_url": "https://example.org/terms",
|
||||
"license_name": "cc-by",
|
||||
"risk_level": "green",
|
||||
"is_enabled": True,
|
||||
"last_reviewed_at": "2026-02-18T00:00:00Z",
|
||||
},
|
||||
)
|
||||
source_id = source.json()["id"]
|
||||
feed = self.client.post(
|
||||
"/api/feeds",
|
||||
json={"name": "WP Feed", "url": "https://example.org/feed.xml", "source_id": source_id, "is_enabled": True},
|
||||
)
|
||||
feed_id = feed.json()["id"]
|
||||
|
||||
article = self.client.post(
|
||||
"/api/articles/upsert",
|
||||
json={
|
||||
"feed_id": feed_id,
|
||||
"source_article_id": "pub-1",
|
||||
"source_hash": "pub-hash-1",
|
||||
"title": "Publish Artikel",
|
||||
"source_url": "https://example.org/article/1",
|
||||
"canonical_url": "https://example.org/article/1",
|
||||
"published_at": "2026-02-18T00:00:00Z",
|
||||
"author": "Autor",
|
||||
"summary": "Kurz",
|
||||
"content_raw": "Langtext",
|
||||
"image_urls_json": "[\"https://example.org/img.jpg\"]",
|
||||
"press_contact": "Kontakt",
|
||||
"source_name_snapshot": "WP Source",
|
||||
"source_terms_url_snapshot": "https://example.org/terms",
|
||||
"source_license_name_snapshot": "cc-by",
|
||||
"legal_checked": True,
|
||||
"status": "approved",
|
||||
"meta_json": "{\"image_review\":{\"selected_url\":\"https://example.org/img.jpg\"}}",
|
||||
},
|
||||
)
|
||||
return article.json()["id"]
|
||||
|
||||
@patch("backend.app.publisher.publish_article_draft")
|
||||
def test_enqueue_and_run_publisher(self, mock_publish) -> None:
|
||||
mock_publish.return_value = (777, "https://example.org/?p=777")
|
||||
article_id = self._create_publishable_article()
|
||||
|
||||
enqueue = self.client.post("/api/publisher/enqueue", json={"article_id": article_id, "max_attempts": 3})
|
||||
self.assertEqual(enqueue.status_code, 200)
|
||||
|
||||
run = self.client.post("/api/publisher/run", json={"max_jobs": 5})
|
||||
self.assertEqual(run.status_code, 200)
|
||||
stats = run.json()["stats"]
|
||||
self.assertEqual(stats["success"], 1)
|
||||
|
||||
article = self.client.get(f"/api/articles/{article_id}")
|
||||
self.assertEqual(article.status_code, 200)
|
||||
item = article.json()["item"]
|
||||
self.assertEqual(item["status"], "published")
|
||||
self.assertEqual(item["wp_post_id"], 777)
|
||||
self.assertIn("?p=777", item["wp_post_url"] or "")
|
||||
|
||||
jobs = self.client.get("/api/publisher/jobs")
|
||||
self.assertEqual(jobs.status_code, 200)
|
||||
self.assertGreaterEqual(len(jobs.json()["items"]), 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
21
backend/tests/test_relevance.py
Normal file
21
backend/tests/test_relevance.py
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
from datetime import datetime, timezone
|
||||
import unittest
|
||||
|
||||
from backend.app.relevance import article_age_days, article_relevance
|
||||
|
||||
|
||||
class TestRelevance(unittest.TestCase):
|
||||
def test_article_age_and_relevance(self) -> None:
|
||||
now = datetime(2026, 2, 18, 12, 0, 0, tzinfo=timezone.utc)
|
||||
self.assertEqual(article_age_days("2026-02-18T10:00:00Z", now=now), 0)
|
||||
self.assertEqual(article_relevance("2026-02-18T10:00:00Z", now=now), "hoch")
|
||||
|
||||
self.assertEqual(article_age_days("2026-02-14T12:00:00Z", now=now), 4)
|
||||
self.assertEqual(article_relevance("2026-02-14T12:00:00Z", now=now), "mittel")
|
||||
|
||||
self.assertEqual(article_relevance("2025-12-01T00:00:00Z", now=now), "alt")
|
||||
self.assertEqual(article_relevance(None, now=now), "unbekannt")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
96
backend/tests/test_source_extraction.py
Normal file
96
backend/tests/test_source_extraction.py
Normal file
|
|
@ -0,0 +1,96 @@
|
|||
import unittest
|
||||
from unittest.mock import patch
|
||||
|
||||
from backend.app.source_extraction import extract_article
|
||||
|
||||
|
||||
SAMPLE_HTML = """
|
||||
<!doctype html>
|
||||
<html lang="de">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta property="og:title" content="Demo Meldung von Presseportal" />
|
||||
<meta name="author" content="Max Mustermann" />
|
||||
<meta name="description" content="Kurzbeschreibung aus der Originalseite" />
|
||||
<meta property="og:image" content="/images/demo.jpg" />
|
||||
<link rel="canonical" href="https://www.presseportal.de/pm/118273/6158137" />
|
||||
</head>
|
||||
<body>
|
||||
<article>
|
||||
<p>Dies ist der vollstaendige Inhalt des Artikels.</p>
|
||||
<p>Weitere relevante Informationen fuer die Meldung.</p>
|
||||
<h3>Pressekontakt</h3>
|
||||
<p>Musterfirma GmbH, Kontakt: presse@example.org</p>
|
||||
</article>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
SAMPLE_HTML_AGENTUR = """
|
||||
<!doctype html>
|
||||
<html lang="de">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta property="og:title" content="Demo Meldung Agentur" />
|
||||
</head>
|
||||
<body>
|
||||
<article>
|
||||
<p>Inhalt der Meldung.</p>
|
||||
<h3>Agentur</h3>
|
||||
<p>Agenturname GmbH</p>
|
||||
<p>presse@agentur.example</p>
|
||||
<p>Original-Content von Beispiel</p>
|
||||
</article>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
|
||||
class _FakeHeaders:
|
||||
@staticmethod
|
||||
def get_content_charset():
|
||||
return "utf-8"
|
||||
|
||||
|
||||
class _FakeResponse:
|
||||
headers = _FakeHeaders()
|
||||
|
||||
def __init__(self, body: str):
|
||||
self._body = body.encode("utf-8")
|
||||
|
||||
def read(self):
|
||||
return self._body
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
return False
|
||||
|
||||
|
||||
class TestSourceExtraction(unittest.TestCase):
|
||||
@patch("backend.app.source_extraction.urlopen")
|
||||
def test_extract_article_parses_author_images_and_press_contact(self, mock_urlopen) -> None:
|
||||
mock_urlopen.return_value = _FakeResponse(SAMPLE_HTML)
|
||||
|
||||
extracted = extract_article("https://www.presseportal.de/pm/118273/6158137")
|
||||
self.assertEqual(extracted.title, "Demo Meldung von Presseportal")
|
||||
self.assertEqual(extracted.author, "Max Mustermann")
|
||||
self.assertEqual(extracted.canonical_url, "https://www.presseportal.de/pm/118273/6158137")
|
||||
self.assertIn("vollstaendige Inhalt", extracted.content_text or "")
|
||||
self.assertIn("Kurzbeschreibung", extracted.summary or "")
|
||||
self.assertIn("https://www.presseportal.de/images/demo.jpg", extracted.images)
|
||||
self.assertIn("Pressekontakt", extracted.press_contact or "")
|
||||
self.assertIsNone(extracted.extraction_error)
|
||||
|
||||
@patch("backend.app.source_extraction.urlopen")
|
||||
def test_extract_article_detects_agentur_block_as_press_contact(self, mock_urlopen) -> None:
|
||||
mock_urlopen.return_value = _FakeResponse(SAMPLE_HTML_AGENTUR)
|
||||
extracted = extract_article("https://www.presseportal.de/pm/155103/6210401")
|
||||
self.assertIn("Agentur", extracted.press_contact or "")
|
||||
self.assertIn("Agenturname", extracted.press_contact or "")
|
||||
self.assertIn("presse@agentur.example", extracted.press_contact or "")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
139
backend/tests/test_wordpress.py
Normal file
139
backend/tests/test_wordpress.py
Normal file
|
|
@ -0,0 +1,139 @@
|
|||
import os
|
||||
import unittest
|
||||
from unittest.mock import patch
|
||||
|
||||
from backend.app import config as config_module
|
||||
from backend.app.wordpress import publish_article_draft
|
||||
|
||||
|
||||
class TestWordpressPublish(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
os.environ["WORDPRESS_BASE_URL"] = "https://example.org"
|
||||
os.environ["WORDPRESS_USERNAME"] = "wp-user"
|
||||
os.environ["WORDPRESS_APP_PASSWORD"] = "wp-pass"
|
||||
config_module.get_settings.cache_clear()
|
||||
|
||||
def tearDown(self) -> None:
|
||||
for key in ("WORDPRESS_BASE_URL", "WORDPRESS_USERNAME", "WORDPRESS_APP_PASSWORD"):
|
||||
os.environ.pop(key, None)
|
||||
config_module.get_settings.cache_clear()
|
||||
|
||||
@patch("backend.app.wordpress._upload_featured_media")
|
||||
@patch("backend.app.wordpress._wp_request")
|
||||
def test_publish_sets_featured_media_when_selected_image_exists(self, mock_wp_request, mock_upload_media) -> None:
|
||||
mock_upload_media.return_value = 456
|
||||
mock_wp_request.return_value = {"id": 321, "link": "https://example.org/?p=321"}
|
||||
|
||||
article = {
|
||||
"title": "Testartikel",
|
||||
"content_raw": "Inhalt",
|
||||
"source_url": "https://example.com/source",
|
||||
"canonical_url": "https://example.com/source",
|
||||
"meta_json": '{"image_review":{"selected_url":"https://example.com/image.jpg"}}',
|
||||
}
|
||||
post_id, post_url = publish_article_draft(article)
|
||||
|
||||
self.assertEqual(post_id, 321)
|
||||
self.assertIn("?p=321", post_url or "")
|
||||
self.assertTrue(mock_upload_media.called)
|
||||
payload = mock_wp_request.call_args.kwargs["payload"]
|
||||
self.assertEqual(payload.get("featured_media"), 456)
|
||||
self.assertIn("<!-- wp:paragraph -->", payload.get("content", ""))
|
||||
self.assertIn("<p>Inhalt</p>", payload.get("content", ""))
|
||||
self.assertNotIn("excerpt", payload)
|
||||
|
||||
@patch("backend.app.wordpress._upload_featured_media")
|
||||
@patch("backend.app.wordpress._wp_request")
|
||||
def test_publish_without_selected_image_has_no_featured_media(self, mock_wp_request, mock_upload_media) -> None:
|
||||
mock_wp_request.return_value = {"id": 654, "link": "https://example.org/?p=654"}
|
||||
|
||||
article = {
|
||||
"title": "Testartikel",
|
||||
"content_raw": "Inhalt",
|
||||
"source_url": "https://example.com/source",
|
||||
"canonical_url": "https://example.com/source",
|
||||
"meta_json": "{}",
|
||||
}
|
||||
post_id, _ = publish_article_draft(article)
|
||||
|
||||
self.assertEqual(post_id, 654)
|
||||
self.assertFalse(mock_upload_media.called)
|
||||
payload = mock_wp_request.call_args.kwargs["payload"]
|
||||
self.assertNotIn("featured_media", payload)
|
||||
self.assertIn("<p>Inhalt</p>", payload.get("content", ""))
|
||||
|
||||
@patch("backend.app.wordpress._upload_featured_media")
|
||||
@patch("backend.app.wordpress._wp_request")
|
||||
def test_publish_strips_feed_header_and_press_contact(self, mock_wp_request, mock_upload_media) -> None:
|
||||
mock_wp_request.return_value = {"id": 100, "link": "https://example.org/?p=100"}
|
||||
article = {
|
||||
"title": "Header Test",
|
||||
"content_raw": "21.02.2026 10:00\nFirma GmbH\n(ots)\nDas ist der eigentliche Text.\nPressekontakt: Test Person",
|
||||
"source_url": "https://example.com/source",
|
||||
"canonical_url": "https://example.com/source",
|
||||
"meta_json": "{}",
|
||||
}
|
||||
publish_article_draft(article)
|
||||
payload = mock_wp_request.call_args.kwargs["payload"]
|
||||
content = payload.get("content", "")
|
||||
self.assertNotIn("Firma GmbH", content)
|
||||
self.assertNotIn("Pressekontakt", content)
|
||||
self.assertIn("eigentliche Text", content)
|
||||
self.assertNotIn("Artikeldetails", content)
|
||||
|
||||
@patch("backend.app.wordpress._upload_featured_media")
|
||||
@patch("backend.app.wordpress._wp_request")
|
||||
def test_publish_resolves_and_sets_tags(self, mock_wp_request, mock_upload_media) -> None:
|
||||
def _fake_wp_request(**kwargs):
|
||||
endpoint = kwargs.get("endpoint", "")
|
||||
method = kwargs.get("method", "")
|
||||
if method == "GET" and endpoint.startswith("tags?search="):
|
||||
if "Rheingas" in endpoint:
|
||||
return [{"id": 11, "name": "Rheingas"}]
|
||||
return []
|
||||
if method == "POST" and endpoint == "tags":
|
||||
name = (kwargs.get("payload") or {}).get("name")
|
||||
if name == "Gasflasche":
|
||||
return {"id": 12, "name": "Gasflasche"}
|
||||
return {"id": 13, "name": str(name)}
|
||||
if method == "POST" and endpoint == "posts":
|
||||
return {"id": 900, "link": "https://example.org/?p=900"}
|
||||
return {}
|
||||
|
||||
mock_wp_request.side_effect = _fake_wp_request
|
||||
article = {
|
||||
"title": "Tag Test",
|
||||
"content_raw": "Inhalt",
|
||||
"source_url": "https://example.com/source",
|
||||
"canonical_url": "https://example.com/source",
|
||||
"meta_json": '{"generated_tags":["Rheingas","Gasflasche"]}',
|
||||
}
|
||||
post_id, _ = publish_article_draft(article)
|
||||
self.assertEqual(post_id, 900)
|
||||
post_calls = [call for call in mock_wp_request.call_args_list if call.kwargs.get("endpoint") == "posts"]
|
||||
self.assertEqual(len(post_calls), 1)
|
||||
payload = post_calls[0].kwargs.get("payload", {})
|
||||
self.assertEqual(payload.get("tags"), [11, 12])
|
||||
|
||||
@patch("backend.app.wordpress._upload_featured_media")
|
||||
@patch("backend.app.wordpress._wp_request")
|
||||
def test_publish_converts_html_to_wp_blocks_without_html_block(self, mock_wp_request, mock_upload_media) -> None:
|
||||
mock_wp_request.return_value = {"id": 111, "link": "https://example.org/?p=111"}
|
||||
article = {
|
||||
"title": "Block Test",
|
||||
"content_rewritten": "<h2>Überschrift</h2><p>Absatz 1</p><ul><li>A</li><li>B</li></ul>",
|
||||
"source_url": "https://example.com/source",
|
||||
"canonical_url": "https://example.com/source",
|
||||
"meta_json": "{}",
|
||||
}
|
||||
publish_article_draft(article)
|
||||
payload = mock_wp_request.call_args.kwargs["payload"]
|
||||
content = payload.get("content", "")
|
||||
self.assertIn("<!-- wp:heading", content)
|
||||
self.assertIn("<!-- wp:paragraph -->", content)
|
||||
self.assertIn("<!-- wp:list -->", content)
|
||||
self.assertNotIn("<!-- wp:html -->", content)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
2650
data/articles.json
2650
data/articles.json
File diff suppressed because one or more lines are too long
|
|
@ -10,5 +10,33 @@
|
|||
{
|
||||
"url": "https://www.promobil.de/rss/ratgeber",
|
||||
"name": "Promobil Ratgeber"
|
||||
},
|
||||
{
|
||||
"url": "https://www.presseportal.de/rss/rss2_vts.htx?q=camping&langid=1 ",
|
||||
"name": "Presseportal Camping"
|
||||
},
|
||||
{
|
||||
"url": "https://www.presseportal.de/rss/rss2_vts.htx?q=wohnmobil&langid=1 ",
|
||||
"name": "Presseportal Wohnmobil"
|
||||
},
|
||||
{
|
||||
"url": "https://caravan-news.de/rss/schlagzeilen.php",
|
||||
"name": "Caravan News"
|
||||
},
|
||||
{
|
||||
"url": "https://www.google.de/alerts/feeds/03077836356662926441/16793724126187652294",
|
||||
"name": "Google Campingplatz"
|
||||
},
|
||||
{
|
||||
"url": "https://www.google.de/alerts/feeds/03077836356662926441/987500860911797305",
|
||||
"name": "Google VanLife"
|
||||
},
|
||||
{
|
||||
"url": "https://www.google.de/alerts/feeds/03077836356662926441/4770194054838089856",
|
||||
"name": "Google Camping Termine"
|
||||
},
|
||||
{
|
||||
"url": "https://www.google.com/alerts/feeds/03077836356662926441/14685692393152596493",
|
||||
"name": "Google Camping Messe 2025"
|
||||
}
|
||||
]
|
||||
190
docs/AUTOMATION.md
Normal file
190
docs/AUTOMATION.md
Normal file
|
|
@ -0,0 +1,190 @@
|
|||
# Automatischer Pipeline-Betrieb
|
||||
|
||||
## Überblick
|
||||
|
||||
Das System läuft vollautomatisch und benötigt nur noch gelegentliche Telegram-Interaktion.
|
||||
|
||||
```
|
||||
N8N (2× täglich, 08:00 + 16:00 Uhr)
|
||||
└─► POST /api/n8n/pipeline (X-API-Key Header)
|
||||
├── RSS Ingestion (alle aktivierten Feeds)
|
||||
├── Relevanz-Score per GPT (0–100)
|
||||
│ ├── Score ≥ 80 → Rewrite + WP-Draft + Telegram
|
||||
│ ├── Score 60–79 → Telegram-Warnung + manueller Override möglich
|
||||
│ └── Score < 60 → Abgelehnt + tägliche Telegram-Liste
|
||||
└── Pipeline-Zusammenfassung via Telegram
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Einrichtung
|
||||
|
||||
### 1. Umgebungsvariablen setzen
|
||||
|
||||
Kopiere `backend/.env.example` nach `backend/.env` und fülle alle Felder aus:
|
||||
|
||||
```bash
|
||||
cp backend/.env.example backend/.env
|
||||
nano backend/.env
|
||||
```
|
||||
|
||||
Wichtige Variablen:
|
||||
|
||||
| Variable | Beschreibung |
|
||||
|----------|-------------|
|
||||
| `TELEGRAM_BOT_TOKEN` | Bot-Token von @BotFather |
|
||||
| `TELEGRAM_CHAT_ID` | Deine persönliche Chat-ID |
|
||||
| `TELEGRAM_WEBHOOK_SECRET` | Zufälliger String (≥ 20 Zeichen) |
|
||||
| `N8N_API_KEY` | Starker zufälliger API-Key |
|
||||
| `OPENAI_API_KEY` | OpenAI API-Key |
|
||||
| `WP_BASE_URL` | WordPress-URL |
|
||||
| `WP_USERNAME` | WordPress-Benutzername |
|
||||
| `WP_PASSWORD` | WordPress App-Passwort |
|
||||
|
||||
### 2. Telegram-Webhook registrieren
|
||||
|
||||
Nach dem Deployment einmalig aufrufen:
|
||||
|
||||
```bash
|
||||
curl -X POST https://news.vanityontour.de/api/telegram/setup-webhook \
|
||||
-H "Cookie: rss_news_session=<dein-session-token>"
|
||||
```
|
||||
|
||||
Oder über die Admin-UI: Settings → Telegram Webhook einrichten.
|
||||
|
||||
### 3. N8N Workflow einrichten
|
||||
|
||||
In N8N einen neuen Workflow erstellen:
|
||||
|
||||
**Trigger:** Cron
|
||||
- Zeitplan 1: `0 8 * * *` (täglich 08:00)
|
||||
- Zeitplan 2: `0 16 * * *` (täglich 16:00)
|
||||
|
||||
**Aktion:** HTTP Request
|
||||
- Method: `POST`
|
||||
- URL: `https://news.vanityontour.de/api/n8n/pipeline`
|
||||
- Header: `X-API-Key: <dein-n8n-api-key>`
|
||||
|
||||
**Fehlerbehandlung:** Bei HTTP-Fehler → E-Mail/Telegram-Alert
|
||||
|
||||
---
|
||||
|
||||
## Telegram-Befehle
|
||||
|
||||
| Befehl | Funktion |
|
||||
|--------|----------|
|
||||
| `/run` | Pipeline manuell starten |
|
||||
| `/rejected` | Abgelehnte Artikel der letzten 3 Tage anzeigen |
|
||||
| `/status` | Aktuellen Pipeline-Status |
|
||||
| `/help` | Alle Befehle anzeigen |
|
||||
|
||||
---
|
||||
|
||||
## Telegram-Benachrichtigungen
|
||||
|
||||
### Neuer Draft erstellt
|
||||
Wenn ein Artikel erfolgreich verarbeitet wurde:
|
||||
|
||||
```
|
||||
✅ Neuer Draft erstellt
|
||||
📰 [Artikel-Titel]
|
||||
🟢 Relevanz-Score: 87/100
|
||||
📅 Vorgeschlagene Veröffentlichung: Mo, 24.03.2026 um 09:00 Uhr
|
||||
🏷 #VanLife #Camping #Wohnmobil
|
||||
🔗 Draft in WordPress öffnen
|
||||
|
||||
[✏️ Neu schreiben] [❌ Verwerfen]
|
||||
```
|
||||
|
||||
### Relevanz-Warnung (Score 60–79)
|
||||
```
|
||||
⚠️ Artikel mit niedrigem Relevanz-Score
|
||||
📰 [Artikel-Titel]
|
||||
🟡 Score: 72/100
|
||||
💬 Artikel behandelt hauptsächlich...
|
||||
🔗 Originalartikel
|
||||
|
||||
[➕ Trotzdem verarbeiten] [❌ Ablehnen]
|
||||
```
|
||||
|
||||
### Abgelehnte Artikel (Ende jedes Runs)
|
||||
Liste aller abgelehnten Artikel mit Override-Buttons für jeden einzelnen.
|
||||
|
||||
---
|
||||
|
||||
## Relevanz-Score
|
||||
|
||||
Der GPT-basierte Score bewertet die Themenrelevanz für den VanLife/Camping-Blog:
|
||||
|
||||
| Score | Aktion |
|
||||
|-------|--------|
|
||||
| 80–100 | Automatisch verarbeiten |
|
||||
| 60–79 | Telegram-Warnung, manueller Override |
|
||||
| 0–59 | Automatisch abgelehnt |
|
||||
|
||||
Themen die hoch scored werden: Campingplätze, Stellplätze, Wohnmobile, Van-Ausbau,
|
||||
Outdoor-Equipment, Wandern, Naturreisen, Roadtrips, Camping-Tipps.
|
||||
|
||||
Schwellwerte sind in `.env` konfigurierbar:
|
||||
```
|
||||
PIPELINE_RELEVANCE_AUTO=80
|
||||
PIPELINE_RELEVANCE_WARN=60
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Veröffentlichungsplan
|
||||
|
||||
- Maximal **2 Beiträge pro Tag**
|
||||
- Bevorzugte Zeiten: **09:00 und 14:00 Uhr** (CET)
|
||||
- Gleichmäßig über die Woche verteilt
|
||||
- Der Vorschlag erscheint in der Telegram-Nachricht
|
||||
- Manuell in WordPress setzen oder über WP Scheduling-Plugin automatisieren
|
||||
|
||||
Einstellbar via:
|
||||
```
|
||||
PIPELINE_MAX_DRAFTS_PER_DAY=2
|
||||
PIPELINE_PUBLISH_HOURS=9,14
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## API-Endpunkte (N8N / extern)
|
||||
|
||||
Alle externen Endpunkte benötigen den Header `X-API-Key: <N8N_API_KEY>`.
|
||||
|
||||
| Methode | Endpunkt | Funktion |
|
||||
|---------|----------|----------|
|
||||
| `POST` | `/api/n8n/pipeline` | Komplette Pipeline starten |
|
||||
| `POST` | `/api/n8n/ingest` | Nur RSS-Import (ohne Rewrite) |
|
||||
|
||||
---
|
||||
|
||||
## Deployment (Hetzner via GitHub)
|
||||
|
||||
Das Deployment läuft automatisch über GitHub Actions beim Push auf `main`:
|
||||
|
||||
1. GitHub Action führt Tests aus
|
||||
2. Bei Erfolg: SSH-Deploy auf Hetzner
|
||||
3. `pip install -r requirements.txt`
|
||||
4. Systemd-Dienst `rss-app` neu starten
|
||||
|
||||
Workflow-Dateien: `.github/workflows/test.yml` und `.github/workflows/deploy.yml`
|
||||
|
||||
---
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
**Pipeline läuft, aber keine Telegram-Nachrichten:**
|
||||
- `TELEGRAM_BOT_TOKEN` und `TELEGRAM_CHAT_ID` prüfen
|
||||
- Webhook-Status prüfen: `GET https://api.telegram.org/bot<TOKEN>/getWebhookInfo`
|
||||
|
||||
**N8N bekommt 401:**
|
||||
- `N8N_API_KEY` in `.env` und N8N-Workflow-Header müssen übereinstimmen
|
||||
|
||||
**Alle Artikel werden abgelehnt:**
|
||||
- `PIPELINE_RELEVANCE_WARN` temporär auf 40 senken zum Testen
|
||||
- Über `/rejected` + Override-Button manuell testen
|
||||
|
||||
**Artikel werden doppelt importiert:**
|
||||
- Deduplication läuft über `source_url` (eindeutig). Bereits verarbeitete Artikel werden nie erneut als Draft angelegt.
|
||||
91
docs/PROJECT_PLAN.md
Normal file
91
docs/PROJECT_PLAN.md
Normal file
|
|
@ -0,0 +1,91 @@
|
|||
# Projektplan (Neustart)
|
||||
|
||||
## Leitentscheidungen
|
||||
- Bestehendes Repository wird weiterverwendet.
|
||||
- Kein harter Endtermin: lauffaehig werden, dann iterativ verbessern.
|
||||
- Hetzner bleibt Laufzeitplattform.
|
||||
- WordPress (IONOS) bleibt vorerst Ziel fuer Publikation.
|
||||
- Auth initial nur mit einem User/Password.
|
||||
|
||||
## Zielbild
|
||||
Eine modulare News-Pipeline mit klaren Stufen:
|
||||
1. Feed-Ingestion
|
||||
2. Inhaltsanalyse und Normalisierung
|
||||
3. Rewrite/Anreicherung
|
||||
4. Legal- und Qualitaetschecks
|
||||
5. WordPress-Publikation (Draft-first, Queue + Retry)
|
||||
6. Monitoring/Logging
|
||||
|
||||
## Grobe Zeitplanung (ohne Fixtermine)
|
||||
- Phase 0: ca. 1 Woche
|
||||
- Phase 1: ca. 2-4 Wochen
|
||||
- Phase 2: ca. 2-3 Wochen
|
||||
- Phase 3: fortlaufend
|
||||
|
||||
## Phasen
|
||||
|
||||
### Phase 0 - Grundlagen (jetzt)
|
||||
- Doku und Wiki strukturieren
|
||||
- Source-Policy definieren
|
||||
- Redirect fuer `news.vanityontour.de` setzen
|
||||
- GitHub Project als zentrale Planung scharfstellen
|
||||
|
||||
### Phase 1 - MVP Core
|
||||
- Neues FastAPI-Projektgeruest
|
||||
- SQLite-Datenmodell (feeds, articles, runs, source_policy)
|
||||
- Feed-Import mit Duplikaterkennung
|
||||
- Admin-Login (ein User)
|
||||
- Manuelle Review vor Publish
|
||||
- Admin-UI fuer Rechtscheck, Bildauswahl, Relevanzbewertung
|
||||
|
||||
### Phase 2 - Automation
|
||||
- Job-Queue (asynchron)
|
||||
- Regelbasierte Scheduler
|
||||
- Retry/Dead-Letter-Handling
|
||||
- Robustes Error-Reporting
|
||||
- WordPress-Publisher (Draft) mit Mapping `article_id -> wp_post_id`
|
||||
|
||||
### Phase 3 - Compliance und Skalierung
|
||||
- Source-Whitelisting mit Pflichtfeldern
|
||||
- Pflicht-Attribution pro Artikel
|
||||
- Qualitaetsmetriken und Audit-Logs
|
||||
- Optional: Passkey/WebAuthn
|
||||
|
||||
## Aktueller Stand (Snapshot)
|
||||
- Backend/API + Admin-UI lauffaehig
|
||||
- Feed-Ingestion inkl. Originalartikel-Extraktion (Autor, Pressekontakt, Bilder)
|
||||
- Bildkuration:
|
||||
- automatische Scoring-Reduktion (u. a. Presseportal `story_big` priorisiert)
|
||||
- manuelle Auswahl/Ausblendung im UI
|
||||
- Rechts-/Publish-Gates aktiv:
|
||||
- `legal_checked` Pflicht
|
||||
- Hauptbild-Auswahl Pflicht
|
||||
- Status-Workflow bis `published`
|
||||
- WordPress-Publishing:
|
||||
- Queue + Retry + Job-Historie
|
||||
- Draft-Erstellung/Update erfolgreich getestet
|
||||
- Exporte:
|
||||
- JSON/CSV inkl. Datum/Alter/Relevanz + Attribution/Legal-Felder
|
||||
|
||||
## Naechste Iteration (konkret)
|
||||
1. WordPress `featured_media` Upload aus ausgewaehltem Hauptbild
|
||||
2. Publish-HTML je Artikel verfeinern (strukturierter Body + konsistenter Quellenblock)
|
||||
3. Publisher als periodischen Worker (Timer/Cron/Systemd) auf Hetzner betreiben
|
||||
4. Monitoring/Alerting fuer Queue-Fehler + WP-API Fehlercodes
|
||||
|
||||
## Architekturprinzipien
|
||||
- Idempotente Jobs
|
||||
- Trennung von UI, API, Worker
|
||||
- Strikte Validierung bei Quell-/Lizenzdaten
|
||||
- Expliziter Publish-Schritt, kein blindes Autoposting
|
||||
|
||||
## Risiken
|
||||
- Lizenz-/Nutzungsbedingungen je Quelle variieren stark
|
||||
- Feeds aendern Struktur/Verfuegbarkeit
|
||||
- WordPress-API und Auth koennen regressionsanfaellig sein
|
||||
|
||||
## Erfolgsmetriken
|
||||
- Zeit von Feed-Eingang bis Review-Ready
|
||||
- Quote sauber attribuierter Artikel
|
||||
- Fehlerrate pro Pipeline-Stufe
|
||||
- Anzahl manueller Eingriffe pro Woche
|
||||
81
docs/SOURCE_POLICY.md
Normal file
81
docs/SOURCE_POLICY.md
Normal file
|
|
@ -0,0 +1,81 @@
|
|||
# Source Policy und Feed-Vorschlaege
|
||||
|
||||
## Grundsatz
|
||||
Es werden nur Quellen genutzt, deren Nutzungsbedingungen die geplante Nutzung erlauben oder fuer die eine explizite Genehmigung vorliegt.
|
||||
|
||||
## Pflichtdaten pro Quelle
|
||||
- Quellname
|
||||
- Feed-URL
|
||||
- Originalartikel-URL
|
||||
- Autor/Herausgeber (wenn vorhanden)
|
||||
- Lizenz/Nutzungsgrundlage
|
||||
- Einschraenkungen (kommerziell, Bearbeitung, Bildrechte, Archivierung)
|
||||
- Datum der letzten Pruefung
|
||||
- Link auf Nutzungsbedingungen
|
||||
|
||||
## Einstufung (Ampel)
|
||||
- Gruen: Nutzung fuer geplantes Modell klar erlaubt
|
||||
- Gelb: teilklar/mit Einschraenkungen, manuelle Pruefung erforderlich
|
||||
- Rot: fuer das Modell nicht geeignet ohne Zusatzvertrag
|
||||
|
||||
## Verbindliche Regeln
|
||||
- Keine neue Quelle ohne Eintrag im Source-Register
|
||||
- Kein automatischer Publish bei Gelb/Rot
|
||||
- Bilder separat pruefen (Textrecht != Bildrecht)
|
||||
- Quartalsweiser Re-Check der Terms
|
||||
|
||||
## Ersteinschaetzung (Stand: 16.02.2026)
|
||||
|
||||
### Rot
|
||||
1. Reuters / Thomson Reuters
|
||||
- Grund: Inhalte sind urheberrechtlich geschuetzt; Reproduktion/Verteilung laut Terms nur mit vorheriger Zustimmung.
|
||||
- Folge: Nur mit explizitem Vertrag/Lizenz.
|
||||
- Referenz:
|
||||
- https://www.thomsonreuters.com/en/terms-of-use
|
||||
|
||||
2. tagesschau.de RSS
|
||||
- Grund: Inhalte nur privat/nicht-kommerziell; Veroeffentlichung grundsaetzlich nicht erlaubt (ausser explizit CC-lizenziert).
|
||||
- Folge: Nicht fuer das geplante Modell geeignet.
|
||||
- Referenz:
|
||||
- https://www.tagesschau.de/infoservices/rssfeeds
|
||||
|
||||
### Gelb
|
||||
1. Presseportal / ots
|
||||
- Grund: Redaktionelle Nutzung grundsaetzlich moeglich, aber Verantwortung liegt beim Verwender; darueber hinausgehende Geschaeftsnutzung nur mit Genehmigung.
|
||||
- Folge: Nur mit strikter Einzelpruefung pro Meldung (insb. Bild-/Drittrechte).
|
||||
- Referenz:
|
||||
- https://www.presseportal.de/nutzungsbedingungen
|
||||
- https://www.presseportal.de/feeds/
|
||||
|
||||
2. Bundesbehoerden-RSS ohne explizite freie Weiterverwendungs-Lizenz
|
||||
- Grund: RSS wird bereitgestellt, aber nicht immer als offene Lizenz zur kommerziellen Nachnutzung formuliert.
|
||||
- Folge: Je Behoerde einzeln pruefen und dokumentieren.
|
||||
- Beispiele:
|
||||
- https://www.bundesfinanzministerium.de/Content/DE/Standardartikel/Service/rss_base.html
|
||||
- https://bmas.bund.de/EN/Services/RSS/rss.html
|
||||
|
||||
### Gruen (mit korrekter Attribution)
|
||||
1. GovData / Open-Data-Portale mit `dl-de/by-2-0`, `dl-de/zero-2-0`, `CC BY 4.0` oder `CC0`
|
||||
- Grund: Diese Lizenzen erlauben grundsaetzlich auch kommerzielle Weiterverwendung (je nach Lizenzbedingungen).
|
||||
- Folge: Sehr gut fuer stabile Automatisierung geeignet.
|
||||
- Referenz:
|
||||
- https://www.govdata.de/dl-de/by-2-0
|
||||
- https://data.gov.de/informationen/lizenzen
|
||||
- https://www.dcat-ap.de/def/licenses/dl-zero-de/2.0
|
||||
|
||||
2. EU-Quellen mit expliziter `CC BY 4.0` Wiederverwendungsregel
|
||||
- Grund: EU-Inhalte sind haeufig unter CC BY 4.0 wiederverwendbar, sofern nicht anders gekennzeichnet.
|
||||
- Folge: Geeignet, wenn Drittinhalte ausgenommen werden.
|
||||
- Referenz:
|
||||
- https://commission.europa.eu/legal-notice_en
|
||||
- https://eur-lex.europa.eu/content/help/content/legal-notice/legal-notice.html
|
||||
|
||||
## Quelle im Register freischalten (Definition of Done)
|
||||
- Terms-Link hinterlegt
|
||||
- Lizenzklasse (Gruen/Gelb/Rot) gesetzt
|
||||
- Pflicht-Attribution dokumentiert
|
||||
- Bildrechtsregel dokumentiert
|
||||
- Letzte Pruefung und Verantwortlicher gepflegt
|
||||
|
||||
## Hinweis
|
||||
Keine Rechtsberatung. Bei unklaren oder wirtschaftlich kritischen Quellen ist eine juristische Prüfung sinnvoll.
|
||||
38
docs/TODO.md
Normal file
38
docs/TODO.md
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
# ToDo (Ein-Entwickler Setup)
|
||||
|
||||
## Jetzt
|
||||
- [ ] WordPress Beitragsbild-Upload implementieren (`featured_media` aus ausgewaehltem Hauptbild)
|
||||
- [ ] WordPress-HTML-Ausgabe pro Artikel weiter verbessern (sauberes Layout, Quellenblock, Shortcodes falls noetig)
|
||||
- [ ] Publisher Fehlertexte fuer WP-Auth/Media/API in UI klarer darstellen
|
||||
- [ ] End-to-end Publish Smoke-Test dokumentieren (lokal + Hetzner)
|
||||
|
||||
## MVP
|
||||
- [x] Neues Backend-Skelett (`backend/`) aufsetzen (FastAPI)
|
||||
- [x] Datenmodell in SQLite anlegen
|
||||
- [x] Feed-Ingestion Service bauen (ETag/Last-Modified)
|
||||
- [x] Duplikaterkennung ueber `source_url`, `guid`, Hash
|
||||
- [x] Login mit 1 Admin-Account implementieren
|
||||
- [x] Artikel-Review-Maske mit Statusworkflow
|
||||
- [x] WordPress-Publisher als separaten Service implementieren (Queue + Retry + Mapping)
|
||||
- [x] Bildvorschau + manuelle Bildauswahl im Admin-UI
|
||||
- [x] Automatische Bildreduktion/Scoring fuer Presseportal-Quellen
|
||||
- [x] Artikel-Datum + Relevanzscore im UI/Export
|
||||
|
||||
## Recht/Qualitaet
|
||||
- [x] Source-Policy in DB + Admin-UI abbilden
|
||||
- [x] Pflichtfelder je Quelle erzwingen (Autor, URL, Lizenz, Hinweise)
|
||||
- [x] Auto-Block bei fehlender Lizenzinfo
|
||||
- [x] Pro Artikel Attribution-Block generieren
|
||||
- [x] Manuelle Rechtsfreigabe als Publish-Gate
|
||||
|
||||
## Betrieb
|
||||
- [ ] Systemd-Service(s) fuer API/Worker erstellen
|
||||
- [ ] Nginx-Routing fuer neue App einrichten
|
||||
- [ ] Healthcheck-Endpunkte + Monitoring einrichten
|
||||
- [ ] Backup/Restore fuer DB dokumentieren
|
||||
|
||||
## Spaeter
|
||||
- [ ] Passkey/WebAuthn evaluieren und optional einfuehren
|
||||
- [ ] Migration auf PostgreSQL bewerten
|
||||
- [ ] Teilautomatische Freigabe-Regeln definieren
|
||||
- [ ] KI-Rewrite mit Prompt-Versionierung + Qualitaetsmetriken wieder aktivieren
|
||||
37
docs/roadmap-image-dedup.md
Normal file
37
docs/roadmap-image-dedup.md
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
# Roadmap: Bild-Deduplizierung & Medien-Hygiene
|
||||
|
||||
## Ziele
|
||||
- Speicherverbrauch reduzieren
|
||||
- Medienbestand konsistent halten
|
||||
- Pipeline stabilisieren (keine Mehrfach-Uploads und -Speicherungen)
|
||||
|
||||
## Vorgehen (sicher und reversibel)
|
||||
1. **Index aufbauen (Read-Only):**
|
||||
- Alle Bilder (`.jpg/.jpeg/.png/.webp/.gif`) in definierten Verzeichnissen scannen
|
||||
- Für jede Datei: `sha256` (Byte-Hash) + `pHash` (perzeptuell) berechnen
|
||||
- Ergebnis als SQLite-Index + CSV-Report speichern
|
||||
|
||||
2. **Kanonisierung & Referenzen prüfen:**
|
||||
- Pro Duplikatgruppe genau **eine** kanonische Datei wählen (größte/neueste)
|
||||
- Alle internen Referenzen (DB/JSON) testweise auf Kanon aktualisieren (Dry-Run)
|
||||
|
||||
3. **Speicher sparen ohne Risiko:**
|
||||
- Nicht-kanonische Dateien durch **Hardlinks** auf den Kanon ersetzen (gleiches FS)
|
||||
- Alternativ: nur löschen, wenn Referenzen **sicher** auf Kanon zeigen
|
||||
|
||||
4. **Prävention für die Zukunft:**
|
||||
- Beim Speichern: **Content-Addressed Storage** (`<sha256>.<ext>`)
|
||||
- In DB ein `content_hash`-Feld mit **Unique-Constraint**
|
||||
- Vor jedem Speichern/Upload: Hash lookup → vorhandene Datei wiederverwenden
|
||||
|
||||
## Akzeptanzkriterien
|
||||
- Report listet alle Duplikatgruppen mit Pfaden und Größenersparnis
|
||||
- Dry-Run zeigt geplante Änderungen ohne Schreibzugriff
|
||||
- Nach „Anwenden“ verweisen alle Referenzen auf die kanonische Datei
|
||||
- Re-Run findet **keine** Duplikate mehr (idempotent)
|
||||
- Rollback möglich via Backup der Reports/Indexdatei
|
||||
|
||||
## Metriken
|
||||
- Anzahl Bilder vorher/nachher
|
||||
- Ersparter Speicher (MB/GB)
|
||||
- Anzahl gruppierter Duplikate
|
||||
29
docs/wiki/Architektur.md
Normal file
29
docs/wiki/Architektur.md
Normal file
|
|
@ -0,0 +1,29 @@
|
|||
# Architektur
|
||||
|
||||
## Zielarchitektur
|
||||
- API: FastAPI
|
||||
- Worker: Queue-basierte Hintergrundjobs
|
||||
- DB: SQLite (MVP), spaeter optional PostgreSQL
|
||||
- Publisher: WordPress REST API
|
||||
- Frontend/Admin: schlanke Web-UI mit Login
|
||||
|
||||
## Pipeline
|
||||
1. Feed Fetch
|
||||
2. Parse + Normalize
|
||||
3. Deduplicate
|
||||
4. Enrichment (Rewrite/Tags)
|
||||
5. Legal/Policy Check
|
||||
6. Publish (pending)
|
||||
|
||||
## Datenobjekte (MVP)
|
||||
- `sources`
|
||||
- `feeds`
|
||||
- `articles`
|
||||
- `article_versions`
|
||||
- `runs`
|
||||
- `policy_checks`
|
||||
|
||||
## Nichtziele (MVP)
|
||||
- Multi-User und Rollen
|
||||
- Vollautomatische Freigabe ohne Review
|
||||
- Komplexe externe SSO-Integration
|
||||
20
docs/wiki/Deployment.md
Normal file
20
docs/wiki/Deployment.md
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
# Deployment (Hetzner + CloudPanel)
|
||||
|
||||
## Umgebung
|
||||
- Host: Hetzner
|
||||
- Reverse Proxy: Nginx via CloudPanel
|
||||
- Ziel-Domain: `news.vanityontour.de`
|
||||
|
||||
## Aktueller Zustand
|
||||
- Domain ist bis zum Go-Live auf `https://vanityontour.de` umgeleitet.
|
||||
|
||||
## Zielzustand
|
||||
- `news.vanityontour.de` zeigt auf neue App (interner Port, z. B. `127.0.0.1:8501`)
|
||||
- API/Worker laufen als systemd-Services
|
||||
- TLS bleibt ueber CloudPanel/Nginx
|
||||
|
||||
## Mindest-Checks nach Deployment
|
||||
- `curl -I https://news.vanityontour.de`
|
||||
- Login erreichbar
|
||||
- Feed-Import laeuft
|
||||
- WordPress-Testpublikation (pending) erfolgreich
|
||||
19
docs/wiki/Home.md
Normal file
19
docs/wiki/Home.md
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
# Wiki Home
|
||||
|
||||
## Zweck
|
||||
Dieses Wiki dokumentiert Architektur, Betrieb, Sicherheit, Recht und Roadmap des Neuaufbaus von `rss-news`.
|
||||
|
||||
## Inhalte
|
||||
- `Architektur.md`
|
||||
- `Deployment.md`
|
||||
- `Security-Auth.md`
|
||||
- `Recht-Quellen.md`
|
||||
- `Operations-Runbook.md`
|
||||
- `Roadmap.md`
|
||||
- `Project-Board.md`
|
||||
|
||||
## Projektsteuerung
|
||||
- GitHub Project #3: https://github.com/users/OliverGiertz/projects/3/views/1
|
||||
|
||||
## Prinzip
|
||||
Dokumentation wird bei jeder relevanten Aenderung im selben Pull Request aktualisiert.
|
||||
43
docs/wiki/Operations-Runbook.md
Normal file
43
docs/wiki/Operations-Runbook.md
Normal file
|
|
@ -0,0 +1,43 @@
|
|||
# Operations Runbook
|
||||
|
||||
## Daily Checks
|
||||
- App erreichbar
|
||||
- Queue/Worker aktiv
|
||||
- Letzte Feed-Laeufe erfolgreich
|
||||
- Keine auffaelligen Fehler im Log
|
||||
|
||||
## Incident: Feed-Import faellt aus
|
||||
1. RSS-Quelle erreichbar?
|
||||
2. Parser-Fehler im Log?
|
||||
3. Rate Limits oder Blockaden?
|
||||
4. Retry-Queue pruefen
|
||||
|
||||
## Incident: WordPress Publish faellt aus
|
||||
1. WP API erreichbar?
|
||||
2. Credentials gueltig?
|
||||
3. Payload-Validation/Tag-Fehler?
|
||||
4. Artikel in `pending` statt `failed` markieren, wenn unklar
|
||||
|
||||
## Incident: Telegram-Buttons reagieren nicht / Befehle ignoriert
|
||||
|
||||
**Ursache:** N8N "App Release - Telegram Bot"-Workflow hat den Webhook überschrieben.
|
||||
|
||||
**Prüfen:**
|
||||
```bash
|
||||
curl -s "https://api.telegram.org/bot8403822424:AAGp8gZoNIGZv3IIan45q7P9HfM868qzXi4/getWebhookInfo" | python3 -m json.tool
|
||||
```
|
||||
→ `url` muss auf `https://news.vanityontour.de/telegram/webhook` zeigen
|
||||
→ `allowed_updates` muss `["message", "callback_query"]` enthalten
|
||||
|
||||
**Webhook zurücksetzen:**
|
||||
```bash
|
||||
curl -s -X POST "https://api.telegram.org/bot8403822424:AAGp8gZoNIGZv3IIan45q7P9HfM868qzXi4/setWebhook" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"url":"https://news.vanityontour.de/telegram/webhook","allowed_updates":["message","callback_query"],"secret_token":"RWWAaBwfCUX9Y573JVkB9zAeloHsZZoruXOBBgUtsvU"}'
|
||||
```
|
||||
|
||||
Vollständige Dokumentation: `projects/webhook/telegram-webhook-reset.md`
|
||||
|
||||
## Backups
|
||||
- SQLite-Dump taeglich
|
||||
- Konfiguration und `.env` sicher sichern
|
||||
28
docs/wiki/Project-Board.md
Normal file
28
docs/wiki/Project-Board.md
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
# Project Board Workflow
|
||||
|
||||
## Zentrale Steuerung
|
||||
- Board: https://github.com/users/OliverGiertz/projects/3/views/1
|
||||
- Board ist die einzige Quelle fuer Planungsstatus.
|
||||
|
||||
## Arbeitsmodus (1 Entwickler)
|
||||
- Neue Arbeit immer als Issue anlegen
|
||||
- Issue direkt ins Project aufnehmen
|
||||
- Status nur im Project pflegen
|
||||
- PR/Commit auf Issue referenzieren
|
||||
|
||||
## Empfohlene Status-Disziplin
|
||||
- `Todo`: noch nicht begonnen
|
||||
- `In Progress`: aktiv in Arbeit
|
||||
- `Done`: umgesetzt und dokumentiert
|
||||
|
||||
## Konventionen fuer Issues
|
||||
- Prefix fuer Klarheit:
|
||||
- `[MVP]`
|
||||
- `[Infra]`
|
||||
- `[Legal]`
|
||||
- `[Bug]`
|
||||
- Definition of Done in jedem Issue notieren
|
||||
|
||||
## Aktueller Backlog-Hinweis
|
||||
- Thema Userverwaltung ist fuer MVP obsolet (ein Admin-User).
|
||||
- Entsprechende Issues als `deferred` oder `closed` kennzeichnen.
|
||||
35
docs/wiki/Recht-Quellen.md
Normal file
35
docs/wiki/Recht-Quellen.md
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
# Recht und Quellen
|
||||
|
||||
## Grundregeln
|
||||
- Nur freigegebene Quellen aus Source-Register
|
||||
- Pflicht-Attribution pro Artikel
|
||||
- Rechte fuer Bilder separat pruefen
|
||||
- Kein Autopublish bei unklarer Lizenz
|
||||
|
||||
## Bewertungsmodell
|
||||
- Gruen: Freie Nachnutzung klar erlaubt
|
||||
- Gelb: Nutzung mit Einschraenkungen/Einzelfallpruefung
|
||||
- Rot: Ohne Zusatzlizenz nicht geeignet
|
||||
|
||||
## Aktuelle Referenzen
|
||||
- Reuters/Thomson Reuters Terms: https://www.thomsonreuters.com/en/terms-of-use
|
||||
- Presseportal Nutzungsbedingungen: https://www.presseportal.de/nutzungsbedingungen
|
||||
- tagesschau RSS-Hinweise: https://www.tagesschau.de/infoservices/rssfeeds
|
||||
- Datenlizenz Deutschland BY 2.0: https://www.govdata.de/dl-de/by-2-0
|
||||
- GovData Lizenzen: https://data.gov.de/informationen/lizenzen
|
||||
- EU Legal Notice (CC BY 4.0): https://commission.europa.eu/legal-notice_en
|
||||
|
||||
## Review-Checkliste je Quelle
|
||||
1. Sind Bearbeitung und Veroeffentlichung erlaubt?
|
||||
2. Ist kommerzielle Nutzung erlaubt?
|
||||
3. Gibt es gesonderte Bildrechte?
|
||||
4. Ist die Quellenangabe vorgeschrieben?
|
||||
5. Gibt es Archivierungs- oder Weitergabebeschraenkungen?
|
||||
|
||||
## Operativer Schutz
|
||||
- Source-Register als Pflicht vor Feed-Aktivierung
|
||||
- Auto-Block bei fehlenden Lizenzdaten
|
||||
- Quartalsweiser Terms-Recheck
|
||||
|
||||
## Hinweis
|
||||
Keine Rechtsberatung. Finale Freigabe kritischer Quellen bei Bedarf juristisch validieren.
|
||||
19
docs/wiki/Roadmap.md
Normal file
19
docs/wiki/Roadmap.md
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
# Roadmap
|
||||
|
||||
## Jetzt
|
||||
- Doku und Projektstruktur bereinigen
|
||||
- Redirect aktiv
|
||||
- Backlog auf Neustart ausrichten
|
||||
|
||||
## Naechster Schritt
|
||||
- FastAPI-MVP implementieren
|
||||
- Login + Feed-Ingestion + Review + WordPress pending
|
||||
|
||||
## Danach
|
||||
- Worker/Queue
|
||||
- Source-Policy Enforcement
|
||||
- Monitoring/Reporting
|
||||
- Optional Passkey
|
||||
|
||||
## Steuerung
|
||||
Alle Arbeitsitems liegen im GitHub Project #3.
|
||||
16
docs/wiki/Security-Auth.md
Normal file
16
docs/wiki/Security-Auth.md
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
# Security und Auth
|
||||
|
||||
## Mindestanforderungen
|
||||
- Zugriff auf die WebApp nur mit Login
|
||||
- Ein aktiver Admin-User (kein Rollenmodell im MVP)
|
||||
- Passwort nicht im Repo, nur als Secret auf Server
|
||||
|
||||
## Empfohlene Umsetzung
|
||||
- Session-basierte Auth (HTTP-only Cookies)
|
||||
- Passwort gehasht (Argon2 oder bcrypt)
|
||||
- Rate Limiting auf Login-Endpunkt
|
||||
- CSRF-Schutz fuer Form-Aktionen
|
||||
|
||||
## Spaeter (optional)
|
||||
- Passkey/WebAuthn als zusaetzlicher Login-Faktor
|
||||
- IP-Allowlist fuer Admin-Zugang
|
||||
5915
logs/rss_tool.log
5915
logs/rss_tool.log
File diff suppressed because it is too large
Load diff
|
|
@ -2,8 +2,13 @@
|
|||
|
||||
import streamlit as st
|
||||
from main import load_feeds, save_feeds, load_articles
|
||||
from utils.css_loader import load_css, apply_dark_theme
|
||||
import logging
|
||||
|
||||
# === CSS & Theme laden ===
|
||||
load_css()
|
||||
apply_dark_theme()
|
||||
|
||||
# === Logging vorbereiten ===
|
||||
log_dir = "logs"
|
||||
log_file = f"{log_dir}/rss_tool.log"
|
||||
|
|
@ -15,17 +20,29 @@ logging.basicConfig(
|
|||
|
||||
st.set_page_config(page_title="📡 Feed-Verwaltung")
|
||||
|
||||
st.title("📡 RSS Feed-Verwaltung")
|
||||
# Header
|
||||
st.markdown("""
|
||||
<div class="main-header">
|
||||
<h1>📡 RSS Feed-Verwaltung</h1>
|
||||
<p>Verwalte deine RSS-Feeds zentral und effizient</p>
|
||||
</div>
|
||||
""", unsafe_allow_html=True)
|
||||
|
||||
feeds = load_feeds()
|
||||
articles = load_articles()
|
||||
|
||||
# === Neuen Feed hinzufügen ===
|
||||
st.markdown('<div class="filter-section">', unsafe_allow_html=True)
|
||||
st.subheader("➕ Neuen Feed hinzufügen")
|
||||
|
||||
with st.form("add_feed_form"):
|
||||
new_url = st.text_input("Feed URL", "")
|
||||
new_name = st.text_input("Feed Name", "")
|
||||
submitted = st.form_submit_button("Feed hinzufügen")
|
||||
col1, col2 = st.columns(2)
|
||||
with col1:
|
||||
new_url = st.text_input("Feed URL", "", placeholder="https://example.com/feed.xml")
|
||||
with col2:
|
||||
new_name = st.text_input("Feed Name", "", placeholder="Beispiel News")
|
||||
|
||||
submitted = st.form_submit_button("Feed hinzufügen", use_container_width=True)
|
||||
if submitted:
|
||||
if new_url and new_name:
|
||||
if not any(f.get("url") == new_url for f in feeds):
|
||||
|
|
@ -39,33 +56,185 @@ with st.form("add_feed_form"):
|
|||
else:
|
||||
st.error("❌ Bitte gib sowohl URL als auch Name ein.")
|
||||
|
||||
st.markdown('</div>', unsafe_allow_html=True)
|
||||
|
||||
# === Bestehende Feeds bearbeiten ===
|
||||
st.subheader("🛠️ Vorhandene Feeds bearbeiten oder löschen")
|
||||
st.subheader("🛠️ Vorhandene Feeds verwalten")
|
||||
|
||||
for idx, feed in enumerate(feeds):
|
||||
with st.expander(f"🔗 {feed.get('name')}"):
|
||||
url = st.text_input(f"Feed-URL {idx}", value=feed.get("url"), key=f"url_{idx}")
|
||||
name = st.text_input(f"Feed-Name {idx}", value=feed.get("name"), key=f"name_{idx}")
|
||||
count = sum(1 for a in articles if a.get("source") == feed.get("url"))
|
||||
|
||||
col1, col2 = st.columns(2)
|
||||
if not feeds:
|
||||
st.info("Noch keine Feeds konfiguriert. Füge oben deinen ersten Feed hinzu!")
|
||||
else:
|
||||
for idx, feed in enumerate(feeds):
|
||||
feed_url = feed.get("url", "")
|
||||
feed_name = feed.get("name", "Unbekannt")
|
||||
article_count = sum(1 for a in articles if a.get("source") == feed_url)
|
||||
|
||||
# Feed Card
|
||||
st.markdown(f"""
|
||||
<div class="article-card">
|
||||
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 1rem;">
|
||||
<div>
|
||||
<h3 class="article-title">{feed_name}</h3>
|
||||
<div class="article-meta">{feed_url}</div>
|
||||
</div>
|
||||
<div>
|
||||
<span class="status-badge status-online">{article_count} Artikel</span>
|
||||
</div>
|
||||
</div>
|
||||
<div class="article-footer">
|
||||
📰 Verknüpfte Artikel: {article_count}
|
||||
</div>
|
||||
</div>
|
||||
""", unsafe_allow_html=True)
|
||||
|
||||
# Actions
|
||||
col1, col2, col3 = st.columns(3)
|
||||
|
||||
with col1:
|
||||
if st.button("💾 Änderungen speichern", key=f"save_{idx}"):
|
||||
old_url, old_name = feed.get("url"), feed.get("name")
|
||||
feeds[idx]["url"] = url
|
||||
feeds[idx]["name"] = name
|
||||
save_feeds(feeds)
|
||||
logging.info(f"✏️ Feed geändert: '{old_name}' ({old_url}) → '{name}' ({url})")
|
||||
st.success("Änderungen gespeichert.")
|
||||
st.rerun()
|
||||
if st.button("💾 Bearbeiten", key=f"edit_{idx}", use_container_width=True):
|
||||
st.session_state[f"edit_mode_{idx}"] = not st.session_state.get(f"edit_mode_{idx}", False)
|
||||
|
||||
with col2:
|
||||
if st.button("🗑️ Feed löschen", key=f"delete_{idx}"):
|
||||
deleted_feed = feeds.pop(idx)
|
||||
save_feeds(feeds)
|
||||
logging.info(f"❌ Feed gelöscht: {deleted_feed.get('name')} ({deleted_feed.get('url')})")
|
||||
st.warning("Feed gelöscht.")
|
||||
st.rerun()
|
||||
if st.button("🔄 Aktualisieren", key=f"refresh_{idx}", use_container_width=True):
|
||||
with st.spinner(f"Aktualisiere Feed '{feed_name}'..."):
|
||||
# Hier könntest du eine einzelne Feed-Update-Funktion implementieren
|
||||
from main import process_articles
|
||||
existing_ids = [a["id"] for a in articles]
|
||||
process_articles(existing_ids)
|
||||
st.success(f"Feed '{feed_name}' aktualisiert!")
|
||||
st.rerun()
|
||||
|
||||
st.caption(f"📰 Verknüpfte Artikel: {count}")
|
||||
with col3:
|
||||
if st.button("🗑️ Löschen", key=f"delete_{idx}", use_container_width=True):
|
||||
# Bestätigung über Session State
|
||||
if not st.session_state.get(f"confirm_delete_{idx}", False):
|
||||
st.session_state[f"confirm_delete_{idx}"] = True
|
||||
st.warning(f"Klicke erneut um '{feed_name}' wirklich zu löschen!")
|
||||
else:
|
||||
deleted_feed = feeds.pop(idx)
|
||||
save_feeds(feeds)
|
||||
logging.info(f"❌ Feed gelöscht: {deleted_feed.get('name')} ({deleted_feed.get('url')})")
|
||||
st.success(f"Feed '{feed_name}' wurde gelöscht.")
|
||||
# Cleanup Session State
|
||||
if f"confirm_delete_{idx}" in st.session_state:
|
||||
del st.session_state[f"confirm_delete_{idx}"]
|
||||
st.rerun()
|
||||
|
||||
# Edit Form (wenn aktiviert)
|
||||
if st.session_state.get(f"edit_mode_{idx}", False):
|
||||
st.markdown('<div class="filter-section" style="margin-top: 1rem;">', unsafe_allow_html=True)
|
||||
st.write("**Feed bearbeiten:**")
|
||||
|
||||
with st.form(f"edit_form_{idx}"):
|
||||
col1, col2 = st.columns(2)
|
||||
with col1:
|
||||
edited_url = st.text_input("Feed-URL", value=feed_url, key=f"edit_url_{idx}")
|
||||
with col2:
|
||||
edited_name = st.text_input("Feed-Name", value=feed_name, key=f"edit_name_{idx}")
|
||||
|
||||
form_col1, form_col2 = st.columns(2)
|
||||
with form_col1:
|
||||
if st.form_submit_button("💾 Änderungen speichern", use_container_width=True):
|
||||
old_url, old_name = feed.get("url"), feed.get("name")
|
||||
feeds[idx]["url"] = edited_url
|
||||
feeds[idx]["name"] = edited_name
|
||||
save_feeds(feeds)
|
||||
logging.info(f"✏️ Feed geändert: '{old_name}' ({old_url}) → '{edited_name}' ({edited_url})")
|
||||
st.success("Änderungen gespeichert!")
|
||||
st.session_state[f"edit_mode_{idx}"] = False
|
||||
st.rerun()
|
||||
|
||||
with form_col2:
|
||||
if st.form_submit_button("❌ Abbrechen", use_container_width=True):
|
||||
st.session_state[f"edit_mode_{idx}"] = False
|
||||
st.rerun()
|
||||
|
||||
st.markdown('</div>', unsafe_allow_html=True)
|
||||
|
||||
# === Feed-Statistiken ===
|
||||
if feeds:
|
||||
st.markdown("---")
|
||||
st.subheader("📊 Feed-Statistiken")
|
||||
|
||||
col1, col2, col3 = st.columns(3)
|
||||
|
||||
with col1:
|
||||
st.markdown("""
|
||||
<div class="stats-card">
|
||||
<div class="stats-number">{}</div>
|
||||
<div>Feeds Gesamt</div>
|
||||
</div>
|
||||
""".format(len(feeds)), unsafe_allow_html=True)
|
||||
|
||||
with col2:
|
||||
total_articles = len(articles)
|
||||
st.markdown("""
|
||||
<div class="stats-card">
|
||||
<div class="stats-number">{}</div>
|
||||
<div>Artikel Gesamt</div>
|
||||
</div>
|
||||
""".format(total_articles), unsafe_allow_html=True)
|
||||
|
||||
with col3:
|
||||
avg_articles = total_articles // len(feeds) if feeds else 0
|
||||
st.markdown("""
|
||||
<div class="stats-card">
|
||||
<div class="stats-number">{}</div>
|
||||
<div>Ø Artikel pro Feed</div>
|
||||
</div>
|
||||
""".format(avg_articles), unsafe_allow_html=True)
|
||||
|
||||
# === Bulk Actions ===
|
||||
if feeds:
|
||||
st.markdown("---")
|
||||
st.subheader("⚡ Bulk-Aktionen")
|
||||
|
||||
col1, col2 = st.columns(2)
|
||||
|
||||
with col1:
|
||||
if st.button("🔄 Alle Feeds aktualisieren", use_container_width=True):
|
||||
with st.spinner("Aktualisiere alle Feeds..."):
|
||||
from main import process_articles
|
||||
existing_ids = [a["id"] for a in articles]
|
||||
process_articles(existing_ids)
|
||||
st.success(f"Alle {len(feeds)} Feeds wurden aktualisiert!")
|
||||
st.rerun()
|
||||
|
||||
with col2:
|
||||
if st.button("📊 Feed-Performance anzeigen", use_container_width=True):
|
||||
st.subheader("📈 Feed-Performance")
|
||||
|
||||
# Performance-Daten sammeln
|
||||
feed_performance = []
|
||||
for feed in feeds:
|
||||
feed_url = feed.get("url", "")
|
||||
feed_name = feed.get("name", "Unbekannt")
|
||||
feed_articles = [a for a in articles if a.get("source") == feed_url]
|
||||
|
||||
performance = {
|
||||
"name": feed_name,
|
||||
"url": feed_url,
|
||||
"total_articles": len(feed_articles),
|
||||
"new_articles": len([a for a in feed_articles if a.get("status") == "New"]),
|
||||
"processed_articles": len([a for a in feed_articles if a.get("status") in ["Process", "Online", "WordPress Pending"]])
|
||||
}
|
||||
feed_performance.append(performance)
|
||||
|
||||
# Sortiere nach Artikel-Anzahl
|
||||
feed_performance.sort(key=lambda x: x["total_articles"], reverse=True)
|
||||
|
||||
# Anzeige als Cards
|
||||
for perf in feed_performance:
|
||||
success_rate = (perf["processed_articles"] / perf["total_articles"] * 100) if perf["total_articles"] > 0 else 0
|
||||
|
||||
st.markdown(f"""
|
||||
<div class="article-card">
|
||||
<h3 class="article-title">{perf["name"]}</h3>
|
||||
<div class="article-footer">
|
||||
📰 {perf["total_articles"]} Artikel |
|
||||
🆕 {perf["new_articles"]} Neu |
|
||||
✅ {perf["processed_articles"]} Verarbeitet |
|
||||
📊 {success_rate:.1f}% Success Rate
|
||||
</div>
|
||||
</div>
|
||||
""", unsafe_allow_html=True)
|
||||
|
|
@ -1,23 +1,297 @@
|
|||
# log_viewer.py
|
||||
# pages/log_viewer.py
|
||||
|
||||
import streamlit as st
|
||||
import os
|
||||
from utils.css_loader import load_css, apply_dark_theme
|
||||
from datetime import datetime
|
||||
|
||||
# === CSS & Theme laden ===
|
||||
load_css()
|
||||
apply_dark_theme()
|
||||
|
||||
st.set_page_config(page_title="🧾 Log Viewer", layout="wide")
|
||||
st.title("🧾 Letzte Logeinträge anzeigen")
|
||||
|
||||
# Header
|
||||
st.markdown("""
|
||||
<div class="main-header">
|
||||
<h1>🧾 Log Viewer</h1>
|
||||
<p>Überwache Systemaktivitäten und Debug-Informationen</p>
|
||||
</div>
|
||||
""", unsafe_allow_html=True)
|
||||
|
||||
LOG_FILE = "logs/rss_tool.log"
|
||||
MAX_LINES = 500
|
||||
|
||||
if not os.path.exists(LOG_FILE):
|
||||
st.warning("Keine Logdatei gefunden.")
|
||||
else:
|
||||
with open(LOG_FILE, "r") as f:
|
||||
lines = f.readlines()
|
||||
# === Log-Datei Kontrollen ===
|
||||
st.markdown('<div class="filter-section">', unsafe_allow_html=True)
|
||||
st.subheader("📁 Log-Datei Optionen")
|
||||
|
||||
st.write(f"Letzte {min(len(lines), MAX_LINES)} Zeilen aus `{LOG_FILE}`:")
|
||||
col1, col2, col3, col4 = st.columns(4)
|
||||
|
||||
st.code("".join(lines[-MAX_LINES:]), language="text")
|
||||
with col1:
|
||||
lines_to_show = st.selectbox(
|
||||
"Anzahl Zeilen",
|
||||
[50, 100, 200, 500, 1000],
|
||||
index=3, # Default: 500
|
||||
key="lines_select"
|
||||
)
|
||||
|
||||
if st.button("🔄 Neu laden"):
|
||||
with col2:
|
||||
if st.button("🔄 Neu laden", use_container_width=True):
|
||||
st.rerun()
|
||||
|
||||
with col3:
|
||||
log_level_filter = st.selectbox(
|
||||
"Log Level Filter",
|
||||
["Alle", "INFO", "WARNING", "ERROR", "DEBUG"],
|
||||
key="level_filter"
|
||||
)
|
||||
|
||||
with col4:
|
||||
search_term = st.text_input(
|
||||
"Suche in Logs",
|
||||
placeholder="Suchbegriff...",
|
||||
key="log_search"
|
||||
)
|
||||
|
||||
st.markdown('</div>', unsafe_allow_html=True)
|
||||
|
||||
# === Log-Datei Status ===
|
||||
if not os.path.exists(LOG_FILE):
|
||||
st.markdown("""
|
||||
<div class="wp-status">
|
||||
<strong>⚠️ Keine Log-Datei gefunden</strong><br>
|
||||
<div class="text-secondary">
|
||||
Die Log-Datei wurde noch nicht erstellt oder befindet sich an einem anderen Ort.<br>
|
||||
Erwarteter Pfad: <code>{}</code>
|
||||
</div>
|
||||
</div>
|
||||
""".format(LOG_FILE), unsafe_allow_html=True)
|
||||
else:
|
||||
# Datei-Informationen
|
||||
file_size = os.path.getsize(LOG_FILE)
|
||||
file_mtime = datetime.fromtimestamp(os.path.getmtime(LOG_FILE))
|
||||
|
||||
col1, col2, col3 = st.columns(3)
|
||||
|
||||
with col1:
|
||||
st.markdown("""
|
||||
<div class="stats-card">
|
||||
<div class="stats-number">{:.1f} KB</div>
|
||||
<div>Dateigröße</div>
|
||||
</div>
|
||||
""".format(file_size / 1024), unsafe_allow_html=True)
|
||||
|
||||
with col2:
|
||||
st.markdown("""
|
||||
<div class="stats-card">
|
||||
<div class="stats-number" style="font-size: 1.5rem;">{}</div>
|
||||
<div>Letzte Änderung</div>
|
||||
</div>
|
||||
""".format(file_mtime.strftime("%H:%M:%S")), unsafe_allow_html=True)
|
||||
|
||||
with col3:
|
||||
# Zeilen zählen
|
||||
try:
|
||||
with open(LOG_FILE, "r", encoding="utf-8") as f:
|
||||
total_lines = sum(1 for _ in f)
|
||||
except:
|
||||
total_lines = 0
|
||||
|
||||
st.markdown("""
|
||||
<div class="stats-card">
|
||||
<div class="stats-number">{}</div>
|
||||
<div>Zeilen Gesamt</div>
|
||||
</div>
|
||||
""".format(total_lines), unsafe_allow_html=True)
|
||||
|
||||
# === Log-Inhalt anzeigen ===
|
||||
try:
|
||||
with open(LOG_FILE, "r", encoding="utf-8") as f:
|
||||
lines = f.readlines()
|
||||
|
||||
# Filter anwenden
|
||||
filtered_lines = []
|
||||
|
||||
for line in lines:
|
||||
# Log Level Filter
|
||||
if log_level_filter != "Alle":
|
||||
if f" - {log_level_filter} - " not in line:
|
||||
continue
|
||||
|
||||
# Suchfilter
|
||||
if search_term and search_term.lower() not in line.lower():
|
||||
continue
|
||||
|
||||
filtered_lines.append(line)
|
||||
|
||||
# Anzahl begrenzen
|
||||
display_lines = filtered_lines[-lines_to_show:] if len(filtered_lines) > lines_to_show else filtered_lines
|
||||
|
||||
# Header für Log-Anzeige
|
||||
st.subheader(f"📋 Log-Einträge ({len(display_lines)} von {len(filtered_lines)} gefilterten Zeilen)")
|
||||
|
||||
if display_lines:
|
||||
# Log-Inhalt mit Syntax-Highlighting
|
||||
log_content = "".join(display_lines)
|
||||
|
||||
# Farbkodierung für verschiedene Log-Level
|
||||
colored_content = log_content
|
||||
colored_content = colored_content.replace(" - ERROR - ", " - 🔴 ERROR - ")
|
||||
colored_content = colored_content.replace(" - WARNING - ", " - 🟡 WARNING - ")
|
||||
colored_content = colored_content.replace(" - INFO - ", " - 🔵 INFO - ")
|
||||
colored_content = colored_content.replace(" - DEBUG - ", " - ⚪ DEBUG - ")
|
||||
|
||||
# Log in Card anzeigen
|
||||
st.markdown("""
|
||||
<div class="article-card">
|
||||
<h3 class="article-title">📄 Log-Ausgabe</h3>
|
||||
<div class="article-meta">
|
||||
Letzte {count} Einträge | Filter: {level} | Suche: "{search}"
|
||||
</div>
|
||||
</div>
|
||||
""".format(
|
||||
count=len(display_lines),
|
||||
level=log_level_filter,
|
||||
search=search_term or "Keine"
|
||||
), unsafe_allow_html=True)
|
||||
|
||||
# Code-Block mit Logs
|
||||
st.code(colored_content, language="text")
|
||||
|
||||
# Download-Button
|
||||
st.download_button(
|
||||
label="💾 Log-Datei herunterladen",
|
||||
data=log_content,
|
||||
file_name=f"rss_tool_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
|
||||
mime="text/plain",
|
||||
use_container_width=True
|
||||
)
|
||||
|
||||
else:
|
||||
st.markdown("""
|
||||
<div class="wp-status">
|
||||
<strong>🔍 Keine Log-Einträge gefunden</strong><br>
|
||||
<div class="text-secondary">
|
||||
Mit den aktuellen Filtern wurden keine Log-Einträge gefunden.<br>
|
||||
Versuche andere Filter-Einstellungen.
|
||||
</div>
|
||||
</div>
|
||||
""", unsafe_allow_html=True)
|
||||
|
||||
except Exception as e:
|
||||
st.markdown(f"""
|
||||
<div class="wp-status">
|
||||
<strong>❌ Fehler beim Lesen der Log-Datei</strong><br>
|
||||
<div class="text-secondary">
|
||||
{str(e)}
|
||||
</div>
|
||||
</div>
|
||||
""", unsafe_allow_html=True)
|
||||
|
||||
# === Log-Level Erklärung ===
|
||||
with st.expander("ℹ️ Log-Level Erklärung", expanded=False):
|
||||
st.markdown("""
|
||||
<div class="article-card">
|
||||
<h3 class="article-title">📖 Log-Level Bedeutung</h3>
|
||||
<div class="article-summary">
|
||||
<strong>🔵 INFO:</strong> Normale Programmaktivitäten (Feed-Updates, Artikel verarbeitet)<br>
|
||||
<strong>🟡 WARNING:</strong> Potentielle Probleme (Duplikate, fehlende Daten)<br>
|
||||
<strong>🔴 ERROR:</strong> Fehler die das Programm beeinträchtigen<br>
|
||||
<strong>⚪ DEBUG:</strong> Detaillierte Entwickler-Informationen
|
||||
</div>
|
||||
</div>
|
||||
""", unsafe_allow_html=True)
|
||||
|
||||
# === Log-Datei verwalten ===
|
||||
st.markdown("---")
|
||||
st.subheader("🛠️ Log-Datei Verwaltung")
|
||||
|
||||
col1, col2, col3 = st.columns(3)
|
||||
|
||||
with col1:
|
||||
if st.button("🗑️ Log-Datei leeren", use_container_width=True):
|
||||
if st.button("⚠️ Wirklich leeren?", key="confirm_clear"):
|
||||
try:
|
||||
with open(LOG_FILE, "w", encoding="utf-8") as f:
|
||||
f.write("")
|
||||
st.success("Log-Datei wurde geleert!")
|
||||
st.rerun()
|
||||
except Exception as e:
|
||||
st.error(f"Fehler beim Leeren der Log-Datei: {e}")
|
||||
|
||||
with col2:
|
||||
if st.button("📦 Log archivieren", use_container_width=True):
|
||||
try:
|
||||
archive_name = f"rss_tool_log_archive_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
|
||||
with open(LOG_FILE, "r", encoding="utf-8") as f:
|
||||
log_data = f.read()
|
||||
|
||||
st.download_button(
|
||||
label=f"💾 {archive_name}",
|
||||
data=log_data,
|
||||
file_name=archive_name,
|
||||
mime="text/plain",
|
||||
key="archive_download"
|
||||
)
|
||||
except Exception as e:
|
||||
st.error(f"Fehler beim Archivieren: {e}")
|
||||
|
||||
with col3:
|
||||
if st.button("📊 Log-Statistiken", use_container_width=True):
|
||||
if os.path.exists(LOG_FILE):
|
||||
try:
|
||||
with open(LOG_FILE, "r", encoding="utf-8") as f:
|
||||
all_lines = f.readlines()
|
||||
|
||||
# Statistiken berechnen
|
||||
total_lines = len(all_lines)
|
||||
info_count = sum(1 for line in all_lines if " - INFO - " in line)
|
||||
warning_count = sum(1 for line in all_lines if " - WARNING - " in line)
|
||||
error_count = sum(1 for line in all_lines if " - ERROR - " in line)
|
||||
debug_count = sum(1 for line in all_lines if " - DEBUG - " in line)
|
||||
|
||||
st.subheader("📈 Log-Statistiken")
|
||||
|
||||
col1, col2, col3, col4 = st.columns(4)
|
||||
|
||||
with col1:
|
||||
st.markdown("""
|
||||
<div class="stats-card">
|
||||
<div class="stats-number">{}</div>
|
||||
<div>🔵 INFO</div>
|
||||
</div>
|
||||
""".format(info_count), unsafe_allow_html=True)
|
||||
|
||||
with col2:
|
||||
st.markdown("""
|
||||
<div class="stats-card">
|
||||
<div class="stats-number">{}</div>
|
||||
<div>🟡 WARNING</div>
|
||||
</div>
|
||||
""".format(warning_count), unsafe_allow_html=True)
|
||||
|
||||
with col3:
|
||||
st.markdown("""
|
||||
<div class="stats-card">
|
||||
<div class="stats-number">{}</div>
|
||||
<div>🔴 ERROR</div>
|
||||
</div>
|
||||
""".format(error_count), unsafe_allow_html=True)
|
||||
|
||||
with col4:
|
||||
st.markdown("""
|
||||
<div class="stats-card">
|
||||
<div class="stats-number">{}</div>
|
||||
<div>⚪ DEBUG</div>
|
||||
</div>
|
||||
""".format(debug_count), unsafe_allow_html=True)
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"Fehler beim Berechnen der Statistiken: {e}")
|
||||
|
||||
# === Auto-Refresh Option ===
|
||||
if st.checkbox("🔄 Auto-Refresh (30s)", key="auto_refresh"):
|
||||
import time
|
||||
time.sleep(30)
|
||||
st.rerun()
|
||||
4
pytest.ini
Normal file
4
pytest.ini
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
[pytest]
|
||||
testpaths = backend/tests
|
||||
python_files = test_*.py
|
||||
addopts = -q --maxfail=1
|
||||
|
|
@ -53,3 +53,6 @@ typing_extensions==4.14.0
|
|||
tzdata==2025.2
|
||||
urllib3==2.5.0
|
||||
typer[all]==0.12.3
|
||||
# Bild- und Hashing-Bibliotheken für Dedupe
|
||||
Pillow>=10.0.0 # Bildverarbeitung (öffnet Bilder für pHash)
|
||||
ImageHash>=4.3 # Perzeptueller Hash (pHash) für Near-Duplicate Erkennung
|
||||
|
|
|
|||
33
scripts/smoke_backend.sh
Executable file
33
scripts/smoke_backend.sh
Executable file
|
|
@ -0,0 +1,33 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
if [[ -z "${BASE_URL:-}" ]]; then
|
||||
echo "BASE_URL fehlt (z. B. https://news.vanityontour.de)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ -z "${APP_ADMIN_USERNAME:-}" || -z "${APP_ADMIN_PASSWORD:-}" ]]; then
|
||||
echo "APP_ADMIN_USERNAME/APP_ADMIN_PASSWORD fehlen"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cookie_file="$(mktemp)"
|
||||
trap 'rm -f "$cookie_file"' EXIT
|
||||
|
||||
echo "[1/4] Healthcheck"
|
||||
curl -fsS "${BASE_URL}/health" | grep -q '"status":"ok"'
|
||||
|
||||
echo "[2/4] Login"
|
||||
curl -fsS -c "$cookie_file" \
|
||||
-H "Content-Type: application/json" \
|
||||
-X POST "${BASE_URL}/auth/login" \
|
||||
-d "{\"username\":\"${APP_ADMIN_USERNAME}\",\"password\":\"${APP_ADMIN_PASSWORD}\"}" \
|
||||
| grep -q '"ok":true'
|
||||
|
||||
echo "[3/4] Protected Endpoint"
|
||||
curl -fsS -b "$cookie_file" "${BASE_URL}/api/protected" | grep -q '"ok":true'
|
||||
|
||||
echo "[4/4] Pipeline Status"
|
||||
curl -fsS -b "$cookie_file" "${BASE_URL}/api/pipeline/status" | grep -q '"stage":"skeleton+db"'
|
||||
|
||||
echo "Smoke test erfolgreich."
|
||||
491
static/styles.css
Normal file
491
static/styles.css
Normal file
|
|
@ -0,0 +1,491 @@
|
|||
/* ===============================================
|
||||
RSS Feed Manager - Zentrale CSS-Datei
|
||||
Dark-Mode optimiert mit Fallbacks
|
||||
=============================================== */
|
||||
|
||||
/* === ROOT VARIABLEN === */
|
||||
:root {
|
||||
/* Dark Mode Farbpalette */
|
||||
--bg-primary: #1e1e1e;
|
||||
--bg-secondary: #2d2d30;
|
||||
--bg-card: #2d2d30;
|
||||
--bg-header: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||||
--bg-filter: #363636;
|
||||
|
||||
/* Text Farben */
|
||||
--text-primary: #ffffff;
|
||||
--text-secondary: #b0b0b0;
|
||||
--text-muted: #888888;
|
||||
--text-accent: #667eea;
|
||||
|
||||
/* Status Farben */
|
||||
--status-new: #2196f3;
|
||||
--status-new-bg: #1565c0;
|
||||
--status-rewrite: #ff9800;
|
||||
--status-rewrite-bg: #ef6c00;
|
||||
--status-process: #9c27b0;
|
||||
--status-process-bg: #6a1b9a;
|
||||
--status-online: #4caf50;
|
||||
--status-online-bg: #2e7d32;
|
||||
--status-hold: #e91e63;
|
||||
--status-hold-bg: #ad1457;
|
||||
--status-trash: #f44336;
|
||||
--status-trash-bg: #c62828;
|
||||
--status-wp-pending: #00bcd4;
|
||||
--status-wp-pending-bg: #0097a7;
|
||||
|
||||
/* Borders & Shadows */
|
||||
--border-color: #404040;
|
||||
--shadow-light: 0 2px 8px rgba(0, 0, 0, 0.3);
|
||||
--shadow-hover: 0 8px 20px rgba(0, 0, 0, 0.4);
|
||||
|
||||
/* Accent Colors */
|
||||
--gradient-primary: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||||
--gradient-secondary: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
|
||||
}
|
||||
|
||||
/* === LIGHT MODE FALLBACKS === */
|
||||
[data-theme="light"], .stApp[data-theme="light"] {
|
||||
--bg-primary: #ffffff;
|
||||
--bg-secondary: #f8f9fa;
|
||||
--bg-card: #ffffff;
|
||||
--bg-filter: #f0f2f6;
|
||||
|
||||
--text-primary: #212529;
|
||||
--text-secondary: #495057;
|
||||
--text-muted: #6c757d;
|
||||
--text-accent: #667eea;
|
||||
|
||||
--border-color: #dee2e6;
|
||||
--shadow-light: 0 2px 8px rgba(0, 0, 0, 0.1);
|
||||
--shadow-hover: 0 8px 20px rgba(0, 0, 0, 0.15);
|
||||
}
|
||||
|
||||
/* === GLOBALE RESETS === */
|
||||
* {
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
/* === HAUPTCONTAINER === */
|
||||
.main-header {
|
||||
background: var(--bg-header);
|
||||
padding: 2rem;
|
||||
border-radius: 12px;
|
||||
margin-bottom: 2rem;
|
||||
color: var(--text-primary);
|
||||
text-align: center;
|
||||
box-shadow: var(--shadow-light);
|
||||
}
|
||||
|
||||
.main-header h1 {
|
||||
color: var(--text-primary) !important;
|
||||
margin: 0 0 0.5rem 0;
|
||||
font-size: 2.5rem;
|
||||
font-weight: 700;
|
||||
}
|
||||
|
||||
.main-header p {
|
||||
color: rgba(255, 255, 255, 0.9) !important;
|
||||
margin: 0;
|
||||
font-size: 1.1rem;
|
||||
}
|
||||
|
||||
/* === ARTIKEL CARDS === */
|
||||
.article-card {
|
||||
background: var(--bg-card) !important;
|
||||
border-radius: 12px;
|
||||
padding: 1.5rem;
|
||||
margin-bottom: 1rem;
|
||||
box-shadow: var(--shadow-light);
|
||||
border-left: 4px solid var(--text-accent);
|
||||
border: 1px solid var(--border-color);
|
||||
transition: all 0.3s ease;
|
||||
color: var(--text-primary) !important;
|
||||
}
|
||||
|
||||
.article-card:hover {
|
||||
transform: translateY(-2px);
|
||||
box-shadow: var(--shadow-hover);
|
||||
border-color: var(--text-accent);
|
||||
}
|
||||
|
||||
.article-card h3,
|
||||
.article-card .article-title {
|
||||
color: var(--text-primary) !important;
|
||||
margin: 0 0 0.5rem 0;
|
||||
font-size: 1.2rem;
|
||||
font-weight: 600;
|
||||
line-height: 1.4;
|
||||
}
|
||||
|
||||
.article-card .article-meta {
|
||||
color: var(--text-secondary) !important;
|
||||
font-size: 0.9rem;
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
|
||||
.article-card .article-summary {
|
||||
color: var(--text-secondary) !important;
|
||||
line-height: 1.5;
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
|
||||
.article-card .article-footer {
|
||||
color: var(--text-muted) !important;
|
||||
font-size: 0.85rem;
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
/* === STATUS BADGES === */
|
||||
.status-badge {
|
||||
padding: 0.3rem 0.8rem;
|
||||
border-radius: 20px;
|
||||
font-size: 0.8rem;
|
||||
font-weight: 600;
|
||||
margin-right: 0.5rem;
|
||||
display: inline-block;
|
||||
color: white !important;
|
||||
}
|
||||
|
||||
.status-new {
|
||||
background-color: var(--status-new-bg) !important;
|
||||
color: white !important;
|
||||
}
|
||||
|
||||
.status-rewrite {
|
||||
background-color: var(--status-rewrite-bg) !important;
|
||||
color: white !important;
|
||||
}
|
||||
|
||||
.status-process {
|
||||
background-color: var(--status-process-bg) !important;
|
||||
color: white !important;
|
||||
}
|
||||
|
||||
.status-online {
|
||||
background-color: var(--status-online-bg) !important;
|
||||
color: white !important;
|
||||
}
|
||||
|
||||
.status-hold {
|
||||
background-color: var(--status-hold-bg) !important;
|
||||
color: white !important;
|
||||
}
|
||||
|
||||
.status-trash {
|
||||
background-color: var(--status-trash-bg) !important;
|
||||
color: white !important;
|
||||
}
|
||||
|
||||
.status-wp-pending {
|
||||
background-color: var(--status-wp-pending-bg) !important;
|
||||
color: white !important;
|
||||
}
|
||||
|
||||
/* === FILTER SECTION === */
|
||||
.filter-section {
|
||||
background: var(--bg-filter) !important;
|
||||
padding: 1.5rem;
|
||||
border-radius: 12px;
|
||||
margin-bottom: 2rem;
|
||||
border: 1px solid var(--border-color);
|
||||
box-shadow: var(--shadow-light);
|
||||
}
|
||||
|
||||
.filter-section h3 {
|
||||
color: var(--text-primary) !important;
|
||||
margin: 0 0 1rem 0;
|
||||
font-size: 1.3rem;
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
/* === STATS CARDS === */
|
||||
.stats-card {
|
||||
background: var(--bg-card) !important;
|
||||
padding: 1.5rem;
|
||||
border-radius: 12px;
|
||||
text-align: center;
|
||||
box-shadow: var(--shadow-light);
|
||||
border: 1px solid var(--border-color);
|
||||
transition: transform 0.2s ease;
|
||||
}
|
||||
|
||||
.stats-card:hover {
|
||||
transform: translateY(-2px);
|
||||
}
|
||||
|
||||
.stats-number {
|
||||
font-size: 2.5rem;
|
||||
font-weight: 700;
|
||||
color: var(--text-accent) !important;
|
||||
margin-bottom: 0.5rem;
|
||||
display: block;
|
||||
}
|
||||
|
||||
.stats-card div:last-child {
|
||||
color: var(--text-secondary) !important;
|
||||
font-weight: 500;
|
||||
font-size: 1rem;
|
||||
}
|
||||
|
||||
/* === WORDPRESS STATUS === */
|
||||
.wp-status {
|
||||
background: var(--bg-card) !important;
|
||||
padding: 1rem;
|
||||
border-radius: 8px;
|
||||
margin: 1rem 0;
|
||||
border-left: 4px solid var(--status-wp-pending);
|
||||
border: 1px solid var(--border-color);
|
||||
box-shadow: var(--shadow-light);
|
||||
}
|
||||
|
||||
.wp-status strong {
|
||||
color: var(--text-primary) !important;
|
||||
}
|
||||
|
||||
.wp-status small {
|
||||
color: var(--text-muted) !important;
|
||||
}
|
||||
|
||||
/* === IMAGE GALLERY === */
|
||||
.image-gallery {
|
||||
display: flex;
|
||||
gap: 1rem;
|
||||
overflow-x: auto;
|
||||
padding: 1rem 0;
|
||||
}
|
||||
|
||||
.image-item {
|
||||
min-width: 200px;
|
||||
text-align: center;
|
||||
background: var(--bg-card);
|
||||
padding: 1rem;
|
||||
border-radius: 8px;
|
||||
border: 1px solid var(--border-color);
|
||||
}
|
||||
|
||||
.image-item img {
|
||||
border-radius: 6px;
|
||||
max-width: 100%;
|
||||
}
|
||||
|
||||
.image-item strong,
|
||||
.image-item p {
|
||||
color: var(--text-primary) !important;
|
||||
}
|
||||
|
||||
.image-item small {
|
||||
color: var(--text-muted) !important;
|
||||
}
|
||||
|
||||
/* === BUTTONS & ACTIONS === */
|
||||
.action-button {
|
||||
margin: 0.25rem;
|
||||
border-radius: 6px;
|
||||
}
|
||||
|
||||
/* Streamlit Button Overrides */
|
||||
.stButton > button {
|
||||
background: var(--gradient-primary) !important;
|
||||
color: white !important;
|
||||
border: none !important;
|
||||
border-radius: 8px !important;
|
||||
font-weight: 600 !important;
|
||||
transition: all 0.2s ease !important;
|
||||
}
|
||||
|
||||
.stButton > button:hover {
|
||||
transform: translateY(-1px) !important;
|
||||
box-shadow: 0 4px 12px rgba(102, 126, 234, 0.4) !important;
|
||||
}
|
||||
|
||||
/* === SELECTBOX & INPUT OVERRIDES === */
|
||||
.stSelectbox > div > div {
|
||||
background-color: var(--bg-card) !important;
|
||||
color: var(--text-primary) !important;
|
||||
border: 1px solid var(--border-color) !important;
|
||||
}
|
||||
|
||||
.stTextInput > div > div > input {
|
||||
background-color: var(--bg-card) !important;
|
||||
color: var(--text-primary) !important;
|
||||
border: 1px solid var(--border-color) !important;
|
||||
}
|
||||
|
||||
/* === TABS === */
|
||||
.stTabs [data-baseweb="tab-list"] {
|
||||
background-color: var(--bg-secondary) !important;
|
||||
border-radius: 8px;
|
||||
padding: 0.25rem;
|
||||
}
|
||||
|
||||
.stTabs [data-baseweb="tab"] {
|
||||
color: var(--text-secondary) !important;
|
||||
background-color: transparent !important;
|
||||
border-radius: 6px !important;
|
||||
font-weight: 600 !important;
|
||||
}
|
||||
|
||||
.stTabs [aria-selected="true"] {
|
||||
background-color: var(--text-accent) !important;
|
||||
color: white !important;
|
||||
}
|
||||
|
||||
/* === EXPANDER === */
|
||||
.streamlit-expanderHeader {
|
||||
background-color: var(--bg-card) !important;
|
||||
color: var(--text-primary) !important;
|
||||
border: 1px solid var(--border-color) !important;
|
||||
border-radius: 8px !important;
|
||||
}
|
||||
|
||||
.streamlit-expanderContent {
|
||||
background-color: var(--bg-card) !important;
|
||||
border: 1px solid var(--border-color) !important;
|
||||
border-top: none !important;
|
||||
}
|
||||
|
||||
/* === METRICS === */
|
||||
.metric-container {
|
||||
background: var(--bg-card) !important;
|
||||
padding: 1rem;
|
||||
border-radius: 8px;
|
||||
border: 1px solid var(--border-color);
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.metric-container [data-testid="metric-container"] {
|
||||
background: transparent !important;
|
||||
}
|
||||
|
||||
.metric-container [data-testid="metric-container"] > div {
|
||||
color: var(--text-primary) !important;
|
||||
}
|
||||
|
||||
/* === CODE BLOCKS === */
|
||||
.stCodeBlock {
|
||||
background-color: var(--bg-secondary) !important;
|
||||
border: 1px solid var(--border-color) !important;
|
||||
border-radius: 8px;
|
||||
}
|
||||
|
||||
/* === SUCCESS/ERROR/WARNING/INFO === */
|
||||
.stAlert {
|
||||
border-radius: 8px !important;
|
||||
border: 1px solid var(--border-color) !important;
|
||||
}
|
||||
|
||||
.stSuccess {
|
||||
background-color: rgba(76, 175, 80, 0.1) !important;
|
||||
color: var(--status-online) !important;
|
||||
border-color: var(--status-online) !important;
|
||||
}
|
||||
|
||||
.stError {
|
||||
background-color: rgba(244, 67, 54, 0.1) !important;
|
||||
color: var(--status-trash) !important;
|
||||
border-color: var(--status-trash) !important;
|
||||
}
|
||||
|
||||
.stWarning {
|
||||
background-color: rgba(255, 152, 0, 0.1) !important;
|
||||
color: var(--status-rewrite) !important;
|
||||
border-color: var(--status-rewrite) !important;
|
||||
}
|
||||
|
||||
.stInfo {
|
||||
background-color: rgba(33, 150, 243, 0.1) !important;
|
||||
color: var(--status-new) !important;
|
||||
border-color: var(--status-new) !important;
|
||||
}
|
||||
|
||||
/* === SIDEBAR === */
|
||||
.css-1d391kg {
|
||||
background-color: var(--bg-secondary) !important;
|
||||
}
|
||||
|
||||
.css-1d391kg .stMarkdown {
|
||||
color: var(--text-primary) !important;
|
||||
}
|
||||
|
||||
/* === RESPONSIVE DESIGN === */
|
||||
@media (max-width: 768px) {
|
||||
.main-header {
|
||||
padding: 1.5rem;
|
||||
}
|
||||
|
||||
.main-header h1 {
|
||||
font-size: 2rem;
|
||||
}
|
||||
|
||||
.article-card {
|
||||
padding: 1rem;
|
||||
}
|
||||
|
||||
.stats-card {
|
||||
padding: 1rem;
|
||||
}
|
||||
|
||||
.stats-number {
|
||||
font-size: 2rem;
|
||||
}
|
||||
}
|
||||
|
||||
/* === UTILITY CLASSES === */
|
||||
.text-primary { color: var(--text-primary) !important; }
|
||||
.text-secondary { color: var(--text-secondary) !important; }
|
||||
.text-muted { color: var(--text-muted) !important; }
|
||||
.text-accent { color: var(--text-accent) !important; }
|
||||
|
||||
.bg-card { background-color: var(--bg-card) !important; }
|
||||
.bg-secondary { background-color: var(--bg-secondary) !important; }
|
||||
|
||||
.border-radius { border-radius: 8px; }
|
||||
.shadow-light { box-shadow: var(--shadow-light); }
|
||||
.shadow-hover { box-shadow: var(--shadow-hover); }
|
||||
|
||||
/* === SCROLLBAR STYLING === */
|
||||
::-webkit-scrollbar {
|
||||
width: 8px;
|
||||
height: 8px;
|
||||
}
|
||||
|
||||
::-webkit-scrollbar-track {
|
||||
background: var(--bg-secondary);
|
||||
border-radius: 4px;
|
||||
}
|
||||
|
||||
::-webkit-scrollbar-thumb {
|
||||
background: var(--text-muted);
|
||||
border-radius: 4px;
|
||||
}
|
||||
|
||||
::-webkit-scrollbar-thumb:hover {
|
||||
background: var(--text-secondary);
|
||||
}
|
||||
|
||||
/* === LOADING SPINNER === */
|
||||
@keyframes spin {
|
||||
0% { transform: rotate(0deg); }
|
||||
100% { transform: rotate(360deg); }
|
||||
}
|
||||
|
||||
.loading-spinner {
|
||||
border: 4px solid var(--bg-secondary);
|
||||
border-top: 4px solid var(--text-accent);
|
||||
border-radius: 50%;
|
||||
width: 40px;
|
||||
height: 40px;
|
||||
animation: spin 1s linear infinite;
|
||||
margin: 0 auto 1rem auto;
|
||||
}
|
||||
|
||||
/* === FOCUS STATES === */
|
||||
.stButton > button:focus,
|
||||
.stSelectbox > div > div:focus,
|
||||
.stTextInput > div > div > input:focus {
|
||||
outline: 2px solid var(--text-accent) !important;
|
||||
outline-offset: 2px !important;
|
||||
}
|
||||
303
tools/image_deduper.py
Normal file
303
tools/image_deduper.py
Normal file
|
|
@ -0,0 +1,303 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
image_deduper.py — Finde und bereinige Bild-Dubletten sicher & reversibel.
|
||||
|
||||
Funktionen:
|
||||
- Scan: Verzeichnisse rekursiv scannen, sha256 + pHash berechnen
|
||||
- Report: CSV + menschenlesbare Zusammenfassung mit Gruppen
|
||||
- Apply: Duplikate auf kanonische Datei umbiegen (Hardlink/Löschen)
|
||||
- Optional: DB-Referenzen aktualisieren (SQLite/SQLModel kompatibel)
|
||||
|
||||
Nutzung (Beispiele):
|
||||
# 1) Nur scannen + reporten (keine Änderungen):
|
||||
python tools/image_deduper.py scan --roots media,assets/images --out-dir .dedupe --phash
|
||||
|
||||
# 2) Report anzeigen:
|
||||
python tools/image_deduper.py report --index .dedupe/index.sqlite --csv
|
||||
|
||||
# 3) Anwenden (Hardlinks setzen, Dry-Run):
|
||||
python tools/image_deduper.py apply --index .dedupe/index.sqlite --mode hardlink --dry-run
|
||||
|
||||
# 4) Anwenden (wirklich ändern):
|
||||
python tools/image_deduper.py apply --index .dedupe/index.sqlite --mode hardlink
|
||||
|
||||
# 5) Referenzen in DB aktualisieren (optional):
|
||||
python tools/image_deduper.py apply --index .dedupe/index.sqlite --update-db sqlite:///./rssbot.db --dry-run
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import hashlib
|
||||
import os
|
||||
import sqlite3
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Optional, Tuple
|
||||
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
Image = None
|
||||
|
||||
try:
|
||||
import imagehash # type: ignore
|
||||
except ImportError:
|
||||
imagehash = None
|
||||
|
||||
|
||||
IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".webp", ".gif"}
|
||||
DEFAULT_INDEX = ".dedupe/index.sqlite"
|
||||
DEFAULT_REPORT = ".dedupe/report.csv"
|
||||
|
||||
|
||||
def sha256_file(path: Path, bufsize: int = 1024 * 1024) -> str:
|
||||
h = hashlib.sha256()
|
||||
with path.open("rb") as f:
|
||||
while True:
|
||||
b = f.read(bufsize)
|
||||
if not b:
|
||||
break
|
||||
h.update(b)
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
def calc_phash(path: Path) -> Optional[str]:
|
||||
if Image is None or imagehash is None:
|
||||
return None
|
||||
try:
|
||||
with Image.open(path) as im:
|
||||
im = im.convert("RGB")
|
||||
ph = imagehash.phash(im, hash_size=16) # 16x16 → 256-bit
|
||||
return str(ph)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def ensure_dir(p: Path):
|
||||
p.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
def init_index(db_path: Path):
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
CREATE TABLE IF NOT EXISTS files (
|
||||
id INTEGER PRIMARY KEY,
|
||||
path TEXT NOT NULL UNIQUE,
|
||||
size INTEGER NOT NULL,
|
||||
mtime REAL NOT NULL,
|
||||
sha256 TEXT NOT NULL,
|
||||
phash TEXT,
|
||||
ext TEXT NOT NULL
|
||||
);
|
||||
""")
|
||||
cur.execute("CREATE INDEX IF NOT EXISTS idx_sha256 ON files (sha256);")
|
||||
cur.execute("CREATE INDEX IF NOT EXISTS idx_phash ON files (phash);")
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
|
||||
def is_image(path: Path) -> bool:
|
||||
return path.suffix.lower() in IMAGE_EXTS
|
||||
|
||||
|
||||
def walk_images(roots: List[Path]) -> Iterable[Path]:
|
||||
for root in roots:
|
||||
for p in root.rglob("*"):
|
||||
if p.is_file() and is_image(p):
|
||||
yield p
|
||||
|
||||
|
||||
def upsert_file(db_path: Path, path: Path, sha256: str, phash: Optional[str]):
|
||||
st = path.stat()
|
||||
row = (str(path), st.st_size, st.st_mtime, sha256, phash, path.suffix.lower())
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
INSERT INTO files (path, size, mtime, sha256, phash, ext)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(path) DO UPDATE SET
|
||||
size=excluded.size,
|
||||
mtime=excluded.mtime,
|
||||
sha256=excluded.sha256,
|
||||
phash=excluded.phash,
|
||||
ext=excluded.ext;
|
||||
""", row)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
|
||||
def group_by_sha256(db_path: Path) -> List[List[Tuple[int, str, int]]]:
|
||||
"""Return list of groups: [(id, path, size), ...] where sha256 identical and len(group) > 1."""
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT sha256 FROM files GROUP BY sha256 HAVING COUNT(*) > 1;
|
||||
""")
|
||||
hashes = [r[0] for r in cur.fetchall()]
|
||||
groups = []
|
||||
for h in hashes:
|
||||
cur.execute("SELECT id, path, size FROM files WHERE sha256=?", (h,))
|
||||
rows = cur.fetchall()
|
||||
groups.append([(rid, rpath, rsize) for rid, rpath, rsize in rows])
|
||||
conn.close()
|
||||
return groups
|
||||
|
||||
|
||||
def write_csv_report(db_path: Path, csv_path: Path) -> Tuple[int, int]:
|
||||
groups = group_by_sha256(db_path)
|
||||
ensure_dir(csv_path.parent)
|
||||
total_dups = 0
|
||||
total_savings = 0
|
||||
with csv_path.open("w", newline="", encoding="utf-8") as f:
|
||||
w = csv.writer(f)
|
||||
w.writerow(["group_id", "canonical_path", "dup_path", "dup_size_bytes"])
|
||||
gid = 0
|
||||
for g in groups:
|
||||
if not g:
|
||||
continue
|
||||
# Kanon: größte Datei (oder erste)
|
||||
canonical = max(g, key=lambda x: x[2])
|
||||
for rid, path, size in g:
|
||||
if path == canonical[1]:
|
||||
continue
|
||||
total_dups += 1
|
||||
total_savings += size
|
||||
w.writerow([gid, canonical[1], path, size])
|
||||
gid += 1
|
||||
return total_dups, total_savings
|
||||
|
||||
|
||||
def apply_hardlink(canonical: Path, dup: Path, dry_run: bool) -> None:
|
||||
# Ersetzt dup durch Hardlink auf canonical (gleiche Partition nötig)
|
||||
if dry_run:
|
||||
return
|
||||
tmp = dup.with_suffix(dup.suffix + ".dedupe.tmp")
|
||||
dup.unlink() # entferne dup
|
||||
os.link(canonical, tmp) # hardlink temp
|
||||
tmp.replace(dup) # atomarer move
|
||||
|
||||
|
||||
def apply_delete(dup: Path, dry_run: bool) -> None:
|
||||
if dry_run:
|
||||
return
|
||||
dup.unlink()
|
||||
|
||||
|
||||
@dataclass
|
||||
class ApplyStats:
|
||||
processed: int = 0
|
||||
errors: int = 0
|
||||
saved_bytes: int = 0
|
||||
|
||||
|
||||
def apply_changes(csv_report: Path, mode: str, dry_run: bool) -> ApplyStats:
|
||||
stats = ApplyStats()
|
||||
with csv_report.open("r", encoding="utf-8") as f:
|
||||
r = csv.DictReader(f)
|
||||
for row in r:
|
||||
canonical = Path(row["canonical_path"])
|
||||
dup = Path(row["dup_path"])
|
||||
size = int(row["dup_size_bytes"])
|
||||
try:
|
||||
if mode == "hardlink":
|
||||
apply_hardlink(canonical, dup, dry_run)
|
||||
elif mode == "delete":
|
||||
apply_delete(dup, dry_run)
|
||||
else:
|
||||
raise ValueError("mode must be 'hardlink' or 'delete'")
|
||||
stats.processed += 1
|
||||
stats.saved_bytes += size
|
||||
except Exception as e:
|
||||
stats.errors += 1
|
||||
print(f"[ERROR] {dup}: {e}", file=sys.stderr)
|
||||
return stats
|
||||
|
||||
|
||||
def parse_roots(roots_arg: str) -> List[Path]:
|
||||
parts = [Path(p.strip()) for p in roots_arg.split(",") if p.strip()]
|
||||
for p in parts:
|
||||
if not p.exists():
|
||||
raise FileNotFoundError(f"Root not found: {p}")
|
||||
return parts
|
||||
|
||||
|
||||
def cmd_scan(args):
|
||||
out_dir = Path(args.out_dir)
|
||||
index = Path(args.index or DEFAULT_INDEX)
|
||||
ensure_dir(out_dir)
|
||||
ensure_dir(index.parent)
|
||||
init_index(index)
|
||||
roots = parse_roots(args.roots)
|
||||
|
||||
count = 0
|
||||
for path in walk_images(roots):
|
||||
try:
|
||||
h = sha256_file(path)
|
||||
ph = calc_phash(path) if args.phash else None
|
||||
upsert_file(index, path, h, ph)
|
||||
count += 1
|
||||
if count % 500 == 0:
|
||||
print(f"... indexed {count} files")
|
||||
except Exception as e:
|
||||
print(f"[WARN] {path}: {e}", file=sys.stderr)
|
||||
|
||||
dups, savings = write_csv_report(index, Path(args.report or DEFAULT_REPORT))
|
||||
print(f"Indexed {count} images. Found duplicate files: {dups}, potential savings: {savings/1_000_000:.2f} MB")
|
||||
print(f"Index: {index}")
|
||||
print(f"Report: {args.report or DEFAULT_REPORT}")
|
||||
|
||||
|
||||
def cmd_report(args):
|
||||
index = Path(args.index or DEFAULT_INDEX)
|
||||
csv_path = Path(args.report or DEFAULT_REPORT)
|
||||
dups, savings = write_csv_report(index, csv_path)
|
||||
print(f"Duplicates: {dups}, potential savings: {savings/1_000_000:.2f} MB")
|
||||
if args.csv:
|
||||
print(f"CSV written: {csv_path}")
|
||||
|
||||
|
||||
def cmd_apply(args):
|
||||
csv_report = Path(args.report or DEFAULT_REPORT)
|
||||
if not csv_report.exists():
|
||||
raise FileNotFoundError(f"Report not found: {csv_report}")
|
||||
stats = apply_changes(csv_report, args.mode, args.dry_run)
|
||||
print(f"Processed: {stats.processed}, Errors: {stats.errors}, Saved: {stats.saved_bytes/1_000_000:.2f} MB (mode={args.mode}, dry_run={args.dry_run})")
|
||||
if args.update_db:
|
||||
# Platzhalter: hier könntest du eure DB-Referenzen aktualisieren (falls Bilder-Paths in DB gespeichert sind).
|
||||
# Beispiel: SQLModel mit Tabelle ImageMeta(content_hash UNIQUE, local_path) → auf kanonischen Pfad umbiegen.
|
||||
print(f"[INFO] DB update requested for: {args.update_db} (implementierung projektspezifisch)")
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description="Bild-Deduplizierung (scan/report/apply)")
|
||||
sub = ap.add_subparsers(dest="cmd", required=True)
|
||||
|
||||
sc = sub.add_parser("scan", help="Verzeichnisse scannen und Index/Report erstellen")
|
||||
sc.add_argument("--roots", required=True, help="Kommagetrennte Wurzelpfade, z.B. 'media,assets/images'")
|
||||
sc.add_argument("--out-dir", default=".dedupe", help="Ausgabeverzeichnis für Index/Reports")
|
||||
sc.add_argument("--index", help="Pfad zur SQLite-Indexdatei (default .dedupe/index.sqlite)")
|
||||
sc.add_argument("--report", help="Pfad zum CSV-Report (default .dedupe/report.csv)")
|
||||
sc.add_argument("--phash", action="store_true", help="Perzeptuellen Hash berechnen (für zukünftige Near-Dups)")
|
||||
sc.set_defaults(func=cmd_scan)
|
||||
|
||||
rp = sub.add_parser("report", help="Report neu generieren/anzeigen")
|
||||
rp.add_argument("--index", help="Pfad zur SQLite-Indexdatei")
|
||||
rp.add_argument("--report", help="Pfad zum CSV-Report")
|
||||
rp.add_argument("--csv", action="store_true", help="CSV-Pfad ausgeben")
|
||||
rp.set_defaults(func=cmd_report)
|
||||
|
||||
aply = sub.add_parser("apply", help="Änderungen anwenden (Hardlink/Delete)")
|
||||
aply.add_argument("--report", help="Pfad zum CSV-Report")
|
||||
aply.add_argument("--mode", choices=["hardlink", "delete"], default="hardlink", help="Strategie für Duplikate")
|
||||
aply.add_argument("--dry-run", action="store_true", help="Nur anzeigen, nichts ändern")
|
||||
aply.add_argument("--update-db", help="Optional: DB-URL für Referenz-Updates (projektspezifisch)")
|
||||
aply.set_defaults(func=cmd_apply)
|
||||
|
||||
args = ap.parse_args()
|
||||
args.func(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
51
utils/config.py
Normal file
51
utils/config.py
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
import os
|
||||
from typing import Dict, List
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
||||
def validate_env() -> Dict:
|
||||
"""Validiert sicherheitsrelevante .env-Variablen.
|
||||
|
||||
Returns dict with: ok: bool, errors: List[str], warnings: List[str], summary: Dict[str, bool]
|
||||
"""
|
||||
errors: List[str] = []
|
||||
warnings: List[str] = []
|
||||
|
||||
wp_base_url = os.getenv("WP_BASE_URL", "").strip()
|
||||
wp_user = os.getenv("WP_USERNAME", "").strip()
|
||||
wp_pass = os.getenv("WP_PASSWORD", "").strip()
|
||||
wp_b64 = os.getenv("WP_AUTH_BASE64", "").strip()
|
||||
openai_key = os.getenv("OPENAI_API_KEY", "").strip()
|
||||
|
||||
# WP_BASE_URL Pflicht
|
||||
if not wp_base_url:
|
||||
errors.append("WP_BASE_URL fehlt in .env")
|
||||
elif not (wp_base_url.startswith("http://") or wp_base_url.startswith("https://")):
|
||||
errors.append("WP_BASE_URL muss mit http:// oder https:// beginnen")
|
||||
|
||||
# Auth-Creds: entweder Base64 ODER Username+Password
|
||||
if not wp_b64 and not (wp_user and wp_pass):
|
||||
errors.append("Entweder WP_AUTH_BASE64 oder WP_USERNAME + WP_PASSWORD in .env setzen")
|
||||
|
||||
# Empfehlungen
|
||||
if not wp_b64 and (wp_user and wp_pass):
|
||||
warnings.append("WP_AUTH_BASE64 nicht gesetzt – Empfehlung: Base64 nutzen (Application Password)")
|
||||
|
||||
if not openai_key:
|
||||
warnings.append("OPENAI_API_KEY ist nicht gesetzt – Umschreibungsfunktion ist deaktiviert")
|
||||
|
||||
summary = {
|
||||
"WP_BASE_URL": bool(wp_base_url),
|
||||
"WP_USERNAME": bool(wp_user),
|
||||
"WP_PASSWORD": bool(wp_pass),
|
||||
"WP_AUTH_BASE64": bool(wp_b64),
|
||||
"OPENAI_API_KEY": bool(openai_key),
|
||||
}
|
||||
|
||||
return {"ok": len(errors) == 0, "errors": errors, "warnings": warnings, "summary": summary}
|
||||
|
||||
|
||||
__all__ = ["validate_env"]
|
||||
|
||||
367
utils/css_loader.py
Normal file
367
utils/css_loader.py
Normal file
|
|
@ -0,0 +1,367 @@
|
|||
# utils/css_loader.py
|
||||
|
||||
import streamlit as st
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
def load_css():
|
||||
"""
|
||||
Lädt die zentrale CSS-Datei und injiziert sie in die Streamlit-App
|
||||
"""
|
||||
try:
|
||||
# Pfad zur CSS-Datei bestimmen
|
||||
css_file = Path(__file__).parent.parent / "static" / "styles.css"
|
||||
|
||||
if css_file.exists():
|
||||
with open(css_file, "r", encoding="utf-8") as f:
|
||||
css_content = f.read()
|
||||
|
||||
# CSS in Streamlit injizieren
|
||||
st.markdown(f"""
|
||||
<style>
|
||||
{css_content}
|
||||
</style>
|
||||
""", unsafe_allow_html=True)
|
||||
|
||||
return True
|
||||
else:
|
||||
# Fallback: CSS-Datei erstellen
|
||||
create_css_file()
|
||||
return load_css() # Rekursiver Aufruf nach Erstellung
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"Fehler beim Laden der CSS-Datei: {e}")
|
||||
return False
|
||||
|
||||
def create_css_file():
|
||||
"""
|
||||
Erstellt die CSS-Datei falls sie nicht existiert
|
||||
"""
|
||||
css_content = """/* ===============================================
|
||||
RSS Feed Manager - Zentrale CSS-Datei
|
||||
Dark-Mode optimiert mit Fallbacks
|
||||
=============================================== */
|
||||
|
||||
/* === ROOT VARIABLEN === */
|
||||
:root {
|
||||
/* Dark Mode Farbpalette */
|
||||
--bg-primary: #1e1e1e;
|
||||
--bg-secondary: #2d2d30;
|
||||
--bg-card: #2d2d30;
|
||||
--bg-header: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||||
--bg-filter: #363636;
|
||||
|
||||
/* Text Farben */
|
||||
--text-primary: #ffffff;
|
||||
--text-secondary: #b0b0b0;
|
||||
--text-muted: #888888;
|
||||
--text-accent: #667eea;
|
||||
|
||||
/* Status Farben */
|
||||
--status-new: #2196f3;
|
||||
--status-new-bg: #1565c0;
|
||||
--status-rewrite: #ff9800;
|
||||
--status-rewrite-bg: #ef6c00;
|
||||
--status-process: #9c27b0;
|
||||
--status-process-bg: #6a1b9a;
|
||||
--status-online: #4caf50;
|
||||
--status-online-bg: #2e7d32;
|
||||
--status-hold: #e91e63;
|
||||
--status-hold-bg: #ad1457;
|
||||
--status-trash: #f44336;
|
||||
--status-trash-bg: #c62828;
|
||||
--status-wp-pending: #00bcd4;
|
||||
--status-wp-pending-bg: #0097a7;
|
||||
|
||||
/* Borders & Shadows */
|
||||
--border-color: #404040;
|
||||
--shadow-light: 0 2px 8px rgba(0, 0, 0, 0.3);
|
||||
--shadow-hover: 0 8px 20px rgba(0, 0, 0, 0.4);
|
||||
|
||||
/* Accent Colors */
|
||||
--gradient-primary: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||||
--gradient-secondary: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
|
||||
}
|
||||
|
||||
/* === LIGHT MODE FALLBACKS === */
|
||||
[data-theme="light"], .stApp[data-theme="light"] {
|
||||
--bg-primary: #ffffff;
|
||||
--bg-secondary: #f8f9fa;
|
||||
--bg-card: #ffffff;
|
||||
--bg-filter: #f0f2f6;
|
||||
|
||||
--text-primary: #212529;
|
||||
--text-secondary: #495057;
|
||||
--text-muted: #6c757d;
|
||||
--text-accent: #667eea;
|
||||
|
||||
--border-color: #dee2e6;
|
||||
--shadow-light: 0 2px 8px rgba(0, 0, 0, 0.1);
|
||||
--shadow-hover: 0 8px 20px rgba(0, 0, 0, 0.15);
|
||||
}
|
||||
|
||||
/* === HAUPTCONTAINER === */
|
||||
.main-header {
|
||||
background: var(--bg-header);
|
||||
padding: 2rem;
|
||||
border-radius: 12px;
|
||||
margin-bottom: 2rem;
|
||||
color: var(--text-primary);
|
||||
text-align: center;
|
||||
box-shadow: var(--shadow-light);
|
||||
}
|
||||
|
||||
.main-header h1 {
|
||||
color: var(--text-primary) !important;
|
||||
margin: 0 0 0.5rem 0;
|
||||
font-size: 2.5rem;
|
||||
font-weight: 700;
|
||||
}
|
||||
|
||||
.main-header p {
|
||||
color: rgba(255, 255, 255, 0.9) !important;
|
||||
margin: 0;
|
||||
font-size: 1.1rem;
|
||||
}
|
||||
|
||||
/* === ARTIKEL CARDS === */
|
||||
.article-card {
|
||||
background: var(--bg-card) !important;
|
||||
border-radius: 12px;
|
||||
padding: 1.5rem;
|
||||
margin-bottom: 1rem;
|
||||
box-shadow: var(--shadow-light);
|
||||
border-left: 4px solid var(--text-accent);
|
||||
border: 1px solid var(--border-color);
|
||||
transition: all 0.3s ease;
|
||||
color: var(--text-primary) !important;
|
||||
}
|
||||
|
||||
.article-card:hover {
|
||||
transform: translateY(-2px);
|
||||
box-shadow: var(--shadow-hover);
|
||||
border-color: var(--text-accent);
|
||||
}
|
||||
|
||||
.article-card h3,
|
||||
.article-card .article-title {
|
||||
color: var(--text-primary) !important;
|
||||
margin: 0 0 0.5rem 0;
|
||||
font-size: 1.2rem;
|
||||
font-weight: 600;
|
||||
line-height: 1.4;
|
||||
}
|
||||
|
||||
.article-card .article-meta {
|
||||
color: var(--text-secondary) !important;
|
||||
font-size: 0.9rem;
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
|
||||
.article-card .article-summary {
|
||||
color: var(--text-secondary) !important;
|
||||
line-height: 1.5;
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
|
||||
.article-card .article-footer {
|
||||
color: var(--text-muted) !important;
|
||||
font-size: 0.85rem;
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
/* === STATUS BADGES === */
|
||||
.status-badge {
|
||||
padding: 0.3rem 0.8rem;
|
||||
border-radius: 20px;
|
||||
font-size: 0.8rem;
|
||||
font-weight: 600;
|
||||
margin-right: 0.5rem;
|
||||
display: inline-block;
|
||||
color: white !important;
|
||||
}
|
||||
|
||||
.status-new {
|
||||
background-color: var(--status-new-bg) !important;
|
||||
color: white !important;
|
||||
}
|
||||
|
||||
.status-rewrite {
|
||||
background-color: var(--status-rewrite-bg) !important;
|
||||
color: white !important;
|
||||
}
|
||||
|
||||
.status-process {
|
||||
background-color: var(--status-process-bg) !important;
|
||||
color: white !important;
|
||||
}
|
||||
|
||||
.status-online {
|
||||
background-color: var(--status-online-bg) !important;
|
||||
color: white !important;
|
||||
}
|
||||
|
||||
.status-hold {
|
||||
background-color: var(--status-hold-bg) !important;
|
||||
color: white !important;
|
||||
}
|
||||
|
||||
.status-trash {
|
||||
background-color: var(--status-trash-bg) !important;
|
||||
color: white !important;
|
||||
}
|
||||
|
||||
.status-wp-pending {
|
||||
background-color: var(--status-wp-pending-bg) !important;
|
||||
color: white !important;
|
||||
}
|
||||
|
||||
/* === FILTER SECTION === */
|
||||
.filter-section {
|
||||
background: var(--bg-filter) !important;
|
||||
padding: 1.5rem;
|
||||
border-radius: 12px;
|
||||
margin-bottom: 2rem;
|
||||
border: 1px solid var(--border-color);
|
||||
box-shadow: var(--shadow-light);
|
||||
}
|
||||
|
||||
.filter-section h3 {
|
||||
color: var(--text-primary) !important;
|
||||
margin: 0 0 1rem 0;
|
||||
font-size: 1.3rem;
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
/* === STATS CARDS === */
|
||||
.stats-card {
|
||||
background: var(--bg-card) !important;
|
||||
padding: 1.5rem;
|
||||
border-radius: 12px;
|
||||
text-align: center;
|
||||
box-shadow: var(--shadow-light);
|
||||
border: 1px solid var(--border-color);
|
||||
transition: transform 0.2s ease;
|
||||
}
|
||||
|
||||
.stats-card:hover {
|
||||
transform: translateY(-2px);
|
||||
}
|
||||
|
||||
.stats-number {
|
||||
font-size: 2.5rem;
|
||||
font-weight: 700;
|
||||
color: var(--text-accent) !important;
|
||||
margin-bottom: 0.5rem;
|
||||
display: block;
|
||||
}
|
||||
|
||||
.stats-card div:last-child {
|
||||
color: var(--text-secondary) !important;
|
||||
font-weight: 500;
|
||||
font-size: 1rem;
|
||||
}
|
||||
|
||||
/* === WORDPRESS STATUS === */
|
||||
.wp-status {
|
||||
background: var(--bg-card) !important;
|
||||
padding: 1rem;
|
||||
border-radius: 8px;
|
||||
margin: 1rem 0;
|
||||
border-left: 4px solid var(--status-wp-pending);
|
||||
border: 1px solid var(--border-color);
|
||||
box-shadow: var(--shadow-light);
|
||||
}
|
||||
|
||||
.wp-status strong {
|
||||
color: var(--text-primary) !important;
|
||||
}
|
||||
|
||||
.wp-status small {
|
||||
color: var(--text-muted) !important;
|
||||
}
|
||||
|
||||
/* === BUTTONS & ACTIONS === */
|
||||
.stButton > button {
|
||||
background: var(--gradient-primary) !important;
|
||||
color: white !important;
|
||||
border: none !important;
|
||||
border-radius: 8px !important;
|
||||
font-weight: 600 !important;
|
||||
transition: all 0.2s ease !important;
|
||||
}
|
||||
|
||||
.stButton > button:hover {
|
||||
transform: translateY(-1px) !important;
|
||||
box-shadow: 0 4px 12px rgba(102, 126, 234, 0.4) !important;
|
||||
}
|
||||
|
||||
/* === SELECTBOX & INPUT OVERRIDES === */
|
||||
.stSelectbox > div > div {
|
||||
background-color: var(--bg-card) !important;
|
||||
color: var(--text-primary) !important;
|
||||
border: 1px solid var(--border-color) !important;
|
||||
}
|
||||
|
||||
.stTextInput > div > div > input {
|
||||
background-color: var(--bg-card) !important;
|
||||
color: var(--text-primary) !important;
|
||||
border: 1px solid var(--border-color) !important;
|
||||
}
|
||||
|
||||
/* === RESPONSIVE DESIGN === */
|
||||
@media (max-width: 768px) {
|
||||
.main-header {
|
||||
padding: 1.5rem;
|
||||
}
|
||||
|
||||
.main-header h1 {
|
||||
font-size: 2rem;
|
||||
}
|
||||
|
||||
.article-card {
|
||||
padding: 1rem;
|
||||
}
|
||||
|
||||
.stats-card {
|
||||
padding: 1rem;
|
||||
}
|
||||
|
||||
.stats-number {
|
||||
font-size: 2rem;
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
try:
|
||||
# Static-Ordner erstellen falls nicht vorhanden
|
||||
static_dir = Path(__file__).parent.parent / "static"
|
||||
static_dir.mkdir(exist_ok=True)
|
||||
|
||||
# CSS-Datei schreiben
|
||||
css_file = static_dir / "styles.css"
|
||||
with open(css_file, "w", encoding="utf-8") as f:
|
||||
f.write(css_content)
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
st.error(f"Fehler beim Erstellen der CSS-Datei: {e}")
|
||||
return False
|
||||
|
||||
def apply_dark_theme():
|
||||
"""
|
||||
Wendet das Dark Theme an (zusätzlich zur CSS-Datei)
|
||||
"""
|
||||
st.markdown("""
|
||||
<script>
|
||||
// Dark Theme Detection und Anwendung
|
||||
const prefersDark = window.matchMedia('(prefers-color-scheme: dark)').matches;
|
||||
if (prefersDark) {
|
||||
document.documentElement.setAttribute('data-theme', 'dark');
|
||||
} else {
|
||||
document.documentElement.setAttribute('data-theme', 'light');
|
||||
}
|
||||
</script>
|
||||
""", unsafe_allow_html=True)
|
||||
|
|
@ -11,17 +11,16 @@ from dotenv import load_dotenv
|
|||
|
||||
load_dotenv()
|
||||
|
||||
# WordPress API Konfiguration
|
||||
WP_BASE_URL = os.getenv("WP_BASE_URL", "https://vanityontour.de")
|
||||
WP_USERNAME = os.getenv("WP_USERNAME", "ogiertz")
|
||||
WP_PASSWORD = os.getenv("WP_PASSWORD", "whNEx9aZCIUXViV89Z3e7Z03")
|
||||
WP_AUTH_BASE64 = os.getenv("WP_AUTH_BASE64", "b2dpZXJ0ejp3aE5FeDlhWkNJVVhWaVY4OVozZTdaMDM=")
|
||||
WP_API_ENDPOINT = f"{WP_BASE_URL}/wp-json/wp/v2"
|
||||
# WordPress API Konfiguration – ausschließlich aus .env
|
||||
WP_BASE_URL = os.getenv("WP_BASE_URL")
|
||||
WP_USERNAME = os.getenv("WP_USERNAME")
|
||||
WP_PASSWORD = os.getenv("WP_PASSWORD")
|
||||
WP_AUTH_BASE64 = os.getenv("WP_AUTH_BASE64")
|
||||
|
||||
# Request-Konfiguration
|
||||
REQUEST_TIMEOUT = 30
|
||||
MAX_RETRIES = 3
|
||||
USER_AGENT = 'RSS-Feed-Manager/1.6.1'
|
||||
USER_AGENT = 'RSS-Feed-Manager/1.7.x'
|
||||
|
||||
class WordPressUploader:
|
||||
"""
|
||||
|
|
@ -30,41 +29,43 @@ class WordPressUploader:
|
|||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.base_url = WP_BASE_URL
|
||||
self.api_endpoint = WP_API_ENDPOINT
|
||||
# Basis-URL validieren und Endpunkt bauen
|
||||
if not WP_BASE_URL:
|
||||
raise ValueError("WP_BASE_URL nicht gesetzt. Bitte .env konfigurieren.")
|
||||
self.base_url = WP_BASE_URL.rstrip('/')
|
||||
self.api_endpoint = f"{self.base_url}/wp-json/wp/v2"
|
||||
|
||||
# Zugangsdaten (aus .env)
|
||||
self.username = WP_USERNAME
|
||||
self.password = WP_PASSWORD
|
||||
self.auth_base64 = WP_AUTH_BASE64
|
||||
|
||||
|
||||
if not self.auth_base64 and not (self.username and self.password):
|
||||
raise ValueError("WordPress-Authentifizierung nicht konfiguriert. WP_AUTH_BASE64 oder WP_USERNAME + WP_PASSWORD setzen.")
|
||||
|
||||
# Session für bessere Performance
|
||||
self.session = requests.Session()
|
||||
|
||||
|
||||
# Authentifizierung über Authorization Header mit Base64
|
||||
if self.auth_base64:
|
||||
# Verwende bereitgestellten Base64-String
|
||||
self.session.headers.update({
|
||||
'Authorization': f'Basic {self.auth_base64}',
|
||||
'User-Agent': USER_AGENT,
|
||||
'Content-Type': 'application/json',
|
||||
'Accept': 'application/json'
|
||||
})
|
||||
logging.info("✅ WordPress-Authentifizierung: Verwende bereitgestellten Base64-String")
|
||||
logging.info("✅ WordPress-Authentifizierung: Base64-String verwendet")
|
||||
else:
|
||||
# Fallback: Generiere Base64 aus Username/Password
|
||||
if self.username and self.password:
|
||||
credentials = f"{self.username}:{self.password}"
|
||||
encoded_credentials = base64.b64encode(credentials.encode('utf-8')).decode('utf-8')
|
||||
self.session.headers.update({
|
||||
'Authorization': f'Basic {encoded_credentials}',
|
||||
'User-Agent': USER_AGENT,
|
||||
'Content-Type': 'application/json',
|
||||
'Accept': 'application/json'
|
||||
})
|
||||
logging.info("✅ WordPress-Authentifizierung: Base64 aus Username/Password generiert")
|
||||
else:
|
||||
logging.error("❌ WordPress-Authentifizierung: Weder Base64-String noch Username/Password verfügbar")
|
||||
raise ValueError("WordPress-Authentifizierung nicht konfiguriert")
|
||||
|
||||
credentials = f"{self.username}:{self.password}"
|
||||
encoded_credentials = base64.b64encode(credentials.encode('utf-8')).decode('utf-8')
|
||||
self.session.headers.update({
|
||||
'Authorization': f'Basic {encoded_credentials}',
|
||||
'User-Agent': USER_AGENT,
|
||||
'Content-Type': 'application/json',
|
||||
'Accept': 'application/json'
|
||||
})
|
||||
logging.info("✅ WordPress-Authentifizierung: Base64 aus Username/Password generiert")
|
||||
|
||||
# Standard-Kategorie ID ermitteln
|
||||
self.default_category_id = self._get_default_category_id()
|
||||
|
||||
|
|
@ -335,11 +336,7 @@ class WordPressUploader:
|
|||
Testet die Verbindung zur WordPress API mit Base64-Authentifizierung
|
||||
"""
|
||||
try:
|
||||
logging.info("🔧 Teste WordPress-API-Verbindung mit Base64-Auth...")
|
||||
|
||||
# Debug: Auth-Header prüfen
|
||||
auth_header = self.session.headers.get('Authorization', 'Nicht gesetzt')
|
||||
logging.info(f"🔑 Authorization Header: {auth_header[:20]}..." if len(auth_header) > 20 else f"🔑 Authorization Header: {auth_header}")
|
||||
logging.info("🔧 Teste WordPress-API-Verbindung...")
|
||||
|
||||
# Einfache Abfrage der Kategorien als Test
|
||||
response = self.session.get(
|
||||
|
|
@ -349,7 +346,6 @@ class WordPressUploader:
|
|||
)
|
||||
|
||||
logging.info(f"📡 API-Response Status: {response.status_code}")
|
||||
logging.info(f"📡 API-Response Headers: {dict(response.headers)}")
|
||||
|
||||
if response.status_code == 200:
|
||||
logging.info("✅ WordPress-API-Verbindung erfolgreich")
|
||||
|
|
@ -465,4 +461,4 @@ def upload_single_article_to_wordpress(article: Dict) -> Tuple[bool, str, Option
|
|||
return False, connection_msg, None
|
||||
|
||||
# Artikel hochladen
|
||||
return uploader.upload_article(article)
|
||||
return uploader.upload_article(article)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue