feat: rebuild rss-news backend, admin ui, and legal extraction pipeline
This commit is contained in:
parent
d65c55d315
commit
2c331d683b
43 changed files with 3463 additions and 73 deletions
1
backend/app/__init__.py
Normal file
1
backend/app/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
"""Application package."""
|
||||
265
backend/app/admin_ui.py
Normal file
265
backend/app/admin_ui.py
Normal file
|
|
@ -0,0 +1,265 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlencode
|
||||
|
||||
from fastapi import APIRouter, Form, Request
|
||||
from fastapi.responses import HTMLResponse, RedirectResponse
|
||||
from fastapi.templating import Jinja2Templates
|
||||
|
||||
from .auth import create_session_token, verify_credentials, verify_session_token
|
||||
from .config import get_settings
|
||||
from .ingestion import run_ingestion
|
||||
from .policy import evaluate_source_policy
|
||||
from .repositories import (
|
||||
FeedCreate,
|
||||
SourceCreate,
|
||||
create_feed,
|
||||
create_source,
|
||||
get_article_by_id,
|
||||
list_articles,
|
||||
list_feeds,
|
||||
list_runs,
|
||||
list_sources,
|
||||
update_article_status,
|
||||
)
|
||||
|
||||
settings = get_settings()
|
||||
router = APIRouter(tags=["admin-ui"])
|
||||
templates = Jinja2Templates(directory=str(Path(__file__).resolve().parent.parent / "templates"))
|
||||
ALLOWED_TRANSITIONS: dict[str, tuple[str, ...]] = {
|
||||
"new": ("review", "rewrite", "error"),
|
||||
"rewrite": ("review", "error"),
|
||||
"review": ("approved", "rewrite", "error"),
|
||||
"approved": ("published", "error"),
|
||||
"published": ("error",),
|
||||
"error": ("review", "rewrite"),
|
||||
}
|
||||
|
||||
|
||||
def _admin_user(request: Request) -> str | None:
|
||||
token = request.cookies.get(settings.session_cookie_name)
|
||||
if not token:
|
||||
return None
|
||||
return verify_session_token(token)
|
||||
|
||||
|
||||
def _to_optional_int(raw: str | None) -> int | None:
|
||||
if raw is None:
|
||||
return None
|
||||
value = raw.strip()
|
||||
if value == "":
|
||||
return None
|
||||
return int(value)
|
||||
|
||||
|
||||
def _dashboard_redirect(
|
||||
*,
|
||||
msg: str | None = None,
|
||||
msg_type: str = "success",
|
||||
status_filter: str | None = None,
|
||||
) -> RedirectResponse:
|
||||
query: dict[str, str] = {}
|
||||
if msg:
|
||||
query["msg"] = msg
|
||||
query["type"] = msg_type
|
||||
if status_filter:
|
||||
query["status_filter"] = status_filter
|
||||
suffix = f"?{urlencode(query)}" if query else ""
|
||||
return RedirectResponse(url=f"/admin/dashboard{suffix}", status_code=303)
|
||||
|
||||
|
||||
def _parse_meta_json(raw: str | None) -> dict:
|
||||
if not raw:
|
||||
return {}
|
||||
try:
|
||||
parsed = json.loads(raw)
|
||||
return parsed if isinstance(parsed, dict) else {}
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
|
||||
@router.get("/admin", response_class=HTMLResponse)
|
||||
def admin_index(request: Request):
|
||||
user = _admin_user(request)
|
||||
if not user:
|
||||
return RedirectResponse(url="/admin/login", status_code=303)
|
||||
return RedirectResponse(url="/admin/dashboard", status_code=303)
|
||||
|
||||
|
||||
@router.get("/admin/login", response_class=HTMLResponse)
|
||||
def admin_login_page(request: Request):
|
||||
return templates.TemplateResponse(
|
||||
request,
|
||||
"admin_login.html",
|
||||
{"request": request, "title": "Admin Login", "error": request.query_params.get("error")},
|
||||
)
|
||||
|
||||
|
||||
@router.post("/admin/login")
|
||||
def admin_login(request: Request, username: str = Form(...), password: str = Form(...)):
|
||||
if not verify_credentials(username, password):
|
||||
return RedirectResponse(url="/admin/login?error=1", status_code=303)
|
||||
|
||||
token = create_session_token(username)
|
||||
response = RedirectResponse(url="/admin/dashboard", status_code=303)
|
||||
response.set_cookie(
|
||||
key=settings.session_cookie_name,
|
||||
value=token,
|
||||
max_age=settings.session_max_age_seconds,
|
||||
httponly=True,
|
||||
secure=False,
|
||||
samesite="lax",
|
||||
)
|
||||
return response
|
||||
|
||||
|
||||
@router.post("/admin/logout")
|
||||
def admin_logout():
|
||||
response = RedirectResponse(url="/admin/login", status_code=303)
|
||||
response.delete_cookie(settings.session_cookie_name)
|
||||
return response
|
||||
|
||||
|
||||
@router.get("/admin/dashboard", response_class=HTMLResponse)
|
||||
def admin_dashboard(request: Request):
|
||||
user = _admin_user(request)
|
||||
if not user:
|
||||
return RedirectResponse(url="/admin/login", status_code=303)
|
||||
|
||||
sources = list_sources()
|
||||
source_policy = {s["id"]: evaluate_source_policy(s) for s in sources}
|
||||
feeds = list_feeds()
|
||||
runs = list_runs(limit=30)
|
||||
status_filter = request.query_params.get("status_filter")
|
||||
if status_filter in {"new", "rewrite", "review", "approved", "published", "error"}:
|
||||
articles = list_articles(limit=100, status_filter=status_filter)
|
||||
else:
|
||||
status_filter = ""
|
||||
articles = list_articles(limit=100)
|
||||
for article in articles:
|
||||
meta = _parse_meta_json(article.get("meta_json"))
|
||||
extraction = meta.get("extraction") if isinstance(meta.get("extraction"), dict) else {}
|
||||
article["meta"] = meta
|
||||
article["extracted_images"] = extraction.get("images") if isinstance(extraction.get("images"), list) else []
|
||||
article["press_contact"] = extraction.get("press_contact") if isinstance(extraction.get("press_contact"), str) else None
|
||||
article["extraction_error"] = extraction.get("extraction_error") if isinstance(extraction.get("extraction_error"), str) else None
|
||||
|
||||
return templates.TemplateResponse(
|
||||
request,
|
||||
"admin_dashboard.html",
|
||||
{
|
||||
"request": request,
|
||||
"title": "Admin Dashboard",
|
||||
"user": user,
|
||||
"sources": sources,
|
||||
"source_policy": source_policy,
|
||||
"feeds": feeds,
|
||||
"runs": runs,
|
||||
"articles": articles,
|
||||
"status_options": ["new", "rewrite", "review", "approved", "published", "error"],
|
||||
"allowed_transitions": ALLOWED_TRANSITIONS,
|
||||
"status_filter": status_filter,
|
||||
"flash_msg": request.query_params.get("msg", ""),
|
||||
"flash_type": request.query_params.get("type", "success"),
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@router.post("/admin/sources/create")
|
||||
def admin_create_source(
|
||||
request: Request,
|
||||
name: str = Form(...),
|
||||
base_url: str = Form(""),
|
||||
terms_url: str = Form(""),
|
||||
license_name: str = Form(""),
|
||||
risk_level: str = Form("yellow"),
|
||||
last_reviewed_at: str = Form(""),
|
||||
):
|
||||
user = _admin_user(request)
|
||||
if not user:
|
||||
return RedirectResponse(url="/admin/login", status_code=303)
|
||||
|
||||
try:
|
||||
create_source(
|
||||
SourceCreate(
|
||||
name=name,
|
||||
base_url=base_url or None,
|
||||
terms_url=terms_url or None,
|
||||
license_name=license_name or None,
|
||||
risk_level=risk_level,
|
||||
is_enabled=True,
|
||||
notes=None,
|
||||
last_reviewed_at=last_reviewed_at or None,
|
||||
)
|
||||
)
|
||||
except Exception as exc:
|
||||
return _dashboard_redirect(msg=f"Quelle konnte nicht gespeichert werden: {exc}", msg_type="error")
|
||||
return _dashboard_redirect(msg="Quelle gespeichert")
|
||||
|
||||
|
||||
@router.post("/admin/feeds/create")
|
||||
def admin_create_feed(
|
||||
request: Request,
|
||||
name: str = Form(...),
|
||||
url: str = Form(...),
|
||||
source_id: str = Form(""),
|
||||
):
|
||||
user = _admin_user(request)
|
||||
if not user:
|
||||
return RedirectResponse(url="/admin/login", status_code=303)
|
||||
|
||||
try:
|
||||
create_feed(
|
||||
FeedCreate(
|
||||
name=name,
|
||||
url=url,
|
||||
source_id=_to_optional_int(source_id),
|
||||
is_enabled=True,
|
||||
)
|
||||
)
|
||||
except Exception as exc:
|
||||
return _dashboard_redirect(msg=f"Feed konnte nicht gespeichert werden: {exc}", msg_type="error")
|
||||
return _dashboard_redirect(msg="Feed gespeichert")
|
||||
|
||||
|
||||
@router.post("/admin/ingestion/run")
|
||||
def admin_run_ingestion(request: Request, feed_id: str = Form("")):
|
||||
user = _admin_user(request)
|
||||
if not user:
|
||||
return RedirectResponse(url="/admin/login", status_code=303)
|
||||
try:
|
||||
stats = run_ingestion(feed_id=_to_optional_int(feed_id))
|
||||
except Exception as exc:
|
||||
return _dashboard_redirect(msg=f"Ingestion fehlgeschlagen: {exc}", msg_type="error")
|
||||
return _dashboard_redirect(msg=f"Ingestion: {stats.status}, upserts={stats.articles_upserted}")
|
||||
|
||||
|
||||
@router.post("/admin/articles/{article_id}/review")
|
||||
def admin_review_article(request: Request, article_id: int, decision: str = Form(...), note: str = Form("")):
|
||||
user = _admin_user(request)
|
||||
if not user:
|
||||
return RedirectResponse(url="/admin/login", status_code=303)
|
||||
|
||||
article = get_article_by_id(article_id)
|
||||
if article and article.get("status") == "review" and decision in {"approve", "reject"}:
|
||||
target = "approved" if decision == "approve" else "rewrite"
|
||||
update_article_status(article_id, target, actor=user, note=note or None, decision=decision)
|
||||
return _dashboard_redirect(msg=f"Artikel #{article_id}: {decision}")
|
||||
return _dashboard_redirect(msg=f"Review-Aktion ungueltig fuer Artikel #{article_id}", msg_type="error")
|
||||
|
||||
|
||||
@router.post("/admin/articles/{article_id}/transition")
|
||||
def admin_transition_article(request: Request, article_id: int, target_status: str = Form(...), note: str = Form("")):
|
||||
user = _admin_user(request)
|
||||
if not user:
|
||||
return RedirectResponse(url="/admin/login", status_code=303)
|
||||
|
||||
article = get_article_by_id(article_id)
|
||||
if article:
|
||||
current = article.get("status")
|
||||
if target_status in ALLOWED_TRANSITIONS.get(current, ()):
|
||||
update_article_status(article_id, target_status, actor=user, note=note or None)
|
||||
return _dashboard_redirect(msg=f"Artikel #{article_id}: {current} -> {target_status}")
|
||||
return _dashboard_redirect(msg=f"Ungueltiger Statuswechsel fuer Artikel #{article_id}", msg_type="error")
|
||||
31
backend/app/auth.py
Normal file
31
backend/app/auth.py
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
import hmac
|
||||
from typing import Optional
|
||||
|
||||
from itsdangerous import URLSafeTimedSerializer, BadSignature, SignatureExpired
|
||||
|
||||
from .config import get_settings
|
||||
|
||||
|
||||
def _serializer() -> URLSafeTimedSerializer:
|
||||
settings = get_settings()
|
||||
return URLSafeTimedSerializer(settings.app_secret_key, salt="rss-news-session")
|
||||
|
||||
|
||||
def verify_credentials(username: str, password: str) -> bool:
|
||||
settings = get_settings()
|
||||
user_ok = hmac.compare_digest(username, settings.app_admin_username)
|
||||
pw_ok = hmac.compare_digest(password, settings.app_admin_password)
|
||||
return user_ok and pw_ok
|
||||
|
||||
|
||||
def create_session_token(username: str) -> str:
|
||||
return _serializer().dumps({"username": username})
|
||||
|
||||
|
||||
def verify_session_token(token: str) -> Optional[str]:
|
||||
settings = get_settings()
|
||||
try:
|
||||
payload = _serializer().loads(token, max_age=settings.session_max_age_seconds)
|
||||
except (BadSignature, SignatureExpired):
|
||||
return None
|
||||
return payload.get("username")
|
||||
29
backend/app/config.py
Normal file
29
backend/app/config.py
Normal file
|
|
@ -0,0 +1,29 @@
|
|||
from functools import lru_cache
|
||||
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
# Prefer backend-specific env file to avoid collisions with legacy root .env
|
||||
model_config = SettingsConfigDict(
|
||||
env_file=("backend/.env", ".env"),
|
||||
env_file_encoding="utf-8",
|
||||
extra="ignore",
|
||||
)
|
||||
|
||||
app_env: str = "development"
|
||||
app_name: str = "rss-news-backend"
|
||||
app_secret_key: str = "replace-with-a-long-random-secret"
|
||||
|
||||
app_admin_username: str = "admin"
|
||||
app_admin_password: str = "change-me"
|
||||
|
||||
session_cookie_name: str = "rss_news_session"
|
||||
session_max_age_seconds: int = 28800
|
||||
|
||||
app_db_path: str = "backend/data/rss_news.db"
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def get_settings() -> Settings:
|
||||
return Settings()
|
||||
138
backend/app/db.py
Normal file
138
backend/app/db.py
Normal file
|
|
@ -0,0 +1,138 @@
|
|||
import sqlite3
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterator
|
||||
|
||||
from .config import get_settings
|
||||
|
||||
|
||||
def _db_path() -> Path:
|
||||
settings = get_settings()
|
||||
path = Path(settings.app_db_path)
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
return path
|
||||
|
||||
|
||||
@contextmanager
|
||||
def get_conn() -> Iterator[sqlite3.Connection]:
|
||||
conn = sqlite3.connect(_db_path())
|
||||
conn.row_factory = sqlite3.Row
|
||||
conn.execute("PRAGMA foreign_keys=ON;")
|
||||
try:
|
||||
yield conn
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def init_db() -> None:
|
||||
with get_conn() as conn:
|
||||
conn.executescript(
|
||||
"""
|
||||
PRAGMA journal_mode=WAL;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS sources (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
name TEXT NOT NULL,
|
||||
base_url TEXT,
|
||||
terms_url TEXT,
|
||||
license_name TEXT,
|
||||
risk_level TEXT NOT NULL DEFAULT 'yellow' CHECK (risk_level IN ('green', 'yellow', 'red')),
|
||||
is_enabled INTEGER NOT NULL DEFAULT 0,
|
||||
notes TEXT,
|
||||
last_reviewed_at TEXT,
|
||||
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS feeds (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
source_id INTEGER,
|
||||
name TEXT NOT NULL,
|
||||
url TEXT NOT NULL UNIQUE,
|
||||
is_enabled INTEGER NOT NULL DEFAULT 1,
|
||||
etag TEXT,
|
||||
last_modified TEXT,
|
||||
last_checked_at TEXT,
|
||||
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
FOREIGN KEY(source_id) REFERENCES sources(id) ON DELETE SET NULL
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS runs (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
run_type TEXT NOT NULL,
|
||||
status TEXT NOT NULL CHECK (status IN ('queued', 'running', 'success', 'failed')),
|
||||
started_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
finished_at TEXT,
|
||||
details TEXT
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS articles (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
feed_id INTEGER,
|
||||
source_article_id TEXT,
|
||||
source_hash TEXT,
|
||||
title TEXT NOT NULL,
|
||||
source_url TEXT NOT NULL,
|
||||
canonical_url TEXT,
|
||||
published_at TEXT,
|
||||
author TEXT,
|
||||
summary TEXT,
|
||||
content_raw TEXT,
|
||||
content_rewritten TEXT,
|
||||
word_count INTEGER DEFAULT 0,
|
||||
status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error')),
|
||||
meta_json TEXT,
|
||||
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
FOREIGN KEY(feed_id) REFERENCES feeds(id) ON DELETE SET NULL,
|
||||
UNIQUE(source_url)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_articles_source_article_id ON articles(source_article_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_articles_source_hash ON articles(source_hash);
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_feed_source_article_id
|
||||
ON articles(feed_id, source_article_id)
|
||||
WHERE source_article_id IS NOT NULL;
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_source_hash
|
||||
ON articles(source_hash)
|
||||
WHERE source_hash IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_articles_status ON articles(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_source_id ON feeds(source_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_runs_started_at ON runs(started_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_articles_published_at ON articles(published_at);
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS trg_sources_updated_at
|
||||
AFTER UPDATE ON sources
|
||||
FOR EACH ROW
|
||||
BEGIN
|
||||
UPDATE sources SET updated_at = datetime('now') WHERE id = OLD.id;
|
||||
END;
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS trg_feeds_updated_at
|
||||
AFTER UPDATE ON feeds
|
||||
FOR EACH ROW
|
||||
BEGIN
|
||||
UPDATE feeds SET updated_at = datetime('now') WHERE id = OLD.id;
|
||||
END;
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS trg_articles_updated_at
|
||||
AFTER UPDATE ON articles
|
||||
FOR EACH ROW
|
||||
BEGIN
|
||||
UPDATE articles SET updated_at = datetime('now') WHERE id = OLD.id;
|
||||
END;
|
||||
"""
|
||||
)
|
||||
|
||||
# Lightweight migration for existing DBs created before source_hash was introduced.
|
||||
existing_columns = {
|
||||
row["name"] for row in conn.execute("PRAGMA table_info(articles)").fetchall()
|
||||
}
|
||||
if "source_hash" not in existing_columns:
|
||||
conn.execute("ALTER TABLE articles ADD COLUMN source_hash TEXT")
|
||||
|
||||
|
||||
def rows_to_dicts(rows: list[sqlite3.Row]) -> list[dict[str, Any]]:
|
||||
return [dict(r) for r in rows]
|
||||
253
backend/app/ingestion.py
Normal file
253
backend/app/ingestion.py
Normal file
|
|
@ -0,0 +1,253 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
import hashlib
|
||||
import json
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
import feedparser
|
||||
|
||||
from .policy import evaluate_source_policy
|
||||
from .repositories import (
|
||||
ArticleUpsert,
|
||||
RunCreate,
|
||||
create_run,
|
||||
finish_run,
|
||||
get_feed_by_id,
|
||||
list_enabled_feeds,
|
||||
update_feed_fetch_state,
|
||||
upsert_article,
|
||||
)
|
||||
from .source_extraction import extract_article, extracted_article_to_meta
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class IngestionStats:
|
||||
run_id: int
|
||||
feeds_processed: int
|
||||
entries_seen: int
|
||||
articles_upserted: int
|
||||
status: str
|
||||
message: str
|
||||
|
||||
|
||||
MAX_FEED_FETCH_RETRIES = 3
|
||||
|
||||
|
||||
def _entry_published_iso(entry: dict) -> str | None:
|
||||
published = entry.get("published_parsed") or entry.get("updated_parsed")
|
||||
if not published:
|
||||
return None
|
||||
return datetime(*published[:6], tzinfo=timezone.utc).isoformat()
|
||||
|
||||
|
||||
def _entry_text(entry: dict) -> tuple[str, str]:
|
||||
summary = entry.get("summary", "") or ""
|
||||
content = ""
|
||||
if entry.get("content") and isinstance(entry.get("content"), list):
|
||||
first = entry["content"][0]
|
||||
content = first.get("value", "") if isinstance(first, dict) else ""
|
||||
if not content:
|
||||
content = summary
|
||||
return summary, content
|
||||
|
||||
|
||||
def _entry_hash(entry: dict, feed_id: int, link: str, title: str, summary: str) -> str:
|
||||
source_id = entry.get("id") or entry.get("guid") or ""
|
||||
published = _entry_published_iso(entry) or ""
|
||||
fingerprint = f"{feed_id}|{source_id}|{link}|{title.strip()}|{summary.strip()}|{published}"
|
||||
return hashlib.sha256(fingerprint.encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
def _parsed_get(parsed: object, key: str, default: object = None) -> object:
|
||||
if isinstance(parsed, dict):
|
||||
return parsed.get(key, default)
|
||||
return getattr(parsed, key, default)
|
||||
|
||||
|
||||
def run_ingestion(feed_id: int | None = None) -> IngestionStats:
|
||||
run_id = create_run(RunCreate(run_type="ingestion", status="running", details="started"))
|
||||
feeds_processed = 0
|
||||
entries_seen = 0
|
||||
articles_upserted = 0
|
||||
feed_results: list[dict[str, object]] = []
|
||||
|
||||
try:
|
||||
if feed_id is not None:
|
||||
feed = get_feed_by_id(feed_id)
|
||||
feeds = [feed] if feed and int(feed.get("is_enabled", 0)) == 1 else []
|
||||
else:
|
||||
feeds = list_enabled_feeds()
|
||||
|
||||
for feed in feeds:
|
||||
if not feed:
|
||||
continue
|
||||
feeds_processed += 1
|
||||
|
||||
source_snapshot = {
|
||||
"id": feed.get("source_id"),
|
||||
"name": feed.get("source_name"),
|
||||
"base_url": feed.get("source_base_url"),
|
||||
"terms_url": feed.get("source_terms_url"),
|
||||
"license_name": feed.get("source_license_name"),
|
||||
"risk_level": feed.get("source_risk_level"),
|
||||
"last_reviewed_at": feed.get("source_last_reviewed_at"),
|
||||
"is_enabled": feed.get("source_is_enabled"),
|
||||
}
|
||||
policy_issues = evaluate_source_policy(source_snapshot)
|
||||
if policy_issues:
|
||||
feed_results.append(
|
||||
{
|
||||
"feed_id": int(feed["id"]),
|
||||
"feed_url": feed["url"],
|
||||
"status": "blocked",
|
||||
"policy_issues": policy_issues,
|
||||
"entries_seen": 0,
|
||||
"upserts": 0,
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
parsed = None
|
||||
feed_error = None
|
||||
for attempt in range(1, MAX_FEED_FETCH_RETRIES + 1):
|
||||
try:
|
||||
parsed = feedparser.parse(
|
||||
feed["url"],
|
||||
etag=feed.get("etag"),
|
||||
modified=feed.get("last_modified"),
|
||||
)
|
||||
break
|
||||
except Exception as exc:
|
||||
feed_error = str(exc)
|
||||
if attempt < MAX_FEED_FETCH_RETRIES:
|
||||
time.sleep(0.5 * attempt)
|
||||
|
||||
if parsed is None:
|
||||
feed_results.append(
|
||||
{
|
||||
"feed_id": int(feed["id"]),
|
||||
"feed_url": feed["url"],
|
||||
"status": "failed",
|
||||
"error": feed_error or "unknown",
|
||||
"entries_seen": 0,
|
||||
"upserts": 0,
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
# Persist ETag/Last-Modified for conditional requests.
|
||||
parsed_etag = _parsed_get(parsed, "etag")
|
||||
parsed_modified = _parsed_get(parsed, "modified")
|
||||
if parsed_modified and not isinstance(parsed_modified, str):
|
||||
parsed_modified = str(parsed_modified)
|
||||
update_feed_fetch_state(
|
||||
feed_id=int(feed["id"]),
|
||||
etag=parsed_etag if isinstance(parsed_etag, str) else None,
|
||||
last_modified=parsed_modified if isinstance(parsed_modified, str) else None,
|
||||
)
|
||||
|
||||
feed_entries_seen = 0
|
||||
feed_upserts = 0
|
||||
for entry in _parsed_get(parsed, "entries", []):
|
||||
entries_seen += 1
|
||||
feed_entries_seen += 1
|
||||
link = entry.get("link")
|
||||
if not link:
|
||||
continue
|
||||
|
||||
summary, content_raw = _entry_text(entry)
|
||||
title = entry.get("title") or "Ohne Titel"
|
||||
extracted = extract_article(link)
|
||||
|
||||
final_title = extracted.title or title
|
||||
final_author = extracted.author or entry.get("author")
|
||||
final_summary = extracted.summary or (summary[:1000] if summary else None)
|
||||
final_content_raw = extracted.content_text or content_raw
|
||||
final_canonical = extracted.canonical_url or entry.get("link")
|
||||
|
||||
source_hash = _entry_hash(
|
||||
entry,
|
||||
int(feed["id"]),
|
||||
link,
|
||||
final_title,
|
||||
final_summary or "",
|
||||
)
|
||||
attribution = {
|
||||
"source_name": feed.get("source_name"),
|
||||
"source_base_url": feed.get("source_base_url"),
|
||||
"source_terms_url": feed.get("source_terms_url"),
|
||||
"source_license_name": feed.get("source_license_name"),
|
||||
"source_risk_level": feed.get("source_risk_level"),
|
||||
"original_link": link,
|
||||
"feed_name": feed.get("name"),
|
||||
"feed_id": int(feed["id"]),
|
||||
"imported_at": datetime.now(timezone.utc).isoformat(),
|
||||
}
|
||||
extraction_meta: dict[str, Any] = extracted_article_to_meta(extracted)
|
||||
extraction_meta["fetched_from"] = link
|
||||
article_id = upsert_article(
|
||||
ArticleUpsert(
|
||||
feed_id=int(feed["id"]),
|
||||
source_article_id=entry.get("id") or entry.get("guid"),
|
||||
source_hash=source_hash,
|
||||
title=final_title,
|
||||
source_url=link,
|
||||
canonical_url=final_canonical,
|
||||
published_at=_entry_published_iso(entry),
|
||||
author=final_author,
|
||||
summary=final_summary,
|
||||
content_raw=final_content_raw,
|
||||
content_rewritten=None,
|
||||
word_count=len((final_content_raw or "").split()),
|
||||
status="new",
|
||||
meta_json=json.dumps({"attribution": attribution, "extraction": extraction_meta}, ensure_ascii=False),
|
||||
)
|
||||
)
|
||||
if article_id:
|
||||
articles_upserted += 1
|
||||
feed_upserts += 1
|
||||
|
||||
feed_results.append(
|
||||
{
|
||||
"feed_id": int(feed["id"]),
|
||||
"feed_url": feed["url"],
|
||||
"status": "success",
|
||||
"entries_seen": feed_entries_seen,
|
||||
"upserts": feed_upserts,
|
||||
}
|
||||
)
|
||||
|
||||
finish_run(
|
||||
run_id=run_id,
|
||||
status="success",
|
||||
details=json.dumps(
|
||||
{
|
||||
"feeds_processed": feeds_processed,
|
||||
"entries_seen": entries_seen,
|
||||
"upserts": articles_upserted,
|
||||
"feeds": feed_results,
|
||||
},
|
||||
ensure_ascii=False,
|
||||
),
|
||||
)
|
||||
return IngestionStats(
|
||||
run_id=run_id,
|
||||
feeds_processed=feeds_processed,
|
||||
entries_seen=entries_seen,
|
||||
articles_upserted=articles_upserted,
|
||||
status="success",
|
||||
message="Ingestion abgeschlossen",
|
||||
)
|
||||
except Exception as exc:
|
||||
finish_run(run_id=run_id, status="failed", details=str(exc))
|
||||
return IngestionStats(
|
||||
run_id=run_id,
|
||||
feeds_processed=feeds_processed,
|
||||
entries_seen=entries_seen,
|
||||
articles_upserted=articles_upserted,
|
||||
status="failed",
|
||||
message=str(exc),
|
||||
)
|
||||
404
backend/app/main.py
Normal file
404
backend/app/main.py
Normal file
|
|
@ -0,0 +1,404 @@
|
|||
from contextlib import asynccontextmanager
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import Depends, FastAPI, HTTPException, Request, Response, status
|
||||
from pydantic import BaseModel, Field
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
|
||||
from .admin_ui import router as admin_router
|
||||
from .auth import create_session_token, verify_credentials, verify_session_token
|
||||
from .config import get_settings
|
||||
from .db import init_db
|
||||
from .ingestion import run_ingestion
|
||||
from .policy import evaluate_source_policy, is_source_allowed
|
||||
from .repositories import (
|
||||
ArticleUpsert,
|
||||
FeedCreate,
|
||||
RunCreate,
|
||||
SourceCreate,
|
||||
create_feed as repo_create_feed,
|
||||
create_run,
|
||||
create_source as repo_create_source,
|
||||
finish_run,
|
||||
get_article_by_id,
|
||||
get_feed_by_id,
|
||||
get_run_by_id,
|
||||
get_source_by_id,
|
||||
list_articles as repo_list_articles,
|
||||
list_feeds as repo_list_feeds,
|
||||
list_runs,
|
||||
list_sources as repo_list_sources,
|
||||
update_article_status,
|
||||
upsert_article as repo_upsert_article,
|
||||
)
|
||||
|
||||
settings = get_settings()
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def app_lifespan(_: FastAPI):
|
||||
init_db()
|
||||
yield
|
||||
|
||||
|
||||
app = FastAPI(title=settings.app_name, lifespan=app_lifespan)
|
||||
app.include_router(admin_router)
|
||||
app.mount(
|
||||
"/admin/static",
|
||||
StaticFiles(directory=str(Path(__file__).resolve().parent.parent / "static")),
|
||||
name="admin-static",
|
||||
)
|
||||
|
||||
|
||||
class LoginRequest(BaseModel):
|
||||
username: str
|
||||
password: str
|
||||
|
||||
|
||||
class SourceCreateRequest(BaseModel):
|
||||
name: str = Field(min_length=1, max_length=200)
|
||||
base_url: str | None = None
|
||||
terms_url: str | None = None
|
||||
license_name: str | None = None
|
||||
risk_level: str = Field(default="yellow", pattern="^(green|yellow|red)$")
|
||||
is_enabled: bool = False
|
||||
notes: str | None = None
|
||||
last_reviewed_at: str | None = None
|
||||
|
||||
|
||||
class FeedCreateRequest(BaseModel):
|
||||
name: str = Field(min_length=1, max_length=200)
|
||||
url: str = Field(min_length=5, max_length=1000)
|
||||
source_id: int | None = None
|
||||
is_enabled: bool = True
|
||||
|
||||
|
||||
class RunCreateRequest(BaseModel):
|
||||
run_type: str = Field(min_length=2, max_length=100)
|
||||
status: str = Field(default="queued", pattern="^(queued|running|success|failed)$")
|
||||
details: str | None = None
|
||||
|
||||
|
||||
class RunFinishRequest(BaseModel):
|
||||
status: str = Field(pattern="^(success|failed)$")
|
||||
details: str | None = None
|
||||
|
||||
|
||||
class ArticleUpsertRequest(BaseModel):
|
||||
feed_id: int | None = None
|
||||
source_article_id: str | None = None
|
||||
source_hash: str | None = None
|
||||
title: str = Field(min_length=1, max_length=500)
|
||||
source_url: str = Field(min_length=5, max_length=2000)
|
||||
canonical_url: str | None = None
|
||||
published_at: str | None = None
|
||||
author: str | None = None
|
||||
summary: str | None = None
|
||||
content_raw: str | None = None
|
||||
content_rewritten: str | None = None
|
||||
word_count: int = 0
|
||||
status: str = Field(default="new", pattern="^(new|rewrite|review|approved|published|error)$")
|
||||
meta_json: str | None = None
|
||||
|
||||
|
||||
class IngestionRunRequest(BaseModel):
|
||||
feed_id: int | None = None
|
||||
|
||||
|
||||
class ArticleTransitionRequest(BaseModel):
|
||||
target_status: str = Field(pattern="^(new|rewrite|review|approved|published|error)$")
|
||||
note: str | None = None
|
||||
|
||||
|
||||
class ArticleReviewRequest(BaseModel):
|
||||
decision: str = Field(pattern="^(approve|reject)$")
|
||||
note: str | None = None
|
||||
|
||||
|
||||
ALLOWED_ARTICLE_TRANSITIONS: dict[str, set[str]] = {
|
||||
"new": {"review", "rewrite", "error"},
|
||||
"rewrite": {"review", "error"},
|
||||
"review": {"approved", "rewrite", "error"},
|
||||
"approved": {"published", "error"},
|
||||
"published": {"error"},
|
||||
"error": {"review", "rewrite"},
|
||||
}
|
||||
|
||||
|
||||
def require_auth(request: Request) -> str:
|
||||
token = request.cookies.get(settings.session_cookie_name)
|
||||
if not token:
|
||||
raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Nicht angemeldet")
|
||||
|
||||
username = verify_session_token(token)
|
||||
if not username:
|
||||
raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Session ungueltig oder abgelaufen")
|
||||
|
||||
return username
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
def health() -> dict:
|
||||
return {"status": "ok", "service": settings.app_name, "db_path": settings.app_db_path}
|
||||
|
||||
|
||||
@app.post("/auth/login")
|
||||
def login(payload: LoginRequest, response: Response) -> dict:
|
||||
if not verify_credentials(payload.username, payload.password):
|
||||
raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Ungueltige Zugangsdaten")
|
||||
|
||||
token = create_session_token(payload.username)
|
||||
response.set_cookie(
|
||||
key=settings.session_cookie_name,
|
||||
value=token,
|
||||
max_age=settings.session_max_age_seconds,
|
||||
httponly=True,
|
||||
secure=False,
|
||||
samesite="lax",
|
||||
)
|
||||
return {"ok": True, "username": payload.username}
|
||||
|
||||
|
||||
@app.post("/auth/logout")
|
||||
def logout(response: Response) -> dict:
|
||||
response.delete_cookie(settings.session_cookie_name)
|
||||
return {"ok": True}
|
||||
|
||||
|
||||
@app.get("/auth/me")
|
||||
def me(username: str = Depends(require_auth)) -> dict:
|
||||
return {"authenticated": True, "username": username}
|
||||
|
||||
|
||||
@app.get("/api/protected")
|
||||
def protected(username: str = Depends(require_auth)) -> dict:
|
||||
return {"ok": True, "message": "Protected endpoint", "username": username}
|
||||
|
||||
|
||||
@app.get("/api/pipeline/status")
|
||||
def pipeline_status(username: str = Depends(require_auth)) -> dict:
|
||||
feeds_total = len(repo_list_feeds())
|
||||
sources_total = len(repo_list_sources())
|
||||
articles_total = len(repo_list_articles(limit=500))
|
||||
return {
|
||||
"ok": True,
|
||||
"stage": "skeleton+db",
|
||||
"requested_by": username,
|
||||
"counts": {
|
||||
"sources": sources_total,
|
||||
"feeds": feeds_total,
|
||||
"articles": articles_total,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/sources")
|
||||
def list_sources(username: str = Depends(require_auth)) -> dict:
|
||||
return {"ok": True, "items": repo_list_sources(), "requested_by": username}
|
||||
|
||||
|
||||
@app.get("/api/sources/{source_id}/policy-check")
|
||||
def source_policy_check(source_id: int, username: str = Depends(require_auth)) -> dict:
|
||||
source = get_source_by_id(source_id)
|
||||
if not source:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Quelle nicht gefunden")
|
||||
issues = evaluate_source_policy(source)
|
||||
return {
|
||||
"ok": True,
|
||||
"source_id": source_id,
|
||||
"allowed": is_source_allowed(source),
|
||||
"issues": issues,
|
||||
"requested_by": username,
|
||||
}
|
||||
|
||||
|
||||
@app.post("/api/sources")
|
||||
def create_source(payload: SourceCreateRequest, username: str = Depends(require_auth)) -> dict:
|
||||
source_id = repo_create_source(
|
||||
SourceCreate(
|
||||
name=payload.name,
|
||||
base_url=payload.base_url,
|
||||
terms_url=payload.terms_url,
|
||||
license_name=payload.license_name,
|
||||
risk_level=payload.risk_level,
|
||||
is_enabled=payload.is_enabled,
|
||||
notes=payload.notes,
|
||||
last_reviewed_at=payload.last_reviewed_at,
|
||||
)
|
||||
)
|
||||
return {"ok": True, "id": source_id, "requested_by": username}
|
||||
|
||||
|
||||
@app.get("/api/feeds")
|
||||
def list_feeds(username: str = Depends(require_auth)) -> dict:
|
||||
return {"ok": True, "items": repo_list_feeds(), "requested_by": username}
|
||||
|
||||
|
||||
@app.get("/api/feeds/{feed_id}/policy-check")
|
||||
def feed_policy_check(feed_id: int, username: str = Depends(require_auth)) -> dict:
|
||||
feed = get_feed_by_id(feed_id)
|
||||
if not feed:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Feed nicht gefunden")
|
||||
|
||||
source_snapshot = {
|
||||
"id": feed.get("source_id"),
|
||||
"name": feed.get("source_name"),
|
||||
"base_url": feed.get("source_base_url"),
|
||||
"terms_url": feed.get("source_terms_url"),
|
||||
"license_name": feed.get("source_license_name"),
|
||||
"risk_level": feed.get("source_risk_level"),
|
||||
"last_reviewed_at": feed.get("source_last_reviewed_at"),
|
||||
"is_enabled": feed.get("source_is_enabled"),
|
||||
}
|
||||
issues = evaluate_source_policy(source_snapshot)
|
||||
return {
|
||||
"ok": True,
|
||||
"feed_id": feed_id,
|
||||
"allowed": len(issues) == 0,
|
||||
"issues": issues,
|
||||
"requested_by": username,
|
||||
}
|
||||
|
||||
|
||||
@app.post("/api/feeds")
|
||||
def create_feed(payload: FeedCreateRequest, username: str = Depends(require_auth)) -> dict:
|
||||
try:
|
||||
feed_id = repo_create_feed(
|
||||
FeedCreate(
|
||||
name=payload.name,
|
||||
url=payload.url,
|
||||
source_id=payload.source_id,
|
||||
is_enabled=payload.is_enabled,
|
||||
)
|
||||
)
|
||||
except Exception as exc:
|
||||
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=f"Feed konnte nicht angelegt werden: {exc}") from exc
|
||||
|
||||
return {"ok": True, "id": feed_id, "requested_by": username}
|
||||
|
||||
|
||||
@app.get("/api/runs")
|
||||
def api_list_runs(limit: int = 50, username: str = Depends(require_auth)) -> dict:
|
||||
return {"ok": True, "items": list_runs(limit=limit), "requested_by": username}
|
||||
|
||||
|
||||
@app.get("/api/runs/{run_id}")
|
||||
def api_get_run(run_id: int, username: str = Depends(require_auth)) -> dict:
|
||||
run = get_run_by_id(run_id)
|
||||
if not run:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Run nicht gefunden")
|
||||
return {"ok": True, "item": run, "requested_by": username}
|
||||
|
||||
|
||||
@app.post("/api/runs")
|
||||
def api_create_run(payload: RunCreateRequest, username: str = Depends(require_auth)) -> dict:
|
||||
run_id = create_run(RunCreate(run_type=payload.run_type, status=payload.status, details=payload.details))
|
||||
return {"ok": True, "id": run_id, "requested_by": username}
|
||||
|
||||
|
||||
@app.post("/api/runs/{run_id}/finish")
|
||||
def api_finish_run(run_id: int, payload: RunFinishRequest, username: str = Depends(require_auth)) -> dict:
|
||||
finish_run(run_id=run_id, status=payload.status, details=payload.details)
|
||||
return {"ok": True, "id": run_id, "requested_by": username}
|
||||
|
||||
|
||||
@app.get("/api/articles")
|
||||
def api_list_articles(limit: int = 100, status_filter: str | None = None, username: str = Depends(require_auth)) -> dict:
|
||||
return {"ok": True, "items": repo_list_articles(limit=limit, status_filter=status_filter), "requested_by": username}
|
||||
|
||||
|
||||
@app.get("/api/articles/{article_id}")
|
||||
def api_get_article(article_id: int, username: str = Depends(require_auth)) -> dict:
|
||||
article = get_article_by_id(article_id)
|
||||
if not article:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
|
||||
return {"ok": True, "item": article, "requested_by": username}
|
||||
|
||||
|
||||
@app.post("/api/articles/upsert")
|
||||
def api_upsert_article(payload: ArticleUpsertRequest, username: str = Depends(require_auth)) -> dict:
|
||||
article_id = repo_upsert_article(
|
||||
ArticleUpsert(
|
||||
feed_id=payload.feed_id,
|
||||
source_article_id=payload.source_article_id,
|
||||
source_hash=payload.source_hash,
|
||||
title=payload.title,
|
||||
source_url=payload.source_url,
|
||||
canonical_url=payload.canonical_url,
|
||||
published_at=payload.published_at,
|
||||
author=payload.author,
|
||||
summary=payload.summary,
|
||||
content_raw=payload.content_raw,
|
||||
content_rewritten=payload.content_rewritten,
|
||||
word_count=payload.word_count,
|
||||
status=payload.status,
|
||||
meta_json=payload.meta_json,
|
||||
)
|
||||
)
|
||||
return {"ok": True, "id": article_id, "requested_by": username}
|
||||
|
||||
|
||||
@app.post("/api/articles/{article_id}/transition")
|
||||
def api_article_transition(article_id: int, payload: ArticleTransitionRequest, username: str = Depends(require_auth)) -> dict:
|
||||
article = get_article_by_id(article_id)
|
||||
if not article:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
|
||||
|
||||
current_status = article.get("status")
|
||||
allowed_targets = ALLOWED_ARTICLE_TRANSITIONS.get(current_status, set())
|
||||
if payload.target_status not in allowed_targets:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail=f"Ungueltiger Statuswechsel: {current_status} -> {payload.target_status}",
|
||||
)
|
||||
|
||||
updated = update_article_status(article_id, payload.target_status, actor=username, note=payload.note)
|
||||
if not updated:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
|
||||
return {"ok": True, "id": article_id, "from_status": current_status, "to_status": payload.target_status}
|
||||
|
||||
|
||||
@app.post("/api/articles/{article_id}/review")
|
||||
def api_article_review(article_id: int, payload: ArticleReviewRequest, username: str = Depends(require_auth)) -> dict:
|
||||
article = get_article_by_id(article_id)
|
||||
if not article:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
|
||||
if article.get("status") != "review":
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail=f"Review nur fuer Status 'review' erlaubt (aktuell: {article.get('status')})",
|
||||
)
|
||||
|
||||
target_status = "approved" if payload.decision == "approve" else "rewrite"
|
||||
updated = update_article_status(
|
||||
article_id,
|
||||
target_status,
|
||||
actor=username,
|
||||
note=payload.note,
|
||||
decision=payload.decision,
|
||||
)
|
||||
if not updated:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
|
||||
return {
|
||||
"ok": True,
|
||||
"id": article_id,
|
||||
"decision": payload.decision,
|
||||
"to_status": target_status,
|
||||
}
|
||||
|
||||
|
||||
@app.post("/api/ingestion/run")
|
||||
def api_run_ingestion(payload: IngestionRunRequest, username: str = Depends(require_auth)) -> dict:
|
||||
stats = run_ingestion(feed_id=payload.feed_id)
|
||||
return {
|
||||
"ok": stats.status == "success",
|
||||
"run_id": stats.run_id,
|
||||
"status": stats.status,
|
||||
"message": stats.message,
|
||||
"stats": {
|
||||
"feeds_processed": stats.feeds_processed,
|
||||
"entries_seen": stats.entries_seen,
|
||||
"articles_upserted": stats.articles_upserted,
|
||||
},
|
||||
"requested_by": username,
|
||||
}
|
||||
35
backend/app/policy.py
Normal file
35
backend/app/policy.py
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
|
||||
def evaluate_source_policy(source: dict[str, Any] | None) -> list[str]:
|
||||
issues: list[str] = []
|
||||
if not source:
|
||||
issues.append("Keine Quelle zugeordnet")
|
||||
return issues
|
||||
|
||||
risk_level = (source.get("risk_level") or "").strip().lower()
|
||||
if risk_level != "green":
|
||||
issues.append(f"Quelle nicht freigegeben (risk_level={risk_level or 'unset'})")
|
||||
|
||||
terms_url = (source.get("terms_url") or "").strip()
|
||||
if not terms_url:
|
||||
issues.append("terms_url fehlt")
|
||||
|
||||
license_name = (source.get("license_name") or "").strip()
|
||||
if not license_name:
|
||||
issues.append("license_name fehlt")
|
||||
|
||||
last_reviewed_at = (source.get("last_reviewed_at") or "").strip()
|
||||
if not last_reviewed_at:
|
||||
issues.append("last_reviewed_at fehlt")
|
||||
|
||||
if int(source.get("is_enabled", 0) or 0) != 1:
|
||||
issues.append("Quelle ist deaktiviert")
|
||||
|
||||
return issues
|
||||
|
||||
|
||||
def is_source_allowed(source: dict[str, Any] | None) -> bool:
|
||||
return len(evaluate_source_policy(source)) == 0
|
||||
416
backend/app/repositories.py
Normal file
416
backend/app/repositories.py
Normal file
|
|
@ -0,0 +1,416 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
from .db import get_conn, rows_to_dicts
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class SourceCreate:
|
||||
name: str
|
||||
base_url: str | None
|
||||
terms_url: str | None
|
||||
license_name: str | None
|
||||
risk_level: str
|
||||
is_enabled: bool
|
||||
notes: str | None
|
||||
last_reviewed_at: str | None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class FeedCreate:
|
||||
name: str
|
||||
url: str
|
||||
source_id: int | None
|
||||
is_enabled: bool
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RunCreate:
|
||||
run_type: str
|
||||
status: str
|
||||
details: str | None = None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ArticleUpsert:
|
||||
feed_id: int | None
|
||||
source_article_id: str | None
|
||||
source_hash: str | None
|
||||
title: str
|
||||
source_url: str
|
||||
canonical_url: str | None
|
||||
published_at: str | None
|
||||
author: str | None
|
||||
summary: str | None
|
||||
content_raw: str | None
|
||||
content_rewritten: str | None
|
||||
word_count: int
|
||||
status: str
|
||||
meta_json: str | None
|
||||
|
||||
|
||||
def create_source(payload: SourceCreate) -> int:
|
||||
with get_conn() as conn:
|
||||
cur = conn.execute(
|
||||
"""
|
||||
INSERT INTO sources (name, base_url, terms_url, license_name, risk_level, is_enabled, notes, last_reviewed_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""",
|
||||
(
|
||||
payload.name.strip(),
|
||||
payload.base_url,
|
||||
payload.terms_url,
|
||||
payload.license_name,
|
||||
payload.risk_level,
|
||||
1 if payload.is_enabled else 0,
|
||||
payload.notes,
|
||||
payload.last_reviewed_at,
|
||||
),
|
||||
)
|
||||
return int(cur.lastrowid)
|
||||
|
||||
|
||||
def list_sources() -> list[dict[str, Any]]:
|
||||
with get_conn() as conn:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT id, name, base_url, terms_url, license_name, risk_level, is_enabled, notes, last_reviewed_at, created_at, updated_at
|
||||
FROM sources
|
||||
ORDER BY id DESC
|
||||
"""
|
||||
).fetchall()
|
||||
return rows_to_dicts(rows)
|
||||
|
||||
|
||||
def get_source_by_id(source_id: int) -> dict[str, Any] | None:
|
||||
with get_conn() as conn:
|
||||
row = conn.execute(
|
||||
"""
|
||||
SELECT id, name, base_url, terms_url, license_name, risk_level, is_enabled, notes, last_reviewed_at, created_at, updated_at
|
||||
FROM sources
|
||||
WHERE id = ?
|
||||
""",
|
||||
(source_id,),
|
||||
).fetchone()
|
||||
return dict(row) if row else None
|
||||
|
||||
|
||||
def create_feed(payload: FeedCreate) -> int:
|
||||
with get_conn() as conn:
|
||||
cur = conn.execute(
|
||||
"INSERT INTO feeds (name, url, source_id, is_enabled) VALUES (?, ?, ?, ?)",
|
||||
(payload.name.strip(), payload.url.strip(), payload.source_id, 1 if payload.is_enabled else 0),
|
||||
)
|
||||
return int(cur.lastrowid)
|
||||
|
||||
|
||||
def list_feeds() -> list[dict[str, Any]]:
|
||||
with get_conn() as conn:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT f.id, f.name, f.url, f.source_id, f.is_enabled, f.etag, f.last_modified, f.last_checked_at,
|
||||
f.created_at, f.updated_at, s.name AS source_name, s.license_name AS source_license_name,
|
||||
s.terms_url AS source_terms_url, s.risk_level AS source_risk_level, s.base_url AS source_base_url,
|
||||
s.last_reviewed_at AS source_last_reviewed_at, s.is_enabled AS source_is_enabled
|
||||
FROM feeds f
|
||||
LEFT JOIN sources s ON s.id = f.source_id
|
||||
ORDER BY f.id DESC
|
||||
"""
|
||||
).fetchall()
|
||||
return rows_to_dicts(rows)
|
||||
|
||||
|
||||
def list_enabled_feeds() -> list[dict[str, Any]]:
|
||||
with get_conn() as conn:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT f.id, f.name, f.url, f.source_id, f.is_enabled, f.etag, f.last_modified, f.last_checked_at,
|
||||
s.name AS source_name, s.license_name AS source_license_name, s.terms_url AS source_terms_url,
|
||||
s.risk_level AS source_risk_level, s.base_url AS source_base_url,
|
||||
s.last_reviewed_at AS source_last_reviewed_at, s.is_enabled AS source_is_enabled
|
||||
FROM feeds f
|
||||
LEFT JOIN sources s ON s.id = f.source_id
|
||||
WHERE f.is_enabled = 1
|
||||
ORDER BY f.id ASC
|
||||
"""
|
||||
).fetchall()
|
||||
return rows_to_dicts(rows)
|
||||
|
||||
|
||||
def get_feed_by_id(feed_id: int) -> dict[str, Any] | None:
|
||||
with get_conn() as conn:
|
||||
row = conn.execute(
|
||||
"""
|
||||
SELECT f.id, f.name, f.url, f.source_id, f.is_enabled, f.etag, f.last_modified, f.last_checked_at,
|
||||
s.name AS source_name, s.license_name AS source_license_name, s.terms_url AS source_terms_url,
|
||||
s.risk_level AS source_risk_level, s.base_url AS source_base_url,
|
||||
s.last_reviewed_at AS source_last_reviewed_at, s.is_enabled AS source_is_enabled
|
||||
FROM feeds f
|
||||
LEFT JOIN sources s ON s.id = f.source_id
|
||||
WHERE f.id = ?
|
||||
""",
|
||||
(feed_id,),
|
||||
).fetchone()
|
||||
return dict(row) if row else None
|
||||
|
||||
|
||||
def update_feed_fetch_state(feed_id: int, etag: str | None, last_modified: str | None) -> None:
|
||||
with get_conn() as conn:
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE feeds
|
||||
SET etag = ?, last_modified = ?, last_checked_at = datetime('now')
|
||||
WHERE id = ?
|
||||
""",
|
||||
(etag, last_modified, feed_id),
|
||||
)
|
||||
|
||||
|
||||
def create_run(payload: RunCreate) -> int:
|
||||
with get_conn() as conn:
|
||||
cur = conn.execute(
|
||||
"INSERT INTO runs (run_type, status, details) VALUES (?, ?, ?)",
|
||||
(payload.run_type, payload.status, payload.details),
|
||||
)
|
||||
return int(cur.lastrowid)
|
||||
|
||||
|
||||
def finish_run(run_id: int, status: str, details: str | None = None) -> None:
|
||||
with get_conn() as conn:
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE runs
|
||||
SET status = ?, details = ?, finished_at = datetime('now')
|
||||
WHERE id = ?
|
||||
""",
|
||||
(status, details, run_id),
|
||||
)
|
||||
|
||||
|
||||
def list_runs(limit: int = 50) -> list[dict[str, Any]]:
|
||||
safe_limit = max(1, min(limit, 500))
|
||||
with get_conn() as conn:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT id, run_type, status, started_at, finished_at, details
|
||||
FROM runs
|
||||
ORDER BY id DESC
|
||||
LIMIT ?
|
||||
""",
|
||||
(safe_limit,),
|
||||
).fetchall()
|
||||
return rows_to_dicts(rows)
|
||||
|
||||
|
||||
def get_run_by_id(run_id: int) -> dict[str, Any] | None:
|
||||
with get_conn() as conn:
|
||||
row = conn.execute(
|
||||
"""
|
||||
SELECT id, run_type, status, started_at, finished_at, details
|
||||
FROM runs
|
||||
WHERE id = ?
|
||||
""",
|
||||
(run_id,),
|
||||
).fetchone()
|
||||
return dict(row) if row else None
|
||||
|
||||
|
||||
def get_article_by_id(article_id: int) -> dict[str, Any] | None:
|
||||
with get_conn() as conn:
|
||||
row = conn.execute(
|
||||
"""
|
||||
SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author,
|
||||
a.summary, a.content_raw, a.content_rewritten, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at
|
||||
FROM articles a
|
||||
WHERE a.id = ?
|
||||
""",
|
||||
(article_id,),
|
||||
).fetchone()
|
||||
return dict(row) if row else None
|
||||
|
||||
|
||||
def _merge_review_event(meta_json: str | None, event: dict[str, Any]) -> str:
|
||||
meta: dict[str, Any] = {}
|
||||
if meta_json:
|
||||
try:
|
||||
meta = json.loads(meta_json)
|
||||
if not isinstance(meta, dict):
|
||||
meta = {}
|
||||
except Exception:
|
||||
meta = {}
|
||||
|
||||
events = meta.get("review_events")
|
||||
if not isinstance(events, list):
|
||||
events = []
|
||||
events.append(event)
|
||||
meta["review_events"] = events
|
||||
return json.dumps(meta, ensure_ascii=False)
|
||||
|
||||
|
||||
def update_article_status(
|
||||
article_id: int,
|
||||
new_status: str,
|
||||
*,
|
||||
actor: str | None = None,
|
||||
note: str | None = None,
|
||||
decision: str | None = None,
|
||||
) -> bool:
|
||||
article = get_article_by_id(article_id)
|
||||
if not article:
|
||||
return False
|
||||
|
||||
event = {
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"from_status": article.get("status"),
|
||||
"to_status": new_status,
|
||||
"actor": actor or "system",
|
||||
"note": note,
|
||||
"decision": decision,
|
||||
}
|
||||
merged_meta = _merge_review_event(article.get("meta_json"), event)
|
||||
|
||||
with get_conn() as conn:
|
||||
conn.execute(
|
||||
"UPDATE articles SET status = ?, meta_json = ? WHERE id = ?",
|
||||
(new_status, merged_meta, article_id),
|
||||
)
|
||||
return True
|
||||
|
||||
|
||||
def _resolve_existing_article_id(payload: ArticleUpsert) -> int | None:
|
||||
with get_conn() as conn:
|
||||
# 1) strongest key: source_url
|
||||
row = conn.execute(
|
||||
"SELECT id FROM articles WHERE source_url = ?",
|
||||
(payload.source_url.strip(),),
|
||||
).fetchone()
|
||||
if row:
|
||||
return int(row["id"])
|
||||
|
||||
# 2) stable feed+guid combo
|
||||
if payload.feed_id is not None and payload.source_article_id:
|
||||
row = conn.execute(
|
||||
"SELECT id FROM articles WHERE feed_id = ? AND source_article_id = ?",
|
||||
(payload.feed_id, payload.source_article_id),
|
||||
).fetchone()
|
||||
if row:
|
||||
return int(row["id"])
|
||||
|
||||
# 3) content hash fallback
|
||||
if payload.source_hash:
|
||||
row = conn.execute(
|
||||
"SELECT id FROM articles WHERE source_hash = ?",
|
||||
(payload.source_hash,),
|
||||
).fetchone()
|
||||
if row:
|
||||
return int(row["id"])
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def upsert_article(payload: ArticleUpsert) -> int:
|
||||
existing_id = _resolve_existing_article_id(payload)
|
||||
with get_conn() as conn:
|
||||
if existing_id is None:
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO articles (
|
||||
feed_id, source_article_id, source_hash, title, source_url, canonical_url, published_at, author,
|
||||
summary, content_raw, content_rewritten, word_count, status, meta_json
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""",
|
||||
(
|
||||
payload.feed_id,
|
||||
payload.source_article_id,
|
||||
payload.source_hash,
|
||||
payload.title.strip(),
|
||||
payload.source_url.strip(),
|
||||
payload.canonical_url,
|
||||
payload.published_at,
|
||||
payload.author,
|
||||
payload.summary,
|
||||
payload.content_raw,
|
||||
payload.content_rewritten,
|
||||
payload.word_count,
|
||||
payload.status,
|
||||
payload.meta_json,
|
||||
),
|
||||
)
|
||||
else:
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE articles
|
||||
SET
|
||||
feed_id = ?,
|
||||
source_article_id = ?,
|
||||
source_hash = ?,
|
||||
title = ?,
|
||||
source_url = ?,
|
||||
canonical_url = ?,
|
||||
published_at = ?,
|
||||
author = ?,
|
||||
summary = ?,
|
||||
content_raw = ?,
|
||||
content_rewritten = ?,
|
||||
word_count = ?,
|
||||
status = ?,
|
||||
meta_json = ?
|
||||
WHERE id = ?
|
||||
""",
|
||||
(
|
||||
payload.feed_id,
|
||||
payload.source_article_id,
|
||||
payload.source_hash,
|
||||
payload.title.strip(),
|
||||
payload.source_url.strip(),
|
||||
payload.canonical_url,
|
||||
payload.published_at,
|
||||
payload.author,
|
||||
payload.summary,
|
||||
payload.content_raw,
|
||||
payload.content_rewritten,
|
||||
payload.word_count,
|
||||
payload.status,
|
||||
payload.meta_json,
|
||||
existing_id,
|
||||
),
|
||||
)
|
||||
row = conn.execute("SELECT id FROM articles WHERE source_url = ?", (payload.source_url.strip(),)).fetchone()
|
||||
if row:
|
||||
return int(row["id"])
|
||||
return int(existing_id) if existing_id else 0
|
||||
|
||||
|
||||
def list_articles(limit: int = 100, status_filter: str | None = None) -> list[dict[str, Any]]:
|
||||
safe_limit = max(1, min(limit, 500))
|
||||
with get_conn() as conn:
|
||||
if status_filter:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author,
|
||||
a.summary, a.content_raw, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at, f.name AS feed_name
|
||||
FROM articles a
|
||||
LEFT JOIN feeds f ON f.id = a.feed_id
|
||||
WHERE a.status = ?
|
||||
ORDER BY a.id DESC
|
||||
LIMIT ?
|
||||
""",
|
||||
(status_filter, safe_limit),
|
||||
).fetchall()
|
||||
else:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author,
|
||||
a.summary, a.content_raw, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at, f.name AS feed_name
|
||||
FROM articles a
|
||||
LEFT JOIN feeds f ON f.id = a.feed_id
|
||||
ORDER BY a.id DESC
|
||||
LIMIT ?
|
||||
""",
|
||||
(safe_limit,),
|
||||
).fetchall()
|
||||
return rows_to_dicts(rows)
|
||||
257
backend/app/source_extraction.py
Normal file
257
backend/app/source_extraction.py
Normal file
|
|
@ -0,0 +1,257 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from html import unescape
|
||||
import re
|
||||
from typing import Any
|
||||
from urllib.parse import urljoin
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
DEFAULT_TIMEOUT_SECONDS = 10
|
||||
DEFAULT_USER_AGENT = "rss-news-bot/1.0 (+https://news.vanityontour.de)"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ExtractedArticle:
|
||||
title: str | None
|
||||
author: str | None
|
||||
canonical_url: str | None
|
||||
summary: str | None
|
||||
content_text: str | None
|
||||
images: list[str]
|
||||
press_contact: str | None
|
||||
extraction_error: str | None = None
|
||||
|
||||
|
||||
def _clean_text(raw: str | None) -> str | None:
|
||||
if not raw:
|
||||
return None
|
||||
text = unescape(raw)
|
||||
text = re.sub(r"<[^>]+>", " ", text)
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
return text or None
|
||||
|
||||
|
||||
def _strip_noise(html: str) -> str:
|
||||
html = re.sub(r"<script[\s\S]*?</script>", " ", html, flags=re.IGNORECASE)
|
||||
html = re.sub(r"<style[\s\S]*?</style>", " ", html, flags=re.IGNORECASE)
|
||||
html = re.sub(r"<noscript[\s\S]*?</noscript>", " ", html, flags=re.IGNORECASE)
|
||||
return html
|
||||
|
||||
|
||||
def _meta_content(html: str, attr: str, value: str) -> str | None:
|
||||
pattern = re.compile(
|
||||
rf"<meta[^>]+{attr}\s*=\s*[\"']{re.escape(value)}[\"'][^>]*content\s*=\s*[\"']([^\"']+)[\"'][^>]*>",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
match = pattern.search(html)
|
||||
if match:
|
||||
return _clean_text(match.group(1))
|
||||
|
||||
# handle reversed attribute order
|
||||
pattern_rev = re.compile(
|
||||
rf"<meta[^>]+content\s*=\s*[\"']([^\"']+)[\"'][^>]*{attr}\s*=\s*[\"']{re.escape(value)}[\"'][^>]*>",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
match = pattern_rev.search(html)
|
||||
if match:
|
||||
return _clean_text(match.group(1))
|
||||
return None
|
||||
|
||||
|
||||
def _extract_title(html: str) -> str | None:
|
||||
title = _meta_content(html, "property", "og:title")
|
||||
if title:
|
||||
return title
|
||||
|
||||
match = re.search(r"<title[^>]*>([\s\S]*?)</title>", html, re.IGNORECASE)
|
||||
if match:
|
||||
cleaned = _clean_text(match.group(1))
|
||||
if cleaned:
|
||||
return cleaned
|
||||
|
||||
match = re.search(r"<h1[^>]*>([\s\S]*?)</h1>", html, re.IGNORECASE)
|
||||
if match:
|
||||
return _clean_text(match.group(1))
|
||||
return None
|
||||
|
||||
|
||||
def _extract_canonical(html: str) -> str | None:
|
||||
match = re.search(
|
||||
r"<link[^>]+rel\s*=\s*[\"']canonical[\"'][^>]*href\s*=\s*[\"']([^\"']+)[\"'][^>]*>",
|
||||
html,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if match:
|
||||
return _clean_text(match.group(1))
|
||||
|
||||
match = re.search(
|
||||
r"<link[^>]+href\s*=\s*[\"']([^\"']+)[\"'][^>]*rel\s*=\s*[\"']canonical[\"'][^>]*>",
|
||||
html,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if match:
|
||||
return _clean_text(match.group(1))
|
||||
return None
|
||||
|
||||
|
||||
def _extract_author(html: str) -> str | None:
|
||||
for attr, value in (("name", "author"), ("property", "article:author"), ("property", "og:article:author")):
|
||||
author = _meta_content(html, attr, value)
|
||||
if author:
|
||||
return author
|
||||
|
||||
for pattern in (
|
||||
r"(?:Von|Autor(?:in)?)\s*[:\-]\s*([^<\n\r]{3,120})",
|
||||
r"class=[\"'][^\"']*(?:author|byline)[^\"']*[\"'][^>]*>([\s\S]{1,180})<",
|
||||
):
|
||||
match = re.search(pattern, html, re.IGNORECASE)
|
||||
if match:
|
||||
author = _clean_text(match.group(1))
|
||||
if author:
|
||||
return author
|
||||
return None
|
||||
|
||||
|
||||
def _extract_images(html: str, page_url: str) -> list[str]:
|
||||
images: list[str] = []
|
||||
seen: set[str] = set()
|
||||
|
||||
for prop in ("og:image", "twitter:image"):
|
||||
pattern = re.compile(
|
||||
rf"<meta[^>]+property\s*=\s*[\"']{re.escape(prop)}[\"'][^>]*content\s*=\s*[\"']([^\"']+)[\"'][^>]*>",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
for match in pattern.finditer(html):
|
||||
src = match.group(1).strip()
|
||||
abs_src = urljoin(page_url, src)
|
||||
if abs_src not in seen:
|
||||
seen.add(abs_src)
|
||||
images.append(abs_src)
|
||||
|
||||
for match in re.finditer(r"<img[^>]+src\s*=\s*[\"']([^\"']+)[\"'][^>]*>", html, re.IGNORECASE):
|
||||
src = match.group(1).strip()
|
||||
abs_src = urljoin(page_url, src)
|
||||
if abs_src not in seen:
|
||||
seen.add(abs_src)
|
||||
images.append(abs_src)
|
||||
|
||||
return images
|
||||
|
||||
|
||||
def _extract_content_text(html: str) -> str | None:
|
||||
section = None
|
||||
for pattern in (
|
||||
r"<article[^>]*>([\s\S]*?)</article>",
|
||||
r"<main[^>]*>([\s\S]*?)</main>",
|
||||
r"<body[^>]*>([\s\S]*?)</body>",
|
||||
):
|
||||
match = re.search(pattern, html, re.IGNORECASE)
|
||||
if match:
|
||||
section = match.group(1)
|
||||
break
|
||||
|
||||
if not section:
|
||||
section = html
|
||||
|
||||
paragraphs = []
|
||||
for match in re.finditer(r"<h[2-4][^>]*>([\s\S]*?)</h[2-4]>", section, re.IGNORECASE):
|
||||
text = _clean_text(match.group(1))
|
||||
if text and re.search(r"\b(pressekontakt|press contact|kontakt)\b", text, re.IGNORECASE):
|
||||
paragraphs.append(text)
|
||||
|
||||
for match in re.finditer(r"<p[^>]*>([\s\S]*?)</p>", section, re.IGNORECASE):
|
||||
text = _clean_text(match.group(1))
|
||||
if text and len(text) > 2:
|
||||
paragraphs.append(text)
|
||||
|
||||
if paragraphs:
|
||||
return "\n".join(paragraphs)
|
||||
|
||||
stripped = _clean_text(section)
|
||||
return stripped
|
||||
|
||||
|
||||
def _extract_press_contact(content_text: str | None) -> str | None:
|
||||
if not content_text:
|
||||
return None
|
||||
|
||||
lines = [line.strip() for line in content_text.split("\n") if line.strip()]
|
||||
marker_re = re.compile(r"\b(pressekontakt|press contact|presse\-kontakt)\b", re.IGNORECASE)
|
||||
for idx, line in enumerate(lines):
|
||||
if marker_re.search(line):
|
||||
chunk = [line]
|
||||
for nxt in lines[idx + 1 : idx + 6]:
|
||||
if re.search(r"\b(original\-content von|ots:|newsroom:)\b", nxt, re.IGNORECASE):
|
||||
break
|
||||
chunk.append(nxt)
|
||||
return _clean_text("\n".join(chunk))
|
||||
|
||||
match = re.search(
|
||||
r"(Pressekontakt[\s\S]{0,1200}?)(?:Original-Content von|OTS:|newsroom:|$)",
|
||||
content_text,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if match:
|
||||
return _clean_text(match.group(1))
|
||||
return None
|
||||
|
||||
|
||||
def extract_article(url: str, timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS) -> ExtractedArticle:
|
||||
try:
|
||||
req = Request(
|
||||
url=url,
|
||||
headers={
|
||||
"User-Agent": DEFAULT_USER_AGENT,
|
||||
"Accept-Language": "de-DE,de;q=0.9,en;q=0.8",
|
||||
},
|
||||
)
|
||||
with urlopen(req, timeout=timeout_seconds) as resp:
|
||||
raw = resp.read()
|
||||
charset = resp.headers.get_content_charset() or "utf-8"
|
||||
html = raw.decode(charset, errors="replace")
|
||||
except Exception as exc:
|
||||
return ExtractedArticle(
|
||||
title=None,
|
||||
author=None,
|
||||
canonical_url=None,
|
||||
summary=None,
|
||||
content_text=None,
|
||||
images=[],
|
||||
press_contact=None,
|
||||
extraction_error=str(exc),
|
||||
)
|
||||
|
||||
html = _strip_noise(html)
|
||||
title = _extract_title(html)
|
||||
author = _extract_author(html)
|
||||
canonical_url = _extract_canonical(html)
|
||||
summary = _meta_content(html, "name", "description")
|
||||
content_text = _extract_content_text(html)
|
||||
if not summary and content_text:
|
||||
summary = _clean_text(content_text[:320])
|
||||
images = _extract_images(html, url)
|
||||
press_contact = _extract_press_contact(content_text)
|
||||
|
||||
return ExtractedArticle(
|
||||
title=title,
|
||||
author=author,
|
||||
canonical_url=canonical_url,
|
||||
summary=summary,
|
||||
content_text=content_text,
|
||||
images=images,
|
||||
press_contact=press_contact,
|
||||
extraction_error=None,
|
||||
)
|
||||
|
||||
|
||||
def extracted_article_to_meta(article: ExtractedArticle) -> dict[str, Any]:
|
||||
return {
|
||||
"title": article.title,
|
||||
"author": article.author,
|
||||
"canonical_url": article.canonical_url,
|
||||
"summary": article.summary,
|
||||
"images": article.images,
|
||||
"press_contact": article.press_contact,
|
||||
"extraction_error": article.extraction_error,
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue