feat: rebuild rss-news backend, admin ui, and legal extraction pipeline

This commit is contained in:
Oliver 2026-02-18 09:46:44 +01:00
parent d65c55d315
commit 2c331d683b
No known key found for this signature in database
43 changed files with 3463 additions and 73 deletions

1
backend/app/__init__.py Normal file
View file

@ -0,0 +1 @@
"""Application package."""

265
backend/app/admin_ui.py Normal file
View file

@ -0,0 +1,265 @@
from __future__ import annotations
import json
from pathlib import Path
from urllib.parse import urlencode
from fastapi import APIRouter, Form, Request
from fastapi.responses import HTMLResponse, RedirectResponse
from fastapi.templating import Jinja2Templates
from .auth import create_session_token, verify_credentials, verify_session_token
from .config import get_settings
from .ingestion import run_ingestion
from .policy import evaluate_source_policy
from .repositories import (
FeedCreate,
SourceCreate,
create_feed,
create_source,
get_article_by_id,
list_articles,
list_feeds,
list_runs,
list_sources,
update_article_status,
)
settings = get_settings()
router = APIRouter(tags=["admin-ui"])
templates = Jinja2Templates(directory=str(Path(__file__).resolve().parent.parent / "templates"))
ALLOWED_TRANSITIONS: dict[str, tuple[str, ...]] = {
"new": ("review", "rewrite", "error"),
"rewrite": ("review", "error"),
"review": ("approved", "rewrite", "error"),
"approved": ("published", "error"),
"published": ("error",),
"error": ("review", "rewrite"),
}
def _admin_user(request: Request) -> str | None:
token = request.cookies.get(settings.session_cookie_name)
if not token:
return None
return verify_session_token(token)
def _to_optional_int(raw: str | None) -> int | None:
if raw is None:
return None
value = raw.strip()
if value == "":
return None
return int(value)
def _dashboard_redirect(
*,
msg: str | None = None,
msg_type: str = "success",
status_filter: str | None = None,
) -> RedirectResponse:
query: dict[str, str] = {}
if msg:
query["msg"] = msg
query["type"] = msg_type
if status_filter:
query["status_filter"] = status_filter
suffix = f"?{urlencode(query)}" if query else ""
return RedirectResponse(url=f"/admin/dashboard{suffix}", status_code=303)
def _parse_meta_json(raw: str | None) -> dict:
if not raw:
return {}
try:
parsed = json.loads(raw)
return parsed if isinstance(parsed, dict) else {}
except Exception:
return {}
@router.get("/admin", response_class=HTMLResponse)
def admin_index(request: Request):
user = _admin_user(request)
if not user:
return RedirectResponse(url="/admin/login", status_code=303)
return RedirectResponse(url="/admin/dashboard", status_code=303)
@router.get("/admin/login", response_class=HTMLResponse)
def admin_login_page(request: Request):
return templates.TemplateResponse(
request,
"admin_login.html",
{"request": request, "title": "Admin Login", "error": request.query_params.get("error")},
)
@router.post("/admin/login")
def admin_login(request: Request, username: str = Form(...), password: str = Form(...)):
if not verify_credentials(username, password):
return RedirectResponse(url="/admin/login?error=1", status_code=303)
token = create_session_token(username)
response = RedirectResponse(url="/admin/dashboard", status_code=303)
response.set_cookie(
key=settings.session_cookie_name,
value=token,
max_age=settings.session_max_age_seconds,
httponly=True,
secure=False,
samesite="lax",
)
return response
@router.post("/admin/logout")
def admin_logout():
response = RedirectResponse(url="/admin/login", status_code=303)
response.delete_cookie(settings.session_cookie_name)
return response
@router.get("/admin/dashboard", response_class=HTMLResponse)
def admin_dashboard(request: Request):
user = _admin_user(request)
if not user:
return RedirectResponse(url="/admin/login", status_code=303)
sources = list_sources()
source_policy = {s["id"]: evaluate_source_policy(s) for s in sources}
feeds = list_feeds()
runs = list_runs(limit=30)
status_filter = request.query_params.get("status_filter")
if status_filter in {"new", "rewrite", "review", "approved", "published", "error"}:
articles = list_articles(limit=100, status_filter=status_filter)
else:
status_filter = ""
articles = list_articles(limit=100)
for article in articles:
meta = _parse_meta_json(article.get("meta_json"))
extraction = meta.get("extraction") if isinstance(meta.get("extraction"), dict) else {}
article["meta"] = meta
article["extracted_images"] = extraction.get("images") if isinstance(extraction.get("images"), list) else []
article["press_contact"] = extraction.get("press_contact") if isinstance(extraction.get("press_contact"), str) else None
article["extraction_error"] = extraction.get("extraction_error") if isinstance(extraction.get("extraction_error"), str) else None
return templates.TemplateResponse(
request,
"admin_dashboard.html",
{
"request": request,
"title": "Admin Dashboard",
"user": user,
"sources": sources,
"source_policy": source_policy,
"feeds": feeds,
"runs": runs,
"articles": articles,
"status_options": ["new", "rewrite", "review", "approved", "published", "error"],
"allowed_transitions": ALLOWED_TRANSITIONS,
"status_filter": status_filter,
"flash_msg": request.query_params.get("msg", ""),
"flash_type": request.query_params.get("type", "success"),
},
)
@router.post("/admin/sources/create")
def admin_create_source(
request: Request,
name: str = Form(...),
base_url: str = Form(""),
terms_url: str = Form(""),
license_name: str = Form(""),
risk_level: str = Form("yellow"),
last_reviewed_at: str = Form(""),
):
user = _admin_user(request)
if not user:
return RedirectResponse(url="/admin/login", status_code=303)
try:
create_source(
SourceCreate(
name=name,
base_url=base_url or None,
terms_url=terms_url or None,
license_name=license_name or None,
risk_level=risk_level,
is_enabled=True,
notes=None,
last_reviewed_at=last_reviewed_at or None,
)
)
except Exception as exc:
return _dashboard_redirect(msg=f"Quelle konnte nicht gespeichert werden: {exc}", msg_type="error")
return _dashboard_redirect(msg="Quelle gespeichert")
@router.post("/admin/feeds/create")
def admin_create_feed(
request: Request,
name: str = Form(...),
url: str = Form(...),
source_id: str = Form(""),
):
user = _admin_user(request)
if not user:
return RedirectResponse(url="/admin/login", status_code=303)
try:
create_feed(
FeedCreate(
name=name,
url=url,
source_id=_to_optional_int(source_id),
is_enabled=True,
)
)
except Exception as exc:
return _dashboard_redirect(msg=f"Feed konnte nicht gespeichert werden: {exc}", msg_type="error")
return _dashboard_redirect(msg="Feed gespeichert")
@router.post("/admin/ingestion/run")
def admin_run_ingestion(request: Request, feed_id: str = Form("")):
user = _admin_user(request)
if not user:
return RedirectResponse(url="/admin/login", status_code=303)
try:
stats = run_ingestion(feed_id=_to_optional_int(feed_id))
except Exception as exc:
return _dashboard_redirect(msg=f"Ingestion fehlgeschlagen: {exc}", msg_type="error")
return _dashboard_redirect(msg=f"Ingestion: {stats.status}, upserts={stats.articles_upserted}")
@router.post("/admin/articles/{article_id}/review")
def admin_review_article(request: Request, article_id: int, decision: str = Form(...), note: str = Form("")):
user = _admin_user(request)
if not user:
return RedirectResponse(url="/admin/login", status_code=303)
article = get_article_by_id(article_id)
if article and article.get("status") == "review" and decision in {"approve", "reject"}:
target = "approved" if decision == "approve" else "rewrite"
update_article_status(article_id, target, actor=user, note=note or None, decision=decision)
return _dashboard_redirect(msg=f"Artikel #{article_id}: {decision}")
return _dashboard_redirect(msg=f"Review-Aktion ungueltig fuer Artikel #{article_id}", msg_type="error")
@router.post("/admin/articles/{article_id}/transition")
def admin_transition_article(request: Request, article_id: int, target_status: str = Form(...), note: str = Form("")):
user = _admin_user(request)
if not user:
return RedirectResponse(url="/admin/login", status_code=303)
article = get_article_by_id(article_id)
if article:
current = article.get("status")
if target_status in ALLOWED_TRANSITIONS.get(current, ()):
update_article_status(article_id, target_status, actor=user, note=note or None)
return _dashboard_redirect(msg=f"Artikel #{article_id}: {current} -> {target_status}")
return _dashboard_redirect(msg=f"Ungueltiger Statuswechsel fuer Artikel #{article_id}", msg_type="error")

31
backend/app/auth.py Normal file
View file

@ -0,0 +1,31 @@
import hmac
from typing import Optional
from itsdangerous import URLSafeTimedSerializer, BadSignature, SignatureExpired
from .config import get_settings
def _serializer() -> URLSafeTimedSerializer:
settings = get_settings()
return URLSafeTimedSerializer(settings.app_secret_key, salt="rss-news-session")
def verify_credentials(username: str, password: str) -> bool:
settings = get_settings()
user_ok = hmac.compare_digest(username, settings.app_admin_username)
pw_ok = hmac.compare_digest(password, settings.app_admin_password)
return user_ok and pw_ok
def create_session_token(username: str) -> str:
return _serializer().dumps({"username": username})
def verify_session_token(token: str) -> Optional[str]:
settings = get_settings()
try:
payload = _serializer().loads(token, max_age=settings.session_max_age_seconds)
except (BadSignature, SignatureExpired):
return None
return payload.get("username")

29
backend/app/config.py Normal file
View file

@ -0,0 +1,29 @@
from functools import lru_cache
from pydantic_settings import BaseSettings, SettingsConfigDict
class Settings(BaseSettings):
# Prefer backend-specific env file to avoid collisions with legacy root .env
model_config = SettingsConfigDict(
env_file=("backend/.env", ".env"),
env_file_encoding="utf-8",
extra="ignore",
)
app_env: str = "development"
app_name: str = "rss-news-backend"
app_secret_key: str = "replace-with-a-long-random-secret"
app_admin_username: str = "admin"
app_admin_password: str = "change-me"
session_cookie_name: str = "rss_news_session"
session_max_age_seconds: int = 28800
app_db_path: str = "backend/data/rss_news.db"
@lru_cache(maxsize=1)
def get_settings() -> Settings:
return Settings()

138
backend/app/db.py Normal file
View file

@ -0,0 +1,138 @@
import sqlite3
from contextlib import contextmanager
from pathlib import Path
from typing import Any, Iterator
from .config import get_settings
def _db_path() -> Path:
settings = get_settings()
path = Path(settings.app_db_path)
path.parent.mkdir(parents=True, exist_ok=True)
return path
@contextmanager
def get_conn() -> Iterator[sqlite3.Connection]:
conn = sqlite3.connect(_db_path())
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA foreign_keys=ON;")
try:
yield conn
conn.commit()
finally:
conn.close()
def init_db() -> None:
with get_conn() as conn:
conn.executescript(
"""
PRAGMA journal_mode=WAL;
CREATE TABLE IF NOT EXISTS sources (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL,
base_url TEXT,
terms_url TEXT,
license_name TEXT,
risk_level TEXT NOT NULL DEFAULT 'yellow' CHECK (risk_level IN ('green', 'yellow', 'red')),
is_enabled INTEGER NOT NULL DEFAULT 0,
notes TEXT,
last_reviewed_at TEXT,
created_at TEXT NOT NULL DEFAULT (datetime('now')),
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
);
CREATE TABLE IF NOT EXISTS feeds (
id INTEGER PRIMARY KEY AUTOINCREMENT,
source_id INTEGER,
name TEXT NOT NULL,
url TEXT NOT NULL UNIQUE,
is_enabled INTEGER NOT NULL DEFAULT 1,
etag TEXT,
last_modified TEXT,
last_checked_at TEXT,
created_at TEXT NOT NULL DEFAULT (datetime('now')),
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
FOREIGN KEY(source_id) REFERENCES sources(id) ON DELETE SET NULL
);
CREATE TABLE IF NOT EXISTS runs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
run_type TEXT NOT NULL,
status TEXT NOT NULL CHECK (status IN ('queued', 'running', 'success', 'failed')),
started_at TEXT NOT NULL DEFAULT (datetime('now')),
finished_at TEXT,
details TEXT
);
CREATE TABLE IF NOT EXISTS articles (
id INTEGER PRIMARY KEY AUTOINCREMENT,
feed_id INTEGER,
source_article_id TEXT,
source_hash TEXT,
title TEXT NOT NULL,
source_url TEXT NOT NULL,
canonical_url TEXT,
published_at TEXT,
author TEXT,
summary TEXT,
content_raw TEXT,
content_rewritten TEXT,
word_count INTEGER DEFAULT 0,
status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error')),
meta_json TEXT,
created_at TEXT NOT NULL DEFAULT (datetime('now')),
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
FOREIGN KEY(feed_id) REFERENCES feeds(id) ON DELETE SET NULL,
UNIQUE(source_url)
);
CREATE INDEX IF NOT EXISTS idx_articles_source_article_id ON articles(source_article_id);
CREATE INDEX IF NOT EXISTS idx_articles_source_hash ON articles(source_hash);
CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_feed_source_article_id
ON articles(feed_id, source_article_id)
WHERE source_article_id IS NOT NULL;
CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_source_hash
ON articles(source_hash)
WHERE source_hash IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_articles_status ON articles(status);
CREATE INDEX IF NOT EXISTS idx_feeds_source_id ON feeds(source_id);
CREATE INDEX IF NOT EXISTS idx_runs_started_at ON runs(started_at);
CREATE INDEX IF NOT EXISTS idx_articles_published_at ON articles(published_at);
CREATE TRIGGER IF NOT EXISTS trg_sources_updated_at
AFTER UPDATE ON sources
FOR EACH ROW
BEGIN
UPDATE sources SET updated_at = datetime('now') WHERE id = OLD.id;
END;
CREATE TRIGGER IF NOT EXISTS trg_feeds_updated_at
AFTER UPDATE ON feeds
FOR EACH ROW
BEGIN
UPDATE feeds SET updated_at = datetime('now') WHERE id = OLD.id;
END;
CREATE TRIGGER IF NOT EXISTS trg_articles_updated_at
AFTER UPDATE ON articles
FOR EACH ROW
BEGIN
UPDATE articles SET updated_at = datetime('now') WHERE id = OLD.id;
END;
"""
)
# Lightweight migration for existing DBs created before source_hash was introduced.
existing_columns = {
row["name"] for row in conn.execute("PRAGMA table_info(articles)").fetchall()
}
if "source_hash" not in existing_columns:
conn.execute("ALTER TABLE articles ADD COLUMN source_hash TEXT")
def rows_to_dicts(rows: list[sqlite3.Row]) -> list[dict[str, Any]]:
return [dict(r) for r in rows]

253
backend/app/ingestion.py Normal file
View file

@ -0,0 +1,253 @@
from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime, timezone
import hashlib
import json
import time
from typing import Any
import feedparser
from .policy import evaluate_source_policy
from .repositories import (
ArticleUpsert,
RunCreate,
create_run,
finish_run,
get_feed_by_id,
list_enabled_feeds,
update_feed_fetch_state,
upsert_article,
)
from .source_extraction import extract_article, extracted_article_to_meta
@dataclass(frozen=True)
class IngestionStats:
run_id: int
feeds_processed: int
entries_seen: int
articles_upserted: int
status: str
message: str
MAX_FEED_FETCH_RETRIES = 3
def _entry_published_iso(entry: dict) -> str | None:
published = entry.get("published_parsed") or entry.get("updated_parsed")
if not published:
return None
return datetime(*published[:6], tzinfo=timezone.utc).isoformat()
def _entry_text(entry: dict) -> tuple[str, str]:
summary = entry.get("summary", "") or ""
content = ""
if entry.get("content") and isinstance(entry.get("content"), list):
first = entry["content"][0]
content = first.get("value", "") if isinstance(first, dict) else ""
if not content:
content = summary
return summary, content
def _entry_hash(entry: dict, feed_id: int, link: str, title: str, summary: str) -> str:
source_id = entry.get("id") or entry.get("guid") or ""
published = _entry_published_iso(entry) or ""
fingerprint = f"{feed_id}|{source_id}|{link}|{title.strip()}|{summary.strip()}|{published}"
return hashlib.sha256(fingerprint.encode("utf-8")).hexdigest()
def _parsed_get(parsed: object, key: str, default: object = None) -> object:
if isinstance(parsed, dict):
return parsed.get(key, default)
return getattr(parsed, key, default)
def run_ingestion(feed_id: int | None = None) -> IngestionStats:
run_id = create_run(RunCreate(run_type="ingestion", status="running", details="started"))
feeds_processed = 0
entries_seen = 0
articles_upserted = 0
feed_results: list[dict[str, object]] = []
try:
if feed_id is not None:
feed = get_feed_by_id(feed_id)
feeds = [feed] if feed and int(feed.get("is_enabled", 0)) == 1 else []
else:
feeds = list_enabled_feeds()
for feed in feeds:
if not feed:
continue
feeds_processed += 1
source_snapshot = {
"id": feed.get("source_id"),
"name": feed.get("source_name"),
"base_url": feed.get("source_base_url"),
"terms_url": feed.get("source_terms_url"),
"license_name": feed.get("source_license_name"),
"risk_level": feed.get("source_risk_level"),
"last_reviewed_at": feed.get("source_last_reviewed_at"),
"is_enabled": feed.get("source_is_enabled"),
}
policy_issues = evaluate_source_policy(source_snapshot)
if policy_issues:
feed_results.append(
{
"feed_id": int(feed["id"]),
"feed_url": feed["url"],
"status": "blocked",
"policy_issues": policy_issues,
"entries_seen": 0,
"upserts": 0,
}
)
continue
parsed = None
feed_error = None
for attempt in range(1, MAX_FEED_FETCH_RETRIES + 1):
try:
parsed = feedparser.parse(
feed["url"],
etag=feed.get("etag"),
modified=feed.get("last_modified"),
)
break
except Exception as exc:
feed_error = str(exc)
if attempt < MAX_FEED_FETCH_RETRIES:
time.sleep(0.5 * attempt)
if parsed is None:
feed_results.append(
{
"feed_id": int(feed["id"]),
"feed_url": feed["url"],
"status": "failed",
"error": feed_error or "unknown",
"entries_seen": 0,
"upserts": 0,
}
)
continue
# Persist ETag/Last-Modified for conditional requests.
parsed_etag = _parsed_get(parsed, "etag")
parsed_modified = _parsed_get(parsed, "modified")
if parsed_modified and not isinstance(parsed_modified, str):
parsed_modified = str(parsed_modified)
update_feed_fetch_state(
feed_id=int(feed["id"]),
etag=parsed_etag if isinstance(parsed_etag, str) else None,
last_modified=parsed_modified if isinstance(parsed_modified, str) else None,
)
feed_entries_seen = 0
feed_upserts = 0
for entry in _parsed_get(parsed, "entries", []):
entries_seen += 1
feed_entries_seen += 1
link = entry.get("link")
if not link:
continue
summary, content_raw = _entry_text(entry)
title = entry.get("title") or "Ohne Titel"
extracted = extract_article(link)
final_title = extracted.title or title
final_author = extracted.author or entry.get("author")
final_summary = extracted.summary or (summary[:1000] if summary else None)
final_content_raw = extracted.content_text or content_raw
final_canonical = extracted.canonical_url or entry.get("link")
source_hash = _entry_hash(
entry,
int(feed["id"]),
link,
final_title,
final_summary or "",
)
attribution = {
"source_name": feed.get("source_name"),
"source_base_url": feed.get("source_base_url"),
"source_terms_url": feed.get("source_terms_url"),
"source_license_name": feed.get("source_license_name"),
"source_risk_level": feed.get("source_risk_level"),
"original_link": link,
"feed_name": feed.get("name"),
"feed_id": int(feed["id"]),
"imported_at": datetime.now(timezone.utc).isoformat(),
}
extraction_meta: dict[str, Any] = extracted_article_to_meta(extracted)
extraction_meta["fetched_from"] = link
article_id = upsert_article(
ArticleUpsert(
feed_id=int(feed["id"]),
source_article_id=entry.get("id") or entry.get("guid"),
source_hash=source_hash,
title=final_title,
source_url=link,
canonical_url=final_canonical,
published_at=_entry_published_iso(entry),
author=final_author,
summary=final_summary,
content_raw=final_content_raw,
content_rewritten=None,
word_count=len((final_content_raw or "").split()),
status="new",
meta_json=json.dumps({"attribution": attribution, "extraction": extraction_meta}, ensure_ascii=False),
)
)
if article_id:
articles_upserted += 1
feed_upserts += 1
feed_results.append(
{
"feed_id": int(feed["id"]),
"feed_url": feed["url"],
"status": "success",
"entries_seen": feed_entries_seen,
"upserts": feed_upserts,
}
)
finish_run(
run_id=run_id,
status="success",
details=json.dumps(
{
"feeds_processed": feeds_processed,
"entries_seen": entries_seen,
"upserts": articles_upserted,
"feeds": feed_results,
},
ensure_ascii=False,
),
)
return IngestionStats(
run_id=run_id,
feeds_processed=feeds_processed,
entries_seen=entries_seen,
articles_upserted=articles_upserted,
status="success",
message="Ingestion abgeschlossen",
)
except Exception as exc:
finish_run(run_id=run_id, status="failed", details=str(exc))
return IngestionStats(
run_id=run_id,
feeds_processed=feeds_processed,
entries_seen=entries_seen,
articles_upserted=articles_upserted,
status="failed",
message=str(exc),
)

404
backend/app/main.py Normal file
View file

@ -0,0 +1,404 @@
from contextlib import asynccontextmanager
from pathlib import Path
from fastapi import Depends, FastAPI, HTTPException, Request, Response, status
from pydantic import BaseModel, Field
from fastapi.staticfiles import StaticFiles
from .admin_ui import router as admin_router
from .auth import create_session_token, verify_credentials, verify_session_token
from .config import get_settings
from .db import init_db
from .ingestion import run_ingestion
from .policy import evaluate_source_policy, is_source_allowed
from .repositories import (
ArticleUpsert,
FeedCreate,
RunCreate,
SourceCreate,
create_feed as repo_create_feed,
create_run,
create_source as repo_create_source,
finish_run,
get_article_by_id,
get_feed_by_id,
get_run_by_id,
get_source_by_id,
list_articles as repo_list_articles,
list_feeds as repo_list_feeds,
list_runs,
list_sources as repo_list_sources,
update_article_status,
upsert_article as repo_upsert_article,
)
settings = get_settings()
@asynccontextmanager
async def app_lifespan(_: FastAPI):
init_db()
yield
app = FastAPI(title=settings.app_name, lifespan=app_lifespan)
app.include_router(admin_router)
app.mount(
"/admin/static",
StaticFiles(directory=str(Path(__file__).resolve().parent.parent / "static")),
name="admin-static",
)
class LoginRequest(BaseModel):
username: str
password: str
class SourceCreateRequest(BaseModel):
name: str = Field(min_length=1, max_length=200)
base_url: str | None = None
terms_url: str | None = None
license_name: str | None = None
risk_level: str = Field(default="yellow", pattern="^(green|yellow|red)$")
is_enabled: bool = False
notes: str | None = None
last_reviewed_at: str | None = None
class FeedCreateRequest(BaseModel):
name: str = Field(min_length=1, max_length=200)
url: str = Field(min_length=5, max_length=1000)
source_id: int | None = None
is_enabled: bool = True
class RunCreateRequest(BaseModel):
run_type: str = Field(min_length=2, max_length=100)
status: str = Field(default="queued", pattern="^(queued|running|success|failed)$")
details: str | None = None
class RunFinishRequest(BaseModel):
status: str = Field(pattern="^(success|failed)$")
details: str | None = None
class ArticleUpsertRequest(BaseModel):
feed_id: int | None = None
source_article_id: str | None = None
source_hash: str | None = None
title: str = Field(min_length=1, max_length=500)
source_url: str = Field(min_length=5, max_length=2000)
canonical_url: str | None = None
published_at: str | None = None
author: str | None = None
summary: str | None = None
content_raw: str | None = None
content_rewritten: str | None = None
word_count: int = 0
status: str = Field(default="new", pattern="^(new|rewrite|review|approved|published|error)$")
meta_json: str | None = None
class IngestionRunRequest(BaseModel):
feed_id: int | None = None
class ArticleTransitionRequest(BaseModel):
target_status: str = Field(pattern="^(new|rewrite|review|approved|published|error)$")
note: str | None = None
class ArticleReviewRequest(BaseModel):
decision: str = Field(pattern="^(approve|reject)$")
note: str | None = None
ALLOWED_ARTICLE_TRANSITIONS: dict[str, set[str]] = {
"new": {"review", "rewrite", "error"},
"rewrite": {"review", "error"},
"review": {"approved", "rewrite", "error"},
"approved": {"published", "error"},
"published": {"error"},
"error": {"review", "rewrite"},
}
def require_auth(request: Request) -> str:
token = request.cookies.get(settings.session_cookie_name)
if not token:
raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Nicht angemeldet")
username = verify_session_token(token)
if not username:
raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Session ungueltig oder abgelaufen")
return username
@app.get("/health")
def health() -> dict:
return {"status": "ok", "service": settings.app_name, "db_path": settings.app_db_path}
@app.post("/auth/login")
def login(payload: LoginRequest, response: Response) -> dict:
if not verify_credentials(payload.username, payload.password):
raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Ungueltige Zugangsdaten")
token = create_session_token(payload.username)
response.set_cookie(
key=settings.session_cookie_name,
value=token,
max_age=settings.session_max_age_seconds,
httponly=True,
secure=False,
samesite="lax",
)
return {"ok": True, "username": payload.username}
@app.post("/auth/logout")
def logout(response: Response) -> dict:
response.delete_cookie(settings.session_cookie_name)
return {"ok": True}
@app.get("/auth/me")
def me(username: str = Depends(require_auth)) -> dict:
return {"authenticated": True, "username": username}
@app.get("/api/protected")
def protected(username: str = Depends(require_auth)) -> dict:
return {"ok": True, "message": "Protected endpoint", "username": username}
@app.get("/api/pipeline/status")
def pipeline_status(username: str = Depends(require_auth)) -> dict:
feeds_total = len(repo_list_feeds())
sources_total = len(repo_list_sources())
articles_total = len(repo_list_articles(limit=500))
return {
"ok": True,
"stage": "skeleton+db",
"requested_by": username,
"counts": {
"sources": sources_total,
"feeds": feeds_total,
"articles": articles_total,
},
}
@app.get("/api/sources")
def list_sources(username: str = Depends(require_auth)) -> dict:
return {"ok": True, "items": repo_list_sources(), "requested_by": username}
@app.get("/api/sources/{source_id}/policy-check")
def source_policy_check(source_id: int, username: str = Depends(require_auth)) -> dict:
source = get_source_by_id(source_id)
if not source:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Quelle nicht gefunden")
issues = evaluate_source_policy(source)
return {
"ok": True,
"source_id": source_id,
"allowed": is_source_allowed(source),
"issues": issues,
"requested_by": username,
}
@app.post("/api/sources")
def create_source(payload: SourceCreateRequest, username: str = Depends(require_auth)) -> dict:
source_id = repo_create_source(
SourceCreate(
name=payload.name,
base_url=payload.base_url,
terms_url=payload.terms_url,
license_name=payload.license_name,
risk_level=payload.risk_level,
is_enabled=payload.is_enabled,
notes=payload.notes,
last_reviewed_at=payload.last_reviewed_at,
)
)
return {"ok": True, "id": source_id, "requested_by": username}
@app.get("/api/feeds")
def list_feeds(username: str = Depends(require_auth)) -> dict:
return {"ok": True, "items": repo_list_feeds(), "requested_by": username}
@app.get("/api/feeds/{feed_id}/policy-check")
def feed_policy_check(feed_id: int, username: str = Depends(require_auth)) -> dict:
feed = get_feed_by_id(feed_id)
if not feed:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Feed nicht gefunden")
source_snapshot = {
"id": feed.get("source_id"),
"name": feed.get("source_name"),
"base_url": feed.get("source_base_url"),
"terms_url": feed.get("source_terms_url"),
"license_name": feed.get("source_license_name"),
"risk_level": feed.get("source_risk_level"),
"last_reviewed_at": feed.get("source_last_reviewed_at"),
"is_enabled": feed.get("source_is_enabled"),
}
issues = evaluate_source_policy(source_snapshot)
return {
"ok": True,
"feed_id": feed_id,
"allowed": len(issues) == 0,
"issues": issues,
"requested_by": username,
}
@app.post("/api/feeds")
def create_feed(payload: FeedCreateRequest, username: str = Depends(require_auth)) -> dict:
try:
feed_id = repo_create_feed(
FeedCreate(
name=payload.name,
url=payload.url,
source_id=payload.source_id,
is_enabled=payload.is_enabled,
)
)
except Exception as exc:
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=f"Feed konnte nicht angelegt werden: {exc}") from exc
return {"ok": True, "id": feed_id, "requested_by": username}
@app.get("/api/runs")
def api_list_runs(limit: int = 50, username: str = Depends(require_auth)) -> dict:
return {"ok": True, "items": list_runs(limit=limit), "requested_by": username}
@app.get("/api/runs/{run_id}")
def api_get_run(run_id: int, username: str = Depends(require_auth)) -> dict:
run = get_run_by_id(run_id)
if not run:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Run nicht gefunden")
return {"ok": True, "item": run, "requested_by": username}
@app.post("/api/runs")
def api_create_run(payload: RunCreateRequest, username: str = Depends(require_auth)) -> dict:
run_id = create_run(RunCreate(run_type=payload.run_type, status=payload.status, details=payload.details))
return {"ok": True, "id": run_id, "requested_by": username}
@app.post("/api/runs/{run_id}/finish")
def api_finish_run(run_id: int, payload: RunFinishRequest, username: str = Depends(require_auth)) -> dict:
finish_run(run_id=run_id, status=payload.status, details=payload.details)
return {"ok": True, "id": run_id, "requested_by": username}
@app.get("/api/articles")
def api_list_articles(limit: int = 100, status_filter: str | None = None, username: str = Depends(require_auth)) -> dict:
return {"ok": True, "items": repo_list_articles(limit=limit, status_filter=status_filter), "requested_by": username}
@app.get("/api/articles/{article_id}")
def api_get_article(article_id: int, username: str = Depends(require_auth)) -> dict:
article = get_article_by_id(article_id)
if not article:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
return {"ok": True, "item": article, "requested_by": username}
@app.post("/api/articles/upsert")
def api_upsert_article(payload: ArticleUpsertRequest, username: str = Depends(require_auth)) -> dict:
article_id = repo_upsert_article(
ArticleUpsert(
feed_id=payload.feed_id,
source_article_id=payload.source_article_id,
source_hash=payload.source_hash,
title=payload.title,
source_url=payload.source_url,
canonical_url=payload.canonical_url,
published_at=payload.published_at,
author=payload.author,
summary=payload.summary,
content_raw=payload.content_raw,
content_rewritten=payload.content_rewritten,
word_count=payload.word_count,
status=payload.status,
meta_json=payload.meta_json,
)
)
return {"ok": True, "id": article_id, "requested_by": username}
@app.post("/api/articles/{article_id}/transition")
def api_article_transition(article_id: int, payload: ArticleTransitionRequest, username: str = Depends(require_auth)) -> dict:
article = get_article_by_id(article_id)
if not article:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
current_status = article.get("status")
allowed_targets = ALLOWED_ARTICLE_TRANSITIONS.get(current_status, set())
if payload.target_status not in allowed_targets:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=f"Ungueltiger Statuswechsel: {current_status} -> {payload.target_status}",
)
updated = update_article_status(article_id, payload.target_status, actor=username, note=payload.note)
if not updated:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
return {"ok": True, "id": article_id, "from_status": current_status, "to_status": payload.target_status}
@app.post("/api/articles/{article_id}/review")
def api_article_review(article_id: int, payload: ArticleReviewRequest, username: str = Depends(require_auth)) -> dict:
article = get_article_by_id(article_id)
if not article:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
if article.get("status") != "review":
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=f"Review nur fuer Status 'review' erlaubt (aktuell: {article.get('status')})",
)
target_status = "approved" if payload.decision == "approve" else "rewrite"
updated = update_article_status(
article_id,
target_status,
actor=username,
note=payload.note,
decision=payload.decision,
)
if not updated:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
return {
"ok": True,
"id": article_id,
"decision": payload.decision,
"to_status": target_status,
}
@app.post("/api/ingestion/run")
def api_run_ingestion(payload: IngestionRunRequest, username: str = Depends(require_auth)) -> dict:
stats = run_ingestion(feed_id=payload.feed_id)
return {
"ok": stats.status == "success",
"run_id": stats.run_id,
"status": stats.status,
"message": stats.message,
"stats": {
"feeds_processed": stats.feeds_processed,
"entries_seen": stats.entries_seen,
"articles_upserted": stats.articles_upserted,
},
"requested_by": username,
}

35
backend/app/policy.py Normal file
View file

@ -0,0 +1,35 @@
from __future__ import annotations
from typing import Any
def evaluate_source_policy(source: dict[str, Any] | None) -> list[str]:
issues: list[str] = []
if not source:
issues.append("Keine Quelle zugeordnet")
return issues
risk_level = (source.get("risk_level") or "").strip().lower()
if risk_level != "green":
issues.append(f"Quelle nicht freigegeben (risk_level={risk_level or 'unset'})")
terms_url = (source.get("terms_url") or "").strip()
if not terms_url:
issues.append("terms_url fehlt")
license_name = (source.get("license_name") or "").strip()
if not license_name:
issues.append("license_name fehlt")
last_reviewed_at = (source.get("last_reviewed_at") or "").strip()
if not last_reviewed_at:
issues.append("last_reviewed_at fehlt")
if int(source.get("is_enabled", 0) or 0) != 1:
issues.append("Quelle ist deaktiviert")
return issues
def is_source_allowed(source: dict[str, Any] | None) -> bool:
return len(evaluate_source_policy(source)) == 0

416
backend/app/repositories.py Normal file
View file

@ -0,0 +1,416 @@
from __future__ import annotations
from dataclasses import dataclass
import json
from datetime import datetime, timezone
from typing import Any
from .db import get_conn, rows_to_dicts
@dataclass(frozen=True)
class SourceCreate:
name: str
base_url: str | None
terms_url: str | None
license_name: str | None
risk_level: str
is_enabled: bool
notes: str | None
last_reviewed_at: str | None
@dataclass(frozen=True)
class FeedCreate:
name: str
url: str
source_id: int | None
is_enabled: bool
@dataclass(frozen=True)
class RunCreate:
run_type: str
status: str
details: str | None = None
@dataclass(frozen=True)
class ArticleUpsert:
feed_id: int | None
source_article_id: str | None
source_hash: str | None
title: str
source_url: str
canonical_url: str | None
published_at: str | None
author: str | None
summary: str | None
content_raw: str | None
content_rewritten: str | None
word_count: int
status: str
meta_json: str | None
def create_source(payload: SourceCreate) -> int:
with get_conn() as conn:
cur = conn.execute(
"""
INSERT INTO sources (name, base_url, terms_url, license_name, risk_level, is_enabled, notes, last_reviewed_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""",
(
payload.name.strip(),
payload.base_url,
payload.terms_url,
payload.license_name,
payload.risk_level,
1 if payload.is_enabled else 0,
payload.notes,
payload.last_reviewed_at,
),
)
return int(cur.lastrowid)
def list_sources() -> list[dict[str, Any]]:
with get_conn() as conn:
rows = conn.execute(
"""
SELECT id, name, base_url, terms_url, license_name, risk_level, is_enabled, notes, last_reviewed_at, created_at, updated_at
FROM sources
ORDER BY id DESC
"""
).fetchall()
return rows_to_dicts(rows)
def get_source_by_id(source_id: int) -> dict[str, Any] | None:
with get_conn() as conn:
row = conn.execute(
"""
SELECT id, name, base_url, terms_url, license_name, risk_level, is_enabled, notes, last_reviewed_at, created_at, updated_at
FROM sources
WHERE id = ?
""",
(source_id,),
).fetchone()
return dict(row) if row else None
def create_feed(payload: FeedCreate) -> int:
with get_conn() as conn:
cur = conn.execute(
"INSERT INTO feeds (name, url, source_id, is_enabled) VALUES (?, ?, ?, ?)",
(payload.name.strip(), payload.url.strip(), payload.source_id, 1 if payload.is_enabled else 0),
)
return int(cur.lastrowid)
def list_feeds() -> list[dict[str, Any]]:
with get_conn() as conn:
rows = conn.execute(
"""
SELECT f.id, f.name, f.url, f.source_id, f.is_enabled, f.etag, f.last_modified, f.last_checked_at,
f.created_at, f.updated_at, s.name AS source_name, s.license_name AS source_license_name,
s.terms_url AS source_terms_url, s.risk_level AS source_risk_level, s.base_url AS source_base_url,
s.last_reviewed_at AS source_last_reviewed_at, s.is_enabled AS source_is_enabled
FROM feeds f
LEFT JOIN sources s ON s.id = f.source_id
ORDER BY f.id DESC
"""
).fetchall()
return rows_to_dicts(rows)
def list_enabled_feeds() -> list[dict[str, Any]]:
with get_conn() as conn:
rows = conn.execute(
"""
SELECT f.id, f.name, f.url, f.source_id, f.is_enabled, f.etag, f.last_modified, f.last_checked_at,
s.name AS source_name, s.license_name AS source_license_name, s.terms_url AS source_terms_url,
s.risk_level AS source_risk_level, s.base_url AS source_base_url,
s.last_reviewed_at AS source_last_reviewed_at, s.is_enabled AS source_is_enabled
FROM feeds f
LEFT JOIN sources s ON s.id = f.source_id
WHERE f.is_enabled = 1
ORDER BY f.id ASC
"""
).fetchall()
return rows_to_dicts(rows)
def get_feed_by_id(feed_id: int) -> dict[str, Any] | None:
with get_conn() as conn:
row = conn.execute(
"""
SELECT f.id, f.name, f.url, f.source_id, f.is_enabled, f.etag, f.last_modified, f.last_checked_at,
s.name AS source_name, s.license_name AS source_license_name, s.terms_url AS source_terms_url,
s.risk_level AS source_risk_level, s.base_url AS source_base_url,
s.last_reviewed_at AS source_last_reviewed_at, s.is_enabled AS source_is_enabled
FROM feeds f
LEFT JOIN sources s ON s.id = f.source_id
WHERE f.id = ?
""",
(feed_id,),
).fetchone()
return dict(row) if row else None
def update_feed_fetch_state(feed_id: int, etag: str | None, last_modified: str | None) -> None:
with get_conn() as conn:
conn.execute(
"""
UPDATE feeds
SET etag = ?, last_modified = ?, last_checked_at = datetime('now')
WHERE id = ?
""",
(etag, last_modified, feed_id),
)
def create_run(payload: RunCreate) -> int:
with get_conn() as conn:
cur = conn.execute(
"INSERT INTO runs (run_type, status, details) VALUES (?, ?, ?)",
(payload.run_type, payload.status, payload.details),
)
return int(cur.lastrowid)
def finish_run(run_id: int, status: str, details: str | None = None) -> None:
with get_conn() as conn:
conn.execute(
"""
UPDATE runs
SET status = ?, details = ?, finished_at = datetime('now')
WHERE id = ?
""",
(status, details, run_id),
)
def list_runs(limit: int = 50) -> list[dict[str, Any]]:
safe_limit = max(1, min(limit, 500))
with get_conn() as conn:
rows = conn.execute(
"""
SELECT id, run_type, status, started_at, finished_at, details
FROM runs
ORDER BY id DESC
LIMIT ?
""",
(safe_limit,),
).fetchall()
return rows_to_dicts(rows)
def get_run_by_id(run_id: int) -> dict[str, Any] | None:
with get_conn() as conn:
row = conn.execute(
"""
SELECT id, run_type, status, started_at, finished_at, details
FROM runs
WHERE id = ?
""",
(run_id,),
).fetchone()
return dict(row) if row else None
def get_article_by_id(article_id: int) -> dict[str, Any] | None:
with get_conn() as conn:
row = conn.execute(
"""
SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author,
a.summary, a.content_raw, a.content_rewritten, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at
FROM articles a
WHERE a.id = ?
""",
(article_id,),
).fetchone()
return dict(row) if row else None
def _merge_review_event(meta_json: str | None, event: dict[str, Any]) -> str:
meta: dict[str, Any] = {}
if meta_json:
try:
meta = json.loads(meta_json)
if not isinstance(meta, dict):
meta = {}
except Exception:
meta = {}
events = meta.get("review_events")
if not isinstance(events, list):
events = []
events.append(event)
meta["review_events"] = events
return json.dumps(meta, ensure_ascii=False)
def update_article_status(
article_id: int,
new_status: str,
*,
actor: str | None = None,
note: str | None = None,
decision: str | None = None,
) -> bool:
article = get_article_by_id(article_id)
if not article:
return False
event = {
"timestamp": datetime.now(timezone.utc).isoformat(),
"from_status": article.get("status"),
"to_status": new_status,
"actor": actor or "system",
"note": note,
"decision": decision,
}
merged_meta = _merge_review_event(article.get("meta_json"), event)
with get_conn() as conn:
conn.execute(
"UPDATE articles SET status = ?, meta_json = ? WHERE id = ?",
(new_status, merged_meta, article_id),
)
return True
def _resolve_existing_article_id(payload: ArticleUpsert) -> int | None:
with get_conn() as conn:
# 1) strongest key: source_url
row = conn.execute(
"SELECT id FROM articles WHERE source_url = ?",
(payload.source_url.strip(),),
).fetchone()
if row:
return int(row["id"])
# 2) stable feed+guid combo
if payload.feed_id is not None and payload.source_article_id:
row = conn.execute(
"SELECT id FROM articles WHERE feed_id = ? AND source_article_id = ?",
(payload.feed_id, payload.source_article_id),
).fetchone()
if row:
return int(row["id"])
# 3) content hash fallback
if payload.source_hash:
row = conn.execute(
"SELECT id FROM articles WHERE source_hash = ?",
(payload.source_hash,),
).fetchone()
if row:
return int(row["id"])
return None
def upsert_article(payload: ArticleUpsert) -> int:
existing_id = _resolve_existing_article_id(payload)
with get_conn() as conn:
if existing_id is None:
conn.execute(
"""
INSERT INTO articles (
feed_id, source_article_id, source_hash, title, source_url, canonical_url, published_at, author,
summary, content_raw, content_rewritten, word_count, status, meta_json
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
(
payload.feed_id,
payload.source_article_id,
payload.source_hash,
payload.title.strip(),
payload.source_url.strip(),
payload.canonical_url,
payload.published_at,
payload.author,
payload.summary,
payload.content_raw,
payload.content_rewritten,
payload.word_count,
payload.status,
payload.meta_json,
),
)
else:
conn.execute(
"""
UPDATE articles
SET
feed_id = ?,
source_article_id = ?,
source_hash = ?,
title = ?,
source_url = ?,
canonical_url = ?,
published_at = ?,
author = ?,
summary = ?,
content_raw = ?,
content_rewritten = ?,
word_count = ?,
status = ?,
meta_json = ?
WHERE id = ?
""",
(
payload.feed_id,
payload.source_article_id,
payload.source_hash,
payload.title.strip(),
payload.source_url.strip(),
payload.canonical_url,
payload.published_at,
payload.author,
payload.summary,
payload.content_raw,
payload.content_rewritten,
payload.word_count,
payload.status,
payload.meta_json,
existing_id,
),
)
row = conn.execute("SELECT id FROM articles WHERE source_url = ?", (payload.source_url.strip(),)).fetchone()
if row:
return int(row["id"])
return int(existing_id) if existing_id else 0
def list_articles(limit: int = 100, status_filter: str | None = None) -> list[dict[str, Any]]:
safe_limit = max(1, min(limit, 500))
with get_conn() as conn:
if status_filter:
rows = conn.execute(
"""
SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author,
a.summary, a.content_raw, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at, f.name AS feed_name
FROM articles a
LEFT JOIN feeds f ON f.id = a.feed_id
WHERE a.status = ?
ORDER BY a.id DESC
LIMIT ?
""",
(status_filter, safe_limit),
).fetchall()
else:
rows = conn.execute(
"""
SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author,
a.summary, a.content_raw, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at, f.name AS feed_name
FROM articles a
LEFT JOIN feeds f ON f.id = a.feed_id
ORDER BY a.id DESC
LIMIT ?
""",
(safe_limit,),
).fetchall()
return rows_to_dicts(rows)

View file

@ -0,0 +1,257 @@
from __future__ import annotations
from dataclasses import dataclass
from html import unescape
import re
from typing import Any
from urllib.parse import urljoin
from urllib.request import Request, urlopen
DEFAULT_TIMEOUT_SECONDS = 10
DEFAULT_USER_AGENT = "rss-news-bot/1.0 (+https://news.vanityontour.de)"
@dataclass(frozen=True)
class ExtractedArticle:
title: str | None
author: str | None
canonical_url: str | None
summary: str | None
content_text: str | None
images: list[str]
press_contact: str | None
extraction_error: str | None = None
def _clean_text(raw: str | None) -> str | None:
if not raw:
return None
text = unescape(raw)
text = re.sub(r"<[^>]+>", " ", text)
text = re.sub(r"\s+", " ", text).strip()
return text or None
def _strip_noise(html: str) -> str:
html = re.sub(r"<script[\s\S]*?</script>", " ", html, flags=re.IGNORECASE)
html = re.sub(r"<style[\s\S]*?</style>", " ", html, flags=re.IGNORECASE)
html = re.sub(r"<noscript[\s\S]*?</noscript>", " ", html, flags=re.IGNORECASE)
return html
def _meta_content(html: str, attr: str, value: str) -> str | None:
pattern = re.compile(
rf"<meta[^>]+{attr}\s*=\s*[\"']{re.escape(value)}[\"'][^>]*content\s*=\s*[\"']([^\"']+)[\"'][^>]*>",
re.IGNORECASE,
)
match = pattern.search(html)
if match:
return _clean_text(match.group(1))
# handle reversed attribute order
pattern_rev = re.compile(
rf"<meta[^>]+content\s*=\s*[\"']([^\"']+)[\"'][^>]*{attr}\s*=\s*[\"']{re.escape(value)}[\"'][^>]*>",
re.IGNORECASE,
)
match = pattern_rev.search(html)
if match:
return _clean_text(match.group(1))
return None
def _extract_title(html: str) -> str | None:
title = _meta_content(html, "property", "og:title")
if title:
return title
match = re.search(r"<title[^>]*>([\s\S]*?)</title>", html, re.IGNORECASE)
if match:
cleaned = _clean_text(match.group(1))
if cleaned:
return cleaned
match = re.search(r"<h1[^>]*>([\s\S]*?)</h1>", html, re.IGNORECASE)
if match:
return _clean_text(match.group(1))
return None
def _extract_canonical(html: str) -> str | None:
match = re.search(
r"<link[^>]+rel\s*=\s*[\"']canonical[\"'][^>]*href\s*=\s*[\"']([^\"']+)[\"'][^>]*>",
html,
re.IGNORECASE,
)
if match:
return _clean_text(match.group(1))
match = re.search(
r"<link[^>]+href\s*=\s*[\"']([^\"']+)[\"'][^>]*rel\s*=\s*[\"']canonical[\"'][^>]*>",
html,
re.IGNORECASE,
)
if match:
return _clean_text(match.group(1))
return None
def _extract_author(html: str) -> str | None:
for attr, value in (("name", "author"), ("property", "article:author"), ("property", "og:article:author")):
author = _meta_content(html, attr, value)
if author:
return author
for pattern in (
r"(?:Von|Autor(?:in)?)\s*[:\-]\s*([^<\n\r]{3,120})",
r"class=[\"'][^\"']*(?:author|byline)[^\"']*[\"'][^>]*>([\s\S]{1,180})<",
):
match = re.search(pattern, html, re.IGNORECASE)
if match:
author = _clean_text(match.group(1))
if author:
return author
return None
def _extract_images(html: str, page_url: str) -> list[str]:
images: list[str] = []
seen: set[str] = set()
for prop in ("og:image", "twitter:image"):
pattern = re.compile(
rf"<meta[^>]+property\s*=\s*[\"']{re.escape(prop)}[\"'][^>]*content\s*=\s*[\"']([^\"']+)[\"'][^>]*>",
re.IGNORECASE,
)
for match in pattern.finditer(html):
src = match.group(1).strip()
abs_src = urljoin(page_url, src)
if abs_src not in seen:
seen.add(abs_src)
images.append(abs_src)
for match in re.finditer(r"<img[^>]+src\s*=\s*[\"']([^\"']+)[\"'][^>]*>", html, re.IGNORECASE):
src = match.group(1).strip()
abs_src = urljoin(page_url, src)
if abs_src not in seen:
seen.add(abs_src)
images.append(abs_src)
return images
def _extract_content_text(html: str) -> str | None:
section = None
for pattern in (
r"<article[^>]*>([\s\S]*?)</article>",
r"<main[^>]*>([\s\S]*?)</main>",
r"<body[^>]*>([\s\S]*?)</body>",
):
match = re.search(pattern, html, re.IGNORECASE)
if match:
section = match.group(1)
break
if not section:
section = html
paragraphs = []
for match in re.finditer(r"<h[2-4][^>]*>([\s\S]*?)</h[2-4]>", section, re.IGNORECASE):
text = _clean_text(match.group(1))
if text and re.search(r"\b(pressekontakt|press contact|kontakt)\b", text, re.IGNORECASE):
paragraphs.append(text)
for match in re.finditer(r"<p[^>]*>([\s\S]*?)</p>", section, re.IGNORECASE):
text = _clean_text(match.group(1))
if text and len(text) > 2:
paragraphs.append(text)
if paragraphs:
return "\n".join(paragraphs)
stripped = _clean_text(section)
return stripped
def _extract_press_contact(content_text: str | None) -> str | None:
if not content_text:
return None
lines = [line.strip() for line in content_text.split("\n") if line.strip()]
marker_re = re.compile(r"\b(pressekontakt|press contact|presse\-kontakt)\b", re.IGNORECASE)
for idx, line in enumerate(lines):
if marker_re.search(line):
chunk = [line]
for nxt in lines[idx + 1 : idx + 6]:
if re.search(r"\b(original\-content von|ots:|newsroom:)\b", nxt, re.IGNORECASE):
break
chunk.append(nxt)
return _clean_text("\n".join(chunk))
match = re.search(
r"(Pressekontakt[\s\S]{0,1200}?)(?:Original-Content von|OTS:|newsroom:|$)",
content_text,
re.IGNORECASE,
)
if match:
return _clean_text(match.group(1))
return None
def extract_article(url: str, timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS) -> ExtractedArticle:
try:
req = Request(
url=url,
headers={
"User-Agent": DEFAULT_USER_AGENT,
"Accept-Language": "de-DE,de;q=0.9,en;q=0.8",
},
)
with urlopen(req, timeout=timeout_seconds) as resp:
raw = resp.read()
charset = resp.headers.get_content_charset() or "utf-8"
html = raw.decode(charset, errors="replace")
except Exception as exc:
return ExtractedArticle(
title=None,
author=None,
canonical_url=None,
summary=None,
content_text=None,
images=[],
press_contact=None,
extraction_error=str(exc),
)
html = _strip_noise(html)
title = _extract_title(html)
author = _extract_author(html)
canonical_url = _extract_canonical(html)
summary = _meta_content(html, "name", "description")
content_text = _extract_content_text(html)
if not summary and content_text:
summary = _clean_text(content_text[:320])
images = _extract_images(html, url)
press_contact = _extract_press_contact(content_text)
return ExtractedArticle(
title=title,
author=author,
canonical_url=canonical_url,
summary=summary,
content_text=content_text,
images=images,
press_contact=press_contact,
extraction_error=None,
)
def extracted_article_to_meta(article: ExtractedArticle) -> dict[str, Any]:
return {
"title": article.title,
"author": article.author,
"canonical_url": article.canonical_url,
"summary": article.summary,
"images": article.images,
"press_contact": article.press_contact,
"extraction_error": article.extraction_error,
}