feat(legal): add structured attribution fields and publish legal gate

This commit is contained in:
Oliver 2026-02-18 10:02:19 +01:00
parent c52363f1a7
commit 5159a6e3b4
10 changed files with 259 additions and 16 deletions

View file

@ -23,6 +23,7 @@ from .repositories import (
list_feeds,
list_runs,
list_sources,
set_article_legal_review,
update_article_status,
)
@ -104,22 +105,22 @@ def _legal_checklist(article: dict, feed: dict | None) -> list[dict[str, str]]:
checks.append(
{
"label": "Bilder extrahiert",
"status": "ok" if extraction.get("images") else "missing",
"status": "ok" if article.get("image_urls_json") else "missing",
"value": str(len(extraction.get("images", []))) if isinstance(extraction.get("images"), list) else "0",
}
)
checks.append(
{
"label": "Pressekontakt",
"status": "ok" if extraction.get("press_contact") else "missing",
"value": extraction.get("press_contact") or "-",
"status": "ok" if article.get("press_contact") else "missing",
"value": article.get("press_contact") or extraction.get("press_contact") or "-",
}
)
checks.append(
{
"label": "Lizenz/Terms",
"status": "ok" if attribution.get("source_license_name") and attribution.get("source_terms_url") else "missing",
"value": f"{attribution.get('source_license_name') or '-'} | {attribution.get('source_terms_url') or '-'}",
"status": "ok" if article.get("source_license_name_snapshot") and article.get("source_terms_url_snapshot") else "missing",
"value": f"{article.get('source_license_name_snapshot') or attribution.get('source_license_name') or '-'} | {article.get('source_terms_url_snapshot') or attribution.get('source_terms_url') or '-'}",
}
)
checks.append(
@ -129,6 +130,13 @@ def _legal_checklist(article: dict, feed: dict | None) -> list[dict[str, str]]:
"value": feed.get("source_risk_level") if feed else "-",
}
)
checks.append(
{
"label": "Manuelle Rechtsfreigabe",
"status": "ok" if int(article.get("legal_checked", 0)) == 1 else "missing",
"value": article.get("legal_checked_at") or "-",
}
)
return checks
@ -193,9 +201,20 @@ def admin_dashboard(request: Request):
for article in articles:
meta = _parse_meta_json(article.get("meta_json"))
extraction = meta.get("extraction") if isinstance(meta.get("extraction"), dict) else {}
images = []
if article.get("image_urls_json"):
try:
parsed_images = json.loads(article["image_urls_json"])
if isinstance(parsed_images, list):
images = [str(item) for item in parsed_images if item]
except Exception:
images = []
if not images and isinstance(extraction.get("images"), list):
images = extraction.get("images")
article["meta"] = meta
article["extracted_images"] = extraction.get("images") if isinstance(extraction.get("images"), list) else []
article["press_contact"] = extraction.get("press_contact") if isinstance(extraction.get("press_contact"), str) else None
article["extracted_images"] = images
if not article.get("press_contact") and isinstance(extraction.get("press_contact"), str):
article["press_contact"] = extraction.get("press_contact")
article["extraction_error"] = extraction.get("extraction_error") if isinstance(extraction.get("extraction_error"), str) else None
return templates.TemplateResponse(
@ -232,6 +251,15 @@ def admin_article_detail(request: Request, article_id: int):
meta = _parse_meta_json(article.get("meta_json"))
article["meta"] = meta
extraction = meta.get("extraction") if isinstance(meta.get("extraction"), dict) else {}
if article.get("image_urls_json"):
try:
parsed_images = json.loads(article["image_urls_json"])
if isinstance(parsed_images, list):
extraction["images"] = [str(item) for item in parsed_images if item]
except Exception:
pass
if not article.get("press_contact") and isinstance(extraction.get("press_contact"), str):
article["press_contact"] = extraction.get("press_contact")
article["extraction"] = extraction
feed = get_feed_by_id(int(article["feed_id"])) if article.get("feed_id") else None
checklist = _legal_checklist(article, feed)
@ -251,6 +279,19 @@ def admin_article_detail(request: Request, article_id: int):
)
@router.post("/admin/articles/{article_id}/legal-review")
def admin_article_legal_review(request: Request, article_id: int, approved: str = Form("0"), note: str = Form("")):
user = _admin_user(request)
if not user:
return RedirectResponse(url="/admin/login", status_code=303)
is_approved = approved == "1"
ok = set_article_legal_review(article_id, approved=is_approved, note=note or None, actor=user)
if not ok:
return _dashboard_redirect(msg=f"Artikel #{article_id} nicht gefunden", msg_type="error")
return RedirectResponse(url=f"/admin/articles/{article_id}", status_code=303)
@router.post("/admin/sources/create")
def admin_create_source(
request: Request,
@ -344,6 +385,8 @@ def admin_transition_article(request: Request, article_id: int, target_status: s
if article:
current = article.get("status")
if target_status in ALLOWED_TRANSITIONS.get(current, ()):
if target_status == "published" and int(article.get("legal_checked", 0)) != 1:
return _dashboard_redirect(msg=f"Publish blockiert fuer Artikel #{article_id}: Rechtsfreigabe fehlt", msg_type="error")
update_article_status(article_id, target_status, actor=user, note=note or None)
return _dashboard_redirect(msg=f"Artikel #{article_id}: {current} -> {target_status}")
return _dashboard_redirect(msg=f"Ungueltiger Statuswechsel fuer Artikel #{article_id}", msg_type="error")

View file

@ -81,6 +81,14 @@ def init_db() -> None:
summary TEXT,
content_raw TEXT,
content_rewritten TEXT,
image_urls_json TEXT,
press_contact TEXT,
source_name_snapshot TEXT,
source_terms_url_snapshot TEXT,
source_license_name_snapshot TEXT,
legal_checked INTEGER NOT NULL DEFAULT 0,
legal_checked_at TEXT,
legal_note TEXT,
word_count INTEGER DEFAULT 0,
status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error')),
meta_json TEXT,
@ -130,8 +138,20 @@ def init_db() -> None:
existing_columns = {
row["name"] for row in conn.execute("PRAGMA table_info(articles)").fetchall()
}
if "source_hash" not in existing_columns:
conn.execute("ALTER TABLE articles ADD COLUMN source_hash TEXT")
migration_columns = {
"source_hash": "ALTER TABLE articles ADD COLUMN source_hash TEXT",
"image_urls_json": "ALTER TABLE articles ADD COLUMN image_urls_json TEXT",
"press_contact": "ALTER TABLE articles ADD COLUMN press_contact TEXT",
"source_name_snapshot": "ALTER TABLE articles ADD COLUMN source_name_snapshot TEXT",
"source_terms_url_snapshot": "ALTER TABLE articles ADD COLUMN source_terms_url_snapshot TEXT",
"source_license_name_snapshot": "ALTER TABLE articles ADD COLUMN source_license_name_snapshot TEXT",
"legal_checked": "ALTER TABLE articles ADD COLUMN legal_checked INTEGER NOT NULL DEFAULT 0",
"legal_checked_at": "ALTER TABLE articles ADD COLUMN legal_checked_at TEXT",
"legal_note": "ALTER TABLE articles ADD COLUMN legal_note TEXT",
}
for column, ddl in migration_columns.items():
if column not in existing_columns:
conn.execute(ddl)
def rows_to_dicts(rows: list[sqlite3.Row]) -> list[dict[str, Any]]:

View file

@ -201,6 +201,14 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats:
summary=final_summary,
content_raw=final_content_raw,
content_rewritten=None,
image_urls_json=json.dumps(extracted.images, ensure_ascii=False) if extracted.images else None,
press_contact=extracted.press_contact,
source_name_snapshot=feed.get("source_name"),
source_terms_url_snapshot=feed.get("source_terms_url"),
source_license_name_snapshot=feed.get("source_license_name"),
legal_checked=False,
legal_checked_at=None,
legal_note=None,
word_count=len((final_content_raw or "").split()),
status="new",
meta_json=json.dumps({"attribution": attribution, "extraction": extraction_meta}, ensure_ascii=False),

View file

@ -28,6 +28,7 @@ from .repositories import (
list_feeds as repo_list_feeds,
list_runs,
list_sources as repo_list_sources,
set_article_legal_review,
update_article_status,
upsert_article as repo_upsert_article,
)
@ -96,6 +97,14 @@ class ArticleUpsertRequest(BaseModel):
summary: str | None = None
content_raw: str | None = None
content_rewritten: str | None = None
image_urls_json: str | None = None
press_contact: str | None = None
source_name_snapshot: str | None = None
source_terms_url_snapshot: str | None = None
source_license_name_snapshot: str | None = None
legal_checked: bool = False
legal_checked_at: str | None = None
legal_note: str | None = None
word_count: int = 0
status: str = Field(default="new", pattern="^(new|rewrite|review|approved|published|error)$")
meta_json: str | None = None
@ -115,6 +124,11 @@ class ArticleReviewRequest(BaseModel):
note: str | None = None
class ArticleLegalReviewRequest(BaseModel):
approved: bool
note: str | None = None
ALLOWED_ARTICLE_TRANSITIONS: dict[str, set[str]] = {
"new": {"review", "rewrite", "error"},
"rewrite": {"review", "error"},
@ -330,6 +344,14 @@ def api_upsert_article(payload: ArticleUpsertRequest, username: str = Depends(re
summary=payload.summary,
content_raw=payload.content_raw,
content_rewritten=payload.content_rewritten,
image_urls_json=payload.image_urls_json,
press_contact=payload.press_contact,
source_name_snapshot=payload.source_name_snapshot,
source_terms_url_snapshot=payload.source_terms_url_snapshot,
source_license_name_snapshot=payload.source_license_name_snapshot,
legal_checked=payload.legal_checked,
legal_checked_at=payload.legal_checked_at,
legal_note=payload.legal_note,
word_count=payload.word_count,
status=payload.status,
meta_json=payload.meta_json,
@ -351,6 +373,11 @@ def api_article_transition(article_id: int, payload: ArticleTransitionRequest, u
status_code=status.HTTP_400_BAD_REQUEST,
detail=f"Ungueltiger Statuswechsel: {current_status} -> {payload.target_status}",
)
if payload.target_status == "published" and int(article.get("legal_checked", 0)) != 1:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Publish gesperrt: Rechtscheck wurde noch nicht freigegeben",
)
updated = update_article_status(article_id, payload.target_status, actor=username, note=payload.note)
if not updated:
@ -358,6 +385,22 @@ def api_article_transition(article_id: int, payload: ArticleTransitionRequest, u
return {"ok": True, "id": article_id, "from_status": current_status, "to_status": payload.target_status}
@app.post("/api/articles/{article_id}/legal-review")
def api_article_legal_review(article_id: int, payload: ArticleLegalReviewRequest, username: str = Depends(require_auth)) -> dict:
article = get_article_by_id(article_id)
if not article:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
updated = set_article_legal_review(article_id, approved=payload.approved, note=payload.note, actor=username)
if not updated:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Artikel nicht gefunden")
return {
"ok": True,
"id": article_id,
"legal_checked": payload.approved,
}
@app.post("/api/articles/{article_id}/review")
def api_article_review(article_id: int, payload: ArticleReviewRequest, username: str = Depends(require_auth)) -> dict:
article = get_article_by_id(article_id)

View file

@ -48,6 +48,14 @@ class ArticleUpsert:
summary: str | None
content_raw: str | None
content_rewritten: str | None
image_urls_json: str | None
press_contact: str | None
source_name_snapshot: str | None
source_terms_url_snapshot: str | None
source_license_name_snapshot: str | None
legal_checked: bool
legal_checked_at: str | None
legal_note: str | None
word_count: int
status: str
meta_json: str | None
@ -224,7 +232,10 @@ def get_article_by_id(article_id: int) -> dict[str, Any] | None:
row = conn.execute(
"""
SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author,
a.summary, a.content_raw, a.content_rewritten, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at
a.summary, a.content_raw, a.content_rewritten, a.image_urls_json, a.press_contact,
a.source_name_snapshot, a.source_terms_url_snapshot, a.source_license_name_snapshot,
a.legal_checked, a.legal_checked_at, a.legal_note,
a.word_count, a.status, a.meta_json, a.created_at, a.updated_at
FROM articles a
WHERE a.id = ?
""",
@ -281,6 +292,31 @@ def update_article_status(
return True
def set_article_legal_review(article_id: int, approved: bool, note: str | None, actor: str | None = None) -> bool:
article = get_article_by_id(article_id)
if not article:
return False
event = {
"timestamp": datetime.now(timezone.utc).isoformat(),
"event": "legal_review",
"approved": approved,
"actor": actor or "system",
"note": note,
}
merged_meta = _merge_review_event(article.get("meta_json"), event)
with get_conn() as conn:
conn.execute(
"""
UPDATE articles
SET legal_checked = ?, legal_checked_at = datetime('now'), legal_note = ?, meta_json = ?
WHERE id = ?
""",
(1 if approved else 0, note, merged_meta, article_id),
)
return True
def _resolve_existing_article_id(payload: ArticleUpsert) -> int | None:
with get_conn() as conn:
# 1) strongest key: source_url
@ -320,8 +356,11 @@ def upsert_article(payload: ArticleUpsert) -> int:
"""
INSERT INTO articles (
feed_id, source_article_id, source_hash, title, source_url, canonical_url, published_at, author,
summary, content_raw, content_rewritten, word_count, status, meta_json
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
summary, content_raw, content_rewritten, image_urls_json, press_contact,
source_name_snapshot, source_terms_url_snapshot, source_license_name_snapshot,
legal_checked, legal_checked_at, legal_note,
word_count, status, meta_json
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
(
payload.feed_id,
@ -335,6 +374,14 @@ def upsert_article(payload: ArticleUpsert) -> int:
payload.summary,
payload.content_raw,
payload.content_rewritten,
payload.image_urls_json,
payload.press_contact,
payload.source_name_snapshot,
payload.source_terms_url_snapshot,
payload.source_license_name_snapshot,
1 if payload.legal_checked else 0,
payload.legal_checked_at,
payload.legal_note,
payload.word_count,
payload.status,
payload.meta_json,
@ -356,6 +403,14 @@ def upsert_article(payload: ArticleUpsert) -> int:
summary = ?,
content_raw = ?,
content_rewritten = ?,
image_urls_json = ?,
press_contact = ?,
source_name_snapshot = ?,
source_terms_url_snapshot = ?,
source_license_name_snapshot = ?,
legal_checked = ?,
legal_checked_at = ?,
legal_note = ?,
word_count = ?,
status = ?,
meta_json = ?
@ -373,6 +428,14 @@ def upsert_article(payload: ArticleUpsert) -> int:
payload.summary,
payload.content_raw,
payload.content_rewritten,
payload.image_urls_json,
payload.press_contact,
payload.source_name_snapshot,
payload.source_terms_url_snapshot,
payload.source_license_name_snapshot,
1 if payload.legal_checked else 0,
payload.legal_checked_at,
payload.legal_note,
payload.word_count,
payload.status,
payload.meta_json,
@ -392,7 +455,9 @@ def list_articles(limit: int = 100, status_filter: str | None = None) -> list[di
rows = conn.execute(
"""
SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author,
a.summary, a.content_raw, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at, f.name AS feed_name
a.summary, a.content_raw, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at, f.name AS feed_name,
a.image_urls_json, a.press_contact, a.source_name_snapshot, a.source_terms_url_snapshot,
a.source_license_name_snapshot, a.legal_checked, a.legal_checked_at, a.legal_note
FROM articles a
LEFT JOIN feeds f ON f.id = a.feed_id
WHERE a.status = ?
@ -405,7 +470,9 @@ def list_articles(limit: int = 100, status_filter: str | None = None) -> list[di
rows = conn.execute(
"""
SELECT a.id, a.feed_id, a.source_article_id, a.source_hash, a.title, a.source_url, a.canonical_url, a.published_at, a.author,
a.summary, a.content_raw, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at, f.name AS feed_name
a.summary, a.content_raw, a.word_count, a.status, a.meta_json, a.created_at, a.updated_at, f.name AS feed_name,
a.image_urls_json, a.press_contact, a.source_name_snapshot, a.source_terms_url_snapshot,
a.source_license_name_snapshot, a.legal_checked, a.legal_checked_at, a.legal_note
FROM articles a
LEFT JOIN feeds f ON f.id = a.feed_id
ORDER BY a.id DESC

View file

@ -26,6 +26,9 @@
<p><strong>Status:</strong> <span class="badge">{{ article.status }}</span></p>
<p><strong>Autor:</strong> {{ article.author or "-" }}</p>
<p><strong>Feed:</strong> {{ feed.name if feed else "-" }}</p>
<p><strong>Quelle Snapshot:</strong> {{ article.source_name_snapshot or "-" }}</p>
<p><strong>Lizenz Snapshot:</strong> {{ article.source_license_name_snapshot or "-" }}</p>
<p><strong>Terms Snapshot:</strong> {{ article.source_terms_url_snapshot or "-" }}</p>
<p><strong>Quelle:</strong> <a href="{{ article.source_url }}" target="_blank" rel="noopener">{{ article.source_url }}</a></p>
{% if article.canonical_url %}
<p><strong>Canonical:</strong> <a href="{{ article.canonical_url }}" target="_blank" rel="noopener">{{ article.canonical_url }}</a></p>
@ -69,9 +72,9 @@
{% endfor %}
</ul>
{% endif %}
{% if article.extraction.press_contact %}
{% if article.press_contact or article.extraction.press_contact %}
<p><strong>Pressekontakt</strong></p>
<div class="pre">{{ article.extraction.press_contact }}</div>
<div class="pre">{{ article.press_contact or article.extraction.press_contact }}</div>
{% endif %}
{% if article.extraction.extraction_error %}
<p class="subtle">Extraktionsfehler: {{ article.extraction.extraction_error }}</p>
@ -83,8 +86,32 @@
<div class="pre">{{ article.content_raw or "-" }}</div>
</section>
<section class="card">
<h2>Rechtsfreigabe</h2>
<p><strong>Freigabe:</strong>
{% if article.legal_checked %}
<span class="badge ok">Freigegeben</span>
{% else %}
<span class="badge bad">Nicht freigegeben</span>
{% endif %}
</p>
<p><strong>Zeitpunkt:</strong> {{ article.legal_checked_at or "-" }}</p>
<p><strong>Notiz:</strong> {{ article.legal_note or "-" }}</p>
<form method="post" action="/admin/articles/{{ article.id }}/legal-review" class="row">
<select name="approved">
<option value="1">Freigeben</option>
<option value="0">Zurücksetzen</option>
</select>
<input name="note" placeholder="Rechtsnotiz" />
<button type="submit">Speichern</button>
</form>
</section>
<section class="card">
<h2>Status ändern</h2>
{% if not article.legal_checked %}
<p class="subtle">Hinweis: `published` ist erst nach manueller Rechtsfreigabe erlaubt.</p>
{% endif %}
<form method="post" action="/admin/articles/{{ article.id }}/transition" class="row">
<select name="target_status">
{% for s in allowed_transitions %}

View file

@ -151,6 +151,7 @@
</td>
<td><span class="badge">{{ a.status }}</span></td>
<td>
<div class="subtle">Legal: {{ "OK" if a.legal_checked else "offen" }}</div>
{% if a.summary %}
<div><strong>Summary:</strong> {{ a.summary }}</div>
{% endif %}

View file

@ -95,6 +95,14 @@ class TestAdminUi(unittest.TestCase):
summary="Summary A",
content_raw="Volltext A",
content_rewritten=None,
image_urls_json='["https://example.org/img.jpg"]',
press_contact="Kontakt",
source_name_snapshot="Test Source",
source_terms_url_snapshot="https://example.org/terms",
source_license_name_snapshot="cc-by",
legal_checked=False,
legal_checked_at=None,
legal_note=None,
word_count=2,
status="new",
meta_json='{"extraction":{"images":["https://example.org/img.jpg"],"press_contact":"Kontakt"}}',

View file

@ -73,12 +73,22 @@ class TestArticleWorkflow(unittest.TestCase):
self.assertEqual(r1.status_code, 200)
self.assertEqual(r1.json()["to_status"], "approved")
blocked_publish = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "published"})
self.assertEqual(blocked_publish.status_code, 400)
legal = self.client.post(
f"/api/articles/{article_id}/legal-review",
json={"approved": True, "note": "Rechte geprueft"},
)
self.assertEqual(legal.status_code, 200)
t2 = self.client.post(f"/api/articles/{article_id}/transition", json={"target_status": "published"})
self.assertEqual(t2.status_code, 200)
final = self.client.get(f"/api/articles/{article_id}")
self.assertEqual(final.status_code, 200)
self.assertEqual(final.json()["item"]["status"], "published")
self.assertEqual(final.json()["item"]["legal_checked"], 1)
def test_invalid_transition_rejected(self) -> None:
article_id = self._create_article()

View file

@ -77,6 +77,14 @@ class TestSQLiteRepositories(unittest.TestCase):
summary="Kurzfassung",
content_raw="Originaltext",
content_rewritten="Umschreibung",
image_urls_json='["https://example.org/img.jpg"]',
press_contact="Pressekontakt X",
source_name_snapshot="GovData",
source_terms_url_snapshot="https://www.govdata.de/dl-de/by-2-0",
source_license_name_snapshot="dl-de/by-2-0",
legal_checked=False,
legal_checked_at=None,
legal_note=None,
word_count=120,
status="review",
meta_json='{"lang":"de"}',
@ -98,6 +106,14 @@ class TestSQLiteRepositories(unittest.TestCase):
summary="Kurzfassung 2",
content_raw="Originaltext 2",
content_rewritten="Umschreibung 2",
image_urls_json='["https://example.org/img2.jpg"]',
press_contact="Pressekontakt Y",
source_name_snapshot="GovData",
source_terms_url_snapshot="https://www.govdata.de/dl-de/by-2-0",
source_license_name_snapshot="dl-de/by-2-0",
legal_checked=True,
legal_checked_at="2026-02-18T00:10:00Z",
legal_note="ok",
word_count=140,
status="approved",
meta_json='{"lang":"de","v":2}',