rss-news/backend/app/db.py
OliverGiertz aaac5def27 feat(pipeline): image caption/credit extraction, no-image exclusion, WP attribution
source_extraction.py:
- New _extract_image_metadata(): extracts figcaption text + copyright/credit
  per image URL using 3 strategies (figure+figcaption, data-* attributes,
  adjacent credit spans)
- ExtractedArticle gets new image_metadata field
- extracted_article_to_meta() includes image_metadata in stored JSON

pipeline.py:
- After auto image selection, check if selected_url is set
- Articles without usable image → status "no_image" (excluded with Telegram notice)
- PipelineStats and summary report include no_image counter

db.py:
- Add "no_image" to articles status CHECK constraint
- Migration: recreates articles table with updated constraint on existing DBs

workflow.py / main.py:
- Map no_image as own UI status with rewrite/close transitions

wordpress.py:
- _upload_featured_media() accepts image_caption param, sends to WP media
- _get_image_meta_for_url() / _build_image_caption() helpers
- _build_attribution_block(): separator + attribution paragraph at article end
  (original link, author, Bildnachweis/credit)
- _build_post_content() appends attribution block

telegram_bot.py:
- notify_pipeline_done() shows 🖼️ no-image count

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-27 07:08:48 +00:00

293 lines
13 KiB
Python

import sqlite3
from contextlib import contextmanager
from pathlib import Path
from typing import Any, Iterator
from .config import get_settings
def _db_path() -> Path:
settings = get_settings()
path = Path(settings.app_db_path)
path.parent.mkdir(parents=True, exist_ok=True)
return path
@contextmanager
def get_conn() -> Iterator[sqlite3.Connection]:
conn = sqlite3.connect(_db_path())
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA foreign_keys=ON;")
try:
yield conn
conn.commit()
finally:
conn.close()
def init_db() -> None:
with get_conn() as conn:
conn.executescript(
"""
PRAGMA journal_mode=WAL;
CREATE TABLE IF NOT EXISTS sources (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL,
base_url TEXT,
terms_url TEXT,
license_name TEXT,
risk_level TEXT NOT NULL DEFAULT 'yellow' CHECK (risk_level IN ('green', 'yellow', 'red')),
is_enabled INTEGER NOT NULL DEFAULT 0,
notes TEXT,
last_reviewed_at TEXT,
created_at TEXT NOT NULL DEFAULT (datetime('now')),
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
);
CREATE TABLE IF NOT EXISTS feeds (
id INTEGER PRIMARY KEY AUTOINCREMENT,
source_id INTEGER,
name TEXT NOT NULL,
url TEXT NOT NULL UNIQUE,
is_enabled INTEGER NOT NULL DEFAULT 1,
etag TEXT,
last_modified TEXT,
last_checked_at TEXT,
created_at TEXT NOT NULL DEFAULT (datetime('now')),
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
FOREIGN KEY(source_id) REFERENCES sources(id) ON DELETE SET NULL
);
CREATE TABLE IF NOT EXISTS runs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
run_type TEXT NOT NULL,
status TEXT NOT NULL CHECK (status IN ('queued', 'running', 'success', 'failed')),
started_at TEXT NOT NULL DEFAULT (datetime('now')),
finished_at TEXT,
details TEXT
);
CREATE TABLE IF NOT EXISTS publish_jobs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
article_id INTEGER NOT NULL,
status TEXT NOT NULL CHECK (status IN ('queued', 'running', 'success', 'failed')),
attempts INTEGER NOT NULL DEFAULT 0,
max_attempts INTEGER NOT NULL DEFAULT 3,
error_message TEXT,
wp_post_id INTEGER,
wp_post_url TEXT,
created_at TEXT NOT NULL DEFAULT (datetime('now')),
started_at TEXT,
finished_at TEXT,
FOREIGN KEY(article_id) REFERENCES articles(id) ON DELETE CASCADE
);
CREATE TABLE IF NOT EXISTS articles (
id INTEGER PRIMARY KEY AUTOINCREMENT,
feed_id INTEGER,
source_article_id TEXT,
source_hash TEXT,
title TEXT NOT NULL,
source_url TEXT NOT NULL,
canonical_url TEXT,
published_at TEXT,
author TEXT,
summary TEXT,
content_raw TEXT,
content_rewritten TEXT,
image_urls_json TEXT,
press_contact TEXT,
source_name_snapshot TEXT,
source_terms_url_snapshot TEXT,
source_license_name_snapshot TEXT,
legal_checked INTEGER NOT NULL DEFAULT 0,
legal_checked_at TEXT,
legal_note TEXT,
wp_post_id INTEGER,
wp_post_url TEXT,
publish_attempts INTEGER NOT NULL DEFAULT 0,
publish_last_error TEXT,
published_to_wp_at TEXT,
word_count INTEGER DEFAULT 0,
status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error', 'no_image')),
meta_json TEXT,
created_at TEXT NOT NULL DEFAULT (datetime('now')),
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
FOREIGN KEY(feed_id) REFERENCES feeds(id) ON DELETE SET NULL,
UNIQUE(source_url)
);
CREATE INDEX IF NOT EXISTS idx_articles_source_article_id ON articles(source_article_id);
CREATE INDEX IF NOT EXISTS idx_articles_source_hash ON articles(source_hash);
CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_feed_source_article_id
ON articles(feed_id, source_article_id)
WHERE source_article_id IS NOT NULL;
CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_source_hash
ON articles(source_hash)
WHERE source_hash IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_articles_status ON articles(status);
CREATE INDEX IF NOT EXISTS idx_feeds_source_id ON feeds(source_id);
CREATE INDEX IF NOT EXISTS idx_runs_started_at ON runs(started_at);
CREATE INDEX IF NOT EXISTS idx_articles_published_at ON articles(published_at);
CREATE INDEX IF NOT EXISTS idx_publish_jobs_status_created_at ON publish_jobs(status, created_at);
CREATE TRIGGER IF NOT EXISTS trg_sources_updated_at
AFTER UPDATE ON sources
FOR EACH ROW
BEGIN
UPDATE sources SET updated_at = datetime('now') WHERE id = OLD.id;
END;
CREATE TRIGGER IF NOT EXISTS trg_feeds_updated_at
AFTER UPDATE ON feeds
FOR EACH ROW
BEGIN
UPDATE feeds SET updated_at = datetime('now') WHERE id = OLD.id;
END;
CREATE TRIGGER IF NOT EXISTS trg_articles_updated_at
AFTER UPDATE ON articles
FOR EACH ROW
BEGIN
UPDATE articles SET updated_at = datetime('now') WHERE id = OLD.id;
END;
"""
)
# Lightweight migration for existing DBs created before source_hash was introduced.
existing_columns = {
row["name"] for row in conn.execute("PRAGMA table_info(articles)").fetchall()
}
migration_columns = {
"relevance_score": "ALTER TABLE articles ADD COLUMN relevance_score INTEGER",
"scheduled_publish_at": "ALTER TABLE articles ADD COLUMN scheduled_publish_at TEXT",
"source_hash": "ALTER TABLE articles ADD COLUMN source_hash TEXT",
"image_urls_json": "ALTER TABLE articles ADD COLUMN image_urls_json TEXT",
"press_contact": "ALTER TABLE articles ADD COLUMN press_contact TEXT",
"source_name_snapshot": "ALTER TABLE articles ADD COLUMN source_name_snapshot TEXT",
"source_terms_url_snapshot": "ALTER TABLE articles ADD COLUMN source_terms_url_snapshot TEXT",
"source_license_name_snapshot": "ALTER TABLE articles ADD COLUMN source_license_name_snapshot TEXT",
"legal_checked": "ALTER TABLE articles ADD COLUMN legal_checked INTEGER NOT NULL DEFAULT 0",
"legal_checked_at": "ALTER TABLE articles ADD COLUMN legal_checked_at TEXT",
"legal_note": "ALTER TABLE articles ADD COLUMN legal_note TEXT",
"wp_post_id": "ALTER TABLE articles ADD COLUMN wp_post_id INTEGER",
"wp_post_url": "ALTER TABLE articles ADD COLUMN wp_post_url TEXT",
"publish_attempts": "ALTER TABLE articles ADD COLUMN publish_attempts INTEGER NOT NULL DEFAULT 0",
"publish_last_error": "ALTER TABLE articles ADD COLUMN publish_last_error TEXT",
"published_to_wp_at": "ALTER TABLE articles ADD COLUMN published_to_wp_at TEXT",
}
for column, ddl in migration_columns.items():
if column not in existing_columns:
conn.execute(ddl)
# Migration: add 'no_image' to the status CHECK constraint if not present.
# SQLite cannot modify CHECK constraints in-place, so we recreate the table.
table_sql_row = conn.execute(
"SELECT sql FROM sqlite_master WHERE type='table' AND name='articles'"
).fetchone()
if table_sql_row and "'no_image'" not in (table_sql_row["sql"] or ""):
conn.executescript(
"""
PRAGMA foreign_keys=OFF;
CREATE TABLE articles_v2 (
id INTEGER PRIMARY KEY AUTOINCREMENT,
feed_id INTEGER,
source_article_id TEXT,
source_hash TEXT,
title TEXT NOT NULL,
source_url TEXT NOT NULL,
canonical_url TEXT,
published_at TEXT,
author TEXT,
summary TEXT,
content_raw TEXT,
content_rewritten TEXT,
image_urls_json TEXT,
press_contact TEXT,
source_name_snapshot TEXT,
source_terms_url_snapshot TEXT,
source_license_name_snapshot TEXT,
legal_checked INTEGER NOT NULL DEFAULT 0,
legal_checked_at TEXT,
legal_note TEXT,
wp_post_id INTEGER,
wp_post_url TEXT,
publish_attempts INTEGER NOT NULL DEFAULT 0,
publish_last_error TEXT,
published_to_wp_at TEXT,
word_count INTEGER DEFAULT 0,
status TEXT NOT NULL DEFAULT 'new' CHECK (status IN ('new', 'rewrite', 'review', 'approved', 'published', 'error', 'no_image')),
meta_json TEXT,
relevance_score INTEGER,
scheduled_publish_at TEXT,
created_at TEXT NOT NULL DEFAULT (datetime('now')),
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
FOREIGN KEY(feed_id) REFERENCES feeds(id) ON DELETE SET NULL,
UNIQUE(source_url)
);
INSERT INTO articles_v2 SELECT
id, feed_id, source_article_id, source_hash, title, source_url,
canonical_url, published_at, author, summary, content_raw,
content_rewritten, image_urls_json, press_contact,
source_name_snapshot, source_terms_url_snapshot, source_license_name_snapshot,
legal_checked, legal_checked_at, legal_note,
wp_post_id, wp_post_url, publish_attempts, publish_last_error,
published_to_wp_at, word_count, status, meta_json,
relevance_score, scheduled_publish_at, created_at, updated_at
FROM articles;
DROP TABLE articles;
ALTER TABLE articles_v2 RENAME TO articles;
CREATE INDEX IF NOT EXISTS idx_articles_source_article_id ON articles(source_article_id);
CREATE INDEX IF NOT EXISTS idx_articles_source_hash ON articles(source_hash);
CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_feed_source_article_id
ON articles(feed_id, source_article_id)
WHERE source_article_id IS NOT NULL;
CREATE UNIQUE INDEX IF NOT EXISTS uq_articles_source_hash
ON articles(source_hash)
WHERE source_hash IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_articles_status ON articles(status);
CREATE INDEX IF NOT EXISTS idx_articles_published_at ON articles(published_at);
CREATE TRIGGER IF NOT EXISTS trg_articles_updated_at
AFTER UPDATE ON articles
FOR EACH ROW
BEGIN
UPDATE articles SET updated_at = datetime('now') WHERE id = OLD.id;
END;
PRAGMA foreign_keys=ON;
"""
)
table_rows = conn.execute(
"SELECT name FROM sqlite_master WHERE type = 'table' AND name = 'publish_jobs'"
).fetchall()
if not table_rows:
conn.executescript(
"""
CREATE TABLE IF NOT EXISTS publish_jobs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
article_id INTEGER NOT NULL,
status TEXT NOT NULL CHECK (status IN ('queued', 'running', 'success', 'failed')),
attempts INTEGER NOT NULL DEFAULT 0,
max_attempts INTEGER NOT NULL DEFAULT 3,
error_message TEXT,
wp_post_id INTEGER,
wp_post_url TEXT,
created_at TEXT NOT NULL DEFAULT (datetime('now')),
started_at TEXT,
finished_at TEXT,
FOREIGN KEY(article_id) REFERENCES articles(id) ON DELETE CASCADE
);
CREATE INDEX IF NOT EXISTS idx_publish_jobs_status_created_at ON publish_jobs(status, created_at);
"""
)
def rows_to_dicts(rows: list[sqlite3.Row]) -> list[dict[str, Any]]:
return [dict(r) for r in rows]