feat(pipeline): add two-stage article quality gate (min word count)

Stage 1 (before OpenAI rewrite): reject if raw content < pipeline_min_words_raw (default 120)
Stage 2 (after rewrite): reject if rewritten text < pipeline_min_words_rewritten (default 150)

Both stages set status='error' with a descriptive note and skip WP draft creation.
The reserved publish slot is released so it stays available for the next article.
Quality rejections don't abort the pipeline — processing continues with the next article.

New config settings (overridable via .env):
  PIPELINE_MIN_WORDS_RAW=120
  PIPELINE_MIN_WORDS_REWRITTEN=150

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OliverGiertz 2026-04-08 09:42:02 +00:00
parent 94bd93a18a
commit 09dcf6ce36
3 changed files with 45 additions and 1 deletions

View file

@ -46,6 +46,8 @@ class Settings(BaseSettings):
pipeline_relevance_warn: int = 60 # >= this: Telegram warning, else reject
pipeline_max_drafts_per_day: int = 2
pipeline_publish_hours: str = "9,14" # comma-separated preferred publish hours (CET)
pipeline_min_words_raw: int = 120 # minimum words in raw content before rewrite (else reject)
pipeline_min_words_rewritten: int = 150 # minimum words in rewritten content (else reject)
@lru_cache(maxsize=1)

View file

@ -109,10 +109,35 @@ def _store_relevance(article_id: int, relevance: dict[str, Any]) -> None:
def _do_rewrite_and_draft(article: dict[str, Any]) -> tuple[int, str | None]:
"""Rewrite article and create WP draft. Returns (wp_post_id, wp_post_url)."""
article_id = int(article["id"])
settings = get_settings()
# ── Quality gate 1: raw content length ──────────────────────────────────
import re as _re
raw_text = _re.sub(r"<[^>]+>", " ", article.get("content_raw") or "")
raw_words = len(raw_text.split())
if raw_words < settings.pipeline_min_words_raw:
note = (
f"Zu wenig Rohinhalt: {raw_words} Wörter "
f"(Minimum: {settings.pipeline_min_words_raw})"
)
logger.warning("_do_rewrite_and_draft #%d: %s — überspringe", article_id, note)
update_article_status(article_id, "error", actor="pipeline", note=note)
raise ValueError(note)
# Rewrite
logger.info("_do_rewrite_and_draft #%d: starte OpenAI-Rewrite", article_id)
logger.info("_do_rewrite_and_draft #%d: starte OpenAI-Rewrite (%d Roh-Wörter)", article_id, raw_words)
rewritten = rewrite_article_text(article)
# ── Quality gate 2: rewritten content length ─────────────────────────────
rewritten_words = len(rewritten.split())
if rewritten_words < settings.pipeline_min_words_rewritten:
note = (
f"Rewrite zu kurz: {rewritten_words} Wörter "
f"(Minimum: {settings.pipeline_min_words_rewritten})"
)
logger.warning("_do_rewrite_and_draft #%d: %s — überspringe", article_id, note)
update_article_status(article_id, "error", actor="pipeline", note=note)
raise ValueError(note)
logger.info("_do_rewrite_and_draft #%d: Rewrite fertig (%d Wörter), generiere Tags", article_id, len(rewritten.split()))
tags: list[str] = []
try:
@ -342,6 +367,14 @@ def _process_article(article: dict[str, Any], stats: PipelineStats, settings: An
except Exception as exc:
logger.warning("Telegram draft-Benachrichtigung für #%d fehlgeschlagen: %s", article_id, exc)
except ValueError as exc:
# Quality gate rejection (too short etc.) — status already set in _do_rewrite_and_draft
# Release the reserved slot so it's available for the next article
from .scheduler import release_publish_slot
release_publish_slot(article_id)
stats.rejected_articles.append(get_article_by_id(article_id) or {})
logger.info("Artikel #%d wegen Qualitätsprüfung abgelehnt: %s", article_id, exc)
except Exception as exc:
logger.error("Draft-Erstellung für #%d fehlgeschlagen: %s", article_id, exc)
update_article_status(article_id, "error", actor="pipeline", note=f"Draft-Fehler: {exc}")

View file

@ -165,6 +165,15 @@ def _find_next_free_slot(
return tomorrow, _preferred_hours()[0] if _preferred_hours() else 9
def release_publish_slot(article_id: int) -> None:
"""Clear a previously reserved slot (e.g. when article is rejected after slot assignment)."""
with get_conn() as conn:
conn.execute(
"UPDATE articles SET scheduled_publish_at = NULL WHERE id = ?",
(article_id,),
)
def suggest_publish_slot() -> str:
"""Return a suggested publish datetime string (CET) for the next free slot."""
wp_occupied = _fetch_wp_occupied_slots()