feat(pipeline): add two-stage article quality gate (min word count)
Stage 1 (before OpenAI rewrite): reject if raw content < pipeline_min_words_raw (default 120) Stage 2 (after rewrite): reject if rewritten text < pipeline_min_words_rewritten (default 150) Both stages set status='error' with a descriptive note and skip WP draft creation. The reserved publish slot is released so it stays available for the next article. Quality rejections don't abort the pipeline — processing continues with the next article. New config settings (overridable via .env): PIPELINE_MIN_WORDS_RAW=120 PIPELINE_MIN_WORDS_REWRITTEN=150 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
94bd93a18a
commit
09dcf6ce36
3 changed files with 45 additions and 1 deletions
|
|
@ -46,6 +46,8 @@ class Settings(BaseSettings):
|
|||
pipeline_relevance_warn: int = 60 # >= this: Telegram warning, else reject
|
||||
pipeline_max_drafts_per_day: int = 2
|
||||
pipeline_publish_hours: str = "9,14" # comma-separated preferred publish hours (CET)
|
||||
pipeline_min_words_raw: int = 120 # minimum words in raw content before rewrite (else reject)
|
||||
pipeline_min_words_rewritten: int = 150 # minimum words in rewritten content (else reject)
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
|
|
|
|||
|
|
@ -109,10 +109,35 @@ def _store_relevance(article_id: int, relevance: dict[str, Any]) -> None:
|
|||
def _do_rewrite_and_draft(article: dict[str, Any]) -> tuple[int, str | None]:
|
||||
"""Rewrite article and create WP draft. Returns (wp_post_id, wp_post_url)."""
|
||||
article_id = int(article["id"])
|
||||
settings = get_settings()
|
||||
|
||||
# ── Quality gate 1: raw content length ──────────────────────────────────
|
||||
import re as _re
|
||||
raw_text = _re.sub(r"<[^>]+>", " ", article.get("content_raw") or "")
|
||||
raw_words = len(raw_text.split())
|
||||
if raw_words < settings.pipeline_min_words_raw:
|
||||
note = (
|
||||
f"Zu wenig Rohinhalt: {raw_words} Wörter "
|
||||
f"(Minimum: {settings.pipeline_min_words_raw})"
|
||||
)
|
||||
logger.warning("_do_rewrite_and_draft #%d: %s — überspringe", article_id, note)
|
||||
update_article_status(article_id, "error", actor="pipeline", note=note)
|
||||
raise ValueError(note)
|
||||
|
||||
# Rewrite
|
||||
logger.info("_do_rewrite_and_draft #%d: starte OpenAI-Rewrite", article_id)
|
||||
logger.info("_do_rewrite_and_draft #%d: starte OpenAI-Rewrite (%d Roh-Wörter)", article_id, raw_words)
|
||||
rewritten = rewrite_article_text(article)
|
||||
|
||||
# ── Quality gate 2: rewritten content length ─────────────────────────────
|
||||
rewritten_words = len(rewritten.split())
|
||||
if rewritten_words < settings.pipeline_min_words_rewritten:
|
||||
note = (
|
||||
f"Rewrite zu kurz: {rewritten_words} Wörter "
|
||||
f"(Minimum: {settings.pipeline_min_words_rewritten})"
|
||||
)
|
||||
logger.warning("_do_rewrite_and_draft #%d: %s — überspringe", article_id, note)
|
||||
update_article_status(article_id, "error", actor="pipeline", note=note)
|
||||
raise ValueError(note)
|
||||
logger.info("_do_rewrite_and_draft #%d: Rewrite fertig (%d Wörter), generiere Tags", article_id, len(rewritten.split()))
|
||||
tags: list[str] = []
|
||||
try:
|
||||
|
|
@ -342,6 +367,14 @@ def _process_article(article: dict[str, Any], stats: PipelineStats, settings: An
|
|||
except Exception as exc:
|
||||
logger.warning("Telegram draft-Benachrichtigung für #%d fehlgeschlagen: %s", article_id, exc)
|
||||
|
||||
except ValueError as exc:
|
||||
# Quality gate rejection (too short etc.) — status already set in _do_rewrite_and_draft
|
||||
# Release the reserved slot so it's available for the next article
|
||||
from .scheduler import release_publish_slot
|
||||
release_publish_slot(article_id)
|
||||
stats.rejected_articles.append(get_article_by_id(article_id) or {})
|
||||
logger.info("Artikel #%d wegen Qualitätsprüfung abgelehnt: %s", article_id, exc)
|
||||
|
||||
except Exception as exc:
|
||||
logger.error("Draft-Erstellung für #%d fehlgeschlagen: %s", article_id, exc)
|
||||
update_article_status(article_id, "error", actor="pipeline", note=f"Draft-Fehler: {exc}")
|
||||
|
|
|
|||
|
|
@ -165,6 +165,15 @@ def _find_next_free_slot(
|
|||
return tomorrow, _preferred_hours()[0] if _preferred_hours() else 9
|
||||
|
||||
|
||||
def release_publish_slot(article_id: int) -> None:
|
||||
"""Clear a previously reserved slot (e.g. when article is rejected after slot assignment)."""
|
||||
with get_conn() as conn:
|
||||
conn.execute(
|
||||
"UPDATE articles SET scheduled_publish_at = NULL WHERE id = ?",
|
||||
(article_id,),
|
||||
)
|
||||
|
||||
|
||||
def suggest_publish_slot() -> str:
|
||||
"""Return a suggested publish datetime string (CET) for the next free slot."""
|
||||
wp_occupied = _fetch_wp_occupied_slots()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue