feat(pipeline): add two-stage article quality gate (min word count)
Stage 1 (before OpenAI rewrite): reject if raw content < pipeline_min_words_raw (default 120) Stage 2 (after rewrite): reject if rewritten text < pipeline_min_words_rewritten (default 150) Both stages set status='error' with a descriptive note and skip WP draft creation. The reserved publish slot is released so it stays available for the next article. Quality rejections don't abort the pipeline — processing continues with the next article. New config settings (overridable via .env): PIPELINE_MIN_WORDS_RAW=120 PIPELINE_MIN_WORDS_REWRITTEN=150 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
94bd93a18a
commit
09dcf6ce36
3 changed files with 45 additions and 1 deletions
|
|
@ -46,6 +46,8 @@ class Settings(BaseSettings):
|
||||||
pipeline_relevance_warn: int = 60 # >= this: Telegram warning, else reject
|
pipeline_relevance_warn: int = 60 # >= this: Telegram warning, else reject
|
||||||
pipeline_max_drafts_per_day: int = 2
|
pipeline_max_drafts_per_day: int = 2
|
||||||
pipeline_publish_hours: str = "9,14" # comma-separated preferred publish hours (CET)
|
pipeline_publish_hours: str = "9,14" # comma-separated preferred publish hours (CET)
|
||||||
|
pipeline_min_words_raw: int = 120 # minimum words in raw content before rewrite (else reject)
|
||||||
|
pipeline_min_words_rewritten: int = 150 # minimum words in rewritten content (else reject)
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=1)
|
@lru_cache(maxsize=1)
|
||||||
|
|
|
||||||
|
|
@ -109,10 +109,35 @@ def _store_relevance(article_id: int, relevance: dict[str, Any]) -> None:
|
||||||
def _do_rewrite_and_draft(article: dict[str, Any]) -> tuple[int, str | None]:
|
def _do_rewrite_and_draft(article: dict[str, Any]) -> tuple[int, str | None]:
|
||||||
"""Rewrite article and create WP draft. Returns (wp_post_id, wp_post_url)."""
|
"""Rewrite article and create WP draft. Returns (wp_post_id, wp_post_url)."""
|
||||||
article_id = int(article["id"])
|
article_id = int(article["id"])
|
||||||
|
settings = get_settings()
|
||||||
|
|
||||||
|
# ── Quality gate 1: raw content length ──────────────────────────────────
|
||||||
|
import re as _re
|
||||||
|
raw_text = _re.sub(r"<[^>]+>", " ", article.get("content_raw") or "")
|
||||||
|
raw_words = len(raw_text.split())
|
||||||
|
if raw_words < settings.pipeline_min_words_raw:
|
||||||
|
note = (
|
||||||
|
f"Zu wenig Rohinhalt: {raw_words} Wörter "
|
||||||
|
f"(Minimum: {settings.pipeline_min_words_raw})"
|
||||||
|
)
|
||||||
|
logger.warning("_do_rewrite_and_draft #%d: %s — überspringe", article_id, note)
|
||||||
|
update_article_status(article_id, "error", actor="pipeline", note=note)
|
||||||
|
raise ValueError(note)
|
||||||
|
|
||||||
# Rewrite
|
# Rewrite
|
||||||
logger.info("_do_rewrite_and_draft #%d: starte OpenAI-Rewrite", article_id)
|
logger.info("_do_rewrite_and_draft #%d: starte OpenAI-Rewrite (%d Roh-Wörter)", article_id, raw_words)
|
||||||
rewritten = rewrite_article_text(article)
|
rewritten = rewrite_article_text(article)
|
||||||
|
|
||||||
|
# ── Quality gate 2: rewritten content length ─────────────────────────────
|
||||||
|
rewritten_words = len(rewritten.split())
|
||||||
|
if rewritten_words < settings.pipeline_min_words_rewritten:
|
||||||
|
note = (
|
||||||
|
f"Rewrite zu kurz: {rewritten_words} Wörter "
|
||||||
|
f"(Minimum: {settings.pipeline_min_words_rewritten})"
|
||||||
|
)
|
||||||
|
logger.warning("_do_rewrite_and_draft #%d: %s — überspringe", article_id, note)
|
||||||
|
update_article_status(article_id, "error", actor="pipeline", note=note)
|
||||||
|
raise ValueError(note)
|
||||||
logger.info("_do_rewrite_and_draft #%d: Rewrite fertig (%d Wörter), generiere Tags", article_id, len(rewritten.split()))
|
logger.info("_do_rewrite_and_draft #%d: Rewrite fertig (%d Wörter), generiere Tags", article_id, len(rewritten.split()))
|
||||||
tags: list[str] = []
|
tags: list[str] = []
|
||||||
try:
|
try:
|
||||||
|
|
@ -342,6 +367,14 @@ def _process_article(article: dict[str, Any], stats: PipelineStats, settings: An
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.warning("Telegram draft-Benachrichtigung für #%d fehlgeschlagen: %s", article_id, exc)
|
logger.warning("Telegram draft-Benachrichtigung für #%d fehlgeschlagen: %s", article_id, exc)
|
||||||
|
|
||||||
|
except ValueError as exc:
|
||||||
|
# Quality gate rejection (too short etc.) — status already set in _do_rewrite_and_draft
|
||||||
|
# Release the reserved slot so it's available for the next article
|
||||||
|
from .scheduler import release_publish_slot
|
||||||
|
release_publish_slot(article_id)
|
||||||
|
stats.rejected_articles.append(get_article_by_id(article_id) or {})
|
||||||
|
logger.info("Artikel #%d wegen Qualitätsprüfung abgelehnt: %s", article_id, exc)
|
||||||
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.error("Draft-Erstellung für #%d fehlgeschlagen: %s", article_id, exc)
|
logger.error("Draft-Erstellung für #%d fehlgeschlagen: %s", article_id, exc)
|
||||||
update_article_status(article_id, "error", actor="pipeline", note=f"Draft-Fehler: {exc}")
|
update_article_status(article_id, "error", actor="pipeline", note=f"Draft-Fehler: {exc}")
|
||||||
|
|
|
||||||
|
|
@ -165,6 +165,15 @@ def _find_next_free_slot(
|
||||||
return tomorrow, _preferred_hours()[0] if _preferred_hours() else 9
|
return tomorrow, _preferred_hours()[0] if _preferred_hours() else 9
|
||||||
|
|
||||||
|
|
||||||
|
def release_publish_slot(article_id: int) -> None:
|
||||||
|
"""Clear a previously reserved slot (e.g. when article is rejected after slot assignment)."""
|
||||||
|
with get_conn() as conn:
|
||||||
|
conn.execute(
|
||||||
|
"UPDATE articles SET scheduled_publish_at = NULL WHERE id = ?",
|
||||||
|
(article_id,),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def suggest_publish_slot() -> str:
|
def suggest_publish_slot() -> str:
|
||||||
"""Return a suggested publish datetime string (CET) for the next free slot."""
|
"""Return a suggested publish datetime string (CET) for the next free slot."""
|
||||||
wp_occupied = _fetch_wp_occupied_slots()
|
wp_occupied = _fetch_wp_occupied_slots()
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue