From 09dcf6ce368079df10dbbc5749c901e806f32f69 Mon Sep 17 00:00:00 2001 From: OliverGiertz Date: Wed, 8 Apr 2026 09:42:02 +0000 Subject: [PATCH] feat(pipeline): add two-stage article quality gate (min word count) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stage 1 (before OpenAI rewrite): reject if raw content < pipeline_min_words_raw (default 120) Stage 2 (after rewrite): reject if rewritten text < pipeline_min_words_rewritten (default 150) Both stages set status='error' with a descriptive note and skip WP draft creation. The reserved publish slot is released so it stays available for the next article. Quality rejections don't abort the pipeline — processing continues with the next article. New config settings (overridable via .env): PIPELINE_MIN_WORDS_RAW=120 PIPELINE_MIN_WORDS_REWRITTEN=150 Co-Authored-By: Claude Sonnet 4.6 --- backend/app/config.py | 2 ++ backend/app/pipeline.py | 35 ++++++++++++++++++++++++++++++++++- backend/app/scheduler.py | 9 +++++++++ 3 files changed, 45 insertions(+), 1 deletion(-) diff --git a/backend/app/config.py b/backend/app/config.py index d56ce11..713669e 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -46,6 +46,8 @@ class Settings(BaseSettings): pipeline_relevance_warn: int = 60 # >= this: Telegram warning, else reject pipeline_max_drafts_per_day: int = 2 pipeline_publish_hours: str = "9,14" # comma-separated preferred publish hours (CET) + pipeline_min_words_raw: int = 120 # minimum words in raw content before rewrite (else reject) + pipeline_min_words_rewritten: int = 150 # minimum words in rewritten content (else reject) @lru_cache(maxsize=1) diff --git a/backend/app/pipeline.py b/backend/app/pipeline.py index 059ccf5..9cd9059 100644 --- a/backend/app/pipeline.py +++ b/backend/app/pipeline.py @@ -109,10 +109,35 @@ def _store_relevance(article_id: int, relevance: dict[str, Any]) -> None: def _do_rewrite_and_draft(article: dict[str, Any]) -> tuple[int, str | None]: """Rewrite article and create WP draft. Returns (wp_post_id, wp_post_url).""" article_id = int(article["id"]) + settings = get_settings() + + # ── Quality gate 1: raw content length ────────────────────────────────── + import re as _re + raw_text = _re.sub(r"<[^>]+>", " ", article.get("content_raw") or "") + raw_words = len(raw_text.split()) + if raw_words < settings.pipeline_min_words_raw: + note = ( + f"Zu wenig Rohinhalt: {raw_words} Wörter " + f"(Minimum: {settings.pipeline_min_words_raw})" + ) + logger.warning("_do_rewrite_and_draft #%d: %s — überspringe", article_id, note) + update_article_status(article_id, "error", actor="pipeline", note=note) + raise ValueError(note) # Rewrite - logger.info("_do_rewrite_and_draft #%d: starte OpenAI-Rewrite", article_id) + logger.info("_do_rewrite_and_draft #%d: starte OpenAI-Rewrite (%d Roh-Wörter)", article_id, raw_words) rewritten = rewrite_article_text(article) + + # ── Quality gate 2: rewritten content length ───────────────────────────── + rewritten_words = len(rewritten.split()) + if rewritten_words < settings.pipeline_min_words_rewritten: + note = ( + f"Rewrite zu kurz: {rewritten_words} Wörter " + f"(Minimum: {settings.pipeline_min_words_rewritten})" + ) + logger.warning("_do_rewrite_and_draft #%d: %s — überspringe", article_id, note) + update_article_status(article_id, "error", actor="pipeline", note=note) + raise ValueError(note) logger.info("_do_rewrite_and_draft #%d: Rewrite fertig (%d Wörter), generiere Tags", article_id, len(rewritten.split())) tags: list[str] = [] try: @@ -342,6 +367,14 @@ def _process_article(article: dict[str, Any], stats: PipelineStats, settings: An except Exception as exc: logger.warning("Telegram draft-Benachrichtigung für #%d fehlgeschlagen: %s", article_id, exc) + except ValueError as exc: + # Quality gate rejection (too short etc.) — status already set in _do_rewrite_and_draft + # Release the reserved slot so it's available for the next article + from .scheduler import release_publish_slot + release_publish_slot(article_id) + stats.rejected_articles.append(get_article_by_id(article_id) or {}) + logger.info("Artikel #%d wegen Qualitätsprüfung abgelehnt: %s", article_id, exc) + except Exception as exc: logger.error("Draft-Erstellung für #%d fehlgeschlagen: %s", article_id, exc) update_article_status(article_id, "error", actor="pipeline", note=f"Draft-Fehler: {exc}") diff --git a/backend/app/scheduler.py b/backend/app/scheduler.py index a1028fc..0ec38d8 100644 --- a/backend/app/scheduler.py +++ b/backend/app/scheduler.py @@ -165,6 +165,15 @@ def _find_next_free_slot( return tomorrow, _preferred_hours()[0] if _preferred_hours() else 9 +def release_publish_slot(article_id: int) -> None: + """Clear a previously reserved slot (e.g. when article is rejected after slot assignment).""" + with get_conn() as conn: + conn.execute( + "UPDATE articles SET scheduled_publish_at = NULL WHERE id = ?", + (article_id,), + ) + + def suggest_publish_slot() -> str: """Return a suggested publish datetime string (CET) for the next free slot.""" wp_occupied = _fetch_wp_occupied_slots()