feat(pipeline): article age filter, image URL validation, schedule UI, retry button

1. Article age filter (ingestion.py + config.py):
   - New setting pipeline_max_article_age_days=7 (0 = no limit)
   - Skip RSS entries older than N days before expensive extract_article()
   - Prevents old articles from Google Alerts re-entering pipeline

2. Image URL pre-validation (ingestion.py):
   - HEAD request probe for each primary image candidate during ingestion
   - Falls back to next-best candidate if primary returns 4xx
   - Network errors treated as OK to avoid false negatives on flaky servers

3. Stale WP draft cleanup (pipeline.py):
   - Quality gate rejections now delete any pre-existing WP draft (wp_post_id)
   - Prevents orphaned drafts when re-running articles that previously had drafts

4. Schedule overview UI (scheduler.py + admin_ui.py + admin_schedule.html):
   - New /admin/schedule page showing calendar grid of all booked slots
   - Distinguishes Pipeline-DB slots from WordPress-only slots
   - Link added to dashboard navigation

5. Retry for failed articles (admin_ui.py + admin_dashboard.html):
   - New POST /admin/articles/{id}/retry endpoint: resets to 'new', releases slot
   - '🔄 Wiederholen' button shown in dashboard for all 'close' (error) articles

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OliverGiertz 2026-04-10 08:44:28 +00:00
parent cf2d826c8a
commit 8676ace102
7 changed files with 344 additions and 5 deletions

View file

@ -929,3 +929,75 @@ def admin_transition_article(request: Request, article_id: int, target_status: s
update_article_status(article_id, target_internal, actor=user, note=note or None)
return _dashboard_redirect(msg=f"Artikel #{article_id}: {current_ui} -> {target_ui}")
return _dashboard_redirect(msg=f"Ungueltiger Statuswechsel fuer Artikel #{article_id}", msg_type="error")
@router.post("/admin/articles/{article_id}/retry")
def admin_retry_article(request: Request, article_id: int):
"""Reset a failed article to 'new' so the pipeline picks it up on next run."""
user = _admin_user(request)
if not user:
return RedirectResponse(url="/admin/login", status_code=303)
article = get_article_by_id(article_id)
if not article:
return _dashboard_redirect(msg=f"Artikel #{article_id} nicht gefunden", msg_type="error")
from .scheduler import release_publish_slot
release_publish_slot(article_id)
update_article_status(article_id, "new", actor=user, note="Manuell zurückgesetzt für erneuten Pipeline-Versuch")
return _dashboard_redirect(
msg=f"Artikel #{article_id} wurde auf 'neu' zurückgesetzt und wird beim nächsten Pipeline-Lauf verarbeitet",
status_filter="close",
)
@router.get("/admin/schedule", response_class=HTMLResponse)
def admin_schedule(request: Request):
"""Schedule overview: all booked slots from DB and WordPress."""
user = _admin_user(request)
if not user:
return RedirectResponse(url="/admin/login", status_code=303)
from .scheduler import get_schedule_overview, _preferred_hours, _today_cet
from datetime import timedelta
slots = get_schedule_overview(lookahead_days=60)
today = _today_cet()
hours = _preferred_hours()
# Build a calendar grid: for each day in the next 60 days, show each preferred hour slot
booked: dict[tuple[str, int], dict] = {(s["date"], s["hour"]): s for s in slots}
calendar_days = []
for offset in range(0, 61):
d = today + timedelta(days=offset)
d_str = d.isoformat()
day_slots = []
for h in hours:
key = (d_str, h)
day_slots.append({
"hour": h,
"booked": key in booked,
"slot": booked.get(key),
})
calendar_days.append({
"date": d_str,
"date_fmt": d.strftime("%d.%m.%Y"),
"weekday": ["Mo", "Di", "Mi", "Do", "Fr", "Sa", "So"][d.weekday()],
"slots": day_slots,
"any_booked": any(s["booked"] for s in day_slots),
})
return templates.TemplateResponse(
request,
"admin_schedule.html",
{
"request": request,
"title": "Veröffentlichungsplan",
"user": user,
"slots": slots,
"calendar_days": calendar_days,
"hours": hours,
"flash_msg": request.query_params.get("msg", ""),
"flash_type": request.query_params.get("type", "success"),
},
)

View file

@ -48,6 +48,7 @@ class Settings(BaseSettings):
pipeline_publish_hours: str = "9,14" # comma-separated preferred publish hours (CET)
pipeline_min_words_raw: int = 120 # minimum words in raw content before rewrite (else reject)
pipeline_min_words_rewritten: int = 150 # minimum words in rewritten content (else reject)
pipeline_max_article_age_days: int = 7 # skip articles older than N days during ingestion (0 = no limit)
@lru_cache(maxsize=1)

View file

@ -1,13 +1,15 @@
from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime, timezone
from datetime import datetime, timedelta, timezone
import hashlib
import json
import re
import time
from typing import Any
from urllib.parse import unquote, urlencode, urlparse, parse_qs
import urllib.error
import urllib.request as _urllib_req
import feedparser
@ -119,6 +121,26 @@ def _normalize_tokens(text: str) -> set[str]:
return {token for token in normalized.split() if len(token) >= 4}
def _probe_image_url(url: str, timeout: int = 5) -> bool:
"""Return True if URL responds without a 4xx/5xx error (HEAD request).
Returns True on network/connection errors so that a flaky server does not
cause a valid image to be silently dropped.
"""
try:
req = _urllib_req.Request(
url,
method="HEAD",
headers={"User-Agent": "Mozilla/5.0 (compatible; rss-news/1.0)"},
)
with _urllib_req.urlopen(req, timeout=timeout) as resp:
return resp.status < 400
except urllib.error.HTTPError as exc:
return exc.code < 400 # 3xx redirects are OK; 4xx/5xx are not
except Exception:
return True # network error → don't filter, let WP try later
def _rank_image_candidates(source_url: str, title: str, images: list[str]) -> list[dict[str, Any]]:
source_host = (urlparse(source_url).hostname or "").lower()
is_presseportal = "presseportal.de" in source_host
@ -184,10 +206,25 @@ def _select_relevant_images(source_url: str, title: str, images: list[str], max_
deduped.append(image)
ranked = _rank_image_candidates(source_url, title, deduped)
kept = [item["url"] for item in ranked if item["score"] > 0][:max_keep]
if not kept and ranked:
kept = [ranked[0]["url"]]
primary = kept[0] if kept else None
candidates = [item["url"] for item in ranked if item["score"] > -100]
# Probe top candidates (max 4) to skip definitively broken URLs (HTTP 4xx).
# Network errors are treated as OK to avoid false negatives on flaky servers.
primary = None
kept: list[str] = []
for url in candidates[:4]:
if _probe_image_url(url):
if primary is None:
primary = url
kept.append(url)
if len(kept) >= max_keep:
break
# Fallback: if all probes failed with network errors, use best candidate anyway
if not kept and candidates:
primary = candidates[0]
kept = candidates[:max_keep]
return kept, primary, ranked
@ -265,12 +302,27 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats:
feed_entries_seen = 0
feed_upserts = 0
from .config import get_settings as _get_settings
_max_age_days = _get_settings().pipeline_max_article_age_days
for entry in _parsed_get(parsed, "entries", []):
entries_seen += 1
feed_entries_seen += 1
link = entry.get("link")
if not link:
continue
# Age filter: skip articles older than max_age_days (0 = no limit)
if _max_age_days > 0:
published_iso = _entry_published_iso(entry)
if published_iso:
try:
published_dt = datetime.fromisoformat(published_iso)
age = datetime.now(timezone.utc) - published_dt
if age > timedelta(days=_max_age_days):
continue
except Exception:
pass # can't parse date → allow through
# Resolve Google redirect URLs (google.com/url?...&url=<actual_url>&...)
link = _resolve_google_redirect(link)
# Normalize AMP/tracking params (e.g. ?outputType=valid_amp)

View file

@ -374,6 +374,15 @@ def _process_article(article: dict[str, Any], stats: PipelineStats, settings: An
# Release the reserved slot so it's available for the next article
from .scheduler import release_publish_slot
release_publish_slot(article_id)
# Clean up any stale WP draft from a previous pipeline run
stale = get_article_by_id(article_id)
if stale and stale.get("wp_post_id"):
try:
from .wordpress import delete_wp_post
delete_wp_post(int(stale["wp_post_id"]))
logger.info("Artikel #%d: veralteten WP-Draft #%s gelöscht", article_id, stale["wp_post_id"])
except Exception as del_exc:
logger.warning("Artikel #%d: WP-Draft konnte nicht gelöscht werden: %s", article_id, del_exc)
stats.quality_gate_rejected += 1
logger.info("Artikel #%d wegen Qualitätsprüfung abgelehnt: %s", article_id, exc)
# Individual Telegram notification for quality gate rejection

View file

@ -165,6 +165,72 @@ def _find_next_free_slot(
return tomorrow, _preferred_hours()[0] if _preferred_hours() else 9
def get_schedule_overview(lookahead_days: int = 60) -> list[dict]:
"""Return all booked scheduling slots (DB + WP) for the next N days, sorted by date."""
today = _today_cet()
hours = _preferred_hours()
# Slots booked in local DB
with get_conn() as conn:
rows = conn.execute(
"""
SELECT id, title, status, wp_post_id, wp_post_url, scheduled_publish_at
FROM articles
WHERE scheduled_publish_at IS NOT NULL
AND scheduled_publish_at >= ?
AND status NOT IN ('error', 'no_image')
ORDER BY scheduled_publish_at
""",
(today.isoformat() + "T00:00:00",),
).fetchall()
db_slots: dict[tuple[str, int], dict] = {}
for row in rows:
try:
dt = datetime.fromisoformat(row["scheduled_publish_at"])
key = (dt.date().isoformat(), dt.hour)
db_slots[key] = {
"date": dt.date().isoformat(),
"hour": dt.hour,
"formatted": _format_slot(dt.date(), dt.hour),
"source": "db",
"article_id": row["id"],
"article_title": row["title"],
"article_status": row["status"],
"wp_post_id": row["wp_post_id"],
"wp_post_url": row["wp_post_url"],
}
except Exception:
pass
# Slots occupied in WordPress but not in local DB
wp_occupied = _fetch_wp_occupied_slots()
wp_only: list[dict] = []
for d_str, h in sorted(wp_occupied):
if (d_str, h) in db_slots:
continue
try:
d = date.fromisoformat(d_str)
if d >= today:
wp_only.append({
"date": d_str,
"hour": h,
"formatted": _format_slot(d, h),
"source": "wordpress",
"article_id": None,
"article_title": "(WP-Beitrag außerhalb Pipeline)",
"article_status": None,
"wp_post_id": None,
"wp_post_url": None,
})
except Exception:
pass
all_slots = list(db_slots.values()) + wp_only
all_slots.sort(key=lambda s: (s["date"], s["hour"]))
return all_slots
def release_publish_slot(article_id: int) -> None:
"""Clear a previously reserved slot (e.g. when article is rejected after slot assignment)."""
with get_conn() as conn:

View file

@ -13,6 +13,7 @@
<p>Angemeldet als <strong>{{ user }}</strong></p>
</div>
<div class="row">
<a class="linkbtn" href="/admin/schedule">Veröffentlichungsplan</a>
<a class="linkbtn" href="/admin/connectivity">Connectivity Check</a>
<form method="post" action="/admin/logout">
<button type="submit" class="secondary">Logout</button>
@ -330,6 +331,11 @@
<span class="subtle">keine Aktion</span>
{% endif %}
</form>
{% if a.status_ui == 'close' %}
<form method="post" action="/admin/articles/{{ a.id }}/retry" class="inline" style="margin-top:4px;">
<button type="submit" title="Artikel auf 'neu' zurücksetzen wird beim nächsten Pipeline-Lauf erneut verarbeitet">🔄 Wiederholen</button>
</form>
{% endif %}
</td>
</tr>
{% endfor %}

View file

@ -0,0 +1,133 @@
<!doctype html>
<html lang="de">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>{{ title }}</title>
<link rel="stylesheet" href="/admin/static/admin.css" />
<style>
.schedule-table td, .schedule-table th { padding: 6px 10px; }
.slot-free { color: #aaa; font-style: italic; }
.slot-booked-db { color: #1a7a1a; font-weight: bold; }
.slot-booked-wp { color: #b35a00; font-weight: bold; }
.badge-db { background: #d4edda; color: #155724; padding: 2px 6px; border-radius: 4px; font-size: 0.75em; }
.badge-wp { background: #fff3cd; color: #856404; padding: 2px 6px; border-radius: 4px; font-size: 0.75em; }
.summary-bar { display: flex; gap: 1.5rem; margin-bottom: 1rem; font-size: 0.95em; }
</style>
</head>
<body>
<header class="topbar">
<div>
<h1>rss-news Veröffentlichungsplan</h1>
<p>Angemeldet als <strong>{{ user }}</strong></p>
</div>
<div class="row">
<a class="linkbtn" href="/admin/dashboard">Dashboard</a>
<a class="linkbtn" href="/admin/connectivity">Connectivity</a>
<form method="post" action="/admin/logout">
<button type="submit" class="secondary">Logout</button>
</form>
</div>
</header>
<main class="container">
{% if flash_msg %}
<section class="card flash {{ 'flash-error' if flash_type == 'error' else 'flash-success' }}">
{{ flash_msg }}
</section>
{% endif %}
<section class="card">
<h2>Slot-Übersicht (nächste 60 Tage)</h2>
<div class="summary-bar">
<span>📅 Belegte Slots gesamt: <strong>{{ slots|length }}</strong></span>
<span>🗄️ Aus Pipeline-DB: <strong>{{ slots|selectattr('source', 'eq', 'db')|list|length }}</strong></span>
<span>🌐 Nur in WordPress: <strong>{{ slots|selectattr('source', 'eq', 'wordpress')|list|length }}</strong></span>
</div>
<table class="schedule-table">
<thead>
<tr>
<th>Tag</th>
{% for h in hours %}
<th>{{ "%02d:00 Uhr"|format(h) }}</th>
{% endfor %}
</tr>
</thead>
<tbody>
{% for day in calendar_days %}
{% if day.any_booked %}
<tr>
<td><strong>{{ day.weekday }}</strong> {{ day.date_fmt }}</td>
{% for s in day.slots %}
<td>
{% if s.booked %}
{% set info = s.slot %}
{% if info.source == 'db' %}
<span class="slot-booked-db"></span>
<span class="badge-db">DB</span>
<div style="font-size:0.85em;">
{% if info.article_id %}
<a href="/admin/articles/{{ info.article_id }}">
{{ (info.article_title or "Artikel")[:50] }}{% if (info.article_title or "")|length > 50 %}…{% endif %}
</a>
{% endif %}
<br /><span class="subtle">Status: {{ info.article_status }}</span>
{% if info.wp_post_url %}
<br /><a href="{{ info.wp_post_url }}" target="_blank" rel="noopener">WP öffnen</a>
{% endif %}
</div>
{% else %}
<span class="slot-booked-wp">⚠️</span>
<span class="badge-wp">WP</span>
<div style="font-size:0.85em;">{{ info.article_title }}</div>
{% endif %}
{% else %}
<span class="slot-free">frei</span>
{% endif %}
</td>
{% endfor %}
</tr>
{% endif %}
{% endfor %}
</tbody>
</table>
{% if not slots %}
<p class="subtle">Keine geplanten Beiträge in den nächsten 60 Tagen.</p>
{% endif %}
</section>
<section class="card">
<h2>Alle belegten Slots (Liste)</h2>
<table>
<thead>
<tr><th>Datum/Zeit</th><th>Quelle</th><th>Artikel</th><th>Status</th><th>WordPress</th></tr>
</thead>
<tbody>
{% for s in slots %}
<tr>
<td>{{ s.formatted }}</td>
<td>
{% if s.source == 'db' %}<span class="badge-db">Pipeline-DB</span>
{% else %}<span class="badge-wp">WordPress</span>{% endif %}
</td>
<td>
{% if s.article_id %}
<a href="/admin/articles/{{ s.article_id }}">{{ (s.article_title or "")[:60] }}</a>
{% else %}
{{ s.article_title or "-" }}
{% endif %}
</td>
<td>{{ s.article_status or "-" }}</td>
<td>
{% if s.wp_post_url %}
<a href="{{ s.wp_post_url }}" target="_blank" rel="noopener">Draft öffnen</a>
{% else %}-{% endif %}
</td>
</tr>
{% endfor %}
</tbody>
</table>
</section>
</main>
</body>
</html>