feat(pipeline): article age filter, image URL validation, schedule UI, retry button
1. Article age filter (ingestion.py + config.py):
- New setting pipeline_max_article_age_days=7 (0 = no limit)
- Skip RSS entries older than N days before expensive extract_article()
- Prevents old articles from Google Alerts re-entering pipeline
2. Image URL pre-validation (ingestion.py):
- HEAD request probe for each primary image candidate during ingestion
- Falls back to next-best candidate if primary returns 4xx
- Network errors treated as OK to avoid false negatives on flaky servers
3. Stale WP draft cleanup (pipeline.py):
- Quality gate rejections now delete any pre-existing WP draft (wp_post_id)
- Prevents orphaned drafts when re-running articles that previously had drafts
4. Schedule overview UI (scheduler.py + admin_ui.py + admin_schedule.html):
- New /admin/schedule page showing calendar grid of all booked slots
- Distinguishes Pipeline-DB slots from WordPress-only slots
- Link added to dashboard navigation
5. Retry for failed articles (admin_ui.py + admin_dashboard.html):
- New POST /admin/articles/{id}/retry endpoint: resets to 'new', releases slot
- '🔄 Wiederholen' button shown in dashboard for all 'close' (error) articles
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
cf2d826c8a
commit
8676ace102
7 changed files with 344 additions and 5 deletions
|
|
@ -929,3 +929,75 @@ def admin_transition_article(request: Request, article_id: int, target_status: s
|
|||
update_article_status(article_id, target_internal, actor=user, note=note or None)
|
||||
return _dashboard_redirect(msg=f"Artikel #{article_id}: {current_ui} -> {target_ui}")
|
||||
return _dashboard_redirect(msg=f"Ungueltiger Statuswechsel fuer Artikel #{article_id}", msg_type="error")
|
||||
|
||||
|
||||
@router.post("/admin/articles/{article_id}/retry")
|
||||
def admin_retry_article(request: Request, article_id: int):
|
||||
"""Reset a failed article to 'new' so the pipeline picks it up on next run."""
|
||||
user = _admin_user(request)
|
||||
if not user:
|
||||
return RedirectResponse(url="/admin/login", status_code=303)
|
||||
|
||||
article = get_article_by_id(article_id)
|
||||
if not article:
|
||||
return _dashboard_redirect(msg=f"Artikel #{article_id} nicht gefunden", msg_type="error")
|
||||
|
||||
from .scheduler import release_publish_slot
|
||||
release_publish_slot(article_id)
|
||||
update_article_status(article_id, "new", actor=user, note="Manuell zurückgesetzt für erneuten Pipeline-Versuch")
|
||||
return _dashboard_redirect(
|
||||
msg=f"Artikel #{article_id} wurde auf 'neu' zurückgesetzt und wird beim nächsten Pipeline-Lauf verarbeitet",
|
||||
status_filter="close",
|
||||
)
|
||||
|
||||
|
||||
@router.get("/admin/schedule", response_class=HTMLResponse)
|
||||
def admin_schedule(request: Request):
|
||||
"""Schedule overview: all booked slots from DB and WordPress."""
|
||||
user = _admin_user(request)
|
||||
if not user:
|
||||
return RedirectResponse(url="/admin/login", status_code=303)
|
||||
|
||||
from .scheduler import get_schedule_overview, _preferred_hours, _today_cet
|
||||
from datetime import timedelta
|
||||
|
||||
slots = get_schedule_overview(lookahead_days=60)
|
||||
today = _today_cet()
|
||||
hours = _preferred_hours()
|
||||
|
||||
# Build a calendar grid: for each day in the next 60 days, show each preferred hour slot
|
||||
booked: dict[tuple[str, int], dict] = {(s["date"], s["hour"]): s for s in slots}
|
||||
calendar_days = []
|
||||
for offset in range(0, 61):
|
||||
d = today + timedelta(days=offset)
|
||||
d_str = d.isoformat()
|
||||
day_slots = []
|
||||
for h in hours:
|
||||
key = (d_str, h)
|
||||
day_slots.append({
|
||||
"hour": h,
|
||||
"booked": key in booked,
|
||||
"slot": booked.get(key),
|
||||
})
|
||||
calendar_days.append({
|
||||
"date": d_str,
|
||||
"date_fmt": d.strftime("%d.%m.%Y"),
|
||||
"weekday": ["Mo", "Di", "Mi", "Do", "Fr", "Sa", "So"][d.weekday()],
|
||||
"slots": day_slots,
|
||||
"any_booked": any(s["booked"] for s in day_slots),
|
||||
})
|
||||
|
||||
return templates.TemplateResponse(
|
||||
request,
|
||||
"admin_schedule.html",
|
||||
{
|
||||
"request": request,
|
||||
"title": "Veröffentlichungsplan",
|
||||
"user": user,
|
||||
"slots": slots,
|
||||
"calendar_days": calendar_days,
|
||||
"hours": hours,
|
||||
"flash_msg": request.query_params.get("msg", ""),
|
||||
"flash_type": request.query_params.get("type", "success"),
|
||||
},
|
||||
)
|
||||
|
|
|
|||
|
|
@ -48,6 +48,7 @@ class Settings(BaseSettings):
|
|||
pipeline_publish_hours: str = "9,14" # comma-separated preferred publish hours (CET)
|
||||
pipeline_min_words_raw: int = 120 # minimum words in raw content before rewrite (else reject)
|
||||
pipeline_min_words_rewritten: int = 150 # minimum words in rewritten content (else reject)
|
||||
pipeline_max_article_age_days: int = 7 # skip articles older than N days during ingestion (0 = no limit)
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
|
|
|
|||
|
|
@ -1,13 +1,15 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from datetime import datetime, timedelta, timezone
|
||||
import hashlib
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
from typing import Any
|
||||
from urllib.parse import unquote, urlencode, urlparse, parse_qs
|
||||
import urllib.error
|
||||
import urllib.request as _urllib_req
|
||||
|
||||
import feedparser
|
||||
|
||||
|
|
@ -119,6 +121,26 @@ def _normalize_tokens(text: str) -> set[str]:
|
|||
return {token for token in normalized.split() if len(token) >= 4}
|
||||
|
||||
|
||||
def _probe_image_url(url: str, timeout: int = 5) -> bool:
|
||||
"""Return True if URL responds without a 4xx/5xx error (HEAD request).
|
||||
|
||||
Returns True on network/connection errors so that a flaky server does not
|
||||
cause a valid image to be silently dropped.
|
||||
"""
|
||||
try:
|
||||
req = _urllib_req.Request(
|
||||
url,
|
||||
method="HEAD",
|
||||
headers={"User-Agent": "Mozilla/5.0 (compatible; rss-news/1.0)"},
|
||||
)
|
||||
with _urllib_req.urlopen(req, timeout=timeout) as resp:
|
||||
return resp.status < 400
|
||||
except urllib.error.HTTPError as exc:
|
||||
return exc.code < 400 # 3xx redirects are OK; 4xx/5xx are not
|
||||
except Exception:
|
||||
return True # network error → don't filter, let WP try later
|
||||
|
||||
|
||||
def _rank_image_candidates(source_url: str, title: str, images: list[str]) -> list[dict[str, Any]]:
|
||||
source_host = (urlparse(source_url).hostname or "").lower()
|
||||
is_presseportal = "presseportal.de" in source_host
|
||||
|
|
@ -184,10 +206,25 @@ def _select_relevant_images(source_url: str, title: str, images: list[str], max_
|
|||
deduped.append(image)
|
||||
|
||||
ranked = _rank_image_candidates(source_url, title, deduped)
|
||||
kept = [item["url"] for item in ranked if item["score"] > 0][:max_keep]
|
||||
if not kept and ranked:
|
||||
kept = [ranked[0]["url"]]
|
||||
primary = kept[0] if kept else None
|
||||
candidates = [item["url"] for item in ranked if item["score"] > -100]
|
||||
|
||||
# Probe top candidates (max 4) to skip definitively broken URLs (HTTP 4xx).
|
||||
# Network errors are treated as OK to avoid false negatives on flaky servers.
|
||||
primary = None
|
||||
kept: list[str] = []
|
||||
for url in candidates[:4]:
|
||||
if _probe_image_url(url):
|
||||
if primary is None:
|
||||
primary = url
|
||||
kept.append(url)
|
||||
if len(kept) >= max_keep:
|
||||
break
|
||||
|
||||
# Fallback: if all probes failed with network errors, use best candidate anyway
|
||||
if not kept and candidates:
|
||||
primary = candidates[0]
|
||||
kept = candidates[:max_keep]
|
||||
|
||||
return kept, primary, ranked
|
||||
|
||||
|
||||
|
|
@ -265,12 +302,27 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats:
|
|||
|
||||
feed_entries_seen = 0
|
||||
feed_upserts = 0
|
||||
from .config import get_settings as _get_settings
|
||||
_max_age_days = _get_settings().pipeline_max_article_age_days
|
||||
for entry in _parsed_get(parsed, "entries", []):
|
||||
entries_seen += 1
|
||||
feed_entries_seen += 1
|
||||
link = entry.get("link")
|
||||
if not link:
|
||||
continue
|
||||
|
||||
# Age filter: skip articles older than max_age_days (0 = no limit)
|
||||
if _max_age_days > 0:
|
||||
published_iso = _entry_published_iso(entry)
|
||||
if published_iso:
|
||||
try:
|
||||
published_dt = datetime.fromisoformat(published_iso)
|
||||
age = datetime.now(timezone.utc) - published_dt
|
||||
if age > timedelta(days=_max_age_days):
|
||||
continue
|
||||
except Exception:
|
||||
pass # can't parse date → allow through
|
||||
|
||||
# Resolve Google redirect URLs (google.com/url?...&url=<actual_url>&...)
|
||||
link = _resolve_google_redirect(link)
|
||||
# Normalize AMP/tracking params (e.g. ?outputType=valid_amp)
|
||||
|
|
|
|||
|
|
@ -374,6 +374,15 @@ def _process_article(article: dict[str, Any], stats: PipelineStats, settings: An
|
|||
# Release the reserved slot so it's available for the next article
|
||||
from .scheduler import release_publish_slot
|
||||
release_publish_slot(article_id)
|
||||
# Clean up any stale WP draft from a previous pipeline run
|
||||
stale = get_article_by_id(article_id)
|
||||
if stale and stale.get("wp_post_id"):
|
||||
try:
|
||||
from .wordpress import delete_wp_post
|
||||
delete_wp_post(int(stale["wp_post_id"]))
|
||||
logger.info("Artikel #%d: veralteten WP-Draft #%s gelöscht", article_id, stale["wp_post_id"])
|
||||
except Exception as del_exc:
|
||||
logger.warning("Artikel #%d: WP-Draft konnte nicht gelöscht werden: %s", article_id, del_exc)
|
||||
stats.quality_gate_rejected += 1
|
||||
logger.info("Artikel #%d wegen Qualitätsprüfung abgelehnt: %s", article_id, exc)
|
||||
# Individual Telegram notification for quality gate rejection
|
||||
|
|
|
|||
|
|
@ -165,6 +165,72 @@ def _find_next_free_slot(
|
|||
return tomorrow, _preferred_hours()[0] if _preferred_hours() else 9
|
||||
|
||||
|
||||
def get_schedule_overview(lookahead_days: int = 60) -> list[dict]:
|
||||
"""Return all booked scheduling slots (DB + WP) for the next N days, sorted by date."""
|
||||
today = _today_cet()
|
||||
hours = _preferred_hours()
|
||||
|
||||
# Slots booked in local DB
|
||||
with get_conn() as conn:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT id, title, status, wp_post_id, wp_post_url, scheduled_publish_at
|
||||
FROM articles
|
||||
WHERE scheduled_publish_at IS NOT NULL
|
||||
AND scheduled_publish_at >= ?
|
||||
AND status NOT IN ('error', 'no_image')
|
||||
ORDER BY scheduled_publish_at
|
||||
""",
|
||||
(today.isoformat() + "T00:00:00",),
|
||||
).fetchall()
|
||||
|
||||
db_slots: dict[tuple[str, int], dict] = {}
|
||||
for row in rows:
|
||||
try:
|
||||
dt = datetime.fromisoformat(row["scheduled_publish_at"])
|
||||
key = (dt.date().isoformat(), dt.hour)
|
||||
db_slots[key] = {
|
||||
"date": dt.date().isoformat(),
|
||||
"hour": dt.hour,
|
||||
"formatted": _format_slot(dt.date(), dt.hour),
|
||||
"source": "db",
|
||||
"article_id": row["id"],
|
||||
"article_title": row["title"],
|
||||
"article_status": row["status"],
|
||||
"wp_post_id": row["wp_post_id"],
|
||||
"wp_post_url": row["wp_post_url"],
|
||||
}
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Slots occupied in WordPress but not in local DB
|
||||
wp_occupied = _fetch_wp_occupied_slots()
|
||||
wp_only: list[dict] = []
|
||||
for d_str, h in sorted(wp_occupied):
|
||||
if (d_str, h) in db_slots:
|
||||
continue
|
||||
try:
|
||||
d = date.fromisoformat(d_str)
|
||||
if d >= today:
|
||||
wp_only.append({
|
||||
"date": d_str,
|
||||
"hour": h,
|
||||
"formatted": _format_slot(d, h),
|
||||
"source": "wordpress",
|
||||
"article_id": None,
|
||||
"article_title": "(WP-Beitrag außerhalb Pipeline)",
|
||||
"article_status": None,
|
||||
"wp_post_id": None,
|
||||
"wp_post_url": None,
|
||||
})
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
all_slots = list(db_slots.values()) + wp_only
|
||||
all_slots.sort(key=lambda s: (s["date"], s["hour"]))
|
||||
return all_slots
|
||||
|
||||
|
||||
def release_publish_slot(article_id: int) -> None:
|
||||
"""Clear a previously reserved slot (e.g. when article is rejected after slot assignment)."""
|
||||
with get_conn() as conn:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue