feat(pipeline): article age filter, image URL validation, schedule UI, retry button
1. Article age filter (ingestion.py + config.py):
- New setting pipeline_max_article_age_days=7 (0 = no limit)
- Skip RSS entries older than N days before expensive extract_article()
- Prevents old articles from Google Alerts re-entering pipeline
2. Image URL pre-validation (ingestion.py):
- HEAD request probe for each primary image candidate during ingestion
- Falls back to next-best candidate if primary returns 4xx
- Network errors treated as OK to avoid false negatives on flaky servers
3. Stale WP draft cleanup (pipeline.py):
- Quality gate rejections now delete any pre-existing WP draft (wp_post_id)
- Prevents orphaned drafts when re-running articles that previously had drafts
4. Schedule overview UI (scheduler.py + admin_ui.py + admin_schedule.html):
- New /admin/schedule page showing calendar grid of all booked slots
- Distinguishes Pipeline-DB slots from WordPress-only slots
- Link added to dashboard navigation
5. Retry for failed articles (admin_ui.py + admin_dashboard.html):
- New POST /admin/articles/{id}/retry endpoint: resets to 'new', releases slot
- '🔄 Wiederholen' button shown in dashboard for all 'close' (error) articles
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
cf2d826c8a
commit
8676ace102
7 changed files with 344 additions and 5 deletions
|
|
@ -929,3 +929,75 @@ def admin_transition_article(request: Request, article_id: int, target_status: s
|
|||
update_article_status(article_id, target_internal, actor=user, note=note or None)
|
||||
return _dashboard_redirect(msg=f"Artikel #{article_id}: {current_ui} -> {target_ui}")
|
||||
return _dashboard_redirect(msg=f"Ungueltiger Statuswechsel fuer Artikel #{article_id}", msg_type="error")
|
||||
|
||||
|
||||
@router.post("/admin/articles/{article_id}/retry")
|
||||
def admin_retry_article(request: Request, article_id: int):
|
||||
"""Reset a failed article to 'new' so the pipeline picks it up on next run."""
|
||||
user = _admin_user(request)
|
||||
if not user:
|
||||
return RedirectResponse(url="/admin/login", status_code=303)
|
||||
|
||||
article = get_article_by_id(article_id)
|
||||
if not article:
|
||||
return _dashboard_redirect(msg=f"Artikel #{article_id} nicht gefunden", msg_type="error")
|
||||
|
||||
from .scheduler import release_publish_slot
|
||||
release_publish_slot(article_id)
|
||||
update_article_status(article_id, "new", actor=user, note="Manuell zurückgesetzt für erneuten Pipeline-Versuch")
|
||||
return _dashboard_redirect(
|
||||
msg=f"Artikel #{article_id} wurde auf 'neu' zurückgesetzt und wird beim nächsten Pipeline-Lauf verarbeitet",
|
||||
status_filter="close",
|
||||
)
|
||||
|
||||
|
||||
@router.get("/admin/schedule", response_class=HTMLResponse)
|
||||
def admin_schedule(request: Request):
|
||||
"""Schedule overview: all booked slots from DB and WordPress."""
|
||||
user = _admin_user(request)
|
||||
if not user:
|
||||
return RedirectResponse(url="/admin/login", status_code=303)
|
||||
|
||||
from .scheduler import get_schedule_overview, _preferred_hours, _today_cet
|
||||
from datetime import timedelta
|
||||
|
||||
slots = get_schedule_overview(lookahead_days=60)
|
||||
today = _today_cet()
|
||||
hours = _preferred_hours()
|
||||
|
||||
# Build a calendar grid: for each day in the next 60 days, show each preferred hour slot
|
||||
booked: dict[tuple[str, int], dict] = {(s["date"], s["hour"]): s for s in slots}
|
||||
calendar_days = []
|
||||
for offset in range(0, 61):
|
||||
d = today + timedelta(days=offset)
|
||||
d_str = d.isoformat()
|
||||
day_slots = []
|
||||
for h in hours:
|
||||
key = (d_str, h)
|
||||
day_slots.append({
|
||||
"hour": h,
|
||||
"booked": key in booked,
|
||||
"slot": booked.get(key),
|
||||
})
|
||||
calendar_days.append({
|
||||
"date": d_str,
|
||||
"date_fmt": d.strftime("%d.%m.%Y"),
|
||||
"weekday": ["Mo", "Di", "Mi", "Do", "Fr", "Sa", "So"][d.weekday()],
|
||||
"slots": day_slots,
|
||||
"any_booked": any(s["booked"] for s in day_slots),
|
||||
})
|
||||
|
||||
return templates.TemplateResponse(
|
||||
request,
|
||||
"admin_schedule.html",
|
||||
{
|
||||
"request": request,
|
||||
"title": "Veröffentlichungsplan",
|
||||
"user": user,
|
||||
"slots": slots,
|
||||
"calendar_days": calendar_days,
|
||||
"hours": hours,
|
||||
"flash_msg": request.query_params.get("msg", ""),
|
||||
"flash_type": request.query_params.get("type", "success"),
|
||||
},
|
||||
)
|
||||
|
|
|
|||
|
|
@ -48,6 +48,7 @@ class Settings(BaseSettings):
|
|||
pipeline_publish_hours: str = "9,14" # comma-separated preferred publish hours (CET)
|
||||
pipeline_min_words_raw: int = 120 # minimum words in raw content before rewrite (else reject)
|
||||
pipeline_min_words_rewritten: int = 150 # minimum words in rewritten content (else reject)
|
||||
pipeline_max_article_age_days: int = 7 # skip articles older than N days during ingestion (0 = no limit)
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
|
|
|
|||
|
|
@ -1,13 +1,15 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from datetime import datetime, timedelta, timezone
|
||||
import hashlib
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
from typing import Any
|
||||
from urllib.parse import unquote, urlencode, urlparse, parse_qs
|
||||
import urllib.error
|
||||
import urllib.request as _urllib_req
|
||||
|
||||
import feedparser
|
||||
|
||||
|
|
@ -119,6 +121,26 @@ def _normalize_tokens(text: str) -> set[str]:
|
|||
return {token for token in normalized.split() if len(token) >= 4}
|
||||
|
||||
|
||||
def _probe_image_url(url: str, timeout: int = 5) -> bool:
|
||||
"""Return True if URL responds without a 4xx/5xx error (HEAD request).
|
||||
|
||||
Returns True on network/connection errors so that a flaky server does not
|
||||
cause a valid image to be silently dropped.
|
||||
"""
|
||||
try:
|
||||
req = _urllib_req.Request(
|
||||
url,
|
||||
method="HEAD",
|
||||
headers={"User-Agent": "Mozilla/5.0 (compatible; rss-news/1.0)"},
|
||||
)
|
||||
with _urllib_req.urlopen(req, timeout=timeout) as resp:
|
||||
return resp.status < 400
|
||||
except urllib.error.HTTPError as exc:
|
||||
return exc.code < 400 # 3xx redirects are OK; 4xx/5xx are not
|
||||
except Exception:
|
||||
return True # network error → don't filter, let WP try later
|
||||
|
||||
|
||||
def _rank_image_candidates(source_url: str, title: str, images: list[str]) -> list[dict[str, Any]]:
|
||||
source_host = (urlparse(source_url).hostname or "").lower()
|
||||
is_presseportal = "presseportal.de" in source_host
|
||||
|
|
@ -184,10 +206,25 @@ def _select_relevant_images(source_url: str, title: str, images: list[str], max_
|
|||
deduped.append(image)
|
||||
|
||||
ranked = _rank_image_candidates(source_url, title, deduped)
|
||||
kept = [item["url"] for item in ranked if item["score"] > 0][:max_keep]
|
||||
if not kept and ranked:
|
||||
kept = [ranked[0]["url"]]
|
||||
primary = kept[0] if kept else None
|
||||
candidates = [item["url"] for item in ranked if item["score"] > -100]
|
||||
|
||||
# Probe top candidates (max 4) to skip definitively broken URLs (HTTP 4xx).
|
||||
# Network errors are treated as OK to avoid false negatives on flaky servers.
|
||||
primary = None
|
||||
kept: list[str] = []
|
||||
for url in candidates[:4]:
|
||||
if _probe_image_url(url):
|
||||
if primary is None:
|
||||
primary = url
|
||||
kept.append(url)
|
||||
if len(kept) >= max_keep:
|
||||
break
|
||||
|
||||
# Fallback: if all probes failed with network errors, use best candidate anyway
|
||||
if not kept and candidates:
|
||||
primary = candidates[0]
|
||||
kept = candidates[:max_keep]
|
||||
|
||||
return kept, primary, ranked
|
||||
|
||||
|
||||
|
|
@ -265,12 +302,27 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats:
|
|||
|
||||
feed_entries_seen = 0
|
||||
feed_upserts = 0
|
||||
from .config import get_settings as _get_settings
|
||||
_max_age_days = _get_settings().pipeline_max_article_age_days
|
||||
for entry in _parsed_get(parsed, "entries", []):
|
||||
entries_seen += 1
|
||||
feed_entries_seen += 1
|
||||
link = entry.get("link")
|
||||
if not link:
|
||||
continue
|
||||
|
||||
# Age filter: skip articles older than max_age_days (0 = no limit)
|
||||
if _max_age_days > 0:
|
||||
published_iso = _entry_published_iso(entry)
|
||||
if published_iso:
|
||||
try:
|
||||
published_dt = datetime.fromisoformat(published_iso)
|
||||
age = datetime.now(timezone.utc) - published_dt
|
||||
if age > timedelta(days=_max_age_days):
|
||||
continue
|
||||
except Exception:
|
||||
pass # can't parse date → allow through
|
||||
|
||||
# Resolve Google redirect URLs (google.com/url?...&url=<actual_url>&...)
|
||||
link = _resolve_google_redirect(link)
|
||||
# Normalize AMP/tracking params (e.g. ?outputType=valid_amp)
|
||||
|
|
|
|||
|
|
@ -374,6 +374,15 @@ def _process_article(article: dict[str, Any], stats: PipelineStats, settings: An
|
|||
# Release the reserved slot so it's available for the next article
|
||||
from .scheduler import release_publish_slot
|
||||
release_publish_slot(article_id)
|
||||
# Clean up any stale WP draft from a previous pipeline run
|
||||
stale = get_article_by_id(article_id)
|
||||
if stale and stale.get("wp_post_id"):
|
||||
try:
|
||||
from .wordpress import delete_wp_post
|
||||
delete_wp_post(int(stale["wp_post_id"]))
|
||||
logger.info("Artikel #%d: veralteten WP-Draft #%s gelöscht", article_id, stale["wp_post_id"])
|
||||
except Exception as del_exc:
|
||||
logger.warning("Artikel #%d: WP-Draft konnte nicht gelöscht werden: %s", article_id, del_exc)
|
||||
stats.quality_gate_rejected += 1
|
||||
logger.info("Artikel #%d wegen Qualitätsprüfung abgelehnt: %s", article_id, exc)
|
||||
# Individual Telegram notification for quality gate rejection
|
||||
|
|
|
|||
|
|
@ -165,6 +165,72 @@ def _find_next_free_slot(
|
|||
return tomorrow, _preferred_hours()[0] if _preferred_hours() else 9
|
||||
|
||||
|
||||
def get_schedule_overview(lookahead_days: int = 60) -> list[dict]:
|
||||
"""Return all booked scheduling slots (DB + WP) for the next N days, sorted by date."""
|
||||
today = _today_cet()
|
||||
hours = _preferred_hours()
|
||||
|
||||
# Slots booked in local DB
|
||||
with get_conn() as conn:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT id, title, status, wp_post_id, wp_post_url, scheduled_publish_at
|
||||
FROM articles
|
||||
WHERE scheduled_publish_at IS NOT NULL
|
||||
AND scheduled_publish_at >= ?
|
||||
AND status NOT IN ('error', 'no_image')
|
||||
ORDER BY scheduled_publish_at
|
||||
""",
|
||||
(today.isoformat() + "T00:00:00",),
|
||||
).fetchall()
|
||||
|
||||
db_slots: dict[tuple[str, int], dict] = {}
|
||||
for row in rows:
|
||||
try:
|
||||
dt = datetime.fromisoformat(row["scheduled_publish_at"])
|
||||
key = (dt.date().isoformat(), dt.hour)
|
||||
db_slots[key] = {
|
||||
"date": dt.date().isoformat(),
|
||||
"hour": dt.hour,
|
||||
"formatted": _format_slot(dt.date(), dt.hour),
|
||||
"source": "db",
|
||||
"article_id": row["id"],
|
||||
"article_title": row["title"],
|
||||
"article_status": row["status"],
|
||||
"wp_post_id": row["wp_post_id"],
|
||||
"wp_post_url": row["wp_post_url"],
|
||||
}
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Slots occupied in WordPress but not in local DB
|
||||
wp_occupied = _fetch_wp_occupied_slots()
|
||||
wp_only: list[dict] = []
|
||||
for d_str, h in sorted(wp_occupied):
|
||||
if (d_str, h) in db_slots:
|
||||
continue
|
||||
try:
|
||||
d = date.fromisoformat(d_str)
|
||||
if d >= today:
|
||||
wp_only.append({
|
||||
"date": d_str,
|
||||
"hour": h,
|
||||
"formatted": _format_slot(d, h),
|
||||
"source": "wordpress",
|
||||
"article_id": None,
|
||||
"article_title": "(WP-Beitrag außerhalb Pipeline)",
|
||||
"article_status": None,
|
||||
"wp_post_id": None,
|
||||
"wp_post_url": None,
|
||||
})
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
all_slots = list(db_slots.values()) + wp_only
|
||||
all_slots.sort(key=lambda s: (s["date"], s["hour"]))
|
||||
return all_slots
|
||||
|
||||
|
||||
def release_publish_slot(article_id: int) -> None:
|
||||
"""Clear a previously reserved slot (e.g. when article is rejected after slot assignment)."""
|
||||
with get_conn() as conn:
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@
|
|||
<p>Angemeldet als <strong>{{ user }}</strong></p>
|
||||
</div>
|
||||
<div class="row">
|
||||
<a class="linkbtn" href="/admin/schedule">Veröffentlichungsplan</a>
|
||||
<a class="linkbtn" href="/admin/connectivity">Connectivity Check</a>
|
||||
<form method="post" action="/admin/logout">
|
||||
<button type="submit" class="secondary">Logout</button>
|
||||
|
|
@ -330,6 +331,11 @@
|
|||
<span class="subtle">keine Aktion</span>
|
||||
{% endif %}
|
||||
</form>
|
||||
{% if a.status_ui == 'close' %}
|
||||
<form method="post" action="/admin/articles/{{ a.id }}/retry" class="inline" style="margin-top:4px;">
|
||||
<button type="submit" title="Artikel auf 'neu' zurücksetzen – wird beim nächsten Pipeline-Lauf erneut verarbeitet">🔄 Wiederholen</button>
|
||||
</form>
|
||||
{% endif %}
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
|
|
|
|||
133
backend/templates/admin_schedule.html
Normal file
133
backend/templates/admin_schedule.html
Normal file
|
|
@ -0,0 +1,133 @@
|
|||
<!doctype html>
|
||||
<html lang="de">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<title>{{ title }}</title>
|
||||
<link rel="stylesheet" href="/admin/static/admin.css" />
|
||||
<style>
|
||||
.schedule-table td, .schedule-table th { padding: 6px 10px; }
|
||||
.slot-free { color: #aaa; font-style: italic; }
|
||||
.slot-booked-db { color: #1a7a1a; font-weight: bold; }
|
||||
.slot-booked-wp { color: #b35a00; font-weight: bold; }
|
||||
.badge-db { background: #d4edda; color: #155724; padding: 2px 6px; border-radius: 4px; font-size: 0.75em; }
|
||||
.badge-wp { background: #fff3cd; color: #856404; padding: 2px 6px; border-radius: 4px; font-size: 0.75em; }
|
||||
.summary-bar { display: flex; gap: 1.5rem; margin-bottom: 1rem; font-size: 0.95em; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<header class="topbar">
|
||||
<div>
|
||||
<h1>rss-news Veröffentlichungsplan</h1>
|
||||
<p>Angemeldet als <strong>{{ user }}</strong></p>
|
||||
</div>
|
||||
<div class="row">
|
||||
<a class="linkbtn" href="/admin/dashboard">Dashboard</a>
|
||||
<a class="linkbtn" href="/admin/connectivity">Connectivity</a>
|
||||
<form method="post" action="/admin/logout">
|
||||
<button type="submit" class="secondary">Logout</button>
|
||||
</form>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
<main class="container">
|
||||
{% if flash_msg %}
|
||||
<section class="card flash {{ 'flash-error' if flash_type == 'error' else 'flash-success' }}">
|
||||
{{ flash_msg }}
|
||||
</section>
|
||||
{% endif %}
|
||||
|
||||
<section class="card">
|
||||
<h2>Slot-Übersicht (nächste 60 Tage)</h2>
|
||||
<div class="summary-bar">
|
||||
<span>📅 Belegte Slots gesamt: <strong>{{ slots|length }}</strong></span>
|
||||
<span>🗄️ Aus Pipeline-DB: <strong>{{ slots|selectattr('source', 'eq', 'db')|list|length }}</strong></span>
|
||||
<span>🌐 Nur in WordPress: <strong>{{ slots|selectattr('source', 'eq', 'wordpress')|list|length }}</strong></span>
|
||||
</div>
|
||||
<table class="schedule-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Tag</th>
|
||||
{% for h in hours %}
|
||||
<th>{{ "%02d:00 Uhr"|format(h) }}</th>
|
||||
{% endfor %}
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for day in calendar_days %}
|
||||
{% if day.any_booked %}
|
||||
<tr>
|
||||
<td><strong>{{ day.weekday }}</strong> {{ day.date_fmt }}</td>
|
||||
{% for s in day.slots %}
|
||||
<td>
|
||||
{% if s.booked %}
|
||||
{% set info = s.slot %}
|
||||
{% if info.source == 'db' %}
|
||||
<span class="slot-booked-db">✅</span>
|
||||
<span class="badge-db">DB</span>
|
||||
<div style="font-size:0.85em;">
|
||||
{% if info.article_id %}
|
||||
<a href="/admin/articles/{{ info.article_id }}">
|
||||
{{ (info.article_title or "Artikel")[:50] }}{% if (info.article_title or "")|length > 50 %}…{% endif %}
|
||||
</a>
|
||||
{% endif %}
|
||||
<br /><span class="subtle">Status: {{ info.article_status }}</span>
|
||||
{% if info.wp_post_url %}
|
||||
<br /><a href="{{ info.wp_post_url }}" target="_blank" rel="noopener">WP öffnen</a>
|
||||
{% endif %}
|
||||
</div>
|
||||
{% else %}
|
||||
<span class="slot-booked-wp">⚠️</span>
|
||||
<span class="badge-wp">WP</span>
|
||||
<div style="font-size:0.85em;">{{ info.article_title }}</div>
|
||||
{% endif %}
|
||||
{% else %}
|
||||
<span class="slot-free">frei</span>
|
||||
{% endif %}
|
||||
</td>
|
||||
{% endfor %}
|
||||
</tr>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
{% if not slots %}
|
||||
<p class="subtle">Keine geplanten Beiträge in den nächsten 60 Tagen.</p>
|
||||
{% endif %}
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Alle belegten Slots (Liste)</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>Datum/Zeit</th><th>Quelle</th><th>Artikel</th><th>Status</th><th>WordPress</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for s in slots %}
|
||||
<tr>
|
||||
<td>{{ s.formatted }}</td>
|
||||
<td>
|
||||
{% if s.source == 'db' %}<span class="badge-db">Pipeline-DB</span>
|
||||
{% else %}<span class="badge-wp">WordPress</span>{% endif %}
|
||||
</td>
|
||||
<td>
|
||||
{% if s.article_id %}
|
||||
<a href="/admin/articles/{{ s.article_id }}">{{ (s.article_title or "")[:60] }}</a>
|
||||
{% else %}
|
||||
{{ s.article_title or "-" }}
|
||||
{% endif %}
|
||||
</td>
|
||||
<td>{{ s.article_status or "-" }}</td>
|
||||
<td>
|
||||
{% if s.wp_post_url %}
|
||||
<a href="{{ s.wp_post_url }}" target="_blank" rel="noopener">Draft öffnen</a>
|
||||
{% else %}-{% endif %}
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</section>
|
||||
</main>
|
||||
</body>
|
||||
</html>
|
||||
Loading…
Add table
Add a link
Reference in a new issue