diff --git a/backend/app/admin_ui.py b/backend/app/admin_ui.py index 689efce..26085ff 100644 --- a/backend/app/admin_ui.py +++ b/backend/app/admin_ui.py @@ -929,3 +929,75 @@ def admin_transition_article(request: Request, article_id: int, target_status: s update_article_status(article_id, target_internal, actor=user, note=note or None) return _dashboard_redirect(msg=f"Artikel #{article_id}: {current_ui} -> {target_ui}") return _dashboard_redirect(msg=f"Ungueltiger Statuswechsel fuer Artikel #{article_id}", msg_type="error") + + +@router.post("/admin/articles/{article_id}/retry") +def admin_retry_article(request: Request, article_id: int): + """Reset a failed article to 'new' so the pipeline picks it up on next run.""" + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + + article = get_article_by_id(article_id) + if not article: + return _dashboard_redirect(msg=f"Artikel #{article_id} nicht gefunden", msg_type="error") + + from .scheduler import release_publish_slot + release_publish_slot(article_id) + update_article_status(article_id, "new", actor=user, note="Manuell zurückgesetzt für erneuten Pipeline-Versuch") + return _dashboard_redirect( + msg=f"Artikel #{article_id} wurde auf 'neu' zurückgesetzt und wird beim nächsten Pipeline-Lauf verarbeitet", + status_filter="close", + ) + + +@router.get("/admin/schedule", response_class=HTMLResponse) +def admin_schedule(request: Request): + """Schedule overview: all booked slots from DB and WordPress.""" + user = _admin_user(request) + if not user: + return RedirectResponse(url="/admin/login", status_code=303) + + from .scheduler import get_schedule_overview, _preferred_hours, _today_cet + from datetime import timedelta + + slots = get_schedule_overview(lookahead_days=60) + today = _today_cet() + hours = _preferred_hours() + + # Build a calendar grid: for each day in the next 60 days, show each preferred hour slot + booked: dict[tuple[str, int], dict] = {(s["date"], s["hour"]): s for s in slots} + calendar_days = [] + for offset in range(0, 61): + d = today + timedelta(days=offset) + d_str = d.isoformat() + day_slots = [] + for h in hours: + key = (d_str, h) + day_slots.append({ + "hour": h, + "booked": key in booked, + "slot": booked.get(key), + }) + calendar_days.append({ + "date": d_str, + "date_fmt": d.strftime("%d.%m.%Y"), + "weekday": ["Mo", "Di", "Mi", "Do", "Fr", "Sa", "So"][d.weekday()], + "slots": day_slots, + "any_booked": any(s["booked"] for s in day_slots), + }) + + return templates.TemplateResponse( + request, + "admin_schedule.html", + { + "request": request, + "title": "Veröffentlichungsplan", + "user": user, + "slots": slots, + "calendar_days": calendar_days, + "hours": hours, + "flash_msg": request.query_params.get("msg", ""), + "flash_type": request.query_params.get("type", "success"), + }, + ) diff --git a/backend/app/config.py b/backend/app/config.py index 713669e..24c3902 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -48,6 +48,7 @@ class Settings(BaseSettings): pipeline_publish_hours: str = "9,14" # comma-separated preferred publish hours (CET) pipeline_min_words_raw: int = 120 # minimum words in raw content before rewrite (else reject) pipeline_min_words_rewritten: int = 150 # minimum words in rewritten content (else reject) + pipeline_max_article_age_days: int = 7 # skip articles older than N days during ingestion (0 = no limit) @lru_cache(maxsize=1) diff --git a/backend/app/ingestion.py b/backend/app/ingestion.py index 30140ca..391af92 100644 --- a/backend/app/ingestion.py +++ b/backend/app/ingestion.py @@ -1,13 +1,15 @@ from __future__ import annotations from dataclasses import dataclass -from datetime import datetime, timezone +from datetime import datetime, timedelta, timezone import hashlib import json import re import time from typing import Any from urllib.parse import unquote, urlencode, urlparse, parse_qs +import urllib.error +import urllib.request as _urllib_req import feedparser @@ -119,6 +121,26 @@ def _normalize_tokens(text: str) -> set[str]: return {token for token in normalized.split() if len(token) >= 4} +def _probe_image_url(url: str, timeout: int = 5) -> bool: + """Return True if URL responds without a 4xx/5xx error (HEAD request). + + Returns True on network/connection errors so that a flaky server does not + cause a valid image to be silently dropped. + """ + try: + req = _urllib_req.Request( + url, + method="HEAD", + headers={"User-Agent": "Mozilla/5.0 (compatible; rss-news/1.0)"}, + ) + with _urllib_req.urlopen(req, timeout=timeout) as resp: + return resp.status < 400 + except urllib.error.HTTPError as exc: + return exc.code < 400 # 3xx redirects are OK; 4xx/5xx are not + except Exception: + return True # network error → don't filter, let WP try later + + def _rank_image_candidates(source_url: str, title: str, images: list[str]) -> list[dict[str, Any]]: source_host = (urlparse(source_url).hostname or "").lower() is_presseportal = "presseportal.de" in source_host @@ -184,10 +206,25 @@ def _select_relevant_images(source_url: str, title: str, images: list[str], max_ deduped.append(image) ranked = _rank_image_candidates(source_url, title, deduped) - kept = [item["url"] for item in ranked if item["score"] > 0][:max_keep] - if not kept and ranked: - kept = [ranked[0]["url"]] - primary = kept[0] if kept else None + candidates = [item["url"] for item in ranked if item["score"] > -100] + + # Probe top candidates (max 4) to skip definitively broken URLs (HTTP 4xx). + # Network errors are treated as OK to avoid false negatives on flaky servers. + primary = None + kept: list[str] = [] + for url in candidates[:4]: + if _probe_image_url(url): + if primary is None: + primary = url + kept.append(url) + if len(kept) >= max_keep: + break + + # Fallback: if all probes failed with network errors, use best candidate anyway + if not kept and candidates: + primary = candidates[0] + kept = candidates[:max_keep] + return kept, primary, ranked @@ -265,12 +302,27 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats: feed_entries_seen = 0 feed_upserts = 0 + from .config import get_settings as _get_settings + _max_age_days = _get_settings().pipeline_max_article_age_days for entry in _parsed_get(parsed, "entries", []): entries_seen += 1 feed_entries_seen += 1 link = entry.get("link") if not link: continue + + # Age filter: skip articles older than max_age_days (0 = no limit) + if _max_age_days > 0: + published_iso = _entry_published_iso(entry) + if published_iso: + try: + published_dt = datetime.fromisoformat(published_iso) + age = datetime.now(timezone.utc) - published_dt + if age > timedelta(days=_max_age_days): + continue + except Exception: + pass # can't parse date → allow through + # Resolve Google redirect URLs (google.com/url?...&url=&...) link = _resolve_google_redirect(link) # Normalize AMP/tracking params (e.g. ?outputType=valid_amp) diff --git a/backend/app/pipeline.py b/backend/app/pipeline.py index d9766ae..93a251b 100644 --- a/backend/app/pipeline.py +++ b/backend/app/pipeline.py @@ -374,6 +374,15 @@ def _process_article(article: dict[str, Any], stats: PipelineStats, settings: An # Release the reserved slot so it's available for the next article from .scheduler import release_publish_slot release_publish_slot(article_id) + # Clean up any stale WP draft from a previous pipeline run + stale = get_article_by_id(article_id) + if stale and stale.get("wp_post_id"): + try: + from .wordpress import delete_wp_post + delete_wp_post(int(stale["wp_post_id"])) + logger.info("Artikel #%d: veralteten WP-Draft #%s gelöscht", article_id, stale["wp_post_id"]) + except Exception as del_exc: + logger.warning("Artikel #%d: WP-Draft konnte nicht gelöscht werden: %s", article_id, del_exc) stats.quality_gate_rejected += 1 logger.info("Artikel #%d wegen Qualitätsprüfung abgelehnt: %s", article_id, exc) # Individual Telegram notification for quality gate rejection diff --git a/backend/app/scheduler.py b/backend/app/scheduler.py index ff5cecf..8f8d498 100644 --- a/backend/app/scheduler.py +++ b/backend/app/scheduler.py @@ -165,6 +165,72 @@ def _find_next_free_slot( return tomorrow, _preferred_hours()[0] if _preferred_hours() else 9 +def get_schedule_overview(lookahead_days: int = 60) -> list[dict]: + """Return all booked scheduling slots (DB + WP) for the next N days, sorted by date.""" + today = _today_cet() + hours = _preferred_hours() + + # Slots booked in local DB + with get_conn() as conn: + rows = conn.execute( + """ + SELECT id, title, status, wp_post_id, wp_post_url, scheduled_publish_at + FROM articles + WHERE scheduled_publish_at IS NOT NULL + AND scheduled_publish_at >= ? + AND status NOT IN ('error', 'no_image') + ORDER BY scheduled_publish_at + """, + (today.isoformat() + "T00:00:00",), + ).fetchall() + + db_slots: dict[tuple[str, int], dict] = {} + for row in rows: + try: + dt = datetime.fromisoformat(row["scheduled_publish_at"]) + key = (dt.date().isoformat(), dt.hour) + db_slots[key] = { + "date": dt.date().isoformat(), + "hour": dt.hour, + "formatted": _format_slot(dt.date(), dt.hour), + "source": "db", + "article_id": row["id"], + "article_title": row["title"], + "article_status": row["status"], + "wp_post_id": row["wp_post_id"], + "wp_post_url": row["wp_post_url"], + } + except Exception: + pass + + # Slots occupied in WordPress but not in local DB + wp_occupied = _fetch_wp_occupied_slots() + wp_only: list[dict] = [] + for d_str, h in sorted(wp_occupied): + if (d_str, h) in db_slots: + continue + try: + d = date.fromisoformat(d_str) + if d >= today: + wp_only.append({ + "date": d_str, + "hour": h, + "formatted": _format_slot(d, h), + "source": "wordpress", + "article_id": None, + "article_title": "(WP-Beitrag außerhalb Pipeline)", + "article_status": None, + "wp_post_id": None, + "wp_post_url": None, + }) + except Exception: + pass + + all_slots = list(db_slots.values()) + wp_only + all_slots.sort(key=lambda s: (s["date"], s["hour"])) + return all_slots + + def release_publish_slot(article_id: int) -> None: """Clear a previously reserved slot (e.g. when article is rejected after slot assignment).""" with get_conn() as conn: diff --git a/backend/templates/admin_dashboard.html b/backend/templates/admin_dashboard.html index 15f3daf..67738c7 100644 --- a/backend/templates/admin_dashboard.html +++ b/backend/templates/admin_dashboard.html @@ -13,6 +13,7 @@

Angemeldet als {{ user }}

+ Veröffentlichungsplan Connectivity Check
@@ -330,6 +331,11 @@ keine Aktion {% endif %}
+ {% if a.status_ui == 'close' %} +
+ +
+ {% endif %} {% endfor %} diff --git a/backend/templates/admin_schedule.html b/backend/templates/admin_schedule.html new file mode 100644 index 0000000..f585b00 --- /dev/null +++ b/backend/templates/admin_schedule.html @@ -0,0 +1,133 @@ + + + + + + {{ title }} + + + + +
+
+

rss-news Veröffentlichungsplan

+

Angemeldet als {{ user }}

+
+
+ Dashboard + Connectivity +
+ +
+
+
+ +
+ {% if flash_msg %} +
+ {{ flash_msg }} +
+ {% endif %} + +
+

Slot-Übersicht (nächste 60 Tage)

+
+ 📅 Belegte Slots gesamt: {{ slots|length }} + 🗄️ Aus Pipeline-DB: {{ slots|selectattr('source', 'eq', 'db')|list|length }} + 🌐 Nur in WordPress: {{ slots|selectattr('source', 'eq', 'wordpress')|list|length }} +
+ + + + + {% for h in hours %} + + {% endfor %} + + + + {% for day in calendar_days %} + {% if day.any_booked %} + + + {% for s in day.slots %} + + {% endfor %} + + {% endif %} + {% endfor %} + +
Tag{{ "%02d:00 Uhr"|format(h) }}
{{ day.weekday }} {{ day.date_fmt }} + {% if s.booked %} + {% set info = s.slot %} + {% if info.source == 'db' %} + + DB +
+ {% if info.article_id %} + + {{ (info.article_title or "Artikel")[:50] }}{% if (info.article_title or "")|length > 50 %}…{% endif %} + + {% endif %} +
Status: {{ info.article_status }} + {% if info.wp_post_url %} +
WP öffnen + {% endif %} +
+ {% else %} + ⚠️ + WP +
{{ info.article_title }}
+ {% endif %} + {% else %} + frei + {% endif %} +
+ {% if not slots %} +

Keine geplanten Beiträge in den nächsten 60 Tagen.

+ {% endif %} +
+ +
+

Alle belegten Slots (Liste)

+ + + + + + {% for s in slots %} + + + + + + + + {% endfor %} + +
Datum/ZeitQuelleArtikelStatusWordPress
{{ s.formatted }} + {% if s.source == 'db' %}Pipeline-DB + {% else %}WordPress{% endif %} + + {% if s.article_id %} + {{ (s.article_title or "")[:60] }} + {% else %} + {{ s.article_title or "-" }} + {% endif %} + {{ s.article_status or "-" }} + {% if s.wp_post_url %} + Draft öffnen + {% else %}-{% endif %} +
+
+
+ +