From 93f52f72b948aa997491cce0ae7a33e89197a88c Mon Sep 17 00:00:00 2001 From: Oliver G Date: Sat, 21 Feb 2026 14:51:36 +0100 Subject: [PATCH] fix(ingestion): preserve article workflow data and skip closed items on re-import --- backend/app/ingestion.py | 111 +++++++++++++++++++++-------- backend/app/repositories.py | 7 ++ backend/tests/test_ingestion.py | 119 +++++++++++++++++++++++++++++++- 3 files changed, 206 insertions(+), 31 deletions(-) diff --git a/backend/app/ingestion.py b/backend/app/ingestion.py index 872a1b0..1ba6b6c 100644 --- a/backend/app/ingestion.py +++ b/backend/app/ingestion.py @@ -16,6 +16,7 @@ from .repositories import ( ArticleUpsert, RunCreate, create_run, + find_existing_article_for_upsert, finish_run, get_feed_by_id, list_enabled_feeds, @@ -135,6 +136,20 @@ def _select_relevant_images(source_url: str, title: str, images: list[str], max_ return kept, primary, ranked +def _merge_ingestion_meta(existing_meta_json: str | None, attribution: dict[str, Any], extraction_meta: dict[str, Any]) -> str: + meta: dict[str, Any] = {} + if existing_meta_json: + try: + parsed = json.loads(existing_meta_json) + if isinstance(parsed, dict): + meta = parsed + except Exception: + meta = {} + meta["attribution"] = attribution + meta["extraction"] = extraction_meta + return json.dumps(meta, ensure_ascii=False) + + def run_ingestion(feed_id: int | None = None) -> IngestionStats: run_id = create_run(RunCreate(run_type="ingestion", status="running", details="started")) feeds_processed = 0 @@ -268,37 +283,73 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats: "total_candidates": len(extracted.images), "ranked": ranked_images, } - article_id = upsert_article( - ArticleUpsert( - feed_id=int(feed["id"]), - source_article_id=entry.get("id") or entry.get("guid"), - source_hash=source_hash, - title=final_title, - source_url=link, - canonical_url=final_canonical, - published_at=_entry_published_iso(entry), - author=final_author, - summary=final_summary, - content_raw=final_content_raw, - content_rewritten=None, - image_urls_json=json.dumps(selected_images, ensure_ascii=False) if selected_images else None, - press_contact=extracted.press_contact, - source_name_snapshot=feed.get("source_name"), - source_terms_url_snapshot=feed.get("source_terms_url"), - source_license_name_snapshot=feed.get("source_license_name"), - legal_checked=False, - legal_checked_at=None, - legal_note=None, - wp_post_id=None, - wp_post_url=None, - publish_attempts=0, - publish_last_error=None, - published_to_wp_at=None, - word_count=len((final_content_raw or "").split()), - status="new", - meta_json=json.dumps({"attribution": attribution, "extraction": extraction_meta}, ensure_ascii=False), - ) + base_payload = ArticleUpsert( + feed_id=int(feed["id"]), + source_article_id=entry.get("id") or entry.get("guid"), + source_hash=source_hash, + title=final_title, + source_url=link, + canonical_url=final_canonical, + published_at=_entry_published_iso(entry), + author=final_author, + summary=final_summary, + content_raw=final_content_raw, + content_rewritten=None, + image_urls_json=json.dumps(selected_images, ensure_ascii=False) if selected_images else None, + press_contact=extracted.press_contact, + source_name_snapshot=feed.get("source_name"), + source_terms_url_snapshot=feed.get("source_terms_url"), + source_license_name_snapshot=feed.get("source_license_name"), + legal_checked=False, + legal_checked_at=None, + legal_note=None, + wp_post_id=None, + wp_post_url=None, + publish_attempts=0, + publish_last_error=None, + published_to_wp_at=None, + word_count=len((final_content_raw or "").split()), + status="new", + meta_json=json.dumps({"attribution": attribution, "extraction": extraction_meta}, ensure_ascii=False), ) + existing = find_existing_article_for_upsert(base_payload) + if existing and existing.get("status") == "error": + # Explicitly closed article: ignore on subsequent ingestion runs. + continue + + payload = base_payload + if existing: + payload = ArticleUpsert( + feed_id=base_payload.feed_id, + source_article_id=base_payload.source_article_id, + source_hash=base_payload.source_hash, + title=base_payload.title, + source_url=base_payload.source_url, + canonical_url=base_payload.canonical_url, + published_at=base_payload.published_at, + author=base_payload.author, + summary=base_payload.summary, + content_raw=base_payload.content_raw, + content_rewritten=existing.get("content_rewritten"), + image_urls_json=base_payload.image_urls_json, + press_contact=base_payload.press_contact or existing.get("press_contact"), + source_name_snapshot=base_payload.source_name_snapshot, + source_terms_url_snapshot=base_payload.source_terms_url_snapshot, + source_license_name_snapshot=base_payload.source_license_name_snapshot, + legal_checked=bool(int(existing.get("legal_checked", 0))), + legal_checked_at=existing.get("legal_checked_at"), + legal_note=existing.get("legal_note"), + wp_post_id=existing.get("wp_post_id"), + wp_post_url=existing.get("wp_post_url"), + publish_attempts=int(existing.get("publish_attempts", 0)), + publish_last_error=existing.get("publish_last_error"), + published_to_wp_at=existing.get("published_to_wp_at"), + word_count=base_payload.word_count, + status=existing.get("status") or "new", + meta_json=_merge_ingestion_meta(existing.get("meta_json"), attribution, extraction_meta), + ) + + article_id = upsert_article(payload) if article_id: articles_upserted += 1 feed_upserts += 1 diff --git a/backend/app/repositories.py b/backend/app/repositories.py index bff6e87..0ee5380 100644 --- a/backend/app/repositories.py +++ b/backend/app/repositories.py @@ -633,6 +633,13 @@ def _resolve_existing_article_id(payload: ArticleUpsert) -> int | None: return None +def find_existing_article_for_upsert(payload: ArticleUpsert) -> dict[str, Any] | None: + article_id = _resolve_existing_article_id(payload) + if article_id is None: + return None + return get_article_by_id(article_id) + + def upsert_article(payload: ArticleUpsert) -> int: existing_id = _resolve_existing_article_id(payload) with get_conn() as conn: diff --git a/backend/tests/test_ingestion.py b/backend/tests/test_ingestion.py index 342c216..a36b62e 100644 --- a/backend/tests/test_ingestion.py +++ b/backend/tests/test_ingestion.py @@ -7,7 +7,16 @@ from unittest.mock import patch from backend.app import config as config_module from backend.app.db import init_db from backend.app.ingestion import run_ingestion -from backend.app.repositories import FeedCreate, SourceCreate, create_feed, create_source, list_articles +from backend.app.repositories import ( + ArticleUpsert, + FeedCreate, + SourceCreate, + create_feed, + create_source, + get_article_by_id, + list_articles, + upsert_article, +) from backend.app.source_extraction import ExtractedArticle @@ -118,6 +127,114 @@ class TestIngestion(unittest.TestCase): mock_parse.assert_not_called() mock_extract_article.assert_not_called() + @patch("backend.app.ingestion.extract_article") + @patch("backend.app.ingestion.feedparser.parse") + def test_ingestion_preserves_existing_work_and_skips_closed(self, mock_parse, mock_extract_article) -> None: + existing_closed_id = upsert_article( + ArticleUpsert( + feed_id=self.feed_id, + source_article_id="closed-1", + source_hash="closed-hash-1", + title="Alt Closed", + source_url="https://example.org/closed-article", + canonical_url="https://example.org/closed-article", + published_at=None, + author="Autor", + summary="Alt", + content_raw="Alt Raw", + content_rewritten="

Alt Rewrite Closed

", + image_urls_json=None, + press_contact="Kontakt Alt", + source_name_snapshot="Test Source", + source_terms_url_snapshot="https://example.org/terms", + source_license_name_snapshot="cc-by", + legal_checked=False, + legal_checked_at=None, + legal_note=None, + wp_post_id=42, + wp_post_url="https://wp.local/?p=42", + publish_attempts=2, + publish_last_error=None, + published_to_wp_at="2026-02-21T12:00:00Z", + word_count=3, + status="error", # UI: close + meta_json='{"generated_tags":["AltTag"]}', + ) + ) + existing_published_id = upsert_article( + ArticleUpsert( + feed_id=self.feed_id, + source_article_id="published-1", + source_hash="published-hash-1", + title="Alt Published", + source_url="https://example.org/published-article", + canonical_url="https://example.org/published-article", + published_at=None, + author="Autor", + summary="Alt", + content_raw="Alt Raw", + content_rewritten="

Alt Rewrite Published

", + image_urls_json=None, + press_contact="Kontakt Alt", + source_name_snapshot="Test Source", + source_terms_url_snapshot="https://example.org/terms", + source_license_name_snapshot="cc-by", + legal_checked=False, + legal_checked_at=None, + legal_note=None, + wp_post_id=77, + wp_post_url="https://wp.local/?p=77", + publish_attempts=3, + publish_last_error=None, + published_to_wp_at="2026-02-21T12:10:00Z", + word_count=3, + status="published", + meta_json='{"generated_tags":["Rheingas"],"image_review":{"selected_url":"https://img.local/1.jpg"}}', + ) + ) + + mock_extract_article.return_value = ExtractedArticle( + title="Neu Titel", + author="Neu Autor", + canonical_url=None, + summary="Neu Summary", + content_text="Neu Volltext", + images=["https://example.org/a.jpg"], + press_contact=None, + extraction_error=None, + ) + mock_parse.return_value = { + "etag": "etag-2", + "modified": "Tue, 18 Feb 2026 11:00:00 GMT", + "entries": [ + { + "id": "closed-1", + "title": "Closed Entry", + "link": "https://example.org/closed-article", + "summary": "X", + }, + { + "id": "published-1", + "title": "Published Entry", + "link": "https://example.org/published-article", + "summary": "Y", + }, + ], + } + + stats = run_ingestion(feed_id=self.feed_id) + self.assertEqual(stats.status, "success") + closed_row = get_article_by_id(existing_closed_id) or {} + self.assertEqual(closed_row["status"], "error") + self.assertIn("Alt Rewrite Closed", closed_row.get("content_rewritten") or "") + self.assertEqual(closed_row.get("wp_post_id"), 42) + + published_row = get_article_by_id(existing_published_id) or {} + self.assertEqual(published_row["status"], "published") + self.assertIn("Alt Rewrite Published", published_row.get("content_rewritten") or "") + self.assertEqual(published_row.get("wp_post_id"), 77) + self.assertIn("generated_tags", published_row.get("meta_json") or "") + if __name__ == "__main__": unittest.main()