fix(ingestion): preserve article workflow data and skip closed items on re-import
This commit is contained in:
parent
b0f995d5c9
commit
93f52f72b9
3 changed files with 206 additions and 31 deletions
|
|
@ -16,6 +16,7 @@ from .repositories import (
|
||||||
ArticleUpsert,
|
ArticleUpsert,
|
||||||
RunCreate,
|
RunCreate,
|
||||||
create_run,
|
create_run,
|
||||||
|
find_existing_article_for_upsert,
|
||||||
finish_run,
|
finish_run,
|
||||||
get_feed_by_id,
|
get_feed_by_id,
|
||||||
list_enabled_feeds,
|
list_enabled_feeds,
|
||||||
|
|
@ -135,6 +136,20 @@ def _select_relevant_images(source_url: str, title: str, images: list[str], max_
|
||||||
return kept, primary, ranked
|
return kept, primary, ranked
|
||||||
|
|
||||||
|
|
||||||
|
def _merge_ingestion_meta(existing_meta_json: str | None, attribution: dict[str, Any], extraction_meta: dict[str, Any]) -> str:
|
||||||
|
meta: dict[str, Any] = {}
|
||||||
|
if existing_meta_json:
|
||||||
|
try:
|
||||||
|
parsed = json.loads(existing_meta_json)
|
||||||
|
if isinstance(parsed, dict):
|
||||||
|
meta = parsed
|
||||||
|
except Exception:
|
||||||
|
meta = {}
|
||||||
|
meta["attribution"] = attribution
|
||||||
|
meta["extraction"] = extraction_meta
|
||||||
|
return json.dumps(meta, ensure_ascii=False)
|
||||||
|
|
||||||
|
|
||||||
def run_ingestion(feed_id: int | None = None) -> IngestionStats:
|
def run_ingestion(feed_id: int | None = None) -> IngestionStats:
|
||||||
run_id = create_run(RunCreate(run_type="ingestion", status="running", details="started"))
|
run_id = create_run(RunCreate(run_type="ingestion", status="running", details="started"))
|
||||||
feeds_processed = 0
|
feeds_processed = 0
|
||||||
|
|
@ -268,8 +283,7 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats:
|
||||||
"total_candidates": len(extracted.images),
|
"total_candidates": len(extracted.images),
|
||||||
"ranked": ranked_images,
|
"ranked": ranked_images,
|
||||||
}
|
}
|
||||||
article_id = upsert_article(
|
base_payload = ArticleUpsert(
|
||||||
ArticleUpsert(
|
|
||||||
feed_id=int(feed["id"]),
|
feed_id=int(feed["id"]),
|
||||||
source_article_id=entry.get("id") or entry.get("guid"),
|
source_article_id=entry.get("id") or entry.get("guid"),
|
||||||
source_hash=source_hash,
|
source_hash=source_hash,
|
||||||
|
|
@ -298,7 +312,44 @@ def run_ingestion(feed_id: int | None = None) -> IngestionStats:
|
||||||
status="new",
|
status="new",
|
||||||
meta_json=json.dumps({"attribution": attribution, "extraction": extraction_meta}, ensure_ascii=False),
|
meta_json=json.dumps({"attribution": attribution, "extraction": extraction_meta}, ensure_ascii=False),
|
||||||
)
|
)
|
||||||
|
existing = find_existing_article_for_upsert(base_payload)
|
||||||
|
if existing and existing.get("status") == "error":
|
||||||
|
# Explicitly closed article: ignore on subsequent ingestion runs.
|
||||||
|
continue
|
||||||
|
|
||||||
|
payload = base_payload
|
||||||
|
if existing:
|
||||||
|
payload = ArticleUpsert(
|
||||||
|
feed_id=base_payload.feed_id,
|
||||||
|
source_article_id=base_payload.source_article_id,
|
||||||
|
source_hash=base_payload.source_hash,
|
||||||
|
title=base_payload.title,
|
||||||
|
source_url=base_payload.source_url,
|
||||||
|
canonical_url=base_payload.canonical_url,
|
||||||
|
published_at=base_payload.published_at,
|
||||||
|
author=base_payload.author,
|
||||||
|
summary=base_payload.summary,
|
||||||
|
content_raw=base_payload.content_raw,
|
||||||
|
content_rewritten=existing.get("content_rewritten"),
|
||||||
|
image_urls_json=base_payload.image_urls_json,
|
||||||
|
press_contact=base_payload.press_contact or existing.get("press_contact"),
|
||||||
|
source_name_snapshot=base_payload.source_name_snapshot,
|
||||||
|
source_terms_url_snapshot=base_payload.source_terms_url_snapshot,
|
||||||
|
source_license_name_snapshot=base_payload.source_license_name_snapshot,
|
||||||
|
legal_checked=bool(int(existing.get("legal_checked", 0))),
|
||||||
|
legal_checked_at=existing.get("legal_checked_at"),
|
||||||
|
legal_note=existing.get("legal_note"),
|
||||||
|
wp_post_id=existing.get("wp_post_id"),
|
||||||
|
wp_post_url=existing.get("wp_post_url"),
|
||||||
|
publish_attempts=int(existing.get("publish_attempts", 0)),
|
||||||
|
publish_last_error=existing.get("publish_last_error"),
|
||||||
|
published_to_wp_at=existing.get("published_to_wp_at"),
|
||||||
|
word_count=base_payload.word_count,
|
||||||
|
status=existing.get("status") or "new",
|
||||||
|
meta_json=_merge_ingestion_meta(existing.get("meta_json"), attribution, extraction_meta),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
article_id = upsert_article(payload)
|
||||||
if article_id:
|
if article_id:
|
||||||
articles_upserted += 1
|
articles_upserted += 1
|
||||||
feed_upserts += 1
|
feed_upserts += 1
|
||||||
|
|
|
||||||
|
|
@ -633,6 +633,13 @@ def _resolve_existing_article_id(payload: ArticleUpsert) -> int | None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def find_existing_article_for_upsert(payload: ArticleUpsert) -> dict[str, Any] | None:
|
||||||
|
article_id = _resolve_existing_article_id(payload)
|
||||||
|
if article_id is None:
|
||||||
|
return None
|
||||||
|
return get_article_by_id(article_id)
|
||||||
|
|
||||||
|
|
||||||
def upsert_article(payload: ArticleUpsert) -> int:
|
def upsert_article(payload: ArticleUpsert) -> int:
|
||||||
existing_id = _resolve_existing_article_id(payload)
|
existing_id = _resolve_existing_article_id(payload)
|
||||||
with get_conn() as conn:
|
with get_conn() as conn:
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,16 @@ from unittest.mock import patch
|
||||||
from backend.app import config as config_module
|
from backend.app import config as config_module
|
||||||
from backend.app.db import init_db
|
from backend.app.db import init_db
|
||||||
from backend.app.ingestion import run_ingestion
|
from backend.app.ingestion import run_ingestion
|
||||||
from backend.app.repositories import FeedCreate, SourceCreate, create_feed, create_source, list_articles
|
from backend.app.repositories import (
|
||||||
|
ArticleUpsert,
|
||||||
|
FeedCreate,
|
||||||
|
SourceCreate,
|
||||||
|
create_feed,
|
||||||
|
create_source,
|
||||||
|
get_article_by_id,
|
||||||
|
list_articles,
|
||||||
|
upsert_article,
|
||||||
|
)
|
||||||
from backend.app.source_extraction import ExtractedArticle
|
from backend.app.source_extraction import ExtractedArticle
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -118,6 +127,114 @@ class TestIngestion(unittest.TestCase):
|
||||||
mock_parse.assert_not_called()
|
mock_parse.assert_not_called()
|
||||||
mock_extract_article.assert_not_called()
|
mock_extract_article.assert_not_called()
|
||||||
|
|
||||||
|
@patch("backend.app.ingestion.extract_article")
|
||||||
|
@patch("backend.app.ingestion.feedparser.parse")
|
||||||
|
def test_ingestion_preserves_existing_work_and_skips_closed(self, mock_parse, mock_extract_article) -> None:
|
||||||
|
existing_closed_id = upsert_article(
|
||||||
|
ArticleUpsert(
|
||||||
|
feed_id=self.feed_id,
|
||||||
|
source_article_id="closed-1",
|
||||||
|
source_hash="closed-hash-1",
|
||||||
|
title="Alt Closed",
|
||||||
|
source_url="https://example.org/closed-article",
|
||||||
|
canonical_url="https://example.org/closed-article",
|
||||||
|
published_at=None,
|
||||||
|
author="Autor",
|
||||||
|
summary="Alt",
|
||||||
|
content_raw="Alt Raw",
|
||||||
|
content_rewritten="<p>Alt Rewrite Closed</p>",
|
||||||
|
image_urls_json=None,
|
||||||
|
press_contact="Kontakt Alt",
|
||||||
|
source_name_snapshot="Test Source",
|
||||||
|
source_terms_url_snapshot="https://example.org/terms",
|
||||||
|
source_license_name_snapshot="cc-by",
|
||||||
|
legal_checked=False,
|
||||||
|
legal_checked_at=None,
|
||||||
|
legal_note=None,
|
||||||
|
wp_post_id=42,
|
||||||
|
wp_post_url="https://wp.local/?p=42",
|
||||||
|
publish_attempts=2,
|
||||||
|
publish_last_error=None,
|
||||||
|
published_to_wp_at="2026-02-21T12:00:00Z",
|
||||||
|
word_count=3,
|
||||||
|
status="error", # UI: close
|
||||||
|
meta_json='{"generated_tags":["AltTag"]}',
|
||||||
|
)
|
||||||
|
)
|
||||||
|
existing_published_id = upsert_article(
|
||||||
|
ArticleUpsert(
|
||||||
|
feed_id=self.feed_id,
|
||||||
|
source_article_id="published-1",
|
||||||
|
source_hash="published-hash-1",
|
||||||
|
title="Alt Published",
|
||||||
|
source_url="https://example.org/published-article",
|
||||||
|
canonical_url="https://example.org/published-article",
|
||||||
|
published_at=None,
|
||||||
|
author="Autor",
|
||||||
|
summary="Alt",
|
||||||
|
content_raw="Alt Raw",
|
||||||
|
content_rewritten="<p>Alt Rewrite Published</p>",
|
||||||
|
image_urls_json=None,
|
||||||
|
press_contact="Kontakt Alt",
|
||||||
|
source_name_snapshot="Test Source",
|
||||||
|
source_terms_url_snapshot="https://example.org/terms",
|
||||||
|
source_license_name_snapshot="cc-by",
|
||||||
|
legal_checked=False,
|
||||||
|
legal_checked_at=None,
|
||||||
|
legal_note=None,
|
||||||
|
wp_post_id=77,
|
||||||
|
wp_post_url="https://wp.local/?p=77",
|
||||||
|
publish_attempts=3,
|
||||||
|
publish_last_error=None,
|
||||||
|
published_to_wp_at="2026-02-21T12:10:00Z",
|
||||||
|
word_count=3,
|
||||||
|
status="published",
|
||||||
|
meta_json='{"generated_tags":["Rheingas"],"image_review":{"selected_url":"https://img.local/1.jpg"}}',
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
mock_extract_article.return_value = ExtractedArticle(
|
||||||
|
title="Neu Titel",
|
||||||
|
author="Neu Autor",
|
||||||
|
canonical_url=None,
|
||||||
|
summary="Neu Summary",
|
||||||
|
content_text="Neu Volltext",
|
||||||
|
images=["https://example.org/a.jpg"],
|
||||||
|
press_contact=None,
|
||||||
|
extraction_error=None,
|
||||||
|
)
|
||||||
|
mock_parse.return_value = {
|
||||||
|
"etag": "etag-2",
|
||||||
|
"modified": "Tue, 18 Feb 2026 11:00:00 GMT",
|
||||||
|
"entries": [
|
||||||
|
{
|
||||||
|
"id": "closed-1",
|
||||||
|
"title": "Closed Entry",
|
||||||
|
"link": "https://example.org/closed-article",
|
||||||
|
"summary": "X",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "published-1",
|
||||||
|
"title": "Published Entry",
|
||||||
|
"link": "https://example.org/published-article",
|
||||||
|
"summary": "Y",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
stats = run_ingestion(feed_id=self.feed_id)
|
||||||
|
self.assertEqual(stats.status, "success")
|
||||||
|
closed_row = get_article_by_id(existing_closed_id) or {}
|
||||||
|
self.assertEqual(closed_row["status"], "error")
|
||||||
|
self.assertIn("Alt Rewrite Closed", closed_row.get("content_rewritten") or "")
|
||||||
|
self.assertEqual(closed_row.get("wp_post_id"), 42)
|
||||||
|
|
||||||
|
published_row = get_article_by_id(existing_published_id) or {}
|
||||||
|
self.assertEqual(published_row["status"], "published")
|
||||||
|
self.assertIn("Alt Rewrite Published", published_row.get("content_rewritten") or "")
|
||||||
|
self.assertEqual(published_row.get("wp_post_id"), 77)
|
||||||
|
self.assertIn("generated_tags", published_row.get("meta_json") or "")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue