feat: rebuild rss-news backend, admin ui, and legal extraction pipeline

2026-02-18 09:46:44 +01:00 · 2026-02-18 09:46:44 +01:00 · 2c331d683b
commit 2c331d683b
parent d65c55d315
43 changed files with 3463 additions and 73 deletions
--- a/backend/tests/test_source_extraction.py
+++ b/backend/tests/test_source_extraction.py
@ -0,0 +1,69 @@
+import unittest
+from unittest.mock import patch
+
+from backend.app.source_extraction import extract_article
+
+
+SAMPLE_HTML = """
+<!doctype html>
+<html lang="de">
+<head>
+  <meta charset="utf-8" />
+  <meta property="og:title" content="Demo Meldung von Presseportal" />
+  <meta name="author" content="Max Mustermann" />
+  <meta name="description" content="Kurzbeschreibung aus der Originalseite" />
+  <meta property="og:image" content="/images/demo.jpg" />
+  <link rel="canonical" href="https://www.presseportal.de/pm/118273/6158137" />
+</head>
+<body>
+  <article>
+    <p>Dies ist der vollstaendige Inhalt des Artikels.</p>
+    <p>Weitere relevante Informationen fuer die Meldung.</p>
+    <h3>Pressekontakt</h3>
+    <p>Musterfirma GmbH, Kontakt: presse@example.org</p>
+  </article>
+</body>
+</html>
+"""
+
+
+class _FakeHeaders:
+    @staticmethod
+    def get_content_charset():
+        return "utf-8"
+
+
+class _FakeResponse:
+    headers = _FakeHeaders()
+
+    def __init__(self, body: str):
+        self._body = body.encode("utf-8")
+
+    def read(self):
+        return self._body
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        return False
+
+
+class TestSourceExtraction(unittest.TestCase):
+    @patch("backend.app.source_extraction.urlopen")
+    def test_extract_article_parses_author_images_and_press_contact(self, mock_urlopen) -> None:
+        mock_urlopen.return_value = _FakeResponse(SAMPLE_HTML)
+
+        extracted = extract_article("https://www.presseportal.de/pm/118273/6158137")
+        self.assertEqual(extracted.title, "Demo Meldung von Presseportal")
+        self.assertEqual(extracted.author, "Max Mustermann")
+        self.assertEqual(extracted.canonical_url, "https://www.presseportal.de/pm/118273/6158137")
+        self.assertIn("vollstaendige Inhalt", extracted.content_text or "")
+        self.assertIn("Kurzbeschreibung", extracted.summary or "")
+        self.assertIn("https://www.presseportal.de/images/demo.jpg", extracted.images)
+        self.assertIn("Pressekontakt", extracted.press_contact or "")
+        self.assertIsNone(extracted.extraction_error)
+
+
+if __name__ == "__main__":
+    unittest.main()