feat: rebuild rss-news backend, admin ui, and legal extraction pipeline
This commit is contained in:
parent
d65c55d315
commit
2c331d683b
43 changed files with 3463 additions and 73 deletions
69
backend/tests/test_source_extraction.py
Normal file
69
backend/tests/test_source_extraction.py
Normal file
|
|
@ -0,0 +1,69 @@
|
|||
import unittest
|
||||
from unittest.mock import patch
|
||||
|
||||
from backend.app.source_extraction import extract_article
|
||||
|
||||
|
||||
SAMPLE_HTML = """
|
||||
<!doctype html>
|
||||
<html lang="de">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta property="og:title" content="Demo Meldung von Presseportal" />
|
||||
<meta name="author" content="Max Mustermann" />
|
||||
<meta name="description" content="Kurzbeschreibung aus der Originalseite" />
|
||||
<meta property="og:image" content="/images/demo.jpg" />
|
||||
<link rel="canonical" href="https://www.presseportal.de/pm/118273/6158137" />
|
||||
</head>
|
||||
<body>
|
||||
<article>
|
||||
<p>Dies ist der vollstaendige Inhalt des Artikels.</p>
|
||||
<p>Weitere relevante Informationen fuer die Meldung.</p>
|
||||
<h3>Pressekontakt</h3>
|
||||
<p>Musterfirma GmbH, Kontakt: presse@example.org</p>
|
||||
</article>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
|
||||
class _FakeHeaders:
|
||||
@staticmethod
|
||||
def get_content_charset():
|
||||
return "utf-8"
|
||||
|
||||
|
||||
class _FakeResponse:
|
||||
headers = _FakeHeaders()
|
||||
|
||||
def __init__(self, body: str):
|
||||
self._body = body.encode("utf-8")
|
||||
|
||||
def read(self):
|
||||
return self._body
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
return False
|
||||
|
||||
|
||||
class TestSourceExtraction(unittest.TestCase):
|
||||
@patch("backend.app.source_extraction.urlopen")
|
||||
def test_extract_article_parses_author_images_and_press_contact(self, mock_urlopen) -> None:
|
||||
mock_urlopen.return_value = _FakeResponse(SAMPLE_HTML)
|
||||
|
||||
extracted = extract_article("https://www.presseportal.de/pm/118273/6158137")
|
||||
self.assertEqual(extracted.title, "Demo Meldung von Presseportal")
|
||||
self.assertEqual(extracted.author, "Max Mustermann")
|
||||
self.assertEqual(extracted.canonical_url, "https://www.presseportal.de/pm/118273/6158137")
|
||||
self.assertIn("vollstaendige Inhalt", extracted.content_text or "")
|
||||
self.assertIn("Kurzbeschreibung", extracted.summary or "")
|
||||
self.assertIn("https://www.presseportal.de/images/demo.jpg", extracted.images)
|
||||
self.assertIn("Pressekontakt", extracted.press_contact or "")
|
||||
self.assertIsNone(extracted.extraction_error)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue