diff --git a/backend/app/wordpress.py b/backend/app/wordpress.py index 8da5fc5..150bcd1 100644 --- a/backend/app/wordpress.py +++ b/backend/app/wordpress.py @@ -229,6 +229,42 @@ def _as_block_paragraphs(text: str) -> str: return "\n".join(lines) +def _strip_html_tags(raw: str) -> str: + text = re.sub(r"<[^>]+>", " ", raw or "") + return re.sub(r"\s+", " ", text).strip() + + +def _html_to_wp_blocks(html: str) -> str: + src = (html or "").strip() + if not src: + return "" + pattern = re.compile( + r"]*>[\s\S]*?|]*>[\s\S]*?

|]*>[\s\S]*?|]*>[\s\S]*?", + re.IGNORECASE, + ) + blocks: list[str] = [] + for match in pattern.finditer(src): + block_html = match.group(0).strip() + if not block_html: + continue + tag_match = re.match(r"<([a-z0-9]+)", block_html, re.IGNORECASE) + tag = (tag_match.group(1).lower() if tag_match else "") + if tag == "p": + blocks.append(f"{block_html}") + elif tag in {"ul", "ol"}: + ordered = tag == "ol" + if ordered: + blocks.append(f'{block_html}') + else: + blocks.append(f"{block_html}") + elif tag.startswith("h") and len(tag) == 2 and tag[1].isdigit(): + level = int(tag[1]) + blocks.append(f'{block_html}') + if blocks: + return "\n".join(blocks) + return _as_block_paragraphs(_strip_html_tags(src)) + + def _as_block_heading(level: int, text: str) -> str: safe_level = min(6, max(1, int(level))) return f'{escape(text)}' @@ -254,60 +290,18 @@ def _sanitize_publish_text(text: str) -> str: def _build_post_content(article: dict[str, Any]) -> tuple[str, str | None]: - source_url = article.get("source_url") or "" - canonical_url = article.get("canonical_url") or source_url summary = (article.get("summary") or "").strip() body_text = (article.get("content_rewritten") or article.get("content_raw") or "").strip() body_text = _sanitize_publish_text(body_text) if not body_text: body_text = summary - # Keep existing HTML if already present, otherwise wrap plain text into block paragraphs. has_html = bool(re.search(r"<[a-zA-Z][^>]*>", body_text)) - body_html = body_text if has_html else _as_block_paragraphs(body_text) + body_html = _html_to_wp_blocks(body_text) if has_html else _as_block_paragraphs(body_text) if not body_html: body_html = "

Kein Inhalt verfügbar.

" - elif has_html: - body_html = f"\n{body_html}\n" - - author = (article.get("author") or "").strip() - published_at = (article.get("published_at") or "").strip() - source_name = (article.get("source_name_snapshot") or "").strip() - license_name = (article.get("source_license_name_snapshot") or "").strip() - terms_url = (article.get("source_terms_url_snapshot") or "").strip() - - lead_html = f"

{escape(summary)}

\n" if summary else "" - - facts: list[str] = [] - if author: - facts.append(f"Autor: {escape(author)}") - if published_at: - facts.append(f"Veröffentlicht (Quelle): {escape(published_at)}") - if source_name: - facts.append(f"Quelle: {escape(source_name)}") - if license_name: - facts.append(f"Lizenz: {escape(license_name)}") - if terms_url: - facts.append(f"Lizenzhinweise: {escape(terms_url)}") - - facts_html = "" - if facts: - facts_html = _as_block_heading(3, "Artikeldetails") + "\n" + _as_block_list(facts) - - attribution_parts = [ - _as_block_heading(3, "Quelle"), - f'

Originalartikel: {escape(source_url)}

', - ] - if canonical_url and canonical_url != source_url: - attribution_parts.append( - f'

Canonical: {escape(canonical_url)}

' - ) - attribution_html = "\n".join(attribution_parts) - - content = f"{lead_html}{body_html}\n\n{facts_html}\n{attribution_html}".strip() - excerpt_source = summary or re.sub(r"\s+", " ", body_text).strip() - excerpt = excerpt_source[:220] if excerpt_source else None - return content, excerpt + content = body_html.strip() + return content, None def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]: diff --git a/backend/tests/test_wordpress.py b/backend/tests/test_wordpress.py index 4cafc55..20b0618 100644 --- a/backend/tests/test_wordpress.py +++ b/backend/tests/test_wordpress.py @@ -38,9 +38,9 @@ class TestWordpressPublish(unittest.TestCase): self.assertTrue(mock_upload_media.called) payload = mock_wp_request.call_args.kwargs["payload"] self.assertEqual(payload.get("featured_media"), 456) - self.assertIn("

Quelle

", payload.get("content", "")) - self.assertIn("Originalartikel", payload.get("content", "")) - self.assertEqual(payload.get("excerpt"), "Inhalt") + self.assertIn("", payload.get("content", "")) + self.assertIn("

Inhalt

", payload.get("content", "")) + self.assertNotIn("excerpt", payload) @patch("backend.app.wordpress._upload_featured_media") @patch("backend.app.wordpress._wp_request") @@ -79,6 +79,7 @@ class TestWordpressPublish(unittest.TestCase): self.assertNotIn("Firma GmbH", content) self.assertNotIn("Pressekontakt", content) self.assertIn("eigentliche Text", content) + self.assertNotIn("Artikeldetails", content) @patch("backend.app.wordpress._upload_featured_media") @patch("backend.app.wordpress._wp_request") @@ -114,6 +115,25 @@ class TestWordpressPublish(unittest.TestCase): payload = post_calls[0].kwargs.get("payload", {}) self.assertEqual(payload.get("tags"), [11, 12]) + @patch("backend.app.wordpress._upload_featured_media") + @patch("backend.app.wordpress._wp_request") + def test_publish_converts_html_to_wp_blocks_without_html_block(self, mock_wp_request, mock_upload_media) -> None: + mock_wp_request.return_value = {"id": 111, "link": "https://example.org/?p=111"} + article = { + "title": "Block Test", + "content_rewritten": "

Überschrift

Absatz 1

  • A
  • B
", + "source_url": "https://example.com/source", + "canonical_url": "https://example.com/source", + "meta_json": "{}", + } + publish_article_draft(article) + payload = mock_wp_request.call_args.kwargs["payload"] + content = payload.get("content", "") + self.assertIn("", content) + self.assertIn("", content) + self.assertNotIn("", content) + if __name__ == "__main__": unittest.main()