feat(wordpress): publish true Gutenberg blocks and remove auto summary/details sections

This commit is contained in:
Oliver 2026-02-21 14:55:20 +01:00
parent 93f52f72b9
commit 6332a9a399
No known key found for this signature in database
2 changed files with 62 additions and 48 deletions

View file

@ -229,6 +229,42 @@ def _as_block_paragraphs(text: str) -> str:
return "\n".join(lines)
def _strip_html_tags(raw: str) -> str:
text = re.sub(r"<[^>]+>", " ", raw or "")
return re.sub(r"\s+", " ", text).strip()
def _html_to_wp_blocks(html: str) -> str:
src = (html or "").strip()
if not src:
return ""
pattern = re.compile(
r"<h([2-6])[^>]*>[\s\S]*?</h\1>|<p[^>]*>[\s\S]*?</p>|<ul[^>]*>[\s\S]*?</ul>|<ol[^>]*>[\s\S]*?</ol>",
re.IGNORECASE,
)
blocks: list[str] = []
for match in pattern.finditer(src):
block_html = match.group(0).strip()
if not block_html:
continue
tag_match = re.match(r"<([a-z0-9]+)", block_html, re.IGNORECASE)
tag = (tag_match.group(1).lower() if tag_match else "")
if tag == "p":
blocks.append(f"<!-- wp:paragraph -->{block_html}<!-- /wp:paragraph -->")
elif tag in {"ul", "ol"}:
ordered = tag == "ol"
if ordered:
blocks.append(f'<!-- wp:list {{"ordered":true}} -->{block_html}<!-- /wp:list -->')
else:
blocks.append(f"<!-- wp:list -->{block_html}<!-- /wp:list -->")
elif tag.startswith("h") and len(tag) == 2 and tag[1].isdigit():
level = int(tag[1])
blocks.append(f'<!-- wp:heading {{"level":{level}}} -->{block_html}<!-- /wp:heading -->')
if blocks:
return "\n".join(blocks)
return _as_block_paragraphs(_strip_html_tags(src))
def _as_block_heading(level: int, text: str) -> str:
safe_level = min(6, max(1, int(level)))
return f'<!-- wp:heading {{"level":{safe_level}}} --><h{safe_level}>{escape(text)}</h{safe_level}><!-- /wp:heading -->'
@ -254,60 +290,18 @@ def _sanitize_publish_text(text: str) -> str:
def _build_post_content(article: dict[str, Any]) -> tuple[str, str | None]:
source_url = article.get("source_url") or ""
canonical_url = article.get("canonical_url") or source_url
summary = (article.get("summary") or "").strip()
body_text = (article.get("content_rewritten") or article.get("content_raw") or "").strip()
body_text = _sanitize_publish_text(body_text)
if not body_text:
body_text = summary
# Keep existing HTML if already present, otherwise wrap plain text into block paragraphs.
has_html = bool(re.search(r"<[a-zA-Z][^>]*>", body_text))
body_html = body_text if has_html else _as_block_paragraphs(body_text)
body_html = _html_to_wp_blocks(body_text) if has_html else _as_block_paragraphs(body_text)
if not body_html:
body_html = "<!-- wp:paragraph --><p>Kein Inhalt verfügbar.</p><!-- /wp:paragraph -->"
elif has_html:
body_html = f"<!-- wp:html -->\n{body_html}\n<!-- /wp:html -->"
author = (article.get("author") or "").strip()
published_at = (article.get("published_at") or "").strip()
source_name = (article.get("source_name_snapshot") or "").strip()
license_name = (article.get("source_license_name_snapshot") or "").strip()
terms_url = (article.get("source_terms_url_snapshot") or "").strip()
lead_html = f"<!-- wp:paragraph --><p><em>{escape(summary)}</em></p><!-- /wp:paragraph -->\n" if summary else ""
facts: list[str] = []
if author:
facts.append(f"<strong>Autor:</strong> {escape(author)}")
if published_at:
facts.append(f"<strong>Veröffentlicht (Quelle):</strong> {escape(published_at)}")
if source_name:
facts.append(f"<strong>Quelle:</strong> {escape(source_name)}")
if license_name:
facts.append(f"<strong>Lizenz:</strong> {escape(license_name)}")
if terms_url:
facts.append(f"<strong>Lizenzhinweise:</strong> <a href=\"{escape(terms_url)}\">{escape(terms_url)}</a>")
facts_html = ""
if facts:
facts_html = _as_block_heading(3, "Artikeldetails") + "\n" + _as_block_list(facts)
attribution_parts = [
_as_block_heading(3, "Quelle"),
f'<!-- wp:paragraph --><p>Originalartikel: <a href="{escape(source_url)}">{escape(source_url)}</a></p><!-- /wp:paragraph -->',
]
if canonical_url and canonical_url != source_url:
attribution_parts.append(
f'<!-- wp:paragraph --><p>Canonical: <a href="{escape(canonical_url)}">{escape(canonical_url)}</a></p><!-- /wp:paragraph -->'
)
attribution_html = "\n".join(attribution_parts)
content = f"{lead_html}{body_html}\n\n{facts_html}\n{attribution_html}".strip()
excerpt_source = summary or re.sub(r"\s+", " ", body_text).strip()
excerpt = excerpt_source[:220] if excerpt_source else None
return content, excerpt
content = body_html.strip()
return content, None
def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]:

View file

@ -38,9 +38,9 @@ class TestWordpressPublish(unittest.TestCase):
self.assertTrue(mock_upload_media.called)
payload = mock_wp_request.call_args.kwargs["payload"]
self.assertEqual(payload.get("featured_media"), 456)
self.assertIn("<h3>Quelle</h3>", payload.get("content", ""))
self.assertIn("Originalartikel", payload.get("content", ""))
self.assertEqual(payload.get("excerpt"), "Inhalt")
self.assertIn("<!-- wp:paragraph -->", payload.get("content", ""))
self.assertIn("<p>Inhalt</p>", payload.get("content", ""))
self.assertNotIn("excerpt", payload)
@patch("backend.app.wordpress._upload_featured_media")
@patch("backend.app.wordpress._wp_request")
@ -79,6 +79,7 @@ class TestWordpressPublish(unittest.TestCase):
self.assertNotIn("Firma GmbH", content)
self.assertNotIn("Pressekontakt", content)
self.assertIn("eigentliche Text", content)
self.assertNotIn("Artikeldetails", content)
@patch("backend.app.wordpress._upload_featured_media")
@patch("backend.app.wordpress._wp_request")
@ -114,6 +115,25 @@ class TestWordpressPublish(unittest.TestCase):
payload = post_calls[0].kwargs.get("payload", {})
self.assertEqual(payload.get("tags"), [11, 12])
@patch("backend.app.wordpress._upload_featured_media")
@patch("backend.app.wordpress._wp_request")
def test_publish_converts_html_to_wp_blocks_without_html_block(self, mock_wp_request, mock_upload_media) -> None:
mock_wp_request.return_value = {"id": 111, "link": "https://example.org/?p=111"}
article = {
"title": "Block Test",
"content_rewritten": "<h2>Überschrift</h2><p>Absatz 1</p><ul><li>A</li><li>B</li></ul>",
"source_url": "https://example.com/source",
"canonical_url": "https://example.com/source",
"meta_json": "{}",
}
publish_article_draft(article)
payload = mock_wp_request.call_args.kwargs["payload"]
content = payload.get("content", "")
self.assertIn("<!-- wp:heading", content)
self.assertIn("<!-- wp:paragraph -->", content)
self.assertIn("<!-- wp:list -->", content)
self.assertNotIn("<!-- wp:html -->", content)
if __name__ == "__main__":
unittest.main()