From 24d8e5ad0fdbd76608cdb1920e2c3149288655df Mon Sep 17 00:00:00 2001
From: Oliver G <oliver@giertz.biz>
Date: Sat, 21 Feb 2026 13:09:00 +0100
Subject: [PATCH] feat(wordpress): improve post html structure and excerpt
 generation

---
 backend/app/wordpress.py        | 86 ++++++++++++++++++++++++++++-----
 backend/tests/test_wordpress.py |  4 ++
 2 files changed, 79 insertions(+), 11 deletions(-)
diff --git a/backend/app/wordpress.py b/backend/app/wordpress.py
index 756a346..fbf4443 100644
--- a/backend/app/wordpress.py
+++ b/backend/app/wordpress.py
@@ -1,9 +1,11 @@
 from __future__ import annotations
 
 import base64
+from html import escape
 import json
 import mimetypes
 from pathlib import Path
+import re
 from typing import Any
 from urllib.parse import urlparse
 from urllib.request import Request, urlopen
@@ -130,6 +132,75 @@ def _upload_featured_media(
     return media_id
 
 
+def _as_paragraph_html(text: str) -> str:
+    chunks = [chunk.strip() for chunk in re.split(r"\n{2,}", text.strip()) if chunk.strip()]
+    if not chunks:
+        return ""
+    lines = []
+    for chunk in chunks:
+        compact = re.sub(r"\s*\n\s*", " ", chunk)
+        lines.append(f"<p>{escape(compact)}</p>")
+    return "\n".join(lines)
+
+
+def _build_post_content(article: dict[str, Any]) -> tuple[str, str | None]:
+    source_url = article.get("source_url") or ""
+    canonical_url = article.get("canonical_url") or source_url
+    summary = (article.get("summary") or "").strip()
+    body_text = (article.get("content_rewritten") or article.get("content_raw") or "").strip()
+    if not body_text:
+        body_text = summary
+
+    # Keep existing HTML if already present, otherwise wrap plain text into paragraphs.
+    has_html = bool(re.search(r"<[a-zA-Z][^>]*>", body_text))
+    body_html = body_text if has_html else _as_paragraph_html(body_text)
+    if not body_html:
+        body_html = "<p>Kein Inhalt verfügbar.</p>"
+
+    author = (article.get("author") or "").strip()
+    published_at = (article.get("published_at") or "").strip()
+    source_name = (article.get("source_name_snapshot") or "").strip()
+    license_name = (article.get("source_license_name_snapshot") or "").strip()
+    terms_url = (article.get("source_terms_url_snapshot") or "").strip()
+    press_contact = (article.get("press_contact") or "").strip()
+
+    lead_html = f"<p><em>{escape(summary)}</em></p>\n" if summary else ""
+
+    facts: list[str] = []
+    if author:
+        facts.append(f"<li><strong>Autor:</strong> {escape(author)}</li>")
+    if published_at:
+        facts.append(f"<li><strong>Veröffentlicht (Quelle):</strong> {escape(published_at)}</li>")
+    if source_name:
+        facts.append(f"<li><strong>Quelle:</strong> {escape(source_name)}</li>")
+    if license_name:
+        facts.append(f"<li><strong>Lizenz:</strong> {escape(license_name)}</li>")
+    if terms_url:
+        facts.append(f"<li><strong>Lizenzhinweise:</strong> <a href=\"{escape(terms_url)}\">{escape(terms_url)}</a></li>")
+
+    facts_html = (
+        "<h3>Artikeldetails</h3>\n<ul>\n" + "\n".join(facts) + "\n</ul>\n"
+        if facts
+        else ""
+    )
+    press_contact_html = (
+        f"<h3>Pressekontakt</h3>\n<p>{escape(press_contact)}</p>\n" if press_contact else ""
+    )
+    attribution_html = (
+        "<hr />\n<section class=\"rss-news-attribution\">\n"
+        "<h3>Quelle</h3>\n"
+        f"<p>Originalartikel: <a href=\"{escape(source_url)}\">{escape(source_url)}</a></p>\n"
+    )
+    if canonical_url and canonical_url != source_url:
+        attribution_html += f"<p>Canonical: <a href=\"{escape(canonical_url)}\">{escape(canonical_url)}</a></p>\n"
+    attribution_html += "</section>"
+
+    content = f"{lead_html}{body_html}\n\n{facts_html}{press_contact_html}{attribution_html}".strip()
+    excerpt_source = summary or re.sub(r"\s+", " ", body_text).strip()
+    excerpt = excerpt_source[:220] if excerpt_source else None
+    return content, excerpt
+
+
 def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]:
     settings = get_settings()
     if not settings.wordpress_base_url or not settings.wordpress_username or not settings.wordpress_app_password:
@@ -137,18 +208,9 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]:
 
     auth = _auth_header(settings.wordpress_username, settings.wordpress_app_password)
 
-    source_url = article.get("source_url") or ""
-    canonical_url = article.get("canonical_url") or source_url
     title = (article.get("title") or "Ohne Titel").strip()
-    body = (article.get("content_rewritten") or article.get("content_raw") or "").strip()
-    if not body:
-        body = article.get("summary") or ""
-
-    footer = "\n\n<hr />\n<p><strong>Quelle:</strong> "
-    footer += f"<a href=\"{source_url}\">{source_url}</a></p>"
-    if canonical_url and canonical_url != source_url:
-        footer += f"\n<p><strong>Canonical:</strong> <a href=\"{canonical_url}\">{canonical_url}</a></p>"
-    content = f"{body}{footer}"
+    content, excerpt = _build_post_content(article)
+    source_url = article.get("source_url") or ""
 
     featured_media_id = None
     selected_image_url = _selected_image_url_from_meta(article.get("meta_json"))
@@ -166,6 +228,8 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]:
         "content": content,
         "status": settings.wordpress_default_status,
     }
+    if excerpt:
+        payload["excerpt"] = excerpt
     if featured_media_id:
         payload["featured_media"] = featured_media_id
 
diff --git a/backend/tests/test_wordpress.py b/backend/tests/test_wordpress.py
index f12c6e1..2c9094e 100644
--- a/backend/tests/test_wordpress.py
+++ b/backend/tests/test_wordpress.py
@@ -38,6 +38,9 @@ class TestWordpressPublish(unittest.TestCase):
         self.assertTrue(mock_upload_media.called)
         payload = mock_wp_request.call_args.kwargs["payload"]
         self.assertEqual(payload.get("featured_media"), 456)
+        self.assertIn("<h3>Quelle</h3>", payload.get("content", ""))
+        self.assertIn("Originalartikel", payload.get("content", ""))
+        self.assertEqual(payload.get("excerpt"), "Inhalt")
 
     @patch("backend.app.wordpress._upload_featured_media")
     @patch("backend.app.wordpress._wp_request")
@@ -57,6 +60,7 @@ class TestWordpressPublish(unittest.TestCase):
         self.assertFalse(mock_upload_media.called)
         payload = mock_wp_request.call_args.kwargs["payload"]
         self.assertNotIn("featured_media", payload)
+        self.assertIn("<p>Inhalt</p>", payload.get("content", ""))
 
 
 if __name__ == "__main__":