From 24d8e5ad0fdbd76608cdb1920e2c3149288655df Mon Sep 17 00:00:00 2001 From: Oliver G Date: Sat, 21 Feb 2026 13:09:00 +0100 Subject: [PATCH] feat(wordpress): improve post html structure and excerpt generation --- backend/app/wordpress.py | 86 ++++++++++++++++++++++++++++----- backend/tests/test_wordpress.py | 4 ++ 2 files changed, 79 insertions(+), 11 deletions(-) diff --git a/backend/app/wordpress.py b/backend/app/wordpress.py index 756a346..fbf4443 100644 --- a/backend/app/wordpress.py +++ b/backend/app/wordpress.py @@ -1,9 +1,11 @@ from __future__ import annotations import base64 +from html import escape import json import mimetypes from pathlib import Path +import re from typing import Any from urllib.parse import urlparse from urllib.request import Request, urlopen @@ -130,6 +132,75 @@ def _upload_featured_media( return media_id +def _as_paragraph_html(text: str) -> str: + chunks = [chunk.strip() for chunk in re.split(r"\n{2,}", text.strip()) if chunk.strip()] + if not chunks: + return "" + lines = [] + for chunk in chunks: + compact = re.sub(r"\s*\n\s*", " ", chunk) + lines.append(f"

{escape(compact)}

") + return "\n".join(lines) + + +def _build_post_content(article: dict[str, Any]) -> tuple[str, str | None]: + source_url = article.get("source_url") or "" + canonical_url = article.get("canonical_url") or source_url + summary = (article.get("summary") or "").strip() + body_text = (article.get("content_rewritten") or article.get("content_raw") or "").strip() + if not body_text: + body_text = summary + + # Keep existing HTML if already present, otherwise wrap plain text into paragraphs. + has_html = bool(re.search(r"<[a-zA-Z][^>]*>", body_text)) + body_html = body_text if has_html else _as_paragraph_html(body_text) + if not body_html: + body_html = "

Kein Inhalt verfügbar.

" + + author = (article.get("author") or "").strip() + published_at = (article.get("published_at") or "").strip() + source_name = (article.get("source_name_snapshot") or "").strip() + license_name = (article.get("source_license_name_snapshot") or "").strip() + terms_url = (article.get("source_terms_url_snapshot") or "").strip() + press_contact = (article.get("press_contact") or "").strip() + + lead_html = f"

{escape(summary)}

\n" if summary else "" + + facts: list[str] = [] + if author: + facts.append(f"
  • Autor: {escape(author)}
  • ") + if published_at: + facts.append(f"
  • Veröffentlicht (Quelle): {escape(published_at)}
  • ") + if source_name: + facts.append(f"
  • Quelle: {escape(source_name)}
  • ") + if license_name: + facts.append(f"
  • Lizenz: {escape(license_name)}
  • ") + if terms_url: + facts.append(f"
  • Lizenzhinweise: {escape(terms_url)}
  • ") + + facts_html = ( + "

    Artikeldetails

    \n\n" + if facts + else "" + ) + press_contact_html = ( + f"

    Pressekontakt

    \n

    {escape(press_contact)}

    \n" if press_contact else "" + ) + attribution_html = ( + "
    \n
    \n" + "

    Quelle

    \n" + f"

    Originalartikel: {escape(source_url)}

    \n" + ) + if canonical_url and canonical_url != source_url: + attribution_html += f"

    Canonical: {escape(canonical_url)}

    \n" + attribution_html += "
    " + + content = f"{lead_html}{body_html}\n\n{facts_html}{press_contact_html}{attribution_html}".strip() + excerpt_source = summary or re.sub(r"\s+", " ", body_text).strip() + excerpt = excerpt_source[:220] if excerpt_source else None + return content, excerpt + + def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]: settings = get_settings() if not settings.wordpress_base_url or not settings.wordpress_username or not settings.wordpress_app_password: @@ -137,18 +208,9 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]: auth = _auth_header(settings.wordpress_username, settings.wordpress_app_password) - source_url = article.get("source_url") or "" - canonical_url = article.get("canonical_url") or source_url title = (article.get("title") or "Ohne Titel").strip() - body = (article.get("content_rewritten") or article.get("content_raw") or "").strip() - if not body: - body = article.get("summary") or "" - - footer = "\n\n
    \n

    Quelle: " - footer += f"{source_url}

    " - if canonical_url and canonical_url != source_url: - footer += f"\n

    Canonical: {canonical_url}

    " - content = f"{body}{footer}" + content, excerpt = _build_post_content(article) + source_url = article.get("source_url") or "" featured_media_id = None selected_image_url = _selected_image_url_from_meta(article.get("meta_json")) @@ -166,6 +228,8 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]: "content": content, "status": settings.wordpress_default_status, } + if excerpt: + payload["excerpt"] = excerpt if featured_media_id: payload["featured_media"] = featured_media_id diff --git a/backend/tests/test_wordpress.py b/backend/tests/test_wordpress.py index f12c6e1..2c9094e 100644 --- a/backend/tests/test_wordpress.py +++ b/backend/tests/test_wordpress.py @@ -38,6 +38,9 @@ class TestWordpressPublish(unittest.TestCase): self.assertTrue(mock_upload_media.called) payload = mock_wp_request.call_args.kwargs["payload"] self.assertEqual(payload.get("featured_media"), 456) + self.assertIn("

    Quelle

    ", payload.get("content", "")) + self.assertIn("Originalartikel", payload.get("content", "")) + self.assertEqual(payload.get("excerpt"), "Inhalt") @patch("backend.app.wordpress._upload_featured_media") @patch("backend.app.wordpress._wp_request") @@ -57,6 +60,7 @@ class TestWordpressPublish(unittest.TestCase): self.assertFalse(mock_upload_media.called) payload = mock_wp_request.call_args.kwargs["payload"] self.assertNotIn("featured_media", payload) + self.assertIn("

    Inhalt

    ", payload.get("content", "")) if __name__ == "__main__":