feat(wordpress): improve post html structure and excerpt generation
This commit is contained in:
parent
e68b6a41fd
commit
24d8e5ad0f
2 changed files with 79 additions and 11 deletions
|
|
@ -1,9 +1,11 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
from html import escape
|
||||
import json
|
||||
import mimetypes
|
||||
from pathlib import Path
|
||||
import re
|
||||
from typing import Any
|
||||
from urllib.parse import urlparse
|
||||
from urllib.request import Request, urlopen
|
||||
|
|
@ -130,6 +132,75 @@ def _upload_featured_media(
|
|||
return media_id
|
||||
|
||||
|
||||
def _as_paragraph_html(text: str) -> str:
|
||||
chunks = [chunk.strip() for chunk in re.split(r"\n{2,}", text.strip()) if chunk.strip()]
|
||||
if not chunks:
|
||||
return ""
|
||||
lines = []
|
||||
for chunk in chunks:
|
||||
compact = re.sub(r"\s*\n\s*", " ", chunk)
|
||||
lines.append(f"<p>{escape(compact)}</p>")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _build_post_content(article: dict[str, Any]) -> tuple[str, str | None]:
|
||||
source_url = article.get("source_url") or ""
|
||||
canonical_url = article.get("canonical_url") or source_url
|
||||
summary = (article.get("summary") or "").strip()
|
||||
body_text = (article.get("content_rewritten") or article.get("content_raw") or "").strip()
|
||||
if not body_text:
|
||||
body_text = summary
|
||||
|
||||
# Keep existing HTML if already present, otherwise wrap plain text into paragraphs.
|
||||
has_html = bool(re.search(r"<[a-zA-Z][^>]*>", body_text))
|
||||
body_html = body_text if has_html else _as_paragraph_html(body_text)
|
||||
if not body_html:
|
||||
body_html = "<p>Kein Inhalt verfügbar.</p>"
|
||||
|
||||
author = (article.get("author") or "").strip()
|
||||
published_at = (article.get("published_at") or "").strip()
|
||||
source_name = (article.get("source_name_snapshot") or "").strip()
|
||||
license_name = (article.get("source_license_name_snapshot") or "").strip()
|
||||
terms_url = (article.get("source_terms_url_snapshot") or "").strip()
|
||||
press_contact = (article.get("press_contact") or "").strip()
|
||||
|
||||
lead_html = f"<p><em>{escape(summary)}</em></p>\n" if summary else ""
|
||||
|
||||
facts: list[str] = []
|
||||
if author:
|
||||
facts.append(f"<li><strong>Autor:</strong> {escape(author)}</li>")
|
||||
if published_at:
|
||||
facts.append(f"<li><strong>Veröffentlicht (Quelle):</strong> {escape(published_at)}</li>")
|
||||
if source_name:
|
||||
facts.append(f"<li><strong>Quelle:</strong> {escape(source_name)}</li>")
|
||||
if license_name:
|
||||
facts.append(f"<li><strong>Lizenz:</strong> {escape(license_name)}</li>")
|
||||
if terms_url:
|
||||
facts.append(f"<li><strong>Lizenzhinweise:</strong> <a href=\"{escape(terms_url)}\">{escape(terms_url)}</a></li>")
|
||||
|
||||
facts_html = (
|
||||
"<h3>Artikeldetails</h3>\n<ul>\n" + "\n".join(facts) + "\n</ul>\n"
|
||||
if facts
|
||||
else ""
|
||||
)
|
||||
press_contact_html = (
|
||||
f"<h3>Pressekontakt</h3>\n<p>{escape(press_contact)}</p>\n" if press_contact else ""
|
||||
)
|
||||
attribution_html = (
|
||||
"<hr />\n<section class=\"rss-news-attribution\">\n"
|
||||
"<h3>Quelle</h3>\n"
|
||||
f"<p>Originalartikel: <a href=\"{escape(source_url)}\">{escape(source_url)}</a></p>\n"
|
||||
)
|
||||
if canonical_url and canonical_url != source_url:
|
||||
attribution_html += f"<p>Canonical: <a href=\"{escape(canonical_url)}\">{escape(canonical_url)}</a></p>\n"
|
||||
attribution_html += "</section>"
|
||||
|
||||
content = f"{lead_html}{body_html}\n\n{facts_html}{press_contact_html}{attribution_html}".strip()
|
||||
excerpt_source = summary or re.sub(r"\s+", " ", body_text).strip()
|
||||
excerpt = excerpt_source[:220] if excerpt_source else None
|
||||
return content, excerpt
|
||||
|
||||
|
||||
def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]:
|
||||
settings = get_settings()
|
||||
if not settings.wordpress_base_url or not settings.wordpress_username or not settings.wordpress_app_password:
|
||||
|
|
@ -137,18 +208,9 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]:
|
|||
|
||||
auth = _auth_header(settings.wordpress_username, settings.wordpress_app_password)
|
||||
|
||||
source_url = article.get("source_url") or ""
|
||||
canonical_url = article.get("canonical_url") or source_url
|
||||
title = (article.get("title") or "Ohne Titel").strip()
|
||||
body = (article.get("content_rewritten") or article.get("content_raw") or "").strip()
|
||||
if not body:
|
||||
body = article.get("summary") or ""
|
||||
|
||||
footer = "\n\n<hr />\n<p><strong>Quelle:</strong> "
|
||||
footer += f"<a href=\"{source_url}\">{source_url}</a></p>"
|
||||
if canonical_url and canonical_url != source_url:
|
||||
footer += f"\n<p><strong>Canonical:</strong> <a href=\"{canonical_url}\">{canonical_url}</a></p>"
|
||||
content = f"{body}{footer}"
|
||||
content, excerpt = _build_post_content(article)
|
||||
source_url = article.get("source_url") or ""
|
||||
|
||||
featured_media_id = None
|
||||
selected_image_url = _selected_image_url_from_meta(article.get("meta_json"))
|
||||
|
|
@ -166,6 +228,8 @@ def publish_article_draft(article: dict[str, Any]) -> tuple[int, str | None]:
|
|||
"content": content,
|
||||
"status": settings.wordpress_default_status,
|
||||
}
|
||||
if excerpt:
|
||||
payload["excerpt"] = excerpt
|
||||
if featured_media_id:
|
||||
payload["featured_media"] = featured_media_id
|
||||
|
||||
|
|
|
|||
|
|
@ -38,6 +38,9 @@ class TestWordpressPublish(unittest.TestCase):
|
|||
self.assertTrue(mock_upload_media.called)
|
||||
payload = mock_wp_request.call_args.kwargs["payload"]
|
||||
self.assertEqual(payload.get("featured_media"), 456)
|
||||
self.assertIn("<h3>Quelle</h3>", payload.get("content", ""))
|
||||
self.assertIn("Originalartikel", payload.get("content", ""))
|
||||
self.assertEqual(payload.get("excerpt"), "Inhalt")
|
||||
|
||||
@patch("backend.app.wordpress._upload_featured_media")
|
||||
@patch("backend.app.wordpress._wp_request")
|
||||
|
|
@ -57,6 +60,7 @@ class TestWordpressPublish(unittest.TestCase):
|
|||
self.assertFalse(mock_upload_media.called)
|
||||
payload = mock_wp_request.call_args.kwargs["payload"]
|
||||
self.assertNotIn("featured_media", payload)
|
||||
self.assertIn("<p>Inhalt</p>", payload.get("content", ""))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue