Grundfunktionen optimiert

This commit is contained in:
Oliver 2025-08-16 11:13:10 +02:00
parent 050e08859c
commit 0c84dd1a1a
No known key found for this signature in database
8 changed files with 4866 additions and 315 deletions

750
app.py
View file

@ -1,4 +1,4 @@
# app.py (aktualisiert mit Feed-Dropdown) # app.py
import streamlit as st import streamlit as st
from datetime import datetime from datetime import datetime
@ -13,151 +13,659 @@ from main import (
from utils.dalle_generator import generate_dalle_image from utils.dalle_generator import generate_dalle_image
import os import os
from collections import Counter from collections import Counter
import time
st.set_page_config(page_title="📰 RSS Artikel Manager", layout="wide") # === Page Configuration ===
st.title("📰 RSS Artikel Manager") st.set_page_config(
page_title="📰 RSS Artikel Manager",
layout="wide",
initial_sidebar_state="collapsed"
)
# === Sidebar: Feed-Verwaltung === # === Custom CSS für modernes Design ===
st.sidebar.header("📡 RSS Feeds verwalten") st.markdown("""
feeds = load_feeds() <style>
new_feed = st.sidebar.text_input("Neuen RSS Feed hinzufügen") /* Hauptcontainer */
if st.sidebar.button("Feed hinzufügen"): .main-header {
if new_feed and new_feed not in [f.get("url", f) for f in feeds]: background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
feeds.append({"url": new_feed, "name": "Neuer Feed"}) padding: 2rem;
save_feeds(feeds) border-radius: 10px;
st.sidebar.success("Feed hinzugefügt") margin-bottom: 2rem;
color: white;
text-align: center;
}
/* Artikel Cards */
.article-card {
background: white;
border-radius: 12px;
padding: 1.5rem;
margin-bottom: 1rem;
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
border-left: 4px solid #667eea;
transition: transform 0.2s;
}
.article-card:hover {
transform: translateY(-2px);
box-shadow: 0 8px 15px rgba(0, 0, 0, 0.15);
}
/* Status Badges */
.status-badge {
padding: 0.3rem 0.8rem;
border-radius: 20px;
font-size: 0.8rem;
font-weight: bold;
margin-right: 0.5rem;
}
.status-new { background-color: #e3f2fd; color: #1976d2; }
.status-rewrite { background-color: #fff3e0; color: #f57c00; }
.status-process { background-color: #f3e5f5; color: #7b1fa2; }
.status-online { background-color: #e8f5e8; color: #388e3c; }
.status-hold { background-color: #fce4ec; color: #c2185b; }
.status-trash { background-color: #ffebee; color: #d32f2f; }
/* Filter Section */
.filter-section {
background: #f8f9fa;
padding: 1.5rem;
border-radius: 10px;
margin-bottom: 2rem;
}
/* Stats Cards */
.stats-card {
background: white;
padding: 1.5rem;
border-radius: 10px;
text-align: center;
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
}
.stats-number {
font-size: 2rem;
font-weight: bold;
color: #667eea;
}
/* Action Buttons */
.action-button {
margin: 0.25rem;
}
/* Image Gallery */
.image-gallery {
display: flex;
gap: 1rem;
overflow-x: auto;
padding: 1rem 0;
}
.image-item {
min-width: 200px;
text-align: center;
}
</style>
""", unsafe_allow_html=True)
if st.sidebar.button("🔄 Alle Feeds neu laden"): # === Initialize Session State ===
existing_ids = [a["id"] for a in load_articles()] if 'selected_articles' not in st.session_state:
process_articles(existing_ids) st.session_state.selected_articles = set()
st.rerun() if 'search_query' not in st.session_state:
st.session_state.search_query = ""
if 'status_filter' not in st.session_state:
st.session_state.status_filter = "New"
if 'feed_filter' not in st.session_state:
st.session_state.feed_filter = "Alle"
if st.sidebar.button("✍️ Artikel umschreiben (Rewrite)"): # === Helper Functions ===
rewrite_articles() def get_status_badge(status):
st.rerun() """Erstellt einen farbigen Status-Badge"""
status_classes = {
"New": "status-new",
"Rewrite": "status-rewrite",
"Process": "status-process",
"Online": "status-online",
"On Hold": "status-hold",
"Trash": "status-trash"
}
class_name = status_classes.get(status, "status-new")
return f'<span class="status-badge {class_name}">{status}</span>'
# === Hauptbereich: Artikelübersicht === def format_date(date_str):
st.header("📋 Artikelübersicht") """Formatiert Datum für bessere Lesbarkeit"""
status_filter = st.selectbox("Status filtern", ["Alle", "New", "Rewrite", "Process", "Online", "On Hold", "Trash"], index=1) try:
if "GMT" in date_str or "+" in date_str:
return datetime.strptime(date_str, "%a, %d %b %Y %H:%M:%S %z").strftime("%d.%m.%Y %H:%M")
else:
return date_str[:16].replace("T", " ")
except:
return date_str[:10]
all_articles = load_articles() def get_word_count(text):
articles = all_articles """Zählt Wörter im Text"""
return len(text.split()) if text else 0
if status_filter != "Alle": def show_notification(message, type="success"):
articles = [a for a in articles if a.get("status") == status_filter] """Zeigt eine Benachrichtigung an"""
if type == "success":
st.success(message)
elif type == "error":
st.error(message)
elif type == "warning":
st.warning(message)
elif type == "info":
st.info(message)
# === Feed-Filter === # === Header ===
source_to_name = {f.get("url"): f.get("name", "unidentified") for f in feeds} st.markdown("""
source_counter = Counter([a.get("source", "unidentified") for a in articles]) <div class="main-header">
<h1>📰 RSS Artikel Manager</h1>
<p>Moderne Verwaltung deiner RSS-Feeds und Artikel</p>
</div>
""", unsafe_allow_html=True)
feed_options = ["Alle ({})".format(len(articles))] # === Tab Navigation ===
feed_map = {} tab1, tab2, tab3, tab4, tab5 = st.tabs([
"📋 Dashboard",
"📰 Artikel",
"📡 Feeds",
"🖼️ Bilder",
"📊 Statistiken"
])
for source, count in source_counter.items(): # === Dashboard Tab ===
name = source_to_name.get(source, "unidentified") with tab1:
label = f"{name} ({count})" st.header("📊 Übersicht")
feed_options.append(label)
feed_map[label] = source # Lade Daten
all_articles = load_articles()
feeds = load_feeds()
# Statistiken
col1, col2, col3, col4 = st.columns(4)
with col1:
st.markdown("""
<div class="stats-card">
<div class="stats-number">{}</div>
<div>Gesamt Artikel</div>
</div>
""".format(len(all_articles)), unsafe_allow_html=True)
with col2:
new_count = len([a for a in all_articles if a.get("status") == "New"])
st.markdown("""
<div class="stats-card">
<div class="stats-number">{}</div>
<div>Neue Artikel</div>
</div>
""".format(new_count), unsafe_allow_html=True)
with col3:
st.markdown("""
<div class="stats-card">
<div class="stats-number">{}</div>
<div>RSS Feeds</div>
</div>
""".format(len(feeds)), unsafe_allow_html=True)
with col4:
online_count = len([a for a in all_articles if a.get("status") == "Online"])
st.markdown("""
<div class="stats-card">
<div class="stats-number">{}</div>
<div>Online</div>
</div>
""".format(online_count), unsafe_allow_html=True)
st.markdown("<br>", unsafe_allow_html=True)
# Quick Actions
st.subheader("⚡ Schnellaktionen")
col1, col2, col3 = st.columns(3)
with col1:
if st.button("🔄 Alle Feeds aktualisieren", use_container_width=True):
with st.spinner("Feeds werden aktualisiert..."):
existing_ids = [a["id"] for a in all_articles]
process_articles(existing_ids)
show_notification("Feeds erfolgreich aktualisiert!")
time.sleep(1)
st.rerun()
with col2:
if st.button("✍️ Artikel umschreiben", use_container_width=True):
rewrite_count = len([a for a in all_articles if a.get("status") == "Rewrite"])
if rewrite_count > 0:
with st.spinner(f"{rewrite_count} Artikel werden umgeschrieben..."):
rewrite_articles()
show_notification(f"{rewrite_count} Artikel erfolgreich umgeschrieben!")
time.sleep(1)
st.rerun()
else:
show_notification("Keine Artikel zum Umschreiben gefunden.", "info")
with col3:
if st.button("🧹 Aufräumen", use_container_width=True):
trash_count = len([a for a in all_articles if a.get("status") == "Trash"])
if trash_count > 0:
show_notification(f"{trash_count} Artikel im Papierkorb gefunden.", "info")
else:
show_notification("Keine Artikel zum Aufräumen gefunden.", "info")
# Neueste Artikel Preview
st.subheader("🕒 Neueste Artikel")
recent_articles = sorted(all_articles, key=lambda x: x.get("date", ""), reverse=True)[:5]
for article in recent_articles:
st.markdown(f"""
<div class="article-card">
<div style="display: flex; justify-content: space-between; align-items: center;">
<div>
<strong>{article.get('title', 'Kein Titel')}</strong>
<br>
<small>{format_date(article.get('date', ''))}</small>
</div>
<div>
{get_status_badge(article.get('status', 'New'))}
</div>
</div>
</div>
""", unsafe_allow_html=True)
selected_feed_label = st.selectbox("Feed-Auswahl", feed_options) # === Artikel Tab ===
with tab2:
if selected_feed_label != feed_options[0]: # nicht „Alle“ st.header("📰 Artikel verwalten")
selected_source = feed_map[selected_feed_label]
articles = [a for a in articles if a.get("source", "unidentified") == selected_source] # Filter Section
st.markdown('<div class="filter-section">', unsafe_allow_html=True)
# === Artikel-Tabelle === st.subheader("🔍 Filter & Suche")
if articles:
st.markdown("### 📄 Übersichtstabelle") col1, col2, col3 = st.columns(3)
st.write("**Spaltenübersicht:** Auswahl | Datum | Titel | Zusammenfassung | Wörter | Tags | Status")
with col1:
for article in articles: status_options = ["Alle", "New", "Rewrite", "Process", "Online", "On Hold", "Trash"]
st.session_state.status_filter = st.selectbox(
"Status",
status_options,
index=status_options.index(st.session_state.status_filter)
)
with col2:
# Feed Filter
source_to_name = {f.get("url"): f.get("name", "Unbekannt") for f in feeds}
source_counter = Counter([a.get("source", "Unbekannt") for a in all_articles])
feed_options = ["Alle"]
feed_map = {"Alle": None}
for source, count in source_counter.items():
name = source_to_name.get(source, "Unbekannt")
label = f"{name} ({count})"
feed_options.append(label)
feed_map[label] = source
selected_feed_label = st.selectbox("Feed", feed_options)
st.session_state.feed_filter = selected_feed_label
with col3:
st.session_state.search_query = st.text_input(
"Suche",
value=st.session_state.search_query,
placeholder="Titel, Text oder Tags durchsuchen..."
)
st.markdown('</div>', unsafe_allow_html=True)
# Filter anwenden
filtered_articles = all_articles
# Status Filter
if st.session_state.status_filter != "Alle":
filtered_articles = [a for a in filtered_articles if a.get("status") == st.session_state.status_filter]
# Feed Filter
if st.session_state.feed_filter != "Alle":
selected_source = feed_map[st.session_state.feed_filter]
filtered_articles = [a for a in filtered_articles if a.get("source") == selected_source]
# Suche
if st.session_state.search_query:
query = st.session_state.search_query.lower()
filtered_articles = [
a for a in filtered_articles
if query in a.get("title", "").lower()
or query in a.get("text", "").lower()
or any(query in tag.lower() for tag in a.get("tags", []))
]
# Ergebnisse anzeigen
st.write(f"**{len(filtered_articles)} Artikel gefunden**")
# Artikel Cards
for article in filtered_articles:
has_incomplete_images = any( has_incomplete_images = any(
not all(k in img and img[k] for k in ("caption", "copyright", "copyright_url")) not all(k in img and img[k] for k in ("caption", "copyright", "copyright_url"))
for img in article.get("images", []) for img in article.get("images", [])
) )
cols = st.columns([0.05, 0.1, 0.2, 0.25, 0.05, 0.2, 0.15]) # Article Card
with cols[0]: st.markdown('<div class="article-card">', unsafe_allow_html=True)
st.checkbox("", key=f"select_{article['id']}")
with cols[1]: # Header
date_str = article["date"] col1, col2 = st.columns([3, 1])
if "GMT" in date_str or "+" in date_str:
date_str = datetime.strptime(date_str, "%a, %d %b %Y %H:%M:%S %z").strftime("%d.%m.%y") with col1:
else: title = article.get("title", "Kein Titel")
date_str = date_str[:10]
st.markdown(date_str)
with cols[2]:
title = f"**{article['title']}**"
if has_incomplete_images: if has_incomplete_images:
title += " ⚠️" title += " ⚠️"
st.markdown(title) st.markdown(f"**{title}**")
with cols[3]: st.markdown(f"📅 {format_date(article.get('date', ''))}")
st.markdown(article.get("summary", "")[:150])
with cols[4]: with col2:
st.markdown(str(len(article.get("text", "").split()))) st.markdown(get_status_badge(article.get("status", "New")), unsafe_allow_html=True)
with cols[5]:
st.markdown(", ".join(article.get("tags", []))) # Content Preview
with cols[6]: summary = article.get("summary", "")[:200]
if len(summary) == 200:
summary += "..."
st.markdown(summary)
# Meta Info
col1, col2, col3 = st.columns(3)
with col1:
st.markdown(f"📝 **{get_word_count(article.get('text', ''))} Wörter**")
with col2:
tags = article.get("tags", [])
if tags:
st.markdown(f"🏷️ {', '.join(tags[:3])}{'...' if len(tags) > 3 else ''}")
with col3:
source_name = source_to_name.get(article.get("source", ""), "Unbekannt")
st.markdown(f"📡 {source_name}")
# Actions
col1, col2, col3, col4 = st.columns(4)
with col1:
# Status ändern
status_options = ["New", "Rewrite", "Process", "Online", "On Hold", "Trash"] status_options = ["New", "Rewrite", "Process", "Online", "On Hold", "Trash"]
current_status = article.get("status", "New") current_status = article.get("status", "New")
new_status = st.selectbox("", status_options, index=status_options.index(current_status), key=f"status_{article['id']}") new_status = st.selectbox(
"Status",
status_options,
index=status_options.index(current_status),
key=f"status_{article['id']}"
)
if new_status != current_status: if new_status != current_status:
article["status"] = new_status # Artikel in der Liste finden und aktualisieren
for idx, art in enumerate(all_articles): for idx, art in enumerate(all_articles):
if art["id"] == article["id"]: if art["id"] == article["id"]:
all_articles[idx] = article all_articles[idx]["status"] = new_status
break break
save_articles(all_articles) save_articles(all_articles)
show_notification(f"Status auf '{new_status}' geändert!")
time.sleep(0.5)
st.rerun() st.rerun()
with st.expander(f"🔍 {article['title']}"): with col2:
st.markdown("#### ✍️ Artikeltext") if st.button("📋 Text kopieren", key=f"copy_{article['id']}"):
st.code(f"{article['title']}\n\n{article['text']}\n\nQuelle: {article['link']}", language="markdown") text_to_copy = f"{article['title']}\n\n{article['text']}\n\nQuelle: {article['link']}"
st.code(text_to_copy, language="markdown")
st.markdown("#### 🌿 Tags") show_notification("Text bereit zum Kopieren!")
st.code(", ".join(article.get("tags", [])), language="markdown")
with col3:
st.markdown("#### 🖼️ Bilder") if st.button("🔗 Original öffnen", key=f"link_{article['id']}"):
for i, img in enumerate(article.get("images", [])): st.markdown(f"[🔗 Artikel öffnen]({article.get('link', '#')})")
st.image(img["url"], caption=img.get("caption", "Kein Titel"), use_column_width=True)
with col4:
with st.form(f"edit_image_{article['id']}_{i}", clear_on_submit=False): # Details anzeigen
caption = st.text_input("Bildtitel", value=img.get("caption", "")) if st.button("📖 Details", key=f"details_{article['id']}"):
copyright = st.text_input("Copyright", value=img.get("copyright", "")) st.session_state[f"show_details_{article['id']}"] = not st.session_state.get(f"show_details_{article['id']}", False)
copyright_url = st.text_input("Quelle", value=img.get("copyright_url", ""))
if st.form_submit_button("Änderungen speichern"): # Details Section (wenn erweitert)
img["caption"] = caption or "Kein Bildtitel vorhanden" if st.session_state.get(f"show_details_{article['id']}", False):
img["copyright"] = copyright or "Unbekannt" st.markdown("---")
img["copyright_url"] = copyright_url or "#"
for idx, art in enumerate(all_articles): # Artikel Text
if art["id"] == article["id"]: with st.expander("📝 Volltext", expanded=False):
all_articles[idx] = article st.code(article.get("text", ""), language="markdown")
break
save_articles(all_articles) # Tags bearbeiten
st.success("Bilddaten gespeichert") with st.expander("🏷️ Tags bearbeiten", expanded=False):
current_tags = ", ".join(article.get("tags", []))
new_tags = st.text_area("Tags (getrennt durch Komma)", value=current_tags, key=f"tags_{article['id']}")
if st.button("Tags speichern", key=f"save_tags_{article['id']}"):
tag_list = [tag.strip() for tag in new_tags.split(",") if tag.strip()]
for idx, art in enumerate(all_articles):
if art["id"] == article["id"]:
all_articles[idx]["tags"] = tag_list
break
save_articles(all_articles)
show_notification("Tags gespeichert!")
st.rerun()
# Bilder
if article.get("images"):
with st.expander("🖼️ Bilder verwalten", expanded=False):
for i, img in enumerate(article.get("images", [])):
col1, col2 = st.columns([1, 2])
with col1:
st.image(img["url"], width=200)
with col2:
caption = st.text_input("Bildtitel", value=img.get("caption", ""), key=f"caption_{article['id']}_{i}")
copyright_text = st.text_input("Copyright", value=img.get("copyright", ""), key=f"copyright_{article['id']}_{i}")
copyright_url = st.text_input("Quelle URL", value=img.get("copyright_url", ""), key=f"copyright_url_{article['id']}_{i}")
if st.button("Bilddaten speichern", key=f"save_img_{article['id']}_{i}"):
img["caption"] = caption or "Kein Bildtitel vorhanden"
img["copyright"] = copyright_text or "Unbekannt"
img["copyright_url"] = copyright_url or "#"
for idx, art in enumerate(all_articles):
if art["id"] == article["id"]:
all_articles[idx] = article
break
save_articles(all_articles)
show_notification("Bilddaten gespeichert!")
st.rerun()
# DALL-E Bildgenerierung
if st.button("🪄 KI-Bild generieren", key=f"dalle_{article['id']}"): if st.button("🪄 KI-Bild generieren", key=f"dalle_{article['id']}"):
if not any(img.get("copyright") == "OpenAI DALL·E" for img in article.get("images", [])): if not any(img.get("copyright") == "OpenAI DALL·E" for img in article.get("images", [])):
prompt = article["title"] with st.spinner("Bild wird generiert..."):
image_url = generate_dalle_image(prompt) prompt = article["title"]
if image_url: image_url = generate_dalle_image(prompt)
article.setdefault("images", []).append({ if image_url:
"url": image_url, article.setdefault("images", []).append({
"alt": f"KI-generiertes Titelbild zu: {prompt}", "url": image_url,
"caption": f"KI-generiertes Titelbild zu: {prompt}", "alt": f"KI-generiertes Titelbild zu: {prompt}",
"copyright": "OpenAI DALL·E", "caption": f"KI-generiertes Titelbild zu: {prompt}",
"copyright_url": "https://openai.com/dall-e" "copyright": "OpenAI DALL·E",
}) "copyright_url": "https://openai.com/dall-e"
for idx, art in enumerate(all_articles): })
if art["id"] == article["id"]: for idx, art in enumerate(all_articles):
all_articles[idx] = article if art["id"] == article["id"]:
break all_articles[idx] = article
save_articles(all_articles) break
st.success("DALL·E-Bild erfolgreich hinzugefügt") save_articles(all_articles)
st.rerun() show_notification("DALL·E-Bild erfolgreich hinzugefügt!")
else: st.rerun()
st.error("Fehler beim Erzeugen des Bildes.") else:
show_notification("Fehler beim Erzeugen des Bildes.", "error")
else: else:
st.info("Ein KI-generiertes Bild ist bereits vorhanden.") show_notification("Ein KI-generiertes Bild ist bereits vorhanden.", "info")
else:
st.info("Keine Artikel für den gewählten Status gefunden.") st.markdown('</div>', unsafe_allow_html=True)
# === Feeds Tab ===
with tab3:
st.header("📡 RSS Feeds verwalten")
# Feed hinzufügen
with st.expander(" Neuen Feed hinzufügen", expanded=False):
col1, col2 = st.columns(2)
with col1:
new_url = st.text_input("Feed URL")
with col2:
new_name = st.text_input("Feed Name")
if st.button("Feed hinzufügen", use_container_width=True):
if new_url and new_name:
if not any(f.get("url") == new_url for f in feeds):
feeds.append({"url": new_url, "name": new_name})
save_feeds(feeds)
show_notification(f"Feed '{new_name}' hinzugefügt!")
st.rerun()
else:
show_notification("Dieser Feed existiert bereits.", "warning")
else:
show_notification("Bitte URL und Name eingeben.", "error")
# Feeds anzeigen
for idx, feed in enumerate(feeds):
feed_url = feed.get("url", "")
feed_name = feed.get("name", "Unbekannt")
article_count = sum(1 for a in all_articles if a.get("source") == feed_url)
st.markdown(f"""
<div class="article-card">
<div style="display: flex; justify-content: space-between; align-items: center;">
<div>
<strong>{feed_name}</strong>
<br>
<small>{feed_url}</small>
<br>
<span style="color: #667eea;">📰 {article_count} Artikel</span>
</div>
<div>
<span class="status-badge status-online">{article_count} Artikel</span>
</div>
</div>
</div>
""", unsafe_allow_html=True)
# Feed Actions
col1, col2, col3 = st.columns(3)
with col1:
if st.button("✏️ Bearbeiten", key=f"edit_feed_{idx}"):
st.session_state[f"edit_feed_{idx}"] = not st.session_state.get(f"edit_feed_{idx}", False)
with col2:
if st.button("🔄 Aktualisieren", key=f"refresh_feed_{idx}"):
with st.spinner("Feed wird aktualisiert..."):
existing_ids = [a["id"] for a in all_articles]
# Hier könntest du eine einzelne Feed-Update-Funktion implementieren
process_articles(existing_ids)
show_notification(f"Feed '{feed_name}' aktualisiert!")
st.rerun()
with col3:
if st.button("🗑️ Löschen", key=f"delete_feed_{idx}"):
feeds.pop(idx)
save_feeds(feeds)
show_notification(f"Feed '{feed_name}' gelöscht!", "warning")
st.rerun()
# Edit Form
if st.session_state.get(f"edit_feed_{idx}", False):
with st.form(f"edit_form_{idx}"):
new_feed_url = st.text_input("URL", value=feed_url)
new_feed_name = st.text_input("Name", value=feed_name)
if st.form_submit_button("Änderungen speichern"):
feeds[idx]["url"] = new_feed_url
feeds[idx]["name"] = new_feed_name
save_feeds(feeds)
show_notification("Feed aktualisiert!")
st.session_state[f"edit_feed_{idx}"] = False
st.rerun()
# === Bilder Tab ===
with tab4:
st.header("🖼️ Bilderverwaltung")
# Alle Bilder sammeln
all_images = []
for article in all_articles:
for img in article.get("images", []):
img_data = img.copy()
img_data["article_title"] = article.get("title", "Unbekannt")
img_data["article_id"] = article.get("id")
all_images.append(img_data)
if all_images:
st.write(f"**{len(all_images)} Bilder gefunden**")
# Bilder in Spalten anzeigen
cols = st.columns(3)
for idx, img in enumerate(all_images):
with cols[idx % 3]:
st.image(img["url"], use_column_width=True)
st.markdown(f"**{img.get('caption', 'Kein Titel')}**")
st.markdown(f"📰 {img['article_title']}")
st.markdown(f"©️ {img.get('copyright', 'Unbekannt')}")
if img.get("copyright_url") and img["copyright_url"] != "#":
st.markdown(f"[🔗 Quelle]({img['copyright_url']})")
else:
st.info("Keine Bilder gefunden.")
# === Statistiken Tab ===
with tab5:
st.header("📊 Detaillierte Statistiken")
# Status Verteilung
status_counts = Counter([a.get("status", "New") for a in all_articles])
col1, col2 = st.columns(2)
with col1:
st.subheader("📈 Status Verteilung")
for status, count in status_counts.items():
percentage = (count / len(all_articles) * 100) if all_articles else 0
st.markdown(f"{get_status_badge(status)} {count} ({percentage:.1f}%)", unsafe_allow_html=True)
with col2:
st.subheader("📡 Artikel pro Feed")
feed_counts = Counter([source_to_name.get(a.get("source", ""), "Unbekannt") for a in all_articles])
for feed_name, count in feed_counts.most_common():
st.markdown(f"**{feed_name}:** {count} Artikel")
# Weitere Statistiken
st.subheader("📝 Textstatistiken")
word_counts = [get_word_count(a.get("text", "")) for a in all_articles]
if word_counts:
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Durchschnittliche Wortanzahl", f"{sum(word_counts) // len(word_counts)}")
with col2:
st.metric("Längster Artikel", f"{max(word_counts)} Wörter")
with col3:
st.metric("Kürzester Artikel", f"{min(word_counts)} Wörter")
# Tag Cloud Simulation
st.subheader("🏷️ Häufigste Tags")
all_tags = []
for article in all_articles:
all_tags.extend(article.get("tags", []))
if all_tags:
tag_counts = Counter(all_tags)
for tag, count in tag_counts.most_common(10):
st.markdown(f"**{tag}:** {count}x verwendet")
else:
st.info("Keine Tags gefunden.")

File diff suppressed because one or more lines are too long

20
internal/git.sh Normal file
View file

@ -0,0 +1,20 @@
# Aktuellen Stand vom main/master holen
git checkout main
git pull origin main
# Neuen Feature-Branch erstellen
git checkout -b feature/neue-funktion
# Entwickeln und committen
git add .
git commit -m "Neue Funktion implementiert"
# Branch auf Remote-Repository pushen
git push -u origin feature/neue-funktion
# Alle Branches anzeigen
git branch -a
# Aktuellen Branch anzeigen
git branch --show-current

View file

@ -218,3 +218,296 @@
2025-07-11 08:54:54,834 - INFO - 5 neue Artikel gespeichert. 2025-07-11 08:54:54,834 - INFO - 5 neue Artikel gespeichert.
2025-07-11 09:34:42,951 - INFO - ❌ Feed gelöscht: Promobil Ratgeber (https://www.promobil.de/rss/ratgeber) 2025-07-11 09:34:42,951 - INFO - ❌ Feed gelöscht: Promobil Ratgeber (https://www.promobil.de/rss/ratgeber)
2025-07-11 09:35:05,863 - INFO - 🔗 Neuer Feed hinzugefügt: Promobil Ratgeber (https://www.promobil.de/rss/ratgeber) 2025-07-11 09:35:05,863 - INFO - 🔗 Neuer Feed hinzugefügt: Promobil Ratgeber (https://www.promobil.de/rss/ratgeber)
2025-07-28 09:17:09,355 - INFO - ✍️ Umschreiben von: Preisschock bei Wohnmobilversicherungen: Versicherung gestiegen? Das können Sie tun!
2025-07-28 09:17:19,759 - INFO - Retrying request to /chat/completions in 0.484478 seconds
2025-07-28 09:17:30,386 - INFO - Retrying request to /chat/completions in 0.765465 seconds
2025-07-28 09:17:41,238 - ERROR - ❌ Fehler beim Umschreiben von 'Preisschock bei Wohnmobilversicherungen: Versicherung gestiegen? Das können Sie tun!':
Traceback (most recent call last):
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpx/_transports/default.py", line 101, in map_httpcore_exceptions
yield
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpx/_transports/default.py", line 250, in handle_request
resp = self._pool.handle_request(req)
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpcore/_sync/connection_pool.py", line 256, in handle_request
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpcore/_sync/connection_pool.py", line 236, in handle_request
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpcore/_sync/connection.py", line 101, in handle_request
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpcore/_sync/connection.py", line 78, in handle_request
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpcore/_sync/connection.py", line 124, in _connect
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpcore/_backends/sync.py", line 207, in connect_tcp
File "/opt/homebrew/Cellar/python@3.13/3.13.5/Frameworks/Python.framework/Versions/3.13/lib/python3.13/contextlib.py", line 162, in __exit__
self.gen.throw(value)
~~~~~~~~~~~~~~^^^^^^^
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpcore/_exceptions.py", line 14, in map_exceptions
httpcore.ConnectError: [Errno 8] nodename nor servname provided, or not known
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/openai/_base_client.py", line 972, in request
response = self._client.send(
request,
stream=stream or self._should_stream_response_body(request=request),
**kwargs,
)
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpx/_client.py", line 914, in send
response = self._send_handling_auth(
request,
...<2 lines>...
history=[],
)
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpx/_client.py", line 942, in _send_handling_auth
response = self._send_handling_redirects(
request,
follow_redirects=follow_redirects,
history=history,
)
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpx/_client.py", line 979, in _send_handling_redirects
response = self._send_single_request(request)
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpx/_client.py", line 1014, in _send_single_request
response = transport.handle_request(request)
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpx/_transports/default.py", line 249, in handle_request
with map_httpcore_exceptions():
~~~~~~~~~~~~~~~~~~~~~~~^^
File "/opt/homebrew/Cellar/python@3.13/3.13.5/Frameworks/Python.framework/Versions/3.13/lib/python3.13/contextlib.py", line 162, in __exit__
self.gen.throw(value)
~~~~~~~~~~~~~~^^^^^^^
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpx/_transports/default.py", line 118, in map_httpcore_exceptions
raise mapped_exc(message) from exc
httpx.ConnectError: [Errno 8] nodename nor servname provided, or not known
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/Users/oliver/Documents/rss-news/main.py", line 145, in rewrite_articles
response = openai.chat.completions.create(
model="gpt-4",
...<3 lines>...
]
)
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/openai/_utils/_utils.py", line 287, in wrapper
return func(*args, **kwargs)
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/openai/resources/chat/completions/completions.py", line 1087, in create
return self._post(
~~~~~~~~~~^
"/chat/completions",
^^^^^^^^^^^^^^^^^^^^
...<43 lines>...
stream_cls=Stream[ChatCompletionChunk],
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
)
^
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/openai/_base_client.py", line 1249, in post
return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))
~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/openai/_base_client.py", line 1004, in request
raise APIConnectionError(request=request) from err
openai.APIConnectionError: Connection error.
2025-07-28 09:18:02,091 - INFO - ✍️ Umschreiben von: Camper-Radio Caravan.fm : Radiosender speziell für Camping-Fans
2025-07-28 09:18:02,094 - INFO - Retrying request to /chat/completions in 0.415304 seconds
2025-07-28 09:18:02,517 - INFO - Retrying request to /chat/completions in 0.899018 seconds
2025-07-28 09:18:03,419 - ERROR - ❌ Fehler beim Umschreiben von 'Camper-Radio Caravan.fm : Radiosender speziell für Camping-Fans':
Traceback (most recent call last):
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpx/_transports/default.py", line 101, in map_httpcore_exceptions
yield
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpx/_transports/default.py", line 250, in handle_request
resp = self._pool.handle_request(req)
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpcore/_sync/connection_pool.py", line 256, in handle_request
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpcore/_sync/connection_pool.py", line 236, in handle_request
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpcore/_sync/connection.py", line 101, in handle_request
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpcore/_sync/connection.py", line 78, in handle_request
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpcore/_sync/connection.py", line 124, in _connect
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpcore/_backends/sync.py", line 207, in connect_tcp
File "/opt/homebrew/Cellar/python@3.13/3.13.5/Frameworks/Python.framework/Versions/3.13/lib/python3.13/contextlib.py", line 162, in __exit__
self.gen.throw(value)
~~~~~~~~~~~~~~^^^^^^^
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpcore/_exceptions.py", line 14, in map_exceptions
httpcore.ConnectError: [Errno 8] nodename nor servname provided, or not known
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/openai/_base_client.py", line 972, in request
response = self._client.send(
request,
stream=stream or self._should_stream_response_body(request=request),
**kwargs,
)
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpx/_client.py", line 914, in send
response = self._send_handling_auth(
request,
...<2 lines>...
history=[],
)
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpx/_client.py", line 942, in _send_handling_auth
response = self._send_handling_redirects(
request,
follow_redirects=follow_redirects,
history=history,
)
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpx/_client.py", line 979, in _send_handling_redirects
response = self._send_single_request(request)
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpx/_client.py", line 1014, in _send_single_request
response = transport.handle_request(request)
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpx/_transports/default.py", line 249, in handle_request
with map_httpcore_exceptions():
~~~~~~~~~~~~~~~~~~~~~~~^^
File "/opt/homebrew/Cellar/python@3.13/3.13.5/Frameworks/Python.framework/Versions/3.13/lib/python3.13/contextlib.py", line 162, in __exit__
self.gen.throw(value)
~~~~~~~~~~~~~~^^^^^^^
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpx/_transports/default.py", line 118, in map_httpcore_exceptions
raise mapped_exc(message) from exc
httpx.ConnectError: [Errno 8] nodename nor servname provided, or not known
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/Users/oliver/Documents/rss-news/main.py", line 145, in rewrite_articles
response = openai.chat.completions.create(
model="gpt-4",
...<3 lines>...
]
)
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/openai/_utils/_utils.py", line 287, in wrapper
return func(*args, **kwargs)
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/openai/resources/chat/completions/completions.py", line 1087, in create
return self._post(
~~~~~~~~~~^
"/chat/completions",
^^^^^^^^^^^^^^^^^^^^
...<43 lines>...
stream_cls=Stream[ChatCompletionChunk],
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
)
^
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/openai/_base_client.py", line 1249, in post
return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))
~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/openai/_base_client.py", line 1004, in request
raise APIConnectionError(request=request) from err
openai.APIConnectionError: Connection error.
2025-07-28 09:18:43,426 - INFO - ✍️ Umschreiben von: Preisschock bei Wohnmobilversicherungen: Versicherung gestiegen? Das können Sie tun!
2025-07-28 09:19:04,744 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-07-28 09:19:09,962 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-07-28 09:19:09,964 - INFO - ✅ Artikel umgeschrieben: Preisschock bei Wohnmobilversicherungen: Versicherung gestiegen? Das können Sie tun!
2025-07-28 09:19:09,964 - INFO - ✍️ Umschreiben von: Camper-Radio Caravan.fm : Radiosender speziell für Camping-Fans
2025-07-28 09:19:23,989 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-07-28 09:19:27,267 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-07-28 09:19:27,269 - INFO - ✅ Artikel umgeschrieben: Camper-Radio Caravan.fm : Radiosender speziell für Camping-Fans
2025-07-28 09:19:27,276 - INFO - Alle Artikel mit Status 'Rewrite' wurden verarbeitet.
2025-07-28 09:27:10,258 - INFO - 🧠 Generiere DALL·E-Bild für Prompt: Preisschock bei Wohnmobilversicherungen: Versicherung gestiegen? Das können Sie tun!
2025-07-28 09:27:26,502 - INFO - HTTP Request: POST https://api.openai.com/v1/images/generations "HTTP/1.1 200 OK"
2025-07-28 09:27:26,514 - INFO - ✅ Bild generiert: https://oaidalleapiprodscus.blob.core.windows.net/private/org-YimPc01cYtOXjUpCATUqDABw/user-eA31w0vmy3fOrb3G64Ygndsr/img-yKWdGDCQJZBOCQ4V4HoD40A0.png?st=2025-07-28T06%3A27%3A26Z&se=2025-07-28T08%3A27%3A26Z&sp=r&sv=2024-08-04&sr=b&rscd=inline&rsct=image/png&skoid=cc612491-d948-4d2e-9821-2683df3719f5&sktid=a48cca56-e6da-484e-a814-9c849652bcb3&skt=2025-07-27T22%3A07%3A39Z&ske=2025-07-28T22%3A07%3A39Z&sks=b&skv=2024-08-04&sig=HUMRhg2FbaKnLil%2BMbyvNemVeBcrvTODpctkfQyFHPc%3D
2025-07-28 09:39:35,087 - INFO - Lade Feed: https://www.camping-news.de/rss/
2025-07-28 09:39:35,473 - INFO - 0 neue Artikel gefunden in https://www.camping-news.de/rss/
2025-07-28 09:39:35,473 - INFO - Lade Feed: https://www.promobil.de/rss/news
2025-07-28 09:39:35,914 - INFO - 0 neue Artikel gefunden in https://www.promobil.de/rss/news
2025-07-28 09:39:35,915 - INFO - Lade Feed: https://www.promobil.de/rss/ratgeber
2025-07-28 09:39:36,365 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/weitere-ratgeber/stunt-auf-wohnwagen-jensen-ackles-countdown/
2025-07-28 09:39:36,584 - INFO - 16 Bilder gefunden bei https://www.promobil.de/weitere-ratgeber/stunt-auf-wohnwagen-jensen-ackles-countdown/
2025-07-28 09:39:36,585 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/weitere-ratgeber/clever-campen-podcast-wie-passen-gravelbikes-und-camping-zusamen/
2025-07-28 09:39:36,793 - INFO - 13 Bilder gefunden bei https://www.promobil.de/weitere-ratgeber/clever-campen-podcast-wie-passen-gravelbikes-und-camping-zusamen/
2025-07-28 09:39:36,794 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/weitere-ratgeber/stellplatz-radar-30-tage-kostenlos-alle-plus-funktionen/
2025-07-28 09:39:36,999 - INFO - 15 Bilder gefunden bei https://www.promobil.de/weitere-ratgeber/stellplatz-radar-30-tage-kostenlos-alle-plus-funktionen/
2025-07-28 09:39:36,999 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/weitere-ratgeber/wann-der-digitale-fahrzeugschein-fuer-alle-kommt/
2025-07-28 09:39:37,219 - INFO - 14 Bilder gefunden bei https://www.promobil.de/weitere-ratgeber/wann-der-digitale-fahrzeugschein-fuer-alle-kommt/
2025-07-28 09:39:37,220 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/weitere-ratgeber/umfrage-welches-bad-brauchen-sie-im-wohnmobil/
2025-07-28 09:39:37,439 - INFO - 22 Bilder gefunden bei https://www.promobil.de/weitere-ratgeber/umfrage-welches-bad-brauchen-sie-im-wohnmobil/
2025-07-28 09:39:37,440 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/weitere-ratgeber/umfrage-kaffeegenuss-camping-wohnmobil-wohnwagen/
2025-07-28 09:39:37,744 - INFO - 13 Bilder gefunden bei https://www.promobil.de/weitere-ratgeber/umfrage-kaffeegenuss-camping-wohnmobil-wohnwagen/
2025-07-28 09:39:37,746 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/weitere-ratgeber/meinung-privates-mietbad-beim-camping-komfort-oder-stilbruch/
2025-07-28 09:39:37,983 - INFO - 17 Bilder gefunden bei https://www.promobil.de/weitere-ratgeber/meinung-privates-mietbad-beim-camping-komfort-oder-stilbruch/
2025-07-28 09:39:37,984 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/tipps/caravan-salon-duesseldorf-eine-einmalige-gelegenheit-fuer-camping-zubehoer-shopper/
2025-07-28 09:39:38,242 - INFO - 20 Bilder gefunden bei https://www.promobil.de/tipps/caravan-salon-duesseldorf-eine-einmalige-gelegenheit-fuer-camping-zubehoer-shopper/
2025-07-28 09:39:38,244 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/tipps/caravan-salon-2025-6-gruende-besuch-messe/
2025-07-28 09:39:38,476 - INFO - 20 Bilder gefunden bei https://www.promobil.de/tipps/caravan-salon-2025-6-gruende-besuch-messe/
2025-07-28 09:39:38,479 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/tipps/tipps-schutz-gegen-sommerhitze-hitzestau-wohnmobil/
2025-07-28 09:39:38,758 - INFO - 24 Bilder gefunden bei https://www.promobil.de/tipps/tipps-schutz-gegen-sommerhitze-hitzestau-wohnmobil/
2025-07-28 09:39:38,759 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/ratgeber/dethleffs-reiselust-praemie-2025-rabatte-wohnmobile/
2025-07-28 09:39:39,019 - INFO - 16 Bilder gefunden bei https://www.promobil.de/ratgeber/dethleffs-reiselust-praemie-2025-rabatte-wohnmobile/
2025-07-28 09:39:39,021 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/weitere-ratgeber/promobil-newsletter-ab-sofort-zwei-mal-die-woche-samstags-keine-camping-news-verpassen/
2025-07-28 09:39:39,254 - INFO - 13 Bilder gefunden bei https://www.promobil.de/weitere-ratgeber/promobil-newsletter-ab-sofort-zwei-mal-die-woche-samstags-keine-camping-news-verpassen/
2025-07-28 09:39:39,256 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/weitere-ratgeber/campingtourismus-boomt-drittes-camping-rekordjahr-in-folge/
2025-07-28 09:39:39,516 - INFO - 18 Bilder gefunden bei https://www.promobil.de/weitere-ratgeber/campingtourismus-boomt-drittes-camping-rekordjahr-in-folge/
2025-07-28 09:39:39,517 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/ratgeber/camping-ausruester-herzog-beantragt-insolvenz-wohnmobilhandel-belastet-vorzelthersteller/
2025-07-28 09:39:39,729 - INFO - 13 Bilder gefunden bei https://www.promobil.de/ratgeber/camping-ausruester-herzog-beantragt-insolvenz-wohnmobilhandel-belastet-vorzelthersteller/
2025-07-28 09:39:39,731 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/tipps/bordtechnik-einmaleins-einsteiger-tipps-fuer-den-campingurlaub/
2025-07-28 09:39:39,951 - INFO - 13 Bilder gefunden bei https://www.promobil.de/tipps/bordtechnik-einmaleins-einsteiger-tipps-fuer-den-campingurlaub/
2025-07-28 09:39:39,952 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/weitere-ratgeber/gotthard-brenner-reschenpass-wo-wohnmobile-und-camper-heute-geduld-brauchen/
2025-07-28 09:39:40,197 - INFO - 13 Bilder gefunden bei https://www.promobil.de/weitere-ratgeber/gotthard-brenner-reschenpass-wo-wohnmobile-und-camper-heute-geduld-brauchen/
2025-07-28 09:39:40,199 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/tipps/leichtbautricks-wohnmobilhersteller/
2025-07-28 09:39:40,458 - INFO - 13 Bilder gefunden bei https://www.promobil.de/tipps/leichtbautricks-wohnmobilhersteller/
2025-07-28 09:39:40,462 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/weitere-ratgeber/slowenische-campingmarken-sind-laengst-auf-dem-deutschen-markt-angekommen/
2025-07-28 09:39:40,695 - INFO - 16 Bilder gefunden bei https://www.promobil.de/weitere-ratgeber/slowenische-campingmarken-sind-laengst-auf-dem-deutschen-markt-angekommen/
2025-07-28 09:39:40,697 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/weitere-ratgeber/recap-folge-2-bella-italia-camping-auf-deutsch/
2025-07-28 09:39:40,919 - INFO - 14 Bilder gefunden bei https://www.promobil.de/weitere-ratgeber/recap-folge-2-bella-italia-camping-auf-deutsch/
2025-07-28 09:39:40,922 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/weitere-ratgeber/diese-kostenlosen-apps-muessen-camper-kennen/
2025-07-28 09:39:41,210 - INFO - 17 Bilder gefunden bei https://www.promobil.de/weitere-ratgeber/diese-kostenlosen-apps-muessen-camper-kennen/
2025-07-28 09:39:41,210 - INFO - 20 neue Artikel gefunden in https://www.promobil.de/rss/ratgeber
2025-07-28 09:39:41,211 - INFO - Lade Feed: https://caravan.fm/
2025-07-28 09:39:44,233 - INFO - 0 neue Artikel gefunden in https://caravan.fm/
2025-07-28 09:39:44,238 - INFO - 20 neue Artikel gespeichert.
2025-07-28 09:42:36,590 - INFO - ❌ Feed gelöscht: Neuer Feed (https://caravan.fm/)
2025-07-28 09:44:53,801 - INFO - ✍️ Umschreiben von: Pannen und Probleme im Wohnmobil & Wohnwagen: Erste Hilfe für die Camper-Bordtechnik
2025-07-28 09:45:18,500 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-07-28 09:45:21,113 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-07-28 09:45:21,126 - INFO - ✅ Artikel umgeschrieben: Pannen und Probleme im Wohnmobil & Wohnwagen: Erste Hilfe für die Camper-Bordtechnik
2025-07-28 09:45:21,146 - INFO - Alle Artikel mit Status 'Rewrite' wurden verarbeitet.
2025-07-28 10:29:47,016 - INFO - Lade Feed: https://www.camping-news.de/rss/
2025-07-28 10:29:47,407 - INFO - 0 neue Artikel gefunden in https://www.camping-news.de/rss/
2025-07-28 10:29:47,407 - INFO - Lade Feed: https://www.promobil.de/rss/news
2025-07-28 10:29:47,719 - INFO - 0 neue Artikel gefunden in https://www.promobil.de/rss/news
2025-07-28 10:29:47,719 - INFO - Lade Feed: https://www.promobil.de/rss/ratgeber
2025-07-28 10:29:48,183 - INFO - 0 neue Artikel gefunden in https://www.promobil.de/rss/ratgeber
2025-07-28 10:29:48,183 - INFO - Keine neuen Artikel gefunden.
2025-07-29 19:30:44,481 - INFO - Lade Feed: https://www.camping-news.de/rss/
2025-07-29 19:30:44,923 - INFO - 0 neue Artikel gefunden in https://www.camping-news.de/rss/
2025-07-29 19:30:44,923 - INFO - Lade Feed: https://www.promobil.de/rss/news
2025-07-29 19:30:45,348 - INFO - 0 neue Artikel gefunden in https://www.promobil.de/rss/news
2025-07-29 19:30:45,348 - INFO - Lade Feed: https://www.promobil.de/rss/ratgeber
2025-07-29 19:30:45,899 - INFO - 0 neue Artikel gefunden in https://www.promobil.de/rss/ratgeber
2025-07-29 19:30:45,899 - INFO - Keine neuen Artikel gefunden.
2025-08-15 09:44:18,677 - INFO - Lade Feed: https://www.camping-news.de/rss/
2025-08-15 09:44:18,993 - INFO - 0 neue Artikel gefunden in https://www.camping-news.de/rss/
2025-08-15 09:44:18,993 - INFO - Lade Feed: https://www.promobil.de/rss/news
2025-08-15 09:44:19,241 - INFO - 0 neue Artikel gefunden in https://www.promobil.de/rss/news
2025-08-15 09:44:19,241 - INFO - Lade Feed: https://www.promobil.de/rss/ratgeber
2025-08-15 09:44:19,550 - INFO - 🖼️ Starte Bildextraktion von: https://www.promobil.de/weitere-ratgeber/frankreichs-autobahnen-kein-adac-schutz-bei-pannen/
2025-08-15 09:44:19,709 - INFO - 🔍 12 img-Tags gefunden
2025-08-15 09:44:19,710 - INFO - ✅ Bild hinzugefügt: Bild aus Originalartikel...
2025-08-15 09:44:19,710 - ERROR - ❌ Unerwarteter Fehler bei Bildextraktion von https://www.promobil.de/weitere-ratgeber/frankreichs-autobahnen-kein-adac-schutz-bei-pannen/: unsupported operand type(s) for *: 'NoneType' and 'NoneType'
2025-08-15 09:44:19,710 - INFO - 🖼️ Starte Bildextraktion von: https://www.promobil.de/weitere-ratgeber/warntafel-wahnsinn-in-italien-anbringen-an-fahrradtraegern-trotz-neuer-gesetze-empfohlen/
2025-08-15 09:44:19,856 - INFO - 🔍 13 img-Tags gefunden
2025-08-15 09:44:19,856 - INFO - ✅ Bild hinzugefügt: 02/2024, Fahrradträger mit Warntafel...
2025-08-15 09:44:19,856 - INFO - ✅ Bild hinzugefügt: Bild aus Originalartikel...
2025-08-15 09:44:19,857 - ERROR - ❌ Unerwarteter Fehler bei Bildextraktion von https://www.promobil.de/weitere-ratgeber/warntafel-wahnsinn-in-italien-anbringen-an-fahrradtraegern-trotz-neuer-gesetze-empfohlen/: unsupported operand type(s) for *: 'NoneType' and 'NoneType'
2025-08-15 09:44:19,859 - INFO - 🖼️ Starte Bildextraktion von: https://www.promobil.de/tipps/achtung-mautschock-warum-viele-wohnmobile-bald-eine-go-box-brauchen/
2025-08-15 09:44:20,025 - INFO - 🔍 20 img-Tags gefunden
2025-08-15 09:44:20,025 - INFO - ✅ Bild hinzugefügt: Maut, Basis, Wissen, Österreich, Vignette, Go-Box,...
2025-08-15 09:44:20,025 - INFO - ✅ Bild hinzugefügt: Wohnmobil, Küste, Parkplatz, Wohnmobil, Mann...
2025-08-15 09:44:20,026 - INFO - ✅ Bild hinzugefügt: 05/2025, Spanien Polizei Verkehr...
2025-08-15 09:44:20,026 - INFO - ✅ Bild hinzugefügt: f_Autohof, Restaurant, essen, vegetarisch...
2025-08-15 09:44:20,026 - INFO - ✅ Bild hinzugefügt: Supercheck, Dethleffs Magic Edition T 2 EB, Seiten...
2025-08-15 09:44:20,026 - INFO - 🎉 5 Bilder erfolgreich extrahiert von https://www.promobil.de/tipps/achtung-mautschock-warum-viele-wohnmobile-bald-eine-go-box-brauchen/
2025-08-15 09:44:20,026 - INFO - 3 neue Artikel gefunden in https://www.promobil.de/rss/ratgeber
2025-08-15 09:44:20,038 - INFO - 3 neue Artikel gespeichert.
2025-08-15 09:45:55,607 - INFO - ✍️ Umschreiben von: Fahrradtransport in Italien: Warntafel bei Fahrradträgern doch wieder Pflicht?
2025-08-15 09:46:09,158 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-15 09:46:11,508 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-15 09:46:11,564 - INFO - ✅ Artikel umgeschrieben: Fahrradtransport in Italien: Warntafel bei Fahrradträgern doch wieder Pflicht?
2025-08-15 09:46:11,565 - INFO - ✍️ Umschreiben von: Neue Mautregeln für Wohnmobile mit 3,5 t: Diese Camper brauchen ab dem Stichtag eine GO-Box
2025-08-15 09:46:32,092 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-15 09:46:34,549 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-15 09:46:34,552 - INFO - ✅ Artikel umgeschrieben: Neue Mautregeln für Wohnmobile mit 3,5 t: Diese Camper brauchen ab dem Stichtag eine GO-Box
2025-08-15 09:46:34,571 - INFO - Alle Artikel mit Status 'Rewrite' wurden verarbeitet.
2025-08-15 09:48:30,972 - INFO - 🧠 Generiere DALL·E-Bild für Prompt: Fahrradtransport in Italien: Warntafel bei Fahrradträgern doch wieder Pflicht?
2025-08-15 09:48:42,548 - INFO - HTTP Request: POST https://api.openai.com/v1/images/generations "HTTP/1.1 200 OK"
2025-08-15 09:48:42,559 - INFO - ✅ Bild generiert: https://oaidalleapiprodscus.blob.core.windows.net/private/org-YimPc01cYtOXjUpCATUqDABw/user-eA31w0vmy3fOrb3G64Ygndsr/img-Ksiks2ssSZxpEFf1MQedlap1.png?st=2025-08-15T06%3A48%3A42Z&se=2025-08-15T08%3A48%3A42Z&sp=r&sv=2024-08-04&sr=b&rscd=inline&rsct=image/png&skoid=8b33a531-2df9-46a3-bc02-d4b1430a422c&sktid=a48cca56-e6da-484e-a814-9c849652bcb3&skt=2025-08-14T16%3A59%3A16Z&ske=2025-08-15T16%3A59%3A16Z&sks=b&skv=2024-08-04&sig=e0/ULpNgNLwixo3UapqnxHgR18t4HCpyEtnbmik33yA%3D
2025-08-15 09:51:43,090 - INFO - 🧠 Generiere DALL·E-Bild für Prompt: Neue Mautregeln für Wohnmobile mit 3,5 t: Diese Camper brauchen ab dem Stichtag eine GO-Box
2025-08-15 09:51:53,907 - INFO - HTTP Request: POST https://api.openai.com/v1/images/generations "HTTP/1.1 200 OK"
2025-08-15 09:51:53,914 - INFO - ✅ Bild generiert: https://oaidalleapiprodscus.blob.core.windows.net/private/org-YimPc01cYtOXjUpCATUqDABw/user-eA31w0vmy3fOrb3G64Ygndsr/img-Pl4ik6W7mTrv2MhIbdWlwgOL.png?st=2025-08-15T06%3A51%3A53Z&se=2025-08-15T08%3A51%3A53Z&sp=r&sv=2024-08-04&sr=b&rscd=inline&rsct=image/png&skoid=b1a0ae1f-618f-4548-84fd-8b16cacd5485&sktid=a48cca56-e6da-484e-a814-9c849652bcb3&skt=2025-08-14T15%3A09%3A17Z&ske=2025-08-15T15%3A09%3A17Z&sks=b&skv=2024-08-04&sig=RHIFlJLMumrcr/jEskOVfqJ%2Bns0pDS2HM8l5siBfLmM%3D
2025-08-15 09:55:42,370 - INFO - Lade Feed: https://www.camping-news.de/rss/
2025-08-15 09:55:42,639 - INFO - 0 neue Artikel gefunden in https://www.camping-news.de/rss/
2025-08-15 09:55:42,640 - INFO - Lade Feed: https://www.promobil.de/rss/news
2025-08-15 09:55:42,843 - INFO - 0 neue Artikel gefunden in https://www.promobil.de/rss/news
2025-08-15 09:55:42,843 - INFO - Lade Feed: https://www.promobil.de/rss/ratgeber
2025-08-15 09:55:43,180 - INFO - 0 neue Artikel gefunden in https://www.promobil.de/rss/ratgeber
2025-08-15 09:55:43,180 - INFO - Keine neuen Artikel gefunden.

491
main.py
View file

@ -10,6 +10,8 @@ import logging
import openai import openai
from utils.image_extractor import extract_images_with_metadata from utils.image_extractor import extract_images_with_metadata
from utils.article_extractor import extract_full_article from utils.article_extractor import extract_full_article
import hashlib
import time
load_dotenv() load_dotenv()
@ -17,10 +19,15 @@ load_dotenv()
log_dir = "logs" log_dir = "logs"
os.makedirs(log_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True)
log_file = os.path.join(log_dir, "rss_tool.log") log_file = os.path.join(log_dir, "rss_tool.log")
# Logging-Format verbessern
logging.basicConfig( logging.basicConfig(
filename=log_file,
level=logging.INFO, level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s" format="%(asctime)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s",
handlers=[
logging.FileHandler(log_file, encoding='utf-8'),
logging.StreamHandler() # Auch in Konsole ausgeben
]
) )
openai.api_key = os.getenv("OPENAI_API_KEY") openai.api_key = os.getenv("OPENAI_API_KEY")
@ -29,156 +36,412 @@ ARTICLES_FILE = "data/articles.json"
FEEDS_FILE = "data/feeds.json" FEEDS_FILE = "data/feeds.json"
VALID_STATUSES = ["New", "Rewrite", "Process", "Online", "On Hold", "Trash"] VALID_STATUSES = ["New", "Rewrite", "Process", "Online", "On Hold", "Trash"]
# === Datenordner erstellen ===
os.makedirs("data", exist_ok=True)
def generate_article_id(title, link, date):
"""Generiert eine eindeutige ID für einen Artikel basierend auf mehreren Attributen"""
identifier = f"{title}_{link}_{date}"
return hashlib.md5(identifier.encode('utf-8')).hexdigest()
def is_duplicate_article(new_article, existing_articles):
"""Prüft ob ein Artikel bereits existiert (erweiterte Duplikatserkennung)"""
new_title = new_article.get("title", "").lower().strip()
new_link = new_article.get("link", "").strip()
for existing in existing_articles:
existing_title = existing.get("title", "").lower().strip()
existing_link = existing.get("link", "").strip()
# Exakte URL-Übereinstimmung
if new_link and existing_link and new_link == existing_link:
return True
# Sehr ähnliche Titel (mindestens 90% Übereinstimmung)
if new_title and existing_title:
similarity = calculate_similarity(new_title, existing_title)
if similarity > 0.9:
return True
return False
def calculate_similarity(text1, text2):
"""Berechnet die Ähnlichkeit zwischen zwei Texten (vereinfachte Methode)"""
words1 = set(text1.split())
words2 = set(text2.split())
if not words1 and not words2:
return 1.0
if not words1 or not words2:
return 0.0
intersection = len(words1.intersection(words2))
union = len(words1.union(words2))
return intersection / union if union > 0 else 0.0
def load_feeds(): def load_feeds():
if not os.path.exists(FEEDS_FILE): """Lädt RSS-Feeds aus der JSON-Datei"""
try:
if not os.path.exists(FEEDS_FILE):
logging.info("Feeds-Datei existiert nicht, erstelle leere Liste")
return []
with open(FEEDS_FILE, "r", encoding='utf-8') as f:
feeds = json.load(f)
logging.info(f"{len(feeds)} Feeds geladen")
return feeds
except Exception as e:
logging.error(f"❌ Fehler beim Laden der Feeds: {e}")
return [] return []
with open(FEEDS_FILE, "r") as f:
return json.load(f)
def save_feeds(feeds): def save_feeds(feeds):
with open(FEEDS_FILE, "w") as f: """Speichert RSS-Feeds in die JSON-Datei"""
json.dump(feeds, f, indent=2) try:
with open(FEEDS_FILE, "w", encoding='utf-8') as f:
json.dump(feeds, f, indent=2, ensure_ascii=False)
logging.info(f"{len(feeds)} Feeds gespeichert")
except Exception as e:
logging.error(f"❌ Fehler beim Speichern der Feeds: {e}")
def load_articles(): def load_articles():
if not os.path.exists(ARTICLES_FILE): """Lädt Artikel aus der JSON-Datei"""
try:
if not os.path.exists(ARTICLES_FILE):
logging.info("Artikel-Datei existiert nicht, erstelle leere Liste")
return []
with open(ARTICLES_FILE, "r", encoding='utf-8') as f:
articles = json.load(f)
# Status-Validierung
for article in articles:
if article.get("status") not in VALID_STATUSES:
article["status"] = "New"
logging.warning(f"⚠️ Ungültiger Status für Artikel '{article.get('title', 'Unbekannt')}' korrigiert")
logging.info(f"{len(articles)} Artikel geladen")
return articles
except Exception as e:
logging.error(f"❌ Fehler beim Laden der Artikel: {e}")
return [] return []
with open(ARTICLES_FILE, "r") as f:
articles = json.load(f)
for article in articles:
if article.get("status") not in VALID_STATUSES:
article["status"] = "New"
return articles
def save_articles(articles): def save_articles(articles):
with open(ARTICLES_FILE, "w") as f: """Speichert Artikel in die JSON-Datei"""
json.dump(articles, f, indent=2) try:
# Validierung vor dem Speichern
valid_articles = []
def fetch_and_process_feed(feed_url, existing_ids): for article in articles:
feed = feedparser.parse(feed_url) if "id" in article and "title" in article:
new_articles = [] valid_articles.append(article)
else:
for entry in feed.entries: logging.warning(f"⚠️ Ungültiger Artikel übersprungen: {article}")
article_id = entry.get("id") or entry.get("link")
if not article_id or article_id in existing_ids: with open(ARTICLES_FILE, "w", encoding='utf-8') as f:
continue json.dump(valid_articles, f, indent=2, ensure_ascii=False)
title = entry.get("title", "Kein Titel") logging.info(f"{len(valid_articles)} Artikel gespeichert")
date = entry.get("published", datetime.now().isoformat()) except Exception as e:
summary = entry.get("summary", "") logging.error(f"❌ Fehler beim Speichern der Artikel: {e}")
content = entry.get("content", [{}])[0].get("value") or entry.get("description", "")
def clean_html_content(content):
"""Bereinigt HTML-Inhalt und extrahiert Text"""
try:
soup = BeautifulSoup(content, "html.parser") soup = BeautifulSoup(content, "html.parser")
# Entferne Script- und Style-Tags
for script in soup(["script", "style"]):
script.decompose()
# Hole sauberen Text
clean_text = soup.get_text(" ", strip=True) clean_text = soup.get_text(" ", strip=True)
# Entferne überschüssige Leerzeichen
clean_text = " ".join(clean_text.split())
return clean_text
except Exception as e:
logging.error(f"❌ Fehler beim Bereinigen des HTML-Inhalts: {e}")
return content
# Automatischer Volltext-Fetch bei zu wenig Wörtern def fetch_and_process_feed(feed_url, existing_articles):
if len(clean_text.split()) < 50 and entry.get("link"): """Lädt und verarbeitet einen einzelnen RSS-Feed"""
fetched_text = extract_full_article(entry["link"]) new_articles = []
if len(fetched_text.split()) > len(clean_text.split()): feed_name = "Unbekannt"
clean_text = fetched_text
try:
logging.info(f"🔄 Verarbeite Feed: {feed_url}")
# Feed parsen
feed = feedparser.parse(feed_url)
if hasattr(feed, 'feed') and hasattr(feed.feed, 'title'):
feed_name = feed.feed.title
logging.info(f"📡 Feed-Name: {feed_name}")
if not feed.entries:
logging.warning(f"⚠️ Keine Einträge in Feed gefunden: {feed_url}")
return []
logging.info(f"📰 {len(feed.entries)} Einträge gefunden")
for entry in feed.entries:
try:
# Basis-Informationen extrahieren
title = entry.get("title", "Kein Titel")
date = entry.get("published", datetime.now().isoformat())
link = entry.get("link", "")
summary = entry.get("summary", "")
# Content extrahieren
content = ""
if hasattr(entry, 'content') and entry.content:
content = entry.content[0].get("value", "")
elif hasattr(entry, 'description'):
content = entry.description
else:
content = summary
# HTML bereinigen
clean_text = clean_html_content(content)
# Volltext-Extraktion bei kurzen Artikeln
if len(clean_text.split()) < 50 and link:
logging.info(f"🔍 Kurzer Artikel erkannt, versuche Volltext-Extraktion: {title}")
fetched_text = extract_full_article(link)
if len(fetched_text.split()) > len(clean_text.split()):
clean_text = fetched_text
logging.info(f"✅ Volltext extrahiert: {len(clean_text.split())} Wörter")
# Artikel-ID generieren
article_id = generate_article_id(title, link, date)
# Neuen Artikel erstellen
new_article = {
"id": article_id,
"title": title,
"date": date,
"summary": summary[:300] + "..." if len(summary) > 300 else summary,
"text": clean_text,
"tags": [],
"status": "New",
"link": link,
"images": [],
"source": feed_url,
"source_name": feed_name,
"created_at": datetime.now().isoformat(),
"word_count": len(clean_text.split())
}
# Duplikatsprüfung
if not is_duplicate_article(new_article, existing_articles):
# Bilder extrahieren
if link:
try:
images = extract_images_with_metadata(link)
new_article["images"] = images
logging.info(f"🖼️ {len(images)} Bilder für '{title}' extrahiert")
except Exception as e:
logging.error(f"❌ Fehler bei Bildextraktion für '{title}': {e}")
new_articles.append(new_article)
logging.info(f"✅ Neuer Artikel hinzugefügt: {title}")
else:
logging.info(f"🔄 Duplikat übersprungen: {title}")
except Exception as e:
logging.error(f"❌ Fehler beim Verarbeiten des Eintrags '{entry.get('title', 'Unbekannt')}': {e}")
continue
logging.info(f"✅ Feed verarbeitet: {len(new_articles)} neue Artikel aus {feed_url}")
return new_articles
except Exception as e:
logging.error(f"❌ Kritischer Fehler beim Verarbeiten von {feed_url}: {e}")
return []
images = extract_images_with_metadata(entry.link) def process_articles(existing_ids=None):
"""Verarbeitet alle RSS-Feeds und fügt neue Artikel hinzu"""
new_articles.append({ try:
"id": article_id, start_time = time.time()
"title": title, logging.info("🚀 Starte Artikel-Verarbeitung")
"date": date,
"summary": summary, feeds = load_feeds()
"text": clean_text, all_articles = load_articles()
"tags": [],
"status": "New", if not feeds:
"link": entry.get("link", ""), logging.warning("⚠️ Keine RSS-Feeds konfiguriert")
"images": images, return
"source": feed_url
}) # Bestehende Artikel für Duplikatsprüfung
existing_articles = all_articles.copy()
return new_articles
total_new_articles = 0
def process_articles(existing_ids): for feed in feeds:
feeds = load_feeds() feed_url = feed.get("url") if isinstance(feed, dict) else feed
all_articles = load_articles()
articles_by_id = {article["id"]: article for article in all_articles if "id" in article} if not feed_url:
new_entries = [] logging.warning("⚠️ Feed ohne URL übersprungen")
continue
for feed in feeds:
url = feed.get("url") if isinstance(feed, dict) else feed try:
if not url: new_articles = fetch_and_process_feed(feed_url, existing_articles)
continue
try: # Neue Artikel zur Gesamtliste hinzufügen
logging.info(f"Lade Feed: {url}") for article in new_articles:
entries = fetch_and_process_feed(url, existing_ids) all_articles.append(article)
new_entries.extend(entries) existing_articles.append(article) # Für weitere Duplikatsprüfung
logging.info(f"{len(entries)} neue Artikel gefunden in {url}")
except Exception as e: total_new_articles += len(new_articles)
logging.exception(f"Fehler beim Verarbeiten von {url}:")
# Kurze Pause zwischen Feeds
added = 0 time.sleep(1)
for entry in new_entries:
if entry["id"] not in articles_by_id: except Exception as e:
articles_by_id[entry["id"]] = entry logging.error(f"❌ Fehler beim Verarbeiten von Feed {feed_url}: {e}")
added += 1 continue
# Artikel speichern
if total_new_articles > 0:
save_articles(all_articles)
processing_time = time.time() - start_time
logging.info(f"🎉 Verarbeitung abgeschlossen: {total_new_articles} neue Artikel in {processing_time:.2f}s hinzugefügt")
else: else:
logging.info(f"Artikel bereits vorhanden, wird übersprungen: {entry['title']}") logging.info(" Keine neuen Artikel gefunden")
if added > 0: except Exception as e:
save_articles(list(articles_by_id.values())) logging.error(f"❌ Kritischer Fehler bei der Artikel-Verarbeitung: {e}")
logging.info(f"{added} neue Artikel gespeichert.")
else:
logging.info("Keine neuen Artikel gefunden.")
def rewrite_articles(): def rewrite_articles():
articles = load_articles() """Schreibt Artikel mit Status 'Rewrite' um"""
changed = False try:
logging.info("✍️ Starte Artikel-Umschreibung")
for article in articles:
if article.get("status") == "Rewrite": articles = load_articles()
rewrite_articles_list = [a for a in articles if a.get("status") == "Rewrite"]
if not rewrite_articles_list:
logging.info(" Keine Artikel zum Umschreiben gefunden")
return
if not openai.api_key:
logging.error("❌ OpenAI API-Key nicht konfiguriert")
return
changed = False
for article in rewrite_articles_list:
try: try:
logging.info(f"✍️ Umschreiben von: {article['title']}") logging.info(f"✍️ Umschreiben von: {article['title']}")
prompt = f"Schreibe folgenden Artikel um und fasse ihn verständlich zusammen:\n\n{article['text']}"
# Artikel umschreiben
prompt = f"""Schreibe den folgenden Artikel um und fasse ihn verständlich zusammen.
Behalte die wichtigsten Informationen bei, aber formuliere alles neu:
{article['text']}"""
response = openai.chat.completions.create( response = openai.chat.completions.create(
model="gpt-4", model="gpt-4",
messages=[ messages=[
{"role": "system", "content": "Du bist ein professioneller Redakteur."}, {"role": "system", "content": "Du bist ein professioneller Redakteur, der Artikel umschreibt und verbessert."},
{"role": "user", "content": prompt} {"role": "user", "content": prompt}
] ],
max_tokens=1500,
temperature=0.7
) )
new_text = response.choices[0].message.content.strip() new_text = response.choices[0].message.content.strip()
article["text"] = f"{article['title']}\n\n{new_text}"
article["status"] = "Process" # Tags generieren
tag_prompt = f"""Erstelle 3-5 passende, kurze Stichwörter (Tags) für diesen Artikel.
Gib nur die Tags zurück, getrennt durch Kommas:
tag_prompt = f"Erstelle 3 passende, kurze Stichwörter (Tags) für diesen Artikel:\n\n{new_text}" {new_text}"""
tag_response = openai.chat.completions.create( tag_response = openai.chat.completions.create(
model="gpt-4", model="gpt-4",
messages=[ messages=[
{"role": "system", "content": "Du bist ein Blog-Tag-Generator."}, {"role": "system", "content": "Du generierst präzise Tags für Blog-Artikel."},
{"role": "user", "content": tag_prompt} {"role": "user", "content": tag_prompt}
] ],
max_tokens=100,
temperature=0.5
) )
tags_raw = tag_response.choices[0].message.content.strip() tags_raw = tag_response.choices[0].message.content.strip()
tags = [tag.strip(" ,") for tag in tags_raw.replace("\n", ",").split(",") if tag.strip()] tags = [tag.strip().strip(',') for tag in tags_raw.split(",") if tag.strip()]
# Artikel aktualisieren
article["text"] = new_text
article["tags"] = tags article["tags"] = tags
article["status"] = "Process"
article["rewritten_at"] = datetime.now().isoformat()
article["word_count"] = len(new_text.split())
# Bildmetadaten vervollständigen falls nötig
for img in article.get("images", []): for img in article.get("images", []):
if "caption" not in img: if "caption" not in img or not img["caption"]:
img["caption"] = "Kein Bildtitel vorhanden" img["caption"] = "Kein Bildtitel vorhanden"
if "copyright" not in img: if "copyright" not in img or not img["copyright"]:
img["copyright"] = "Unbekannt" img["copyright"] = "Unbekannt"
if "copyright_url" not in img: if "copyright_url" not in img or not img["copyright_url"]:
img["copyright_url"] = "#" img["copyright_url"] = "#"
logging.info(f"✅ Artikel umgeschrieben: {article['title']}") logging.info(f"✅ Artikel erfolgreich umgeschrieben: {article['title']}")
changed = True changed = True
# Kurze Pause zwischen API-Calls
time.sleep(2)
except Exception as e: except Exception as e:
logging.exception(f"❌ Fehler beim Umschreiben von '{article['title']}':") logging.error(f"❌ Fehler beim Umschreiben von '{article['title']}': {e}")
continue
if changed:
save_articles(articles)
logging.info(f"🎉 {len(rewrite_articles_list)} Artikel erfolgreich umgeschrieben")
except Exception as e:
logging.error(f"❌ Kritischer Fehler beim Umschreiben: {e}")
if changed: def get_article_stats():
save_articles(articles) """Gibt Statistiken über die Artikel zurück"""
logging.info("Alle Artikel mit Status 'Rewrite' wurden verarbeitet.") try:
articles = load_articles()
stats = {
"total_articles": len(articles),
"status_distribution": {},
"word_count_stats": {},
"source_distribution": {},
"images_count": 0
}
# Status-Verteilung
for article in articles:
status = article.get("status", "New")
stats["status_distribution"][status] = stats["status_distribution"].get(status, 0) + 1
# Wortanzahl-Statistiken
word_counts = [article.get("word_count", 0) for article in articles if article.get("word_count")]
if word_counts:
stats["word_count_stats"] = {
"average": sum(word_counts) // len(word_counts),
"min": min(word_counts),
"max": max(word_counts)
}
# Quellen-Verteilung
for article in articles:
source = article.get("source_name", "Unbekannt")
stats["source_distribution"][source] = stats["source_distribution"].get(source, 0) + 1
# Bilder zählen
stats["images_count"] = sum(len(article.get("images", [])) for article in articles)
return stats
except Exception as e:
logging.error(f"❌ Fehler beim Erstellen der Statistiken: {e}")
return {}

View file

@ -2,26 +2,362 @@
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import logging
import time
from typing import Optional
# Konfiguration
REQUEST_TIMEOUT = 15
MAX_RETRIES = 3
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
# Website-spezifische Selektoren
CONTENT_SELECTORS = {
# Promobil & Camping-spezifisch
'promobil.de': [
{'tag': 'div', 'class': 'article__text'},
{'tag': 'div', 'class': 'article-content'},
{'tag': 'div', 'class': 'content-text'}
],
'camping.info': [
{'tag': 'div', 'class': 'article-body'},
{'tag': 'div', 'class': 'post-content'}
],
'caravaning.de': [
{'tag': 'div', 'class': 'article__content'},
{'tag': 'div', 'class': 'entry-content'}
],
# WordPress Standard-Selektoren
'wordpress': [
{'tag': 'div', 'class': 'entry-content'},
{'tag': 'div', 'class': 'post-content'},
{'tag': 'div', 'class': 'content'},
{'tag': 'main', 'class': 'main-content'},
{'tag': 'article', 'class': None}
],
# Allgemeine Fallbacks
'generic': [
{'tag': 'article', 'class': None},
{'tag': 'div', 'class': 'content'},
{'tag': 'div', 'class': 'post'},
{'tag': 'div', 'class': 'entry'},
{'tag': 'main', 'class': None},
{'tag': 'div', 'id': 'content'},
{'tag': 'div', 'id': 'main'}
]
}
def get_domain_from_url(url: str) -> str:
"""
Extrahiert die Domain aus einer URL
"""
try:
from urllib.parse import urlparse
parsed = urlparse(url)
return parsed.netloc.lower()
except:
return ""
def get_selectors_for_domain(domain: str) -> list:
"""
Gibt die passenden Selektoren für eine Domain zurück
"""
# Direkte Domain-Matches
for known_domain in CONTENT_SELECTORS:
if known_domain != 'wordpress' and known_domain != 'generic' and known_domain in domain:
return CONTENT_SELECTORS[known_domain]
# WordPress erkennen (wird später durch Meta-Tags erkannt)
return CONTENT_SELECTORS['generic']
def is_wordpress_site(soup: BeautifulSoup) -> bool:
"""
Erkennt WordPress-Websites anhand von Meta-Tags
"""
try:
# WordPress Generator Meta-Tag
generator = soup.find('meta', attrs={'name': 'generator'})
if generator and 'wordpress' in generator.get('content', '').lower():
return True
# WordPress-spezifische Link-Tags
wp_links = soup.find_all('link', href=lambda x: x and '/wp-' in x)
if wp_links:
return True
# WordPress REST API
rest_api = soup.find('link', attrs={'rel': 'https://api.w.org/'})
if rest_api:
return True
return False
except:
return False
def clean_extracted_text(text: str) -> str:
"""
Bereinigt extrahierten Text von unerwünschten Elementen
"""
if not text:
return ""
lines = text.split('\n')
cleaned_lines = []
for line in lines:
line = line.strip()
# Überspringe sehr kurze Zeilen (wahrscheinlich Navigation/Werbung)
if len(line) < 10:
continue
# Überspringe typische Navigation/Footer-Texte
skip_patterns = [
'cookie', 'datenschutz', 'impressum', 'agb', 'newsletter',
'folgen sie uns', 'social media', 'teilen', 'weiterlesen',
'mehr zum thema', 'ähnliche artikel', 'kommentare',
'anzeige', 'werbung', 'advertisement'
]
if any(pattern in line.lower() for pattern in skip_patterns):
continue
# Überspringe Zeilen mit zu vielen Sonderzeichen (Navigation)
if len([c for c in line if c in '|•→←↑↓']) > 3:
continue
cleaned_lines.append(line)
# Text zusammenfügen
cleaned_text = ' '.join(cleaned_lines)
# Mehrfache Leerzeichen entfernen
cleaned_text = ' '.join(cleaned_text.split())
return cleaned_text
def extract_with_selectors(soup: BeautifulSoup, selectors: list) -> str:
"""
Versucht Text mit einer Liste von Selektoren zu extrahieren
"""
for selector in selectors:
try:
element = None
if selector.get('class'):
element = soup.find(selector['tag'], class_=selector['class'])
elif selector.get('id'):
element = soup.find(selector['tag'], id=selector['id'])
else:
element = soup.find(selector['tag'])
if element:
# Entferne Script- und Style-Tags
for script in element(['script', 'style', 'nav', 'header', 'footer', 'aside']):
script.decompose()
text = element.get_text(' ', strip=True)
# Nur zurückgeben wenn genügend Text vorhanden
if len(text.split()) > 50:
logging.info(f"✅ Erfolgreiche Extraktion mit Selektor: {selector}")
return clean_extracted_text(text)
except Exception as e:
logging.debug(f"Selektor {selector} fehlgeschlagen: {e}")
continue
return ""
def extract_from_paragraphs(soup: BeautifulSoup) -> str:
"""
Fallback: Extrahiert Text aus allen Paragraph-Tags
"""
try:
paragraphs = soup.find_all('p')
if not paragraphs:
return ""
# Sammle alle Paragraph-Texte
texts = []
for p in paragraphs:
text = p.get_text(strip=True)
if len(text) > 20: # Nur längere Absätze
texts.append(text)
combined_text = ' '.join(texts)
if len(combined_text.split()) > 30:
logging.info(f"✅ Fallback-Extraktion aus {len(paragraphs)} Paragraphen")
return clean_extracted_text(combined_text)
return ""
except Exception as e:
logging.error(f"Fehler bei Paragraph-Extraktion: {e}")
return ""
def extract_full_article(url: str) -> str: def extract_full_article(url: str) -> str:
try: """
response = requests.get(url, timeout=10) Hauptfunktion: Extrahiert den vollständigen Artikeltext von einer URL
response.raise_for_status() """
soup = BeautifulSoup(response.text, "html.parser") if not url:
# Promobil & WordPress & allgemeine Fallbacks
candidates = [
{"tag": "div", "class_": "article__text"}, # Promobil
{"tag": "div", "class_": "entry-content"}, # WordPress Standard
{"tag": "article", "class_": None}, # Generisch
]
for selector in candidates:
el = soup.find(selector["tag"], class_=selector["class_"])
if el and len(el.get_text(strip=True).split()) > 50:
return el.get_text(" ", strip=True)
# Fallback: ganzer Seiteninhalt
return soup.get_text(" ", strip=True)
except Exception:
return "" return ""
retries = 0
while retries < MAX_RETRIES:
try:
logging.info(f"📰 Starte Volltextextraktion von: {url} (Versuch {retries + 1})")
# HTTP-Request mit verbessertem Header
headers = {
'User-Agent': USER_AGENT,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'de,en-US;q=0.7,en;q=0.3',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
}
response = requests.get(url, timeout=REQUEST_TIMEOUT, headers=headers)
response.raise_for_status()
# Encoding sicherstellen
if response.encoding.lower() in ['iso-8859-1', 'windows-1252']:
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, "html.parser")
# Domain-spezifische Selektoren ermitteln
domain = get_domain_from_url(url)
selectors = get_selectors_for_domain(domain)
# WordPress erkennen und entsprechende Selektoren verwenden
if is_wordpress_site(soup):
logging.info("🔧 WordPress-Site erkannt")
selectors = CONTENT_SELECTORS['wordpress'] + selectors
# 1. Versuch: Domain-spezifische Selektoren
extracted_text = extract_with_selectors(soup, selectors)
if extracted_text and len(extracted_text.split()) > 50:
logging.info(f"🎉 Erfolgreiche Extraktion: {len(extracted_text.split())} Wörter")
return extracted_text
# 2. Versuch: Generische Selektoren
if not extracted_text:
logging.info("🔄 Fallback auf generische Selektoren")
extracted_text = extract_with_selectors(soup, CONTENT_SELECTORS['generic'])
if extracted_text and len(extracted_text.split()) > 50:
logging.info(f"🎉 Erfolgreiche Extraktion (generisch): {len(extracted_text.split())} Wörter")
return extracted_text
# 3. Versuch: Paragraph-Extraktion
if not extracted_text:
logging.info("🔄 Fallback auf Paragraph-Extraktion")
extracted_text = extract_from_paragraphs(soup)
if extracted_text and len(extracted_text.split()) > 30:
logging.info(f"🎉 Erfolgreiche Extraktion (Paragraphen): {len(extracted_text.split())} Wörter")
return extracted_text
# 4. Letzter Versuch: Gesamter Body-Text
if not extracted_text:
logging.info("🔄 Letzter Fallback: Body-Text")
body = soup.find('body')
if body:
# Entferne Navigation, Header, Footer
for element in body(['nav', 'header', 'footer', 'aside', 'script', 'style']):
element.decompose()
body_text = body.get_text(' ', strip=True)
if len(body_text.split()) > 100:
extracted_text = clean_extracted_text(body_text)
logging.info(f"⚠️ Body-Extraktion: {len(extracted_text.split())} Wörter")
return extracted_text
# Kein brauchbarer Text gefunden
if not extracted_text:
logging.warning(f"⚠️ Keine verwertbaren Inhalte gefunden bei: {url}")
return ""
return extracted_text
except requests.RequestException as e:
retries += 1
logging.warning(f"🌐 Netzwerkfehler bei {url} (Versuch {retries}): {e}")
if retries < MAX_RETRIES:
time.sleep(2 ** retries) # Exponential backoff
continue
else:
logging.error(f"❌ Maximale Anzahl Versuche erreicht für: {url}")
return ""
except Exception as e:
logging.error(f"❌ Unerwarteter Fehler bei Volltextextraktion von {url}: {e}")
return ""
return ""
def extract_article_summary(full_text: str, max_length: int = 300) -> str:
"""
Erstellt eine intelligente Zusammenfassung aus dem Volltext
"""
if not full_text:
return ""
sentences = full_text.split('.')
# Erste 2-3 sinnvolle Sätze als Summary verwenden
summary_sentences = []
current_length = 0
for sentence in sentences[:5]: # Maximal erste 5 Sätze prüfen
sentence = sentence.strip()
if len(sentence) < 20: # Zu kurze Sätze überspringen
continue
if current_length + len(sentence) > max_length:
break
summary_sentences.append(sentence)
current_length += len(sentence)
summary = '. '.join(summary_sentences)
if summary and not summary.endswith('.'):
summary += '.'
return summary[:max_length]
def validate_extracted_content(text: str) -> bool:
"""
Validiert ob der extrahierte Inhalt brauchbar ist
"""
if not text or len(text.strip()) < 100:
return False
words = text.split()
# Mindestens 50 Wörter
if len(words) < 50:
return False
# Nicht zu viele Sonderzeichen (Navigation etc.)
special_chars = len([c for c in text if c in '|•→←↑↓'])
if special_chars > len(text) * 0.05: # Mehr als 5% Sonderzeichen
return False
# Durchschnittliche Wortlänge prüfen (zu kurz = Navigation)
avg_word_length = sum(len(word) for word in words) / len(words)
if avg_word_length < 3:
return False
return True

View file

@ -2,59 +2,325 @@
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urljoin from urllib.parse import urljoin, urlparse
import logging import logging
import time
from typing import List, Dict
# Konfiguration
MAX_IMAGES = 5
MIN_IMAGE_SIZE = 100 # Mindestgröße in Pixeln
ALLOWED_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.webp', '.gif'}
REQUEST_TIMEOUT = 10
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
def extract_images_with_metadata(article_url): def is_valid_image_url(url: str) -> bool:
""" """
Versucht, Bilder mit Bildunterschrift und Copyright aus dem Originalartikel zu extrahieren. Prüft ob eine URL auf ein gültiges Bild zeigt
Gibt eine Liste mit Dictionaries zurück: {url, alt, copyright, copyright_url, caption} """
try:
parsed = urlparse(url)
path = parsed.path.lower()
# Prüfe Dateiendung
if not any(path.endswith(ext) for ext in ALLOWED_EXTENSIONS):
return False
# Prüfe ob URL vollständig ist
if not parsed.scheme or not parsed.netloc:
return False
# Blacklist für unerwünschte Bilder
blacklist_patterns = [
'avatar', 'profile', 'icon', 'logo', 'banner',
'advertisement', 'ads', 'tracking', 'pixel', 'social'
]
return not any(pattern in url.lower() for pattern in blacklist_patterns)
except Exception:
return False
def get_image_dimensions(img_tag) -> tuple:
"""
Versucht die Bildabmessungen aus HTML-Attributen zu ermitteln
"""
try:
width = img_tag.get('width')
height = img_tag.get('height')
if width and height:
return int(width), int(height)
# Aus Style-Attribut extrahieren
style = img_tag.get('style', '')
if 'width:' in style or 'height:' in style:
# Vereinfachte Extraktion - könnte erweitert werden
pass
return None, None
except:
return None, None
def extract_image_metadata(img_tag, base_url: str) -> Dict:
"""
Extrahiert alle verfügbaren Metadaten eines Bildes
"""
try:
# Basis-URL
src = img_tag.get('src') or img_tag.get('data-src') or img_tag.get('data-lazy-src')
if not src:
return None
img_url = urljoin(base_url, src)
if not is_valid_image_url(img_url):
return None
# Alt-Text
alt_text = img_tag.get('alt', '').strip()
# Titel
title = img_tag.get('title', '').strip()
# Bildabmessungen
width, height = get_image_dimensions(img_tag)
# Überspringe sehr kleine Bilder
if width and height and (width < MIN_IMAGE_SIZE or height < MIN_IMAGE_SIZE):
return None
# Caption und Copyright aus Parent-Elementen suchen
caption = ""
copyright_text = "Unbekannt"
copyright_url = base_url
# Suche in Parent-Elementen nach Caption
parent = img_tag.find_parent(['figure', 'div', 'span', 'p'])
if parent:
# Figcaption
figcaption = parent.find('figcaption')
if figcaption:
caption = figcaption.get_text(strip=True)
# Copyright-Link in Figcaption suchen
copyright_link = figcaption.find('a')
if copyright_link:
copyright_url = urljoin(base_url, copyright_link.get('href', ''))
copyright_text = copyright_link.get_text(strip=True)
# Alternative: Caption in kleinen Texten unter dem Bild
caption_candidates = parent.find_all(['small', 'em', 'i'], limit=3)
for candidate in caption_candidates:
text = candidate.get_text(strip=True)
if len(text) > 10 and len(text) < 200: # Plausible Caption-Länge
if not caption: # Nur wenn noch keine Caption gefunden
caption = text
# Fallback für Caption
if not caption:
caption = title or alt_text or "Bild aus Originalartikel"
return {
"url": img_url,
"alt": alt_text,
"caption": caption[:300] if caption else "Kein Bildtitel vorhanden",
"copyright": copyright_text or "Unbekannt",
"copyright_url": copyright_url or base_url,
"width": width,
"height": height,
"title": title
}
except Exception as e:
logging.error(f"Fehler bei Metadaten-Extraktion: {e}")
return None
def extract_images_with_metadata(article_url: str) -> List[Dict]:
"""
Hauptfunktion: Extrahiert Bilder mit Metadaten aus einem Artikel
""" """
images = [] images = []
try:
logging.info(f"📷 Extrahiere Bilder von {article_url}") if not article_url:
response = requests.get(article_url, timeout=10)
if response.status_code != 200:
logging.warning(f"Keine gültige Antwort von {article_url} (Status {response.status_code})")
return []
soup = BeautifulSoup(response.content, "html.parser")
for img_tag in soup.find_all("img"):
src = img_tag.get("src")
if not src:
continue
img_url = urljoin(article_url, src)
alt_text = img_tag.get("alt", "").strip()
copyright_text = "Unbekannt"
copyright_link = article_url
caption = alt_text or "Bild aus Originalartikel"
parent = img_tag.find_parent(["figure", "div"])
if parent:
figcaption = parent.find("figcaption")
if figcaption:
caption = figcaption.get_text(strip=True)
link_tag = figcaption.find("a")
if link_tag and link_tag.has_attr("href"):
copyright_link = link_tag["href"]
copyright_text = link_tag.get_text(strip=True)
image_data = {
"url": img_url,
"alt": alt_text,
"caption": caption or "Kein Bildtitel vorhanden",
"copyright": copyright_text or "Unbekannt",
"copyright_url": copyright_link or article_url
}
images.append(image_data)
logging.info(f"{len(images)} Bilder gefunden bei {article_url}")
return images return images
try:
logging.info(f"🖼️ Starte Bildextraktion von: {article_url}")
# HTTP-Request mit verbessertem Header
headers = {
'User-Agent': USER_AGENT,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'de,en-US;q=0.7,en;q=0.3',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
}
response = requests.get(article_url, timeout=REQUEST_TIMEOUT, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
# Alle img-Tags finden
img_tags = soup.find_all("img")
logging.info(f"🔍 {len(img_tags)} img-Tags gefunden")
processed_urls = set() # Duplikate vermeiden
for img_tag in img_tags:
try:
# Metadaten extrahieren
image_data = extract_image_metadata(img_tag, article_url)
if image_data and image_data["url"] not in processed_urls:
images.append(image_data)
processed_urls.add(image_data["url"])
logging.info(f"✅ Bild hinzugefügt: {image_data['caption'][:50]}...")
# Maximum erreicht?
if len(images) >= MAX_IMAGES:
break
except Exception as e:
logging.error(f"❌ Fehler beim Verarbeiten eines Bildes: {e}")
continue
# Bilder nach Größe sortieren (größere zuerst)
images.sort(key=lambda x: (x.get('width', 0) * x.get('height', 0)), reverse=True)
logging.info(f"🎉 {len(images)} Bilder erfolgreich extrahiert von {article_url}")
return images[:MAX_IMAGES] # Sicherheitshalber nochmal begrenzen
except requests.RequestException as e:
logging.error(f"🌐 Netzwerkfehler bei {article_url}: {e}")
return []
except Exception as e: except Exception as e:
logging.exception(f"Fehler bei der Bildextraktion aus {article_url}:") logging.error(f"❌ Unerwarteter Fehler bei Bildextraktion von {article_url}: {e}")
return [] return []
def validate_image_url(url: str) -> bool:
"""
Prüft ob ein Bild tatsächlich erreichbar ist
"""
try:
response = requests.head(url, timeout=5)
content_type = response.headers.get('content-type', '').lower()
return response.status_code == 200 and 'image' in content_type
except:
return False
def extract_featured_image(article_url: str) -> Dict:
"""
Versucht das Hauptbild/Featured Image eines Artikels zu finden
"""
try:
headers = {'User-Agent': USER_AGENT}
response = requests.get(article_url, timeout=REQUEST_TIMEOUT, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
# OpenGraph Image
og_image = soup.find('meta', property='og:image')
if og_image and og_image.get('content'):
img_url = urljoin(article_url, og_image['content'])
if is_valid_image_url(img_url):
return {
"url": img_url,
"alt": "Featured Image",
"caption": "Hauptbild des Artikels",
"copyright": "Unbekannt",
"copyright_url": article_url,
"type": "featured"
}
# Twitter Card Image
twitter_image = soup.find('meta', attrs={'name': 'twitter:image'})
if twitter_image and twitter_image.get('content'):
img_url = urljoin(article_url, twitter_image['content'])
if is_valid_image_url(img_url):
return {
"url": img_url,
"alt": "Featured Image",
"caption": "Hauptbild des Artikels",
"copyright": "Unbekannt",
"copyright_url": article_url,
"type": "featured"
}
return None
except Exception as e:
logging.error(f"Fehler bei Featured Image Extraktion: {e}")
return None
def clean_image_metadata(images: List[Dict]) -> List[Dict]:
"""
Bereinigt und normalisiert Bildmetadaten
"""
cleaned_images = []
for img in images:
try:
# URL validieren
if not img.get("url") or not is_valid_image_url(img["url"]):
continue
# Metadaten bereinigen
cleaned_img = {
"url": img["url"].strip(),
"alt": (img.get("alt") or "").strip()[:200],
"caption": (img.get("caption") or "Kein Bildtitel vorhanden").strip()[:300],
"copyright": (img.get("copyright") or "Unbekannt").strip()[:100],
"copyright_url": (img.get("copyright_url") or "#").strip(),
"width": img.get("width"),
"height": img.get("height"),
"title": (img.get("title") or "").strip()[:200]
}
# Leere Felder mit Standardwerten füllen
if not cleaned_img["caption"]:
cleaned_img["caption"] = "Kein Bildtitel vorhanden"
if not cleaned_img["copyright"]:
cleaned_img["copyright"] = "Unbekannt"
if not cleaned_img["copyright_url"] or cleaned_img["copyright_url"] == "#":
cleaned_img["copyright_url"] = img["url"] # Bild-URL als Fallback
cleaned_images.append(cleaned_img)
except Exception as e:
logging.error(f"Fehler beim Bereinigen der Bildmetadaten: {e}")
continue
return cleaned_images
# Hauptfunktion für bessere Kompatibilität mit dem bestehenden Code
def extract_images_with_metadata_enhanced(article_url: str) -> List[Dict]:
"""
Erweiterte Bildextraktion mit Fallback-Strategien
"""
all_images = []
# 1. Featured Image versuchen
featured = extract_featured_image(article_url)
if featured:
all_images.append(featured)
# 2. Normale Bildextraktion
content_images = extract_images_with_metadata(article_url)
all_images.extend(content_images)
# 3. Duplikate entfernen
seen_urls = set()
unique_images = []
for img in all_images:
if img["url"] not in seen_urls:
unique_images.append(img)
seen_urls.add(img["url"])
# 4. Metadaten bereinigen
cleaned_images = clean_image_metadata(unique_images)
return cleaned_images[:MAX_IMAGES]

236
utils/ui_helpers.py Normal file
View file

@ -0,0 +1,236 @@
# utils/ui_helpers.py
import streamlit as st
from datetime import datetime
import logging
def show_toast(message, type="success", duration=3):
"""
Zeigt eine Toast-Benachrichtigung an
"""
if type == "success":
st.success(message)
elif type == "error":
st.error(message)
elif type == "warning":
st.warning(message)
elif type == "info":
st.info(message)
def format_datetime(date_str):
"""
Formatiert Datetime-Strings für bessere Lesbarkeit
"""
try:
if isinstance(date_str, str):
if "GMT" in date_str or "+" in date_str:
dt = datetime.strptime(date_str, "%a, %d %b %Y %H:%M:%S %z")
return dt.strftime("%d.%m.%Y %H:%M")
elif "T" in date_str:
dt = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
return dt.strftime("%d.%m.%Y %H:%M")
else:
return date_str[:16].replace("T", " ")
return str(date_str)
except Exception as e:
logging.warning(f"Datum konnte nicht formatiert werden: {date_str} - {e}")
return str(date_str)[:16]
def get_status_color(status):
"""
Gibt die passende Farbe für einen Status zurück
"""
colors = {
"New": "#2196f3",
"Rewrite": "#ff9800",
"Process": "#9c27b0",
"Online": "#4caf50",
"On Hold": "#e91e63",
"Trash": "#f44336"
}
return colors.get(status, "#2196f3")
def create_status_badge(status):
"""
Erstellt einen HTML-Status-Badge
"""
color = get_status_color(status)
return f"""
<span style="
background-color: {color}20;
color: {color};
padding: 0.25rem 0.5rem;
border-radius: 12px;
font-size: 0.8rem;
font-weight: 600;
border: 1px solid {color}40;
">{status}</span>
"""
def truncate_text(text, max_length=150):
"""
Kürzt Text auf maximale Länge
"""
if not text:
return ""
if len(text) <= max_length:
return text
return text[:max_length].rsplit(' ', 1)[0] + "..."
def calculate_reading_time(text):
"""
Berechnet geschätzte Lesezeit (200 Wörter/Minute)
"""
if not text:
return 0
word_count = len(text.split())
reading_time = max(1, word_count // 200)
return reading_time
def validate_url(url):
"""
Validiert eine URL
"""
import re
pattern = re.compile(
r'^https?://' # http:// oder https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' # domain...
r'localhost|' # localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...oder IP
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
return pattern.match(url) is not None
def create_article_card_html(article, source_name="Unbekannt"):
"""
Erstellt HTML für eine Artikel-Karte
"""
has_images = len(article.get("images", [])) > 0
word_count = len(article.get("text", "").split())
reading_time = calculate_reading_time(article.get("text", ""))
# Unvollständige Bilder prüfen
incomplete_images = any(
not all(k in img and img[k] for k in ("caption", "copyright", "copyright_url"))
for img in article.get("images", [])
)
warning_icon = " ⚠️" if incomplete_images else ""
return f"""
<div style="
background: white;
border-radius: 12px;
padding: 1.5rem;
margin-bottom: 1rem;
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
border-left: 4px solid {get_status_color(article.get('status', 'New'))};
transition: transform 0.2s ease;
" onmouseover="this.style.transform='translateY(-2px)'" onmouseout="this.style.transform='translateY(0)'">
<div style="display: flex; justify-content: space-between; align-items: start; margin-bottom: 1rem;">
<div style="flex: 1;">
<h3 style="margin: 0 0 0.5rem 0; color: #333; font-size: 1.1rem;">
{article.get('title', 'Kein Titel')}{warning_icon}
</h3>
<div style="font-size: 0.85rem; color: #666; margin-bottom: 0.5rem;">
📅 {format_datetime(article.get('date', ''))}
📝 {word_count} Wörter
{reading_time} Min Lesezeit
{'• 🖼️ ' + str(len(article.get('images', []))) + ' Bilder' if has_images else ''}
</div>
</div>
<div>
{create_status_badge(article.get('status', 'New'))}
</div>
</div>
<div style="margin-bottom: 1rem; color: #555; line-height: 1.4;">
{truncate_text(article.get('summary', ''), 200)}
</div>
<div style="display: flex; justify-content: space-between; align-items: center; font-size: 0.8rem; color: #888;">
<div>
📡 {source_name}
</div>
<div>
🏷 {', '.join(article.get('tags', [])[:3])}{'...' if len(article.get('tags', [])) > 3 else ''}
</div>
</div>
</div>
"""
def create_stats_card(title, value, icon="📊", color="#667eea"):
"""
Erstellt eine Statistik-Karte
"""
return f"""
<div style="
background: white;
border-radius: 12px;
padding: 1.5rem;
text-align: center;
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
border-top: 4px solid {color};
">
<div style="font-size: 2rem; margin-bottom: 0.5rem;">{icon}</div>
<div style="font-size: 2rem; font-weight: bold; color: {color}; margin-bottom: 0.5rem;">{value}</div>
<div style="color: #666; font-weight: 500;">{title}</div>
</div>
"""
def show_loading_spinner(text="Lädt..."):
"""
Zeigt einen Lade-Spinner mit Text
"""
return st.empty().markdown(f"""
<div style="text-align: center; padding: 2rem;">
<div style="
border: 4px solid #f3f3f3;
border-top: 4px solid #667eea;
border-radius: 50%;
width: 40px;
height: 40px;
animation: spin 1s linear infinite;
margin: 0 auto 1rem auto;
"></div>
<div style="color: #666;">{text}</div>
</div>
<style>
@keyframes spin {
0% { transform: rotate(0deg); }
100% { transform: rotate(360deg); }
}
</style>
""", unsafe_allow_html=True)
def create_filter_section():
"""
Erstellt einen modernen Filter-Bereich
"""
return """
<div style="
background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
border-radius: 12px;
padding: 1.5rem;
margin-bottom: 2rem;
box-shadow: 0 4px 6px rgba(0,0,0,0.1);
">
<h3 style="margin: 0 0 1rem 0; color: #333;">🔍 Filter & Suche</h3>
"""
def get_error_message(error_type, details=""):
"""
Gibt formatierte Fehlermeldungen zurück
"""
messages = {
"feed_error": f"❌ Fehler beim Laden des Feeds: {details}",
"save_error": f"❌ Fehler beim Speichern: {details}",
"api_error": f"❌ API-Fehler: {details}",
"validation_error": f"⚠️ Validierungsfehler: {details}",
"network_error": f"🌐 Netzwerkfehler: {details}"
}
return messages.get(error_type, f"❌ Unbekannter Fehler: {details}")