Grundfunktionen optimiert
This commit is contained in:
parent
050e08859c
commit
0c84dd1a1a
8 changed files with 4866 additions and 315 deletions
750
app.py
750
app.py
|
|
@ -1,4 +1,4 @@
|
|||
# app.py (aktualisiert mit Feed-Dropdown)
|
||||
# app.py
|
||||
|
||||
import streamlit as st
|
||||
from datetime import datetime
|
||||
|
|
@ -13,151 +13,659 @@ from main import (
|
|||
from utils.dalle_generator import generate_dalle_image
|
||||
import os
|
||||
from collections import Counter
|
||||
import time
|
||||
|
||||
st.set_page_config(page_title="📰 RSS Artikel Manager", layout="wide")
|
||||
st.title("📰 RSS Artikel Manager")
|
||||
# === Page Configuration ===
|
||||
st.set_page_config(
|
||||
page_title="📰 RSS Artikel Manager",
|
||||
layout="wide",
|
||||
initial_sidebar_state="collapsed"
|
||||
)
|
||||
|
||||
# === Sidebar: Feed-Verwaltung ===
|
||||
st.sidebar.header("📡 RSS Feeds verwalten")
|
||||
feeds = load_feeds()
|
||||
new_feed = st.sidebar.text_input("Neuen RSS Feed hinzufügen")
|
||||
if st.sidebar.button("Feed hinzufügen"):
|
||||
if new_feed and new_feed not in [f.get("url", f) for f in feeds]:
|
||||
feeds.append({"url": new_feed, "name": "Neuer Feed"})
|
||||
save_feeds(feeds)
|
||||
st.sidebar.success("Feed hinzugefügt")
|
||||
# === Custom CSS für modernes Design ===
|
||||
st.markdown("""
|
||||
<style>
|
||||
/* Hauptcontainer */
|
||||
.main-header {
|
||||
background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
|
||||
padding: 2rem;
|
||||
border-radius: 10px;
|
||||
margin-bottom: 2rem;
|
||||
color: white;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
/* Artikel Cards */
|
||||
.article-card {
|
||||
background: white;
|
||||
border-radius: 12px;
|
||||
padding: 1.5rem;
|
||||
margin-bottom: 1rem;
|
||||
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
|
||||
border-left: 4px solid #667eea;
|
||||
transition: transform 0.2s;
|
||||
}
|
||||
|
||||
.article-card:hover {
|
||||
transform: translateY(-2px);
|
||||
box-shadow: 0 8px 15px rgba(0, 0, 0, 0.15);
|
||||
}
|
||||
|
||||
/* Status Badges */
|
||||
.status-badge {
|
||||
padding: 0.3rem 0.8rem;
|
||||
border-radius: 20px;
|
||||
font-size: 0.8rem;
|
||||
font-weight: bold;
|
||||
margin-right: 0.5rem;
|
||||
}
|
||||
|
||||
.status-new { background-color: #e3f2fd; color: #1976d2; }
|
||||
.status-rewrite { background-color: #fff3e0; color: #f57c00; }
|
||||
.status-process { background-color: #f3e5f5; color: #7b1fa2; }
|
||||
.status-online { background-color: #e8f5e8; color: #388e3c; }
|
||||
.status-hold { background-color: #fce4ec; color: #c2185b; }
|
||||
.status-trash { background-color: #ffebee; color: #d32f2f; }
|
||||
|
||||
/* Filter Section */
|
||||
.filter-section {
|
||||
background: #f8f9fa;
|
||||
padding: 1.5rem;
|
||||
border-radius: 10px;
|
||||
margin-bottom: 2rem;
|
||||
}
|
||||
|
||||
/* Stats Cards */
|
||||
.stats-card {
|
||||
background: white;
|
||||
padding: 1.5rem;
|
||||
border-radius: 10px;
|
||||
text-align: center;
|
||||
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
|
||||
}
|
||||
|
||||
.stats-number {
|
||||
font-size: 2rem;
|
||||
font-weight: bold;
|
||||
color: #667eea;
|
||||
}
|
||||
|
||||
/* Action Buttons */
|
||||
.action-button {
|
||||
margin: 0.25rem;
|
||||
}
|
||||
|
||||
/* Image Gallery */
|
||||
.image-gallery {
|
||||
display: flex;
|
||||
gap: 1rem;
|
||||
overflow-x: auto;
|
||||
padding: 1rem 0;
|
||||
}
|
||||
|
||||
.image-item {
|
||||
min-width: 200px;
|
||||
text-align: center;
|
||||
}
|
||||
</style>
|
||||
""", unsafe_allow_html=True)
|
||||
|
||||
if st.sidebar.button("🔄 Alle Feeds neu laden"):
|
||||
existing_ids = [a["id"] for a in load_articles()]
|
||||
process_articles(existing_ids)
|
||||
st.rerun()
|
||||
# === Initialize Session State ===
|
||||
if 'selected_articles' not in st.session_state:
|
||||
st.session_state.selected_articles = set()
|
||||
if 'search_query' not in st.session_state:
|
||||
st.session_state.search_query = ""
|
||||
if 'status_filter' not in st.session_state:
|
||||
st.session_state.status_filter = "New"
|
||||
if 'feed_filter' not in st.session_state:
|
||||
st.session_state.feed_filter = "Alle"
|
||||
|
||||
if st.sidebar.button("✍️ Artikel umschreiben (Rewrite)"):
|
||||
rewrite_articles()
|
||||
st.rerun()
|
||||
# === Helper Functions ===
|
||||
def get_status_badge(status):
|
||||
"""Erstellt einen farbigen Status-Badge"""
|
||||
status_classes = {
|
||||
"New": "status-new",
|
||||
"Rewrite": "status-rewrite",
|
||||
"Process": "status-process",
|
||||
"Online": "status-online",
|
||||
"On Hold": "status-hold",
|
||||
"Trash": "status-trash"
|
||||
}
|
||||
class_name = status_classes.get(status, "status-new")
|
||||
return f'<span class="status-badge {class_name}">{status}</span>'
|
||||
|
||||
# === Hauptbereich: Artikelübersicht ===
|
||||
st.header("📋 Artikelübersicht")
|
||||
status_filter = st.selectbox("Status filtern", ["Alle", "New", "Rewrite", "Process", "Online", "On Hold", "Trash"], index=1)
|
||||
def format_date(date_str):
|
||||
"""Formatiert Datum für bessere Lesbarkeit"""
|
||||
try:
|
||||
if "GMT" in date_str or "+" in date_str:
|
||||
return datetime.strptime(date_str, "%a, %d %b %Y %H:%M:%S %z").strftime("%d.%m.%Y %H:%M")
|
||||
else:
|
||||
return date_str[:16].replace("T", " ")
|
||||
except:
|
||||
return date_str[:10]
|
||||
|
||||
all_articles = load_articles()
|
||||
articles = all_articles
|
||||
def get_word_count(text):
|
||||
"""Zählt Wörter im Text"""
|
||||
return len(text.split()) if text else 0
|
||||
|
||||
if status_filter != "Alle":
|
||||
articles = [a for a in articles if a.get("status") == status_filter]
|
||||
def show_notification(message, type="success"):
|
||||
"""Zeigt eine Benachrichtigung an"""
|
||||
if type == "success":
|
||||
st.success(message)
|
||||
elif type == "error":
|
||||
st.error(message)
|
||||
elif type == "warning":
|
||||
st.warning(message)
|
||||
elif type == "info":
|
||||
st.info(message)
|
||||
|
||||
# === Feed-Filter ===
|
||||
source_to_name = {f.get("url"): f.get("name", "unidentified") for f in feeds}
|
||||
source_counter = Counter([a.get("source", "unidentified") for a in articles])
|
||||
# === Header ===
|
||||
st.markdown("""
|
||||
<div class="main-header">
|
||||
<h1>📰 RSS Artikel Manager</h1>
|
||||
<p>Moderne Verwaltung deiner RSS-Feeds und Artikel</p>
|
||||
</div>
|
||||
""", unsafe_allow_html=True)
|
||||
|
||||
feed_options = ["Alle ({})".format(len(articles))]
|
||||
feed_map = {}
|
||||
# === Tab Navigation ===
|
||||
tab1, tab2, tab3, tab4, tab5 = st.tabs([
|
||||
"📋 Dashboard",
|
||||
"📰 Artikel",
|
||||
"📡 Feeds",
|
||||
"🖼️ Bilder",
|
||||
"📊 Statistiken"
|
||||
])
|
||||
|
||||
for source, count in source_counter.items():
|
||||
name = source_to_name.get(source, "unidentified")
|
||||
label = f"{name} ({count})"
|
||||
feed_options.append(label)
|
||||
feed_map[label] = source
|
||||
# === Dashboard Tab ===
|
||||
with tab1:
|
||||
st.header("📊 Übersicht")
|
||||
|
||||
# Lade Daten
|
||||
all_articles = load_articles()
|
||||
feeds = load_feeds()
|
||||
|
||||
# Statistiken
|
||||
col1, col2, col3, col4 = st.columns(4)
|
||||
|
||||
with col1:
|
||||
st.markdown("""
|
||||
<div class="stats-card">
|
||||
<div class="stats-number">{}</div>
|
||||
<div>Gesamt Artikel</div>
|
||||
</div>
|
||||
""".format(len(all_articles)), unsafe_allow_html=True)
|
||||
|
||||
with col2:
|
||||
new_count = len([a for a in all_articles if a.get("status") == "New"])
|
||||
st.markdown("""
|
||||
<div class="stats-card">
|
||||
<div class="stats-number">{}</div>
|
||||
<div>Neue Artikel</div>
|
||||
</div>
|
||||
""".format(new_count), unsafe_allow_html=True)
|
||||
|
||||
with col3:
|
||||
st.markdown("""
|
||||
<div class="stats-card">
|
||||
<div class="stats-number">{}</div>
|
||||
<div>RSS Feeds</div>
|
||||
</div>
|
||||
""".format(len(feeds)), unsafe_allow_html=True)
|
||||
|
||||
with col4:
|
||||
online_count = len([a for a in all_articles if a.get("status") == "Online"])
|
||||
st.markdown("""
|
||||
<div class="stats-card">
|
||||
<div class="stats-number">{}</div>
|
||||
<div>Online</div>
|
||||
</div>
|
||||
""".format(online_count), unsafe_allow_html=True)
|
||||
|
||||
st.markdown("<br>", unsafe_allow_html=True)
|
||||
|
||||
# Quick Actions
|
||||
st.subheader("⚡ Schnellaktionen")
|
||||
|
||||
col1, col2, col3 = st.columns(3)
|
||||
|
||||
with col1:
|
||||
if st.button("🔄 Alle Feeds aktualisieren", use_container_width=True):
|
||||
with st.spinner("Feeds werden aktualisiert..."):
|
||||
existing_ids = [a["id"] for a in all_articles]
|
||||
process_articles(existing_ids)
|
||||
show_notification("Feeds erfolgreich aktualisiert!")
|
||||
time.sleep(1)
|
||||
st.rerun()
|
||||
|
||||
with col2:
|
||||
if st.button("✍️ Artikel umschreiben", use_container_width=True):
|
||||
rewrite_count = len([a for a in all_articles if a.get("status") == "Rewrite"])
|
||||
if rewrite_count > 0:
|
||||
with st.spinner(f"{rewrite_count} Artikel werden umgeschrieben..."):
|
||||
rewrite_articles()
|
||||
show_notification(f"{rewrite_count} Artikel erfolgreich umgeschrieben!")
|
||||
time.sleep(1)
|
||||
st.rerun()
|
||||
else:
|
||||
show_notification("Keine Artikel zum Umschreiben gefunden.", "info")
|
||||
|
||||
with col3:
|
||||
if st.button("🧹 Aufräumen", use_container_width=True):
|
||||
trash_count = len([a for a in all_articles if a.get("status") == "Trash"])
|
||||
if trash_count > 0:
|
||||
show_notification(f"{trash_count} Artikel im Papierkorb gefunden.", "info")
|
||||
else:
|
||||
show_notification("Keine Artikel zum Aufräumen gefunden.", "info")
|
||||
|
||||
# Neueste Artikel Preview
|
||||
st.subheader("🕒 Neueste Artikel")
|
||||
recent_articles = sorted(all_articles, key=lambda x: x.get("date", ""), reverse=True)[:5]
|
||||
|
||||
for article in recent_articles:
|
||||
st.markdown(f"""
|
||||
<div class="article-card">
|
||||
<div style="display: flex; justify-content: space-between; align-items: center;">
|
||||
<div>
|
||||
<strong>{article.get('title', 'Kein Titel')}</strong>
|
||||
<br>
|
||||
<small>{format_date(article.get('date', ''))}</small>
|
||||
</div>
|
||||
<div>
|
||||
{get_status_badge(article.get('status', 'New'))}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
""", unsafe_allow_html=True)
|
||||
|
||||
selected_feed_label = st.selectbox("Feed-Auswahl", feed_options)
|
||||
|
||||
if selected_feed_label != feed_options[0]: # nicht „Alle“
|
||||
selected_source = feed_map[selected_feed_label]
|
||||
articles = [a for a in articles if a.get("source", "unidentified") == selected_source]
|
||||
|
||||
# === Artikel-Tabelle ===
|
||||
if articles:
|
||||
st.markdown("### 📄 Übersichtstabelle")
|
||||
st.write("**Spaltenübersicht:** Auswahl | Datum | Titel | Zusammenfassung | Wörter | Tags | Status")
|
||||
|
||||
for article in articles:
|
||||
# === Artikel Tab ===
|
||||
with tab2:
|
||||
st.header("📰 Artikel verwalten")
|
||||
|
||||
# Filter Section
|
||||
st.markdown('<div class="filter-section">', unsafe_allow_html=True)
|
||||
st.subheader("🔍 Filter & Suche")
|
||||
|
||||
col1, col2, col3 = st.columns(3)
|
||||
|
||||
with col1:
|
||||
status_options = ["Alle", "New", "Rewrite", "Process", "Online", "On Hold", "Trash"]
|
||||
st.session_state.status_filter = st.selectbox(
|
||||
"Status",
|
||||
status_options,
|
||||
index=status_options.index(st.session_state.status_filter)
|
||||
)
|
||||
|
||||
with col2:
|
||||
# Feed Filter
|
||||
source_to_name = {f.get("url"): f.get("name", "Unbekannt") for f in feeds}
|
||||
source_counter = Counter([a.get("source", "Unbekannt") for a in all_articles])
|
||||
|
||||
feed_options = ["Alle"]
|
||||
feed_map = {"Alle": None}
|
||||
|
||||
for source, count in source_counter.items():
|
||||
name = source_to_name.get(source, "Unbekannt")
|
||||
label = f"{name} ({count})"
|
||||
feed_options.append(label)
|
||||
feed_map[label] = source
|
||||
|
||||
selected_feed_label = st.selectbox("Feed", feed_options)
|
||||
st.session_state.feed_filter = selected_feed_label
|
||||
|
||||
with col3:
|
||||
st.session_state.search_query = st.text_input(
|
||||
"Suche",
|
||||
value=st.session_state.search_query,
|
||||
placeholder="Titel, Text oder Tags durchsuchen..."
|
||||
)
|
||||
|
||||
st.markdown('</div>', unsafe_allow_html=True)
|
||||
|
||||
# Filter anwenden
|
||||
filtered_articles = all_articles
|
||||
|
||||
# Status Filter
|
||||
if st.session_state.status_filter != "Alle":
|
||||
filtered_articles = [a for a in filtered_articles if a.get("status") == st.session_state.status_filter]
|
||||
|
||||
# Feed Filter
|
||||
if st.session_state.feed_filter != "Alle":
|
||||
selected_source = feed_map[st.session_state.feed_filter]
|
||||
filtered_articles = [a for a in filtered_articles if a.get("source") == selected_source]
|
||||
|
||||
# Suche
|
||||
if st.session_state.search_query:
|
||||
query = st.session_state.search_query.lower()
|
||||
filtered_articles = [
|
||||
a for a in filtered_articles
|
||||
if query in a.get("title", "").lower()
|
||||
or query in a.get("text", "").lower()
|
||||
or any(query in tag.lower() for tag in a.get("tags", []))
|
||||
]
|
||||
|
||||
# Ergebnisse anzeigen
|
||||
st.write(f"**{len(filtered_articles)} Artikel gefunden**")
|
||||
|
||||
# Artikel Cards
|
||||
for article in filtered_articles:
|
||||
has_incomplete_images = any(
|
||||
not all(k in img and img[k] for k in ("caption", "copyright", "copyright_url"))
|
||||
for img in article.get("images", [])
|
||||
)
|
||||
|
||||
cols = st.columns([0.05, 0.1, 0.2, 0.25, 0.05, 0.2, 0.15])
|
||||
with cols[0]:
|
||||
st.checkbox("", key=f"select_{article['id']}")
|
||||
with cols[1]:
|
||||
date_str = article["date"]
|
||||
if "GMT" in date_str or "+" in date_str:
|
||||
date_str = datetime.strptime(date_str, "%a, %d %b %Y %H:%M:%S %z").strftime("%d.%m.%y")
|
||||
else:
|
||||
date_str = date_str[:10]
|
||||
st.markdown(date_str)
|
||||
with cols[2]:
|
||||
title = f"**{article['title']}**"
|
||||
|
||||
# Article Card
|
||||
st.markdown('<div class="article-card">', unsafe_allow_html=True)
|
||||
|
||||
# Header
|
||||
col1, col2 = st.columns([3, 1])
|
||||
|
||||
with col1:
|
||||
title = article.get("title", "Kein Titel")
|
||||
if has_incomplete_images:
|
||||
title += " ⚠️"
|
||||
st.markdown(title)
|
||||
with cols[3]:
|
||||
st.markdown(article.get("summary", "")[:150])
|
||||
with cols[4]:
|
||||
st.markdown(str(len(article.get("text", "").split())))
|
||||
with cols[5]:
|
||||
st.markdown(", ".join(article.get("tags", [])))
|
||||
with cols[6]:
|
||||
st.markdown(f"**{title}**")
|
||||
st.markdown(f"📅 {format_date(article.get('date', ''))}")
|
||||
|
||||
with col2:
|
||||
st.markdown(get_status_badge(article.get("status", "New")), unsafe_allow_html=True)
|
||||
|
||||
# Content Preview
|
||||
summary = article.get("summary", "")[:200]
|
||||
if len(summary) == 200:
|
||||
summary += "..."
|
||||
st.markdown(summary)
|
||||
|
||||
# Meta Info
|
||||
col1, col2, col3 = st.columns(3)
|
||||
with col1:
|
||||
st.markdown(f"📝 **{get_word_count(article.get('text', ''))} Wörter**")
|
||||
with col2:
|
||||
tags = article.get("tags", [])
|
||||
if tags:
|
||||
st.markdown(f"🏷️ {', '.join(tags[:3])}{'...' if len(tags) > 3 else ''}")
|
||||
with col3:
|
||||
source_name = source_to_name.get(article.get("source", ""), "Unbekannt")
|
||||
st.markdown(f"📡 {source_name}")
|
||||
|
||||
# Actions
|
||||
col1, col2, col3, col4 = st.columns(4)
|
||||
|
||||
with col1:
|
||||
# Status ändern
|
||||
status_options = ["New", "Rewrite", "Process", "Online", "On Hold", "Trash"]
|
||||
current_status = article.get("status", "New")
|
||||
new_status = st.selectbox("", status_options, index=status_options.index(current_status), key=f"status_{article['id']}")
|
||||
new_status = st.selectbox(
|
||||
"Status",
|
||||
status_options,
|
||||
index=status_options.index(current_status),
|
||||
key=f"status_{article['id']}"
|
||||
)
|
||||
|
||||
if new_status != current_status:
|
||||
article["status"] = new_status
|
||||
# Artikel in der Liste finden und aktualisieren
|
||||
for idx, art in enumerate(all_articles):
|
||||
if art["id"] == article["id"]:
|
||||
all_articles[idx] = article
|
||||
all_articles[idx]["status"] = new_status
|
||||
break
|
||||
save_articles(all_articles)
|
||||
show_notification(f"Status auf '{new_status}' geändert!")
|
||||
time.sleep(0.5)
|
||||
st.rerun()
|
||||
|
||||
with st.expander(f"🔍 {article['title']}"):
|
||||
st.markdown("#### ✍️ Artikeltext")
|
||||
st.code(f"{article['title']}\n\n{article['text']}\n\nQuelle: {article['link']}", language="markdown")
|
||||
|
||||
st.markdown("#### 🌿 Tags")
|
||||
st.code(", ".join(article.get("tags", [])), language="markdown")
|
||||
|
||||
st.markdown("#### 🖼️ Bilder")
|
||||
for i, img in enumerate(article.get("images", [])):
|
||||
st.image(img["url"], caption=img.get("caption", "Kein Titel"), use_column_width=True)
|
||||
|
||||
with st.form(f"edit_image_{article['id']}_{i}", clear_on_submit=False):
|
||||
caption = st.text_input("Bildtitel", value=img.get("caption", ""))
|
||||
copyright = st.text_input("Copyright", value=img.get("copyright", ""))
|
||||
copyright_url = st.text_input("Quelle", value=img.get("copyright_url", ""))
|
||||
if st.form_submit_button("Änderungen speichern"):
|
||||
img["caption"] = caption or "Kein Bildtitel vorhanden"
|
||||
img["copyright"] = copyright or "Unbekannt"
|
||||
img["copyright_url"] = copyright_url or "#"
|
||||
for idx, art in enumerate(all_articles):
|
||||
if art["id"] == article["id"]:
|
||||
all_articles[idx] = article
|
||||
break
|
||||
save_articles(all_articles)
|
||||
st.success("Bilddaten gespeichert")
|
||||
|
||||
|
||||
with col2:
|
||||
if st.button("📋 Text kopieren", key=f"copy_{article['id']}"):
|
||||
text_to_copy = f"{article['title']}\n\n{article['text']}\n\nQuelle: {article['link']}"
|
||||
st.code(text_to_copy, language="markdown")
|
||||
show_notification("Text bereit zum Kopieren!")
|
||||
|
||||
with col3:
|
||||
if st.button("🔗 Original öffnen", key=f"link_{article['id']}"):
|
||||
st.markdown(f"[🔗 Artikel öffnen]({article.get('link', '#')})")
|
||||
|
||||
with col4:
|
||||
# Details anzeigen
|
||||
if st.button("📖 Details", key=f"details_{article['id']}"):
|
||||
st.session_state[f"show_details_{article['id']}"] = not st.session_state.get(f"show_details_{article['id']}", False)
|
||||
|
||||
# Details Section (wenn erweitert)
|
||||
if st.session_state.get(f"show_details_{article['id']}", False):
|
||||
st.markdown("---")
|
||||
|
||||
# Artikel Text
|
||||
with st.expander("📝 Volltext", expanded=False):
|
||||
st.code(article.get("text", ""), language="markdown")
|
||||
|
||||
# Tags bearbeiten
|
||||
with st.expander("🏷️ Tags bearbeiten", expanded=False):
|
||||
current_tags = ", ".join(article.get("tags", []))
|
||||
new_tags = st.text_area("Tags (getrennt durch Komma)", value=current_tags, key=f"tags_{article['id']}")
|
||||
|
||||
if st.button("Tags speichern", key=f"save_tags_{article['id']}"):
|
||||
tag_list = [tag.strip() for tag in new_tags.split(",") if tag.strip()]
|
||||
for idx, art in enumerate(all_articles):
|
||||
if art["id"] == article["id"]:
|
||||
all_articles[idx]["tags"] = tag_list
|
||||
break
|
||||
save_articles(all_articles)
|
||||
show_notification("Tags gespeichert!")
|
||||
st.rerun()
|
||||
|
||||
# Bilder
|
||||
if article.get("images"):
|
||||
with st.expander("🖼️ Bilder verwalten", expanded=False):
|
||||
for i, img in enumerate(article.get("images", [])):
|
||||
col1, col2 = st.columns([1, 2])
|
||||
|
||||
with col1:
|
||||
st.image(img["url"], width=200)
|
||||
|
||||
with col2:
|
||||
caption = st.text_input("Bildtitel", value=img.get("caption", ""), key=f"caption_{article['id']}_{i}")
|
||||
copyright_text = st.text_input("Copyright", value=img.get("copyright", ""), key=f"copyright_{article['id']}_{i}")
|
||||
copyright_url = st.text_input("Quelle URL", value=img.get("copyright_url", ""), key=f"copyright_url_{article['id']}_{i}")
|
||||
|
||||
if st.button("Bilddaten speichern", key=f"save_img_{article['id']}_{i}"):
|
||||
img["caption"] = caption or "Kein Bildtitel vorhanden"
|
||||
img["copyright"] = copyright_text or "Unbekannt"
|
||||
img["copyright_url"] = copyright_url or "#"
|
||||
|
||||
for idx, art in enumerate(all_articles):
|
||||
if art["id"] == article["id"]:
|
||||
all_articles[idx] = article
|
||||
break
|
||||
save_articles(all_articles)
|
||||
show_notification("Bilddaten gespeichert!")
|
||||
st.rerun()
|
||||
|
||||
# DALL-E Bildgenerierung
|
||||
if st.button("🪄 KI-Bild generieren", key=f"dalle_{article['id']}"):
|
||||
if not any(img.get("copyright") == "OpenAI DALL·E" for img in article.get("images", [])):
|
||||
prompt = article["title"]
|
||||
image_url = generate_dalle_image(prompt)
|
||||
if image_url:
|
||||
article.setdefault("images", []).append({
|
||||
"url": image_url,
|
||||
"alt": f"KI-generiertes Titelbild zu: {prompt}",
|
||||
"caption": f"KI-generiertes Titelbild zu: {prompt}",
|
||||
"copyright": "OpenAI DALL·E",
|
||||
"copyright_url": "https://openai.com/dall-e"
|
||||
})
|
||||
for idx, art in enumerate(all_articles):
|
||||
if art["id"] == article["id"]:
|
||||
all_articles[idx] = article
|
||||
break
|
||||
save_articles(all_articles)
|
||||
st.success("DALL·E-Bild erfolgreich hinzugefügt")
|
||||
st.rerun()
|
||||
else:
|
||||
st.error("Fehler beim Erzeugen des Bildes.")
|
||||
with st.spinner("Bild wird generiert..."):
|
||||
prompt = article["title"]
|
||||
image_url = generate_dalle_image(prompt)
|
||||
if image_url:
|
||||
article.setdefault("images", []).append({
|
||||
"url": image_url,
|
||||
"alt": f"KI-generiertes Titelbild zu: {prompt}",
|
||||
"caption": f"KI-generiertes Titelbild zu: {prompt}",
|
||||
"copyright": "OpenAI DALL·E",
|
||||
"copyright_url": "https://openai.com/dall-e"
|
||||
})
|
||||
for idx, art in enumerate(all_articles):
|
||||
if art["id"] == article["id"]:
|
||||
all_articles[idx] = article
|
||||
break
|
||||
save_articles(all_articles)
|
||||
show_notification("DALL·E-Bild erfolgreich hinzugefügt!")
|
||||
st.rerun()
|
||||
else:
|
||||
show_notification("Fehler beim Erzeugen des Bildes.", "error")
|
||||
else:
|
||||
st.info("Ein KI-generiertes Bild ist bereits vorhanden.")
|
||||
else:
|
||||
st.info("Keine Artikel für den gewählten Status gefunden.")
|
||||
show_notification("Ein KI-generiertes Bild ist bereits vorhanden.", "info")
|
||||
|
||||
st.markdown('</div>', unsafe_allow_html=True)
|
||||
|
||||
# === Feeds Tab ===
|
||||
with tab3:
|
||||
st.header("📡 RSS Feeds verwalten")
|
||||
|
||||
# Feed hinzufügen
|
||||
with st.expander("➕ Neuen Feed hinzufügen", expanded=False):
|
||||
col1, col2 = st.columns(2)
|
||||
|
||||
with col1:
|
||||
new_url = st.text_input("Feed URL")
|
||||
with col2:
|
||||
new_name = st.text_input("Feed Name")
|
||||
|
||||
if st.button("Feed hinzufügen", use_container_width=True):
|
||||
if new_url and new_name:
|
||||
if not any(f.get("url") == new_url for f in feeds):
|
||||
feeds.append({"url": new_url, "name": new_name})
|
||||
save_feeds(feeds)
|
||||
show_notification(f"Feed '{new_name}' hinzugefügt!")
|
||||
st.rerun()
|
||||
else:
|
||||
show_notification("Dieser Feed existiert bereits.", "warning")
|
||||
else:
|
||||
show_notification("Bitte URL und Name eingeben.", "error")
|
||||
|
||||
# Feeds anzeigen
|
||||
for idx, feed in enumerate(feeds):
|
||||
feed_url = feed.get("url", "")
|
||||
feed_name = feed.get("name", "Unbekannt")
|
||||
article_count = sum(1 for a in all_articles if a.get("source") == feed_url)
|
||||
|
||||
st.markdown(f"""
|
||||
<div class="article-card">
|
||||
<div style="display: flex; justify-content: space-between; align-items: center;">
|
||||
<div>
|
||||
<strong>{feed_name}</strong>
|
||||
<br>
|
||||
<small>{feed_url}</small>
|
||||
<br>
|
||||
<span style="color: #667eea;">📰 {article_count} Artikel</span>
|
||||
</div>
|
||||
<div>
|
||||
<span class="status-badge status-online">{article_count} Artikel</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
""", unsafe_allow_html=True)
|
||||
|
||||
# Feed Actions
|
||||
col1, col2, col3 = st.columns(3)
|
||||
|
||||
with col1:
|
||||
if st.button("✏️ Bearbeiten", key=f"edit_feed_{idx}"):
|
||||
st.session_state[f"edit_feed_{idx}"] = not st.session_state.get(f"edit_feed_{idx}", False)
|
||||
|
||||
with col2:
|
||||
if st.button("🔄 Aktualisieren", key=f"refresh_feed_{idx}"):
|
||||
with st.spinner("Feed wird aktualisiert..."):
|
||||
existing_ids = [a["id"] for a in all_articles]
|
||||
# Hier könntest du eine einzelne Feed-Update-Funktion implementieren
|
||||
process_articles(existing_ids)
|
||||
show_notification(f"Feed '{feed_name}' aktualisiert!")
|
||||
st.rerun()
|
||||
|
||||
with col3:
|
||||
if st.button("🗑️ Löschen", key=f"delete_feed_{idx}"):
|
||||
feeds.pop(idx)
|
||||
save_feeds(feeds)
|
||||
show_notification(f"Feed '{feed_name}' gelöscht!", "warning")
|
||||
st.rerun()
|
||||
|
||||
# Edit Form
|
||||
if st.session_state.get(f"edit_feed_{idx}", False):
|
||||
with st.form(f"edit_form_{idx}"):
|
||||
new_feed_url = st.text_input("URL", value=feed_url)
|
||||
new_feed_name = st.text_input("Name", value=feed_name)
|
||||
|
||||
if st.form_submit_button("Änderungen speichern"):
|
||||
feeds[idx]["url"] = new_feed_url
|
||||
feeds[idx]["name"] = new_feed_name
|
||||
save_feeds(feeds)
|
||||
show_notification("Feed aktualisiert!")
|
||||
st.session_state[f"edit_feed_{idx}"] = False
|
||||
st.rerun()
|
||||
|
||||
# === Bilder Tab ===
|
||||
with tab4:
|
||||
st.header("🖼️ Bilderverwaltung")
|
||||
|
||||
# Alle Bilder sammeln
|
||||
all_images = []
|
||||
for article in all_articles:
|
||||
for img in article.get("images", []):
|
||||
img_data = img.copy()
|
||||
img_data["article_title"] = article.get("title", "Unbekannt")
|
||||
img_data["article_id"] = article.get("id")
|
||||
all_images.append(img_data)
|
||||
|
||||
if all_images:
|
||||
st.write(f"**{len(all_images)} Bilder gefunden**")
|
||||
|
||||
# Bilder in Spalten anzeigen
|
||||
cols = st.columns(3)
|
||||
for idx, img in enumerate(all_images):
|
||||
with cols[idx % 3]:
|
||||
st.image(img["url"], use_column_width=True)
|
||||
st.markdown(f"**{img.get('caption', 'Kein Titel')}**")
|
||||
st.markdown(f"📰 {img['article_title']}")
|
||||
st.markdown(f"©️ {img.get('copyright', 'Unbekannt')}")
|
||||
|
||||
if img.get("copyright_url") and img["copyright_url"] != "#":
|
||||
st.markdown(f"[🔗 Quelle]({img['copyright_url']})")
|
||||
else:
|
||||
st.info("Keine Bilder gefunden.")
|
||||
|
||||
# === Statistiken Tab ===
|
||||
with tab5:
|
||||
st.header("📊 Detaillierte Statistiken")
|
||||
|
||||
# Status Verteilung
|
||||
status_counts = Counter([a.get("status", "New") for a in all_articles])
|
||||
|
||||
col1, col2 = st.columns(2)
|
||||
|
||||
with col1:
|
||||
st.subheader("📈 Status Verteilung")
|
||||
for status, count in status_counts.items():
|
||||
percentage = (count / len(all_articles) * 100) if all_articles else 0
|
||||
st.markdown(f"{get_status_badge(status)} {count} ({percentage:.1f}%)", unsafe_allow_html=True)
|
||||
|
||||
with col2:
|
||||
st.subheader("📡 Artikel pro Feed")
|
||||
feed_counts = Counter([source_to_name.get(a.get("source", ""), "Unbekannt") for a in all_articles])
|
||||
for feed_name, count in feed_counts.most_common():
|
||||
st.markdown(f"**{feed_name}:** {count} Artikel")
|
||||
|
||||
# Weitere Statistiken
|
||||
st.subheader("📝 Textstatistiken")
|
||||
|
||||
word_counts = [get_word_count(a.get("text", "")) for a in all_articles]
|
||||
if word_counts:
|
||||
col1, col2, col3 = st.columns(3)
|
||||
|
||||
with col1:
|
||||
st.metric("Durchschnittliche Wortanzahl", f"{sum(word_counts) // len(word_counts)}")
|
||||
|
||||
with col2:
|
||||
st.metric("Längster Artikel", f"{max(word_counts)} Wörter")
|
||||
|
||||
with col3:
|
||||
st.metric("Kürzester Artikel", f"{min(word_counts)} Wörter")
|
||||
|
||||
# Tag Cloud Simulation
|
||||
st.subheader("🏷️ Häufigste Tags")
|
||||
all_tags = []
|
||||
for article in all_articles:
|
||||
all_tags.extend(article.get("tags", []))
|
||||
|
||||
if all_tags:
|
||||
tag_counts = Counter(all_tags)
|
||||
for tag, count in tag_counts.most_common(10):
|
||||
st.markdown(f"**{tag}:** {count}x verwendet")
|
||||
else:
|
||||
st.info("Keine Tags gefunden.")
|
||||
2653
data/articles.json
2653
data/articles.json
File diff suppressed because one or more lines are too long
20
internal/git.sh
Normal file
20
internal/git.sh
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
# Aktuellen Stand vom main/master holen
|
||||
git checkout main
|
||||
git pull origin main
|
||||
|
||||
# Neuen Feature-Branch erstellen
|
||||
git checkout -b feature/neue-funktion
|
||||
|
||||
# Entwickeln und committen
|
||||
git add .
|
||||
git commit -m "Neue Funktion implementiert"
|
||||
|
||||
# Branch auf Remote-Repository pushen
|
||||
git push -u origin feature/neue-funktion
|
||||
|
||||
|
||||
# Alle Branches anzeigen
|
||||
git branch -a
|
||||
|
||||
# Aktuellen Branch anzeigen
|
||||
git branch --show-current
|
||||
|
|
@ -218,3 +218,296 @@
|
|||
2025-07-11 08:54:54,834 - INFO - 5 neue Artikel gespeichert.
|
||||
2025-07-11 09:34:42,951 - INFO - ❌ Feed gelöscht: Promobil Ratgeber (https://www.promobil.de/rss/ratgeber)
|
||||
2025-07-11 09:35:05,863 - INFO - 🔗 Neuer Feed hinzugefügt: Promobil Ratgeber (https://www.promobil.de/rss/ratgeber)
|
||||
2025-07-28 09:17:09,355 - INFO - ✍️ Umschreiben von: Preisschock bei Wohnmobilversicherungen: Versicherung gestiegen? Das können Sie tun!
|
||||
2025-07-28 09:17:19,759 - INFO - Retrying request to /chat/completions in 0.484478 seconds
|
||||
2025-07-28 09:17:30,386 - INFO - Retrying request to /chat/completions in 0.765465 seconds
|
||||
2025-07-28 09:17:41,238 - ERROR - ❌ Fehler beim Umschreiben von 'Preisschock bei Wohnmobilversicherungen: Versicherung gestiegen? Das können Sie tun!':
|
||||
Traceback (most recent call last):
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpx/_transports/default.py", line 101, in map_httpcore_exceptions
|
||||
yield
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpx/_transports/default.py", line 250, in handle_request
|
||||
resp = self._pool.handle_request(req)
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpcore/_sync/connection_pool.py", line 256, in handle_request
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpcore/_sync/connection_pool.py", line 236, in handle_request
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpcore/_sync/connection.py", line 101, in handle_request
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpcore/_sync/connection.py", line 78, in handle_request
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpcore/_sync/connection.py", line 124, in _connect
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpcore/_backends/sync.py", line 207, in connect_tcp
|
||||
File "/opt/homebrew/Cellar/python@3.13/3.13.5/Frameworks/Python.framework/Versions/3.13/lib/python3.13/contextlib.py", line 162, in __exit__
|
||||
self.gen.throw(value)
|
||||
~~~~~~~~~~~~~~^^^^^^^
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpcore/_exceptions.py", line 14, in map_exceptions
|
||||
httpcore.ConnectError: [Errno 8] nodename nor servname provided, or not known
|
||||
|
||||
The above exception was the direct cause of the following exception:
|
||||
|
||||
Traceback (most recent call last):
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/openai/_base_client.py", line 972, in request
|
||||
response = self._client.send(
|
||||
request,
|
||||
stream=stream or self._should_stream_response_body(request=request),
|
||||
**kwargs,
|
||||
)
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpx/_client.py", line 914, in send
|
||||
response = self._send_handling_auth(
|
||||
request,
|
||||
...<2 lines>...
|
||||
history=[],
|
||||
)
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpx/_client.py", line 942, in _send_handling_auth
|
||||
response = self._send_handling_redirects(
|
||||
request,
|
||||
follow_redirects=follow_redirects,
|
||||
history=history,
|
||||
)
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpx/_client.py", line 979, in _send_handling_redirects
|
||||
response = self._send_single_request(request)
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpx/_client.py", line 1014, in _send_single_request
|
||||
response = transport.handle_request(request)
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpx/_transports/default.py", line 249, in handle_request
|
||||
with map_httpcore_exceptions():
|
||||
~~~~~~~~~~~~~~~~~~~~~~~^^
|
||||
File "/opt/homebrew/Cellar/python@3.13/3.13.5/Frameworks/Python.framework/Versions/3.13/lib/python3.13/contextlib.py", line 162, in __exit__
|
||||
self.gen.throw(value)
|
||||
~~~~~~~~~~~~~~^^^^^^^
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpx/_transports/default.py", line 118, in map_httpcore_exceptions
|
||||
raise mapped_exc(message) from exc
|
||||
httpx.ConnectError: [Errno 8] nodename nor servname provided, or not known
|
||||
|
||||
The above exception was the direct cause of the following exception:
|
||||
|
||||
Traceback (most recent call last):
|
||||
File "/Users/oliver/Documents/rss-news/main.py", line 145, in rewrite_articles
|
||||
response = openai.chat.completions.create(
|
||||
model="gpt-4",
|
||||
...<3 lines>...
|
||||
]
|
||||
)
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/openai/_utils/_utils.py", line 287, in wrapper
|
||||
return func(*args, **kwargs)
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/openai/resources/chat/completions/completions.py", line 1087, in create
|
||||
return self._post(
|
||||
~~~~~~~~~~^
|
||||
"/chat/completions",
|
||||
^^^^^^^^^^^^^^^^^^^^
|
||||
...<43 lines>...
|
||||
stream_cls=Stream[ChatCompletionChunk],
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
)
|
||||
^
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/openai/_base_client.py", line 1249, in post
|
||||
return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))
|
||||
~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/openai/_base_client.py", line 1004, in request
|
||||
raise APIConnectionError(request=request) from err
|
||||
openai.APIConnectionError: Connection error.
|
||||
2025-07-28 09:18:02,091 - INFO - ✍️ Umschreiben von: Camper-Radio Caravan.fm : Radiosender speziell für Camping-Fans
|
||||
2025-07-28 09:18:02,094 - INFO - Retrying request to /chat/completions in 0.415304 seconds
|
||||
2025-07-28 09:18:02,517 - INFO - Retrying request to /chat/completions in 0.899018 seconds
|
||||
2025-07-28 09:18:03,419 - ERROR - ❌ Fehler beim Umschreiben von 'Camper-Radio Caravan.fm : Radiosender speziell für Camping-Fans':
|
||||
Traceback (most recent call last):
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpx/_transports/default.py", line 101, in map_httpcore_exceptions
|
||||
yield
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpx/_transports/default.py", line 250, in handle_request
|
||||
resp = self._pool.handle_request(req)
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpcore/_sync/connection_pool.py", line 256, in handle_request
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpcore/_sync/connection_pool.py", line 236, in handle_request
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpcore/_sync/connection.py", line 101, in handle_request
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpcore/_sync/connection.py", line 78, in handle_request
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpcore/_sync/connection.py", line 124, in _connect
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpcore/_backends/sync.py", line 207, in connect_tcp
|
||||
File "/opt/homebrew/Cellar/python@3.13/3.13.5/Frameworks/Python.framework/Versions/3.13/lib/python3.13/contextlib.py", line 162, in __exit__
|
||||
self.gen.throw(value)
|
||||
~~~~~~~~~~~~~~^^^^^^^
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpcore/_exceptions.py", line 14, in map_exceptions
|
||||
httpcore.ConnectError: [Errno 8] nodename nor servname provided, or not known
|
||||
|
||||
The above exception was the direct cause of the following exception:
|
||||
|
||||
Traceback (most recent call last):
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/openai/_base_client.py", line 972, in request
|
||||
response = self._client.send(
|
||||
request,
|
||||
stream=stream or self._should_stream_response_body(request=request),
|
||||
**kwargs,
|
||||
)
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpx/_client.py", line 914, in send
|
||||
response = self._send_handling_auth(
|
||||
request,
|
||||
...<2 lines>...
|
||||
history=[],
|
||||
)
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpx/_client.py", line 942, in _send_handling_auth
|
||||
response = self._send_handling_redirects(
|
||||
request,
|
||||
follow_redirects=follow_redirects,
|
||||
history=history,
|
||||
)
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpx/_client.py", line 979, in _send_handling_redirects
|
||||
response = self._send_single_request(request)
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpx/_client.py", line 1014, in _send_single_request
|
||||
response = transport.handle_request(request)
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpx/_transports/default.py", line 249, in handle_request
|
||||
with map_httpcore_exceptions():
|
||||
~~~~~~~~~~~~~~~~~~~~~~~^^
|
||||
File "/opt/homebrew/Cellar/python@3.13/3.13.5/Frameworks/Python.framework/Versions/3.13/lib/python3.13/contextlib.py", line 162, in __exit__
|
||||
self.gen.throw(value)
|
||||
~~~~~~~~~~~~~~^^^^^^^
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/httpx/_transports/default.py", line 118, in map_httpcore_exceptions
|
||||
raise mapped_exc(message) from exc
|
||||
httpx.ConnectError: [Errno 8] nodename nor servname provided, or not known
|
||||
|
||||
The above exception was the direct cause of the following exception:
|
||||
|
||||
Traceback (most recent call last):
|
||||
File "/Users/oliver/Documents/rss-news/main.py", line 145, in rewrite_articles
|
||||
response = openai.chat.completions.create(
|
||||
model="gpt-4",
|
||||
...<3 lines>...
|
||||
]
|
||||
)
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/openai/_utils/_utils.py", line 287, in wrapper
|
||||
return func(*args, **kwargs)
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/openai/resources/chat/completions/completions.py", line 1087, in create
|
||||
return self._post(
|
||||
~~~~~~~~~~^
|
||||
"/chat/completions",
|
||||
^^^^^^^^^^^^^^^^^^^^
|
||||
...<43 lines>...
|
||||
stream_cls=Stream[ChatCompletionChunk],
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
)
|
||||
^
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/openai/_base_client.py", line 1249, in post
|
||||
return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))
|
||||
~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
File "/Users/oliver/Documents/rss-news/.venv/lib/python3.13/site-packages/openai/_base_client.py", line 1004, in request
|
||||
raise APIConnectionError(request=request) from err
|
||||
openai.APIConnectionError: Connection error.
|
||||
2025-07-28 09:18:43,426 - INFO - ✍️ Umschreiben von: Preisschock bei Wohnmobilversicherungen: Versicherung gestiegen? Das können Sie tun!
|
||||
2025-07-28 09:19:04,744 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
|
||||
2025-07-28 09:19:09,962 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
|
||||
2025-07-28 09:19:09,964 - INFO - ✅ Artikel umgeschrieben: Preisschock bei Wohnmobilversicherungen: Versicherung gestiegen? Das können Sie tun!
|
||||
2025-07-28 09:19:09,964 - INFO - ✍️ Umschreiben von: Camper-Radio Caravan.fm : Radiosender speziell für Camping-Fans
|
||||
2025-07-28 09:19:23,989 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
|
||||
2025-07-28 09:19:27,267 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
|
||||
2025-07-28 09:19:27,269 - INFO - ✅ Artikel umgeschrieben: Camper-Radio Caravan.fm : Radiosender speziell für Camping-Fans
|
||||
2025-07-28 09:19:27,276 - INFO - Alle Artikel mit Status 'Rewrite' wurden verarbeitet.
|
||||
2025-07-28 09:27:10,258 - INFO - 🧠 Generiere DALL·E-Bild für Prompt: Preisschock bei Wohnmobilversicherungen: Versicherung gestiegen? Das können Sie tun!
|
||||
2025-07-28 09:27:26,502 - INFO - HTTP Request: POST https://api.openai.com/v1/images/generations "HTTP/1.1 200 OK"
|
||||
2025-07-28 09:27:26,514 - INFO - ✅ Bild generiert: https://oaidalleapiprodscus.blob.core.windows.net/private/org-YimPc01cYtOXjUpCATUqDABw/user-eA31w0vmy3fOrb3G64Ygndsr/img-yKWdGDCQJZBOCQ4V4HoD40A0.png?st=2025-07-28T06%3A27%3A26Z&se=2025-07-28T08%3A27%3A26Z&sp=r&sv=2024-08-04&sr=b&rscd=inline&rsct=image/png&skoid=cc612491-d948-4d2e-9821-2683df3719f5&sktid=a48cca56-e6da-484e-a814-9c849652bcb3&skt=2025-07-27T22%3A07%3A39Z&ske=2025-07-28T22%3A07%3A39Z&sks=b&skv=2024-08-04&sig=HUMRhg2FbaKnLil%2BMbyvNemVeBcrvTODpctkfQyFHPc%3D
|
||||
2025-07-28 09:39:35,087 - INFO - Lade Feed: https://www.camping-news.de/rss/
|
||||
2025-07-28 09:39:35,473 - INFO - 0 neue Artikel gefunden in https://www.camping-news.de/rss/
|
||||
2025-07-28 09:39:35,473 - INFO - Lade Feed: https://www.promobil.de/rss/news
|
||||
2025-07-28 09:39:35,914 - INFO - 0 neue Artikel gefunden in https://www.promobil.de/rss/news
|
||||
2025-07-28 09:39:35,915 - INFO - Lade Feed: https://www.promobil.de/rss/ratgeber
|
||||
2025-07-28 09:39:36,365 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/weitere-ratgeber/stunt-auf-wohnwagen-jensen-ackles-countdown/
|
||||
2025-07-28 09:39:36,584 - INFO - 16 Bilder gefunden bei https://www.promobil.de/weitere-ratgeber/stunt-auf-wohnwagen-jensen-ackles-countdown/
|
||||
2025-07-28 09:39:36,585 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/weitere-ratgeber/clever-campen-podcast-wie-passen-gravelbikes-und-camping-zusamen/
|
||||
2025-07-28 09:39:36,793 - INFO - 13 Bilder gefunden bei https://www.promobil.de/weitere-ratgeber/clever-campen-podcast-wie-passen-gravelbikes-und-camping-zusamen/
|
||||
2025-07-28 09:39:36,794 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/weitere-ratgeber/stellplatz-radar-30-tage-kostenlos-alle-plus-funktionen/
|
||||
2025-07-28 09:39:36,999 - INFO - 15 Bilder gefunden bei https://www.promobil.de/weitere-ratgeber/stellplatz-radar-30-tage-kostenlos-alle-plus-funktionen/
|
||||
2025-07-28 09:39:36,999 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/weitere-ratgeber/wann-der-digitale-fahrzeugschein-fuer-alle-kommt/
|
||||
2025-07-28 09:39:37,219 - INFO - 14 Bilder gefunden bei https://www.promobil.de/weitere-ratgeber/wann-der-digitale-fahrzeugschein-fuer-alle-kommt/
|
||||
2025-07-28 09:39:37,220 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/weitere-ratgeber/umfrage-welches-bad-brauchen-sie-im-wohnmobil/
|
||||
2025-07-28 09:39:37,439 - INFO - 22 Bilder gefunden bei https://www.promobil.de/weitere-ratgeber/umfrage-welches-bad-brauchen-sie-im-wohnmobil/
|
||||
2025-07-28 09:39:37,440 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/weitere-ratgeber/umfrage-kaffeegenuss-camping-wohnmobil-wohnwagen/
|
||||
2025-07-28 09:39:37,744 - INFO - 13 Bilder gefunden bei https://www.promobil.de/weitere-ratgeber/umfrage-kaffeegenuss-camping-wohnmobil-wohnwagen/
|
||||
2025-07-28 09:39:37,746 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/weitere-ratgeber/meinung-privates-mietbad-beim-camping-komfort-oder-stilbruch/
|
||||
2025-07-28 09:39:37,983 - INFO - 17 Bilder gefunden bei https://www.promobil.de/weitere-ratgeber/meinung-privates-mietbad-beim-camping-komfort-oder-stilbruch/
|
||||
2025-07-28 09:39:37,984 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/tipps/caravan-salon-duesseldorf-eine-einmalige-gelegenheit-fuer-camping-zubehoer-shopper/
|
||||
2025-07-28 09:39:38,242 - INFO - 20 Bilder gefunden bei https://www.promobil.de/tipps/caravan-salon-duesseldorf-eine-einmalige-gelegenheit-fuer-camping-zubehoer-shopper/
|
||||
2025-07-28 09:39:38,244 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/tipps/caravan-salon-2025-6-gruende-besuch-messe/
|
||||
2025-07-28 09:39:38,476 - INFO - 20 Bilder gefunden bei https://www.promobil.de/tipps/caravan-salon-2025-6-gruende-besuch-messe/
|
||||
2025-07-28 09:39:38,479 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/tipps/tipps-schutz-gegen-sommerhitze-hitzestau-wohnmobil/
|
||||
2025-07-28 09:39:38,758 - INFO - 24 Bilder gefunden bei https://www.promobil.de/tipps/tipps-schutz-gegen-sommerhitze-hitzestau-wohnmobil/
|
||||
2025-07-28 09:39:38,759 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/ratgeber/dethleffs-reiselust-praemie-2025-rabatte-wohnmobile/
|
||||
2025-07-28 09:39:39,019 - INFO - 16 Bilder gefunden bei https://www.promobil.de/ratgeber/dethleffs-reiselust-praemie-2025-rabatte-wohnmobile/
|
||||
2025-07-28 09:39:39,021 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/weitere-ratgeber/promobil-newsletter-ab-sofort-zwei-mal-die-woche-samstags-keine-camping-news-verpassen/
|
||||
2025-07-28 09:39:39,254 - INFO - 13 Bilder gefunden bei https://www.promobil.de/weitere-ratgeber/promobil-newsletter-ab-sofort-zwei-mal-die-woche-samstags-keine-camping-news-verpassen/
|
||||
2025-07-28 09:39:39,256 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/weitere-ratgeber/campingtourismus-boomt-drittes-camping-rekordjahr-in-folge/
|
||||
2025-07-28 09:39:39,516 - INFO - 18 Bilder gefunden bei https://www.promobil.de/weitere-ratgeber/campingtourismus-boomt-drittes-camping-rekordjahr-in-folge/
|
||||
2025-07-28 09:39:39,517 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/ratgeber/camping-ausruester-herzog-beantragt-insolvenz-wohnmobilhandel-belastet-vorzelthersteller/
|
||||
2025-07-28 09:39:39,729 - INFO - 13 Bilder gefunden bei https://www.promobil.de/ratgeber/camping-ausruester-herzog-beantragt-insolvenz-wohnmobilhandel-belastet-vorzelthersteller/
|
||||
2025-07-28 09:39:39,731 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/tipps/bordtechnik-einmaleins-einsteiger-tipps-fuer-den-campingurlaub/
|
||||
2025-07-28 09:39:39,951 - INFO - 13 Bilder gefunden bei https://www.promobil.de/tipps/bordtechnik-einmaleins-einsteiger-tipps-fuer-den-campingurlaub/
|
||||
2025-07-28 09:39:39,952 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/weitere-ratgeber/gotthard-brenner-reschenpass-wo-wohnmobile-und-camper-heute-geduld-brauchen/
|
||||
2025-07-28 09:39:40,197 - INFO - 13 Bilder gefunden bei https://www.promobil.de/weitere-ratgeber/gotthard-brenner-reschenpass-wo-wohnmobile-und-camper-heute-geduld-brauchen/
|
||||
2025-07-28 09:39:40,199 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/tipps/leichtbautricks-wohnmobilhersteller/
|
||||
2025-07-28 09:39:40,458 - INFO - 13 Bilder gefunden bei https://www.promobil.de/tipps/leichtbautricks-wohnmobilhersteller/
|
||||
2025-07-28 09:39:40,462 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/weitere-ratgeber/slowenische-campingmarken-sind-laengst-auf-dem-deutschen-markt-angekommen/
|
||||
2025-07-28 09:39:40,695 - INFO - 16 Bilder gefunden bei https://www.promobil.de/weitere-ratgeber/slowenische-campingmarken-sind-laengst-auf-dem-deutschen-markt-angekommen/
|
||||
2025-07-28 09:39:40,697 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/weitere-ratgeber/recap-folge-2-bella-italia-camping-auf-deutsch/
|
||||
2025-07-28 09:39:40,919 - INFO - 14 Bilder gefunden bei https://www.promobil.de/weitere-ratgeber/recap-folge-2-bella-italia-camping-auf-deutsch/
|
||||
2025-07-28 09:39:40,922 - INFO - 📷 Extrahiere Bilder von https://www.promobil.de/weitere-ratgeber/diese-kostenlosen-apps-muessen-camper-kennen/
|
||||
2025-07-28 09:39:41,210 - INFO - 17 Bilder gefunden bei https://www.promobil.de/weitere-ratgeber/diese-kostenlosen-apps-muessen-camper-kennen/
|
||||
2025-07-28 09:39:41,210 - INFO - 20 neue Artikel gefunden in https://www.promobil.de/rss/ratgeber
|
||||
2025-07-28 09:39:41,211 - INFO - Lade Feed: https://caravan.fm/
|
||||
2025-07-28 09:39:44,233 - INFO - 0 neue Artikel gefunden in https://caravan.fm/
|
||||
2025-07-28 09:39:44,238 - INFO - 20 neue Artikel gespeichert.
|
||||
2025-07-28 09:42:36,590 - INFO - ❌ Feed gelöscht: Neuer Feed (https://caravan.fm/)
|
||||
2025-07-28 09:44:53,801 - INFO - ✍️ Umschreiben von: Pannen und Probleme im Wohnmobil & Wohnwagen: Erste Hilfe für die Camper-Bordtechnik
|
||||
2025-07-28 09:45:18,500 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
|
||||
2025-07-28 09:45:21,113 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
|
||||
2025-07-28 09:45:21,126 - INFO - ✅ Artikel umgeschrieben: Pannen und Probleme im Wohnmobil & Wohnwagen: Erste Hilfe für die Camper-Bordtechnik
|
||||
2025-07-28 09:45:21,146 - INFO - Alle Artikel mit Status 'Rewrite' wurden verarbeitet.
|
||||
2025-07-28 10:29:47,016 - INFO - Lade Feed: https://www.camping-news.de/rss/
|
||||
2025-07-28 10:29:47,407 - INFO - 0 neue Artikel gefunden in https://www.camping-news.de/rss/
|
||||
2025-07-28 10:29:47,407 - INFO - Lade Feed: https://www.promobil.de/rss/news
|
||||
2025-07-28 10:29:47,719 - INFO - 0 neue Artikel gefunden in https://www.promobil.de/rss/news
|
||||
2025-07-28 10:29:47,719 - INFO - Lade Feed: https://www.promobil.de/rss/ratgeber
|
||||
2025-07-28 10:29:48,183 - INFO - 0 neue Artikel gefunden in https://www.promobil.de/rss/ratgeber
|
||||
2025-07-28 10:29:48,183 - INFO - Keine neuen Artikel gefunden.
|
||||
2025-07-29 19:30:44,481 - INFO - Lade Feed: https://www.camping-news.de/rss/
|
||||
2025-07-29 19:30:44,923 - INFO - 0 neue Artikel gefunden in https://www.camping-news.de/rss/
|
||||
2025-07-29 19:30:44,923 - INFO - Lade Feed: https://www.promobil.de/rss/news
|
||||
2025-07-29 19:30:45,348 - INFO - 0 neue Artikel gefunden in https://www.promobil.de/rss/news
|
||||
2025-07-29 19:30:45,348 - INFO - Lade Feed: https://www.promobil.de/rss/ratgeber
|
||||
2025-07-29 19:30:45,899 - INFO - 0 neue Artikel gefunden in https://www.promobil.de/rss/ratgeber
|
||||
2025-07-29 19:30:45,899 - INFO - Keine neuen Artikel gefunden.
|
||||
2025-08-15 09:44:18,677 - INFO - Lade Feed: https://www.camping-news.de/rss/
|
||||
2025-08-15 09:44:18,993 - INFO - 0 neue Artikel gefunden in https://www.camping-news.de/rss/
|
||||
2025-08-15 09:44:18,993 - INFO - Lade Feed: https://www.promobil.de/rss/news
|
||||
2025-08-15 09:44:19,241 - INFO - 0 neue Artikel gefunden in https://www.promobil.de/rss/news
|
||||
2025-08-15 09:44:19,241 - INFO - Lade Feed: https://www.promobil.de/rss/ratgeber
|
||||
2025-08-15 09:44:19,550 - INFO - 🖼️ Starte Bildextraktion von: https://www.promobil.de/weitere-ratgeber/frankreichs-autobahnen-kein-adac-schutz-bei-pannen/
|
||||
2025-08-15 09:44:19,709 - INFO - 🔍 12 img-Tags gefunden
|
||||
2025-08-15 09:44:19,710 - INFO - ✅ Bild hinzugefügt: Bild aus Originalartikel...
|
||||
2025-08-15 09:44:19,710 - ERROR - ❌ Unerwarteter Fehler bei Bildextraktion von https://www.promobil.de/weitere-ratgeber/frankreichs-autobahnen-kein-adac-schutz-bei-pannen/: unsupported operand type(s) for *: 'NoneType' and 'NoneType'
|
||||
2025-08-15 09:44:19,710 - INFO - 🖼️ Starte Bildextraktion von: https://www.promobil.de/weitere-ratgeber/warntafel-wahnsinn-in-italien-anbringen-an-fahrradtraegern-trotz-neuer-gesetze-empfohlen/
|
||||
2025-08-15 09:44:19,856 - INFO - 🔍 13 img-Tags gefunden
|
||||
2025-08-15 09:44:19,856 - INFO - ✅ Bild hinzugefügt: 02/2024, Fahrradträger mit Warntafel...
|
||||
2025-08-15 09:44:19,856 - INFO - ✅ Bild hinzugefügt: Bild aus Originalartikel...
|
||||
2025-08-15 09:44:19,857 - ERROR - ❌ Unerwarteter Fehler bei Bildextraktion von https://www.promobil.de/weitere-ratgeber/warntafel-wahnsinn-in-italien-anbringen-an-fahrradtraegern-trotz-neuer-gesetze-empfohlen/: unsupported operand type(s) for *: 'NoneType' and 'NoneType'
|
||||
2025-08-15 09:44:19,859 - INFO - 🖼️ Starte Bildextraktion von: https://www.promobil.de/tipps/achtung-mautschock-warum-viele-wohnmobile-bald-eine-go-box-brauchen/
|
||||
2025-08-15 09:44:20,025 - INFO - 🔍 20 img-Tags gefunden
|
||||
2025-08-15 09:44:20,025 - INFO - ✅ Bild hinzugefügt: Maut, Basis, Wissen, Österreich, Vignette, Go-Box,...
|
||||
2025-08-15 09:44:20,025 - INFO - ✅ Bild hinzugefügt: Wohnmobil, Küste, Parkplatz, Wohnmobil, Mann...
|
||||
2025-08-15 09:44:20,026 - INFO - ✅ Bild hinzugefügt: 05/2025, Spanien Polizei Verkehr...
|
||||
2025-08-15 09:44:20,026 - INFO - ✅ Bild hinzugefügt: f_Autohof, Restaurant, essen, vegetarisch...
|
||||
2025-08-15 09:44:20,026 - INFO - ✅ Bild hinzugefügt: Supercheck, Dethleffs Magic Edition T 2 EB, Seiten...
|
||||
2025-08-15 09:44:20,026 - INFO - 🎉 5 Bilder erfolgreich extrahiert von https://www.promobil.de/tipps/achtung-mautschock-warum-viele-wohnmobile-bald-eine-go-box-brauchen/
|
||||
2025-08-15 09:44:20,026 - INFO - 3 neue Artikel gefunden in https://www.promobil.de/rss/ratgeber
|
||||
2025-08-15 09:44:20,038 - INFO - 3 neue Artikel gespeichert.
|
||||
2025-08-15 09:45:55,607 - INFO - ✍️ Umschreiben von: Fahrradtransport in Italien: Warntafel bei Fahrradträgern doch wieder Pflicht?
|
||||
2025-08-15 09:46:09,158 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
|
||||
2025-08-15 09:46:11,508 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
|
||||
2025-08-15 09:46:11,564 - INFO - ✅ Artikel umgeschrieben: Fahrradtransport in Italien: Warntafel bei Fahrradträgern doch wieder Pflicht?
|
||||
2025-08-15 09:46:11,565 - INFO - ✍️ Umschreiben von: Neue Mautregeln für Wohnmobile mit 3,5 t: Diese Camper brauchen ab dem Stichtag eine GO-Box
|
||||
2025-08-15 09:46:32,092 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
|
||||
2025-08-15 09:46:34,549 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
|
||||
2025-08-15 09:46:34,552 - INFO - ✅ Artikel umgeschrieben: Neue Mautregeln für Wohnmobile mit 3,5 t: Diese Camper brauchen ab dem Stichtag eine GO-Box
|
||||
2025-08-15 09:46:34,571 - INFO - Alle Artikel mit Status 'Rewrite' wurden verarbeitet.
|
||||
2025-08-15 09:48:30,972 - INFO - 🧠 Generiere DALL·E-Bild für Prompt: Fahrradtransport in Italien: Warntafel bei Fahrradträgern doch wieder Pflicht?
|
||||
2025-08-15 09:48:42,548 - INFO - HTTP Request: POST https://api.openai.com/v1/images/generations "HTTP/1.1 200 OK"
|
||||
2025-08-15 09:48:42,559 - INFO - ✅ Bild generiert: https://oaidalleapiprodscus.blob.core.windows.net/private/org-YimPc01cYtOXjUpCATUqDABw/user-eA31w0vmy3fOrb3G64Ygndsr/img-Ksiks2ssSZxpEFf1MQedlap1.png?st=2025-08-15T06%3A48%3A42Z&se=2025-08-15T08%3A48%3A42Z&sp=r&sv=2024-08-04&sr=b&rscd=inline&rsct=image/png&skoid=8b33a531-2df9-46a3-bc02-d4b1430a422c&sktid=a48cca56-e6da-484e-a814-9c849652bcb3&skt=2025-08-14T16%3A59%3A16Z&ske=2025-08-15T16%3A59%3A16Z&sks=b&skv=2024-08-04&sig=e0/ULpNgNLwixo3UapqnxHgR18t4HCpyEtnbmik33yA%3D
|
||||
2025-08-15 09:51:43,090 - INFO - 🧠 Generiere DALL·E-Bild für Prompt: Neue Mautregeln für Wohnmobile mit 3,5 t: Diese Camper brauchen ab dem Stichtag eine GO-Box
|
||||
2025-08-15 09:51:53,907 - INFO - HTTP Request: POST https://api.openai.com/v1/images/generations "HTTP/1.1 200 OK"
|
||||
2025-08-15 09:51:53,914 - INFO - ✅ Bild generiert: https://oaidalleapiprodscus.blob.core.windows.net/private/org-YimPc01cYtOXjUpCATUqDABw/user-eA31w0vmy3fOrb3G64Ygndsr/img-Pl4ik6W7mTrv2MhIbdWlwgOL.png?st=2025-08-15T06%3A51%3A53Z&se=2025-08-15T08%3A51%3A53Z&sp=r&sv=2024-08-04&sr=b&rscd=inline&rsct=image/png&skoid=b1a0ae1f-618f-4548-84fd-8b16cacd5485&sktid=a48cca56-e6da-484e-a814-9c849652bcb3&skt=2025-08-14T15%3A09%3A17Z&ske=2025-08-15T15%3A09%3A17Z&sks=b&skv=2024-08-04&sig=RHIFlJLMumrcr/jEskOVfqJ%2Bns0pDS2HM8l5siBfLmM%3D
|
||||
2025-08-15 09:55:42,370 - INFO - Lade Feed: https://www.camping-news.de/rss/
|
||||
2025-08-15 09:55:42,639 - INFO - 0 neue Artikel gefunden in https://www.camping-news.de/rss/
|
||||
2025-08-15 09:55:42,640 - INFO - Lade Feed: https://www.promobil.de/rss/news
|
||||
2025-08-15 09:55:42,843 - INFO - 0 neue Artikel gefunden in https://www.promobil.de/rss/news
|
||||
2025-08-15 09:55:42,843 - INFO - Lade Feed: https://www.promobil.de/rss/ratgeber
|
||||
2025-08-15 09:55:43,180 - INFO - 0 neue Artikel gefunden in https://www.promobil.de/rss/ratgeber
|
||||
2025-08-15 09:55:43,180 - INFO - Keine neuen Artikel gefunden.
|
||||
|
|
|
|||
491
main.py
491
main.py
|
|
@ -10,6 +10,8 @@ import logging
|
|||
import openai
|
||||
from utils.image_extractor import extract_images_with_metadata
|
||||
from utils.article_extractor import extract_full_article
|
||||
import hashlib
|
||||
import time
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
|
@ -17,10 +19,15 @@ load_dotenv()
|
|||
log_dir = "logs"
|
||||
os.makedirs(log_dir, exist_ok=True)
|
||||
log_file = os.path.join(log_dir, "rss_tool.log")
|
||||
|
||||
# Logging-Format verbessern
|
||||
logging.basicConfig(
|
||||
filename=log_file,
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s"
|
||||
format="%(asctime)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s",
|
||||
handlers=[
|
||||
logging.FileHandler(log_file, encoding='utf-8'),
|
||||
logging.StreamHandler() # Auch in Konsole ausgeben
|
||||
]
|
||||
)
|
||||
|
||||
openai.api_key = os.getenv("OPENAI_API_KEY")
|
||||
|
|
@ -29,156 +36,412 @@ ARTICLES_FILE = "data/articles.json"
|
|||
FEEDS_FILE = "data/feeds.json"
|
||||
VALID_STATUSES = ["New", "Rewrite", "Process", "Online", "On Hold", "Trash"]
|
||||
|
||||
# === Datenordner erstellen ===
|
||||
os.makedirs("data", exist_ok=True)
|
||||
|
||||
def generate_article_id(title, link, date):
|
||||
"""Generiert eine eindeutige ID für einen Artikel basierend auf mehreren Attributen"""
|
||||
identifier = f"{title}_{link}_{date}"
|
||||
return hashlib.md5(identifier.encode('utf-8')).hexdigest()
|
||||
|
||||
def is_duplicate_article(new_article, existing_articles):
|
||||
"""Prüft ob ein Artikel bereits existiert (erweiterte Duplikatserkennung)"""
|
||||
new_title = new_article.get("title", "").lower().strip()
|
||||
new_link = new_article.get("link", "").strip()
|
||||
|
||||
for existing in existing_articles:
|
||||
existing_title = existing.get("title", "").lower().strip()
|
||||
existing_link = existing.get("link", "").strip()
|
||||
|
||||
# Exakte URL-Übereinstimmung
|
||||
if new_link and existing_link and new_link == existing_link:
|
||||
return True
|
||||
|
||||
# Sehr ähnliche Titel (mindestens 90% Übereinstimmung)
|
||||
if new_title and existing_title:
|
||||
similarity = calculate_similarity(new_title, existing_title)
|
||||
if similarity > 0.9:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def calculate_similarity(text1, text2):
|
||||
"""Berechnet die Ähnlichkeit zwischen zwei Texten (vereinfachte Methode)"""
|
||||
words1 = set(text1.split())
|
||||
words2 = set(text2.split())
|
||||
|
||||
if not words1 and not words2:
|
||||
return 1.0
|
||||
if not words1 or not words2:
|
||||
return 0.0
|
||||
|
||||
intersection = len(words1.intersection(words2))
|
||||
union = len(words1.union(words2))
|
||||
|
||||
return intersection / union if union > 0 else 0.0
|
||||
|
||||
def load_feeds():
|
||||
if not os.path.exists(FEEDS_FILE):
|
||||
"""Lädt RSS-Feeds aus der JSON-Datei"""
|
||||
try:
|
||||
if not os.path.exists(FEEDS_FILE):
|
||||
logging.info("Feeds-Datei existiert nicht, erstelle leere Liste")
|
||||
return []
|
||||
|
||||
with open(FEEDS_FILE, "r", encoding='utf-8') as f:
|
||||
feeds = json.load(f)
|
||||
logging.info(f"✅ {len(feeds)} Feeds geladen")
|
||||
return feeds
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Fehler beim Laden der Feeds: {e}")
|
||||
return []
|
||||
with open(FEEDS_FILE, "r") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def save_feeds(feeds):
|
||||
with open(FEEDS_FILE, "w") as f:
|
||||
json.dump(feeds, f, indent=2)
|
||||
|
||||
"""Speichert RSS-Feeds in die JSON-Datei"""
|
||||
try:
|
||||
with open(FEEDS_FILE, "w", encoding='utf-8') as f:
|
||||
json.dump(feeds, f, indent=2, ensure_ascii=False)
|
||||
logging.info(f"✅ {len(feeds)} Feeds gespeichert")
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Fehler beim Speichern der Feeds: {e}")
|
||||
|
||||
def load_articles():
|
||||
if not os.path.exists(ARTICLES_FILE):
|
||||
"""Lädt Artikel aus der JSON-Datei"""
|
||||
try:
|
||||
if not os.path.exists(ARTICLES_FILE):
|
||||
logging.info("Artikel-Datei existiert nicht, erstelle leere Liste")
|
||||
return []
|
||||
|
||||
with open(ARTICLES_FILE, "r", encoding='utf-8') as f:
|
||||
articles = json.load(f)
|
||||
|
||||
# Status-Validierung
|
||||
for article in articles:
|
||||
if article.get("status") not in VALID_STATUSES:
|
||||
article["status"] = "New"
|
||||
logging.warning(f"⚠️ Ungültiger Status für Artikel '{article.get('title', 'Unbekannt')}' korrigiert")
|
||||
|
||||
logging.info(f"✅ {len(articles)} Artikel geladen")
|
||||
return articles
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Fehler beim Laden der Artikel: {e}")
|
||||
return []
|
||||
with open(ARTICLES_FILE, "r") as f:
|
||||
articles = json.load(f)
|
||||
|
||||
for article in articles:
|
||||
if article.get("status") not in VALID_STATUSES:
|
||||
article["status"] = "New"
|
||||
return articles
|
||||
|
||||
|
||||
def save_articles(articles):
|
||||
with open(ARTICLES_FILE, "w") as f:
|
||||
json.dump(articles, f, indent=2)
|
||||
|
||||
|
||||
def fetch_and_process_feed(feed_url, existing_ids):
|
||||
feed = feedparser.parse(feed_url)
|
||||
new_articles = []
|
||||
|
||||
for entry in feed.entries:
|
||||
article_id = entry.get("id") or entry.get("link")
|
||||
if not article_id or article_id in existing_ids:
|
||||
continue
|
||||
|
||||
title = entry.get("title", "Kein Titel")
|
||||
date = entry.get("published", datetime.now().isoformat())
|
||||
summary = entry.get("summary", "")
|
||||
content = entry.get("content", [{}])[0].get("value") or entry.get("description", "")
|
||||
"""Speichert Artikel in die JSON-Datei"""
|
||||
try:
|
||||
# Validierung vor dem Speichern
|
||||
valid_articles = []
|
||||
for article in articles:
|
||||
if "id" in article and "title" in article:
|
||||
valid_articles.append(article)
|
||||
else:
|
||||
logging.warning(f"⚠️ Ungültiger Artikel übersprungen: {article}")
|
||||
|
||||
with open(ARTICLES_FILE, "w", encoding='utf-8') as f:
|
||||
json.dump(valid_articles, f, indent=2, ensure_ascii=False)
|
||||
|
||||
logging.info(f"✅ {len(valid_articles)} Artikel gespeichert")
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Fehler beim Speichern der Artikel: {e}")
|
||||
|
||||
def clean_html_content(content):
|
||||
"""Bereinigt HTML-Inhalt und extrahiert Text"""
|
||||
try:
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
|
||||
# Entferne Script- und Style-Tags
|
||||
for script in soup(["script", "style"]):
|
||||
script.decompose()
|
||||
|
||||
# Hole sauberen Text
|
||||
clean_text = soup.get_text(" ", strip=True)
|
||||
|
||||
# Entferne überschüssige Leerzeichen
|
||||
clean_text = " ".join(clean_text.split())
|
||||
|
||||
return clean_text
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Fehler beim Bereinigen des HTML-Inhalts: {e}")
|
||||
return content
|
||||
|
||||
# Automatischer Volltext-Fetch bei zu wenig Wörtern
|
||||
if len(clean_text.split()) < 50 and entry.get("link"):
|
||||
fetched_text = extract_full_article(entry["link"])
|
||||
if len(fetched_text.split()) > len(clean_text.split()):
|
||||
clean_text = fetched_text
|
||||
def fetch_and_process_feed(feed_url, existing_articles):
|
||||
"""Lädt und verarbeitet einen einzelnen RSS-Feed"""
|
||||
new_articles = []
|
||||
feed_name = "Unbekannt"
|
||||
|
||||
try:
|
||||
logging.info(f"🔄 Verarbeite Feed: {feed_url}")
|
||||
|
||||
# Feed parsen
|
||||
feed = feedparser.parse(feed_url)
|
||||
|
||||
if hasattr(feed, 'feed') and hasattr(feed.feed, 'title'):
|
||||
feed_name = feed.feed.title
|
||||
logging.info(f"📡 Feed-Name: {feed_name}")
|
||||
|
||||
if not feed.entries:
|
||||
logging.warning(f"⚠️ Keine Einträge in Feed gefunden: {feed_url}")
|
||||
return []
|
||||
|
||||
logging.info(f"📰 {len(feed.entries)} Einträge gefunden")
|
||||
|
||||
for entry in feed.entries:
|
||||
try:
|
||||
# Basis-Informationen extrahieren
|
||||
title = entry.get("title", "Kein Titel")
|
||||
date = entry.get("published", datetime.now().isoformat())
|
||||
link = entry.get("link", "")
|
||||
summary = entry.get("summary", "")
|
||||
|
||||
# Content extrahieren
|
||||
content = ""
|
||||
if hasattr(entry, 'content') and entry.content:
|
||||
content = entry.content[0].get("value", "")
|
||||
elif hasattr(entry, 'description'):
|
||||
content = entry.description
|
||||
else:
|
||||
content = summary
|
||||
|
||||
# HTML bereinigen
|
||||
clean_text = clean_html_content(content)
|
||||
|
||||
# Volltext-Extraktion bei kurzen Artikeln
|
||||
if len(clean_text.split()) < 50 and link:
|
||||
logging.info(f"🔍 Kurzer Artikel erkannt, versuche Volltext-Extraktion: {title}")
|
||||
fetched_text = extract_full_article(link)
|
||||
if len(fetched_text.split()) > len(clean_text.split()):
|
||||
clean_text = fetched_text
|
||||
logging.info(f"✅ Volltext extrahiert: {len(clean_text.split())} Wörter")
|
||||
|
||||
# Artikel-ID generieren
|
||||
article_id = generate_article_id(title, link, date)
|
||||
|
||||
# Neuen Artikel erstellen
|
||||
new_article = {
|
||||
"id": article_id,
|
||||
"title": title,
|
||||
"date": date,
|
||||
"summary": summary[:300] + "..." if len(summary) > 300 else summary,
|
||||
"text": clean_text,
|
||||
"tags": [],
|
||||
"status": "New",
|
||||
"link": link,
|
||||
"images": [],
|
||||
"source": feed_url,
|
||||
"source_name": feed_name,
|
||||
"created_at": datetime.now().isoformat(),
|
||||
"word_count": len(clean_text.split())
|
||||
}
|
||||
|
||||
# Duplikatsprüfung
|
||||
if not is_duplicate_article(new_article, existing_articles):
|
||||
# Bilder extrahieren
|
||||
if link:
|
||||
try:
|
||||
images = extract_images_with_metadata(link)
|
||||
new_article["images"] = images
|
||||
logging.info(f"🖼️ {len(images)} Bilder für '{title}' extrahiert")
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Fehler bei Bildextraktion für '{title}': {e}")
|
||||
|
||||
new_articles.append(new_article)
|
||||
logging.info(f"✅ Neuer Artikel hinzugefügt: {title}")
|
||||
else:
|
||||
logging.info(f"🔄 Duplikat übersprungen: {title}")
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Fehler beim Verarbeiten des Eintrags '{entry.get('title', 'Unbekannt')}': {e}")
|
||||
continue
|
||||
|
||||
logging.info(f"✅ Feed verarbeitet: {len(new_articles)} neue Artikel aus {feed_url}")
|
||||
return new_articles
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Kritischer Fehler beim Verarbeiten von {feed_url}: {e}")
|
||||
return []
|
||||
|
||||
images = extract_images_with_metadata(entry.link)
|
||||
|
||||
new_articles.append({
|
||||
"id": article_id,
|
||||
"title": title,
|
||||
"date": date,
|
||||
"summary": summary,
|
||||
"text": clean_text,
|
||||
"tags": [],
|
||||
"status": "New",
|
||||
"link": entry.get("link", ""),
|
||||
"images": images,
|
||||
"source": feed_url
|
||||
})
|
||||
|
||||
return new_articles
|
||||
|
||||
|
||||
def process_articles(existing_ids):
|
||||
feeds = load_feeds()
|
||||
all_articles = load_articles()
|
||||
articles_by_id = {article["id"]: article for article in all_articles if "id" in article}
|
||||
new_entries = []
|
||||
|
||||
for feed in feeds:
|
||||
url = feed.get("url") if isinstance(feed, dict) else feed
|
||||
if not url:
|
||||
continue
|
||||
try:
|
||||
logging.info(f"Lade Feed: {url}")
|
||||
entries = fetch_and_process_feed(url, existing_ids)
|
||||
new_entries.extend(entries)
|
||||
logging.info(f"{len(entries)} neue Artikel gefunden in {url}")
|
||||
except Exception as e:
|
||||
logging.exception(f"Fehler beim Verarbeiten von {url}:")
|
||||
|
||||
added = 0
|
||||
for entry in new_entries:
|
||||
if entry["id"] not in articles_by_id:
|
||||
articles_by_id[entry["id"]] = entry
|
||||
added += 1
|
||||
def process_articles(existing_ids=None):
|
||||
"""Verarbeitet alle RSS-Feeds und fügt neue Artikel hinzu"""
|
||||
try:
|
||||
start_time = time.time()
|
||||
logging.info("🚀 Starte Artikel-Verarbeitung")
|
||||
|
||||
feeds = load_feeds()
|
||||
all_articles = load_articles()
|
||||
|
||||
if not feeds:
|
||||
logging.warning("⚠️ Keine RSS-Feeds konfiguriert")
|
||||
return
|
||||
|
||||
# Bestehende Artikel für Duplikatsprüfung
|
||||
existing_articles = all_articles.copy()
|
||||
|
||||
total_new_articles = 0
|
||||
|
||||
for feed in feeds:
|
||||
feed_url = feed.get("url") if isinstance(feed, dict) else feed
|
||||
|
||||
if not feed_url:
|
||||
logging.warning("⚠️ Feed ohne URL übersprungen")
|
||||
continue
|
||||
|
||||
try:
|
||||
new_articles = fetch_and_process_feed(feed_url, existing_articles)
|
||||
|
||||
# Neue Artikel zur Gesamtliste hinzufügen
|
||||
for article in new_articles:
|
||||
all_articles.append(article)
|
||||
existing_articles.append(article) # Für weitere Duplikatsprüfung
|
||||
|
||||
total_new_articles += len(new_articles)
|
||||
|
||||
# Kurze Pause zwischen Feeds
|
||||
time.sleep(1)
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Fehler beim Verarbeiten von Feed {feed_url}: {e}")
|
||||
continue
|
||||
|
||||
# Artikel speichern
|
||||
if total_new_articles > 0:
|
||||
save_articles(all_articles)
|
||||
processing_time = time.time() - start_time
|
||||
logging.info(f"🎉 Verarbeitung abgeschlossen: {total_new_articles} neue Artikel in {processing_time:.2f}s hinzugefügt")
|
||||
else:
|
||||
logging.info(f"Artikel bereits vorhanden, wird übersprungen: {entry['title']}")
|
||||
|
||||
if added > 0:
|
||||
save_articles(list(articles_by_id.values()))
|
||||
logging.info(f"{added} neue Artikel gespeichert.")
|
||||
else:
|
||||
logging.info("Keine neuen Artikel gefunden.")
|
||||
|
||||
logging.info("ℹ️ Keine neuen Artikel gefunden")
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Kritischer Fehler bei der Artikel-Verarbeitung: {e}")
|
||||
|
||||
def rewrite_articles():
|
||||
articles = load_articles()
|
||||
changed = False
|
||||
|
||||
for article in articles:
|
||||
if article.get("status") == "Rewrite":
|
||||
"""Schreibt Artikel mit Status 'Rewrite' um"""
|
||||
try:
|
||||
logging.info("✍️ Starte Artikel-Umschreibung")
|
||||
|
||||
articles = load_articles()
|
||||
rewrite_articles_list = [a for a in articles if a.get("status") == "Rewrite"]
|
||||
|
||||
if not rewrite_articles_list:
|
||||
logging.info("ℹ️ Keine Artikel zum Umschreiben gefunden")
|
||||
return
|
||||
|
||||
if not openai.api_key:
|
||||
logging.error("❌ OpenAI API-Key nicht konfiguriert")
|
||||
return
|
||||
|
||||
changed = False
|
||||
|
||||
for article in rewrite_articles_list:
|
||||
try:
|
||||
logging.info(f"✍️ Umschreiben von: {article['title']}")
|
||||
prompt = f"Schreibe folgenden Artikel um und fasse ihn verständlich zusammen:\n\n{article['text']}"
|
||||
|
||||
# Artikel umschreiben
|
||||
prompt = f"""Schreibe den folgenden Artikel um und fasse ihn verständlich zusammen.
|
||||
Behalte die wichtigsten Informationen bei, aber formuliere alles neu:
|
||||
|
||||
{article['text']}"""
|
||||
|
||||
response = openai.chat.completions.create(
|
||||
model="gpt-4",
|
||||
messages=[
|
||||
{"role": "system", "content": "Du bist ein professioneller Redakteur."},
|
||||
{"role": "system", "content": "Du bist ein professioneller Redakteur, der Artikel umschreibt und verbessert."},
|
||||
{"role": "user", "content": prompt}
|
||||
]
|
||||
],
|
||||
max_tokens=1500,
|
||||
temperature=0.7
|
||||
)
|
||||
|
||||
new_text = response.choices[0].message.content.strip()
|
||||
article["text"] = f"{article['title']}\n\n{new_text}"
|
||||
article["status"] = "Process"
|
||||
|
||||
# Tags generieren
|
||||
tag_prompt = f"""Erstelle 3-5 passende, kurze Stichwörter (Tags) für diesen Artikel.
|
||||
Gib nur die Tags zurück, getrennt durch Kommas:
|
||||
|
||||
tag_prompt = f"Erstelle 3 passende, kurze Stichwörter (Tags) für diesen Artikel:\n\n{new_text}"
|
||||
{new_text}"""
|
||||
|
||||
tag_response = openai.chat.completions.create(
|
||||
model="gpt-4",
|
||||
messages=[
|
||||
{"role": "system", "content": "Du bist ein Blog-Tag-Generator."},
|
||||
{"role": "system", "content": "Du generierst präzise Tags für Blog-Artikel."},
|
||||
{"role": "user", "content": tag_prompt}
|
||||
]
|
||||
],
|
||||
max_tokens=100,
|
||||
temperature=0.5
|
||||
)
|
||||
|
||||
tags_raw = tag_response.choices[0].message.content.strip()
|
||||
tags = [tag.strip(" ,") for tag in tags_raw.replace("\n", ",").split(",") if tag.strip()]
|
||||
tags = [tag.strip().strip(',') for tag in tags_raw.split(",") if tag.strip()]
|
||||
|
||||
# Artikel aktualisieren
|
||||
article["text"] = new_text
|
||||
article["tags"] = tags
|
||||
|
||||
article["status"] = "Process"
|
||||
article["rewritten_at"] = datetime.now().isoformat()
|
||||
article["word_count"] = len(new_text.split())
|
||||
|
||||
# Bildmetadaten vervollständigen falls nötig
|
||||
for img in article.get("images", []):
|
||||
if "caption" not in img:
|
||||
if "caption" not in img or not img["caption"]:
|
||||
img["caption"] = "Kein Bildtitel vorhanden"
|
||||
if "copyright" not in img:
|
||||
if "copyright" not in img or not img["copyright"]:
|
||||
img["copyright"] = "Unbekannt"
|
||||
if "copyright_url" not in img:
|
||||
if "copyright_url" not in img or not img["copyright_url"]:
|
||||
img["copyright_url"] = "#"
|
||||
|
||||
logging.info(f"✅ Artikel umgeschrieben: {article['title']}")
|
||||
|
||||
logging.info(f"✅ Artikel erfolgreich umgeschrieben: {article['title']}")
|
||||
changed = True
|
||||
|
||||
|
||||
# Kurze Pause zwischen API-Calls
|
||||
time.sleep(2)
|
||||
|
||||
except Exception as e:
|
||||
logging.exception(f"❌ Fehler beim Umschreiben von '{article['title']}':")
|
||||
logging.error(f"❌ Fehler beim Umschreiben von '{article['title']}': {e}")
|
||||
continue
|
||||
|
||||
if changed:
|
||||
save_articles(articles)
|
||||
logging.info(f"🎉 {len(rewrite_articles_list)} Artikel erfolgreich umgeschrieben")
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Kritischer Fehler beim Umschreiben: {e}")
|
||||
|
||||
if changed:
|
||||
save_articles(articles)
|
||||
logging.info("Alle Artikel mit Status 'Rewrite' wurden verarbeitet.")
|
||||
def get_article_stats():
|
||||
"""Gibt Statistiken über die Artikel zurück"""
|
||||
try:
|
||||
articles = load_articles()
|
||||
|
||||
stats = {
|
||||
"total_articles": len(articles),
|
||||
"status_distribution": {},
|
||||
"word_count_stats": {},
|
||||
"source_distribution": {},
|
||||
"images_count": 0
|
||||
}
|
||||
|
||||
# Status-Verteilung
|
||||
for article in articles:
|
||||
status = article.get("status", "New")
|
||||
stats["status_distribution"][status] = stats["status_distribution"].get(status, 0) + 1
|
||||
|
||||
# Wortanzahl-Statistiken
|
||||
word_counts = [article.get("word_count", 0) for article in articles if article.get("word_count")]
|
||||
if word_counts:
|
||||
stats["word_count_stats"] = {
|
||||
"average": sum(word_counts) // len(word_counts),
|
||||
"min": min(word_counts),
|
||||
"max": max(word_counts)
|
||||
}
|
||||
|
||||
# Quellen-Verteilung
|
||||
for article in articles:
|
||||
source = article.get("source_name", "Unbekannt")
|
||||
stats["source_distribution"][source] = stats["source_distribution"].get(source, 0) + 1
|
||||
|
||||
# Bilder zählen
|
||||
stats["images_count"] = sum(len(article.get("images", [])) for article in articles)
|
||||
|
||||
return stats
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Fehler beim Erstellen der Statistiken: {e}")
|
||||
return {}
|
||||
|
|
@ -2,26 +2,362 @@
|
|||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import logging
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
# Konfiguration
|
||||
REQUEST_TIMEOUT = 15
|
||||
MAX_RETRIES = 3
|
||||
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
|
||||
# Website-spezifische Selektoren
|
||||
CONTENT_SELECTORS = {
|
||||
# Promobil & Camping-spezifisch
|
||||
'promobil.de': [
|
||||
{'tag': 'div', 'class': 'article__text'},
|
||||
{'tag': 'div', 'class': 'article-content'},
|
||||
{'tag': 'div', 'class': 'content-text'}
|
||||
],
|
||||
'camping.info': [
|
||||
{'tag': 'div', 'class': 'article-body'},
|
||||
{'tag': 'div', 'class': 'post-content'}
|
||||
],
|
||||
'caravaning.de': [
|
||||
{'tag': 'div', 'class': 'article__content'},
|
||||
{'tag': 'div', 'class': 'entry-content'}
|
||||
],
|
||||
|
||||
# WordPress Standard-Selektoren
|
||||
'wordpress': [
|
||||
{'tag': 'div', 'class': 'entry-content'},
|
||||
{'tag': 'div', 'class': 'post-content'},
|
||||
{'tag': 'div', 'class': 'content'},
|
||||
{'tag': 'main', 'class': 'main-content'},
|
||||
{'tag': 'article', 'class': None}
|
||||
],
|
||||
|
||||
# Allgemeine Fallbacks
|
||||
'generic': [
|
||||
{'tag': 'article', 'class': None},
|
||||
{'tag': 'div', 'class': 'content'},
|
||||
{'tag': 'div', 'class': 'post'},
|
||||
{'tag': 'div', 'class': 'entry'},
|
||||
{'tag': 'main', 'class': None},
|
||||
{'tag': 'div', 'id': 'content'},
|
||||
{'tag': 'div', 'id': 'main'}
|
||||
]
|
||||
}
|
||||
|
||||
def get_domain_from_url(url: str) -> str:
|
||||
"""
|
||||
Extrahiert die Domain aus einer URL
|
||||
"""
|
||||
try:
|
||||
from urllib.parse import urlparse
|
||||
parsed = urlparse(url)
|
||||
return parsed.netloc.lower()
|
||||
except:
|
||||
return ""
|
||||
|
||||
def get_selectors_for_domain(domain: str) -> list:
|
||||
"""
|
||||
Gibt die passenden Selektoren für eine Domain zurück
|
||||
"""
|
||||
# Direkte Domain-Matches
|
||||
for known_domain in CONTENT_SELECTORS:
|
||||
if known_domain != 'wordpress' and known_domain != 'generic' and known_domain in domain:
|
||||
return CONTENT_SELECTORS[known_domain]
|
||||
|
||||
# WordPress erkennen (wird später durch Meta-Tags erkannt)
|
||||
return CONTENT_SELECTORS['generic']
|
||||
|
||||
def is_wordpress_site(soup: BeautifulSoup) -> bool:
|
||||
"""
|
||||
Erkennt WordPress-Websites anhand von Meta-Tags
|
||||
"""
|
||||
try:
|
||||
# WordPress Generator Meta-Tag
|
||||
generator = soup.find('meta', attrs={'name': 'generator'})
|
||||
if generator and 'wordpress' in generator.get('content', '').lower():
|
||||
return True
|
||||
|
||||
# WordPress-spezifische Link-Tags
|
||||
wp_links = soup.find_all('link', href=lambda x: x and '/wp-' in x)
|
||||
if wp_links:
|
||||
return True
|
||||
|
||||
# WordPress REST API
|
||||
rest_api = soup.find('link', attrs={'rel': 'https://api.w.org/'})
|
||||
if rest_api:
|
||||
return True
|
||||
|
||||
return False
|
||||
except:
|
||||
return False
|
||||
|
||||
def clean_extracted_text(text: str) -> str:
|
||||
"""
|
||||
Bereinigt extrahierten Text von unerwünschten Elementen
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
lines = text.split('\n')
|
||||
cleaned_lines = []
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
|
||||
# Überspringe sehr kurze Zeilen (wahrscheinlich Navigation/Werbung)
|
||||
if len(line) < 10:
|
||||
continue
|
||||
|
||||
# Überspringe typische Navigation/Footer-Texte
|
||||
skip_patterns = [
|
||||
'cookie', 'datenschutz', 'impressum', 'agb', 'newsletter',
|
||||
'folgen sie uns', 'social media', 'teilen', 'weiterlesen',
|
||||
'mehr zum thema', 'ähnliche artikel', 'kommentare',
|
||||
'anzeige', 'werbung', 'advertisement'
|
||||
]
|
||||
|
||||
if any(pattern in line.lower() for pattern in skip_patterns):
|
||||
continue
|
||||
|
||||
# Überspringe Zeilen mit zu vielen Sonderzeichen (Navigation)
|
||||
if len([c for c in line if c in '|•→←↑↓']) > 3:
|
||||
continue
|
||||
|
||||
cleaned_lines.append(line)
|
||||
|
||||
# Text zusammenfügen
|
||||
cleaned_text = ' '.join(cleaned_lines)
|
||||
|
||||
# Mehrfache Leerzeichen entfernen
|
||||
cleaned_text = ' '.join(cleaned_text.split())
|
||||
|
||||
return cleaned_text
|
||||
|
||||
def extract_with_selectors(soup: BeautifulSoup, selectors: list) -> str:
|
||||
"""
|
||||
Versucht Text mit einer Liste von Selektoren zu extrahieren
|
||||
"""
|
||||
for selector in selectors:
|
||||
try:
|
||||
element = None
|
||||
|
||||
if selector.get('class'):
|
||||
element = soup.find(selector['tag'], class_=selector['class'])
|
||||
elif selector.get('id'):
|
||||
element = soup.find(selector['tag'], id=selector['id'])
|
||||
else:
|
||||
element = soup.find(selector['tag'])
|
||||
|
||||
if element:
|
||||
# Entferne Script- und Style-Tags
|
||||
for script in element(['script', 'style', 'nav', 'header', 'footer', 'aside']):
|
||||
script.decompose()
|
||||
|
||||
text = element.get_text(' ', strip=True)
|
||||
|
||||
# Nur zurückgeben wenn genügend Text vorhanden
|
||||
if len(text.split()) > 50:
|
||||
logging.info(f"✅ Erfolgreiche Extraktion mit Selektor: {selector}")
|
||||
return clean_extracted_text(text)
|
||||
|
||||
except Exception as e:
|
||||
logging.debug(f"Selektor {selector} fehlgeschlagen: {e}")
|
||||
continue
|
||||
|
||||
return ""
|
||||
|
||||
def extract_from_paragraphs(soup: BeautifulSoup) -> str:
|
||||
"""
|
||||
Fallback: Extrahiert Text aus allen Paragraph-Tags
|
||||
"""
|
||||
try:
|
||||
paragraphs = soup.find_all('p')
|
||||
|
||||
if not paragraphs:
|
||||
return ""
|
||||
|
||||
# Sammle alle Paragraph-Texte
|
||||
texts = []
|
||||
for p in paragraphs:
|
||||
text = p.get_text(strip=True)
|
||||
if len(text) > 20: # Nur längere Absätze
|
||||
texts.append(text)
|
||||
|
||||
combined_text = ' '.join(texts)
|
||||
|
||||
if len(combined_text.split()) > 30:
|
||||
logging.info(f"✅ Fallback-Extraktion aus {len(paragraphs)} Paragraphen")
|
||||
return clean_extracted_text(combined_text)
|
||||
|
||||
return ""
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Fehler bei Paragraph-Extraktion: {e}")
|
||||
return ""
|
||||
|
||||
def extract_full_article(url: str) -> str:
|
||||
try:
|
||||
response = requests.get(url, timeout=10)
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
# Promobil & WordPress & allgemeine Fallbacks
|
||||
candidates = [
|
||||
{"tag": "div", "class_": "article__text"}, # Promobil
|
||||
{"tag": "div", "class_": "entry-content"}, # WordPress Standard
|
||||
{"tag": "article", "class_": None}, # Generisch
|
||||
]
|
||||
|
||||
for selector in candidates:
|
||||
el = soup.find(selector["tag"], class_=selector["class_"])
|
||||
if el and len(el.get_text(strip=True).split()) > 50:
|
||||
return el.get_text(" ", strip=True)
|
||||
|
||||
# Fallback: ganzer Seiteninhalt
|
||||
return soup.get_text(" ", strip=True)
|
||||
except Exception:
|
||||
"""
|
||||
Hauptfunktion: Extrahiert den vollständigen Artikeltext von einer URL
|
||||
"""
|
||||
if not url:
|
||||
return ""
|
||||
|
||||
retries = 0
|
||||
|
||||
while retries < MAX_RETRIES:
|
||||
try:
|
||||
logging.info(f"📰 Starte Volltextextraktion von: {url} (Versuch {retries + 1})")
|
||||
|
||||
# HTTP-Request mit verbessertem Header
|
||||
headers = {
|
||||
'User-Agent': USER_AGENT,
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'de,en-US;q=0.7,en;q=0.3',
|
||||
'Accept-Encoding': 'gzip, deflate',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
}
|
||||
|
||||
response = requests.get(url, timeout=REQUEST_TIMEOUT, headers=headers)
|
||||
response.raise_for_status()
|
||||
|
||||
# Encoding sicherstellen
|
||||
if response.encoding.lower() in ['iso-8859-1', 'windows-1252']:
|
||||
response.encoding = 'utf-8'
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
# Domain-spezifische Selektoren ermitteln
|
||||
domain = get_domain_from_url(url)
|
||||
selectors = get_selectors_for_domain(domain)
|
||||
|
||||
# WordPress erkennen und entsprechende Selektoren verwenden
|
||||
if is_wordpress_site(soup):
|
||||
logging.info("🔧 WordPress-Site erkannt")
|
||||
selectors = CONTENT_SELECTORS['wordpress'] + selectors
|
||||
|
||||
# 1. Versuch: Domain-spezifische Selektoren
|
||||
extracted_text = extract_with_selectors(soup, selectors)
|
||||
|
||||
if extracted_text and len(extracted_text.split()) > 50:
|
||||
logging.info(f"🎉 Erfolgreiche Extraktion: {len(extracted_text.split())} Wörter")
|
||||
return extracted_text
|
||||
|
||||
# 2. Versuch: Generische Selektoren
|
||||
if not extracted_text:
|
||||
logging.info("🔄 Fallback auf generische Selektoren")
|
||||
extracted_text = extract_with_selectors(soup, CONTENT_SELECTORS['generic'])
|
||||
|
||||
if extracted_text and len(extracted_text.split()) > 50:
|
||||
logging.info(f"🎉 Erfolgreiche Extraktion (generisch): {len(extracted_text.split())} Wörter")
|
||||
return extracted_text
|
||||
|
||||
# 3. Versuch: Paragraph-Extraktion
|
||||
if not extracted_text:
|
||||
logging.info("🔄 Fallback auf Paragraph-Extraktion")
|
||||
extracted_text = extract_from_paragraphs(soup)
|
||||
|
||||
if extracted_text and len(extracted_text.split()) > 30:
|
||||
logging.info(f"🎉 Erfolgreiche Extraktion (Paragraphen): {len(extracted_text.split())} Wörter")
|
||||
return extracted_text
|
||||
|
||||
# 4. Letzter Versuch: Gesamter Body-Text
|
||||
if not extracted_text:
|
||||
logging.info("🔄 Letzter Fallback: Body-Text")
|
||||
body = soup.find('body')
|
||||
if body:
|
||||
# Entferne Navigation, Header, Footer
|
||||
for element in body(['nav', 'header', 'footer', 'aside', 'script', 'style']):
|
||||
element.decompose()
|
||||
|
||||
body_text = body.get_text(' ', strip=True)
|
||||
if len(body_text.split()) > 100:
|
||||
extracted_text = clean_extracted_text(body_text)
|
||||
logging.info(f"⚠️ Body-Extraktion: {len(extracted_text.split())} Wörter")
|
||||
return extracted_text
|
||||
|
||||
# Kein brauchbarer Text gefunden
|
||||
if not extracted_text:
|
||||
logging.warning(f"⚠️ Keine verwertbaren Inhalte gefunden bei: {url}")
|
||||
return ""
|
||||
|
||||
return extracted_text
|
||||
|
||||
except requests.RequestException as e:
|
||||
retries += 1
|
||||
logging.warning(f"🌐 Netzwerkfehler bei {url} (Versuch {retries}): {e}")
|
||||
|
||||
if retries < MAX_RETRIES:
|
||||
time.sleep(2 ** retries) # Exponential backoff
|
||||
continue
|
||||
else:
|
||||
logging.error(f"❌ Maximale Anzahl Versuche erreicht für: {url}")
|
||||
return ""
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Unerwarteter Fehler bei Volltextextraktion von {url}: {e}")
|
||||
return ""
|
||||
|
||||
return ""
|
||||
|
||||
def extract_article_summary(full_text: str, max_length: int = 300) -> str:
|
||||
"""
|
||||
Erstellt eine intelligente Zusammenfassung aus dem Volltext
|
||||
"""
|
||||
if not full_text:
|
||||
return ""
|
||||
|
||||
sentences = full_text.split('.')
|
||||
|
||||
# Erste 2-3 sinnvolle Sätze als Summary verwenden
|
||||
summary_sentences = []
|
||||
current_length = 0
|
||||
|
||||
for sentence in sentences[:5]: # Maximal erste 5 Sätze prüfen
|
||||
sentence = sentence.strip()
|
||||
|
||||
if len(sentence) < 20: # Zu kurze Sätze überspringen
|
||||
continue
|
||||
|
||||
if current_length + len(sentence) > max_length:
|
||||
break
|
||||
|
||||
summary_sentences.append(sentence)
|
||||
current_length += len(sentence)
|
||||
|
||||
summary = '. '.join(summary_sentences)
|
||||
|
||||
if summary and not summary.endswith('.'):
|
||||
summary += '.'
|
||||
|
||||
return summary[:max_length]
|
||||
|
||||
def validate_extracted_content(text: str) -> bool:
|
||||
"""
|
||||
Validiert ob der extrahierte Inhalt brauchbar ist
|
||||
"""
|
||||
if not text or len(text.strip()) < 100:
|
||||
return False
|
||||
|
||||
words = text.split()
|
||||
|
||||
# Mindestens 50 Wörter
|
||||
if len(words) < 50:
|
||||
return False
|
||||
|
||||
# Nicht zu viele Sonderzeichen (Navigation etc.)
|
||||
special_chars = len([c for c in text if c in '|•→←↑↓'])
|
||||
if special_chars > len(text) * 0.05: # Mehr als 5% Sonderzeichen
|
||||
return False
|
||||
|
||||
# Durchschnittliche Wortlänge prüfen (zu kurz = Navigation)
|
||||
avg_word_length = sum(len(word) for word in words) / len(words)
|
||||
if avg_word_length < 3:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
|
@ -2,59 +2,325 @@
|
|||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urljoin
|
||||
from urllib.parse import urljoin, urlparse
|
||||
import logging
|
||||
import time
|
||||
from typing import List, Dict
|
||||
|
||||
# Konfiguration
|
||||
MAX_IMAGES = 5
|
||||
MIN_IMAGE_SIZE = 100 # Mindestgröße in Pixeln
|
||||
ALLOWED_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.webp', '.gif'}
|
||||
REQUEST_TIMEOUT = 10
|
||||
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
|
||||
def extract_images_with_metadata(article_url):
|
||||
def is_valid_image_url(url: str) -> bool:
|
||||
"""
|
||||
Versucht, Bilder mit Bildunterschrift und Copyright aus dem Originalartikel zu extrahieren.
|
||||
Gibt eine Liste mit Dictionaries zurück: {url, alt, copyright, copyright_url, caption}
|
||||
Prüft ob eine URL auf ein gültiges Bild zeigt
|
||||
"""
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
path = parsed.path.lower()
|
||||
|
||||
# Prüfe Dateiendung
|
||||
if not any(path.endswith(ext) for ext in ALLOWED_EXTENSIONS):
|
||||
return False
|
||||
|
||||
# Prüfe ob URL vollständig ist
|
||||
if not parsed.scheme or not parsed.netloc:
|
||||
return False
|
||||
|
||||
# Blacklist für unerwünschte Bilder
|
||||
blacklist_patterns = [
|
||||
'avatar', 'profile', 'icon', 'logo', 'banner',
|
||||
'advertisement', 'ads', 'tracking', 'pixel', 'social'
|
||||
]
|
||||
|
||||
return not any(pattern in url.lower() for pattern in blacklist_patterns)
|
||||
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def get_image_dimensions(img_tag) -> tuple:
|
||||
"""
|
||||
Versucht die Bildabmessungen aus HTML-Attributen zu ermitteln
|
||||
"""
|
||||
try:
|
||||
width = img_tag.get('width')
|
||||
height = img_tag.get('height')
|
||||
|
||||
if width and height:
|
||||
return int(width), int(height)
|
||||
|
||||
# Aus Style-Attribut extrahieren
|
||||
style = img_tag.get('style', '')
|
||||
if 'width:' in style or 'height:' in style:
|
||||
# Vereinfachte Extraktion - könnte erweitert werden
|
||||
pass
|
||||
|
||||
return None, None
|
||||
except:
|
||||
return None, None
|
||||
|
||||
def extract_image_metadata(img_tag, base_url: str) -> Dict:
|
||||
"""
|
||||
Extrahiert alle verfügbaren Metadaten eines Bildes
|
||||
"""
|
||||
try:
|
||||
# Basis-URL
|
||||
src = img_tag.get('src') or img_tag.get('data-src') or img_tag.get('data-lazy-src')
|
||||
if not src:
|
||||
return None
|
||||
|
||||
img_url = urljoin(base_url, src)
|
||||
|
||||
if not is_valid_image_url(img_url):
|
||||
return None
|
||||
|
||||
# Alt-Text
|
||||
alt_text = img_tag.get('alt', '').strip()
|
||||
|
||||
# Titel
|
||||
title = img_tag.get('title', '').strip()
|
||||
|
||||
# Bildabmessungen
|
||||
width, height = get_image_dimensions(img_tag)
|
||||
|
||||
# Überspringe sehr kleine Bilder
|
||||
if width and height and (width < MIN_IMAGE_SIZE or height < MIN_IMAGE_SIZE):
|
||||
return None
|
||||
|
||||
# Caption und Copyright aus Parent-Elementen suchen
|
||||
caption = ""
|
||||
copyright_text = "Unbekannt"
|
||||
copyright_url = base_url
|
||||
|
||||
# Suche in Parent-Elementen nach Caption
|
||||
parent = img_tag.find_parent(['figure', 'div', 'span', 'p'])
|
||||
if parent:
|
||||
# Figcaption
|
||||
figcaption = parent.find('figcaption')
|
||||
if figcaption:
|
||||
caption = figcaption.get_text(strip=True)
|
||||
|
||||
# Copyright-Link in Figcaption suchen
|
||||
copyright_link = figcaption.find('a')
|
||||
if copyright_link:
|
||||
copyright_url = urljoin(base_url, copyright_link.get('href', ''))
|
||||
copyright_text = copyright_link.get_text(strip=True)
|
||||
|
||||
# Alternative: Caption in kleinen Texten unter dem Bild
|
||||
caption_candidates = parent.find_all(['small', 'em', 'i'], limit=3)
|
||||
for candidate in caption_candidates:
|
||||
text = candidate.get_text(strip=True)
|
||||
if len(text) > 10 and len(text) < 200: # Plausible Caption-Länge
|
||||
if not caption: # Nur wenn noch keine Caption gefunden
|
||||
caption = text
|
||||
|
||||
# Fallback für Caption
|
||||
if not caption:
|
||||
caption = title or alt_text or "Bild aus Originalartikel"
|
||||
|
||||
return {
|
||||
"url": img_url,
|
||||
"alt": alt_text,
|
||||
"caption": caption[:300] if caption else "Kein Bildtitel vorhanden",
|
||||
"copyright": copyright_text or "Unbekannt",
|
||||
"copyright_url": copyright_url or base_url,
|
||||
"width": width,
|
||||
"height": height,
|
||||
"title": title
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Fehler bei Metadaten-Extraktion: {e}")
|
||||
return None
|
||||
|
||||
def extract_images_with_metadata(article_url: str) -> List[Dict]:
|
||||
"""
|
||||
Hauptfunktion: Extrahiert Bilder mit Metadaten aus einem Artikel
|
||||
"""
|
||||
images = []
|
||||
try:
|
||||
logging.info(f"📷 Extrahiere Bilder von {article_url}")
|
||||
response = requests.get(article_url, timeout=10)
|
||||
if response.status_code != 200:
|
||||
logging.warning(f"Keine gültige Antwort von {article_url} (Status {response.status_code})")
|
||||
return []
|
||||
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
|
||||
for img_tag in soup.find_all("img"):
|
||||
src = img_tag.get("src")
|
||||
if not src:
|
||||
continue
|
||||
|
||||
img_url = urljoin(article_url, src)
|
||||
alt_text = img_tag.get("alt", "").strip()
|
||||
|
||||
copyright_text = "Unbekannt"
|
||||
copyright_link = article_url
|
||||
caption = alt_text or "Bild aus Originalartikel"
|
||||
|
||||
parent = img_tag.find_parent(["figure", "div"])
|
||||
if parent:
|
||||
figcaption = parent.find("figcaption")
|
||||
if figcaption:
|
||||
caption = figcaption.get_text(strip=True)
|
||||
link_tag = figcaption.find("a")
|
||||
if link_tag and link_tag.has_attr("href"):
|
||||
copyright_link = link_tag["href"]
|
||||
copyright_text = link_tag.get_text(strip=True)
|
||||
|
||||
image_data = {
|
||||
"url": img_url,
|
||||
"alt": alt_text,
|
||||
"caption": caption or "Kein Bildtitel vorhanden",
|
||||
"copyright": copyright_text or "Unbekannt",
|
||||
"copyright_url": copyright_link or article_url
|
||||
}
|
||||
images.append(image_data)
|
||||
|
||||
logging.info(f"{len(images)} Bilder gefunden bei {article_url}")
|
||||
|
||||
if not article_url:
|
||||
return images
|
||||
|
||||
|
||||
try:
|
||||
logging.info(f"🖼️ Starte Bildextraktion von: {article_url}")
|
||||
|
||||
# HTTP-Request mit verbessertem Header
|
||||
headers = {
|
||||
'User-Agent': USER_AGENT,
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'de,en-US;q=0.7,en;q=0.3',
|
||||
'Accept-Encoding': 'gzip, deflate',
|
||||
'Connection': 'keep-alive',
|
||||
}
|
||||
|
||||
response = requests.get(article_url, timeout=REQUEST_TIMEOUT, headers=headers)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
|
||||
# Alle img-Tags finden
|
||||
img_tags = soup.find_all("img")
|
||||
logging.info(f"🔍 {len(img_tags)} img-Tags gefunden")
|
||||
|
||||
processed_urls = set() # Duplikate vermeiden
|
||||
|
||||
for img_tag in img_tags:
|
||||
try:
|
||||
# Metadaten extrahieren
|
||||
image_data = extract_image_metadata(img_tag, article_url)
|
||||
|
||||
if image_data and image_data["url"] not in processed_urls:
|
||||
images.append(image_data)
|
||||
processed_urls.add(image_data["url"])
|
||||
|
||||
logging.info(f"✅ Bild hinzugefügt: {image_data['caption'][:50]}...")
|
||||
|
||||
# Maximum erreicht?
|
||||
if len(images) >= MAX_IMAGES:
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Fehler beim Verarbeiten eines Bildes: {e}")
|
||||
continue
|
||||
|
||||
# Bilder nach Größe sortieren (größere zuerst)
|
||||
images.sort(key=lambda x: (x.get('width', 0) * x.get('height', 0)), reverse=True)
|
||||
|
||||
logging.info(f"🎉 {len(images)} Bilder erfolgreich extrahiert von {article_url}")
|
||||
return images[:MAX_IMAGES] # Sicherheitshalber nochmal begrenzen
|
||||
|
||||
except requests.RequestException as e:
|
||||
logging.error(f"🌐 Netzwerkfehler bei {article_url}: {e}")
|
||||
return []
|
||||
except Exception as e:
|
||||
logging.exception(f"Fehler bei der Bildextraktion aus {article_url}:")
|
||||
return []
|
||||
logging.error(f"❌ Unerwarteter Fehler bei Bildextraktion von {article_url}: {e}")
|
||||
return []
|
||||
|
||||
def validate_image_url(url: str) -> bool:
|
||||
"""
|
||||
Prüft ob ein Bild tatsächlich erreichbar ist
|
||||
"""
|
||||
try:
|
||||
response = requests.head(url, timeout=5)
|
||||
content_type = response.headers.get('content-type', '').lower()
|
||||
return response.status_code == 200 and 'image' in content_type
|
||||
except:
|
||||
return False
|
||||
|
||||
def extract_featured_image(article_url: str) -> Dict:
|
||||
"""
|
||||
Versucht das Hauptbild/Featured Image eines Artikels zu finden
|
||||
"""
|
||||
try:
|
||||
headers = {'User-Agent': USER_AGENT}
|
||||
response = requests.get(article_url, timeout=REQUEST_TIMEOUT, headers=headers)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
|
||||
# OpenGraph Image
|
||||
og_image = soup.find('meta', property='og:image')
|
||||
if og_image and og_image.get('content'):
|
||||
img_url = urljoin(article_url, og_image['content'])
|
||||
if is_valid_image_url(img_url):
|
||||
return {
|
||||
"url": img_url,
|
||||
"alt": "Featured Image",
|
||||
"caption": "Hauptbild des Artikels",
|
||||
"copyright": "Unbekannt",
|
||||
"copyright_url": article_url,
|
||||
"type": "featured"
|
||||
}
|
||||
|
||||
# Twitter Card Image
|
||||
twitter_image = soup.find('meta', attrs={'name': 'twitter:image'})
|
||||
if twitter_image and twitter_image.get('content'):
|
||||
img_url = urljoin(article_url, twitter_image['content'])
|
||||
if is_valid_image_url(img_url):
|
||||
return {
|
||||
"url": img_url,
|
||||
"alt": "Featured Image",
|
||||
"caption": "Hauptbild des Artikels",
|
||||
"copyright": "Unbekannt",
|
||||
"copyright_url": article_url,
|
||||
"type": "featured"
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Fehler bei Featured Image Extraktion: {e}")
|
||||
return None
|
||||
|
||||
def clean_image_metadata(images: List[Dict]) -> List[Dict]:
|
||||
"""
|
||||
Bereinigt und normalisiert Bildmetadaten
|
||||
"""
|
||||
cleaned_images = []
|
||||
|
||||
for img in images:
|
||||
try:
|
||||
# URL validieren
|
||||
if not img.get("url") or not is_valid_image_url(img["url"]):
|
||||
continue
|
||||
|
||||
# Metadaten bereinigen
|
||||
cleaned_img = {
|
||||
"url": img["url"].strip(),
|
||||
"alt": (img.get("alt") or "").strip()[:200],
|
||||
"caption": (img.get("caption") or "Kein Bildtitel vorhanden").strip()[:300],
|
||||
"copyright": (img.get("copyright") or "Unbekannt").strip()[:100],
|
||||
"copyright_url": (img.get("copyright_url") or "#").strip(),
|
||||
"width": img.get("width"),
|
||||
"height": img.get("height"),
|
||||
"title": (img.get("title") or "").strip()[:200]
|
||||
}
|
||||
|
||||
# Leere Felder mit Standardwerten füllen
|
||||
if not cleaned_img["caption"]:
|
||||
cleaned_img["caption"] = "Kein Bildtitel vorhanden"
|
||||
if not cleaned_img["copyright"]:
|
||||
cleaned_img["copyright"] = "Unbekannt"
|
||||
if not cleaned_img["copyright_url"] or cleaned_img["copyright_url"] == "#":
|
||||
cleaned_img["copyright_url"] = img["url"] # Bild-URL als Fallback
|
||||
|
||||
cleaned_images.append(cleaned_img)
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Fehler beim Bereinigen der Bildmetadaten: {e}")
|
||||
continue
|
||||
|
||||
return cleaned_images
|
||||
|
||||
# Hauptfunktion für bessere Kompatibilität mit dem bestehenden Code
|
||||
def extract_images_with_metadata_enhanced(article_url: str) -> List[Dict]:
|
||||
"""
|
||||
Erweiterte Bildextraktion mit Fallback-Strategien
|
||||
"""
|
||||
all_images = []
|
||||
|
||||
# 1. Featured Image versuchen
|
||||
featured = extract_featured_image(article_url)
|
||||
if featured:
|
||||
all_images.append(featured)
|
||||
|
||||
# 2. Normale Bildextraktion
|
||||
content_images = extract_images_with_metadata(article_url)
|
||||
all_images.extend(content_images)
|
||||
|
||||
# 3. Duplikate entfernen
|
||||
seen_urls = set()
|
||||
unique_images = []
|
||||
for img in all_images:
|
||||
if img["url"] not in seen_urls:
|
||||
unique_images.append(img)
|
||||
seen_urls.add(img["url"])
|
||||
|
||||
# 4. Metadaten bereinigen
|
||||
cleaned_images = clean_image_metadata(unique_images)
|
||||
|
||||
return cleaned_images[:MAX_IMAGES]
|
||||
236
utils/ui_helpers.py
Normal file
236
utils/ui_helpers.py
Normal file
|
|
@ -0,0 +1,236 @@
|
|||
# utils/ui_helpers.py
|
||||
|
||||
import streamlit as st
|
||||
from datetime import datetime
|
||||
import logging
|
||||
|
||||
def show_toast(message, type="success", duration=3):
|
||||
"""
|
||||
Zeigt eine Toast-Benachrichtigung an
|
||||
"""
|
||||
if type == "success":
|
||||
st.success(message)
|
||||
elif type == "error":
|
||||
st.error(message)
|
||||
elif type == "warning":
|
||||
st.warning(message)
|
||||
elif type == "info":
|
||||
st.info(message)
|
||||
|
||||
def format_datetime(date_str):
|
||||
"""
|
||||
Formatiert Datetime-Strings für bessere Lesbarkeit
|
||||
"""
|
||||
try:
|
||||
if isinstance(date_str, str):
|
||||
if "GMT" in date_str or "+" in date_str:
|
||||
dt = datetime.strptime(date_str, "%a, %d %b %Y %H:%M:%S %z")
|
||||
return dt.strftime("%d.%m.%Y %H:%M")
|
||||
elif "T" in date_str:
|
||||
dt = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
|
||||
return dt.strftime("%d.%m.%Y %H:%M")
|
||||
else:
|
||||
return date_str[:16].replace("T", " ")
|
||||
return str(date_str)
|
||||
except Exception as e:
|
||||
logging.warning(f"Datum konnte nicht formatiert werden: {date_str} - {e}")
|
||||
return str(date_str)[:16]
|
||||
|
||||
def get_status_color(status):
|
||||
"""
|
||||
Gibt die passende Farbe für einen Status zurück
|
||||
"""
|
||||
colors = {
|
||||
"New": "#2196f3",
|
||||
"Rewrite": "#ff9800",
|
||||
"Process": "#9c27b0",
|
||||
"Online": "#4caf50",
|
||||
"On Hold": "#e91e63",
|
||||
"Trash": "#f44336"
|
||||
}
|
||||
return colors.get(status, "#2196f3")
|
||||
|
||||
def create_status_badge(status):
|
||||
"""
|
||||
Erstellt einen HTML-Status-Badge
|
||||
"""
|
||||
color = get_status_color(status)
|
||||
return f"""
|
||||
<span style="
|
||||
background-color: {color}20;
|
||||
color: {color};
|
||||
padding: 0.25rem 0.5rem;
|
||||
border-radius: 12px;
|
||||
font-size: 0.8rem;
|
||||
font-weight: 600;
|
||||
border: 1px solid {color}40;
|
||||
">{status}</span>
|
||||
"""
|
||||
|
||||
def truncate_text(text, max_length=150):
|
||||
"""
|
||||
Kürzt Text auf maximale Länge
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
if len(text) <= max_length:
|
||||
return text
|
||||
|
||||
return text[:max_length].rsplit(' ', 1)[0] + "..."
|
||||
|
||||
def calculate_reading_time(text):
|
||||
"""
|
||||
Berechnet geschätzte Lesezeit (200 Wörter/Minute)
|
||||
"""
|
||||
if not text:
|
||||
return 0
|
||||
|
||||
word_count = len(text.split())
|
||||
reading_time = max(1, word_count // 200)
|
||||
return reading_time
|
||||
|
||||
def validate_url(url):
|
||||
"""
|
||||
Validiert eine URL
|
||||
"""
|
||||
import re
|
||||
pattern = re.compile(
|
||||
r'^https?://' # http:// oder https://
|
||||
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' # domain...
|
||||
r'localhost|' # localhost...
|
||||
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...oder IP
|
||||
r'(?::\d+)?' # optional port
|
||||
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
|
||||
return pattern.match(url) is not None
|
||||
|
||||
def create_article_card_html(article, source_name="Unbekannt"):
|
||||
"""
|
||||
Erstellt HTML für eine Artikel-Karte
|
||||
"""
|
||||
has_images = len(article.get("images", [])) > 0
|
||||
word_count = len(article.get("text", "").split())
|
||||
reading_time = calculate_reading_time(article.get("text", ""))
|
||||
|
||||
# Unvollständige Bilder prüfen
|
||||
incomplete_images = any(
|
||||
not all(k in img and img[k] for k in ("caption", "copyright", "copyright_url"))
|
||||
for img in article.get("images", [])
|
||||
)
|
||||
|
||||
warning_icon = " ⚠️" if incomplete_images else ""
|
||||
|
||||
return f"""
|
||||
<div style="
|
||||
background: white;
|
||||
border-radius: 12px;
|
||||
padding: 1.5rem;
|
||||
margin-bottom: 1rem;
|
||||
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
|
||||
border-left: 4px solid {get_status_color(article.get('status', 'New'))};
|
||||
transition: transform 0.2s ease;
|
||||
" onmouseover="this.style.transform='translateY(-2px)'" onmouseout="this.style.transform='translateY(0)'">
|
||||
|
||||
<div style="display: flex; justify-content: space-between; align-items: start; margin-bottom: 1rem;">
|
||||
<div style="flex: 1;">
|
||||
<h3 style="margin: 0 0 0.5rem 0; color: #333; font-size: 1.1rem;">
|
||||
{article.get('title', 'Kein Titel')}{warning_icon}
|
||||
</h3>
|
||||
<div style="font-size: 0.85rem; color: #666; margin-bottom: 0.5rem;">
|
||||
📅 {format_datetime(article.get('date', ''))} •
|
||||
📝 {word_count} Wörter •
|
||||
⏱️ {reading_time} Min Lesezeit
|
||||
{'• 🖼️ ' + str(len(article.get('images', []))) + ' Bilder' if has_images else ''}
|
||||
</div>
|
||||
</div>
|
||||
<div>
|
||||
{create_status_badge(article.get('status', 'New'))}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div style="margin-bottom: 1rem; color: #555; line-height: 1.4;">
|
||||
{truncate_text(article.get('summary', ''), 200)}
|
||||
</div>
|
||||
|
||||
<div style="display: flex; justify-content: space-between; align-items: center; font-size: 0.8rem; color: #888;">
|
||||
<div>
|
||||
📡 {source_name}
|
||||
</div>
|
||||
<div>
|
||||
🏷️ {', '.join(article.get('tags', [])[:3])}{'...' if len(article.get('tags', [])) > 3 else ''}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
"""
|
||||
|
||||
def create_stats_card(title, value, icon="📊", color="#667eea"):
|
||||
"""
|
||||
Erstellt eine Statistik-Karte
|
||||
"""
|
||||
return f"""
|
||||
<div style="
|
||||
background: white;
|
||||
border-radius: 12px;
|
||||
padding: 1.5rem;
|
||||
text-align: center;
|
||||
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
|
||||
border-top: 4px solid {color};
|
||||
">
|
||||
<div style="font-size: 2rem; margin-bottom: 0.5rem;">{icon}</div>
|
||||
<div style="font-size: 2rem; font-weight: bold; color: {color}; margin-bottom: 0.5rem;">{value}</div>
|
||||
<div style="color: #666; font-weight: 500;">{title}</div>
|
||||
</div>
|
||||
"""
|
||||
|
||||
def show_loading_spinner(text="Lädt..."):
|
||||
"""
|
||||
Zeigt einen Lade-Spinner mit Text
|
||||
"""
|
||||
return st.empty().markdown(f"""
|
||||
<div style="text-align: center; padding: 2rem;">
|
||||
<div style="
|
||||
border: 4px solid #f3f3f3;
|
||||
border-top: 4px solid #667eea;
|
||||
border-radius: 50%;
|
||||
width: 40px;
|
||||
height: 40px;
|
||||
animation: spin 1s linear infinite;
|
||||
margin: 0 auto 1rem auto;
|
||||
"></div>
|
||||
<div style="color: #666;">{text}</div>
|
||||
</div>
|
||||
<style>
|
||||
@keyframes spin {
|
||||
0% { transform: rotate(0deg); }
|
||||
100% { transform: rotate(360deg); }
|
||||
}
|
||||
</style>
|
||||
""", unsafe_allow_html=True)
|
||||
|
||||
def create_filter_section():
|
||||
"""
|
||||
Erstellt einen modernen Filter-Bereich
|
||||
"""
|
||||
return """
|
||||
<div style="
|
||||
background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
|
||||
border-radius: 12px;
|
||||
padding: 1.5rem;
|
||||
margin-bottom: 2rem;
|
||||
box-shadow: 0 4px 6px rgba(0,0,0,0.1);
|
||||
">
|
||||
<h3 style="margin: 0 0 1rem 0; color: #333;">🔍 Filter & Suche</h3>
|
||||
"""
|
||||
|
||||
def get_error_message(error_type, details=""):
|
||||
"""
|
||||
Gibt formatierte Fehlermeldungen zurück
|
||||
"""
|
||||
messages = {
|
||||
"feed_error": f"❌ Fehler beim Laden des Feeds: {details}",
|
||||
"save_error": f"❌ Fehler beim Speichern: {details}",
|
||||
"api_error": f"❌ API-Fehler: {details}",
|
||||
"validation_error": f"⚠️ Validierungsfehler: {details}",
|
||||
"network_error": f"🌐 Netzwerkfehler: {details}"
|
||||
}
|
||||
return messages.get(error_type, f"❌ Unbekannter Fehler: {details}")
|
||||
Loading…
Add table
Add a link
Reference in a new issue