Grundfunktionen optimiert

This commit is contained in:
Oliver 2025-08-16 11:13:10 +02:00
parent 050e08859c
commit 0c84dd1a1a
No known key found for this signature in database
8 changed files with 4866 additions and 315 deletions

View file

@ -2,26 +2,362 @@
import requests
from bs4 import BeautifulSoup
import logging
import time
from typing import Optional
# Konfiguration
REQUEST_TIMEOUT = 15
MAX_RETRIES = 3
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
# Website-spezifische Selektoren
CONTENT_SELECTORS = {
# Promobil & Camping-spezifisch
'promobil.de': [
{'tag': 'div', 'class': 'article__text'},
{'tag': 'div', 'class': 'article-content'},
{'tag': 'div', 'class': 'content-text'}
],
'camping.info': [
{'tag': 'div', 'class': 'article-body'},
{'tag': 'div', 'class': 'post-content'}
],
'caravaning.de': [
{'tag': 'div', 'class': 'article__content'},
{'tag': 'div', 'class': 'entry-content'}
],
# WordPress Standard-Selektoren
'wordpress': [
{'tag': 'div', 'class': 'entry-content'},
{'tag': 'div', 'class': 'post-content'},
{'tag': 'div', 'class': 'content'},
{'tag': 'main', 'class': 'main-content'},
{'tag': 'article', 'class': None}
],
# Allgemeine Fallbacks
'generic': [
{'tag': 'article', 'class': None},
{'tag': 'div', 'class': 'content'},
{'tag': 'div', 'class': 'post'},
{'tag': 'div', 'class': 'entry'},
{'tag': 'main', 'class': None},
{'tag': 'div', 'id': 'content'},
{'tag': 'div', 'id': 'main'}
]
}
def get_domain_from_url(url: str) -> str:
"""
Extrahiert die Domain aus einer URL
"""
try:
from urllib.parse import urlparse
parsed = urlparse(url)
return parsed.netloc.lower()
except:
return ""
def get_selectors_for_domain(domain: str) -> list:
"""
Gibt die passenden Selektoren für eine Domain zurück
"""
# Direkte Domain-Matches
for known_domain in CONTENT_SELECTORS:
if known_domain != 'wordpress' and known_domain != 'generic' and known_domain in domain:
return CONTENT_SELECTORS[known_domain]
# WordPress erkennen (wird später durch Meta-Tags erkannt)
return CONTENT_SELECTORS['generic']
def is_wordpress_site(soup: BeautifulSoup) -> bool:
"""
Erkennt WordPress-Websites anhand von Meta-Tags
"""
try:
# WordPress Generator Meta-Tag
generator = soup.find('meta', attrs={'name': 'generator'})
if generator and 'wordpress' in generator.get('content', '').lower():
return True
# WordPress-spezifische Link-Tags
wp_links = soup.find_all('link', href=lambda x: x and '/wp-' in x)
if wp_links:
return True
# WordPress REST API
rest_api = soup.find('link', attrs={'rel': 'https://api.w.org/'})
if rest_api:
return True
return False
except:
return False
def clean_extracted_text(text: str) -> str:
"""
Bereinigt extrahierten Text von unerwünschten Elementen
"""
if not text:
return ""
lines = text.split('\n')
cleaned_lines = []
for line in lines:
line = line.strip()
# Überspringe sehr kurze Zeilen (wahrscheinlich Navigation/Werbung)
if len(line) < 10:
continue
# Überspringe typische Navigation/Footer-Texte
skip_patterns = [
'cookie', 'datenschutz', 'impressum', 'agb', 'newsletter',
'folgen sie uns', 'social media', 'teilen', 'weiterlesen',
'mehr zum thema', 'ähnliche artikel', 'kommentare',
'anzeige', 'werbung', 'advertisement'
]
if any(pattern in line.lower() for pattern in skip_patterns):
continue
# Überspringe Zeilen mit zu vielen Sonderzeichen (Navigation)
if len([c for c in line if c in '|•→←↑↓']) > 3:
continue
cleaned_lines.append(line)
# Text zusammenfügen
cleaned_text = ' '.join(cleaned_lines)
# Mehrfache Leerzeichen entfernen
cleaned_text = ' '.join(cleaned_text.split())
return cleaned_text
def extract_with_selectors(soup: BeautifulSoup, selectors: list) -> str:
"""
Versucht Text mit einer Liste von Selektoren zu extrahieren
"""
for selector in selectors:
try:
element = None
if selector.get('class'):
element = soup.find(selector['tag'], class_=selector['class'])
elif selector.get('id'):
element = soup.find(selector['tag'], id=selector['id'])
else:
element = soup.find(selector['tag'])
if element:
# Entferne Script- und Style-Tags
for script in element(['script', 'style', 'nav', 'header', 'footer', 'aside']):
script.decompose()
text = element.get_text(' ', strip=True)
# Nur zurückgeben wenn genügend Text vorhanden
if len(text.split()) > 50:
logging.info(f"✅ Erfolgreiche Extraktion mit Selektor: {selector}")
return clean_extracted_text(text)
except Exception as e:
logging.debug(f"Selektor {selector} fehlgeschlagen: {e}")
continue
return ""
def extract_from_paragraphs(soup: BeautifulSoup) -> str:
"""
Fallback: Extrahiert Text aus allen Paragraph-Tags
"""
try:
paragraphs = soup.find_all('p')
if not paragraphs:
return ""
# Sammle alle Paragraph-Texte
texts = []
for p in paragraphs:
text = p.get_text(strip=True)
if len(text) > 20: # Nur längere Absätze
texts.append(text)
combined_text = ' '.join(texts)
if len(combined_text.split()) > 30:
logging.info(f"✅ Fallback-Extraktion aus {len(paragraphs)} Paragraphen")
return clean_extracted_text(combined_text)
return ""
except Exception as e:
logging.error(f"Fehler bei Paragraph-Extraktion: {e}")
return ""
def extract_full_article(url: str) -> str:
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# Promobil & WordPress & allgemeine Fallbacks
candidates = [
{"tag": "div", "class_": "article__text"}, # Promobil
{"tag": "div", "class_": "entry-content"}, # WordPress Standard
{"tag": "article", "class_": None}, # Generisch
]
for selector in candidates:
el = soup.find(selector["tag"], class_=selector["class_"])
if el and len(el.get_text(strip=True).split()) > 50:
return el.get_text(" ", strip=True)
# Fallback: ganzer Seiteninhalt
return soup.get_text(" ", strip=True)
except Exception:
"""
Hauptfunktion: Extrahiert den vollständigen Artikeltext von einer URL
"""
if not url:
return ""
retries = 0
while retries < MAX_RETRIES:
try:
logging.info(f"📰 Starte Volltextextraktion von: {url} (Versuch {retries + 1})")
# HTTP-Request mit verbessertem Header
headers = {
'User-Agent': USER_AGENT,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'de,en-US;q=0.7,en;q=0.3',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
}
response = requests.get(url, timeout=REQUEST_TIMEOUT, headers=headers)
response.raise_for_status()
# Encoding sicherstellen
if response.encoding.lower() in ['iso-8859-1', 'windows-1252']:
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, "html.parser")
# Domain-spezifische Selektoren ermitteln
domain = get_domain_from_url(url)
selectors = get_selectors_for_domain(domain)
# WordPress erkennen und entsprechende Selektoren verwenden
if is_wordpress_site(soup):
logging.info("🔧 WordPress-Site erkannt")
selectors = CONTENT_SELECTORS['wordpress'] + selectors
# 1. Versuch: Domain-spezifische Selektoren
extracted_text = extract_with_selectors(soup, selectors)
if extracted_text and len(extracted_text.split()) > 50:
logging.info(f"🎉 Erfolgreiche Extraktion: {len(extracted_text.split())} Wörter")
return extracted_text
# 2. Versuch: Generische Selektoren
if not extracted_text:
logging.info("🔄 Fallback auf generische Selektoren")
extracted_text = extract_with_selectors(soup, CONTENT_SELECTORS['generic'])
if extracted_text and len(extracted_text.split()) > 50:
logging.info(f"🎉 Erfolgreiche Extraktion (generisch): {len(extracted_text.split())} Wörter")
return extracted_text
# 3. Versuch: Paragraph-Extraktion
if not extracted_text:
logging.info("🔄 Fallback auf Paragraph-Extraktion")
extracted_text = extract_from_paragraphs(soup)
if extracted_text and len(extracted_text.split()) > 30:
logging.info(f"🎉 Erfolgreiche Extraktion (Paragraphen): {len(extracted_text.split())} Wörter")
return extracted_text
# 4. Letzter Versuch: Gesamter Body-Text
if not extracted_text:
logging.info("🔄 Letzter Fallback: Body-Text")
body = soup.find('body')
if body:
# Entferne Navigation, Header, Footer
for element in body(['nav', 'header', 'footer', 'aside', 'script', 'style']):
element.decompose()
body_text = body.get_text(' ', strip=True)
if len(body_text.split()) > 100:
extracted_text = clean_extracted_text(body_text)
logging.info(f"⚠️ Body-Extraktion: {len(extracted_text.split())} Wörter")
return extracted_text
# Kein brauchbarer Text gefunden
if not extracted_text:
logging.warning(f"⚠️ Keine verwertbaren Inhalte gefunden bei: {url}")
return ""
return extracted_text
except requests.RequestException as e:
retries += 1
logging.warning(f"🌐 Netzwerkfehler bei {url} (Versuch {retries}): {e}")
if retries < MAX_RETRIES:
time.sleep(2 ** retries) # Exponential backoff
continue
else:
logging.error(f"❌ Maximale Anzahl Versuche erreicht für: {url}")
return ""
except Exception as e:
logging.error(f"❌ Unerwarteter Fehler bei Volltextextraktion von {url}: {e}")
return ""
return ""
def extract_article_summary(full_text: str, max_length: int = 300) -> str:
"""
Erstellt eine intelligente Zusammenfassung aus dem Volltext
"""
if not full_text:
return ""
sentences = full_text.split('.')
# Erste 2-3 sinnvolle Sätze als Summary verwenden
summary_sentences = []
current_length = 0
for sentence in sentences[:5]: # Maximal erste 5 Sätze prüfen
sentence = sentence.strip()
if len(sentence) < 20: # Zu kurze Sätze überspringen
continue
if current_length + len(sentence) > max_length:
break
summary_sentences.append(sentence)
current_length += len(sentence)
summary = '. '.join(summary_sentences)
if summary and not summary.endswith('.'):
summary += '.'
return summary[:max_length]
def validate_extracted_content(text: str) -> bool:
"""
Validiert ob der extrahierte Inhalt brauchbar ist
"""
if not text or len(text.strip()) < 100:
return False
words = text.split()
# Mindestens 50 Wörter
if len(words) < 50:
return False
# Nicht zu viele Sonderzeichen (Navigation etc.)
special_chars = len([c for c in text if c in '|•→←↑↓'])
if special_chars > len(text) * 0.05: # Mehr als 5% Sonderzeichen
return False
# Durchschnittliche Wortlänge prüfen (zu kurz = Navigation)
avg_word_length = sum(len(word) for word in words) / len(words)
if avg_word_length < 3:
return False
return True

View file

@ -2,59 +2,325 @@
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from urllib.parse import urljoin, urlparse
import logging
import time
from typing import List, Dict
# Konfiguration
MAX_IMAGES = 5
MIN_IMAGE_SIZE = 100 # Mindestgröße in Pixeln
ALLOWED_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.webp', '.gif'}
REQUEST_TIMEOUT = 10
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
def extract_images_with_metadata(article_url):
def is_valid_image_url(url: str) -> bool:
"""
Versucht, Bilder mit Bildunterschrift und Copyright aus dem Originalartikel zu extrahieren.
Gibt eine Liste mit Dictionaries zurück: {url, alt, copyright, copyright_url, caption}
Prüft ob eine URL auf ein gültiges Bild zeigt
"""
try:
parsed = urlparse(url)
path = parsed.path.lower()
# Prüfe Dateiendung
if not any(path.endswith(ext) for ext in ALLOWED_EXTENSIONS):
return False
# Prüfe ob URL vollständig ist
if not parsed.scheme or not parsed.netloc:
return False
# Blacklist für unerwünschte Bilder
blacklist_patterns = [
'avatar', 'profile', 'icon', 'logo', 'banner',
'advertisement', 'ads', 'tracking', 'pixel', 'social'
]
return not any(pattern in url.lower() for pattern in blacklist_patterns)
except Exception:
return False
def get_image_dimensions(img_tag) -> tuple:
"""
Versucht die Bildabmessungen aus HTML-Attributen zu ermitteln
"""
try:
width = img_tag.get('width')
height = img_tag.get('height')
if width and height:
return int(width), int(height)
# Aus Style-Attribut extrahieren
style = img_tag.get('style', '')
if 'width:' in style or 'height:' in style:
# Vereinfachte Extraktion - könnte erweitert werden
pass
return None, None
except:
return None, None
def extract_image_metadata(img_tag, base_url: str) -> Dict:
"""
Extrahiert alle verfügbaren Metadaten eines Bildes
"""
try:
# Basis-URL
src = img_tag.get('src') or img_tag.get('data-src') or img_tag.get('data-lazy-src')
if not src:
return None
img_url = urljoin(base_url, src)
if not is_valid_image_url(img_url):
return None
# Alt-Text
alt_text = img_tag.get('alt', '').strip()
# Titel
title = img_tag.get('title', '').strip()
# Bildabmessungen
width, height = get_image_dimensions(img_tag)
# Überspringe sehr kleine Bilder
if width and height and (width < MIN_IMAGE_SIZE or height < MIN_IMAGE_SIZE):
return None
# Caption und Copyright aus Parent-Elementen suchen
caption = ""
copyright_text = "Unbekannt"
copyright_url = base_url
# Suche in Parent-Elementen nach Caption
parent = img_tag.find_parent(['figure', 'div', 'span', 'p'])
if parent:
# Figcaption
figcaption = parent.find('figcaption')
if figcaption:
caption = figcaption.get_text(strip=True)
# Copyright-Link in Figcaption suchen
copyright_link = figcaption.find('a')
if copyright_link:
copyright_url = urljoin(base_url, copyright_link.get('href', ''))
copyright_text = copyright_link.get_text(strip=True)
# Alternative: Caption in kleinen Texten unter dem Bild
caption_candidates = parent.find_all(['small', 'em', 'i'], limit=3)
for candidate in caption_candidates:
text = candidate.get_text(strip=True)
if len(text) > 10 and len(text) < 200: # Plausible Caption-Länge
if not caption: # Nur wenn noch keine Caption gefunden
caption = text
# Fallback für Caption
if not caption:
caption = title or alt_text or "Bild aus Originalartikel"
return {
"url": img_url,
"alt": alt_text,
"caption": caption[:300] if caption else "Kein Bildtitel vorhanden",
"copyright": copyright_text or "Unbekannt",
"copyright_url": copyright_url or base_url,
"width": width,
"height": height,
"title": title
}
except Exception as e:
logging.error(f"Fehler bei Metadaten-Extraktion: {e}")
return None
def extract_images_with_metadata(article_url: str) -> List[Dict]:
"""
Hauptfunktion: Extrahiert Bilder mit Metadaten aus einem Artikel
"""
images = []
try:
logging.info(f"📷 Extrahiere Bilder von {article_url}")
response = requests.get(article_url, timeout=10)
if response.status_code != 200:
logging.warning(f"Keine gültige Antwort von {article_url} (Status {response.status_code})")
return []
soup = BeautifulSoup(response.content, "html.parser")
for img_tag in soup.find_all("img"):
src = img_tag.get("src")
if not src:
continue
img_url = urljoin(article_url, src)
alt_text = img_tag.get("alt", "").strip()
copyright_text = "Unbekannt"
copyright_link = article_url
caption = alt_text or "Bild aus Originalartikel"
parent = img_tag.find_parent(["figure", "div"])
if parent:
figcaption = parent.find("figcaption")
if figcaption:
caption = figcaption.get_text(strip=True)
link_tag = figcaption.find("a")
if link_tag and link_tag.has_attr("href"):
copyright_link = link_tag["href"]
copyright_text = link_tag.get_text(strip=True)
image_data = {
"url": img_url,
"alt": alt_text,
"caption": caption or "Kein Bildtitel vorhanden",
"copyright": copyright_text or "Unbekannt",
"copyright_url": copyright_link or article_url
}
images.append(image_data)
logging.info(f"{len(images)} Bilder gefunden bei {article_url}")
if not article_url:
return images
try:
logging.info(f"🖼️ Starte Bildextraktion von: {article_url}")
# HTTP-Request mit verbessertem Header
headers = {
'User-Agent': USER_AGENT,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'de,en-US;q=0.7,en;q=0.3',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
}
response = requests.get(article_url, timeout=REQUEST_TIMEOUT, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
# Alle img-Tags finden
img_tags = soup.find_all("img")
logging.info(f"🔍 {len(img_tags)} img-Tags gefunden")
processed_urls = set() # Duplikate vermeiden
for img_tag in img_tags:
try:
# Metadaten extrahieren
image_data = extract_image_metadata(img_tag, article_url)
if image_data and image_data["url"] not in processed_urls:
images.append(image_data)
processed_urls.add(image_data["url"])
logging.info(f"✅ Bild hinzugefügt: {image_data['caption'][:50]}...")
# Maximum erreicht?
if len(images) >= MAX_IMAGES:
break
except Exception as e:
logging.error(f"❌ Fehler beim Verarbeiten eines Bildes: {e}")
continue
# Bilder nach Größe sortieren (größere zuerst)
images.sort(key=lambda x: (x.get('width', 0) * x.get('height', 0)), reverse=True)
logging.info(f"🎉 {len(images)} Bilder erfolgreich extrahiert von {article_url}")
return images[:MAX_IMAGES] # Sicherheitshalber nochmal begrenzen
except requests.RequestException as e:
logging.error(f"🌐 Netzwerkfehler bei {article_url}: {e}")
return []
except Exception as e:
logging.exception(f"Fehler bei der Bildextraktion aus {article_url}:")
return []
logging.error(f"❌ Unerwarteter Fehler bei Bildextraktion von {article_url}: {e}")
return []
def validate_image_url(url: str) -> bool:
"""
Prüft ob ein Bild tatsächlich erreichbar ist
"""
try:
response = requests.head(url, timeout=5)
content_type = response.headers.get('content-type', '').lower()
return response.status_code == 200 and 'image' in content_type
except:
return False
def extract_featured_image(article_url: str) -> Dict:
"""
Versucht das Hauptbild/Featured Image eines Artikels zu finden
"""
try:
headers = {'User-Agent': USER_AGENT}
response = requests.get(article_url, timeout=REQUEST_TIMEOUT, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
# OpenGraph Image
og_image = soup.find('meta', property='og:image')
if og_image and og_image.get('content'):
img_url = urljoin(article_url, og_image['content'])
if is_valid_image_url(img_url):
return {
"url": img_url,
"alt": "Featured Image",
"caption": "Hauptbild des Artikels",
"copyright": "Unbekannt",
"copyright_url": article_url,
"type": "featured"
}
# Twitter Card Image
twitter_image = soup.find('meta', attrs={'name': 'twitter:image'})
if twitter_image and twitter_image.get('content'):
img_url = urljoin(article_url, twitter_image['content'])
if is_valid_image_url(img_url):
return {
"url": img_url,
"alt": "Featured Image",
"caption": "Hauptbild des Artikels",
"copyright": "Unbekannt",
"copyright_url": article_url,
"type": "featured"
}
return None
except Exception as e:
logging.error(f"Fehler bei Featured Image Extraktion: {e}")
return None
def clean_image_metadata(images: List[Dict]) -> List[Dict]:
"""
Bereinigt und normalisiert Bildmetadaten
"""
cleaned_images = []
for img in images:
try:
# URL validieren
if not img.get("url") or not is_valid_image_url(img["url"]):
continue
# Metadaten bereinigen
cleaned_img = {
"url": img["url"].strip(),
"alt": (img.get("alt") or "").strip()[:200],
"caption": (img.get("caption") or "Kein Bildtitel vorhanden").strip()[:300],
"copyright": (img.get("copyright") or "Unbekannt").strip()[:100],
"copyright_url": (img.get("copyright_url") or "#").strip(),
"width": img.get("width"),
"height": img.get("height"),
"title": (img.get("title") or "").strip()[:200]
}
# Leere Felder mit Standardwerten füllen
if not cleaned_img["caption"]:
cleaned_img["caption"] = "Kein Bildtitel vorhanden"
if not cleaned_img["copyright"]:
cleaned_img["copyright"] = "Unbekannt"
if not cleaned_img["copyright_url"] or cleaned_img["copyright_url"] == "#":
cleaned_img["copyright_url"] = img["url"] # Bild-URL als Fallback
cleaned_images.append(cleaned_img)
except Exception as e:
logging.error(f"Fehler beim Bereinigen der Bildmetadaten: {e}")
continue
return cleaned_images
# Hauptfunktion für bessere Kompatibilität mit dem bestehenden Code
def extract_images_with_metadata_enhanced(article_url: str) -> List[Dict]:
"""
Erweiterte Bildextraktion mit Fallback-Strategien
"""
all_images = []
# 1. Featured Image versuchen
featured = extract_featured_image(article_url)
if featured:
all_images.append(featured)
# 2. Normale Bildextraktion
content_images = extract_images_with_metadata(article_url)
all_images.extend(content_images)
# 3. Duplikate entfernen
seen_urls = set()
unique_images = []
for img in all_images:
if img["url"] not in seen_urls:
unique_images.append(img)
seen_urls.add(img["url"])
# 4. Metadaten bereinigen
cleaned_images = clean_image_metadata(unique_images)
return cleaned_images[:MAX_IMAGES]

236
utils/ui_helpers.py Normal file
View file

@ -0,0 +1,236 @@
# utils/ui_helpers.py
import streamlit as st
from datetime import datetime
import logging
def show_toast(message, type="success", duration=3):
"""
Zeigt eine Toast-Benachrichtigung an
"""
if type == "success":
st.success(message)
elif type == "error":
st.error(message)
elif type == "warning":
st.warning(message)
elif type == "info":
st.info(message)
def format_datetime(date_str):
"""
Formatiert Datetime-Strings für bessere Lesbarkeit
"""
try:
if isinstance(date_str, str):
if "GMT" in date_str or "+" in date_str:
dt = datetime.strptime(date_str, "%a, %d %b %Y %H:%M:%S %z")
return dt.strftime("%d.%m.%Y %H:%M")
elif "T" in date_str:
dt = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
return dt.strftime("%d.%m.%Y %H:%M")
else:
return date_str[:16].replace("T", " ")
return str(date_str)
except Exception as e:
logging.warning(f"Datum konnte nicht formatiert werden: {date_str} - {e}")
return str(date_str)[:16]
def get_status_color(status):
"""
Gibt die passende Farbe für einen Status zurück
"""
colors = {
"New": "#2196f3",
"Rewrite": "#ff9800",
"Process": "#9c27b0",
"Online": "#4caf50",
"On Hold": "#e91e63",
"Trash": "#f44336"
}
return colors.get(status, "#2196f3")
def create_status_badge(status):
"""
Erstellt einen HTML-Status-Badge
"""
color = get_status_color(status)
return f"""
<span style="
background-color: {color}20;
color: {color};
padding: 0.25rem 0.5rem;
border-radius: 12px;
font-size: 0.8rem;
font-weight: 600;
border: 1px solid {color}40;
">{status}</span>
"""
def truncate_text(text, max_length=150):
"""
Kürzt Text auf maximale Länge
"""
if not text:
return ""
if len(text) <= max_length:
return text
return text[:max_length].rsplit(' ', 1)[0] + "..."
def calculate_reading_time(text):
"""
Berechnet geschätzte Lesezeit (200 Wörter/Minute)
"""
if not text:
return 0
word_count = len(text.split())
reading_time = max(1, word_count // 200)
return reading_time
def validate_url(url):
"""
Validiert eine URL
"""
import re
pattern = re.compile(
r'^https?://' # http:// oder https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' # domain...
r'localhost|' # localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...oder IP
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
return pattern.match(url) is not None
def create_article_card_html(article, source_name="Unbekannt"):
"""
Erstellt HTML für eine Artikel-Karte
"""
has_images = len(article.get("images", [])) > 0
word_count = len(article.get("text", "").split())
reading_time = calculate_reading_time(article.get("text", ""))
# Unvollständige Bilder prüfen
incomplete_images = any(
not all(k in img and img[k] for k in ("caption", "copyright", "copyright_url"))
for img in article.get("images", [])
)
warning_icon = " ⚠️" if incomplete_images else ""
return f"""
<div style="
background: white;
border-radius: 12px;
padding: 1.5rem;
margin-bottom: 1rem;
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
border-left: 4px solid {get_status_color(article.get('status', 'New'))};
transition: transform 0.2s ease;
" onmouseover="this.style.transform='translateY(-2px)'" onmouseout="this.style.transform='translateY(0)'">
<div style="display: flex; justify-content: space-between; align-items: start; margin-bottom: 1rem;">
<div style="flex: 1;">
<h3 style="margin: 0 0 0.5rem 0; color: #333; font-size: 1.1rem;">
{article.get('title', 'Kein Titel')}{warning_icon}
</h3>
<div style="font-size: 0.85rem; color: #666; margin-bottom: 0.5rem;">
📅 {format_datetime(article.get('date', ''))}
📝 {word_count} Wörter
{reading_time} Min Lesezeit
{'• 🖼️ ' + str(len(article.get('images', []))) + ' Bilder' if has_images else ''}
</div>
</div>
<div>
{create_status_badge(article.get('status', 'New'))}
</div>
</div>
<div style="margin-bottom: 1rem; color: #555; line-height: 1.4;">
{truncate_text(article.get('summary', ''), 200)}
</div>
<div style="display: flex; justify-content: space-between; align-items: center; font-size: 0.8rem; color: #888;">
<div>
📡 {source_name}
</div>
<div>
🏷 {', '.join(article.get('tags', [])[:3])}{'...' if len(article.get('tags', [])) > 3 else ''}
</div>
</div>
</div>
"""
def create_stats_card(title, value, icon="📊", color="#667eea"):
"""
Erstellt eine Statistik-Karte
"""
return f"""
<div style="
background: white;
border-radius: 12px;
padding: 1.5rem;
text-align: center;
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
border-top: 4px solid {color};
">
<div style="font-size: 2rem; margin-bottom: 0.5rem;">{icon}</div>
<div style="font-size: 2rem; font-weight: bold; color: {color}; margin-bottom: 0.5rem;">{value}</div>
<div style="color: #666; font-weight: 500;">{title}</div>
</div>
"""
def show_loading_spinner(text="Lädt..."):
"""
Zeigt einen Lade-Spinner mit Text
"""
return st.empty().markdown(f"""
<div style="text-align: center; padding: 2rem;">
<div style="
border: 4px solid #f3f3f3;
border-top: 4px solid #667eea;
border-radius: 50%;
width: 40px;
height: 40px;
animation: spin 1s linear infinite;
margin: 0 auto 1rem auto;
"></div>
<div style="color: #666;">{text}</div>
</div>
<style>
@keyframes spin {
0% { transform: rotate(0deg); }
100% { transform: rotate(360deg); }
}
</style>
""", unsafe_allow_html=True)
def create_filter_section():
"""
Erstellt einen modernen Filter-Bereich
"""
return """
<div style="
background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
border-radius: 12px;
padding: 1.5rem;
margin-bottom: 2rem;
box-shadow: 0 4px 6px rgba(0,0,0,0.1);
">
<h3 style="margin: 0 0 1rem 0; color: #333;">🔍 Filter & Suche</h3>
"""
def get_error_message(error_type, details=""):
"""
Gibt formatierte Fehlermeldungen zurück
"""
messages = {
"feed_error": f"❌ Fehler beim Laden des Feeds: {details}",
"save_error": f"❌ Fehler beim Speichern: {details}",
"api_error": f"❌ API-Fehler: {details}",
"validation_error": f"⚠️ Validierungsfehler: {details}",
"network_error": f"🌐 Netzwerkfehler: {details}"
}
return messages.get(error_type, f"❌ Unbekannter Fehler: {details}")