Grundfunktionen optimiert

2025-08-16 11:13:10 +02:00 · 2025-08-16 11:13:10 +02:00 · 0c84dd1a1a
commit 0c84dd1a1a
parent 050e08859c
8 changed files with 4866 additions and 315 deletions
--- a/utils/article_extractor.py
+++ b/utils/article_extractor.py
@ -2,26 +2,362 @@

 import requests
 from bs4 import BeautifulSoup
+import logging
+import time
+from typing import Optional
+
+# Konfiguration
+REQUEST_TIMEOUT = 15
+MAX_RETRIES = 3
+USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+
+# Website-spezifische Selektoren
+CONTENT_SELECTORS = {
+    # Promobil & Camping-spezifisch
+    'promobil.de': [
+        {'tag': 'div', 'class': 'article__text'},
+        {'tag': 'div', 'class': 'article-content'},
+        {'tag': 'div', 'class': 'content-text'}
+    ],
+    'camping.info': [
+        {'tag': 'div', 'class': 'article-body'},
+        {'tag': 'div', 'class': 'post-content'}
+    ],
+    'caravaning.de': [
+        {'tag': 'div', 'class': 'article__content'},
+        {'tag': 'div', 'class': 'entry-content'}
+    ],
+    
+    # WordPress Standard-Selektoren
+    'wordpress': [
+        {'tag': 'div', 'class': 'entry-content'},
+        {'tag': 'div', 'class': 'post-content'},
+        {'tag': 'div', 'class': 'content'},
+        {'tag': 'main', 'class': 'main-content'},
+        {'tag': 'article', 'class': None}
+    ],
+    
+    # Allgemeine Fallbacks
+    'generic': [
+        {'tag': 'article', 'class': None},
+        {'tag': 'div', 'class': 'content'},
+        {'tag': 'div', 'class': 'post'},
+        {'tag': 'div', 'class': 'entry'},
+        {'tag': 'main', 'class': None},
+        {'tag': 'div', 'id': 'content'},
+        {'tag': 'div', 'id': 'main'}
+    ]
+}
+
+def get_domain_from_url(url: str) -> str:
+    """
+    Extrahiert die Domain aus einer URL
+    """
+    try:
+        from urllib.parse import urlparse
+        parsed = urlparse(url)
+        return parsed.netloc.lower()
+    except:
+        return ""
+
+def get_selectors_for_domain(domain: str) -> list:
+    """
+    Gibt die passenden Selektoren für eine Domain zurück
+    """
+    # Direkte Domain-Matches
+    for known_domain in CONTENT_SELECTORS:
+        if known_domain != 'wordpress' and known_domain != 'generic' and known_domain in domain:
+            return CONTENT_SELECTORS[known_domain]
+    
+    # WordPress erkennen (wird später durch Meta-Tags erkannt)
+    return CONTENT_SELECTORS['generic']
+
+def is_wordpress_site(soup: BeautifulSoup) -> bool:
+    """
+    Erkennt WordPress-Websites anhand von Meta-Tags
+    """
+    try:
+        # WordPress Generator Meta-Tag
+        generator = soup.find('meta', attrs={'name': 'generator'})
+        if generator and 'wordpress' in generator.get('content', '').lower():
+            return True
+        
+        # WordPress-spezifische Link-Tags
+        wp_links = soup.find_all('link', href=lambda x: x and '/wp-' in x)
+        if wp_links:
+            return True
+            
+        # WordPress REST API
+        rest_api = soup.find('link', attrs={'rel': 'https://api.w.org/'})
+        if rest_api:
+            return True
+            
+        return False
+    except:
+        return False
+
+def clean_extracted_text(text: str) -> str:
+    """
+    Bereinigt extrahierten Text von unerwünschten Elementen
+    """
+    if not text:
+        return ""
+    
+    lines = text.split('\n')
+    cleaned_lines = []
+    
+    for line in lines:
+        line = line.strip()
+        
+        # Überspringe sehr kurze Zeilen (wahrscheinlich Navigation/Werbung)
+        if len(line) < 10:
+            continue
+            
+        # Überspringe typische Navigation/Footer-Texte
+        skip_patterns = [
+            'cookie', 'datenschutz', 'impressum', 'agb', 'newsletter',
+            'folgen sie uns', 'social media', 'teilen', 'weiterlesen',
+            'mehr zum thema', 'ähnliche artikel', 'kommentare',
+            'anzeige', 'werbung', 'advertisement'
+        ]
+        
+        if any(pattern in line.lower() for pattern in skip_patterns):
+            continue
+            
+        # Überspringe Zeilen mit zu vielen Sonderzeichen (Navigation)
+        if len([c for c in line if c in '|•→←↑↓']) > 3:
+            continue
+            
+        cleaned_lines.append(line)
+    
+    # Text zusammenfügen
+    cleaned_text = ' '.join(cleaned_lines)
+    
+    # Mehrfache Leerzeichen entfernen
+    cleaned_text = ' '.join(cleaned_text.split())
+    
+    return cleaned_text
+
+def extract_with_selectors(soup: BeautifulSoup, selectors: list) -> str:
+    """
+    Versucht Text mit einer Liste von Selektoren zu extrahieren
+    """
+    for selector in selectors:
+        try:
+            element = None
+            
+            if selector.get('class'):
+                element = soup.find(selector['tag'], class_=selector['class'])
+            elif selector.get('id'):
+                element = soup.find(selector['tag'], id=selector['id'])
+            else:
+                element = soup.find(selector['tag'])
+            
+            if element:
+                # Entferne Script- und Style-Tags
+                for script in element(['script', 'style', 'nav', 'header', 'footer', 'aside']):
+                    script.decompose()
+                
+                text = element.get_text(' ', strip=True)
+                
+                # Nur zurückgeben wenn genügend Text vorhanden
+                if len(text.split()) > 50:
+                    logging.info(f"✅ Erfolgreiche Extraktion mit Selektor: {selector}")
+                    return clean_extracted_text(text)
+                    
+        except Exception as e:
+            logging.debug(f"Selektor {selector} fehlgeschlagen: {e}")
+            continue
+    
+    return ""
+
+def extract_from_paragraphs(soup: BeautifulSoup) -> str:
+    """
+    Fallback: Extrahiert Text aus allen Paragraph-Tags
+    """
+    try:
+        paragraphs = soup.find_all('p')
+        
+        if not paragraphs:
+            return ""
+        
+        # Sammle alle Paragraph-Texte
+        texts = []
+        for p in paragraphs:
+            text = p.get_text(strip=True)
+            if len(text) > 20:  # Nur längere Absätze
+                texts.append(text)
+        
+        combined_text = ' '.join(texts)
+        
+        if len(combined_text.split()) > 30:
+            logging.info(f"✅ Fallback-Extraktion aus {len(paragraphs)} Paragraphen")
+            return clean_extracted_text(combined_text)
+            
+        return ""
+        
+    except Exception as e:
+        logging.error(f"Fehler bei Paragraph-Extraktion: {e}")
+        return ""

 def extract_full_article(url: str) -> str:
-    try:
-        response = requests.get(url, timeout=10)
-        response.raise_for_status()
-        soup = BeautifulSoup(response.text, "html.parser")
-
-        # Promobil & WordPress & allgemeine Fallbacks
-        candidates = [
-            {"tag": "div", "class_": "article__text"},     # Promobil
-            {"tag": "div", "class_": "entry-content"},     # WordPress Standard
-            {"tag": "article", "class_": None},            # Generisch
-        ]
-
-        for selector in candidates:
-            el = soup.find(selector["tag"], class_=selector["class_"])
-            if el and len(el.get_text(strip=True).split()) > 50:
-                return el.get_text(" ", strip=True)
-
-        # Fallback: ganzer Seiteninhalt
-        return soup.get_text(" ", strip=True)
-    except Exception:
+    """
+    Hauptfunktion: Extrahiert den vollständigen Artikeltext von einer URL
+    """
+    if not url:
        return ""
+    
+    retries = 0
+    
+    while retries < MAX_RETRIES:
+        try:
+            logging.info(f"📰 Starte Volltextextraktion von: {url} (Versuch {retries + 1})")
+            
+            # HTTP-Request mit verbessertem Header
+            headers = {
+                'User-Agent': USER_AGENT,
+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+                'Accept-Language': 'de,en-US;q=0.7,en;q=0.3',
+                'Accept-Encoding': 'gzip, deflate',
+                'Connection': 'keep-alive',
+                'Upgrade-Insecure-Requests': '1',
+            }
+            
+            response = requests.get(url, timeout=REQUEST_TIMEOUT, headers=headers)
+            response.raise_for_status()
+            
+            # Encoding sicherstellen
+            if response.encoding.lower() in ['iso-8859-1', 'windows-1252']:
+                response.encoding = 'utf-8'
+            
+            soup = BeautifulSoup(response.text, "html.parser")
+            
+            # Domain-spezifische Selektoren ermitteln
+            domain = get_domain_from_url(url)
+            selectors = get_selectors_for_domain(domain)
+            
+            # WordPress erkennen und entsprechende Selektoren verwenden
+            if is_wordpress_site(soup):
+                logging.info("🔧 WordPress-Site erkannt")
+                selectors = CONTENT_SELECTORS['wordpress'] + selectors
+            
+            # 1. Versuch: Domain-spezifische Selektoren
+            extracted_text = extract_with_selectors(soup, selectors)
+            
+            if extracted_text and len(extracted_text.split()) > 50:
+                logging.info(f"🎉 Erfolgreiche Extraktion: {len(extracted_text.split())} Wörter")
+                return extracted_text
+            
+            # 2. Versuch: Generische Selektoren
+            if not extracted_text:
+                logging.info("🔄 Fallback auf generische Selektoren")
+                extracted_text = extract_with_selectors(soup, CONTENT_SELECTORS['generic'])
+            
+            if extracted_text and len(extracted_text.split()) > 50:
+                logging.info(f"🎉 Erfolgreiche Extraktion (generisch): {len(extracted_text.split())} Wörter")
+                return extracted_text
+            
+            # 3. Versuch: Paragraph-Extraktion
+            if not extracted_text:
+                logging.info("🔄 Fallback auf Paragraph-Extraktion")
+                extracted_text = extract_from_paragraphs(soup)
+            
+            if extracted_text and len(extracted_text.split()) > 30:
+                logging.info(f"🎉 Erfolgreiche Extraktion (Paragraphen): {len(extracted_text.split())} Wörter")
+                return extracted_text
+            
+            # 4. Letzter Versuch: Gesamter Body-Text
+            if not extracted_text:
+                logging.info("🔄 Letzter Fallback: Body-Text")
+                body = soup.find('body')
+                if body:
+                    # Entferne Navigation, Header, Footer
+                    for element in body(['nav', 'header', 'footer', 'aside', 'script', 'style']):
+                        element.decompose()
+                    
+                    body_text = body.get_text(' ', strip=True)
+                    if len(body_text.split()) > 100:
+                        extracted_text = clean_extracted_text(body_text)
+                        logging.info(f"⚠️ Body-Extraktion: {len(extracted_text.split())} Wörter")
+                        return extracted_text
+            
+            # Kein brauchbarer Text gefunden
+            if not extracted_text:
+                logging.warning(f"⚠️ Keine verwertbaren Inhalte gefunden bei: {url}")
+                return ""
+            
+            return extracted_text
+            
+        except requests.RequestException as e:
+            retries += 1
+            logging.warning(f"🌐 Netzwerkfehler bei {url} (Versuch {retries}): {e}")
+            
+            if retries < MAX_RETRIES:
+                time.sleep(2 ** retries)  # Exponential backoff
+                continue
+            else:
+                logging.error(f"❌ Maximale Anzahl Versuche erreicht für: {url}")
+                return ""
+                
+        except Exception as e:
+            logging.error(f"❌ Unerwarteter Fehler bei Volltextextraktion von {url}: {e}")
+            return ""
+    
+    return ""
+
+def extract_article_summary(full_text: str, max_length: int = 300) -> str:
+    """
+    Erstellt eine intelligente Zusammenfassung aus dem Volltext
+    """
+    if not full_text:
+        return ""
+    
+    sentences = full_text.split('.')
+    
+    # Erste 2-3 sinnvolle Sätze als Summary verwenden
+    summary_sentences = []
+    current_length = 0
+    
+    for sentence in sentences[:5]:  # Maximal erste 5 Sätze prüfen
+        sentence = sentence.strip()
+        
+        if len(sentence) < 20:  # Zu kurze Sätze überspringen
+            continue
+            
+        if current_length + len(sentence) > max_length:
+            break
+            
+        summary_sentences.append(sentence)
+        current_length += len(sentence)
+    
+    summary = '. '.join(summary_sentences)
+    
+    if summary and not summary.endswith('.'):
+        summary += '.'
+    
+    return summary[:max_length]
+
+def validate_extracted_content(text: str) -> bool:
+    """
+    Validiert ob der extrahierte Inhalt brauchbar ist
+    """
+    if not text or len(text.strip()) < 100:
+        return False
+    
+    words = text.split()
+    
+    # Mindestens 50 Wörter
+    if len(words) < 50:
+        return False
+    
+    # Nicht zu viele Sonderzeichen (Navigation etc.)
+    special_chars = len([c for c in text if c in '|•→←↑↓'])
+    if special_chars > len(text) * 0.05:  # Mehr als 5% Sonderzeichen
+        return False
+    
+    # Durchschnittliche Wortlänge prüfen (zu kurz = Navigation)
+    avg_word_length = sum(len(word) for word in words) / len(words)
+    if avg_word_length < 3:
+        return False
+    
+    return True
--- a/utils/image_extractor.py
+++ b/utils/image_extractor.py
@ -2,59 +2,325 @@

 import requests
 from bs4 import BeautifulSoup
-from urllib.parse import urljoin
+from urllib.parse import urljoin, urlparse
 import logging
+import time
+from typing import List, Dict

+# Konfiguration
+MAX_IMAGES = 5
+MIN_IMAGE_SIZE = 100  # Mindestgröße in Pixeln
+ALLOWED_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.webp', '.gif'}
+REQUEST_TIMEOUT = 10
+USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'

-def extract_images_with_metadata(article_url):
+def is_valid_image_url(url: str) -> bool:
    """
-    Versucht, Bilder mit Bildunterschrift und Copyright aus dem Originalartikel zu extrahieren.
-    Gibt eine Liste mit Dictionaries zurück: {url, alt, copyright, copyright_url, caption}
+    Prüft ob eine URL auf ein gültiges Bild zeigt
+    """
+    try:
+        parsed = urlparse(url)
+        path = parsed.path.lower()
+        
+        # Prüfe Dateiendung
+        if not any(path.endswith(ext) for ext in ALLOWED_EXTENSIONS):
+            return False
+        
+        # Prüfe ob URL vollständig ist
+        if not parsed.scheme or not parsed.netloc:
+            return False
+            
+        # Blacklist für unerwünschte Bilder
+        blacklist_patterns = [
+            'avatar', 'profile', 'icon', 'logo', 'banner', 
+            'advertisement', 'ads', 'tracking', 'pixel', 'social'
+        ]
+        
+        return not any(pattern in url.lower() for pattern in blacklist_patterns)
+        
+    except Exception:
+        return False
+
+def get_image_dimensions(img_tag) -> tuple:
+    """
+    Versucht die Bildabmessungen aus HTML-Attributen zu ermitteln
+    """
+    try:
+        width = img_tag.get('width')
+        height = img_tag.get('height')
+        
+        if width and height:
+            return int(width), int(height)
+        
+        # Aus Style-Attribut extrahieren
+        style = img_tag.get('style', '')
+        if 'width:' in style or 'height:' in style:
+            # Vereinfachte Extraktion - könnte erweitert werden
+            pass
+            
+        return None, None
+    except:
+        return None, None
+
+def extract_image_metadata(img_tag, base_url: str) -> Dict:
+    """
+    Extrahiert alle verfügbaren Metadaten eines Bildes
+    """
+    try:
+        # Basis-URL
+        src = img_tag.get('src') or img_tag.get('data-src') or img_tag.get('data-lazy-src')
+        if not src:
+            return None
+            
+        img_url = urljoin(base_url, src)
+        
+        if not is_valid_image_url(img_url):
+            return None
+        
+        # Alt-Text
+        alt_text = img_tag.get('alt', '').strip()
+        
+        # Titel
+        title = img_tag.get('title', '').strip()
+        
+        # Bildabmessungen
+        width, height = get_image_dimensions(img_tag)
+        
+        # Überspringe sehr kleine Bilder
+        if width and height and (width < MIN_IMAGE_SIZE or height < MIN_IMAGE_SIZE):
+            return None
+        
+        # Caption und Copyright aus Parent-Elementen suchen
+        caption = ""
+        copyright_text = "Unbekannt"
+        copyright_url = base_url
+        
+        # Suche in Parent-Elementen nach Caption
+        parent = img_tag.find_parent(['figure', 'div', 'span', 'p'])
+        if parent:
+            # Figcaption
+            figcaption = parent.find('figcaption')
+            if figcaption:
+                caption = figcaption.get_text(strip=True)
+                
+                # Copyright-Link in Figcaption suchen
+                copyright_link = figcaption.find('a')
+                if copyright_link:
+                    copyright_url = urljoin(base_url, copyright_link.get('href', ''))
+                    copyright_text = copyright_link.get_text(strip=True)
+            
+            # Alternative: Caption in kleinen Texten unter dem Bild
+            caption_candidates = parent.find_all(['small', 'em', 'i'], limit=3)
+            for candidate in caption_candidates:
+                text = candidate.get_text(strip=True)
+                if len(text) > 10 and len(text) < 200:  # Plausible Caption-Länge
+                    if not caption:  # Nur wenn noch keine Caption gefunden
+                        caption = text
+        
+        # Fallback für Caption
+        if not caption:
+            caption = title or alt_text or "Bild aus Originalartikel"
+        
+        return {
+            "url": img_url,
+            "alt": alt_text,
+            "caption": caption[:300] if caption else "Kein Bildtitel vorhanden",
+            "copyright": copyright_text or "Unbekannt", 
+            "copyright_url": copyright_url or base_url,
+            "width": width,
+            "height": height,
+            "title": title
+        }
+        
+    except Exception as e:
+        logging.error(f"Fehler bei Metadaten-Extraktion: {e}")
+        return None
+
+def extract_images_with_metadata(article_url: str) -> List[Dict]:
+    """
+    Hauptfunktion: Extrahiert Bilder mit Metadaten aus einem Artikel
    """
    images = []
-    try:
-        logging.info(f"📷 Extrahiere Bilder von {article_url}")
-        response = requests.get(article_url, timeout=10)
-        if response.status_code != 200:
-            logging.warning(f"Keine gültige Antwort von {article_url} (Status {response.status_code})")
-            return []
-
-        soup = BeautifulSoup(response.content, "html.parser")
-
-        for img_tag in soup.find_all("img"):
-            src = img_tag.get("src")
-            if not src:
-                continue
-
-            img_url = urljoin(article_url, src)
-            alt_text = img_tag.get("alt", "").strip()
-
-            copyright_text = "Unbekannt"
-            copyright_link = article_url
-            caption = alt_text or "Bild aus Originalartikel"
-
-            parent = img_tag.find_parent(["figure", "div"])
-            if parent:
-                figcaption = parent.find("figcaption")
-                if figcaption:
-                    caption = figcaption.get_text(strip=True)
-                    link_tag = figcaption.find("a")
-                    if link_tag and link_tag.has_attr("href"):
-                        copyright_link = link_tag["href"]
-                        copyright_text = link_tag.get_text(strip=True)
-
-            image_data = {
-                "url": img_url,
-                "alt": alt_text,
-                "caption": caption or "Kein Bildtitel vorhanden",
-                "copyright": copyright_text or "Unbekannt",
-                "copyright_url": copyright_link or article_url
-            }
-            images.append(image_data)
-
-        logging.info(f"{len(images)} Bilder gefunden bei {article_url}")
+    
+    if not article_url:
        return images
-
+        
+    try:
+        logging.info(f"🖼️ Starte Bildextraktion von: {article_url}")
+        
+        # HTTP-Request mit verbessertem Header
+        headers = {
+            'User-Agent': USER_AGENT,
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+            'Accept-Language': 'de,en-US;q=0.7,en;q=0.3',
+            'Accept-Encoding': 'gzip, deflate',
+            'Connection': 'keep-alive',
+        }
+        
+        response = requests.get(article_url, timeout=REQUEST_TIMEOUT, headers=headers)
+        response.raise_for_status()
+        
+        soup = BeautifulSoup(response.content, "html.parser")
+        
+        # Alle img-Tags finden
+        img_tags = soup.find_all("img")
+        logging.info(f"🔍 {len(img_tags)} img-Tags gefunden")
+        
+        processed_urls = set()  # Duplikate vermeiden
+        
+        for img_tag in img_tags:
+            try:
+                # Metadaten extrahieren
+                image_data = extract_image_metadata(img_tag, article_url)
+                
+                if image_data and image_data["url"] not in processed_urls:
+                    images.append(image_data)
+                    processed_urls.add(image_data["url"])
+                    
+                    logging.info(f"✅ Bild hinzugefügt: {image_data['caption'][:50]}...")
+                    
+                    # Maximum erreicht?
+                    if len(images) >= MAX_IMAGES:
+                        break
+                        
+            except Exception as e:
+                logging.error(f"❌ Fehler beim Verarbeiten eines Bildes: {e}")
+                continue
+        
+        # Bilder nach Größe sortieren (größere zuerst)
+        images.sort(key=lambda x: (x.get('width', 0) * x.get('height', 0)), reverse=True)
+        
+        logging.info(f"🎉 {len(images)} Bilder erfolgreich extrahiert von {article_url}")
+        return images[:MAX_IMAGES]  # Sicherheitshalber nochmal begrenzen
+        
+    except requests.RequestException as e:
+        logging.error(f"🌐 Netzwerkfehler bei {article_url}: {e}")
+        return []
    except Exception as e:
-        logging.exception(f"Fehler bei der Bildextraktion aus {article_url}:")
-        return []
+        logging.error(f"❌ Unerwarteter Fehler bei Bildextraktion von {article_url}: {e}")
+        return []
+
+def validate_image_url(url: str) -> bool:
+    """
+    Prüft ob ein Bild tatsächlich erreichbar ist
+    """
+    try:
+        response = requests.head(url, timeout=5)
+        content_type = response.headers.get('content-type', '').lower()
+        return response.status_code == 200 and 'image' in content_type
+    except:
+        return False
+
+def extract_featured_image(article_url: str) -> Dict:
+    """
+    Versucht das Hauptbild/Featured Image eines Artikels zu finden
+    """
+    try:
+        headers = {'User-Agent': USER_AGENT}
+        response = requests.get(article_url, timeout=REQUEST_TIMEOUT, headers=headers)
+        response.raise_for_status()
+        
+        soup = BeautifulSoup(response.content, "html.parser")
+        
+        # OpenGraph Image
+        og_image = soup.find('meta', property='og:image')
+        if og_image and og_image.get('content'):
+            img_url = urljoin(article_url, og_image['content'])
+            if is_valid_image_url(img_url):
+                return {
+                    "url": img_url,
+                    "alt": "Featured Image",
+                    "caption": "Hauptbild des Artikels",
+                    "copyright": "Unbekannt",
+                    "copyright_url": article_url,
+                    "type": "featured"
+                }
+        
+        # Twitter Card Image
+        twitter_image = soup.find('meta', attrs={'name': 'twitter:image'})
+        if twitter_image and twitter_image.get('content'):
+            img_url = urljoin(article_url, twitter_image['content'])
+            if is_valid_image_url(img_url):
+                return {
+                    "url": img_url,
+                    "alt": "Featured Image",
+                    "caption": "Hauptbild des Artikels",
+                    "copyright": "Unbekannt", 
+                    "copyright_url": article_url,
+                    "type": "featured"
+                }
+        
+        return None
+        
+    except Exception as e:
+        logging.error(f"Fehler bei Featured Image Extraktion: {e}")
+        return None
+
+def clean_image_metadata(images: List[Dict]) -> List[Dict]:
+    """
+    Bereinigt und normalisiert Bildmetadaten
+    """
+    cleaned_images = []
+    
+    for img in images:
+        try:
+            # URL validieren
+            if not img.get("url") or not is_valid_image_url(img["url"]):
+                continue
+            
+            # Metadaten bereinigen
+            cleaned_img = {
+                "url": img["url"].strip(),
+                "alt": (img.get("alt") or "").strip()[:200],
+                "caption": (img.get("caption") or "Kein Bildtitel vorhanden").strip()[:300],
+                "copyright": (img.get("copyright") or "Unbekannt").strip()[:100],
+                "copyright_url": (img.get("copyright_url") or "#").strip(),
+                "width": img.get("width"),
+                "height": img.get("height"),
+                "title": (img.get("title") or "").strip()[:200]
+            }
+            
+            # Leere Felder mit Standardwerten füllen
+            if not cleaned_img["caption"]:
+                cleaned_img["caption"] = "Kein Bildtitel vorhanden"
+            if not cleaned_img["copyright"]:
+                cleaned_img["copyright"] = "Unbekannt"
+            if not cleaned_img["copyright_url"] or cleaned_img["copyright_url"] == "#":
+                cleaned_img["copyright_url"] = img["url"]  # Bild-URL als Fallback
+            
+            cleaned_images.append(cleaned_img)
+            
+        except Exception as e:
+            logging.error(f"Fehler beim Bereinigen der Bildmetadaten: {e}")
+            continue
+    
+    return cleaned_images
+
+# Hauptfunktion für bessere Kompatibilität mit dem bestehenden Code
+def extract_images_with_metadata_enhanced(article_url: str) -> List[Dict]:
+    """
+    Erweiterte Bildextraktion mit Fallback-Strategien
+    """
+    all_images = []
+    
+    # 1. Featured Image versuchen
+    featured = extract_featured_image(article_url)
+    if featured:
+        all_images.append(featured)
+    
+    # 2. Normale Bildextraktion
+    content_images = extract_images_with_metadata(article_url)
+    all_images.extend(content_images)
+    
+    # 3. Duplikate entfernen
+    seen_urls = set()
+    unique_images = []
+    for img in all_images:
+        if img["url"] not in seen_urls:
+            unique_images.append(img)
+            seen_urls.add(img["url"])
+    
+    # 4. Metadaten bereinigen
+    cleaned_images = clean_image_metadata(unique_images)
+    
+    return cleaned_images[:MAX_IMAGES]
--- a/utils/ui_helpers.py
+++ b/utils/ui_helpers.py
@ -0,0 +1,236 @@
+# utils/ui_helpers.py
+
+import streamlit as st
+from datetime import datetime
+import logging
+
+def show_toast(message, type="success", duration=3):
+    """
+    Zeigt eine Toast-Benachrichtigung an
+    """
+    if type == "success":
+        st.success(message)
+    elif type == "error":
+        st.error(message)
+    elif type == "warning":
+        st.warning(message)
+    elif type == "info":
+        st.info(message)
+
+def format_datetime(date_str):
+    """
+    Formatiert Datetime-Strings für bessere Lesbarkeit
+    """
+    try:
+        if isinstance(date_str, str):
+            if "GMT" in date_str or "+" in date_str:
+                dt = datetime.strptime(date_str, "%a, %d %b %Y %H:%M:%S %z")
+                return dt.strftime("%d.%m.%Y %H:%M")
+            elif "T" in date_str:
+                dt = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
+                return dt.strftime("%d.%m.%Y %H:%M")
+            else:
+                return date_str[:16].replace("T", " ")
+        return str(date_str)
+    except Exception as e:
+        logging.warning(f"Datum konnte nicht formatiert werden: {date_str} - {e}")
+        return str(date_str)[:16]
+
+def get_status_color(status):
+    """
+    Gibt die passende Farbe für einen Status zurück
+    """
+    colors = {
+        "New": "#2196f3",
+        "Rewrite": "#ff9800", 
+        "Process": "#9c27b0",
+        "Online": "#4caf50",
+        "On Hold": "#e91e63",
+        "Trash": "#f44336"
+    }
+    return colors.get(status, "#2196f3")
+
+def create_status_badge(status):
+    """
+    Erstellt einen HTML-Status-Badge
+    """
+    color = get_status_color(status)
+    return f"""
+    <span style="
+        background-color: {color}20; 
+        color: {color}; 
+        padding: 0.25rem 0.5rem; 
+        border-radius: 12px; 
+        font-size: 0.8rem; 
+        font-weight: 600;
+        border: 1px solid {color}40;
+    ">{status}</span>
+    """
+
+def truncate_text(text, max_length=150):
+    """
+    Kürzt Text auf maximale Länge
+    """
+    if not text:
+        return ""
+    
+    if len(text) <= max_length:
+        return text
+    
+    return text[:max_length].rsplit(' ', 1)[0] + "..."
+
+def calculate_reading_time(text):
+    """
+    Berechnet geschätzte Lesezeit (200 Wörter/Minute)
+    """
+    if not text:
+        return 0
+    
+    word_count = len(text.split())
+    reading_time = max(1, word_count // 200)
+    return reading_time
+
+def validate_url(url):
+    """
+    Validiert eine URL
+    """
+    import re
+    pattern = re.compile(
+        r'^https?://'  # http:// oder https://
+        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|'  # domain...
+        r'localhost|'  # localhost...
+        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # ...oder IP
+        r'(?::\d+)?'  # optional port
+        r'(?:/?|[/?]\S+)$', re.IGNORECASE)
+    return pattern.match(url) is not None
+
+def create_article_card_html(article, source_name="Unbekannt"):
+    """
+    Erstellt HTML für eine Artikel-Karte
+    """
+    has_images = len(article.get("images", [])) > 0
+    word_count = len(article.get("text", "").split())
+    reading_time = calculate_reading_time(article.get("text", ""))
+    
+    # Unvollständige Bilder prüfen
+    incomplete_images = any(
+        not all(k in img and img[k] for k in ("caption", "copyright", "copyright_url"))
+        for img in article.get("images", [])
+    )
+    
+    warning_icon = " ⚠️" if incomplete_images else ""
+    
+    return f"""
+    <div style="
+        background: white;
+        border-radius: 12px;
+        padding: 1.5rem;
+        margin-bottom: 1rem;
+        box-shadow: 0 2px 8px rgba(0,0,0,0.1);
+        border-left: 4px solid {get_status_color(article.get('status', 'New'))};
+        transition: transform 0.2s ease;
+    " onmouseover="this.style.transform='translateY(-2px)'" onmouseout="this.style.transform='translateY(0)'">
+        
+        <div style="display: flex; justify-content: space-between; align-items: start; margin-bottom: 1rem;">
+            <div style="flex: 1;">
+                <h3 style="margin: 0 0 0.5rem 0; color: #333; font-size: 1.1rem;">
+                    {article.get('title', 'Kein Titel')}{warning_icon}
+                </h3>
+                <div style="font-size: 0.85rem; color: #666; margin-bottom: 0.5rem;">
+                    📅 {format_datetime(article.get('date', ''))} • 
+                    📝 {word_count} Wörter • 
+                    ⏱️ {reading_time} Min Lesezeit
+                    {'• 🖼️ ' + str(len(article.get('images', []))) + ' Bilder' if has_images else ''}
+                </div>
+            </div>
+            <div>
+                {create_status_badge(article.get('status', 'New'))}
+            </div>
+        </div>
+        
+        <div style="margin-bottom: 1rem; color: #555; line-height: 1.4;">
+            {truncate_text(article.get('summary', ''), 200)}
+        </div>
+        
+        <div style="display: flex; justify-content: space-between; align-items: center; font-size: 0.8rem; color: #888;">
+            <div>
+                📡 {source_name}
+            </div>
+            <div>
+                🏷️ {', '.join(article.get('tags', [])[:3])}{'...' if len(article.get('tags', [])) > 3 else ''}
+            </div>
+        </div>
+    </div>
+    """
+
+def create_stats_card(title, value, icon="📊", color="#667eea"):
+    """
+    Erstellt eine Statistik-Karte
+    """
+    return f"""
+    <div style="
+        background: white;
+        border-radius: 12px;
+        padding: 1.5rem;
+        text-align: center;
+        box-shadow: 0 2px 8px rgba(0,0,0,0.1);
+        border-top: 4px solid {color};
+    ">
+        <div style="font-size: 2rem; margin-bottom: 0.5rem;">{icon}</div>
+        <div style="font-size: 2rem; font-weight: bold; color: {color}; margin-bottom: 0.5rem;">{value}</div>
+        <div style="color: #666; font-weight: 500;">{title}</div>
+    </div>
+    """
+
+def show_loading_spinner(text="Lädt..."):
+    """
+    Zeigt einen Lade-Spinner mit Text
+    """
+    return st.empty().markdown(f"""
+    <div style="text-align: center; padding: 2rem;">
+        <div style="
+            border: 4px solid #f3f3f3;
+            border-top: 4px solid #667eea;
+            border-radius: 50%;
+            width: 40px;
+            height: 40px;
+            animation: spin 1s linear infinite;
+            margin: 0 auto 1rem auto;
+        "></div>
+        <div style="color: #666;">{text}</div>
+    </div>
+    <style>
+    @keyframes spin {
+        0% { transform: rotate(0deg); }
+        100% { transform: rotate(360deg); }
+    }
+    </style>
+    """, unsafe_allow_html=True)
+
+def create_filter_section():
+    """
+    Erstellt einen modernen Filter-Bereich
+    """
+    return """
+    <div style="
+        background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
+        border-radius: 12px;
+        padding: 1.5rem;
+        margin-bottom: 2rem;
+        box-shadow: 0 4px 6px rgba(0,0,0,0.1);
+    ">
+        <h3 style="margin: 0 0 1rem 0; color: #333;">🔍 Filter & Suche</h3>
+    """
+
+def get_error_message(error_type, details=""):
+    """
+    Gibt formatierte Fehlermeldungen zurück
+    """
+    messages = {
+        "feed_error": f"❌ Fehler beim Laden des Feeds: {details}",
+        "save_error": f"❌ Fehler beim Speichern: {details}",
+        "api_error": f"❌ API-Fehler: {details}",
+        "validation_error": f"⚠️ Validierungsfehler: {details}",
+        "network_error": f"🌐 Netzwerkfehler: {details}"
+    }
+    return messages.get(error_type, f"❌ Unbekannter Fehler: {details}")