Grundfunktionen optimiert

2025-08-16 11:13:10 +02:00 · 2025-08-16 11:13:10 +02:00 · 0c84dd1a1a
commit 0c84dd1a1a
parent 050e08859c
8 changed files with 4866 additions and 315 deletions
--- a/utils/article_extractor.py
+++ b/utils/article_extractor.py
@ -2,26 +2,362 @@

 import requests
 from bs4 import BeautifulSoup
+import logging
+import time
+from typing import Optional
+
+# Konfiguration
+REQUEST_TIMEOUT = 15
+MAX_RETRIES = 3
+USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+
+# Website-spezifische Selektoren
+CONTENT_SELECTORS = {
+    # Promobil & Camping-spezifisch
+    'promobil.de': [
+        {'tag': 'div', 'class': 'article__text'},
+        {'tag': 'div', 'class': 'article-content'},
+        {'tag': 'div', 'class': 'content-text'}
+    ],
+    'camping.info': [
+        {'tag': 'div', 'class': 'article-body'},
+        {'tag': 'div', 'class': 'post-content'}
+    ],
+    'caravaning.de': [
+        {'tag': 'div', 'class': 'article__content'},
+        {'tag': 'div', 'class': 'entry-content'}
+    ],
+    
+    # WordPress Standard-Selektoren
+    'wordpress': [
+        {'tag': 'div', 'class': 'entry-content'},
+        {'tag': 'div', 'class': 'post-content'},
+        {'tag': 'div', 'class': 'content'},
+        {'tag': 'main', 'class': 'main-content'},
+        {'tag': 'article', 'class': None}
+    ],
+    
+    # Allgemeine Fallbacks
+    'generic': [
+        {'tag': 'article', 'class': None},
+        {'tag': 'div', 'class': 'content'},
+        {'tag': 'div', 'class': 'post'},
+        {'tag': 'div', 'class': 'entry'},
+        {'tag': 'main', 'class': None},
+        {'tag': 'div', 'id': 'content'},
+        {'tag': 'div', 'id': 'main'}
+    ]
+}
+
+def get_domain_from_url(url: str) -> str:
+    """
+    Extrahiert die Domain aus einer URL
+    """
+    try:
+        from urllib.parse import urlparse
+        parsed = urlparse(url)
+        return parsed.netloc.lower()
+    except:
+        return ""
+
+def get_selectors_for_domain(domain: str) -> list:
+    """
+    Gibt die passenden Selektoren für eine Domain zurück
+    """
+    # Direkte Domain-Matches
+    for known_domain in CONTENT_SELECTORS:
+        if known_domain != 'wordpress' and known_domain != 'generic' and known_domain in domain:
+            return CONTENT_SELECTORS[known_domain]
+    
+    # WordPress erkennen (wird später durch Meta-Tags erkannt)
+    return CONTENT_SELECTORS['generic']
+
+def is_wordpress_site(soup: BeautifulSoup) -> bool:
+    """
+    Erkennt WordPress-Websites anhand von Meta-Tags
+    """
+    try:
+        # WordPress Generator Meta-Tag
+        generator = soup.find('meta', attrs={'name': 'generator'})
+        if generator and 'wordpress' in generator.get('content', '').lower():
+            return True
+        
+        # WordPress-spezifische Link-Tags
+        wp_links = soup.find_all('link', href=lambda x: x and '/wp-' in x)
+        if wp_links:
+            return True
+            
+        # WordPress REST API
+        rest_api = soup.find('link', attrs={'rel': 'https://api.w.org/'})
+        if rest_api:
+            return True
+            
+        return False
+    except:
+        return False
+
+def clean_extracted_text(text: str) -> str:
+    """
+    Bereinigt extrahierten Text von unerwünschten Elementen
+    """
+    if not text:
+        return ""
+    
+    lines = text.split('\n')
+    cleaned_lines = []
+    
+    for line in lines:
+        line = line.strip()
+        
+        # Überspringe sehr kurze Zeilen (wahrscheinlich Navigation/Werbung)
+        if len(line) < 10:
+            continue
+            
+        # Überspringe typische Navigation/Footer-Texte
+        skip_patterns = [
+            'cookie', 'datenschutz', 'impressum', 'agb', 'newsletter',
+            'folgen sie uns', 'social media', 'teilen', 'weiterlesen',
+            'mehr zum thema', 'ähnliche artikel', 'kommentare',
+            'anzeige', 'werbung', 'advertisement'
+        ]
+        
+        if any(pattern in line.lower() for pattern in skip_patterns):
+            continue
+            
+        # Überspringe Zeilen mit zu vielen Sonderzeichen (Navigation)
+        if len([c for c in line if c in '|•→←↑↓']) > 3:
+            continue
+            
+        cleaned_lines.append(line)
+    
+    # Text zusammenfügen
+    cleaned_text = ' '.join(cleaned_lines)
+    
+    # Mehrfache Leerzeichen entfernen
+    cleaned_text = ' '.join(cleaned_text.split())
+    
+    return cleaned_text
+
+def extract_with_selectors(soup: BeautifulSoup, selectors: list) -> str:
+    """
+    Versucht Text mit einer Liste von Selektoren zu extrahieren
+    """
+    for selector in selectors:
+        try:
+            element = None
+            
+            if selector.get('class'):
+                element = soup.find(selector['tag'], class_=selector['class'])
+            elif selector.get('id'):
+                element = soup.find(selector['tag'], id=selector['id'])
+            else:
+                element = soup.find(selector['tag'])
+            
+            if element:
+                # Entferne Script- und Style-Tags
+                for script in element(['script', 'style', 'nav', 'header', 'footer', 'aside']):
+                    script.decompose()
+                
+                text = element.get_text(' ', strip=True)
+                
+                # Nur zurückgeben wenn genügend Text vorhanden
+                if len(text.split()) > 50:
+                    logging.info(f"✅ Erfolgreiche Extraktion mit Selektor: {selector}")
+                    return clean_extracted_text(text)
+                    
+        except Exception as e:
+            logging.debug(f"Selektor {selector} fehlgeschlagen: {e}")
+            continue
+    
+    return ""
+
+def extract_from_paragraphs(soup: BeautifulSoup) -> str:
+    """
+    Fallback: Extrahiert Text aus allen Paragraph-Tags
+    """
+    try:
+        paragraphs = soup.find_all('p')
+        
+        if not paragraphs:
+            return ""
+        
+        # Sammle alle Paragraph-Texte
+        texts = []
+        for p in paragraphs:
+            text = p.get_text(strip=True)
+            if len(text) > 20:  # Nur längere Absätze
+                texts.append(text)
+        
+        combined_text = ' '.join(texts)
+        
+        if len(combined_text.split()) > 30:
+            logging.info(f"✅ Fallback-Extraktion aus {len(paragraphs)} Paragraphen")
+            return clean_extracted_text(combined_text)
+            
+        return ""
+        
+    except Exception as e:
+        logging.error(f"Fehler bei Paragraph-Extraktion: {e}")
+        return ""

 def extract_full_article(url: str) -> str:
-    try:
-        response = requests.get(url, timeout=10)
-        response.raise_for_status()
-        soup = BeautifulSoup(response.text, "html.parser")
-
-        # Promobil & WordPress & allgemeine Fallbacks
-        candidates = [
-            {"tag": "div", "class_": "article__text"},     # Promobil
-            {"tag": "div", "class_": "entry-content"},     # WordPress Standard
-            {"tag": "article", "class_": None},            # Generisch
-        ]
-
-        for selector in candidates:
-            el = soup.find(selector["tag"], class_=selector["class_"])
-            if el and len(el.get_text(strip=True).split()) > 50:
-                return el.get_text(" ", strip=True)
-
-        # Fallback: ganzer Seiteninhalt
-        return soup.get_text(" ", strip=True)
-    except Exception:
+    """
+    Hauptfunktion: Extrahiert den vollständigen Artikeltext von einer URL
+    """
+    if not url:
        return ""
+    
+    retries = 0
+    
+    while retries < MAX_RETRIES:
+        try:
+            logging.info(f"📰 Starte Volltextextraktion von: {url} (Versuch {retries + 1})")
+            
+            # HTTP-Request mit verbessertem Header
+            headers = {
+                'User-Agent': USER_AGENT,
+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+                'Accept-Language': 'de,en-US;q=0.7,en;q=0.3',
+                'Accept-Encoding': 'gzip, deflate',
+                'Connection': 'keep-alive',
+                'Upgrade-Insecure-Requests': '1',
+            }
+            
+            response = requests.get(url, timeout=REQUEST_TIMEOUT, headers=headers)
+            response.raise_for_status()
+            
+            # Encoding sicherstellen
+            if response.encoding.lower() in ['iso-8859-1', 'windows-1252']:
+                response.encoding = 'utf-8'
+            
+            soup = BeautifulSoup(response.text, "html.parser")
+            
+            # Domain-spezifische Selektoren ermitteln
+            domain = get_domain_from_url(url)
+            selectors = get_selectors_for_domain(domain)
+            
+            # WordPress erkennen und entsprechende Selektoren verwenden
+            if is_wordpress_site(soup):
+                logging.info("🔧 WordPress-Site erkannt")
+                selectors = CONTENT_SELECTORS['wordpress'] + selectors
+            
+            # 1. Versuch: Domain-spezifische Selektoren
+            extracted_text = extract_with_selectors(soup, selectors)
+            
+            if extracted_text and len(extracted_text.split()) > 50:
+                logging.info(f"🎉 Erfolgreiche Extraktion: {len(extracted_text.split())} Wörter")
+                return extracted_text
+            
+            # 2. Versuch: Generische Selektoren
+            if not extracted_text:
+                logging.info("🔄 Fallback auf generische Selektoren")
+                extracted_text = extract_with_selectors(soup, CONTENT_SELECTORS['generic'])
+            
+            if extracted_text and len(extracted_text.split()) > 50:
+                logging.info(f"🎉 Erfolgreiche Extraktion (generisch): {len(extracted_text.split())} Wörter")
+                return extracted_text
+            
+            # 3. Versuch: Paragraph-Extraktion
+            if not extracted_text:
+                logging.info("🔄 Fallback auf Paragraph-Extraktion")
+                extracted_text = extract_from_paragraphs(soup)
+            
+            if extracted_text and len(extracted_text.split()) > 30:
+                logging.info(f"🎉 Erfolgreiche Extraktion (Paragraphen): {len(extracted_text.split())} Wörter")
+                return extracted_text
+            
+            # 4. Letzter Versuch: Gesamter Body-Text
+            if not extracted_text:
+                logging.info("🔄 Letzter Fallback: Body-Text")
+                body = soup.find('body')
+                if body:
+                    # Entferne Navigation, Header, Footer
+                    for element in body(['nav', 'header', 'footer', 'aside', 'script', 'style']):
+                        element.decompose()
+                    
+                    body_text = body.get_text(' ', strip=True)
+                    if len(body_text.split()) > 100:
+                        extracted_text = clean_extracted_text(body_text)
+                        logging.info(f"⚠️ Body-Extraktion: {len(extracted_text.split())} Wörter")
+                        return extracted_text
+            
+            # Kein brauchbarer Text gefunden
+            if not extracted_text:
+                logging.warning(f"⚠️ Keine verwertbaren Inhalte gefunden bei: {url}")
+                return ""
+            
+            return extracted_text
+            
+        except requests.RequestException as e:
+            retries += 1
+            logging.warning(f"🌐 Netzwerkfehler bei {url} (Versuch {retries}): {e}")
+            
+            if retries < MAX_RETRIES:
+                time.sleep(2 ** retries)  # Exponential backoff
+                continue
+            else:
+                logging.error(f"❌ Maximale Anzahl Versuche erreicht für: {url}")
+                return ""
+                
+        except Exception as e:
+            logging.error(f"❌ Unerwarteter Fehler bei Volltextextraktion von {url}: {e}")
+            return ""
+    
+    return ""
+
+def extract_article_summary(full_text: str, max_length: int = 300) -> str:
+    """
+    Erstellt eine intelligente Zusammenfassung aus dem Volltext
+    """
+    if not full_text:
+        return ""
+    
+    sentences = full_text.split('.')
+    
+    # Erste 2-3 sinnvolle Sätze als Summary verwenden
+    summary_sentences = []
+    current_length = 0
+    
+    for sentence in sentences[:5]:  # Maximal erste 5 Sätze prüfen
+        sentence = sentence.strip()
+        
+        if len(sentence) < 20:  # Zu kurze Sätze überspringen
+            continue
+            
+        if current_length + len(sentence) > max_length:
+            break
+            
+        summary_sentences.append(sentence)
+        current_length += len(sentence)
+    
+    summary = '. '.join(summary_sentences)
+    
+    if summary and not summary.endswith('.'):
+        summary += '.'
+    
+    return summary[:max_length]
+
+def validate_extracted_content(text: str) -> bool:
+    """
+    Validiert ob der extrahierte Inhalt brauchbar ist
+    """
+    if not text or len(text.strip()) < 100:
+        return False
+    
+    words = text.split()
+    
+    # Mindestens 50 Wörter
+    if len(words) < 50:
+        return False
+    
+    # Nicht zu viele Sonderzeichen (Navigation etc.)
+    special_chars = len([c for c in text if c in '|•→←↑↓'])
+    if special_chars > len(text) * 0.05:  # Mehr als 5% Sonderzeichen
+        return False
+    
+    # Durchschnittliche Wortlänge prüfen (zu kurz = Navigation)
+    avg_word_length = sum(len(word) for word in words) / len(words)
+    if avg_word_length < 3:
+        return False
+    
+    return True