rss-news/utils/article_extractor.py

# utils/article_extractor.py

import requests
from bs4 import BeautifulSoup

def extract_full_article(url: str) -> str:
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        # Promobil & WordPress & allgemeine Fallbacks
        candidates = [
            {"tag": "div", "class_": "article__text"},     # Promobil
            {"tag": "div", "class_": "entry-content"},     # WordPress Standard
            {"tag": "article", "class_": None},            # Generisch
        ]

        for selector in candidates:
            el = soup.find(selector["tag"], class_=selector["class_"])
            if el and len(el.get_text(strip=True).split()) > 50:
                return el.get_text(" ", strip=True)

        # Fallback: ganzer Seiteninhalt
        return soup.get_text(" ", strip=True)
    except Exception:
        return ""