Bump version to v1.5.3
This commit is contained in:
parent
c49864c4aa
commit
4eaef89be8
10 changed files with 3098 additions and 19 deletions
27
utils/article_extractor.py
Normal file
27
utils/article_extractor.py
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
# utils/article_extractor.py
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
def extract_full_article(url: str) -> str:
|
||||
try:
|
||||
response = requests.get(url, timeout=10)
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
# Promobil & WordPress & allgemeine Fallbacks
|
||||
candidates = [
|
||||
{"tag": "div", "class_": "article__text"}, # Promobil
|
||||
{"tag": "div", "class_": "entry-content"}, # WordPress Standard
|
||||
{"tag": "article", "class_": None}, # Generisch
|
||||
]
|
||||
|
||||
for selector in candidates:
|
||||
el = soup.find(selector["tag"], class_=selector["class_"])
|
||||
if el and len(el.get_text(strip=True).split()) > 50:
|
||||
return el.get_text(" ", strip=True)
|
||||
|
||||
# Fallback: ganzer Seiteninhalt
|
||||
return soup.get_text(" ", strip=True)
|
||||
except Exception:
|
||||
return ""
|
||||
Loading…
Add table
Add a link
Reference in a new issue