Bump version to v1.5.3
This commit is contained in:
parent
c49864c4aa
commit
4eaef89be8
10 changed files with 3098 additions and 19 deletions
27
utils/article_extractor.py
Normal file
27
utils/article_extractor.py
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
# utils/article_extractor.py
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
def extract_full_article(url: str) -> str:
|
||||
try:
|
||||
response = requests.get(url, timeout=10)
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
# Promobil & WordPress & allgemeine Fallbacks
|
||||
candidates = [
|
||||
{"tag": "div", "class_": "article__text"}, # Promobil
|
||||
{"tag": "div", "class_": "entry-content"}, # WordPress Standard
|
||||
{"tag": "article", "class_": None}, # Generisch
|
||||
]
|
||||
|
||||
for selector in candidates:
|
||||
el = soup.find(selector["tag"], class_=selector["class_"])
|
||||
if el and len(el.get_text(strip=True).split()) > 50:
|
||||
return el.get_text(" ", strip=True)
|
||||
|
||||
# Fallback: ganzer Seiteninhalt
|
||||
return soup.get_text(" ", strip=True)
|
||||
except Exception:
|
||||
return ""
|
||||
23
utils/article_utils.py
Normal file
23
utils/article_utils.py
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
# utils/article_utils.py
|
||||
|
||||
import hashlib
|
||||
|
||||
def clean_text(text: str) -> str:
|
||||
return text.strip()
|
||||
|
||||
def generate_id(link: str) -> str:
|
||||
return hashlib.md5(link.encode("utf-8")).hexdigest()
|
||||
|
||||
def categorize_article(text: str) -> str:
|
||||
# Dummy-Kategorie
|
||||
return "Allgemein"
|
||||
|
||||
def tag_article(text: str) -> list:
|
||||
# Dummy-Tags
|
||||
return ["tag1", "tag2"]
|
||||
|
||||
def summarize_text(text: str) -> str:
|
||||
return text[:200] + "..."
|
||||
|
||||
def rewrite_text(text: str) -> str:
|
||||
return text # Platzhalter, z. B. für GPT-Rewrite später
|
||||
Loading…
Add table
Add a link
Reference in a new issue