Grundfunktionen optimiert
This commit is contained in:
parent
050e08859c
commit
0c84dd1a1a
8 changed files with 4866 additions and 315 deletions
|
|
@ -2,59 +2,325 @@
|
|||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urljoin
|
||||
from urllib.parse import urljoin, urlparse
|
||||
import logging
|
||||
import time
|
||||
from typing import List, Dict
|
||||
|
||||
# Konfiguration
|
||||
MAX_IMAGES = 5
|
||||
MIN_IMAGE_SIZE = 100 # Mindestgröße in Pixeln
|
||||
ALLOWED_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.webp', '.gif'}
|
||||
REQUEST_TIMEOUT = 10
|
||||
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
|
||||
def extract_images_with_metadata(article_url):
|
||||
def is_valid_image_url(url: str) -> bool:
|
||||
"""
|
||||
Versucht, Bilder mit Bildunterschrift und Copyright aus dem Originalartikel zu extrahieren.
|
||||
Gibt eine Liste mit Dictionaries zurück: {url, alt, copyright, copyright_url, caption}
|
||||
Prüft ob eine URL auf ein gültiges Bild zeigt
|
||||
"""
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
path = parsed.path.lower()
|
||||
|
||||
# Prüfe Dateiendung
|
||||
if not any(path.endswith(ext) for ext in ALLOWED_EXTENSIONS):
|
||||
return False
|
||||
|
||||
# Prüfe ob URL vollständig ist
|
||||
if not parsed.scheme or not parsed.netloc:
|
||||
return False
|
||||
|
||||
# Blacklist für unerwünschte Bilder
|
||||
blacklist_patterns = [
|
||||
'avatar', 'profile', 'icon', 'logo', 'banner',
|
||||
'advertisement', 'ads', 'tracking', 'pixel', 'social'
|
||||
]
|
||||
|
||||
return not any(pattern in url.lower() for pattern in blacklist_patterns)
|
||||
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def get_image_dimensions(img_tag) -> tuple:
|
||||
"""
|
||||
Versucht die Bildabmessungen aus HTML-Attributen zu ermitteln
|
||||
"""
|
||||
try:
|
||||
width = img_tag.get('width')
|
||||
height = img_tag.get('height')
|
||||
|
||||
if width and height:
|
||||
return int(width), int(height)
|
||||
|
||||
# Aus Style-Attribut extrahieren
|
||||
style = img_tag.get('style', '')
|
||||
if 'width:' in style or 'height:' in style:
|
||||
# Vereinfachte Extraktion - könnte erweitert werden
|
||||
pass
|
||||
|
||||
return None, None
|
||||
except:
|
||||
return None, None
|
||||
|
||||
def extract_image_metadata(img_tag, base_url: str) -> Dict:
|
||||
"""
|
||||
Extrahiert alle verfügbaren Metadaten eines Bildes
|
||||
"""
|
||||
try:
|
||||
# Basis-URL
|
||||
src = img_tag.get('src') or img_tag.get('data-src') or img_tag.get('data-lazy-src')
|
||||
if not src:
|
||||
return None
|
||||
|
||||
img_url = urljoin(base_url, src)
|
||||
|
||||
if not is_valid_image_url(img_url):
|
||||
return None
|
||||
|
||||
# Alt-Text
|
||||
alt_text = img_tag.get('alt', '').strip()
|
||||
|
||||
# Titel
|
||||
title = img_tag.get('title', '').strip()
|
||||
|
||||
# Bildabmessungen
|
||||
width, height = get_image_dimensions(img_tag)
|
||||
|
||||
# Überspringe sehr kleine Bilder
|
||||
if width and height and (width < MIN_IMAGE_SIZE or height < MIN_IMAGE_SIZE):
|
||||
return None
|
||||
|
||||
# Caption und Copyright aus Parent-Elementen suchen
|
||||
caption = ""
|
||||
copyright_text = "Unbekannt"
|
||||
copyright_url = base_url
|
||||
|
||||
# Suche in Parent-Elementen nach Caption
|
||||
parent = img_tag.find_parent(['figure', 'div', 'span', 'p'])
|
||||
if parent:
|
||||
# Figcaption
|
||||
figcaption = parent.find('figcaption')
|
||||
if figcaption:
|
||||
caption = figcaption.get_text(strip=True)
|
||||
|
||||
# Copyright-Link in Figcaption suchen
|
||||
copyright_link = figcaption.find('a')
|
||||
if copyright_link:
|
||||
copyright_url = urljoin(base_url, copyright_link.get('href', ''))
|
||||
copyright_text = copyright_link.get_text(strip=True)
|
||||
|
||||
# Alternative: Caption in kleinen Texten unter dem Bild
|
||||
caption_candidates = parent.find_all(['small', 'em', 'i'], limit=3)
|
||||
for candidate in caption_candidates:
|
||||
text = candidate.get_text(strip=True)
|
||||
if len(text) > 10 and len(text) < 200: # Plausible Caption-Länge
|
||||
if not caption: # Nur wenn noch keine Caption gefunden
|
||||
caption = text
|
||||
|
||||
# Fallback für Caption
|
||||
if not caption:
|
||||
caption = title or alt_text or "Bild aus Originalartikel"
|
||||
|
||||
return {
|
||||
"url": img_url,
|
||||
"alt": alt_text,
|
||||
"caption": caption[:300] if caption else "Kein Bildtitel vorhanden",
|
||||
"copyright": copyright_text or "Unbekannt",
|
||||
"copyright_url": copyright_url or base_url,
|
||||
"width": width,
|
||||
"height": height,
|
||||
"title": title
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Fehler bei Metadaten-Extraktion: {e}")
|
||||
return None
|
||||
|
||||
def extract_images_with_metadata(article_url: str) -> List[Dict]:
|
||||
"""
|
||||
Hauptfunktion: Extrahiert Bilder mit Metadaten aus einem Artikel
|
||||
"""
|
||||
images = []
|
||||
try:
|
||||
logging.info(f"📷 Extrahiere Bilder von {article_url}")
|
||||
response = requests.get(article_url, timeout=10)
|
||||
if response.status_code != 200:
|
||||
logging.warning(f"Keine gültige Antwort von {article_url} (Status {response.status_code})")
|
||||
return []
|
||||
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
|
||||
for img_tag in soup.find_all("img"):
|
||||
src = img_tag.get("src")
|
||||
if not src:
|
||||
continue
|
||||
|
||||
img_url = urljoin(article_url, src)
|
||||
alt_text = img_tag.get("alt", "").strip()
|
||||
|
||||
copyright_text = "Unbekannt"
|
||||
copyright_link = article_url
|
||||
caption = alt_text or "Bild aus Originalartikel"
|
||||
|
||||
parent = img_tag.find_parent(["figure", "div"])
|
||||
if parent:
|
||||
figcaption = parent.find("figcaption")
|
||||
if figcaption:
|
||||
caption = figcaption.get_text(strip=True)
|
||||
link_tag = figcaption.find("a")
|
||||
if link_tag and link_tag.has_attr("href"):
|
||||
copyright_link = link_tag["href"]
|
||||
copyright_text = link_tag.get_text(strip=True)
|
||||
|
||||
image_data = {
|
||||
"url": img_url,
|
||||
"alt": alt_text,
|
||||
"caption": caption or "Kein Bildtitel vorhanden",
|
||||
"copyright": copyright_text or "Unbekannt",
|
||||
"copyright_url": copyright_link or article_url
|
||||
}
|
||||
images.append(image_data)
|
||||
|
||||
logging.info(f"{len(images)} Bilder gefunden bei {article_url}")
|
||||
|
||||
if not article_url:
|
||||
return images
|
||||
|
||||
|
||||
try:
|
||||
logging.info(f"🖼️ Starte Bildextraktion von: {article_url}")
|
||||
|
||||
# HTTP-Request mit verbessertem Header
|
||||
headers = {
|
||||
'User-Agent': USER_AGENT,
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'de,en-US;q=0.7,en;q=0.3',
|
||||
'Accept-Encoding': 'gzip, deflate',
|
||||
'Connection': 'keep-alive',
|
||||
}
|
||||
|
||||
response = requests.get(article_url, timeout=REQUEST_TIMEOUT, headers=headers)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
|
||||
# Alle img-Tags finden
|
||||
img_tags = soup.find_all("img")
|
||||
logging.info(f"🔍 {len(img_tags)} img-Tags gefunden")
|
||||
|
||||
processed_urls = set() # Duplikate vermeiden
|
||||
|
||||
for img_tag in img_tags:
|
||||
try:
|
||||
# Metadaten extrahieren
|
||||
image_data = extract_image_metadata(img_tag, article_url)
|
||||
|
||||
if image_data and image_data["url"] not in processed_urls:
|
||||
images.append(image_data)
|
||||
processed_urls.add(image_data["url"])
|
||||
|
||||
logging.info(f"✅ Bild hinzugefügt: {image_data['caption'][:50]}...")
|
||||
|
||||
# Maximum erreicht?
|
||||
if len(images) >= MAX_IMAGES:
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Fehler beim Verarbeiten eines Bildes: {e}")
|
||||
continue
|
||||
|
||||
# Bilder nach Größe sortieren (größere zuerst)
|
||||
images.sort(key=lambda x: (x.get('width', 0) * x.get('height', 0)), reverse=True)
|
||||
|
||||
logging.info(f"🎉 {len(images)} Bilder erfolgreich extrahiert von {article_url}")
|
||||
return images[:MAX_IMAGES] # Sicherheitshalber nochmal begrenzen
|
||||
|
||||
except requests.RequestException as e:
|
||||
logging.error(f"🌐 Netzwerkfehler bei {article_url}: {e}")
|
||||
return []
|
||||
except Exception as e:
|
||||
logging.exception(f"Fehler bei der Bildextraktion aus {article_url}:")
|
||||
return []
|
||||
logging.error(f"❌ Unerwarteter Fehler bei Bildextraktion von {article_url}: {e}")
|
||||
return []
|
||||
|
||||
def validate_image_url(url: str) -> bool:
|
||||
"""
|
||||
Prüft ob ein Bild tatsächlich erreichbar ist
|
||||
"""
|
||||
try:
|
||||
response = requests.head(url, timeout=5)
|
||||
content_type = response.headers.get('content-type', '').lower()
|
||||
return response.status_code == 200 and 'image' in content_type
|
||||
except:
|
||||
return False
|
||||
|
||||
def extract_featured_image(article_url: str) -> Dict:
|
||||
"""
|
||||
Versucht das Hauptbild/Featured Image eines Artikels zu finden
|
||||
"""
|
||||
try:
|
||||
headers = {'User-Agent': USER_AGENT}
|
||||
response = requests.get(article_url, timeout=REQUEST_TIMEOUT, headers=headers)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
|
||||
# OpenGraph Image
|
||||
og_image = soup.find('meta', property='og:image')
|
||||
if og_image and og_image.get('content'):
|
||||
img_url = urljoin(article_url, og_image['content'])
|
||||
if is_valid_image_url(img_url):
|
||||
return {
|
||||
"url": img_url,
|
||||
"alt": "Featured Image",
|
||||
"caption": "Hauptbild des Artikels",
|
||||
"copyright": "Unbekannt",
|
||||
"copyright_url": article_url,
|
||||
"type": "featured"
|
||||
}
|
||||
|
||||
# Twitter Card Image
|
||||
twitter_image = soup.find('meta', attrs={'name': 'twitter:image'})
|
||||
if twitter_image and twitter_image.get('content'):
|
||||
img_url = urljoin(article_url, twitter_image['content'])
|
||||
if is_valid_image_url(img_url):
|
||||
return {
|
||||
"url": img_url,
|
||||
"alt": "Featured Image",
|
||||
"caption": "Hauptbild des Artikels",
|
||||
"copyright": "Unbekannt",
|
||||
"copyright_url": article_url,
|
||||
"type": "featured"
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Fehler bei Featured Image Extraktion: {e}")
|
||||
return None
|
||||
|
||||
def clean_image_metadata(images: List[Dict]) -> List[Dict]:
|
||||
"""
|
||||
Bereinigt und normalisiert Bildmetadaten
|
||||
"""
|
||||
cleaned_images = []
|
||||
|
||||
for img in images:
|
||||
try:
|
||||
# URL validieren
|
||||
if not img.get("url") or not is_valid_image_url(img["url"]):
|
||||
continue
|
||||
|
||||
# Metadaten bereinigen
|
||||
cleaned_img = {
|
||||
"url": img["url"].strip(),
|
||||
"alt": (img.get("alt") or "").strip()[:200],
|
||||
"caption": (img.get("caption") or "Kein Bildtitel vorhanden").strip()[:300],
|
||||
"copyright": (img.get("copyright") or "Unbekannt").strip()[:100],
|
||||
"copyright_url": (img.get("copyright_url") or "#").strip(),
|
||||
"width": img.get("width"),
|
||||
"height": img.get("height"),
|
||||
"title": (img.get("title") or "").strip()[:200]
|
||||
}
|
||||
|
||||
# Leere Felder mit Standardwerten füllen
|
||||
if not cleaned_img["caption"]:
|
||||
cleaned_img["caption"] = "Kein Bildtitel vorhanden"
|
||||
if not cleaned_img["copyright"]:
|
||||
cleaned_img["copyright"] = "Unbekannt"
|
||||
if not cleaned_img["copyright_url"] or cleaned_img["copyright_url"] == "#":
|
||||
cleaned_img["copyright_url"] = img["url"] # Bild-URL als Fallback
|
||||
|
||||
cleaned_images.append(cleaned_img)
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Fehler beim Bereinigen der Bildmetadaten: {e}")
|
||||
continue
|
||||
|
||||
return cleaned_images
|
||||
|
||||
# Hauptfunktion für bessere Kompatibilität mit dem bestehenden Code
|
||||
def extract_images_with_metadata_enhanced(article_url: str) -> List[Dict]:
|
||||
"""
|
||||
Erweiterte Bildextraktion mit Fallback-Strategien
|
||||
"""
|
||||
all_images = []
|
||||
|
||||
# 1. Featured Image versuchen
|
||||
featured = extract_featured_image(article_url)
|
||||
if featured:
|
||||
all_images.append(featured)
|
||||
|
||||
# 2. Normale Bildextraktion
|
||||
content_images = extract_images_with_metadata(article_url)
|
||||
all_images.extend(content_images)
|
||||
|
||||
# 3. Duplikate entfernen
|
||||
seen_urls = set()
|
||||
unique_images = []
|
||||
for img in all_images:
|
||||
if img["url"] not in seen_urls:
|
||||
unique_images.append(img)
|
||||
seen_urls.add(img["url"])
|
||||
|
||||
# 4. Metadaten bereinigen
|
||||
cleaned_images = clean_image_metadata(unique_images)
|
||||
|
||||
return cleaned_images[:MAX_IMAGES]
|
||||
Loading…
Add table
Add a link
Reference in a new issue