326 lines
No EOL
11 KiB
Python
326 lines
No EOL
11 KiB
Python
# utils/image_extractor.py
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from urllib.parse import urljoin, urlparse
|
|
import logging
|
|
import time
|
|
from typing import List, Dict
|
|
|
|
# Konfiguration
|
|
MAX_IMAGES = 5
|
|
MIN_IMAGE_SIZE = 100 # Mindestgröße in Pixeln
|
|
ALLOWED_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.webp', '.gif'}
|
|
REQUEST_TIMEOUT = 10
|
|
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
|
|
def is_valid_image_url(url: str) -> bool:
|
|
"""
|
|
Prüft ob eine URL auf ein gültiges Bild zeigt
|
|
"""
|
|
try:
|
|
parsed = urlparse(url)
|
|
path = parsed.path.lower()
|
|
|
|
# Prüfe Dateiendung
|
|
if not any(path.endswith(ext) for ext in ALLOWED_EXTENSIONS):
|
|
return False
|
|
|
|
# Prüfe ob URL vollständig ist
|
|
if not parsed.scheme or not parsed.netloc:
|
|
return False
|
|
|
|
# Blacklist für unerwünschte Bilder
|
|
blacklist_patterns = [
|
|
'avatar', 'profile', 'icon', 'logo', 'banner',
|
|
'advertisement', 'ads', 'tracking', 'pixel', 'social'
|
|
]
|
|
|
|
return not any(pattern in url.lower() for pattern in blacklist_patterns)
|
|
|
|
except Exception:
|
|
return False
|
|
|
|
def get_image_dimensions(img_tag) -> tuple:
|
|
"""
|
|
Versucht die Bildabmessungen aus HTML-Attributen zu ermitteln
|
|
"""
|
|
try:
|
|
width = img_tag.get('width')
|
|
height = img_tag.get('height')
|
|
|
|
if width and height:
|
|
return int(width), int(height)
|
|
|
|
# Aus Style-Attribut extrahieren
|
|
style = img_tag.get('style', '')
|
|
if 'width:' in style or 'height:' in style:
|
|
# Vereinfachte Extraktion - könnte erweitert werden
|
|
pass
|
|
|
|
return None, None
|
|
except:
|
|
return None, None
|
|
|
|
def extract_image_metadata(img_tag, base_url: str) -> Dict:
|
|
"""
|
|
Extrahiert alle verfügbaren Metadaten eines Bildes
|
|
"""
|
|
try:
|
|
# Basis-URL
|
|
src = img_tag.get('src') or img_tag.get('data-src') or img_tag.get('data-lazy-src')
|
|
if not src:
|
|
return None
|
|
|
|
img_url = urljoin(base_url, src)
|
|
|
|
if not is_valid_image_url(img_url):
|
|
return None
|
|
|
|
# Alt-Text
|
|
alt_text = img_tag.get('alt', '').strip()
|
|
|
|
# Titel
|
|
title = img_tag.get('title', '').strip()
|
|
|
|
# Bildabmessungen
|
|
width, height = get_image_dimensions(img_tag)
|
|
|
|
# Überspringe sehr kleine Bilder
|
|
if width and height and (width < MIN_IMAGE_SIZE or height < MIN_IMAGE_SIZE):
|
|
return None
|
|
|
|
# Caption und Copyright aus Parent-Elementen suchen
|
|
caption = ""
|
|
copyright_text = "Unbekannt"
|
|
copyright_url = base_url
|
|
|
|
# Suche in Parent-Elementen nach Caption
|
|
parent = img_tag.find_parent(['figure', 'div', 'span', 'p'])
|
|
if parent:
|
|
# Figcaption
|
|
figcaption = parent.find('figcaption')
|
|
if figcaption:
|
|
caption = figcaption.get_text(strip=True)
|
|
|
|
# Copyright-Link in Figcaption suchen
|
|
copyright_link = figcaption.find('a')
|
|
if copyright_link:
|
|
copyright_url = urljoin(base_url, copyright_link.get('href', ''))
|
|
copyright_text = copyright_link.get_text(strip=True)
|
|
|
|
# Alternative: Caption in kleinen Texten unter dem Bild
|
|
caption_candidates = parent.find_all(['small', 'em', 'i'], limit=3)
|
|
for candidate in caption_candidates:
|
|
text = candidate.get_text(strip=True)
|
|
if len(text) > 10 and len(text) < 200: # Plausible Caption-Länge
|
|
if not caption: # Nur wenn noch keine Caption gefunden
|
|
caption = text
|
|
|
|
# Fallback für Caption
|
|
if not caption:
|
|
caption = title or alt_text or "Bild aus Originalartikel"
|
|
|
|
return {
|
|
"url": img_url,
|
|
"alt": alt_text,
|
|
"caption": caption[:300] if caption else "Kein Bildtitel vorhanden",
|
|
"copyright": copyright_text or "Unbekannt",
|
|
"copyright_url": copyright_url or base_url,
|
|
"width": width,
|
|
"height": height,
|
|
"title": title
|
|
}
|
|
|
|
except Exception as e:
|
|
logging.error(f"Fehler bei Metadaten-Extraktion: {e}")
|
|
return None
|
|
|
|
def extract_images_with_metadata(article_url: str) -> List[Dict]:
|
|
"""
|
|
Hauptfunktion: Extrahiert Bilder mit Metadaten aus einem Artikel
|
|
"""
|
|
images = []
|
|
|
|
if not article_url:
|
|
return images
|
|
|
|
try:
|
|
logging.info(f"🖼️ Starte Bildextraktion von: {article_url}")
|
|
|
|
# HTTP-Request mit verbessertem Header
|
|
headers = {
|
|
'User-Agent': USER_AGENT,
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
'Accept-Language': 'de,en-US;q=0.7,en;q=0.3',
|
|
'Accept-Encoding': 'gzip, deflate',
|
|
'Connection': 'keep-alive',
|
|
}
|
|
|
|
response = requests.get(article_url, timeout=REQUEST_TIMEOUT, headers=headers)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.content, "html.parser")
|
|
|
|
# Alle img-Tags finden
|
|
img_tags = soup.find_all("img")
|
|
logging.info(f"🔍 {len(img_tags)} img-Tags gefunden")
|
|
|
|
processed_urls = set() # Duplikate vermeiden
|
|
|
|
for img_tag in img_tags:
|
|
try:
|
|
# Metadaten extrahieren
|
|
image_data = extract_image_metadata(img_tag, article_url)
|
|
|
|
if image_data and image_data["url"] not in processed_urls:
|
|
images.append(image_data)
|
|
processed_urls.add(image_data["url"])
|
|
|
|
logging.info(f"✅ Bild hinzugefügt: {image_data['caption'][:50]}...")
|
|
|
|
# Maximum erreicht?
|
|
if len(images) >= MAX_IMAGES:
|
|
break
|
|
|
|
except Exception as e:
|
|
logging.error(f"❌ Fehler beim Verarbeiten eines Bildes: {e}")
|
|
continue
|
|
|
|
# Bilder nach Größe sortieren (größere zuerst)
|
|
images.sort(key=lambda x: (x.get('width', 0) * x.get('height', 0)), reverse=True)
|
|
|
|
logging.info(f"🎉 {len(images)} Bilder erfolgreich extrahiert von {article_url}")
|
|
return images[:MAX_IMAGES] # Sicherheitshalber nochmal begrenzen
|
|
|
|
except requests.RequestException as e:
|
|
logging.error(f"🌐 Netzwerkfehler bei {article_url}: {e}")
|
|
return []
|
|
except Exception as e:
|
|
logging.error(f"❌ Unerwarteter Fehler bei Bildextraktion von {article_url}: {e}")
|
|
return []
|
|
|
|
def validate_image_url(url: str) -> bool:
|
|
"""
|
|
Prüft ob ein Bild tatsächlich erreichbar ist
|
|
"""
|
|
try:
|
|
response = requests.head(url, timeout=5)
|
|
content_type = response.headers.get('content-type', '').lower()
|
|
return response.status_code == 200 and 'image' in content_type
|
|
except:
|
|
return False
|
|
|
|
def extract_featured_image(article_url: str) -> Dict:
|
|
"""
|
|
Versucht das Hauptbild/Featured Image eines Artikels zu finden
|
|
"""
|
|
try:
|
|
headers = {'User-Agent': USER_AGENT}
|
|
response = requests.get(article_url, timeout=REQUEST_TIMEOUT, headers=headers)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.content, "html.parser")
|
|
|
|
# OpenGraph Image
|
|
og_image = soup.find('meta', property='og:image')
|
|
if og_image and og_image.get('content'):
|
|
img_url = urljoin(article_url, og_image['content'])
|
|
if is_valid_image_url(img_url):
|
|
return {
|
|
"url": img_url,
|
|
"alt": "Featured Image",
|
|
"caption": "Hauptbild des Artikels",
|
|
"copyright": "Unbekannt",
|
|
"copyright_url": article_url,
|
|
"type": "featured"
|
|
}
|
|
|
|
# Twitter Card Image
|
|
twitter_image = soup.find('meta', attrs={'name': 'twitter:image'})
|
|
if twitter_image and twitter_image.get('content'):
|
|
img_url = urljoin(article_url, twitter_image['content'])
|
|
if is_valid_image_url(img_url):
|
|
return {
|
|
"url": img_url,
|
|
"alt": "Featured Image",
|
|
"caption": "Hauptbild des Artikels",
|
|
"copyright": "Unbekannt",
|
|
"copyright_url": article_url,
|
|
"type": "featured"
|
|
}
|
|
|
|
return None
|
|
|
|
except Exception as e:
|
|
logging.error(f"Fehler bei Featured Image Extraktion: {e}")
|
|
return None
|
|
|
|
def clean_image_metadata(images: List[Dict]) -> List[Dict]:
|
|
"""
|
|
Bereinigt und normalisiert Bildmetadaten
|
|
"""
|
|
cleaned_images = []
|
|
|
|
for img in images:
|
|
try:
|
|
# URL validieren
|
|
if not img.get("url") or not is_valid_image_url(img["url"]):
|
|
continue
|
|
|
|
# Metadaten bereinigen
|
|
cleaned_img = {
|
|
"url": img["url"].strip(),
|
|
"alt": (img.get("alt") or "").strip()[:200],
|
|
"caption": (img.get("caption") or "Kein Bildtitel vorhanden").strip()[:300],
|
|
"copyright": (img.get("copyright") or "Unbekannt").strip()[:100],
|
|
"copyright_url": (img.get("copyright_url") or "#").strip(),
|
|
"width": img.get("width"),
|
|
"height": img.get("height"),
|
|
"title": (img.get("title") or "").strip()[:200]
|
|
}
|
|
|
|
# Leere Felder mit Standardwerten füllen
|
|
if not cleaned_img["caption"]:
|
|
cleaned_img["caption"] = "Kein Bildtitel vorhanden"
|
|
if not cleaned_img["copyright"]:
|
|
cleaned_img["copyright"] = "Unbekannt"
|
|
if not cleaned_img["copyright_url"] or cleaned_img["copyright_url"] == "#":
|
|
cleaned_img["copyright_url"] = img["url"] # Bild-URL als Fallback
|
|
|
|
cleaned_images.append(cleaned_img)
|
|
|
|
except Exception as e:
|
|
logging.error(f"Fehler beim Bereinigen der Bildmetadaten: {e}")
|
|
continue
|
|
|
|
return cleaned_images
|
|
|
|
# Hauptfunktion für bessere Kompatibilität mit dem bestehenden Code
|
|
def extract_images_with_metadata_enhanced(article_url: str) -> List[Dict]:
|
|
"""
|
|
Erweiterte Bildextraktion mit Fallback-Strategien
|
|
"""
|
|
all_images = []
|
|
|
|
# 1. Featured Image versuchen
|
|
featured = extract_featured_image(article_url)
|
|
if featured:
|
|
all_images.append(featured)
|
|
|
|
# 2. Normale Bildextraktion
|
|
content_images = extract_images_with_metadata(article_url)
|
|
all_images.extend(content_images)
|
|
|
|
# 3. Duplikate entfernen
|
|
seen_urls = set()
|
|
unique_images = []
|
|
for img in all_images:
|
|
if img["url"] not in seen_urls:
|
|
unique_images.append(img)
|
|
seen_urls.add(img["url"])
|
|
|
|
# 4. Metadaten bereinigen
|
|
cleaned_images = clean_image_metadata(unique_images)
|
|
|
|
return cleaned_images[:MAX_IMAGES] |