🔖 Release v1.4.5
This commit is contained in:
parent
691c9e00b6
commit
e7d98dba3a
10 changed files with 1322 additions and 147 deletions
|
|
@ -1,51 +1,60 @@
|
|||
# utils/image_extractor.py
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urljoin
|
||||
import logging
|
||||
|
||||
|
||||
def extract_images_with_metadata(article_url):
|
||||
"""
|
||||
Versucht, Bilder mit Bildunterschrift und Copyright aus dem Originalartikel zu extrahieren.
|
||||
Gibt eine Liste mit Dictionaries zurück: {url, alt, copyright_text, copyright_link}
|
||||
Gibt eine Liste mit Dictionaries zurück: {url, alt, copyright, copyright_url, caption}
|
||||
"""
|
||||
images = []
|
||||
try:
|
||||
logging.info(f"📷 Extrahiere Bilder von {article_url}")
|
||||
response = requests.get(article_url, timeout=10)
|
||||
if response.status_code != 200:
|
||||
logging.warning(f"Keine gültige Antwort von {article_url} (Status {response.status_code})")
|
||||
return []
|
||||
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
images = []
|
||||
|
||||
for img_tag in soup.find_all("img"):
|
||||
src = img_tag.get("src")
|
||||
if not src:
|
||||
continue
|
||||
|
||||
# Vollständige URL bauen
|
||||
img_url = urljoin(article_url, src)
|
||||
alt_text = img_tag.get("alt", "").strip()
|
||||
|
||||
# Copyright-Hinweis suchen: z. B. umgebender <figure> oder <div>
|
||||
copyright_text = ""
|
||||
copyright_link = ""
|
||||
copyright_text = "Unbekannt"
|
||||
copyright_link = article_url
|
||||
caption = alt_text or "Bild aus Originalartikel"
|
||||
|
||||
parent = img_tag.find_parent(["figure", "div"])
|
||||
if parent:
|
||||
caption = parent.find("figcaption")
|
||||
if caption:
|
||||
copyright_text = caption.get_text(strip=True)
|
||||
link_tag = caption.find("a")
|
||||
figcaption = parent.find("figcaption")
|
||||
if figcaption:
|
||||
caption = figcaption.get_text(strip=True)
|
||||
link_tag = figcaption.find("a")
|
||||
if link_tag and link_tag.has_attr("href"):
|
||||
copyright_link = link_tag["href"]
|
||||
copyright_text = link_tag.get_text(strip=True)
|
||||
|
||||
images.append({
|
||||
image_data = {
|
||||
"url": img_url,
|
||||
"alt": alt_text or "Bild aus Originalartikel",
|
||||
"copyright_text": copyright_text or "Unbekannt",
|
||||
"copyright_link": copyright_link or article_url
|
||||
})
|
||||
"alt": alt_text,
|
||||
"caption": caption or "Kein Bildtitel vorhanden",
|
||||
"copyright": copyright_text or "Unbekannt",
|
||||
"copyright_url": copyright_link or article_url
|
||||
}
|
||||
images.append(image_data)
|
||||
|
||||
logging.info(f"{len(images)} Bilder gefunden bei {article_url}")
|
||||
return images
|
||||
|
||||
except Exception as e:
|
||||
print(f"[extract_images_with_metadata] Fehler bei {article_url}: {e}")
|
||||
return []
|
||||
logging.exception(f"Fehler bei der Bildextraktion aus {article_url}:")
|
||||
return []
|
||||
Loading…
Add table
Add a link
Reference in a new issue