�� Tabellenansicht & Statusverwaltung integriert, Rewrite-Batchfunktion hinzugefügt (v1.3.1)

This commit is contained in:
Oliver 2025-07-04 20:14:12 +02:00
parent 2f7f2a1eb7
commit fe2191e6c8
9 changed files with 281 additions and 361 deletions

51
utils/image_extractor.py Normal file
View file

@ -0,0 +1,51 @@
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
def extract_images_with_metadata(article_url):
"""
Versucht, Bilder mit Bildunterschrift und Copyright aus dem Originalartikel zu extrahieren.
Gibt eine Liste mit Dictionaries zurück: {url, alt, copyright_text, copyright_link}
"""
try:
response = requests.get(article_url, timeout=10)
if response.status_code != 200:
return []
soup = BeautifulSoup(response.content, "html.parser")
images = []
for img_tag in soup.find_all("img"):
src = img_tag.get("src")
if not src:
continue
# Vollständige URL bauen
img_url = urljoin(article_url, src)
alt_text = img_tag.get("alt", "").strip()
# Copyright-Hinweis suchen: z.B. umgebender <figure> oder <div>
copyright_text = ""
copyright_link = ""
parent = img_tag.find_parent(["figure", "div"])
if parent:
caption = parent.find("figcaption")
if caption:
copyright_text = caption.get_text(strip=True)
link_tag = caption.find("a")
if link_tag and link_tag.has_attr("href"):
copyright_link = link_tag["href"]
images.append({
"url": img_url,
"alt": alt_text or "Bild aus Originalartikel",
"copyright_text": copyright_text or "Unbekannt",
"copyright_link": copyright_link or article_url
})
return images
except Exception as e:
print(f"[extract_images_with_metadata] Fehler bei {article_url}: {e}")
return []