288 lines
8.3 KiB
Python
288 lines
8.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
image_deduper.py — Finde und bereinige Bild-Dubletten sicher & reversibel.
|
|
|
|
Neu:
|
|
- collect: Bild-URLs aus JSON/HTML/Markdown/Text sammeln und lokal in .media_cache/ speichern
|
|
- scan/report/apply: wie gehabt (Hardlink/Delete), jetzt standardmäßig mit Cache nutzbar
|
|
|
|
Workflows:
|
|
# A) Nur remote-Referenzen vorhanden → erst sammeln, dann deduplizieren
|
|
python tools/image_deduper.py collect --sources processed_articles.json,content/ --cache .media_cache
|
|
python tools/image_deduper.py scan --roots .media_cache --out-dir .dedupe --phash
|
|
python tools/image_deduper.py apply --report .dedupe/report.csv --mode hardlink --dry-run
|
|
python tools/image_deduper.py apply --report .dedupe/report.csv --mode hardlink
|
|
|
|
# B) Lokale Bild-Ordner vorhanden
|
|
python tools/image_deduper.py scan --roots ./media,./static/images --out-dir .dedupe --phash
|
|
|
|
Hinweise:
|
|
- "collect" speichert Bilder content-addressed (<sha256>.<ext>) und lädt identische Dateien nicht doppelt.
|
|
- Für HTML/Markdown werden Bild-URLs geparst; für JSON werden alle Stringwerte mit Bild-URL-Muster extrahiert.
|
|
"""
|
|
|
|
import argparse
|
|
import csv
|
|
import hashlib
|
|
import json
|
|
import mimetypes
|
|
import os
|
|
import re
|
|
import sqlite3
|
|
import sys
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from typing import Iterable, List, Optional, Tuple, Set
|
|
from urllib.parse import urlparse
|
|
|
|
# Optionale Parser/Helpers
|
|
try:
|
|
from PIL import Image
|
|
except ImportError:
|
|
Image = None
|
|
|
|
try:
|
|
import imagehash # type: ignore
|
|
except ImportError:
|
|
imagehash = None
|
|
|
|
try:
|
|
import requests
|
|
except ImportError:
|
|
requests = None
|
|
|
|
try:
|
|
from bs4 import BeautifulSoup
|
|
except ImportError:
|
|
BeautifulSoup = None
|
|
|
|
try:
|
|
from tqdm import tqdm
|
|
except ImportError:
|
|
tqdm = None
|
|
|
|
|
|
IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".webp", ".gif"}
|
|
DEFAULT_INDEX = ".dedupe/index.sqlite"
|
|
DEFAULT_REPORT = ".dedupe/report.csv"
|
|
DEFAULT_CACHE = ".media_cache"
|
|
|
|
URL_RE = re.compile(
|
|
r"""https?://[^\s"'<>]+?\.(?:jpg|jpeg|png|webp|gif)(?:\?[^\s"'<>]*)?""",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
def human_mb(nbytes: int) -> str:
|
|
return f"{nbytes/1_000_000:.2f} MB"
|
|
|
|
|
|
# ---------------------------
|
|
# Hashing & pHash
|
|
# ---------------------------
|
|
def sha256_file(path: Path, bufsize: int = 1024 * 1024) -> str:
|
|
h = hashlib.sha256()
|
|
with path.open("rb") as f:
|
|
while True:
|
|
b = f.read(bufsize)
|
|
if not b:
|
|
break
|
|
h.update(b)
|
|
return h.hexdigest()
|
|
|
|
|
|
def calc_phash(path: Path) -> Optional[str]:
|
|
if Image is None or imagehash is None:
|
|
return None
|
|
try:
|
|
with Image.open(path) as im:
|
|
im = im.convert("RGB")
|
|
ph = imagehash.phash(im, hash_size=16) # 16x16 → 256-bit
|
|
return str(ph)
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
# ---------------------------
|
|
# Index (SQLite)
|
|
# ---------------------------
|
|
def ensure_dir(p: Path):
|
|
p.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
def init_index(db_path: Path):
|
|
conn = sqlite3.connect(str(db_path))
|
|
cur = conn.cursor()
|
|
cur.execute("""
|
|
CREATE TABLE IF NOT EXISTS files (
|
|
id INTEGER PRIMARY KEY,
|
|
path TEXT NOT NULL UNIQUE,
|
|
size INTEGER NOT NULL,
|
|
mtime REAL NOT NULL,
|
|
sha256 TEXT NOT NULL,
|
|
phash TEXT,
|
|
ext TEXT NOT NULL
|
|
);
|
|
""")
|
|
cur.execute("CREATE INDEX IF NOT EXISTS idx_sha256 ON files (sha256);")
|
|
cur.execute("CREATE INDEX IF NOT EXISTS idx_phash ON files (phash);")
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
|
|
def is_image(path: Path) -> bool:
|
|
return path.suffix.lower() in IMAGE_EXTS
|
|
|
|
|
|
def walk_images(roots: List[Path]) -> Iterable[Path]:
|
|
for root in roots:
|
|
if not root.exists():
|
|
# Skip silently to be more forgiving; user might pass multiple roots
|
|
continue
|
|
for p in root.rglob("*"):
|
|
if p.is_file() and is_image(p):
|
|
yield p
|
|
|
|
|
|
def upsert_file(db_path: Path, path: Path, sha256: str, phash: Optional[str]):
|
|
st = path.stat()
|
|
row = (str(path), st.st_size, st.st_mtime, sha256, phash, path.suffix.lower())
|
|
conn = sqlite3.connect(str(db_path))
|
|
cur = conn.cursor()
|
|
cur.execute("""
|
|
INSERT INTO files (path, size, mtime, sha256, phash, ext)
|
|
VALUES (?, ?, ?, ?, ?, ?)
|
|
ON CONFLICT(path) DO UPDATE SET
|
|
size=excluded.size,
|
|
mtime=excluded.mtime,
|
|
sha256=excluded.sha256,
|
|
phash=excluded.phash,
|
|
ext=excluded.ext;
|
|
""", row)
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
|
|
def group_by_sha256(db_path: Path) -> List[List[Tuple[int, str, int]]]:
|
|
conn = sqlite3.connect(str(db_path))
|
|
cur = conn.cursor()
|
|
cur.execute("SELECT sha256 FROM files GROUP BY sha256 HAVING COUNT(*) > 1;")
|
|
hashes = [r[0] for r in cur.fetchall()]
|
|
groups = []
|
|
for h in hashes:
|
|
cur.execute("SELECT id, path, size FROM files WHERE sha256=?", (h,))
|
|
rows = cur.fetchall()
|
|
groups.append([(rid, rpath, rsize) for rid, rpath, rsize in rows])
|
|
conn.close()
|
|
return groups
|
|
|
|
|
|
def write_csv_report(db_path: Path, csv_path: Path) -> Tuple[int, int]:
|
|
groups = group_by_sha256(db_path)
|
|
ensure_dir(csv_path.parent)
|
|
total_dups = 0
|
|
total_savings = 0
|
|
with csv_path.open("w", newline="", encoding="utf-8") as f:
|
|
w = csv.writer(f)
|
|
w.writerow(["group_id", "canonical_path", "dup_path", "dup_size_bytes"])
|
|
gid = 0
|
|
for g in groups:
|
|
if not g:
|
|
continue
|
|
canonical = max(g, key=lambda x: x[2]) # größte Datei als Kanon
|
|
for rid, path, size in g:
|
|
if path == canonical[1]:
|
|
continue
|
|
total_dups += 1
|
|
total_savings += size
|
|
w.writerow([gid, canonical[1], path, size])
|
|
gid += 1
|
|
return total_dups, total_savings
|
|
|
|
|
|
# ---------------------------
|
|
# Apply (Hardlink/Delete)
|
|
# ---------------------------
|
|
def apply_hardlink(canonical: Path, dup: Path, dry_run: bool) -> None:
|
|
if dry_run:
|
|
return
|
|
tmp = dup.with_suffix(dup.suffix + ".dedupe.tmp")
|
|
dup.unlink(missing_ok=False)
|
|
os.link(canonical, tmp) # gleicher FS notwendig
|
|
tmp.replace(dup)
|
|
|
|
|
|
def apply_delete(dup: Path, dry_run: bool) -> None:
|
|
if dry_run:
|
|
return
|
|
dup.unlink(missing_ok=False)
|
|
|
|
|
|
@dataclass
|
|
class ApplyStats:
|
|
processed: int = 0
|
|
errors: int = 0
|
|
saved_bytes: int = 0
|
|
|
|
|
|
def apply_changes(csv_report: Path, mode: str, dry_run: bool) -> ApplyStats:
|
|
stats = ApplyStats()
|
|
with csv_report.open("r", encoding="utf-8") as f:
|
|
r = csv.DictReader(f)
|
|
for row in r:
|
|
canonical = Path(row["canonical_path"])
|
|
dup = Path(row["dup_path"])
|
|
size = int(row["dup_size_bytes"])
|
|
try:
|
|
if mode == "hardlink":
|
|
apply_hardlink(canonical, dup, dry_run)
|
|
elif mode == "delete":
|
|
apply_delete(dup, dry_run)
|
|
else:
|
|
raise ValueError("mode must be 'hardlink' or 'delete'")
|
|
stats.processed += 1
|
|
stats.saved_bytes += size
|
|
except Exception as e:
|
|
stats.errors += 1
|
|
print(f"[ERROR] {dup}: {e}", file=sys.stderr)
|
|
return stats
|
|
|
|
|
|
# ---------------------------
|
|
# Collect (URLs -> Cache)
|
|
# ---------------------------
|
|
def is_image_url(url: str) -> bool:
|
|
if URL_RE.search(url):
|
|
return True
|
|
# Fallback: Extension anhand des Pfads
|
|
path = urlparse(url).path
|
|
ext = Path(path).suffix.lower()
|
|
return ext in IMAGE_EXTS
|
|
|
|
|
|
def extract_urls_from_text(text: str) -> Set[str]:
|
|
urls = set(URL_RE.findall(text))
|
|
return urls
|
|
|
|
|
|
def extract_urls_from_html(text: str) -> Set[str]:
|
|
urls = set()
|
|
if BeautifulSoup is None:
|
|
return extract_urls_from_text(text)
|
|
try:
|
|
soup = BeautifulSoup(text, "html.parser")
|
|
for tag in soup.find_all(["img", "source"]):
|
|
src = tag.get("src") or tag.get("data-src")
|
|
if src and is_image_url(src):
|
|
urls.add(src)
|
|
# Fallback auf nackte URLs
|
|
urls |= extract_urls_from_text(text)
|
|
except Exception:
|
|
urls |= extract_urls_from_text(text)
|
|
return urls
|
|
|
|
|
|
def extract_urls_from_json(obj) -> Set[str]:
|
|
urls = set()
|
|
if isinstance(obj, dict):
|
|
for v in obj.values():
|
|
urls |= extract
|