Grundfunktionen optimiert
This commit is contained in:
parent
050e08859c
commit
0c84dd1a1a
8 changed files with 4866 additions and 315 deletions
491
main.py
491
main.py
|
|
@ -10,6 +10,8 @@ import logging
|
|||
import openai
|
||||
from utils.image_extractor import extract_images_with_metadata
|
||||
from utils.article_extractor import extract_full_article
|
||||
import hashlib
|
||||
import time
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
|
@ -17,10 +19,15 @@ load_dotenv()
|
|||
log_dir = "logs"
|
||||
os.makedirs(log_dir, exist_ok=True)
|
||||
log_file = os.path.join(log_dir, "rss_tool.log")
|
||||
|
||||
# Logging-Format verbessern
|
||||
logging.basicConfig(
|
||||
filename=log_file,
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s"
|
||||
format="%(asctime)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s",
|
||||
handlers=[
|
||||
logging.FileHandler(log_file, encoding='utf-8'),
|
||||
logging.StreamHandler() # Auch in Konsole ausgeben
|
||||
]
|
||||
)
|
||||
|
||||
openai.api_key = os.getenv("OPENAI_API_KEY")
|
||||
|
|
@ -29,156 +36,412 @@ ARTICLES_FILE = "data/articles.json"
|
|||
FEEDS_FILE = "data/feeds.json"
|
||||
VALID_STATUSES = ["New", "Rewrite", "Process", "Online", "On Hold", "Trash"]
|
||||
|
||||
# === Datenordner erstellen ===
|
||||
os.makedirs("data", exist_ok=True)
|
||||
|
||||
def generate_article_id(title, link, date):
|
||||
"""Generiert eine eindeutige ID für einen Artikel basierend auf mehreren Attributen"""
|
||||
identifier = f"{title}_{link}_{date}"
|
||||
return hashlib.md5(identifier.encode('utf-8')).hexdigest()
|
||||
|
||||
def is_duplicate_article(new_article, existing_articles):
|
||||
"""Prüft ob ein Artikel bereits existiert (erweiterte Duplikatserkennung)"""
|
||||
new_title = new_article.get("title", "").lower().strip()
|
||||
new_link = new_article.get("link", "").strip()
|
||||
|
||||
for existing in existing_articles:
|
||||
existing_title = existing.get("title", "").lower().strip()
|
||||
existing_link = existing.get("link", "").strip()
|
||||
|
||||
# Exakte URL-Übereinstimmung
|
||||
if new_link and existing_link and new_link == existing_link:
|
||||
return True
|
||||
|
||||
# Sehr ähnliche Titel (mindestens 90% Übereinstimmung)
|
||||
if new_title and existing_title:
|
||||
similarity = calculate_similarity(new_title, existing_title)
|
||||
if similarity > 0.9:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def calculate_similarity(text1, text2):
|
||||
"""Berechnet die Ähnlichkeit zwischen zwei Texten (vereinfachte Methode)"""
|
||||
words1 = set(text1.split())
|
||||
words2 = set(text2.split())
|
||||
|
||||
if not words1 and not words2:
|
||||
return 1.0
|
||||
if not words1 or not words2:
|
||||
return 0.0
|
||||
|
||||
intersection = len(words1.intersection(words2))
|
||||
union = len(words1.union(words2))
|
||||
|
||||
return intersection / union if union > 0 else 0.0
|
||||
|
||||
def load_feeds():
|
||||
if not os.path.exists(FEEDS_FILE):
|
||||
"""Lädt RSS-Feeds aus der JSON-Datei"""
|
||||
try:
|
||||
if not os.path.exists(FEEDS_FILE):
|
||||
logging.info("Feeds-Datei existiert nicht, erstelle leere Liste")
|
||||
return []
|
||||
|
||||
with open(FEEDS_FILE, "r", encoding='utf-8') as f:
|
||||
feeds = json.load(f)
|
||||
logging.info(f"✅ {len(feeds)} Feeds geladen")
|
||||
return feeds
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Fehler beim Laden der Feeds: {e}")
|
||||
return []
|
||||
with open(FEEDS_FILE, "r") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def save_feeds(feeds):
|
||||
with open(FEEDS_FILE, "w") as f:
|
||||
json.dump(feeds, f, indent=2)
|
||||
|
||||
"""Speichert RSS-Feeds in die JSON-Datei"""
|
||||
try:
|
||||
with open(FEEDS_FILE, "w", encoding='utf-8') as f:
|
||||
json.dump(feeds, f, indent=2, ensure_ascii=False)
|
||||
logging.info(f"✅ {len(feeds)} Feeds gespeichert")
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Fehler beim Speichern der Feeds: {e}")
|
||||
|
||||
def load_articles():
|
||||
if not os.path.exists(ARTICLES_FILE):
|
||||
"""Lädt Artikel aus der JSON-Datei"""
|
||||
try:
|
||||
if not os.path.exists(ARTICLES_FILE):
|
||||
logging.info("Artikel-Datei existiert nicht, erstelle leere Liste")
|
||||
return []
|
||||
|
||||
with open(ARTICLES_FILE, "r", encoding='utf-8') as f:
|
||||
articles = json.load(f)
|
||||
|
||||
# Status-Validierung
|
||||
for article in articles:
|
||||
if article.get("status") not in VALID_STATUSES:
|
||||
article["status"] = "New"
|
||||
logging.warning(f"⚠️ Ungültiger Status für Artikel '{article.get('title', 'Unbekannt')}' korrigiert")
|
||||
|
||||
logging.info(f"✅ {len(articles)} Artikel geladen")
|
||||
return articles
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Fehler beim Laden der Artikel: {e}")
|
||||
return []
|
||||
with open(ARTICLES_FILE, "r") as f:
|
||||
articles = json.load(f)
|
||||
|
||||
for article in articles:
|
||||
if article.get("status") not in VALID_STATUSES:
|
||||
article["status"] = "New"
|
||||
return articles
|
||||
|
||||
|
||||
def save_articles(articles):
|
||||
with open(ARTICLES_FILE, "w") as f:
|
||||
json.dump(articles, f, indent=2)
|
||||
|
||||
|
||||
def fetch_and_process_feed(feed_url, existing_ids):
|
||||
feed = feedparser.parse(feed_url)
|
||||
new_articles = []
|
||||
|
||||
for entry in feed.entries:
|
||||
article_id = entry.get("id") or entry.get("link")
|
||||
if not article_id or article_id in existing_ids:
|
||||
continue
|
||||
|
||||
title = entry.get("title", "Kein Titel")
|
||||
date = entry.get("published", datetime.now().isoformat())
|
||||
summary = entry.get("summary", "")
|
||||
content = entry.get("content", [{}])[0].get("value") or entry.get("description", "")
|
||||
"""Speichert Artikel in die JSON-Datei"""
|
||||
try:
|
||||
# Validierung vor dem Speichern
|
||||
valid_articles = []
|
||||
for article in articles:
|
||||
if "id" in article and "title" in article:
|
||||
valid_articles.append(article)
|
||||
else:
|
||||
logging.warning(f"⚠️ Ungültiger Artikel übersprungen: {article}")
|
||||
|
||||
with open(ARTICLES_FILE, "w", encoding='utf-8') as f:
|
||||
json.dump(valid_articles, f, indent=2, ensure_ascii=False)
|
||||
|
||||
logging.info(f"✅ {len(valid_articles)} Artikel gespeichert")
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Fehler beim Speichern der Artikel: {e}")
|
||||
|
||||
def clean_html_content(content):
|
||||
"""Bereinigt HTML-Inhalt und extrahiert Text"""
|
||||
try:
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
|
||||
# Entferne Script- und Style-Tags
|
||||
for script in soup(["script", "style"]):
|
||||
script.decompose()
|
||||
|
||||
# Hole sauberen Text
|
||||
clean_text = soup.get_text(" ", strip=True)
|
||||
|
||||
# Entferne überschüssige Leerzeichen
|
||||
clean_text = " ".join(clean_text.split())
|
||||
|
||||
return clean_text
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Fehler beim Bereinigen des HTML-Inhalts: {e}")
|
||||
return content
|
||||
|
||||
# Automatischer Volltext-Fetch bei zu wenig Wörtern
|
||||
if len(clean_text.split()) < 50 and entry.get("link"):
|
||||
fetched_text = extract_full_article(entry["link"])
|
||||
if len(fetched_text.split()) > len(clean_text.split()):
|
||||
clean_text = fetched_text
|
||||
def fetch_and_process_feed(feed_url, existing_articles):
|
||||
"""Lädt und verarbeitet einen einzelnen RSS-Feed"""
|
||||
new_articles = []
|
||||
feed_name = "Unbekannt"
|
||||
|
||||
try:
|
||||
logging.info(f"🔄 Verarbeite Feed: {feed_url}")
|
||||
|
||||
# Feed parsen
|
||||
feed = feedparser.parse(feed_url)
|
||||
|
||||
if hasattr(feed, 'feed') and hasattr(feed.feed, 'title'):
|
||||
feed_name = feed.feed.title
|
||||
logging.info(f"📡 Feed-Name: {feed_name}")
|
||||
|
||||
if not feed.entries:
|
||||
logging.warning(f"⚠️ Keine Einträge in Feed gefunden: {feed_url}")
|
||||
return []
|
||||
|
||||
logging.info(f"📰 {len(feed.entries)} Einträge gefunden")
|
||||
|
||||
for entry in feed.entries:
|
||||
try:
|
||||
# Basis-Informationen extrahieren
|
||||
title = entry.get("title", "Kein Titel")
|
||||
date = entry.get("published", datetime.now().isoformat())
|
||||
link = entry.get("link", "")
|
||||
summary = entry.get("summary", "")
|
||||
|
||||
# Content extrahieren
|
||||
content = ""
|
||||
if hasattr(entry, 'content') and entry.content:
|
||||
content = entry.content[0].get("value", "")
|
||||
elif hasattr(entry, 'description'):
|
||||
content = entry.description
|
||||
else:
|
||||
content = summary
|
||||
|
||||
# HTML bereinigen
|
||||
clean_text = clean_html_content(content)
|
||||
|
||||
# Volltext-Extraktion bei kurzen Artikeln
|
||||
if len(clean_text.split()) < 50 and link:
|
||||
logging.info(f"🔍 Kurzer Artikel erkannt, versuche Volltext-Extraktion: {title}")
|
||||
fetched_text = extract_full_article(link)
|
||||
if len(fetched_text.split()) > len(clean_text.split()):
|
||||
clean_text = fetched_text
|
||||
logging.info(f"✅ Volltext extrahiert: {len(clean_text.split())} Wörter")
|
||||
|
||||
# Artikel-ID generieren
|
||||
article_id = generate_article_id(title, link, date)
|
||||
|
||||
# Neuen Artikel erstellen
|
||||
new_article = {
|
||||
"id": article_id,
|
||||
"title": title,
|
||||
"date": date,
|
||||
"summary": summary[:300] + "..." if len(summary) > 300 else summary,
|
||||
"text": clean_text,
|
||||
"tags": [],
|
||||
"status": "New",
|
||||
"link": link,
|
||||
"images": [],
|
||||
"source": feed_url,
|
||||
"source_name": feed_name,
|
||||
"created_at": datetime.now().isoformat(),
|
||||
"word_count": len(clean_text.split())
|
||||
}
|
||||
|
||||
# Duplikatsprüfung
|
||||
if not is_duplicate_article(new_article, existing_articles):
|
||||
# Bilder extrahieren
|
||||
if link:
|
||||
try:
|
||||
images = extract_images_with_metadata(link)
|
||||
new_article["images"] = images
|
||||
logging.info(f"🖼️ {len(images)} Bilder für '{title}' extrahiert")
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Fehler bei Bildextraktion für '{title}': {e}")
|
||||
|
||||
new_articles.append(new_article)
|
||||
logging.info(f"✅ Neuer Artikel hinzugefügt: {title}")
|
||||
else:
|
||||
logging.info(f"🔄 Duplikat übersprungen: {title}")
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Fehler beim Verarbeiten des Eintrags '{entry.get('title', 'Unbekannt')}': {e}")
|
||||
continue
|
||||
|
||||
logging.info(f"✅ Feed verarbeitet: {len(new_articles)} neue Artikel aus {feed_url}")
|
||||
return new_articles
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Kritischer Fehler beim Verarbeiten von {feed_url}: {e}")
|
||||
return []
|
||||
|
||||
images = extract_images_with_metadata(entry.link)
|
||||
|
||||
new_articles.append({
|
||||
"id": article_id,
|
||||
"title": title,
|
||||
"date": date,
|
||||
"summary": summary,
|
||||
"text": clean_text,
|
||||
"tags": [],
|
||||
"status": "New",
|
||||
"link": entry.get("link", ""),
|
||||
"images": images,
|
||||
"source": feed_url
|
||||
})
|
||||
|
||||
return new_articles
|
||||
|
||||
|
||||
def process_articles(existing_ids):
|
||||
feeds = load_feeds()
|
||||
all_articles = load_articles()
|
||||
articles_by_id = {article["id"]: article for article in all_articles if "id" in article}
|
||||
new_entries = []
|
||||
|
||||
for feed in feeds:
|
||||
url = feed.get("url") if isinstance(feed, dict) else feed
|
||||
if not url:
|
||||
continue
|
||||
try:
|
||||
logging.info(f"Lade Feed: {url}")
|
||||
entries = fetch_and_process_feed(url, existing_ids)
|
||||
new_entries.extend(entries)
|
||||
logging.info(f"{len(entries)} neue Artikel gefunden in {url}")
|
||||
except Exception as e:
|
||||
logging.exception(f"Fehler beim Verarbeiten von {url}:")
|
||||
|
||||
added = 0
|
||||
for entry in new_entries:
|
||||
if entry["id"] not in articles_by_id:
|
||||
articles_by_id[entry["id"]] = entry
|
||||
added += 1
|
||||
def process_articles(existing_ids=None):
|
||||
"""Verarbeitet alle RSS-Feeds und fügt neue Artikel hinzu"""
|
||||
try:
|
||||
start_time = time.time()
|
||||
logging.info("🚀 Starte Artikel-Verarbeitung")
|
||||
|
||||
feeds = load_feeds()
|
||||
all_articles = load_articles()
|
||||
|
||||
if not feeds:
|
||||
logging.warning("⚠️ Keine RSS-Feeds konfiguriert")
|
||||
return
|
||||
|
||||
# Bestehende Artikel für Duplikatsprüfung
|
||||
existing_articles = all_articles.copy()
|
||||
|
||||
total_new_articles = 0
|
||||
|
||||
for feed in feeds:
|
||||
feed_url = feed.get("url") if isinstance(feed, dict) else feed
|
||||
|
||||
if not feed_url:
|
||||
logging.warning("⚠️ Feed ohne URL übersprungen")
|
||||
continue
|
||||
|
||||
try:
|
||||
new_articles = fetch_and_process_feed(feed_url, existing_articles)
|
||||
|
||||
# Neue Artikel zur Gesamtliste hinzufügen
|
||||
for article in new_articles:
|
||||
all_articles.append(article)
|
||||
existing_articles.append(article) # Für weitere Duplikatsprüfung
|
||||
|
||||
total_new_articles += len(new_articles)
|
||||
|
||||
# Kurze Pause zwischen Feeds
|
||||
time.sleep(1)
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Fehler beim Verarbeiten von Feed {feed_url}: {e}")
|
||||
continue
|
||||
|
||||
# Artikel speichern
|
||||
if total_new_articles > 0:
|
||||
save_articles(all_articles)
|
||||
processing_time = time.time() - start_time
|
||||
logging.info(f"🎉 Verarbeitung abgeschlossen: {total_new_articles} neue Artikel in {processing_time:.2f}s hinzugefügt")
|
||||
else:
|
||||
logging.info(f"Artikel bereits vorhanden, wird übersprungen: {entry['title']}")
|
||||
|
||||
if added > 0:
|
||||
save_articles(list(articles_by_id.values()))
|
||||
logging.info(f"{added} neue Artikel gespeichert.")
|
||||
else:
|
||||
logging.info("Keine neuen Artikel gefunden.")
|
||||
|
||||
logging.info("ℹ️ Keine neuen Artikel gefunden")
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Kritischer Fehler bei der Artikel-Verarbeitung: {e}")
|
||||
|
||||
def rewrite_articles():
|
||||
articles = load_articles()
|
||||
changed = False
|
||||
|
||||
for article in articles:
|
||||
if article.get("status") == "Rewrite":
|
||||
"""Schreibt Artikel mit Status 'Rewrite' um"""
|
||||
try:
|
||||
logging.info("✍️ Starte Artikel-Umschreibung")
|
||||
|
||||
articles = load_articles()
|
||||
rewrite_articles_list = [a for a in articles if a.get("status") == "Rewrite"]
|
||||
|
||||
if not rewrite_articles_list:
|
||||
logging.info("ℹ️ Keine Artikel zum Umschreiben gefunden")
|
||||
return
|
||||
|
||||
if not openai.api_key:
|
||||
logging.error("❌ OpenAI API-Key nicht konfiguriert")
|
||||
return
|
||||
|
||||
changed = False
|
||||
|
||||
for article in rewrite_articles_list:
|
||||
try:
|
||||
logging.info(f"✍️ Umschreiben von: {article['title']}")
|
||||
prompt = f"Schreibe folgenden Artikel um und fasse ihn verständlich zusammen:\n\n{article['text']}"
|
||||
|
||||
# Artikel umschreiben
|
||||
prompt = f"""Schreibe den folgenden Artikel um und fasse ihn verständlich zusammen.
|
||||
Behalte die wichtigsten Informationen bei, aber formuliere alles neu:
|
||||
|
||||
{article['text']}"""
|
||||
|
||||
response = openai.chat.completions.create(
|
||||
model="gpt-4",
|
||||
messages=[
|
||||
{"role": "system", "content": "Du bist ein professioneller Redakteur."},
|
||||
{"role": "system", "content": "Du bist ein professioneller Redakteur, der Artikel umschreibt und verbessert."},
|
||||
{"role": "user", "content": prompt}
|
||||
]
|
||||
],
|
||||
max_tokens=1500,
|
||||
temperature=0.7
|
||||
)
|
||||
|
||||
new_text = response.choices[0].message.content.strip()
|
||||
article["text"] = f"{article['title']}\n\n{new_text}"
|
||||
article["status"] = "Process"
|
||||
|
||||
# Tags generieren
|
||||
tag_prompt = f"""Erstelle 3-5 passende, kurze Stichwörter (Tags) für diesen Artikel.
|
||||
Gib nur die Tags zurück, getrennt durch Kommas:
|
||||
|
||||
tag_prompt = f"Erstelle 3 passende, kurze Stichwörter (Tags) für diesen Artikel:\n\n{new_text}"
|
||||
{new_text}"""
|
||||
|
||||
tag_response = openai.chat.completions.create(
|
||||
model="gpt-4",
|
||||
messages=[
|
||||
{"role": "system", "content": "Du bist ein Blog-Tag-Generator."},
|
||||
{"role": "system", "content": "Du generierst präzise Tags für Blog-Artikel."},
|
||||
{"role": "user", "content": tag_prompt}
|
||||
]
|
||||
],
|
||||
max_tokens=100,
|
||||
temperature=0.5
|
||||
)
|
||||
|
||||
tags_raw = tag_response.choices[0].message.content.strip()
|
||||
tags = [tag.strip(" ,") for tag in tags_raw.replace("\n", ",").split(",") if tag.strip()]
|
||||
tags = [tag.strip().strip(',') for tag in tags_raw.split(",") if tag.strip()]
|
||||
|
||||
# Artikel aktualisieren
|
||||
article["text"] = new_text
|
||||
article["tags"] = tags
|
||||
|
||||
article["status"] = "Process"
|
||||
article["rewritten_at"] = datetime.now().isoformat()
|
||||
article["word_count"] = len(new_text.split())
|
||||
|
||||
# Bildmetadaten vervollständigen falls nötig
|
||||
for img in article.get("images", []):
|
||||
if "caption" not in img:
|
||||
if "caption" not in img or not img["caption"]:
|
||||
img["caption"] = "Kein Bildtitel vorhanden"
|
||||
if "copyright" not in img:
|
||||
if "copyright" not in img or not img["copyright"]:
|
||||
img["copyright"] = "Unbekannt"
|
||||
if "copyright_url" not in img:
|
||||
if "copyright_url" not in img or not img["copyright_url"]:
|
||||
img["copyright_url"] = "#"
|
||||
|
||||
logging.info(f"✅ Artikel umgeschrieben: {article['title']}")
|
||||
|
||||
logging.info(f"✅ Artikel erfolgreich umgeschrieben: {article['title']}")
|
||||
changed = True
|
||||
|
||||
|
||||
# Kurze Pause zwischen API-Calls
|
||||
time.sleep(2)
|
||||
|
||||
except Exception as e:
|
||||
logging.exception(f"❌ Fehler beim Umschreiben von '{article['title']}':")
|
||||
logging.error(f"❌ Fehler beim Umschreiben von '{article['title']}': {e}")
|
||||
continue
|
||||
|
||||
if changed:
|
||||
save_articles(articles)
|
||||
logging.info(f"🎉 {len(rewrite_articles_list)} Artikel erfolgreich umgeschrieben")
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Kritischer Fehler beim Umschreiben: {e}")
|
||||
|
||||
if changed:
|
||||
save_articles(articles)
|
||||
logging.info("Alle Artikel mit Status 'Rewrite' wurden verarbeitet.")
|
||||
def get_article_stats():
|
||||
"""Gibt Statistiken über die Artikel zurück"""
|
||||
try:
|
||||
articles = load_articles()
|
||||
|
||||
stats = {
|
||||
"total_articles": len(articles),
|
||||
"status_distribution": {},
|
||||
"word_count_stats": {},
|
||||
"source_distribution": {},
|
||||
"images_count": 0
|
||||
}
|
||||
|
||||
# Status-Verteilung
|
||||
for article in articles:
|
||||
status = article.get("status", "New")
|
||||
stats["status_distribution"][status] = stats["status_distribution"].get(status, 0) + 1
|
||||
|
||||
# Wortanzahl-Statistiken
|
||||
word_counts = [article.get("word_count", 0) for article in articles if article.get("word_count")]
|
||||
if word_counts:
|
||||
stats["word_count_stats"] = {
|
||||
"average": sum(word_counts) // len(word_counts),
|
||||
"min": min(word_counts),
|
||||
"max": max(word_counts)
|
||||
}
|
||||
|
||||
# Quellen-Verteilung
|
||||
for article in articles:
|
||||
source = article.get("source_name", "Unbekannt")
|
||||
stats["source_distribution"][source] = stats["source_distribution"].get(source, 0) + 1
|
||||
|
||||
# Bilder zählen
|
||||
stats["images_count"] = sum(len(article.get("images", [])) for article in articles)
|
||||
|
||||
return stats
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Fehler beim Erstellen der Statistiken: {e}")
|
||||
return {}
|
||||
Loading…
Add table
Add a link
Reference in a new issue