169 lines
5.7 KiB
Python
169 lines
5.7 KiB
Python
# main.py
|
|
|
|
import feedparser
|
|
import json
|
|
import os
|
|
from bs4 import BeautifulSoup
|
|
from datetime import datetime
|
|
from dotenv import load_dotenv
|
|
import logging
|
|
from utils.image_extractor import extract_images_with_metadata
|
|
import openai
|
|
|
|
load_dotenv()
|
|
|
|
# Logging konfigurieren
|
|
log_dir = "logs"
|
|
os.makedirs(log_dir, exist_ok=True)
|
|
logging.basicConfig(
|
|
filename=os.path.join(log_dir, "rss_tool.log"),
|
|
level=logging.INFO,
|
|
format="%(asctime)s - %(levelname)s - %(message)s"
|
|
)
|
|
|
|
openai.api_key = os.getenv("OPENAI_API_KEY")
|
|
|
|
ARTICLES_FILE = "data/articles.json"
|
|
FEEDS_FILE = "data/feeds.json"
|
|
|
|
VALID_STATUSES = ["New", "Rewrite", "Process", "Online", "On Hold", "Trash"]
|
|
|
|
|
|
def load_feeds():
|
|
if not os.path.exists(FEEDS_FILE):
|
|
return []
|
|
with open(FEEDS_FILE, "r") as f:
|
|
return json.load(f)
|
|
|
|
def save_feeds(feeds):
|
|
with open(FEEDS_FILE, "w") as f:
|
|
json.dump(feeds, f, indent=2)
|
|
|
|
def load_articles():
|
|
if not os.path.exists(ARTICLES_FILE):
|
|
return []
|
|
with open(ARTICLES_FILE, "r") as f:
|
|
articles = json.load(f)
|
|
|
|
# Sicherstellen, dass jeder Artikel einen gültigen Status hat
|
|
for article in articles:
|
|
if article.get("status") not in VALID_STATUSES:
|
|
article["status"] = "New"
|
|
return articles
|
|
|
|
def save_articles(articles):
|
|
with open(ARTICLES_FILE, "w") as f:
|
|
json.dump(articles, f, indent=2)
|
|
|
|
def fetch_and_process_feed(feed_url, existing_ids):
|
|
feed = feedparser.parse(feed_url)
|
|
new_articles = []
|
|
|
|
for entry in feed.entries:
|
|
article_id = entry.get("id") or entry.get("link")
|
|
if not article_id or article_id in existing_ids:
|
|
continue
|
|
|
|
title = entry.get("title", "Kein Titel")
|
|
date = entry.get("published", datetime.now().isoformat())
|
|
summary = entry.get("summary", "")
|
|
content = entry.get("content", [{}])[0].get("value") or entry.get("description", "")
|
|
|
|
soup = BeautifulSoup(content, "html.parser")
|
|
clean_text = soup.get_text(" ", strip=True)
|
|
|
|
images = extract_images_with_metadata(entry.link)
|
|
|
|
new_articles.append({
|
|
"id": article_id,
|
|
"title": title,
|
|
"date": date,
|
|
"summary": summary,
|
|
"text": clean_text,
|
|
"tags": [],
|
|
"status": "New",
|
|
"link": entry.get("link", ""),
|
|
"images": images
|
|
})
|
|
|
|
return new_articles
|
|
|
|
def process_articles(existing_ids):
|
|
feeds = load_feeds()
|
|
all_articles = load_articles()
|
|
new_entries = []
|
|
|
|
for feed in feeds:
|
|
if isinstance(feed, dict):
|
|
url = feed.get("url")
|
|
else:
|
|
url = feed
|
|
|
|
if not url:
|
|
continue
|
|
|
|
try:
|
|
logging.info(f"Lade Feed: {url}")
|
|
entries = fetch_and_process_feed(url, existing_ids)
|
|
new_entries.extend(entries)
|
|
logging.info(f"{len(entries)} neue Artikel gefunden in {url}")
|
|
except Exception as e:
|
|
logging.error(f"Fehler beim Verarbeiten von {url}: {e}")
|
|
|
|
# Nur neue Artikel speichern, deren ID noch nicht vorhanden ist
|
|
existing_article_ids = set(article["id"] for article in all_articles)
|
|
unique_new_entries = [a for a in new_entries if a["id"] not in existing_article_ids]
|
|
|
|
if unique_new_entries:
|
|
all_articles.extend(unique_new_entries)
|
|
save_articles(all_articles)
|
|
logging.info(f"{len(unique_new_entries)} neue Artikel gespeichert.")
|
|
else:
|
|
logging.info("Keine neuen Artikel gefunden.")
|
|
|
|
def rewrite_articles():
|
|
articles = load_articles()
|
|
for article in articles:
|
|
if article.get("status") == "Rewrite":
|
|
try:
|
|
logging.info(f"✍️ Umschreiben von: {article['title']}")
|
|
prompt = f"Schreibe folgenden Artikel um und fasse ihn verständlich zusammen:\n\n{article['text']}"
|
|
response = openai.chat.completions.create(
|
|
model="gpt-4",
|
|
messages=[
|
|
{"role": "system", "content": "Du bist ein professioneller Redakteur."},
|
|
{"role": "user", "content": prompt}
|
|
]
|
|
)
|
|
new_text = response.choices[0].message.content.strip()
|
|
article["text"] = f"{article['title']}\n\n{new_text}"
|
|
article["status"] = "Process"
|
|
|
|
tag_prompt = f"Erstelle 3 passende, kurze Stichwörter (Tags) für diesen Artikel:\n\n{new_text}"
|
|
tag_response = openai.chat.completions.create(
|
|
model="gpt-4",
|
|
messages=[
|
|
{"role": "system", "content": "Du bist ein Blog-Tag-Generator."},
|
|
{"role": "user", "content": tag_prompt}
|
|
]
|
|
)
|
|
tags_raw = tag_response.choices[0].message.content.strip()
|
|
tags = [tag.strip(" ,") for tag in tags_raw.replace("\n", ",").split(",") if tag.strip()]
|
|
article["tags"] = tags
|
|
|
|
# Sicherstellen, dass Bildmetadaten vollständig sind
|
|
for img in article.get("images", []):
|
|
if "caption" not in img:
|
|
img["caption"] = "Kein Bildtitel vorhanden"
|
|
if "copyright" not in img:
|
|
img["copyright"] = "Unbekannt"
|
|
if "copyright_url" not in img:
|
|
img["copyright_url"] = "#"
|
|
|
|
logging.info(f"✅ Artikel umgeschrieben: {article['title']}")
|
|
|
|
except Exception as e:
|
|
logging.error(f"❌ Fehler beim Umschreiben von '{article['title']}': {e}")
|
|
|
|
save_articles(articles)
|
|
logging.info("Alle Artikel mit Status 'Rewrite' wurden verarbeitet.")
|