Initial commit: Daily News App
企業內部新聞彙整與分析系統 - 自動新聞抓取 (Digitimes, 經濟日報, 工商時報) - AI 智慧摘要 (OpenAI/Claude/Ollama) - 群組管理與訂閱通知 - 已清理 Python 快取檔案 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
19
app/services/__init__.py
Normal file
19
app/services/__init__.py
Normal file
@@ -0,0 +1,19 @@
|
||||
"""
|
||||
服務模組
|
||||
"""
|
||||
from app.services.llm_service import generate_summary, test_llm_connection
|
||||
from app.services.notification_service import send_email, send_report_notifications
|
||||
from app.services.crawler_service import get_crawler, BaseCrawler
|
||||
from app.services.scheduler_service import init_scheduler, shutdown_scheduler, run_daily_crawl
|
||||
|
||||
__all__ = [
|
||||
"generate_summary",
|
||||
"test_llm_connection",
|
||||
"send_email",
|
||||
"send_report_notifications",
|
||||
"get_crawler",
|
||||
"BaseCrawler",
|
||||
"init_scheduler",
|
||||
"shutdown_scheduler",
|
||||
"run_daily_crawl"
|
||||
]
|
||||
322
app/services/crawler_service.py
Normal file
322
app/services/crawler_service.py
Normal file
@@ -0,0 +1,322 @@
|
||||
"""
|
||||
新聞爬蟲服務模組
|
||||
支援 Digitimes、經濟日報、工商時報
|
||||
"""
|
||||
import time
|
||||
import re
|
||||
from datetime import datetime, date
|
||||
from typing import Optional, List, Dict, Any
|
||||
from abc import ABC, abstractmethod
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
from tenacity import retry, stop_after_attempt, wait_exponential
|
||||
import logging
|
||||
|
||||
from app.core.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BaseCrawler(ABC):
|
||||
"""爬蟲基礎類別"""
|
||||
|
||||
def __init__(self):
|
||||
self.session = httpx.Client(
|
||||
timeout=30,
|
||||
headers={
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
}
|
||||
)
|
||||
self.delay = settings.crawl_request_delay
|
||||
|
||||
def _wait(self):
|
||||
"""請求間隔"""
|
||||
time.sleep(self.delay)
|
||||
|
||||
@abstractmethod
|
||||
def get_article_list(self, keywords: List[str]) -> List[Dict[str, Any]]:
|
||||
"""取得文章列表"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_article_content(self, url: str) -> Optional[str]:
|
||||
"""取得文章內容"""
|
||||
pass
|
||||
|
||||
def close(self):
|
||||
"""關閉連線"""
|
||||
self.session.close()
|
||||
|
||||
|
||||
class DigitimesCrawler(BaseCrawler):
|
||||
"""Digitimes 爬蟲(付費訂閱)"""
|
||||
|
||||
BASE_URL = "https://www.digitimes.com.tw"
|
||||
|
||||
def __init__(self, username: str, password: str):
|
||||
super().__init__()
|
||||
self.username = username
|
||||
self.password = password
|
||||
self.is_logged_in = False
|
||||
|
||||
def login(self) -> bool:
|
||||
"""登入 Digitimes"""
|
||||
try:
|
||||
# 取得登入頁面
|
||||
login_page = self.session.get(f"{self.BASE_URL}/member/login.asp")
|
||||
|
||||
# 發送登入請求
|
||||
login_data = {
|
||||
"uid": self.username,
|
||||
"pwd": self.password,
|
||||
"remember": "1"
|
||||
}
|
||||
|
||||
response = self.session.post(
|
||||
f"{self.BASE_URL}/member/login_check.asp",
|
||||
data=login_data,
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
# 檢查是否登入成功(根據回應判斷)
|
||||
self.is_logged_in = "logout" in response.text.lower() or response.status_code == 200
|
||||
return self.is_logged_in
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Digitimes 登入失敗", exc_info=True)
|
||||
return False
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
|
||||
def get_article_list(self, keywords: List[str]) -> List[Dict[str, Any]]:
|
||||
"""取得文章列表"""
|
||||
if not self.is_logged_in:
|
||||
self.login()
|
||||
|
||||
articles = []
|
||||
|
||||
for keyword in keywords:
|
||||
self._wait()
|
||||
|
||||
try:
|
||||
# 搜尋 API
|
||||
search_url = f"{self.BASE_URL}/search/search_result.asp?query={keyword}"
|
||||
response = self.session.get(search_url)
|
||||
soup = BeautifulSoup(response.text, "lxml")
|
||||
|
||||
# 解析搜尋結果
|
||||
for item in soup.select(".search-result-item, .news-item"):
|
||||
title_elem = item.select_one("h3 a, .title a")
|
||||
if not title_elem:
|
||||
continue
|
||||
|
||||
title = title_elem.get_text(strip=True)
|
||||
url = title_elem.get("href", "")
|
||||
if not url.startswith("http"):
|
||||
url = f"{self.BASE_URL}{url}"
|
||||
|
||||
# 取得日期
|
||||
date_elem = item.select_one(".date, .time")
|
||||
pub_date = None
|
||||
if date_elem:
|
||||
date_text = date_elem.get_text(strip=True)
|
||||
try:
|
||||
pub_date = datetime.strptime(date_text, "%Y/%m/%d")
|
||||
except:
|
||||
pass
|
||||
|
||||
# 只取今天的新聞
|
||||
if pub_date and pub_date.date() == date.today():
|
||||
articles.append({
|
||||
"title": title,
|
||||
"url": url,
|
||||
"published_at": pub_date,
|
||||
"source": "digitimes"
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Digitimes 抓取失敗 (關鍵字: {keyword})", exc_info=True)
|
||||
|
||||
return articles
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
|
||||
def get_article_content(self, url: str) -> Optional[str]:
|
||||
"""取得文章內容"""
|
||||
if not self.is_logged_in:
|
||||
self.login()
|
||||
|
||||
try:
|
||||
self._wait()
|
||||
response = self.session.get(url)
|
||||
soup = BeautifulSoup(response.text, "lxml")
|
||||
|
||||
# 嘗試多個內容選擇器
|
||||
content_selectors = [".article-body", ".content", "#article-content", ".main-content"]
|
||||
|
||||
for selector in content_selectors:
|
||||
content_elem = soup.select_one(selector)
|
||||
if content_elem:
|
||||
# 移除不需要的元素
|
||||
for unwanted in content_elem.select("script, style, .ad, .advertisement"):
|
||||
unwanted.decompose()
|
||||
return content_elem.get_text(separator="\n", strip=True)
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Digitimes 內容抓取失敗", exc_info=True)
|
||||
return None
|
||||
|
||||
|
||||
class UDNCrawler(BaseCrawler):
|
||||
"""經濟日報爬蟲"""
|
||||
|
||||
BASE_URL = "https://money.udn.com"
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
|
||||
def get_article_list(self, keywords: List[str]) -> List[Dict[str, Any]]:
|
||||
"""取得文章列表"""
|
||||
articles = []
|
||||
|
||||
for keyword in keywords:
|
||||
self._wait()
|
||||
|
||||
try:
|
||||
search_url = f"{self.BASE_URL}/search/result/1/{keyword}"
|
||||
response = self.session.get(search_url)
|
||||
soup = BeautifulSoup(response.text, "lxml")
|
||||
|
||||
for item in soup.select(".story-list__news, .news-item"):
|
||||
title_elem = item.select_one("h3 a, .story-list__text a")
|
||||
if not title_elem:
|
||||
continue
|
||||
|
||||
title = title_elem.get_text(strip=True)
|
||||
url = title_elem.get("href", "")
|
||||
if not url.startswith("http"):
|
||||
url = f"{self.BASE_URL}{url}"
|
||||
|
||||
date_elem = item.select_one("time, .story-list__time")
|
||||
pub_date = None
|
||||
if date_elem:
|
||||
date_text = date_elem.get_text(strip=True)
|
||||
try:
|
||||
pub_date = datetime.strptime(date_text[:10], "%Y-%m-%d")
|
||||
except:
|
||||
pass
|
||||
|
||||
if pub_date and pub_date.date() == date.today():
|
||||
articles.append({
|
||||
"title": title,
|
||||
"url": url,
|
||||
"published_at": pub_date,
|
||||
"source": "udn"
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"經濟日報抓取失敗 (關鍵字: {keyword})", exc_info=True)
|
||||
|
||||
return articles
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
|
||||
def get_article_content(self, url: str) -> Optional[str]:
|
||||
"""取得文章內容"""
|
||||
try:
|
||||
self._wait()
|
||||
response = self.session.get(url)
|
||||
soup = BeautifulSoup(response.text, "lxml")
|
||||
|
||||
content_elem = soup.select_one("#story_body_content, .article-content")
|
||||
if content_elem:
|
||||
for unwanted in content_elem.select("script, style, .ad"):
|
||||
unwanted.decompose()
|
||||
return content_elem.get_text(separator="\n", strip=True)
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("經濟日報內容抓取失敗", exc_info=True)
|
||||
return None
|
||||
|
||||
|
||||
class CTEECrawler(BaseCrawler):
|
||||
"""工商時報爬蟲"""
|
||||
|
||||
BASE_URL = "https://ctee.com.tw"
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
|
||||
def get_article_list(self, keywords: List[str]) -> List[Dict[str, Any]]:
|
||||
"""取得文章列表"""
|
||||
articles = []
|
||||
|
||||
for keyword in keywords:
|
||||
self._wait()
|
||||
|
||||
try:
|
||||
search_url = f"{self.BASE_URL}/?s={keyword}"
|
||||
response = self.session.get(search_url)
|
||||
soup = BeautifulSoup(response.text, "lxml")
|
||||
|
||||
for item in soup.select(".post-item, article.post"):
|
||||
title_elem = item.select_one("h2 a, .post-title a")
|
||||
if not title_elem:
|
||||
continue
|
||||
|
||||
title = title_elem.get_text(strip=True)
|
||||
url = title_elem.get("href", "")
|
||||
|
||||
date_elem = item.select_one("time, .post-date")
|
||||
pub_date = None
|
||||
if date_elem:
|
||||
date_text = date_elem.get("datetime", date_elem.get_text(strip=True))
|
||||
try:
|
||||
pub_date = datetime.fromisoformat(date_text[:10])
|
||||
except:
|
||||
pass
|
||||
|
||||
if pub_date and pub_date.date() == date.today():
|
||||
articles.append({
|
||||
"title": title,
|
||||
"url": url,
|
||||
"published_at": pub_date,
|
||||
"source": "ctee"
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"工商時報抓取失敗 (關鍵字: {keyword})", exc_info=True)
|
||||
|
||||
return articles
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
|
||||
def get_article_content(self, url: str) -> Optional[str]:
|
||||
"""取得文章內容"""
|
||||
try:
|
||||
self._wait()
|
||||
response = self.session.get(url)
|
||||
soup = BeautifulSoup(response.text, "lxml")
|
||||
|
||||
content_elem = soup.select_one(".entry-content, .post-content")
|
||||
if content_elem:
|
||||
for unwanted in content_elem.select("script, style, .ad"):
|
||||
unwanted.decompose()
|
||||
return content_elem.get_text(separator="\n", strip=True)
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("工商時報內容抓取失敗", exc_info=True)
|
||||
return None
|
||||
|
||||
|
||||
def get_crawler(source_code: str) -> BaseCrawler:
|
||||
"""取得對應的爬蟲實例"""
|
||||
if source_code == "digitimes":
|
||||
return DigitimesCrawler(
|
||||
settings.digitimes_username,
|
||||
settings.digitimes_password
|
||||
)
|
||||
elif source_code == "udn":
|
||||
return UDNCrawler()
|
||||
elif source_code == "ctee":
|
||||
return CTEECrawler()
|
||||
else:
|
||||
raise ValueError(f"不支援的新聞來源: {source_code}")
|
||||
176
app/services/llm_service.py
Normal file
176
app/services/llm_service.py
Normal file
@@ -0,0 +1,176 @@
|
||||
"""
|
||||
LLM 服務模組
|
||||
支援 Google Gemini、OpenAI、Ollama
|
||||
"""
|
||||
import time
|
||||
from typing import Optional
|
||||
import httpx
|
||||
|
||||
from app.core.config import settings
|
||||
|
||||
|
||||
def get_llm_client():
|
||||
"""取得 LLM 客戶端"""
|
||||
provider = settings.llm_provider
|
||||
|
||||
if provider == "gemini":
|
||||
import google.generativeai as genai
|
||||
genai.configure(api_key=settings.gemini_api_key)
|
||||
return genai
|
||||
elif provider == "openai":
|
||||
from openai import OpenAI
|
||||
return OpenAI(api_key=settings.openai_api_key)
|
||||
elif provider == "ollama":
|
||||
return None # 使用 httpx 直接呼叫
|
||||
|
||||
raise ValueError(f"不支援的 LLM 提供者: {provider}")
|
||||
|
||||
|
||||
def generate_summary(group, articles: list) -> str:
|
||||
"""
|
||||
產生 AI 摘要
|
||||
|
||||
Args:
|
||||
group: 群組物件(包含 ai_background, ai_prompt)
|
||||
articles: 新聞文章列表
|
||||
|
||||
Returns:
|
||||
綜合摘要文字
|
||||
"""
|
||||
if not articles:
|
||||
return "無相關新聞可供摘要。"
|
||||
|
||||
# 組合文章內容
|
||||
articles_text = ""
|
||||
for i, article in enumerate(articles, 1):
|
||||
articles_text += f"""
|
||||
---
|
||||
新聞 {i}:{article.title}
|
||||
來源:{article.source.name if article.source else '未知'}
|
||||
內容:{article.content[:1000] if article.content else article.summary or '無內容'}
|
||||
---
|
||||
"""
|
||||
|
||||
# 建立 Prompt
|
||||
system_prompt = f"""你是一位專業的產業分析師,負責彙整每日新聞並產出精闘的綜合分析報告。
|
||||
|
||||
背景資訊:
|
||||
{group.ai_background or '無特定背景資訊'}
|
||||
|
||||
摘要方向:
|
||||
{group.ai_prompt or '請綜合分析以下新聞的重點、趨勢與潛在影響。'}
|
||||
"""
|
||||
|
||||
user_prompt = f"""請根據以下 {len(articles)} 則新聞,產出一份繁體中文的綜合分析報告:
|
||||
|
||||
{articles_text}
|
||||
|
||||
請注意:
|
||||
1. 使用繁體中文
|
||||
2. 整合相關主題,避免逐條列舉
|
||||
3. 突出重要趨勢與影響
|
||||
4. 控制在 500 字以內
|
||||
"""
|
||||
|
||||
provider = settings.llm_provider
|
||||
|
||||
try:
|
||||
if provider == "gemini":
|
||||
import google.generativeai as genai
|
||||
genai.configure(api_key=settings.gemini_api_key)
|
||||
model = genai.GenerativeModel(settings.gemini_model or "gemini-1.5-pro")
|
||||
response = model.generate_content(
|
||||
f"{system_prompt}\n\n{user_prompt}",
|
||||
generation_config={
|
||||
"temperature": 0.7,
|
||||
"max_output_tokens": 2048,
|
||||
"top_p": 0.95,
|
||||
"top_k": 40
|
||||
}
|
||||
)
|
||||
return response.text
|
||||
|
||||
elif provider == "openai":
|
||||
from openai import OpenAI
|
||||
client = OpenAI(api_key=settings.openai_api_key)
|
||||
response = client.chat.completions.create(
|
||||
model=settings.openai_model or "gpt-4o",
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_prompt}
|
||||
],
|
||||
max_tokens=2048,
|
||||
temperature=0.7
|
||||
)
|
||||
return response.choices[0].message.content
|
||||
|
||||
elif provider == "ollama":
|
||||
response = httpx.post(
|
||||
f"{settings.ollama_endpoint}/api/generate",
|
||||
json={
|
||||
"model": settings.ollama_model or "llama3",
|
||||
"prompt": f"{system_prompt}\n\n{user_prompt}",
|
||||
"stream": False,
|
||||
"options": {
|
||||
"temperature": 0.7,
|
||||
"num_predict": 2048,
|
||||
"top_p": 0.9,
|
||||
"top_k": 40
|
||||
}
|
||||
},
|
||||
timeout=120
|
||||
)
|
||||
return response.json().get("response", "")
|
||||
|
||||
except Exception as e:
|
||||
return f"摘要產生失敗:{str(e)}"
|
||||
|
||||
|
||||
def test_llm_connection(provider: str, model: str) -> dict:
|
||||
"""
|
||||
測試 LLM 連線
|
||||
|
||||
Returns:
|
||||
{"success": bool, "response_time_ms": int, "message": str}
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
if provider == "gemini":
|
||||
import google.generativeai as genai
|
||||
genai.configure(api_key=settings.gemini_api_key)
|
||||
gen_model = genai.GenerativeModel(model)
|
||||
response = gen_model.generate_content(
|
||||
"Hello",
|
||||
generation_config={"max_output_tokens": 10}
|
||||
)
|
||||
elapsed = int((time.time() - start_time) * 1000)
|
||||
return {"success": True, "response_time_ms": elapsed}
|
||||
|
||||
elif provider == "openai":
|
||||
from openai import OpenAI
|
||||
client = OpenAI(api_key=settings.openai_api_key)
|
||||
response = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": "Hello"}],
|
||||
max_tokens=10
|
||||
)
|
||||
elapsed = int((time.time() - start_time) * 1000)
|
||||
return {"success": True, "response_time_ms": elapsed}
|
||||
|
||||
elif provider == "ollama":
|
||||
response = httpx.post(
|
||||
f"{settings.ollama_endpoint}/api/generate",
|
||||
json={"model": model, "prompt": "Hello", "stream": False},
|
||||
timeout=30
|
||||
)
|
||||
elapsed = int((time.time() - start_time) * 1000)
|
||||
if response.status_code == 200:
|
||||
return {"success": True, "response_time_ms": elapsed}
|
||||
return {"success": False, "message": f"HTTP {response.status_code}"}
|
||||
|
||||
return {"success": False, "message": f"不支援的提供者: {provider}"}
|
||||
|
||||
except Exception as e:
|
||||
elapsed = int((time.time() - start_time) * 1000)
|
||||
return {"success": False, "response_time_ms": elapsed, "message": str(e)}
|
||||
203
app/services/notification_service.py
Normal file
203
app/services/notification_service.py
Normal file
@@ -0,0 +1,203 @@
|
||||
"""
|
||||
通知服務模組
|
||||
處理 Email 發送
|
||||
"""
|
||||
import smtplib
|
||||
from email.mime.text import MIMEText
|
||||
from email.mime.multipart import MIMEMultipart
|
||||
from typing import Optional
|
||||
from html import escape
|
||||
from sqlalchemy.orm import Session
|
||||
import logging
|
||||
|
||||
from app.core.config import settings
|
||||
from app.models import Report, Subscription, User, NotificationLog, NotificationStatus
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def send_email(to_email: str, subject: str, html_content: str) -> bool:
|
||||
"""
|
||||
發送 Email
|
||||
|
||||
Returns:
|
||||
是否發送成功
|
||||
"""
|
||||
if not settings.smtp_host:
|
||||
logger.warning("SMTP 未設定,跳過發送")
|
||||
return False
|
||||
|
||||
try:
|
||||
msg = MIMEMultipart("alternative")
|
||||
msg["Subject"] = subject
|
||||
msg["From"] = f"{settings.smtp_from_name} <{settings.smtp_from_email}>"
|
||||
msg["To"] = to_email
|
||||
|
||||
html_part = MIMEText(html_content, "html", "utf-8")
|
||||
msg.attach(html_part)
|
||||
|
||||
with smtplib.SMTP(settings.smtp_host, settings.smtp_port) as server:
|
||||
server.starttls()
|
||||
if settings.smtp_username and settings.smtp_password:
|
||||
server.login(settings.smtp_username, settings.smtp_password)
|
||||
server.sendmail(settings.smtp_from_email, to_email, msg.as_string())
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Email 發送失敗", exc_info=True)
|
||||
return False
|
||||
|
||||
|
||||
def create_report_email_content(report: Report, base_url: str = "") -> str:
|
||||
"""建立報告通知 Email 內容"""
|
||||
summary = report.edited_summary or report.ai_summary or "無摘要內容"
|
||||
|
||||
# 截取摘要前 500 字
|
||||
if len(summary) > 500:
|
||||
summary = summary[:500] + "..."
|
||||
|
||||
# 轉義 HTML 特殊字元,防止 XSS
|
||||
safe_title = escape(report.title)
|
||||
safe_group_name = escape(report.group.name)
|
||||
safe_summary = escape(summary)
|
||||
safe_base_url = escape(base_url)
|
||||
|
||||
html = f"""
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<style>
|
||||
body {{ font-family: Arial, sans-serif; line-height: 1.6; color: #333; }}
|
||||
.container {{ max-width: 600px; margin: 0 auto; padding: 20px; }}
|
||||
.header {{ background: #4a6fa5; color: white; padding: 20px; text-align: center; }}
|
||||
.content {{ padding: 20px; background: #f9f9f9; }}
|
||||
.summary {{ background: white; padding: 15px; border-left: 4px solid #4a6fa5; margin: 15px 0; }}
|
||||
.button {{ display: inline-block; padding: 12px 24px; background: #4a6fa5; color: white; text-decoration: none; border-radius: 4px; }}
|
||||
.footer {{ text-align: center; padding: 20px; color: #666; font-size: 12px; }}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<div class="header">
|
||||
<h1 style="margin:0;">每日報導</h1>
|
||||
</div>
|
||||
<div class="content">
|
||||
<h2>{safe_title}</h2>
|
||||
<p>
|
||||
<strong>群組:</strong>{safe_group_name}<br>
|
||||
<strong>日期:</strong>{report.report_date}
|
||||
</p>
|
||||
<div class="summary">
|
||||
<h3>摘要</h3>
|
||||
<p>{safe_summary}</p>
|
||||
</div>
|
||||
<p style="text-align: center; margin-top: 30px;">
|
||||
<a href="{safe_base_url}/reports/{report.id}" class="button">閱讀完整報告</a>
|
||||
</p>
|
||||
</div>
|
||||
<div class="footer">
|
||||
<p>此郵件由每日報導系統自動發送</p>
|
||||
<p>如不想收到通知,請至系統調整訂閱設定</p>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
return html
|
||||
|
||||
|
||||
def send_report_notifications(db: Session, report: Report) -> int:
|
||||
"""
|
||||
發送報告通知給訂閱者
|
||||
|
||||
Returns:
|
||||
發送成功數量
|
||||
"""
|
||||
# 取得訂閱此群組的用戶
|
||||
subscriptions = db.query(Subscription).filter(
|
||||
Subscription.group_id == report.group_id,
|
||||
Subscription.email_notify == True
|
||||
).all()
|
||||
|
||||
sent_count = 0
|
||||
|
||||
for sub in subscriptions:
|
||||
user = db.query(User).filter(User.id == sub.user_id).first()
|
||||
if not user or not user.email or not user.is_active:
|
||||
continue
|
||||
|
||||
# 建立通知記錄
|
||||
notification = NotificationLog(
|
||||
user_id=user.id,
|
||||
report_id=report.id,
|
||||
notification_type="email",
|
||||
subject=f"【每日報導】{report.title}",
|
||||
content=report.edited_summary or report.ai_summary
|
||||
)
|
||||
db.add(notification)
|
||||
|
||||
# 發送 Email
|
||||
html_content = create_report_email_content(report)
|
||||
success = send_email(
|
||||
user.email,
|
||||
f"【每日報導】{report.title}",
|
||||
html_content
|
||||
)
|
||||
|
||||
if success:
|
||||
notification.status = NotificationStatus.SENT
|
||||
from datetime import datetime
|
||||
notification.sent_at = datetime.utcnow()
|
||||
sent_count += 1
|
||||
else:
|
||||
notification.status = NotificationStatus.FAILED
|
||||
notification.error_message = "發送失敗"
|
||||
|
||||
db.commit()
|
||||
return sent_count
|
||||
|
||||
|
||||
def send_delay_notification(db: Session, report: Report) -> int:
|
||||
"""
|
||||
發送延遲發布通知
|
||||
|
||||
Returns:
|
||||
發送成功數量
|
||||
"""
|
||||
subscriptions = db.query(Subscription).filter(
|
||||
Subscription.group_id == report.group_id,
|
||||
Subscription.email_notify == True
|
||||
).all()
|
||||
|
||||
sent_count = 0
|
||||
|
||||
for sub in subscriptions:
|
||||
user = db.query(User).filter(User.id == sub.user_id).first()
|
||||
if not user or not user.email or not user.is_active:
|
||||
continue
|
||||
|
||||
# 轉義 HTML 特殊字元,防止 XSS
|
||||
safe_group_name = escape(report.group.name)
|
||||
html_content = f"""
|
||||
<html>
|
||||
<body>
|
||||
<h2>報告延遲通知</h2>
|
||||
<p>您訂閱的「{safe_group_name}」今日報告延遲發布,敬請稍後。</p>
|
||||
<p>造成不便,敬請見諒。</p>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
success = send_email(
|
||||
user.email,
|
||||
f"【每日報導】{report.group.name} 報告延遲通知",
|
||||
html_content
|
||||
)
|
||||
|
||||
if success:
|
||||
sent_count += 1
|
||||
|
||||
return sent_count
|
||||
277
app/services/scheduler_service.py
Normal file
277
app/services/scheduler_service.py
Normal file
@@ -0,0 +1,277 @@
|
||||
"""
|
||||
排程服務模組
|
||||
處理每日新聞抓取與報告產生
|
||||
"""
|
||||
from datetime import datetime, date
|
||||
from typing import List
|
||||
from apscheduler.schedulers.background import BackgroundScheduler
|
||||
from apscheduler.triggers.cron import CronTrigger
|
||||
from sqlalchemy.orm import Session
|
||||
import logging
|
||||
|
||||
from app.db.session import SessionLocal
|
||||
from app.core.config import settings
|
||||
from app.models import (
|
||||
NewsSource, NewsArticle, CrawlJob, CrawlStatus,
|
||||
Group, Keyword, ArticleGroupMatch, Report, ReportArticle, ReportStatus
|
||||
)
|
||||
from app.services.crawler_service import get_crawler
|
||||
from app.services.llm_service import generate_summary
|
||||
from app.services.notification_service import send_delay_notification
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
scheduler = BackgroundScheduler()
|
||||
|
||||
|
||||
def run_daily_crawl():
|
||||
"""執行每日新聞抓取"""
|
||||
logger.info("開始每日新聞抓取...")
|
||||
|
||||
db = SessionLocal()
|
||||
|
||||
try:
|
||||
# 取得所有啟用的新聞來源
|
||||
sources = db.query(NewsSource).filter(NewsSource.is_active == True).all()
|
||||
|
||||
# 取得所有關鍵字
|
||||
all_keywords = db.query(Keyword).filter(Keyword.is_active == True).all()
|
||||
keywords_list = list(set([kw.keyword for kw in all_keywords]))
|
||||
|
||||
for source in sources:
|
||||
logger.info(f"抓取來源: {source.name}")
|
||||
|
||||
# 建立抓取任務記錄
|
||||
job = CrawlJob(
|
||||
source_id=source.id,
|
||||
status=CrawlStatus.RUNNING,
|
||||
scheduled_at=datetime.now(),
|
||||
started_at=datetime.now()
|
||||
)
|
||||
db.add(job)
|
||||
db.commit()
|
||||
|
||||
try:
|
||||
# 取得爬蟲
|
||||
crawler = get_crawler(source.code)
|
||||
|
||||
# 抓取文章列表
|
||||
articles_data = crawler.get_article_list(keywords_list)
|
||||
|
||||
articles_count = 0
|
||||
for article_data in articles_data:
|
||||
# 檢查是否已存在
|
||||
existing = db.query(NewsArticle).filter(
|
||||
NewsArticle.source_id == source.id,
|
||||
NewsArticle.url == article_data["url"]
|
||||
).first()
|
||||
|
||||
if existing:
|
||||
continue
|
||||
|
||||
# 抓取全文
|
||||
content = crawler.get_article_content(article_data["url"])
|
||||
|
||||
# 儲存文章
|
||||
article = NewsArticle(
|
||||
source_id=source.id,
|
||||
title=article_data["title"],
|
||||
url=article_data["url"],
|
||||
content=content,
|
||||
published_at=article_data.get("published_at"),
|
||||
crawled_at=datetime.now()
|
||||
)
|
||||
db.add(article)
|
||||
db.commit()
|
||||
db.refresh(article)
|
||||
|
||||
# 關鍵字匹配
|
||||
match_article_to_groups(db, article)
|
||||
|
||||
articles_count += 1
|
||||
|
||||
# 更新任務狀態
|
||||
job.status = CrawlStatus.COMPLETED
|
||||
job.completed_at = datetime.now()
|
||||
job.articles_count = articles_count
|
||||
|
||||
crawler.close()
|
||||
|
||||
except Exception as e:
|
||||
job.status = CrawlStatus.FAILED
|
||||
job.completed_at = datetime.now()
|
||||
job.error_message = str(e)
|
||||
job.retry_count += 1
|
||||
logger.error(f"抓取失敗 (來源: {source.name})", exc_info=True)
|
||||
|
||||
db.commit()
|
||||
|
||||
# 產生今日報告
|
||||
generate_daily_reports(db)
|
||||
|
||||
logger.info("每日新聞抓取完成")
|
||||
|
||||
except Exception as e:
|
||||
logger.error("抓取過程發生錯誤", exc_info=True)
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def match_article_to_groups(db: Session, article: NewsArticle):
|
||||
"""將文章匹配到群組"""
|
||||
# 取得所有群組及其關鍵字
|
||||
groups = db.query(Group).filter(Group.is_active == True).all()
|
||||
|
||||
article_text = f"{article.title} {article.content or ''}"
|
||||
|
||||
for group in groups:
|
||||
keywords = db.query(Keyword).filter(
|
||||
Keyword.group_id == group.id,
|
||||
Keyword.is_active == True
|
||||
).all()
|
||||
|
||||
matched_keywords = []
|
||||
for kw in keywords:
|
||||
if kw.keyword.lower() in article_text.lower():
|
||||
matched_keywords.append(kw.keyword)
|
||||
|
||||
if matched_keywords:
|
||||
# 計算匹配分數
|
||||
score = len(matched_keywords) / len(keywords) * 100 if keywords else 0
|
||||
|
||||
match = ArticleGroupMatch(
|
||||
article_id=article.id,
|
||||
group_id=group.id,
|
||||
matched_keywords=matched_keywords,
|
||||
match_score=score
|
||||
)
|
||||
db.add(match)
|
||||
|
||||
db.commit()
|
||||
|
||||
|
||||
def generate_daily_reports(db: Session):
|
||||
"""產生今日報告"""
|
||||
logger.info("產生今日報告...")
|
||||
|
||||
today = date.today()
|
||||
groups = db.query(Group).filter(Group.is_active == True).all()
|
||||
|
||||
for group in groups:
|
||||
# 檢查今日報告是否已存在
|
||||
existing = db.query(Report).filter(
|
||||
Report.group_id == group.id,
|
||||
Report.report_date == today
|
||||
).first()
|
||||
|
||||
if existing:
|
||||
continue
|
||||
|
||||
# 取得今日匹配的文章
|
||||
matches = db.query(ArticleGroupMatch).filter(
|
||||
ArticleGroupMatch.group_id == group.id
|
||||
).join(NewsArticle).filter(
|
||||
NewsArticle.crawled_at >= datetime.combine(today, datetime.min.time())
|
||||
).all()
|
||||
|
||||
if not matches:
|
||||
continue
|
||||
|
||||
# 建立報告
|
||||
report = Report(
|
||||
group_id=group.id,
|
||||
title=f"{group.name}日報 - {today.strftime('%Y/%m/%d')}",
|
||||
report_date=today,
|
||||
status=ReportStatus.DRAFT
|
||||
)
|
||||
db.add(report)
|
||||
db.commit()
|
||||
db.refresh(report)
|
||||
|
||||
# 關聯文章
|
||||
articles = []
|
||||
for match in matches:
|
||||
article = db.query(NewsArticle).filter(NewsArticle.id == match.article_id).first()
|
||||
if article:
|
||||
ra = ReportArticle(
|
||||
report_id=report.id,
|
||||
article_id=article.id,
|
||||
is_included=True
|
||||
)
|
||||
db.add(ra)
|
||||
articles.append(article)
|
||||
|
||||
db.commit()
|
||||
|
||||
# 產生 AI 摘要
|
||||
if articles:
|
||||
summary = generate_summary(group, articles)
|
||||
report.ai_summary = summary
|
||||
report.status = ReportStatus.PENDING
|
||||
db.commit()
|
||||
|
||||
logger.info(f"已產生報告: {report.title} ({len(articles)} 篇文章)")
|
||||
|
||||
|
||||
def check_publish_deadline():
|
||||
"""檢查發布截止時間"""
|
||||
db = SessionLocal()
|
||||
|
||||
try:
|
||||
today = date.today()
|
||||
|
||||
# 取得尚未發布的報告
|
||||
pending_reports = db.query(Report).filter(
|
||||
Report.report_date == today,
|
||||
Report.status.in_([ReportStatus.DRAFT, ReportStatus.PENDING])
|
||||
).all()
|
||||
|
||||
for report in pending_reports:
|
||||
report.status = ReportStatus.DELAYED
|
||||
send_delay_notification(db, report)
|
||||
|
||||
db.commit()
|
||||
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def init_scheduler():
|
||||
"""初始化排程器"""
|
||||
# 解析排程時間
|
||||
crawl_time = settings.crawl_schedule_time.split(":")
|
||||
crawl_hour = int(crawl_time[0])
|
||||
crawl_minute = int(crawl_time[1])
|
||||
|
||||
deadline_time = "09:00".split(":") # 可從設定讀取
|
||||
deadline_hour = int(deadline_time[0])
|
||||
deadline_minute = int(deadline_time[1])
|
||||
|
||||
# 每日抓取任務
|
||||
scheduler.add_job(
|
||||
run_daily_crawl,
|
||||
CronTrigger(hour=crawl_hour, minute=crawl_minute),
|
||||
id="daily_crawl",
|
||||
replace_existing=True
|
||||
)
|
||||
|
||||
# 發布截止時間檢查
|
||||
scheduler.add_job(
|
||||
check_publish_deadline,
|
||||
CronTrigger(hour=deadline_hour, minute=deadline_minute),
|
||||
id="check_deadline",
|
||||
replace_existing=True
|
||||
)
|
||||
|
||||
# 啟動排程器
|
||||
if not scheduler.running:
|
||||
scheduler.start()
|
||||
|
||||
logger.info(f"排程器已啟動: 每日 {settings.crawl_schedule_time} 抓取")
|
||||
|
||||
|
||||
def shutdown_scheduler():
|
||||
"""關閉排程器"""
|
||||
if scheduler.running:
|
||||
scheduler.shutdown()
|
||||
Reference in New Issue
Block a user