Initial commit: Daily News App

企業內部新聞彙整與分析系統
- 自動新聞抓取 (Digitimes, 經濟日報, 工商時報)
- AI 智慧摘要 (OpenAI/Claude/Ollama)
- 群組管理與訂閱通知
- 已清理 Python 快取檔案

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
donald
2025-12-03 23:53:24 +08:00
commit db0f0bbfe7
50 changed files with 11883 additions and 0 deletions

19
app/services/__init__.py Normal file
View File

@@ -0,0 +1,19 @@
"""
服務模組
"""
from app.services.llm_service import generate_summary, test_llm_connection
from app.services.notification_service import send_email, send_report_notifications
from app.services.crawler_service import get_crawler, BaseCrawler
from app.services.scheduler_service import init_scheduler, shutdown_scheduler, run_daily_crawl
__all__ = [
"generate_summary",
"test_llm_connection",
"send_email",
"send_report_notifications",
"get_crawler",
"BaseCrawler",
"init_scheduler",
"shutdown_scheduler",
"run_daily_crawl"
]

View File

@@ -0,0 +1,322 @@
"""
新聞爬蟲服務模組
支援 Digitimes、經濟日報、工商時報
"""
import time
import re
from datetime import datetime, date
from typing import Optional, List, Dict, Any
from abc import ABC, abstractmethod
import httpx
from bs4 import BeautifulSoup
from tenacity import retry, stop_after_attempt, wait_exponential
import logging
from app.core.config import settings
logger = logging.getLogger(__name__)
class BaseCrawler(ABC):
"""爬蟲基礎類別"""
def __init__(self):
self.session = httpx.Client(
timeout=30,
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
)
self.delay = settings.crawl_request_delay
def _wait(self):
"""請求間隔"""
time.sleep(self.delay)
@abstractmethod
def get_article_list(self, keywords: List[str]) -> List[Dict[str, Any]]:
"""取得文章列表"""
pass
@abstractmethod
def get_article_content(self, url: str) -> Optional[str]:
"""取得文章內容"""
pass
def close(self):
"""關閉連線"""
self.session.close()
class DigitimesCrawler(BaseCrawler):
"""Digitimes 爬蟲(付費訂閱)"""
BASE_URL = "https://www.digitimes.com.tw"
def __init__(self, username: str, password: str):
super().__init__()
self.username = username
self.password = password
self.is_logged_in = False
def login(self) -> bool:
"""登入 Digitimes"""
try:
# 取得登入頁面
login_page = self.session.get(f"{self.BASE_URL}/member/login.asp")
# 發送登入請求
login_data = {
"uid": self.username,
"pwd": self.password,
"remember": "1"
}
response = self.session.post(
f"{self.BASE_URL}/member/login_check.asp",
data=login_data,
follow_redirects=True
)
# 檢查是否登入成功(根據回應判斷)
self.is_logged_in = "logout" in response.text.lower() or response.status_code == 200
return self.is_logged_in
except Exception as e:
logger.error("Digitimes 登入失敗", exc_info=True)
return False
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
def get_article_list(self, keywords: List[str]) -> List[Dict[str, Any]]:
"""取得文章列表"""
if not self.is_logged_in:
self.login()
articles = []
for keyword in keywords:
self._wait()
try:
# 搜尋 API
search_url = f"{self.BASE_URL}/search/search_result.asp?query={keyword}"
response = self.session.get(search_url)
soup = BeautifulSoup(response.text, "lxml")
# 解析搜尋結果
for item in soup.select(".search-result-item, .news-item"):
title_elem = item.select_one("h3 a, .title a")
if not title_elem:
continue
title = title_elem.get_text(strip=True)
url = title_elem.get("href", "")
if not url.startswith("http"):
url = f"{self.BASE_URL}{url}"
# 取得日期
date_elem = item.select_one(".date, .time")
pub_date = None
if date_elem:
date_text = date_elem.get_text(strip=True)
try:
pub_date = datetime.strptime(date_text, "%Y/%m/%d")
except:
pass
# 只取今天的新聞
if pub_date and pub_date.date() == date.today():
articles.append({
"title": title,
"url": url,
"published_at": pub_date,
"source": "digitimes"
})
except Exception as e:
logger.warning(f"Digitimes 抓取失敗 (關鍵字: {keyword})", exc_info=True)
return articles
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
def get_article_content(self, url: str) -> Optional[str]:
"""取得文章內容"""
if not self.is_logged_in:
self.login()
try:
self._wait()
response = self.session.get(url)
soup = BeautifulSoup(response.text, "lxml")
# 嘗試多個內容選擇器
content_selectors = [".article-body", ".content", "#article-content", ".main-content"]
for selector in content_selectors:
content_elem = soup.select_one(selector)
if content_elem:
# 移除不需要的元素
for unwanted in content_elem.select("script, style, .ad, .advertisement"):
unwanted.decompose()
return content_elem.get_text(separator="\n", strip=True)
return None
except Exception as e:
logger.warning("Digitimes 內容抓取失敗", exc_info=True)
return None
class UDNCrawler(BaseCrawler):
"""經濟日報爬蟲"""
BASE_URL = "https://money.udn.com"
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
def get_article_list(self, keywords: List[str]) -> List[Dict[str, Any]]:
"""取得文章列表"""
articles = []
for keyword in keywords:
self._wait()
try:
search_url = f"{self.BASE_URL}/search/result/1/{keyword}"
response = self.session.get(search_url)
soup = BeautifulSoup(response.text, "lxml")
for item in soup.select(".story-list__news, .news-item"):
title_elem = item.select_one("h3 a, .story-list__text a")
if not title_elem:
continue
title = title_elem.get_text(strip=True)
url = title_elem.get("href", "")
if not url.startswith("http"):
url = f"{self.BASE_URL}{url}"
date_elem = item.select_one("time, .story-list__time")
pub_date = None
if date_elem:
date_text = date_elem.get_text(strip=True)
try:
pub_date = datetime.strptime(date_text[:10], "%Y-%m-%d")
except:
pass
if pub_date and pub_date.date() == date.today():
articles.append({
"title": title,
"url": url,
"published_at": pub_date,
"source": "udn"
})
except Exception as e:
logger.warning(f"經濟日報抓取失敗 (關鍵字: {keyword})", exc_info=True)
return articles
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
def get_article_content(self, url: str) -> Optional[str]:
"""取得文章內容"""
try:
self._wait()
response = self.session.get(url)
soup = BeautifulSoup(response.text, "lxml")
content_elem = soup.select_one("#story_body_content, .article-content")
if content_elem:
for unwanted in content_elem.select("script, style, .ad"):
unwanted.decompose()
return content_elem.get_text(separator="\n", strip=True)
return None
except Exception as e:
logger.warning("經濟日報內容抓取失敗", exc_info=True)
return None
class CTEECrawler(BaseCrawler):
"""工商時報爬蟲"""
BASE_URL = "https://ctee.com.tw"
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
def get_article_list(self, keywords: List[str]) -> List[Dict[str, Any]]:
"""取得文章列表"""
articles = []
for keyword in keywords:
self._wait()
try:
search_url = f"{self.BASE_URL}/?s={keyword}"
response = self.session.get(search_url)
soup = BeautifulSoup(response.text, "lxml")
for item in soup.select(".post-item, article.post"):
title_elem = item.select_one("h2 a, .post-title a")
if not title_elem:
continue
title = title_elem.get_text(strip=True)
url = title_elem.get("href", "")
date_elem = item.select_one("time, .post-date")
pub_date = None
if date_elem:
date_text = date_elem.get("datetime", date_elem.get_text(strip=True))
try:
pub_date = datetime.fromisoformat(date_text[:10])
except:
pass
if pub_date and pub_date.date() == date.today():
articles.append({
"title": title,
"url": url,
"published_at": pub_date,
"source": "ctee"
})
except Exception as e:
logger.warning(f"工商時報抓取失敗 (關鍵字: {keyword})", exc_info=True)
return articles
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
def get_article_content(self, url: str) -> Optional[str]:
"""取得文章內容"""
try:
self._wait()
response = self.session.get(url)
soup = BeautifulSoup(response.text, "lxml")
content_elem = soup.select_one(".entry-content, .post-content")
if content_elem:
for unwanted in content_elem.select("script, style, .ad"):
unwanted.decompose()
return content_elem.get_text(separator="\n", strip=True)
return None
except Exception as e:
logger.warning("工商時報內容抓取失敗", exc_info=True)
return None
def get_crawler(source_code: str) -> BaseCrawler:
"""取得對應的爬蟲實例"""
if source_code == "digitimes":
return DigitimesCrawler(
settings.digitimes_username,
settings.digitimes_password
)
elif source_code == "udn":
return UDNCrawler()
elif source_code == "ctee":
return CTEECrawler()
else:
raise ValueError(f"不支援的新聞來源: {source_code}")

176
app/services/llm_service.py Normal file
View File

@@ -0,0 +1,176 @@
"""
LLM 服務模組
支援 Google Gemini、OpenAI、Ollama
"""
import time
from typing import Optional
import httpx
from app.core.config import settings
def get_llm_client():
"""取得 LLM 客戶端"""
provider = settings.llm_provider
if provider == "gemini":
import google.generativeai as genai
genai.configure(api_key=settings.gemini_api_key)
return genai
elif provider == "openai":
from openai import OpenAI
return OpenAI(api_key=settings.openai_api_key)
elif provider == "ollama":
return None # 使用 httpx 直接呼叫
raise ValueError(f"不支援的 LLM 提供者: {provider}")
def generate_summary(group, articles: list) -> str:
"""
產生 AI 摘要
Args:
group: 群組物件(包含 ai_background, ai_prompt
articles: 新聞文章列表
Returns:
綜合摘要文字
"""
if not articles:
return "無相關新聞可供摘要。"
# 組合文章內容
articles_text = ""
for i, article in enumerate(articles, 1):
articles_text += f"""
---
新聞 {i}{article.title}
來源:{article.source.name if article.source else '未知'}
內容:{article.content[:1000] if article.content else article.summary or '無內容'}
---
"""
# 建立 Prompt
system_prompt = f"""你是一位專業的產業分析師,負責彙整每日新聞並產出精闘的綜合分析報告。
背景資訊:
{group.ai_background or '無特定背景資訊'}
摘要方向:
{group.ai_prompt or '請綜合分析以下新聞的重點、趨勢與潛在影響。'}
"""
user_prompt = f"""請根據以下 {len(articles)} 則新聞,產出一份繁體中文的綜合分析報告:
{articles_text}
請注意:
1. 使用繁體中文
2. 整合相關主題,避免逐條列舉
3. 突出重要趨勢與影響
4. 控制在 500 字以內
"""
provider = settings.llm_provider
try:
if provider == "gemini":
import google.generativeai as genai
genai.configure(api_key=settings.gemini_api_key)
model = genai.GenerativeModel(settings.gemini_model or "gemini-1.5-pro")
response = model.generate_content(
f"{system_prompt}\n\n{user_prompt}",
generation_config={
"temperature": 0.7,
"max_output_tokens": 2048,
"top_p": 0.95,
"top_k": 40
}
)
return response.text
elif provider == "openai":
from openai import OpenAI
client = OpenAI(api_key=settings.openai_api_key)
response = client.chat.completions.create(
model=settings.openai_model or "gpt-4o",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
],
max_tokens=2048,
temperature=0.7
)
return response.choices[0].message.content
elif provider == "ollama":
response = httpx.post(
f"{settings.ollama_endpoint}/api/generate",
json={
"model": settings.ollama_model or "llama3",
"prompt": f"{system_prompt}\n\n{user_prompt}",
"stream": False,
"options": {
"temperature": 0.7,
"num_predict": 2048,
"top_p": 0.9,
"top_k": 40
}
},
timeout=120
)
return response.json().get("response", "")
except Exception as e:
return f"摘要產生失敗:{str(e)}"
def test_llm_connection(provider: str, model: str) -> dict:
"""
測試 LLM 連線
Returns:
{"success": bool, "response_time_ms": int, "message": str}
"""
start_time = time.time()
try:
if provider == "gemini":
import google.generativeai as genai
genai.configure(api_key=settings.gemini_api_key)
gen_model = genai.GenerativeModel(model)
response = gen_model.generate_content(
"Hello",
generation_config={"max_output_tokens": 10}
)
elapsed = int((time.time() - start_time) * 1000)
return {"success": True, "response_time_ms": elapsed}
elif provider == "openai":
from openai import OpenAI
client = OpenAI(api_key=settings.openai_api_key)
response = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": "Hello"}],
max_tokens=10
)
elapsed = int((time.time() - start_time) * 1000)
return {"success": True, "response_time_ms": elapsed}
elif provider == "ollama":
response = httpx.post(
f"{settings.ollama_endpoint}/api/generate",
json={"model": model, "prompt": "Hello", "stream": False},
timeout=30
)
elapsed = int((time.time() - start_time) * 1000)
if response.status_code == 200:
return {"success": True, "response_time_ms": elapsed}
return {"success": False, "message": f"HTTP {response.status_code}"}
return {"success": False, "message": f"不支援的提供者: {provider}"}
except Exception as e:
elapsed = int((time.time() - start_time) * 1000)
return {"success": False, "response_time_ms": elapsed, "message": str(e)}

View File

@@ -0,0 +1,203 @@
"""
通知服務模組
處理 Email 發送
"""
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from typing import Optional
from html import escape
from sqlalchemy.orm import Session
import logging
from app.core.config import settings
from app.models import Report, Subscription, User, NotificationLog, NotificationStatus
logger = logging.getLogger(__name__)
def send_email(to_email: str, subject: str, html_content: str) -> bool:
"""
發送 Email
Returns:
是否發送成功
"""
if not settings.smtp_host:
logger.warning("SMTP 未設定,跳過發送")
return False
try:
msg = MIMEMultipart("alternative")
msg["Subject"] = subject
msg["From"] = f"{settings.smtp_from_name} <{settings.smtp_from_email}>"
msg["To"] = to_email
html_part = MIMEText(html_content, "html", "utf-8")
msg.attach(html_part)
with smtplib.SMTP(settings.smtp_host, settings.smtp_port) as server:
server.starttls()
if settings.smtp_username and settings.smtp_password:
server.login(settings.smtp_username, settings.smtp_password)
server.sendmail(settings.smtp_from_email, to_email, msg.as_string())
return True
except Exception as e:
logger.error("Email 發送失敗", exc_info=True)
return False
def create_report_email_content(report: Report, base_url: str = "") -> str:
"""建立報告通知 Email 內容"""
summary = report.edited_summary or report.ai_summary or "無摘要內容"
# 截取摘要前 500 字
if len(summary) > 500:
summary = summary[:500] + "..."
# 轉義 HTML 特殊字元,防止 XSS
safe_title = escape(report.title)
safe_group_name = escape(report.group.name)
safe_summary = escape(summary)
safe_base_url = escape(base_url)
html = f"""
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<style>
body {{ font-family: Arial, sans-serif; line-height: 1.6; color: #333; }}
.container {{ max-width: 600px; margin: 0 auto; padding: 20px; }}
.header {{ background: #4a6fa5; color: white; padding: 20px; text-align: center; }}
.content {{ padding: 20px; background: #f9f9f9; }}
.summary {{ background: white; padding: 15px; border-left: 4px solid #4a6fa5; margin: 15px 0; }}
.button {{ display: inline-block; padding: 12px 24px; background: #4a6fa5; color: white; text-decoration: none; border-radius: 4px; }}
.footer {{ text-align: center; padding: 20px; color: #666; font-size: 12px; }}
</style>
</head>
<body>
<div class="container">
<div class="header">
<h1 style="margin:0;">每日報導</h1>
</div>
<div class="content">
<h2>{safe_title}</h2>
<p>
<strong>群組:</strong>{safe_group_name}<br>
<strong>日期:</strong>{report.report_date}
</p>
<div class="summary">
<h3>摘要</h3>
<p>{safe_summary}</p>
</div>
<p style="text-align: center; margin-top: 30px;">
<a href="{safe_base_url}/reports/{report.id}" class="button">閱讀完整報告</a>
</p>
</div>
<div class="footer">
<p>此郵件由每日報導系統自動發送</p>
<p>如不想收到通知,請至系統調整訂閱設定</p>
</div>
</div>
</body>
</html>
"""
return html
def send_report_notifications(db: Session, report: Report) -> int:
"""
發送報告通知給訂閱者
Returns:
發送成功數量
"""
# 取得訂閱此群組的用戶
subscriptions = db.query(Subscription).filter(
Subscription.group_id == report.group_id,
Subscription.email_notify == True
).all()
sent_count = 0
for sub in subscriptions:
user = db.query(User).filter(User.id == sub.user_id).first()
if not user or not user.email or not user.is_active:
continue
# 建立通知記錄
notification = NotificationLog(
user_id=user.id,
report_id=report.id,
notification_type="email",
subject=f"【每日報導】{report.title}",
content=report.edited_summary or report.ai_summary
)
db.add(notification)
# 發送 Email
html_content = create_report_email_content(report)
success = send_email(
user.email,
f"【每日報導】{report.title}",
html_content
)
if success:
notification.status = NotificationStatus.SENT
from datetime import datetime
notification.sent_at = datetime.utcnow()
sent_count += 1
else:
notification.status = NotificationStatus.FAILED
notification.error_message = "發送失敗"
db.commit()
return sent_count
def send_delay_notification(db: Session, report: Report) -> int:
"""
發送延遲發布通知
Returns:
發送成功數量
"""
subscriptions = db.query(Subscription).filter(
Subscription.group_id == report.group_id,
Subscription.email_notify == True
).all()
sent_count = 0
for sub in subscriptions:
user = db.query(User).filter(User.id == sub.user_id).first()
if not user or not user.email or not user.is_active:
continue
# 轉義 HTML 特殊字元,防止 XSS
safe_group_name = escape(report.group.name)
html_content = f"""
<html>
<body>
<h2>報告延遲通知</h2>
<p>您訂閱的「{safe_group_name}」今日報告延遲發布,敬請稍後。</p>
<p>造成不便,敬請見諒。</p>
</body>
</html>
"""
success = send_email(
user.email,
f"【每日報導】{report.group.name} 報告延遲通知",
html_content
)
if success:
sent_count += 1
return sent_count

View File

@@ -0,0 +1,277 @@
"""
排程服務模組
處理每日新聞抓取與報告產生
"""
from datetime import datetime, date
from typing import List
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.triggers.cron import CronTrigger
from sqlalchemy.orm import Session
import logging
from app.db.session import SessionLocal
from app.core.config import settings
from app.models import (
NewsSource, NewsArticle, CrawlJob, CrawlStatus,
Group, Keyword, ArticleGroupMatch, Report, ReportArticle, ReportStatus
)
from app.services.crawler_service import get_crawler
from app.services.llm_service import generate_summary
from app.services.notification_service import send_delay_notification
logger = logging.getLogger(__name__)
scheduler = BackgroundScheduler()
def run_daily_crawl():
"""執行每日新聞抓取"""
logger.info("開始每日新聞抓取...")
db = SessionLocal()
try:
# 取得所有啟用的新聞來源
sources = db.query(NewsSource).filter(NewsSource.is_active == True).all()
# 取得所有關鍵字
all_keywords = db.query(Keyword).filter(Keyword.is_active == True).all()
keywords_list = list(set([kw.keyword for kw in all_keywords]))
for source in sources:
logger.info(f"抓取來源: {source.name}")
# 建立抓取任務記錄
job = CrawlJob(
source_id=source.id,
status=CrawlStatus.RUNNING,
scheduled_at=datetime.now(),
started_at=datetime.now()
)
db.add(job)
db.commit()
try:
# 取得爬蟲
crawler = get_crawler(source.code)
# 抓取文章列表
articles_data = crawler.get_article_list(keywords_list)
articles_count = 0
for article_data in articles_data:
# 檢查是否已存在
existing = db.query(NewsArticle).filter(
NewsArticle.source_id == source.id,
NewsArticle.url == article_data["url"]
).first()
if existing:
continue
# 抓取全文
content = crawler.get_article_content(article_data["url"])
# 儲存文章
article = NewsArticle(
source_id=source.id,
title=article_data["title"],
url=article_data["url"],
content=content,
published_at=article_data.get("published_at"),
crawled_at=datetime.now()
)
db.add(article)
db.commit()
db.refresh(article)
# 關鍵字匹配
match_article_to_groups(db, article)
articles_count += 1
# 更新任務狀態
job.status = CrawlStatus.COMPLETED
job.completed_at = datetime.now()
job.articles_count = articles_count
crawler.close()
except Exception as e:
job.status = CrawlStatus.FAILED
job.completed_at = datetime.now()
job.error_message = str(e)
job.retry_count += 1
logger.error(f"抓取失敗 (來源: {source.name})", exc_info=True)
db.commit()
# 產生今日報告
generate_daily_reports(db)
logger.info("每日新聞抓取完成")
except Exception as e:
logger.error("抓取過程發生錯誤", exc_info=True)
finally:
db.close()
def match_article_to_groups(db: Session, article: NewsArticle):
"""將文章匹配到群組"""
# 取得所有群組及其關鍵字
groups = db.query(Group).filter(Group.is_active == True).all()
article_text = f"{article.title} {article.content or ''}"
for group in groups:
keywords = db.query(Keyword).filter(
Keyword.group_id == group.id,
Keyword.is_active == True
).all()
matched_keywords = []
for kw in keywords:
if kw.keyword.lower() in article_text.lower():
matched_keywords.append(kw.keyword)
if matched_keywords:
# 計算匹配分數
score = len(matched_keywords) / len(keywords) * 100 if keywords else 0
match = ArticleGroupMatch(
article_id=article.id,
group_id=group.id,
matched_keywords=matched_keywords,
match_score=score
)
db.add(match)
db.commit()
def generate_daily_reports(db: Session):
"""產生今日報告"""
logger.info("產生今日報告...")
today = date.today()
groups = db.query(Group).filter(Group.is_active == True).all()
for group in groups:
# 檢查今日報告是否已存在
existing = db.query(Report).filter(
Report.group_id == group.id,
Report.report_date == today
).first()
if existing:
continue
# 取得今日匹配的文章
matches = db.query(ArticleGroupMatch).filter(
ArticleGroupMatch.group_id == group.id
).join(NewsArticle).filter(
NewsArticle.crawled_at >= datetime.combine(today, datetime.min.time())
).all()
if not matches:
continue
# 建立報告
report = Report(
group_id=group.id,
title=f"{group.name}日報 - {today.strftime('%Y/%m/%d')}",
report_date=today,
status=ReportStatus.DRAFT
)
db.add(report)
db.commit()
db.refresh(report)
# 關聯文章
articles = []
for match in matches:
article = db.query(NewsArticle).filter(NewsArticle.id == match.article_id).first()
if article:
ra = ReportArticle(
report_id=report.id,
article_id=article.id,
is_included=True
)
db.add(ra)
articles.append(article)
db.commit()
# 產生 AI 摘要
if articles:
summary = generate_summary(group, articles)
report.ai_summary = summary
report.status = ReportStatus.PENDING
db.commit()
logger.info(f"已產生報告: {report.title} ({len(articles)} 篇文章)")
def check_publish_deadline():
"""檢查發布截止時間"""
db = SessionLocal()
try:
today = date.today()
# 取得尚未發布的報告
pending_reports = db.query(Report).filter(
Report.report_date == today,
Report.status.in_([ReportStatus.DRAFT, ReportStatus.PENDING])
).all()
for report in pending_reports:
report.status = ReportStatus.DELAYED
send_delay_notification(db, report)
db.commit()
finally:
db.close()
def init_scheduler():
"""初始化排程器"""
# 解析排程時間
crawl_time = settings.crawl_schedule_time.split(":")
crawl_hour = int(crawl_time[0])
crawl_minute = int(crawl_time[1])
deadline_time = "09:00".split(":") # 可從設定讀取
deadline_hour = int(deadline_time[0])
deadline_minute = int(deadline_time[1])
# 每日抓取任務
scheduler.add_job(
run_daily_crawl,
CronTrigger(hour=crawl_hour, minute=crawl_minute),
id="daily_crawl",
replace_existing=True
)
# 發布截止時間檢查
scheduler.add_job(
check_publish_deadline,
CronTrigger(hour=deadline_hour, minute=deadline_minute),
id="check_deadline",
replace_existing=True
)
# 啟動排程器
if not scheduler.running:
scheduler.start()
logger.info(f"排程器已啟動: 每日 {settings.crawl_schedule_time} 抓取")
def shutdown_scheduler():
"""關閉排程器"""
if scheduler.running:
scheduler.shutdown()