Initial commit: Daily News App

企業內部新聞彙整與分析系統 - 自動新聞抓取 (Digitimes, 經濟日報, 工商時報) - AI 智慧摘要 (OpenAI/Claude/Ollama) - 群組管理與訂閱通知 - 已清理 Python 快取檔案 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-03 23:53:24 +08:00
commit db0f0bbfe7
50 changed files with 11883 additions and 0 deletions
--- a/app/services/init.py
+++ b/app/services/init.py
@@ -0,0 +1,19 @@
+"""
+服務模組
+"""
+from app.services.llm_service import generate_summary, test_llm_connection
+from app.services.notification_service import send_email, send_report_notifications
+from app.services.crawler_service import get_crawler, BaseCrawler
+from app.services.scheduler_service import init_scheduler, shutdown_scheduler, run_daily_crawl
+
+__all__ = [
+    "generate_summary",
+    "test_llm_connection",
+    "send_email",
+    "send_report_notifications",
+    "get_crawler",
+    "BaseCrawler",
+    "init_scheduler",
+    "shutdown_scheduler",
+    "run_daily_crawl"
+]
--- a/app/services/crawler_service.py
+++ b/app/services/crawler_service.py
@@ -0,0 +1,322 @@
+"""
+新聞爬蟲服務模組
+支援 Digitimes、經濟日報、工商時報
+"""
+import time
+import re
+from datetime import datetime, date
+from typing import Optional, List, Dict, Any
+from abc import ABC, abstractmethod
+import httpx
+from bs4 import BeautifulSoup
+from tenacity import retry, stop_after_attempt, wait_exponential
+import logging
+
+from app.core.config import settings
+
+logger = logging.getLogger(__name__)
+
+
+class BaseCrawler(ABC):
+    """爬蟲基礎類別"""
+    
+    def __init__(self):
+        self.session = httpx.Client(
+            timeout=30,
+            headers={
+                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+            }
+        )
+        self.delay = settings.crawl_request_delay
+    
+    def _wait(self):
+        """請求間隔"""
+        time.sleep(self.delay)
+    
+    @abstractmethod
+    def get_article_list(self, keywords: List[str]) -> List[Dict[str, Any]]:
+        """取得文章列表"""
+        pass
+    
+    @abstractmethod
+    def get_article_content(self, url: str) -> Optional[str]:
+        """取得文章內容"""
+        pass
+    
+    def close(self):
+        """關閉連線"""
+        self.session.close()
+
+
+class DigitimesCrawler(BaseCrawler):
+    """Digitimes 爬蟲（付費訂閱）"""
+    
+    BASE_URL = "https://www.digitimes.com.tw"
+    
+    def __init__(self, username: str, password: str):
+        super().__init__()
+        self.username = username
+        self.password = password
+        self.is_logged_in = False
+    
+    def login(self) -> bool:
+        """登入 Digitimes"""
+        try:
+            # 取得登入頁面
+            login_page = self.session.get(f"{self.BASE_URL}/member/login.asp")
+            
+            # 發送登入請求
+            login_data = {
+                "uid": self.username,
+                "pwd": self.password,
+                "remember": "1"
+            }
+            
+            response = self.session.post(
+                f"{self.BASE_URL}/member/login_check.asp",
+                data=login_data,
+                follow_redirects=True
+            )
+            
+            # 檢查是否登入成功（根據回應判斷）
+            self.is_logged_in = "logout" in response.text.lower() or response.status_code == 200
+            return self.is_logged_in
+            
+        except Exception as e:
+            logger.error("Digitimes 登入失敗", exc_info=True)
+            return False
+    
+    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
+    def get_article_list(self, keywords: List[str]) -> List[Dict[str, Any]]:
+        """取得文章列表"""
+        if not self.is_logged_in:
+            self.login()
+        
+        articles = []
+        
+        for keyword in keywords:
+            self._wait()
+            
+            try:
+                # 搜尋 API
+                search_url = f"{self.BASE_URL}/search/search_result.asp?query={keyword}"
+                response = self.session.get(search_url)
+                soup = BeautifulSoup(response.text, "lxml")
+                
+                # 解析搜尋結果
+                for item in soup.select(".search-result-item, .news-item"):
+                    title_elem = item.select_one("h3 a, .title a")
+                    if not title_elem:
+                        continue
+                    
+                    title = title_elem.get_text(strip=True)
+                    url = title_elem.get("href", "")
+                    if not url.startswith("http"):
+                        url = f"{self.BASE_URL}{url}"
+                    
+                    # 取得日期
+                    date_elem = item.select_one(".date, .time")
+                    pub_date = None
+                    if date_elem:
+                        date_text = date_elem.get_text(strip=True)
+                        try:
+                            pub_date = datetime.strptime(date_text, "%Y/%m/%d")
+                        except:
+                            pass
+                    
+                    # 只取今天的新聞
+                    if pub_date and pub_date.date() == date.today():
+                        articles.append({
+                            "title": title,
+                            "url": url,
+                            "published_at": pub_date,
+                            "source": "digitimes"
+                        })
+                
+            except Exception as e:
+                logger.warning(f"Digitimes 抓取失敗 (關鍵字: {keyword})", exc_info=True)
+        
+        return articles
+    
+    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
+    def get_article_content(self, url: str) -> Optional[str]:
+        """取得文章內容"""
+        if not self.is_logged_in:
+            self.login()
+        
+        try:
+            self._wait()
+            response = self.session.get(url)
+            soup = BeautifulSoup(response.text, "lxml")
+            
+            # 嘗試多個內容選擇器
+            content_selectors = [".article-body", ".content", "#article-content", ".main-content"]
+            
+            for selector in content_selectors:
+                content_elem = soup.select_one(selector)
+                if content_elem:
+                    # 移除不需要的元素
+                    for unwanted in content_elem.select("script, style, .ad, .advertisement"):
+                        unwanted.decompose()
+                    return content_elem.get_text(separator="\n", strip=True)
+            
+            return None
+            
+        except Exception as e:
+            logger.warning("Digitimes 內容抓取失敗", exc_info=True)
+            return None
+
+
+class UDNCrawler(BaseCrawler):
+    """經濟日報爬蟲"""
+    
+    BASE_URL = "https://money.udn.com"
+    
+    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
+    def get_article_list(self, keywords: List[str]) -> List[Dict[str, Any]]:
+        """取得文章列表"""
+        articles = []
+        
+        for keyword in keywords:
+            self._wait()
+            
+            try:
+                search_url = f"{self.BASE_URL}/search/result/1/{keyword}"
+                response = self.session.get(search_url)
+                soup = BeautifulSoup(response.text, "lxml")
+                
+                for item in soup.select(".story-list__news, .news-item"):
+                    title_elem = item.select_one("h3 a, .story-list__text a")
+                    if not title_elem:
+                        continue
+                    
+                    title = title_elem.get_text(strip=True)
+                    url = title_elem.get("href", "")
+                    if not url.startswith("http"):
+                        url = f"{self.BASE_URL}{url}"
+                    
+                    date_elem = item.select_one("time, .story-list__time")
+                    pub_date = None
+                    if date_elem:
+                        date_text = date_elem.get_text(strip=True)
+                        try:
+                            pub_date = datetime.strptime(date_text[:10], "%Y-%m-%d")
+                        except:
+                            pass
+                    
+                    if pub_date and pub_date.date() == date.today():
+                        articles.append({
+                            "title": title,
+                            "url": url,
+                            "published_at": pub_date,
+                            "source": "udn"
+                        })
+                
+            except Exception as e:
+                logger.warning(f"經濟日報抓取失敗 (關鍵字: {keyword})", exc_info=True)
+        
+        return articles
+    
+    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
+    def get_article_content(self, url: str) -> Optional[str]:
+        """取得文章內容"""
+        try:
+            self._wait()
+            response = self.session.get(url)
+            soup = BeautifulSoup(response.text, "lxml")
+            
+            content_elem = soup.select_one("#story_body_content, .article-content")
+            if content_elem:
+                for unwanted in content_elem.select("script, style, .ad"):
+                    unwanted.decompose()
+                return content_elem.get_text(separator="\n", strip=True)
+            
+            return None
+            
+        except Exception as e:
+            logger.warning("經濟日報內容抓取失敗", exc_info=True)
+            return None
+
+
+class CTEECrawler(BaseCrawler):
+    """工商時報爬蟲"""
+    
+    BASE_URL = "https://ctee.com.tw"
+    
+    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
+    def get_article_list(self, keywords: List[str]) -> List[Dict[str, Any]]:
+        """取得文章列表"""
+        articles = []
+        
+        for keyword in keywords:
+            self._wait()
+            
+            try:
+                search_url = f"{self.BASE_URL}/?s={keyword}"
+                response = self.session.get(search_url)
+                soup = BeautifulSoup(response.text, "lxml")
+                
+                for item in soup.select(".post-item, article.post"):
+                    title_elem = item.select_one("h2 a, .post-title a")
+                    if not title_elem:
+                        continue
+                    
+                    title = title_elem.get_text(strip=True)
+                    url = title_elem.get("href", "")
+                    
+                    date_elem = item.select_one("time, .post-date")
+                    pub_date = None
+                    if date_elem:
+                        date_text = date_elem.get("datetime", date_elem.get_text(strip=True))
+                        try:
+                            pub_date = datetime.fromisoformat(date_text[:10])
+                        except:
+                            pass
+                    
+                    if pub_date and pub_date.date() == date.today():
+                        articles.append({
+                            "title": title,
+                            "url": url,
+                            "published_at": pub_date,
+                            "source": "ctee"
+                        })
+                
+            except Exception as e:
+                logger.warning(f"工商時報抓取失敗 (關鍵字: {keyword})", exc_info=True)
+        
+        return articles
+    
+    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
+    def get_article_content(self, url: str) -> Optional[str]:
+        """取得文章內容"""
+        try:
+            self._wait()
+            response = self.session.get(url)
+            soup = BeautifulSoup(response.text, "lxml")
+            
+            content_elem = soup.select_one(".entry-content, .post-content")
+            if content_elem:
+                for unwanted in content_elem.select("script, style, .ad"):
+                    unwanted.decompose()
+                return content_elem.get_text(separator="\n", strip=True)
+            
+            return None
+            
+        except Exception as e:
+            logger.warning("工商時報內容抓取失敗", exc_info=True)
+            return None
+
+
+def get_crawler(source_code: str) -> BaseCrawler:
+    """取得對應的爬蟲實例"""
+    if source_code == "digitimes":
+        return DigitimesCrawler(
+            settings.digitimes_username,
+            settings.digitimes_password
+        )
+    elif source_code == "udn":
+        return UDNCrawler()
+    elif source_code == "ctee":
+        return CTEECrawler()
+    else:
+        raise ValueError(f"不支援的新聞來源: {source_code}")
--- a/app/services/llm_service.py
+++ b/app/services/llm_service.py
@@ -0,0 +1,176 @@
+"""
+LLM 服務模組
+支援 Google Gemini、OpenAI、Ollama
+"""
+import time
+from typing import Optional
+import httpx
+
+from app.core.config import settings
+
+
+def get_llm_client():
+    """取得 LLM 客戶端"""
+    provider = settings.llm_provider
+    
+    if provider == "gemini":
+        import google.generativeai as genai
+        genai.configure(api_key=settings.gemini_api_key)
+        return genai
+    elif provider == "openai":
+        from openai import OpenAI
+        return OpenAI(api_key=settings.openai_api_key)
+    elif provider == "ollama":
+        return None  # 使用 httpx 直接呼叫
+    
+    raise ValueError(f"不支援的 LLM 提供者: {provider}")
+
+
+def generate_summary(group, articles: list) -> str:
+    """
+    產生 AI 摘要
+    
+    Args:
+        group: 群組物件（包含 ai_background, ai_prompt）
+        articles: 新聞文章列表
+    
+    Returns:
+        綜合摘要文字
+    """
+    if not articles:
+        return "無相關新聞可供摘要。"
+    
+    # 組合文章內容
+    articles_text = ""
+    for i, article in enumerate(articles, 1):
+        articles_text += f"""
+---
+新聞 {i}：{article.title}
+來源：{article.source.name if article.source else '未知'}
+內容：{article.content[:1000] if article.content else article.summary or '無內容'}
+---
+"""
+    
+    # 建立 Prompt
+    system_prompt = f"""你是一位專業的產業分析師，負責彙整每日新聞並產出精闘的綜合分析報告。
+
+背景資訊：
+{group.ai_background or '無特定背景資訊'}
+
+摘要方向：
+{group.ai_prompt or '請綜合分析以下新聞的重點、趨勢與潛在影響。'}
+"""
+    
+    user_prompt = f"""請根據以下 {len(articles)} 則新聞，產出一份繁體中文的綜合分析報告：
+
+{articles_text}
+
+請注意：
+1. 使用繁體中文
+2. 整合相關主題，避免逐條列舉
+3. 突出重要趨勢與影響
+4. 控制在 500 字以內
+"""
+    
+    provider = settings.llm_provider
+    
+    try:
+        if provider == "gemini":
+            import google.generativeai as genai
+            genai.configure(api_key=settings.gemini_api_key)
+            model = genai.GenerativeModel(settings.gemini_model or "gemini-1.5-pro")
+            response = model.generate_content(
+                f"{system_prompt}\n\n{user_prompt}",
+                generation_config={
+                    "temperature": 0.7,
+                    "max_output_tokens": 2048,
+                    "top_p": 0.95,
+                    "top_k": 40
+                }
+            )
+            return response.text
+        
+        elif provider == "openai":
+            from openai import OpenAI
+            client = OpenAI(api_key=settings.openai_api_key)
+            response = client.chat.completions.create(
+                model=settings.openai_model or "gpt-4o",
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_prompt}
+                ],
+                max_tokens=2048,
+                temperature=0.7
+            )
+            return response.choices[0].message.content
+        
+        elif provider == "ollama":
+            response = httpx.post(
+                f"{settings.ollama_endpoint}/api/generate",
+                json={
+                    "model": settings.ollama_model or "llama3",
+                    "prompt": f"{system_prompt}\n\n{user_prompt}",
+                    "stream": False,
+                    "options": {
+                        "temperature": 0.7,
+                        "num_predict": 2048,
+                        "top_p": 0.9,
+                        "top_k": 40
+                    }
+                },
+                timeout=120
+            )
+            return response.json().get("response", "")
+        
+    except Exception as e:
+        return f"摘要產生失敗：{str(e)}"
+
+
+def test_llm_connection(provider: str, model: str) -> dict:
+    """
+    測試 LLM 連線
+    
+    Returns:
+        {"success": bool, "response_time_ms": int, "message": str}
+    """
+    start_time = time.time()
+    
+    try:
+        if provider == "gemini":
+            import google.generativeai as genai
+            genai.configure(api_key=settings.gemini_api_key)
+            gen_model = genai.GenerativeModel(model)
+            response = gen_model.generate_content(
+                "Hello",
+                generation_config={"max_output_tokens": 10}
+            )
+            elapsed = int((time.time() - start_time) * 1000)
+            return {"success": True, "response_time_ms": elapsed}
+        
+        elif provider == "openai":
+            from openai import OpenAI
+            client = OpenAI(api_key=settings.openai_api_key)
+            response = client.chat.completions.create(
+                model=model,
+                messages=[{"role": "user", "content": "Hello"}],
+                max_tokens=10
+            )
+            elapsed = int((time.time() - start_time) * 1000)
+            return {"success": True, "response_time_ms": elapsed}
+        
+        elif provider == "ollama":
+            response = httpx.post(
+                f"{settings.ollama_endpoint}/api/generate",
+                json={"model": model, "prompt": "Hello", "stream": False},
+                timeout=30
+            )
+            elapsed = int((time.time() - start_time) * 1000)
+            if response.status_code == 200:
+                return {"success": True, "response_time_ms": elapsed}
+            return {"success": False, "message": f"HTTP {response.status_code}"}
+        
+        return {"success": False, "message": f"不支援的提供者: {provider}"}
+        
+    except Exception as e:
+        elapsed = int((time.time() - start_time) * 1000)
+        return {"success": False, "response_time_ms": elapsed, "message": str(e)}
--- a/app/services/notification_service.py
+++ b/app/services/notification_service.py
@@ -0,0 +1,203 @@
+"""
+通知服務模組
+處理 Email 發送
+"""
+import smtplib
+from email.mime.text import MIMEText
+from email.mime.multipart import MIMEMultipart
+from typing import Optional
+from html import escape
+from sqlalchemy.orm import Session
+import logging
+
+from app.core.config import settings
+from app.models import Report, Subscription, User, NotificationLog, NotificationStatus
+
+logger = logging.getLogger(__name__)
+
+
+def send_email(to_email: str, subject: str, html_content: str) -> bool:
+    """
+    發送 Email
+    
+    Returns:
+        是否發送成功
+    """
+    if not settings.smtp_host:
+        logger.warning("SMTP 未設定，跳過發送")
+        return False
+    
+    try:
+        msg = MIMEMultipart("alternative")
+        msg["Subject"] = subject
+        msg["From"] = f"{settings.smtp_from_name} <{settings.smtp_from_email}>"
+        msg["To"] = to_email
+        
+        html_part = MIMEText(html_content, "html", "utf-8")
+        msg.attach(html_part)
+        
+        with smtplib.SMTP(settings.smtp_host, settings.smtp_port) as server:
+            server.starttls()
+            if settings.smtp_username and settings.smtp_password:
+                server.login(settings.smtp_username, settings.smtp_password)
+            server.sendmail(settings.smtp_from_email, to_email, msg.as_string())
+        
+        return True
+        
+    except Exception as e:
+        logger.error("Email 發送失敗", exc_info=True)
+        return False
+
+
+def create_report_email_content(report: Report, base_url: str = "") -> str:
+    """建立報告通知 Email 內容"""
+    summary = report.edited_summary or report.ai_summary or "無摘要內容"
+    
+    # 截取摘要前 500 字
+    if len(summary) > 500:
+        summary = summary[:500] + "..."
+    
+    # 轉義 HTML 特殊字元，防止 XSS
+    safe_title = escape(report.title)
+    safe_group_name = escape(report.group.name)
+    safe_summary = escape(summary)
+    safe_base_url = escape(base_url)
+    
+    html = f"""
+    <!DOCTYPE html>
+    <html>
+    <head>
+        <meta charset="UTF-8">
+        <style>
+            body {{ font-family: Arial, sans-serif; line-height: 1.6; color: #333; }}
+            .container {{ max-width: 600px; margin: 0 auto; padding: 20px; }}
+            .header {{ background: #4a6fa5; color: white; padding: 20px; text-align: center; }}
+            .content {{ padding: 20px; background: #f9f9f9; }}
+            .summary {{ background: white; padding: 15px; border-left: 4px solid #4a6fa5; margin: 15px 0; }}
+            .button {{ display: inline-block; padding: 12px 24px; background: #4a6fa5; color: white; text-decoration: none; border-radius: 4px; }}
+            .footer {{ text-align: center; padding: 20px; color: #666; font-size: 12px; }}
+        </style>
+    </head>
+    <body>
+        <div class="container">
+            <div class="header">
+                <h1 style="margin:0;">每日報導</h1>
+            </div>
+            <div class="content">
+                <h2>{safe_title}</h2>
+                <p>
+                    <strong>群組：</strong>{safe_group_name}<br>
+                    <strong>日期：</strong>{report.report_date}
+                </p>
+                <div class="summary">
+                    <h3>摘要</h3>
+                    <p>{safe_summary}</p>
+                </div>
+                <p style="text-align: center; margin-top: 30px;">
+                    <a href="{safe_base_url}/reports/{report.id}" class="button">閱讀完整報告</a>
+                </p>
+            </div>
+            <div class="footer">
+                <p>此郵件由每日報導系統自動發送</p>
+                <p>如不想收到通知，請至系統調整訂閱設定</p>
+            </div>
+        </div>
+    </body>
+    </html>
+    """
+    
+    return html
+
+
+def send_report_notifications(db: Session, report: Report) -> int:
+    """
+    發送報告通知給訂閱者
+    
+    Returns:
+        發送成功數量
+    """
+    # 取得訂閱此群組的用戶
+    subscriptions = db.query(Subscription).filter(
+        Subscription.group_id == report.group_id,
+        Subscription.email_notify == True
+    ).all()
+    
+    sent_count = 0
+    
+    for sub in subscriptions:
+        user = db.query(User).filter(User.id == sub.user_id).first()
+        if not user or not user.email or not user.is_active:
+            continue
+        
+        # 建立通知記錄
+        notification = NotificationLog(
+            user_id=user.id,
+            report_id=report.id,
+            notification_type="email",
+            subject=f"【每日報導】{report.title}",
+            content=report.edited_summary or report.ai_summary
+        )
+        db.add(notification)
+        
+        # 發送 Email
+        html_content = create_report_email_content(report)
+        success = send_email(
+            user.email,
+            f"【每日報導】{report.title}",
+            html_content
+        )
+        
+        if success:
+            notification.status = NotificationStatus.SENT
+            from datetime import datetime
+            notification.sent_at = datetime.utcnow()
+            sent_count += 1
+        else:
+            notification.status = NotificationStatus.FAILED
+            notification.error_message = "發送失敗"
+    
+    db.commit()
+    return sent_count
+
+
+def send_delay_notification(db: Session, report: Report) -> int:
+    """
+    發送延遲發布通知
+    
+    Returns:
+        發送成功數量
+    """
+    subscriptions = db.query(Subscription).filter(
+        Subscription.group_id == report.group_id,
+        Subscription.email_notify == True
+    ).all()
+    
+    sent_count = 0
+    
+    for sub in subscriptions:
+        user = db.query(User).filter(User.id == sub.user_id).first()
+        if not user or not user.email or not user.is_active:
+            continue
+        
+        # 轉義 HTML 特殊字元，防止 XSS
+        safe_group_name = escape(report.group.name)
+        html_content = f"""
+        <html>
+        <body>
+            <h2>報告延遲通知</h2>
+            <p>您訂閱的「{safe_group_name}」今日報告延遲發布，敬請稍後。</p>
+            <p>造成不便，敬請見諒。</p>
+        </body>
+        </html>
+        """
+        
+        success = send_email(
+            user.email,
+            f"【每日報導】{report.group.name} 報告延遲通知",
+            html_content
+        )
+        
+        if success:
+            sent_count += 1
+    
+    return sent_count
--- a/app/services/scheduler_service.py
+++ b/app/services/scheduler_service.py
@@ -0,0 +1,277 @@
+"""
+排程服務模組
+處理每日新聞抓取與報告產生
+"""
+from datetime import datetime, date
+from typing import List
+from apscheduler.schedulers.background import BackgroundScheduler
+from apscheduler.triggers.cron import CronTrigger
+from sqlalchemy.orm import Session
+import logging
+
+from app.db.session import SessionLocal
+from app.core.config import settings
+from app.models import (
+    NewsSource, NewsArticle, CrawlJob, CrawlStatus,
+    Group, Keyword, ArticleGroupMatch, Report, ReportArticle, ReportStatus
+)
+from app.services.crawler_service import get_crawler
+from app.services.llm_service import generate_summary
+from app.services.notification_service import send_delay_notification
+
+logger = logging.getLogger(__name__)
+
+
+scheduler = BackgroundScheduler()
+
+
+def run_daily_crawl():
+    """執行每日新聞抓取"""
+    logger.info("開始每日新聞抓取...")
+    
+    db = SessionLocal()
+    
+    try:
+        # 取得所有啟用的新聞來源
+        sources = db.query(NewsSource).filter(NewsSource.is_active == True).all()
+        
+        # 取得所有關鍵字
+        all_keywords = db.query(Keyword).filter(Keyword.is_active == True).all()
+        keywords_list = list(set([kw.keyword for kw in all_keywords]))
+        
+        for source in sources:
+            logger.info(f"抓取來源: {source.name}")
+            
+            # 建立抓取任務記錄
+            job = CrawlJob(
+                source_id=source.id,
+                status=CrawlStatus.RUNNING,
+                scheduled_at=datetime.now(),
+                started_at=datetime.now()
+            )
+            db.add(job)
+            db.commit()
+            
+            try:
+                # 取得爬蟲
+                crawler = get_crawler(source.code)
+                
+                # 抓取文章列表
+                articles_data = crawler.get_article_list(keywords_list)
+                
+                articles_count = 0
+                for article_data in articles_data:
+                    # 檢查是否已存在
+                    existing = db.query(NewsArticle).filter(
+                        NewsArticle.source_id == source.id,
+                        NewsArticle.url == article_data["url"]
+                    ).first()
+                    
+                    if existing:
+                        continue
+                    
+                    # 抓取全文
+                    content = crawler.get_article_content(article_data["url"])
+                    
+                    # 儲存文章
+                    article = NewsArticle(
+                        source_id=source.id,
+                        title=article_data["title"],
+                        url=article_data["url"],
+                        content=content,
+                        published_at=article_data.get("published_at"),
+                        crawled_at=datetime.now()
+                    )
+                    db.add(article)
+                    db.commit()
+                    db.refresh(article)
+                    
+                    # 關鍵字匹配
+                    match_article_to_groups(db, article)
+                    
+                    articles_count += 1
+                
+                # 更新任務狀態
+                job.status = CrawlStatus.COMPLETED
+                job.completed_at = datetime.now()
+                job.articles_count = articles_count
+                
+                crawler.close()
+                
+            except Exception as e:
+                job.status = CrawlStatus.FAILED
+                job.completed_at = datetime.now()
+                job.error_message = str(e)
+                job.retry_count += 1
+                logger.error(f"抓取失敗 (來源: {source.name})", exc_info=True)
+            
+            db.commit()
+        
+        # 產生今日報告
+        generate_daily_reports(db)
+        
+        logger.info("每日新聞抓取完成")
+        
+    except Exception as e:
+        logger.error("抓取過程發生錯誤", exc_info=True)
+    finally:
+        db.close()
+
+
+def match_article_to_groups(db: Session, article: NewsArticle):
+    """將文章匹配到群組"""
+    # 取得所有群組及其關鍵字
+    groups = db.query(Group).filter(Group.is_active == True).all()
+    
+    article_text = f"{article.title} {article.content or ''}"
+    
+    for group in groups:
+        keywords = db.query(Keyword).filter(
+            Keyword.group_id == group.id,
+            Keyword.is_active == True
+        ).all()
+        
+        matched_keywords = []
+        for kw in keywords:
+            if kw.keyword.lower() in article_text.lower():
+                matched_keywords.append(kw.keyword)
+        
+        if matched_keywords:
+            # 計算匹配分數
+            score = len(matched_keywords) / len(keywords) * 100 if keywords else 0
+            
+            match = ArticleGroupMatch(
+                article_id=article.id,
+                group_id=group.id,
+                matched_keywords=matched_keywords,
+                match_score=score
+            )
+            db.add(match)
+    
+    db.commit()
+
+
+def generate_daily_reports(db: Session):
+    """產生今日報告"""
+    logger.info("產生今日報告...")
+    
+    today = date.today()
+    groups = db.query(Group).filter(Group.is_active == True).all()
+    
+    for group in groups:
+        # 檢查今日報告是否已存在
+        existing = db.query(Report).filter(
+            Report.group_id == group.id,
+            Report.report_date == today
+        ).first()
+        
+        if existing:
+            continue
+        
+        # 取得今日匹配的文章
+        matches = db.query(ArticleGroupMatch).filter(
+            ArticleGroupMatch.group_id == group.id
+        ).join(NewsArticle).filter(
+            NewsArticle.crawled_at >= datetime.combine(today, datetime.min.time())
+        ).all()
+        
+        if not matches:
+            continue
+        
+        # 建立報告
+        report = Report(
+            group_id=group.id,
+            title=f"{group.name}日報 - {today.strftime('%Y/%m/%d')}",
+            report_date=today,
+            status=ReportStatus.DRAFT
+        )
+        db.add(report)
+        db.commit()
+        db.refresh(report)
+        
+        # 關聯文章
+        articles = []
+        for match in matches:
+            article = db.query(NewsArticle).filter(NewsArticle.id == match.article_id).first()
+            if article:
+                ra = ReportArticle(
+                    report_id=report.id,
+                    article_id=article.id,
+                    is_included=True
+                )
+                db.add(ra)
+                articles.append(article)
+        
+        db.commit()
+        
+        # 產生 AI 摘要
+        if articles:
+            summary = generate_summary(group, articles)
+            report.ai_summary = summary
+            report.status = ReportStatus.PENDING
+            db.commit()
+        
+        logger.info(f"已產生報告: {report.title} ({len(articles)} 篇文章)")
+
+
+def check_publish_deadline():
+    """檢查發布截止時間"""
+    db = SessionLocal()
+    
+    try:
+        today = date.today()
+        
+        # 取得尚未發布的報告
+        pending_reports = db.query(Report).filter(
+            Report.report_date == today,
+            Report.status.in_([ReportStatus.DRAFT, ReportStatus.PENDING])
+        ).all()
+        
+        for report in pending_reports:
+            report.status = ReportStatus.DELAYED
+            send_delay_notification(db, report)
+        
+        db.commit()
+        
+    finally:
+        db.close()
+
+
+def init_scheduler():
+    """初始化排程器"""
+    # 解析排程時間
+    crawl_time = settings.crawl_schedule_time.split(":")
+    crawl_hour = int(crawl_time[0])
+    crawl_minute = int(crawl_time[1])
+    
+    deadline_time = "09:00".split(":")  # 可從設定讀取
+    deadline_hour = int(deadline_time[0])
+    deadline_minute = int(deadline_time[1])
+    
+    # 每日抓取任務
+    scheduler.add_job(
+        run_daily_crawl,
+        CronTrigger(hour=crawl_hour, minute=crawl_minute),
+        id="daily_crawl",
+        replace_existing=True
+    )
+    
+    # 發布截止時間檢查
+    scheduler.add_job(
+        check_publish_deadline,
+        CronTrigger(hour=deadline_hour, minute=deadline_minute),
+        id="check_deadline",
+        replace_existing=True
+    )
+    
+    # 啟動排程器
+    if not scheduler.running:
+        scheduler.start()
+    
+    logger.info(f"排程器已啟動: 每日 {settings.crawl_schedule_time} 抓取")
+
+
+def shutdown_scheduler():
+    """關閉排程器"""
+    if scheduler.running:
+        scheduler.shutdown()