Initial commit: Daily News App

企業內部新聞彙整與分析系統
- 自動新聞抓取 (Digitimes, 經濟日報, 工商時報)
- AI 智慧摘要 (OpenAI/Claude/Ollama)
- 群組管理與訂閱通知
- 已清理 Python 快取檔案

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
donald
2025-12-03 23:53:24 +08:00
commit db0f0bbfe7
50 changed files with 11883 additions and 0 deletions

View File

@@ -0,0 +1,322 @@
"""
新聞爬蟲服務模組
支援 Digitimes、經濟日報、工商時報
"""
import time
import re
from datetime import datetime, date
from typing import Optional, List, Dict, Any
from abc import ABC, abstractmethod
import httpx
from bs4 import BeautifulSoup
from tenacity import retry, stop_after_attempt, wait_exponential
import logging
from app.core.config import settings
logger = logging.getLogger(__name__)
class BaseCrawler(ABC):
"""爬蟲基礎類別"""
def __init__(self):
self.session = httpx.Client(
timeout=30,
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
)
self.delay = settings.crawl_request_delay
def _wait(self):
"""請求間隔"""
time.sleep(self.delay)
@abstractmethod
def get_article_list(self, keywords: List[str]) -> List[Dict[str, Any]]:
"""取得文章列表"""
pass
@abstractmethod
def get_article_content(self, url: str) -> Optional[str]:
"""取得文章內容"""
pass
def close(self):
"""關閉連線"""
self.session.close()
class DigitimesCrawler(BaseCrawler):
"""Digitimes 爬蟲(付費訂閱)"""
BASE_URL = "https://www.digitimes.com.tw"
def __init__(self, username: str, password: str):
super().__init__()
self.username = username
self.password = password
self.is_logged_in = False
def login(self) -> bool:
"""登入 Digitimes"""
try:
# 取得登入頁面
login_page = self.session.get(f"{self.BASE_URL}/member/login.asp")
# 發送登入請求
login_data = {
"uid": self.username,
"pwd": self.password,
"remember": "1"
}
response = self.session.post(
f"{self.BASE_URL}/member/login_check.asp",
data=login_data,
follow_redirects=True
)
# 檢查是否登入成功(根據回應判斷)
self.is_logged_in = "logout" in response.text.lower() or response.status_code == 200
return self.is_logged_in
except Exception as e:
logger.error("Digitimes 登入失敗", exc_info=True)
return False
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
def get_article_list(self, keywords: List[str]) -> List[Dict[str, Any]]:
"""取得文章列表"""
if not self.is_logged_in:
self.login()
articles = []
for keyword in keywords:
self._wait()
try:
# 搜尋 API
search_url = f"{self.BASE_URL}/search/search_result.asp?query={keyword}"
response = self.session.get(search_url)
soup = BeautifulSoup(response.text, "lxml")
# 解析搜尋結果
for item in soup.select(".search-result-item, .news-item"):
title_elem = item.select_one("h3 a, .title a")
if not title_elem:
continue
title = title_elem.get_text(strip=True)
url = title_elem.get("href", "")
if not url.startswith("http"):
url = f"{self.BASE_URL}{url}"
# 取得日期
date_elem = item.select_one(".date, .time")
pub_date = None
if date_elem:
date_text = date_elem.get_text(strip=True)
try:
pub_date = datetime.strptime(date_text, "%Y/%m/%d")
except:
pass
# 只取今天的新聞
if pub_date and pub_date.date() == date.today():
articles.append({
"title": title,
"url": url,
"published_at": pub_date,
"source": "digitimes"
})
except Exception as e:
logger.warning(f"Digitimes 抓取失敗 (關鍵字: {keyword})", exc_info=True)
return articles
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
def get_article_content(self, url: str) -> Optional[str]:
"""取得文章內容"""
if not self.is_logged_in:
self.login()
try:
self._wait()
response = self.session.get(url)
soup = BeautifulSoup(response.text, "lxml")
# 嘗試多個內容選擇器
content_selectors = [".article-body", ".content", "#article-content", ".main-content"]
for selector in content_selectors:
content_elem = soup.select_one(selector)
if content_elem:
# 移除不需要的元素
for unwanted in content_elem.select("script, style, .ad, .advertisement"):
unwanted.decompose()
return content_elem.get_text(separator="\n", strip=True)
return None
except Exception as e:
logger.warning("Digitimes 內容抓取失敗", exc_info=True)
return None
class UDNCrawler(BaseCrawler):
"""經濟日報爬蟲"""
BASE_URL = "https://money.udn.com"
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
def get_article_list(self, keywords: List[str]) -> List[Dict[str, Any]]:
"""取得文章列表"""
articles = []
for keyword in keywords:
self._wait()
try:
search_url = f"{self.BASE_URL}/search/result/1/{keyword}"
response = self.session.get(search_url)
soup = BeautifulSoup(response.text, "lxml")
for item in soup.select(".story-list__news, .news-item"):
title_elem = item.select_one("h3 a, .story-list__text a")
if not title_elem:
continue
title = title_elem.get_text(strip=True)
url = title_elem.get("href", "")
if not url.startswith("http"):
url = f"{self.BASE_URL}{url}"
date_elem = item.select_one("time, .story-list__time")
pub_date = None
if date_elem:
date_text = date_elem.get_text(strip=True)
try:
pub_date = datetime.strptime(date_text[:10], "%Y-%m-%d")
except:
pass
if pub_date and pub_date.date() == date.today():
articles.append({
"title": title,
"url": url,
"published_at": pub_date,
"source": "udn"
})
except Exception as e:
logger.warning(f"經濟日報抓取失敗 (關鍵字: {keyword})", exc_info=True)
return articles
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
def get_article_content(self, url: str) -> Optional[str]:
"""取得文章內容"""
try:
self._wait()
response = self.session.get(url)
soup = BeautifulSoup(response.text, "lxml")
content_elem = soup.select_one("#story_body_content, .article-content")
if content_elem:
for unwanted in content_elem.select("script, style, .ad"):
unwanted.decompose()
return content_elem.get_text(separator="\n", strip=True)
return None
except Exception as e:
logger.warning("經濟日報內容抓取失敗", exc_info=True)
return None
class CTEECrawler(BaseCrawler):
"""工商時報爬蟲"""
BASE_URL = "https://ctee.com.tw"
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
def get_article_list(self, keywords: List[str]) -> List[Dict[str, Any]]:
"""取得文章列表"""
articles = []
for keyword in keywords:
self._wait()
try:
search_url = f"{self.BASE_URL}/?s={keyword}"
response = self.session.get(search_url)
soup = BeautifulSoup(response.text, "lxml")
for item in soup.select(".post-item, article.post"):
title_elem = item.select_one("h2 a, .post-title a")
if not title_elem:
continue
title = title_elem.get_text(strip=True)
url = title_elem.get("href", "")
date_elem = item.select_one("time, .post-date")
pub_date = None
if date_elem:
date_text = date_elem.get("datetime", date_elem.get_text(strip=True))
try:
pub_date = datetime.fromisoformat(date_text[:10])
except:
pass
if pub_date and pub_date.date() == date.today():
articles.append({
"title": title,
"url": url,
"published_at": pub_date,
"source": "ctee"
})
except Exception as e:
logger.warning(f"工商時報抓取失敗 (關鍵字: {keyword})", exc_info=True)
return articles
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
def get_article_content(self, url: str) -> Optional[str]:
"""取得文章內容"""
try:
self._wait()
response = self.session.get(url)
soup = BeautifulSoup(response.text, "lxml")
content_elem = soup.select_one(".entry-content, .post-content")
if content_elem:
for unwanted in content_elem.select("script, style, .ad"):
unwanted.decompose()
return content_elem.get_text(separator="\n", strip=True)
return None
except Exception as e:
logger.warning("工商時報內容抓取失敗", exc_info=True)
return None
def get_crawler(source_code: str) -> BaseCrawler:
"""取得對應的爬蟲實例"""
if source_code == "digitimes":
return DigitimesCrawler(
settings.digitimes_username,
settings.digitimes_password
)
elif source_code == "udn":
return UDNCrawler()
elif source_code == "ctee":
return CTEECrawler()
else:
raise ValueError(f"不支援的新聞來源: {source_code}")