Initial commit: Daily News App
企業內部新聞彙整與分析系統 - 自動新聞抓取 (Digitimes, 經濟日報, 工商時報) - AI 智慧摘要 (OpenAI/Claude/Ollama) - 群組管理與訂閱通知 - 已清理 Python 快取檔案 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
322
app/services/crawler_service.py
Normal file
322
app/services/crawler_service.py
Normal file
@@ -0,0 +1,322 @@
|
||||
"""
|
||||
新聞爬蟲服務模組
|
||||
支援 Digitimes、經濟日報、工商時報
|
||||
"""
|
||||
import time
|
||||
import re
|
||||
from datetime import datetime, date
|
||||
from typing import Optional, List, Dict, Any
|
||||
from abc import ABC, abstractmethod
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
from tenacity import retry, stop_after_attempt, wait_exponential
|
||||
import logging
|
||||
|
||||
from app.core.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BaseCrawler(ABC):
|
||||
"""爬蟲基礎類別"""
|
||||
|
||||
def __init__(self):
|
||||
self.session = httpx.Client(
|
||||
timeout=30,
|
||||
headers={
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
}
|
||||
)
|
||||
self.delay = settings.crawl_request_delay
|
||||
|
||||
def _wait(self):
|
||||
"""請求間隔"""
|
||||
time.sleep(self.delay)
|
||||
|
||||
@abstractmethod
|
||||
def get_article_list(self, keywords: List[str]) -> List[Dict[str, Any]]:
|
||||
"""取得文章列表"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_article_content(self, url: str) -> Optional[str]:
|
||||
"""取得文章內容"""
|
||||
pass
|
||||
|
||||
def close(self):
|
||||
"""關閉連線"""
|
||||
self.session.close()
|
||||
|
||||
|
||||
class DigitimesCrawler(BaseCrawler):
|
||||
"""Digitimes 爬蟲(付費訂閱)"""
|
||||
|
||||
BASE_URL = "https://www.digitimes.com.tw"
|
||||
|
||||
def __init__(self, username: str, password: str):
|
||||
super().__init__()
|
||||
self.username = username
|
||||
self.password = password
|
||||
self.is_logged_in = False
|
||||
|
||||
def login(self) -> bool:
|
||||
"""登入 Digitimes"""
|
||||
try:
|
||||
# 取得登入頁面
|
||||
login_page = self.session.get(f"{self.BASE_URL}/member/login.asp")
|
||||
|
||||
# 發送登入請求
|
||||
login_data = {
|
||||
"uid": self.username,
|
||||
"pwd": self.password,
|
||||
"remember": "1"
|
||||
}
|
||||
|
||||
response = self.session.post(
|
||||
f"{self.BASE_URL}/member/login_check.asp",
|
||||
data=login_data,
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
# 檢查是否登入成功(根據回應判斷)
|
||||
self.is_logged_in = "logout" in response.text.lower() or response.status_code == 200
|
||||
return self.is_logged_in
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Digitimes 登入失敗", exc_info=True)
|
||||
return False
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
|
||||
def get_article_list(self, keywords: List[str]) -> List[Dict[str, Any]]:
|
||||
"""取得文章列表"""
|
||||
if not self.is_logged_in:
|
||||
self.login()
|
||||
|
||||
articles = []
|
||||
|
||||
for keyword in keywords:
|
||||
self._wait()
|
||||
|
||||
try:
|
||||
# 搜尋 API
|
||||
search_url = f"{self.BASE_URL}/search/search_result.asp?query={keyword}"
|
||||
response = self.session.get(search_url)
|
||||
soup = BeautifulSoup(response.text, "lxml")
|
||||
|
||||
# 解析搜尋結果
|
||||
for item in soup.select(".search-result-item, .news-item"):
|
||||
title_elem = item.select_one("h3 a, .title a")
|
||||
if not title_elem:
|
||||
continue
|
||||
|
||||
title = title_elem.get_text(strip=True)
|
||||
url = title_elem.get("href", "")
|
||||
if not url.startswith("http"):
|
||||
url = f"{self.BASE_URL}{url}"
|
||||
|
||||
# 取得日期
|
||||
date_elem = item.select_one(".date, .time")
|
||||
pub_date = None
|
||||
if date_elem:
|
||||
date_text = date_elem.get_text(strip=True)
|
||||
try:
|
||||
pub_date = datetime.strptime(date_text, "%Y/%m/%d")
|
||||
except:
|
||||
pass
|
||||
|
||||
# 只取今天的新聞
|
||||
if pub_date and pub_date.date() == date.today():
|
||||
articles.append({
|
||||
"title": title,
|
||||
"url": url,
|
||||
"published_at": pub_date,
|
||||
"source": "digitimes"
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Digitimes 抓取失敗 (關鍵字: {keyword})", exc_info=True)
|
||||
|
||||
return articles
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
|
||||
def get_article_content(self, url: str) -> Optional[str]:
|
||||
"""取得文章內容"""
|
||||
if not self.is_logged_in:
|
||||
self.login()
|
||||
|
||||
try:
|
||||
self._wait()
|
||||
response = self.session.get(url)
|
||||
soup = BeautifulSoup(response.text, "lxml")
|
||||
|
||||
# 嘗試多個內容選擇器
|
||||
content_selectors = [".article-body", ".content", "#article-content", ".main-content"]
|
||||
|
||||
for selector in content_selectors:
|
||||
content_elem = soup.select_one(selector)
|
||||
if content_elem:
|
||||
# 移除不需要的元素
|
||||
for unwanted in content_elem.select("script, style, .ad, .advertisement"):
|
||||
unwanted.decompose()
|
||||
return content_elem.get_text(separator="\n", strip=True)
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Digitimes 內容抓取失敗", exc_info=True)
|
||||
return None
|
||||
|
||||
|
||||
class UDNCrawler(BaseCrawler):
|
||||
"""經濟日報爬蟲"""
|
||||
|
||||
BASE_URL = "https://money.udn.com"
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
|
||||
def get_article_list(self, keywords: List[str]) -> List[Dict[str, Any]]:
|
||||
"""取得文章列表"""
|
||||
articles = []
|
||||
|
||||
for keyword in keywords:
|
||||
self._wait()
|
||||
|
||||
try:
|
||||
search_url = f"{self.BASE_URL}/search/result/1/{keyword}"
|
||||
response = self.session.get(search_url)
|
||||
soup = BeautifulSoup(response.text, "lxml")
|
||||
|
||||
for item in soup.select(".story-list__news, .news-item"):
|
||||
title_elem = item.select_one("h3 a, .story-list__text a")
|
||||
if not title_elem:
|
||||
continue
|
||||
|
||||
title = title_elem.get_text(strip=True)
|
||||
url = title_elem.get("href", "")
|
||||
if not url.startswith("http"):
|
||||
url = f"{self.BASE_URL}{url}"
|
||||
|
||||
date_elem = item.select_one("time, .story-list__time")
|
||||
pub_date = None
|
||||
if date_elem:
|
||||
date_text = date_elem.get_text(strip=True)
|
||||
try:
|
||||
pub_date = datetime.strptime(date_text[:10], "%Y-%m-%d")
|
||||
except:
|
||||
pass
|
||||
|
||||
if pub_date and pub_date.date() == date.today():
|
||||
articles.append({
|
||||
"title": title,
|
||||
"url": url,
|
||||
"published_at": pub_date,
|
||||
"source": "udn"
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"經濟日報抓取失敗 (關鍵字: {keyword})", exc_info=True)
|
||||
|
||||
return articles
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
|
||||
def get_article_content(self, url: str) -> Optional[str]:
|
||||
"""取得文章內容"""
|
||||
try:
|
||||
self._wait()
|
||||
response = self.session.get(url)
|
||||
soup = BeautifulSoup(response.text, "lxml")
|
||||
|
||||
content_elem = soup.select_one("#story_body_content, .article-content")
|
||||
if content_elem:
|
||||
for unwanted in content_elem.select("script, style, .ad"):
|
||||
unwanted.decompose()
|
||||
return content_elem.get_text(separator="\n", strip=True)
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("經濟日報內容抓取失敗", exc_info=True)
|
||||
return None
|
||||
|
||||
|
||||
class CTEECrawler(BaseCrawler):
|
||||
"""工商時報爬蟲"""
|
||||
|
||||
BASE_URL = "https://ctee.com.tw"
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
|
||||
def get_article_list(self, keywords: List[str]) -> List[Dict[str, Any]]:
|
||||
"""取得文章列表"""
|
||||
articles = []
|
||||
|
||||
for keyword in keywords:
|
||||
self._wait()
|
||||
|
||||
try:
|
||||
search_url = f"{self.BASE_URL}/?s={keyword}"
|
||||
response = self.session.get(search_url)
|
||||
soup = BeautifulSoup(response.text, "lxml")
|
||||
|
||||
for item in soup.select(".post-item, article.post"):
|
||||
title_elem = item.select_one("h2 a, .post-title a")
|
||||
if not title_elem:
|
||||
continue
|
||||
|
||||
title = title_elem.get_text(strip=True)
|
||||
url = title_elem.get("href", "")
|
||||
|
||||
date_elem = item.select_one("time, .post-date")
|
||||
pub_date = None
|
||||
if date_elem:
|
||||
date_text = date_elem.get("datetime", date_elem.get_text(strip=True))
|
||||
try:
|
||||
pub_date = datetime.fromisoformat(date_text[:10])
|
||||
except:
|
||||
pass
|
||||
|
||||
if pub_date and pub_date.date() == date.today():
|
||||
articles.append({
|
||||
"title": title,
|
||||
"url": url,
|
||||
"published_at": pub_date,
|
||||
"source": "ctee"
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"工商時報抓取失敗 (關鍵字: {keyword})", exc_info=True)
|
||||
|
||||
return articles
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
|
||||
def get_article_content(self, url: str) -> Optional[str]:
|
||||
"""取得文章內容"""
|
||||
try:
|
||||
self._wait()
|
||||
response = self.session.get(url)
|
||||
soup = BeautifulSoup(response.text, "lxml")
|
||||
|
||||
content_elem = soup.select_one(".entry-content, .post-content")
|
||||
if content_elem:
|
||||
for unwanted in content_elem.select("script, style, .ad"):
|
||||
unwanted.decompose()
|
||||
return content_elem.get_text(separator="\n", strip=True)
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("工商時報內容抓取失敗", exc_info=True)
|
||||
return None
|
||||
|
||||
|
||||
def get_crawler(source_code: str) -> BaseCrawler:
|
||||
"""取得對應的爬蟲實例"""
|
||||
if source_code == "digitimes":
|
||||
return DigitimesCrawler(
|
||||
settings.digitimes_username,
|
||||
settings.digitimes_password
|
||||
)
|
||||
elif source_code == "udn":
|
||||
return UDNCrawler()
|
||||
elif source_code == "ctee":
|
||||
return CTEECrawler()
|
||||
else:
|
||||
raise ValueError(f"不支援的新聞來源: {source_code}")
|
||||
Reference in New Issue
Block a user