企業內部新聞彙整與分析系統 - 自動新聞抓取 (Digitimes, 經濟日報, 工商時報) - AI 智慧摘要 (OpenAI/Claude/Ollama) - 群組管理與訂閱通知 - 已清理 Python 快取檔案 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
323 lines
12 KiB
Python
323 lines
12 KiB
Python
"""
|
|
新聞爬蟲服務模組
|
|
支援 Digitimes、經濟日報、工商時報
|
|
"""
|
|
import time
|
|
import re
|
|
from datetime import datetime, date
|
|
from typing import Optional, List, Dict, Any
|
|
from abc import ABC, abstractmethod
|
|
import httpx
|
|
from bs4 import BeautifulSoup
|
|
from tenacity import retry, stop_after_attempt, wait_exponential
|
|
import logging
|
|
|
|
from app.core.config import settings
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class BaseCrawler(ABC):
|
|
"""爬蟲基礎類別"""
|
|
|
|
def __init__(self):
|
|
self.session = httpx.Client(
|
|
timeout=30,
|
|
headers={
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
}
|
|
)
|
|
self.delay = settings.crawl_request_delay
|
|
|
|
def _wait(self):
|
|
"""請求間隔"""
|
|
time.sleep(self.delay)
|
|
|
|
@abstractmethod
|
|
def get_article_list(self, keywords: List[str]) -> List[Dict[str, Any]]:
|
|
"""取得文章列表"""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def get_article_content(self, url: str) -> Optional[str]:
|
|
"""取得文章內容"""
|
|
pass
|
|
|
|
def close(self):
|
|
"""關閉連線"""
|
|
self.session.close()
|
|
|
|
|
|
class DigitimesCrawler(BaseCrawler):
|
|
"""Digitimes 爬蟲(付費訂閱)"""
|
|
|
|
BASE_URL = "https://www.digitimes.com.tw"
|
|
|
|
def __init__(self, username: str, password: str):
|
|
super().__init__()
|
|
self.username = username
|
|
self.password = password
|
|
self.is_logged_in = False
|
|
|
|
def login(self) -> bool:
|
|
"""登入 Digitimes"""
|
|
try:
|
|
# 取得登入頁面
|
|
login_page = self.session.get(f"{self.BASE_URL}/member/login.asp")
|
|
|
|
# 發送登入請求
|
|
login_data = {
|
|
"uid": self.username,
|
|
"pwd": self.password,
|
|
"remember": "1"
|
|
}
|
|
|
|
response = self.session.post(
|
|
f"{self.BASE_URL}/member/login_check.asp",
|
|
data=login_data,
|
|
follow_redirects=True
|
|
)
|
|
|
|
# 檢查是否登入成功(根據回應判斷)
|
|
self.is_logged_in = "logout" in response.text.lower() or response.status_code == 200
|
|
return self.is_logged_in
|
|
|
|
except Exception as e:
|
|
logger.error("Digitimes 登入失敗", exc_info=True)
|
|
return False
|
|
|
|
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
|
|
def get_article_list(self, keywords: List[str]) -> List[Dict[str, Any]]:
|
|
"""取得文章列表"""
|
|
if not self.is_logged_in:
|
|
self.login()
|
|
|
|
articles = []
|
|
|
|
for keyword in keywords:
|
|
self._wait()
|
|
|
|
try:
|
|
# 搜尋 API
|
|
search_url = f"{self.BASE_URL}/search/search_result.asp?query={keyword}"
|
|
response = self.session.get(search_url)
|
|
soup = BeautifulSoup(response.text, "lxml")
|
|
|
|
# 解析搜尋結果
|
|
for item in soup.select(".search-result-item, .news-item"):
|
|
title_elem = item.select_one("h3 a, .title a")
|
|
if not title_elem:
|
|
continue
|
|
|
|
title = title_elem.get_text(strip=True)
|
|
url = title_elem.get("href", "")
|
|
if not url.startswith("http"):
|
|
url = f"{self.BASE_URL}{url}"
|
|
|
|
# 取得日期
|
|
date_elem = item.select_one(".date, .time")
|
|
pub_date = None
|
|
if date_elem:
|
|
date_text = date_elem.get_text(strip=True)
|
|
try:
|
|
pub_date = datetime.strptime(date_text, "%Y/%m/%d")
|
|
except:
|
|
pass
|
|
|
|
# 只取今天的新聞
|
|
if pub_date and pub_date.date() == date.today():
|
|
articles.append({
|
|
"title": title,
|
|
"url": url,
|
|
"published_at": pub_date,
|
|
"source": "digitimes"
|
|
})
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Digitimes 抓取失敗 (關鍵字: {keyword})", exc_info=True)
|
|
|
|
return articles
|
|
|
|
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
|
|
def get_article_content(self, url: str) -> Optional[str]:
|
|
"""取得文章內容"""
|
|
if not self.is_logged_in:
|
|
self.login()
|
|
|
|
try:
|
|
self._wait()
|
|
response = self.session.get(url)
|
|
soup = BeautifulSoup(response.text, "lxml")
|
|
|
|
# 嘗試多個內容選擇器
|
|
content_selectors = [".article-body", ".content", "#article-content", ".main-content"]
|
|
|
|
for selector in content_selectors:
|
|
content_elem = soup.select_one(selector)
|
|
if content_elem:
|
|
# 移除不需要的元素
|
|
for unwanted in content_elem.select("script, style, .ad, .advertisement"):
|
|
unwanted.decompose()
|
|
return content_elem.get_text(separator="\n", strip=True)
|
|
|
|
return None
|
|
|
|
except Exception as e:
|
|
logger.warning("Digitimes 內容抓取失敗", exc_info=True)
|
|
return None
|
|
|
|
|
|
class UDNCrawler(BaseCrawler):
|
|
"""經濟日報爬蟲"""
|
|
|
|
BASE_URL = "https://money.udn.com"
|
|
|
|
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
|
|
def get_article_list(self, keywords: List[str]) -> List[Dict[str, Any]]:
|
|
"""取得文章列表"""
|
|
articles = []
|
|
|
|
for keyword in keywords:
|
|
self._wait()
|
|
|
|
try:
|
|
search_url = f"{self.BASE_URL}/search/result/1/{keyword}"
|
|
response = self.session.get(search_url)
|
|
soup = BeautifulSoup(response.text, "lxml")
|
|
|
|
for item in soup.select(".story-list__news, .news-item"):
|
|
title_elem = item.select_one("h3 a, .story-list__text a")
|
|
if not title_elem:
|
|
continue
|
|
|
|
title = title_elem.get_text(strip=True)
|
|
url = title_elem.get("href", "")
|
|
if not url.startswith("http"):
|
|
url = f"{self.BASE_URL}{url}"
|
|
|
|
date_elem = item.select_one("time, .story-list__time")
|
|
pub_date = None
|
|
if date_elem:
|
|
date_text = date_elem.get_text(strip=True)
|
|
try:
|
|
pub_date = datetime.strptime(date_text[:10], "%Y-%m-%d")
|
|
except:
|
|
pass
|
|
|
|
if pub_date and pub_date.date() == date.today():
|
|
articles.append({
|
|
"title": title,
|
|
"url": url,
|
|
"published_at": pub_date,
|
|
"source": "udn"
|
|
})
|
|
|
|
except Exception as e:
|
|
logger.warning(f"經濟日報抓取失敗 (關鍵字: {keyword})", exc_info=True)
|
|
|
|
return articles
|
|
|
|
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
|
|
def get_article_content(self, url: str) -> Optional[str]:
|
|
"""取得文章內容"""
|
|
try:
|
|
self._wait()
|
|
response = self.session.get(url)
|
|
soup = BeautifulSoup(response.text, "lxml")
|
|
|
|
content_elem = soup.select_one("#story_body_content, .article-content")
|
|
if content_elem:
|
|
for unwanted in content_elem.select("script, style, .ad"):
|
|
unwanted.decompose()
|
|
return content_elem.get_text(separator="\n", strip=True)
|
|
|
|
return None
|
|
|
|
except Exception as e:
|
|
logger.warning("經濟日報內容抓取失敗", exc_info=True)
|
|
return None
|
|
|
|
|
|
class CTEECrawler(BaseCrawler):
|
|
"""工商時報爬蟲"""
|
|
|
|
BASE_URL = "https://ctee.com.tw"
|
|
|
|
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
|
|
def get_article_list(self, keywords: List[str]) -> List[Dict[str, Any]]:
|
|
"""取得文章列表"""
|
|
articles = []
|
|
|
|
for keyword in keywords:
|
|
self._wait()
|
|
|
|
try:
|
|
search_url = f"{self.BASE_URL}/?s={keyword}"
|
|
response = self.session.get(search_url)
|
|
soup = BeautifulSoup(response.text, "lxml")
|
|
|
|
for item in soup.select(".post-item, article.post"):
|
|
title_elem = item.select_one("h2 a, .post-title a")
|
|
if not title_elem:
|
|
continue
|
|
|
|
title = title_elem.get_text(strip=True)
|
|
url = title_elem.get("href", "")
|
|
|
|
date_elem = item.select_one("time, .post-date")
|
|
pub_date = None
|
|
if date_elem:
|
|
date_text = date_elem.get("datetime", date_elem.get_text(strip=True))
|
|
try:
|
|
pub_date = datetime.fromisoformat(date_text[:10])
|
|
except:
|
|
pass
|
|
|
|
if pub_date and pub_date.date() == date.today():
|
|
articles.append({
|
|
"title": title,
|
|
"url": url,
|
|
"published_at": pub_date,
|
|
"source": "ctee"
|
|
})
|
|
|
|
except Exception as e:
|
|
logger.warning(f"工商時報抓取失敗 (關鍵字: {keyword})", exc_info=True)
|
|
|
|
return articles
|
|
|
|
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
|
|
def get_article_content(self, url: str) -> Optional[str]:
|
|
"""取得文章內容"""
|
|
try:
|
|
self._wait()
|
|
response = self.session.get(url)
|
|
soup = BeautifulSoup(response.text, "lxml")
|
|
|
|
content_elem = soup.select_one(".entry-content, .post-content")
|
|
if content_elem:
|
|
for unwanted in content_elem.select("script, style, .ad"):
|
|
unwanted.decompose()
|
|
return content_elem.get_text(separator="\n", strip=True)
|
|
|
|
return None
|
|
|
|
except Exception as e:
|
|
logger.warning("工商時報內容抓取失敗", exc_info=True)
|
|
return None
|
|
|
|
|
|
def get_crawler(source_code: str) -> BaseCrawler:
|
|
"""取得對應的爬蟲實例"""
|
|
if source_code == "digitimes":
|
|
return DigitimesCrawler(
|
|
settings.digitimes_username,
|
|
settings.digitimes_password
|
|
)
|
|
elif source_code == "udn":
|
|
return UDNCrawler()
|
|
elif source_code == "ctee":
|
|
return CTEECrawler()
|
|
else:
|
|
raise ValueError(f"不支援的新聞來源: {source_code}")
|