""" 新聞爬蟲服務模組 支援 Digitimes、經濟日報、工商時報 """ import time import re from datetime import datetime, date from typing import Optional, List, Dict, Any from abc import ABC, abstractmethod import httpx from bs4 import BeautifulSoup from tenacity import retry, stop_after_attempt, wait_exponential import logging from app.core.config import settings logger = logging.getLogger(__name__) class BaseCrawler(ABC): """爬蟲基礎類別""" def __init__(self): self.session = httpx.Client( timeout=30, headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } ) self.delay = settings.crawl_request_delay def _wait(self): """請求間隔""" time.sleep(self.delay) @abstractmethod def get_article_list(self, keywords: List[str]) -> List[Dict[str, Any]]: """取得文章列表""" pass @abstractmethod def get_article_content(self, url: str) -> Optional[str]: """取得文章內容""" pass def close(self): """關閉連線""" self.session.close() class DigitimesCrawler(BaseCrawler): """Digitimes 爬蟲(付費訂閱)""" BASE_URL = "https://www.digitimes.com.tw" def __init__(self, username: str, password: str): super().__init__() self.username = username self.password = password self.is_logged_in = False def login(self) -> bool: """登入 Digitimes""" try: # 取得登入頁面 login_page = self.session.get(f"{self.BASE_URL}/member/login.asp") # 發送登入請求 login_data = { "uid": self.username, "pwd": self.password, "remember": "1" } response = self.session.post( f"{self.BASE_URL}/member/login_check.asp", data=login_data, follow_redirects=True ) # 檢查是否登入成功(根據回應判斷) self.is_logged_in = "logout" in response.text.lower() or response.status_code == 200 return self.is_logged_in except Exception as e: logger.error("Digitimes 登入失敗", exc_info=True) return False @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10)) def get_article_list(self, keywords: List[str]) -> List[Dict[str, Any]]: """取得文章列表""" if not self.is_logged_in: self.login() articles = [] for keyword in keywords: self._wait() try: # 搜尋 API search_url = f"{self.BASE_URL}/search/search_result.asp?query={keyword}" response = self.session.get(search_url) soup = BeautifulSoup(response.text, "lxml") # 解析搜尋結果 for item in soup.select(".search-result-item, .news-item"): title_elem = item.select_one("h3 a, .title a") if not title_elem: continue title = title_elem.get_text(strip=True) url = title_elem.get("href", "") if not url.startswith("http"): url = f"{self.BASE_URL}{url}" # 取得日期 date_elem = item.select_one(".date, .time") pub_date = None if date_elem: date_text = date_elem.get_text(strip=True) try: pub_date = datetime.strptime(date_text, "%Y/%m/%d") except: pass # 只取今天的新聞 if pub_date and pub_date.date() == date.today(): articles.append({ "title": title, "url": url, "published_at": pub_date, "source": "digitimes" }) except Exception as e: logger.warning(f"Digitimes 抓取失敗 (關鍵字: {keyword})", exc_info=True) return articles @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10)) def get_article_content(self, url: str) -> Optional[str]: """取得文章內容""" if not self.is_logged_in: self.login() try: self._wait() response = self.session.get(url) soup = BeautifulSoup(response.text, "lxml") # 嘗試多個內容選擇器 content_selectors = [".article-body", ".content", "#article-content", ".main-content"] for selector in content_selectors: content_elem = soup.select_one(selector) if content_elem: # 移除不需要的元素 for unwanted in content_elem.select("script, style, .ad, .advertisement"): unwanted.decompose() return content_elem.get_text(separator="\n", strip=True) return None except Exception as e: logger.warning("Digitimes 內容抓取失敗", exc_info=True) return None class UDNCrawler(BaseCrawler): """經濟日報爬蟲""" BASE_URL = "https://money.udn.com" @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10)) def get_article_list(self, keywords: List[str]) -> List[Dict[str, Any]]: """取得文章列表""" articles = [] for keyword in keywords: self._wait() try: search_url = f"{self.BASE_URL}/search/result/1/{keyword}" response = self.session.get(search_url) soup = BeautifulSoup(response.text, "lxml") for item in soup.select(".story-list__news, .news-item"): title_elem = item.select_one("h3 a, .story-list__text a") if not title_elem: continue title = title_elem.get_text(strip=True) url = title_elem.get("href", "") if not url.startswith("http"): url = f"{self.BASE_URL}{url}" date_elem = item.select_one("time, .story-list__time") pub_date = None if date_elem: date_text = date_elem.get_text(strip=True) try: pub_date = datetime.strptime(date_text[:10], "%Y-%m-%d") except: pass if pub_date and pub_date.date() == date.today(): articles.append({ "title": title, "url": url, "published_at": pub_date, "source": "udn" }) except Exception as e: logger.warning(f"經濟日報抓取失敗 (關鍵字: {keyword})", exc_info=True) return articles @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10)) def get_article_content(self, url: str) -> Optional[str]: """取得文章內容""" try: self._wait() response = self.session.get(url) soup = BeautifulSoup(response.text, "lxml") content_elem = soup.select_one("#story_body_content, .article-content") if content_elem: for unwanted in content_elem.select("script, style, .ad"): unwanted.decompose() return content_elem.get_text(separator="\n", strip=True) return None except Exception as e: logger.warning("經濟日報內容抓取失敗", exc_info=True) return None class CTEECrawler(BaseCrawler): """工商時報爬蟲""" BASE_URL = "https://ctee.com.tw" @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10)) def get_article_list(self, keywords: List[str]) -> List[Dict[str, Any]]: """取得文章列表""" articles = [] for keyword in keywords: self._wait() try: search_url = f"{self.BASE_URL}/?s={keyword}" response = self.session.get(search_url) soup = BeautifulSoup(response.text, "lxml") for item in soup.select(".post-item, article.post"): title_elem = item.select_one("h2 a, .post-title a") if not title_elem: continue title = title_elem.get_text(strip=True) url = title_elem.get("href", "") date_elem = item.select_one("time, .post-date") pub_date = None if date_elem: date_text = date_elem.get("datetime", date_elem.get_text(strip=True)) try: pub_date = datetime.fromisoformat(date_text[:10]) except: pass if pub_date and pub_date.date() == date.today(): articles.append({ "title": title, "url": url, "published_at": pub_date, "source": "ctee" }) except Exception as e: logger.warning(f"工商時報抓取失敗 (關鍵字: {keyword})", exc_info=True) return articles @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10)) def get_article_content(self, url: str) -> Optional[str]: """取得文章內容""" try: self._wait() response = self.session.get(url) soup = BeautifulSoup(response.text, "lxml") content_elem = soup.select_one(".entry-content, .post-content") if content_elem: for unwanted in content_elem.select("script, style, .ad"): unwanted.decompose() return content_elem.get_text(separator="\n", strip=True) return None except Exception as e: logger.warning("工商時報內容抓取失敗", exc_info=True) return None def get_crawler(source_code: str) -> BaseCrawler: """取得對應的爬蟲實例""" if source_code == "digitimes": return DigitimesCrawler( settings.digitimes_username, settings.digitimes_password ) elif source_code == "udn": return UDNCrawler() elif source_code == "ctee": return CTEECrawler() else: raise ValueError(f"不支援的新聞來源: {source_code}")