daily-news-app/app/services/crawler_service.py

"""
新聞爬蟲服務模組
支援 Digitimes、經濟日報、工商時報
"""
import time
import re
from datetime import datetime, date
from typing import Optional, List, Dict, Any
from abc import ABC, abstractmethod
import httpx
from bs4 import BeautifulSoup
from tenacity import retry, stop_after_attempt, wait_exponential
import logging

from app.core.config import settings

logger = logging.getLogger(__name__)


class BaseCrawler(ABC):
    """爬蟲基礎類別"""

    def __init__(self):
        self.session = httpx.Client(
            timeout=30,
            headers={
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
            }
        )
        self.delay = settings.crawl_request_delay

    def _wait(self):
        """請求間隔"""
        time.sleep(self.delay)

    @abstractmethod
    def get_article_list(self, keywords: List[str]) -> List[Dict[str, Any]]:
        """取得文章列表"""
        pass

    @abstractmethod
    def get_article_content(self, url: str) -> Optional[str]:
        """取得文章內容"""
        pass

    def close(self):
        """關閉連線"""
        self.session.close()


class DigitimesCrawler(BaseCrawler):
    """Digitimes 爬蟲（付費訂閱）"""

    BASE_URL = "https://www.digitimes.com.tw"

    def __init__(self, username: str, password: str):
        super().__init__()
        self.username = username
        self.password = password
        self.is_logged_in = False

    def login(self) -> bool:
        """登入 Digitimes"""
        try:
            # 取得登入頁面
            login_page = self.session.get(f"{self.BASE_URL}/member/login.asp")

            # 發送登入請求
            login_data = {
                "uid": self.username,
                "pwd": self.password,
                "remember": "1"
            }

            response = self.session.post(
                f"{self.BASE_URL}/member/login_check.asp",
                data=login_data,
                follow_redirects=True
            )

            # 檢查是否登入成功（根據回應判斷）
            self.is_logged_in = "logout" in response.text.lower() or response.status_code == 200
            return self.is_logged_in

        except Exception as e:
            logger.error("Digitimes 登入失敗", exc_info=True)
            return False

    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
    def get_article_list(self, keywords: List[str]) -> List[Dict[str, Any]]:
        """取得文章列表"""
        if not self.is_logged_in:
            self.login()

        articles = []

        for keyword in keywords:
            self._wait()

            try:
                # 搜尋 API
                search_url = f"{self.BASE_URL}/search/search_result.asp?query={keyword}"
                response = self.session.get(search_url)
                soup = BeautifulSoup(response.text, "lxml")

                # 解析搜尋結果
                for item in soup.select(".search-result-item, .news-item"):
                    title_elem = item.select_one("h3 a, .title a")
                    if not title_elem:
                        continue

                    title = title_elem.get_text(strip=True)
                    url = title_elem.get("href", "")
                    if not url.startswith("http"):
                        url = f"{self.BASE_URL}{url}"

                    # 取得日期
                    date_elem = item.select_one(".date, .time")
                    pub_date = None
                    if date_elem:
                        date_text = date_elem.get_text(strip=True)
                        try:
                            pub_date = datetime.strptime(date_text, "%Y/%m/%d")
                        except:
                            pass

                    # 只取今天的新聞
                    if pub_date and pub_date.date() == date.today():
                        articles.append({
                            "title": title,
                            "url": url,
                            "published_at": pub_date,
                            "source": "digitimes"
                        })

            except Exception as e:
                logger.warning(f"Digitimes 抓取失敗 (關鍵字: {keyword})", exc_info=True)

        return articles

    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
    def get_article_content(self, url: str) -> Optional[str]:
        """取得文章內容"""
        if not self.is_logged_in:
            self.login()

        try:
            self._wait()
            response = self.session.get(url)
            soup = BeautifulSoup(response.text, "lxml")

            # 嘗試多個內容選擇器
            content_selectors = [".article-body", ".content", "#article-content", ".main-content"]

            for selector in content_selectors:
                content_elem = soup.select_one(selector)
                if content_elem:
                    # 移除不需要的元素
                    for unwanted in content_elem.select("script, style, .ad, .advertisement"):
                        unwanted.decompose()
                    return content_elem.get_text(separator="\n", strip=True)

            return None

        except Exception as e:
            logger.warning("Digitimes 內容抓取失敗", exc_info=True)
            return None


class UDNCrawler(BaseCrawler):
    """經濟日報爬蟲"""

    BASE_URL = "https://money.udn.com"

    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
    def get_article_list(self, keywords: List[str]) -> List[Dict[str, Any]]:
        """取得文章列表"""
        articles = []

        for keyword in keywords:
            self._wait()

            try:
                search_url = f"{self.BASE_URL}/search/result/1/{keyword}"
                response = self.session.get(search_url)
                soup = BeautifulSoup(response.text, "lxml")

                for item in soup.select(".story-list__news, .news-item"):
                    title_elem = item.select_one("h3 a, .story-list__text a")
                    if not title_elem:
                        continue

                    title = title_elem.get_text(strip=True)
                    url = title_elem.get("href", "")
                    if not url.startswith("http"):
                        url = f"{self.BASE_URL}{url}"

                    date_elem = item.select_one("time, .story-list__time")
                    pub_date = None
                    if date_elem:
                        date_text = date_elem.get_text(strip=True)
                        try:
                            pub_date = datetime.strptime(date_text[:10], "%Y-%m-%d")
                        except:
                            pass

                    if pub_date and pub_date.date() == date.today():
                        articles.append({
                            "title": title,
                            "url": url,
                            "published_at": pub_date,
                            "source": "udn"
                        })

            except Exception as e:
                logger.warning(f"經濟日報抓取失敗 (關鍵字: {keyword})", exc_info=True)

        return articles

    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
    def get_article_content(self, url: str) -> Optional[str]:
        """取得文章內容"""
        try:
            self._wait()
            response = self.session.get(url)
            soup = BeautifulSoup(response.text, "lxml")

            content_elem = soup.select_one("#story_body_content, .article-content")
            if content_elem:
                for unwanted in content_elem.select("script, style, .ad"):
                    unwanted.decompose()
                return content_elem.get_text(separator="\n", strip=True)

            return None

        except Exception as e:
            logger.warning("經濟日報內容抓取失敗", exc_info=True)
            return None


class CTEECrawler(BaseCrawler):
    """工商時報爬蟲"""

    BASE_URL = "https://ctee.com.tw"

    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
    def get_article_list(self, keywords: List[str]) -> List[Dict[str, Any]]:
        """取得文章列表"""
        articles = []

        for keyword in keywords:
            self._wait()

            try:
                search_url = f"{self.BASE_URL}/?s={keyword}"
                response = self.session.get(search_url)
                soup = BeautifulSoup(response.text, "lxml")

                for item in soup.select(".post-item, article.post"):
                    title_elem = item.select_one("h2 a, .post-title a")
                    if not title_elem:
                        continue

                    title = title_elem.get_text(strip=True)
                    url = title_elem.get("href", "")

                    date_elem = item.select_one("time, .post-date")
                    pub_date = None
                    if date_elem:
                        date_text = date_elem.get("datetime", date_elem.get_text(strip=True))
                        try:
                            pub_date = datetime.fromisoformat(date_text[:10])
                        except:
                            pass

                    if pub_date and pub_date.date() == date.today():
                        articles.append({
                            "title": title,
                            "url": url,
                            "published_at": pub_date,
                            "source": "ctee"
                        })

            except Exception as e:
                logger.warning(f"工商時報抓取失敗 (關鍵字: {keyword})", exc_info=True)

        return articles

    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
    def get_article_content(self, url: str) -> Optional[str]:
        """取得文章內容"""
        try:
            self._wait()
            response = self.session.get(url)
            soup = BeautifulSoup(response.text, "lxml")

            content_elem = soup.select_one(".entry-content, .post-content")
            if content_elem:
                for unwanted in content_elem.select("script, style, .ad"):
                    unwanted.decompose()
                return content_elem.get_text(separator="\n", strip=True)

            return None

        except Exception as e:
            logger.warning("工商時報內容抓取失敗", exc_info=True)
            return None


def get_crawler(source_code: str) -> BaseCrawler:
    """取得對應的爬蟲實例"""
    if source_code == "digitimes":
        return DigitimesCrawler(
            settings.digitimes_username,
            settings.digitimes_password
        )
    elif source_code == "udn":
        return UDNCrawler()
    elif source_code == "ctee":
        return CTEECrawler()
    else:
        raise ValueError(f"不支援的新聞來源: {source_code}")