Initial commit: HBR 文章爬蟲專案

- Scrapy 爬蟲框架，爬取 HBR 繁體中文文章 - Flask Web 應用程式，提供文章查詢介面 - SQL Server 資料庫整合 - 自動化排程與郵件通知功能 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-03 17:19:56 +08:00
commit f524713cb6
35 changed files with 6719 additions and 0 deletions
--- a/run_crawler.py
+++ b/run_crawler.py
@@ -0,0 +1,219 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+HBR 爬蟲系統主啟動腳本
+整合爬蟲執行、資料庫儲存、CSV 匯出和郵件發送功能
+"""
+import os
+import sys
+import subprocess
+import logging
+from pathlib import Path
+from datetime import datetime
+
+# 設定日誌
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler(sys.stdout),
+        logging.FileHandler('logs/crawler.log', encoding='utf-8') if os.path.exists('logs') or os.makedirs('logs', exist_ok=True) else logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
+
+
+def check_scrapy_installed():
+    """檢查 Scrapy 是否已安裝"""
+    try:
+        import scrapy
+        logger.info(f"✓ Scrapy 已安裝 (版本: {scrapy.__version__})")
+        return True
+    except ImportError:
+        logger.error("✗ Scrapy 未安裝")
+        logger.error("請執行以下命令安裝依賴：")
+        logger.error("  pip install -r requirements.txt")
+        return False
+
+
+def run_scrapy_crawler():
+    """執行 Scrapy 爬蟲"""
+    logger.info("=" * 60)
+    logger.info("開始執行 HBR 爬蟲")
+    logger.info("=" * 60)
+    
+    # 檢查 Scrapy 是否已安裝
+    if not check_scrapy_installed():
+        return False
+    
+    # 切換到爬蟲目錄
+    crawler_dir = Path(__file__).parent / "hbr_crawler"
+    
+    if not crawler_dir.exists():
+        logger.error(f"✗ 爬蟲目錄不存在: {crawler_dir}")
+        return False
+    
+    try:
+        # 使用 python -m scrapy 的方式執行（更可靠）
+        result = subprocess.run(
+            [sys.executable, "-m", "scrapy", "crawl", "hbr"],
+            cwd=str(crawler_dir),
+            capture_output=False,  # 直接輸出到控制台
+            text=True
+        )
+        
+        if result.returncode == 0:
+            logger.info("✓ 爬蟲執行成功")
+            return True
+        else:
+            logger.error(f"✗ 爬蟲執行失敗，退出碼: {result.returncode}")
+            return False
+            
+    except FileNotFoundError:
+        logger.error("✗ 找不到 Python 或 Scrapy 模組")
+        logger.error("請確認：")
+        logger.error("  1. Python 已正確安裝")
+        logger.error("  2. 已執行: pip install -r requirements.txt")
+        return False
+    except Exception as e:
+        logger.error(f"✗ 執行爬蟲時發生錯誤: {e}")
+        import traceback
+        logger.error(traceback.format_exc())
+        return False
+
+
+def send_email():
+    """發送郵件（如果已設定）"""
+    logger.info("=" * 60)
+    logger.info("檢查郵件發送功能")
+    logger.info("=" * 60)
+    
+    try:
+        # 使用 shell=True 和正確的編碼處理（Windows）
+        if sys.platform == 'win32':
+            result = subprocess.run(
+                [sys.executable, "send_mail.py"],
+                capture_output=True,
+                text=True,
+                encoding='utf-8',
+                errors='replace'  # 遇到編碼錯誤時替換而不是失敗
+            )
+        else:
+            result = subprocess.run(
+                [sys.executable, "send_mail.py"],
+                capture_output=True,
+                text=True,
+                encoding='utf-8'
+            )
+        
+        if result.stdout:
+            print(result.stdout)
+        
+        if result.stderr:
+            print(result.stderr)
+        
+        return result.returncode == 0
+    except Exception as e:
+        logger.error(f"執行郵件發送時發生錯誤: {e}")
+        return False
+
+
+def check_csv_file():
+    """檢查 CSV 檔案是否產生"""
+    csv_path = Path(__file__).parent / "hbr_articles.csv"
+    
+    if csv_path.exists():
+        file_size = csv_path.stat().st_size
+        logger.info(f"[OK] CSV 檔案已產生: {csv_path} ({file_size:,} bytes)")
+        return True
+    else:
+        logger.warning(f"✗ CSV 檔案不存在: {csv_path}")
+        return False
+
+
+def main():
+    """主函數"""
+    start_time = datetime.now()
+    logger.info(f"爬蟲系統啟動時間: {start_time.strftime('%Y-%m-%d %H:%M:%S')}")
+    
+    # 步驟 1: 執行爬蟲
+    crawler_success = run_scrapy_crawler()
+    
+    if not crawler_success:
+        logger.error("爬蟲執行失敗，終止流程")
+        sys.exit(1)
+    
+    # 步驟 2: 檢查 CSV 檔案
+    csv_exists = check_csv_file()
+    
+    if not csv_exists:
+        logger.warning("CSV 檔案未產生，可能沒有爬取到資料")
+    
+    # 步驟 3: 發送郵件（可選）
+    email_sent = send_email()
+    
+    # 完成 - 顯示執行摘要
+    end_time = datetime.now()
+    duration = (end_time - start_time).total_seconds()
+    logger.info("=" * 60)
+    logger.info("爬蟲系統執行完成")
+    logger.info("=" * 60)
+    logger.info("執行摘要：")
+    logger.info(f"  [{'OK' if crawler_success else 'FAIL'}] 爬蟲執行: {'成功' if crawler_success else '失敗'}")
+    logger.info(f"  [{'OK' if csv_exists else 'FAIL'}] CSV 檔案: {'已產生' if csv_exists else '未產生'}")
+    logger.info(f"  [{'OK' if email_sent else 'SKIP'}] 郵件發送: {'已發送' if email_sent else '已跳過（未設定）'}")
+    logger.info(f"  執行時間: {duration:.2f} 秒")
+    logger.info(f"  完成時間: {end_time.strftime('%Y-%m-%d %H:%M:%S')}")
+    
+    # 顯示注意事項
+    if not csv_exists:
+        logger.warning("")
+        logger.warning("⚠️  注意：CSV 檔案未產生，請檢查爬蟲是否成功爬取到資料")
+    
+    logger.info("=" * 60)
+    
+    # 如果爬蟲成功但沒有 CSV，返回警告碼（非錯誤）
+    if crawler_success and not csv_exists:
+        return 2  # 警告碼
+    return 0 if crawler_success else 1
+
+
+def run_web_server():
+    """啟動 Web 服務"""
+    logger.info("=" * 60)
+    logger.info("啟動 Web 服務")
+    logger.info("=" * 60)
+    
+    try:
+        from web_app import app
+        logger.info("Web 服務啟動成功")
+        logger.info("服務地址: http://localhost:5000")
+        logger.info("按 Ctrl+C 停止服務")
+        app.run(host='0.0.0.0', port=5000, debug=False)
+    except ImportError:
+        logger.error("無法匯入 web_app 模組，請確認 web_app.py 存在")
+        sys.exit(1)
+    except Exception as e:
+        logger.error(f"啟動 Web 服務失敗: {e}")
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    # 檢查是否要啟動 Web 服務
+    if len(sys.argv) > 1 and sys.argv[1] == '--web':
+        try:
+            run_web_server()
+        except KeyboardInterrupt:
+            logger.info("\nWeb 服務已停止")
+            sys.exit(0)
+    else:
+        # 預設執行爬蟲
+        try:
+            sys.exit(main())
+        except KeyboardInterrupt:
+            logger.info("\n使用者中斷執行")
+            sys.exit(130)
+        except Exception as e:
+            logger.error(f"發生未預期的錯誤: {e}", exc_info=True)
+            sys.exit(1)
+