Files
hbr-crawler/run_crawler.py
DonaldFang 方士碩 f524713cb6 Initial commit: HBR 文章爬蟲專案
- Scrapy 爬蟲框架,爬取 HBR 繁體中文文章
- Flask Web 應用程式,提供文章查詢介面
- SQL Server 資料庫整合
- 自動化排程與郵件通知功能

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-03 17:19:56 +08:00

220 lines
6.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
HBR 爬蟲系統主啟動腳本
整合爬蟲執行、資料庫儲存、CSV 匯出和郵件發送功能
"""
import os
import sys
import subprocess
import logging
from pathlib import Path
from datetime import datetime
# 設定日誌
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(sys.stdout),
logging.FileHandler('logs/crawler.log', encoding='utf-8') if os.path.exists('logs') or os.makedirs('logs', exist_ok=True) else logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
def check_scrapy_installed():
"""檢查 Scrapy 是否已安裝"""
try:
import scrapy
logger.info(f"✓ Scrapy 已安裝 (版本: {scrapy.__version__})")
return True
except ImportError:
logger.error("✗ Scrapy 未安裝")
logger.error("請執行以下命令安裝依賴:")
logger.error(" pip install -r requirements.txt")
return False
def run_scrapy_crawler():
"""執行 Scrapy 爬蟲"""
logger.info("=" * 60)
logger.info("開始執行 HBR 爬蟲")
logger.info("=" * 60)
# 檢查 Scrapy 是否已安裝
if not check_scrapy_installed():
return False
# 切換到爬蟲目錄
crawler_dir = Path(__file__).parent / "hbr_crawler"
if not crawler_dir.exists():
logger.error(f"✗ 爬蟲目錄不存在: {crawler_dir}")
return False
try:
# 使用 python -m scrapy 的方式執行(更可靠)
result = subprocess.run(
[sys.executable, "-m", "scrapy", "crawl", "hbr"],
cwd=str(crawler_dir),
capture_output=False, # 直接輸出到控制台
text=True
)
if result.returncode == 0:
logger.info("✓ 爬蟲執行成功")
return True
else:
logger.error(f"✗ 爬蟲執行失敗,退出碼: {result.returncode}")
return False
except FileNotFoundError:
logger.error("✗ 找不到 Python 或 Scrapy 模組")
logger.error("請確認:")
logger.error(" 1. Python 已正確安裝")
logger.error(" 2. 已執行: pip install -r requirements.txt")
return False
except Exception as e:
logger.error(f"✗ 執行爬蟲時發生錯誤: {e}")
import traceback
logger.error(traceback.format_exc())
return False
def send_email():
"""發送郵件(如果已設定)"""
logger.info("=" * 60)
logger.info("檢查郵件發送功能")
logger.info("=" * 60)
try:
# 使用 shell=True 和正確的編碼處理Windows
if sys.platform == 'win32':
result = subprocess.run(
[sys.executable, "send_mail.py"],
capture_output=True,
text=True,
encoding='utf-8',
errors='replace' # 遇到編碼錯誤時替換而不是失敗
)
else:
result = subprocess.run(
[sys.executable, "send_mail.py"],
capture_output=True,
text=True,
encoding='utf-8'
)
if result.stdout:
print(result.stdout)
if result.stderr:
print(result.stderr)
return result.returncode == 0
except Exception as e:
logger.error(f"執行郵件發送時發生錯誤: {e}")
return False
def check_csv_file():
"""檢查 CSV 檔案是否產生"""
csv_path = Path(__file__).parent / "hbr_articles.csv"
if csv_path.exists():
file_size = csv_path.stat().st_size
logger.info(f"[OK] CSV 檔案已產生: {csv_path} ({file_size:,} bytes)")
return True
else:
logger.warning(f"✗ CSV 檔案不存在: {csv_path}")
return False
def main():
"""主函數"""
start_time = datetime.now()
logger.info(f"爬蟲系統啟動時間: {start_time.strftime('%Y-%m-%d %H:%M:%S')}")
# 步驟 1: 執行爬蟲
crawler_success = run_scrapy_crawler()
if not crawler_success:
logger.error("爬蟲執行失敗,終止流程")
sys.exit(1)
# 步驟 2: 檢查 CSV 檔案
csv_exists = check_csv_file()
if not csv_exists:
logger.warning("CSV 檔案未產生,可能沒有爬取到資料")
# 步驟 3: 發送郵件(可選)
email_sent = send_email()
# 完成 - 顯示執行摘要
end_time = datetime.now()
duration = (end_time - start_time).total_seconds()
logger.info("=" * 60)
logger.info("爬蟲系統執行完成")
logger.info("=" * 60)
logger.info("執行摘要:")
logger.info(f" [{'OK' if crawler_success else 'FAIL'}] 爬蟲執行: {'成功' if crawler_success else '失敗'}")
logger.info(f" [{'OK' if csv_exists else 'FAIL'}] CSV 檔案: {'已產生' if csv_exists else '未產生'}")
logger.info(f" [{'OK' if email_sent else 'SKIP'}] 郵件發送: {'已發送' if email_sent else '已跳過(未設定)'}")
logger.info(f" 執行時間: {duration:.2f}")
logger.info(f" 完成時間: {end_time.strftime('%Y-%m-%d %H:%M:%S')}")
# 顯示注意事項
if not csv_exists:
logger.warning("")
logger.warning("⚠️ 注意CSV 檔案未產生,請檢查爬蟲是否成功爬取到資料")
logger.info("=" * 60)
# 如果爬蟲成功但沒有 CSV返回警告碼非錯誤
if crawler_success and not csv_exists:
return 2 # 警告碼
return 0 if crawler_success else 1
def run_web_server():
"""啟動 Web 服務"""
logger.info("=" * 60)
logger.info("啟動 Web 服務")
logger.info("=" * 60)
try:
from web_app import app
logger.info("Web 服務啟動成功")
logger.info("服務地址: http://localhost:5000")
logger.info("按 Ctrl+C 停止服務")
app.run(host='0.0.0.0', port=5000, debug=False)
except ImportError:
logger.error("無法匯入 web_app 模組,請確認 web_app.py 存在")
sys.exit(1)
except Exception as e:
logger.error(f"啟動 Web 服務失敗: {e}")
sys.exit(1)
if __name__ == '__main__':
# 檢查是否要啟動 Web 服務
if len(sys.argv) > 1 and sys.argv[1] == '--web':
try:
run_web_server()
except KeyboardInterrupt:
logger.info("\nWeb 服務已停止")
sys.exit(0)
else:
# 預設執行爬蟲
try:
sys.exit(main())
except KeyboardInterrupt:
logger.info("\n使用者中斷執行")
sys.exit(130)
except Exception as e:
logger.error(f"發生未預期的錯誤: {e}", exc_info=True)
sys.exit(1)