- Scrapy 爬蟲框架,爬取 HBR 繁體中文文章 - Flask Web 應用程式,提供文章查詢介面 - SQL Server 資料庫整合 - 自動化排程與郵件通知功能 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
220 lines
6.8 KiB
Python
220 lines
6.8 KiB
Python
#!/usr/bin/env python
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
HBR 爬蟲系統主啟動腳本
|
||
整合爬蟲執行、資料庫儲存、CSV 匯出和郵件發送功能
|
||
"""
|
||
import os
|
||
import sys
|
||
import subprocess
|
||
import logging
|
||
from pathlib import Path
|
||
from datetime import datetime
|
||
|
||
# 設定日誌
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||
handlers=[
|
||
logging.StreamHandler(sys.stdout),
|
||
logging.FileHandler('logs/crawler.log', encoding='utf-8') if os.path.exists('logs') or os.makedirs('logs', exist_ok=True) else logging.StreamHandler()
|
||
]
|
||
)
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
def check_scrapy_installed():
|
||
"""檢查 Scrapy 是否已安裝"""
|
||
try:
|
||
import scrapy
|
||
logger.info(f"✓ Scrapy 已安裝 (版本: {scrapy.__version__})")
|
||
return True
|
||
except ImportError:
|
||
logger.error("✗ Scrapy 未安裝")
|
||
logger.error("請執行以下命令安裝依賴:")
|
||
logger.error(" pip install -r requirements.txt")
|
||
return False
|
||
|
||
|
||
def run_scrapy_crawler():
|
||
"""執行 Scrapy 爬蟲"""
|
||
logger.info("=" * 60)
|
||
logger.info("開始執行 HBR 爬蟲")
|
||
logger.info("=" * 60)
|
||
|
||
# 檢查 Scrapy 是否已安裝
|
||
if not check_scrapy_installed():
|
||
return False
|
||
|
||
# 切換到爬蟲目錄
|
||
crawler_dir = Path(__file__).parent / "hbr_crawler"
|
||
|
||
if not crawler_dir.exists():
|
||
logger.error(f"✗ 爬蟲目錄不存在: {crawler_dir}")
|
||
return False
|
||
|
||
try:
|
||
# 使用 python -m scrapy 的方式執行(更可靠)
|
||
result = subprocess.run(
|
||
[sys.executable, "-m", "scrapy", "crawl", "hbr"],
|
||
cwd=str(crawler_dir),
|
||
capture_output=False, # 直接輸出到控制台
|
||
text=True
|
||
)
|
||
|
||
if result.returncode == 0:
|
||
logger.info("✓ 爬蟲執行成功")
|
||
return True
|
||
else:
|
||
logger.error(f"✗ 爬蟲執行失敗,退出碼: {result.returncode}")
|
||
return False
|
||
|
||
except FileNotFoundError:
|
||
logger.error("✗ 找不到 Python 或 Scrapy 模組")
|
||
logger.error("請確認:")
|
||
logger.error(" 1. Python 已正確安裝")
|
||
logger.error(" 2. 已執行: pip install -r requirements.txt")
|
||
return False
|
||
except Exception as e:
|
||
logger.error(f"✗ 執行爬蟲時發生錯誤: {e}")
|
||
import traceback
|
||
logger.error(traceback.format_exc())
|
||
return False
|
||
|
||
|
||
def send_email():
|
||
"""發送郵件(如果已設定)"""
|
||
logger.info("=" * 60)
|
||
logger.info("檢查郵件發送功能")
|
||
logger.info("=" * 60)
|
||
|
||
try:
|
||
# 使用 shell=True 和正確的編碼處理(Windows)
|
||
if sys.platform == 'win32':
|
||
result = subprocess.run(
|
||
[sys.executable, "send_mail.py"],
|
||
capture_output=True,
|
||
text=True,
|
||
encoding='utf-8',
|
||
errors='replace' # 遇到編碼錯誤時替換而不是失敗
|
||
)
|
||
else:
|
||
result = subprocess.run(
|
||
[sys.executable, "send_mail.py"],
|
||
capture_output=True,
|
||
text=True,
|
||
encoding='utf-8'
|
||
)
|
||
|
||
if result.stdout:
|
||
print(result.stdout)
|
||
|
||
if result.stderr:
|
||
print(result.stderr)
|
||
|
||
return result.returncode == 0
|
||
except Exception as e:
|
||
logger.error(f"執行郵件發送時發生錯誤: {e}")
|
||
return False
|
||
|
||
|
||
def check_csv_file():
|
||
"""檢查 CSV 檔案是否產生"""
|
||
csv_path = Path(__file__).parent / "hbr_articles.csv"
|
||
|
||
if csv_path.exists():
|
||
file_size = csv_path.stat().st_size
|
||
logger.info(f"[OK] CSV 檔案已產生: {csv_path} ({file_size:,} bytes)")
|
||
return True
|
||
else:
|
||
logger.warning(f"✗ CSV 檔案不存在: {csv_path}")
|
||
return False
|
||
|
||
|
||
def main():
|
||
"""主函數"""
|
||
start_time = datetime.now()
|
||
logger.info(f"爬蟲系統啟動時間: {start_time.strftime('%Y-%m-%d %H:%M:%S')}")
|
||
|
||
# 步驟 1: 執行爬蟲
|
||
crawler_success = run_scrapy_crawler()
|
||
|
||
if not crawler_success:
|
||
logger.error("爬蟲執行失敗,終止流程")
|
||
sys.exit(1)
|
||
|
||
# 步驟 2: 檢查 CSV 檔案
|
||
csv_exists = check_csv_file()
|
||
|
||
if not csv_exists:
|
||
logger.warning("CSV 檔案未產生,可能沒有爬取到資料")
|
||
|
||
# 步驟 3: 發送郵件(可選)
|
||
email_sent = send_email()
|
||
|
||
# 完成 - 顯示執行摘要
|
||
end_time = datetime.now()
|
||
duration = (end_time - start_time).total_seconds()
|
||
logger.info("=" * 60)
|
||
logger.info("爬蟲系統執行完成")
|
||
logger.info("=" * 60)
|
||
logger.info("執行摘要:")
|
||
logger.info(f" [{'OK' if crawler_success else 'FAIL'}] 爬蟲執行: {'成功' if crawler_success else '失敗'}")
|
||
logger.info(f" [{'OK' if csv_exists else 'FAIL'}] CSV 檔案: {'已產生' if csv_exists else '未產生'}")
|
||
logger.info(f" [{'OK' if email_sent else 'SKIP'}] 郵件發送: {'已發送' if email_sent else '已跳過(未設定)'}")
|
||
logger.info(f" 執行時間: {duration:.2f} 秒")
|
||
logger.info(f" 完成時間: {end_time.strftime('%Y-%m-%d %H:%M:%S')}")
|
||
|
||
# 顯示注意事項
|
||
if not csv_exists:
|
||
logger.warning("")
|
||
logger.warning("⚠️ 注意:CSV 檔案未產生,請檢查爬蟲是否成功爬取到資料")
|
||
|
||
logger.info("=" * 60)
|
||
|
||
# 如果爬蟲成功但沒有 CSV,返回警告碼(非錯誤)
|
||
if crawler_success and not csv_exists:
|
||
return 2 # 警告碼
|
||
return 0 if crawler_success else 1
|
||
|
||
|
||
def run_web_server():
|
||
"""啟動 Web 服務"""
|
||
logger.info("=" * 60)
|
||
logger.info("啟動 Web 服務")
|
||
logger.info("=" * 60)
|
||
|
||
try:
|
||
from web_app import app
|
||
logger.info("Web 服務啟動成功")
|
||
logger.info("服務地址: http://localhost:5000")
|
||
logger.info("按 Ctrl+C 停止服務")
|
||
app.run(host='0.0.0.0', port=5000, debug=False)
|
||
except ImportError:
|
||
logger.error("無法匯入 web_app 模組,請確認 web_app.py 存在")
|
||
sys.exit(1)
|
||
except Exception as e:
|
||
logger.error(f"啟動 Web 服務失敗: {e}")
|
||
sys.exit(1)
|
||
|
||
|
||
if __name__ == '__main__':
|
||
# 檢查是否要啟動 Web 服務
|
||
if len(sys.argv) > 1 and sys.argv[1] == '--web':
|
||
try:
|
||
run_web_server()
|
||
except KeyboardInterrupt:
|
||
logger.info("\nWeb 服務已停止")
|
||
sys.exit(0)
|
||
else:
|
||
# 預設執行爬蟲
|
||
try:
|
||
sys.exit(main())
|
||
except KeyboardInterrupt:
|
||
logger.info("\n使用者中斷執行")
|
||
sys.exit(130)
|
||
except Exception as e:
|
||
logger.error(f"發生未預期的錯誤: {e}", exc_info=True)
|
||
sys.exit(1)
|
||
|