Initial commit: HBR 文章爬蟲專案

- Scrapy 爬蟲框架，爬取 HBR 繁體中文文章 - Flask Web 應用程式，提供文章查詢介面 - SQL Server 資料庫整合 - 自動化排程與郵件通知功能 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-03 17:19:56 +08:00
commit f524713cb6
35 changed files with 6719 additions and 0 deletions
--- a/web_app.py
+++ b/web_app.py
@@ -0,0 +1,557 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+HBR 爬蟲系統 Web 服務
+提供查詢介面和統計功能
+"""
+import os
+import sys
+from pathlib import Path
+from flask import Flask, render_template, request, jsonify
+from datetime import datetime, timedelta
+import logging
+try:
+    from flask_cors import CORS
+    CORS_AVAILABLE = True
+except ImportError:
+    CORS_AVAILABLE = False
+
+# 加入專案路徑
+project_root = Path(__file__).parent
+sys.path.insert(0, str(project_root))
+
+from hbr_crawler.hbr_crawler.database import get_database_manager
+
+# 設定日誌
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+app = Flask(__name__, 
+            template_folder='templates',
+            static_folder='static')
+
+# 啟用 CORS（如果需要跨域請求）
+if CORS_AVAILABLE:
+    CORS(app)
+
+# 取得資料庫管理物件
+db_manager = None
+
+def get_db():
+    """取得資料庫管理物件（單例模式）"""
+    global db_manager
+    if db_manager is None:
+        db_manager = get_database_manager()
+    return db_manager
+
+
+@app.route('/')
+def index():
+    """首頁"""
+    return render_template('index.html')
+
+
+@app.route('/api/articles', methods=['GET'])
+def get_articles():
+    """取得文章列表 API"""
+    try:
+        db = get_db()
+        
+        # 取得查詢參數
+        page = int(request.args.get('page', 1))
+        per_page = int(request.args.get('per_page', 20))
+        category = request.args.get('category', '')
+        tag = request.args.get('tag', '')
+        start_date = request.args.get('start_date', '')
+        end_date = request.args.get('end_date', '')
+        keyword = request.args.get('keyword', '')
+        is_paywalled = request.args.get('is_paywalled', '')
+        language = request.args.get('language', '')
+        
+        # 建立查詢條件
+        where_conditions = []
+        params = []
+        
+        # 使用別名 'a' 建立查詢條件（因為查詢會使用 JOIN）
+        if category:
+            where_conditions.append("a.category = %s")
+            params.append(category)
+        
+        if tag:
+            # 使用 article_tags 關聯表查詢標籤
+            where_conditions.append("t.name LIKE %s")
+            params.append(f'%{tag}%')
+            use_join = True
+        else:
+            use_join = False
+        
+        if start_date:
+            where_conditions.append("a.publish_date >= %s")
+            params.append(start_date)
+        
+        if end_date:
+            where_conditions.append("a.publish_date <= %s")
+            params.append(end_date)
+        
+        if keyword:
+            where_conditions.append("(a.title LIKE %s OR a.summary LIKE %s OR a.content LIKE %s)")
+            params.extend([f'%{keyword}%', f'%{keyword}%', f'%{keyword}%'])
+        
+        if is_paywalled != '':
+            where_conditions.append("a.is_paywalled = %s")
+            params.append(int(is_paywalled))
+        
+        # language 欄位不存在，暫時跳過
+        # if language:
+        #     where_conditions.append("a.language = %s")
+        #     params.append(language)
+        
+        where_clause = " AND ".join(where_conditions) if where_conditions else "1=1"
+        
+        # 計算總數
+        if use_join:
+            count_query = f"""
+                SELECT COUNT(DISTINCT a.id) as count 
+                FROM articles a
+                LEFT JOIN article_tags at ON a.id = at.article_id
+                LEFT JOIN tags t ON at.tag_id = t.id
+                WHERE {where_clause}
+            """
+        else:
+            count_query = f"SELECT COUNT(*) as count FROM articles a WHERE {where_clause}"
+        
+        count_params = tuple(params) if params else None
+        count_result = db.execute_query(count_query, count_params, database='db_A101')
+        total = count_result[0]['count'] if count_result and len(count_result) > 0 else 0
+        
+        # 取得文章列表
+        offset = (page - 1) * per_page
+        query_params = list(params)
+        query_params.extend([per_page, offset])
+        
+        # 查詢文章列表（使用 LEFT JOIN 取得標籤）
+        if use_join:
+            query = f"""
+                SELECT a.id, a.title, a.url, a.author, a.publish_date, a.summary, 
+                       a.is_paywalled, a.category, a.crawled_at,
+                       GROUP_CONCAT(DISTINCT t.name SEPARATOR ', ') as tags
+                FROM articles a
+                LEFT JOIN article_tags at ON a.id = at.article_id
+                LEFT JOIN tags t ON at.tag_id = t.id
+                WHERE {where_clause}
+                GROUP BY a.id, a.title, a.url, a.author, a.publish_date, a.summary, 
+                         a.is_paywalled, a.category, a.crawled_at
+                ORDER BY a.crawled_at DESC
+                LIMIT %s OFFSET %s
+            """
+        else:
+            # 沒有標籤查詢時，使用子查詢取得標籤
+            query = f"""
+                SELECT a.id, a.title, a.url, a.author, a.publish_date, a.summary, 
+                       a.is_paywalled, a.category, a.crawled_at,
+                       (SELECT GROUP_CONCAT(t.name SEPARATOR ', ')
+                        FROM article_tags at
+                        INNER JOIN tags t ON at.tag_id = t.id
+                        WHERE at.article_id = a.id) as tags
+                FROM articles a
+                WHERE {where_clause}
+                ORDER BY a.crawled_at DESC
+                LIMIT %s OFFSET %s
+            """
+        
+        articles = db.execute_query(query, tuple(query_params), database='db_A101')
+        
+        # 確保 articles 是列表
+        if not articles:
+            articles = []
+        
+        # 為每篇文章添加預設值並處理資料格式
+        for article in articles:
+            if 'tags' not in article or article['tags'] is None:
+                article['tags'] = ''
+            if 'language' not in article:
+                article['language'] = 'zh-TW'
+            # 確保日期格式正確
+            if article.get('publish_date') and isinstance(article['publish_date'], datetime):
+                article['publish_date'] = article['publish_date'].strftime('%Y-%m-%d %H:%M:%S')
+            if article.get('crawled_at') and isinstance(article['crawled_at'], datetime):
+                article['crawled_at'] = article['crawled_at'].strftime('%Y-%m-%d %H:%M:%S')
+        
+        logger.info(f"查詢到 {len(articles)} 篇文章，總數: {total}")
+        
+        return jsonify({
+            'success': True,
+            'data': articles,
+            'pagination': {
+                'page': page,
+                'per_page': per_page,
+                'total': total,
+                'pages': (total + per_page - 1) // per_page
+            }
+        })
+    except Exception as e:
+        logger.error(f"取得文章列表失敗: {e}")
+        return jsonify({'success': False, 'error': str(e)}), 500
+
+
+@app.route('/api/article/<int:article_id>', methods=['GET'])
+def get_article(article_id):
+    """取得單篇文章詳情"""
+    try:
+        db = get_db()
+        query = "SELECT * FROM articles WHERE id = %s"
+        result = db.execute_query(query, (article_id,), database='db_A101')
+        
+        if result:
+            return jsonify({'success': True, 'data': result[0]})
+        else:
+            return jsonify({'success': False, 'error': '文章不存在'}), 404
+    except Exception as e:
+        logger.error(f"取得文章詳情失敗: {e}")
+        return jsonify({'success': False, 'error': str(e)}), 500
+
+
+@app.route('/api/statistics', methods=['GET'])
+def get_statistics():
+    """取得統計資料"""
+    try:
+        db = get_db()
+        db_name = 'db_A101'
+        
+        stats = {}
+        
+        # 文章總數
+        total_result = db.execute_query("SELECT COUNT(*) as count FROM articles", database=db_name)
+        stats['total_articles'] = total_result[0]['count'] if total_result else 0
+        
+        # 付費/非付費文章統計
+        paywall_result = db.execute_query(
+            "SELECT is_paywalled, COUNT(*) as count FROM articles GROUP BY is_paywalled",
+            database=db_name
+        )
+        stats['paywall'] = {row['is_paywalled']: row['count'] for row in paywall_result}
+        
+        # 分類分布
+        category_result = db.execute_query(
+            "SELECT category, COUNT(*) as count FROM articles WHERE category IS NOT NULL AND category != '' GROUP BY category ORDER BY count DESC LIMIT 10",
+            database=db_name
+        )
+        stats['categories'] = [{'name': row['category'], 'count': row['count']} for row in category_result]
+        
+        # 作者統計
+        author_result = db.execute_query(
+            "SELECT author, COUNT(*) as count FROM articles WHERE author IS NOT NULL AND author != '' GROUP BY author ORDER BY count DESC LIMIT 10",
+            database=db_name
+        )
+        stats['authors'] = [{'name': row['author'], 'count': row['count']} for row in author_result]
+        
+        # 語言分布
+        language_result = db.execute_query(
+            "SELECT language, COUNT(*) as count FROM articles GROUP BY language",
+            database=db_name
+        )
+        stats['languages'] = {row['language']: row['count'] for row in language_result}
+        
+        # 最近30天文章數量趨勢
+        date_result = db.execute_query(
+            """
+            SELECT DATE(crawled_at) as date, COUNT(*) as count 
+            FROM articles 
+            WHERE crawled_at >= DATE_SUB(NOW(), INTERVAL 30 DAY)
+            GROUP BY DATE(crawled_at)
+            ORDER BY date DESC
+            """,
+            database=db_name
+        )
+        stats['daily_trend'] = [{'date': str(row['date']), 'count': row['count']} for row in date_result]
+        
+        return jsonify({'success': True, 'data': stats})
+    except Exception as e:
+        logger.error(f"取得統計資料失敗: {e}")
+        return jsonify({'success': False, 'error': str(e)}), 500
+
+
+@app.route('/api/categories', methods=['GET'])
+def get_categories():
+    """取得所有分類列表"""
+    try:
+        db = get_db()
+        result = db.execute_query(
+            "SELECT DISTINCT category FROM articles WHERE category IS NOT NULL AND category != '' ORDER BY category",
+            database='db_A101'
+        )
+        categories = [row['category'] for row in result]
+        return jsonify({'success': True, 'data': categories})
+    except Exception as e:
+        logger.error(f"取得分類列表失敗: {e}")
+        return jsonify({'success': False, 'error': str(e)}), 500
+
+
+@app.route('/api/tags', methods=['GET'])
+def get_tags():
+    """取得所有標籤列表"""
+    try:
+        db = get_db()
+        result = db.execute_query(
+            "SELECT DISTINCT tags FROM articles WHERE tags IS NOT NULL AND tags != ''",
+            database='db_A101'
+        )
+        # 解析逗號分隔的標籤
+        all_tags = set()
+        for row in result:
+            tags = [t.strip() for t in row['tags'].split(',') if t.strip()]
+            all_tags.update(tags)
+        
+        return jsonify({'success': True, 'data': sorted(list(all_tags))})
+    except Exception as e:
+        logger.error(f"取得標籤列表失敗: {e}")
+        return jsonify({'success': False, 'error': str(e)}), 500
+
+
+@app.route('/api/run-crawler', methods=['POST'])
+def run_crawler():
+    """手動觸發爬蟲執行"""
+    try:
+        import subprocess
+        result = subprocess.run(
+            [sys.executable, 'run_crawler.py'],
+            capture_output=True,
+            text=True,
+            timeout=300  # 5分鐘超時
+        )
+        
+        return jsonify({
+            'success': result.returncode == 0,
+            'output': result.stdout,
+            'error': result.stderr
+        })
+    except subprocess.TimeoutExpired:
+        return jsonify({'success': False, 'error': '爬蟲執行超時'}), 500
+    except Exception as e:
+        logger.error(f"執行爬蟲失敗: {e}")
+        return jsonify({'success': False, 'error': str(e)}), 500
+
+
+@app.route('/api/save-crawler-config', methods=['POST'])
+def save_crawler_config():
+    """儲存爬蟲設定"""
+    try:
+        config = request.get_json()
+        
+        # 驗證設定
+        if not config:
+            return jsonify({'success': False, 'error': '設定資料為空'}), 400
+        
+        # 儲存到檔案（可選）
+        import json
+        config_file = Path(__file__).parent / 'crawler_config.json'
+        with open(config_file, 'w', encoding='utf-8') as f:
+            json.dump(config, f, ensure_ascii=False, indent=2)
+        
+        logger.info(f"爬蟲設定已儲存: {config_file}")
+        
+        return jsonify({'success': True, 'message': '設定已儲存'})
+    except Exception as e:
+        logger.error(f"儲存爬蟲設定失敗: {e}")
+        return jsonify({'success': False, 'error': str(e)}), 500
+
+
+@app.route('/api/load-crawler-config', methods=['GET'])
+def load_crawler_config():
+    """載入爬蟲設定"""
+    try:
+        config_file = Path(__file__).parent / 'crawler_config.json'
+        
+        if config_file.exists():
+            import json
+            with open(config_file, 'r', encoding='utf-8') as f:
+                config = json.load(f)
+            return jsonify({'success': True, 'data': config})
+        else:
+            # 返回預設設定
+            default_config = {
+                'urls': [
+                    'https://www.hbrtaiwan.com/',
+                    'https://www.hbrtaiwan.com/topic/management',
+                    'https://www.hbrtaiwan.com/topic/leadership',
+                    'https://www.hbrtaiwan.com/topic/strategy',
+                    'https://www.hbrtaiwan.com/topic/innovation',
+                    'https://www.hbrtaiwan.com/topic/technology'
+                ],
+                'downloadDelay': 1,
+                'maxDepth': 3,
+                'concurrentRequests': 16,
+                'skipPaywalled': True,
+                'followPagination': True,
+                'obeyRobotsTxt': True,
+                'articleListSelector': '.articleItem, article, .article-item, .post-item, .content-item',
+                'titleSelector': 'h1.articleTitle, h1.article-title, h1, .article-title, .post-title',
+                'authorSelector': '.authorName, .author, .byline, .writer, .author-name',
+                'contentSelector': '.articleContent, .article-content, .post-content, .content, .articleText'
+            }
+            return jsonify({'success': True, 'data': default_config})
+    except Exception as e:
+        logger.error(f"載入爬蟲設定失敗: {e}")
+        return jsonify({'success': False, 'error': str(e)}), 500
+
+
+@app.route('/api/test-crawler-config', methods=['POST'])
+def test_crawler_config():
+    """測試爬蟲設定（僅測試第一個 URL）"""
+    try:
+        config = request.get_json()
+        
+        if not config or not config.get('urls') or len(config['urls']) == 0:
+            return jsonify({'success': False, 'error': '請至少提供一個起始 URL'}), 400
+        
+        # 使用 Scrapy 測試第一個 URL
+        import subprocess
+        import tempfile
+        import json
+        
+        # 建立臨時設定檔
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False, encoding='utf-8') as f:
+            json.dump(config, f, ensure_ascii=False, indent=2)
+            temp_config_file = f.name
+        
+        try:
+            # 執行測試爬蟲（僅爬取第一個 URL，深度 1）
+            # 使用 stats 收集器來獲取準確的統計資訊
+            test_result = subprocess.run(
+                [sys.executable, '-m', 'scrapy', 'crawl', 'hbr', 
+                 '-a', f'start_url={config["urls"][0]}',
+                 '-a', 'test_mode=true',
+                 '-s', 'LOG_LEVEL=INFO',
+                 '-s', 'STATS_CLASS=scrapy.statscollectors.MemoryStatsCollector'],
+                cwd=str(Path(__file__).parent / 'hbr_crawler'),
+                capture_output=True,
+                text=True,
+                timeout=60,
+                encoding='utf-8',
+                errors='replace'
+            )
+            
+            # 從輸出中解析文章數量
+            articles_found = 0
+            output_lines = test_result.stdout.split('\n') if test_result.stdout else []
+            stderr_lines = test_result.stderr.split('\n') if test_result.stderr else []
+            all_lines = output_lines + stderr_lines
+            
+            import re
+            
+            # 方法1: 查找 Scrapy 統計資訊中的 item_scraped_count
+            # Scrapy 輸出格式: 'item_scraped_count': 5 或 'items': 5
+            for line in all_lines:
+                # 匹配 'item_scraped_count': 數字 或 'items': 數字
+                match = re.search(r"['\"]?item_scraped_count['\"]?\s*[:=]\s*(\d+)", line, re.IGNORECASE)
+                if match:
+                    articles_found = int(match.group(1))
+                    break
+                
+                # 匹配 'items': 數字（在某些 Scrapy 版本中）
+                match = re.search(r"['\"]?items['\"]?\s*[:=]\s*(\d+)", line, re.IGNORECASE)
+                if match:
+                    articles_found = int(match.group(1))
+                    break
+            
+            # 方法2: 查找日誌中的 "Scraped from" 或 "item_scraped" 訊息
+            if articles_found == 0:
+                for line in all_lines:
+                    # Scrapy 日誌格式: [hbr] DEBUG: Scraped from <200 https://...>
+                    if 'Scraped from' in line or 'item_scraped' in line.lower():
+                        articles_found += 1
+            
+            # 方法3: 查找統計摘要中的數字（格式: "items": 5）
+            if articles_found == 0:
+                for line in all_lines:
+                    # 匹配 JSON 格式的統計: "items": 5 或 'items': 5
+                    match = re.search(r"['\"]items['\"]\s*:\s*(\d+)", line, re.IGNORECASE)
+                    if match:
+                        articles_found = int(match.group(1))
+                        break
+            
+            # 方法4: 如果還是0，檢查是否有錯誤或警告
+            if articles_found == 0:
+                has_error = False
+                error_lines = []
+                for line in all_lines:
+                    if 'ERROR' in line.upper() or 'CRITICAL' in line.upper():
+                        has_error = True
+                        error_lines.append(line)
+                    elif 'No module named' in line or 'ImportError' in line:
+                        has_error = True
+                        error_lines.append(line)
+                
+                if has_error:
+                    error_msg = '\n'.join(error_lines[:5])  # 只取前5行錯誤
+                    return jsonify({
+                        'success': False,
+                        'error': f'爬蟲執行時發生錯誤',
+                        'data': {
+                            'articles_found': 0,
+                            'output': test_result.stdout[:1000] if test_result.stdout else '',
+                            'error': error_msg[:500],
+                            'returncode': test_result.returncode
+                        }
+                    })
+            
+            # 如果找到文章，返回成功；如果沒找到但沒有錯誤，可能是選擇器問題
+            if articles_found == 0:
+                # 檢查是否成功連接到網站
+                has_connection = False
+                for line in all_lines:
+                    if '200' in line or 'downloaded' in line.lower() or 'response' in line.lower():
+                        has_connection = True
+                        break
+                
+                if has_connection:
+                    return jsonify({
+                        'success': True,
+                        'data': {
+                            'articles_found': 0,
+                            'output': test_result.stdout[:1000] if test_result.stdout else '',
+                            'error': '成功連接到網站，但未找到文章。可能是 CSS 選擇器不正確，或網站結構已變更。',
+                            'returncode': test_result.returncode,
+                            'warning': '未找到文章，請檢查 CSS 選擇器設定'
+                        }
+                    })
+            
+            return jsonify({
+                'success': True,
+                'data': {
+                    'articles_found': articles_found,
+                    'output': test_result.stdout[:1000] if test_result.stdout else '',  # 返回前 1000 字元
+                    'error': test_result.stderr[:500] if test_result.stderr else '',
+                    'returncode': test_result.returncode
+                }
+            })
+        finally:
+            # 清理臨時檔案
+            try:
+                os.unlink(temp_config_file)
+            except:
+                pass
+                
+    except subprocess.TimeoutExpired:
+        return jsonify({'success': False, 'error': '測試超時'}), 500
+    except Exception as e:
+        logger.error(f"測試爬蟲設定失敗: {e}")
+        return jsonify({'success': False, 'error': str(e)}), 500
+
+
+if __name__ == '__main__':
+    # 建立必要的目錄
+    os.makedirs('templates', exist_ok=True)
+    os.makedirs('static', exist_ok=True)
+    
+    # 啟動服務
+    print("=" * 60)
+    print("HBR 爬蟲系統 Web 服務")
+    print("=" * 60)
+    print("服務地址: http://localhost:5000")
+    print("按 Ctrl+C 停止服務")
+    print("=" * 60)
+    
+    app.run(host='0.0.0.0', port=5000, debug=True)
+
+