Initial commit: HBR 文章爬蟲專案

- Scrapy 爬蟲框架,爬取 HBR 繁體中文文章
- Flask Web 應用程式,提供文章查詢介面
- SQL Server 資料庫整合
- 自動化排程與郵件通知功能

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-12-03 17:19:56 +08:00
commit f524713cb6
35 changed files with 6719 additions and 0 deletions

55
create_tables.sql Normal file
View File

@@ -0,0 +1,55 @@
-- HBR 爬蟲系統資料表結構
-- 資料庫: HBR_scraper
-- 建立日期: 2024-12-22
-- 1. 文章主表 (articles)
CREATE TABLE IF NOT EXISTS `articles` (
`id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT COMMENT '文章唯一識別碼',
`title` VARCHAR(500) NOT NULL COMMENT '文章標題',
`url` VARCHAR(1000) NOT NULL COMMENT '文章網址',
`author` VARCHAR(200) DEFAULT NULL COMMENT '作者名稱',
`publish_date` DATETIME DEFAULT NULL COMMENT '發布日期',
`summary` TEXT DEFAULT NULL COMMENT '文章摘要',
`is_paywalled` TINYINT(1) NOT NULL DEFAULT 0 COMMENT '是否為付費文章 (1=是, 0=否)',
`category` VARCHAR(100) DEFAULT NULL COMMENT '文章分類',
`tags` VARCHAR(500) DEFAULT NULL COMMENT '標籤(逗號分隔字串,非正規化設計)',
`content` TEXT DEFAULT NULL COMMENT '文章內容(僅非付費文章)',
`language` VARCHAR(10) DEFAULT 'zh-TW' COMMENT '語言代碼zh-TW, en, ko',
`crawl_count` INT DEFAULT 1 COMMENT '爬取次數(用於追蹤歷史記錄)',
`created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '資料建立時間',
`updated_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '資料更新時間',
`crawled_at` DATETIME DEFAULT NULL COMMENT '爬取時間',
PRIMARY KEY (`id`),
UNIQUE KEY `idx_url` (`url`(255)),
KEY `idx_publish_date` (`publish_date`),
KEY `idx_category` (`category`),
KEY `idx_is_paywalled` (`is_paywalled`),
KEY `idx_crawled_at` (`crawled_at`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='文章主表';
-- 2. 標籤表 (tags)
CREATE TABLE IF NOT EXISTS `tags` (
`id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT COMMENT '標籤唯一識別碼',
`name` VARCHAR(100) NOT NULL COMMENT '標籤名稱',
`created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '建立時間',
PRIMARY KEY (`id`),
UNIQUE KEY `idx_name` (`name`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='標籤表';
-- 3. 文章標籤關聯表 (article_tags)
CREATE TABLE IF NOT EXISTS `article_tags` (
`id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT COMMENT '關聯唯一識別碼',
`article_id` BIGINT UNSIGNED NOT NULL COMMENT '文章 ID',
`tag_id` BIGINT UNSIGNED NOT NULL COMMENT '標籤 ID',
`created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '建立時間',
PRIMARY KEY (`id`),
UNIQUE KEY `idx_article_tag` (`article_id`, `tag_id`),
KEY `idx_article_id` (`article_id`),
KEY `idx_tag_id` (`tag_id`),
CONSTRAINT `fk_article_tags_article` FOREIGN KEY (`article_id`)
REFERENCES `articles` (`id`) ON DELETE CASCADE ON UPDATE CASCADE,
CONSTRAINT `fk_article_tags_tag` FOREIGN KEY (`tag_id`)
REFERENCES `tags` (`id`) ON DELETE CASCADE ON UPDATE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='文章標籤關聯表';