Initial commit: HBR 文章爬蟲專案
- Scrapy 爬蟲框架,爬取 HBR 繁體中文文章 - Flask Web 應用程式,提供文章查詢介面 - SQL Server 資料庫整合 - 自動化排程與郵件通知功能 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
55
create_tables.sql
Normal file
55
create_tables.sql
Normal file
@@ -0,0 +1,55 @@
|
||||
-- HBR 爬蟲系統資料表結構
|
||||
-- 資料庫: HBR_scraper
|
||||
-- 建立日期: 2024-12-22
|
||||
|
||||
-- 1. 文章主表 (articles)
|
||||
CREATE TABLE IF NOT EXISTS `articles` (
|
||||
`id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT COMMENT '文章唯一識別碼',
|
||||
`title` VARCHAR(500) NOT NULL COMMENT '文章標題',
|
||||
`url` VARCHAR(1000) NOT NULL COMMENT '文章網址',
|
||||
`author` VARCHAR(200) DEFAULT NULL COMMENT '作者名稱',
|
||||
`publish_date` DATETIME DEFAULT NULL COMMENT '發布日期',
|
||||
`summary` TEXT DEFAULT NULL COMMENT '文章摘要',
|
||||
`is_paywalled` TINYINT(1) NOT NULL DEFAULT 0 COMMENT '是否為付費文章 (1=是, 0=否)',
|
||||
`category` VARCHAR(100) DEFAULT NULL COMMENT '文章分類',
|
||||
`tags` VARCHAR(500) DEFAULT NULL COMMENT '標籤(逗號分隔字串,非正規化設計)',
|
||||
`content` TEXT DEFAULT NULL COMMENT '文章內容(僅非付費文章)',
|
||||
`language` VARCHAR(10) DEFAULT 'zh-TW' COMMENT '語言代碼(zh-TW, en, ko)',
|
||||
`crawl_count` INT DEFAULT 1 COMMENT '爬取次數(用於追蹤歷史記錄)',
|
||||
`created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '資料建立時間',
|
||||
`updated_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '資料更新時間',
|
||||
`crawled_at` DATETIME DEFAULT NULL COMMENT '爬取時間',
|
||||
PRIMARY KEY (`id`),
|
||||
UNIQUE KEY `idx_url` (`url`(255)),
|
||||
KEY `idx_publish_date` (`publish_date`),
|
||||
KEY `idx_category` (`category`),
|
||||
KEY `idx_is_paywalled` (`is_paywalled`),
|
||||
KEY `idx_crawled_at` (`crawled_at`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='文章主表';
|
||||
|
||||
-- 2. 標籤表 (tags)
|
||||
CREATE TABLE IF NOT EXISTS `tags` (
|
||||
`id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT COMMENT '標籤唯一識別碼',
|
||||
`name` VARCHAR(100) NOT NULL COMMENT '標籤名稱',
|
||||
`created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '建立時間',
|
||||
PRIMARY KEY (`id`),
|
||||
UNIQUE KEY `idx_name` (`name`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='標籤表';
|
||||
|
||||
-- 3. 文章標籤關聯表 (article_tags)
|
||||
CREATE TABLE IF NOT EXISTS `article_tags` (
|
||||
`id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT COMMENT '關聯唯一識別碼',
|
||||
`article_id` BIGINT UNSIGNED NOT NULL COMMENT '文章 ID',
|
||||
`tag_id` BIGINT UNSIGNED NOT NULL COMMENT '標籤 ID',
|
||||
`created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '建立時間',
|
||||
PRIMARY KEY (`id`),
|
||||
UNIQUE KEY `idx_article_tag` (`article_id`, `tag_id`),
|
||||
KEY `idx_article_id` (`article_id`),
|
||||
KEY `idx_tag_id` (`tag_id`),
|
||||
CONSTRAINT `fk_article_tags_article` FOREIGN KEY (`article_id`)
|
||||
REFERENCES `articles` (`id`) ON DELETE CASCADE ON UPDATE CASCADE,
|
||||
CONSTRAINT `fk_article_tags_tag` FOREIGN KEY (`tag_id`)
|
||||
REFERENCES `tags` (`id`) ON DELETE CASCADE ON UPDATE CASCADE
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='文章標籤關聯表';
|
||||
|
||||
|
||||
Reference in New Issue
Block a user