Files
hbr-crawler/create_tables.sql
DonaldFang 方士碩 f524713cb6 Initial commit: HBR 文章爬蟲專案
- Scrapy 爬蟲框架,爬取 HBR 繁體中文文章
- Flask Web 應用程式,提供文章查詢介面
- SQL Server 資料庫整合
- 自動化排程與郵件通知功能

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-03 17:19:56 +08:00

56 lines
2.9 KiB
SQL
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

-- HBR 爬蟲系統資料表結構
-- 資料庫: HBR_scraper
-- 建立日期: 2024-12-22
-- 1. 文章主表 (articles)
CREATE TABLE IF NOT EXISTS `articles` (
`id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT COMMENT '文章唯一識別碼',
`title` VARCHAR(500) NOT NULL COMMENT '文章標題',
`url` VARCHAR(1000) NOT NULL COMMENT '文章網址',
`author` VARCHAR(200) DEFAULT NULL COMMENT '作者名稱',
`publish_date` DATETIME DEFAULT NULL COMMENT '發布日期',
`summary` TEXT DEFAULT NULL COMMENT '文章摘要',
`is_paywalled` TINYINT(1) NOT NULL DEFAULT 0 COMMENT '是否為付費文章 (1=是, 0=否)',
`category` VARCHAR(100) DEFAULT NULL COMMENT '文章分類',
`tags` VARCHAR(500) DEFAULT NULL COMMENT '標籤(逗號分隔字串,非正規化設計)',
`content` TEXT DEFAULT NULL COMMENT '文章內容(僅非付費文章)',
`language` VARCHAR(10) DEFAULT 'zh-TW' COMMENT '語言代碼zh-TW, en, ko',
`crawl_count` INT DEFAULT 1 COMMENT '爬取次數(用於追蹤歷史記錄)',
`created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '資料建立時間',
`updated_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '資料更新時間',
`crawled_at` DATETIME DEFAULT NULL COMMENT '爬取時間',
PRIMARY KEY (`id`),
UNIQUE KEY `idx_url` (`url`(255)),
KEY `idx_publish_date` (`publish_date`),
KEY `idx_category` (`category`),
KEY `idx_is_paywalled` (`is_paywalled`),
KEY `idx_crawled_at` (`crawled_at`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='文章主表';
-- 2. 標籤表 (tags)
CREATE TABLE IF NOT EXISTS `tags` (
`id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT COMMENT '標籤唯一識別碼',
`name` VARCHAR(100) NOT NULL COMMENT '標籤名稱',
`created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '建立時間',
PRIMARY KEY (`id`),
UNIQUE KEY `idx_name` (`name`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='標籤表';
-- 3. 文章標籤關聯表 (article_tags)
CREATE TABLE IF NOT EXISTS `article_tags` (
`id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT COMMENT '關聯唯一識別碼',
`article_id` BIGINT UNSIGNED NOT NULL COMMENT '文章 ID',
`tag_id` BIGINT UNSIGNED NOT NULL COMMENT '標籤 ID',
`created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '建立時間',
PRIMARY KEY (`id`),
UNIQUE KEY `idx_article_tag` (`article_id`, `tag_id`),
KEY `idx_article_id` (`article_id`),
KEY `idx_tag_id` (`tag_id`),
CONSTRAINT `fk_article_tags_article` FOREIGN KEY (`article_id`)
REFERENCES `articles` (`id`) ON DELETE CASCADE ON UPDATE CASCADE,
CONSTRAINT `fk_article_tags_tag` FOREIGN KEY (`tag_id`)
REFERENCES `tags` (`id`) ON DELETE CASCADE ON UPDATE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='文章標籤關聯表';