- Scrapy 爬蟲框架,爬取 HBR 繁體中文文章 - Flask Web 應用程式,提供文章查詢介面 - SQL Server 資料庫整合 - 自動化排程與郵件通知功能 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
56 lines
2.9 KiB
SQL
56 lines
2.9 KiB
SQL
-- HBR 爬蟲系統資料表結構
|
||
-- 資料庫: HBR_scraper
|
||
-- 建立日期: 2024-12-22
|
||
|
||
-- 1. 文章主表 (articles)
|
||
CREATE TABLE IF NOT EXISTS `articles` (
|
||
`id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT COMMENT '文章唯一識別碼',
|
||
`title` VARCHAR(500) NOT NULL COMMENT '文章標題',
|
||
`url` VARCHAR(1000) NOT NULL COMMENT '文章網址',
|
||
`author` VARCHAR(200) DEFAULT NULL COMMENT '作者名稱',
|
||
`publish_date` DATETIME DEFAULT NULL COMMENT '發布日期',
|
||
`summary` TEXT DEFAULT NULL COMMENT '文章摘要',
|
||
`is_paywalled` TINYINT(1) NOT NULL DEFAULT 0 COMMENT '是否為付費文章 (1=是, 0=否)',
|
||
`category` VARCHAR(100) DEFAULT NULL COMMENT '文章分類',
|
||
`tags` VARCHAR(500) DEFAULT NULL COMMENT '標籤(逗號分隔字串,非正規化設計)',
|
||
`content` TEXT DEFAULT NULL COMMENT '文章內容(僅非付費文章)',
|
||
`language` VARCHAR(10) DEFAULT 'zh-TW' COMMENT '語言代碼(zh-TW, en, ko)',
|
||
`crawl_count` INT DEFAULT 1 COMMENT '爬取次數(用於追蹤歷史記錄)',
|
||
`created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '資料建立時間',
|
||
`updated_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '資料更新時間',
|
||
`crawled_at` DATETIME DEFAULT NULL COMMENT '爬取時間',
|
||
PRIMARY KEY (`id`),
|
||
UNIQUE KEY `idx_url` (`url`(255)),
|
||
KEY `idx_publish_date` (`publish_date`),
|
||
KEY `idx_category` (`category`),
|
||
KEY `idx_is_paywalled` (`is_paywalled`),
|
||
KEY `idx_crawled_at` (`crawled_at`)
|
||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='文章主表';
|
||
|
||
-- 2. 標籤表 (tags)
|
||
CREATE TABLE IF NOT EXISTS `tags` (
|
||
`id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT COMMENT '標籤唯一識別碼',
|
||
`name` VARCHAR(100) NOT NULL COMMENT '標籤名稱',
|
||
`created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '建立時間',
|
||
PRIMARY KEY (`id`),
|
||
UNIQUE KEY `idx_name` (`name`)
|
||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='標籤表';
|
||
|
||
-- 3. 文章標籤關聯表 (article_tags)
|
||
CREATE TABLE IF NOT EXISTS `article_tags` (
|
||
`id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT COMMENT '關聯唯一識別碼',
|
||
`article_id` BIGINT UNSIGNED NOT NULL COMMENT '文章 ID',
|
||
`tag_id` BIGINT UNSIGNED NOT NULL COMMENT '標籤 ID',
|
||
`created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '建立時間',
|
||
PRIMARY KEY (`id`),
|
||
UNIQUE KEY `idx_article_tag` (`article_id`, `tag_id`),
|
||
KEY `idx_article_id` (`article_id`),
|
||
KEY `idx_tag_id` (`tag_id`),
|
||
CONSTRAINT `fk_article_tags_article` FOREIGN KEY (`article_id`)
|
||
REFERENCES `articles` (`id`) ON DELETE CASCADE ON UPDATE CASCADE,
|
||
CONSTRAINT `fk_article_tags_tag` FOREIGN KEY (`tag_id`)
|
||
REFERENCES `tags` (`id`) ON DELETE CASCADE ON UPDATE CASCADE
|
||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='文章標籤關聯表';
|
||
|
||
|