企業內部新聞彙整與分析系統 - 自動新聞抓取 (Digitimes, 經濟日報, 工商時報) - AI 智慧摘要 (OpenAI/Claude/Ollama) - 群組管理與訂閱通知 - 已清理 Python 快取檔案 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
101 lines
4.7 KiB
Python
101 lines
4.7 KiB
Python
"""
|
|
新聞來源與文章資料模型
|
|
"""
|
|
from datetime import datetime
|
|
from sqlalchemy import String, Boolean, ForeignKey, Text, JSON, Enum as SQLEnum, UniqueConstraint, Index
|
|
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
|
from typing import Optional, List
|
|
import enum
|
|
|
|
from app.db.session import Base
|
|
|
|
|
|
class SourceType(str, enum.Enum):
|
|
"""來源類型"""
|
|
SUBSCRIPTION = "subscription"
|
|
PUBLIC = "public"
|
|
|
|
|
|
class CrawlStatus(str, enum.Enum):
|
|
"""抓取任務狀態"""
|
|
PENDING = "pending"
|
|
RUNNING = "running"
|
|
COMPLETED = "completed"
|
|
FAILED = "failed"
|
|
|
|
|
|
class NewsSource(Base):
|
|
"""新聞來源表"""
|
|
__tablename__ = "news_sources"
|
|
|
|
id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
|
|
code: Mapped[str] = mapped_column(String(30), unique=True, nullable=False, comment="來源代碼")
|
|
name: Mapped[str] = mapped_column(String(100), nullable=False, comment="來源名稱")
|
|
base_url: Mapped[str] = mapped_column(String(255), nullable=False, comment="網站基礎URL")
|
|
source_type: Mapped[SourceType] = mapped_column(SQLEnum(SourceType), nullable=False, comment="來源類型")
|
|
login_username: Mapped[Optional[str]] = mapped_column(String(100), comment="登入帳號")
|
|
login_password_encrypted: Mapped[Optional[str]] = mapped_column(String(255), comment="加密後密碼")
|
|
is_active: Mapped[bool] = mapped_column(Boolean, default=True, comment="是否啟用")
|
|
crawl_config: Mapped[Optional[dict]] = mapped_column(JSON, comment="爬蟲設定")
|
|
created_at: Mapped[datetime] = mapped_column(default=datetime.utcnow)
|
|
updated_at: Mapped[datetime] = mapped_column(default=datetime.utcnow, onupdate=datetime.utcnow)
|
|
|
|
# 關聯
|
|
articles: Mapped[List["NewsArticle"]] = relationship(back_populates="source")
|
|
crawl_jobs: Mapped[List["CrawlJob"]] = relationship(back_populates="source")
|
|
|
|
|
|
class NewsArticle(Base):
|
|
"""新聞文章表"""
|
|
__tablename__ = "news_articles"
|
|
__table_args__ = (
|
|
UniqueConstraint("source_id", "external_id", name="uk_source_external"),
|
|
Index("idx_articles_published", "published_at"),
|
|
Index("idx_articles_crawled", "crawled_at"),
|
|
)
|
|
|
|
id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
|
|
source_id: Mapped[int] = mapped_column(ForeignKey("news_sources.id"), nullable=False, comment="來源ID")
|
|
external_id: Mapped[Optional[str]] = mapped_column(String(100), comment="外部文章ID")
|
|
title: Mapped[str] = mapped_column(String(500), nullable=False, comment="文章標題")
|
|
content: Mapped[Optional[str]] = mapped_column(Text, comment="文章全文")
|
|
summary: Mapped[Optional[str]] = mapped_column(Text, comment="原文摘要")
|
|
url: Mapped[str] = mapped_column(String(500), nullable=False, comment="原文連結")
|
|
author: Mapped[Optional[str]] = mapped_column(String(100), comment="作者")
|
|
published_at: Mapped[Optional[datetime]] = mapped_column(comment="發布時間")
|
|
crawled_at: Mapped[datetime] = mapped_column(default=datetime.utcnow, comment="抓取時間")
|
|
created_at: Mapped[datetime] = mapped_column(default=datetime.utcnow)
|
|
|
|
# 關聯
|
|
source: Mapped["NewsSource"] = relationship(back_populates="articles")
|
|
group_matches: Mapped[List["ArticleGroupMatch"]] = relationship(back_populates="article", cascade="all, delete-orphan")
|
|
report_articles: Mapped[List["ReportArticle"]] = relationship(back_populates="article")
|
|
|
|
|
|
class CrawlJob(Base):
|
|
"""抓取任務記錄表"""
|
|
__tablename__ = "crawl_jobs"
|
|
__table_args__ = (
|
|
Index("idx_crawl_jobs_status", "status"),
|
|
Index("idx_crawl_jobs_scheduled", "scheduled_at"),
|
|
)
|
|
|
|
id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
|
|
source_id: Mapped[int] = mapped_column(ForeignKey("news_sources.id"), nullable=False, comment="來源ID")
|
|
status: Mapped[CrawlStatus] = mapped_column(SQLEnum(CrawlStatus), default=CrawlStatus.PENDING)
|
|
scheduled_at: Mapped[datetime] = mapped_column(nullable=False, comment="排程時間")
|
|
started_at: Mapped[Optional[datetime]] = mapped_column(comment="開始時間")
|
|
completed_at: Mapped[Optional[datetime]] = mapped_column(comment="完成時間")
|
|
articles_count: Mapped[int] = mapped_column(default=0, comment="抓取文章數")
|
|
error_message: Mapped[Optional[str]] = mapped_column(Text, comment="錯誤訊息")
|
|
retry_count: Mapped[int] = mapped_column(default=0, comment="重試次數")
|
|
created_at: Mapped[datetime] = mapped_column(default=datetime.utcnow)
|
|
|
|
# 關聯
|
|
source: Mapped["NewsSource"] = relationship(back_populates="crawl_jobs")
|
|
|
|
|
|
# 避免循環引入
|
|
from app.models.group import ArticleGroupMatch
|
|
from app.models.report import ReportArticle
|