refactor: complete V1 to V2 migration and remove legacy architecture

Remove all V1 architecture components and promote V2 to primary: - Delete all paddle_ocr_* table models (export, ocr, translation, user) - Delete legacy routers (auth, export, ocr, translation) - Delete legacy schemas and services - Promote user_v2.py to user.py as primary user model - Update all imports and dependencies to use V2 models only - Update main.py version to 2.0.0 Database changes: - Fix SQLAlchemy reserved word: rename audit_log.metadata to extra_data - Add migration to drop all paddle_ocr_* tables - Update alembic env to only import V2 models Frontend fixes: - Fix Select component exports in TaskHistoryPage.tsx - Update to use simplified Select API with options prop - Fix AxiosInstance TypeScript import syntax 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-14 21:27:39 +08:00
parent ad2b832fb6
commit fd98018ddd
34 changed files with 554 additions and 3787 deletions
--- a/backend/app/models/init.py
+++ b/backend/app/models/init.py
@@ -1,31 +1,20 @@
 """
-Tool_OCR - Database Models
+Tool_OCR - Database Models (V2)

-New schema with external API authentication and user task isolation.
+External API authentication with user task isolation.
 All tables use 'tool_ocr_' prefix for namespace separation.
 """

-# New models for external authentication system
-from app.models.user_v2 import User
+from app.models.user import User
 from app.models.task import Task, TaskFile, TaskStatus
 from app.models.session import Session
-
-# Legacy models (will be deprecated after migration)
-from app.models.ocr import OCRBatch, OCRFile, OCRResult
-from app.models.export import ExportRule
-from app.models.translation import TranslationConfig
+from app.models.audit_log import AuditLog

 __all__ = [
-    # New authentication and task models
    "User",
    "Task",
    "TaskFile",
    "TaskStatus",
    "Session",
-    # Legacy models (deprecated)
-    "OCRBatch",
-    "OCRFile",
-    "OCRResult",
-    "ExportRule",
-    "TranslationConfig",
+    "AuditLog",
 ]
--- a/backend/app/models/audit_log.py
+++ b/backend/app/models/audit_log.py
@@ -67,7 +67,7 @@ class AuditLog(Base):
        comment="1 for success, 0 for failure"
    )
    error_message = Column(Text, nullable=True, comment="Error details if failed")
-    metadata = Column(Text, nullable=True, comment="Additional JSON metadata")
+    extra_data = Column(Text, nullable=True, comment="Additional JSON metadata")
    created_at = Column(DateTime, default=datetime.utcnow, nullable=False, index=True)

    # Relationships
@@ -90,6 +90,6 @@ class AuditLog(Base):
            "resource_id": self.resource_id,
            "success": bool(self.success),
            "error_message": self.error_message,
-            "metadata": self.metadata,
+            "extra_data": self.extra_data,
            "created_at": self.created_at.isoformat() if self.created_at else None
        }
--- a/backend/app/models/export.py
+++ b/backend/app/models/export.py
@@ -1,55 +0,0 @@
-"""
-Tool_OCR - Export Rule Model
-User-defined export rules and formatting configurations
-"""
-
-from sqlalchemy import Column, Integer, String, DateTime, Text, ForeignKey, JSON
-from sqlalchemy.orm import relationship
-from datetime import datetime
-
-from app.core.database import Base
-
-
-class ExportRule(Base):
-    """Export rule configuration for customized output formatting"""
-
-    __tablename__ = "paddle_ocr_export_rules"
-
-    id = Column(Integer, primary_key=True, index=True)
-    user_id = Column(Integer, ForeignKey("paddle_ocr_users.id", ondelete="CASCADE"), nullable=False, index=True)
-    rule_name = Column(String(100), nullable=False)
-    description = Column(Text, nullable=True)
-
-    # Rule configuration stored as JSON
-    # {
-    #   "filters": {
-    #     "confidence_threshold": 0.8,
-    #     "filename_pattern": "invoice_*",
-    #     "language": "ch"
-    #   },
-    #   "formatting": {
-    #     "add_line_numbers": true,
-    #     "sort_by_position": true,
-    #     "group_by_filename": false
-    #   },
-    #   "export_options": {
-    #     "include_metadata": true,
-    #     "include_confidence": true,
-    #     "include_bounding_boxes": false
-    #   }
-    # }
-    config_json = Column(JSON, nullable=False)
-
-    # CSS template for PDF export (optional)
-    # Can reference predefined templates: "default", "academic", "business", "report"
-    # Or store custom CSS
-    css_template = Column(Text, nullable=True)
-
-    created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
-    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
-
-    # Relationships
-    user = relationship("User", back_populates="export_rules")
-
-    def __repr__(self):
-        return f"<ExportRule(id={self.id}, name='{self.rule_name}', user_id={self.user_id})>"
--- a/backend/app/models/ocr.py
+++ b/backend/app/models/ocr.py
@@ -1,122 +0,0 @@
-"""
-Tool_OCR - OCR Models
-Database models for OCR batches, files, and results
-"""
-
-from sqlalchemy import Column, Integer, String, DateTime, Float, Text, ForeignKey, Enum, JSON
-from sqlalchemy.orm import relationship
-from datetime import datetime
-import enum
-
-from app.core.database import Base
-
-
-class BatchStatus(str, enum.Enum):
-    """Batch processing status"""
-    PENDING = "pending"
-    PROCESSING = "processing"
-    COMPLETED = "completed"
-    PARTIAL = "partial"  # Some files failed
-    FAILED = "failed"
-
-
-class FileStatus(str, enum.Enum):
-    """Individual file processing status"""
-    PENDING = "pending"
-    PROCESSING = "processing"
-    COMPLETED = "completed"
-    FAILED = "failed"
-
-
-class OCRBatch(Base):
-    """OCR batch processing tracking"""
-
-    __tablename__ = "paddle_ocr_batches"
-
-    id = Column(Integer, primary_key=True, index=True)
-    user_id = Column(Integer, ForeignKey("paddle_ocr_users.id", ondelete="CASCADE"), nullable=False, index=True)
-    batch_name = Column(String(255), nullable=True)
-    status = Column(Enum(BatchStatus), default=BatchStatus.PENDING, nullable=False, index=True)
-    total_files = Column(Integer, default=0, nullable=False)
-    completed_files = Column(Integer, default=0, nullable=False)
-    failed_files = Column(Integer, default=0, nullable=False)
-    created_at = Column(DateTime, default=datetime.utcnow, nullable=False, index=True)
-    started_at = Column(DateTime, nullable=True)
-    completed_at = Column(DateTime, nullable=True)
-
-    # Relationships
-    user = relationship("User", back_populates="ocr_batches")
-    files = relationship("OCRFile", back_populates="batch", cascade="all, delete-orphan")
-
-    @property
-    def progress_percentage(self) -> float:
-        """Calculate progress percentage"""
-        if self.total_files == 0:
-            return 0.0
-        return (self.completed_files / self.total_files) * 100
-
-    def __repr__(self):
-        return f"<OCRBatch(id={self.id}, status='{self.status}', progress={self.progress_percentage:.1f}%)>"
-
-
-class OCRFile(Base):
-    """Individual file in an OCR batch"""
-
-    __tablename__ = "paddle_ocr_files"
-
-    id = Column(Integer, primary_key=True, index=True)
-    batch_id = Column(Integer, ForeignKey("paddle_ocr_batches.id", ondelete="CASCADE"), nullable=False, index=True)
-    filename = Column(String(255), nullable=False)
-    original_filename = Column(String(255), nullable=False)
-    file_path = Column(String(512), nullable=False)
-    file_size = Column(Integer, nullable=False)  # Size in bytes
-    file_format = Column(String(20), nullable=False)  # png, jpg, pdf, etc.
-    status = Column(Enum(FileStatus), default=FileStatus.PENDING, nullable=False, index=True)
-    error_message = Column(Text, nullable=True)
-    retry_count = Column(Integer, default=0, nullable=False)  # Number of retry attempts
-    created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
-    started_at = Column(DateTime, nullable=True)
-    completed_at = Column(DateTime, nullable=True)
-    processing_time = Column(Float, nullable=True)  # Processing time in seconds
-
-    # Relationships
-    batch = relationship("OCRBatch", back_populates="files")
-    result = relationship("OCRResult", back_populates="file", uselist=False, cascade="all, delete-orphan")
-
-    def __repr__(self):
-        return f"<OCRFile(id={self.id}, filename='{self.filename}', status='{self.status}')>"
-
-
-class OCRResult(Base):
-    """OCR processing result with structure and images"""
-
-    __tablename__ = "paddle_ocr_results"
-
-    id = Column(Integer, primary_key=True, index=True)
-    file_id = Column(Integer, ForeignKey("paddle_ocr_files.id", ondelete="CASCADE"), unique=True, nullable=False, index=True)
-
-    # Output file paths
-    markdown_path = Column(String(512), nullable=True)  # Path to Markdown file
-    json_path = Column(String(512), nullable=True)  # Path to JSON file
-    images_dir = Column(String(512), nullable=True)  # Directory containing extracted images
-
-    # OCR metadata
-    detected_language = Column(String(20), nullable=True)  # ch, en, japan, korean
-    total_text_regions = Column(Integer, default=0, nullable=False)
-    average_confidence = Column(Float, nullable=True)
-
-    # Layout structure data (stored as JSON)
-    # Contains: layout elements (title, paragraph, table, image, formula), reading order, bounding boxes
-    layout_data = Column(JSON, nullable=True)
-
-    # Extracted images metadata (stored as JSON)
-    # Contains: list of {image_path, bbox, element_type}
-    images_metadata = Column(JSON, nullable=True)
-
-    created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
-
-    # Relationships
-    file = relationship("OCRFile", back_populates="result")
-
-    def __repr__(self):
-        return f"<OCRResult(id={self.id}, file_id={self.file_id}, language='{self.detected_language}')>"
--- a/backend/app/models/translation.py
+++ b/backend/app/models/translation.py
@@ -1,43 +0,0 @@
-"""
-Tool_OCR - Translation Config Model (RESERVED)
-Reserved for future translation feature implementation
-"""
-
-from sqlalchemy import Column, Integer, String, DateTime, ForeignKey, JSON
-from sqlalchemy.orm import relationship
-from datetime import datetime
-
-from app.core.database import Base
-
-
-class TranslationConfig(Base):
-    """
-    Translation configuration (RESERVED for future implementation)
-
-    This table is created but not actively used until translation feature is implemented.
-    """
-
-    __tablename__ = "paddle_ocr_translation_configs"
-
-    id = Column(Integer, primary_key=True, index=True)
-    user_id = Column(Integer, ForeignKey("paddle_ocr_users.id", ondelete="CASCADE"), nullable=False, index=True)
-
-    source_lang = Column(String(20), nullable=False)  # ch, en, japan, korean, etc.
-    target_lang = Column(String(20), nullable=False)  # en, ch, japan, korean, etc.
-
-    # Translation engine type: "offline" (argostranslate), "ernie", "google", "deepl"
-    engine_type = Column(String(50), nullable=False, default="offline")
-
-    # Engine-specific configuration stored as JSON
-    # For offline (argostranslate): {"model_path": "/path/to/model"}
-    # For API-based: {"api_key": "xxx", "endpoint": "https://..."}
-    engine_config = Column(JSON, nullable=True)
-
-    created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
-    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
-
-    # Relationships
-    user = relationship("User", back_populates="translation_configs")
-
-    def __repr__(self):
-        return f"<TranslationConfig(id={self.id}, {self.source_lang}->{self.target_lang}, engine='{self.engine_type}')>"
--- a/backend/app/models/user.py
+++ b/backend/app/models/user.py
@@ -1,6 +1,6 @@
 """
-Tool_OCR - User Model
-User authentication and management
+Tool_OCR - User Model v2.0
+External API authentication with simplified schema
 """

 from sqlalchemy import Column, Integer, String, DateTime, Boolean
@@ -11,24 +11,39 @@ from app.core.database import Base


 class User(Base):
-    """User model for JWT authentication"""
+    """
+    User model for external API authentication

-    __tablename__ = "paddle_ocr_users"
+    Uses email as primary identifier from Azure AD.
+    No password storage - authentication via external API only.
+    """

-    id = Column(Integer, primary_key=True, index=True)
-    username = Column(String(50), unique=True, nullable=False, index=True)
-    email = Column(String(100), unique=True, nullable=False, index=True)
-    password_hash = Column(String(255), nullable=False)
-    full_name = Column(String(100), nullable=True)
-    is_active = Column(Boolean, default=True, nullable=False)
-    is_admin = Column(Boolean, default=False, nullable=False)
+    __tablename__ = "tool_ocr_users"
+
+    id = Column(Integer, primary_key=True, index=True, autoincrement=True)
+    email = Column(String(255), unique=True, nullable=False, index=True,
+                  comment="Primary identifier from Azure AD")
+    display_name = Column(String(255), nullable=True,
+                         comment="Display name from API response")
    created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
-    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
+    last_login = Column(DateTime, nullable=True)
+    is_active = Column(Boolean, default=True, nullable=False, index=True)

    # Relationships
-    ocr_batches = relationship("OCRBatch", back_populates="user", cascade="all, delete-orphan")
-    export_rules = relationship("ExportRule", back_populates="user", cascade="all, delete-orphan")
-    translation_configs = relationship("TranslationConfig", back_populates="user", cascade="all, delete-orphan")
+    tasks = relationship("Task", back_populates="user", cascade="all, delete-orphan")
+    sessions = relationship("Session", back_populates="user", cascade="all, delete-orphan")
+    audit_logs = relationship("AuditLog", back_populates="user")

    def __repr__(self):
-        return f"<User(id={self.id}, username='{self.username}', email='{self.email}')>"
+        return f"<User(id={self.id}, email='{self.email}', display_name='{self.display_name}')>"
+
+    def to_dict(self):
+        """Convert user to dictionary"""
+        return {
+            "id": self.id,
+            "email": self.email,
+            "display_name": self.display_name,
+            "created_at": self.created_at.isoformat() if self.created_at else None,
+            "last_login": self.last_login.isoformat() if self.last_login else None,
+            "is_active": self.is_active
+        }
--- a/backend/app/models/user_v2.py
+++ b/backend/app/models/user_v2.py
@@ -1,49 +0,0 @@
-"""
-Tool_OCR - User Model v2.0
-External API authentication with simplified schema
-"""
-
-from sqlalchemy import Column, Integer, String, DateTime, Boolean
-from sqlalchemy.orm import relationship
-from datetime import datetime
-
-from app.core.database import Base
-
-
-class User(Base):
-    """
-    User model for external API authentication
-
-    Uses email as primary identifier from Azure AD.
-    No password storage - authentication via external API only.
-    """
-
-    __tablename__ = "tool_ocr_users"
-
-    id = Column(Integer, primary_key=True, index=True, autoincrement=True)
-    email = Column(String(255), unique=True, nullable=False, index=True,
-                  comment="Primary identifier from Azure AD")
-    display_name = Column(String(255), nullable=True,
-                         comment="Display name from API response")
-    created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
-    last_login = Column(DateTime, nullable=True)
-    is_active = Column(Boolean, default=True, nullable=False, index=True)
-
-    # Relationships
-    tasks = relationship("Task", back_populates="user", cascade="all, delete-orphan")
-    sessions = relationship("Session", back_populates="user", cascade="all, delete-orphan")
-    audit_logs = relationship("AuditLog", back_populates="user")
-
-    def __repr__(self):
-        return f"<User(id={self.id}, email='{self.email}', display_name='{self.display_name}')>"
-
-    def to_dict(self):
-        """Convert user to dictionary"""
-        return {
-            "id": self.id,
-            "email": self.email,
-            "display_name": self.display_name,
-            "created_at": self.created_at.isoformat() if self.created_at else None,
-            "last_login": self.last_login.isoformat() if self.last_login else None,
-            "is_active": self.is_active
-        }