refactor: complete V1 to V2 migration and remove legacy architecture

Remove all V1 architecture components and promote V2 to primary:
- Delete all paddle_ocr_* table models (export, ocr, translation, user)
- Delete legacy routers (auth, export, ocr, translation)
- Delete legacy schemas and services
- Promote user_v2.py to user.py as primary user model
- Update all imports and dependencies to use V2 models only
- Update main.py version to 2.0.0

Database changes:
- Fix SQLAlchemy reserved word: rename audit_log.metadata to extra_data
- Add migration to drop all paddle_ocr_* tables
- Update alembic env to only import V2 models

Frontend fixes:
- Fix Select component exports in TaskHistoryPage.tsx
- Update to use simplified Select API with options prop
- Fix AxiosInstance TypeScript import syntax

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-14 21:27:39 +08:00
parent ad2b832fb6
commit fd98018ddd
34 changed files with 554 additions and 3787 deletions

View File

@@ -1,31 +1,20 @@
"""
Tool_OCR - Database Models
Tool_OCR - Database Models (V2)
New schema with external API authentication and user task isolation.
External API authentication with user task isolation.
All tables use 'tool_ocr_' prefix for namespace separation.
"""
# New models for external authentication system
from app.models.user_v2 import User
from app.models.user import User
from app.models.task import Task, TaskFile, TaskStatus
from app.models.session import Session
# Legacy models (will be deprecated after migration)
from app.models.ocr import OCRBatch, OCRFile, OCRResult
from app.models.export import ExportRule
from app.models.translation import TranslationConfig
from app.models.audit_log import AuditLog
__all__ = [
# New authentication and task models
"User",
"Task",
"TaskFile",
"TaskStatus",
"Session",
# Legacy models (deprecated)
"OCRBatch",
"OCRFile",
"OCRResult",
"ExportRule",
"TranslationConfig",
"AuditLog",
]

View File

@@ -67,7 +67,7 @@ class AuditLog(Base):
comment="1 for success, 0 for failure"
)
error_message = Column(Text, nullable=True, comment="Error details if failed")
metadata = Column(Text, nullable=True, comment="Additional JSON metadata")
extra_data = Column(Text, nullable=True, comment="Additional JSON metadata")
created_at = Column(DateTime, default=datetime.utcnow, nullable=False, index=True)
# Relationships
@@ -90,6 +90,6 @@ class AuditLog(Base):
"resource_id": self.resource_id,
"success": bool(self.success),
"error_message": self.error_message,
"metadata": self.metadata,
"extra_data": self.extra_data,
"created_at": self.created_at.isoformat() if self.created_at else None
}

View File

@@ -1,55 +0,0 @@
"""
Tool_OCR - Export Rule Model
User-defined export rules and formatting configurations
"""
from sqlalchemy import Column, Integer, String, DateTime, Text, ForeignKey, JSON
from sqlalchemy.orm import relationship
from datetime import datetime
from app.core.database import Base
class ExportRule(Base):
"""Export rule configuration for customized output formatting"""
__tablename__ = "paddle_ocr_export_rules"
id = Column(Integer, primary_key=True, index=True)
user_id = Column(Integer, ForeignKey("paddle_ocr_users.id", ondelete="CASCADE"), nullable=False, index=True)
rule_name = Column(String(100), nullable=False)
description = Column(Text, nullable=True)
# Rule configuration stored as JSON
# {
# "filters": {
# "confidence_threshold": 0.8,
# "filename_pattern": "invoice_*",
# "language": "ch"
# },
# "formatting": {
# "add_line_numbers": true,
# "sort_by_position": true,
# "group_by_filename": false
# },
# "export_options": {
# "include_metadata": true,
# "include_confidence": true,
# "include_bounding_boxes": false
# }
# }
config_json = Column(JSON, nullable=False)
# CSS template for PDF export (optional)
# Can reference predefined templates: "default", "academic", "business", "report"
# Or store custom CSS
css_template = Column(Text, nullable=True)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
# Relationships
user = relationship("User", back_populates="export_rules")
def __repr__(self):
return f"<ExportRule(id={self.id}, name='{self.rule_name}', user_id={self.user_id})>"

View File

@@ -1,122 +0,0 @@
"""
Tool_OCR - OCR Models
Database models for OCR batches, files, and results
"""
from sqlalchemy import Column, Integer, String, DateTime, Float, Text, ForeignKey, Enum, JSON
from sqlalchemy.orm import relationship
from datetime import datetime
import enum
from app.core.database import Base
class BatchStatus(str, enum.Enum):
"""Batch processing status"""
PENDING = "pending"
PROCESSING = "processing"
COMPLETED = "completed"
PARTIAL = "partial" # Some files failed
FAILED = "failed"
class FileStatus(str, enum.Enum):
"""Individual file processing status"""
PENDING = "pending"
PROCESSING = "processing"
COMPLETED = "completed"
FAILED = "failed"
class OCRBatch(Base):
"""OCR batch processing tracking"""
__tablename__ = "paddle_ocr_batches"
id = Column(Integer, primary_key=True, index=True)
user_id = Column(Integer, ForeignKey("paddle_ocr_users.id", ondelete="CASCADE"), nullable=False, index=True)
batch_name = Column(String(255), nullable=True)
status = Column(Enum(BatchStatus), default=BatchStatus.PENDING, nullable=False, index=True)
total_files = Column(Integer, default=0, nullable=False)
completed_files = Column(Integer, default=0, nullable=False)
failed_files = Column(Integer, default=0, nullable=False)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False, index=True)
started_at = Column(DateTime, nullable=True)
completed_at = Column(DateTime, nullable=True)
# Relationships
user = relationship("User", back_populates="ocr_batches")
files = relationship("OCRFile", back_populates="batch", cascade="all, delete-orphan")
@property
def progress_percentage(self) -> float:
"""Calculate progress percentage"""
if self.total_files == 0:
return 0.0
return (self.completed_files / self.total_files) * 100
def __repr__(self):
return f"<OCRBatch(id={self.id}, status='{self.status}', progress={self.progress_percentage:.1f}%)>"
class OCRFile(Base):
"""Individual file in an OCR batch"""
__tablename__ = "paddle_ocr_files"
id = Column(Integer, primary_key=True, index=True)
batch_id = Column(Integer, ForeignKey("paddle_ocr_batches.id", ondelete="CASCADE"), nullable=False, index=True)
filename = Column(String(255), nullable=False)
original_filename = Column(String(255), nullable=False)
file_path = Column(String(512), nullable=False)
file_size = Column(Integer, nullable=False) # Size in bytes
file_format = Column(String(20), nullable=False) # png, jpg, pdf, etc.
status = Column(Enum(FileStatus), default=FileStatus.PENDING, nullable=False, index=True)
error_message = Column(Text, nullable=True)
retry_count = Column(Integer, default=0, nullable=False) # Number of retry attempts
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
started_at = Column(DateTime, nullable=True)
completed_at = Column(DateTime, nullable=True)
processing_time = Column(Float, nullable=True) # Processing time in seconds
# Relationships
batch = relationship("OCRBatch", back_populates="files")
result = relationship("OCRResult", back_populates="file", uselist=False, cascade="all, delete-orphan")
def __repr__(self):
return f"<OCRFile(id={self.id}, filename='{self.filename}', status='{self.status}')>"
class OCRResult(Base):
"""OCR processing result with structure and images"""
__tablename__ = "paddle_ocr_results"
id = Column(Integer, primary_key=True, index=True)
file_id = Column(Integer, ForeignKey("paddle_ocr_files.id", ondelete="CASCADE"), unique=True, nullable=False, index=True)
# Output file paths
markdown_path = Column(String(512), nullable=True) # Path to Markdown file
json_path = Column(String(512), nullable=True) # Path to JSON file
images_dir = Column(String(512), nullable=True) # Directory containing extracted images
# OCR metadata
detected_language = Column(String(20), nullable=True) # ch, en, japan, korean
total_text_regions = Column(Integer, default=0, nullable=False)
average_confidence = Column(Float, nullable=True)
# Layout structure data (stored as JSON)
# Contains: layout elements (title, paragraph, table, image, formula), reading order, bounding boxes
layout_data = Column(JSON, nullable=True)
# Extracted images metadata (stored as JSON)
# Contains: list of {image_path, bbox, element_type}
images_metadata = Column(JSON, nullable=True)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
# Relationships
file = relationship("OCRFile", back_populates="result")
def __repr__(self):
return f"<OCRResult(id={self.id}, file_id={self.file_id}, language='{self.detected_language}')>"

View File

@@ -1,43 +0,0 @@
"""
Tool_OCR - Translation Config Model (RESERVED)
Reserved for future translation feature implementation
"""
from sqlalchemy import Column, Integer, String, DateTime, ForeignKey, JSON
from sqlalchemy.orm import relationship
from datetime import datetime
from app.core.database import Base
class TranslationConfig(Base):
"""
Translation configuration (RESERVED for future implementation)
This table is created but not actively used until translation feature is implemented.
"""
__tablename__ = "paddle_ocr_translation_configs"
id = Column(Integer, primary_key=True, index=True)
user_id = Column(Integer, ForeignKey("paddle_ocr_users.id", ondelete="CASCADE"), nullable=False, index=True)
source_lang = Column(String(20), nullable=False) # ch, en, japan, korean, etc.
target_lang = Column(String(20), nullable=False) # en, ch, japan, korean, etc.
# Translation engine type: "offline" (argostranslate), "ernie", "google", "deepl"
engine_type = Column(String(50), nullable=False, default="offline")
# Engine-specific configuration stored as JSON
# For offline (argostranslate): {"model_path": "/path/to/model"}
# For API-based: {"api_key": "xxx", "endpoint": "https://..."}
engine_config = Column(JSON, nullable=True)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
# Relationships
user = relationship("User", back_populates="translation_configs")
def __repr__(self):
return f"<TranslationConfig(id={self.id}, {self.source_lang}->{self.target_lang}, engine='{self.engine_type}')>"

View File

@@ -1,6 +1,6 @@
"""
Tool_OCR - User Model
User authentication and management
Tool_OCR - User Model v2.0
External API authentication with simplified schema
"""
from sqlalchemy import Column, Integer, String, DateTime, Boolean
@@ -11,24 +11,39 @@ from app.core.database import Base
class User(Base):
"""User model for JWT authentication"""
"""
User model for external API authentication
__tablename__ = "paddle_ocr_users"
Uses email as primary identifier from Azure AD.
No password storage - authentication via external API only.
"""
id = Column(Integer, primary_key=True, index=True)
username = Column(String(50), unique=True, nullable=False, index=True)
email = Column(String(100), unique=True, nullable=False, index=True)
password_hash = Column(String(255), nullable=False)
full_name = Column(String(100), nullable=True)
is_active = Column(Boolean, default=True, nullable=False)
is_admin = Column(Boolean, default=False, nullable=False)
__tablename__ = "tool_ocr_users"
id = Column(Integer, primary_key=True, index=True, autoincrement=True)
email = Column(String(255), unique=True, nullable=False, index=True,
comment="Primary identifier from Azure AD")
display_name = Column(String(255), nullable=True,
comment="Display name from API response")
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
last_login = Column(DateTime, nullable=True)
is_active = Column(Boolean, default=True, nullable=False, index=True)
# Relationships
ocr_batches = relationship("OCRBatch", back_populates="user", cascade="all, delete-orphan")
export_rules = relationship("ExportRule", back_populates="user", cascade="all, delete-orphan")
translation_configs = relationship("TranslationConfig", back_populates="user", cascade="all, delete-orphan")
tasks = relationship("Task", back_populates="user", cascade="all, delete-orphan")
sessions = relationship("Session", back_populates="user", cascade="all, delete-orphan")
audit_logs = relationship("AuditLog", back_populates="user")
def __repr__(self):
return f"<User(id={self.id}, username='{self.username}', email='{self.email}')>"
return f"<User(id={self.id}, email='{self.email}', display_name='{self.display_name}')>"
def to_dict(self):
"""Convert user to dictionary"""
return {
"id": self.id,
"email": self.email,
"display_name": self.display_name,
"created_at": self.created_at.isoformat() if self.created_at else None,
"last_login": self.last_login.isoformat() if self.last_login else None,
"is_active": self.is_active
}

View File

@@ -1,49 +0,0 @@
"""
Tool_OCR - User Model v2.0
External API authentication with simplified schema
"""
from sqlalchemy import Column, Integer, String, DateTime, Boolean
from sqlalchemy.orm import relationship
from datetime import datetime
from app.core.database import Base
class User(Base):
"""
User model for external API authentication
Uses email as primary identifier from Azure AD.
No password storage - authentication via external API only.
"""
__tablename__ = "tool_ocr_users"
id = Column(Integer, primary_key=True, index=True, autoincrement=True)
email = Column(String(255), unique=True, nullable=False, index=True,
comment="Primary identifier from Azure AD")
display_name = Column(String(255), nullable=True,
comment="Display name from API response")
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
last_login = Column(DateTime, nullable=True)
is_active = Column(Boolean, default=True, nullable=False, index=True)
# Relationships
tasks = relationship("Task", back_populates="user", cascade="all, delete-orphan")
sessions = relationship("Session", back_populates="user", cascade="all, delete-orphan")
audit_logs = relationship("AuditLog", back_populates="user")
def __repr__(self):
return f"<User(id={self.id}, email='{self.email}', display_name='{self.display_name}')>"
def to_dict(self):
"""Convert user to dictionary"""
return {
"id": self.id,
"email": self.email,
"display_name": self.display_name,
"created_at": self.created_at.isoformat() if self.created_at else None,
"last_login": self.last_login.isoformat() if self.last_login else None,
"is_active": self.is_active
}