This commit is contained in:
beabigegg
2025-11-12 22:53:17 +08:00
commit da700721fa
130 changed files with 23393 additions and 0 deletions

View File

@@ -0,0 +1,17 @@
"""
Tool_OCR - Database Models
"""
from app.models.user import User
from app.models.ocr import OCRBatch, OCRFile, OCRResult
from app.models.export import ExportRule
from app.models.translation import TranslationConfig
__all__ = [
"User",
"OCRBatch",
"OCRFile",
"OCRResult",
"ExportRule",
"TranslationConfig",
]

View File

@@ -0,0 +1,55 @@
"""
Tool_OCR - Export Rule Model
User-defined export rules and formatting configurations
"""
from sqlalchemy import Column, Integer, String, DateTime, Text, ForeignKey, JSON
from sqlalchemy.orm import relationship
from datetime import datetime
from app.core.database import Base
class ExportRule(Base):
"""Export rule configuration for customized output formatting"""
__tablename__ = "paddle_ocr_export_rules"
id = Column(Integer, primary_key=True, index=True)
user_id = Column(Integer, ForeignKey("paddle_ocr_users.id", ondelete="CASCADE"), nullable=False, index=True)
rule_name = Column(String(100), nullable=False)
description = Column(Text, nullable=True)
# Rule configuration stored as JSON
# {
# "filters": {
# "confidence_threshold": 0.8,
# "filename_pattern": "invoice_*",
# "language": "ch"
# },
# "formatting": {
# "add_line_numbers": true,
# "sort_by_position": true,
# "group_by_filename": false
# },
# "export_options": {
# "include_metadata": true,
# "include_confidence": true,
# "include_bounding_boxes": false
# }
# }
config_json = Column(JSON, nullable=False)
# CSS template for PDF export (optional)
# Can reference predefined templates: "default", "academic", "business", "report"
# Or store custom CSS
css_template = Column(Text, nullable=True)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
# Relationships
user = relationship("User", back_populates="export_rules")
def __repr__(self):
return f"<ExportRule(id={self.id}, name='{self.rule_name}', user_id={self.user_id})>"

122
backend/app/models/ocr.py Normal file
View File

@@ -0,0 +1,122 @@
"""
Tool_OCR - OCR Models
Database models for OCR batches, files, and results
"""
from sqlalchemy import Column, Integer, String, DateTime, Float, Text, ForeignKey, Enum, JSON
from sqlalchemy.orm import relationship
from datetime import datetime
import enum
from app.core.database import Base
class BatchStatus(str, enum.Enum):
"""Batch processing status"""
PENDING = "pending"
PROCESSING = "processing"
COMPLETED = "completed"
PARTIAL = "partial" # Some files failed
FAILED = "failed"
class FileStatus(str, enum.Enum):
"""Individual file processing status"""
PENDING = "pending"
PROCESSING = "processing"
COMPLETED = "completed"
FAILED = "failed"
class OCRBatch(Base):
"""OCR batch processing tracking"""
__tablename__ = "paddle_ocr_batches"
id = Column(Integer, primary_key=True, index=True)
user_id = Column(Integer, ForeignKey("paddle_ocr_users.id", ondelete="CASCADE"), nullable=False, index=True)
batch_name = Column(String(255), nullable=True)
status = Column(Enum(BatchStatus), default=BatchStatus.PENDING, nullable=False, index=True)
total_files = Column(Integer, default=0, nullable=False)
completed_files = Column(Integer, default=0, nullable=False)
failed_files = Column(Integer, default=0, nullable=False)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False, index=True)
started_at = Column(DateTime, nullable=True)
completed_at = Column(DateTime, nullable=True)
# Relationships
user = relationship("User", back_populates="ocr_batches")
files = relationship("OCRFile", back_populates="batch", cascade="all, delete-orphan")
@property
def progress_percentage(self) -> float:
"""Calculate progress percentage"""
if self.total_files == 0:
return 0.0
return (self.completed_files / self.total_files) * 100
def __repr__(self):
return f"<OCRBatch(id={self.id}, status='{self.status}', progress={self.progress_percentage:.1f}%)>"
class OCRFile(Base):
"""Individual file in an OCR batch"""
__tablename__ = "paddle_ocr_files"
id = Column(Integer, primary_key=True, index=True)
batch_id = Column(Integer, ForeignKey("paddle_ocr_batches.id", ondelete="CASCADE"), nullable=False, index=True)
filename = Column(String(255), nullable=False)
original_filename = Column(String(255), nullable=False)
file_path = Column(String(512), nullable=False)
file_size = Column(Integer, nullable=False) # Size in bytes
file_format = Column(String(20), nullable=False) # png, jpg, pdf, etc.
status = Column(Enum(FileStatus), default=FileStatus.PENDING, nullable=False, index=True)
error_message = Column(Text, nullable=True)
retry_count = Column(Integer, default=0, nullable=False) # Number of retry attempts
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
started_at = Column(DateTime, nullable=True)
completed_at = Column(DateTime, nullable=True)
processing_time = Column(Float, nullable=True) # Processing time in seconds
# Relationships
batch = relationship("OCRBatch", back_populates="files")
result = relationship("OCRResult", back_populates="file", uselist=False, cascade="all, delete-orphan")
def __repr__(self):
return f"<OCRFile(id={self.id}, filename='{self.filename}', status='{self.status}')>"
class OCRResult(Base):
"""OCR processing result with structure and images"""
__tablename__ = "paddle_ocr_results"
id = Column(Integer, primary_key=True, index=True)
file_id = Column(Integer, ForeignKey("paddle_ocr_files.id", ondelete="CASCADE"), unique=True, nullable=False, index=True)
# Output file paths
markdown_path = Column(String(512), nullable=True) # Path to Markdown file
json_path = Column(String(512), nullable=True) # Path to JSON file
images_dir = Column(String(512), nullable=True) # Directory containing extracted images
# OCR metadata
detected_language = Column(String(20), nullable=True) # ch, en, japan, korean
total_text_regions = Column(Integer, default=0, nullable=False)
average_confidence = Column(Float, nullable=True)
# Layout structure data (stored as JSON)
# Contains: layout elements (title, paragraph, table, image, formula), reading order, bounding boxes
layout_data = Column(JSON, nullable=True)
# Extracted images metadata (stored as JSON)
# Contains: list of {image_path, bbox, element_type}
images_metadata = Column(JSON, nullable=True)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
# Relationships
file = relationship("OCRFile", back_populates="result")
def __repr__(self):
return f"<OCRResult(id={self.id}, file_id={self.file_id}, language='{self.detected_language}')>"

View File

@@ -0,0 +1,43 @@
"""
Tool_OCR - Translation Config Model (RESERVED)
Reserved for future translation feature implementation
"""
from sqlalchemy import Column, Integer, String, DateTime, ForeignKey, JSON
from sqlalchemy.orm import relationship
from datetime import datetime
from app.core.database import Base
class TranslationConfig(Base):
"""
Translation configuration (RESERVED for future implementation)
This table is created but not actively used until translation feature is implemented.
"""
__tablename__ = "paddle_ocr_translation_configs"
id = Column(Integer, primary_key=True, index=True)
user_id = Column(Integer, ForeignKey("paddle_ocr_users.id", ondelete="CASCADE"), nullable=False, index=True)
source_lang = Column(String(20), nullable=False) # ch, en, japan, korean, etc.
target_lang = Column(String(20), nullable=False) # en, ch, japan, korean, etc.
# Translation engine type: "offline" (argostranslate), "ernie", "google", "deepl"
engine_type = Column(String(50), nullable=False, default="offline")
# Engine-specific configuration stored as JSON
# For offline (argostranslate): {"model_path": "/path/to/model"}
# For API-based: {"api_key": "xxx", "endpoint": "https://..."}
engine_config = Column(JSON, nullable=True)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
# Relationships
user = relationship("User", back_populates="translation_configs")
def __repr__(self):
return f"<TranslationConfig(id={self.id}, {self.source_lang}->{self.target_lang}, engine='{self.engine_type}')>"

View File

@@ -0,0 +1,34 @@
"""
Tool_OCR - User Model
User authentication and management
"""
from sqlalchemy import Column, Integer, String, DateTime, Boolean
from sqlalchemy.orm import relationship
from datetime import datetime
from app.core.database import Base
class User(Base):
"""User model for JWT authentication"""
__tablename__ = "paddle_ocr_users"
id = Column(Integer, primary_key=True, index=True)
username = Column(String(50), unique=True, nullable=False, index=True)
email = Column(String(100), unique=True, nullable=False, index=True)
password_hash = Column(String(255), nullable=False)
full_name = Column(String(100), nullable=True)
is_active = Column(Boolean, default=True, nullable=False)
is_admin = Column(Boolean, default=False, nullable=False)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
# Relationships
ocr_batches = relationship("OCRBatch", back_populates="user", cascade="all, delete-orphan")
export_rules = relationship("ExportRule", back_populates="user", cascade="all, delete-orphan")
translation_configs = relationship("TranslationConfig", back_populates="user", cascade="all, delete-orphan")
def __repr__(self):
return f"<User(id={self.id}, username='{self.username}', email='{self.email}')>"