123 lines
4.7 KiB
Python
123 lines
4.7 KiB
Python
"""
|
|
Tool_OCR - OCR Models
|
|
Database models for OCR batches, files, and results
|
|
"""
|
|
|
|
from sqlalchemy import Column, Integer, String, DateTime, Float, Text, ForeignKey, Enum, JSON
|
|
from sqlalchemy.orm import relationship
|
|
from datetime import datetime
|
|
import enum
|
|
|
|
from app.core.database import Base
|
|
|
|
|
|
class BatchStatus(str, enum.Enum):
|
|
"""Batch processing status"""
|
|
PENDING = "pending"
|
|
PROCESSING = "processing"
|
|
COMPLETED = "completed"
|
|
PARTIAL = "partial" # Some files failed
|
|
FAILED = "failed"
|
|
|
|
|
|
class FileStatus(str, enum.Enum):
|
|
"""Individual file processing status"""
|
|
PENDING = "pending"
|
|
PROCESSING = "processing"
|
|
COMPLETED = "completed"
|
|
FAILED = "failed"
|
|
|
|
|
|
class OCRBatch(Base):
|
|
"""OCR batch processing tracking"""
|
|
|
|
__tablename__ = "paddle_ocr_batches"
|
|
|
|
id = Column(Integer, primary_key=True, index=True)
|
|
user_id = Column(Integer, ForeignKey("paddle_ocr_users.id", ondelete="CASCADE"), nullable=False, index=True)
|
|
batch_name = Column(String(255), nullable=True)
|
|
status = Column(Enum(BatchStatus), default=BatchStatus.PENDING, nullable=False, index=True)
|
|
total_files = Column(Integer, default=0, nullable=False)
|
|
completed_files = Column(Integer, default=0, nullable=False)
|
|
failed_files = Column(Integer, default=0, nullable=False)
|
|
created_at = Column(DateTime, default=datetime.utcnow, nullable=False, index=True)
|
|
started_at = Column(DateTime, nullable=True)
|
|
completed_at = Column(DateTime, nullable=True)
|
|
|
|
# Relationships
|
|
user = relationship("User", back_populates="ocr_batches")
|
|
files = relationship("OCRFile", back_populates="batch", cascade="all, delete-orphan")
|
|
|
|
@property
|
|
def progress_percentage(self) -> float:
|
|
"""Calculate progress percentage"""
|
|
if self.total_files == 0:
|
|
return 0.0
|
|
return (self.completed_files / self.total_files) * 100
|
|
|
|
def __repr__(self):
|
|
return f"<OCRBatch(id={self.id}, status='{self.status}', progress={self.progress_percentage:.1f}%)>"
|
|
|
|
|
|
class OCRFile(Base):
|
|
"""Individual file in an OCR batch"""
|
|
|
|
__tablename__ = "paddle_ocr_files"
|
|
|
|
id = Column(Integer, primary_key=True, index=True)
|
|
batch_id = Column(Integer, ForeignKey("paddle_ocr_batches.id", ondelete="CASCADE"), nullable=False, index=True)
|
|
filename = Column(String(255), nullable=False)
|
|
original_filename = Column(String(255), nullable=False)
|
|
file_path = Column(String(512), nullable=False)
|
|
file_size = Column(Integer, nullable=False) # Size in bytes
|
|
file_format = Column(String(20), nullable=False) # png, jpg, pdf, etc.
|
|
status = Column(Enum(FileStatus), default=FileStatus.PENDING, nullable=False, index=True)
|
|
error_message = Column(Text, nullable=True)
|
|
retry_count = Column(Integer, default=0, nullable=False) # Number of retry attempts
|
|
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
|
started_at = Column(DateTime, nullable=True)
|
|
completed_at = Column(DateTime, nullable=True)
|
|
processing_time = Column(Float, nullable=True) # Processing time in seconds
|
|
|
|
# Relationships
|
|
batch = relationship("OCRBatch", back_populates="files")
|
|
result = relationship("OCRResult", back_populates="file", uselist=False, cascade="all, delete-orphan")
|
|
|
|
def __repr__(self):
|
|
return f"<OCRFile(id={self.id}, filename='{self.filename}', status='{self.status}')>"
|
|
|
|
|
|
class OCRResult(Base):
|
|
"""OCR processing result with structure and images"""
|
|
|
|
__tablename__ = "paddle_ocr_results"
|
|
|
|
id = Column(Integer, primary_key=True, index=True)
|
|
file_id = Column(Integer, ForeignKey("paddle_ocr_files.id", ondelete="CASCADE"), unique=True, nullable=False, index=True)
|
|
|
|
# Output file paths
|
|
markdown_path = Column(String(512), nullable=True) # Path to Markdown file
|
|
json_path = Column(String(512), nullable=True) # Path to JSON file
|
|
images_dir = Column(String(512), nullable=True) # Directory containing extracted images
|
|
|
|
# OCR metadata
|
|
detected_language = Column(String(20), nullable=True) # ch, en, japan, korean
|
|
total_text_regions = Column(Integer, default=0, nullable=False)
|
|
average_confidence = Column(Float, nullable=True)
|
|
|
|
# Layout structure data (stored as JSON)
|
|
# Contains: layout elements (title, paragraph, table, image, formula), reading order, bounding boxes
|
|
layout_data = Column(JSON, nullable=True)
|
|
|
|
# Extracted images metadata (stored as JSON)
|
|
# Contains: list of {image_path, bbox, element_type}
|
|
images_metadata = Column(JSON, nullable=True)
|
|
|
|
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
|
|
|
# Relationships
|
|
file = relationship("OCRFile", back_populates="result")
|
|
|
|
def __repr__(self):
|
|
return f"<OCRResult(id={self.id}, file_id={self.file_id}, language='{self.detected_language}')>"
|