""" Tool_OCR - OCR Models Database models for OCR batches, files, and results """ from sqlalchemy import Column, Integer, String, DateTime, Float, Text, ForeignKey, Enum, JSON from sqlalchemy.orm import relationship from datetime import datetime import enum from app.core.database import Base class BatchStatus(str, enum.Enum): """Batch processing status""" PENDING = "pending" PROCESSING = "processing" COMPLETED = "completed" PARTIAL = "partial" # Some files failed FAILED = "failed" class FileStatus(str, enum.Enum): """Individual file processing status""" PENDING = "pending" PROCESSING = "processing" COMPLETED = "completed" FAILED = "failed" class OCRBatch(Base): """OCR batch processing tracking""" __tablename__ = "paddle_ocr_batches" id = Column(Integer, primary_key=True, index=True) user_id = Column(Integer, ForeignKey("paddle_ocr_users.id", ondelete="CASCADE"), nullable=False, index=True) batch_name = Column(String(255), nullable=True) status = Column(Enum(BatchStatus), default=BatchStatus.PENDING, nullable=False, index=True) total_files = Column(Integer, default=0, nullable=False) completed_files = Column(Integer, default=0, nullable=False) failed_files = Column(Integer, default=0, nullable=False) created_at = Column(DateTime, default=datetime.utcnow, nullable=False, index=True) started_at = Column(DateTime, nullable=True) completed_at = Column(DateTime, nullable=True) # Relationships user = relationship("User", back_populates="ocr_batches") files = relationship("OCRFile", back_populates="batch", cascade="all, delete-orphan") @property def progress_percentage(self) -> float: """Calculate progress percentage""" if self.total_files == 0: return 0.0 return (self.completed_files / self.total_files) * 100 def __repr__(self): return f"" class OCRFile(Base): """Individual file in an OCR batch""" __tablename__ = "paddle_ocr_files" id = Column(Integer, primary_key=True, index=True) batch_id = Column(Integer, ForeignKey("paddle_ocr_batches.id", ondelete="CASCADE"), nullable=False, index=True) filename = Column(String(255), nullable=False) original_filename = Column(String(255), nullable=False) file_path = Column(String(512), nullable=False) file_size = Column(Integer, nullable=False) # Size in bytes file_format = Column(String(20), nullable=False) # png, jpg, pdf, etc. status = Column(Enum(FileStatus), default=FileStatus.PENDING, nullable=False, index=True) error_message = Column(Text, nullable=True) retry_count = Column(Integer, default=0, nullable=False) # Number of retry attempts created_at = Column(DateTime, default=datetime.utcnow, nullable=False) started_at = Column(DateTime, nullable=True) completed_at = Column(DateTime, nullable=True) processing_time = Column(Float, nullable=True) # Processing time in seconds # Relationships batch = relationship("OCRBatch", back_populates="files") result = relationship("OCRResult", back_populates="file", uselist=False, cascade="all, delete-orphan") def __repr__(self): return f"" class OCRResult(Base): """OCR processing result with structure and images""" __tablename__ = "paddle_ocr_results" id = Column(Integer, primary_key=True, index=True) file_id = Column(Integer, ForeignKey("paddle_ocr_files.id", ondelete="CASCADE"), unique=True, nullable=False, index=True) # Output file paths markdown_path = Column(String(512), nullable=True) # Path to Markdown file json_path = Column(String(512), nullable=True) # Path to JSON file images_dir = Column(String(512), nullable=True) # Directory containing extracted images # OCR metadata detected_language = Column(String(20), nullable=True) # ch, en, japan, korean total_text_regions = Column(Integer, default=0, nullable=False) average_confidence = Column(Float, nullable=True) # Layout structure data (stored as JSON) # Contains: layout elements (title, paragraph, table, image, formula), reading order, bounding boxes layout_data = Column(JSON, nullable=True) # Extracted images metadata (stored as JSON) # Contains: list of {image_path, bbox, element_type} images_metadata = Column(JSON, nullable=True) created_at = Column(DateTime, default=datetime.utcnow, nullable=False) # Relationships file = relationship("OCRFile", back_populates="result") def __repr__(self): return f""