OCR/backend/app/models/ocr.py

"""
Tool_OCR - OCR Models
Database models for OCR batches, files, and results
"""

from sqlalchemy import Column, Integer, String, DateTime, Float, Text, ForeignKey, Enum, JSON
from sqlalchemy.orm import relationship
from datetime import datetime
import enum

from app.core.database import Base


class BatchStatus(str, enum.Enum):
    """Batch processing status"""
    PENDING = "pending"
    PROCESSING = "processing"
    COMPLETED = "completed"
    PARTIAL = "partial"  # Some files failed
    FAILED = "failed"


class FileStatus(str, enum.Enum):
    """Individual file processing status"""
    PENDING = "pending"
    PROCESSING = "processing"
    COMPLETED = "completed"
    FAILED = "failed"


class OCRBatch(Base):
    """OCR batch processing tracking"""

    __tablename__ = "paddle_ocr_batches"

    id = Column(Integer, primary_key=True, index=True)
    user_id = Column(Integer, ForeignKey("paddle_ocr_users.id", ondelete="CASCADE"), nullable=False, index=True)
    batch_name = Column(String(255), nullable=True)
    status = Column(Enum(BatchStatus), default=BatchStatus.PENDING, nullable=False, index=True)
    total_files = Column(Integer, default=0, nullable=False)
    completed_files = Column(Integer, default=0, nullable=False)
    failed_files = Column(Integer, default=0, nullable=False)
    created_at = Column(DateTime, default=datetime.utcnow, nullable=False, index=True)
    started_at = Column(DateTime, nullable=True)
    completed_at = Column(DateTime, nullable=True)

    # Relationships
    user = relationship("User", back_populates="ocr_batches")
    files = relationship("OCRFile", back_populates="batch", cascade="all, delete-orphan")

    @property
    def progress_percentage(self) -> float:
        """Calculate progress percentage"""
        if self.total_files == 0:
            return 0.0
        return (self.completed_files / self.total_files) * 100

    def __repr__(self):
        return f"<OCRBatch(id={self.id}, status='{self.status}', progress={self.progress_percentage:.1f}%)>"


class OCRFile(Base):
    """Individual file in an OCR batch"""

    __tablename__ = "paddle_ocr_files"

    id = Column(Integer, primary_key=True, index=True)
    batch_id = Column(Integer, ForeignKey("paddle_ocr_batches.id", ondelete="CASCADE"), nullable=False, index=True)
    filename = Column(String(255), nullable=False)
    original_filename = Column(String(255), nullable=False)
    file_path = Column(String(512), nullable=False)
    file_size = Column(Integer, nullable=False)  # Size in bytes
    file_format = Column(String(20), nullable=False)  # png, jpg, pdf, etc.
    status = Column(Enum(FileStatus), default=FileStatus.PENDING, nullable=False, index=True)
    error_message = Column(Text, nullable=True)
    retry_count = Column(Integer, default=0, nullable=False)  # Number of retry attempts
    created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
    started_at = Column(DateTime, nullable=True)
    completed_at = Column(DateTime, nullable=True)
    processing_time = Column(Float, nullable=True)  # Processing time in seconds

    # Relationships
    batch = relationship("OCRBatch", back_populates="files")
    result = relationship("OCRResult", back_populates="file", uselist=False, cascade="all, delete-orphan")

    def __repr__(self):
        return f"<OCRFile(id={self.id}, filename='{self.filename}', status='{self.status}')>"


class OCRResult(Base):
    """OCR processing result with structure and images"""

    __tablename__ = "paddle_ocr_results"

    id = Column(Integer, primary_key=True, index=True)
    file_id = Column(Integer, ForeignKey("paddle_ocr_files.id", ondelete="CASCADE"), unique=True, nullable=False, index=True)

    # Output file paths
    markdown_path = Column(String(512), nullable=True)  # Path to Markdown file
    json_path = Column(String(512), nullable=True)  # Path to JSON file
    images_dir = Column(String(512), nullable=True)  # Directory containing extracted images

    # OCR metadata
    detected_language = Column(String(20), nullable=True)  # ch, en, japan, korean
    total_text_regions = Column(Integer, default=0, nullable=False)
    average_confidence = Column(Float, nullable=True)

    # Layout structure data (stored as JSON)
    # Contains: layout elements (title, paragraph, table, image, formula), reading order, bounding boxes
    layout_data = Column(JSON, nullable=True)

    # Extracted images metadata (stored as JSON)
    # Contains: list of {image_path, bbox, element_type}
    images_metadata = Column(JSON, nullable=True)

    created_at = Column(DateTime, default=datetime.utcnow, nullable=False)

    # Relationships
    file = relationship("OCRFile", back_populates="result")

    def __repr__(self):
        return f"<OCRResult(id={self.id}, file_id={self.file_id}, language='{self.detected_language}')>"