Files
OCR/backend/app/models/ocr.py
beabigegg da700721fa first
2025-11-12 22:53:17 +08:00

123 lines
4.7 KiB
Python

"""
Tool_OCR - OCR Models
Database models for OCR batches, files, and results
"""
from sqlalchemy import Column, Integer, String, DateTime, Float, Text, ForeignKey, Enum, JSON
from sqlalchemy.orm import relationship
from datetime import datetime
import enum
from app.core.database import Base
class BatchStatus(str, enum.Enum):
"""Batch processing status"""
PENDING = "pending"
PROCESSING = "processing"
COMPLETED = "completed"
PARTIAL = "partial" # Some files failed
FAILED = "failed"
class FileStatus(str, enum.Enum):
"""Individual file processing status"""
PENDING = "pending"
PROCESSING = "processing"
COMPLETED = "completed"
FAILED = "failed"
class OCRBatch(Base):
"""OCR batch processing tracking"""
__tablename__ = "paddle_ocr_batches"
id = Column(Integer, primary_key=True, index=True)
user_id = Column(Integer, ForeignKey("paddle_ocr_users.id", ondelete="CASCADE"), nullable=False, index=True)
batch_name = Column(String(255), nullable=True)
status = Column(Enum(BatchStatus), default=BatchStatus.PENDING, nullable=False, index=True)
total_files = Column(Integer, default=0, nullable=False)
completed_files = Column(Integer, default=0, nullable=False)
failed_files = Column(Integer, default=0, nullable=False)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False, index=True)
started_at = Column(DateTime, nullable=True)
completed_at = Column(DateTime, nullable=True)
# Relationships
user = relationship("User", back_populates="ocr_batches")
files = relationship("OCRFile", back_populates="batch", cascade="all, delete-orphan")
@property
def progress_percentage(self) -> float:
"""Calculate progress percentage"""
if self.total_files == 0:
return 0.0
return (self.completed_files / self.total_files) * 100
def __repr__(self):
return f"<OCRBatch(id={self.id}, status='{self.status}', progress={self.progress_percentage:.1f}%)>"
class OCRFile(Base):
"""Individual file in an OCR batch"""
__tablename__ = "paddle_ocr_files"
id = Column(Integer, primary_key=True, index=True)
batch_id = Column(Integer, ForeignKey("paddle_ocr_batches.id", ondelete="CASCADE"), nullable=False, index=True)
filename = Column(String(255), nullable=False)
original_filename = Column(String(255), nullable=False)
file_path = Column(String(512), nullable=False)
file_size = Column(Integer, nullable=False) # Size in bytes
file_format = Column(String(20), nullable=False) # png, jpg, pdf, etc.
status = Column(Enum(FileStatus), default=FileStatus.PENDING, nullable=False, index=True)
error_message = Column(Text, nullable=True)
retry_count = Column(Integer, default=0, nullable=False) # Number of retry attempts
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
started_at = Column(DateTime, nullable=True)
completed_at = Column(DateTime, nullable=True)
processing_time = Column(Float, nullable=True) # Processing time in seconds
# Relationships
batch = relationship("OCRBatch", back_populates="files")
result = relationship("OCRResult", back_populates="file", uselist=False, cascade="all, delete-orphan")
def __repr__(self):
return f"<OCRFile(id={self.id}, filename='{self.filename}', status='{self.status}')>"
class OCRResult(Base):
"""OCR processing result with structure and images"""
__tablename__ = "paddle_ocr_results"
id = Column(Integer, primary_key=True, index=True)
file_id = Column(Integer, ForeignKey("paddle_ocr_files.id", ondelete="CASCADE"), unique=True, nullable=False, index=True)
# Output file paths
markdown_path = Column(String(512), nullable=True) # Path to Markdown file
json_path = Column(String(512), nullable=True) # Path to JSON file
images_dir = Column(String(512), nullable=True) # Directory containing extracted images
# OCR metadata
detected_language = Column(String(20), nullable=True) # ch, en, japan, korean
total_text_regions = Column(Integer, default=0, nullable=False)
average_confidence = Column(Float, nullable=True)
# Layout structure data (stored as JSON)
# Contains: layout elements (title, paragraph, table, image, formula), reading order, bounding boxes
layout_data = Column(JSON, nullable=True)
# Extracted images metadata (stored as JSON)
# Contains: list of {image_path, bbox, element_type}
images_metadata = Column(JSON, nullable=True)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
# Relationships
file = relationship("OCRFile", back_populates="result")
def __repr__(self):
return f"<OCRResult(id={self.id}, file_id={self.file_id}, language='{self.detected_language}')>"