first
This commit is contained in:
122
backend/app/models/ocr.py
Normal file
122
backend/app/models/ocr.py
Normal file
@@ -0,0 +1,122 @@
|
||||
"""
|
||||
Tool_OCR - OCR Models
|
||||
Database models for OCR batches, files, and results
|
||||
"""
|
||||
|
||||
from sqlalchemy import Column, Integer, String, DateTime, Float, Text, ForeignKey, Enum, JSON
|
||||
from sqlalchemy.orm import relationship
|
||||
from datetime import datetime
|
||||
import enum
|
||||
|
||||
from app.core.database import Base
|
||||
|
||||
|
||||
class BatchStatus(str, enum.Enum):
|
||||
"""Batch processing status"""
|
||||
PENDING = "pending"
|
||||
PROCESSING = "processing"
|
||||
COMPLETED = "completed"
|
||||
PARTIAL = "partial" # Some files failed
|
||||
FAILED = "failed"
|
||||
|
||||
|
||||
class FileStatus(str, enum.Enum):
|
||||
"""Individual file processing status"""
|
||||
PENDING = "pending"
|
||||
PROCESSING = "processing"
|
||||
COMPLETED = "completed"
|
||||
FAILED = "failed"
|
||||
|
||||
|
||||
class OCRBatch(Base):
|
||||
"""OCR batch processing tracking"""
|
||||
|
||||
__tablename__ = "paddle_ocr_batches"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
user_id = Column(Integer, ForeignKey("paddle_ocr_users.id", ondelete="CASCADE"), nullable=False, index=True)
|
||||
batch_name = Column(String(255), nullable=True)
|
||||
status = Column(Enum(BatchStatus), default=BatchStatus.PENDING, nullable=False, index=True)
|
||||
total_files = Column(Integer, default=0, nullable=False)
|
||||
completed_files = Column(Integer, default=0, nullable=False)
|
||||
failed_files = Column(Integer, default=0, nullable=False)
|
||||
created_at = Column(DateTime, default=datetime.utcnow, nullable=False, index=True)
|
||||
started_at = Column(DateTime, nullable=True)
|
||||
completed_at = Column(DateTime, nullable=True)
|
||||
|
||||
# Relationships
|
||||
user = relationship("User", back_populates="ocr_batches")
|
||||
files = relationship("OCRFile", back_populates="batch", cascade="all, delete-orphan")
|
||||
|
||||
@property
|
||||
def progress_percentage(self) -> float:
|
||||
"""Calculate progress percentage"""
|
||||
if self.total_files == 0:
|
||||
return 0.0
|
||||
return (self.completed_files / self.total_files) * 100
|
||||
|
||||
def __repr__(self):
|
||||
return f"<OCRBatch(id={self.id}, status='{self.status}', progress={self.progress_percentage:.1f}%)>"
|
||||
|
||||
|
||||
class OCRFile(Base):
|
||||
"""Individual file in an OCR batch"""
|
||||
|
||||
__tablename__ = "paddle_ocr_files"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
batch_id = Column(Integer, ForeignKey("paddle_ocr_batches.id", ondelete="CASCADE"), nullable=False, index=True)
|
||||
filename = Column(String(255), nullable=False)
|
||||
original_filename = Column(String(255), nullable=False)
|
||||
file_path = Column(String(512), nullable=False)
|
||||
file_size = Column(Integer, nullable=False) # Size in bytes
|
||||
file_format = Column(String(20), nullable=False) # png, jpg, pdf, etc.
|
||||
status = Column(Enum(FileStatus), default=FileStatus.PENDING, nullable=False, index=True)
|
||||
error_message = Column(Text, nullable=True)
|
||||
retry_count = Column(Integer, default=0, nullable=False) # Number of retry attempts
|
||||
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
||||
started_at = Column(DateTime, nullable=True)
|
||||
completed_at = Column(DateTime, nullable=True)
|
||||
processing_time = Column(Float, nullable=True) # Processing time in seconds
|
||||
|
||||
# Relationships
|
||||
batch = relationship("OCRBatch", back_populates="files")
|
||||
result = relationship("OCRResult", back_populates="file", uselist=False, cascade="all, delete-orphan")
|
||||
|
||||
def __repr__(self):
|
||||
return f"<OCRFile(id={self.id}, filename='{self.filename}', status='{self.status}')>"
|
||||
|
||||
|
||||
class OCRResult(Base):
|
||||
"""OCR processing result with structure and images"""
|
||||
|
||||
__tablename__ = "paddle_ocr_results"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
file_id = Column(Integer, ForeignKey("paddle_ocr_files.id", ondelete="CASCADE"), unique=True, nullable=False, index=True)
|
||||
|
||||
# Output file paths
|
||||
markdown_path = Column(String(512), nullable=True) # Path to Markdown file
|
||||
json_path = Column(String(512), nullable=True) # Path to JSON file
|
||||
images_dir = Column(String(512), nullable=True) # Directory containing extracted images
|
||||
|
||||
# OCR metadata
|
||||
detected_language = Column(String(20), nullable=True) # ch, en, japan, korean
|
||||
total_text_regions = Column(Integer, default=0, nullable=False)
|
||||
average_confidence = Column(Float, nullable=True)
|
||||
|
||||
# Layout structure data (stored as JSON)
|
||||
# Contains: layout elements (title, paragraph, table, image, formula), reading order, bounding boxes
|
||||
layout_data = Column(JSON, nullable=True)
|
||||
|
||||
# Extracted images metadata (stored as JSON)
|
||||
# Contains: list of {image_path, bbox, element_type}
|
||||
images_metadata = Column(JSON, nullable=True)
|
||||
|
||||
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
||||
|
||||
# Relationships
|
||||
file = relationship("OCRFile", back_populates="result")
|
||||
|
||||
def __repr__(self):
|
||||
return f"<OCRResult(id={self.id}, file_id={self.file_id}, language='{self.detected_language}')>"
|
||||
Reference in New Issue
Block a user