- Force Office documents (PPTX, DOCX, XLSX) to use Direct track after LibreOffice conversion, since converted PDFs always have extractable text - Fix PDF generator to not exclude text in image regions for Direct track, allowing text to render on top of background images (critical for PPT) - Increase file_type column from VARCHAR(50) to VARCHAR(100) to support long MIME types like PPTX - Remove reference to non-existent total_images metadata attribute This significantly improves processing time for Office documents (from ~170s OCR to ~10s Direct) while preserving text quality. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
127 lines
5.0 KiB
Python
127 lines
5.0 KiB
Python
"""
|
|
Tool_OCR - Task Model
|
|
OCR task management with user isolation
|
|
"""
|
|
|
|
from sqlalchemy import Column, Integer, String, DateTime, Boolean, Text, ForeignKey, Enum as SQLEnum
|
|
from sqlalchemy.orm import relationship
|
|
from datetime import datetime
|
|
import enum
|
|
|
|
from app.core.database import Base
|
|
|
|
|
|
class TaskStatus(str, enum.Enum):
|
|
"""Task status enumeration"""
|
|
PENDING = "pending"
|
|
PROCESSING = "processing"
|
|
COMPLETED = "completed"
|
|
FAILED = "failed"
|
|
|
|
|
|
class Task(Base):
|
|
"""
|
|
OCR Task model with user association
|
|
|
|
Each task belongs to a specific user and stores
|
|
processing status and result file paths.
|
|
"""
|
|
|
|
__tablename__ = "tool_ocr_tasks"
|
|
|
|
id = Column(Integer, primary_key=True, index=True, autoincrement=True)
|
|
user_id = Column(Integer, ForeignKey("tool_ocr_users.id", ondelete="CASCADE"),
|
|
nullable=False, index=True,
|
|
comment="Foreign key to users table")
|
|
task_id = Column(String(255), unique=True, nullable=False, index=True,
|
|
comment="Unique task identifier (UUID)")
|
|
filename = Column(String(255), nullable=True, index=True)
|
|
file_type = Column(String(100), nullable=True)
|
|
status = Column(SQLEnum(TaskStatus), default=TaskStatus.PENDING, nullable=False,
|
|
index=True)
|
|
result_json_path = Column(String(500), nullable=True,
|
|
comment="Path to JSON result file")
|
|
result_markdown_path = Column(String(500), nullable=True,
|
|
comment="Path to Markdown result file")
|
|
result_pdf_path = Column(String(500), nullable=True,
|
|
comment="Path to searchable PDF file")
|
|
error_message = Column(Text, nullable=True,
|
|
comment="Error details if task failed")
|
|
processing_time_ms = Column(Integer, nullable=True,
|
|
comment="Processing time in milliseconds")
|
|
created_at = Column(DateTime, default=datetime.utcnow, nullable=False, index=True)
|
|
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow,
|
|
nullable=False)
|
|
completed_at = Column(DateTime, nullable=True)
|
|
file_deleted = Column(Boolean, default=False, nullable=False,
|
|
comment="Track if files were auto-deleted")
|
|
|
|
# Relationships
|
|
user = relationship("User", back_populates="tasks")
|
|
files = relationship("TaskFile", back_populates="task", cascade="all, delete-orphan")
|
|
|
|
def __repr__(self):
|
|
return f"<Task(id={self.id}, task_id='{self.task_id}', status='{self.status.value}')>"
|
|
|
|
def to_dict(self):
|
|
"""Convert task to dictionary"""
|
|
return {
|
|
"id": self.id,
|
|
"task_id": self.task_id,
|
|
"filename": self.filename,
|
|
"file_type": self.file_type,
|
|
"status": self.status.value if self.status else None,
|
|
"result_json_path": self.result_json_path,
|
|
"result_markdown_path": self.result_markdown_path,
|
|
"result_pdf_path": self.result_pdf_path,
|
|
"error_message": self.error_message,
|
|
"processing_time_ms": self.processing_time_ms,
|
|
"created_at": self.created_at.isoformat() if self.created_at else None,
|
|
"updated_at": self.updated_at.isoformat() if self.updated_at else None,
|
|
"completed_at": self.completed_at.isoformat() if self.completed_at else None,
|
|
"file_deleted": self.file_deleted
|
|
}
|
|
|
|
|
|
class TaskFile(Base):
|
|
"""
|
|
Task file model
|
|
|
|
Stores information about files associated with a task.
|
|
"""
|
|
|
|
__tablename__ = "tool_ocr_task_files"
|
|
|
|
id = Column(Integer, primary_key=True, index=True, autoincrement=True)
|
|
task_id = Column(Integer, ForeignKey("tool_ocr_tasks.id", ondelete="CASCADE"),
|
|
nullable=False, index=True,
|
|
comment="Foreign key to tasks table")
|
|
original_name = Column(String(255), nullable=True)
|
|
stored_path = Column(String(500), nullable=True,
|
|
comment="Actual file path on server")
|
|
file_size = Column(Integer, nullable=True,
|
|
comment="File size in bytes")
|
|
mime_type = Column(String(100), nullable=True)
|
|
file_hash = Column(String(64), nullable=True, index=True,
|
|
comment="SHA256 hash for deduplication")
|
|
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
|
|
|
# Relationships
|
|
task = relationship("Task", back_populates="files")
|
|
|
|
def __repr__(self):
|
|
return f"<TaskFile(id={self.id}, task_id={self.task_id}, original_name='{self.original_name}')>"
|
|
|
|
def to_dict(self):
|
|
"""Convert task file to dictionary"""
|
|
return {
|
|
"id": self.id,
|
|
"task_id": self.task_id,
|
|
"original_name": self.original_name,
|
|
"stored_path": self.stored_path,
|
|
"file_size": self.file_size,
|
|
"mime_type": self.mime_type,
|
|
"file_hash": self.file_hash,
|
|
"created_at": self.created_at.isoformat() if self.created_at else None
|
|
}
|