fix: improve Office document processing with Direct track
- Force Office documents (PPTX, DOCX, XLSX) to use Direct track after LibreOffice conversion, since converted PDFs always have extractable text - Fix PDF generator to not exclude text in image regions for Direct track, allowing text to render on top of background images (critical for PPT) - Increase file_type column from VARCHAR(50) to VARCHAR(100) to support long MIME types like PPTX - Remove reference to non-existent total_images metadata attribute This significantly improves processing time for Office documents (from ~170s OCR to ~10s Direct) while preserving text quality. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,40 @@
|
||||
"""increase_file_type_column_length
|
||||
|
||||
Revision ID: e51c9a16ee16
|
||||
Revises: 4d37f412d37a
|
||||
Create Date: 2025-11-30 15:03:28.950186
|
||||
|
||||
"""
|
||||
from typing import Sequence, Union
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = 'e51c9a16ee16'
|
||||
down_revision: Union[str, None] = '4d37f412d37a'
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
"""Upgrade schema."""
|
||||
op.alter_column(
|
||||
'tool_ocr_tasks',
|
||||
'file_type',
|
||||
existing_type=sa.String(50),
|
||||
type_=sa.String(100),
|
||||
existing_nullable=True
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
"""Downgrade schema."""
|
||||
op.alter_column(
|
||||
'tool_ocr_tasks',
|
||||
'file_type',
|
||||
existing_type=sa.String(100),
|
||||
type_=sa.String(50),
|
||||
existing_nullable=True
|
||||
)
|
||||
Reference in New Issue
Block a user