fix: improve Office document processing with Direct track

- Force Office documents (PPTX, DOCX, XLSX) to use Direct track after LibreOffice conversion, since converted PDFs always have extractable text - Fix PDF generator to not exclude text in image regions for Direct track, allowing text to render on top of background images (critical for PPT) - Increase file_type column from VARCHAR(50) to VARCHAR(100) to support long MIME types like PPTX - Remove reference to non-existent total_images metadata attribute This significantly improves processing time for Office documents (from ~170s OCR to ~10s Direct) while preserving text quality. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-30 16:22:04 +08:00
parent 6806fff1d5
commit 87dc97d951
5 changed files with 86 additions and 25 deletions
--- a/backend/alembic/versions/e51c9a16ee16_increase_file_type_column_length.py
+++ b/backend/alembic/versions/e51c9a16ee16_increase_file_type_column_length.py
@@ -0,0 +1,40 @@
 """increase_file_type_column_length
 Revision ID: e51c9a16ee16
 Revises: 4d37f412d37a
 Create Date: 2025-11-30 15:03:28.950186
 """
 from typing import Sequence, Union
 from alembic import op
 import sqlalchemy as sa
 # revision identifiers, used by Alembic.
 revision: str = 'e51c9a16ee16'
 down_revision: Union[str, None] = '4d37f412d37a'
 branch_labels: Union[str, Sequence[str], None] = None
 depends_on: Union[str, Sequence[str], None] = None
 def upgrade() -> None:
    """Upgrade schema."""
    op.alter_column(
        'tool_ocr_tasks',
        'file_type',
        existing_type=sa.String(50),
        type_=sa.String(100),
        existing_nullable=True
    )
 def downgrade() -> None:
    """Downgrade schema."""
    op.alter_column(
        'tool_ocr_tasks',
        'file_type',
        existing_type=sa.String(100),
        type_=sa.String(50),
        existing_nullable=True
    )
--- a/backend/app/models/task.py
+++ b/backend/app/models/task.py
@@ -36,7 +36,7 @@ class Task(Base):
    task_id = Column(String(255), unique=True, nullable=False, index=True,
                    comment="Unique task identifier (UUID)")
    filename = Column(String(255), nullable=True, index=True)
-    file_type = Column(String(50), nullable=True)
+    file_type = Column(String(100), nullable=True)
    status = Column(SQLEnum(TaskStatus), default=TaskStatus.PENDING, nullable=False,
                   index=True)
    result_json_path = Column(String(500), nullable=True,
--- a/backend/app/services/direct_extraction_engine.py
+++ b/backend/app/services/direct_extraction_engine.py
@@ -1317,8 +1317,6 @@ class DirectExtractionEngine:
            doc.close()
            if images_added > 0:
                current_images = unified_doc.metadata.total_images or 0
                unified_doc.metadata.total_images = current_images + images_added
                logger.info(f"Added {images_added} inline image regions to document")
        except Exception as e:
--- a/backend/app/services/document_type_detector.py
+++ b/backend/app/services/document_type_detector.py
@@ -291,9 +291,14 @@ class DocumentTypeDetector:
        Strategy:
        1. Convert Office file to PDF using LibreOffice
-        2. Analyze the converted PDF for text extractability
+        2. LibreOffice always produces text-based PDFs (not scanned images)
-        3. Route to direct track if PDF has extractable text
+        3. Always use Direct track for successful conversions
        4. This significantly improves processing time (from >300s to ~2-5s)
        Note: LibreOffice conversion preserves text as extractable text layer,
        even for documents with complex backgrounds (PPT slides, etc.).
        The "mixed content" detection in PDF analysis is misleading for Office docs
        because it counts background images, not scanned text.
        """
        document_type = self.OFFICE_MIMES.get(mime_type, DocumentType.UNKNOWN)
        file_size = file_path.stat().st_size
@@ -318,32 +323,24 @@ class DocumentTypeDetector:
                pdf_path = converter.convert_to_pdf(file_path, temp_path)
                logger.info(f"Office document converted to PDF: {pdf_path.name}")
-                # Analyze the converted PDF for text extractability
+                # Analyze the converted PDF for metadata (but always use Direct track)
                pdf_recommendation = self._analyze_pdf(pdf_path)
                # Merge metadata
                merged_metadata = {**base_metadata, **pdf_recommendation.metadata}
                merged_metadata["converted_pdf_analyzed"] = True
-                # Determine final recommendation based on PDF analysis
+                # LibreOffice always produces text-based PDFs - use Direct track
-                if pdf_recommendation.track == "direct":
+                # Even "mixed content" PDFs from Office docs have extractable text
-                    # Converted PDF has extractable text - use direct track
+                # The images are backgrounds/decorations, not scanned content
-                    return ProcessingTrackRecommendation(
+                text_coverage = pdf_recommendation.metadata.get('text_coverage', 0)
-                        track="direct",
+                return ProcessingTrackRecommendation(
-                        confidence=pdf_recommendation.confidence * 0.95,  # Slightly lower confidence for converted files
+                    track="direct",
-                        reason=f"Office document converted to text-based PDF ({pdf_recommendation.metadata.get('text_coverage', 0):.0%} text coverage)",
+                    confidence=0.95,
-                        document_type=document_type,  # Keep original Office type
+                    reason=f"Office document converted to PDF (text coverage: {text_coverage:.0%}, using Direct track)",
-                        metadata=merged_metadata
+                    document_type=document_type,
-                    )
+                    metadata=merged_metadata
-                else:
+                )
                    # Converted PDF is image-based or mixed - use OCR track
                    return ProcessingTrackRecommendation(
                        track="ocr",
                        confidence=pdf_recommendation.confidence,
                        reason=f"Office document converted to image-based PDF, requires OCR",
                        document_type=document_type,  # Keep original Office type
                        metadata=merged_metadata
                    )
        except OfficeConverterError as e:
            logger.error(f"Office conversion failed: {e}")
--- a/backend/app/services/pdf_generator_service.py
+++ b/backend/app/services/pdf_generator_service.py
@@ -854,6 +854,9 @@ class PDFGeneratorService:
                # FIX: Collect exclusion regions (tables, images) to prevent duplicate rendering
                regions_to_avoid = []
                # Calculate page area for background detection
                page_area = current_page_width * current_page_height
                for element in page.elements:
                    if element.type == ElementType.TABLE:
                        table_elements.append(element)
@@ -867,6 +870,29 @@ class PDFGeneratorService:
                        # Charts often have large bounding boxes that include text labels
                        # which should be rendered as selectable text on top
                        if element.type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.LOGO, ElementType.STAMP]:
                            # Check if this is Direct track (text from PDF text layer, not OCR)
                            is_direct = (self.current_processing_track == ProcessingTrack.DIRECT or
                                        self.current_processing_track == ProcessingTrack.HYBRID)
                            if is_direct:
                                # Direct track: text is from PDF text layer, not OCR'd from images
                                # Don't exclude any images - text should be rendered on top
                                # This is critical for Office documents with background images
                                logger.debug(f"Direct track: not excluding {element.element_id} from text regions")
                                continue
                            # OCR track: Skip full-page background images from exclusion regions
                            # Smaller images that might contain OCR'd text should still be excluded
                            if element.bbox:
                                elem_area = (element.bbox.x1 - element.bbox.x0) * (element.bbox.y1 - element.bbox.y0)
                                coverage_ratio = elem_area / page_area if page_area > 0 else 0
                                # If image covers >70% of page, it's likely a background - don't exclude text
                                if coverage_ratio > 0.7:
                                    logger.debug(f"OCR track: skipping background image {element.element_id} from exclusion "
                                               f"(covers {coverage_ratio*100:.1f}% of page)")
                                    continue
                            regions_to_avoid.append(element)
                    elif element.type == ElementType.LIST_ITEM:
                        list_elements.append(element)