feat: implement Office document direct extraction (Section 2.4)

- Update DocumentTypeDetector._analyze_office to convert Office to PDF first - Analyze converted PDF for text extractability before routing - Route text-based Office documents to direct track (10x faster) - Update OCR service to convert Office files for DirectExtractionEngine - Add unit tests for Office → PDF → Direct extraction flow - Handle conversion failures with fallback to OCR track This optimization reduces Office document processing from >300s to ~2-5s for text-based documents by avoiding unnecessary OCR processing. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-20 12:20:50 +08:00
parent 0974fc3a54
commit ef335cf3af
4 changed files with 284 additions and 28 deletions
--- a/backend/app/services/document_type_detector.py
+++ b/backend/app/services/document_type_detector.py
@@ -13,6 +13,9 @@ from pathlib import Path
 from typing import Dict, Optional, Tuple, List
 from enum import Enum
 import statistics
+import tempfile
+
+from app.services.office_converter import OfficeConverter, OfficeConverterError

 logger = logging.getLogger(__name__)

@@ -284,29 +287,83 @@ class DocumentTypeDetector:

    def _analyze_office(self, file_path: Path, mime_type: str) -> ProcessingTrackRecommendation:
        """
-        Analyze Office document.
+        Analyze Office document by converting to PDF first.

-        Currently routes all Office documents to OCR track.
-        Future enhancement: implement direct extraction for Office files.
+        Strategy:
+        1. Convert Office file to PDF using LibreOffice
+        2. Analyze the converted PDF for text extractability
+        3. Route to direct track if PDF has extractable text
+        4. This significantly improves processing time (from >300s to ~2-5s)
        """
        document_type = self.OFFICE_MIMES.get(mime_type, DocumentType.UNKNOWN)
        file_size = file_path.stat().st_size

-        metadata = {
+        base_metadata = {
            "mime_type": mime_type,
            "file_size": file_size,
-            "file_extension": file_path.suffix
+            "file_extension": file_path.suffix,
+            "original_document_type": document_type.value
        }

-        # TODO: In future, we could implement direct extraction for Office files
-        # using python-docx, openpyxl, python-pptx
-        return ProcessingTrackRecommendation(
-            track="ocr",
-            confidence=0.9,
-            reason="Office documents currently processed via OCR (direct extraction planned)",
-            document_type=document_type,
-            metadata=metadata
-        )
+        try:
+            # Initialize Office converter
+            converter = OfficeConverter()
+
+            # Create temporary directory for converted PDF
+            with tempfile.TemporaryDirectory() as temp_dir:
+                temp_path = Path(temp_dir)
+
+                # Convert Office to PDF
+                logger.info(f"Converting Office document to PDF: {file_path.name}")
+                pdf_path = converter.convert_to_pdf(file_path, temp_path)
+                logger.info(f"Office document converted to PDF: {pdf_path.name}")
+
+                # Analyze the converted PDF for text extractability
+                pdf_recommendation = self._analyze_pdf(pdf_path)
+
+                # Merge metadata
+                merged_metadata = {**base_metadata, **pdf_recommendation.metadata}
+                merged_metadata["converted_pdf_analyzed"] = True
+
+                # Determine final recommendation based on PDF analysis
+                if pdf_recommendation.track == "direct":
+                    # Converted PDF has extractable text - use direct track
+                    return ProcessingTrackRecommendation(
+                        track="direct",
+                        confidence=pdf_recommendation.confidence * 0.95,  # Slightly lower confidence for converted files
+                        reason=f"Office document converted to text-based PDF ({pdf_recommendation.metadata.get('text_coverage', 0):.0%} text coverage)",
+                        document_type=document_type,  # Keep original Office type
+                        metadata=merged_metadata
+                    )
+                else:
+                    # Converted PDF is image-based or mixed - use OCR track
+                    return ProcessingTrackRecommendation(
+                        track="ocr",
+                        confidence=pdf_recommendation.confidence,
+                        reason=f"Office document converted to image-based PDF, requires OCR",
+                        document_type=document_type,  # Keep original Office type
+                        metadata=merged_metadata
+                    )
+
+        except OfficeConverterError as e:
+            logger.error(f"Office conversion failed: {e}")
+            # Fallback to OCR if conversion fails
+            return ProcessingTrackRecommendation(
+                track="ocr",
+                confidence=0.7,
+                reason=f"Office conversion failed ({str(e)}), using OCR as fallback",
+                document_type=document_type,
+                metadata=base_metadata
+            )
+        except Exception as e:
+            logger.error(f"Error analyzing Office document: {e}")
+            return ProcessingTrackRecommendation(
+                track="ocr",
+                confidence=0.5,
+                reason=f"Error during Office analysis: {str(e)}",
+                document_type=document_type,
+                metadata=base_metadata
+            )

    def _analyze_text(self, file_path: Path, mime_type: str) -> ProcessingTrackRecommendation:
        """
--- a/backend/app/services/ocr_service.py
+++ b/backend/app/services/ocr_service.py
@@ -1025,12 +1025,46 @@ class OCRService:
                logger.info(f"Reason: {recommendation.reason}")

            # Route to appropriate processing track
+            unified_doc = None
+
            if recommendation.track == "direct":
                # Use direct extraction for editable PDFs
                logger.info("Using DIRECT extraction track (PyMuPDF)")
-                unified_doc = self.direct_extraction_engine.extract(file_path, output_dir)
-                unified_doc.document_id = document_id
-            else:
+
+                # Check if file is Office document - needs conversion to PDF first
+                actual_file_path = file_path
+                temp_pdf_path = None
+
+                if self.office_converter.is_office_document(file_path):
+                    # Convert Office to PDF for direct extraction
+                    logger.info(f"Converting Office document to PDF for direct extraction: {file_path.name}")
+                    try:
+                        # Convert to output directory or file parent
+                        convert_dir = output_dir if output_dir else file_path.parent
+                        temp_pdf_path = self.office_converter.convert_to_pdf(file_path, convert_dir)
+                        actual_file_path = temp_pdf_path
+                        logger.info(f"Office document converted to PDF: {temp_pdf_path.name}")
+                    except OfficeConverterError as e:
+                        logger.error(f"Office conversion failed, falling back to OCR: {e}")
+                        # Fallback to OCR if conversion fails
+                        recommendation = ProcessingTrackRecommendation(
+                            track="ocr",
+                            confidence=0.7,
+                            reason=f"Office conversion failed ({str(e)}), using OCR as fallback",
+                            document_type=recommendation.document_type
+                        )
+
+                # Only proceed with direct extraction if track is still "direct"
+                if recommendation.track == "direct":
+                    unified_doc = self.direct_extraction_engine.extract(actual_file_path, output_dir)
+                    unified_doc.document_id = document_id
+
+                    # Update metadata with original filename if Office was converted
+                    if temp_pdf_path:
+                        unified_doc.metadata.original_filename = file_path.name
+
+            # Use OCR track (either by recommendation or fallback)
+            if recommendation.track == "ocr":
                # Use OCR for scanned documents, images, etc.
                logger.info("Using OCR track (PaddleOCR)")
                ocr_result = self.process_file_traditional(