diff --git a/backend/alembic/versions/e51c9a16ee16_increase_file_type_column_length.py b/backend/alembic/versions/e51c9a16ee16_increase_file_type_column_length.py new file mode 100644 index 0000000..298e5e7 --- /dev/null +++ b/backend/alembic/versions/e51c9a16ee16_increase_file_type_column_length.py @@ -0,0 +1,40 @@ +"""increase_file_type_column_length + +Revision ID: e51c9a16ee16 +Revises: 4d37f412d37a +Create Date: 2025-11-30 15:03:28.950186 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'e51c9a16ee16' +down_revision: Union[str, None] = '4d37f412d37a' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + op.alter_column( + 'tool_ocr_tasks', + 'file_type', + existing_type=sa.String(50), + type_=sa.String(100), + existing_nullable=True + ) + + +def downgrade() -> None: + """Downgrade schema.""" + op.alter_column( + 'tool_ocr_tasks', + 'file_type', + existing_type=sa.String(100), + type_=sa.String(50), + existing_nullable=True + ) diff --git a/backend/app/models/task.py b/backend/app/models/task.py index 735ef02..cf78380 100644 --- a/backend/app/models/task.py +++ b/backend/app/models/task.py @@ -36,7 +36,7 @@ class Task(Base): task_id = Column(String(255), unique=True, nullable=False, index=True, comment="Unique task identifier (UUID)") filename = Column(String(255), nullable=True, index=True) - file_type = Column(String(50), nullable=True) + file_type = Column(String(100), nullable=True) status = Column(SQLEnum(TaskStatus), default=TaskStatus.PENDING, nullable=False, index=True) result_json_path = Column(String(500), nullable=True, diff --git a/backend/app/services/direct_extraction_engine.py b/backend/app/services/direct_extraction_engine.py index 0aad35c..6ec39f0 100644 --- a/backend/app/services/direct_extraction_engine.py +++ b/backend/app/services/direct_extraction_engine.py @@ -1317,8 +1317,6 @@ class DirectExtractionEngine: doc.close() if images_added > 0: - current_images = unified_doc.metadata.total_images or 0 - unified_doc.metadata.total_images = current_images + images_added logger.info(f"Added {images_added} inline image regions to document") except Exception as e: diff --git a/backend/app/services/document_type_detector.py b/backend/app/services/document_type_detector.py index d13c710..f12266d 100644 --- a/backend/app/services/document_type_detector.py +++ b/backend/app/services/document_type_detector.py @@ -291,9 +291,14 @@ class DocumentTypeDetector: Strategy: 1. Convert Office file to PDF using LibreOffice - 2. Analyze the converted PDF for text extractability - 3. Route to direct track if PDF has extractable text + 2. LibreOffice always produces text-based PDFs (not scanned images) + 3. Always use Direct track for successful conversions 4. This significantly improves processing time (from >300s to ~2-5s) + + Note: LibreOffice conversion preserves text as extractable text layer, + even for documents with complex backgrounds (PPT slides, etc.). + The "mixed content" detection in PDF analysis is misleading for Office docs + because it counts background images, not scanned text. """ document_type = self.OFFICE_MIMES.get(mime_type, DocumentType.UNKNOWN) file_size = file_path.stat().st_size @@ -318,32 +323,24 @@ class DocumentTypeDetector: pdf_path = converter.convert_to_pdf(file_path, temp_path) logger.info(f"Office document converted to PDF: {pdf_path.name}") - # Analyze the converted PDF for text extractability + # Analyze the converted PDF for metadata (but always use Direct track) pdf_recommendation = self._analyze_pdf(pdf_path) # Merge metadata merged_metadata = {**base_metadata, **pdf_recommendation.metadata} merged_metadata["converted_pdf_analyzed"] = True - # Determine final recommendation based on PDF analysis - if pdf_recommendation.track == "direct": - # Converted PDF has extractable text - use direct track - return ProcessingTrackRecommendation( - track="direct", - confidence=pdf_recommendation.confidence * 0.95, # Slightly lower confidence for converted files - reason=f"Office document converted to text-based PDF ({pdf_recommendation.metadata.get('text_coverage', 0):.0%} text coverage)", - document_type=document_type, # Keep original Office type - metadata=merged_metadata - ) - else: - # Converted PDF is image-based or mixed - use OCR track - return ProcessingTrackRecommendation( - track="ocr", - confidence=pdf_recommendation.confidence, - reason=f"Office document converted to image-based PDF, requires OCR", - document_type=document_type, # Keep original Office type - metadata=merged_metadata - ) + # LibreOffice always produces text-based PDFs - use Direct track + # Even "mixed content" PDFs from Office docs have extractable text + # The images are backgrounds/decorations, not scanned content + text_coverage = pdf_recommendation.metadata.get('text_coverage', 0) + return ProcessingTrackRecommendation( + track="direct", + confidence=0.95, + reason=f"Office document converted to PDF (text coverage: {text_coverage:.0%}, using Direct track)", + document_type=document_type, + metadata=merged_metadata + ) except OfficeConverterError as e: logger.error(f"Office conversion failed: {e}") diff --git a/backend/app/services/pdf_generator_service.py b/backend/app/services/pdf_generator_service.py index 9e39643..fb4c076 100644 --- a/backend/app/services/pdf_generator_service.py +++ b/backend/app/services/pdf_generator_service.py @@ -854,6 +854,9 @@ class PDFGeneratorService: # FIX: Collect exclusion regions (tables, images) to prevent duplicate rendering regions_to_avoid = [] + # Calculate page area for background detection + page_area = current_page_width * current_page_height + for element in page.elements: if element.type == ElementType.TABLE: table_elements.append(element) @@ -867,6 +870,29 @@ class PDFGeneratorService: # Charts often have large bounding boxes that include text labels # which should be rendered as selectable text on top if element.type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.LOGO, ElementType.STAMP]: + # Check if this is Direct track (text from PDF text layer, not OCR) + is_direct = (self.current_processing_track == ProcessingTrack.DIRECT or + self.current_processing_track == ProcessingTrack.HYBRID) + + if is_direct: + # Direct track: text is from PDF text layer, not OCR'd from images + # Don't exclude any images - text should be rendered on top + # This is critical for Office documents with background images + logger.debug(f"Direct track: not excluding {element.element_id} from text regions") + continue + + # OCR track: Skip full-page background images from exclusion regions + # Smaller images that might contain OCR'd text should still be excluded + if element.bbox: + elem_area = (element.bbox.x1 - element.bbox.x0) * (element.bbox.y1 - element.bbox.y0) + coverage_ratio = elem_area / page_area if page_area > 0 else 0 + + # If image covers >70% of page, it's likely a background - don't exclude text + if coverage_ratio > 0.7: + logger.debug(f"OCR track: skipping background image {element.element_id} from exclusion " + f"(covers {coverage_ratio*100:.1f}% of page)") + continue + regions_to_avoid.append(element) elif element.type == ElementType.LIST_ITEM: list_elements.append(element)