""" Integration tests for dual-track document processing. Tests cover: - Routing logic between OCR and Direct tracks - UnifiedDocument generation from both tracks - Backward compatibility with legacy formats """ import pytest from pathlib import Path import fitz from PIL import Image import io import json from app.services.document_type_detector import DocumentTypeDetector, DocumentType from app.services.direct_extraction_engine import DirectExtractionEngine from app.models.unified_document import ( UnifiedDocument, ProcessingTrack, ElementType ) from app.services.unified_document_exporter import UnifiedDocumentExporter, ExportFormat class TestDualTrackRouting: """Test routing logic between OCR and Direct tracks.""" @pytest.fixture def detector(self): """Create document type detector.""" return DocumentTypeDetector() # ===== Routing Tests ===== def test_route_editable_pdf_to_direct(self, detector, temp_dir): """Test that editable PDFs are routed to direct track.""" pdf_path = temp_dir / "editable.pdf" doc = fitz.open() for i in range(3): page = doc.new_page() page.insert_text((50, 50), f"Page {i+1}: " + "Text content " * 20, fontsize=12) doc.save(str(pdf_path)) doc.close() result = detector.detect(pdf_path) assert result.track == "direct" assert result.document_type == DocumentType.PDF_EDITABLE assert result.confidence >= 0.9 def test_route_scanned_pdf_to_ocr(self, detector, temp_dir): """Test that scanned PDFs are routed to OCR track.""" pdf_path = temp_dir / "scanned.pdf" doc = fitz.open() for i in range(3): page = doc.new_page() # Only images, no text img = Image.new('RGB', (400, 300), color=(200, 200, 200)) img_bytes = io.BytesIO() img.save(img_bytes, format='PNG') img_bytes.seek(0) rect = fitz.Rect(50, 50, 550, 750) page.insert_image(rect, stream=img_bytes.read()) doc.save(str(pdf_path)) doc.close() result = detector.detect(pdf_path) assert result.track == "ocr" assert result.document_type == DocumentType.PDF_SCANNED def test_route_image_to_ocr(self, detector, temp_dir): """Test that images are routed to OCR track.""" img_path = temp_dir / "image.png" img = Image.new('RGB', (100, 100), color='red') img.save(str(img_path)) result = detector.detect(img_path) assert result.track == "ocr" assert result.document_type == DocumentType.IMAGE assert result.confidence == 1.0 def test_route_text_file_to_direct(self, detector, temp_dir): """Test that text files are routed to direct track.""" txt_path = temp_dir / "text.txt" txt_path.write_text("This is a plain text file.\n" * 10) result = detector.detect(txt_path) assert result.track == "direct" assert result.document_type == DocumentType.TEXT def test_route_mixed_pdf_to_ocr(self, detector, temp_dir): """Test that mixed PDFs are routed to OCR track.""" pdf_path = temp_dir / "mixed.pdf" doc = fitz.open() # Page 1: Text page = doc.new_page() page.insert_text((50, 50), "Text content " * 20, fontsize=12) # Page 2: Image only page = doc.new_page() img = Image.new('RGB', (400, 300), color=(200, 200, 200)) img_bytes = io.BytesIO() img.save(img_bytes, format='PNG') img_bytes.seek(0) rect = fitz.Rect(50, 50, 550, 750) page.insert_image(rect, stream=img_bytes.read()) # Page 3: Image only page = doc.new_page() img = Image.new('RGB', (400, 300), color=(150, 150, 150)) img_bytes = io.BytesIO() img.save(img_bytes, format='PNG') img_bytes.seek(0) page.insert_image(rect, stream=img_bytes.read()) doc.save(str(pdf_path)) doc.close() result = detector.detect(pdf_path) assert result.track == "ocr" assert result.document_type == DocumentType.PDF_MIXED class TestUnifiedDocumentGeneration: """Test UnifiedDocument generation from both tracks.""" @pytest.fixture def direct_engine(self): """Create direct extraction engine.""" return DirectExtractionEngine() # ===== Direct Track Generation ===== def test_direct_track_generates_unified_document(self, direct_engine, temp_dir): """Test that direct track generates valid UnifiedDocument.""" pdf_path = temp_dir / "test.pdf" doc = fitz.open() page = doc.new_page() page.insert_text((50, 50), "Test content", fontsize=12) doc.save(str(pdf_path)) doc.close() result = direct_engine.extract(pdf_path) assert isinstance(result, UnifiedDocument) assert result.document_id is not None assert result.metadata is not None assert len(result.pages) == 1 def test_unified_document_has_required_fields(self, direct_engine, temp_dir): """Test that UnifiedDocument has all required fields.""" pdf_path = temp_dir / "complete.pdf" doc = fitz.open() page = doc.new_page() page.insert_text((50, 50), "Complete document", fontsize=12) doc.save(str(pdf_path)) doc.close() result = direct_engine.extract(pdf_path) # Check metadata assert result.metadata.filename == "complete.pdf" assert result.metadata.file_type == "pdf" assert result.metadata.processing_track == ProcessingTrack.DIRECT assert result.metadata.processing_time >= 0 # Check page structure page = result.pages[0] assert page.page_number == 1 assert page.dimensions is not None assert page.elements is not None def test_elements_have_required_fields(self, direct_engine, temp_dir): """Test that elements have all required fields.""" pdf_path = temp_dir / "elements.pdf" doc = fitz.open() page = doc.new_page() page.insert_text((50, 50), "Element test", fontsize=12) doc.save(str(pdf_path)) doc.close() result = direct_engine.extract(pdf_path) for element in result.pages[0].elements: assert element.element_id is not None assert element.type is not None assert element.bbox is not None class TestUnifiedDocumentExport: """Test UnifiedDocument export functionality.""" @pytest.fixture def exporter(self): """Create exporter.""" return UnifiedDocumentExporter() @pytest.fixture def sample_document(self, temp_dir): """Create a sample UnifiedDocument.""" engine = DirectExtractionEngine() pdf_path = temp_dir / "sample.pdf" doc = fitz.open() page = doc.new_page() page.insert_text((50, 50), "Sample document for export", fontsize=12) doc.save(str(pdf_path)) doc.close() return engine.extract(pdf_path) def test_export_to_json(self, exporter, sample_document, temp_dir): """Test export to JSON format.""" output_path = temp_dir / "output.json" exporter.export_to_json(sample_document, output_path) assert output_path.exists() # Verify JSON is valid with open(output_path) as f: data = json.load(f) assert "document_id" in data assert "metadata" in data assert "pages" in data def test_export_to_markdown(self, exporter, sample_document, temp_dir): """Test export to Markdown format.""" output_path = temp_dir / "output.md" exporter.export_to_markdown(sample_document, output_path) assert output_path.exists() content = output_path.read_text() assert len(content) > 0 def test_export_to_text(self, exporter, sample_document): """Test export to plain text.""" text = exporter.export_to_text(sample_document) assert isinstance(text, str) assert len(text) > 0 def test_export_legacy_format(self, exporter, sample_document, temp_dir): """Test export to legacy JSON format for backward compatibility.""" output_path = temp_dir / "legacy.json" exporter.export_to_legacy_json(sample_document, output_path) assert output_path.exists() with open(output_path) as f: data = json.load(f) # Legacy format should have different structure assert isinstance(data, (dict, list)) class TestBackwardCompatibility: """Test backward compatibility with existing system.""" @pytest.fixture def direct_engine(self): """Create direct extraction engine.""" return DirectExtractionEngine() def test_document_can_be_serialized(self, direct_engine, temp_dir): """Test that UnifiedDocument can be serialized to dict.""" pdf_path = temp_dir / "serialize.pdf" doc = fitz.open() page = doc.new_page() page.insert_text((50, 50), "Serializable", fontsize=12) doc.save(str(pdf_path)) doc.close() result = direct_engine.extract(pdf_path) # Should be serializable doc_dict = result.to_dict() assert isinstance(doc_dict, dict) assert "document_id" in doc_dict assert "metadata" in doc_dict assert "pages" in doc_dict def test_element_types_are_strings(self, direct_engine, temp_dir): """Test that element types serialize to strings.""" pdf_path = temp_dir / "types.pdf" doc = fitz.open() page = doc.new_page() page.insert_text((50, 50), "Test", fontsize=12) doc.save(str(pdf_path)) doc.close() result = direct_engine.extract(pdf_path) doc_dict = result.to_dict() for page_data in doc_dict.get("pages", []): for element in page_data.get("elements", []): assert isinstance(element.get("type"), str) def test_processing_track_is_string(self, direct_engine, temp_dir): """Test that processing track serializes to string.""" pdf_path = temp_dir / "track.pdf" doc = fitz.open() page = doc.new_page() page.insert_text((50, 50), "Track test", fontsize=12) doc.save(str(pdf_path)) doc.close() result = direct_engine.extract(pdf_path) doc_dict = result.to_dict() track = doc_dict.get("metadata", {}).get("processing_track") assert isinstance(track, str) assert track in ["ocr", "direct", "hybrid", "auto"] class TestCrossTrackConsistency: """Test consistency between OCR and Direct track outputs.""" @pytest.fixture def detector(self): """Create document type detector.""" return DocumentTypeDetector() @pytest.fixture def direct_engine(self): """Create direct extraction engine.""" return DirectExtractionEngine() def test_both_tracks_produce_unified_document(self, detector, direct_engine, temp_dir): """Test that both tracks produce UnifiedDocument format.""" # Create editable PDF for direct track pdf_path = temp_dir / "editable.pdf" doc = fitz.open() page = doc.new_page() page.insert_text((50, 50), "Editable content " * 20, fontsize=12) doc.save(str(pdf_path)) doc.close() # Detect and route detection = detector.detect(pdf_path) assert detection.track == "direct" # Extract result = direct_engine.extract(pdf_path) # Verify UnifiedDocument structure assert isinstance(result, UnifiedDocument) assert result.metadata.processing_track == ProcessingTrack.DIRECT def test_metadata_structure_consistent(self, direct_engine, temp_dir): """Test that metadata structure is consistent.""" pdf_path = temp_dir / "metadata.pdf" doc = fitz.open() page = doc.new_page() page.insert_text((50, 50), "Metadata test", fontsize=12) doc.save(str(pdf_path)) doc.close() result = direct_engine.extract(pdf_path) metadata = result.metadata # Required metadata fields assert hasattr(metadata, 'filename') assert hasattr(metadata, 'file_type') assert hasattr(metadata, 'file_size') assert hasattr(metadata, 'processing_track') assert hasattr(metadata, 'processing_time') assert hasattr(metadata, 'created_at') def test_element_structure_consistent(self, direct_engine, temp_dir): """Test that element structure is consistent.""" pdf_path = temp_dir / "elements.pdf" doc = fitz.open() page = doc.new_page() page.insert_text((50, 50), "Element structure test", fontsize=12) doc.save(str(pdf_path)) doc.close() result = direct_engine.extract(pdf_path) for element in result.pages[0].elements: # Required element fields assert hasattr(element, 'element_id') assert hasattr(element, 'type') assert hasattr(element, 'content') assert hasattr(element, 'bbox') assert hasattr(element, 'confidence') class TestEdgeCases: """Test edge cases in dual-track processing.""" @pytest.fixture def detector(self): """Create document type detector.""" return DocumentTypeDetector() @pytest.fixture def direct_engine(self): """Create direct extraction engine.""" return DirectExtractionEngine() def test_empty_pdf(self, direct_engine, temp_dir): """Test handling of empty PDF.""" pdf_path = temp_dir / "empty.pdf" doc = fitz.open() doc.new_page() doc.save(str(pdf_path)) doc.close() result = direct_engine.extract(pdf_path) assert len(result.pages) == 1 # May or may not have elements def test_very_small_file(self, detector, temp_dir): """Test handling of very small files.""" small_file = temp_dir / "small.txt" small_file.write_text("Hi") result = detector.detect(small_file) assert result.track in ["direct", "ocr"] def test_special_characters_in_content(self, direct_engine, temp_dir): """Test handling of special characters.""" pdf_path = temp_dir / "special.pdf" doc = fitz.open() page = doc.new_page() special_text = "Special: © ® ™ € £ ¥ § ¶ • … — – '" page.insert_text((50, 50), special_text, fontsize=12) doc.save(str(pdf_path)) doc.close() result = direct_engine.extract(pdf_path) # Should not crash assert len(result.pages) == 1 def test_large_page_count(self, direct_engine, temp_dir): """Test handling of document with many pages.""" pdf_path = temp_dir / "many_pages.pdf" doc = fitz.open() for i in range(50): page = doc.new_page() page.insert_text((50, 50), f"Page {i+1}", fontsize=12) doc.save(str(pdf_path)) doc.close() result = direct_engine.extract(pdf_path) assert len(result.pages) == 50