diff --git a/backend/tests/services/test_direct_extraction_engine.py b/backend/tests/services/test_direct_extraction_engine.py new file mode 100644 index 0000000..f0b4256 --- /dev/null +++ b/backend/tests/services/test_direct_extraction_engine.py @@ -0,0 +1,604 @@ +""" +Unit tests for DirectExtractionEngine service. + +Tests cover: +- Text extraction accuracy +- Structure preservation (headers, lists, sections) +- Coordinate extraction +- Table detection +- Image extraction +""" + +import pytest +from pathlib import Path +import fitz +from PIL import Image +import io + +from app.services.direct_extraction_engine import DirectExtractionEngine +from app.models.unified_document import ( + ElementType, ProcessingTrack, UnifiedDocument +) + + +class TestDirectExtractionEngine: + """Test suite for DirectExtractionEngine.""" + + @pytest.fixture + def engine(self): + """Create engine with default settings.""" + return DirectExtractionEngine() + + @pytest.fixture + def engine_no_tables(self): + """Create engine with table detection disabled.""" + return DirectExtractionEngine(enable_table_detection=False) + + @pytest.fixture + def engine_no_images(self): + """Create engine with image extraction disabled.""" + return DirectExtractionEngine(enable_image_extraction=False) + + # ===== Text Extraction Tests ===== + + def test_extract_simple_text(self, engine, temp_dir): + """Test basic text extraction from PDF.""" + pdf_path = temp_dir / "simple.pdf" + doc = fitz.open() + page = doc.new_page() + + test_text = "Hello World! This is a test document." + page.insert_text((50, 50), test_text, fontsize=12) + + doc.save(str(pdf_path)) + doc.close() + + result = engine.extract(pdf_path) + + assert isinstance(result, UnifiedDocument) + assert len(result.pages) == 1 + assert result.metadata.processing_track == ProcessingTrack.DIRECT + + # Check text was extracted + text_elements = [e for e in result.pages[0].elements if e.is_text] + assert len(text_elements) > 0 + + # Verify text content + all_text = " ".join(e.content for e in text_elements if isinstance(e.content, str)) + assert "Hello World" in all_text + + def test_extract_multiline_text(self, engine, temp_dir): + """Test extraction of multiple lines of text.""" + pdf_path = temp_dir / "multiline.pdf" + doc = fitz.open() + page = doc.new_page() + + lines = [ + "First line of text", + "Second line of text", + "Third line of text" + ] + + y = 50 + for line in lines: + page.insert_text((50, y), line, fontsize=12) + y += 20 + + doc.save(str(pdf_path)) + doc.close() + + result = engine.extract(pdf_path) + + text_elements = [e for e in result.pages[0].elements if e.is_text] + all_text = " ".join(e.content for e in text_elements if isinstance(e.content, str)) + + for line in lines: + assert line in all_text + + def test_extract_chinese_text(self, engine, temp_dir): + """Test extraction of Chinese text.""" + pdf_path = temp_dir / "chinese.pdf" + doc = fitz.open() + page = doc.new_page() + + # Use a font that supports Chinese if available + chinese_text = "這是中文測試文字" + try: + page.insert_text((50, 50), chinese_text, fontsize=12) + doc.save(str(pdf_path)) + doc.close() + + result = engine.extract(pdf_path) + assert len(result.pages) == 1 + except Exception: + # Skip if font not available + doc.close() + pytest.skip("Chinese font not available for test") + + def test_extract_multiple_pages(self, engine, temp_dir): + """Test extraction from multi-page PDF.""" + pdf_path = temp_dir / "multipage.pdf" + doc = fitz.open() + + for i in range(5): + page = doc.new_page() + page.insert_text((50, 50), f"This is page {i + 1}", fontsize=12) + + doc.save(str(pdf_path)) + doc.close() + + result = engine.extract(pdf_path) + + assert len(result.pages) == 5 + + for i, page in enumerate(result.pages): + assert page.page_number == i + 1 + + # ===== Coordinate Extraction Tests ===== + + def test_bounding_box_accuracy(self, engine, temp_dir): + """Test that bounding boxes are extracted correctly.""" + pdf_path = temp_dir / "bbox.pdf" + doc = fitz.open() + page = doc.new_page() + + # Insert text at known position + x, y = 100, 200 + page.insert_text((x, y), "Positioned Text", fontsize=12) + + doc.save(str(pdf_path)) + doc.close() + + result = engine.extract(pdf_path) + + text_elements = [e for e in result.pages[0].elements if e.is_text] + assert len(text_elements) > 0 + + # Check that bbox is near the insertion point + element = text_elements[0] + assert element.bbox is not None + assert element.bbox.x0 >= 0 + assert element.bbox.y0 >= 0 + assert element.bbox.x1 > element.bbox.x0 + assert element.bbox.y1 > element.bbox.y0 + + def test_page_dimensions(self, engine, temp_dir): + """Test that page dimensions are extracted correctly.""" + pdf_path = temp_dir / "dimensions.pdf" + doc = fitz.open() + + # Create A4 page (595.28 x 841.89 points) + page = doc.new_page(width=595.28, height=841.89) + page.insert_text((50, 50), "Test", fontsize=12) + + doc.save(str(pdf_path)) + doc.close() + + result = engine.extract(pdf_path) + + dimensions = result.pages[0].dimensions + assert abs(dimensions.width - 595.28) < 1 + assert abs(dimensions.height - 841.89) < 1 + + # ===== Structure Preservation Tests ===== + + def test_detect_title_by_font_size(self, engine, temp_dir): + """Test title detection based on font size.""" + pdf_path = temp_dir / "title.pdf" + doc = fitz.open() + page = doc.new_page() + + # Large text for title + page.insert_text((50, 50), "Document Title", fontsize=24) + # Normal text for content + page.insert_text((50, 100), "This is regular content.", fontsize=12) + + doc.save(str(pdf_path)) + doc.close() + + result = engine.extract(pdf_path) + + elements = result.pages[0].elements + + # Should have extracted text elements + text_elements = [e for e in elements if e.is_text] + assert len(text_elements) >= 1 + + # Check that elements with larger fonts have different types or metadata + # Note: Title detection depends on font size thresholds and may vary + font_sizes = [] + for e in text_elements: + if e.style and e.style.font_size: + font_sizes.append(e.style.font_size) + + # Should have captured different font sizes + if len(font_sizes) >= 2: + assert max(font_sizes) > min(font_sizes) + + def test_detect_list_items(self, engine, temp_dir): + """Test detection of list items.""" + pdf_path = temp_dir / "list.pdf" + doc = fitz.open() + page = doc.new_page() + + y = 50 + list_items = [ + "• First item", + "• Second item", + "• Third item" + ] + + for item in list_items: + page.insert_text((50, y), item, fontsize=12) + y += 20 + + doc.save(str(pdf_path)) + doc.close() + + result = engine.extract(pdf_path) + + list_elements = [e for e in result.pages[0].elements + if e.type == ElementType.LIST_ITEM] + + # Should detect list items + assert len(list_elements) >= 1 + + def test_detect_headers_footers(self, engine, temp_dir): + """Test header/footer detection by position.""" + pdf_path = temp_dir / "header_footer.pdf" + doc = fitz.open() + page = doc.new_page() + + # Header at top + page.insert_text((50, 30), "Document Header", fontsize=10) + + # Content in middle + page.insert_text((50, 400), "Main content of the document.", fontsize=12) + + # Footer at bottom + page.insert_text((50, 800), "Page 1", fontsize=10) + + doc.save(str(pdf_path)) + doc.close() + + result = engine.extract(pdf_path) + + elements = result.pages[0].elements + + # Check for header/footer metadata + headers = [e for e in elements if e.metadata.get('is_page_header')] + footers = [e for e in elements if e.metadata.get('is_page_footer')] + + # At least one should be detected + assert len(headers) + len(footers) >= 1 + + def test_section_hierarchy(self, engine, temp_dir): + """Test section hierarchy building.""" + pdf_path = temp_dir / "sections.pdf" + doc = fitz.open() + page = doc.new_page() + + # Create hierarchical headers + page.insert_text((50, 50), "Main Title", fontsize=20) + page.insert_text((50, 100), "Section 1", fontsize=16) + page.insert_text((50, 150), "Subsection 1.1", fontsize=14) + page.insert_text((50, 200), "Content text", fontsize=12) + + doc.save(str(pdf_path)) + doc.close() + + result = engine.extract(pdf_path) + + elements = result.pages[0].elements + + # Check for section levels + headers = [e for e in elements + if e.type in [ElementType.TITLE, ElementType.HEADER]] + + levels = [e.metadata.get('section_level') for e in headers + if e.metadata.get('section_level')] + + if levels: + # Should have multiple levels + assert len(set(levels)) >= 1 + + # ===== Table Detection Tests ===== + + def test_detect_simple_table(self, engine, temp_dir): + """Test detection of a simple table.""" + pdf_path = temp_dir / "table.pdf" + doc = fitz.open() + page = doc.new_page() + + # Create a simple table layout + table_data = [ + ["Name", "Age", "City"], + ["Alice", "30", "Tokyo"], + ["Bob", "25", "Paris"] + ] + + y = 100 + for row in table_data: + x = 50 + for cell in row: + page.insert_text((x, y), cell, fontsize=10) + x += 100 + y += 20 + + doc.save(str(pdf_path)) + doc.close() + + result = engine.extract(pdf_path) + + table_elements = [e for e in result.pages[0].elements + if e.type == ElementType.TABLE] + + # Table detection may or may not succeed depending on layout + # Just verify the extraction completed without error + assert result.pages[0].elements is not None + + def test_table_detection_disabled(self, engine_no_tables, temp_dir): + """Test that table detection can be disabled.""" + pdf_path = temp_dir / "no_table.pdf" + doc = fitz.open() + page = doc.new_page() + + # Create table-like layout + for i in range(3): + page.insert_text((50 + i*100, 100), f"Col{i}", fontsize=10) + page.insert_text((50 + i*100, 120), f"Val{i}", fontsize=10) + + doc.save(str(pdf_path)) + doc.close() + + result = engine_no_tables.extract(pdf_path) + + # With table detection disabled, should not find tables + table_elements = [e for e in result.pages[0].elements + if e.type == ElementType.TABLE] + + assert len(table_elements) == 0 + + # ===== Image Extraction Tests ===== + + def test_extract_embedded_image(self, engine, temp_dir): + """Test extraction of embedded images.""" + pdf_path = temp_dir / "with_image.pdf" + doc = fitz.open() + page = doc.new_page() + + # Create and embed an image + img = Image.new('RGB', (100, 100), color='red') + img_bytes = io.BytesIO() + img.save(img_bytes, format='PNG') + img_bytes.seek(0) + + # Insert image into PDF + rect = fitz.Rect(100, 100, 300, 300) + page.insert_image(rect, stream=img_bytes.read()) + + doc.save(str(pdf_path)) + doc.close() + + # Create output directory for images + output_dir = temp_dir / "images" + + result = engine.extract(pdf_path, output_dir) + + image_elements = [e for e in result.pages[0].elements + if e.type == ElementType.IMAGE] + + assert len(image_elements) >= 1 + + # Check image has bbox + if image_elements: + img_elem = image_elements[0] + assert img_elem.bbox is not None + + def test_image_extraction_disabled(self, engine_no_images, temp_dir): + """Test that image extraction can be disabled.""" + pdf_path = temp_dir / "no_image.pdf" + doc = fitz.open() + page = doc.new_page() + + # Add image + img = Image.new('RGB', (50, 50), color='blue') + img_bytes = io.BytesIO() + img.save(img_bytes, format='PNG') + img_bytes.seek(0) + + rect = fitz.Rect(100, 100, 200, 200) + page.insert_image(rect, stream=img_bytes.read()) + + doc.save(str(pdf_path)) + doc.close() + + result = engine_no_images.extract(pdf_path) + + image_elements = [e for e in result.pages[0].elements + if e.type == ElementType.IMAGE] + + assert len(image_elements) == 0 + + # ===== Metadata Tests ===== + + def test_extract_pdf_metadata(self, engine, temp_dir): + """Test extraction of PDF metadata.""" + pdf_path = temp_dir / "metadata.pdf" + doc = fitz.open() + + # Set metadata + doc.set_metadata({ + "title": "Test Document", + "author": "Test Author", + "subject": "Testing" + }) + + page = doc.new_page() + page.insert_text((50, 50), "Content", fontsize=12) + + doc.save(str(pdf_path)) + doc.close() + + result = engine.extract(pdf_path) + + assert result.metadata.title == "Test Document" + assert result.metadata.author == "Test Author" + assert result.metadata.subject == "Testing" + + def test_processing_track_is_direct(self, engine, temp_dir): + """Test that processing track is set to DIRECT.""" + pdf_path = temp_dir / "track.pdf" + doc = fitz.open() + page = doc.new_page() + page.insert_text((50, 50), "Test", fontsize=12) + doc.save(str(pdf_path)) + doc.close() + + result = engine.extract(pdf_path) + + assert result.metadata.processing_track == ProcessingTrack.DIRECT + + def test_confidence_is_perfect(self, engine, temp_dir): + """Test that direct extraction has confidence 1.0.""" + pdf_path = temp_dir / "confidence.pdf" + doc = fitz.open() + page = doc.new_page() + page.insert_text((50, 50), "High confidence text", fontsize=12) + doc.save(str(pdf_path)) + doc.close() + + result = engine.extract(pdf_path) + + text_elements = [e for e in result.pages[0].elements if e.is_text] + + for element in text_elements: + assert element.confidence == 1.0 + + # ===== Style Extraction Tests ===== + + def test_extract_font_info(self, engine, temp_dir): + """Test extraction of font information.""" + pdf_path = temp_dir / "fonts.pdf" + doc = fitz.open() + page = doc.new_page() + + page.insert_text((50, 50), "Normal text", fontsize=12) + page.insert_text((50, 80), "Large text", fontsize=20) + + doc.save(str(pdf_path)) + doc.close() + + result = engine.extract(pdf_path) + + text_elements = [e for e in result.pages[0].elements + if e.is_text and e.style] + + # Should have style information + assert len(text_elements) > 0 + + # Check font sizes are different + font_sizes = [e.style.font_size for e in text_elements if e.style.font_size] + if len(font_sizes) >= 2: + assert max(font_sizes) > min(font_sizes) + + # ===== Error Handling Tests ===== + + def test_nonexistent_file(self, engine, temp_dir): + """Test handling of non-existent file.""" + pdf_path = temp_dir / "nonexistent.pdf" + + result = engine.extract(pdf_path) + + # Should return document with errors + assert result.processing_errors is not None + assert len(result.processing_errors) > 0 + + def test_corrupted_pdf(self, engine, temp_dir): + """Test handling of corrupted PDF.""" + pdf_path = temp_dir / "corrupted.pdf" + + # Create invalid PDF file + pdf_path.write_bytes(b"This is not a valid PDF file") + + result = engine.extract(pdf_path) + + # Should return document with errors + assert result.processing_errors is not None + + def test_empty_pdf_page(self, engine, temp_dir): + """Test handling of PDF with empty pages.""" + pdf_path = temp_dir / "empty_page.pdf" + doc = fitz.open() + doc.new_page() # Empty page + doc.save(str(pdf_path)) + doc.close() + + result = engine.extract(pdf_path) + + assert len(result.pages) == 1 + # May or may not have elements + + # ===== Link Extraction Tests ===== + + def test_extract_hyperlinks(self, engine, temp_dir): + """Test extraction of hyperlinks.""" + pdf_path = temp_dir / "links.pdf" + doc = fitz.open() + page = doc.new_page() + + # Add text and link + page.insert_text((50, 50), "Click here", fontsize=12) + + # Add hyperlink + link_rect = fitz.Rect(50, 40, 120, 60) + page.insert_link({ + 'kind': fitz.LINK_URI, + 'from': link_rect, + 'uri': 'https://example.com' + }) + + doc.save(str(pdf_path)) + doc.close() + + result = engine.extract(pdf_path) + + reference_elements = [e for e in result.pages[0].elements + if e.type == ElementType.REFERENCE] + + assert len(reference_elements) >= 1 + + # Check URI is extracted + if reference_elements: + link = reference_elements[0] + assert link.content.get('uri') == 'https://example.com' + + # ===== Performance Tests ===== + + def test_large_document_performance(self, engine, temp_dir): + """Test extraction performance on larger document.""" + import time + + pdf_path = temp_dir / "large.pdf" + doc = fitz.open() + + # Create 20 pages with content + for i in range(20): + page = doc.new_page() + for j in range(10): + page.insert_text((50, 50 + j*20), + f"Page {i+1} Line {j+1}: Lorem ipsum dolor sit amet", + fontsize=10) + + doc.save(str(pdf_path)) + doc.close() + + start_time = time.time() + result = engine.extract(pdf_path) + elapsed = time.time() - start_time + + assert len(result.pages) == 20 + assert elapsed < 30 # Should complete within 30 seconds + + # Check processing time is recorded + assert result.metadata.processing_time > 0 diff --git a/backend/tests/services/test_dual_track_integration.py b/backend/tests/services/test_dual_track_integration.py new file mode 100644 index 0000000..e583477 --- /dev/null +++ b/backend/tests/services/test_dual_track_integration.py @@ -0,0 +1,471 @@ +""" +Integration tests for dual-track document processing. + +Tests cover: +- Routing logic between OCR and Direct tracks +- UnifiedDocument generation from both tracks +- Backward compatibility with legacy formats +""" + +import pytest +from pathlib import Path +import fitz +from PIL import Image +import io +import json + +from app.services.document_type_detector import DocumentTypeDetector, DocumentType +from app.services.direct_extraction_engine import DirectExtractionEngine +from app.models.unified_document import ( + UnifiedDocument, ProcessingTrack, ElementType +) +from app.services.unified_document_exporter import UnifiedDocumentExporter, ExportFormat + + +class TestDualTrackRouting: + """Test routing logic between OCR and Direct tracks.""" + + @pytest.fixture + def detector(self): + """Create document type detector.""" + return DocumentTypeDetector() + + # ===== Routing Tests ===== + + def test_route_editable_pdf_to_direct(self, detector, temp_dir): + """Test that editable PDFs are routed to direct track.""" + pdf_path = temp_dir / "editable.pdf" + doc = fitz.open() + + for i in range(3): + page = doc.new_page() + page.insert_text((50, 50), f"Page {i+1}: " + "Text content " * 20, fontsize=12) + + doc.save(str(pdf_path)) + doc.close() + + result = detector.detect(pdf_path) + + assert result.track == "direct" + assert result.document_type == DocumentType.PDF_EDITABLE + assert result.confidence >= 0.9 + + def test_route_scanned_pdf_to_ocr(self, detector, temp_dir): + """Test that scanned PDFs are routed to OCR track.""" + pdf_path = temp_dir / "scanned.pdf" + doc = fitz.open() + + for i in range(3): + page = doc.new_page() + # Only images, no text + img = Image.new('RGB', (400, 300), color=(200, 200, 200)) + img_bytes = io.BytesIO() + img.save(img_bytes, format='PNG') + img_bytes.seek(0) + + rect = fitz.Rect(50, 50, 550, 750) + page.insert_image(rect, stream=img_bytes.read()) + + doc.save(str(pdf_path)) + doc.close() + + result = detector.detect(pdf_path) + + assert result.track == "ocr" + assert result.document_type == DocumentType.PDF_SCANNED + + def test_route_image_to_ocr(self, detector, temp_dir): + """Test that images are routed to OCR track.""" + img_path = temp_dir / "image.png" + img = Image.new('RGB', (100, 100), color='red') + img.save(str(img_path)) + + result = detector.detect(img_path) + + assert result.track == "ocr" + assert result.document_type == DocumentType.IMAGE + assert result.confidence == 1.0 + + def test_route_text_file_to_direct(self, detector, temp_dir): + """Test that text files are routed to direct track.""" + txt_path = temp_dir / "text.txt" + txt_path.write_text("This is a plain text file.\n" * 10) + + result = detector.detect(txt_path) + + assert result.track == "direct" + assert result.document_type == DocumentType.TEXT + + def test_route_mixed_pdf_to_ocr(self, detector, temp_dir): + """Test that mixed PDFs are routed to OCR track.""" + pdf_path = temp_dir / "mixed.pdf" + doc = fitz.open() + + # Page 1: Text + page = doc.new_page() + page.insert_text((50, 50), "Text content " * 20, fontsize=12) + + # Page 2: Image only + page = doc.new_page() + img = Image.new('RGB', (400, 300), color=(200, 200, 200)) + img_bytes = io.BytesIO() + img.save(img_bytes, format='PNG') + img_bytes.seek(0) + rect = fitz.Rect(50, 50, 550, 750) + page.insert_image(rect, stream=img_bytes.read()) + + # Page 3: Image only + page = doc.new_page() + img = Image.new('RGB', (400, 300), color=(150, 150, 150)) + img_bytes = io.BytesIO() + img.save(img_bytes, format='PNG') + img_bytes.seek(0) + page.insert_image(rect, stream=img_bytes.read()) + + doc.save(str(pdf_path)) + doc.close() + + result = detector.detect(pdf_path) + + assert result.track == "ocr" + assert result.document_type == DocumentType.PDF_MIXED + + +class TestUnifiedDocumentGeneration: + """Test UnifiedDocument generation from both tracks.""" + + @pytest.fixture + def direct_engine(self): + """Create direct extraction engine.""" + return DirectExtractionEngine() + + # ===== Direct Track Generation ===== + + def test_direct_track_generates_unified_document(self, direct_engine, temp_dir): + """Test that direct track generates valid UnifiedDocument.""" + pdf_path = temp_dir / "test.pdf" + doc = fitz.open() + page = doc.new_page() + page.insert_text((50, 50), "Test content", fontsize=12) + doc.save(str(pdf_path)) + doc.close() + + result = direct_engine.extract(pdf_path) + + assert isinstance(result, UnifiedDocument) + assert result.document_id is not None + assert result.metadata is not None + assert len(result.pages) == 1 + + def test_unified_document_has_required_fields(self, direct_engine, temp_dir): + """Test that UnifiedDocument has all required fields.""" + pdf_path = temp_dir / "complete.pdf" + doc = fitz.open() + page = doc.new_page() + page.insert_text((50, 50), "Complete document", fontsize=12) + doc.save(str(pdf_path)) + doc.close() + + result = direct_engine.extract(pdf_path) + + # Check metadata + assert result.metadata.filename == "complete.pdf" + assert result.metadata.file_type == "pdf" + assert result.metadata.processing_track == ProcessingTrack.DIRECT + assert result.metadata.processing_time >= 0 + + # Check page structure + page = result.pages[0] + assert page.page_number == 1 + assert page.dimensions is not None + assert page.elements is not None + + def test_elements_have_required_fields(self, direct_engine, temp_dir): + """Test that elements have all required fields.""" + pdf_path = temp_dir / "elements.pdf" + doc = fitz.open() + page = doc.new_page() + page.insert_text((50, 50), "Element test", fontsize=12) + doc.save(str(pdf_path)) + doc.close() + + result = direct_engine.extract(pdf_path) + + for element in result.pages[0].elements: + assert element.element_id is not None + assert element.type is not None + assert element.bbox is not None + + +class TestUnifiedDocumentExport: + """Test UnifiedDocument export functionality.""" + + @pytest.fixture + def exporter(self): + """Create exporter.""" + return UnifiedDocumentExporter() + + @pytest.fixture + def sample_document(self, temp_dir): + """Create a sample UnifiedDocument.""" + engine = DirectExtractionEngine() + + pdf_path = temp_dir / "sample.pdf" + doc = fitz.open() + page = doc.new_page() + page.insert_text((50, 50), "Sample document for export", fontsize=12) + doc.save(str(pdf_path)) + doc.close() + + return engine.extract(pdf_path) + + def test_export_to_json(self, exporter, sample_document, temp_dir): + """Test export to JSON format.""" + output_path = temp_dir / "output.json" + + exporter.export_to_json(sample_document, output_path) + + assert output_path.exists() + + # Verify JSON is valid + with open(output_path) as f: + data = json.load(f) + + assert "document_id" in data + assert "metadata" in data + assert "pages" in data + + def test_export_to_markdown(self, exporter, sample_document, temp_dir): + """Test export to Markdown format.""" + output_path = temp_dir / "output.md" + + exporter.export_to_markdown(sample_document, output_path) + + assert output_path.exists() + + content = output_path.read_text() + assert len(content) > 0 + + def test_export_to_text(self, exporter, sample_document): + """Test export to plain text.""" + text = exporter.export_to_text(sample_document) + + assert isinstance(text, str) + assert len(text) > 0 + + def test_export_legacy_format(self, exporter, sample_document, temp_dir): + """Test export to legacy JSON format for backward compatibility.""" + output_path = temp_dir / "legacy.json" + + exporter.export_to_legacy_json(sample_document, output_path) + + assert output_path.exists() + + with open(output_path) as f: + data = json.load(f) + + # Legacy format should have different structure + assert isinstance(data, (dict, list)) + + +class TestBackwardCompatibility: + """Test backward compatibility with existing system.""" + + @pytest.fixture + def direct_engine(self): + """Create direct extraction engine.""" + return DirectExtractionEngine() + + def test_document_can_be_serialized(self, direct_engine, temp_dir): + """Test that UnifiedDocument can be serialized to dict.""" + pdf_path = temp_dir / "serialize.pdf" + doc = fitz.open() + page = doc.new_page() + page.insert_text((50, 50), "Serializable", fontsize=12) + doc.save(str(pdf_path)) + doc.close() + + result = direct_engine.extract(pdf_path) + + # Should be serializable + doc_dict = result.to_dict() + + assert isinstance(doc_dict, dict) + assert "document_id" in doc_dict + assert "metadata" in doc_dict + assert "pages" in doc_dict + + def test_element_types_are_strings(self, direct_engine, temp_dir): + """Test that element types serialize to strings.""" + pdf_path = temp_dir / "types.pdf" + doc = fitz.open() + page = doc.new_page() + page.insert_text((50, 50), "Test", fontsize=12) + doc.save(str(pdf_path)) + doc.close() + + result = direct_engine.extract(pdf_path) + doc_dict = result.to_dict() + + for page_data in doc_dict.get("pages", []): + for element in page_data.get("elements", []): + assert isinstance(element.get("type"), str) + + def test_processing_track_is_string(self, direct_engine, temp_dir): + """Test that processing track serializes to string.""" + pdf_path = temp_dir / "track.pdf" + doc = fitz.open() + page = doc.new_page() + page.insert_text((50, 50), "Track test", fontsize=12) + doc.save(str(pdf_path)) + doc.close() + + result = direct_engine.extract(pdf_path) + doc_dict = result.to_dict() + + track = doc_dict.get("metadata", {}).get("processing_track") + assert isinstance(track, str) + assert track in ["ocr", "direct", "hybrid", "auto"] + + +class TestCrossTrackConsistency: + """Test consistency between OCR and Direct track outputs.""" + + @pytest.fixture + def detector(self): + """Create document type detector.""" + return DocumentTypeDetector() + + @pytest.fixture + def direct_engine(self): + """Create direct extraction engine.""" + return DirectExtractionEngine() + + def test_both_tracks_produce_unified_document(self, detector, direct_engine, temp_dir): + """Test that both tracks produce UnifiedDocument format.""" + # Create editable PDF for direct track + pdf_path = temp_dir / "editable.pdf" + doc = fitz.open() + page = doc.new_page() + page.insert_text((50, 50), "Editable content " * 20, fontsize=12) + doc.save(str(pdf_path)) + doc.close() + + # Detect and route + detection = detector.detect(pdf_path) + assert detection.track == "direct" + + # Extract + result = direct_engine.extract(pdf_path) + + # Verify UnifiedDocument structure + assert isinstance(result, UnifiedDocument) + assert result.metadata.processing_track == ProcessingTrack.DIRECT + + def test_metadata_structure_consistent(self, direct_engine, temp_dir): + """Test that metadata structure is consistent.""" + pdf_path = temp_dir / "metadata.pdf" + doc = fitz.open() + page = doc.new_page() + page.insert_text((50, 50), "Metadata test", fontsize=12) + doc.save(str(pdf_path)) + doc.close() + + result = direct_engine.extract(pdf_path) + metadata = result.metadata + + # Required metadata fields + assert hasattr(metadata, 'filename') + assert hasattr(metadata, 'file_type') + assert hasattr(metadata, 'file_size') + assert hasattr(metadata, 'processing_track') + assert hasattr(metadata, 'processing_time') + assert hasattr(metadata, 'created_at') + + def test_element_structure_consistent(self, direct_engine, temp_dir): + """Test that element structure is consistent.""" + pdf_path = temp_dir / "elements.pdf" + doc = fitz.open() + page = doc.new_page() + page.insert_text((50, 50), "Element structure test", fontsize=12) + doc.save(str(pdf_path)) + doc.close() + + result = direct_engine.extract(pdf_path) + + for element in result.pages[0].elements: + # Required element fields + assert hasattr(element, 'element_id') + assert hasattr(element, 'type') + assert hasattr(element, 'content') + assert hasattr(element, 'bbox') + assert hasattr(element, 'confidence') + + +class TestEdgeCases: + """Test edge cases in dual-track processing.""" + + @pytest.fixture + def detector(self): + """Create document type detector.""" + return DocumentTypeDetector() + + @pytest.fixture + def direct_engine(self): + """Create direct extraction engine.""" + return DirectExtractionEngine() + + def test_empty_pdf(self, direct_engine, temp_dir): + """Test handling of empty PDF.""" + pdf_path = temp_dir / "empty.pdf" + doc = fitz.open() + doc.new_page() + doc.save(str(pdf_path)) + doc.close() + + result = direct_engine.extract(pdf_path) + + assert len(result.pages) == 1 + # May or may not have elements + + def test_very_small_file(self, detector, temp_dir): + """Test handling of very small files.""" + small_file = temp_dir / "small.txt" + small_file.write_text("Hi") + + result = detector.detect(small_file) + + assert result.track in ["direct", "ocr"] + + def test_special_characters_in_content(self, direct_engine, temp_dir): + """Test handling of special characters.""" + pdf_path = temp_dir / "special.pdf" + doc = fitz.open() + page = doc.new_page() + + special_text = "Special: © ® ™ € £ ¥ § ¶ • … — – '" + page.insert_text((50, 50), special_text, fontsize=12) + + doc.save(str(pdf_path)) + doc.close() + + result = direct_engine.extract(pdf_path) + + # Should not crash + assert len(result.pages) == 1 + + def test_large_page_count(self, direct_engine, temp_dir): + """Test handling of document with many pages.""" + pdf_path = temp_dir / "many_pages.pdf" + doc = fitz.open() + + for i in range(50): + page = doc.new_page() + page.insert_text((50, 50), f"Page {i+1}", fontsize=12) + + doc.save(str(pdf_path)) + doc.close() + + result = direct_engine.extract(pdf_path) + + assert len(result.pages) == 50 diff --git a/openspec/changes/dual-track-document-processing/tasks.md b/openspec/changes/dual-track-document-processing/tasks.md index eeb211a..3274890 100644 --- a/openspec/changes/dual-track-document-processing/tasks.md +++ b/openspec/changes/dual-track-document-processing/tasks.md @@ -115,18 +115,18 @@ - Note: UI prepared with disabled state; awaiting Section 5 implementation ## 8. Testing -- [ ] 8.1 Unit tests for DocumentTypeDetector - - [ ] 8.1.1 Test various file types - - [ ] 8.1.2 Test editability detection - - [ ] 8.1.3 Test edge cases -- [ ] 8.2 Unit tests for DirectExtractionEngine - - [ ] 8.2.1 Test text extraction accuracy - - [ ] 8.2.2 Test structure preservation - - [ ] 8.2.3 Test coordinate extraction -- [ ] 8.3 Integration tests for dual-track processing - - [ ] 8.3.1 Test routing logic - - [ ] 8.3.2 Test UnifiedDocument generation - - [ ] 8.3.3 Test backward compatibility +- [x] 8.1 Unit tests for DocumentTypeDetector + - [x] 8.1.1 Test various file types + - [x] 8.1.2 Test editability detection + - [x] 8.1.3 Test edge cases +- [x] 8.2 Unit tests for DirectExtractionEngine + - [x] 8.2.1 Test text extraction accuracy + - [x] 8.2.2 Test structure preservation + - [x] 8.2.3 Test coordinate extraction +- [x] 8.3 Integration tests for dual-track processing + - [x] 8.3.1 Test routing logic + - [x] 8.3.2 Test UnifiedDocument generation + - [x] 8.3.3 Test backward compatibility - [ ] 8.4 End-to-end tests - [ ] 8.4.1 Test scanned PDF processing (OCR track) - [ ] 8.4.2 Test editable PDF processing (direct track)