test: add unit and integration tests for dual-track processing
Add comprehensive test suite for DirectExtractionEngine and dual-track integration. All 65 tests pass covering text extraction, structure preservation, routing logic, and backward compatibility. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
471
backend/tests/services/test_dual_track_integration.py
Normal file
471
backend/tests/services/test_dual_track_integration.py
Normal file
@@ -0,0 +1,471 @@
|
||||
"""
|
||||
Integration tests for dual-track document processing.
|
||||
|
||||
Tests cover:
|
||||
- Routing logic between OCR and Direct tracks
|
||||
- UnifiedDocument generation from both tracks
|
||||
- Backward compatibility with legacy formats
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
import fitz
|
||||
from PIL import Image
|
||||
import io
|
||||
import json
|
||||
|
||||
from app.services.document_type_detector import DocumentTypeDetector, DocumentType
|
||||
from app.services.direct_extraction_engine import DirectExtractionEngine
|
||||
from app.models.unified_document import (
|
||||
UnifiedDocument, ProcessingTrack, ElementType
|
||||
)
|
||||
from app.services.unified_document_exporter import UnifiedDocumentExporter, ExportFormat
|
||||
|
||||
|
||||
class TestDualTrackRouting:
|
||||
"""Test routing logic between OCR and Direct tracks."""
|
||||
|
||||
@pytest.fixture
|
||||
def detector(self):
|
||||
"""Create document type detector."""
|
||||
return DocumentTypeDetector()
|
||||
|
||||
# ===== Routing Tests =====
|
||||
|
||||
def test_route_editable_pdf_to_direct(self, detector, temp_dir):
|
||||
"""Test that editable PDFs are routed to direct track."""
|
||||
pdf_path = temp_dir / "editable.pdf"
|
||||
doc = fitz.open()
|
||||
|
||||
for i in range(3):
|
||||
page = doc.new_page()
|
||||
page.insert_text((50, 50), f"Page {i+1}: " + "Text content " * 20, fontsize=12)
|
||||
|
||||
doc.save(str(pdf_path))
|
||||
doc.close()
|
||||
|
||||
result = detector.detect(pdf_path)
|
||||
|
||||
assert result.track == "direct"
|
||||
assert result.document_type == DocumentType.PDF_EDITABLE
|
||||
assert result.confidence >= 0.9
|
||||
|
||||
def test_route_scanned_pdf_to_ocr(self, detector, temp_dir):
|
||||
"""Test that scanned PDFs are routed to OCR track."""
|
||||
pdf_path = temp_dir / "scanned.pdf"
|
||||
doc = fitz.open()
|
||||
|
||||
for i in range(3):
|
||||
page = doc.new_page()
|
||||
# Only images, no text
|
||||
img = Image.new('RGB', (400, 300), color=(200, 200, 200))
|
||||
img_bytes = io.BytesIO()
|
||||
img.save(img_bytes, format='PNG')
|
||||
img_bytes.seek(0)
|
||||
|
||||
rect = fitz.Rect(50, 50, 550, 750)
|
||||
page.insert_image(rect, stream=img_bytes.read())
|
||||
|
||||
doc.save(str(pdf_path))
|
||||
doc.close()
|
||||
|
||||
result = detector.detect(pdf_path)
|
||||
|
||||
assert result.track == "ocr"
|
||||
assert result.document_type == DocumentType.PDF_SCANNED
|
||||
|
||||
def test_route_image_to_ocr(self, detector, temp_dir):
|
||||
"""Test that images are routed to OCR track."""
|
||||
img_path = temp_dir / "image.png"
|
||||
img = Image.new('RGB', (100, 100), color='red')
|
||||
img.save(str(img_path))
|
||||
|
||||
result = detector.detect(img_path)
|
||||
|
||||
assert result.track == "ocr"
|
||||
assert result.document_type == DocumentType.IMAGE
|
||||
assert result.confidence == 1.0
|
||||
|
||||
def test_route_text_file_to_direct(self, detector, temp_dir):
|
||||
"""Test that text files are routed to direct track."""
|
||||
txt_path = temp_dir / "text.txt"
|
||||
txt_path.write_text("This is a plain text file.\n" * 10)
|
||||
|
||||
result = detector.detect(txt_path)
|
||||
|
||||
assert result.track == "direct"
|
||||
assert result.document_type == DocumentType.TEXT
|
||||
|
||||
def test_route_mixed_pdf_to_ocr(self, detector, temp_dir):
|
||||
"""Test that mixed PDFs are routed to OCR track."""
|
||||
pdf_path = temp_dir / "mixed.pdf"
|
||||
doc = fitz.open()
|
||||
|
||||
# Page 1: Text
|
||||
page = doc.new_page()
|
||||
page.insert_text((50, 50), "Text content " * 20, fontsize=12)
|
||||
|
||||
# Page 2: Image only
|
||||
page = doc.new_page()
|
||||
img = Image.new('RGB', (400, 300), color=(200, 200, 200))
|
||||
img_bytes = io.BytesIO()
|
||||
img.save(img_bytes, format='PNG')
|
||||
img_bytes.seek(0)
|
||||
rect = fitz.Rect(50, 50, 550, 750)
|
||||
page.insert_image(rect, stream=img_bytes.read())
|
||||
|
||||
# Page 3: Image only
|
||||
page = doc.new_page()
|
||||
img = Image.new('RGB', (400, 300), color=(150, 150, 150))
|
||||
img_bytes = io.BytesIO()
|
||||
img.save(img_bytes, format='PNG')
|
||||
img_bytes.seek(0)
|
||||
page.insert_image(rect, stream=img_bytes.read())
|
||||
|
||||
doc.save(str(pdf_path))
|
||||
doc.close()
|
||||
|
||||
result = detector.detect(pdf_path)
|
||||
|
||||
assert result.track == "ocr"
|
||||
assert result.document_type == DocumentType.PDF_MIXED
|
||||
|
||||
|
||||
class TestUnifiedDocumentGeneration:
|
||||
"""Test UnifiedDocument generation from both tracks."""
|
||||
|
||||
@pytest.fixture
|
||||
def direct_engine(self):
|
||||
"""Create direct extraction engine."""
|
||||
return DirectExtractionEngine()
|
||||
|
||||
# ===== Direct Track Generation =====
|
||||
|
||||
def test_direct_track_generates_unified_document(self, direct_engine, temp_dir):
|
||||
"""Test that direct track generates valid UnifiedDocument."""
|
||||
pdf_path = temp_dir / "test.pdf"
|
||||
doc = fitz.open()
|
||||
page = doc.new_page()
|
||||
page.insert_text((50, 50), "Test content", fontsize=12)
|
||||
doc.save(str(pdf_path))
|
||||
doc.close()
|
||||
|
||||
result = direct_engine.extract(pdf_path)
|
||||
|
||||
assert isinstance(result, UnifiedDocument)
|
||||
assert result.document_id is not None
|
||||
assert result.metadata is not None
|
||||
assert len(result.pages) == 1
|
||||
|
||||
def test_unified_document_has_required_fields(self, direct_engine, temp_dir):
|
||||
"""Test that UnifiedDocument has all required fields."""
|
||||
pdf_path = temp_dir / "complete.pdf"
|
||||
doc = fitz.open()
|
||||
page = doc.new_page()
|
||||
page.insert_text((50, 50), "Complete document", fontsize=12)
|
||||
doc.save(str(pdf_path))
|
||||
doc.close()
|
||||
|
||||
result = direct_engine.extract(pdf_path)
|
||||
|
||||
# Check metadata
|
||||
assert result.metadata.filename == "complete.pdf"
|
||||
assert result.metadata.file_type == "pdf"
|
||||
assert result.metadata.processing_track == ProcessingTrack.DIRECT
|
||||
assert result.metadata.processing_time >= 0
|
||||
|
||||
# Check page structure
|
||||
page = result.pages[0]
|
||||
assert page.page_number == 1
|
||||
assert page.dimensions is not None
|
||||
assert page.elements is not None
|
||||
|
||||
def test_elements_have_required_fields(self, direct_engine, temp_dir):
|
||||
"""Test that elements have all required fields."""
|
||||
pdf_path = temp_dir / "elements.pdf"
|
||||
doc = fitz.open()
|
||||
page = doc.new_page()
|
||||
page.insert_text((50, 50), "Element test", fontsize=12)
|
||||
doc.save(str(pdf_path))
|
||||
doc.close()
|
||||
|
||||
result = direct_engine.extract(pdf_path)
|
||||
|
||||
for element in result.pages[0].elements:
|
||||
assert element.element_id is not None
|
||||
assert element.type is not None
|
||||
assert element.bbox is not None
|
||||
|
||||
|
||||
class TestUnifiedDocumentExport:
|
||||
"""Test UnifiedDocument export functionality."""
|
||||
|
||||
@pytest.fixture
|
||||
def exporter(self):
|
||||
"""Create exporter."""
|
||||
return UnifiedDocumentExporter()
|
||||
|
||||
@pytest.fixture
|
||||
def sample_document(self, temp_dir):
|
||||
"""Create a sample UnifiedDocument."""
|
||||
engine = DirectExtractionEngine()
|
||||
|
||||
pdf_path = temp_dir / "sample.pdf"
|
||||
doc = fitz.open()
|
||||
page = doc.new_page()
|
||||
page.insert_text((50, 50), "Sample document for export", fontsize=12)
|
||||
doc.save(str(pdf_path))
|
||||
doc.close()
|
||||
|
||||
return engine.extract(pdf_path)
|
||||
|
||||
def test_export_to_json(self, exporter, sample_document, temp_dir):
|
||||
"""Test export to JSON format."""
|
||||
output_path = temp_dir / "output.json"
|
||||
|
||||
exporter.export_to_json(sample_document, output_path)
|
||||
|
||||
assert output_path.exists()
|
||||
|
||||
# Verify JSON is valid
|
||||
with open(output_path) as f:
|
||||
data = json.load(f)
|
||||
|
||||
assert "document_id" in data
|
||||
assert "metadata" in data
|
||||
assert "pages" in data
|
||||
|
||||
def test_export_to_markdown(self, exporter, sample_document, temp_dir):
|
||||
"""Test export to Markdown format."""
|
||||
output_path = temp_dir / "output.md"
|
||||
|
||||
exporter.export_to_markdown(sample_document, output_path)
|
||||
|
||||
assert output_path.exists()
|
||||
|
||||
content = output_path.read_text()
|
||||
assert len(content) > 0
|
||||
|
||||
def test_export_to_text(self, exporter, sample_document):
|
||||
"""Test export to plain text."""
|
||||
text = exporter.export_to_text(sample_document)
|
||||
|
||||
assert isinstance(text, str)
|
||||
assert len(text) > 0
|
||||
|
||||
def test_export_legacy_format(self, exporter, sample_document, temp_dir):
|
||||
"""Test export to legacy JSON format for backward compatibility."""
|
||||
output_path = temp_dir / "legacy.json"
|
||||
|
||||
exporter.export_to_legacy_json(sample_document, output_path)
|
||||
|
||||
assert output_path.exists()
|
||||
|
||||
with open(output_path) as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Legacy format should have different structure
|
||||
assert isinstance(data, (dict, list))
|
||||
|
||||
|
||||
class TestBackwardCompatibility:
|
||||
"""Test backward compatibility with existing system."""
|
||||
|
||||
@pytest.fixture
|
||||
def direct_engine(self):
|
||||
"""Create direct extraction engine."""
|
||||
return DirectExtractionEngine()
|
||||
|
||||
def test_document_can_be_serialized(self, direct_engine, temp_dir):
|
||||
"""Test that UnifiedDocument can be serialized to dict."""
|
||||
pdf_path = temp_dir / "serialize.pdf"
|
||||
doc = fitz.open()
|
||||
page = doc.new_page()
|
||||
page.insert_text((50, 50), "Serializable", fontsize=12)
|
||||
doc.save(str(pdf_path))
|
||||
doc.close()
|
||||
|
||||
result = direct_engine.extract(pdf_path)
|
||||
|
||||
# Should be serializable
|
||||
doc_dict = result.to_dict()
|
||||
|
||||
assert isinstance(doc_dict, dict)
|
||||
assert "document_id" in doc_dict
|
||||
assert "metadata" in doc_dict
|
||||
assert "pages" in doc_dict
|
||||
|
||||
def test_element_types_are_strings(self, direct_engine, temp_dir):
|
||||
"""Test that element types serialize to strings."""
|
||||
pdf_path = temp_dir / "types.pdf"
|
||||
doc = fitz.open()
|
||||
page = doc.new_page()
|
||||
page.insert_text((50, 50), "Test", fontsize=12)
|
||||
doc.save(str(pdf_path))
|
||||
doc.close()
|
||||
|
||||
result = direct_engine.extract(pdf_path)
|
||||
doc_dict = result.to_dict()
|
||||
|
||||
for page_data in doc_dict.get("pages", []):
|
||||
for element in page_data.get("elements", []):
|
||||
assert isinstance(element.get("type"), str)
|
||||
|
||||
def test_processing_track_is_string(self, direct_engine, temp_dir):
|
||||
"""Test that processing track serializes to string."""
|
||||
pdf_path = temp_dir / "track.pdf"
|
||||
doc = fitz.open()
|
||||
page = doc.new_page()
|
||||
page.insert_text((50, 50), "Track test", fontsize=12)
|
||||
doc.save(str(pdf_path))
|
||||
doc.close()
|
||||
|
||||
result = direct_engine.extract(pdf_path)
|
||||
doc_dict = result.to_dict()
|
||||
|
||||
track = doc_dict.get("metadata", {}).get("processing_track")
|
||||
assert isinstance(track, str)
|
||||
assert track in ["ocr", "direct", "hybrid", "auto"]
|
||||
|
||||
|
||||
class TestCrossTrackConsistency:
|
||||
"""Test consistency between OCR and Direct track outputs."""
|
||||
|
||||
@pytest.fixture
|
||||
def detector(self):
|
||||
"""Create document type detector."""
|
||||
return DocumentTypeDetector()
|
||||
|
||||
@pytest.fixture
|
||||
def direct_engine(self):
|
||||
"""Create direct extraction engine."""
|
||||
return DirectExtractionEngine()
|
||||
|
||||
def test_both_tracks_produce_unified_document(self, detector, direct_engine, temp_dir):
|
||||
"""Test that both tracks produce UnifiedDocument format."""
|
||||
# Create editable PDF for direct track
|
||||
pdf_path = temp_dir / "editable.pdf"
|
||||
doc = fitz.open()
|
||||
page = doc.new_page()
|
||||
page.insert_text((50, 50), "Editable content " * 20, fontsize=12)
|
||||
doc.save(str(pdf_path))
|
||||
doc.close()
|
||||
|
||||
# Detect and route
|
||||
detection = detector.detect(pdf_path)
|
||||
assert detection.track == "direct"
|
||||
|
||||
# Extract
|
||||
result = direct_engine.extract(pdf_path)
|
||||
|
||||
# Verify UnifiedDocument structure
|
||||
assert isinstance(result, UnifiedDocument)
|
||||
assert result.metadata.processing_track == ProcessingTrack.DIRECT
|
||||
|
||||
def test_metadata_structure_consistent(self, direct_engine, temp_dir):
|
||||
"""Test that metadata structure is consistent."""
|
||||
pdf_path = temp_dir / "metadata.pdf"
|
||||
doc = fitz.open()
|
||||
page = doc.new_page()
|
||||
page.insert_text((50, 50), "Metadata test", fontsize=12)
|
||||
doc.save(str(pdf_path))
|
||||
doc.close()
|
||||
|
||||
result = direct_engine.extract(pdf_path)
|
||||
metadata = result.metadata
|
||||
|
||||
# Required metadata fields
|
||||
assert hasattr(metadata, 'filename')
|
||||
assert hasattr(metadata, 'file_type')
|
||||
assert hasattr(metadata, 'file_size')
|
||||
assert hasattr(metadata, 'processing_track')
|
||||
assert hasattr(metadata, 'processing_time')
|
||||
assert hasattr(metadata, 'created_at')
|
||||
|
||||
def test_element_structure_consistent(self, direct_engine, temp_dir):
|
||||
"""Test that element structure is consistent."""
|
||||
pdf_path = temp_dir / "elements.pdf"
|
||||
doc = fitz.open()
|
||||
page = doc.new_page()
|
||||
page.insert_text((50, 50), "Element structure test", fontsize=12)
|
||||
doc.save(str(pdf_path))
|
||||
doc.close()
|
||||
|
||||
result = direct_engine.extract(pdf_path)
|
||||
|
||||
for element in result.pages[0].elements:
|
||||
# Required element fields
|
||||
assert hasattr(element, 'element_id')
|
||||
assert hasattr(element, 'type')
|
||||
assert hasattr(element, 'content')
|
||||
assert hasattr(element, 'bbox')
|
||||
assert hasattr(element, 'confidence')
|
||||
|
||||
|
||||
class TestEdgeCases:
|
||||
"""Test edge cases in dual-track processing."""
|
||||
|
||||
@pytest.fixture
|
||||
def detector(self):
|
||||
"""Create document type detector."""
|
||||
return DocumentTypeDetector()
|
||||
|
||||
@pytest.fixture
|
||||
def direct_engine(self):
|
||||
"""Create direct extraction engine."""
|
||||
return DirectExtractionEngine()
|
||||
|
||||
def test_empty_pdf(self, direct_engine, temp_dir):
|
||||
"""Test handling of empty PDF."""
|
||||
pdf_path = temp_dir / "empty.pdf"
|
||||
doc = fitz.open()
|
||||
doc.new_page()
|
||||
doc.save(str(pdf_path))
|
||||
doc.close()
|
||||
|
||||
result = direct_engine.extract(pdf_path)
|
||||
|
||||
assert len(result.pages) == 1
|
||||
# May or may not have elements
|
||||
|
||||
def test_very_small_file(self, detector, temp_dir):
|
||||
"""Test handling of very small files."""
|
||||
small_file = temp_dir / "small.txt"
|
||||
small_file.write_text("Hi")
|
||||
|
||||
result = detector.detect(small_file)
|
||||
|
||||
assert result.track in ["direct", "ocr"]
|
||||
|
||||
def test_special_characters_in_content(self, direct_engine, temp_dir):
|
||||
"""Test handling of special characters."""
|
||||
pdf_path = temp_dir / "special.pdf"
|
||||
doc = fitz.open()
|
||||
page = doc.new_page()
|
||||
|
||||
special_text = "Special: © ® ™ € £ ¥ § ¶ • … — – '"
|
||||
page.insert_text((50, 50), special_text, fontsize=12)
|
||||
|
||||
doc.save(str(pdf_path))
|
||||
doc.close()
|
||||
|
||||
result = direct_engine.extract(pdf_path)
|
||||
|
||||
# Should not crash
|
||||
assert len(result.pages) == 1
|
||||
|
||||
def test_large_page_count(self, direct_engine, temp_dir):
|
||||
"""Test handling of document with many pages."""
|
||||
pdf_path = temp_dir / "many_pages.pdf"
|
||||
doc = fitz.open()
|
||||
|
||||
for i in range(50):
|
||||
page = doc.new_page()
|
||||
page.insert_text((50, 50), f"Page {i+1}", fontsize=12)
|
||||
|
||||
doc.save(str(pdf_path))
|
||||
doc.close()
|
||||
|
||||
result = direct_engine.extract(pdf_path)
|
||||
|
||||
assert len(result.pages) == 50
|
||||
Reference in New Issue
Block a user