Files
OCR/backend/tests/services/test_dual_track_integration.py
egg c50a5e9d2b test: add unit and integration tests for dual-track processing
Add comprehensive test suite for DirectExtractionEngine and dual-track
integration. All 65 tests pass covering text extraction, structure
preservation, routing logic, and backward compatibility.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-19 12:50:44 +08:00

472 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Integration tests for dual-track document processing.
Tests cover:
- Routing logic between OCR and Direct tracks
- UnifiedDocument generation from both tracks
- Backward compatibility with legacy formats
"""
import pytest
from pathlib import Path
import fitz
from PIL import Image
import io
import json
from app.services.document_type_detector import DocumentTypeDetector, DocumentType
from app.services.direct_extraction_engine import DirectExtractionEngine
from app.models.unified_document import (
UnifiedDocument, ProcessingTrack, ElementType
)
from app.services.unified_document_exporter import UnifiedDocumentExporter, ExportFormat
class TestDualTrackRouting:
"""Test routing logic between OCR and Direct tracks."""
@pytest.fixture
def detector(self):
"""Create document type detector."""
return DocumentTypeDetector()
# ===== Routing Tests =====
def test_route_editable_pdf_to_direct(self, detector, temp_dir):
"""Test that editable PDFs are routed to direct track."""
pdf_path = temp_dir / "editable.pdf"
doc = fitz.open()
for i in range(3):
page = doc.new_page()
page.insert_text((50, 50), f"Page {i+1}: " + "Text content " * 20, fontsize=12)
doc.save(str(pdf_path))
doc.close()
result = detector.detect(pdf_path)
assert result.track == "direct"
assert result.document_type == DocumentType.PDF_EDITABLE
assert result.confidence >= 0.9
def test_route_scanned_pdf_to_ocr(self, detector, temp_dir):
"""Test that scanned PDFs are routed to OCR track."""
pdf_path = temp_dir / "scanned.pdf"
doc = fitz.open()
for i in range(3):
page = doc.new_page()
# Only images, no text
img = Image.new('RGB', (400, 300), color=(200, 200, 200))
img_bytes = io.BytesIO()
img.save(img_bytes, format='PNG')
img_bytes.seek(0)
rect = fitz.Rect(50, 50, 550, 750)
page.insert_image(rect, stream=img_bytes.read())
doc.save(str(pdf_path))
doc.close()
result = detector.detect(pdf_path)
assert result.track == "ocr"
assert result.document_type == DocumentType.PDF_SCANNED
def test_route_image_to_ocr(self, detector, temp_dir):
"""Test that images are routed to OCR track."""
img_path = temp_dir / "image.png"
img = Image.new('RGB', (100, 100), color='red')
img.save(str(img_path))
result = detector.detect(img_path)
assert result.track == "ocr"
assert result.document_type == DocumentType.IMAGE
assert result.confidence == 1.0
def test_route_text_file_to_direct(self, detector, temp_dir):
"""Test that text files are routed to direct track."""
txt_path = temp_dir / "text.txt"
txt_path.write_text("This is a plain text file.\n" * 10)
result = detector.detect(txt_path)
assert result.track == "direct"
assert result.document_type == DocumentType.TEXT
def test_route_mixed_pdf_to_ocr(self, detector, temp_dir):
"""Test that mixed PDFs are routed to OCR track."""
pdf_path = temp_dir / "mixed.pdf"
doc = fitz.open()
# Page 1: Text
page = doc.new_page()
page.insert_text((50, 50), "Text content " * 20, fontsize=12)
# Page 2: Image only
page = doc.new_page()
img = Image.new('RGB', (400, 300), color=(200, 200, 200))
img_bytes = io.BytesIO()
img.save(img_bytes, format='PNG')
img_bytes.seek(0)
rect = fitz.Rect(50, 50, 550, 750)
page.insert_image(rect, stream=img_bytes.read())
# Page 3: Image only
page = doc.new_page()
img = Image.new('RGB', (400, 300), color=(150, 150, 150))
img_bytes = io.BytesIO()
img.save(img_bytes, format='PNG')
img_bytes.seek(0)
page.insert_image(rect, stream=img_bytes.read())
doc.save(str(pdf_path))
doc.close()
result = detector.detect(pdf_path)
assert result.track == "ocr"
assert result.document_type == DocumentType.PDF_MIXED
class TestUnifiedDocumentGeneration:
"""Test UnifiedDocument generation from both tracks."""
@pytest.fixture
def direct_engine(self):
"""Create direct extraction engine."""
return DirectExtractionEngine()
# ===== Direct Track Generation =====
def test_direct_track_generates_unified_document(self, direct_engine, temp_dir):
"""Test that direct track generates valid UnifiedDocument."""
pdf_path = temp_dir / "test.pdf"
doc = fitz.open()
page = doc.new_page()
page.insert_text((50, 50), "Test content", fontsize=12)
doc.save(str(pdf_path))
doc.close()
result = direct_engine.extract(pdf_path)
assert isinstance(result, UnifiedDocument)
assert result.document_id is not None
assert result.metadata is not None
assert len(result.pages) == 1
def test_unified_document_has_required_fields(self, direct_engine, temp_dir):
"""Test that UnifiedDocument has all required fields."""
pdf_path = temp_dir / "complete.pdf"
doc = fitz.open()
page = doc.new_page()
page.insert_text((50, 50), "Complete document", fontsize=12)
doc.save(str(pdf_path))
doc.close()
result = direct_engine.extract(pdf_path)
# Check metadata
assert result.metadata.filename == "complete.pdf"
assert result.metadata.file_type == "pdf"
assert result.metadata.processing_track == ProcessingTrack.DIRECT
assert result.metadata.processing_time >= 0
# Check page structure
page = result.pages[0]
assert page.page_number == 1
assert page.dimensions is not None
assert page.elements is not None
def test_elements_have_required_fields(self, direct_engine, temp_dir):
"""Test that elements have all required fields."""
pdf_path = temp_dir / "elements.pdf"
doc = fitz.open()
page = doc.new_page()
page.insert_text((50, 50), "Element test", fontsize=12)
doc.save(str(pdf_path))
doc.close()
result = direct_engine.extract(pdf_path)
for element in result.pages[0].elements:
assert element.element_id is not None
assert element.type is not None
assert element.bbox is not None
class TestUnifiedDocumentExport:
"""Test UnifiedDocument export functionality."""
@pytest.fixture
def exporter(self):
"""Create exporter."""
return UnifiedDocumentExporter()
@pytest.fixture
def sample_document(self, temp_dir):
"""Create a sample UnifiedDocument."""
engine = DirectExtractionEngine()
pdf_path = temp_dir / "sample.pdf"
doc = fitz.open()
page = doc.new_page()
page.insert_text((50, 50), "Sample document for export", fontsize=12)
doc.save(str(pdf_path))
doc.close()
return engine.extract(pdf_path)
def test_export_to_json(self, exporter, sample_document, temp_dir):
"""Test export to JSON format."""
output_path = temp_dir / "output.json"
exporter.export_to_json(sample_document, output_path)
assert output_path.exists()
# Verify JSON is valid
with open(output_path) as f:
data = json.load(f)
assert "document_id" in data
assert "metadata" in data
assert "pages" in data
def test_export_to_markdown(self, exporter, sample_document, temp_dir):
"""Test export to Markdown format."""
output_path = temp_dir / "output.md"
exporter.export_to_markdown(sample_document, output_path)
assert output_path.exists()
content = output_path.read_text()
assert len(content) > 0
def test_export_to_text(self, exporter, sample_document):
"""Test export to plain text."""
text = exporter.export_to_text(sample_document)
assert isinstance(text, str)
assert len(text) > 0
def test_export_legacy_format(self, exporter, sample_document, temp_dir):
"""Test export to legacy JSON format for backward compatibility."""
output_path = temp_dir / "legacy.json"
exporter.export_to_legacy_json(sample_document, output_path)
assert output_path.exists()
with open(output_path) as f:
data = json.load(f)
# Legacy format should have different structure
assert isinstance(data, (dict, list))
class TestBackwardCompatibility:
"""Test backward compatibility with existing system."""
@pytest.fixture
def direct_engine(self):
"""Create direct extraction engine."""
return DirectExtractionEngine()
def test_document_can_be_serialized(self, direct_engine, temp_dir):
"""Test that UnifiedDocument can be serialized to dict."""
pdf_path = temp_dir / "serialize.pdf"
doc = fitz.open()
page = doc.new_page()
page.insert_text((50, 50), "Serializable", fontsize=12)
doc.save(str(pdf_path))
doc.close()
result = direct_engine.extract(pdf_path)
# Should be serializable
doc_dict = result.to_dict()
assert isinstance(doc_dict, dict)
assert "document_id" in doc_dict
assert "metadata" in doc_dict
assert "pages" in doc_dict
def test_element_types_are_strings(self, direct_engine, temp_dir):
"""Test that element types serialize to strings."""
pdf_path = temp_dir / "types.pdf"
doc = fitz.open()
page = doc.new_page()
page.insert_text((50, 50), "Test", fontsize=12)
doc.save(str(pdf_path))
doc.close()
result = direct_engine.extract(pdf_path)
doc_dict = result.to_dict()
for page_data in doc_dict.get("pages", []):
for element in page_data.get("elements", []):
assert isinstance(element.get("type"), str)
def test_processing_track_is_string(self, direct_engine, temp_dir):
"""Test that processing track serializes to string."""
pdf_path = temp_dir / "track.pdf"
doc = fitz.open()
page = doc.new_page()
page.insert_text((50, 50), "Track test", fontsize=12)
doc.save(str(pdf_path))
doc.close()
result = direct_engine.extract(pdf_path)
doc_dict = result.to_dict()
track = doc_dict.get("metadata", {}).get("processing_track")
assert isinstance(track, str)
assert track in ["ocr", "direct", "hybrid", "auto"]
class TestCrossTrackConsistency:
"""Test consistency between OCR and Direct track outputs."""
@pytest.fixture
def detector(self):
"""Create document type detector."""
return DocumentTypeDetector()
@pytest.fixture
def direct_engine(self):
"""Create direct extraction engine."""
return DirectExtractionEngine()
def test_both_tracks_produce_unified_document(self, detector, direct_engine, temp_dir):
"""Test that both tracks produce UnifiedDocument format."""
# Create editable PDF for direct track
pdf_path = temp_dir / "editable.pdf"
doc = fitz.open()
page = doc.new_page()
page.insert_text((50, 50), "Editable content " * 20, fontsize=12)
doc.save(str(pdf_path))
doc.close()
# Detect and route
detection = detector.detect(pdf_path)
assert detection.track == "direct"
# Extract
result = direct_engine.extract(pdf_path)
# Verify UnifiedDocument structure
assert isinstance(result, UnifiedDocument)
assert result.metadata.processing_track == ProcessingTrack.DIRECT
def test_metadata_structure_consistent(self, direct_engine, temp_dir):
"""Test that metadata structure is consistent."""
pdf_path = temp_dir / "metadata.pdf"
doc = fitz.open()
page = doc.new_page()
page.insert_text((50, 50), "Metadata test", fontsize=12)
doc.save(str(pdf_path))
doc.close()
result = direct_engine.extract(pdf_path)
metadata = result.metadata
# Required metadata fields
assert hasattr(metadata, 'filename')
assert hasattr(metadata, 'file_type')
assert hasattr(metadata, 'file_size')
assert hasattr(metadata, 'processing_track')
assert hasattr(metadata, 'processing_time')
assert hasattr(metadata, 'created_at')
def test_element_structure_consistent(self, direct_engine, temp_dir):
"""Test that element structure is consistent."""
pdf_path = temp_dir / "elements.pdf"
doc = fitz.open()
page = doc.new_page()
page.insert_text((50, 50), "Element structure test", fontsize=12)
doc.save(str(pdf_path))
doc.close()
result = direct_engine.extract(pdf_path)
for element in result.pages[0].elements:
# Required element fields
assert hasattr(element, 'element_id')
assert hasattr(element, 'type')
assert hasattr(element, 'content')
assert hasattr(element, 'bbox')
assert hasattr(element, 'confidence')
class TestEdgeCases:
"""Test edge cases in dual-track processing."""
@pytest.fixture
def detector(self):
"""Create document type detector."""
return DocumentTypeDetector()
@pytest.fixture
def direct_engine(self):
"""Create direct extraction engine."""
return DirectExtractionEngine()
def test_empty_pdf(self, direct_engine, temp_dir):
"""Test handling of empty PDF."""
pdf_path = temp_dir / "empty.pdf"
doc = fitz.open()
doc.new_page()
doc.save(str(pdf_path))
doc.close()
result = direct_engine.extract(pdf_path)
assert len(result.pages) == 1
# May or may not have elements
def test_very_small_file(self, detector, temp_dir):
"""Test handling of very small files."""
small_file = temp_dir / "small.txt"
small_file.write_text("Hi")
result = detector.detect(small_file)
assert result.track in ["direct", "ocr"]
def test_special_characters_in_content(self, direct_engine, temp_dir):
"""Test handling of special characters."""
pdf_path = temp_dir / "special.pdf"
doc = fitz.open()
page = doc.new_page()
special_text = "Special: © ® ™ € £ ¥ § ¶ • … — '"
page.insert_text((50, 50), special_text, fontsize=12)
doc.save(str(pdf_path))
doc.close()
result = direct_engine.extract(pdf_path)
# Should not crash
assert len(result.pages) == 1
def test_large_page_count(self, direct_engine, temp_dir):
"""Test handling of document with many pages."""
pdf_path = temp_dir / "many_pages.pdf"
doc = fitz.open()
for i in range(50):
page = doc.new_page()
page.insert_text((50, 50), f"Page {i+1}", fontsize=12)
doc.save(str(pdf_path))
doc.close()
result = direct_engine.extract(pdf_path)
assert len(result.pages) == 50