OCR/backend/tests/services/test_dual_track_integration.py

"""
Integration tests for dual-track document processing.

Tests cover:
- Routing logic between OCR and Direct tracks
- UnifiedDocument generation from both tracks
- Backward compatibility with legacy formats
"""

import pytest
from pathlib import Path
import fitz
from PIL import Image
import io
import json

from app.services.document_type_detector import DocumentTypeDetector, DocumentType
from app.services.direct_extraction_engine import DirectExtractionEngine
from app.models.unified_document import (
    UnifiedDocument, ProcessingTrack, ElementType
)
from app.services.unified_document_exporter import UnifiedDocumentExporter, ExportFormat


class TestDualTrackRouting:
    """Test routing logic between OCR and Direct tracks."""

    @pytest.fixture
    def detector(self):
        """Create document type detector."""
        return DocumentTypeDetector()

    # ===== Routing Tests =====

    def test_route_editable_pdf_to_direct(self, detector, temp_dir):
        """Test that editable PDFs are routed to direct track."""
        pdf_path = temp_dir / "editable.pdf"
        doc = fitz.open()

        for i in range(3):
            page = doc.new_page()
            page.insert_text((50, 50), f"Page {i+1}: " + "Text content " * 20, fontsize=12)

        doc.save(str(pdf_path))
        doc.close()

        result = detector.detect(pdf_path)

        assert result.track == "direct"
        assert result.document_type == DocumentType.PDF_EDITABLE
        assert result.confidence >= 0.9

    def test_route_scanned_pdf_to_ocr(self, detector, temp_dir):
        """Test that scanned PDFs are routed to OCR track."""
        pdf_path = temp_dir / "scanned.pdf"
        doc = fitz.open()

        for i in range(3):
            page = doc.new_page()
            # Only images, no text
            img = Image.new('RGB', (400, 300), color=(200, 200, 200))
            img_bytes = io.BytesIO()
            img.save(img_bytes, format='PNG')
            img_bytes.seek(0)

            rect = fitz.Rect(50, 50, 550, 750)
            page.insert_image(rect, stream=img_bytes.read())

        doc.save(str(pdf_path))
        doc.close()

        result = detector.detect(pdf_path)

        assert result.track == "ocr"
        assert result.document_type == DocumentType.PDF_SCANNED

    def test_route_image_to_ocr(self, detector, temp_dir):
        """Test that images are routed to OCR track."""
        img_path = temp_dir / "image.png"
        img = Image.new('RGB', (100, 100), color='red')
        img.save(str(img_path))

        result = detector.detect(img_path)

        assert result.track == "ocr"
        assert result.document_type == DocumentType.IMAGE
        assert result.confidence == 1.0

    def test_route_text_file_to_direct(self, detector, temp_dir):
        """Test that text files are routed to direct track."""
        txt_path = temp_dir / "text.txt"
        txt_path.write_text("This is a plain text file.\n" * 10)

        result = detector.detect(txt_path)

        assert result.track == "direct"
        assert result.document_type == DocumentType.TEXT

    def test_route_mixed_pdf_to_ocr(self, detector, temp_dir):
        """Test that mixed PDFs are routed to OCR track."""
        pdf_path = temp_dir / "mixed.pdf"
        doc = fitz.open()

        # Page 1: Text
        page = doc.new_page()
        page.insert_text((50, 50), "Text content " * 20, fontsize=12)

        # Page 2: Image only
        page = doc.new_page()
        img = Image.new('RGB', (400, 300), color=(200, 200, 200))
        img_bytes = io.BytesIO()
        img.save(img_bytes, format='PNG')
        img_bytes.seek(0)
        rect = fitz.Rect(50, 50, 550, 750)
        page.insert_image(rect, stream=img_bytes.read())

        # Page 3: Image only
        page = doc.new_page()
        img = Image.new('RGB', (400, 300), color=(150, 150, 150))
        img_bytes = io.BytesIO()
        img.save(img_bytes, format='PNG')
        img_bytes.seek(0)
        page.insert_image(rect, stream=img_bytes.read())

        doc.save(str(pdf_path))
        doc.close()

        result = detector.detect(pdf_path)

        assert result.track == "ocr"
        assert result.document_type == DocumentType.PDF_MIXED


class TestUnifiedDocumentGeneration:
    """Test UnifiedDocument generation from both tracks."""

    @pytest.fixture
    def direct_engine(self):
        """Create direct extraction engine."""
        return DirectExtractionEngine()

    # ===== Direct Track Generation =====

    def test_direct_track_generates_unified_document(self, direct_engine, temp_dir):
        """Test that direct track generates valid UnifiedDocument."""
        pdf_path = temp_dir / "test.pdf"
        doc = fitz.open()
        page = doc.new_page()
        page.insert_text((50, 50), "Test content", fontsize=12)
        doc.save(str(pdf_path))
        doc.close()

        result = direct_engine.extract(pdf_path)

        assert isinstance(result, UnifiedDocument)
        assert result.document_id is not None
        assert result.metadata is not None
        assert len(result.pages) == 1

    def test_unified_document_has_required_fields(self, direct_engine, temp_dir):
        """Test that UnifiedDocument has all required fields."""
        pdf_path = temp_dir / "complete.pdf"
        doc = fitz.open()
        page = doc.new_page()
        page.insert_text((50, 50), "Complete document", fontsize=12)
        doc.save(str(pdf_path))
        doc.close()

        result = direct_engine.extract(pdf_path)

        # Check metadata
        assert result.metadata.filename == "complete.pdf"
        assert result.metadata.file_type == "pdf"
        assert result.metadata.processing_track == ProcessingTrack.DIRECT
        assert result.metadata.processing_time >= 0

        # Check page structure
        page = result.pages[0]
        assert page.page_number == 1
        assert page.dimensions is not None
        assert page.elements is not None

    def test_elements_have_required_fields(self, direct_engine, temp_dir):
        """Test that elements have all required fields."""
        pdf_path = temp_dir / "elements.pdf"
        doc = fitz.open()
        page = doc.new_page()
        page.insert_text((50, 50), "Element test", fontsize=12)
        doc.save(str(pdf_path))
        doc.close()

        result = direct_engine.extract(pdf_path)

        for element in result.pages[0].elements:
            assert element.element_id is not None
            assert element.type is not None
            assert element.bbox is not None


class TestUnifiedDocumentExport:
    """Test UnifiedDocument export functionality."""

    @pytest.fixture
    def exporter(self):
        """Create exporter."""
        return UnifiedDocumentExporter()

    @pytest.fixture
    def sample_document(self, temp_dir):
        """Create a sample UnifiedDocument."""
        engine = DirectExtractionEngine()

        pdf_path = temp_dir / "sample.pdf"
        doc = fitz.open()
        page = doc.new_page()
        page.insert_text((50, 50), "Sample document for export", fontsize=12)
        doc.save(str(pdf_path))
        doc.close()

        return engine.extract(pdf_path)

    def test_export_to_json(self, exporter, sample_document, temp_dir):
        """Test export to JSON format."""
        output_path = temp_dir / "output.json"

        exporter.export_to_json(sample_document, output_path)

        assert output_path.exists()

        # Verify JSON is valid
        with open(output_path) as f:
            data = json.load(f)

        assert "document_id" in data
        assert "metadata" in data
        assert "pages" in data

    def test_export_to_markdown(self, exporter, sample_document, temp_dir):
        """Test export to Markdown format."""
        output_path = temp_dir / "output.md"

        exporter.export_to_markdown(sample_document, output_path)

        assert output_path.exists()

        content = output_path.read_text()
        assert len(content) > 0

    def test_export_to_text(self, exporter, sample_document):
        """Test export to plain text."""
        text = exporter.export_to_text(sample_document)

        assert isinstance(text, str)
        assert len(text) > 0

    def test_export_legacy_format(self, exporter, sample_document, temp_dir):
        """Test export to legacy JSON format for backward compatibility."""
        output_path = temp_dir / "legacy.json"

        exporter.export_to_legacy_json(sample_document, output_path)

        assert output_path.exists()

        with open(output_path) as f:
            data = json.load(f)

        # Legacy format should have different structure
        assert isinstance(data, (dict, list))


class TestBackwardCompatibility:
    """Test backward compatibility with existing system."""

    @pytest.fixture
    def direct_engine(self):
        """Create direct extraction engine."""
        return DirectExtractionEngine()

    def test_document_can_be_serialized(self, direct_engine, temp_dir):
        """Test that UnifiedDocument can be serialized to dict."""
        pdf_path = temp_dir / "serialize.pdf"
        doc = fitz.open()
        page = doc.new_page()
        page.insert_text((50, 50), "Serializable", fontsize=12)
        doc.save(str(pdf_path))
        doc.close()

        result = direct_engine.extract(pdf_path)

        # Should be serializable
        doc_dict = result.to_dict()

        assert isinstance(doc_dict, dict)
        assert "document_id" in doc_dict
        assert "metadata" in doc_dict
        assert "pages" in doc_dict

    def test_element_types_are_strings(self, direct_engine, temp_dir):
        """Test that element types serialize to strings."""
        pdf_path = temp_dir / "types.pdf"
        doc = fitz.open()
        page = doc.new_page()
        page.insert_text((50, 50), "Test", fontsize=12)
        doc.save(str(pdf_path))
        doc.close()

        result = direct_engine.extract(pdf_path)
        doc_dict = result.to_dict()

        for page_data in doc_dict.get("pages", []):
            for element in page_data.get("elements", []):
                assert isinstance(element.get("type"), str)

    def test_processing_track_is_string(self, direct_engine, temp_dir):
        """Test that processing track serializes to string."""
        pdf_path = temp_dir / "track.pdf"
        doc = fitz.open()
        page = doc.new_page()
        page.insert_text((50, 50), "Track test", fontsize=12)
        doc.save(str(pdf_path))
        doc.close()

        result = direct_engine.extract(pdf_path)
        doc_dict = result.to_dict()

        track = doc_dict.get("metadata", {}).get("processing_track")
        assert isinstance(track, str)
        assert track in ["ocr", "direct", "hybrid", "auto"]


class TestCrossTrackConsistency:
    """Test consistency between OCR and Direct track outputs."""

    @pytest.fixture
    def detector(self):
        """Create document type detector."""
        return DocumentTypeDetector()

    @pytest.fixture
    def direct_engine(self):
        """Create direct extraction engine."""
        return DirectExtractionEngine()

    def test_both_tracks_produce_unified_document(self, detector, direct_engine, temp_dir):
        """Test that both tracks produce UnifiedDocument format."""
        # Create editable PDF for direct track
        pdf_path = temp_dir / "editable.pdf"
        doc = fitz.open()
        page = doc.new_page()
        page.insert_text((50, 50), "Editable content " * 20, fontsize=12)
        doc.save(str(pdf_path))
        doc.close()

        # Detect and route
        detection = detector.detect(pdf_path)
        assert detection.track == "direct"

        # Extract
        result = direct_engine.extract(pdf_path)

        # Verify UnifiedDocument structure
        assert isinstance(result, UnifiedDocument)
        assert result.metadata.processing_track == ProcessingTrack.DIRECT

    def test_metadata_structure_consistent(self, direct_engine, temp_dir):
        """Test that metadata structure is consistent."""
        pdf_path = temp_dir / "metadata.pdf"
        doc = fitz.open()
        page = doc.new_page()
        page.insert_text((50, 50), "Metadata test", fontsize=12)
        doc.save(str(pdf_path))
        doc.close()

        result = direct_engine.extract(pdf_path)
        metadata = result.metadata

        # Required metadata fields
        assert hasattr(metadata, 'filename')
        assert hasattr(metadata, 'file_type')
        assert hasattr(metadata, 'file_size')
        assert hasattr(metadata, 'processing_track')
        assert hasattr(metadata, 'processing_time')
        assert hasattr(metadata, 'created_at')

    def test_element_structure_consistent(self, direct_engine, temp_dir):
        """Test that element structure is consistent."""
        pdf_path = temp_dir / "elements.pdf"
        doc = fitz.open()
        page = doc.new_page()
        page.insert_text((50, 50), "Element structure test", fontsize=12)
        doc.save(str(pdf_path))
        doc.close()

        result = direct_engine.extract(pdf_path)

        for element in result.pages[0].elements:
            # Required element fields
            assert hasattr(element, 'element_id')
            assert hasattr(element, 'type')
            assert hasattr(element, 'content')
            assert hasattr(element, 'bbox')
            assert hasattr(element, 'confidence')


class TestEdgeCases:
    """Test edge cases in dual-track processing."""

    @pytest.fixture
    def detector(self):
        """Create document type detector."""
        return DocumentTypeDetector()

    @pytest.fixture
    def direct_engine(self):
        """Create direct extraction engine."""
        return DirectExtractionEngine()

    def test_empty_pdf(self, direct_engine, temp_dir):
        """Test handling of empty PDF."""
        pdf_path = temp_dir / "empty.pdf"
        doc = fitz.open()
        doc.new_page()
        doc.save(str(pdf_path))
        doc.close()

        result = direct_engine.extract(pdf_path)

        assert len(result.pages) == 1
        # May or may not have elements

    def test_very_small_file(self, detector, temp_dir):
        """Test handling of very small files."""
        small_file = temp_dir / "small.txt"
        small_file.write_text("Hi")

        result = detector.detect(small_file)

        assert result.track in ["direct", "ocr"]

    def test_special_characters_in_content(self, direct_engine, temp_dir):
        """Test handling of special characters."""
        pdf_path = temp_dir / "special.pdf"
        doc = fitz.open()
        page = doc.new_page()

        special_text = "Special: © ® ™ € £ ¥ § ¶ • … — – '"
        page.insert_text((50, 50), special_text, fontsize=12)

        doc.save(str(pdf_path))
        doc.close()

        result = direct_engine.extract(pdf_path)

        # Should not crash
        assert len(result.pages) == 1

    def test_large_page_count(self, direct_engine, temp_dir):
        """Test handling of document with many pages."""
        pdf_path = temp_dir / "many_pages.pdf"
        doc = fitz.open()

        for i in range(50):
            page = doc.new_page()
            page.insert_text((50, 50), f"Page {i+1}", fontsize=12)

        doc.save(str(pdf_path))
        doc.close()

        result = direct_engine.extract(pdf_path)

        assert len(result.pages) == 50