test: add unit tests for DocumentTypeDetector

- Create test directory structure for backend - Add pytest fixtures for test files (PDF, images, Office docs) - Add 20 unit tests covering: - PDF type detection (editable, scanned, mixed) - Image file detection (PNG, JPG) - Office document detection (DOCX) - Text file detection - Edge cases (file not found, unknown types) - Batch processing and statistics - Mark tasks 1.1.4 and 1.3.5 as completed in tasks.md 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-19 12:14:59 +08:00
parent 1d0b63854a
commit 0fcb2492c9
6 changed files with 486 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -97,4 +97,6 @@ storage/results/*
 *.log
 __pycache__/
 *.bak
 # Ignore temporary test files in root, but allow backend/tests/
 test_*.py
 !backend/tests/**/test_*.py
--- a/backend/tests/init.py
+++ b/backend/tests/init.py
@@ -0,0 +1,3 @@
 """
 Tool_OCR - Test Suite
 """
--- a/backend/tests/conftest.py
+++ b/backend/tests/conftest.py
@@ -0,0 +1,176 @@
 """
 Pytest configuration and fixtures for Tool_OCR tests.
 """
 import pytest
 import tempfile
 import os
 from pathlib import Path
 # Add project root to path
 import sys
 project_root = Path(__file__).parent.parent
 sys.path.insert(0, str(project_root))
@pytest.fixture
 def temp_dir():
    """Create a temporary directory for test files."""
    with tempfile.TemporaryDirectory() as tmpdir:
        yield Path(tmpdir)
@pytest.fixture
 def sample_text_file(temp_dir):
    """Create a sample text file."""
    file_path = temp_dir / "sample.txt"
    file_path.write_text("This is a sample text file for testing purposes.\n" * 10)
    return file_path
@pytest.fixture
 def sample_pdf_editable(temp_dir):
    """Create a sample editable PDF with text content."""
    import fitz
    file_path = temp_dir / "editable.pdf"
    doc = fitz.open()
    # Create 3 pages with text
    for i in range(3):
        page = doc.new_page()
        text = f"This is page {i + 1} of an editable PDF document.\n" * 20
        page.insert_text((50, 50), text, fontsize=12)
    doc.save(str(file_path))
    doc.close()
    return file_path
@pytest.fixture
 def sample_pdf_scanned(temp_dir):
    """Create a sample scanned PDF (image-only, no text)."""
    import fitz
    from PIL import Image
    import io
    file_path = temp_dir / "scanned.pdf"
    doc = fitz.open()
    # Create 3 pages with only images (simulating scanned pages)
    for i in range(3):
        page = doc.new_page()
        # Create a simple image
        img = Image.new('RGB', (400, 300), color=(200, 200, 200))
        img_bytes = io.BytesIO()
        img.save(img_bytes, format='PNG')
        img_bytes.seek(0)
        # Insert image covering most of the page
        rect = fitz.Rect(50, 50, 550, 750)
        page.insert_image(rect, stream=img_bytes.read())
    doc.save(str(file_path))
    doc.close()
    return file_path
@pytest.fixture
 def sample_pdf_mixed(temp_dir):
    """Create a sample mixed PDF (some pages with text, some with images)."""
    import fitz
    from PIL import Image
    import io
    file_path = temp_dir / "mixed.pdf"
    doc = fitz.open()
    # Page 1: Text
    page = doc.new_page()
    text = "This is a text page.\n" * 20
    page.insert_text((50, 50), text, fontsize=12)
    # Page 2: Image only
    page = doc.new_page()
    img = Image.new('RGB', (400, 300), color=(200, 200, 200))
    img_bytes = io.BytesIO()
    img.save(img_bytes, format='PNG')
    img_bytes.seek(0)
    rect = fitz.Rect(50, 50, 550, 750)
    page.insert_image(rect, stream=img_bytes.read())
    # Page 3: Image only
    page = doc.new_page()
    img = Image.new('RGB', (400, 300), color=(150, 150, 150))
    img_bytes = io.BytesIO()
    img.save(img_bytes, format='PNG')
    img_bytes.seek(0)
    page.insert_image(rect, stream=img_bytes.read())
    doc.save(str(file_path))
    doc.close()
    return file_path
@pytest.fixture
 def sample_image_png(temp_dir):
    """Create a sample PNG image."""
    from PIL import Image
    file_path = temp_dir / "sample.png"
    img = Image.new('RGB', (100, 100), color='red')
    img.save(str(file_path))
    return file_path
@pytest.fixture
 def sample_image_jpg(temp_dir):
    """Create a sample JPEG image."""
    from PIL import Image
    file_path = temp_dir / "sample.jpg"
    img = Image.new('RGB', (100, 100), color='blue')
    img.save(str(file_path))
    return file_path
@pytest.fixture
 def sample_docx(temp_dir):
    """Create a sample DOCX file (minimal valid structure)."""
    from zipfile import ZipFile
    file_path = temp_dir / "sample.docx"
    # Create minimal DOCX structure
    with ZipFile(file_path, 'w') as zf:
        # [Content_Types].xml
        content_types = '''<?xml version="1.0" encoding="UTF-8"?>
 <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
  <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
  <Default Extension="xml" ContentType="application/xml"/>
  <Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
 </Types>'''
        zf.writestr('[Content_Types].xml', content_types)
        # _rels/.rels
        rels = '''<?xml version="1.0" encoding="UTF-8"?>
 <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
  <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
 </Relationships>'''
        zf.writestr('_rels/.rels', rels)
        # word/document.xml
        document = '''<?xml version="1.0" encoding="UTF-8"?>
 <w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
  <w:body>
    <w:p>
      <w:r>
        <w:t>Test document</w:t>
      </w:r>
    </w:p>
  </w:body>
 </w:document>'''
        zf.writestr('word/document.xml', document)
    return file_path
--- a/backend/tests/services/init.py
+++ b/backend/tests/services/init.py
@@ -0,0 +1,3 @@
 """
 Tool_OCR - Services Tests
 """
--- a/backend/tests/services/test_document_type_detector.py
+++ b/backend/tests/services/test_document_type_detector.py
@@ -0,0 +1,300 @@
 """
 Unit tests for DocumentTypeDetector service.
 Tests cover:
 - Various file type detection (PDF, image, Office, text)
 - PDF editability detection
 - Edge cases (file not found, unknown types)
 """
 import pytest
 from pathlib import Path
 from app.services.document_type_detector import (
    DocumentTypeDetector,
    DocumentType,
    ProcessingTrackRecommendation
 )
 class TestDocumentTypeDetector:
    """Test suite for DocumentTypeDetector."""
    @pytest.fixture
    def detector(self):
        """Create a detector instance with default settings."""
        return DocumentTypeDetector()
    @pytest.fixture
    def strict_detector(self):
        """Create a detector with strict text requirements."""
        return DocumentTypeDetector(
            min_text_length=200,
            text_coverage_threshold=0.95
        )
    # ===== PDF Detection Tests =====
    def test_detect_editable_pdf(self, detector, sample_pdf_editable):
        """Test detection of editable PDF with extractable text."""
        result = detector.detect(sample_pdf_editable)
        assert result.track == "direct"
        assert result.document_type == DocumentType.PDF_EDITABLE
        assert result.confidence >= 0.9
        assert "extractable text" in result.reason.lower()
        assert result.metadata.get("total_pages") == 3
    def test_detect_scanned_pdf(self, detector, sample_pdf_scanned):
        """Test detection of scanned PDF (image-only)."""
        result = detector.detect(sample_pdf_scanned)
        assert result.track == "ocr"
        assert result.document_type == DocumentType.PDF_SCANNED
        assert result.confidence >= 0.9
        assert "scanned" in result.reason.lower()
    def test_detect_mixed_pdf(self, detector, sample_pdf_mixed):
        """Test detection of mixed PDF (text + images)."""
        result = detector.detect(sample_pdf_mixed)
        assert result.track == "ocr"
        assert result.document_type == DocumentType.PDF_MIXED
        assert result.confidence >= 0.5
        assert "mixed" in result.reason.lower()
    def test_pdf_text_coverage_calculation(self, detector, sample_pdf_editable):
        """Test that text coverage is calculated correctly."""
        result = detector.detect(sample_pdf_editable)
        metadata = result.metadata
        assert "text_coverage" in metadata
        assert metadata["text_coverage"] >= 0.9  # All pages have text
    def test_pdf_page_details(self, detector, sample_pdf_editable):
        """Test that page details are included in metadata."""
        result = detector.detect(sample_pdf_editable)
        metadata = result.metadata
        assert "page_details" in metadata
        assert len(metadata["page_details"]) == min(3, detector.sample_pages)
        for page_detail in metadata["page_details"]:
            assert "page" in page_detail
            assert "text_length" in page_detail
            assert "has_text" in page_detail
            assert "image_count" in page_detail
    # ===== Image Detection Tests =====
    def test_detect_png_image(self, detector, sample_image_png):
        """Test detection of PNG image file."""
        result = detector.detect(sample_image_png)
        assert result.track == "ocr"
        assert result.document_type == DocumentType.IMAGE
        assert result.confidence == 1.0
        assert "image" in result.reason.lower()
        assert result.metadata.get("mime_type") == "image/png"
    def test_detect_jpg_image(self, detector, sample_image_jpg):
        """Test detection of JPEG image file."""
        result = detector.detect(sample_image_jpg)
        assert result.track == "ocr"
        assert result.document_type == DocumentType.IMAGE
        assert result.confidence == 1.0
        assert result.metadata.get("file_extension") == ".jpg"
    # ===== Office Document Tests =====
    def test_detect_docx(self, detector, sample_docx):
        """Test detection of Word document."""
        result = detector.detect(sample_docx)
        assert result.track == "ocr"
        assert result.document_type == DocumentType.OFFICE_WORD
        assert result.confidence >= 0.8
        assert "office" in result.reason.lower() or "ocr" in result.reason.lower()
    # ===== Text File Tests =====
    def test_detect_text_file(self, detector, sample_text_file):
        """Test detection of plain text file."""
        result = detector.detect(sample_text_file)
        assert result.track == "direct"
        assert result.document_type == DocumentType.TEXT
        assert result.confidence == 1.0
        assert "text" in result.reason.lower()
    # ===== Edge Case Tests =====
    def test_file_not_found(self, detector, temp_dir):
        """Test handling of non-existent file."""
        non_existent = temp_dir / "does_not_exist.pdf"
        result = detector.detect(non_existent)
        assert result.track == "ocr"
        assert result.document_type == DocumentType.UNKNOWN
        assert result.confidence == 0.5
        assert "not found" in result.reason.lower()
    def test_unknown_file_type(self, detector, temp_dir):
        """Test handling of unknown file type."""
        # Create a file with unknown content
        unknown_file = temp_dir / "unknown.xyz"
        unknown_file.write_bytes(b'\x00\x01\x02\x03\x04\x05')
        result = detector.detect(unknown_file)
        assert result.track == "ocr"
        assert result.document_type == DocumentType.UNKNOWN
        assert result.confidence <= 0.5
    def test_empty_pdf(self, detector, temp_dir):
        """Test handling of PDF with blank pages (no content)."""
        import fitz
        empty_pdf = temp_dir / "empty.pdf"
        doc = fitz.open()
        # Create a blank page with no content
        doc.new_page()
        doc.save(str(empty_pdf))
        doc.close()
        result = detector.detect(empty_pdf)
        # Blank PDF should be detected as scanned (no extractable text)
        assert result.track == "ocr"
        assert result.document_type == DocumentType.PDF_SCANNED
        assert result.metadata.get("total_pages") == 1
    # ===== Configuration Tests =====
    def test_custom_min_text_length(self, temp_dir):
        """Test that custom min_text_length affects detection."""
        import fitz
        # Create PDF with minimal text
        pdf_path = temp_dir / "minimal_text.pdf"
        doc = fitz.open()
        page = doc.new_page()
        page.insert_text((50, 50), "Short text")  # Only ~10 chars
        doc.save(str(pdf_path))
        doc.close()
        # Default detector (min_text_length=100)
        default_detector = DocumentTypeDetector()
        result_default = default_detector.detect(pdf_path)
        # Strict detector (min_text_length=200)
        strict_detector = DocumentTypeDetector(min_text_length=5)
        result_strict = strict_detector.detect(pdf_path)
        # With very low threshold, it should find text
        assert result_strict.document_type in [
            DocumentType.PDF_EDITABLE,
            DocumentType.PDF_MIXED
        ]
    def test_sample_pages_setting(self, temp_dir):
        """Test that sample_pages setting is respected."""
        import fitz
        # Create PDF with 10 pages
        pdf_path = temp_dir / "many_pages.pdf"
        doc = fitz.open()
        for i in range(10):
            page = doc.new_page()
            page.insert_text((50, 50), f"Page {i + 1} content\n" * 20)
        doc.save(str(pdf_path))
        doc.close()
        # Detector that samples only 2 pages
        detector = DocumentTypeDetector(sample_pages=2)
        result = detector.detect(pdf_path)
        assert result.metadata["sampled_pages"] == 2
        assert result.metadata["total_pages"] == 10
    # ===== Batch Processing Tests =====
    def test_analyze_batch(self, detector, sample_pdf_editable, sample_image_png, sample_text_file):
        """Test batch analysis of multiple files."""
        files = [sample_pdf_editable, sample_image_png, sample_text_file]
        results = detector.analyze_batch(files)
        assert len(results) == 3
        assert str(sample_pdf_editable) in results
        assert str(sample_image_png) in results
        assert str(sample_text_file) in results
    def test_get_statistics(self, detector, sample_pdf_editable, sample_image_png, sample_text_file):
        """Test statistics calculation from batch results."""
        files = [sample_pdf_editable, sample_image_png, sample_text_file]
        results = detector.analyze_batch(files)
        stats = detector.get_statistics(results)
        assert stats["total"] == 3
        assert "by_track" in stats
        assert stats["by_track"]["ocr"] >= 1  # At least image
        assert stats["by_track"]["direct"] >= 1  # At least text
        assert "confidence" in stats
        assert stats["confidence"]["mean"] > 0
    def test_get_statistics_empty(self, detector):
        """Test statistics with empty results."""
        stats = detector.get_statistics({})
        assert stats == {"total": 0}
    # ===== Recommendation Object Tests =====
    def test_recommendation_to_dict(self, detector, sample_pdf_editable):
        """Test ProcessingTrackRecommendation.to_dict() method."""
        result = detector.detect(sample_pdf_editable)
        result_dict = result.to_dict()
        assert "recommended_track" in result_dict
        assert "confidence" in result_dict
        assert "reason" in result_dict
        assert "document_type" in result_dict
        assert "metadata" in result_dict
        assert result_dict["recommended_track"] == result.track
        assert result_dict["confidence"] == result.confidence
    def test_recommendation_metadata_types(self, detector, sample_pdf_editable):
        """Test that metadata contains correct types."""
        result = detector.detect(sample_pdf_editable)
        assert isinstance(result.track, str)
        assert isinstance(result.confidence, float)
        assert isinstance(result.reason, str)
        assert isinstance(result.document_type, DocumentType)
        assert isinstance(result.metadata, dict)
    # ===== Performance Tests =====
    def test_large_pdf_sampling(self, detector, temp_dir):
        """Test that large PDFs are sampled efficiently."""
        import fitz
        import time
        # Create a large PDF (20 pages)
        large_pdf = temp_dir / "large.pdf"
        doc = fitz.open()
        for i in range(20):
            page = doc.new_page()
            page.insert_text((50, 50), f"Page {i + 1}\n" * 50)
        doc.save(str(large_pdf))
        doc.close()
        # Detection should be fast due to sampling
        start_time = time.time()
        result = detector.detect(large_pdf)
        elapsed = time.time() - start_time
        assert elapsed < 5.0  # Should complete within 5 seconds
        assert result.metadata["sampled_pages"] <= detector.sample_pages
        assert result.metadata["total_pages"] == 20
--- a/openspec/changes/dual-track-document-processing/tasks.md
+++ b/openspec/changes/dual-track-document-processing/tasks.md
@@ -5,7 +5,7 @@
  - [x] 1.1.1 Add PyMuPDF>=1.23.0
  - [x] 1.1.2 Add pdfplumber>=0.10.0
  - [x] 1.1.3 Add python-magic-bin>=0.4.14
-  - [ ] 1.1.4 Test dependency installation
+  - [x] 1.1.4 Test dependency installation
 - [x] 1.2 Create UnifiedDocument model in backend/app/models/
  - [x] 1.2.1 Define UnifiedDocument dataclass
  - [x] 1.2.2 Add DocumentElement model
@@ -17,7 +17,7 @@
  - [x] 1.3.2 Add PDF editability checking logic
  - [x] 1.3.3 Add Office document detection
  - [x] 1.3.4 Create routing logic to determine processing track
-  - [ ] 1.3.5 Add unit tests for detector
+  - [x] 1.3.5 Add unit tests for detector
 ## 2. Direct Extraction Track
 - [x] 2.1 Create DirectExtractionEngine service