From 0fcb2492c906662722b2621758a4c1e058b19d98 Mon Sep 17 00:00:00 2001
From: egg <lin4637lin4637@gmail.com>
Date: Wed, 19 Nov 2025 12:14:59 +0800
Subject: [PATCH] test: add unit tests for DocumentTypeDetector
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Create test directory structure for backend
- Add pytest fixtures for test files (PDF, images, Office docs)
- Add 20 unit tests covering:
  - PDF type detection (editable, scanned, mixed)
  - Image file detection (PNG, JPG)
  - Office document detection (DOCX)
  - Text file detection
  - Edge cases (file not found, unknown types)
  - Batch processing and statistics
- Mark tasks 1.1.4 and 1.3.5 as completed in tasks.md

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .gitignore                                    |   2 +
 backend/tests/__init__.py                     |   3 +
 backend/tests/conftest.py                     | 176 ++++++++++
 backend/tests/services/__init__.py            |   3 +
 .../services/test_document_type_detector.py   | 300 ++++++++++++++++++
 .../dual-track-document-processing/tasks.md   |   4 +-
 6 files changed, 486 insertions(+), 2 deletions(-)
 create mode 100644 backend/tests/__init__.py
 create mode 100644 backend/tests/conftest.py
 create mode 100644 backend/tests/services/__init__.py
 create mode 100644 backend/tests/services/test_document_type_detector.py

diff --git a/.gitignore b/.gitignore
index 1bb1526..665b7bb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -97,4 +97,6 @@ storage/results/*
 *.log
 __pycache__/
 *.bak
+# Ignore temporary test files in root, but allow backend/tests/
 test_*.py
+!backend/tests/**/test_*.py
diff --git a/backend/tests/__init__.py b/backend/tests/__init__.py
new file mode 100644
index 0000000..514ed4e
--- /dev/null
+++ b/backend/tests/__init__.py
@@ -0,0 +1,3 @@
+"""
+Tool_OCR - Test Suite
+"""
diff --git a/backend/tests/conftest.py b/backend/tests/conftest.py
new file mode 100644
index 0000000..fed0ab0
--- /dev/null
+++ b/backend/tests/conftest.py
@@ -0,0 +1,176 @@
+"""
+Pytest configuration and fixtures for Tool_OCR tests.
+"""
+
+import pytest
+import tempfile
+import os
+from pathlib import Path
+
+# Add project root to path
+import sys
+project_root = Path(__file__).parent.parent
+sys.path.insert(0, str(project_root))
+
+
+@pytest.fixture
+def temp_dir():
+    """Create a temporary directory for test files."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        yield Path(tmpdir)
+
+
+@pytest.fixture
+def sample_text_file(temp_dir):
+    """Create a sample text file."""
+    file_path = temp_dir / "sample.txt"
+    file_path.write_text("This is a sample text file for testing purposes.\n" * 10)
+    return file_path
+
+
+@pytest.fixture
+def sample_pdf_editable(temp_dir):
+    """Create a sample editable PDF with text content."""
+    import fitz
+
+    file_path = temp_dir / "editable.pdf"
+    doc = fitz.open()
+
+    # Create 3 pages with text
+    for i in range(3):
+        page = doc.new_page()
+        text = f"This is page {i + 1} of an editable PDF document.\n" * 20
+        page.insert_text((50, 50), text, fontsize=12)
+
+    doc.save(str(file_path))
+    doc.close()
+    return file_path
+
+
+@pytest.fixture
+def sample_pdf_scanned(temp_dir):
+    """Create a sample scanned PDF (image-only, no text)."""
+    import fitz
+    from PIL import Image
+    import io
+
+    file_path = temp_dir / "scanned.pdf"
+    doc = fitz.open()
+
+    # Create 3 pages with only images (simulating scanned pages)
+    for i in range(3):
+        page = doc.new_page()
+
+        # Create a simple image
+        img = Image.new('RGB', (400, 300), color=(200, 200, 200))
+        img_bytes = io.BytesIO()
+        img.save(img_bytes, format='PNG')
+        img_bytes.seek(0)
+
+        # Insert image covering most of the page
+        rect = fitz.Rect(50, 50, 550, 750)
+        page.insert_image(rect, stream=img_bytes.read())
+
+    doc.save(str(file_path))
+    doc.close()
+    return file_path
+
+
+@pytest.fixture
+def sample_pdf_mixed(temp_dir):
+    """Create a sample mixed PDF (some pages with text, some with images)."""
+    import fitz
+    from PIL import Image
+    import io
+
+    file_path = temp_dir / "mixed.pdf"
+    doc = fitz.open()
+
+    # Page 1: Text
+    page = doc.new_page()
+    text = "This is a text page.\n" * 20
+    page.insert_text((50, 50), text, fontsize=12)
+
+    # Page 2: Image only
+    page = doc.new_page()
+    img = Image.new('RGB', (400, 300), color=(200, 200, 200))
+    img_bytes = io.BytesIO()
+    img.save(img_bytes, format='PNG')
+    img_bytes.seek(0)
+    rect = fitz.Rect(50, 50, 550, 750)
+    page.insert_image(rect, stream=img_bytes.read())
+
+    # Page 3: Image only
+    page = doc.new_page()
+    img = Image.new('RGB', (400, 300), color=(150, 150, 150))
+    img_bytes = io.BytesIO()
+    img.save(img_bytes, format='PNG')
+    img_bytes.seek(0)
+    page.insert_image(rect, stream=img_bytes.read())
+
+    doc.save(str(file_path))
+    doc.close()
+    return file_path
+
+
+@pytest.fixture
+def sample_image_png(temp_dir):
+    """Create a sample PNG image."""
+    from PIL import Image
+
+    file_path = temp_dir / "sample.png"
+    img = Image.new('RGB', (100, 100), color='red')
+    img.save(str(file_path))
+    return file_path
+
+
+@pytest.fixture
+def sample_image_jpg(temp_dir):
+    """Create a sample JPEG image."""
+    from PIL import Image
+
+    file_path = temp_dir / "sample.jpg"
+    img = Image.new('RGB', (100, 100), color='blue')
+    img.save(str(file_path))
+    return file_path
+
+
+@pytest.fixture
+def sample_docx(temp_dir):
+    """Create a sample DOCX file (minimal valid structure)."""
+    from zipfile import ZipFile
+
+    file_path = temp_dir / "sample.docx"
+
+    # Create minimal DOCX structure
+    with ZipFile(file_path, 'w') as zf:
+        # [Content_Types].xml
+        content_types = '''<?xml version="1.0" encoding="UTF-8"?>
+<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
+  <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
+  <Default Extension="xml" ContentType="application/xml"/>
+  <Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
+</Types>'''
+        zf.writestr('[Content_Types].xml', content_types)
+
+        # _rels/.rels
+        rels = '''<?xml version="1.0" encoding="UTF-8"?>
+<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
+  <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
+</Relationships>'''
+        zf.writestr('_rels/.rels', rels)
+
+        # word/document.xml
+        document = '''<?xml version="1.0" encoding="UTF-8"?>
+<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+  <w:body>
+    <w:p>
+      <w:r>
+        <w:t>Test document</w:t>
+      </w:r>
+    </w:p>
+  </w:body>
+</w:document>'''
+        zf.writestr('word/document.xml', document)
+
+    return file_path
diff --git a/backend/tests/services/__init__.py b/backend/tests/services/__init__.py
new file mode 100644
index 0000000..473ef52
--- /dev/null
+++ b/backend/tests/services/__init__.py
@@ -0,0 +1,3 @@
+"""
+Tool_OCR - Services Tests
+"""
diff --git a/backend/tests/services/test_document_type_detector.py b/backend/tests/services/test_document_type_detector.py
new file mode 100644
index 0000000..47cba3c
--- /dev/null
+++ b/backend/tests/services/test_document_type_detector.py
@@ -0,0 +1,300 @@
+"""
+Unit tests for DocumentTypeDetector service.
+
+Tests cover:
+- Various file type detection (PDF, image, Office, text)
+- PDF editability detection
+- Edge cases (file not found, unknown types)
+"""
+
+import pytest
+from pathlib import Path
+
+from app.services.document_type_detector import (
+    DocumentTypeDetector,
+    DocumentType,
+    ProcessingTrackRecommendation
+)
+
+
+class TestDocumentTypeDetector:
+    """Test suite for DocumentTypeDetector."""
+
+    @pytest.fixture
+    def detector(self):
+        """Create a detector instance with default settings."""
+        return DocumentTypeDetector()
+
+    @pytest.fixture
+    def strict_detector(self):
+        """Create a detector with strict text requirements."""
+        return DocumentTypeDetector(
+            min_text_length=200,
+            text_coverage_threshold=0.95
+        )
+
+    # ===== PDF Detection Tests =====
+
+    def test_detect_editable_pdf(self, detector, sample_pdf_editable):
+        """Test detection of editable PDF with extractable text."""
+        result = detector.detect(sample_pdf_editable)
+
+        assert result.track == "direct"
+        assert result.document_type == DocumentType.PDF_EDITABLE
+        assert result.confidence >= 0.9
+        assert "extractable text" in result.reason.lower()
+        assert result.metadata.get("total_pages") == 3
+
+    def test_detect_scanned_pdf(self, detector, sample_pdf_scanned):
+        """Test detection of scanned PDF (image-only)."""
+        result = detector.detect(sample_pdf_scanned)
+
+        assert result.track == "ocr"
+        assert result.document_type == DocumentType.PDF_SCANNED
+        assert result.confidence >= 0.9
+        assert "scanned" in result.reason.lower()
+
+    def test_detect_mixed_pdf(self, detector, sample_pdf_mixed):
+        """Test detection of mixed PDF (text + images)."""
+        result = detector.detect(sample_pdf_mixed)
+
+        assert result.track == "ocr"
+        assert result.document_type == DocumentType.PDF_MIXED
+        assert result.confidence >= 0.5
+        assert "mixed" in result.reason.lower()
+
+    def test_pdf_text_coverage_calculation(self, detector, sample_pdf_editable):
+        """Test that text coverage is calculated correctly."""
+        result = detector.detect(sample_pdf_editable)
+
+        metadata = result.metadata
+        assert "text_coverage" in metadata
+        assert metadata["text_coverage"] >= 0.9  # All pages have text
+
+    def test_pdf_page_details(self, detector, sample_pdf_editable):
+        """Test that page details are included in metadata."""
+        result = detector.detect(sample_pdf_editable)
+
+        metadata = result.metadata
+        assert "page_details" in metadata
+        assert len(metadata["page_details"]) == min(3, detector.sample_pages)
+
+        for page_detail in metadata["page_details"]:
+            assert "page" in page_detail
+            assert "text_length" in page_detail
+            assert "has_text" in page_detail
+            assert "image_count" in page_detail
+
+    # ===== Image Detection Tests =====
+
+    def test_detect_png_image(self, detector, sample_image_png):
+        """Test detection of PNG image file."""
+        result = detector.detect(sample_image_png)
+
+        assert result.track == "ocr"
+        assert result.document_type == DocumentType.IMAGE
+        assert result.confidence == 1.0
+        assert "image" in result.reason.lower()
+        assert result.metadata.get("mime_type") == "image/png"
+
+    def test_detect_jpg_image(self, detector, sample_image_jpg):
+        """Test detection of JPEG image file."""
+        result = detector.detect(sample_image_jpg)
+
+        assert result.track == "ocr"
+        assert result.document_type == DocumentType.IMAGE
+        assert result.confidence == 1.0
+        assert result.metadata.get("file_extension") == ".jpg"
+
+    # ===== Office Document Tests =====
+
+    def test_detect_docx(self, detector, sample_docx):
+        """Test detection of Word document."""
+        result = detector.detect(sample_docx)
+
+        assert result.track == "ocr"
+        assert result.document_type == DocumentType.OFFICE_WORD
+        assert result.confidence >= 0.8
+        assert "office" in result.reason.lower() or "ocr" in result.reason.lower()
+
+    # ===== Text File Tests =====
+
+    def test_detect_text_file(self, detector, sample_text_file):
+        """Test detection of plain text file."""
+        result = detector.detect(sample_text_file)
+
+        assert result.track == "direct"
+        assert result.document_type == DocumentType.TEXT
+        assert result.confidence == 1.0
+        assert "text" in result.reason.lower()
+
+    # ===== Edge Case Tests =====
+
+    def test_file_not_found(self, detector, temp_dir):
+        """Test handling of non-existent file."""
+        non_existent = temp_dir / "does_not_exist.pdf"
+        result = detector.detect(non_existent)
+
+        assert result.track == "ocr"
+        assert result.document_type == DocumentType.UNKNOWN
+        assert result.confidence == 0.5
+        assert "not found" in result.reason.lower()
+
+    def test_unknown_file_type(self, detector, temp_dir):
+        """Test handling of unknown file type."""
+        # Create a file with unknown content
+        unknown_file = temp_dir / "unknown.xyz"
+        unknown_file.write_bytes(b'\x00\x01\x02\x03\x04\x05')
+
+        result = detector.detect(unknown_file)
+
+        assert result.track == "ocr"
+        assert result.document_type == DocumentType.UNKNOWN
+        assert result.confidence <= 0.5
+
+    def test_empty_pdf(self, detector, temp_dir):
+        """Test handling of PDF with blank pages (no content)."""
+        import fitz
+
+        empty_pdf = temp_dir / "empty.pdf"
+        doc = fitz.open()
+        # Create a blank page with no content
+        doc.new_page()
+        doc.save(str(empty_pdf))
+        doc.close()
+
+        result = detector.detect(empty_pdf)
+
+        # Blank PDF should be detected as scanned (no extractable text)
+        assert result.track == "ocr"
+        assert result.document_type == DocumentType.PDF_SCANNED
+        assert result.metadata.get("total_pages") == 1
+
+    # ===== Configuration Tests =====
+
+    def test_custom_min_text_length(self, temp_dir):
+        """Test that custom min_text_length affects detection."""
+        import fitz
+
+        # Create PDF with minimal text
+        pdf_path = temp_dir / "minimal_text.pdf"
+        doc = fitz.open()
+        page = doc.new_page()
+        page.insert_text((50, 50), "Short text")  # Only ~10 chars
+        doc.save(str(pdf_path))
+        doc.close()
+
+        # Default detector (min_text_length=100)
+        default_detector = DocumentTypeDetector()
+        result_default = default_detector.detect(pdf_path)
+
+        # Strict detector (min_text_length=200)
+        strict_detector = DocumentTypeDetector(min_text_length=5)
+        result_strict = strict_detector.detect(pdf_path)
+
+        # With very low threshold, it should find text
+        assert result_strict.document_type in [
+            DocumentType.PDF_EDITABLE,
+            DocumentType.PDF_MIXED
+        ]
+
+    def test_sample_pages_setting(self, temp_dir):
+        """Test that sample_pages setting is respected."""
+        import fitz
+
+        # Create PDF with 10 pages
+        pdf_path = temp_dir / "many_pages.pdf"
+        doc = fitz.open()
+        for i in range(10):
+            page = doc.new_page()
+            page.insert_text((50, 50), f"Page {i + 1} content\n" * 20)
+        doc.save(str(pdf_path))
+        doc.close()
+
+        # Detector that samples only 2 pages
+        detector = DocumentTypeDetector(sample_pages=2)
+        result = detector.detect(pdf_path)
+
+        assert result.metadata["sampled_pages"] == 2
+        assert result.metadata["total_pages"] == 10
+
+    # ===== Batch Processing Tests =====
+
+    def test_analyze_batch(self, detector, sample_pdf_editable, sample_image_png, sample_text_file):
+        """Test batch analysis of multiple files."""
+        files = [sample_pdf_editable, sample_image_png, sample_text_file]
+        results = detector.analyze_batch(files)
+
+        assert len(results) == 3
+        assert str(sample_pdf_editable) in results
+        assert str(sample_image_png) in results
+        assert str(sample_text_file) in results
+
+    def test_get_statistics(self, detector, sample_pdf_editable, sample_image_png, sample_text_file):
+        """Test statistics calculation from batch results."""
+        files = [sample_pdf_editable, sample_image_png, sample_text_file]
+        results = detector.analyze_batch(files)
+        stats = detector.get_statistics(results)
+
+        assert stats["total"] == 3
+        assert "by_track" in stats
+        assert stats["by_track"]["ocr"] >= 1  # At least image
+        assert stats["by_track"]["direct"] >= 1  # At least text
+        assert "confidence" in stats
+        assert stats["confidence"]["mean"] > 0
+
+    def test_get_statistics_empty(self, detector):
+        """Test statistics with empty results."""
+        stats = detector.get_statistics({})
+        assert stats == {"total": 0}
+
+    # ===== Recommendation Object Tests =====
+
+    def test_recommendation_to_dict(self, detector, sample_pdf_editable):
+        """Test ProcessingTrackRecommendation.to_dict() method."""
+        result = detector.detect(sample_pdf_editable)
+        result_dict = result.to_dict()
+
+        assert "recommended_track" in result_dict
+        assert "confidence" in result_dict
+        assert "reason" in result_dict
+        assert "document_type" in result_dict
+        assert "metadata" in result_dict
+
+        assert result_dict["recommended_track"] == result.track
+        assert result_dict["confidence"] == result.confidence
+
+    def test_recommendation_metadata_types(self, detector, sample_pdf_editable):
+        """Test that metadata contains correct types."""
+        result = detector.detect(sample_pdf_editable)
+
+        assert isinstance(result.track, str)
+        assert isinstance(result.confidence, float)
+        assert isinstance(result.reason, str)
+        assert isinstance(result.document_type, DocumentType)
+        assert isinstance(result.metadata, dict)
+
+    # ===== Performance Tests =====
+
+    def test_large_pdf_sampling(self, detector, temp_dir):
+        """Test that large PDFs are sampled efficiently."""
+        import fitz
+        import time
+
+        # Create a large PDF (20 pages)
+        large_pdf = temp_dir / "large.pdf"
+        doc = fitz.open()
+        for i in range(20):
+            page = doc.new_page()
+            page.insert_text((50, 50), f"Page {i + 1}\n" * 50)
+        doc.save(str(large_pdf))
+        doc.close()
+
+        # Detection should be fast due to sampling
+        start_time = time.time()
+        result = detector.detect(large_pdf)
+        elapsed = time.time() - start_time
+
+        assert elapsed < 5.0  # Should complete within 5 seconds
+        assert result.metadata["sampled_pages"] <= detector.sample_pages
+        assert result.metadata["total_pages"] == 20
diff --git a/openspec/changes/dual-track-document-processing/tasks.md b/openspec/changes/dual-track-document-processing/tasks.md
index 6b5f8f3..3c20881 100644
--- a/openspec/changes/dual-track-document-processing/tasks.md
+++ b/openspec/changes/dual-track-document-processing/tasks.md
@@ -5,7 +5,7 @@
   - [x] 1.1.1 Add PyMuPDF>=1.23.0
   - [x] 1.1.2 Add pdfplumber>=0.10.0
   - [x] 1.1.3 Add python-magic-bin>=0.4.14
-  - [ ] 1.1.4 Test dependency installation
+  - [x] 1.1.4 Test dependency installation
 - [x] 1.2 Create UnifiedDocument model in backend/app/models/
   - [x] 1.2.1 Define UnifiedDocument dataclass
   - [x] 1.2.2 Add DocumentElement model
@@ -17,7 +17,7 @@
   - [x] 1.3.2 Add PDF editability checking logic
   - [x] 1.3.3 Add Office document detection
   - [x] 1.3.4 Create routing logic to determine processing track
-  - [ ] 1.3.5 Add unit tests for detector
+  - [x] 1.3.5 Add unit tests for detector
 
 ## 2. Direct Extraction Track
 - [x] 2.1 Create DirectExtractionEngine service