diff --git a/.gitignore b/.gitignore
index 1bb1526..665b7bb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -97,4 +97,6 @@ storage/results/*
*.log
__pycache__/
*.bak
+# Ignore temporary test files in root, but allow backend/tests/
test_*.py
+!backend/tests/**/test_*.py
diff --git a/backend/tests/__init__.py b/backend/tests/__init__.py
new file mode 100644
index 0000000..514ed4e
--- /dev/null
+++ b/backend/tests/__init__.py
@@ -0,0 +1,3 @@
+"""
+Tool_OCR - Test Suite
+"""
diff --git a/backend/tests/conftest.py b/backend/tests/conftest.py
new file mode 100644
index 0000000..fed0ab0
--- /dev/null
+++ b/backend/tests/conftest.py
@@ -0,0 +1,176 @@
+"""
+Pytest configuration and fixtures for Tool_OCR tests.
+"""
+
+import pytest
+import tempfile
+import os
+from pathlib import Path
+
+# Add project root to path
+import sys
+project_root = Path(__file__).parent.parent
+sys.path.insert(0, str(project_root))
+
+
+@pytest.fixture
+def temp_dir():
+ """Create a temporary directory for test files."""
+ with tempfile.TemporaryDirectory() as tmpdir:
+ yield Path(tmpdir)
+
+
+@pytest.fixture
+def sample_text_file(temp_dir):
+ """Create a sample text file."""
+ file_path = temp_dir / "sample.txt"
+ file_path.write_text("This is a sample text file for testing purposes.\n" * 10)
+ return file_path
+
+
+@pytest.fixture
+def sample_pdf_editable(temp_dir):
+ """Create a sample editable PDF with text content."""
+ import fitz
+
+ file_path = temp_dir / "editable.pdf"
+ doc = fitz.open()
+
+ # Create 3 pages with text
+ for i in range(3):
+ page = doc.new_page()
+ text = f"This is page {i + 1} of an editable PDF document.\n" * 20
+ page.insert_text((50, 50), text, fontsize=12)
+
+ doc.save(str(file_path))
+ doc.close()
+ return file_path
+
+
+@pytest.fixture
+def sample_pdf_scanned(temp_dir):
+ """Create a sample scanned PDF (image-only, no text)."""
+ import fitz
+ from PIL import Image
+ import io
+
+ file_path = temp_dir / "scanned.pdf"
+ doc = fitz.open()
+
+ # Create 3 pages with only images (simulating scanned pages)
+ for i in range(3):
+ page = doc.new_page()
+
+ # Create a simple image
+ img = Image.new('RGB', (400, 300), color=(200, 200, 200))
+ img_bytes = io.BytesIO()
+ img.save(img_bytes, format='PNG')
+ img_bytes.seek(0)
+
+ # Insert image covering most of the page
+ rect = fitz.Rect(50, 50, 550, 750)
+ page.insert_image(rect, stream=img_bytes.read())
+
+ doc.save(str(file_path))
+ doc.close()
+ return file_path
+
+
+@pytest.fixture
+def sample_pdf_mixed(temp_dir):
+ """Create a sample mixed PDF (some pages with text, some with images)."""
+ import fitz
+ from PIL import Image
+ import io
+
+ file_path = temp_dir / "mixed.pdf"
+ doc = fitz.open()
+
+ # Page 1: Text
+ page = doc.new_page()
+ text = "This is a text page.\n" * 20
+ page.insert_text((50, 50), text, fontsize=12)
+
+ # Page 2: Image only
+ page = doc.new_page()
+ img = Image.new('RGB', (400, 300), color=(200, 200, 200))
+ img_bytes = io.BytesIO()
+ img.save(img_bytes, format='PNG')
+ img_bytes.seek(0)
+ rect = fitz.Rect(50, 50, 550, 750)
+ page.insert_image(rect, stream=img_bytes.read())
+
+ # Page 3: Image only
+ page = doc.new_page()
+ img = Image.new('RGB', (400, 300), color=(150, 150, 150))
+ img_bytes = io.BytesIO()
+ img.save(img_bytes, format='PNG')
+ img_bytes.seek(0)
+ page.insert_image(rect, stream=img_bytes.read())
+
+ doc.save(str(file_path))
+ doc.close()
+ return file_path
+
+
+@pytest.fixture
+def sample_image_png(temp_dir):
+ """Create a sample PNG image."""
+ from PIL import Image
+
+ file_path = temp_dir / "sample.png"
+ img = Image.new('RGB', (100, 100), color='red')
+ img.save(str(file_path))
+ return file_path
+
+
+@pytest.fixture
+def sample_image_jpg(temp_dir):
+ """Create a sample JPEG image."""
+ from PIL import Image
+
+ file_path = temp_dir / "sample.jpg"
+ img = Image.new('RGB', (100, 100), color='blue')
+ img.save(str(file_path))
+ return file_path
+
+
+@pytest.fixture
+def sample_docx(temp_dir):
+ """Create a sample DOCX file (minimal valid structure)."""
+ from zipfile import ZipFile
+
+ file_path = temp_dir / "sample.docx"
+
+ # Create minimal DOCX structure
+ with ZipFile(file_path, 'w') as zf:
+ # [Content_Types].xml
+ content_types = '''
+
+
+
+
+'''
+ zf.writestr('[Content_Types].xml', content_types)
+
+ # _rels/.rels
+ rels = '''
+
+
+'''
+ zf.writestr('_rels/.rels', rels)
+
+ # word/document.xml
+ document = '''
+
+
+
+
+ Test document
+
+
+
+'''
+ zf.writestr('word/document.xml', document)
+
+ return file_path
diff --git a/backend/tests/services/__init__.py b/backend/tests/services/__init__.py
new file mode 100644
index 0000000..473ef52
--- /dev/null
+++ b/backend/tests/services/__init__.py
@@ -0,0 +1,3 @@
+"""
+Tool_OCR - Services Tests
+"""
diff --git a/backend/tests/services/test_document_type_detector.py b/backend/tests/services/test_document_type_detector.py
new file mode 100644
index 0000000..47cba3c
--- /dev/null
+++ b/backend/tests/services/test_document_type_detector.py
@@ -0,0 +1,300 @@
+"""
+Unit tests for DocumentTypeDetector service.
+
+Tests cover:
+- Various file type detection (PDF, image, Office, text)
+- PDF editability detection
+- Edge cases (file not found, unknown types)
+"""
+
+import pytest
+from pathlib import Path
+
+from app.services.document_type_detector import (
+ DocumentTypeDetector,
+ DocumentType,
+ ProcessingTrackRecommendation
+)
+
+
+class TestDocumentTypeDetector:
+ """Test suite for DocumentTypeDetector."""
+
+ @pytest.fixture
+ def detector(self):
+ """Create a detector instance with default settings."""
+ return DocumentTypeDetector()
+
+ @pytest.fixture
+ def strict_detector(self):
+ """Create a detector with strict text requirements."""
+ return DocumentTypeDetector(
+ min_text_length=200,
+ text_coverage_threshold=0.95
+ )
+
+ # ===== PDF Detection Tests =====
+
+ def test_detect_editable_pdf(self, detector, sample_pdf_editable):
+ """Test detection of editable PDF with extractable text."""
+ result = detector.detect(sample_pdf_editable)
+
+ assert result.track == "direct"
+ assert result.document_type == DocumentType.PDF_EDITABLE
+ assert result.confidence >= 0.9
+ assert "extractable text" in result.reason.lower()
+ assert result.metadata.get("total_pages") == 3
+
+ def test_detect_scanned_pdf(self, detector, sample_pdf_scanned):
+ """Test detection of scanned PDF (image-only)."""
+ result = detector.detect(sample_pdf_scanned)
+
+ assert result.track == "ocr"
+ assert result.document_type == DocumentType.PDF_SCANNED
+ assert result.confidence >= 0.9
+ assert "scanned" in result.reason.lower()
+
+ def test_detect_mixed_pdf(self, detector, sample_pdf_mixed):
+ """Test detection of mixed PDF (text + images)."""
+ result = detector.detect(sample_pdf_mixed)
+
+ assert result.track == "ocr"
+ assert result.document_type == DocumentType.PDF_MIXED
+ assert result.confidence >= 0.5
+ assert "mixed" in result.reason.lower()
+
+ def test_pdf_text_coverage_calculation(self, detector, sample_pdf_editable):
+ """Test that text coverage is calculated correctly."""
+ result = detector.detect(sample_pdf_editable)
+
+ metadata = result.metadata
+ assert "text_coverage" in metadata
+ assert metadata["text_coverage"] >= 0.9 # All pages have text
+
+ def test_pdf_page_details(self, detector, sample_pdf_editable):
+ """Test that page details are included in metadata."""
+ result = detector.detect(sample_pdf_editable)
+
+ metadata = result.metadata
+ assert "page_details" in metadata
+ assert len(metadata["page_details"]) == min(3, detector.sample_pages)
+
+ for page_detail in metadata["page_details"]:
+ assert "page" in page_detail
+ assert "text_length" in page_detail
+ assert "has_text" in page_detail
+ assert "image_count" in page_detail
+
+ # ===== Image Detection Tests =====
+
+ def test_detect_png_image(self, detector, sample_image_png):
+ """Test detection of PNG image file."""
+ result = detector.detect(sample_image_png)
+
+ assert result.track == "ocr"
+ assert result.document_type == DocumentType.IMAGE
+ assert result.confidence == 1.0
+ assert "image" in result.reason.lower()
+ assert result.metadata.get("mime_type") == "image/png"
+
+ def test_detect_jpg_image(self, detector, sample_image_jpg):
+ """Test detection of JPEG image file."""
+ result = detector.detect(sample_image_jpg)
+
+ assert result.track == "ocr"
+ assert result.document_type == DocumentType.IMAGE
+ assert result.confidence == 1.0
+ assert result.metadata.get("file_extension") == ".jpg"
+
+ # ===== Office Document Tests =====
+
+ def test_detect_docx(self, detector, sample_docx):
+ """Test detection of Word document."""
+ result = detector.detect(sample_docx)
+
+ assert result.track == "ocr"
+ assert result.document_type == DocumentType.OFFICE_WORD
+ assert result.confidence >= 0.8
+ assert "office" in result.reason.lower() or "ocr" in result.reason.lower()
+
+ # ===== Text File Tests =====
+
+ def test_detect_text_file(self, detector, sample_text_file):
+ """Test detection of plain text file."""
+ result = detector.detect(sample_text_file)
+
+ assert result.track == "direct"
+ assert result.document_type == DocumentType.TEXT
+ assert result.confidence == 1.0
+ assert "text" in result.reason.lower()
+
+ # ===== Edge Case Tests =====
+
+ def test_file_not_found(self, detector, temp_dir):
+ """Test handling of non-existent file."""
+ non_existent = temp_dir / "does_not_exist.pdf"
+ result = detector.detect(non_existent)
+
+ assert result.track == "ocr"
+ assert result.document_type == DocumentType.UNKNOWN
+ assert result.confidence == 0.5
+ assert "not found" in result.reason.lower()
+
+ def test_unknown_file_type(self, detector, temp_dir):
+ """Test handling of unknown file type."""
+ # Create a file with unknown content
+ unknown_file = temp_dir / "unknown.xyz"
+ unknown_file.write_bytes(b'\x00\x01\x02\x03\x04\x05')
+
+ result = detector.detect(unknown_file)
+
+ assert result.track == "ocr"
+ assert result.document_type == DocumentType.UNKNOWN
+ assert result.confidence <= 0.5
+
+ def test_empty_pdf(self, detector, temp_dir):
+ """Test handling of PDF with blank pages (no content)."""
+ import fitz
+
+ empty_pdf = temp_dir / "empty.pdf"
+ doc = fitz.open()
+ # Create a blank page with no content
+ doc.new_page()
+ doc.save(str(empty_pdf))
+ doc.close()
+
+ result = detector.detect(empty_pdf)
+
+ # Blank PDF should be detected as scanned (no extractable text)
+ assert result.track == "ocr"
+ assert result.document_type == DocumentType.PDF_SCANNED
+ assert result.metadata.get("total_pages") == 1
+
+ # ===== Configuration Tests =====
+
+ def test_custom_min_text_length(self, temp_dir):
+ """Test that custom min_text_length affects detection."""
+ import fitz
+
+ # Create PDF with minimal text
+ pdf_path = temp_dir / "minimal_text.pdf"
+ doc = fitz.open()
+ page = doc.new_page()
+ page.insert_text((50, 50), "Short text") # Only ~10 chars
+ doc.save(str(pdf_path))
+ doc.close()
+
+ # Default detector (min_text_length=100)
+ default_detector = DocumentTypeDetector()
+ result_default = default_detector.detect(pdf_path)
+
+ # Strict detector (min_text_length=200)
+ strict_detector = DocumentTypeDetector(min_text_length=5)
+ result_strict = strict_detector.detect(pdf_path)
+
+ # With very low threshold, it should find text
+ assert result_strict.document_type in [
+ DocumentType.PDF_EDITABLE,
+ DocumentType.PDF_MIXED
+ ]
+
+ def test_sample_pages_setting(self, temp_dir):
+ """Test that sample_pages setting is respected."""
+ import fitz
+
+ # Create PDF with 10 pages
+ pdf_path = temp_dir / "many_pages.pdf"
+ doc = fitz.open()
+ for i in range(10):
+ page = doc.new_page()
+ page.insert_text((50, 50), f"Page {i + 1} content\n" * 20)
+ doc.save(str(pdf_path))
+ doc.close()
+
+ # Detector that samples only 2 pages
+ detector = DocumentTypeDetector(sample_pages=2)
+ result = detector.detect(pdf_path)
+
+ assert result.metadata["sampled_pages"] == 2
+ assert result.metadata["total_pages"] == 10
+
+ # ===== Batch Processing Tests =====
+
+ def test_analyze_batch(self, detector, sample_pdf_editable, sample_image_png, sample_text_file):
+ """Test batch analysis of multiple files."""
+ files = [sample_pdf_editable, sample_image_png, sample_text_file]
+ results = detector.analyze_batch(files)
+
+ assert len(results) == 3
+ assert str(sample_pdf_editable) in results
+ assert str(sample_image_png) in results
+ assert str(sample_text_file) in results
+
+ def test_get_statistics(self, detector, sample_pdf_editable, sample_image_png, sample_text_file):
+ """Test statistics calculation from batch results."""
+ files = [sample_pdf_editable, sample_image_png, sample_text_file]
+ results = detector.analyze_batch(files)
+ stats = detector.get_statistics(results)
+
+ assert stats["total"] == 3
+ assert "by_track" in stats
+ assert stats["by_track"]["ocr"] >= 1 # At least image
+ assert stats["by_track"]["direct"] >= 1 # At least text
+ assert "confidence" in stats
+ assert stats["confidence"]["mean"] > 0
+
+ def test_get_statistics_empty(self, detector):
+ """Test statistics with empty results."""
+ stats = detector.get_statistics({})
+ assert stats == {"total": 0}
+
+ # ===== Recommendation Object Tests =====
+
+ def test_recommendation_to_dict(self, detector, sample_pdf_editable):
+ """Test ProcessingTrackRecommendation.to_dict() method."""
+ result = detector.detect(sample_pdf_editable)
+ result_dict = result.to_dict()
+
+ assert "recommended_track" in result_dict
+ assert "confidence" in result_dict
+ assert "reason" in result_dict
+ assert "document_type" in result_dict
+ assert "metadata" in result_dict
+
+ assert result_dict["recommended_track"] == result.track
+ assert result_dict["confidence"] == result.confidence
+
+ def test_recommendation_metadata_types(self, detector, sample_pdf_editable):
+ """Test that metadata contains correct types."""
+ result = detector.detect(sample_pdf_editable)
+
+ assert isinstance(result.track, str)
+ assert isinstance(result.confidence, float)
+ assert isinstance(result.reason, str)
+ assert isinstance(result.document_type, DocumentType)
+ assert isinstance(result.metadata, dict)
+
+ # ===== Performance Tests =====
+
+ def test_large_pdf_sampling(self, detector, temp_dir):
+ """Test that large PDFs are sampled efficiently."""
+ import fitz
+ import time
+
+ # Create a large PDF (20 pages)
+ large_pdf = temp_dir / "large.pdf"
+ doc = fitz.open()
+ for i in range(20):
+ page = doc.new_page()
+ page.insert_text((50, 50), f"Page {i + 1}\n" * 50)
+ doc.save(str(large_pdf))
+ doc.close()
+
+ # Detection should be fast due to sampling
+ start_time = time.time()
+ result = detector.detect(large_pdf)
+ elapsed = time.time() - start_time
+
+ assert elapsed < 5.0 # Should complete within 5 seconds
+ assert result.metadata["sampled_pages"] <= detector.sample_pages
+ assert result.metadata["total_pages"] == 20
diff --git a/openspec/changes/dual-track-document-processing/tasks.md b/openspec/changes/dual-track-document-processing/tasks.md
index 6b5f8f3..3c20881 100644
--- a/openspec/changes/dual-track-document-processing/tasks.md
+++ b/openspec/changes/dual-track-document-processing/tasks.md
@@ -5,7 +5,7 @@
- [x] 1.1.1 Add PyMuPDF>=1.23.0
- [x] 1.1.2 Add pdfplumber>=0.10.0
- [x] 1.1.3 Add python-magic-bin>=0.4.14
- - [ ] 1.1.4 Test dependency installation
+ - [x] 1.1.4 Test dependency installation
- [x] 1.2 Create UnifiedDocument model in backend/app/models/
- [x] 1.2.1 Define UnifiedDocument dataclass
- [x] 1.2.2 Add DocumentElement model
@@ -17,7 +17,7 @@
- [x] 1.3.2 Add PDF editability checking logic
- [x] 1.3.3 Add Office document detection
- [x] 1.3.4 Create routing logic to determine processing track
- - [ ] 1.3.5 Add unit tests for detector
+ - [x] 1.3.5 Add unit tests for detector
## 2. Direct Extraction Track
- [x] 2.1 Create DirectExtractionEngine service