test: add unit tests for DocumentTypeDetector

- Create test directory structure for backend
- Add pytest fixtures for test files (PDF, images, Office docs)
- Add 20 unit tests covering:
  - PDF type detection (editable, scanned, mixed)
  - Image file detection (PNG, JPG)
  - Office document detection (DOCX)
  - Text file detection
  - Edge cases (file not found, unknown types)
  - Batch processing and statistics
- Mark tasks 1.1.4 and 1.3.5 as completed in tasks.md

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-19 12:14:59 +08:00
parent 1d0b63854a
commit 0fcb2492c9
6 changed files with 486 additions and 2 deletions

2
.gitignore vendored
View File

@@ -97,4 +97,6 @@ storage/results/*
*.log *.log
__pycache__/ __pycache__/
*.bak *.bak
# Ignore temporary test files in root, but allow backend/tests/
test_*.py test_*.py
!backend/tests/**/test_*.py

View File

@@ -0,0 +1,3 @@
"""
Tool_OCR - Test Suite
"""

176
backend/tests/conftest.py Normal file
View File

@@ -0,0 +1,176 @@
"""
Pytest configuration and fixtures for Tool_OCR tests.
"""
import pytest
import tempfile
import os
from pathlib import Path
# Add project root to path
import sys
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))
@pytest.fixture
def temp_dir():
"""Create a temporary directory for test files."""
with tempfile.TemporaryDirectory() as tmpdir:
yield Path(tmpdir)
@pytest.fixture
def sample_text_file(temp_dir):
"""Create a sample text file."""
file_path = temp_dir / "sample.txt"
file_path.write_text("This is a sample text file for testing purposes.\n" * 10)
return file_path
@pytest.fixture
def sample_pdf_editable(temp_dir):
"""Create a sample editable PDF with text content."""
import fitz
file_path = temp_dir / "editable.pdf"
doc = fitz.open()
# Create 3 pages with text
for i in range(3):
page = doc.new_page()
text = f"This is page {i + 1} of an editable PDF document.\n" * 20
page.insert_text((50, 50), text, fontsize=12)
doc.save(str(file_path))
doc.close()
return file_path
@pytest.fixture
def sample_pdf_scanned(temp_dir):
"""Create a sample scanned PDF (image-only, no text)."""
import fitz
from PIL import Image
import io
file_path = temp_dir / "scanned.pdf"
doc = fitz.open()
# Create 3 pages with only images (simulating scanned pages)
for i in range(3):
page = doc.new_page()
# Create a simple image
img = Image.new('RGB', (400, 300), color=(200, 200, 200))
img_bytes = io.BytesIO()
img.save(img_bytes, format='PNG')
img_bytes.seek(0)
# Insert image covering most of the page
rect = fitz.Rect(50, 50, 550, 750)
page.insert_image(rect, stream=img_bytes.read())
doc.save(str(file_path))
doc.close()
return file_path
@pytest.fixture
def sample_pdf_mixed(temp_dir):
"""Create a sample mixed PDF (some pages with text, some with images)."""
import fitz
from PIL import Image
import io
file_path = temp_dir / "mixed.pdf"
doc = fitz.open()
# Page 1: Text
page = doc.new_page()
text = "This is a text page.\n" * 20
page.insert_text((50, 50), text, fontsize=12)
# Page 2: Image only
page = doc.new_page()
img = Image.new('RGB', (400, 300), color=(200, 200, 200))
img_bytes = io.BytesIO()
img.save(img_bytes, format='PNG')
img_bytes.seek(0)
rect = fitz.Rect(50, 50, 550, 750)
page.insert_image(rect, stream=img_bytes.read())
# Page 3: Image only
page = doc.new_page()
img = Image.new('RGB', (400, 300), color=(150, 150, 150))
img_bytes = io.BytesIO()
img.save(img_bytes, format='PNG')
img_bytes.seek(0)
page.insert_image(rect, stream=img_bytes.read())
doc.save(str(file_path))
doc.close()
return file_path
@pytest.fixture
def sample_image_png(temp_dir):
"""Create a sample PNG image."""
from PIL import Image
file_path = temp_dir / "sample.png"
img = Image.new('RGB', (100, 100), color='red')
img.save(str(file_path))
return file_path
@pytest.fixture
def sample_image_jpg(temp_dir):
"""Create a sample JPEG image."""
from PIL import Image
file_path = temp_dir / "sample.jpg"
img = Image.new('RGB', (100, 100), color='blue')
img.save(str(file_path))
return file_path
@pytest.fixture
def sample_docx(temp_dir):
"""Create a sample DOCX file (minimal valid structure)."""
from zipfile import ZipFile
file_path = temp_dir / "sample.docx"
# Create minimal DOCX structure
with ZipFile(file_path, 'w') as zf:
# [Content_Types].xml
content_types = '''<?xml version="1.0" encoding="UTF-8"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
<Default Extension="xml" ContentType="application/xml"/>
<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
</Types>'''
zf.writestr('[Content_Types].xml', content_types)
# _rels/.rels
rels = '''<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
</Relationships>'''
zf.writestr('_rels/.rels', rels)
# word/document.xml
document = '''<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p>
<w:r>
<w:t>Test document</w:t>
</w:r>
</w:p>
</w:body>
</w:document>'''
zf.writestr('word/document.xml', document)
return file_path

View File

@@ -0,0 +1,3 @@
"""
Tool_OCR - Services Tests
"""

View File

@@ -0,0 +1,300 @@
"""
Unit tests for DocumentTypeDetector service.
Tests cover:
- Various file type detection (PDF, image, Office, text)
- PDF editability detection
- Edge cases (file not found, unknown types)
"""
import pytest
from pathlib import Path
from app.services.document_type_detector import (
DocumentTypeDetector,
DocumentType,
ProcessingTrackRecommendation
)
class TestDocumentTypeDetector:
"""Test suite for DocumentTypeDetector."""
@pytest.fixture
def detector(self):
"""Create a detector instance with default settings."""
return DocumentTypeDetector()
@pytest.fixture
def strict_detector(self):
"""Create a detector with strict text requirements."""
return DocumentTypeDetector(
min_text_length=200,
text_coverage_threshold=0.95
)
# ===== PDF Detection Tests =====
def test_detect_editable_pdf(self, detector, sample_pdf_editable):
"""Test detection of editable PDF with extractable text."""
result = detector.detect(sample_pdf_editable)
assert result.track == "direct"
assert result.document_type == DocumentType.PDF_EDITABLE
assert result.confidence >= 0.9
assert "extractable text" in result.reason.lower()
assert result.metadata.get("total_pages") == 3
def test_detect_scanned_pdf(self, detector, sample_pdf_scanned):
"""Test detection of scanned PDF (image-only)."""
result = detector.detect(sample_pdf_scanned)
assert result.track == "ocr"
assert result.document_type == DocumentType.PDF_SCANNED
assert result.confidence >= 0.9
assert "scanned" in result.reason.lower()
def test_detect_mixed_pdf(self, detector, sample_pdf_mixed):
"""Test detection of mixed PDF (text + images)."""
result = detector.detect(sample_pdf_mixed)
assert result.track == "ocr"
assert result.document_type == DocumentType.PDF_MIXED
assert result.confidence >= 0.5
assert "mixed" in result.reason.lower()
def test_pdf_text_coverage_calculation(self, detector, sample_pdf_editable):
"""Test that text coverage is calculated correctly."""
result = detector.detect(sample_pdf_editable)
metadata = result.metadata
assert "text_coverage" in metadata
assert metadata["text_coverage"] >= 0.9 # All pages have text
def test_pdf_page_details(self, detector, sample_pdf_editable):
"""Test that page details are included in metadata."""
result = detector.detect(sample_pdf_editable)
metadata = result.metadata
assert "page_details" in metadata
assert len(metadata["page_details"]) == min(3, detector.sample_pages)
for page_detail in metadata["page_details"]:
assert "page" in page_detail
assert "text_length" in page_detail
assert "has_text" in page_detail
assert "image_count" in page_detail
# ===== Image Detection Tests =====
def test_detect_png_image(self, detector, sample_image_png):
"""Test detection of PNG image file."""
result = detector.detect(sample_image_png)
assert result.track == "ocr"
assert result.document_type == DocumentType.IMAGE
assert result.confidence == 1.0
assert "image" in result.reason.lower()
assert result.metadata.get("mime_type") == "image/png"
def test_detect_jpg_image(self, detector, sample_image_jpg):
"""Test detection of JPEG image file."""
result = detector.detect(sample_image_jpg)
assert result.track == "ocr"
assert result.document_type == DocumentType.IMAGE
assert result.confidence == 1.0
assert result.metadata.get("file_extension") == ".jpg"
# ===== Office Document Tests =====
def test_detect_docx(self, detector, sample_docx):
"""Test detection of Word document."""
result = detector.detect(sample_docx)
assert result.track == "ocr"
assert result.document_type == DocumentType.OFFICE_WORD
assert result.confidence >= 0.8
assert "office" in result.reason.lower() or "ocr" in result.reason.lower()
# ===== Text File Tests =====
def test_detect_text_file(self, detector, sample_text_file):
"""Test detection of plain text file."""
result = detector.detect(sample_text_file)
assert result.track == "direct"
assert result.document_type == DocumentType.TEXT
assert result.confidence == 1.0
assert "text" in result.reason.lower()
# ===== Edge Case Tests =====
def test_file_not_found(self, detector, temp_dir):
"""Test handling of non-existent file."""
non_existent = temp_dir / "does_not_exist.pdf"
result = detector.detect(non_existent)
assert result.track == "ocr"
assert result.document_type == DocumentType.UNKNOWN
assert result.confidence == 0.5
assert "not found" in result.reason.lower()
def test_unknown_file_type(self, detector, temp_dir):
"""Test handling of unknown file type."""
# Create a file with unknown content
unknown_file = temp_dir / "unknown.xyz"
unknown_file.write_bytes(b'\x00\x01\x02\x03\x04\x05')
result = detector.detect(unknown_file)
assert result.track == "ocr"
assert result.document_type == DocumentType.UNKNOWN
assert result.confidence <= 0.5
def test_empty_pdf(self, detector, temp_dir):
"""Test handling of PDF with blank pages (no content)."""
import fitz
empty_pdf = temp_dir / "empty.pdf"
doc = fitz.open()
# Create a blank page with no content
doc.new_page()
doc.save(str(empty_pdf))
doc.close()
result = detector.detect(empty_pdf)
# Blank PDF should be detected as scanned (no extractable text)
assert result.track == "ocr"
assert result.document_type == DocumentType.PDF_SCANNED
assert result.metadata.get("total_pages") == 1
# ===== Configuration Tests =====
def test_custom_min_text_length(self, temp_dir):
"""Test that custom min_text_length affects detection."""
import fitz
# Create PDF with minimal text
pdf_path = temp_dir / "minimal_text.pdf"
doc = fitz.open()
page = doc.new_page()
page.insert_text((50, 50), "Short text") # Only ~10 chars
doc.save(str(pdf_path))
doc.close()
# Default detector (min_text_length=100)
default_detector = DocumentTypeDetector()
result_default = default_detector.detect(pdf_path)
# Strict detector (min_text_length=200)
strict_detector = DocumentTypeDetector(min_text_length=5)
result_strict = strict_detector.detect(pdf_path)
# With very low threshold, it should find text
assert result_strict.document_type in [
DocumentType.PDF_EDITABLE,
DocumentType.PDF_MIXED
]
def test_sample_pages_setting(self, temp_dir):
"""Test that sample_pages setting is respected."""
import fitz
# Create PDF with 10 pages
pdf_path = temp_dir / "many_pages.pdf"
doc = fitz.open()
for i in range(10):
page = doc.new_page()
page.insert_text((50, 50), f"Page {i + 1} content\n" * 20)
doc.save(str(pdf_path))
doc.close()
# Detector that samples only 2 pages
detector = DocumentTypeDetector(sample_pages=2)
result = detector.detect(pdf_path)
assert result.metadata["sampled_pages"] == 2
assert result.metadata["total_pages"] == 10
# ===== Batch Processing Tests =====
def test_analyze_batch(self, detector, sample_pdf_editable, sample_image_png, sample_text_file):
"""Test batch analysis of multiple files."""
files = [sample_pdf_editable, sample_image_png, sample_text_file]
results = detector.analyze_batch(files)
assert len(results) == 3
assert str(sample_pdf_editable) in results
assert str(sample_image_png) in results
assert str(sample_text_file) in results
def test_get_statistics(self, detector, sample_pdf_editable, sample_image_png, sample_text_file):
"""Test statistics calculation from batch results."""
files = [sample_pdf_editable, sample_image_png, sample_text_file]
results = detector.analyze_batch(files)
stats = detector.get_statistics(results)
assert stats["total"] == 3
assert "by_track" in stats
assert stats["by_track"]["ocr"] >= 1 # At least image
assert stats["by_track"]["direct"] >= 1 # At least text
assert "confidence" in stats
assert stats["confidence"]["mean"] > 0
def test_get_statistics_empty(self, detector):
"""Test statistics with empty results."""
stats = detector.get_statistics({})
assert stats == {"total": 0}
# ===== Recommendation Object Tests =====
def test_recommendation_to_dict(self, detector, sample_pdf_editable):
"""Test ProcessingTrackRecommendation.to_dict() method."""
result = detector.detect(sample_pdf_editable)
result_dict = result.to_dict()
assert "recommended_track" in result_dict
assert "confidence" in result_dict
assert "reason" in result_dict
assert "document_type" in result_dict
assert "metadata" in result_dict
assert result_dict["recommended_track"] == result.track
assert result_dict["confidence"] == result.confidence
def test_recommendation_metadata_types(self, detector, sample_pdf_editable):
"""Test that metadata contains correct types."""
result = detector.detect(sample_pdf_editable)
assert isinstance(result.track, str)
assert isinstance(result.confidence, float)
assert isinstance(result.reason, str)
assert isinstance(result.document_type, DocumentType)
assert isinstance(result.metadata, dict)
# ===== Performance Tests =====
def test_large_pdf_sampling(self, detector, temp_dir):
"""Test that large PDFs are sampled efficiently."""
import fitz
import time
# Create a large PDF (20 pages)
large_pdf = temp_dir / "large.pdf"
doc = fitz.open()
for i in range(20):
page = doc.new_page()
page.insert_text((50, 50), f"Page {i + 1}\n" * 50)
doc.save(str(large_pdf))
doc.close()
# Detection should be fast due to sampling
start_time = time.time()
result = detector.detect(large_pdf)
elapsed = time.time() - start_time
assert elapsed < 5.0 # Should complete within 5 seconds
assert result.metadata["sampled_pages"] <= detector.sample_pages
assert result.metadata["total_pages"] == 20

View File

@@ -5,7 +5,7 @@
- [x] 1.1.1 Add PyMuPDF>=1.23.0 - [x] 1.1.1 Add PyMuPDF>=1.23.0
- [x] 1.1.2 Add pdfplumber>=0.10.0 - [x] 1.1.2 Add pdfplumber>=0.10.0
- [x] 1.1.3 Add python-magic-bin>=0.4.14 - [x] 1.1.3 Add python-magic-bin>=0.4.14
- [ ] 1.1.4 Test dependency installation - [x] 1.1.4 Test dependency installation
- [x] 1.2 Create UnifiedDocument model in backend/app/models/ - [x] 1.2 Create UnifiedDocument model in backend/app/models/
- [x] 1.2.1 Define UnifiedDocument dataclass - [x] 1.2.1 Define UnifiedDocument dataclass
- [x] 1.2.2 Add DocumentElement model - [x] 1.2.2 Add DocumentElement model
@@ -17,7 +17,7 @@
- [x] 1.3.2 Add PDF editability checking logic - [x] 1.3.2 Add PDF editability checking logic
- [x] 1.3.3 Add Office document detection - [x] 1.3.3 Add Office document detection
- [x] 1.3.4 Create routing logic to determine processing track - [x] 1.3.4 Create routing logic to determine processing track
- [ ] 1.3.5 Add unit tests for detector - [x] 1.3.5 Add unit tests for detector
## 2. Direct Extraction Track ## 2. Direct Extraction Track
- [x] 2.1 Create DirectExtractionEngine service - [x] 2.1 Create DirectExtractionEngine service