From 0fcb2492c906662722b2621758a4c1e058b19d98 Mon Sep 17 00:00:00 2001 From: egg Date: Wed, 19 Nov 2025 12:14:59 +0800 Subject: [PATCH] test: add unit tests for DocumentTypeDetector MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Create test directory structure for backend - Add pytest fixtures for test files (PDF, images, Office docs) - Add 20 unit tests covering: - PDF type detection (editable, scanned, mixed) - Image file detection (PNG, JPG) - Office document detection (DOCX) - Text file detection - Edge cases (file not found, unknown types) - Batch processing and statistics - Mark tasks 1.1.4 and 1.3.5 as completed in tasks.md 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .gitignore | 2 + backend/tests/__init__.py | 3 + backend/tests/conftest.py | 176 ++++++++++ backend/tests/services/__init__.py | 3 + .../services/test_document_type_detector.py | 300 ++++++++++++++++++ .../dual-track-document-processing/tasks.md | 4 +- 6 files changed, 486 insertions(+), 2 deletions(-) create mode 100644 backend/tests/__init__.py create mode 100644 backend/tests/conftest.py create mode 100644 backend/tests/services/__init__.py create mode 100644 backend/tests/services/test_document_type_detector.py diff --git a/.gitignore b/.gitignore index 1bb1526..665b7bb 100644 --- a/.gitignore +++ b/.gitignore @@ -97,4 +97,6 @@ storage/results/* *.log __pycache__/ *.bak +# Ignore temporary test files in root, but allow backend/tests/ test_*.py +!backend/tests/**/test_*.py diff --git a/backend/tests/__init__.py b/backend/tests/__init__.py new file mode 100644 index 0000000..514ed4e --- /dev/null +++ b/backend/tests/__init__.py @@ -0,0 +1,3 @@ +""" +Tool_OCR - Test Suite +""" diff --git a/backend/tests/conftest.py b/backend/tests/conftest.py new file mode 100644 index 0000000..fed0ab0 --- /dev/null +++ b/backend/tests/conftest.py @@ -0,0 +1,176 @@ +""" +Pytest configuration and fixtures for Tool_OCR tests. +""" + +import pytest +import tempfile +import os +from pathlib import Path + +# Add project root to path +import sys +project_root = Path(__file__).parent.parent +sys.path.insert(0, str(project_root)) + + +@pytest.fixture +def temp_dir(): + """Create a temporary directory for test files.""" + with tempfile.TemporaryDirectory() as tmpdir: + yield Path(tmpdir) + + +@pytest.fixture +def sample_text_file(temp_dir): + """Create a sample text file.""" + file_path = temp_dir / "sample.txt" + file_path.write_text("This is a sample text file for testing purposes.\n" * 10) + return file_path + + +@pytest.fixture +def sample_pdf_editable(temp_dir): + """Create a sample editable PDF with text content.""" + import fitz + + file_path = temp_dir / "editable.pdf" + doc = fitz.open() + + # Create 3 pages with text + for i in range(3): + page = doc.new_page() + text = f"This is page {i + 1} of an editable PDF document.\n" * 20 + page.insert_text((50, 50), text, fontsize=12) + + doc.save(str(file_path)) + doc.close() + return file_path + + +@pytest.fixture +def sample_pdf_scanned(temp_dir): + """Create a sample scanned PDF (image-only, no text).""" + import fitz + from PIL import Image + import io + + file_path = temp_dir / "scanned.pdf" + doc = fitz.open() + + # Create 3 pages with only images (simulating scanned pages) + for i in range(3): + page = doc.new_page() + + # Create a simple image + img = Image.new('RGB', (400, 300), color=(200, 200, 200)) + img_bytes = io.BytesIO() + img.save(img_bytes, format='PNG') + img_bytes.seek(0) + + # Insert image covering most of the page + rect = fitz.Rect(50, 50, 550, 750) + page.insert_image(rect, stream=img_bytes.read()) + + doc.save(str(file_path)) + doc.close() + return file_path + + +@pytest.fixture +def sample_pdf_mixed(temp_dir): + """Create a sample mixed PDF (some pages with text, some with images).""" + import fitz + from PIL import Image + import io + + file_path = temp_dir / "mixed.pdf" + doc = fitz.open() + + # Page 1: Text + page = doc.new_page() + text = "This is a text page.\n" * 20 + page.insert_text((50, 50), text, fontsize=12) + + # Page 2: Image only + page = doc.new_page() + img = Image.new('RGB', (400, 300), color=(200, 200, 200)) + img_bytes = io.BytesIO() + img.save(img_bytes, format='PNG') + img_bytes.seek(0) + rect = fitz.Rect(50, 50, 550, 750) + page.insert_image(rect, stream=img_bytes.read()) + + # Page 3: Image only + page = doc.new_page() + img = Image.new('RGB', (400, 300), color=(150, 150, 150)) + img_bytes = io.BytesIO() + img.save(img_bytes, format='PNG') + img_bytes.seek(0) + page.insert_image(rect, stream=img_bytes.read()) + + doc.save(str(file_path)) + doc.close() + return file_path + + +@pytest.fixture +def sample_image_png(temp_dir): + """Create a sample PNG image.""" + from PIL import Image + + file_path = temp_dir / "sample.png" + img = Image.new('RGB', (100, 100), color='red') + img.save(str(file_path)) + return file_path + + +@pytest.fixture +def sample_image_jpg(temp_dir): + """Create a sample JPEG image.""" + from PIL import Image + + file_path = temp_dir / "sample.jpg" + img = Image.new('RGB', (100, 100), color='blue') + img.save(str(file_path)) + return file_path + + +@pytest.fixture +def sample_docx(temp_dir): + """Create a sample DOCX file (minimal valid structure).""" + from zipfile import ZipFile + + file_path = temp_dir / "sample.docx" + + # Create minimal DOCX structure + with ZipFile(file_path, 'w') as zf: + # [Content_Types].xml + content_types = ''' + + + + +''' + zf.writestr('[Content_Types].xml', content_types) + + # _rels/.rels + rels = ''' + + +''' + zf.writestr('_rels/.rels', rels) + + # word/document.xml + document = ''' + + + + + Test document + + + +''' + zf.writestr('word/document.xml', document) + + return file_path diff --git a/backend/tests/services/__init__.py b/backend/tests/services/__init__.py new file mode 100644 index 0000000..473ef52 --- /dev/null +++ b/backend/tests/services/__init__.py @@ -0,0 +1,3 @@ +""" +Tool_OCR - Services Tests +""" diff --git a/backend/tests/services/test_document_type_detector.py b/backend/tests/services/test_document_type_detector.py new file mode 100644 index 0000000..47cba3c --- /dev/null +++ b/backend/tests/services/test_document_type_detector.py @@ -0,0 +1,300 @@ +""" +Unit tests for DocumentTypeDetector service. + +Tests cover: +- Various file type detection (PDF, image, Office, text) +- PDF editability detection +- Edge cases (file not found, unknown types) +""" + +import pytest +from pathlib import Path + +from app.services.document_type_detector import ( + DocumentTypeDetector, + DocumentType, + ProcessingTrackRecommendation +) + + +class TestDocumentTypeDetector: + """Test suite for DocumentTypeDetector.""" + + @pytest.fixture + def detector(self): + """Create a detector instance with default settings.""" + return DocumentTypeDetector() + + @pytest.fixture + def strict_detector(self): + """Create a detector with strict text requirements.""" + return DocumentTypeDetector( + min_text_length=200, + text_coverage_threshold=0.95 + ) + + # ===== PDF Detection Tests ===== + + def test_detect_editable_pdf(self, detector, sample_pdf_editable): + """Test detection of editable PDF with extractable text.""" + result = detector.detect(sample_pdf_editable) + + assert result.track == "direct" + assert result.document_type == DocumentType.PDF_EDITABLE + assert result.confidence >= 0.9 + assert "extractable text" in result.reason.lower() + assert result.metadata.get("total_pages") == 3 + + def test_detect_scanned_pdf(self, detector, sample_pdf_scanned): + """Test detection of scanned PDF (image-only).""" + result = detector.detect(sample_pdf_scanned) + + assert result.track == "ocr" + assert result.document_type == DocumentType.PDF_SCANNED + assert result.confidence >= 0.9 + assert "scanned" in result.reason.lower() + + def test_detect_mixed_pdf(self, detector, sample_pdf_mixed): + """Test detection of mixed PDF (text + images).""" + result = detector.detect(sample_pdf_mixed) + + assert result.track == "ocr" + assert result.document_type == DocumentType.PDF_MIXED + assert result.confidence >= 0.5 + assert "mixed" in result.reason.lower() + + def test_pdf_text_coverage_calculation(self, detector, sample_pdf_editable): + """Test that text coverage is calculated correctly.""" + result = detector.detect(sample_pdf_editable) + + metadata = result.metadata + assert "text_coverage" in metadata + assert metadata["text_coverage"] >= 0.9 # All pages have text + + def test_pdf_page_details(self, detector, sample_pdf_editable): + """Test that page details are included in metadata.""" + result = detector.detect(sample_pdf_editable) + + metadata = result.metadata + assert "page_details" in metadata + assert len(metadata["page_details"]) == min(3, detector.sample_pages) + + for page_detail in metadata["page_details"]: + assert "page" in page_detail + assert "text_length" in page_detail + assert "has_text" in page_detail + assert "image_count" in page_detail + + # ===== Image Detection Tests ===== + + def test_detect_png_image(self, detector, sample_image_png): + """Test detection of PNG image file.""" + result = detector.detect(sample_image_png) + + assert result.track == "ocr" + assert result.document_type == DocumentType.IMAGE + assert result.confidence == 1.0 + assert "image" in result.reason.lower() + assert result.metadata.get("mime_type") == "image/png" + + def test_detect_jpg_image(self, detector, sample_image_jpg): + """Test detection of JPEG image file.""" + result = detector.detect(sample_image_jpg) + + assert result.track == "ocr" + assert result.document_type == DocumentType.IMAGE + assert result.confidence == 1.0 + assert result.metadata.get("file_extension") == ".jpg" + + # ===== Office Document Tests ===== + + def test_detect_docx(self, detector, sample_docx): + """Test detection of Word document.""" + result = detector.detect(sample_docx) + + assert result.track == "ocr" + assert result.document_type == DocumentType.OFFICE_WORD + assert result.confidence >= 0.8 + assert "office" in result.reason.lower() or "ocr" in result.reason.lower() + + # ===== Text File Tests ===== + + def test_detect_text_file(self, detector, sample_text_file): + """Test detection of plain text file.""" + result = detector.detect(sample_text_file) + + assert result.track == "direct" + assert result.document_type == DocumentType.TEXT + assert result.confidence == 1.0 + assert "text" in result.reason.lower() + + # ===== Edge Case Tests ===== + + def test_file_not_found(self, detector, temp_dir): + """Test handling of non-existent file.""" + non_existent = temp_dir / "does_not_exist.pdf" + result = detector.detect(non_existent) + + assert result.track == "ocr" + assert result.document_type == DocumentType.UNKNOWN + assert result.confidence == 0.5 + assert "not found" in result.reason.lower() + + def test_unknown_file_type(self, detector, temp_dir): + """Test handling of unknown file type.""" + # Create a file with unknown content + unknown_file = temp_dir / "unknown.xyz" + unknown_file.write_bytes(b'\x00\x01\x02\x03\x04\x05') + + result = detector.detect(unknown_file) + + assert result.track == "ocr" + assert result.document_type == DocumentType.UNKNOWN + assert result.confidence <= 0.5 + + def test_empty_pdf(self, detector, temp_dir): + """Test handling of PDF with blank pages (no content).""" + import fitz + + empty_pdf = temp_dir / "empty.pdf" + doc = fitz.open() + # Create a blank page with no content + doc.new_page() + doc.save(str(empty_pdf)) + doc.close() + + result = detector.detect(empty_pdf) + + # Blank PDF should be detected as scanned (no extractable text) + assert result.track == "ocr" + assert result.document_type == DocumentType.PDF_SCANNED + assert result.metadata.get("total_pages") == 1 + + # ===== Configuration Tests ===== + + def test_custom_min_text_length(self, temp_dir): + """Test that custom min_text_length affects detection.""" + import fitz + + # Create PDF with minimal text + pdf_path = temp_dir / "minimal_text.pdf" + doc = fitz.open() + page = doc.new_page() + page.insert_text((50, 50), "Short text") # Only ~10 chars + doc.save(str(pdf_path)) + doc.close() + + # Default detector (min_text_length=100) + default_detector = DocumentTypeDetector() + result_default = default_detector.detect(pdf_path) + + # Strict detector (min_text_length=200) + strict_detector = DocumentTypeDetector(min_text_length=5) + result_strict = strict_detector.detect(pdf_path) + + # With very low threshold, it should find text + assert result_strict.document_type in [ + DocumentType.PDF_EDITABLE, + DocumentType.PDF_MIXED + ] + + def test_sample_pages_setting(self, temp_dir): + """Test that sample_pages setting is respected.""" + import fitz + + # Create PDF with 10 pages + pdf_path = temp_dir / "many_pages.pdf" + doc = fitz.open() + for i in range(10): + page = doc.new_page() + page.insert_text((50, 50), f"Page {i + 1} content\n" * 20) + doc.save(str(pdf_path)) + doc.close() + + # Detector that samples only 2 pages + detector = DocumentTypeDetector(sample_pages=2) + result = detector.detect(pdf_path) + + assert result.metadata["sampled_pages"] == 2 + assert result.metadata["total_pages"] == 10 + + # ===== Batch Processing Tests ===== + + def test_analyze_batch(self, detector, sample_pdf_editable, sample_image_png, sample_text_file): + """Test batch analysis of multiple files.""" + files = [sample_pdf_editable, sample_image_png, sample_text_file] + results = detector.analyze_batch(files) + + assert len(results) == 3 + assert str(sample_pdf_editable) in results + assert str(sample_image_png) in results + assert str(sample_text_file) in results + + def test_get_statistics(self, detector, sample_pdf_editable, sample_image_png, sample_text_file): + """Test statistics calculation from batch results.""" + files = [sample_pdf_editable, sample_image_png, sample_text_file] + results = detector.analyze_batch(files) + stats = detector.get_statistics(results) + + assert stats["total"] == 3 + assert "by_track" in stats + assert stats["by_track"]["ocr"] >= 1 # At least image + assert stats["by_track"]["direct"] >= 1 # At least text + assert "confidence" in stats + assert stats["confidence"]["mean"] > 0 + + def test_get_statistics_empty(self, detector): + """Test statistics with empty results.""" + stats = detector.get_statistics({}) + assert stats == {"total": 0} + + # ===== Recommendation Object Tests ===== + + def test_recommendation_to_dict(self, detector, sample_pdf_editable): + """Test ProcessingTrackRecommendation.to_dict() method.""" + result = detector.detect(sample_pdf_editable) + result_dict = result.to_dict() + + assert "recommended_track" in result_dict + assert "confidence" in result_dict + assert "reason" in result_dict + assert "document_type" in result_dict + assert "metadata" in result_dict + + assert result_dict["recommended_track"] == result.track + assert result_dict["confidence"] == result.confidence + + def test_recommendation_metadata_types(self, detector, sample_pdf_editable): + """Test that metadata contains correct types.""" + result = detector.detect(sample_pdf_editable) + + assert isinstance(result.track, str) + assert isinstance(result.confidence, float) + assert isinstance(result.reason, str) + assert isinstance(result.document_type, DocumentType) + assert isinstance(result.metadata, dict) + + # ===== Performance Tests ===== + + def test_large_pdf_sampling(self, detector, temp_dir): + """Test that large PDFs are sampled efficiently.""" + import fitz + import time + + # Create a large PDF (20 pages) + large_pdf = temp_dir / "large.pdf" + doc = fitz.open() + for i in range(20): + page = doc.new_page() + page.insert_text((50, 50), f"Page {i + 1}\n" * 50) + doc.save(str(large_pdf)) + doc.close() + + # Detection should be fast due to sampling + start_time = time.time() + result = detector.detect(large_pdf) + elapsed = time.time() - start_time + + assert elapsed < 5.0 # Should complete within 5 seconds + assert result.metadata["sampled_pages"] <= detector.sample_pages + assert result.metadata["total_pages"] == 20 diff --git a/openspec/changes/dual-track-document-processing/tasks.md b/openspec/changes/dual-track-document-processing/tasks.md index 6b5f8f3..3c20881 100644 --- a/openspec/changes/dual-track-document-processing/tasks.md +++ b/openspec/changes/dual-track-document-processing/tasks.md @@ -5,7 +5,7 @@ - [x] 1.1.1 Add PyMuPDF>=1.23.0 - [x] 1.1.2 Add pdfplumber>=0.10.0 - [x] 1.1.3 Add python-magic-bin>=0.4.14 - - [ ] 1.1.4 Test dependency installation + - [x] 1.1.4 Test dependency installation - [x] 1.2 Create UnifiedDocument model in backend/app/models/ - [x] 1.2.1 Define UnifiedDocument dataclass - [x] 1.2.2 Add DocumentElement model @@ -17,7 +17,7 @@ - [x] 1.3.2 Add PDF editability checking logic - [x] 1.3.3 Add Office document detection - [x] 1.3.4 Create routing logic to determine processing track - - [ ] 1.3.5 Add unit tests for detector + - [x] 1.3.5 Add unit tests for detector ## 2. Direct Extraction Track - [x] 2.1 Create DirectExtractionEngine service