test: add unit tests for DocumentTypeDetector
- Create test directory structure for backend - Add pytest fixtures for test files (PDF, images, Office docs) - Add 20 unit tests covering: - PDF type detection (editable, scanned, mixed) - Image file detection (PNG, JPG) - Office document detection (DOCX) - Text file detection - Edge cases (file not found, unknown types) - Batch processing and statistics - Mark tasks 1.1.4 and 1.3.5 as completed in tasks.md 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -97,4 +97,6 @@ storage/results/*
|
|||||||
*.log
|
*.log
|
||||||
__pycache__/
|
__pycache__/
|
||||||
*.bak
|
*.bak
|
||||||
|
# Ignore temporary test files in root, but allow backend/tests/
|
||||||
test_*.py
|
test_*.py
|
||||||
|
!backend/tests/**/test_*.py
|
||||||
|
|||||||
3
backend/tests/__init__.py
Normal file
3
backend/tests/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
"""
|
||||||
|
Tool_OCR - Test Suite
|
||||||
|
"""
|
||||||
176
backend/tests/conftest.py
Normal file
176
backend/tests/conftest.py
Normal file
@@ -0,0 +1,176 @@
|
|||||||
|
"""
|
||||||
|
Pytest configuration and fixtures for Tool_OCR tests.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import tempfile
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Add project root to path
|
||||||
|
import sys
|
||||||
|
project_root = Path(__file__).parent.parent
|
||||||
|
sys.path.insert(0, str(project_root))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def temp_dir():
|
||||||
|
"""Create a temporary directory for test files."""
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
yield Path(tmpdir)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_text_file(temp_dir):
|
||||||
|
"""Create a sample text file."""
|
||||||
|
file_path = temp_dir / "sample.txt"
|
||||||
|
file_path.write_text("This is a sample text file for testing purposes.\n" * 10)
|
||||||
|
return file_path
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_pdf_editable(temp_dir):
|
||||||
|
"""Create a sample editable PDF with text content."""
|
||||||
|
import fitz
|
||||||
|
|
||||||
|
file_path = temp_dir / "editable.pdf"
|
||||||
|
doc = fitz.open()
|
||||||
|
|
||||||
|
# Create 3 pages with text
|
||||||
|
for i in range(3):
|
||||||
|
page = doc.new_page()
|
||||||
|
text = f"This is page {i + 1} of an editable PDF document.\n" * 20
|
||||||
|
page.insert_text((50, 50), text, fontsize=12)
|
||||||
|
|
||||||
|
doc.save(str(file_path))
|
||||||
|
doc.close()
|
||||||
|
return file_path
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_pdf_scanned(temp_dir):
|
||||||
|
"""Create a sample scanned PDF (image-only, no text)."""
|
||||||
|
import fitz
|
||||||
|
from PIL import Image
|
||||||
|
import io
|
||||||
|
|
||||||
|
file_path = temp_dir / "scanned.pdf"
|
||||||
|
doc = fitz.open()
|
||||||
|
|
||||||
|
# Create 3 pages with only images (simulating scanned pages)
|
||||||
|
for i in range(3):
|
||||||
|
page = doc.new_page()
|
||||||
|
|
||||||
|
# Create a simple image
|
||||||
|
img = Image.new('RGB', (400, 300), color=(200, 200, 200))
|
||||||
|
img_bytes = io.BytesIO()
|
||||||
|
img.save(img_bytes, format='PNG')
|
||||||
|
img_bytes.seek(0)
|
||||||
|
|
||||||
|
# Insert image covering most of the page
|
||||||
|
rect = fitz.Rect(50, 50, 550, 750)
|
||||||
|
page.insert_image(rect, stream=img_bytes.read())
|
||||||
|
|
||||||
|
doc.save(str(file_path))
|
||||||
|
doc.close()
|
||||||
|
return file_path
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_pdf_mixed(temp_dir):
|
||||||
|
"""Create a sample mixed PDF (some pages with text, some with images)."""
|
||||||
|
import fitz
|
||||||
|
from PIL import Image
|
||||||
|
import io
|
||||||
|
|
||||||
|
file_path = temp_dir / "mixed.pdf"
|
||||||
|
doc = fitz.open()
|
||||||
|
|
||||||
|
# Page 1: Text
|
||||||
|
page = doc.new_page()
|
||||||
|
text = "This is a text page.\n" * 20
|
||||||
|
page.insert_text((50, 50), text, fontsize=12)
|
||||||
|
|
||||||
|
# Page 2: Image only
|
||||||
|
page = doc.new_page()
|
||||||
|
img = Image.new('RGB', (400, 300), color=(200, 200, 200))
|
||||||
|
img_bytes = io.BytesIO()
|
||||||
|
img.save(img_bytes, format='PNG')
|
||||||
|
img_bytes.seek(0)
|
||||||
|
rect = fitz.Rect(50, 50, 550, 750)
|
||||||
|
page.insert_image(rect, stream=img_bytes.read())
|
||||||
|
|
||||||
|
# Page 3: Image only
|
||||||
|
page = doc.new_page()
|
||||||
|
img = Image.new('RGB', (400, 300), color=(150, 150, 150))
|
||||||
|
img_bytes = io.BytesIO()
|
||||||
|
img.save(img_bytes, format='PNG')
|
||||||
|
img_bytes.seek(0)
|
||||||
|
page.insert_image(rect, stream=img_bytes.read())
|
||||||
|
|
||||||
|
doc.save(str(file_path))
|
||||||
|
doc.close()
|
||||||
|
return file_path
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_image_png(temp_dir):
|
||||||
|
"""Create a sample PNG image."""
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
file_path = temp_dir / "sample.png"
|
||||||
|
img = Image.new('RGB', (100, 100), color='red')
|
||||||
|
img.save(str(file_path))
|
||||||
|
return file_path
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_image_jpg(temp_dir):
|
||||||
|
"""Create a sample JPEG image."""
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
file_path = temp_dir / "sample.jpg"
|
||||||
|
img = Image.new('RGB', (100, 100), color='blue')
|
||||||
|
img.save(str(file_path))
|
||||||
|
return file_path
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_docx(temp_dir):
|
||||||
|
"""Create a sample DOCX file (minimal valid structure)."""
|
||||||
|
from zipfile import ZipFile
|
||||||
|
|
||||||
|
file_path = temp_dir / "sample.docx"
|
||||||
|
|
||||||
|
# Create minimal DOCX structure
|
||||||
|
with ZipFile(file_path, 'w') as zf:
|
||||||
|
# [Content_Types].xml
|
||||||
|
content_types = '''<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
|
||||||
|
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
|
||||||
|
<Default Extension="xml" ContentType="application/xml"/>
|
||||||
|
<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
|
||||||
|
</Types>'''
|
||||||
|
zf.writestr('[Content_Types].xml', content_types)
|
||||||
|
|
||||||
|
# _rels/.rels
|
||||||
|
rels = '''<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
||||||
|
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
|
||||||
|
</Relationships>'''
|
||||||
|
zf.writestr('_rels/.rels', rels)
|
||||||
|
|
||||||
|
# word/document.xml
|
||||||
|
document = '''<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
||||||
|
<w:body>
|
||||||
|
<w:p>
|
||||||
|
<w:r>
|
||||||
|
<w:t>Test document</w:t>
|
||||||
|
</w:r>
|
||||||
|
</w:p>
|
||||||
|
</w:body>
|
||||||
|
</w:document>'''
|
||||||
|
zf.writestr('word/document.xml', document)
|
||||||
|
|
||||||
|
return file_path
|
||||||
3
backend/tests/services/__init__.py
Normal file
3
backend/tests/services/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
"""
|
||||||
|
Tool_OCR - Services Tests
|
||||||
|
"""
|
||||||
300
backend/tests/services/test_document_type_detector.py
Normal file
300
backend/tests/services/test_document_type_detector.py
Normal file
@@ -0,0 +1,300 @@
|
|||||||
|
"""
|
||||||
|
Unit tests for DocumentTypeDetector service.
|
||||||
|
|
||||||
|
Tests cover:
|
||||||
|
- Various file type detection (PDF, image, Office, text)
|
||||||
|
- PDF editability detection
|
||||||
|
- Edge cases (file not found, unknown types)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from app.services.document_type_detector import (
|
||||||
|
DocumentTypeDetector,
|
||||||
|
DocumentType,
|
||||||
|
ProcessingTrackRecommendation
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestDocumentTypeDetector:
|
||||||
|
"""Test suite for DocumentTypeDetector."""
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def detector(self):
|
||||||
|
"""Create a detector instance with default settings."""
|
||||||
|
return DocumentTypeDetector()
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def strict_detector(self):
|
||||||
|
"""Create a detector with strict text requirements."""
|
||||||
|
return DocumentTypeDetector(
|
||||||
|
min_text_length=200,
|
||||||
|
text_coverage_threshold=0.95
|
||||||
|
)
|
||||||
|
|
||||||
|
# ===== PDF Detection Tests =====
|
||||||
|
|
||||||
|
def test_detect_editable_pdf(self, detector, sample_pdf_editable):
|
||||||
|
"""Test detection of editable PDF with extractable text."""
|
||||||
|
result = detector.detect(sample_pdf_editable)
|
||||||
|
|
||||||
|
assert result.track == "direct"
|
||||||
|
assert result.document_type == DocumentType.PDF_EDITABLE
|
||||||
|
assert result.confidence >= 0.9
|
||||||
|
assert "extractable text" in result.reason.lower()
|
||||||
|
assert result.metadata.get("total_pages") == 3
|
||||||
|
|
||||||
|
def test_detect_scanned_pdf(self, detector, sample_pdf_scanned):
|
||||||
|
"""Test detection of scanned PDF (image-only)."""
|
||||||
|
result = detector.detect(sample_pdf_scanned)
|
||||||
|
|
||||||
|
assert result.track == "ocr"
|
||||||
|
assert result.document_type == DocumentType.PDF_SCANNED
|
||||||
|
assert result.confidence >= 0.9
|
||||||
|
assert "scanned" in result.reason.lower()
|
||||||
|
|
||||||
|
def test_detect_mixed_pdf(self, detector, sample_pdf_mixed):
|
||||||
|
"""Test detection of mixed PDF (text + images)."""
|
||||||
|
result = detector.detect(sample_pdf_mixed)
|
||||||
|
|
||||||
|
assert result.track == "ocr"
|
||||||
|
assert result.document_type == DocumentType.PDF_MIXED
|
||||||
|
assert result.confidence >= 0.5
|
||||||
|
assert "mixed" in result.reason.lower()
|
||||||
|
|
||||||
|
def test_pdf_text_coverage_calculation(self, detector, sample_pdf_editable):
|
||||||
|
"""Test that text coverage is calculated correctly."""
|
||||||
|
result = detector.detect(sample_pdf_editable)
|
||||||
|
|
||||||
|
metadata = result.metadata
|
||||||
|
assert "text_coverage" in metadata
|
||||||
|
assert metadata["text_coverage"] >= 0.9 # All pages have text
|
||||||
|
|
||||||
|
def test_pdf_page_details(self, detector, sample_pdf_editable):
|
||||||
|
"""Test that page details are included in metadata."""
|
||||||
|
result = detector.detect(sample_pdf_editable)
|
||||||
|
|
||||||
|
metadata = result.metadata
|
||||||
|
assert "page_details" in metadata
|
||||||
|
assert len(metadata["page_details"]) == min(3, detector.sample_pages)
|
||||||
|
|
||||||
|
for page_detail in metadata["page_details"]:
|
||||||
|
assert "page" in page_detail
|
||||||
|
assert "text_length" in page_detail
|
||||||
|
assert "has_text" in page_detail
|
||||||
|
assert "image_count" in page_detail
|
||||||
|
|
||||||
|
# ===== Image Detection Tests =====
|
||||||
|
|
||||||
|
def test_detect_png_image(self, detector, sample_image_png):
|
||||||
|
"""Test detection of PNG image file."""
|
||||||
|
result = detector.detect(sample_image_png)
|
||||||
|
|
||||||
|
assert result.track == "ocr"
|
||||||
|
assert result.document_type == DocumentType.IMAGE
|
||||||
|
assert result.confidence == 1.0
|
||||||
|
assert "image" in result.reason.lower()
|
||||||
|
assert result.metadata.get("mime_type") == "image/png"
|
||||||
|
|
||||||
|
def test_detect_jpg_image(self, detector, sample_image_jpg):
|
||||||
|
"""Test detection of JPEG image file."""
|
||||||
|
result = detector.detect(sample_image_jpg)
|
||||||
|
|
||||||
|
assert result.track == "ocr"
|
||||||
|
assert result.document_type == DocumentType.IMAGE
|
||||||
|
assert result.confidence == 1.0
|
||||||
|
assert result.metadata.get("file_extension") == ".jpg"
|
||||||
|
|
||||||
|
# ===== Office Document Tests =====
|
||||||
|
|
||||||
|
def test_detect_docx(self, detector, sample_docx):
|
||||||
|
"""Test detection of Word document."""
|
||||||
|
result = detector.detect(sample_docx)
|
||||||
|
|
||||||
|
assert result.track == "ocr"
|
||||||
|
assert result.document_type == DocumentType.OFFICE_WORD
|
||||||
|
assert result.confidence >= 0.8
|
||||||
|
assert "office" in result.reason.lower() or "ocr" in result.reason.lower()
|
||||||
|
|
||||||
|
# ===== Text File Tests =====
|
||||||
|
|
||||||
|
def test_detect_text_file(self, detector, sample_text_file):
|
||||||
|
"""Test detection of plain text file."""
|
||||||
|
result = detector.detect(sample_text_file)
|
||||||
|
|
||||||
|
assert result.track == "direct"
|
||||||
|
assert result.document_type == DocumentType.TEXT
|
||||||
|
assert result.confidence == 1.0
|
||||||
|
assert "text" in result.reason.lower()
|
||||||
|
|
||||||
|
# ===== Edge Case Tests =====
|
||||||
|
|
||||||
|
def test_file_not_found(self, detector, temp_dir):
|
||||||
|
"""Test handling of non-existent file."""
|
||||||
|
non_existent = temp_dir / "does_not_exist.pdf"
|
||||||
|
result = detector.detect(non_existent)
|
||||||
|
|
||||||
|
assert result.track == "ocr"
|
||||||
|
assert result.document_type == DocumentType.UNKNOWN
|
||||||
|
assert result.confidence == 0.5
|
||||||
|
assert "not found" in result.reason.lower()
|
||||||
|
|
||||||
|
def test_unknown_file_type(self, detector, temp_dir):
|
||||||
|
"""Test handling of unknown file type."""
|
||||||
|
# Create a file with unknown content
|
||||||
|
unknown_file = temp_dir / "unknown.xyz"
|
||||||
|
unknown_file.write_bytes(b'\x00\x01\x02\x03\x04\x05')
|
||||||
|
|
||||||
|
result = detector.detect(unknown_file)
|
||||||
|
|
||||||
|
assert result.track == "ocr"
|
||||||
|
assert result.document_type == DocumentType.UNKNOWN
|
||||||
|
assert result.confidence <= 0.5
|
||||||
|
|
||||||
|
def test_empty_pdf(self, detector, temp_dir):
|
||||||
|
"""Test handling of PDF with blank pages (no content)."""
|
||||||
|
import fitz
|
||||||
|
|
||||||
|
empty_pdf = temp_dir / "empty.pdf"
|
||||||
|
doc = fitz.open()
|
||||||
|
# Create a blank page with no content
|
||||||
|
doc.new_page()
|
||||||
|
doc.save(str(empty_pdf))
|
||||||
|
doc.close()
|
||||||
|
|
||||||
|
result = detector.detect(empty_pdf)
|
||||||
|
|
||||||
|
# Blank PDF should be detected as scanned (no extractable text)
|
||||||
|
assert result.track == "ocr"
|
||||||
|
assert result.document_type == DocumentType.PDF_SCANNED
|
||||||
|
assert result.metadata.get("total_pages") == 1
|
||||||
|
|
||||||
|
# ===== Configuration Tests =====
|
||||||
|
|
||||||
|
def test_custom_min_text_length(self, temp_dir):
|
||||||
|
"""Test that custom min_text_length affects detection."""
|
||||||
|
import fitz
|
||||||
|
|
||||||
|
# Create PDF with minimal text
|
||||||
|
pdf_path = temp_dir / "minimal_text.pdf"
|
||||||
|
doc = fitz.open()
|
||||||
|
page = doc.new_page()
|
||||||
|
page.insert_text((50, 50), "Short text") # Only ~10 chars
|
||||||
|
doc.save(str(pdf_path))
|
||||||
|
doc.close()
|
||||||
|
|
||||||
|
# Default detector (min_text_length=100)
|
||||||
|
default_detector = DocumentTypeDetector()
|
||||||
|
result_default = default_detector.detect(pdf_path)
|
||||||
|
|
||||||
|
# Strict detector (min_text_length=200)
|
||||||
|
strict_detector = DocumentTypeDetector(min_text_length=5)
|
||||||
|
result_strict = strict_detector.detect(pdf_path)
|
||||||
|
|
||||||
|
# With very low threshold, it should find text
|
||||||
|
assert result_strict.document_type in [
|
||||||
|
DocumentType.PDF_EDITABLE,
|
||||||
|
DocumentType.PDF_MIXED
|
||||||
|
]
|
||||||
|
|
||||||
|
def test_sample_pages_setting(self, temp_dir):
|
||||||
|
"""Test that sample_pages setting is respected."""
|
||||||
|
import fitz
|
||||||
|
|
||||||
|
# Create PDF with 10 pages
|
||||||
|
pdf_path = temp_dir / "many_pages.pdf"
|
||||||
|
doc = fitz.open()
|
||||||
|
for i in range(10):
|
||||||
|
page = doc.new_page()
|
||||||
|
page.insert_text((50, 50), f"Page {i + 1} content\n" * 20)
|
||||||
|
doc.save(str(pdf_path))
|
||||||
|
doc.close()
|
||||||
|
|
||||||
|
# Detector that samples only 2 pages
|
||||||
|
detector = DocumentTypeDetector(sample_pages=2)
|
||||||
|
result = detector.detect(pdf_path)
|
||||||
|
|
||||||
|
assert result.metadata["sampled_pages"] == 2
|
||||||
|
assert result.metadata["total_pages"] == 10
|
||||||
|
|
||||||
|
# ===== Batch Processing Tests =====
|
||||||
|
|
||||||
|
def test_analyze_batch(self, detector, sample_pdf_editable, sample_image_png, sample_text_file):
|
||||||
|
"""Test batch analysis of multiple files."""
|
||||||
|
files = [sample_pdf_editable, sample_image_png, sample_text_file]
|
||||||
|
results = detector.analyze_batch(files)
|
||||||
|
|
||||||
|
assert len(results) == 3
|
||||||
|
assert str(sample_pdf_editable) in results
|
||||||
|
assert str(sample_image_png) in results
|
||||||
|
assert str(sample_text_file) in results
|
||||||
|
|
||||||
|
def test_get_statistics(self, detector, sample_pdf_editable, sample_image_png, sample_text_file):
|
||||||
|
"""Test statistics calculation from batch results."""
|
||||||
|
files = [sample_pdf_editable, sample_image_png, sample_text_file]
|
||||||
|
results = detector.analyze_batch(files)
|
||||||
|
stats = detector.get_statistics(results)
|
||||||
|
|
||||||
|
assert stats["total"] == 3
|
||||||
|
assert "by_track" in stats
|
||||||
|
assert stats["by_track"]["ocr"] >= 1 # At least image
|
||||||
|
assert stats["by_track"]["direct"] >= 1 # At least text
|
||||||
|
assert "confidence" in stats
|
||||||
|
assert stats["confidence"]["mean"] > 0
|
||||||
|
|
||||||
|
def test_get_statistics_empty(self, detector):
|
||||||
|
"""Test statistics with empty results."""
|
||||||
|
stats = detector.get_statistics({})
|
||||||
|
assert stats == {"total": 0}
|
||||||
|
|
||||||
|
# ===== Recommendation Object Tests =====
|
||||||
|
|
||||||
|
def test_recommendation_to_dict(self, detector, sample_pdf_editable):
|
||||||
|
"""Test ProcessingTrackRecommendation.to_dict() method."""
|
||||||
|
result = detector.detect(sample_pdf_editable)
|
||||||
|
result_dict = result.to_dict()
|
||||||
|
|
||||||
|
assert "recommended_track" in result_dict
|
||||||
|
assert "confidence" in result_dict
|
||||||
|
assert "reason" in result_dict
|
||||||
|
assert "document_type" in result_dict
|
||||||
|
assert "metadata" in result_dict
|
||||||
|
|
||||||
|
assert result_dict["recommended_track"] == result.track
|
||||||
|
assert result_dict["confidence"] == result.confidence
|
||||||
|
|
||||||
|
def test_recommendation_metadata_types(self, detector, sample_pdf_editable):
|
||||||
|
"""Test that metadata contains correct types."""
|
||||||
|
result = detector.detect(sample_pdf_editable)
|
||||||
|
|
||||||
|
assert isinstance(result.track, str)
|
||||||
|
assert isinstance(result.confidence, float)
|
||||||
|
assert isinstance(result.reason, str)
|
||||||
|
assert isinstance(result.document_type, DocumentType)
|
||||||
|
assert isinstance(result.metadata, dict)
|
||||||
|
|
||||||
|
# ===== Performance Tests =====
|
||||||
|
|
||||||
|
def test_large_pdf_sampling(self, detector, temp_dir):
|
||||||
|
"""Test that large PDFs are sampled efficiently."""
|
||||||
|
import fitz
|
||||||
|
import time
|
||||||
|
|
||||||
|
# Create a large PDF (20 pages)
|
||||||
|
large_pdf = temp_dir / "large.pdf"
|
||||||
|
doc = fitz.open()
|
||||||
|
for i in range(20):
|
||||||
|
page = doc.new_page()
|
||||||
|
page.insert_text((50, 50), f"Page {i + 1}\n" * 50)
|
||||||
|
doc.save(str(large_pdf))
|
||||||
|
doc.close()
|
||||||
|
|
||||||
|
# Detection should be fast due to sampling
|
||||||
|
start_time = time.time()
|
||||||
|
result = detector.detect(large_pdf)
|
||||||
|
elapsed = time.time() - start_time
|
||||||
|
|
||||||
|
assert elapsed < 5.0 # Should complete within 5 seconds
|
||||||
|
assert result.metadata["sampled_pages"] <= detector.sample_pages
|
||||||
|
assert result.metadata["total_pages"] == 20
|
||||||
@@ -5,7 +5,7 @@
|
|||||||
- [x] 1.1.1 Add PyMuPDF>=1.23.0
|
- [x] 1.1.1 Add PyMuPDF>=1.23.0
|
||||||
- [x] 1.1.2 Add pdfplumber>=0.10.0
|
- [x] 1.1.2 Add pdfplumber>=0.10.0
|
||||||
- [x] 1.1.3 Add python-magic-bin>=0.4.14
|
- [x] 1.1.3 Add python-magic-bin>=0.4.14
|
||||||
- [ ] 1.1.4 Test dependency installation
|
- [x] 1.1.4 Test dependency installation
|
||||||
- [x] 1.2 Create UnifiedDocument model in backend/app/models/
|
- [x] 1.2 Create UnifiedDocument model in backend/app/models/
|
||||||
- [x] 1.2.1 Define UnifiedDocument dataclass
|
- [x] 1.2.1 Define UnifiedDocument dataclass
|
||||||
- [x] 1.2.2 Add DocumentElement model
|
- [x] 1.2.2 Add DocumentElement model
|
||||||
@@ -17,7 +17,7 @@
|
|||||||
- [x] 1.3.2 Add PDF editability checking logic
|
- [x] 1.3.2 Add PDF editability checking logic
|
||||||
- [x] 1.3.3 Add Office document detection
|
- [x] 1.3.3 Add Office document detection
|
||||||
- [x] 1.3.4 Create routing logic to determine processing track
|
- [x] 1.3.4 Create routing logic to determine processing track
|
||||||
- [ ] 1.3.5 Add unit tests for detector
|
- [x] 1.3.5 Add unit tests for detector
|
||||||
|
|
||||||
## 2. Direct Extraction Track
|
## 2. Direct Extraction Track
|
||||||
- [x] 2.1 Create DirectExtractionEngine service
|
- [x] 2.1 Create DirectExtractionEngine service
|
||||||
|
|||||||
Reference in New Issue
Block a user