test: add unit tests for DocumentTypeDetector
- Create test directory structure for backend - Add pytest fixtures for test files (PDF, images, Office docs) - Add 20 unit tests covering: - PDF type detection (editable, scanned, mixed) - Image file detection (PNG, JPG) - Office document detection (DOCX) - Text file detection - Edge cases (file not found, unknown types) - Batch processing and statistics - Mark tasks 1.1.4 and 1.3.5 as completed in tasks.md 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
176
backend/tests/conftest.py
Normal file
176
backend/tests/conftest.py
Normal file
@@ -0,0 +1,176 @@
|
||||
"""
|
||||
Pytest configuration and fixtures for Tool_OCR tests.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import tempfile
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
# Add project root to path
|
||||
import sys
|
||||
project_root = Path(__file__).parent.parent
|
||||
sys.path.insert(0, str(project_root))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def temp_dir():
|
||||
"""Create a temporary directory for test files."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
yield Path(tmpdir)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_text_file(temp_dir):
|
||||
"""Create a sample text file."""
|
||||
file_path = temp_dir / "sample.txt"
|
||||
file_path.write_text("This is a sample text file for testing purposes.\n" * 10)
|
||||
return file_path
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_pdf_editable(temp_dir):
|
||||
"""Create a sample editable PDF with text content."""
|
||||
import fitz
|
||||
|
||||
file_path = temp_dir / "editable.pdf"
|
||||
doc = fitz.open()
|
||||
|
||||
# Create 3 pages with text
|
||||
for i in range(3):
|
||||
page = doc.new_page()
|
||||
text = f"This is page {i + 1} of an editable PDF document.\n" * 20
|
||||
page.insert_text((50, 50), text, fontsize=12)
|
||||
|
||||
doc.save(str(file_path))
|
||||
doc.close()
|
||||
return file_path
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_pdf_scanned(temp_dir):
|
||||
"""Create a sample scanned PDF (image-only, no text)."""
|
||||
import fitz
|
||||
from PIL import Image
|
||||
import io
|
||||
|
||||
file_path = temp_dir / "scanned.pdf"
|
||||
doc = fitz.open()
|
||||
|
||||
# Create 3 pages with only images (simulating scanned pages)
|
||||
for i in range(3):
|
||||
page = doc.new_page()
|
||||
|
||||
# Create a simple image
|
||||
img = Image.new('RGB', (400, 300), color=(200, 200, 200))
|
||||
img_bytes = io.BytesIO()
|
||||
img.save(img_bytes, format='PNG')
|
||||
img_bytes.seek(0)
|
||||
|
||||
# Insert image covering most of the page
|
||||
rect = fitz.Rect(50, 50, 550, 750)
|
||||
page.insert_image(rect, stream=img_bytes.read())
|
||||
|
||||
doc.save(str(file_path))
|
||||
doc.close()
|
||||
return file_path
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_pdf_mixed(temp_dir):
|
||||
"""Create a sample mixed PDF (some pages with text, some with images)."""
|
||||
import fitz
|
||||
from PIL import Image
|
||||
import io
|
||||
|
||||
file_path = temp_dir / "mixed.pdf"
|
||||
doc = fitz.open()
|
||||
|
||||
# Page 1: Text
|
||||
page = doc.new_page()
|
||||
text = "This is a text page.\n" * 20
|
||||
page.insert_text((50, 50), text, fontsize=12)
|
||||
|
||||
# Page 2: Image only
|
||||
page = doc.new_page()
|
||||
img = Image.new('RGB', (400, 300), color=(200, 200, 200))
|
||||
img_bytes = io.BytesIO()
|
||||
img.save(img_bytes, format='PNG')
|
||||
img_bytes.seek(0)
|
||||
rect = fitz.Rect(50, 50, 550, 750)
|
||||
page.insert_image(rect, stream=img_bytes.read())
|
||||
|
||||
# Page 3: Image only
|
||||
page = doc.new_page()
|
||||
img = Image.new('RGB', (400, 300), color=(150, 150, 150))
|
||||
img_bytes = io.BytesIO()
|
||||
img.save(img_bytes, format='PNG')
|
||||
img_bytes.seek(0)
|
||||
page.insert_image(rect, stream=img_bytes.read())
|
||||
|
||||
doc.save(str(file_path))
|
||||
doc.close()
|
||||
return file_path
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_image_png(temp_dir):
|
||||
"""Create a sample PNG image."""
|
||||
from PIL import Image
|
||||
|
||||
file_path = temp_dir / "sample.png"
|
||||
img = Image.new('RGB', (100, 100), color='red')
|
||||
img.save(str(file_path))
|
||||
return file_path
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_image_jpg(temp_dir):
|
||||
"""Create a sample JPEG image."""
|
||||
from PIL import Image
|
||||
|
||||
file_path = temp_dir / "sample.jpg"
|
||||
img = Image.new('RGB', (100, 100), color='blue')
|
||||
img.save(str(file_path))
|
||||
return file_path
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_docx(temp_dir):
|
||||
"""Create a sample DOCX file (minimal valid structure)."""
|
||||
from zipfile import ZipFile
|
||||
|
||||
file_path = temp_dir / "sample.docx"
|
||||
|
||||
# Create minimal DOCX structure
|
||||
with ZipFile(file_path, 'w') as zf:
|
||||
# [Content_Types].xml
|
||||
content_types = '''<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
|
||||
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
|
||||
<Default Extension="xml" ContentType="application/xml"/>
|
||||
<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
|
||||
</Types>'''
|
||||
zf.writestr('[Content_Types].xml', content_types)
|
||||
|
||||
# _rels/.rels
|
||||
rels = '''<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
||||
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
|
||||
</Relationships>'''
|
||||
zf.writestr('_rels/.rels', rels)
|
||||
|
||||
# word/document.xml
|
||||
document = '''<?xml version="1.0" encoding="UTF-8"?>
|
||||
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
||||
<w:body>
|
||||
<w:p>
|
||||
<w:r>
|
||||
<w:t>Test document</w:t>
|
||||
</w:r>
|
||||
</w:p>
|
||||
</w:body>
|
||||
</w:document>'''
|
||||
zf.writestr('word/document.xml', document)
|
||||
|
||||
return file_path
|
||||
Reference in New Issue
Block a user