- Create test directory structure for backend - Add pytest fixtures for test files (PDF, images, Office docs) - Add 20 unit tests covering: - PDF type detection (editable, scanned, mixed) - Image file detection (PNG, JPG) - Office document detection (DOCX) - Text file detection - Edge cases (file not found, unknown types) - Batch processing and statistics - Mark tasks 1.1.4 and 1.3.5 as completed in tasks.md 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
177 lines
4.8 KiB
Python
177 lines
4.8 KiB
Python
"""
|
|
Pytest configuration and fixtures for Tool_OCR tests.
|
|
"""
|
|
|
|
import pytest
|
|
import tempfile
|
|
import os
|
|
from pathlib import Path
|
|
|
|
# Add project root to path
|
|
import sys
|
|
project_root = Path(__file__).parent.parent
|
|
sys.path.insert(0, str(project_root))
|
|
|
|
|
|
@pytest.fixture
|
|
def temp_dir():
|
|
"""Create a temporary directory for test files."""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
yield Path(tmpdir)
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_text_file(temp_dir):
|
|
"""Create a sample text file."""
|
|
file_path = temp_dir / "sample.txt"
|
|
file_path.write_text("This is a sample text file for testing purposes.\n" * 10)
|
|
return file_path
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_pdf_editable(temp_dir):
|
|
"""Create a sample editable PDF with text content."""
|
|
import fitz
|
|
|
|
file_path = temp_dir / "editable.pdf"
|
|
doc = fitz.open()
|
|
|
|
# Create 3 pages with text
|
|
for i in range(3):
|
|
page = doc.new_page()
|
|
text = f"This is page {i + 1} of an editable PDF document.\n" * 20
|
|
page.insert_text((50, 50), text, fontsize=12)
|
|
|
|
doc.save(str(file_path))
|
|
doc.close()
|
|
return file_path
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_pdf_scanned(temp_dir):
|
|
"""Create a sample scanned PDF (image-only, no text)."""
|
|
import fitz
|
|
from PIL import Image
|
|
import io
|
|
|
|
file_path = temp_dir / "scanned.pdf"
|
|
doc = fitz.open()
|
|
|
|
# Create 3 pages with only images (simulating scanned pages)
|
|
for i in range(3):
|
|
page = doc.new_page()
|
|
|
|
# Create a simple image
|
|
img = Image.new('RGB', (400, 300), color=(200, 200, 200))
|
|
img_bytes = io.BytesIO()
|
|
img.save(img_bytes, format='PNG')
|
|
img_bytes.seek(0)
|
|
|
|
# Insert image covering most of the page
|
|
rect = fitz.Rect(50, 50, 550, 750)
|
|
page.insert_image(rect, stream=img_bytes.read())
|
|
|
|
doc.save(str(file_path))
|
|
doc.close()
|
|
return file_path
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_pdf_mixed(temp_dir):
|
|
"""Create a sample mixed PDF (some pages with text, some with images)."""
|
|
import fitz
|
|
from PIL import Image
|
|
import io
|
|
|
|
file_path = temp_dir / "mixed.pdf"
|
|
doc = fitz.open()
|
|
|
|
# Page 1: Text
|
|
page = doc.new_page()
|
|
text = "This is a text page.\n" * 20
|
|
page.insert_text((50, 50), text, fontsize=12)
|
|
|
|
# Page 2: Image only
|
|
page = doc.new_page()
|
|
img = Image.new('RGB', (400, 300), color=(200, 200, 200))
|
|
img_bytes = io.BytesIO()
|
|
img.save(img_bytes, format='PNG')
|
|
img_bytes.seek(0)
|
|
rect = fitz.Rect(50, 50, 550, 750)
|
|
page.insert_image(rect, stream=img_bytes.read())
|
|
|
|
# Page 3: Image only
|
|
page = doc.new_page()
|
|
img = Image.new('RGB', (400, 300), color=(150, 150, 150))
|
|
img_bytes = io.BytesIO()
|
|
img.save(img_bytes, format='PNG')
|
|
img_bytes.seek(0)
|
|
page.insert_image(rect, stream=img_bytes.read())
|
|
|
|
doc.save(str(file_path))
|
|
doc.close()
|
|
return file_path
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_image_png(temp_dir):
|
|
"""Create a sample PNG image."""
|
|
from PIL import Image
|
|
|
|
file_path = temp_dir / "sample.png"
|
|
img = Image.new('RGB', (100, 100), color='red')
|
|
img.save(str(file_path))
|
|
return file_path
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_image_jpg(temp_dir):
|
|
"""Create a sample JPEG image."""
|
|
from PIL import Image
|
|
|
|
file_path = temp_dir / "sample.jpg"
|
|
img = Image.new('RGB', (100, 100), color='blue')
|
|
img.save(str(file_path))
|
|
return file_path
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_docx(temp_dir):
|
|
"""Create a sample DOCX file (minimal valid structure)."""
|
|
from zipfile import ZipFile
|
|
|
|
file_path = temp_dir / "sample.docx"
|
|
|
|
# Create minimal DOCX structure
|
|
with ZipFile(file_path, 'w') as zf:
|
|
# [Content_Types].xml
|
|
content_types = '''<?xml version="1.0" encoding="UTF-8"?>
|
|
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
|
|
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
|
|
<Default Extension="xml" ContentType="application/xml"/>
|
|
<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
|
|
</Types>'''
|
|
zf.writestr('[Content_Types].xml', content_types)
|
|
|
|
# _rels/.rels
|
|
rels = '''<?xml version="1.0" encoding="UTF-8"?>
|
|
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
|
|
</Relationships>'''
|
|
zf.writestr('_rels/.rels', rels)
|
|
|
|
# word/document.xml
|
|
document = '''<?xml version="1.0" encoding="UTF-8"?>
|
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
<w:body>
|
|
<w:p>
|
|
<w:r>
|
|
<w:t>Test document</w:t>
|
|
</w:r>
|
|
</w:p>
|
|
</w:body>
|
|
</w:document>'''
|
|
zf.writestr('word/document.xml', document)
|
|
|
|
return file_path
|