test: add unit tests for DocumentTypeDetector

- Create test directory structure for backend
- Add pytest fixtures for test files (PDF, images, Office docs)
- Add 20 unit tests covering:
  - PDF type detection (editable, scanned, mixed)
  - Image file detection (PNG, JPG)
  - Office document detection (DOCX)
  - Text file detection
  - Edge cases (file not found, unknown types)
  - Batch processing and statistics
- Mark tasks 1.1.4 and 1.3.5 as completed in tasks.md

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-19 12:14:59 +08:00
parent 1d0b63854a
commit 0fcb2492c9
6 changed files with 486 additions and 2 deletions

176
backend/tests/conftest.py Normal file
View File

@@ -0,0 +1,176 @@
"""
Pytest configuration and fixtures for Tool_OCR tests.
"""
import pytest
import tempfile
import os
from pathlib import Path
# Add project root to path
import sys
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))
@pytest.fixture
def temp_dir():
"""Create a temporary directory for test files."""
with tempfile.TemporaryDirectory() as tmpdir:
yield Path(tmpdir)
@pytest.fixture
def sample_text_file(temp_dir):
"""Create a sample text file."""
file_path = temp_dir / "sample.txt"
file_path.write_text("This is a sample text file for testing purposes.\n" * 10)
return file_path
@pytest.fixture
def sample_pdf_editable(temp_dir):
"""Create a sample editable PDF with text content."""
import fitz
file_path = temp_dir / "editable.pdf"
doc = fitz.open()
# Create 3 pages with text
for i in range(3):
page = doc.new_page()
text = f"This is page {i + 1} of an editable PDF document.\n" * 20
page.insert_text((50, 50), text, fontsize=12)
doc.save(str(file_path))
doc.close()
return file_path
@pytest.fixture
def sample_pdf_scanned(temp_dir):
"""Create a sample scanned PDF (image-only, no text)."""
import fitz
from PIL import Image
import io
file_path = temp_dir / "scanned.pdf"
doc = fitz.open()
# Create 3 pages with only images (simulating scanned pages)
for i in range(3):
page = doc.new_page()
# Create a simple image
img = Image.new('RGB', (400, 300), color=(200, 200, 200))
img_bytes = io.BytesIO()
img.save(img_bytes, format='PNG')
img_bytes.seek(0)
# Insert image covering most of the page
rect = fitz.Rect(50, 50, 550, 750)
page.insert_image(rect, stream=img_bytes.read())
doc.save(str(file_path))
doc.close()
return file_path
@pytest.fixture
def sample_pdf_mixed(temp_dir):
"""Create a sample mixed PDF (some pages with text, some with images)."""
import fitz
from PIL import Image
import io
file_path = temp_dir / "mixed.pdf"
doc = fitz.open()
# Page 1: Text
page = doc.new_page()
text = "This is a text page.\n" * 20
page.insert_text((50, 50), text, fontsize=12)
# Page 2: Image only
page = doc.new_page()
img = Image.new('RGB', (400, 300), color=(200, 200, 200))
img_bytes = io.BytesIO()
img.save(img_bytes, format='PNG')
img_bytes.seek(0)
rect = fitz.Rect(50, 50, 550, 750)
page.insert_image(rect, stream=img_bytes.read())
# Page 3: Image only
page = doc.new_page()
img = Image.new('RGB', (400, 300), color=(150, 150, 150))
img_bytes = io.BytesIO()
img.save(img_bytes, format='PNG')
img_bytes.seek(0)
page.insert_image(rect, stream=img_bytes.read())
doc.save(str(file_path))
doc.close()
return file_path
@pytest.fixture
def sample_image_png(temp_dir):
"""Create a sample PNG image."""
from PIL import Image
file_path = temp_dir / "sample.png"
img = Image.new('RGB', (100, 100), color='red')
img.save(str(file_path))
return file_path
@pytest.fixture
def sample_image_jpg(temp_dir):
"""Create a sample JPEG image."""
from PIL import Image
file_path = temp_dir / "sample.jpg"
img = Image.new('RGB', (100, 100), color='blue')
img.save(str(file_path))
return file_path
@pytest.fixture
def sample_docx(temp_dir):
"""Create a sample DOCX file (minimal valid structure)."""
from zipfile import ZipFile
file_path = temp_dir / "sample.docx"
# Create minimal DOCX structure
with ZipFile(file_path, 'w') as zf:
# [Content_Types].xml
content_types = '''<?xml version="1.0" encoding="UTF-8"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
<Default Extension="xml" ContentType="application/xml"/>
<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
</Types>'''
zf.writestr('[Content_Types].xml', content_types)
# _rels/.rels
rels = '''<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
</Relationships>'''
zf.writestr('_rels/.rels', rels)
# word/document.xml
document = '''<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p>
<w:r>
<w:t>Test document</w:t>
</w:r>
</w:p>
</w:body>
</w:document>'''
zf.writestr('word/document.xml', document)
return file_path