test: add unit tests for DocumentTypeDetector

- Create test directory structure for backend - Add pytest fixtures for test files (PDF, images, Office docs) - Add 20 unit tests covering: - PDF type detection (editable, scanned, mixed) - Image file detection (PNG, JPG) - Office document detection (DOCX) - Text file detection - Edge cases (file not found, unknown types) - Batch processing and statistics - Mark tasks 1.1.4 and 1.3.5 as completed in tasks.md 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-19 12:14:59 +08:00
parent 1d0b63854a
commit 0fcb2492c9
6 changed files with 486 additions and 2 deletions
--- a/backend/tests/conftest.py
+++ b/backend/tests/conftest.py
@@ -0,0 +1,176 @@
+"""
+Pytest configuration and fixtures for Tool_OCR tests.
+"""
+
+import pytest
+import tempfile
+import os
+from pathlib import Path
+
+# Add project root to path
+import sys
+project_root = Path(__file__).parent.parent
+sys.path.insert(0, str(project_root))
+
+
+@pytest.fixture
+def temp_dir():
+    """Create a temporary directory for test files."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        yield Path(tmpdir)
+
+
+@pytest.fixture
+def sample_text_file(temp_dir):
+    """Create a sample text file."""
+    file_path = temp_dir / "sample.txt"
+    file_path.write_text("This is a sample text file for testing purposes.\n" * 10)
+    return file_path
+
+
+@pytest.fixture
+def sample_pdf_editable(temp_dir):
+    """Create a sample editable PDF with text content."""
+    import fitz
+
+    file_path = temp_dir / "editable.pdf"
+    doc = fitz.open()
+
+    # Create 3 pages with text
+    for i in range(3):
+        page = doc.new_page()
+        text = f"This is page {i + 1} of an editable PDF document.\n" * 20
+        page.insert_text((50, 50), text, fontsize=12)
+
+    doc.save(str(file_path))
+    doc.close()
+    return file_path
+
+
+@pytest.fixture
+def sample_pdf_scanned(temp_dir):
+    """Create a sample scanned PDF (image-only, no text)."""
+    import fitz
+    from PIL import Image
+    import io
+
+    file_path = temp_dir / "scanned.pdf"
+    doc = fitz.open()
+
+    # Create 3 pages with only images (simulating scanned pages)
+    for i in range(3):
+        page = doc.new_page()
+
+        # Create a simple image
+        img = Image.new('RGB', (400, 300), color=(200, 200, 200))
+        img_bytes = io.BytesIO()
+        img.save(img_bytes, format='PNG')
+        img_bytes.seek(0)
+
+        # Insert image covering most of the page
+        rect = fitz.Rect(50, 50, 550, 750)
+        page.insert_image(rect, stream=img_bytes.read())
+
+    doc.save(str(file_path))
+    doc.close()
+    return file_path
+
+
+@pytest.fixture
+def sample_pdf_mixed(temp_dir):
+    """Create a sample mixed PDF (some pages with text, some with images)."""
+    import fitz
+    from PIL import Image
+    import io
+
+    file_path = temp_dir / "mixed.pdf"
+    doc = fitz.open()
+
+    # Page 1: Text
+    page = doc.new_page()
+    text = "This is a text page.\n" * 20
+    page.insert_text((50, 50), text, fontsize=12)
+
+    # Page 2: Image only
+    page = doc.new_page()
+    img = Image.new('RGB', (400, 300), color=(200, 200, 200))
+    img_bytes = io.BytesIO()
+    img.save(img_bytes, format='PNG')
+    img_bytes.seek(0)
+    rect = fitz.Rect(50, 50, 550, 750)
+    page.insert_image(rect, stream=img_bytes.read())
+
+    # Page 3: Image only
+    page = doc.new_page()
+    img = Image.new('RGB', (400, 300), color=(150, 150, 150))
+    img_bytes = io.BytesIO()
+    img.save(img_bytes, format='PNG')
+    img_bytes.seek(0)
+    page.insert_image(rect, stream=img_bytes.read())
+
+    doc.save(str(file_path))
+    doc.close()
+    return file_path
+
+
+@pytest.fixture
+def sample_image_png(temp_dir):
+    """Create a sample PNG image."""
+    from PIL import Image
+
+    file_path = temp_dir / "sample.png"
+    img = Image.new('RGB', (100, 100), color='red')
+    img.save(str(file_path))
+    return file_path
+
+
+@pytest.fixture
+def sample_image_jpg(temp_dir):
+    """Create a sample JPEG image."""
+    from PIL import Image
+
+    file_path = temp_dir / "sample.jpg"
+    img = Image.new('RGB', (100, 100), color='blue')
+    img.save(str(file_path))
+    return file_path
+
+
+@pytest.fixture
+def sample_docx(temp_dir):
+    """Create a sample DOCX file (minimal valid structure)."""
+    from zipfile import ZipFile
+
+    file_path = temp_dir / "sample.docx"
+
+    # Create minimal DOCX structure
+    with ZipFile(file_path, 'w') as zf:
+        # [Content_Types].xml
+        content_types = '''<?xml version="1.0" encoding="UTF-8"?>
+<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
+  <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
+  <Default Extension="xml" ContentType="application/xml"/>
+  <Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
+</Types>'''
+        zf.writestr('[Content_Types].xml', content_types)
+
+        # _rels/.rels
+        rels = '''<?xml version="1.0" encoding="UTF-8"?>
+<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
+  <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
+</Relationships>'''
+        zf.writestr('_rels/.rels', rels)
+
+        # word/document.xml
+        document = '''<?xml version="1.0" encoding="UTF-8"?>
+<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+  <w:body>
+    <w:p>
+      <w:r>
+        <w:t>Test document</w:t>
+      </w:r>
+    </w:p>
+  </w:body>
+</w:document>'''
+        zf.writestr('word/document.xml', document)
+
+    return file_path