OCR/demo_docs/office_tests/create_docx.py

#!/usr/bin/env python3
import zipfile
from pathlib import Path

# Create a minimal DOCX file
output_path = Path('/Users/egg/Projects/Tool_OCR/demo_docs/office_tests/test_document.docx')

# DOCX is a ZIP file containing XML files
with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as docx:
    # [Content_Types].xml
    content_types = '''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
    <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
    <Default Extension="xml" ContentType="application/xml"/>
    <Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
</Types>'''
    docx.writestr('[Content_Types].xml', content_types)

    # _rels/.rels
    rels = '''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
    <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
</Relationships>'''
    docx.writestr('_rels/.rels', rels)

    # word/document.xml with Chinese and English content
    document = '''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
    <w:body>
        <w:p>
            <w:pPr><w:pStyle w:val="Heading1"/></w:pPr>
            <w:r><w:t>Office Document OCR Test</w:t></w:r>
        </w:p>
        <w:p>
            <w:pPr><w:pStyle w:val="Heading2"/></w:pPr>
            <w:r><w:t>測試文件說明</w:t></w:r>
        </w:p>
        <w:p>
            <w:r><w:t>這是一個用於測試 Tool_OCR 系統 Office 文件支援功能的測試文件。</w:t></w:r>
        </w:p>
        <w:p>
            <w:r><w:t>本系統現已支援以下 Office 格式：</w:t></w:r>
        </w:p>
        <w:p>
            <w:r><w:t>• Microsoft Word: DOC, DOCX</w:t></w:r>
        </w:p>
        <w:p>
            <w:r><w:t>• Microsoft PowerPoint: PPT, PPTX</w:t></w:r>
        </w:p>
        <w:p>
            <w:pPr><w:pStyle w:val="Heading2"/></w:pPr>
            <w:r><w:t>處理流程</w:t></w:r>
        </w:p>
        <w:p>
            <w:r><w:t>Office 文件的處理流程如下：</w:t></w:r>
        </w:p>
        <w:p>
            <w:r><w:t>1. 使用 LibreOffice 將 Office 文件轉換為 PDF</w:t></w:r>
        </w:p>
        <w:p>
            <w:r><w:t>2. 將 PDF 轉換為圖片（每頁一張）</w:t></w:r>
        </w:p>
        <w:p>
            <w:r><w:t>3. 使用 PaddleOCR 處理每張圖片</w:t></w:r>
        </w:p>
        <w:p>
            <w:r><w:t>4. 合併所有頁面的 OCR 結果</w:t></w:r>
        </w:p>
        <w:p>
            <w:pPr><w:pStyle w:val="Heading2"/></w:pPr>
            <w:r><w:t>中英混合測試</w:t></w:r>
        </w:p>
        <w:p>
            <w:r><w:t>This is a test for mixed Chinese and English OCR recognition.</w:t></w:r>
        </w:p>
        <w:p>
            <w:r><w:t>測試中英文混合識別能力：1234567890</w:t></w:r>
        </w:p>
        <w:p>
            <w:pPr><w:pStyle w:val="Heading2"/></w:pPr>
            <w:r><w:t>Technical Information</w:t></w:r>
        </w:p>
        <w:p>
            <w:r><w:t>System Version: Tool_OCR v1.0</w:t></w:r>
        </w:p>
        <w:p>
            <w:r><w:t>Conversion Engine: LibreOffice Headless</w:t></w:r>
        </w:p>
        <w:p>
            <w:r><w:t>OCR Engine: PaddleOCR</w:t></w:r>
        </w:p>
        <w:p>
            <w:r><w:t>Token Validity: 24 hours (1440 minutes)</w:t></w:r>
        </w:p>
    </w:body>
</w:document>'''
    docx.writestr('word/document.xml', document)

print(f"Created DOCX file: {output_path}")
print(f"File size: {output_path.stat().st_size} bytes")