first

2025-11-12 22:53:17 +08:00
commit da700721fa
130 changed files with 23393 additions and 0 deletions
--- a/demo_docs/office_tests/create_docx.py
+++ b/demo_docs/office_tests/create_docx.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+import zipfile
+from pathlib import Path
+
+# Create a minimal DOCX file
+output_path = Path('/Users/egg/Projects/Tool_OCR/demo_docs/office_tests/test_document.docx')
+
+# DOCX is a ZIP file containing XML files
+with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as docx:
+    # [Content_Types].xml
+    content_types = '''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
+    <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
+    <Default Extension="xml" ContentType="application/xml"/>
+    <Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
+</Types>'''
+    docx.writestr('[Content_Types].xml', content_types)
+
+    # _rels/.rels
+    rels = '''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
+    <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
+</Relationships>'''
+    docx.writestr('_rels/.rels', rels)
+
+    # word/document.xml with Chinese and English content
+    document = '''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+    <w:body>
+        <w:p>
+            <w:pPr><w:pStyle w:val="Heading1"/></w:pPr>
+            <w:r><w:t>Office Document OCR Test</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:pPr><w:pStyle w:val="Heading2"/></w:pPr>
+            <w:r><w:t>測試文件說明</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:r><w:t>這是一個用於測試 Tool_OCR 系統 Office 文件支援功能的測試文件。</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:r><w:t>本系統現已支援以下 Office 格式：</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:r><w:t>• Microsoft Word: DOC, DOCX</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:r><w:t>• Microsoft PowerPoint: PPT, PPTX</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:pPr><w:pStyle w:val="Heading2"/></w:pPr>
+            <w:r><w:t>處理流程</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:r><w:t>Office 文件的處理流程如下：</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:r><w:t>1. 使用 LibreOffice 將 Office 文件轉換為 PDF</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:r><w:t>2. 將 PDF 轉換為圖片（每頁一張）</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:r><w:t>3. 使用 PaddleOCR 處理每張圖片</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:r><w:t>4. 合併所有頁面的 OCR 結果</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:pPr><w:pStyle w:val="Heading2"/></w:pPr>
+            <w:r><w:t>中英混合測試</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:r><w:t>This is a test for mixed Chinese and English OCR recognition.</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:r><w:t>測試中英文混合識別能力：1234567890</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:pPr><w:pStyle w:val="Heading2"/></w:pPr>
+            <w:r><w:t>Technical Information</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:r><w:t>System Version: Tool_OCR v1.0</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:r><w:t>Conversion Engine: LibreOffice Headless</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:r><w:t>OCR Engine: PaddleOCR</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:r><w:t>Token Validity: 24 hours (1440 minutes)</w:t></w:r>
+        </w:p>
+    </w:body>
+</w:document>'''
+    docx.writestr('word/document.xml', document)
+
+print(f"Created DOCX file: {output_path}")
+print(f"File size: {output_path.stat().st_size} bytes")