first

2025-11-12 22:53:17 +08:00
commit da700721fa
130 changed files with 23393 additions and 0 deletions
--- a/demo_docs/basic/chinese_simple.png
+++ b/demo_docs/basic/chinese_simple.png
--- a/demo_docs/basic/chinese_traditional.png
+++ b/demo_docs/basic/chinese_traditional.png
--- a/demo_docs/basic/english.png
+++ b/demo_docs/basic/english.png
--- a/demo_docs/layout/document.png
+++ b/demo_docs/layout/document.png
--- a/(附件二)具體事蹟簡報格式(最佳創新獎).pdf
+++ b/(附件二)具體事蹟簡報格式(最佳創新獎).pdf
--- a/demo_docs/mixed/Workflow使用分析.pdf
+++ b/demo_docs/mixed/Workflow使用分析.pdf
--- a/demo_docs/office_tests/create_docx.py
+++ b/demo_docs/office_tests/create_docx.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+import zipfile
+from pathlib import Path
+
+# Create a minimal DOCX file
+output_path = Path('/Users/egg/Projects/Tool_OCR/demo_docs/office_tests/test_document.docx')
+
+# DOCX is a ZIP file containing XML files
+with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as docx:
+    # [Content_Types].xml
+    content_types = '''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
+    <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
+    <Default Extension="xml" ContentType="application/xml"/>
+    <Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
+</Types>'''
+    docx.writestr('[Content_Types].xml', content_types)
+
+    # _rels/.rels
+    rels = '''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
+    <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
+</Relationships>'''
+    docx.writestr('_rels/.rels', rels)
+
+    # word/document.xml with Chinese and English content
+    document = '''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+    <w:body>
+        <w:p>
+            <w:pPr><w:pStyle w:val="Heading1"/></w:pPr>
+            <w:r><w:t>Office Document OCR Test</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:pPr><w:pStyle w:val="Heading2"/></w:pPr>
+            <w:r><w:t>測試文件說明</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:r><w:t>這是一個用於測試 Tool_OCR 系統 Office 文件支援功能的測試文件。</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:r><w:t>本系統現已支援以下 Office 格式：</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:r><w:t>• Microsoft Word: DOC, DOCX</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:r><w:t>• Microsoft PowerPoint: PPT, PPTX</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:pPr><w:pStyle w:val="Heading2"/></w:pPr>
+            <w:r><w:t>處理流程</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:r><w:t>Office 文件的處理流程如下：</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:r><w:t>1. 使用 LibreOffice 將 Office 文件轉換為 PDF</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:r><w:t>2. 將 PDF 轉換為圖片（每頁一張）</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:r><w:t>3. 使用 PaddleOCR 處理每張圖片</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:r><w:t>4. 合併所有頁面的 OCR 結果</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:pPr><w:pStyle w:val="Heading2"/></w:pPr>
+            <w:r><w:t>中英混合測試</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:r><w:t>This is a test for mixed Chinese and English OCR recognition.</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:r><w:t>測試中英文混合識別能力：1234567890</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:pPr><w:pStyle w:val="Heading2"/></w:pPr>
+            <w:r><w:t>Technical Information</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:r><w:t>System Version: Tool_OCR v1.0</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:r><w:t>Conversion Engine: LibreOffice Headless</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:r><w:t>OCR Engine: PaddleOCR</w:t></w:r>
+        </w:p>
+        <w:p>
+            <w:r><w:t>Token Validity: 24 hours (1440 minutes)</w:t></w:r>
+        </w:p>
+    </w:body>
+</w:document>'''
+    docx.writestr('word/document.xml', document)
+
+print(f"Created DOCX file: {output_path}")
+print(f"File size: {output_path.stat().st_size} bytes")
--- a/demo_docs/office_tests/test_document.docx
+++ b/demo_docs/office_tests/test_document.docx
--- a/demo_docs/office_tests/test_document.html
+++ b/demo_docs/office_tests/test_document.html
@@ -0,0 +1,64 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="UTF-8">
+    <title>Office Document OCR Test</title>
+</head>
+<body>
+    <h1>Office Document OCR Test</h1>
+
+    <h2>測試文件說明</h2>
+    <p>這是一個用於測試 Tool_OCR 系統 Office 文件支援功能的測試文件。</p>
+    <p>本系統現已支援以下 Office 格式：</p>
+    <ul>
+        <li>Microsoft Word: DOC, DOCX</li>
+        <li>Microsoft PowerPoint: PPT, PPTX</li>
+    </ul>
+
+    <h2>處理流程</h2>
+    <p>Office 文件的處理流程如下：</p>
+    <ol>
+        <li>使用 LibreOffice 將 Office 文件轉換為 PDF</li>
+        <li>將 PDF 轉換為圖片（每頁一張）</li>
+        <li>使用 PaddleOCR 處理每張圖片</li>
+        <li>合併所有頁面的 OCR 結果</li>
+    </ol>
+
+    <h2>測試數據表格</h2>
+    <table border="1" cellpadding="5">
+        <tr>
+            <th>格式</th>
+            <th>副檔名</th>
+            <th>支援狀態</th>
+        </tr>
+        <tr>
+            <td>Word 新版</td>
+            <td>.docx</td>
+            <td>✓ 支援</td>
+        </tr>
+        <tr>
+            <td>Word 舊版</td>
+            <td>.doc</td>
+            <td>✓ 支援</td>
+        </tr>
+        <tr>
+            <td>PowerPoint 新版</td>
+            <td>.pptx</td>
+            <td>✓ 支援</td>
+        </tr>
+        <tr>
+            <td>PowerPoint 舊版</td>
+            <td>.ppt</td>
+            <td>✓ 支援</td>
+        </tr>
+    </table>
+
+    <h2>中英混合測試</h2>
+    <p>This is a test for mixed Chinese and English OCR recognition.</p>
+    <p>測試中英文混合識別能力：1234567890</p>
+
+    <h2>特殊字符測試</h2>
+    <p>符號測試：!@#$%^&*()_+-=[]{}|;:',.<>?/</p>
+    <p>數學符號：± × ÷ √ ∞ ≈ ≠ ≤ ≥</p>
+</body>
+</html>
--- a/demo_docs/office_tests/test_office_upload.py
+++ b/demo_docs/office_tests/test_office_upload.py
@@ -0,0 +1,178 @@
+#!/usr/bin/env python3
+"""
+Test script for Office document processing
+"""
+import json
+import requests
+from pathlib import Path
+import time
+
+API_BASE = "http://localhost:12010/api/v1"
+USERNAME = "admin"
+PASSWORD = "admin123"
+
+def login():
+    """Login and get JWT token"""
+    print("Step 1: Logging in...")
+    response = requests.post(
+        f"{API_BASE}/auth/login",
+        json={"username": USERNAME, "password": PASSWORD}
+    )
+    response.raise_for_status()
+
+    data = response.json()
+    token = data["access_token"]
+    print(f"✓ Login successful. Token expires in: {data['expires_in']} seconds ({data['expires_in']//3600} hours)")
+    return token
+
+def upload_file(token, file_path):
+    """Upload file and create batch"""
+    print(f"\nStep 2: Uploading file: {file_path.name}...")
+    with open(file_path, 'rb') as f:
+        files = {'files': (file_path.name, f, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')}
+        response = requests.post(
+            f"{API_BASE}/upload",
+            headers={"Authorization": f"Bearer {token}"},
+            files=files,
+            data={"batch_name": "Office Document Test"}
+        )
+    response.raise_for_status()
+    result = response.json()
+    print(f"✓ File uploaded and batch created:")
+    print(f"  Batch ID: {result['id']}")
+    print(f"  Total files: {result['total_files']}")
+    print(f"  Status: {result['status']}")
+    return result['id']
+
+def trigger_ocr(token, batch_id):
+    """Trigger OCR processing"""
+    print(f"\nStep 3: Triggering OCR processing...")
+    response = requests.post(
+        f"{API_BASE}/ocr/process",
+        headers={"Authorization": f"Bearer {token}"},
+        json={
+            "batch_id": batch_id,
+            "lang": "ch",
+            "detect_layout": True
+        }
+    )
+    response.raise_for_status()
+    result = response.json()
+    print(f"✓ OCR processing started")
+    print(f"  Message: {result['message']}")
+    print(f"  Total files: {result['total_files']}")
+
+def check_status(token, batch_id):
+    """Check processing status"""
+    print(f"\nStep 4: Checking processing status...")
+    max_wait = 120  # 120 seconds max
+    waited = 0
+
+    while waited < max_wait:
+        response = requests.get(
+            f"{API_BASE}/batch/{batch_id}/status",
+            headers={"Authorization": f"Bearer {token}"}
+        )
+        response.raise_for_status()
+        data = response.json()
+
+        batch_status = data['batch']['status']
+        progress = data['batch']['progress_percentage']
+        file_status = data['files'][0]['status']
+
+        print(f"  Batch status: {batch_status}, Progress: {progress}%, File status: {file_status}")
+
+        if batch_status == 'completed':
+            print(f"\n✓ Processing completed!")
+            file_data = data['files'][0]
+            if 'processing_time' in file_data:
+                print(f"  Processing time: {file_data['processing_time']:.2f} seconds")
+            return data
+        elif batch_status == 'failed':
+            print(f"\n✗ Processing failed!")
+            print(f"  Error: {data['files'][0].get('error_message', 'Unknown error')}")
+            return data
+
+        time.sleep(5)
+        waited += 5
+
+    print(f"\n⚠ Timeout waiting for processing (waited {waited}s)")
+    return None
+
+def get_result(token, file_id):
+    """Get OCR result"""
+    print(f"\nStep 5: Getting OCR result...")
+    response = requests.get(
+        f"{API_BASE}/ocr/result/{file_id}",
+        headers={"Authorization": f"Bearer {token}"}
+    )
+    response.raise_for_status()
+    data = response.json()
+
+    file_info = data['file']
+    result = data.get('result')
+
+    print(f"✓ OCR Result retrieved:")
+    print(f"  File: {file_info['original_filename']}")
+    print(f"  Status: {file_info['status']}")
+
+    if result:
+        print(f"  Language: {result.get('detected_language', 'N/A')}")
+        print(f"  Total text regions: {result.get('total_text_regions', 0)}")
+        print(f"  Average confidence: {result.get('average_confidence', 0):.2%}")
+
+        # Read markdown file if available
+        if result.get('markdown_path'):
+            try:
+                with open(result['markdown_path'], 'r', encoding='utf-8') as f:
+                    markdown_content = f.read()
+                print(f"\n  Markdown preview (first 300 chars):")
+                print(f"  {'-'*60}")
+                print(f"  {markdown_content[:300]}...")
+                print(f"  {'-'*60}")
+            except Exception as e:
+                print(f"  Could not read markdown file: {e}")
+    else:
+        print(f"  No OCR result available yet")
+
+    return data
+
+def main():
+    try:
+        # Test file
+        test_file = Path('/Users/egg/Projects/Tool_OCR/demo_docs/office_tests/test_document.docx')
+
+        if not test_file.exists():
+            print(f"✗ Test file not found: {test_file}")
+            return
+
+        print("="*70)
+        print("Office Document Processing Test")
+        print("="*70)
+        print(f"Test file: {test_file.name} ({test_file.stat().st_size} bytes)")
+        print("="*70)
+
+        # Run test
+        token = login()
+        batch_id = upload_file(token, test_file)
+        trigger_ocr(token, batch_id)
+        status_data = check_status(token, batch_id)
+
+        if status_data and status_data['batch']['status'] == 'completed':
+            file_id = status_data['files'][0]['id']
+            result = get_result(token, file_id)
+            print("\n" + "="*70)
+            print("✓ TEST PASSED: Office document processing successful!")
+            print("="*70)
+        else:
+            print("\n" + "="*70)
+            print("✗ TEST FAILED: Processing did not complete successfully")
+            print("="*70)
+
+    except Exception as e:
+        print(f"\n✗ TEST ERROR: {str(e)}")
+        import traceback
+        traceback.print_exc()
+
+if __name__ == "__main__":
+    main()
--- a/demo_docs/tables/simple_table.png
+++ b/demo_docs/tables/simple_table.png
--- a/demo_docs/tables/截圖
+++ b/demo_docs/tables/截圖
--- a/demo_docs/tables/截圖
+++ b/demo_docs/tables/截圖