first
BIN
demo_docs/basic/chinese_simple.png
Normal file
|
After Width: | Height: | Size: 21 KiB |
BIN
demo_docs/basic/chinese_traditional.png
Normal file
|
After Width: | Height: | Size: 20 KiB |
BIN
demo_docs/basic/english.png
Normal file
|
After Width: | Height: | Size: 16 KiB |
BIN
demo_docs/layout/document.png
Normal file
|
After Width: | Height: | Size: 80 KiB |
BIN
demo_docs/mixed/4. (附件二)具體事蹟簡報格式(最佳創新獎).pdf
Normal file
BIN
demo_docs/mixed/Workflow使用分析.pdf
Normal file
100
demo_docs/office_tests/create_docx.py
Normal file
@@ -0,0 +1,100 @@
|
||||
#!/usr/bin/env python3
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
# Create a minimal DOCX file
|
||||
output_path = Path('/Users/egg/Projects/Tool_OCR/demo_docs/office_tests/test_document.docx')
|
||||
|
||||
# DOCX is a ZIP file containing XML files
|
||||
with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as docx:
|
||||
# [Content_Types].xml
|
||||
content_types = '''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
|
||||
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
|
||||
<Default Extension="xml" ContentType="application/xml"/>
|
||||
<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
|
||||
</Types>'''
|
||||
docx.writestr('[Content_Types].xml', content_types)
|
||||
|
||||
# _rels/.rels
|
||||
rels = '''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
||||
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
|
||||
</Relationships>'''
|
||||
docx.writestr('_rels/.rels', rels)
|
||||
|
||||
# word/document.xml with Chinese and English content
|
||||
document = '''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
||||
<w:body>
|
||||
<w:p>
|
||||
<w:pPr><w:pStyle w:val="Heading1"/></w:pPr>
|
||||
<w:r><w:t>Office Document OCR Test</w:t></w:r>
|
||||
</w:p>
|
||||
<w:p>
|
||||
<w:pPr><w:pStyle w:val="Heading2"/></w:pPr>
|
||||
<w:r><w:t>測試文件說明</w:t></w:r>
|
||||
</w:p>
|
||||
<w:p>
|
||||
<w:r><w:t>這是一個用於測試 Tool_OCR 系統 Office 文件支援功能的測試文件。</w:t></w:r>
|
||||
</w:p>
|
||||
<w:p>
|
||||
<w:r><w:t>本系統現已支援以下 Office 格式:</w:t></w:r>
|
||||
</w:p>
|
||||
<w:p>
|
||||
<w:r><w:t>• Microsoft Word: DOC, DOCX</w:t></w:r>
|
||||
</w:p>
|
||||
<w:p>
|
||||
<w:r><w:t>• Microsoft PowerPoint: PPT, PPTX</w:t></w:r>
|
||||
</w:p>
|
||||
<w:p>
|
||||
<w:pPr><w:pStyle w:val="Heading2"/></w:pPr>
|
||||
<w:r><w:t>處理流程</w:t></w:r>
|
||||
</w:p>
|
||||
<w:p>
|
||||
<w:r><w:t>Office 文件的處理流程如下:</w:t></w:r>
|
||||
</w:p>
|
||||
<w:p>
|
||||
<w:r><w:t>1. 使用 LibreOffice 將 Office 文件轉換為 PDF</w:t></w:r>
|
||||
</w:p>
|
||||
<w:p>
|
||||
<w:r><w:t>2. 將 PDF 轉換為圖片(每頁一張)</w:t></w:r>
|
||||
</w:p>
|
||||
<w:p>
|
||||
<w:r><w:t>3. 使用 PaddleOCR 處理每張圖片</w:t></w:r>
|
||||
</w:p>
|
||||
<w:p>
|
||||
<w:r><w:t>4. 合併所有頁面的 OCR 結果</w:t></w:r>
|
||||
</w:p>
|
||||
<w:p>
|
||||
<w:pPr><w:pStyle w:val="Heading2"/></w:pPr>
|
||||
<w:r><w:t>中英混合測試</w:t></w:r>
|
||||
</w:p>
|
||||
<w:p>
|
||||
<w:r><w:t>This is a test for mixed Chinese and English OCR recognition.</w:t></w:r>
|
||||
</w:p>
|
||||
<w:p>
|
||||
<w:r><w:t>測試中英文混合識別能力:1234567890</w:t></w:r>
|
||||
</w:p>
|
||||
<w:p>
|
||||
<w:pPr><w:pStyle w:val="Heading2"/></w:pPr>
|
||||
<w:r><w:t>Technical Information</w:t></w:r>
|
||||
</w:p>
|
||||
<w:p>
|
||||
<w:r><w:t>System Version: Tool_OCR v1.0</w:t></w:r>
|
||||
</w:p>
|
||||
<w:p>
|
||||
<w:r><w:t>Conversion Engine: LibreOffice Headless</w:t></w:r>
|
||||
</w:p>
|
||||
<w:p>
|
||||
<w:r><w:t>OCR Engine: PaddleOCR</w:t></w:r>
|
||||
</w:p>
|
||||
<w:p>
|
||||
<w:r><w:t>Token Validity: 24 hours (1440 minutes)</w:t></w:r>
|
||||
</w:p>
|
||||
</w:body>
|
||||
</w:document>'''
|
||||
docx.writestr('word/document.xml', document)
|
||||
|
||||
print(f"Created DOCX file: {output_path}")
|
||||
print(f"File size: {output_path.stat().st_size} bytes")
|
||||
BIN
demo_docs/office_tests/test_document.docx
Normal file
64
demo_docs/office_tests/test_document.html
Normal file
@@ -0,0 +1,64 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>Office Document OCR Test</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Office Document OCR Test</h1>
|
||||
|
||||
<h2>測試文件說明</h2>
|
||||
<p>這是一個用於測試 Tool_OCR 系統 Office 文件支援功能的測試文件。</p>
|
||||
<p>本系統現已支援以下 Office 格式:</p>
|
||||
<ul>
|
||||
<li>Microsoft Word: DOC, DOCX</li>
|
||||
<li>Microsoft PowerPoint: PPT, PPTX</li>
|
||||
</ul>
|
||||
|
||||
<h2>處理流程</h2>
|
||||
<p>Office 文件的處理流程如下:</p>
|
||||
<ol>
|
||||
<li>使用 LibreOffice 將 Office 文件轉換為 PDF</li>
|
||||
<li>將 PDF 轉換為圖片(每頁一張)</li>
|
||||
<li>使用 PaddleOCR 處理每張圖片</li>
|
||||
<li>合併所有頁面的 OCR 結果</li>
|
||||
</ol>
|
||||
|
||||
<h2>測試數據表格</h2>
|
||||
<table border="1" cellpadding="5">
|
||||
<tr>
|
||||
<th>格式</th>
|
||||
<th>副檔名</th>
|
||||
<th>支援狀態</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Word 新版</td>
|
||||
<td>.docx</td>
|
||||
<td>✓ 支援</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Word 舊版</td>
|
||||
<td>.doc</td>
|
||||
<td>✓ 支援</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>PowerPoint 新版</td>
|
||||
<td>.pptx</td>
|
||||
<td>✓ 支援</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>PowerPoint 舊版</td>
|
||||
<td>.ppt</td>
|
||||
<td>✓ 支援</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<h2>中英混合測試</h2>
|
||||
<p>This is a test for mixed Chinese and English OCR recognition.</p>
|
||||
<p>測試中英文混合識別能力:1234567890</p>
|
||||
|
||||
<h2>特殊字符測試</h2>
|
||||
<p>符號測試:!@#$%^&*()_+-=[]{}|;:',.<>?/</p>
|
||||
<p>數學符號:± × ÷ √ ∞ ≈ ≠ ≤ ≥</p>
|
||||
</body>
|
||||
</html>
|
||||
178
demo_docs/office_tests/test_office_upload.py
Normal file
@@ -0,0 +1,178 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test script for Office document processing
|
||||
"""
|
||||
import json
|
||||
import requests
|
||||
from pathlib import Path
|
||||
import time
|
||||
|
||||
API_BASE = "http://localhost:12010/api/v1"
|
||||
USERNAME = "admin"
|
||||
PASSWORD = "admin123"
|
||||
|
||||
def login():
|
||||
"""Login and get JWT token"""
|
||||
print("Step 1: Logging in...")
|
||||
response = requests.post(
|
||||
f"{API_BASE}/auth/login",
|
||||
json={"username": USERNAME, "password": PASSWORD}
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
data = response.json()
|
||||
token = data["access_token"]
|
||||
print(f"✓ Login successful. Token expires in: {data['expires_in']} seconds ({data['expires_in']//3600} hours)")
|
||||
return token
|
||||
|
||||
def upload_file(token, file_path):
|
||||
"""Upload file and create batch"""
|
||||
print(f"\nStep 2: Uploading file: {file_path.name}...")
|
||||
with open(file_path, 'rb') as f:
|
||||
files = {'files': (file_path.name, f, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')}
|
||||
response = requests.post(
|
||||
f"{API_BASE}/upload",
|
||||
headers={"Authorization": f"Bearer {token}"},
|
||||
files=files,
|
||||
data={"batch_name": "Office Document Test"}
|
||||
)
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
print(f"✓ File uploaded and batch created:")
|
||||
print(f" Batch ID: {result['id']}")
|
||||
print(f" Total files: {result['total_files']}")
|
||||
print(f" Status: {result['status']}")
|
||||
return result['id']
|
||||
|
||||
def trigger_ocr(token, batch_id):
|
||||
"""Trigger OCR processing"""
|
||||
print(f"\nStep 3: Triggering OCR processing...")
|
||||
response = requests.post(
|
||||
f"{API_BASE}/ocr/process",
|
||||
headers={"Authorization": f"Bearer {token}"},
|
||||
json={
|
||||
"batch_id": batch_id,
|
||||
"lang": "ch",
|
||||
"detect_layout": True
|
||||
}
|
||||
)
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
print(f"✓ OCR processing started")
|
||||
print(f" Message: {result['message']}")
|
||||
print(f" Total files: {result['total_files']}")
|
||||
|
||||
def check_status(token, batch_id):
|
||||
"""Check processing status"""
|
||||
print(f"\nStep 4: Checking processing status...")
|
||||
max_wait = 120 # 120 seconds max
|
||||
waited = 0
|
||||
|
||||
while waited < max_wait:
|
||||
response = requests.get(
|
||||
f"{API_BASE}/batch/{batch_id}/status",
|
||||
headers={"Authorization": f"Bearer {token}"}
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
batch_status = data['batch']['status']
|
||||
progress = data['batch']['progress_percentage']
|
||||
file_status = data['files'][0]['status']
|
||||
|
||||
print(f" Batch status: {batch_status}, Progress: {progress}%, File status: {file_status}")
|
||||
|
||||
if batch_status == 'completed':
|
||||
print(f"\n✓ Processing completed!")
|
||||
file_data = data['files'][0]
|
||||
if 'processing_time' in file_data:
|
||||
print(f" Processing time: {file_data['processing_time']:.2f} seconds")
|
||||
return data
|
||||
elif batch_status == 'failed':
|
||||
print(f"\n✗ Processing failed!")
|
||||
print(f" Error: {data['files'][0].get('error_message', 'Unknown error')}")
|
||||
return data
|
||||
|
||||
time.sleep(5)
|
||||
waited += 5
|
||||
|
||||
print(f"\n⚠ Timeout waiting for processing (waited {waited}s)")
|
||||
return None
|
||||
|
||||
def get_result(token, file_id):
|
||||
"""Get OCR result"""
|
||||
print(f"\nStep 5: Getting OCR result...")
|
||||
response = requests.get(
|
||||
f"{API_BASE}/ocr/result/{file_id}",
|
||||
headers={"Authorization": f"Bearer {token}"}
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
file_info = data['file']
|
||||
result = data.get('result')
|
||||
|
||||
print(f"✓ OCR Result retrieved:")
|
||||
print(f" File: {file_info['original_filename']}")
|
||||
print(f" Status: {file_info['status']}")
|
||||
|
||||
if result:
|
||||
print(f" Language: {result.get('detected_language', 'N/A')}")
|
||||
print(f" Total text regions: {result.get('total_text_regions', 0)}")
|
||||
print(f" Average confidence: {result.get('average_confidence', 0):.2%}")
|
||||
|
||||
# Read markdown file if available
|
||||
if result.get('markdown_path'):
|
||||
try:
|
||||
with open(result['markdown_path'], 'r', encoding='utf-8') as f:
|
||||
markdown_content = f.read()
|
||||
print(f"\n Markdown preview (first 300 chars):")
|
||||
print(f" {'-'*60}")
|
||||
print(f" {markdown_content[:300]}...")
|
||||
print(f" {'-'*60}")
|
||||
except Exception as e:
|
||||
print(f" Could not read markdown file: {e}")
|
||||
else:
|
||||
print(f" No OCR result available yet")
|
||||
|
||||
return data
|
||||
|
||||
def main():
|
||||
try:
|
||||
# Test file
|
||||
test_file = Path('/Users/egg/Projects/Tool_OCR/demo_docs/office_tests/test_document.docx')
|
||||
|
||||
if not test_file.exists():
|
||||
print(f"✗ Test file not found: {test_file}")
|
||||
return
|
||||
|
||||
print("="*70)
|
||||
print("Office Document Processing Test")
|
||||
print("="*70)
|
||||
print(f"Test file: {test_file.name} ({test_file.stat().st_size} bytes)")
|
||||
print("="*70)
|
||||
|
||||
# Run test
|
||||
token = login()
|
||||
batch_id = upload_file(token, test_file)
|
||||
trigger_ocr(token, batch_id)
|
||||
status_data = check_status(token, batch_id)
|
||||
|
||||
if status_data and status_data['batch']['status'] == 'completed':
|
||||
file_id = status_data['files'][0]['id']
|
||||
result = get_result(token, file_id)
|
||||
print("\n" + "="*70)
|
||||
print("✓ TEST PASSED: Office document processing successful!")
|
||||
print("="*70)
|
||||
else:
|
||||
print("\n" + "="*70)
|
||||
print("✗ TEST FAILED: Processing did not complete successfully")
|
||||
print("="*70)
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n✗ TEST ERROR: {str(e)}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
BIN
demo_docs/tables/simple_table.png
Normal file
|
After Width: | Height: | Size: 23 KiB |
BIN
demo_docs/tables/截圖 2025-11-12 上午10.33.12.png
Normal file
|
After Width: | Height: | Size: 288 KiB |
BIN
demo_docs/tables/截圖 2025-11-12 上午10.34.33.png
Normal file
|
After Width: | Height: | Size: 518 KiB |