chore: project cleanup and prepare for dual-track processing refactor

- Removed all test files and directories - Deleted outdated documentation (will be rewritten) - Cleaned up temporary files, logs, and uploads - Archived 5 completed OpenSpec proposals - Created new dual-track-document-processing proposal with complete OpenSpec structure - Dual-track architecture: OCR track (PaddleOCR) + Direct track (PyMuPDF) - UnifiedDocument model for consistent output - Support for structure-preserving translation - Updated .gitignore to prevent future test/temp files This is a major cleanup preparing for the complete refactoring of the document processing pipeline. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-18 20:02:31 +08:00
parent 0edc56b03f
commit cd3cbea49d
64 changed files with 3573 additions and 8190 deletions
--- a/demo_docs/office_tests/create_docx.py
+++ b/demo_docs/office_tests/create_docx.py
@@ -1,100 +0,0 @@
-#!/usr/bin/env python3
-import zipfile
-from pathlib import Path
-
-# Create a minimal DOCX file
-output_path = Path('/Users/egg/Projects/Tool_OCR/demo_docs/office_tests/test_document.docx')
-
-# DOCX is a ZIP file containing XML files
-with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as docx:
-    # [Content_Types].xml
-    content_types = '''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
-<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
-    <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
-    <Default Extension="xml" ContentType="application/xml"/>
-    <Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
-</Types>'''
-    docx.writestr('[Content_Types].xml', content_types)
-
-    # _rels/.rels
-    rels = '''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
-<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
-    <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
-</Relationships>'''
-    docx.writestr('_rels/.rels', rels)
-
-    # word/document.xml with Chinese and English content
-    document = '''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
-<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
-    <w:body>
-        <w:p>
-            <w:pPr><w:pStyle w:val="Heading1"/></w:pPr>
-            <w:r><w:t>Office Document OCR Test</w:t></w:r>
-        </w:p>
-        <w:p>
-            <w:pPr><w:pStyle w:val="Heading2"/></w:pPr>
-            <w:r><w:t>測試文件說明</w:t></w:r>
-        </w:p>
-        <w:p>
-            <w:r><w:t>這是一個用於測試 Tool_OCR 系統 Office 文件支援功能的測試文件。</w:t></w:r>
-        </w:p>
-        <w:p>
-            <w:r><w:t>本系統現已支援以下 Office 格式：</w:t></w:r>
-        </w:p>
-        <w:p>
-            <w:r><w:t>• Microsoft Word: DOC, DOCX</w:t></w:r>
-        </w:p>
-        <w:p>
-            <w:r><w:t>• Microsoft PowerPoint: PPT, PPTX</w:t></w:r>
-        </w:p>
-        <w:p>
-            <w:pPr><w:pStyle w:val="Heading2"/></w:pPr>
-            <w:r><w:t>處理流程</w:t></w:r>
-        </w:p>
-        <w:p>
-            <w:r><w:t>Office 文件的處理流程如下：</w:t></w:r>
-        </w:p>
-        <w:p>
-            <w:r><w:t>1. 使用 LibreOffice 將 Office 文件轉換為 PDF</w:t></w:r>
-        </w:p>
-        <w:p>
-            <w:r><w:t>2. 將 PDF 轉換為圖片（每頁一張）</w:t></w:r>
-        </w:p>
-        <w:p>
-            <w:r><w:t>3. 使用 PaddleOCR 處理每張圖片</w:t></w:r>
-        </w:p>
-        <w:p>
-            <w:r><w:t>4. 合併所有頁面的 OCR 結果</w:t></w:r>
-        </w:p>
-        <w:p>
-            <w:pPr><w:pStyle w:val="Heading2"/></w:pPr>
-            <w:r><w:t>中英混合測試</w:t></w:r>
-        </w:p>
-        <w:p>
-            <w:r><w:t>This is a test for mixed Chinese and English OCR recognition.</w:t></w:r>
-        </w:p>
-        <w:p>
-            <w:r><w:t>測試中英文混合識別能力：1234567890</w:t></w:r>
-        </w:p>
-        <w:p>
-            <w:pPr><w:pStyle w:val="Heading2"/></w:pPr>
-            <w:r><w:t>Technical Information</w:t></w:r>
-        </w:p>
-        <w:p>
-            <w:r><w:t>System Version: Tool_OCR v1.0</w:t></w:r>
-        </w:p>
-        <w:p>
-            <w:r><w:t>Conversion Engine: LibreOffice Headless</w:t></w:r>
-        </w:p>
-        <w:p>
-            <w:r><w:t>OCR Engine: PaddleOCR</w:t></w:r>
-        </w:p>
-        <w:p>
-            <w:r><w:t>Token Validity: 24 hours (1440 minutes)</w:t></w:r>
-        </w:p>
-    </w:body>
-</w:document>'''
-    docx.writestr('word/document.xml', document)
-
-print(f"Created DOCX file: {output_path}")
-print(f"File size: {output_path.stat().st_size} bytes")
--- a/demo_docs/office_tests/test_document.docx
+++ b/demo_docs/office_tests/test_document.docx
--- a/demo_docs/office_tests/test_document.html
+++ b/demo_docs/office_tests/test_document.html
@@ -1,64 +0,0 @@
-<!DOCTYPE html>
-<html>
-<head>
-    <meta charset="UTF-8">
-    <title>Office Document OCR Test</title>
-</head>
-<body>
-    <h1>Office Document OCR Test</h1>
-
-    <h2>測試文件說明</h2>
-    <p>這是一個用於測試 Tool_OCR 系統 Office 文件支援功能的測試文件。</p>
-    <p>本系統現已支援以下 Office 格式：</p>
-    <ul>
-        <li>Microsoft Word: DOC, DOCX</li>
-        <li>Microsoft PowerPoint: PPT, PPTX</li>
-    </ul>
-
-    <h2>處理流程</h2>
-    <p>Office 文件的處理流程如下：</p>
-    <ol>
-        <li>使用 LibreOffice 將 Office 文件轉換為 PDF</li>
-        <li>將 PDF 轉換為圖片（每頁一張）</li>
-        <li>使用 PaddleOCR 處理每張圖片</li>
-        <li>合併所有頁面的 OCR 結果</li>
-    </ol>
-
-    <h2>測試數據表格</h2>
-    <table border="1" cellpadding="5">
-        <tr>
-            <th>格式</th>
-            <th>副檔名</th>
-            <th>支援狀態</th>
-        </tr>
-        <tr>
-            <td>Word 新版</td>
-            <td>.docx</td>
-            <td>✓ 支援</td>
-        </tr>
-        <tr>
-            <td>Word 舊版</td>
-            <td>.doc</td>
-            <td>✓ 支援</td>
-        </tr>
-        <tr>
-            <td>PowerPoint 新版</td>
-            <td>.pptx</td>
-            <td>✓ 支援</td>
-        </tr>
-        <tr>
-            <td>PowerPoint 舊版</td>
-            <td>.ppt</td>
-            <td>✓ 支援</td>
-        </tr>
-    </table>
-
-    <h2>中英混合測試</h2>
-    <p>This is a test for mixed Chinese and English OCR recognition.</p>
-    <p>測試中英文混合識別能力：1234567890</p>
-
-    <h2>特殊字符測試</h2>
-    <p>符號測試：!@#$%^&*()_+-=[]{}|;:',.<>?/</p>
-    <p>數學符號：± × ÷ √ ∞ ≈ ≠ ≤ ≥</p>
-</body>
-</html>
--- a/demo_docs/office_tests/test_office_upload.py
+++ b/demo_docs/office_tests/test_office_upload.py
@@ -1,178 +0,0 @@
-#!/usr/bin/env python3
-"""
-Test script for Office document processing
-"""
-import json
-import requests
-from pathlib import Path
-import time
-
-API_BASE = "http://localhost:12010/api/v1"
-USERNAME = "admin"
-PASSWORD = "admin123"
-
-def login():
-    """Login and get JWT token"""
-    print("Step 1: Logging in...")
-    response = requests.post(
-        f"{API_BASE}/auth/login",
-        json={"username": USERNAME, "password": PASSWORD}
-    )
-    response.raise_for_status()
-
-    data = response.json()
-    token = data["access_token"]
-    print(f"✓ Login successful. Token expires in: {data['expires_in']} seconds ({data['expires_in']//3600} hours)")
-    return token
-
-def upload_file(token, file_path):
-    """Upload file and create batch"""
-    print(f"\nStep 2: Uploading file: {file_path.name}...")
-    with open(file_path, 'rb') as f:
-        files = {'files': (file_path.name, f, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')}
-        response = requests.post(
-            f"{API_BASE}/upload",
-            headers={"Authorization": f"Bearer {token}"},
-            files=files,
-            data={"batch_name": "Office Document Test"}
-        )
-    response.raise_for_status()
-    result = response.json()
-    print(f"✓ File uploaded and batch created:")
-    print(f"  Batch ID: {result['id']}")
-    print(f"  Total files: {result['total_files']}")
-    print(f"  Status: {result['status']}")
-    return result['id']
-
-def trigger_ocr(token, batch_id):
-    """Trigger OCR processing"""
-    print(f"\nStep 3: Triggering OCR processing...")
-    response = requests.post(
-        f"{API_BASE}/ocr/process",
-        headers={"Authorization": f"Bearer {token}"},
-        json={
-            "batch_id": batch_id,
-            "lang": "ch",
-            "detect_layout": True
-        }
-    )
-    response.raise_for_status()
-    result = response.json()
-    print(f"✓ OCR processing started")
-    print(f"  Message: {result['message']}")
-    print(f"  Total files: {result['total_files']}")
-
-def check_status(token, batch_id):
-    """Check processing status"""
-    print(f"\nStep 4: Checking processing status...")
-    max_wait = 120  # 120 seconds max
-    waited = 0
-
-    while waited < max_wait:
-        response = requests.get(
-            f"{API_BASE}/batch/{batch_id}/status",
-            headers={"Authorization": f"Bearer {token}"}
-        )
-        response.raise_for_status()
-        data = response.json()
-
-        batch_status = data['batch']['status']
-        progress = data['batch']['progress_percentage']
-        file_status = data['files'][0]['status']
-
-        print(f"  Batch status: {batch_status}, Progress: {progress}%, File status: {file_status}")
-
-        if batch_status == 'completed':
-            print(f"\n✓ Processing completed!")
-            file_data = data['files'][0]
-            if 'processing_time' in file_data:
-                print(f"  Processing time: {file_data['processing_time']:.2f} seconds")
-            return data
-        elif batch_status == 'failed':
-            print(f"\n✗ Processing failed!")
-            print(f"  Error: {data['files'][0].get('error_message', 'Unknown error')}")
-            return data
-
-        time.sleep(5)
-        waited += 5
-
-    print(f"\n⚠ Timeout waiting for processing (waited {waited}s)")
-    return None
-
-def get_result(token, file_id):
-    """Get OCR result"""
-    print(f"\nStep 5: Getting OCR result...")
-    response = requests.get(
-        f"{API_BASE}/ocr/result/{file_id}",
-        headers={"Authorization": f"Bearer {token}"}
-    )
-    response.raise_for_status()
-    data = response.json()
-
-    file_info = data['file']
-    result = data.get('result')
-
-    print(f"✓ OCR Result retrieved:")
-    print(f"  File: {file_info['original_filename']}")
-    print(f"  Status: {file_info['status']}")
-
-    if result:
-        print(f"  Language: {result.get('detected_language', 'N/A')}")
-        print(f"  Total text regions: {result.get('total_text_regions', 0)}")
-        print(f"  Average confidence: {result.get('average_confidence', 0):.2%}")
-
-        # Read markdown file if available
-        if result.get('markdown_path'):
-            try:
-                with open(result['markdown_path'], 'r', encoding='utf-8') as f:
-                    markdown_content = f.read()
-                print(f"\n  Markdown preview (first 300 chars):")
-                print(f"  {'-'*60}")
-                print(f"  {markdown_content[:300]}...")
-                print(f"  {'-'*60}")
-            except Exception as e:
-                print(f"  Could not read markdown file: {e}")
-    else:
-        print(f"  No OCR result available yet")
-
-    return data
-
-def main():
-    try:
-        # Test file
-        test_file = Path('/Users/egg/Projects/Tool_OCR/demo_docs/office_tests/test_document.docx')
-
-        if not test_file.exists():
-            print(f"✗ Test file not found: {test_file}")
-            return
-
-        print("="*70)
-        print("Office Document Processing Test")
-        print("="*70)
-        print(f"Test file: {test_file.name} ({test_file.stat().st_size} bytes)")
-        print("="*70)
-
-        # Run test
-        token = login()
-        batch_id = upload_file(token, test_file)
-        trigger_ocr(token, batch_id)
-        status_data = check_status(token, batch_id)
-
-        if status_data and status_data['batch']['status'] == 'completed':
-            file_id = status_data['files'][0]['id']
-            result = get_result(token, file_id)
-            print("\n" + "="*70)
-            print("✓ TEST PASSED: Office document processing successful!")
-            print("="*70)
-        else:
-            print("\n" + "="*70)
-            print("✗ TEST FAILED: Processing did not complete successfully")
-            print("="*70)
-
-    except Exception as e:
-        print(f"\n✗ TEST ERROR: {str(e)}")
-        import traceback
-        traceback.print_exc()
-
-if __name__ == "__main__":
-    main()