From 08adf3d01d8beadf3c186f6a111c2181118181d7 Mon Sep 17 00:00:00 2001 From: egg Date: Wed, 3 Dec 2025 10:10:28 +0800 Subject: [PATCH] feat: add translated PDF format selection (layout/reflow) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add generate_translated_layout_pdf() method for layout-preserving translated PDFs - Add generate_translated_pdf() method for reflow translated PDFs - Update translate router to accept format parameter (layout/reflow) - Update frontend with dropdown to select translated PDF format - Fix reflow PDF table cell extraction from content dict - Add embedded images handling in reflow PDF tables - Archive improve-translated-text-fitting openspec proposal 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- README.md | 348 ++----- backend/app/routers/tasks.py | 54 +- backend/app/routers/translate.py | 43 +- backend/app/services/pdf_generator_service.py | 708 +++++++++++++- docs/API.md | 919 ++---------------- docs/architecture-overview.md | 1 + docs/commit-history-report.md | 31 - docs/project-risk-assessment.md | 24 - frontend/src/pages/TaskDetailPage.tsx | 67 +- frontend/src/services/apiV2.ts | 24 +- .../improve-translated-text-fitting/design.md | 167 ++++ .../proposal.md | 41 + .../specs/result-export/spec.md | 137 +++ .../improve-translated-text-fitting/tasks.md | 30 + requirements.txt | 12 +- 15 files changed, 1384 insertions(+), 1222 deletions(-) delete mode 100644 docs/commit-history-report.md delete mode 100644 docs/project-risk-assessment.md create mode 100644 openspec/changes/improve-translated-text-fitting/design.md create mode 100644 openspec/changes/improve-translated-text-fitting/proposal.md create mode 100644 openspec/changes/improve-translated-text-fitting/specs/result-export/spec.md create mode 100644 openspec/changes/improve-translated-text-fitting/tasks.md diff --git a/README.md b/README.md index 5f47c8b..398fb7a 100644 --- a/README.md +++ b/README.md @@ -1,270 +1,82 @@ # Tool_OCR -**OCR Batch Processing System with Structure Extraction** - -A web-based solution to extract text, images, and document structure from multiple files efficiently using PaddleOCR-VL. - -## Features - -- 🔍 **Multi-Language OCR**: Support for 109 languages (Chinese, English, Japanese, Korean, etc.) -- 📄 **Document Structure Analysis**: Intelligent layout analysis with PP-StructureV3 -- 🖼️ **Image Extraction**: Preserve document images alongside text content -- 📑 **Batch Processing**: Process multiple files concurrently with progress tracking -- 📤 **Multiple Export Formats**: TXT, JSON, Excel, Markdown with images, searchable PDF -- 📋 **Office Documents**: DOC, DOCX, PPT, PPTX support via LibreOffice conversion -- 🚀 **GPU Acceleration**: Automatic CUDA GPU detection with graceful CPU fallback -- 🔧 **Flexible Configuration**: Rule-based output formatting -- 🌐 **Translation Ready**: Reserved architecture for future translation features - -## Tech Stack - -### Backend -- **Framework**: FastAPI 0.115.0 -- **OCR Engine**: PaddleOCR 3.0+ with PaddleOCR-VL and PP-StructureV3 -- **Deep Learning**: PaddlePaddle 3.2.1+ (GPU/CPU support) -- **Database**: MySQL via SQLAlchemy -- **PDF Generation**: Pandoc + WeasyPrint -- **Image Processing**: OpenCV, Pillow, pdf2image -- **Office Conversion**: LibreOffice (headless mode) - -### Frontend -- **Framework**: React 19 with TypeScript -- **Build Tool**: Vite 7 -- **Styling**: Tailwind CSS v4 + shadcn/ui -- **State Management**: React Query + Zustand -- **HTTP Client**: Axios - -## Prerequisites - -- **OS**: WSL2 Ubuntu 24.04 -- **Python**: 3.12+ -- **Node.js**: 24.x LTS -- **MySQL**: External database server (provided) -- **GPU** (Optional): NVIDIA GPU with CUDA 11.8+ for hardware acceleration - - PaddlePaddle 3.2.1+ requires CUDA 11.8, 12.3, or 12.6+ - - WSL2 users: Ensure NVIDIA CUDA drivers are installed - -## Quick Start - -### 1. Automated Setup (Recommended) - -```bash -# Run automated setup script -./setup_dev_env.sh -``` - -This script automatically: -- Detects NVIDIA GPU and CUDA version (if available) -- Installs Python development tools (pip, venv, build-essential) -- Installs system dependencies (pandoc, LibreOffice, fonts, etc.) -- Installs Node.js (via nvm) -- Installs PaddlePaddle 3.2.1+ GPU version (if GPU detected) or CPU version -- Configures WSL CUDA library paths (for WSL2 GPU users) -- Installs other Python packages (PaddleOCR, PaddleX, etc.) -- Installs frontend dependencies -- Verifies GPU functionality and chart recognition API availability - -### 2. Initialize Database - -```bash -source venv/bin/activate -cd backend -alembic upgrade head -python create_test_user.py -cd .. -``` - -Default test user: -- Username: `admin` -- Password: `admin123` - -### 3. Start Development Servers - -**Backend (Terminal 1):** -```bash -./start_backend.sh -``` - -**Frontend (Terminal 2):** -```bash -./start_frontend.sh -``` - -### 4. Access Application - -- **Frontend**: http://localhost:5173 -- **API Docs**: http://localhost:8000/docs -- **Health Check**: http://localhost:8000/health - -## Project Structure - -``` -Tool_OCR/ -├── backend/ # FastAPI backend -│ ├── app/ -│ │ ├── api/v1/ # API endpoints -│ │ ├── core/ # Configuration, database -│ │ ├── models/ # Database models -│ │ ├── services/ # Business logic -│ │ └── main.py # Application entry point -│ ├── alembic/ # Database migrations -│ └── tests/ # Test suite -├── frontend/ # React frontend -│ ├── src/ -│ │ ├── components/ # UI components -│ │ ├── pages/ # Page components -│ │ ├── services/ # API services -│ │ └── stores/ # State management -│ └── public/ # Static assets -├── .env.local # Local development config -├── setup_dev_env.sh # Environment setup script -├── start_backend.sh # Backend startup script -└── start_frontend.sh # Frontend startup script -``` - -## Configuration - -Main config file: `.env.local` - -```bash -# Database -MYSQL_HOST=mysql.theaken.com -MYSQL_PORT=33306 - -# Application ports -BACKEND_PORT=8000 -FRONTEND_PORT=5173 - -# Token expiration (minutes) -ACCESS_TOKEN_EXPIRE_MINUTES=1440 # 24 hours - -# Supported file formats -ALLOWED_EXTENSIONS=png,jpg,jpeg,pdf,bmp,tiff,doc,docx,ppt,pptx - -# OCR settings -OCR_LANGUAGES=ch,en,japan,korean -MAX_OCR_WORKERS=4 - -# GPU acceleration (optional) -FORCE_CPU_MODE=false # Set to true to disable GPU even if available -GPU_MEMORY_FRACTION=0.8 # Fraction of GPU memory to use (0.0-1.0) -GPU_DEVICE_ID=0 # GPU device ID to use (0 for primary GPU) -``` - -### GPU Acceleration - -The system automatically detects and utilizes NVIDIA GPU hardware when available: - -- **Auto-detection**: Setup script detects GPU and installs appropriate PaddlePaddle version -- **Graceful fallback**: If GPU is unavailable or fails, system automatically uses CPU mode -- **Performance**: GPU acceleration provides 3-10x speedup for OCR processing -- **Configuration**: Control GPU usage via `.env.local` environment variables -- **WSL2 CUDA Setup**: For WSL2 users, CUDA library paths are automatically configured in `~/.bashrc` - -**Chart Recognition**: Requires PaddlePaddle 3.2.0+ for full PP-StructureV3 chart recognition capabilities (chart type detection, data extraction, axis/legend parsing). The setup script installs PaddlePaddle 3.2.1+ which includes all required APIs. - -Check GPU status and chart recognition availability at: http://localhost:8000/health - -## API Endpoints - -### Authentication -- `POST /api/v1/auth/login` - User login - -### File Management -- `POST /api/v1/upload` - Upload files -- `POST /api/v1/ocr/process` - Start OCR processing -- `GET /api/v1/batch/{id}/status` - Get batch status - -### Results & Export -- `GET /api/v1/ocr/result/{id}` - Get OCR result -- `GET /api/v1/export/pdf/{id}` - Export as PDF - -Full API documentation: http://localhost:8000/docs - -## Supported File Formats - -- **Images**: PNG, JPG, JPEG, BMP, TIFF -- **Documents**: PDF -- **Office**: DOC, DOCX, PPT, PPTX - -Office files are automatically converted to PDF before OCR processing. - -## Development - -### Backend - -```bash -source venv/bin/activate -cd backend - -# Run tests -pytest - -# Database migration -alembic revision --autogenerate -m "description" -alembic upgrade head - -# Code formatting -black app/ -``` - -### Frontend - -```bash -cd frontend - -# Development server -npm run dev - -# Build for production -npm run build - -# Lint code -npm run lint -``` - -## OpenSpec Workflow - -This project follows OpenSpec for specification-driven development: - -```bash -# View current changes -openspec list - -# Validate specifications -openspec validate add-ocr-batch-processing - -# View implementation tasks -cat openspec/changes/add-ocr-batch-processing/tasks.md -``` - -## Roadmap - -- [x] **Phase 0**: Environment setup -- [x] **Phase 1**: Core OCR backend (~98% complete) -- [x] **Phase 2**: Frontend development (~92% complete) -- [ ] **Phase 3**: Testing & optimization -- [ ] **Phase 4**: Deployment automation -- [ ] **Phase 5**: Translation feature (future) - -## Documentation - -- Development specs: [openspec/project.md](openspec/project.md) -- Implementation status: [openspec/changes/add-ocr-batch-processing/STATUS.md](openspec/changes/add-ocr-batch-processing/STATUS.md) -- Agent instructions: [openspec/AGENTS.md](openspec/AGENTS.md) - -## License - -Internal project use - -## Notes - -- First OCR run will download PaddleOCR models (~900MB) -- Token expiration is set to 24 hours by default -- Office conversion requires LibreOffice (installed via setup script) -- Development environment: WSL2 Ubuntu 24.04 with Python venv -- **GPU acceleration**: Automatically detected and enabled if NVIDIA GPU with CUDA 11.8+ is available -- **PaddlePaddle version**: System uses PaddlePaddle 3.2.1+ which includes full chart recognition support -- **WSL GPU support**: WSL2 CUDA library paths (`/usr/lib/wsl/lib`) are automatically configured in `~/.bashrc` -- **Chart recognition**: Fully enabled with PP-StructureV3 for chart type detection, data extraction, and structure analysis -- GPU status and chart recognition availability can be checked via `/health` API endpoint +多語系批次 OCR 與版面還原工具,提供直接抽取與深度 OCR 雙軌流程、PP-StructureV3 結構分析、JSON/Markdown/版面保持 PDF 匯出,前端以 React 提供任務追蹤與下載。 + +## 功能亮點 +- 雙軌處理:DocumentTypeDetector 選擇 Direct (PyMuPDF 抽取) 或 OCR (PaddleOCR + PP-StructureV3),必要時混合補圖。 +- 統一輸出:OCR/Direct 皆轉成 UnifiedDocument,後續匯出 JSON/Markdown/版面保持 PDF,並回寫 metadata。 +- 資源控管:OCRServicePool、MemoryGuard 與 prediction semaphore 控制 GPU/CPU 載荷,支援自動卸載與 CPU fallback。 +- 任務與權限:JWT 驗證、外部登入 API、任務歷史/統計、管理員審計路由。 +- 前端體驗:React + Vite + shadcn/ui,任務輪詢、結果預覽、下載、設定頁與管理面板。 +- 國際化:保留翻譯流水線(translation_service),可接入 Dify/離線模型。 + +## 架構概覽 +- **Backend (FastAPI)** + - `app/main.py`:lifespan 初始化 service pool、memory manager、CORS、/health;上傳端點 `/api/v2/upload`。 + - `routers/`:`auth.py` 登入、`tasks.py` 任務啟動/下載/metadata、`admin.py` 審計、`translate.py` 翻譯輸出。 + - `services/`:`ocr_service.py` 雙軌處理、`document_type_detector.py` 軌道選擇、`direct_extraction_engine.py` 直抽、`pp_structure_enhanced.py` 版面分析、`ocr_to_unified_converter.py` 與 `unified_document_exporter.py` 匯出、`pdf_generator_service.py` 版面保持 PDF、`service_pool.py`/`memory_manager.py` 資源管理。 + - `models/`、`schemas/`:SQLAlchemy 模型與 Pydantic 結構,`core/config.py` 整合環境設定。 +- **Frontend (React 18 + Vite)** + - `src/pages`:Login、Upload、Processing、Results、Export、TaskHistory/TaskDetail、Settings、AdminDashboard、AuditLogs。 + - `src/services` API client + React Query,`src/store` 任務/使用者狀態,`src/components` 共用 UI。 + - PDF 預覽使用 react-pdf,i18n 由 `src/i18n` 管理。 +- **處理流程摘要** + 1. `/api/v2/upload` 儲存檔案至 `backend/uploads` 並建立 Task。 + 2. `/api/v2/tasks/{id}/start` 觸發雙軌處理(可附 `pp_structure_params`)。 + 3. Direct/OCR 產生 UnifiedDocument,匯出 `_result.json`、`_output.md`、版面保持 PDF 至 `backend/storage/results//`,並在 DB 記錄 metadata。 + 4. `/api/v2/tasks/{id}/download/{json|markdown|pdf|unified}` 與 `/metadata` 提供下載與統計。 + +## 倉庫結構 +- `backend/app/`:FastAPI 程式碼(core、routers、services、schemas、models、main.py)。 +- `backend/tests/`:測試集合 + - `api/` API mock/integration、`services/` 核心邏輯、`e2e/` 需啟動後端與測試帳號、`performance/` 量測、`archived/` 舊案例。 + - 測試資源使用 `demo_docs/` 中的範例檔(gitignore,不會上傳)。 +- `backend/uploads`, `backend/storage`, `backend/logs`, `backend/models/`:執行時輸入/輸出/模型/日誌目錄,啟動時自動建立並鎖定在 backend 目錄下。 +- `frontend/`:React 應用程式碼與設定(vite.config.ts、eslint.config.js 等)。 +- `docs/`:API/架構/風險說明。 +- `openspec/`:規格檔與變更紀錄。 + +## 環境準備 +- 需求:Python 3.10+、Node 18+/20+、MySQL(或相容端點)、可選 NVIDIA GPU(CUDA 11.8+/12.x)。 +- 一鍵腳本:`./setup_dev_env.sh`(可加 `--cpu-only`、`--skip-db`)。 +- 手動: + 1. `python3 -m venv venv && source venv/bin/activate` + 2. `pip install -r requirements.txt` + 3. `cp .env.example .env.local` 並填入 DB/認證/路徑設定(預設使用 8000/5173) + 4. `cd frontend && npm install` + +## 開發啟動 +- Backend(預設 `.env` 的 `BACKEND_PORT=8000`,config 預設 12010,依環境變數覆蓋): + ```bash + source venv/bin/activate + cd backend + uvicorn app.main:app --reload --host 0.0.0.0 --port ${BACKEND_PORT:-8000} + # API docs: http://localhost:${BACKEND_PORT:-8000}/docs + ``` + `Settings` 會將 `uploads`/`storage`/`logs`/`models` 等路徑正規化到 `backend/`,避免在不同工作目錄產生多餘資料夾。 +- Frontend: + ```bash + cd frontend + npm run dev -- --host --port ${FRONTEND_PORT:-5173} + # http://localhost:${FRONTEND_PORT:-5173} + ``` +- 也可用 `./start.sh backend|frontend|--stop|--status` 管理背景進程(PID 置於 `.pid/`)。 + +## 測試 +- 單元/整合:`pytest backend/tests -m "not e2e"`(如需)。 +- API mock 測試:`pytest backend/tests/api`(僅依賴虛擬依賴/SQLite)。 +- E2E:需先啟動後端並準備測試帳號,預設呼叫 `http://localhost:8000/api/v2`,測試檔使用 `demo_docs/` 範例檔。 +- 性能/封存案例:`backend/tests/performance`、`backend/tests/archived` 可選擇性執行。 + +## 產生物與清理 +- 執行後的輸入/輸出皆位於 `backend/uploads`、`backend/storage/results|json|markdown|exports`、`backend/logs`,模型快取在 `backend/models/`。 +- 已移除多餘的 `node_modules/`、`venv/`、舊的 `pp_demo/` 與上傳/輸出/日誌樣本。再次清理可執行: + ```bash + rm -rf backend/uploads/* backend/storage/results/* backend/logs/*.log .pytest_cache backend/.pytest_cache + ``` + 目錄會在啟動時自動重建。 + +## 參考文件 +- `docs/architecture-overview.md`:雙軌流程與組件說明 +- `docs/API.md`:主要 API 介面 +- `openspec/`:系統規格與歷史變更 diff --git a/backend/app/routers/tasks.py b/backend/app/routers/tasks.py index e9b78c9..96f1e22 100644 --- a/backend/app/routers/tasks.py +++ b/backend/app/routers/tasks.py @@ -645,16 +645,22 @@ async def download_markdown( @router.get("/{task_id}/download/pdf", summary="Download PDF result") async def download_pdf( task_id: str, + format: Optional[str] = Query( + None, + description="PDF format: 'layout' (default) preserves original coordinates, 'reflow' provides flowing text with consistent font sizes" + ), db: Session = Depends(get_db), current_user: User = Depends(get_current_user) ): """ - Download task result as layout-preserving PDF file + Download task result as PDF file - **task_id**: Task UUID + - **format**: Optional format parameter + - `layout` (default): Preserves original document layout and coordinates + - `reflow`: Flowing text with consistent font sizes for better readability - Returns a PDF that preserves the original document layout using OCR results. - The PDF is generated from OCR JSON data and cached for subsequent requests. + Returns a PDF generated from OCR JSON data. The PDF is cached for subsequent requests. """ from pathlib import Path from app.services.pdf_generator_service import pdf_generator_service @@ -679,12 +685,15 @@ async def download_pdf( detail="Task is not completed yet. Please wait for OCR processing to finish." ) - # Check if PDF path is stored in database - if task.result_pdf_path and Path(task.result_pdf_path).exists(): + # Determine format (default to layout) + use_reflow = format and format.lower() == "reflow" + + # Check if PDF path is stored in database (only for layout format, as reflow is always generated) + if not use_reflow and task.result_pdf_path and Path(task.result_pdf_path).exists(): pdf_path = Path(task.result_pdf_path) logger.info(f"Using pre-generated PDF from database: {pdf_path.name}") else: - # Fallback: Try to generate PDF on-demand + # Generate PDF on-demand result_dir = Path(settings.result_dir) / task_id # Use stored JSON path or construct it @@ -700,13 +709,14 @@ async def download_pdf( ) json_path = json_files[0] - # Construct PDF path based on JSON filename - pdf_filename = json_path.stem.replace("_result", "_layout") + ".pdf" + # Construct PDF path based on JSON filename and format + format_suffix = "_reflow" if use_reflow else "_layout" + pdf_filename = json_path.stem.replace("_result", format_suffix) + ".pdf" pdf_path = result_dir / pdf_filename # Generate PDF if it doesn't exist if not pdf_path.exists(): - logger.info(f"Generating layout-preserving PDF for task {task_id}") + logger.info(f"Generating {'reflow' if use_reflow else 'layout-preserving'} PDF for task {task_id}") # Get source file path if available source_file = None @@ -714,12 +724,20 @@ async def download_pdf( if task_file and task_file.stored_path and Path(task_file.stored_path).exists(): source_file = Path(task_file.stored_path) - # Generate PDF - success = pdf_generator_service.generate_layout_pdf( - json_path=json_path, - output_path=pdf_path, - source_file_path=source_file - ) + # Generate PDF based on format + if use_reflow: + # For reflow, pass result_dir as source_file_path (contains extracted images) + success = pdf_generator_service.generate_reflow_pdf( + json_path=json_path, + output_path=pdf_path, + source_file_path=result_dir + ) + else: + success = pdf_generator_service.generate_layout_pdf( + json_path=json_path, + output_path=pdf_path, + source_file_path=source_file + ) if not success: raise HTTPException( @@ -743,8 +761,10 @@ async def download_pdf( detail=error_msg ) - # Return file - filename = f"{task.filename or task_id}_result.pdf" + # Return file with format indication in filename + base_name = task.filename or task_id + format_suffix = "_reflow" if use_reflow else "_layout" + filename = f"{base_name}{format_suffix}.pdf" return FileResponse( path=str(pdf_path), filename=filename, diff --git a/backend/app/routers/translate.py b/backend/app/routers/translate.py index 69e75b2..a9df2c6 100644 --- a/backend/app/routers/translate.py +++ b/backend/app/routers/translate.py @@ -507,16 +507,18 @@ async def delete_translation( async def download_translated_pdf( task_id: str, lang: str = Query(..., description="Target language code"), + format: str = Query("reflow", description="PDF format: 'layout' or 'reflow'"), db: Session = Depends(get_db), current_user: User = Depends(get_current_user) ): """ - Download a translated PDF with layout preservation. + Download a translated PDF. - **task_id**: Task UUID - **lang**: Target language code (e.g., 'en', 'ja') + - **format**: PDF format - 'layout' (preserves positions with text wrapping) or 'reflow' (flowing layout) - Returns PDF file with translated content preserving original layout. + Returns PDF file with translated content. """ from app.services.pdf_generator_service import pdf_generator_service from app.services.translation_service import list_available_translations @@ -587,26 +589,37 @@ async def download_translated_pdf( detail="Invalid translation file format" ) + # Validate format parameter + use_layout = format.lower() == 'layout' + # Generate translated PDF to temp file - output_filename = f"{task_id}_translated_{lang}.pdf" + format_suffix = '_layout' if use_layout else '_reflow' + output_filename = f"{task_id}_translated_{lang}{format_suffix}.pdf" with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp_file: output_path = Path(tmp_file.name) try: - # Get source file path for images if available - source_file_path = None - if task.files and len(task.files) > 0: - stored_path = task.files[0].stored_path - if stored_path and Path(stored_path).exists(): - source_file_path = Path(stored_path) + # Use result_dir as image source (contains extracted images) + image_dir = result_json_path.parent - success = pdf_generator_service.generate_translated_pdf( - result_json_path=result_json_path, - translation_json_path=translation_file, - output_path=output_path, - source_file_path=source_file_path - ) + # Choose PDF generation method based on format + if use_layout: + # Layout mode: preserve original positions with text wrapping + success = pdf_generator_service.generate_translated_layout_pdf( + result_json_path=result_json_path, + translation_json_path=translation_file, + output_path=output_path, + source_file_path=image_dir + ) + else: + # Reflow mode: flowing layout + success = pdf_generator_service.generate_translated_pdf( + result_json_path=result_json_path, + translation_json_path=translation_file, + output_path=output_path, + source_file_path=image_dir + ) if not success: raise HTTPException( diff --git a/backend/app/services/pdf_generator_service.py b/backend/app/services/pdf_generator_service.py index 90b5314..402c2a4 100644 --- a/backend/app/services/pdf_generator_service.py +++ b/backend/app/services/pdf_generator_service.py @@ -15,7 +15,8 @@ from reportlab.lib.units import mm from reportlab.pdfgen import canvas from reportlab.pdfbase import pdfmetrics from reportlab.pdfbase.ttfonts import TTFont -from reportlab.platypus import Table, TableStyle +from reportlab.platypus import Table, TableStyle, SimpleDocTemplate, Spacer +from reportlab.platypus import Image as PlatypusImage from reportlab.lib import colors from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_RIGHT from reportlab.platypus import Paragraph @@ -3601,6 +3602,387 @@ class PDFGeneratorService: except Exception as e: logger.error(f"Failed to draw image element {element.element_id}: {e}") + # ============================================================================ + # Reflow Layout PDF Generation + # ============================================================================ + + def _get_elements_in_reading_order(self, page_data: Dict) -> List[Dict]: + """ + Get elements sorted by reading order. + + For OCR track: Uses explicit 'reading_order' array from JSON + For Direct track: Uses implicit element list order (PyMuPDF sort=True) + + Args: + page_data: Page dictionary containing 'elements' and optionally 'reading_order' + + Returns: + List of elements in proper reading order + """ + elements = page_data.get('elements', []) + reading_order = page_data.get('reading_order') + + if reading_order and isinstance(reading_order, list): + # OCR track: use explicit reading order + ordered = [] + for idx in reading_order: + if isinstance(idx, int) and 0 <= idx < len(elements): + ordered.append(elements[idx]) + # Add any elements not in reading_order at the end + ordered_indices = set(reading_order) + for i, elem in enumerate(elements): + if i not in ordered_indices: + ordered.append(elem) + return ordered + else: + # Direct track: elements already in reading order from PyMuPDF + return elements + + def _get_reflow_styles(self) -> Dict[str, ParagraphStyle]: + """Create consistent styles for reflow PDF generation.""" + base_styles = getSampleStyleSheet() + font_name = self.font_name if self.font_registered else 'Helvetica' + + styles = { + 'Title': ParagraphStyle( + 'ReflowTitle', + parent=base_styles['Normal'], + fontName=font_name, + fontSize=18, + leading=22, + spaceAfter=12, + textColor=colors.black, + ), + 'Heading1': ParagraphStyle( + 'ReflowH1', + parent=base_styles['Normal'], + fontName=font_name, + fontSize=16, + leading=20, + spaceAfter=10, + spaceBefore=12, + textColor=colors.black, + ), + 'Heading2': ParagraphStyle( + 'ReflowH2', + parent=base_styles['Normal'], + fontName=font_name, + fontSize=14, + leading=18, + spaceAfter=8, + spaceBefore=10, + textColor=colors.black, + ), + 'Body': ParagraphStyle( + 'ReflowBody', + parent=base_styles['Normal'], + fontName=font_name, + fontSize=12, + leading=16, + spaceAfter=6, + textColor=colors.black, + ), + 'TableCell': ParagraphStyle( + 'ReflowTableCell', + parent=base_styles['Normal'], + fontName=font_name, + fontSize=10, + leading=13, + textColor=colors.black, + ), + 'Caption': ParagraphStyle( + 'ReflowCaption', + parent=base_styles['Normal'], + fontName=font_name, + fontSize=10, + leading=13, + spaceAfter=8, + textColor=colors.gray, + ), + } + return styles + + def _create_reflow_table(self, table_data: Dict, styles: Dict) -> Optional[Table]: + """ + Create a Platypus Table for reflow mode. + + Args: + table_data: Table element dictionary with 'rows' or 'cells' + styles: Style dictionary + + Returns: + Platypus Table object or None + """ + try: + # Get content - cells might be inside 'content' dict + content = table_data.get('content', {}) + if isinstance(content, dict): + rows_data = content.get('rows', []) if isinstance(content.get('rows'), list) else [] + cells = content.get('cells', []) + else: + rows_data = table_data.get('rows', []) + cells = table_data.get('cells', []) + + if not rows_data and cells: + # Group cells by row - support both 'row'/'col' and 'row_index'/'col_index' keys + row_map = {} + for cell in cells: + row_idx = cell.get('row', cell.get('row_index', 0)) + if row_idx not in row_map: + row_map[row_idx] = [] + row_map[row_idx].append(cell) + # Sort and create rows + rows_data = [] + for row_idx in sorted(row_map.keys()): + row_cells = sorted(row_map[row_idx], key=lambda c: c.get('col', c.get('col_index', 0))) + rows_data.append({'cells': row_cells}) + + if not rows_data: + return None + + # Build table data + data = [] + for row in rows_data: + row_data = [] + row_cells = row.get('cells', []) + for cell in row_cells: + # Support both 'text' and 'content' keys + text = cell.get('text', cell.get('content', '')) + if not isinstance(text, str): + text = str(text) if text else '' + # Escape HTML special characters + text = text.replace('&', '&').replace('<', '<').replace('>', '>') + row_data.append(Paragraph(text, styles['TableCell'])) + if row_data: + data.append(row_data) + + if not data: + return None + + # Create table + table = Table(data) + table.setStyle(TableStyle([ + ('GRID', (0, 0), (-1, -1), 0.5, colors.black), + ('VALIGN', (0, 0), (-1, -1), 'TOP'), + ('LEFTPADDING', (0, 0), (-1, -1), 6), + ('RIGHTPADDING', (0, 0), (-1, -1), 6), + ('TOPPADDING', (0, 0), (-1, -1), 4), + ('BOTTOMPADDING', (0, 0), (-1, -1), 4), + ('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey), # Header row + ])) + return table + + except Exception as e: + logger.error(f"Failed to create reflow table: {e}") + return None + + def _embed_image_reflow( + self, + element: Dict, + result_dir: Path, + max_width: float = 450 + ) -> Optional[PlatypusImage]: + """ + Embed an image for reflow mode. + + Args: + element: Image element dictionary + result_dir: Directory containing images + max_width: Maximum width in points + + Returns: + Platypus Image object or None + """ + try: + # Get image path - check multiple possible locations + img_path_str = element.get('image_path', element.get('path', '')) + + # Also check content.saved_path (Direct track format) + if not img_path_str: + content = element.get('content', {}) + if isinstance(content, dict): + img_path_str = content.get('saved_path', content.get('path', '')) + + if not img_path_str: + return None + + img_path = result_dir / img_path_str + if not img_path.exists(): + # Try just the filename + img_path = result_dir / Path(img_path_str).name + if not img_path.exists(): + logger.warning(f"Image not found for reflow: {img_path_str}") + return None + + # Create Platypus Image + img = PlatypusImage(str(img_path)) + + # Scale to fit page width if necessary + if img.drawWidth > max_width: + ratio = max_width / img.drawWidth + img.drawWidth = max_width + img.drawHeight *= ratio + + return img + + except Exception as e: + logger.error(f"Failed to embed image for reflow: {e}") + return None + + def generate_reflow_pdf( + self, + json_path: Path, + output_path: Path, + source_file_path: Optional[Path] = None + ) -> bool: + """ + Generate reflow layout PDF from OCR/Direct JSON data. + + This creates a flowing document with consistent font sizes, + proper reading order, and inline tables/images. + + Args: + json_path: Path to result JSON file (UnifiedDocument format) + output_path: Path to save generated PDF + source_file_path: Optional path to original source file (for images) + + Returns: + True if successful, False otherwise + """ + try: + # Load JSON data + logger.info(f"Generating reflow PDF from: {json_path}") + with open(json_path, 'r', encoding='utf-8') as f: + json_data = json.load(f) + + # Get styles + styles = self._get_reflow_styles() + + # Build document content + story = [] + # Use source_file_path if provided (for translated PDFs where JSON is in temp dir) + # Otherwise use json_path.parent (for regular reflow PDFs) + if source_file_path and source_file_path.is_dir(): + result_dir = source_file_path + elif source_file_path and source_file_path.is_file(): + result_dir = source_file_path.parent + else: + result_dir = json_path.parent + + # Process each page + pages = json_data.get('pages', []) + for page_idx, page_data in enumerate(pages): + if page_idx > 0: + # Add page break between pages + story.append(Spacer(1, 30)) + + # Get elements in reading order + elements = self._get_elements_in_reading_order(page_data) + + for elem in elements: + elem_type = elem.get('type', elem.get('element_type', 'text')) + content = elem.get('content', elem.get('text', '')) + + # Types that can have dict content (handled specially) + dict_content_types = ('table', 'Table', 'image', 'figure', 'Image', 'Figure', 'chart', 'Chart') + + # Ensure content is a string for text elements + if isinstance(content, dict): + # Tables, images, charts have dict content - handled by their respective methods + if elem_type not in dict_content_types: + # Skip other elements with dict content + continue + elif not isinstance(content, str): + content = str(content) if content else '' + + if elem_type in ('table', 'Table'): + # Handle table + table = self._create_reflow_table(elem, styles) + if table: + story.append(table) + story.append(Spacer(1, 12)) + + # Handle embedded images in table (from metadata) + metadata = elem.get('metadata', {}) + embedded_images = metadata.get('embedded_images', []) + for emb_img in embedded_images: + img_path_str = emb_img.get('saved_path', '') + if img_path_str: + img_path = result_dir / img_path_str + if not img_path.exists(): + img_path = result_dir / Path(img_path_str).name + if img_path.exists(): + try: + img = PlatypusImage(str(img_path)) + # Scale to fit page width if necessary + max_width = 450 + if img.drawWidth > max_width: + ratio = max_width / img.drawWidth + img.drawWidth = max_width + img.drawHeight *= ratio + story.append(img) + story.append(Spacer(1, 8)) + logger.info(f"Embedded table image in reflow: {img_path.name}") + except Exception as e: + logger.warning(f"Failed to embed table image: {e}") + + elif elem_type in ('image', 'figure', 'Image', 'Figure', 'chart', 'Chart'): + # Handle image/chart + img = self._embed_image_reflow(elem, result_dir) + if img: + story.append(img) + story.append(Spacer(1, 8)) + + elif elem_type in ('title', 'Title'): + # Title text + if content: + content = content.replace('&', '&').replace('<', '<').replace('>', '>') + story.append(Paragraph(content, styles['Title'])) + + elif elem_type in ('section_header', 'SectionHeader', 'h1', 'H1'): + # Heading 1 + if content: + content = content.replace('&', '&').replace('<', '<').replace('>', '>') + story.append(Paragraph(content, styles['Heading1'])) + + elif elem_type in ('h2', 'H2', 'Heading2'): + # Heading 2 + if content: + content = content.replace('&', '&').replace('<', '<').replace('>', '>') + story.append(Paragraph(content, styles['Heading2'])) + + else: + # Body text (default) + if content: + content = content.replace('&', '&').replace('<', '<').replace('>', '>') + story.append(Paragraph(content, styles['Body'])) + + if not story: + logger.warning("No content to generate reflow PDF") + return False + + # Create PDF document + doc = SimpleDocTemplate( + str(output_path), + pagesize=A4, + leftMargin=50, + rightMargin=50, + topMargin=50, + bottomMargin=50 + ) + + # Build PDF + doc.build(story) + + logger.info(f"Generated reflow PDF: {output_path} ({output_path.stat().st_size} bytes)") + return True + + except Exception as e: + logger.error(f"Failed to generate reflow PDF: {e}") + import traceback + traceback.print_exc() + return False + def generate_translated_pdf( self, result_json_path: Path, @@ -3609,7 +3991,7 @@ class PDFGeneratorService: source_file_path: Optional[Path] = None ) -> bool: """ - Generate layout-preserving PDF with translated content. + Generate reflow layout PDF with translated content. This method loads the original result JSON and translation JSON, merges them to replace original content with translations, and @@ -3660,7 +4042,7 @@ class PDFGeneratorService: f"target_lang={target_lang}" ) - # Write translated JSON to a temporary file and use existing generate_layout_pdf + # Write translated JSON to a temporary file and use reflow PDF generation with tempfile.NamedTemporaryFile( mode='w', suffix='_translated.json', @@ -3671,11 +4053,12 @@ class PDFGeneratorService: tmp_path = Path(tmp_file.name) try: - # Use existing PDF generation with translated content - success = self.generate_layout_pdf( + # Use reflow PDF generation for better translated content display + # Pass result_json_path.parent as image directory (not the temp file's parent) + success = self.generate_reflow_pdf( json_path=tmp_path, output_path=output_path, - source_file_path=source_file_path + source_file_path=result_json_path.parent # Contains extracted images ) return success finally: @@ -3695,6 +4078,319 @@ class PDFGeneratorService: traceback.print_exc() return False + def generate_translated_layout_pdf( + self, + result_json_path: Path, + translation_json_path: Path, + output_path: Path, + source_file_path: Optional[Path] = None + ) -> bool: + """ + Generate layout-preserving PDF with translated content. + + This method creates a PDF that maintains the original document layout + while displaying translated text. Key features: + - Text wraps within original bounding boxes (no font shrinking) + - Tables adapt to translated content + - Images and other elements remain at original positions + - Font size is kept readable (minimum 10pt) + + Args: + result_json_path: Path to original result JSON file + translation_json_path: Path to translation JSON file + output_path: Path to save generated translated PDF + source_file_path: Optional path for image directory + + Returns: + True if successful, False otherwise + """ + import tempfile + from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle + from reportlab.platypus import Paragraph, Frame + from reportlab.lib.enums import TA_LEFT, TA_CENTER, TA_RIGHT + + try: + # Import apply_translations from translation service + from app.services.translation_service import apply_translations + + # Load original result JSON + logger.info(f"Loading result JSON for layout PDF: {result_json_path}") + with open(result_json_path, 'r', encoding='utf-8') as f: + result_json = json.load(f) + + # Load translation JSON + logger.info(f"Loading translation JSON: {translation_json_path}") + with open(translation_json_path, 'r', encoding='utf-8') as f: + translation_json = json.load(f) + + # Extract translations dict + translations = translation_json.get('translations', {}) + if not translations: + logger.warning("No translations found, falling back to original layout PDF") + return self.generate_layout_pdf( + json_path=result_json_path, + output_path=output_path, + source_file_path=source_file_path + ) + + # Apply translations to result JSON + translated_doc = apply_translations(result_json, translations) + + target_lang = translation_json.get('target_lang', 'unknown') + logger.info( + f"Generating translated layout PDF: {len(translations)} translations, " + f"target_lang={target_lang}" + ) + + # Determine image directory + if source_file_path and source_file_path.is_dir(): + image_dir = source_file_path + elif source_file_path and source_file_path.is_file(): + image_dir = source_file_path.parent + else: + image_dir = result_json_path.parent + + # Create PDF canvas + from reportlab.pdfgen import canvas + + # Get page dimensions from first page + pages = translated_doc.get('pages', []) + if not pages: + logger.error("No pages in document") + return False + + first_page = pages[0] + dims = first_page.get('dimensions', {}) + page_width = dims.get('width', 595.32) + page_height = dims.get('height', 841.92) + + pdf_canvas = canvas.Canvas(str(output_path), pagesize=(page_width, page_height)) + + # Create paragraph styles for text wrapping + base_style = ParagraphStyle( + 'TranslatedBase', + fontName=self.font_name if self.font_registered else 'Helvetica', + fontSize=10, + leading=12, + wordWrap='CJK', # Support CJK word wrapping + ) + + # Process each page + for page_idx, page_data in enumerate(pages): + logger.info(f"Processing translated layout page {page_idx + 1}/{len(pages)}") + + # Get current page dimensions + dims = page_data.get('dimensions', {}) + current_page_width = dims.get('width', page_width) + current_page_height = dims.get('height', page_height) + + if page_idx > 0: + pdf_canvas.showPage() + + pdf_canvas.setPageSize((current_page_width, current_page_height)) + + # Process elements + elements = page_data.get('elements', []) + for elem in elements: + elem_type = elem.get('type', 'text') + content = elem.get('content', '') + bbox = elem.get('bbox', {}) + + if not bbox: + continue + + x0 = bbox.get('x0', 0) + y0 = bbox.get('y0', 0) + x1 = bbox.get('x1', 0) + y1 = bbox.get('y1', 0) + box_width = x1 - x0 + box_height = y1 - y0 + + if box_width <= 0 or box_height <= 0: + continue + + # Handle different element types + if elem_type in ('image', 'figure', 'Image', 'Figure', 'chart', 'Chart'): + # Draw image + img = self._embed_image_reflow(elem, image_dir) + if img: + # Convert to PDF coordinates + pdf_y = current_page_height - y1 + # Scale image to fit bbox + scale = min(box_width / img.drawWidth, box_height / img.drawHeight) + img.drawWidth *= scale + img.drawHeight *= scale + img.drawOn(pdf_canvas, x0, pdf_y) + + elif elem_type in ('table', 'Table'): + # Draw table with wrapping + self._draw_translated_table( + pdf_canvas, elem, current_page_height, image_dir + ) + + elif isinstance(content, str) and content.strip(): + # Text element - use Paragraph for word wrapping + # Escape special characters + safe_content = content.replace('&', '&') + safe_content = safe_content.replace('<', '<') + safe_content = safe_content.replace('>', '>') + # Replace newlines with
+ safe_content = safe_content.replace('\n', '
') + + # Calculate font size from bbox height, but keep minimum 10pt + font_size = max(box_height * 0.7, 10) + font_size = min(font_size, 24) # Cap at 24pt + + # Create style for this element + elem_style = ParagraphStyle( + f'elem_{id(elem)}', + parent=base_style, + fontSize=font_size, + leading=font_size * 1.2, + ) + + # Create paragraph + para = Paragraph(safe_content, elem_style) + + # Calculate available width and height + available_width = box_width + available_height = box_height * 2 # Allow overflow + + # Wrap the paragraph + para_width, para_height = para.wrap(available_width, available_height) + + # Convert to PDF coordinates (y from bottom) + pdf_y = current_page_height - y0 - para_height + + # Draw the paragraph + para.drawOn(pdf_canvas, x0, pdf_y) + + # Save PDF + pdf_canvas.save() + logger.info(f"Translated layout PDF saved to {output_path}") + return True + + except FileNotFoundError as e: + logger.error(f"File not found: {e}") + return False + except json.JSONDecodeError as e: + logger.error(f"Invalid JSON: {e}") + return False + except Exception as e: + logger.error(f"Failed to generate translated layout PDF: {e}") + import traceback + traceback.print_exc() + return False + + def _draw_translated_table( + self, + pdf_canvas, + elem: Dict, + page_height: float, + image_dir: Path + ): + """ + Draw a table with translated content using Platypus Table. + + Supports adaptive column widths and text wrapping within cells. + + Args: + pdf_canvas: ReportLab canvas + elem: Table element dict + page_height: Page height for coordinate transformation + image_dir: Directory containing images + """ + from reportlab.platypus import Table, TableStyle, Paragraph + from reportlab.lib.styles import ParagraphStyle + from reportlab.lib import colors + + try: + content = elem.get('content', {}) + bbox = elem.get('bbox', {}) + + if not bbox: + return + + x0 = bbox.get('x0', 0) + y0 = bbox.get('y0', 0) + x1 = bbox.get('x1', 0) + y1 = bbox.get('y1', 0) + table_width = x1 - x0 + table_height = y1 - y0 + + # Parse table content + if isinstance(content, dict): + rows = content.get('rows', []) + cells = content.get('cells', []) + else: + return + + if not rows and not cells: + return + + # Build table data + table_data = [] + + if rows: + for row in rows: + row_cells = row if isinstance(row, list) else row.get('cells', []) + row_data = [] + for cell in row_cells: + if isinstance(cell, str): + cell_text = cell + elif isinstance(cell, dict): + cell_text = cell.get('content', cell.get('text', '')) + else: + cell_text = str(cell) if cell else '' + + # Create paragraph for text wrapping + safe_text = str(cell_text).replace('&', '&') + safe_text = safe_text.replace('<', '<').replace('>', '>') + + cell_style = ParagraphStyle( + f'cell_{id(cell)}', + fontName=self.font_name if self.font_registered else 'Helvetica', + fontSize=9, + leading=11, + wordWrap='CJK', + ) + para = Paragraph(safe_text, cell_style) + row_data.append(para) + + if row_data: + table_data.append(row_data) + + if not table_data: + return + + # Calculate column widths + num_cols = max(len(row) for row in table_data) if table_data else 1 + col_width = table_width / num_cols if num_cols > 0 else table_width + + # Create table + table = Table(table_data, colWidths=[col_width] * num_cols) + + # Apply table style + table.setStyle(TableStyle([ + ('GRID', (0, 0), (-1, -1), 0.5, colors.black), + ('VALIGN', (0, 0), (-1, -1), 'TOP'), + ('LEFTPADDING', (0, 0), (-1, -1), 4), + ('RIGHTPADDING', (0, 0), (-1, -1), 4), + ('TOPPADDING', (0, 0), (-1, -1), 2), + ('BOTTOMPADDING', (0, 0), (-1, -1), 2), + ])) + + # Wrap and draw table + t_width, t_height = table.wrap(table_width, table_height * 2) + + # Convert to PDF coordinates + pdf_y = page_height - y0 - t_height + + table.drawOn(pdf_canvas, x0, pdf_y) + + except Exception as e: + logger.error(f"Failed to draw translated table: {e}") + # Singleton instance pdf_generator_service = PDFGeneratorService() diff --git a/docs/API.md b/docs/API.md index b6f3efe..c56fe86 100644 --- a/docs/API.md +++ b/docs/API.md @@ -1,842 +1,97 @@ -# Tool_OCR V2 API Documentation - -## Overview - -Tool_OCR V2 provides a comprehensive OCR service with dual-track document processing. The API supports intelligent routing between OCR track (for scanned documents) and Direct Extraction track (for editable PDFs and Office documents). - -**Base URL**: `http://localhost:8000/api/v2` - -**Authentication**: Bearer token (JWT) - ---- - -## Table of Contents - -1. [Authentication](#authentication) -2. [Task Management](#task-management) -3. [Document Processing](#document-processing) -4. [Document Analysis](#document-analysis) -5. [File Downloads](#file-downloads) -6. [Processing Tracks](#processing-tracks) -7. [Response Models](#response-models) -8. [Error Handling](#error-handling) - ---- - -## Authentication - -All endpoints require authentication via Bearer token. - -### Headers -```http -Authorization: Bearer -``` - -### Login -```http -POST /api/auth/login -Content-Type: application/json - -{ - "email": "user@example.com", - "password": "password123" -} -``` - -**Response**: -```json -{ - "access_token": "eyJhbGc...", - "token_type": "bearer", - "user": { - "id": 1, - "email": "user@example.com", - "username": "user" - } -} -``` - ---- - -## Task Management - -### Create Task - -Create a new OCR processing task by uploading a document. - -```http -POST /tasks/ -Content-Type: multipart/form-data -``` - -**Request Body**: -- `file` (required): Document file to process - - Supported formats: PDF, PNG, JPG, JPEG, GIF, BMP, TIFF, DOCX, PPTX, XLSX -- `language` (optional): OCR language code (default: 'ch') - - Options: 'ch', 'en', 'japan', 'korean', etc. -- `detect_layout` (optional): Enable layout detection (default: true) -- `force_track` (optional): Force specific processing track - - Options: 'ocr', 'direct', 'auto' (default: 'auto') - -**Response** `201 Created`: -```json -{ - "task_id": "550e8400-e29b-41d4-a716-446655440000", - "filename": "document.pdf", - "status": "pending", - "language": "ch", - "created_at": "2025-11-20T10:00:00Z" -} -``` - -**Processing Track Selection**: -- `auto` (default): Automatically select optimal track based on document analysis - - Editable PDFs → Direct track (faster, ~1-2s/page) - - Scanned documents/images → OCR track (slower, ~2-5s/page) - - Office documents → Convert to PDF, then route based on content -- `ocr`: Force OCR processing (PaddleOCR PP-StructureV3) -- `direct`: Force direct extraction (PyMuPDF) - only for editable PDFs - ---- - -### List Tasks - -Get a paginated list of user's tasks with filtering. - -```http -GET /tasks/?status={status}&filename={search}&skip={skip}&limit={limit} -``` - -**Query Parameters**: -- `status` (optional): Filter by task status - - Options: `pending`, `processing`, `completed`, `failed` -- `filename` (optional): Search by filename (partial match) -- `skip` (optional): Pagination offset (default: 0) -- `limit` (optional): Page size (default: 10, max: 100) - -**Response** `200 OK`: -```json -{ - "tasks": [ - { - "task_id": "550e8400-e29b-41d4-a716-446655440000", - "filename": "document.pdf", - "status": "completed", - "language": "ch", - "processing_track": "direct", - "processing_time": 1.14, - "created_at": "2025-11-20T10:00:00Z", - "completed_at": "2025-11-20T10:00:02Z" - } - ], - "total": 42, - "skip": 0, - "limit": 10 -} -``` - ---- - -### Get Task Details - -Retrieve detailed information about a specific task. - -```http -GET /tasks/{task_id} -``` - -**Response** `200 OK`: -```json -{ - "task_id": "550e8400-e29b-41d4-a716-446655440000", - "filename": "document.pdf", - "status": "completed", - "language": "ch", - "processing_track": "direct", - "document_type": "pdf_editable", - "processing_time": 1.14, - "page_count": 3, - "element_count": 51, - "character_count": 10592, - "confidence": 0.95, - "created_at": "2025-11-20T10:00:00Z", - "completed_at": "2025-11-20T10:00:02Z", - "result_files": { - "json": "/tasks/550e8400-e29b-41d4-a716-446655440000/download/json", - "markdown": "/tasks/550e8400-e29b-41d4-a716-446655440000/download/markdown", - "pdf": "/tasks/550e8400-e29b-41d4-a716-446655440000/download/pdf" - }, - "metadata": { - "file_size": 524288, - "mime_type": "application/pdf", - "text_coverage": 0.95, - "processing_track_reason": "PDF has extractable text on 100% of sampled pages" - } -} -``` - -**New Fields** (Dual-Track): -- `processing_track`: Track used for processing (`ocr`, `direct`, or `null`) -- `document_type`: Detected document type - - `pdf_editable`: Editable PDF with text - - `pdf_scanned`: Scanned/image-based PDF - - `pdf_mixed`: Mixed content PDF - - `image`: Image file - - `office_word`, `office_excel`, `office_ppt`: Office documents -- `page_count`: Number of pages extracted -- `element_count`: Total elements (text, tables, images) extracted -- `character_count`: Total characters extracted -- `metadata.text_coverage`: Percentage of pages with extractable text (0.0-1.0) -- `metadata.processing_track_reason`: Explanation of track selection - ---- - -### Get Task Statistics - -Get aggregated statistics for user's tasks. - -```http -GET /tasks/stats -``` - -**Response** `200 OK`: -```json -{ - "total_tasks": 150, - "by_status": { - "pending": 5, - "processing": 3, - "completed": 140, - "failed": 2 - }, - "by_processing_track": { - "ocr": 80, - "direct": 60, - "unknown": 10 - }, - "total_pages_processed": 4250, - "average_processing_time": 3.5, - "success_rate": 0.987 -} -``` - ---- - -### Delete Task - -Delete a task and all associated files. - -```http -DELETE /tasks/{task_id} -``` - -**Response** `204 No Content` - ---- - -## Document Processing - -### Processing Workflow - -1. **Upload Document** → `POST /tasks/` → Returns `task_id` -2. **Background Processing** → Task status changes to `processing` -3. **Complete** → Task status changes to `completed` or `failed` -4. **Download Results** → Use download endpoints - -### Track Selection Flow - -``` -Document Upload - ↓ -Document Type Detection - ↓ - ┌──────────────┐ - │ Auto Routing │ - └──────┬───────┘ - ↓ - ┌────┴─────┐ - ↓ ↓ - [Direct] [OCR] - ↓ ↓ - PyMuPDF PaddleOCR - ↓ ↓ - UnifiedDocument - ↓ - Export (JSON/MD/PDF) -``` - -**Direct Track** (Fast - 1-2s/page): -- Editable PDFs with extractable text -- Office documents (converted to text-based PDF) -- Uses PyMuPDF for direct text extraction -- Preserves exact layout and fonts - -**OCR Track** (Slower - 2-5s/page): -- Scanned PDFs and images -- Documents without extractable text -- Uses PaddleOCR PP-StructureV3 -- Handles complex layouts with 23 element types - ---- - -## Document Analysis - -### Analyze Document Type - -Analyze a document to determine optimal processing track **before** processing. - -**NEW ENDPOINT** - -```http -POST /tasks/{task_id}/analyze -``` - -**Response** `200 OK`: -```json -{ - "task_id": "550e8400-e29b-41d4-a716-446655440000", - "filename": "document.pdf", - "analysis": { - "recommended_track": "direct", - "confidence": 0.95, - "reason": "PDF has extractable text on 100% of sampled pages", - "document_type": "pdf_editable", - "metadata": { - "total_pages": 3, - "sampled_pages": 3, - "text_coverage": 1.0, - "mime_type": "application/pdf", - "file_size": 524288, - "page_details": [ - { - "page": 1, - "text_length": 3520, - "has_text": true, - "image_count": 2, - "image_coverage": 0.15 - } - ] - } - } -} -``` - -**Use Case**: -- Preview processing track before starting -- Validate document type for batch processing -- Provide user feedback on processing method - ---- - -### Get Processing Metadata - -Get detailed metadata about how a document was processed. - -**NEW ENDPOINT** - -```http -GET /tasks/{task_id}/metadata -``` - -**Response** `200 OK`: -```json -{ - "task_id": "550e8400-e29b-41d4-a716-446655440000", - "processing_track": "direct", - "document_type": "pdf_editable", - "confidence": 0.95, - "reason": "PDF has extractable text on 100% of sampled pages", - "statistics": { - "page_count": 3, - "element_count": 51, - "total_tables": 2, - "total_images": 3, - "element_type_counts": { - "text": 45, - "table": 2, - "image": 3, - "header": 1 - }, - "text_stats": { - "total_characters": 10592, - "total_words": 1842, - "average_confidence": 1.0 - } - }, - "processing_info": { - "processing_time": 1.14, - "track_description": "PyMuPDF Direct Extraction - Used for editable PDFs", - "schema_version": "1.0.0" - }, - "file_metadata": { - "filename": "document.pdf", - "file_size": 524288, - "mime_type": "application/pdf", - "created_at": "2025-11-20T10:00:00Z" - } -} -``` - ---- - -## File Downloads - -### Download JSON Result - -Download structured JSON output with full document structure. - -```http -GET /tasks/{task_id}/download/json -``` - -**Response** `200 OK`: -- Content-Type: `application/json` -- Content-Disposition: `attachment; filename="{filename}_result.json"` - -**JSON Structure**: -```json -{ - "schema_version": "1.0.0", - "document_id": "d8bea84d-a4ea-4455-b219-243624b5518e", - "export_timestamp": "2025-11-20T10:00:02Z", - "metadata": { - "filename": "document.pdf", - "file_type": ".pdf", - "file_size": 524288, - "created_at": "2025-11-20T10:00:00Z", - "processing_track": "direct", - "processing_time": 1.14, - "language": "ch", - "processing_info": { - "track_description": "PyMuPDF Direct Extraction", - "schema_version": "1.0.0", - "export_format": "unified_document_v1" - } - }, - "pages": [ - { - "page_number": 1, - "dimensions": { - "width": 595.32, - "height": 841.92 - }, - "elements": [ - { - "element_id": "text_1_0", - "type": "text", - "bbox": { - "x0": 72.0, - "y0": 72.0, - "x1": 200.0, - "y1": 90.0 - }, - "content": "Document Title", - "confidence": 1.0, - "style": { - "font": "Helvetica-Bold", - "size": 18.0 - } - } - ] - } - ], - "statistics": { - "page_count": 3, - "total_elements": 51, - "total_tables": 2, - "total_images": 3, - "element_type_counts": { - "text": 45, - "table": 2, - "image": 3, - "header": 1 - }, - "text_stats": { - "total_characters": 10592, - "total_words": 1842, - "average_confidence": 1.0 - } - } -} -``` - -**Element Types**: -- `text`: Text blocks -- `header`: Headers (H1-H6) -- `paragraph`: Paragraphs -- `list`: Lists -- `table`: Tables with cell structure -- `image`: Images with position -- `figure`: Figures with captions -- `footer`: Page footers - ---- - -### Download Markdown Result - -Download Markdown formatted output. - -```http -GET /tasks/{task_id}/download/markdown -``` - -**Response** `200 OK`: -- Content-Type: `text/markdown` -- Content-Disposition: `attachment; filename="{filename}_output.md"` - -**Example Output**: -```markdown -# Document Title - -This is the extracted content from the document. - -## Section 1 - -Content of section 1... - -| Column 1 | Column 2 | -|----------|----------| -| Data 1 | Data 2 | - -![Image](imgs/img_in_image_box_100_200_500_600.jpg) -``` - ---- - -### Download Layout-Preserving PDF - -Download reconstructed PDF with layout preservation. - -```http -GET /tasks/{task_id}/download/pdf -``` - -**Response** `200 OK`: -- Content-Type: `application/pdf` -- Content-Disposition: `attachment; filename="{filename}_layout.pdf"` - -**Features**: -- Preserves original layout and coordinates -- Maintains text positioning -- Includes extracted images -- Renders tables with proper structure - ---- - -## Processing Tracks - -### Track Comparison - -| Feature | OCR Track | Direct Track | -|---------|-----------|--------------| -| **Speed** | 2-5 seconds/page | 0.5-1 second/page | -| **Best For** | Scanned documents, images | Editable PDFs, Office docs | -| **Technology** | PaddleOCR PP-StructureV3 | PyMuPDF | -| **Accuracy** | 92-98% (content-dependent) | 100% (text is extracted, not recognized) | -| **Layout Preservation** | Good (23 element types) | Excellent (exact coordinates) | -| **GPU Required** | Yes (8GB recommended) | No | -| **Supported Formats** | PDF, PNG, JPG, TIFF, etc. | PDF (with text), converted Office docs | - -### Processing Track Enum - -```python -class ProcessingTrackEnum(str, Enum): - AUTO = "auto" # Automatic selection (default) - OCR = "ocr" # Force OCR processing - DIRECT = "direct" # Force direct extraction -``` - -### Document Type Enum - -```python -class DocumentType(str, Enum): - PDF_EDITABLE = "pdf_editable" # PDF with extractable text - PDF_SCANNED = "pdf_scanned" # Scanned/image-based PDF - PDF_MIXED = "pdf_mixed" # Mixed content PDF - IMAGE = "image" # Image files - OFFICE_WORD = "office_word" # Word documents - OFFICE_EXCEL = "office_excel" # Excel spreadsheets - OFFICE_POWERPOINT = "office_ppt" # PowerPoint presentations - TEXT = "text" # Plain text files - UNKNOWN = "unknown" # Unknown format -``` - ---- - -## Response Models - -### TaskResponse - -```typescript -interface TaskResponse { - task_id: string; - filename: string; - status: "pending" | "processing" | "completed" | "failed"; - language: string; - processing_track?: "ocr" | "direct" | null; - created_at: string; // ISO 8601 - completed_at?: string | null; -} -``` - -### TaskDetailResponse - -Extends `TaskResponse` with: -```typescript -interface TaskDetailResponse extends TaskResponse { - document_type?: string; - processing_time?: number; // seconds - page_count?: number; - element_count?: number; - character_count?: number; - confidence?: number; // 0.0-1.0 - result_files?: { - json?: string; - markdown?: string; - pdf?: string; - }; - metadata?: { - file_size?: number; - mime_type?: string; - text_coverage?: number; // 0.0-1.0 - processing_track_reason?: string; - [key: string]: any; - }; -} -``` - -### DocumentAnalysisResponse - -```typescript -interface DocumentAnalysisResponse { - task_id: string; - filename: string; - analysis: { - recommended_track: "ocr" | "direct"; - confidence: number; // 0.0-1.0 - reason: string; - document_type: string; - metadata: { - total_pages?: number; - sampled_pages?: number; - text_coverage?: number; - mime_type?: string; - file_size?: number; - page_details?: Array<{ - page: number; - text_length: number; - has_text: boolean; - image_count: number; - image_coverage: number; - }>; - }; - }; -} -``` - -### ProcessingMetadata - -```typescript -interface ProcessingMetadata { - task_id: string; - processing_track: "ocr" | "direct"; - document_type: string; - confidence: number; - reason: string; - statistics: { - page_count: number; - element_count: number; - total_tables: number; - total_images: number; - element_type_counts: { - [type: string]: number; - }; - text_stats: { - total_characters: number; - total_words: number; - average_confidence: number | null; - }; - }; - processing_info: { - processing_time: number; - track_description: string; - schema_version: string; - }; - file_metadata: { - filename: string; - file_size: number; - mime_type: string; - created_at: string; - }; -} -``` - ---- - -## Error Handling - -### HTTP Status Codes - -- `200 OK`: Successful request -- `201 Created`: Resource created successfully -- `204 No Content`: Successful deletion -- `400 Bad Request`: Invalid request parameters -- `401 Unauthorized`: Missing or invalid authentication -- `403 Forbidden`: Insufficient permissions -- `404 Not Found`: Resource not found -- `422 Unprocessable Entity`: Validation error -- `500 Internal Server Error`: Server error - -### Error Response Format - -```json -{ - "detail": "Error message describing the issue", - "error_code": "ERROR_CODE", - "timestamp": "2025-11-20T10:00:00Z" -} -``` - -### Common Errors - -**Invalid File Format**: -```json -{ - "detail": "Unsupported file format. Supported: PDF, PNG, JPG, DOCX, PPTX, XLSX", - "error_code": "INVALID_FILE_FORMAT" -} -``` - -**Task Not Found**: -```json -{ - "detail": "Task not found or access denied", - "error_code": "TASK_NOT_FOUND" -} -``` - -**Processing Failed**: -```json -{ - "detail": "OCR processing failed: GPU memory insufficient", - "error_code": "PROCESSING_FAILED" -} -``` - -**File Too Large**: -```json -{ - "detail": "File size exceeds maximum limit of 50MB", - "error_code": "FILE_TOO_LARGE" -} -``` - ---- - -## Usage Examples - -### Example 1: Auto-Route Processing - -Upload a document and let the system choose the optimal track: - +# Tool_OCR V2 API (現況) + +Base URL:`http://localhost:${BACKEND_PORT:-8000}/api/v2` +認證:所有業務端點需 Bearer Token(JWT)。 + +## 認證 +- `POST /auth/login`:{ username, password } → `access_token`, `expires_in`, `user`. +- `POST /auth/logout`:可傳 `session_id`,未傳則登出全部。 +- `GET /auth/me`:目前使用者資訊。 +- `GET /auth/sessions`:列出登入 Session。 +- `POST /auth/refresh`:刷新 access token。 + +## 任務流程摘要 +1) 上傳檔案 → `POST /upload` (multipart file) 取得 `task_id`。 +2) 啟動處理 → `POST /tasks/{task_id}/start`(ProcessingOptions 可控制 dual track、force_track、layout/預處理/table 偵測)。 +3) 查詢狀態與 metadata → `GET /tasks/{task_id}`、`/metadata`。 +4) 下載結果 → `/download/json | /markdown | /pdf | /unified`。 +5) 進階:`/analyze` 先看推薦軌道;`/preview/preprocessing` 取得預處理前後預覽。 + +## 核心端點 +- `POST /upload` + - 表單欄位:`file` (必填);驗證副檔名於允許清單。 + - 回傳:`task_id`, `filename`, `file_size`, `file_type`, `status` (pending)。 +- `POST /tasks/` + - 僅建立 Task meta(不含檔案),通常不需使用。 +- `POST /tasks/{task_id}/start` + - Body `ProcessingOptions`:`use_dual_track`(default true), `force_track`(ocr|direct), `language`(default ch), `layout_model`(chinese|default|cdla), `preprocessing_mode`(auto|manual|disabled) + `preprocessing_config`, `table_detection`. +- `POST /tasks/{task_id}/cancel`、`POST /tasks/{task_id}/retry`。 +- `GET /tasks` + - 查詢參數:`status`(pending|processing|completed|failed)、`filename`、`date_from`/`date_to`、`page`、`page_size`、`order_by`、`order_desc`。 +- `GET /tasks/{task_id}`:詳細資料與路徑、處理軌道、統計。 +- `GET /tasks/stats`:當前使用者任務統計。 +- `POST /tasks/{task_id}/analyze`:預先分析文件並給出推薦軌道/信心/文件類型/抽樣統計。 +- `GET /tasks/{task_id}/metadata`:處理結果的統計與說明。 +- 下載: + - `GET /tasks/{task_id}/download/json` + - `GET /tasks/{task_id}/download/markdown` + - `GET /tasks/{task_id}/download/pdf`(若無 PDF 則即時生成) + - `GET /tasks/{task_id}/download/unified`(UnifiedDocument JSON) +- 預處理預覽: + - `POST /tasks/{task_id}/preview/preprocessing`(body:page/mode/config) + - `GET /tasks/{task_id}/preview/image?type=original|preprocessed&page=1` + +## 翻譯(需已完成 OCR) +Prefix:`/translate` +- `POST /{task_id}`:開始翻譯,body `{ target_lang, source_lang }`,回傳 202。若已存在會直接回 Completed。 +- `GET /{task_id}/status`:翻譯進度。 +- `GET /{task_id}/result?lang=xx`:翻譯 JSON。 +- `GET /{task_id}/translations`:列出已產生的翻譯。 +- `DELETE /{task_id}/translations/{lang}`:刪除翻譯。 +- `POST /{task_id}/pdf?lang=xx`:下載翻譯後版面保持 PDF。 + +## 管理端(需要管理員) +Prefix:`/admin` +- `GET /stats`:系統層統計。 +- `GET /users`、`GET /users/top`。 +- `GET /audit-logs`、`GET /audit-logs/user/{user_id}/summary`。 + +## 健康檢查 +- `/health`:服務狀態、GPU/Memory 管理資訊。 +- `/`:簡易 API 入口說明。 + +## 回應結構摘要 +- Task 回應常見欄位:`task_id`, `status`, `processing_track`, `document_type`, `processing_time_ms`, `page_count`, `element_count`, `file_size`, `mime_type`, `result_json_path` 等。 +- 下載端點皆以檔案回應(Content-Disposition 附檔名)。 +- 錯誤格式:`{ "detail": "...", "error_code": "...", "timestamp": "..." }`(部分錯誤僅有 `detail`)。 + +## 使用範例 +上傳並啟動: ```bash -# 1. Upload document -curl -X POST "http://localhost:8000/api/v2/tasks/" \ +# 上傳 +curl -X POST "http://localhost:8000/api/v2/upload" \ -H "Authorization: Bearer $TOKEN" \ - -F "file=@document.pdf" \ - -F "language=ch" + -F "file=@demo_docs/edit.pdf" -# Response: {"task_id": "550e8400..."} - -# 2. Check status -curl -X GET "http://localhost:8000/api/v2/tasks/550e8400..." \ - -H "Authorization: Bearer $TOKEN" - -# 3. Download results (when completed) -curl -X GET "http://localhost:8000/api/v2/tasks/550e8400.../download/json" \ +# 啟動處理(force_track=ocr 舉例) +curl -X POST "http://localhost:8000/api/v2/tasks/$TASK_ID/start" \ -H "Authorization: Bearer $TOKEN" \ - -o result.json + -H "Content-Type: application/json" \ + -d '{"force_track":"ocr","language":"ch"}' + +# 查詢與下載 +curl -X GET "http://localhost:8000/api/v2/tasks/$TASK_ID/metadata" -H "Authorization: Bearer $TOKEN" +curl -L "http://localhost:8000/api/v2/tasks/$TASK_ID/download/json" -H "Authorization: Bearer $TOKEN" -o result.json ``` -### Example 2: Analyze Before Processing - -Analyze document type before processing: - +翻譯並下載翻譯 PDF: ```bash -# 1. Upload document -curl -X POST "http://localhost:8000/api/v2/tasks/" \ +curl -X POST "http://localhost:8000/api/v2/translate/$TASK_ID" \ -H "Authorization: Bearer $TOKEN" \ - -F "file=@document.pdf" + -H "Content-Type: application/json" \ + -d '{"target_lang":"en","source_lang":"auto"}' -# Response: {"task_id": "550e8400..."} - -# 2. Analyze document (NEW) -curl -X POST "http://localhost:8000/api/v2/tasks/550e8400.../analyze" \ - -H "Authorization: Bearer $TOKEN" - -# Response shows recommended track and confidence - -# 3. Start processing (automatic based on analysis) -# Processing happens in background after upload +curl -X GET "http://localhost:8000/api/v2/translate/$TASK_ID/status" -H "Authorization: Bearer $TOKEN" +curl -L "http://localhost:8000/api/v2/translate/$TASK_ID/pdf?lang=en" \ + -H "Authorization: Bearer $TOKEN" -o translated.pdf ``` - -### Example 3: Force Specific Track - -Force OCR processing for an editable PDF: - -```bash -curl -X POST "http://localhost:8000/api/v2/tasks/" \ - -H "Authorization: Bearer $TOKEN" \ - -F "file=@document.pdf" \ - -F "force_track=ocr" -``` - -### Example 4: Get Processing Metadata - -Get detailed processing information: - -```bash -curl -X GET "http://localhost:8000/api/v2/tasks/550e8400.../metadata" \ - -H "Authorization: Bearer $TOKEN" -``` - ---- - -## Version History - -### V2.0.0 (2025-11-20) - Dual-Track Processing - -**New Features**: -- ✨ Dual-track processing (OCR + Direct Extraction) -- ✨ Automatic document type detection -- ✨ Office document support (Word, PowerPoint, Excel) -- ✨ Processing track metadata -- ✨ Enhanced layout analysis (23 element types) -- ✨ GPU memory management - -**New Endpoints**: -- `POST /tasks/{task_id}/analyze` - Analyze document type -- `GET /tasks/{task_id}/metadata` - Get processing metadata - -**Enhanced Endpoints**: -- `POST /tasks/` - Added `force_track` parameter -- `GET /tasks/{task_id}` - Added `processing_track`, `document_type`, element counts -- All download endpoints now include processing track information - -**Performance Improvements**: -- 10x faster processing for editable PDFs (1-2s vs 10-20s per page) -- Optimized GPU memory usage for RTX 4060 8GB -- Office documents: 2-5s vs >300s (60x improvement) - ---- - -## Support - -For issues, questions, or feature requests: -- GitHub Issues: https://github.com/your-repo/Tool_OCR/issues -- Documentation: https://your-docs-site.com -- API Status: http://localhost:8000/health - ---- - -*Generated by Tool_OCR V2.0.0 - Dual-Track Document Processing* diff --git a/docs/architecture-overview.md b/docs/architecture-overview.md index 2f33784..239b01e 100644 --- a/docs/architecture-overview.md +++ b/docs/architecture-overview.md @@ -10,6 +10,7 @@ - **OCR 解析**:PaddleOCR + `PPStructureEnhanced` 抽取 23 類元素;`OCRToUnifiedConverter` 轉成 `UnifiedDocument` 統一格式。 - **匯出/呈現**:`UnifiedDocumentExporter` 產出 JSON/Markdown;`pdf_generator_service.py` 產生版面保持 PDF;前端透過 `/api/v2/tasks/{id}/download/*` 取得。 - **資源控管**:`memory_manager.py`(MemoryGuard、prediction semaphore、模型生命週期),`service_pool.py`(`OCRService` 池)避免多重載模與 GPU 爆滿。 +- **翻譯與預覽**:`translation_service` 針對已完成任務提供異步翻譯(`/api/v2/translate/*`),`layout_preprocessing_service` 提供預處理預覽與品質指標(`/preview/preprocessing` → `/preview/image`)。 ## 處理流程(任務層級) 1. **上傳**:`POST /api/v2/upload` 建立 Task 並寫檔到 `uploads/`(含 SHA256、檔案資訊)。 diff --git a/docs/commit-history-report.md b/docs/commit-history-report.md deleted file mode 100644 index 723095a..0000000 --- a/docs/commit-history-report.md +++ /dev/null @@ -1,31 +0,0 @@ -# Tool_OCR Commit History Review (2025-11-12 ~ 2025-11-26) - -本報告依 `git log` 全量 97 筆提交整理,涵蓋開發脈絡、里程碑、測試/品質信號與後續風險。提交類型統計:35 `feat` / 37 `fix` / 9 `chore` / 5 `test` / 4 `docs` / 2 `refactor`,主要集中於 2025-11-18、11-19、11-20 與 11-24 的密集開發。 - -## 時間軸與里程碑 -- **前期基礎與前端現代化 (11-12~11-13)**:`21bc2f9`, `57cf912` 將前端改為 Tailwind v4 +專業 UI,`0f81d5e` 單容器 Docker 化、`d7e6473` WSL Ubuntu 開發環境。 -- **GPU 加速與相容性 (11-14)**:`6452797` 提案 + `7536f43` 實作 GPU OCR,`d80d60f`/`3694411`/`80c091b` 修正 Paddle 3.x API 與安裝來源,`b048f2d` 暫停圖表識別以避免 API 缺口。 -- **外部 Auth V2 與管理後台 (11-14~11-16)**:`28e419f`~`fd98018` 完成外部認證 V2、資料表前綴與架構移除 V1;`8f94191` 新增後台/稽核/Token 檢查;`90fca50`/`6bb5b76` 讓 18/18 測試全過。 -- **V2 UI 串接與初版版面保持 PDF (11-16~11-18)**:前端/後端全面切換 V2 API (`ad5c8be` 之後),`fa1abcd` 版面保持 PDF + 多次座標/重疊修正 (`d33f605`~`0edc56b`),強化 logging (`d99d37d`)。 -- **雙軌處理架構 (11-18~11-20)**:`2d50c12` + `82139c8` 導入 OCR/Direct 雙軌與 UnifiedDocument;`a3a6fbe`/`ab89a40`/`ecdce96` 完成轉換、JSON 匯出與 PDF 支援;`1d0b638` 後端 API,`c2288ba` 前端支援,`c50a5e9` 單元/整合測試;`0974fc3` E2E 修復,`ef335cf` Office 直抽,`b997f93`/`9f449e8` GPU 記憶體管理與文件化,`2ecd022` E2E 測試完成。 -- **PDF 版面復原計畫 (11-20 提案,11-24 實作高峰)**:`cf894b0` 提案後,`0aff468` Phase1 圖片/表格修復,`3fc32bc` Phase2 風格保存,`77fe4cc`/`ad879d4`/`75c194f` 等完成 Alignment、List、Span 級渲染與多欄位;一系列 `93bd9f5`~`3358d97` 針對位置/重疊/缺圖修正,`4325d02` 專案清理並封存提案。 -- **PP-Structure V3 調校 (11-25)**:`a659e7a` 改善複雜圖示結構保留,`2312b4c` 前端可調 `pp_structure` 參數 + 測試,`0999898` 多頁 PDF 座標校正。 -- **記憶體管理與混合抽圖 (11-25~11-26)**:`ba8ddf2` 提案,`1afdb82` 混合圖片抽取+記憶體管理落地,`b997f93` 系列 GPU 釋放/可選 torch,引入 ModelManager、ServicePool、MemoryGuard(詳見 `openspec/changes/archive/2025-11-26-enhance-memory-management`);`a227311` 封存提案但僅完成 75/80 任務(剩餘文件化);隨後多筆修復(`79cffe6`~`fa9b542`)處理 PDF 回歸與文字渲染,`6e050eb` 為最新 OCR 軌表格格式/裁剪修正。 - -## 品質與測試信號 -- 11-16 完成 V2 API 測試 18/18 (`6bb5b76`),建立初步信心。 -- 雙軌導入時新增單元/整合/E2E 測試 (`0fcb249`, `c50a5e9`, `2ecd022`),但後續 PDF 版面復原大量依賴人工驗證,Phase 4 測試仍未完成(見下)。 -- 記憶體管理變更伴隨 57+18+10 測試檔(任務 8.1 完成),但文件化缺失可能影響交接與調參。 -- 11-24 大量 PDF 修復連續提交顯示迭代式修 bug,建議增加回歸測試覆蓋(特別是表格/多欄/列表與跨軌道 PDF)。 - -## 未盡事項與風險 -- **記憶體管理文件化缺口**:`openspec/changes/archive/2025-11-26-enhance-memory-management/tasks.md` 未完成 Section 8.2(架構說明、調校指南、疑難排解、監控、遷移指南),可能影響部署可操作性。 -- **PDF 版面復原驗證不足**:同一變更的 Phase 4 測試/效能/文件與多類文件驗證均未勾選,現階段品質依賴手測。 -- **近期修正集中於 PDF 與表格**(`79cffe6`, `5c561f4`, `19bd5fd`, `fa9b542`, `6e050eb`),顯示 Direct/OCR 軌 PDF 路徑仍脆弱;缺乏自動化回歸易再度回歸。 -- **主分支狀態**:`main` 比 `origin/main` 超前 1 提交(`6e050eb`),請推送前確認 CI/測試。 - -## 建議後續行動 -1) 完成記憶體管理文件(架構、調參、故障排除、Prometheus 監控指南)並加入 sanity check。 -2) 為 PDF 版面復原建立最小回歸集:多欄文檔、含圖表/表格的 Direct/OCR 軌、列表與 span 混排。 -3) 圍繞 `processing_track` 分流與 UnifiedDocument/PDF 生成的邊界條件增加測試(LOGO/未知元素、跨頁表格、OCR/Direct 混合圖片)。 -4) 推送前跑現有單元/整合/E2E 測試,補上近兩週新增場景的腳本以降低回歸風險。 diff --git a/docs/project-risk-assessment.md b/docs/project-risk-assessment.md deleted file mode 100644 index 62544db..0000000 --- a/docs/project-risk-assessment.md +++ /dev/null @@ -1,24 +0,0 @@ -# Project Risk & Issue Outlook - -本文件整理當前專案的可預見問題、潛在問題與建議修復方向(依風險與可行性排序)。依據來源:`git log`(97 commits, 2025-11-12~11-26)、`docs/architecture-overview.md`、`openspec/changes/archive/2025-11-26-enhance-memory-management/tasks.md` 等。 - -## 可預見的問題項目 -- **記憶體管理文件缺口**:`openspec/changes/archive/2025-11-26-enhance-memory-management/tasks.md` 的 8.2 文檔未完成,ModelManager/ServicePool/MemoryGuard 的調參與故障處置缺乏 runbook,部署或擴容時易踩坑。方向:補完架構說明、調參指南、故障排解與監控落地範例(Prometheus 指標與警戒值)。 -- **PDF 生成回歸風險高**:版面保持與表格/圖片渲染在 `fa1abcd` 之後多次修正(例如 `d33f605`→`92e326b`、`108784a`→`3358d97`、`6e050eb`),顯示缺少自動回歸。方向:建立最小回歸集(多欄文本、含圖表/表格、列表/Span 混排)與 golden PDF/JSON 比對,覆蓋 Direct/OCR 雙軌。 -- **最新 OCR 表格格式修復未經回歸**:`6e050eb` 修正 OCR 軌表格資料格式與裁剪,無對應測試。方向:為 OCR 軌加表格解析/PDF 出圖的整合測試,確保與前端下載/展示一致。 -- **PP-Structure 參數調校可能影響資源**:`frontend` 支援前端可調 `pp_structure_params`(`2312b4c`),若缺乏 guard,可能放大 GPU/記憶體壓力。方向:在後端對超參做白名單與上限檢查,並納入 MemoryGuard 預估。 -- **Chart 能力啟停策略缺少驗證**:`b048f2d` 禁用 → `7e12f16` 重新啟用;缺少覆蓋率與性能數據。方向:為 chart 模型啟用/關閉建立健康檢查與 A/B 測試數據收集。 - -## 潛在的問題項目 -- **UnifiedDocument 結構漂移風險**:雙軌共用輸出,近期多次調整(列表、Span、多欄、LOGO 元素),缺少結構驗證或 schema 鎖定。可能導致前端/匯出器/PDF 生成不一致。方向:定義 JSON Schema 或 pydantic 驗證,建立 contract 測試。 -- **服務池與記憶體守護的長時間行為未驗證**:雖有單元/整合測試,缺乏長時間 soak/stress(GPU 記憶碎片、模型 unload/reload、信號處理)。方向:加入 24h soak 測試與記憶體走勢告警,驗證 SIGTERM/SIGINT 清理。 -- **LibreOffice 轉檔鏈低觀測性**:Office 直抽與轉 PDF (`ef335cf`) 依賴系統 LibreOffice,缺少失敗監控與重試策略。方向:為轉檔階段增加 metrics/告警,並提供 fallback/重試。 -- **前端/後端 API 契約缺少檢查**:多次 V1→V2 遷移與新增參數(`pp_structure_params` 等),目前僅靠 E2E,缺少型別/契約檢查。方向:加入 OpenAPI 契約測試或生成型別校驗(ts-sdk 對齊 FastAPI schema)。 -- **混合抽圖/圖片保存路徑邊界**:Direct/OCR 混合抽圖與 `_save_image` 實作曾多次修復,仍缺少對 None/缺檔路徑的防禦。方向:為缺檔/無圖的 PDF 生成加強斷言與 fallback。 - -## 建議修復與方向 -1) **完成記憶體管理文檔與樣板設定**:在 `docs/` 新增 MemoryGuard/ServicePool 調參與故障排除指南,附 `.env` 範例與 Prometheus 規則,對應 tasks 8.2 清單。 -2) **建立 PDF/UnifiedDocument 回歸套件**:收集代表性樣本(多欄、表格、列表、含圖/LOGO、OCR/Direct 雙軌),產生 golden JSON/PDF,加入 CI 比對,並為 `6e050eb` 相關表格路徑新增測試。 -3) **加入 UnifiedDocument Schema 驗證**:定義 schema(pydantic/JSON Schema),在匯出/PDF 生成前驗證;同時讓前端型別由 OpenAPI 生成以防 drift。 -4) **PP-Structure 參數防護與資源估算**:後端實作白名單/上限與 MemoryGuard 預估,避免前端自由調參造成 GPU OOM;增加拒絕/降級回饋。 -5) **長時間穩定性與轉檔可觀測性**:增加 soak/stress pipeline,追蹤 GPU/CPU/記憶碎片;為 LibreOffice/轉檔階段加 metrics、重試與錯誤分類告警。 diff --git a/frontend/src/pages/TaskDetailPage.tsx b/frontend/src/pages/TaskDetailPage.tsx index 470dbf6..e5d3972 100644 --- a/frontend/src/pages/TaskDetailPage.tsx +++ b/frontend/src/pages/TaskDetailPage.tsx @@ -178,13 +178,31 @@ export default function TaskDetailPage() { } } - const handleDownloadPDF = async () => { + const handleDownloadLayoutPDF = async () => { if (!taskId) return try { - await apiClientV2.downloadPDF(taskId) + await apiClientV2.downloadPDF(taskId, 'layout') toast({ title: t('export.exportSuccess'), - description: 'PDF 已下載', + description: '版面 PDF 已下載', + variant: 'success', + }) + } catch (error: any) { + toast({ + title: t('export.exportError'), + description: error.response?.data?.detail || t('errors.networkError'), + variant: 'destructive', + }) + } + } + + const handleDownloadReflowPDF = async () => { + if (!taskId) return + try { + await apiClientV2.downloadPDF(taskId, 'reflow') + toast({ + title: t('export.exportSuccess'), + description: '流式 PDF 已下載', variant: 'success', }) } catch (error: any) { @@ -328,13 +346,14 @@ export default function TaskDetailPage() { } } - const handleDownloadTranslatedPdf = async (lang: string) => { + const handleDownloadTranslatedPdf = async (lang: string, format: 'layout' | 'reflow' = 'reflow') => { if (!taskId) return try { - await apiClientV2.downloadTranslatedPdf(taskId, lang) + await apiClientV2.downloadTranslatedPdf(taskId, lang, format) + const formatLabel = format === 'layout' ? '版面' : '流式' toast({ title: '下載成功', - description: `翻譯 PDF (${lang}) 已下載`, + description: `翻譯 ${formatLabel} PDF (${lang}) 已下載`, variant: 'success', }) } catch (error: any) { @@ -513,7 +532,7 @@ export default function TaskDetailPage() { -
+
- +
@@ -542,7 +565,6 @@ export default function TaskDetailPage() { 文件翻譯 - MADLAD-400 @@ -624,15 +646,22 @@ export default function TaskDetailPage() { JSON -