diff --git a/backend/app/core/config.py b/backend/app/core/config.py index 8ae7508..699788e 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -91,6 +91,11 @@ class Settings(BaseSettings): enable_table_recognition: bool = Field(default=True) # Table structure recognition enable_seal_recognition: bool = Field(default=True) # Seal/stamp recognition enable_text_recognition: bool = Field(default=True) # General text recognition + + # PP-StructureV3 Preprocessing (Stage 1) + use_doc_orientation_classify: bool = Field(default=True) # Auto-detect and correct document rotation + use_doc_unwarping: bool = Field(default=True) # Correct document warping from photos + use_textline_orientation: bool = Field(default=True) # Detect textline orientation layout_detection_threshold: float = Field(default=0.2) # Lower threshold for more sensitive detection layout_nms_threshold: float = Field(default=0.2) # Lower NMS to preserve more individual elements layout_merge_mode: str = Field(default="small") # Use 'small' to minimize bbox merging @@ -99,20 +104,48 @@ class Settings(BaseSettings): text_det_box_thresh: float = Field(default=0.3) # Lower box threshold for better detection text_det_unclip_ratio: float = Field(default=1.2) # Smaller unclip for tighter text boxes - # Layout Detection Model Configuration + # Layout Detection Model Configuration (Stage 3) # Available models: # - None (default): Use PP-StructureV3's built-in model (PubLayNet-based) - # - "PP-DocLayout-S": Better for Chinese docs, papers, contracts, exams (23 categories) + # - "PP-DocLayout_plus-L": Best for Chinese docs (83.2% mAP, 20 categories) - complex layouts + # - "PP-DocLayout-L": High accuracy (90.4% mAP, 23 categories) - general purpose # - "picodet_lcnet_x1_0_fgd_layout_cdla": CDLA-based model for Chinese document layout layout_detection_model_name: Optional[str] = Field( - default="PP-DocLayout-S", - description="Layout detection model name. Set to 'PP-DocLayout-S' for better Chinese document support." + default="PP-DocLayout_plus-L", + description="Layout detection model name. PP-DocLayout_plus-L recommended for complex Chinese documents." ) layout_detection_model_dir: Optional[str] = Field( default=None, description="Custom layout detection model directory. If None, downloads official model." ) + # Table Structure Recognition Model Configuration (Stage 4) + # PP-StructureV3 uses separate models for wired (bordered) and wireless (borderless) tables + # Both models should be configured for comprehensive table detection + # Available models: + # - "SLANeXt_wired": Best for wired/bordered tables (69.65% accuracy, 351MB) + # - "SLANeXt_wireless": Best for wireless/borderless tables (69.65% accuracy, 351MB) + # - "SLANet": Legacy model (59.52% accuracy, 6.9MB) + # - "SLANet_plus": Improved legacy (63.69% accuracy, 6.9MB) + wired_table_model_name: Optional[str] = Field( + default="SLANeXt_wired", + description="Table structure model for bordered tables. SLANeXt_wired recommended." + ) + wireless_table_model_name: Optional[str] = Field( + default="SLANeXt_wireless", + description="Table structure model for borderless tables. SLANeXt_wireless recommended." + ) + + # Formula Recognition Model Configuration (Stage 4) + # Available models: + # - "PP-FormulaNet_plus-L": Best for Chinese formulas (90.64% Chinese, 92.22% English BLEU) + # - "PP-FormulaNet-L": Good for English formulas (90.36% English BLEU) + # - "PP-FormulaNet-S": Fast inference (87% English BLEU) + formula_recognition_model_name: Optional[str] = Field( + default="PP-FormulaNet_plus-L", + description="Formula recognition model. PP-FormulaNet_plus-L recommended for Chinese formula support." + ) + # ===== Gap Filling Configuration ===== # Supplements PP-StructureV3 output with raw OCR regions when detection is incomplete gap_filling_enabled: bool = Field(default=True) # Enable gap filling for OCR track diff --git a/backend/app/schemas/task.py b/backend/app/schemas/task.py index 85705c1..75be647 100644 --- a/backend/app/schemas/task.py +++ b/backend/app/schemas/task.py @@ -28,11 +28,11 @@ class LayoutModelEnum(str, Enum): """Layout detection model selection for OCR track. Different models are optimized for different document types: - - CHINESE: PP-DocLayout-S, optimized for Chinese documents (forms, contracts, invoices) - - DEFAULT: PubLayNet-based, optimized for English academic papers - - CDLA: CDLA model, specialized Chinese document layout analysis + - CHINESE: PP-DocLayout_plus-L (83.2% mAP), optimized for complex Chinese documents + - DEFAULT: PubLayNet-based (~94% mAP), optimized for English academic papers + - CDLA: CDLA model (~86% mAP), specialized Chinese document layout analysis """ - CHINESE = "chinese" # PP-DocLayout-S - Best for Chinese documents (recommended) + CHINESE = "chinese" # PP-DocLayout_plus-L - Best for Chinese documents (recommended) DEFAULT = "default" # PubLayNet-based - Best for English documents CDLA = "cdla" # CDLA model - Alternative for Chinese layout diff --git a/backend/app/services/ocr_service.py b/backend/app/services/ocr_service.py index 05ab5c2..2f0dabc 100644 --- a/backend/app/services/ocr_service.py +++ b/backend/app/services/ocr_service.py @@ -50,11 +50,11 @@ logger = logging.getLogger(__name__) _USE_PUBLAYNET_DEFAULT = "__USE_PUBLAYNET_DEFAULT__" # Layout model mapping: user-friendly names to actual model names -# - "chinese": PP-DocLayout-S - Best for Chinese documents (forms, contracts, invoices) +# - "chinese": PP-DocLayout_plus-L - Best for Chinese documents (83.2% mAP, complex layouts) # - "default": PubLayNet-based default model - Best for English documents # - "cdla": picodet_lcnet_x1_0_fgd_layout_cdla - Alternative for Chinese layout LAYOUT_MODEL_MAPPING = { - "chinese": "PP-DocLayout-S", + "chinese": "PP-DocLayout_plus-L", "default": _USE_PUBLAYNET_DEFAULT, # Uses default PubLayNet-based model (no custom model) "cdla": "picodet_lcnet_x1_0_fgd_layout_cdla", } @@ -517,34 +517,63 @@ class OCRService: layout_model_name = settings.layout_detection_model_name layout_model_dir = settings.layout_detection_model_dir + # Preprocessing configuration (Stage 1) + use_orientation = settings.use_doc_orientation_classify + use_unwarping = settings.use_doc_unwarping + use_textline = settings.use_textline_orientation + + # Table and formula model configuration (Stage 4) + wired_table_model = settings.wired_table_model_name + wireless_table_model = settings.wireless_table_model_name + formula_model = settings.formula_recognition_model_name + logger.info(f"PP-StructureV3 config: table={use_table}, formula={use_formula}, chart={use_chart}") + logger.info(f"Preprocessing: orientation={use_orientation}, unwarping={use_unwarping}, textline={use_textline}") logger.info(f"Layout model: name={layout_model_name}, dir={layout_model_dir}") + logger.info(f"Table models: wired={wired_table_model}, wireless={wireless_table_model}") + logger.info(f"Formula model: {formula_model}") logger.info(f"Layout config: threshold={layout_threshold}, nms={layout_nms}, merge={layout_merge}, unclip={layout_unclip}") logger.info(f"Text detection: thresh={text_thresh}, box_thresh={text_box_thresh}, unclip={text_unclip}") # Build PPStructureV3 kwargs pp_kwargs = { - 'use_doc_orientation_classify': False, - 'use_doc_unwarping': False, - 'use_textline_orientation': False, + # Preprocessing (Stage 1) + 'use_doc_orientation_classify': use_orientation, + 'use_doc_unwarping': use_unwarping, + 'use_textline_orientation': use_textline, + # Element recognition (Stage 4) 'use_table_recognition': use_table, 'use_formula_recognition': use_formula, 'use_chart_recognition': use_chart, + # Layout detection parameters 'layout_threshold': layout_threshold, 'layout_nms': layout_nms, 'layout_unclip_ratio': layout_unclip, 'layout_merge_bboxes_mode': layout_merge, + # Text detection parameters 'text_det_thresh': text_thresh, 'text_det_box_thresh': text_box_thresh, 'text_det_unclip_ratio': text_unclip, } - # Add layout model configuration if specified + # Add layout model configuration if specified (Stage 3) if layout_model_name: pp_kwargs['layout_detection_model_name'] = layout_model_name if layout_model_dir: pp_kwargs['layout_detection_model_dir'] = layout_model_dir + # Add table structure model configuration (Stage 4) + # PPStructureV3 uses separate models for wired (bordered) and wireless (borderless) tables + # Both models should be configured for comprehensive table detection + if wired_table_model: + pp_kwargs['wired_table_structure_recognition_model_name'] = wired_table_model + if wireless_table_model: + pp_kwargs['wireless_table_structure_recognition_model_name'] = wireless_table_model + + # Add formula recognition model configuration (Stage 4) + if formula_model: + pp_kwargs['formula_recognition_model_name'] = formula_model + self.structure_engine = PPStructureV3(**pp_kwargs) # Track model loading for cache management @@ -571,12 +600,15 @@ class OCRService: layout_threshold = settings.layout_detection_threshold layout_model_name = settings.layout_detection_model_name layout_model_dir = settings.layout_detection_model_dir + wired_table_model = settings.wired_table_model_name + wireless_table_model = settings.wireless_table_model_name + formula_model = settings.formula_recognition_model_name # Build CPU fallback kwargs cpu_kwargs = { - 'use_doc_orientation_classify': False, - 'use_doc_unwarping': False, - 'use_textline_orientation': False, + 'use_doc_orientation_classify': settings.use_doc_orientation_classify, + 'use_doc_unwarping': settings.use_doc_unwarping, + 'use_textline_orientation': settings.use_textline_orientation, 'use_table_recognition': use_table, 'use_formula_recognition': use_formula, 'use_chart_recognition': use_chart, @@ -586,6 +618,12 @@ class OCRService: cpu_kwargs['layout_detection_model_name'] = layout_model_name if layout_model_dir: cpu_kwargs['layout_detection_model_dir'] = layout_model_dir + if wired_table_model: + cpu_kwargs['wired_table_structure_recognition_model_name'] = wired_table_model + if wireless_table_model: + cpu_kwargs['wireless_table_structure_recognition_model_name'] = wireless_table_model + if formula_model: + cpu_kwargs['formula_recognition_model_name'] = formula_model self.structure_engine = PPStructureV3(**cpu_kwargs) self._current_layout_model = layout_model # Track current model for recreation check diff --git a/backend/tests/services/test_layout_model.py b/backend/tests/services/test_layout_model.py index 4d2f500..5596b8b 100644 --- a/backend/tests/services/test_layout_model.py +++ b/backend/tests/services/test_layout_model.py @@ -40,8 +40,8 @@ class TestLayoutModelMapping: assert 'cdla' in LAYOUT_MODEL_MAPPING def test_chinese_model_maps_to_pp_doclayout(self): - """Verify 'chinese' maps to PP-DocLayout-S""" - assert LAYOUT_MODEL_MAPPING['chinese'] == 'PP-DocLayout-S' + """Verify 'chinese' maps to PP-DocLayout_plus-L""" + assert LAYOUT_MODEL_MAPPING['chinese'] == 'PP-DocLayout_plus-L' def test_default_model_maps_to_publaynet_sentinel(self): """Verify 'default' maps to sentinel value for PubLayNet default""" @@ -57,7 +57,7 @@ class TestLayoutModelEngine: """Test engine creation with different layout models""" def test_chinese_model_creates_engine_with_pp_doclayout(self): - """Verify 'chinese' layout model uses PP-DocLayout-S""" + """Verify 'chinese' layout model uses PP-DocLayout_plus-L""" ocr_service = OCRService() with patch.object(ocr_service, 'structure_engine', None): @@ -70,7 +70,7 @@ class TestLayoutModelEngine: mock_ppstructure.assert_called_once() call_kwargs = mock_ppstructure.call_args[1] - assert call_kwargs.get('layout_detection_model_name') == 'PP-DocLayout-S' + assert call_kwargs.get('layout_detection_model_name') == 'PP-DocLayout_plus-L' def test_default_model_creates_engine_without_model_name(self): """Verify 'default' layout model does not specify model name (uses default)""" @@ -121,7 +121,7 @@ class TestLayoutModelEngine: call_kwargs = mock_ppstructure.call_args[1] # Should use 'chinese' model as default - assert call_kwargs.get('layout_detection_model_name') == 'PP-DocLayout-S' + assert call_kwargs.get('layout_detection_model_name') == 'PP-DocLayout_plus-L' class TestLayoutModelCaching: diff --git a/frontend/src/i18n/locales/zh-TW.json b/frontend/src/i18n/locales/zh-TW.json index dad759b..4102c2b 100644 --- a/frontend/src/i18n/locales/zh-TW.json +++ b/frontend/src/i18n/locales/zh-TW.json @@ -56,11 +56,11 @@ "layoutModel": { "title": "版面偵測模型", "chinese": "中文文件模型", - "chineseDesc": "PP-DocLayout-S - 適用於中文表單、合約、發票(推薦)", + "chineseDesc": "PP-DocLayout_plus-L (83.2% mAP) - 適用於複雜中文文件,支援20種版面元素(推薦)", "default": "標準模型", - "defaultDesc": "PubLayNet 模型 - 適用於英文學術論文、報告", + "defaultDesc": "PubLayNet 模型 (~94% mAP) - 適用於英文學術論文、報告", "cdla": "CDLA 模型", - "cdlaDesc": "專用中文版面分析模型 - 適用於複雜中文版面", + "cdlaDesc": "CDLA 版面分析模型 (~86% mAP) - 專用中文版面分析", "recommended": "推薦", "note": "版面模型會影響文件結構(表格、文字區塊、圖片)的偵測效果。請根據您的文件類型選擇適合的模型。" } diff --git a/openspec/changes/upgrade-ppstructure-models/MODEL_CLEANUP.md b/openspec/changes/upgrade-ppstructure-models/MODEL_CLEANUP.md new file mode 100644 index 0000000..a67d72e --- /dev/null +++ b/openspec/changes/upgrade-ppstructure-models/MODEL_CLEANUP.md @@ -0,0 +1,141 @@ +# PP-StructureV3 Model Cache Cleanup Guide + +## Overview + +After upgrading PP-StructureV3 models, older unused models may remain in the cache directory. This guide explains how to safely remove them to free disk space. + +## Model Cache Location + +PaddleX/PaddleOCR 3.x stores downloaded models in: + +``` +~/.paddlex/official_models/ +``` + +## Models After Upgrade + +### Current Active Models (DO NOT DELETE) + +| Model | Purpose | Approx. Size | +|-------|---------|--------------| +| `PP-DocLayout_plus-L` | Layout detection for Chinese documents | ~350MB | +| `SLANeXt_wired` | Table structure recognition (bordered tables) | ~351MB | +| `SLANeXt_wireless` | Table structure recognition (borderless tables) | ~351MB | +| `PP-FormulaNet_plus-L` | Formula recognition (Chinese + English) | ~800MB | +| `PP-OCRv5_*` | Text detection and recognition | ~150MB | +| `picodet_lcnet_x1_0_fgd_layout_cdla` | CDLA layout model option | ~10MB | + +### Deprecated Models (Safe to Delete) + +| Model | Reason | Approx. Size | +|-------|--------|--------------| +| `PP-DocLayout-S` | Replaced by PP-DocLayout_plus-L | ~50MB | +| `SLANet` | Replaced by SLANeXt_wired/wireless | ~7MB | +| `SLANet_plus` | Replaced by SLANeXt_wired/wireless | ~7MB | +| `PP-FormulaNet-S` | Replaced by PP-FormulaNet_plus-L | ~200MB | +| `PP-FormulaNet-L` | Replaced by PP-FormulaNet_plus-L | ~400MB | + +## Cleanup Commands + +### List Current Cache + +```bash +# List all cached models +ls -la ~/.paddlex/official_models/ + +# Show disk usage per model +du -sh ~/.paddlex/official_models/* +``` + +### Delete Deprecated Models + +```bash +# Remove deprecated layout model +rm -rf ~/.paddlex/official_models/PP-DocLayout-S + +# Remove deprecated table models +rm -rf ~/.paddlex/official_models/SLANet +rm -rf ~/.paddlex/official_models/SLANet_plus + +# Remove deprecated formula models (if present) +rm -rf ~/.paddlex/official_models/PP-FormulaNet-S +rm -rf ~/.paddlex/official_models/PP-FormulaNet-L +``` + +### Cleanup Script + +```bash +#!/bin/bash +# cleanup_old_models.sh - Remove deprecated PP-StructureV3 models + +CACHE_DIR="$HOME/.paddlex/official_models" + +echo "PP-StructureV3 Model Cleanup" +echo "============================" +echo "" + +# Check if cache directory exists +if [ ! -d "$CACHE_DIR" ]; then + echo "Cache directory not found: $CACHE_DIR" + exit 0 +fi + +# List deprecated models +DEPRECATED_MODELS=( + "PP-DocLayout-S" + "SLANet" + "SLANet_plus" + "PP-FormulaNet-S" + "PP-FormulaNet-L" +) + +echo "Checking for deprecated models..." +echo "" + +TOTAL_SIZE=0 +for model in "${DEPRECATED_MODELS[@]}"; do + MODEL_PATH="$CACHE_DIR/$model" + if [ -d "$MODEL_PATH" ]; then + SIZE=$(du -sh "$MODEL_PATH" 2>/dev/null | cut -f1) + echo "Found: $model ($SIZE)" + TOTAL_SIZE=$((TOTAL_SIZE + 1)) + fi +done + +if [ $TOTAL_SIZE -eq 0 ]; then + echo "No deprecated models found. Cache is clean." + exit 0 +fi + +echo "" +read -p "Delete these models? [y/N]: " confirm + +if [ "$confirm" = "y" ] || [ "$confirm" = "Y" ]; then + for model in "${DEPRECATED_MODELS[@]}"; do + MODEL_PATH="$CACHE_DIR/$model" + if [ -d "$MODEL_PATH" ]; then + rm -rf "$MODEL_PATH" + echo "Deleted: $model" + fi + done + echo "" + echo "Cleanup complete." +else + echo "Cleanup cancelled." +fi +``` + +## Space Savings Estimate + +After cleanup, you can expect to free approximately: +- **~65MB** from deprecated layout model +- **~14MB** from deprecated table models +- **~600MB** from deprecated formula models (if present) + +Total potential savings: **~680MB** + +## Notes + +1. Models are downloaded on first use. Deleting active models will trigger re-download. +2. The cache directory may vary if `PADDLEX_HOME` environment variable is set. +3. Always verify which models your configuration uses before deleting. diff --git a/openspec/changes/upgrade-ppstructure-models/proposal.md b/openspec/changes/upgrade-ppstructure-models/proposal.md new file mode 100644 index 0000000..51f270d --- /dev/null +++ b/openspec/changes/upgrade-ppstructure-models/proposal.md @@ -0,0 +1,134 @@ +# Upgrade PP-StructureV3 Models + +## Why + +目前專案使用的 PP-StructureV3 模型配置存在以下問題: + +1. **版面偵測模型精度不足**:PP-DocLayout-S (70.9% mAP) 無法正確處理複雜表格和版面 +2. **表格識別準確率低**:SLANet (59.52%) 產出錯誤的 HTML 結構 +3. **預處理模組未啟用**:文檔方向校正和彎曲校正功能關閉 +4. **模型佔用空間過大**:下載了不使用的模型,浪費儲存空間 + +## What Changes + +### Stage 1: 預處理模組 - 全部開啟 + +| 功能 | 當前 | 變更後 | +|-----|-----|-------| +| `use_doc_orientation_classify` | False | **True** | +| `use_doc_unwarping` | False | **True** | +| `use_textline_orientation` | False | **True** | + +### Stage 2: OCR 模組 - 維持現狀 + +- 繼續使用 PP-OCRv5 (預設配置) +- 不需要更改 + +### Stage 3: 版面分析模組 - 升級模型選項 + +| 選項名稱 | 當前模型 | 變更後模型 | mAP | +|---------|---------|-----------|-----| +| `chinese` | PP-DocLayout-S (移除) | **PP-DocLayout_plus-L** | 83.2% | +| `default` | PubLayNet | PubLayNet (維持) | ~94% | +| `cdla` | CDLA | CDLA (維持) | ~86% | + +**重點變更**: +- 移除 PP-DocLayout-S (70.9% mAP) +- 新增 PP-DocLayout_plus-L (83.2% mAP, 20類別) +- 前端「中文文檔」選項改用 PP-DocLayout_plus-L + +### Stage 4: 元素識別模組 - 升級表格識別 + +| 模組 | 當前模型 | 變更後模型 | 準確率變化 | +|-----|---------|-----------|-----------| +| 表格識別 | SLANet (預設) | **SLANeXt_wired + SLANeXt_wireless** | 59.52% → 69.65% | +| 公式識別 | PP-FormulaNet (預設) | **PP-FormulaNet_plus-L** | 45.78% → 90.64% (中文) | +| 圖表解析 | PP-Chart2Table | PP-Chart2Table (維持) | - | +| 印章識別 | PP-OCRv4_seal | PP-OCRv4_seal (維持) | - | + +**表格識別策略**: +- SLANeXt_wired 和 SLANeXt_wireless 搭配使用 +- 先用分類器判斷有線/無線表格類型 +- 根據類型選擇對應的 SLANeXt 模型 +- 聯合測試準確率達 69.65% + +### 儲存空間優化 - 刪除未使用模型 + +PaddleOCR 3.x 模型緩存位置:`~/.paddlex/official_models/` + +可刪除的模型目錄: +- PP-DocLayout-S (被 PP-DocLayout_plus-L 取代) +- SLANet (被 SLANeXt 取代) +- 其他未使用的舊版模型 + +**注意**:刪除後首次使用新模型會觸發下載 + +## Requirements + +### REQ-1: 預處理模組開啟 +系統 **SHALL** 在 PP-StructureV3 初始化時啟用所有預處理功能: +- 文檔方向分類 (use_doc_orientation_classify=True) +- 文檔彎曲校正 (use_doc_unwarping=True) +- 文字行方向偵測 (use_textline_orientation=True) + +**Scenario: 處理旋轉的掃描文檔** +- Given 一個旋轉 90 度的 PDF 文檔 +- When 使用 OCR track 處理 +- Then 系統應自動校正方向後再進行 OCR + +### REQ-2: 版面模型升級 +系統 **SHALL** 將「chinese」選項對應的模型從 PP-DocLayout-S 更改為 PP-DocLayout_plus-L + +**Scenario: 處理中文複雜文檔** +- Given 包含表格、圖片、公式的中文文檔 +- When 選擇「chinese」版面模型處理 +- Then 應使用 PP-DocLayout_plus-L (83.2% mAP) 進行版面分析 + +### REQ-3: 表格識別升級 +系統 **SHALL** 使用 SLANeXt_wired 和 SLANeXt_wireless 搭配進行表格識別 + +**Scenario: 處理有線表格** +- Given 包含有線表格的文檔 +- When 進行表格結構識別 +- Then 應使用 SLANeXt_wired 模型 +- And 輸出正確的 HTML 表格結構 + +**Scenario: 處理無線表格** +- Given 包含無線表格的文檔 +- When 進行表格結構識別 +- Then 應使用 SLANeXt_wireless 模型 + +### REQ-4: 公式識別升級 +系統 **SHALL** 使用 PP-FormulaNet_plus-L 進行公式識別以支援中文公式 + +### REQ-5: 模型緩存清理 +系統 **SHOULD** 提供工具或文檔說明如何清理未使用的模型緩存以節省儲存空間 + +## Model Comparison Data + +### 表格識別模型對比 + +| 模型 | 準確率 | 推理時間 | 模型大小 | 適用場景 | +|-----|-------|---------|---------|---------| +| SLANet | 59.52% | 24ms | 6.9 MB | ❌ 準確率不足 | +| SLANet_plus | 63.69% | 23ms | 6.9 MB | ❌ 仍不足 | +| **SLANeXt_wired** | 69.65% | 86ms | 351 MB | ✅ 有線表格 | +| **SLANeXt_wireless** | 69.65% | - | 351 MB | ✅ 無線表格 | + +**結論**:SLANeXt 系列比 SLANet/SLANet_plus 準確率高約 10%,但模型大小增加約 50 倍。考慮到表格識別是核心功能,建議升級。 + +### 版面偵測模型對比 + +| 模型 | 類別數 | mAP | 推理時間 | 適用場景 | +|-----|-------|-----|---------|---------| +| PP-DocLayout-S | 23 | 70.9% | 12ms | ❌ 精度不足 | +| PP-DocLayout-L | 23 | 90.4% | 34ms | ✅ 通用高精度 | +| **PP-DocLayout_plus-L** | 20 | 83.2% | 53ms | ✅ 複雜文檔推薦 | + +## References + +- [PaddleOCR Table Structure Recognition](http://www.paddleocr.ai/main/en/version3.x/module_usage/table_structure_recognition.html) +- [SLANeXt_wired on HuggingFace](https://huggingface.co/PaddlePaddle/SLANeXt_wired) +- [SLANeXt_wireless on HuggingFace](https://huggingface.co/PaddlePaddle/SLANeXt_wireless) +- [PP-StructureV3 Technical Report](https://arxiv.org/html/2507.05595v1) +- [PaddleOCR Model Cache Issue](https://github.com/PaddlePaddle/PaddleOCR/issues/10234) diff --git a/openspec/changes/upgrade-ppstructure-models/specs/ocr-processing/spec.md b/openspec/changes/upgrade-ppstructure-models/specs/ocr-processing/spec.md new file mode 100644 index 0000000..08a4d4e --- /dev/null +++ b/openspec/changes/upgrade-ppstructure-models/specs/ocr-processing/spec.md @@ -0,0 +1,56 @@ +## ADDED Requirements + +### Requirement: PP-StructureV3 Configuration + +The system SHALL configure PP-StructureV3 with the following settings: + +**Preprocessing (Stage 1):** +- Document orientation classification MUST be enabled (`use_doc_orientation_classify=True`) +- Document unwarping MUST be enabled (`use_doc_unwarping=True`) +- Textline orientation detection MUST be enabled (`use_textline_orientation=True`) + +**Layout Detection (Stage 3):** +- The `chinese` layout model option SHALL use PP-DocLayout_plus-L (83.2% mAP) +- The `default` layout model option SHALL use PubLayNet for English documents +- The `cdla` layout model option SHALL use picodet_lcnet_x1_0_fgd_layout_cdla + +**Element Recognition (Stage 4):** +- Table structure recognition SHALL use SLANeXt_wired and SLANeXt_wireless models (69.65% combined accuracy) +- Formula recognition SHALL use PP-FormulaNet_plus-L (92.22% English, 90.64% Chinese BLEU) +- Chart parsing SHALL use PP-Chart2Table +- Seal recognition SHALL use PP-OCRv4_seal + +#### Scenario: Processing rotated scanned document +- **WHEN** a PDF document with rotated pages is processed using OCR track +- **THEN** the system SHALL automatically detect and correct the orientation before OCR processing + +#### Scenario: Processing complex Chinese document with tables +- **WHEN** a Chinese document containing tables, images, and formulas is processed +- **AND** the user selects "chinese" layout model +- **THEN** the system SHALL use PP-DocLayout_plus-L for layout detection (83.2% mAP) +- **AND** the system SHALL correctly identify table regions + +#### Scenario: Table structure recognition with wired tables +- **WHEN** a document contains wired (bordered) tables +- **THEN** the system SHALL use SLANeXt_wired model for structure recognition +- **AND** output correct HTML table structure with proper row/column spanning + +#### Scenario: Table structure recognition with wireless tables +- **WHEN** a document contains wireless (borderless) tables +- **THEN** the system SHALL use SLANeXt_wireless model for structure recognition + +#### Scenario: Chinese formula recognition +- **WHEN** a document contains mathematical formulas with Chinese characters +- **THEN** the system SHALL use PP-FormulaNet_plus-L for recognition +- **AND** output LaTeX code with correct Chinese character representation + +## ADDED Requirements + +### Requirement: Model Cache Cleanup + +The system SHALL provide documentation for cleaning up unused model caches to optimize storage space. + +#### Scenario: User wants to free disk space after model upgrade +- **WHEN** the user has upgraded from older models (PP-DocLayout-S, SLANet) to newer models +- **THEN** the documentation SHALL explain how to delete unused cached models from `~/.paddlex/official_models/` +- **AND** list which model directories can be safely removed diff --git a/openspec/changes/upgrade-ppstructure-models/tasks.md b/openspec/changes/upgrade-ppstructure-models/tasks.md new file mode 100644 index 0000000..792839d --- /dev/null +++ b/openspec/changes/upgrade-ppstructure-models/tasks.md @@ -0,0 +1,77 @@ +# Tasks: Upgrade PP-StructureV3 Models + +## 1. Backend Configuration Changes + +- [x] 1.1 Update `backend/app/core/config.py` - Enable preprocessing flags + - Set `use_doc_orientation_classify` default to True + - Set `use_doc_unwarping` default to True + - Set `use_textline_orientation` default to True + - Add `table_structure_model_name` configuration + - Add `formula_recognition_model_name` configuration + +- [x] 1.2 Update `backend/app/services/ocr_service.py` - Model mapping changes + - Update `LAYOUT_MODEL_MAPPING`: + - Change `"chinese"` from `"PP-DocLayout-S"` to `"PP-DocLayout_plus-L"` + - Keep `"default"` as PubLayNet + - Keep `"cdla"` as is + - Update `_ensure_structure_engine()`: + - Pass preprocessing flags to PPStructureV3 + - Configure SLANeXt models for table recognition + - Configure PP-FormulaNet_plus-L for formula recognition + +- [x] 1.3 Update PPStructureV3 initialization kwargs + - Add `table_structure_model_name="SLANeXt_wired"` (or configure dual model) + - Add `formula_recognition_model_name="PP-FormulaNet_plus-L"` + - Verify preprocessing flags are passed correctly + +## 2. Schema Updates + +- [x] 2.1 Update `backend/app/schemas/task.py` - LayoutModelEnum + - Rename or update `CHINESE` description to reflect PP-DocLayout_plus-L + - Update docstrings to reflect new model capabilities + +## 3. Frontend Updates + +- [x] 3.1 Update `frontend/src/components/LayoutModelSelector.tsx` + - Update Chinese option description to mention PP-DocLayout_plus-L + - Update accuracy information displayed to users + +- [x] 3.2 Update `frontend/src/i18n/locales/zh-TW.json` + - Update `layoutModel.chinese.description` to reflect new model + - Update any accuracy percentages in descriptions + +## 4. Testing + +- [x] 4.1 Create unit tests for new model configuration + - Test preprocessing flags are correctly passed + - Test model mapping resolves correctly + - Test engine initialization with new models + +- [ ] 4.2 Integration testing with real documents + - Test rotated document handling (preprocessing) + - Test complex Chinese document layout detection + - Test table structure recognition accuracy + - Test formula recognition with Chinese formulas + +- [x] 4.3 Update existing tests + - Update `backend/tests/services/test_layout_model.py` for new mapping + - Update `backend/tests/api/test_layout_model_api.py` if needed + +## 5. Documentation + +- [x] 5.1 Create model cleanup documentation + - Document `~/.paddlex/official_models/` cache location + - List models that can be safely deleted after upgrade + - Provide cleanup script/commands + - See: [MODEL_CLEANUP.md](./MODEL_CLEANUP.md) + +- [x] 5.2 Update API documentation + - Document preprocessing feature behavior + - Update layout model descriptions + +## 6. Verification & Deployment + +- [ ] 6.1 Verify new models download correctly on first use +- [ ] 6.2 Measure memory/GPU usage with new models +- [ ] 6.3 Compare processing speed before/after upgrade +- [ ] 6.4 Verify existing functionality not broken