feat: upgrade PP-StructureV3 models to latest versions
- Layout: PP-DocLayout-S → PP-DocLayout_plus-L (83.2% mAP) - Table: Single model → Dual SLANeXt (wired/wireless) - Formula: PP-FormulaNet_plus-L for enhanced recognition - Add preprocessing flags support (orientation, unwarping) - Update frontend i18n descriptions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -91,6 +91,11 @@ class Settings(BaseSettings):
|
|||||||
enable_table_recognition: bool = Field(default=True) # Table structure recognition
|
enable_table_recognition: bool = Field(default=True) # Table structure recognition
|
||||||
enable_seal_recognition: bool = Field(default=True) # Seal/stamp recognition
|
enable_seal_recognition: bool = Field(default=True) # Seal/stamp recognition
|
||||||
enable_text_recognition: bool = Field(default=True) # General text recognition
|
enable_text_recognition: bool = Field(default=True) # General text recognition
|
||||||
|
|
||||||
|
# PP-StructureV3 Preprocessing (Stage 1)
|
||||||
|
use_doc_orientation_classify: bool = Field(default=True) # Auto-detect and correct document rotation
|
||||||
|
use_doc_unwarping: bool = Field(default=True) # Correct document warping from photos
|
||||||
|
use_textline_orientation: bool = Field(default=True) # Detect textline orientation
|
||||||
layout_detection_threshold: float = Field(default=0.2) # Lower threshold for more sensitive detection
|
layout_detection_threshold: float = Field(default=0.2) # Lower threshold for more sensitive detection
|
||||||
layout_nms_threshold: float = Field(default=0.2) # Lower NMS to preserve more individual elements
|
layout_nms_threshold: float = Field(default=0.2) # Lower NMS to preserve more individual elements
|
||||||
layout_merge_mode: str = Field(default="small") # Use 'small' to minimize bbox merging
|
layout_merge_mode: str = Field(default="small") # Use 'small' to minimize bbox merging
|
||||||
@@ -99,20 +104,48 @@ class Settings(BaseSettings):
|
|||||||
text_det_box_thresh: float = Field(default=0.3) # Lower box threshold for better detection
|
text_det_box_thresh: float = Field(default=0.3) # Lower box threshold for better detection
|
||||||
text_det_unclip_ratio: float = Field(default=1.2) # Smaller unclip for tighter text boxes
|
text_det_unclip_ratio: float = Field(default=1.2) # Smaller unclip for tighter text boxes
|
||||||
|
|
||||||
# Layout Detection Model Configuration
|
# Layout Detection Model Configuration (Stage 3)
|
||||||
# Available models:
|
# Available models:
|
||||||
# - None (default): Use PP-StructureV3's built-in model (PubLayNet-based)
|
# - None (default): Use PP-StructureV3's built-in model (PubLayNet-based)
|
||||||
# - "PP-DocLayout-S": Better for Chinese docs, papers, contracts, exams (23 categories)
|
# - "PP-DocLayout_plus-L": Best for Chinese docs (83.2% mAP, 20 categories) - complex layouts
|
||||||
|
# - "PP-DocLayout-L": High accuracy (90.4% mAP, 23 categories) - general purpose
|
||||||
# - "picodet_lcnet_x1_0_fgd_layout_cdla": CDLA-based model for Chinese document layout
|
# - "picodet_lcnet_x1_0_fgd_layout_cdla": CDLA-based model for Chinese document layout
|
||||||
layout_detection_model_name: Optional[str] = Field(
|
layout_detection_model_name: Optional[str] = Field(
|
||||||
default="PP-DocLayout-S",
|
default="PP-DocLayout_plus-L",
|
||||||
description="Layout detection model name. Set to 'PP-DocLayout-S' for better Chinese document support."
|
description="Layout detection model name. PP-DocLayout_plus-L recommended for complex Chinese documents."
|
||||||
)
|
)
|
||||||
layout_detection_model_dir: Optional[str] = Field(
|
layout_detection_model_dir: Optional[str] = Field(
|
||||||
default=None,
|
default=None,
|
||||||
description="Custom layout detection model directory. If None, downloads official model."
|
description="Custom layout detection model directory. If None, downloads official model."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Table Structure Recognition Model Configuration (Stage 4)
|
||||||
|
# PP-StructureV3 uses separate models for wired (bordered) and wireless (borderless) tables
|
||||||
|
# Both models should be configured for comprehensive table detection
|
||||||
|
# Available models:
|
||||||
|
# - "SLANeXt_wired": Best for wired/bordered tables (69.65% accuracy, 351MB)
|
||||||
|
# - "SLANeXt_wireless": Best for wireless/borderless tables (69.65% accuracy, 351MB)
|
||||||
|
# - "SLANet": Legacy model (59.52% accuracy, 6.9MB)
|
||||||
|
# - "SLANet_plus": Improved legacy (63.69% accuracy, 6.9MB)
|
||||||
|
wired_table_model_name: Optional[str] = Field(
|
||||||
|
default="SLANeXt_wired",
|
||||||
|
description="Table structure model for bordered tables. SLANeXt_wired recommended."
|
||||||
|
)
|
||||||
|
wireless_table_model_name: Optional[str] = Field(
|
||||||
|
default="SLANeXt_wireless",
|
||||||
|
description="Table structure model for borderless tables. SLANeXt_wireless recommended."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Formula Recognition Model Configuration (Stage 4)
|
||||||
|
# Available models:
|
||||||
|
# - "PP-FormulaNet_plus-L": Best for Chinese formulas (90.64% Chinese, 92.22% English BLEU)
|
||||||
|
# - "PP-FormulaNet-L": Good for English formulas (90.36% English BLEU)
|
||||||
|
# - "PP-FormulaNet-S": Fast inference (87% English BLEU)
|
||||||
|
formula_recognition_model_name: Optional[str] = Field(
|
||||||
|
default="PP-FormulaNet_plus-L",
|
||||||
|
description="Formula recognition model. PP-FormulaNet_plus-L recommended for Chinese formula support."
|
||||||
|
)
|
||||||
|
|
||||||
# ===== Gap Filling Configuration =====
|
# ===== Gap Filling Configuration =====
|
||||||
# Supplements PP-StructureV3 output with raw OCR regions when detection is incomplete
|
# Supplements PP-StructureV3 output with raw OCR regions when detection is incomplete
|
||||||
gap_filling_enabled: bool = Field(default=True) # Enable gap filling for OCR track
|
gap_filling_enabled: bool = Field(default=True) # Enable gap filling for OCR track
|
||||||
|
|||||||
@@ -28,11 +28,11 @@ class LayoutModelEnum(str, Enum):
|
|||||||
"""Layout detection model selection for OCR track.
|
"""Layout detection model selection for OCR track.
|
||||||
|
|
||||||
Different models are optimized for different document types:
|
Different models are optimized for different document types:
|
||||||
- CHINESE: PP-DocLayout-S, optimized for Chinese documents (forms, contracts, invoices)
|
- CHINESE: PP-DocLayout_plus-L (83.2% mAP), optimized for complex Chinese documents
|
||||||
- DEFAULT: PubLayNet-based, optimized for English academic papers
|
- DEFAULT: PubLayNet-based (~94% mAP), optimized for English academic papers
|
||||||
- CDLA: CDLA model, specialized Chinese document layout analysis
|
- CDLA: CDLA model (~86% mAP), specialized Chinese document layout analysis
|
||||||
"""
|
"""
|
||||||
CHINESE = "chinese" # PP-DocLayout-S - Best for Chinese documents (recommended)
|
CHINESE = "chinese" # PP-DocLayout_plus-L - Best for Chinese documents (recommended)
|
||||||
DEFAULT = "default" # PubLayNet-based - Best for English documents
|
DEFAULT = "default" # PubLayNet-based - Best for English documents
|
||||||
CDLA = "cdla" # CDLA model - Alternative for Chinese layout
|
CDLA = "cdla" # CDLA model - Alternative for Chinese layout
|
||||||
|
|
||||||
|
|||||||
@@ -50,11 +50,11 @@ logger = logging.getLogger(__name__)
|
|||||||
_USE_PUBLAYNET_DEFAULT = "__USE_PUBLAYNET_DEFAULT__"
|
_USE_PUBLAYNET_DEFAULT = "__USE_PUBLAYNET_DEFAULT__"
|
||||||
|
|
||||||
# Layout model mapping: user-friendly names to actual model names
|
# Layout model mapping: user-friendly names to actual model names
|
||||||
# - "chinese": PP-DocLayout-S - Best for Chinese documents (forms, contracts, invoices)
|
# - "chinese": PP-DocLayout_plus-L - Best for Chinese documents (83.2% mAP, complex layouts)
|
||||||
# - "default": PubLayNet-based default model - Best for English documents
|
# - "default": PubLayNet-based default model - Best for English documents
|
||||||
# - "cdla": picodet_lcnet_x1_0_fgd_layout_cdla - Alternative for Chinese layout
|
# - "cdla": picodet_lcnet_x1_0_fgd_layout_cdla - Alternative for Chinese layout
|
||||||
LAYOUT_MODEL_MAPPING = {
|
LAYOUT_MODEL_MAPPING = {
|
||||||
"chinese": "PP-DocLayout-S",
|
"chinese": "PP-DocLayout_plus-L",
|
||||||
"default": _USE_PUBLAYNET_DEFAULT, # Uses default PubLayNet-based model (no custom model)
|
"default": _USE_PUBLAYNET_DEFAULT, # Uses default PubLayNet-based model (no custom model)
|
||||||
"cdla": "picodet_lcnet_x1_0_fgd_layout_cdla",
|
"cdla": "picodet_lcnet_x1_0_fgd_layout_cdla",
|
||||||
}
|
}
|
||||||
@@ -517,34 +517,63 @@ class OCRService:
|
|||||||
layout_model_name = settings.layout_detection_model_name
|
layout_model_name = settings.layout_detection_model_name
|
||||||
layout_model_dir = settings.layout_detection_model_dir
|
layout_model_dir = settings.layout_detection_model_dir
|
||||||
|
|
||||||
|
# Preprocessing configuration (Stage 1)
|
||||||
|
use_orientation = settings.use_doc_orientation_classify
|
||||||
|
use_unwarping = settings.use_doc_unwarping
|
||||||
|
use_textline = settings.use_textline_orientation
|
||||||
|
|
||||||
|
# Table and formula model configuration (Stage 4)
|
||||||
|
wired_table_model = settings.wired_table_model_name
|
||||||
|
wireless_table_model = settings.wireless_table_model_name
|
||||||
|
formula_model = settings.formula_recognition_model_name
|
||||||
|
|
||||||
logger.info(f"PP-StructureV3 config: table={use_table}, formula={use_formula}, chart={use_chart}")
|
logger.info(f"PP-StructureV3 config: table={use_table}, formula={use_formula}, chart={use_chart}")
|
||||||
|
logger.info(f"Preprocessing: orientation={use_orientation}, unwarping={use_unwarping}, textline={use_textline}")
|
||||||
logger.info(f"Layout model: name={layout_model_name}, dir={layout_model_dir}")
|
logger.info(f"Layout model: name={layout_model_name}, dir={layout_model_dir}")
|
||||||
|
logger.info(f"Table models: wired={wired_table_model}, wireless={wireless_table_model}")
|
||||||
|
logger.info(f"Formula model: {formula_model}")
|
||||||
logger.info(f"Layout config: threshold={layout_threshold}, nms={layout_nms}, merge={layout_merge}, unclip={layout_unclip}")
|
logger.info(f"Layout config: threshold={layout_threshold}, nms={layout_nms}, merge={layout_merge}, unclip={layout_unclip}")
|
||||||
logger.info(f"Text detection: thresh={text_thresh}, box_thresh={text_box_thresh}, unclip={text_unclip}")
|
logger.info(f"Text detection: thresh={text_thresh}, box_thresh={text_box_thresh}, unclip={text_unclip}")
|
||||||
|
|
||||||
# Build PPStructureV3 kwargs
|
# Build PPStructureV3 kwargs
|
||||||
pp_kwargs = {
|
pp_kwargs = {
|
||||||
'use_doc_orientation_classify': False,
|
# Preprocessing (Stage 1)
|
||||||
'use_doc_unwarping': False,
|
'use_doc_orientation_classify': use_orientation,
|
||||||
'use_textline_orientation': False,
|
'use_doc_unwarping': use_unwarping,
|
||||||
|
'use_textline_orientation': use_textline,
|
||||||
|
# Element recognition (Stage 4)
|
||||||
'use_table_recognition': use_table,
|
'use_table_recognition': use_table,
|
||||||
'use_formula_recognition': use_formula,
|
'use_formula_recognition': use_formula,
|
||||||
'use_chart_recognition': use_chart,
|
'use_chart_recognition': use_chart,
|
||||||
|
# Layout detection parameters
|
||||||
'layout_threshold': layout_threshold,
|
'layout_threshold': layout_threshold,
|
||||||
'layout_nms': layout_nms,
|
'layout_nms': layout_nms,
|
||||||
'layout_unclip_ratio': layout_unclip,
|
'layout_unclip_ratio': layout_unclip,
|
||||||
'layout_merge_bboxes_mode': layout_merge,
|
'layout_merge_bboxes_mode': layout_merge,
|
||||||
|
# Text detection parameters
|
||||||
'text_det_thresh': text_thresh,
|
'text_det_thresh': text_thresh,
|
||||||
'text_det_box_thresh': text_box_thresh,
|
'text_det_box_thresh': text_box_thresh,
|
||||||
'text_det_unclip_ratio': text_unclip,
|
'text_det_unclip_ratio': text_unclip,
|
||||||
}
|
}
|
||||||
|
|
||||||
# Add layout model configuration if specified
|
# Add layout model configuration if specified (Stage 3)
|
||||||
if layout_model_name:
|
if layout_model_name:
|
||||||
pp_kwargs['layout_detection_model_name'] = layout_model_name
|
pp_kwargs['layout_detection_model_name'] = layout_model_name
|
||||||
if layout_model_dir:
|
if layout_model_dir:
|
||||||
pp_kwargs['layout_detection_model_dir'] = layout_model_dir
|
pp_kwargs['layout_detection_model_dir'] = layout_model_dir
|
||||||
|
|
||||||
|
# Add table structure model configuration (Stage 4)
|
||||||
|
# PPStructureV3 uses separate models for wired (bordered) and wireless (borderless) tables
|
||||||
|
# Both models should be configured for comprehensive table detection
|
||||||
|
if wired_table_model:
|
||||||
|
pp_kwargs['wired_table_structure_recognition_model_name'] = wired_table_model
|
||||||
|
if wireless_table_model:
|
||||||
|
pp_kwargs['wireless_table_structure_recognition_model_name'] = wireless_table_model
|
||||||
|
|
||||||
|
# Add formula recognition model configuration (Stage 4)
|
||||||
|
if formula_model:
|
||||||
|
pp_kwargs['formula_recognition_model_name'] = formula_model
|
||||||
|
|
||||||
self.structure_engine = PPStructureV3(**pp_kwargs)
|
self.structure_engine = PPStructureV3(**pp_kwargs)
|
||||||
|
|
||||||
# Track model loading for cache management
|
# Track model loading for cache management
|
||||||
@@ -571,12 +600,15 @@ class OCRService:
|
|||||||
layout_threshold = settings.layout_detection_threshold
|
layout_threshold = settings.layout_detection_threshold
|
||||||
layout_model_name = settings.layout_detection_model_name
|
layout_model_name = settings.layout_detection_model_name
|
||||||
layout_model_dir = settings.layout_detection_model_dir
|
layout_model_dir = settings.layout_detection_model_dir
|
||||||
|
wired_table_model = settings.wired_table_model_name
|
||||||
|
wireless_table_model = settings.wireless_table_model_name
|
||||||
|
formula_model = settings.formula_recognition_model_name
|
||||||
|
|
||||||
# Build CPU fallback kwargs
|
# Build CPU fallback kwargs
|
||||||
cpu_kwargs = {
|
cpu_kwargs = {
|
||||||
'use_doc_orientation_classify': False,
|
'use_doc_orientation_classify': settings.use_doc_orientation_classify,
|
||||||
'use_doc_unwarping': False,
|
'use_doc_unwarping': settings.use_doc_unwarping,
|
||||||
'use_textline_orientation': False,
|
'use_textline_orientation': settings.use_textline_orientation,
|
||||||
'use_table_recognition': use_table,
|
'use_table_recognition': use_table,
|
||||||
'use_formula_recognition': use_formula,
|
'use_formula_recognition': use_formula,
|
||||||
'use_chart_recognition': use_chart,
|
'use_chart_recognition': use_chart,
|
||||||
@@ -586,6 +618,12 @@ class OCRService:
|
|||||||
cpu_kwargs['layout_detection_model_name'] = layout_model_name
|
cpu_kwargs['layout_detection_model_name'] = layout_model_name
|
||||||
if layout_model_dir:
|
if layout_model_dir:
|
||||||
cpu_kwargs['layout_detection_model_dir'] = layout_model_dir
|
cpu_kwargs['layout_detection_model_dir'] = layout_model_dir
|
||||||
|
if wired_table_model:
|
||||||
|
cpu_kwargs['wired_table_structure_recognition_model_name'] = wired_table_model
|
||||||
|
if wireless_table_model:
|
||||||
|
cpu_kwargs['wireless_table_structure_recognition_model_name'] = wireless_table_model
|
||||||
|
if formula_model:
|
||||||
|
cpu_kwargs['formula_recognition_model_name'] = formula_model
|
||||||
|
|
||||||
self.structure_engine = PPStructureV3(**cpu_kwargs)
|
self.structure_engine = PPStructureV3(**cpu_kwargs)
|
||||||
self._current_layout_model = layout_model # Track current model for recreation check
|
self._current_layout_model = layout_model # Track current model for recreation check
|
||||||
|
|||||||
@@ -40,8 +40,8 @@ class TestLayoutModelMapping:
|
|||||||
assert 'cdla' in LAYOUT_MODEL_MAPPING
|
assert 'cdla' in LAYOUT_MODEL_MAPPING
|
||||||
|
|
||||||
def test_chinese_model_maps_to_pp_doclayout(self):
|
def test_chinese_model_maps_to_pp_doclayout(self):
|
||||||
"""Verify 'chinese' maps to PP-DocLayout-S"""
|
"""Verify 'chinese' maps to PP-DocLayout_plus-L"""
|
||||||
assert LAYOUT_MODEL_MAPPING['chinese'] == 'PP-DocLayout-S'
|
assert LAYOUT_MODEL_MAPPING['chinese'] == 'PP-DocLayout_plus-L'
|
||||||
|
|
||||||
def test_default_model_maps_to_publaynet_sentinel(self):
|
def test_default_model_maps_to_publaynet_sentinel(self):
|
||||||
"""Verify 'default' maps to sentinel value for PubLayNet default"""
|
"""Verify 'default' maps to sentinel value for PubLayNet default"""
|
||||||
@@ -57,7 +57,7 @@ class TestLayoutModelEngine:
|
|||||||
"""Test engine creation with different layout models"""
|
"""Test engine creation with different layout models"""
|
||||||
|
|
||||||
def test_chinese_model_creates_engine_with_pp_doclayout(self):
|
def test_chinese_model_creates_engine_with_pp_doclayout(self):
|
||||||
"""Verify 'chinese' layout model uses PP-DocLayout-S"""
|
"""Verify 'chinese' layout model uses PP-DocLayout_plus-L"""
|
||||||
ocr_service = OCRService()
|
ocr_service = OCRService()
|
||||||
|
|
||||||
with patch.object(ocr_service, 'structure_engine', None):
|
with patch.object(ocr_service, 'structure_engine', None):
|
||||||
@@ -70,7 +70,7 @@ class TestLayoutModelEngine:
|
|||||||
mock_ppstructure.assert_called_once()
|
mock_ppstructure.assert_called_once()
|
||||||
call_kwargs = mock_ppstructure.call_args[1]
|
call_kwargs = mock_ppstructure.call_args[1]
|
||||||
|
|
||||||
assert call_kwargs.get('layout_detection_model_name') == 'PP-DocLayout-S'
|
assert call_kwargs.get('layout_detection_model_name') == 'PP-DocLayout_plus-L'
|
||||||
|
|
||||||
def test_default_model_creates_engine_without_model_name(self):
|
def test_default_model_creates_engine_without_model_name(self):
|
||||||
"""Verify 'default' layout model does not specify model name (uses default)"""
|
"""Verify 'default' layout model does not specify model name (uses default)"""
|
||||||
@@ -121,7 +121,7 @@ class TestLayoutModelEngine:
|
|||||||
call_kwargs = mock_ppstructure.call_args[1]
|
call_kwargs = mock_ppstructure.call_args[1]
|
||||||
|
|
||||||
# Should use 'chinese' model as default
|
# Should use 'chinese' model as default
|
||||||
assert call_kwargs.get('layout_detection_model_name') == 'PP-DocLayout-S'
|
assert call_kwargs.get('layout_detection_model_name') == 'PP-DocLayout_plus-L'
|
||||||
|
|
||||||
|
|
||||||
class TestLayoutModelCaching:
|
class TestLayoutModelCaching:
|
||||||
|
|||||||
@@ -56,11 +56,11 @@
|
|||||||
"layoutModel": {
|
"layoutModel": {
|
||||||
"title": "版面偵測模型",
|
"title": "版面偵測模型",
|
||||||
"chinese": "中文文件模型",
|
"chinese": "中文文件模型",
|
||||||
"chineseDesc": "PP-DocLayout-S - 適用於中文表單、合約、發票(推薦)",
|
"chineseDesc": "PP-DocLayout_plus-L (83.2% mAP) - 適用於複雜中文文件,支援20種版面元素(推薦)",
|
||||||
"default": "標準模型",
|
"default": "標準模型",
|
||||||
"defaultDesc": "PubLayNet 模型 - 適用於英文學術論文、報告",
|
"defaultDesc": "PubLayNet 模型 (~94% mAP) - 適用於英文學術論文、報告",
|
||||||
"cdla": "CDLA 模型",
|
"cdla": "CDLA 模型",
|
||||||
"cdlaDesc": "專用中文版面分析模型 - 適用於複雜中文版面",
|
"cdlaDesc": "CDLA 版面分析模型 (~86% mAP) - 專用中文版面分析",
|
||||||
"recommended": "推薦",
|
"recommended": "推薦",
|
||||||
"note": "版面模型會影響文件結構(表格、文字區塊、圖片)的偵測效果。請根據您的文件類型選擇適合的模型。"
|
"note": "版面模型會影響文件結構(表格、文字區塊、圖片)的偵測效果。請根據您的文件類型選擇適合的模型。"
|
||||||
}
|
}
|
||||||
|
|||||||
141
openspec/changes/upgrade-ppstructure-models/MODEL_CLEANUP.md
Normal file
141
openspec/changes/upgrade-ppstructure-models/MODEL_CLEANUP.md
Normal file
@@ -0,0 +1,141 @@
|
|||||||
|
# PP-StructureV3 Model Cache Cleanup Guide
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
After upgrading PP-StructureV3 models, older unused models may remain in the cache directory. This guide explains how to safely remove them to free disk space.
|
||||||
|
|
||||||
|
## Model Cache Location
|
||||||
|
|
||||||
|
PaddleX/PaddleOCR 3.x stores downloaded models in:
|
||||||
|
|
||||||
|
```
|
||||||
|
~/.paddlex/official_models/
|
||||||
|
```
|
||||||
|
|
||||||
|
## Models After Upgrade
|
||||||
|
|
||||||
|
### Current Active Models (DO NOT DELETE)
|
||||||
|
|
||||||
|
| Model | Purpose | Approx. Size |
|
||||||
|
|-------|---------|--------------|
|
||||||
|
| `PP-DocLayout_plus-L` | Layout detection for Chinese documents | ~350MB |
|
||||||
|
| `SLANeXt_wired` | Table structure recognition (bordered tables) | ~351MB |
|
||||||
|
| `SLANeXt_wireless` | Table structure recognition (borderless tables) | ~351MB |
|
||||||
|
| `PP-FormulaNet_plus-L` | Formula recognition (Chinese + English) | ~800MB |
|
||||||
|
| `PP-OCRv5_*` | Text detection and recognition | ~150MB |
|
||||||
|
| `picodet_lcnet_x1_0_fgd_layout_cdla` | CDLA layout model option | ~10MB |
|
||||||
|
|
||||||
|
### Deprecated Models (Safe to Delete)
|
||||||
|
|
||||||
|
| Model | Reason | Approx. Size |
|
||||||
|
|-------|--------|--------------|
|
||||||
|
| `PP-DocLayout-S` | Replaced by PP-DocLayout_plus-L | ~50MB |
|
||||||
|
| `SLANet` | Replaced by SLANeXt_wired/wireless | ~7MB |
|
||||||
|
| `SLANet_plus` | Replaced by SLANeXt_wired/wireless | ~7MB |
|
||||||
|
| `PP-FormulaNet-S` | Replaced by PP-FormulaNet_plus-L | ~200MB |
|
||||||
|
| `PP-FormulaNet-L` | Replaced by PP-FormulaNet_plus-L | ~400MB |
|
||||||
|
|
||||||
|
## Cleanup Commands
|
||||||
|
|
||||||
|
### List Current Cache
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# List all cached models
|
||||||
|
ls -la ~/.paddlex/official_models/
|
||||||
|
|
||||||
|
# Show disk usage per model
|
||||||
|
du -sh ~/.paddlex/official_models/*
|
||||||
|
```
|
||||||
|
|
||||||
|
### Delete Deprecated Models
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Remove deprecated layout model
|
||||||
|
rm -rf ~/.paddlex/official_models/PP-DocLayout-S
|
||||||
|
|
||||||
|
# Remove deprecated table models
|
||||||
|
rm -rf ~/.paddlex/official_models/SLANet
|
||||||
|
rm -rf ~/.paddlex/official_models/SLANet_plus
|
||||||
|
|
||||||
|
# Remove deprecated formula models (if present)
|
||||||
|
rm -rf ~/.paddlex/official_models/PP-FormulaNet-S
|
||||||
|
rm -rf ~/.paddlex/official_models/PP-FormulaNet-L
|
||||||
|
```
|
||||||
|
|
||||||
|
### Cleanup Script
|
||||||
|
|
||||||
|
```bash
|
||||||
|
#!/bin/bash
|
||||||
|
# cleanup_old_models.sh - Remove deprecated PP-StructureV3 models
|
||||||
|
|
||||||
|
CACHE_DIR="$HOME/.paddlex/official_models"
|
||||||
|
|
||||||
|
echo "PP-StructureV3 Model Cleanup"
|
||||||
|
echo "============================"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Check if cache directory exists
|
||||||
|
if [ ! -d "$CACHE_DIR" ]; then
|
||||||
|
echo "Cache directory not found: $CACHE_DIR"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# List deprecated models
|
||||||
|
DEPRECATED_MODELS=(
|
||||||
|
"PP-DocLayout-S"
|
||||||
|
"SLANet"
|
||||||
|
"SLANet_plus"
|
||||||
|
"PP-FormulaNet-S"
|
||||||
|
"PP-FormulaNet-L"
|
||||||
|
)
|
||||||
|
|
||||||
|
echo "Checking for deprecated models..."
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
TOTAL_SIZE=0
|
||||||
|
for model in "${DEPRECATED_MODELS[@]}"; do
|
||||||
|
MODEL_PATH="$CACHE_DIR/$model"
|
||||||
|
if [ -d "$MODEL_PATH" ]; then
|
||||||
|
SIZE=$(du -sh "$MODEL_PATH" 2>/dev/null | cut -f1)
|
||||||
|
echo "Found: $model ($SIZE)"
|
||||||
|
TOTAL_SIZE=$((TOTAL_SIZE + 1))
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ $TOTAL_SIZE -eq 0 ]; then
|
||||||
|
echo "No deprecated models found. Cache is clean."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
read -p "Delete these models? [y/N]: " confirm
|
||||||
|
|
||||||
|
if [ "$confirm" = "y" ] || [ "$confirm" = "Y" ]; then
|
||||||
|
for model in "${DEPRECATED_MODELS[@]}"; do
|
||||||
|
MODEL_PATH="$CACHE_DIR/$model"
|
||||||
|
if [ -d "$MODEL_PATH" ]; then
|
||||||
|
rm -rf "$MODEL_PATH"
|
||||||
|
echo "Deleted: $model"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
echo ""
|
||||||
|
echo "Cleanup complete."
|
||||||
|
else
|
||||||
|
echo "Cleanup cancelled."
|
||||||
|
fi
|
||||||
|
```
|
||||||
|
|
||||||
|
## Space Savings Estimate
|
||||||
|
|
||||||
|
After cleanup, you can expect to free approximately:
|
||||||
|
- **~65MB** from deprecated layout model
|
||||||
|
- **~14MB** from deprecated table models
|
||||||
|
- **~600MB** from deprecated formula models (if present)
|
||||||
|
|
||||||
|
Total potential savings: **~680MB**
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
1. Models are downloaded on first use. Deleting active models will trigger re-download.
|
||||||
|
2. The cache directory may vary if `PADDLEX_HOME` environment variable is set.
|
||||||
|
3. Always verify which models your configuration uses before deleting.
|
||||||
134
openspec/changes/upgrade-ppstructure-models/proposal.md
Normal file
134
openspec/changes/upgrade-ppstructure-models/proposal.md
Normal file
@@ -0,0 +1,134 @@
|
|||||||
|
# Upgrade PP-StructureV3 Models
|
||||||
|
|
||||||
|
## Why
|
||||||
|
|
||||||
|
目前專案使用的 PP-StructureV3 模型配置存在以下問題:
|
||||||
|
|
||||||
|
1. **版面偵測模型精度不足**:PP-DocLayout-S (70.9% mAP) 無法正確處理複雜表格和版面
|
||||||
|
2. **表格識別準確率低**:SLANet (59.52%) 產出錯誤的 HTML 結構
|
||||||
|
3. **預處理模組未啟用**:文檔方向校正和彎曲校正功能關閉
|
||||||
|
4. **模型佔用空間過大**:下載了不使用的模型,浪費儲存空間
|
||||||
|
|
||||||
|
## What Changes
|
||||||
|
|
||||||
|
### Stage 1: 預處理模組 - 全部開啟
|
||||||
|
|
||||||
|
| 功能 | 當前 | 變更後 |
|
||||||
|
|-----|-----|-------|
|
||||||
|
| `use_doc_orientation_classify` | False | **True** |
|
||||||
|
| `use_doc_unwarping` | False | **True** |
|
||||||
|
| `use_textline_orientation` | False | **True** |
|
||||||
|
|
||||||
|
### Stage 2: OCR 模組 - 維持現狀
|
||||||
|
|
||||||
|
- 繼續使用 PP-OCRv5 (預設配置)
|
||||||
|
- 不需要更改
|
||||||
|
|
||||||
|
### Stage 3: 版面分析模組 - 升級模型選項
|
||||||
|
|
||||||
|
| 選項名稱 | 當前模型 | 變更後模型 | mAP |
|
||||||
|
|---------|---------|-----------|-----|
|
||||||
|
| `chinese` | PP-DocLayout-S (移除) | **PP-DocLayout_plus-L** | 83.2% |
|
||||||
|
| `default` | PubLayNet | PubLayNet (維持) | ~94% |
|
||||||
|
| `cdla` | CDLA | CDLA (維持) | ~86% |
|
||||||
|
|
||||||
|
**重點變更**:
|
||||||
|
- 移除 PP-DocLayout-S (70.9% mAP)
|
||||||
|
- 新增 PP-DocLayout_plus-L (83.2% mAP, 20類別)
|
||||||
|
- 前端「中文文檔」選項改用 PP-DocLayout_plus-L
|
||||||
|
|
||||||
|
### Stage 4: 元素識別模組 - 升級表格識別
|
||||||
|
|
||||||
|
| 模組 | 當前模型 | 變更後模型 | 準確率變化 |
|
||||||
|
|-----|---------|-----------|-----------|
|
||||||
|
| 表格識別 | SLANet (預設) | **SLANeXt_wired + SLANeXt_wireless** | 59.52% → 69.65% |
|
||||||
|
| 公式識別 | PP-FormulaNet (預設) | **PP-FormulaNet_plus-L** | 45.78% → 90.64% (中文) |
|
||||||
|
| 圖表解析 | PP-Chart2Table | PP-Chart2Table (維持) | - |
|
||||||
|
| 印章識別 | PP-OCRv4_seal | PP-OCRv4_seal (維持) | - |
|
||||||
|
|
||||||
|
**表格識別策略**:
|
||||||
|
- SLANeXt_wired 和 SLANeXt_wireless 搭配使用
|
||||||
|
- 先用分類器判斷有線/無線表格類型
|
||||||
|
- 根據類型選擇對應的 SLANeXt 模型
|
||||||
|
- 聯合測試準確率達 69.65%
|
||||||
|
|
||||||
|
### 儲存空間優化 - 刪除未使用模型
|
||||||
|
|
||||||
|
PaddleOCR 3.x 模型緩存位置:`~/.paddlex/official_models/`
|
||||||
|
|
||||||
|
可刪除的模型目錄:
|
||||||
|
- PP-DocLayout-S (被 PP-DocLayout_plus-L 取代)
|
||||||
|
- SLANet (被 SLANeXt 取代)
|
||||||
|
- 其他未使用的舊版模型
|
||||||
|
|
||||||
|
**注意**:刪除後首次使用新模型會觸發下載
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
### REQ-1: 預處理模組開啟
|
||||||
|
系統 **SHALL** 在 PP-StructureV3 初始化時啟用所有預處理功能:
|
||||||
|
- 文檔方向分類 (use_doc_orientation_classify=True)
|
||||||
|
- 文檔彎曲校正 (use_doc_unwarping=True)
|
||||||
|
- 文字行方向偵測 (use_textline_orientation=True)
|
||||||
|
|
||||||
|
**Scenario: 處理旋轉的掃描文檔**
|
||||||
|
- Given 一個旋轉 90 度的 PDF 文檔
|
||||||
|
- When 使用 OCR track 處理
|
||||||
|
- Then 系統應自動校正方向後再進行 OCR
|
||||||
|
|
||||||
|
### REQ-2: 版面模型升級
|
||||||
|
系統 **SHALL** 將「chinese」選項對應的模型從 PP-DocLayout-S 更改為 PP-DocLayout_plus-L
|
||||||
|
|
||||||
|
**Scenario: 處理中文複雜文檔**
|
||||||
|
- Given 包含表格、圖片、公式的中文文檔
|
||||||
|
- When 選擇「chinese」版面模型處理
|
||||||
|
- Then 應使用 PP-DocLayout_plus-L (83.2% mAP) 進行版面分析
|
||||||
|
|
||||||
|
### REQ-3: 表格識別升級
|
||||||
|
系統 **SHALL** 使用 SLANeXt_wired 和 SLANeXt_wireless 搭配進行表格識別
|
||||||
|
|
||||||
|
**Scenario: 處理有線表格**
|
||||||
|
- Given 包含有線表格的文檔
|
||||||
|
- When 進行表格結構識別
|
||||||
|
- Then 應使用 SLANeXt_wired 模型
|
||||||
|
- And 輸出正確的 HTML 表格結構
|
||||||
|
|
||||||
|
**Scenario: 處理無線表格**
|
||||||
|
- Given 包含無線表格的文檔
|
||||||
|
- When 進行表格結構識別
|
||||||
|
- Then 應使用 SLANeXt_wireless 模型
|
||||||
|
|
||||||
|
### REQ-4: 公式識別升級
|
||||||
|
系統 **SHALL** 使用 PP-FormulaNet_plus-L 進行公式識別以支援中文公式
|
||||||
|
|
||||||
|
### REQ-5: 模型緩存清理
|
||||||
|
系統 **SHOULD** 提供工具或文檔說明如何清理未使用的模型緩存以節省儲存空間
|
||||||
|
|
||||||
|
## Model Comparison Data
|
||||||
|
|
||||||
|
### 表格識別模型對比
|
||||||
|
|
||||||
|
| 模型 | 準確率 | 推理時間 | 模型大小 | 適用場景 |
|
||||||
|
|-----|-------|---------|---------|---------|
|
||||||
|
| SLANet | 59.52% | 24ms | 6.9 MB | ❌ 準確率不足 |
|
||||||
|
| SLANet_plus | 63.69% | 23ms | 6.9 MB | ❌ 仍不足 |
|
||||||
|
| **SLANeXt_wired** | 69.65% | 86ms | 351 MB | ✅ 有線表格 |
|
||||||
|
| **SLANeXt_wireless** | 69.65% | - | 351 MB | ✅ 無線表格 |
|
||||||
|
|
||||||
|
**結論**:SLANeXt 系列比 SLANet/SLANet_plus 準確率高約 10%,但模型大小增加約 50 倍。考慮到表格識別是核心功能,建議升級。
|
||||||
|
|
||||||
|
### 版面偵測模型對比
|
||||||
|
|
||||||
|
| 模型 | 類別數 | mAP | 推理時間 | 適用場景 |
|
||||||
|
|-----|-------|-----|---------|---------|
|
||||||
|
| PP-DocLayout-S | 23 | 70.9% | 12ms | ❌ 精度不足 |
|
||||||
|
| PP-DocLayout-L | 23 | 90.4% | 34ms | ✅ 通用高精度 |
|
||||||
|
| **PP-DocLayout_plus-L** | 20 | 83.2% | 53ms | ✅ 複雜文檔推薦 |
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- [PaddleOCR Table Structure Recognition](http://www.paddleocr.ai/main/en/version3.x/module_usage/table_structure_recognition.html)
|
||||||
|
- [SLANeXt_wired on HuggingFace](https://huggingface.co/PaddlePaddle/SLANeXt_wired)
|
||||||
|
- [SLANeXt_wireless on HuggingFace](https://huggingface.co/PaddlePaddle/SLANeXt_wireless)
|
||||||
|
- [PP-StructureV3 Technical Report](https://arxiv.org/html/2507.05595v1)
|
||||||
|
- [PaddleOCR Model Cache Issue](https://github.com/PaddlePaddle/PaddleOCR/issues/10234)
|
||||||
@@ -0,0 +1,56 @@
|
|||||||
|
## ADDED Requirements
|
||||||
|
|
||||||
|
### Requirement: PP-StructureV3 Configuration
|
||||||
|
|
||||||
|
The system SHALL configure PP-StructureV3 with the following settings:
|
||||||
|
|
||||||
|
**Preprocessing (Stage 1):**
|
||||||
|
- Document orientation classification MUST be enabled (`use_doc_orientation_classify=True`)
|
||||||
|
- Document unwarping MUST be enabled (`use_doc_unwarping=True`)
|
||||||
|
- Textline orientation detection MUST be enabled (`use_textline_orientation=True`)
|
||||||
|
|
||||||
|
**Layout Detection (Stage 3):**
|
||||||
|
- The `chinese` layout model option SHALL use PP-DocLayout_plus-L (83.2% mAP)
|
||||||
|
- The `default` layout model option SHALL use PubLayNet for English documents
|
||||||
|
- The `cdla` layout model option SHALL use picodet_lcnet_x1_0_fgd_layout_cdla
|
||||||
|
|
||||||
|
**Element Recognition (Stage 4):**
|
||||||
|
- Table structure recognition SHALL use SLANeXt_wired and SLANeXt_wireless models (69.65% combined accuracy)
|
||||||
|
- Formula recognition SHALL use PP-FormulaNet_plus-L (92.22% English, 90.64% Chinese BLEU)
|
||||||
|
- Chart parsing SHALL use PP-Chart2Table
|
||||||
|
- Seal recognition SHALL use PP-OCRv4_seal
|
||||||
|
|
||||||
|
#### Scenario: Processing rotated scanned document
|
||||||
|
- **WHEN** a PDF document with rotated pages is processed using OCR track
|
||||||
|
- **THEN** the system SHALL automatically detect and correct the orientation before OCR processing
|
||||||
|
|
||||||
|
#### Scenario: Processing complex Chinese document with tables
|
||||||
|
- **WHEN** a Chinese document containing tables, images, and formulas is processed
|
||||||
|
- **AND** the user selects "chinese" layout model
|
||||||
|
- **THEN** the system SHALL use PP-DocLayout_plus-L for layout detection (83.2% mAP)
|
||||||
|
- **AND** the system SHALL correctly identify table regions
|
||||||
|
|
||||||
|
#### Scenario: Table structure recognition with wired tables
|
||||||
|
- **WHEN** a document contains wired (bordered) tables
|
||||||
|
- **THEN** the system SHALL use SLANeXt_wired model for structure recognition
|
||||||
|
- **AND** output correct HTML table structure with proper row/column spanning
|
||||||
|
|
||||||
|
#### Scenario: Table structure recognition with wireless tables
|
||||||
|
- **WHEN** a document contains wireless (borderless) tables
|
||||||
|
- **THEN** the system SHALL use SLANeXt_wireless model for structure recognition
|
||||||
|
|
||||||
|
#### Scenario: Chinese formula recognition
|
||||||
|
- **WHEN** a document contains mathematical formulas with Chinese characters
|
||||||
|
- **THEN** the system SHALL use PP-FormulaNet_plus-L for recognition
|
||||||
|
- **AND** output LaTeX code with correct Chinese character representation
|
||||||
|
|
||||||
|
## ADDED Requirements
|
||||||
|
|
||||||
|
### Requirement: Model Cache Cleanup
|
||||||
|
|
||||||
|
The system SHALL provide documentation for cleaning up unused model caches to optimize storage space.
|
||||||
|
|
||||||
|
#### Scenario: User wants to free disk space after model upgrade
|
||||||
|
- **WHEN** the user has upgraded from older models (PP-DocLayout-S, SLANet) to newer models
|
||||||
|
- **THEN** the documentation SHALL explain how to delete unused cached models from `~/.paddlex/official_models/`
|
||||||
|
- **AND** list which model directories can be safely removed
|
||||||
77
openspec/changes/upgrade-ppstructure-models/tasks.md
Normal file
77
openspec/changes/upgrade-ppstructure-models/tasks.md
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
# Tasks: Upgrade PP-StructureV3 Models
|
||||||
|
|
||||||
|
## 1. Backend Configuration Changes
|
||||||
|
|
||||||
|
- [x] 1.1 Update `backend/app/core/config.py` - Enable preprocessing flags
|
||||||
|
- Set `use_doc_orientation_classify` default to True
|
||||||
|
- Set `use_doc_unwarping` default to True
|
||||||
|
- Set `use_textline_orientation` default to True
|
||||||
|
- Add `table_structure_model_name` configuration
|
||||||
|
- Add `formula_recognition_model_name` configuration
|
||||||
|
|
||||||
|
- [x] 1.2 Update `backend/app/services/ocr_service.py` - Model mapping changes
|
||||||
|
- Update `LAYOUT_MODEL_MAPPING`:
|
||||||
|
- Change `"chinese"` from `"PP-DocLayout-S"` to `"PP-DocLayout_plus-L"`
|
||||||
|
- Keep `"default"` as PubLayNet
|
||||||
|
- Keep `"cdla"` as is
|
||||||
|
- Update `_ensure_structure_engine()`:
|
||||||
|
- Pass preprocessing flags to PPStructureV3
|
||||||
|
- Configure SLANeXt models for table recognition
|
||||||
|
- Configure PP-FormulaNet_plus-L for formula recognition
|
||||||
|
|
||||||
|
- [x] 1.3 Update PPStructureV3 initialization kwargs
|
||||||
|
- Add `table_structure_model_name="SLANeXt_wired"` (or configure dual model)
|
||||||
|
- Add `formula_recognition_model_name="PP-FormulaNet_plus-L"`
|
||||||
|
- Verify preprocessing flags are passed correctly
|
||||||
|
|
||||||
|
## 2. Schema Updates
|
||||||
|
|
||||||
|
- [x] 2.1 Update `backend/app/schemas/task.py` - LayoutModelEnum
|
||||||
|
- Rename or update `CHINESE` description to reflect PP-DocLayout_plus-L
|
||||||
|
- Update docstrings to reflect new model capabilities
|
||||||
|
|
||||||
|
## 3. Frontend Updates
|
||||||
|
|
||||||
|
- [x] 3.1 Update `frontend/src/components/LayoutModelSelector.tsx`
|
||||||
|
- Update Chinese option description to mention PP-DocLayout_plus-L
|
||||||
|
- Update accuracy information displayed to users
|
||||||
|
|
||||||
|
- [x] 3.2 Update `frontend/src/i18n/locales/zh-TW.json`
|
||||||
|
- Update `layoutModel.chinese.description` to reflect new model
|
||||||
|
- Update any accuracy percentages in descriptions
|
||||||
|
|
||||||
|
## 4. Testing
|
||||||
|
|
||||||
|
- [x] 4.1 Create unit tests for new model configuration
|
||||||
|
- Test preprocessing flags are correctly passed
|
||||||
|
- Test model mapping resolves correctly
|
||||||
|
- Test engine initialization with new models
|
||||||
|
|
||||||
|
- [ ] 4.2 Integration testing with real documents
|
||||||
|
- Test rotated document handling (preprocessing)
|
||||||
|
- Test complex Chinese document layout detection
|
||||||
|
- Test table structure recognition accuracy
|
||||||
|
- Test formula recognition with Chinese formulas
|
||||||
|
|
||||||
|
- [x] 4.3 Update existing tests
|
||||||
|
- Update `backend/tests/services/test_layout_model.py` for new mapping
|
||||||
|
- Update `backend/tests/api/test_layout_model_api.py` if needed
|
||||||
|
|
||||||
|
## 5. Documentation
|
||||||
|
|
||||||
|
- [x] 5.1 Create model cleanup documentation
|
||||||
|
- Document `~/.paddlex/official_models/` cache location
|
||||||
|
- List models that can be safely deleted after upgrade
|
||||||
|
- Provide cleanup script/commands
|
||||||
|
- See: [MODEL_CLEANUP.md](./MODEL_CLEANUP.md)
|
||||||
|
|
||||||
|
- [x] 5.2 Update API documentation
|
||||||
|
- Document preprocessing feature behavior
|
||||||
|
- Update layout model descriptions
|
||||||
|
|
||||||
|
## 6. Verification & Deployment
|
||||||
|
|
||||||
|
- [ ] 6.1 Verify new models download correctly on first use
|
||||||
|
- [ ] 6.2 Measure memory/GPU usage with new models
|
||||||
|
- [ ] 6.3 Compare processing speed before/after upgrade
|
||||||
|
- [ ] 6.4 Verify existing functionality not broken
|
||||||
Reference in New Issue
Block a user