feat: simplify layout model selection and archive proposals

Changes:
- Replace PP-Structure 7-slider parameter UI with simple 3-option layout model selector
- Add layout model mapping: chinese (PP-DocLayout-S), default (PubLayNet), cdla
- Add LayoutModelSelector component and zh-TW translations
- Fix "default" model behavior with sentinel value for PubLayNet
- Add gap filling service for OCR track coverage improvement
- Add PP-Structure debug utilities
- Archive completed/incomplete proposals:
  - add-ocr-track-gap-filling (complete)
  - fix-ocr-track-table-rendering (incomplete)
  - simplify-ppstructure-model-selection (22/25 tasks)
- Add new layout model tests, archive old PP-Structure param tests
- Update OpenSpec ocr-processing spec with layout model requirements

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-27 13:27:00 +08:00
parent c65df754cf
commit 59206a6ab8
35 changed files with 3621 additions and 658 deletions

View File

@@ -46,6 +46,19 @@ except ImportError as e:
logger = logging.getLogger(__name__)
# Sentinel value for "use PubLayNet default" - explicitly NO model specification
_USE_PUBLAYNET_DEFAULT = "__USE_PUBLAYNET_DEFAULT__"
# Layout model mapping: user-friendly names to actual model names
# - "chinese": PP-DocLayout-S - Best for Chinese documents (forms, contracts, invoices)
# - "default": PubLayNet-based default model - Best for English documents
# - "cdla": picodet_lcnet_x1_0_fgd_layout_cdla - Alternative for Chinese layout
LAYOUT_MODEL_MAPPING = {
"chinese": "PP-DocLayout-S",
"default": _USE_PUBLAYNET_DEFAULT, # Uses default PubLayNet-based model (no custom model)
"cdla": "picodet_lcnet_x1_0_fgd_layout_cdla",
}
class OCRService:
"""
@@ -436,77 +449,45 @@ class OCRService:
return self.ocr_engines[lang]
def _ensure_structure_engine(self, custom_params: Optional[Dict[str, any]] = None) -> PPStructureV3:
def _ensure_structure_engine(self, layout_model: Optional[str] = None) -> PPStructureV3:
"""
Get or create PP-Structure engine for layout analysis with GPU support.
Supports custom parameters that override default settings.
Supports layout model selection for different document types.
Args:
custom_params: Optional dictionary of custom PP-StructureV3 parameters.
If provided, creates a new engine instance (not cached).
Supported keys: layout_detection_threshold, layout_nms_threshold,
layout_merge_bboxes_mode, layout_unclip_ratio, text_det_thresh,
text_det_box_thresh, text_det_unclip_ratio
layout_model: Layout detection model selection:
- "chinese": PP-DocLayout-S (best for Chinese documents)
- "default": PubLayNet-based (best for English documents)
- "cdla": CDLA model (alternative for Chinese layout)
- None: Use config default
Returns:
PPStructure engine instance
"""
# If custom params provided, create a new engine instance (don't use cache)
if custom_params:
logger.info(f"Creating PP-StructureV3 engine with custom parameters (GPU: {self.use_gpu})")
logger.info(f"Custom params: {custom_params}")
# Resolve layout model name from user-friendly name
resolved_model_name = None
use_publaynet_default = False # Flag to explicitly use PubLayNet default (no model param)
try:
# Base configuration from settings
use_chart = settings.enable_chart_recognition
use_formula = settings.enable_formula_recognition
use_table = settings.enable_table_recognition
if layout_model:
resolved_model_name = LAYOUT_MODEL_MAPPING.get(layout_model)
if layout_model not in LAYOUT_MODEL_MAPPING:
logger.warning(f"Unknown layout model '{layout_model}', using config default")
resolved_model_name = settings.layout_detection_model_name
elif resolved_model_name == _USE_PUBLAYNET_DEFAULT:
# User explicitly selected "default" - use PubLayNet without custom model
use_publaynet_default = True
resolved_model_name = None
logger.info(f"Using layout model: {layout_model} -> PubLayNet default (no custom model)")
else:
logger.info(f"Using layout model: {layout_model} -> {resolved_model_name}")
# Parameter priority: custom > settings default
layout_threshold = custom_params.get('layout_detection_threshold', settings.layout_detection_threshold)
layout_nms = custom_params.get('layout_nms_threshold', settings.layout_nms_threshold)
layout_merge = custom_params.get('layout_merge_bboxes_mode', settings.layout_merge_mode)
layout_unclip = custom_params.get('layout_unclip_ratio', settings.layout_unclip_ratio)
text_thresh = custom_params.get('text_det_thresh', settings.text_det_thresh)
text_box_thresh = custom_params.get('text_det_box_thresh', settings.text_det_box_thresh)
text_unclip = custom_params.get('text_det_unclip_ratio', settings.text_det_unclip_ratio)
# Check if we need to recreate the engine due to different model
current_model = getattr(self, '_current_layout_model', None)
if self.structure_engine is not None and layout_model and layout_model != current_model:
logger.info(f"Layout model changed from {current_model} to {layout_model}, recreating engine")
self.structure_engine = None # Force recreation
logger.info(f"PP-StructureV3 config: table={use_table}, formula={use_formula}, chart={use_chart}")
logger.info(f"Layout config: threshold={layout_threshold}, nms={layout_nms}, merge={layout_merge}, unclip={layout_unclip}")
logger.info(f"Text detection: thresh={text_thresh}, box_thresh={text_box_thresh}, unclip={text_unclip}")
# Create temporary engine with custom params (not cached)
custom_engine = PPStructureV3(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False,
use_table_recognition=use_table,
use_formula_recognition=use_formula,
use_chart_recognition=use_chart,
layout_threshold=layout_threshold,
layout_nms=layout_nms,
layout_unclip_ratio=layout_unclip,
layout_merge_bboxes_mode=layout_merge,
text_det_thresh=text_thresh,
text_det_box_thresh=text_box_thresh,
text_det_unclip_ratio=text_unclip,
)
logger.info(f"PP-StructureV3 engine with custom params ready (PaddlePaddle {paddle.__version__}, {'GPU' if self.use_gpu else 'CPU'} mode)")
# Check GPU memory after loading
if self.use_gpu and settings.enable_memory_optimization:
self._check_gpu_memory_usage()
return custom_engine
except Exception as e:
logger.error(f"Failed to create PP-StructureV3 engine with custom params: {e}")
# Fall back to default cached engine
logger.warning("Falling back to default cached engine")
custom_params = None # Clear custom params to use cached engine
# Use cached default engine
# Use cached engine or create new one
if self.structure_engine is None:
logger.info(f"Initializing PP-StructureV3 engine (GPU: {self.use_gpu})")
@@ -524,28 +505,51 @@ class OCRService:
text_box_thresh = settings.text_det_box_thresh
text_unclip = settings.text_det_unclip_ratio
# Layout model configuration:
# - If use_publaynet_default: don't specify any model (use PubLayNet default)
# - If resolved_model_name: use the specified model
# - Otherwise: use config default
if use_publaynet_default:
layout_model_name = None # Explicitly no model = PubLayNet default
elif resolved_model_name:
layout_model_name = resolved_model_name
else:
layout_model_name = settings.layout_detection_model_name
layout_model_dir = settings.layout_detection_model_dir
logger.info(f"PP-StructureV3 config: table={use_table}, formula={use_formula}, chart={use_chart}")
logger.info(f"Layout model: name={layout_model_name}, dir={layout_model_dir}")
logger.info(f"Layout config: threshold={layout_threshold}, nms={layout_nms}, merge={layout_merge}, unclip={layout_unclip}")
logger.info(f"Text detection: thresh={text_thresh}, box_thresh={text_box_thresh}, unclip={text_unclip}")
self.structure_engine = PPStructureV3(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False,
use_table_recognition=use_table,
use_formula_recognition=use_formula,
use_chart_recognition=use_chart,
layout_threshold=layout_threshold,
layout_nms=layout_nms,
layout_unclip_ratio=layout_unclip,
layout_merge_bboxes_mode=layout_merge, # Use 'small' to minimize merging
text_det_thresh=text_thresh,
text_det_box_thresh=text_box_thresh,
text_det_unclip_ratio=text_unclip,
)
# Build PPStructureV3 kwargs
pp_kwargs = {
'use_doc_orientation_classify': False,
'use_doc_unwarping': False,
'use_textline_orientation': False,
'use_table_recognition': use_table,
'use_formula_recognition': use_formula,
'use_chart_recognition': use_chart,
'layout_threshold': layout_threshold,
'layout_nms': layout_nms,
'layout_unclip_ratio': layout_unclip,
'layout_merge_bboxes_mode': layout_merge,
'text_det_thresh': text_thresh,
'text_det_box_thresh': text_box_thresh,
'text_det_unclip_ratio': text_unclip,
}
# Add layout model configuration if specified
if layout_model_name:
pp_kwargs['layout_detection_model_name'] = layout_model_name
if layout_model_dir:
pp_kwargs['layout_detection_model_dir'] = layout_model_dir
self.structure_engine = PPStructureV3(**pp_kwargs)
# Track model loading for cache management
self._model_last_used['structure'] = datetime.now()
self._current_layout_model = layout_model # Track current model for recreation check
logger.info(f"PP-StructureV3 engine ready (PaddlePaddle {paddle.__version__}, {'GPU' if self.use_gpu else 'CPU'} mode)")
@@ -565,17 +569,27 @@ class OCRService:
use_formula = settings.enable_formula_recognition
use_table = settings.enable_table_recognition
layout_threshold = settings.layout_detection_threshold
layout_model_name = settings.layout_detection_model_name
layout_model_dir = settings.layout_detection_model_dir
self.structure_engine = PPStructureV3(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False,
use_table_recognition=use_table,
use_formula_recognition=use_formula,
use_chart_recognition=use_chart,
layout_threshold=layout_threshold,
)
logger.info("PP-StructureV3 engine ready (CPU mode - fallback)")
# Build CPU fallback kwargs
cpu_kwargs = {
'use_doc_orientation_classify': False,
'use_doc_unwarping': False,
'use_textline_orientation': False,
'use_table_recognition': use_table,
'use_formula_recognition': use_formula,
'use_chart_recognition': use_chart,
'layout_threshold': layout_threshold,
}
if layout_model_name:
cpu_kwargs['layout_detection_model_name'] = layout_model_name
if layout_model_dir:
cpu_kwargs['layout_detection_model_dir'] = layout_model_dir
self.structure_engine = PPStructureV3(**cpu_kwargs)
self._current_layout_model = layout_model # Track current model for recreation check
logger.info(f"PP-StructureV3 engine ready (CPU mode - fallback, layout_model={layout_model_name})")
else:
raise
@@ -813,7 +827,7 @@ class OCRService:
confidence_threshold: Optional[float] = None,
output_dir: Optional[Path] = None,
current_page: int = 0,
pp_structure_params: Optional[Dict[str, any]] = None
layout_model: Optional[str] = None
) -> Dict:
"""
Process single image with OCR and layout analysis
@@ -825,7 +839,7 @@ class OCRService:
confidence_threshold: Minimum confidence threshold (uses default if None)
output_dir: Optional output directory for saving extracted images
current_page: Current page number (0-based) for multi-page documents
pp_structure_params: Optional custom PP-StructureV3 parameters
layout_model: Layout detection model ('chinese', 'default', 'cdla')
Returns:
Dictionary with OCR results and metadata
@@ -894,7 +908,7 @@ class OCRService:
confidence_threshold=confidence_threshold,
output_dir=output_dir,
current_page=page_num - 1, # Convert to 0-based page number for layout data
pp_structure_params=pp_structure_params
layout_model=layout_model
)
# Accumulate results
@@ -1040,7 +1054,7 @@ class OCRService:
image_path,
output_dir=output_dir,
current_page=current_page,
pp_structure_params=pp_structure_params
layout_model=layout_model
)
# Generate Markdown
@@ -1078,6 +1092,38 @@ class OCRService:
'height': ocr_height
}]
# Generate PP-StructureV3 debug outputs if enabled
if settings.pp_structure_debug_enabled and output_dir:
try:
from app.services.pp_structure_debug import PPStructureDebug
debug_service = PPStructureDebug(output_dir)
# Save raw results as JSON
debug_service.save_raw_results(
pp_structure_results={
'elements': layout_data.get('elements', []),
'total_elements': layout_data.get('total_elements', 0),
'element_types': layout_data.get('element_types', {}),
'reading_order': layout_data.get('reading_order', []),
'enhanced': True,
'has_parsing_res_list': True
},
raw_ocr_regions=text_regions,
filename_prefix=image_path.stem
)
# Generate visualization if enabled
if settings.pp_structure_debug_visualization:
debug_service.generate_visualization(
image_path=image_path,
pp_structure_elements=layout_data.get('elements', []),
raw_ocr_regions=text_regions,
filename_prefix=image_path.stem
)
logger.info(f"Generated PP-StructureV3 debug outputs for {image_path.name}")
except Exception as debug_error:
logger.warning(f"Failed to generate debug outputs: {debug_error}")
logger.info(
f"OCR completed: {image_path.name} - "
f"{len(text_regions)} regions, "
@@ -1164,7 +1210,7 @@ class OCRService:
image_path: Path,
output_dir: Optional[Path] = None,
current_page: int = 0,
pp_structure_params: Optional[Dict[str, any]] = None
layout_model: Optional[str] = None
) -> Tuple[Optional[Dict], List[Dict]]:
"""
Analyze document layout using PP-StructureV3 with enhanced element extraction
@@ -1173,7 +1219,7 @@ class OCRService:
image_path: Path to image file
output_dir: Optional output directory for saving extracted images (defaults to image_path.parent)
current_page: Current page number (0-based) for multi-page documents
pp_structure_params: Optional custom PP-StructureV3 parameters
layout_model: Layout detection model ('chinese', 'default', 'cdla')
Returns:
Tuple of (layout_data, images_metadata)
@@ -1191,7 +1237,7 @@ class OCRService:
f"Mode: {'CPU fallback' if self._cpu_fallback_active else 'GPU'}"
)
structure_engine = self._ensure_structure_engine(pp_structure_params)
structure_engine = self._ensure_structure_engine(layout_model)
# Try enhanced processing first
try:
@@ -1425,7 +1471,7 @@ class OCRService:
confidence_threshold: Optional[float] = None,
output_dir: Optional[Path] = None,
force_track: Optional[str] = None,
pp_structure_params: Optional[Dict[str, any]] = None
layout_model: Optional[str] = None
) -> Union[UnifiedDocument, Dict]:
"""
Process document using dual-track approach.
@@ -1437,7 +1483,7 @@ class OCRService:
confidence_threshold: Minimum confidence threshold
output_dir: Optional output directory for extracted images
force_track: Force specific track ("ocr" or "direct"), None for auto-detection
pp_structure_params: Optional custom PP-StructureV3 parameters (used for OCR track only)
layout_model: Layout detection model ('chinese', 'default', 'cdla') (used for OCR track only)
Returns:
UnifiedDocument if dual-track is enabled, Dict otherwise
@@ -1445,7 +1491,7 @@ class OCRService:
if not self.dual_track_enabled:
# Fallback to traditional OCR processing
return self.process_file_traditional(
file_path, lang, detect_layout, confidence_threshold, output_dir, pp_structure_params
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
)
start_time = datetime.now()
@@ -1517,7 +1563,7 @@ class OCRService:
ocr_result = self.process_file_traditional(
actual_file_path, lang, detect_layout=True,
confidence_threshold=confidence_threshold,
output_dir=output_dir, pp_structure_params=pp_structure_params
output_dir=output_dir, layout_model=layout_model
)
# Convert OCR result to extract images
@@ -1550,7 +1596,7 @@ class OCRService:
# Use OCR for scanned documents, images, etc.
logger.info("Using OCR track (PaddleOCR)")
ocr_result = self.process_file_traditional(
file_path, lang, detect_layout, confidence_threshold, output_dir, pp_structure_params
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
)
# Convert OCR result to UnifiedDocument using the converter
@@ -1580,7 +1626,7 @@ class OCRService:
logger.error(f"Error in dual-track processing: {e}")
# Fallback to traditional OCR
return self.process_file_traditional(
file_path, lang, detect_layout, confidence_threshold, output_dir, pp_structure_params
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
)
def _merge_ocr_images_into_direct(
@@ -1659,7 +1705,7 @@ class OCRService:
detect_layout: bool = True,
confidence_threshold: Optional[float] = None,
output_dir: Optional[Path] = None,
pp_structure_params: Optional[Dict[str, any]] = None
layout_model: Optional[str] = None
) -> Dict:
"""
Traditional OCR processing (legacy method).
@@ -1670,7 +1716,7 @@ class OCRService:
detect_layout: Whether to perform layout analysis
confidence_threshold: Minimum confidence threshold
output_dir: Optional output directory
pp_structure_params: Optional custom PP-StructureV3 parameters
layout_model: Layout detection model ('chinese', 'default', 'cdla')
Returns:
Dictionary with OCR results in legacy format
@@ -1683,7 +1729,7 @@ class OCRService:
all_results = []
for i, image_path in enumerate(image_paths):
result = self.process_image(
image_path, lang, detect_layout, confidence_threshold, output_dir, i, pp_structure_params
image_path, lang, detect_layout, confidence_threshold, output_dir, i, layout_model
)
all_results.append(result)
@@ -1699,7 +1745,7 @@ class OCRService:
else:
# Single image or other file
return self.process_image(
file_path, lang, detect_layout, confidence_threshold, output_dir, 0, pp_structure_params
file_path, lang, detect_layout, confidence_threshold, output_dir, 0, layout_model
)
def _combine_results(self, results: List[Dict]) -> Dict:
@@ -1784,7 +1830,7 @@ class OCRService:
output_dir: Optional[Path] = None,
use_dual_track: bool = True,
force_track: Optional[str] = None,
pp_structure_params: Optional[Dict[str, any]] = None
layout_model: Optional[str] = None
) -> Union[UnifiedDocument, Dict]:
"""
Main processing method with dual-track support.
@@ -1797,7 +1843,7 @@ class OCRService:
output_dir: Optional output directory
use_dual_track: Whether to use dual-track processing (default True)
force_track: Force specific track ("ocr" or "direct")
pp_structure_params: Optional custom PP-StructureV3 parameters (used for OCR track only)
layout_model: Layout detection model ('chinese', 'default', 'cdla') (used for OCR track only)
Returns:
UnifiedDocument if dual-track is enabled and use_dual_track=True,
@@ -1809,12 +1855,12 @@ class OCRService:
if (use_dual_track or force_track) and self.dual_track_enabled:
# Use dual-track processing (or forced track)
return self.process_with_dual_track(
file_path, lang, detect_layout, confidence_threshold, output_dir, force_track, pp_structure_params
file_path, lang, detect_layout, confidence_threshold, output_dir, force_track, layout_model
)
else:
# Use traditional OCR processing (no force_track support)
return self.process_file_traditional(
file_path, lang, detect_layout, confidence_threshold, output_dir, pp_structure_params
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
)
def process_legacy(