feat: simplify layout model selection and archive proposals

Changes:
- Replace PP-Structure 7-slider parameter UI with simple 3-option layout model selector
- Add layout model mapping: chinese (PP-DocLayout-S), default (PubLayNet), cdla
- Add LayoutModelSelector component and zh-TW translations
- Fix "default" model behavior with sentinel value for PubLayNet
- Add gap filling service for OCR track coverage improvement
- Add PP-Structure debug utilities
- Archive completed/incomplete proposals:
  - add-ocr-track-gap-filling (complete)
  - fix-ocr-track-table-rendering (incomplete)
  - simplify-ppstructure-model-selection (22/25 tasks)
- Add new layout model tests, archive old PP-Structure param tests
- Update OpenSpec ocr-processing spec with layout model requirements

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-27 13:27:00 +08:00
parent c65df754cf
commit 59206a6ab8
35 changed files with 3621 additions and 658 deletions

View File

@@ -68,7 +68,7 @@ def process_task_ocr(
use_dual_track: bool = True,
force_track: Optional[str] = None,
language: str = 'ch',
pp_structure_params: Optional[dict] = None
layout_model: Optional[str] = "chinese"
):
"""
Background task to process OCR for a task with dual-track support.
@@ -84,7 +84,7 @@ def process_task_ocr(
use_dual_track: Enable dual-track processing
force_track: Force specific track ('ocr' or 'direct')
language: OCR language code
pp_structure_params: Optional custom PP-StructureV3 parameters (dict)
layout_model: Layout detection model ('chinese', 'default', 'cdla')
"""
from app.core.database import SessionLocal
from app.models.task import Task
@@ -143,7 +143,7 @@ def process_task_ocr(
output_dir=result_dir,
use_dual_track=use_dual_track,
force_track=force_track,
pp_structure_params=pp_structure_params
layout_model=layout_model
)
else:
# Fall back to traditional processing (no force_track support)
@@ -152,7 +152,7 @@ def process_task_ocr(
lang=language,
detect_layout=True,
output_dir=result_dir,
pp_structure_params=pp_structure_params
layout_model=layout_model
)
# Calculate processing time
@@ -717,14 +717,14 @@ async def start_task(
current_user: User = Depends(get_current_user)
):
"""
Start processing a pending task with dual-track support and optional PP-StructureV3 parameter tuning
Start processing a pending task with dual-track support and layout model selection
- **task_id**: Task UUID
- **options**: Processing options (in request body):
- **use_dual_track**: Enable intelligent track selection (default: true)
- **force_track**: Force specific processing track ('ocr' or 'direct')
- **language**: OCR language code (default: 'ch')
- **pp_structure_params**: Fine-tuning parameters for PP-StructureV3 (OCR track only)
- **layout_model**: Layout detection model ('chinese', 'default', 'cdla')
"""
try:
# Parse processing options with defaults
@@ -735,11 +735,9 @@ async def start_task(
force_track = options.force_track.value if options.force_track else None
language = options.language
# Extract and convert PP-StructureV3 parameters to dict
pp_structure_params = None
if options.pp_structure_params:
pp_structure_params = options.pp_structure_params.model_dump(exclude_none=True)
logger.info(f"Using custom PP-StructureV3 parameters: {pp_structure_params}")
# Extract layout model (default to 'chinese' for best Chinese document support)
layout_model = options.layout_model.value if options.layout_model else "chinese"
logger.info(f"Using layout model: {layout_model}")
# Get task details
task = task_service.get_task_by_id(
@@ -777,7 +775,7 @@ async def start_task(
status=TaskStatus.PROCESSING
)
# Start OCR processing in background with dual-track parameters and custom PP-StructureV3 params
# Start OCR processing in background with dual-track parameters and layout model
background_tasks.add_task(
process_task_ocr,
task_id=task_id,
@@ -787,13 +785,11 @@ async def start_task(
use_dual_track=use_dual_track,
force_track=force_track,
language=language,
pp_structure_params=pp_structure_params
layout_model=layout_model
)
logger.info(f"Started OCR processing task {task_id} for user {current_user.email}")
logger.info(f"Options: dual_track={use_dual_track}, force_track={force_track}, lang={language}")
if pp_structure_params:
logger.info(f"Custom PP-StructureV3 params: {pp_structure_params}")
logger.info(f"Options: dual_track={use_dual_track}, force_track={force_track}, lang={language}, layout_model={layout_model}")
return task
except HTTPException: