feat: simplify layout model selection and archive proposals
Changes: - Replace PP-Structure 7-slider parameter UI with simple 3-option layout model selector - Add layout model mapping: chinese (PP-DocLayout-S), default (PubLayNet), cdla - Add LayoutModelSelector component and zh-TW translations - Fix "default" model behavior with sentinel value for PubLayNet - Add gap filling service for OCR track coverage improvement - Add PP-Structure debug utilities - Archive completed/incomplete proposals: - add-ocr-track-gap-filling (complete) - fix-ocr-track-table-rendering (incomplete) - simplify-ppstructure-model-selection (22/25 tasks) - Add new layout model tests, archive old PP-Structure param tests - Update OpenSpec ocr-processing spec with layout model requirements 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -68,7 +68,7 @@ def process_task_ocr(
|
||||
use_dual_track: bool = True,
|
||||
force_track: Optional[str] = None,
|
||||
language: str = 'ch',
|
||||
pp_structure_params: Optional[dict] = None
|
||||
layout_model: Optional[str] = "chinese"
|
||||
):
|
||||
"""
|
||||
Background task to process OCR for a task with dual-track support.
|
||||
@@ -84,7 +84,7 @@ def process_task_ocr(
|
||||
use_dual_track: Enable dual-track processing
|
||||
force_track: Force specific track ('ocr' or 'direct')
|
||||
language: OCR language code
|
||||
pp_structure_params: Optional custom PP-StructureV3 parameters (dict)
|
||||
layout_model: Layout detection model ('chinese', 'default', 'cdla')
|
||||
"""
|
||||
from app.core.database import SessionLocal
|
||||
from app.models.task import Task
|
||||
@@ -143,7 +143,7 @@ def process_task_ocr(
|
||||
output_dir=result_dir,
|
||||
use_dual_track=use_dual_track,
|
||||
force_track=force_track,
|
||||
pp_structure_params=pp_structure_params
|
||||
layout_model=layout_model
|
||||
)
|
||||
else:
|
||||
# Fall back to traditional processing (no force_track support)
|
||||
@@ -152,7 +152,7 @@ def process_task_ocr(
|
||||
lang=language,
|
||||
detect_layout=True,
|
||||
output_dir=result_dir,
|
||||
pp_structure_params=pp_structure_params
|
||||
layout_model=layout_model
|
||||
)
|
||||
|
||||
# Calculate processing time
|
||||
@@ -717,14 +717,14 @@ async def start_task(
|
||||
current_user: User = Depends(get_current_user)
|
||||
):
|
||||
"""
|
||||
Start processing a pending task with dual-track support and optional PP-StructureV3 parameter tuning
|
||||
Start processing a pending task with dual-track support and layout model selection
|
||||
|
||||
- **task_id**: Task UUID
|
||||
- **options**: Processing options (in request body):
|
||||
- **use_dual_track**: Enable intelligent track selection (default: true)
|
||||
- **force_track**: Force specific processing track ('ocr' or 'direct')
|
||||
- **language**: OCR language code (default: 'ch')
|
||||
- **pp_structure_params**: Fine-tuning parameters for PP-StructureV3 (OCR track only)
|
||||
- **layout_model**: Layout detection model ('chinese', 'default', 'cdla')
|
||||
"""
|
||||
try:
|
||||
# Parse processing options with defaults
|
||||
@@ -735,11 +735,9 @@ async def start_task(
|
||||
force_track = options.force_track.value if options.force_track else None
|
||||
language = options.language
|
||||
|
||||
# Extract and convert PP-StructureV3 parameters to dict
|
||||
pp_structure_params = None
|
||||
if options.pp_structure_params:
|
||||
pp_structure_params = options.pp_structure_params.model_dump(exclude_none=True)
|
||||
logger.info(f"Using custom PP-StructureV3 parameters: {pp_structure_params}")
|
||||
# Extract layout model (default to 'chinese' for best Chinese document support)
|
||||
layout_model = options.layout_model.value if options.layout_model else "chinese"
|
||||
logger.info(f"Using layout model: {layout_model}")
|
||||
|
||||
# Get task details
|
||||
task = task_service.get_task_by_id(
|
||||
@@ -777,7 +775,7 @@ async def start_task(
|
||||
status=TaskStatus.PROCESSING
|
||||
)
|
||||
|
||||
# Start OCR processing in background with dual-track parameters and custom PP-StructureV3 params
|
||||
# Start OCR processing in background with dual-track parameters and layout model
|
||||
background_tasks.add_task(
|
||||
process_task_ocr,
|
||||
task_id=task_id,
|
||||
@@ -787,13 +785,11 @@ async def start_task(
|
||||
use_dual_track=use_dual_track,
|
||||
force_track=force_track,
|
||||
language=language,
|
||||
pp_structure_params=pp_structure_params
|
||||
layout_model=layout_model
|
||||
)
|
||||
|
||||
logger.info(f"Started OCR processing task {task_id} for user {current_user.email}")
|
||||
logger.info(f"Options: dual_track={use_dual_track}, force_track={force_track}, lang={language}")
|
||||
if pp_structure_params:
|
||||
logger.info(f"Custom PP-StructureV3 params: {pp_structure_params}")
|
||||
logger.info(f"Options: dual_track={use_dual_track}, force_track={force_track}, lang={language}, layout_model={layout_model}")
|
||||
return task
|
||||
|
||||
except HTTPException:
|
||||
|
||||
Reference in New Issue
Block a user