feat: enhance layout preprocessing and unify image scaling proposal

Backend changes:
- Add image scaling configuration for PP-Structure processing
- Enhance layout preprocessing service with scaling support
- Update OCR service with improved memory management
- Add PP-Structure enhanced processing improvements

Frontend changes:
- Update preprocessing settings UI
- Fix processing page layout and state management
- Update API types for new parameters

Proposals:
- Archive add-layout-preprocessing proposal (completed)
- Add unify-image-scaling proposal for consistent coordinate handling

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-28 09:23:19 +08:00
parent 86bbea6fbf
commit dda9621e17
17 changed files with 826 additions and 104 deletions

View File

@@ -7,10 +7,14 @@ This module provides enhanced PP-StructureV3 processing that extracts all
import logging
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Any
from typing import Dict, List, Optional, Tuple, Any, TYPE_CHECKING
import json
import gc
# Import ScalingInfo for type checking (avoid circular imports at runtime)
if TYPE_CHECKING:
from app.services.layout_preprocessing_service import ScalingInfo
# Optional torch import for additional GPU memory management
try:
import torch
@@ -81,7 +85,8 @@ class PPStructureEnhanced:
image_path: Path,
output_dir: Optional[Path] = None,
current_page: int = 0,
preprocessed_image: Optional[Image.Image] = None
preprocessed_image: Optional[Image.Image] = None,
scaling_info: Optional['ScalingInfo'] = None
) -> Dict[str, Any]:
"""
Analyze document with full PP-StructureV3 capabilities.
@@ -93,10 +98,13 @@ class PPStructureEnhanced:
preprocessed_image: Optional preprocessed PIL Image for layout detection.
If provided, this is used for PP-Structure prediction,
but original image_path is still used for cropping images.
scaling_info: Optional ScalingInfo from preprocessing. If image was scaled
for layout detection, all bbox coordinates will be scaled back
to original image coordinates for proper cropping.
Returns:
Dictionary with complete structure information including:
- elements: List of all detected elements with types and bbox
- elements: List of all detected elements with types and bbox (in original coords)
- reading_order: Reading order indices
- images: Extracted images with metadata
- tables: Extracted tables with structure
@@ -184,7 +192,7 @@ class PPStructureEnhanced:
# Process parsing_res_list if found
if parsing_res_list:
elements = self._process_parsing_res_list(
parsing_res_list, current_page, output_dir, image_path
parsing_res_list, current_page, output_dir, image_path, scaling_info
)
all_elements.extend(elements)
@@ -247,13 +255,15 @@ class PPStructureEnhanced:
parsing_res_list: List[Dict],
current_page: int,
output_dir: Optional[Path],
source_image_path: Optional[Path] = None
source_image_path: Optional[Path] = None,
scaling_info: Optional['ScalingInfo'] = None
) -> List[Dict[str, Any]]:
"""
Process parsing_res_list to extract all elements.
Args:
parsing_res_list: List of parsed elements from PP-StructureV3
scaling_info: Scaling information for bbox coordinate restoration
current_page: Current page number
output_dir: Optional output directory
source_image_path: Path to source image for cropping image regions
@@ -285,11 +295,28 @@ class PPStructureEnhanced:
# Ensure bbox has 4 values
if len(layout_bbox) >= 4:
bbox = layout_bbox[:4] # [x1, y1, x2, y2]
bbox = list(layout_bbox[:4]) # [x1, y1, x2, y2]
else:
bbox = [0, 0, 0, 0] # Default if bbox missing
logger.warning(f"Element {idx} has invalid bbox: {layout_bbox}")
# Scale bbox back to original image coordinates if image was scaled
# This is critical for proper cropping from original high-resolution image
if scaling_info and scaling_info.was_scaled and bbox != [0, 0, 0, 0]:
scale_factor = scaling_info.scale_factor
bbox = [
bbox[0] * scale_factor, # x1
bbox[1] * scale_factor, # y1
bbox[2] * scale_factor, # x2
bbox[3] * scale_factor # y2
]
if idx == 0: # Log only for first element to avoid spam
logger.info(
f"Scaled bbox to original coords: "
f"{[round(x, 1) for x in layout_bbox[:4]]} -> {[round(x, 1) for x in bbox]} "
f"(factor={scale_factor:.3f})"
)
# Extract content (check multiple possible keys)
content = (
item.get('content', '') or