feat: enhance layout preprocessing and unify image scaling proposal
Backend changes: - Add image scaling configuration for PP-Structure processing - Enhance layout preprocessing service with scaling support - Update OCR service with improved memory management - Add PP-Structure enhanced processing improvements Frontend changes: - Update preprocessing settings UI - Fix processing page layout and state management - Update API types for new parameters Proposals: - Archive add-layout-preprocessing proposal (completed) - Add unify-image-scaling proposal for consistent coordinate handling 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -7,10 +7,14 @@ This module provides enhanced PP-StructureV3 processing that extracts all
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple, Any
|
||||
from typing import Dict, List, Optional, Tuple, Any, TYPE_CHECKING
|
||||
import json
|
||||
import gc
|
||||
|
||||
# Import ScalingInfo for type checking (avoid circular imports at runtime)
|
||||
if TYPE_CHECKING:
|
||||
from app.services.layout_preprocessing_service import ScalingInfo
|
||||
|
||||
# Optional torch import for additional GPU memory management
|
||||
try:
|
||||
import torch
|
||||
@@ -81,7 +85,8 @@ class PPStructureEnhanced:
|
||||
image_path: Path,
|
||||
output_dir: Optional[Path] = None,
|
||||
current_page: int = 0,
|
||||
preprocessed_image: Optional[Image.Image] = None
|
||||
preprocessed_image: Optional[Image.Image] = None,
|
||||
scaling_info: Optional['ScalingInfo'] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze document with full PP-StructureV3 capabilities.
|
||||
@@ -93,10 +98,13 @@ class PPStructureEnhanced:
|
||||
preprocessed_image: Optional preprocessed PIL Image for layout detection.
|
||||
If provided, this is used for PP-Structure prediction,
|
||||
but original image_path is still used for cropping images.
|
||||
scaling_info: Optional ScalingInfo from preprocessing. If image was scaled
|
||||
for layout detection, all bbox coordinates will be scaled back
|
||||
to original image coordinates for proper cropping.
|
||||
|
||||
Returns:
|
||||
Dictionary with complete structure information including:
|
||||
- elements: List of all detected elements with types and bbox
|
||||
- elements: List of all detected elements with types and bbox (in original coords)
|
||||
- reading_order: Reading order indices
|
||||
- images: Extracted images with metadata
|
||||
- tables: Extracted tables with structure
|
||||
@@ -184,7 +192,7 @@ class PPStructureEnhanced:
|
||||
# Process parsing_res_list if found
|
||||
if parsing_res_list:
|
||||
elements = self._process_parsing_res_list(
|
||||
parsing_res_list, current_page, output_dir, image_path
|
||||
parsing_res_list, current_page, output_dir, image_path, scaling_info
|
||||
)
|
||||
all_elements.extend(elements)
|
||||
|
||||
@@ -247,13 +255,15 @@ class PPStructureEnhanced:
|
||||
parsing_res_list: List[Dict],
|
||||
current_page: int,
|
||||
output_dir: Optional[Path],
|
||||
source_image_path: Optional[Path] = None
|
||||
source_image_path: Optional[Path] = None,
|
||||
scaling_info: Optional['ScalingInfo'] = None
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Process parsing_res_list to extract all elements.
|
||||
|
||||
Args:
|
||||
parsing_res_list: List of parsed elements from PP-StructureV3
|
||||
scaling_info: Scaling information for bbox coordinate restoration
|
||||
current_page: Current page number
|
||||
output_dir: Optional output directory
|
||||
source_image_path: Path to source image for cropping image regions
|
||||
@@ -285,11 +295,28 @@ class PPStructureEnhanced:
|
||||
|
||||
# Ensure bbox has 4 values
|
||||
if len(layout_bbox) >= 4:
|
||||
bbox = layout_bbox[:4] # [x1, y1, x2, y2]
|
||||
bbox = list(layout_bbox[:4]) # [x1, y1, x2, y2]
|
||||
else:
|
||||
bbox = [0, 0, 0, 0] # Default if bbox missing
|
||||
logger.warning(f"Element {idx} has invalid bbox: {layout_bbox}")
|
||||
|
||||
# Scale bbox back to original image coordinates if image was scaled
|
||||
# This is critical for proper cropping from original high-resolution image
|
||||
if scaling_info and scaling_info.was_scaled and bbox != [0, 0, 0, 0]:
|
||||
scale_factor = scaling_info.scale_factor
|
||||
bbox = [
|
||||
bbox[0] * scale_factor, # x1
|
||||
bbox[1] * scale_factor, # y1
|
||||
bbox[2] * scale_factor, # x2
|
||||
bbox[3] * scale_factor # y2
|
||||
]
|
||||
if idx == 0: # Log only for first element to avoid spam
|
||||
logger.info(
|
||||
f"Scaled bbox to original coords: "
|
||||
f"{[round(x, 1) for x in layout_bbox[:4]]} -> {[round(x, 1) for x in bbox]} "
|
||||
f"(factor={scale_factor:.3f})"
|
||||
)
|
||||
|
||||
# Extract content (check multiple possible keys)
|
||||
content = (
|
||||
item.get('content', '') or
|
||||
|
||||
Reference in New Issue
Block a user