fix: OCR track table data format and image cropping
Table data format fixes (ocr_to_unified_converter.py): - Fix ElementType string conversion using value-based lookup - Add content-based HTML table detection (reclassify TEXT to TABLE) - Use BeautifulSoup for robust HTML table parsing - Generate TableData with fully populated cells arrays Image cropping for OCR track (pp_structure_enhanced.py): - Add _crop_and_save_image method for extracting image regions - Pass source_image_path to _process_parsing_res_list - Return relative filename (not full path) for saved_path - Consistent with Direct Track image saving pattern Also includes: - Add beautifulsoup4 to requirements.txt - Add architecture overview documentation - Archive fix-ocr-track-table-data-format proposal (22/24 tasks) Known issues: OCR track images are restored but still have quality issues that will be addressed in a follow-up proposal. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -167,7 +167,7 @@ class PPStructureEnhanced:
|
||||
# Process parsing_res_list if found
|
||||
if parsing_res_list:
|
||||
elements = self._process_parsing_res_list(
|
||||
parsing_res_list, current_page, output_dir
|
||||
parsing_res_list, current_page, output_dir, image_path
|
||||
)
|
||||
all_elements.extend(elements)
|
||||
|
||||
@@ -229,7 +229,8 @@ class PPStructureEnhanced:
|
||||
self,
|
||||
parsing_res_list: List[Dict],
|
||||
current_page: int,
|
||||
output_dir: Optional[Path]
|
||||
output_dir: Optional[Path],
|
||||
source_image_path: Optional[Path] = None
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Process parsing_res_list to extract all elements.
|
||||
@@ -238,6 +239,7 @@ class PPStructureEnhanced:
|
||||
parsing_res_list: List of parsed elements from PP-StructureV3
|
||||
current_page: Current page number
|
||||
output_dir: Optional output directory
|
||||
source_image_path: Path to source image for cropping image regions
|
||||
|
||||
Returns:
|
||||
List of processed elements with normalized structure
|
||||
@@ -327,6 +329,17 @@ class PPStructureEnhanced:
|
||||
element['img_path'] = item['img_path'] # Keep original for reference
|
||||
else:
|
||||
logger.warning(f"Failed to save image for element {element['element_id']}")
|
||||
# Crop image from source if no img_path but source image is available
|
||||
elif source_image_path and output_dir and bbox != [0, 0, 0, 0]:
|
||||
cropped_path = self._crop_and_save_image(
|
||||
source_image_path, bbox, output_dir, element['element_id']
|
||||
)
|
||||
if cropped_path:
|
||||
element['saved_path'] = cropped_path
|
||||
element['img_path'] = cropped_path
|
||||
logger.info(f"Cropped and saved image region for {element['element_id']}")
|
||||
else:
|
||||
logger.warning(f"Failed to crop image for element {element['element_id']}")
|
||||
|
||||
# Add any additional metadata
|
||||
if 'metadata' in item:
|
||||
@@ -535,4 +548,62 @@ class PPStructureEnhanced:
|
||||
img_obj.save(str(img_path))
|
||||
logger.info(f"Saved image to {img_path}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to save PIL image: {e}")
|
||||
logger.warning(f"Failed to save PIL image: {e}")
|
||||
|
||||
def _crop_and_save_image(
|
||||
self,
|
||||
source_image_path: Path,
|
||||
bbox: List[float],
|
||||
output_dir: Path,
|
||||
element_id: str
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Crop image region from source image and save to output directory.
|
||||
|
||||
Args:
|
||||
source_image_path: Path to the source image
|
||||
bbox: Bounding box [x1, y1, x2, y2]
|
||||
output_dir: Output directory for saving cropped image
|
||||
element_id: Element ID for naming
|
||||
|
||||
Returns:
|
||||
Relative filename (not full path) to saved image, consistent with
|
||||
Direct Track which stores "filename.png" that gets joined with
|
||||
result_dir by pdf_generator_service.
|
||||
"""
|
||||
try:
|
||||
from PIL import Image
|
||||
|
||||
# Open source image
|
||||
with Image.open(source_image_path) as img:
|
||||
# Ensure bbox values are integers
|
||||
x1, y1, x2, y2 = [int(v) for v in bbox[:4]]
|
||||
|
||||
# Validate bbox
|
||||
img_width, img_height = img.size
|
||||
x1 = max(0, min(x1, img_width))
|
||||
x2 = max(0, min(x2, img_width))
|
||||
y1 = max(0, min(y1, img_height))
|
||||
y2 = max(0, min(y2, img_height))
|
||||
|
||||
if x2 <= x1 or y2 <= y1:
|
||||
logger.warning(f"Invalid bbox for cropping: {bbox}")
|
||||
return None
|
||||
|
||||
# Crop the region
|
||||
cropped = img.crop((x1, y1, x2, y2))
|
||||
|
||||
# Save directly to output directory (no subdirectory)
|
||||
# Consistent with Direct Track which saves to output_dir directly
|
||||
image_filename = f"{element_id}.png"
|
||||
img_path = output_dir / image_filename
|
||||
cropped.save(str(img_path), "PNG")
|
||||
|
||||
# Return just the filename (relative to result_dir)
|
||||
# PDF generator will join with result_dir to get full path
|
||||
logger.info(f"Cropped image saved: {img_path} ({x2-x1}x{y2-y1} pixels)")
|
||||
return image_filename
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to crop and save image for {element_id}: {e}")
|
||||
return None
|
||||
Reference in New Issue
Block a user