fix: OCR track table data format and image cropping

Table data format fixes (ocr_to_unified_converter.py):
- Fix ElementType string conversion using value-based lookup
- Add content-based HTML table detection (reclassify TEXT to TABLE)
- Use BeautifulSoup for robust HTML table parsing
- Generate TableData with fully populated cells arrays

Image cropping for OCR track (pp_structure_enhanced.py):
- Add _crop_and_save_image method for extracting image regions
- Pass source_image_path to _process_parsing_res_list
- Return relative filename (not full path) for saved_path
- Consistent with Direct Track image saving pattern

Also includes:
- Add beautifulsoup4 to requirements.txt
- Add architecture overview documentation
- Archive fix-ocr-track-table-data-format proposal (22/24 tasks)

Known issues: OCR track images are restored but still have quality issues
that will be addressed in a follow-up proposal.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-26 18:48:15 +08:00
parent a227311b2d
commit 6e050eb540
8 changed files with 585 additions and 30 deletions

View File

@@ -167,7 +167,7 @@ class PPStructureEnhanced:
# Process parsing_res_list if found
if parsing_res_list:
elements = self._process_parsing_res_list(
parsing_res_list, current_page, output_dir
parsing_res_list, current_page, output_dir, image_path
)
all_elements.extend(elements)
@@ -229,7 +229,8 @@ class PPStructureEnhanced:
self,
parsing_res_list: List[Dict],
current_page: int,
output_dir: Optional[Path]
output_dir: Optional[Path],
source_image_path: Optional[Path] = None
) -> List[Dict[str, Any]]:
"""
Process parsing_res_list to extract all elements.
@@ -238,6 +239,7 @@ class PPStructureEnhanced:
parsing_res_list: List of parsed elements from PP-StructureV3
current_page: Current page number
output_dir: Optional output directory
source_image_path: Path to source image for cropping image regions
Returns:
List of processed elements with normalized structure
@@ -327,6 +329,17 @@ class PPStructureEnhanced:
element['img_path'] = item['img_path'] # Keep original for reference
else:
logger.warning(f"Failed to save image for element {element['element_id']}")
# Crop image from source if no img_path but source image is available
elif source_image_path and output_dir and bbox != [0, 0, 0, 0]:
cropped_path = self._crop_and_save_image(
source_image_path, bbox, output_dir, element['element_id']
)
if cropped_path:
element['saved_path'] = cropped_path
element['img_path'] = cropped_path
logger.info(f"Cropped and saved image region for {element['element_id']}")
else:
logger.warning(f"Failed to crop image for element {element['element_id']}")
# Add any additional metadata
if 'metadata' in item:
@@ -535,4 +548,62 @@ class PPStructureEnhanced:
img_obj.save(str(img_path))
logger.info(f"Saved image to {img_path}")
except Exception as e:
logger.warning(f"Failed to save PIL image: {e}")
logger.warning(f"Failed to save PIL image: {e}")
def _crop_and_save_image(
self,
source_image_path: Path,
bbox: List[float],
output_dir: Path,
element_id: str
) -> Optional[str]:
"""
Crop image region from source image and save to output directory.
Args:
source_image_path: Path to the source image
bbox: Bounding box [x1, y1, x2, y2]
output_dir: Output directory for saving cropped image
element_id: Element ID for naming
Returns:
Relative filename (not full path) to saved image, consistent with
Direct Track which stores "filename.png" that gets joined with
result_dir by pdf_generator_service.
"""
try:
from PIL import Image
# Open source image
with Image.open(source_image_path) as img:
# Ensure bbox values are integers
x1, y1, x2, y2 = [int(v) for v in bbox[:4]]
# Validate bbox
img_width, img_height = img.size
x1 = max(0, min(x1, img_width))
x2 = max(0, min(x2, img_width))
y1 = max(0, min(y1, img_height))
y2 = max(0, min(y2, img_height))
if x2 <= x1 or y2 <= y1:
logger.warning(f"Invalid bbox for cropping: {bbox}")
return None
# Crop the region
cropped = img.crop((x1, y1, x2, y2))
# Save directly to output directory (no subdirectory)
# Consistent with Direct Track which saves to output_dir directly
image_filename = f"{element_id}.png"
img_path = output_dir / image_filename
cropped.save(str(img_path), "PNG")
# Return just the filename (relative to result_dir)
# PDF generator will join with result_dir to get full path
logger.info(f"Cropped image saved: {img_path} ({x2-x1}x{y2-y1} pixels)")
return image_filename
except Exception as e:
logger.error(f"Failed to crop and save image for {element_id}: {e}")
return None