fix: OCR track table data format and image cropping

Table data format fixes (ocr_to_unified_converter.py): - Fix ElementType string conversion using value-based lookup - Add content-based HTML table detection (reclassify TEXT to TABLE) - Use BeautifulSoup for robust HTML table parsing - Generate TableData with fully populated cells arrays Image cropping for OCR track (pp_structure_enhanced.py): - Add _crop_and_save_image method for extracting image regions - Pass source_image_path to _process_parsing_res_list - Return relative filename (not full path) for saved_path - Consistent with Direct Track image saving pattern Also includes: - Add beautifulsoup4 to requirements.txt - Add architecture overview documentation - Archive fix-ocr-track-table-data-format proposal (22/24 tasks) Known issues: OCR track images are restored but still have quality issues that will be addressed in a follow-up proposal. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-26 18:48:15 +08:00
parent a227311b2d
commit 6e050eb540
8 changed files with 585 additions and 30 deletions
--- a/backend/app/services/pp_structure_enhanced.py
+++ b/backend/app/services/pp_structure_enhanced.py
@@ -167,7 +167,7 @@ class PPStructureEnhanced:
                # Process parsing_res_list if found
                if parsing_res_list:
                    elements = self._process_parsing_res_list(
-                        parsing_res_list, current_page, output_dir
+                        parsing_res_list, current_page, output_dir, image_path
                    )
                    all_elements.extend(elements)

@@ -229,7 +229,8 @@ class PPStructureEnhanced:
        self,
        parsing_res_list: List[Dict],
        current_page: int,
-        output_dir: Optional[Path]
+        output_dir: Optional[Path],
+        source_image_path: Optional[Path] = None
    ) -> List[Dict[str, Any]]:
        """
        Process parsing_res_list to extract all elements.
@@ -238,6 +239,7 @@ class PPStructureEnhanced:
            parsing_res_list: List of parsed elements from PP-StructureV3
            current_page: Current page number
            output_dir: Optional output directory
+            source_image_path: Path to source image for cropping image regions

        Returns:
            List of processed elements with normalized structure
@@ -327,6 +329,17 @@ class PPStructureEnhanced:
                        element['img_path'] = item['img_path']  # Keep original for reference
                    else:
                        logger.warning(f"Failed to save image for element {element['element_id']}")
+                # Crop image from source if no img_path but source image is available
+                elif source_image_path and output_dir and bbox != [0, 0, 0, 0]:
+                    cropped_path = self._crop_and_save_image(
+                        source_image_path, bbox, output_dir, element['element_id']
+                    )
+                    if cropped_path:
+                        element['saved_path'] = cropped_path
+                        element['img_path'] = cropped_path
+                        logger.info(f"Cropped and saved image region for {element['element_id']}")
+                    else:
+                        logger.warning(f"Failed to crop image for element {element['element_id']}")

            # Add any additional metadata
            if 'metadata' in item:
@@ -535,4 +548,62 @@ class PPStructureEnhanced:
            img_obj.save(str(img_path))
            logger.info(f"Saved image to {img_path}")
        except Exception as e:
-            logger.warning(f"Failed to save PIL image: {e}")
+            logger.warning(f"Failed to save PIL image: {e}")
+
+    def _crop_and_save_image(
+        self,
+        source_image_path: Path,
+        bbox: List[float],
+        output_dir: Path,
+        element_id: str
+    ) -> Optional[str]:
+        """
+        Crop image region from source image and save to output directory.
+
+        Args:
+            source_image_path: Path to the source image
+            bbox: Bounding box [x1, y1, x2, y2]
+            output_dir: Output directory for saving cropped image
+            element_id: Element ID for naming
+
+        Returns:
+            Relative filename (not full path) to saved image, consistent with
+            Direct Track which stores "filename.png" that gets joined with
+            result_dir by pdf_generator_service.
+        """
+        try:
+            from PIL import Image
+
+            # Open source image
+            with Image.open(source_image_path) as img:
+                # Ensure bbox values are integers
+                x1, y1, x2, y2 = [int(v) for v in bbox[:4]]
+
+                # Validate bbox
+                img_width, img_height = img.size
+                x1 = max(0, min(x1, img_width))
+                x2 = max(0, min(x2, img_width))
+                y1 = max(0, min(y1, img_height))
+                y2 = max(0, min(y2, img_height))
+
+                if x2 <= x1 or y2 <= y1:
+                    logger.warning(f"Invalid bbox for cropping: {bbox}")
+                    return None
+
+                # Crop the region
+                cropped = img.crop((x1, y1, x2, y2))
+
+                # Save directly to output directory (no subdirectory)
+                # Consistent with Direct Track which saves to output_dir directly
+                image_filename = f"{element_id}.png"
+                img_path = output_dir / image_filename
+                cropped.save(str(img_path), "PNG")
+
+                # Return just the filename (relative to result_dir)
+                # PDF generator will join with result_dir to get full path
+                logger.info(f"Cropped image saved: {img_path} ({x2-x1}x{y2-y1} pixels)")
+                return image_filename
+
+        except Exception as e:
+            logger.error(f"Failed to crop and save image for {element_id}: {e}")
+            return None