fix: OCR track table data format and image cropping

Table data format fixes (ocr_to_unified_converter.py): - Fix ElementType string conversion using value-based lookup - Add content-based HTML table detection (reclassify TEXT to TABLE) - Use BeautifulSoup for robust HTML table parsing - Generate TableData with fully populated cells arrays Image cropping for OCR track (pp_structure_enhanced.py): - Add _crop_and_save_image method for extracting image regions - Pass source_image_path to _process_parsing_res_list - Return relative filename (not full path) for saved_path - Consistent with Direct Track image saving pattern Also includes: - Add beautifulsoup4 to requirements.txt - Add architecture overview documentation - Archive fix-ocr-track-table-data-format proposal (22/24 tasks) Known issues: OCR track images are restored but still have quality issues that will be addressed in a follow-up proposal. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-26 18:48:15 +08:00
parent a227311b2d
commit 6e050eb540
8 changed files with 585 additions and 30 deletions
--- a/backend/app/services/ocr_to_unified_converter.py
+++ b/backend/app/services/ocr_to_unified_converter.py
@@ -350,7 +350,19 @@ class OCRToUnifiedConverter:
            element_type = elem_data.get('type', ElementType.TEXT)
            if isinstance(element_type, str):
                # Convert string to ElementType if needed
-                element_type = ElementType[element_type] if element_type in ElementType.__members__ else ElementType.TEXT
+                # ElementType is a str-based enum, so we can construct from value (lowercase)
+                try:
+                    element_type = ElementType(element_type)
+                except ValueError:
+                    # If value doesn't match, try member name (uppercase)
+                    element_type = ElementType[element_type.upper()] if element_type.upper() in ElementType.__members__ else ElementType.TEXT
+
+            # Content-based reclassification: detect HTML tables in text content
+            content_str = elem_data.get('content', '')
+            if isinstance(content_str, str) and '<table' in content_str.lower():
+                if element_type == ElementType.TEXT:
+                    logger.info(f"Element {elem_data.get('element_id')}: Reclassifying TEXT to TABLE (HTML table in content)")
+                    element_type = ElementType.TABLE

            # Prepare content based on element type
            if element_type == ElementType.TABLE:
@@ -538,7 +550,12 @@ class OCRToUnifiedConverter:
            return None

    def _extract_table_data(self, elem_data: Dict) -> Optional[TableData]:
-        """Extract table data from element."""
+        """
+        Extract table data from element using BeautifulSoup for robust HTML parsing.
+
+        This method produces TableData objects with fully populated cells arrays,
+        matching the format produced by DirectExtractionEngine for consistency.
+        """
        try:
            html = elem_data.get('html', '')
            extracted_text = elem_data.get('extracted_text', '')
@@ -550,31 +567,101 @@ class OCRToUnifiedConverter:
                    html = content
                    logger.debug("Using content field as HTML table source")

-            # Try to parse HTML to get rows and columns
-            rows = 0
+            # Return None if no HTML table content
+            if not html or '<table' not in html.lower():
+                if extracted_text:
+                    # Return minimal TableData with just caption if we have text
+                    return TableData(rows=0, cols=0, cells=[], caption=extracted_text)
+                return None
+
+            # Parse HTML table using BeautifulSoup
+            try:
+                from bs4 import BeautifulSoup
+                soup = BeautifulSoup(html, 'html.parser')
+                table = soup.find('table')
+
+                if not table:
+                    logger.warning("No <table> element found in HTML")
+                    return self._fallback_table_data(html, extracted_text)
+
+                cells = []
+                headers = []
+                rows = table.find_all('tr')
+
+                # Track actual column positions accounting for rowspan/colspan
+                # This is a simplified approach - complex spanning may need enhancement
+                for row_idx, row in enumerate(rows):
+                    row_cells = row.find_all(['td', 'th'])
+                    col_idx = 0
+
+                    for cell in row_cells:
+                        cell_content = cell.get_text(strip=True)
+                        rowspan = int(cell.get('rowspan', 1))
+                        colspan = int(cell.get('colspan', 1))
+
+                        cells.append(TableCell(
+                            row=row_idx,
+                            col=col_idx,
+                            row_span=rowspan,
+                            col_span=colspan,
+                            content=cell_content
+                        ))
+
+                        # Collect headers from <th> elements or first row
+                        if cell.name == 'th' or row_idx == 0:
+                            headers.append(cell_content)
+
+                        # Advance column index by colspan
+                        col_idx += colspan
+
+                # Calculate actual dimensions
+                num_rows = len(rows)
+                num_cols = max(
+                    sum(int(cell.get('colspan', 1)) for cell in row.find_all(['td', 'th']))
+                    for row in rows
+                ) if rows else 0
+
+                logger.debug(
+                    f"Parsed HTML table: {num_rows} rows, {num_cols} cols, {len(cells)} cells"
+                )
+
+                return TableData(
+                    rows=num_rows,
+                    cols=num_cols,
+                    cells=cells,
+                    headers=headers if headers else None,
+                    caption=extracted_text if extracted_text else None
+                )
+
+            except ImportError:
+                logger.warning("BeautifulSoup not available, using fallback parsing")
+                return self._fallback_table_data(html, extracted_text)
+
+        except Exception as e:
+            logger.warning(f"Failed to extract table data: {e}")
+            return None
+
+    def _fallback_table_data(self, html: str, extracted_text: str = '') -> Optional[TableData]:
+        """
+        Fallback table parsing when BeautifulSoup is not available.
+        Returns basic TableData with row/col counts only (no cells).
+        """
+        try:
+            rows = html.count('<tr')
            cols = 0
-            cells = []
+            if rows > 0:
+                first_row_end = html.find('</tr>')
+                if first_row_end > 0:
+                    first_row = html[:first_row_end]
+                    cols = first_row.count('<td') + first_row.count('<th')

-            if html:
-                # Simple HTML parsing (could be enhanced with BeautifulSoup)
-                rows = html.count('<tr')
-                if rows > 0:
-                    # Estimate columns from first row
-                    first_row_end = html.find('</tr>')
-                    if first_row_end > 0:
-                        first_row = html[:first_row_end]
-                        cols = first_row.count('<td') + first_row.count('<th')
-
-            # Return None if no valid table data found
            if rows == 0 and cols == 0 and not extracted_text:
                return None

-            # Note: TableData uses 'cols' not 'columns'
-            # HTML content can be stored as caption or in element metadata
            return TableData(
                rows=rows,
                cols=cols,
-                cells=cells,
+                cells=[],  # Empty cells in fallback mode
                caption=extracted_text if extracted_text else None
            )
        except:
@@ -653,9 +740,9 @@ class OCRToUnifiedConverter:
            min_distance = float('inf')

            for target in targets:
-                # Caption should be below the target
-                if target.bbox.y2 <= caption.bbox.y1:
-                    distance = caption.bbox.y1 - target.bbox.y2
+                # Caption should be below the target (y1 is bottom in BoundingBox)
+                if target.bbox.y1 <= caption.bbox.y0:
+                    distance = caption.bbox.y0 - target.bbox.y1
                    if distance < min_distance:
                        min_distance = distance
                        best_target = target
@@ -684,8 +771,8 @@ class OCRToUnifiedConverter:
            else:
                prev_item = list_items[i-1]
                # Check if items are consecutive (similar x position, reasonable y gap)
-                x_aligned = abs(item.bbox.x1 - prev_item.bbox.x1) < 20
-                y_consecutive = (item.bbox.y1 - prev_item.bbox.y2) < 30
+                x_aligned = abs(item.bbox.x0 - prev_item.bbox.x0) < 20
+                y_consecutive = (item.bbox.y0 - prev_item.bbox.y1) < 30

                if x_aligned and y_consecutive:
                    current_group.append(item)
@@ -714,11 +801,11 @@ class OCRToUnifiedConverter:
            if i + 1 < len(headers):
                next_header_y = headers[i + 1].bbox.y1

-            # Find all elements between headers
+            # Find all elements between headers (y0=top, y1=bottom)
            content_elements = [
                e for e in elements
-                if (e.bbox.y1 > header.bbox.y2 and
-                    e.bbox.y1 < next_header_y and
+                if (e.bbox.y0 > header.bbox.y1 and
+                    e.bbox.y0 < next_header_y and
                    e.type not in [ElementType.HEADER, ElementType.TITLE])
            ]

--- a/backend/app/services/pp_structure_enhanced.py
+++ b/backend/app/services/pp_structure_enhanced.py
@@ -167,7 +167,7 @@ class PPStructureEnhanced:
                # Process parsing_res_list if found
                if parsing_res_list:
                    elements = self._process_parsing_res_list(
-                        parsing_res_list, current_page, output_dir
+                        parsing_res_list, current_page, output_dir, image_path
                    )
                    all_elements.extend(elements)

@@ -229,7 +229,8 @@ class PPStructureEnhanced:
        self,
        parsing_res_list: List[Dict],
        current_page: int,
-        output_dir: Optional[Path]
+        output_dir: Optional[Path],
+        source_image_path: Optional[Path] = None
    ) -> List[Dict[str, Any]]:
        """
        Process parsing_res_list to extract all elements.
@@ -238,6 +239,7 @@ class PPStructureEnhanced:
            parsing_res_list: List of parsed elements from PP-StructureV3
            current_page: Current page number
            output_dir: Optional output directory
+            source_image_path: Path to source image for cropping image regions

        Returns:
            List of processed elements with normalized structure
@@ -327,6 +329,17 @@ class PPStructureEnhanced:
                        element['img_path'] = item['img_path']  # Keep original for reference
                    else:
                        logger.warning(f"Failed to save image for element {element['element_id']}")
+                # Crop image from source if no img_path but source image is available
+                elif source_image_path and output_dir and bbox != [0, 0, 0, 0]:
+                    cropped_path = self._crop_and_save_image(
+                        source_image_path, bbox, output_dir, element['element_id']
+                    )
+                    if cropped_path:
+                        element['saved_path'] = cropped_path
+                        element['img_path'] = cropped_path
+                        logger.info(f"Cropped and saved image region for {element['element_id']}")
+                    else:
+                        logger.warning(f"Failed to crop image for element {element['element_id']}")

            # Add any additional metadata
            if 'metadata' in item:
@@ -535,4 +548,62 @@ class PPStructureEnhanced:
            img_obj.save(str(img_path))
            logger.info(f"Saved image to {img_path}")
        except Exception as e:
-            logger.warning(f"Failed to save PIL image: {e}")
+            logger.warning(f"Failed to save PIL image: {e}")
+
+    def _crop_and_save_image(
+        self,
+        source_image_path: Path,
+        bbox: List[float],
+        output_dir: Path,
+        element_id: str
+    ) -> Optional[str]:
+        """
+        Crop image region from source image and save to output directory.
+
+        Args:
+            source_image_path: Path to the source image
+            bbox: Bounding box [x1, y1, x2, y2]
+            output_dir: Output directory for saving cropped image
+            element_id: Element ID for naming
+
+        Returns:
+            Relative filename (not full path) to saved image, consistent with
+            Direct Track which stores "filename.png" that gets joined with
+            result_dir by pdf_generator_service.
+        """
+        try:
+            from PIL import Image
+
+            # Open source image
+            with Image.open(source_image_path) as img:
+                # Ensure bbox values are integers
+                x1, y1, x2, y2 = [int(v) for v in bbox[:4]]
+
+                # Validate bbox
+                img_width, img_height = img.size
+                x1 = max(0, min(x1, img_width))
+                x2 = max(0, min(x2, img_width))
+                y1 = max(0, min(y1, img_height))
+                y2 = max(0, min(y2, img_height))
+
+                if x2 <= x1 or y2 <= y1:
+                    logger.warning(f"Invalid bbox for cropping: {bbox}")
+                    return None
+
+                # Crop the region
+                cropped = img.crop((x1, y1, x2, y2))
+
+                # Save directly to output directory (no subdirectory)
+                # Consistent with Direct Track which saves to output_dir directly
+                image_filename = f"{element_id}.png"
+                img_path = output_dir / image_filename
+                cropped.save(str(img_path), "PNG")
+
+                # Return just the filename (relative to result_dir)
+                # PDF generator will join with result_dir to get full path
+                logger.info(f"Cropped image saved: {img_path} ({x2-x1}x{y2-y1} pixels)")
+                return image_filename
+
+        except Exception as e:
+            logger.error(f"Failed to crop and save image for {element_id}: {e}")
+            return None