fix: OCR track table data format and image cropping

Table data format fixes (ocr_to_unified_converter.py): - Fix ElementType string conversion using value-based lookup - Add content-based HTML table detection (reclassify TEXT to TABLE) - Use BeautifulSoup for robust HTML table parsing - Generate TableData with fully populated cells arrays Image cropping for OCR track (pp_structure_enhanced.py): - Add _crop_and_save_image method for extracting image regions - Pass source_image_path to _process_parsing_res_list - Return relative filename (not full path) for saved_path - Consistent with Direct Track image saving pattern Also includes: - Add beautifulsoup4 to requirements.txt - Add architecture overview documentation - Archive fix-ocr-track-table-data-format proposal (22/24 tasks) Known issues: OCR track images are restored but still have quality issues that will be addressed in a follow-up proposal. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-26 18:48:15 +08:00
parent a227311b2d
commit 6e050eb540
8 changed files with 585 additions and 30 deletions
--- a/backend/app/services/ocr_to_unified_converter.py
+++ b/backend/app/services/ocr_to_unified_converter.py
@@ -350,7 +350,19 @@ class OCRToUnifiedConverter:
            element_type = elem_data.get('type', ElementType.TEXT)
            if isinstance(element_type, str):
                # Convert string to ElementType if needed
-                element_type = ElementType[element_type] if element_type in ElementType.__members__ else ElementType.TEXT
+                # ElementType is a str-based enum, so we can construct from value (lowercase)
+                try:
+                    element_type = ElementType(element_type)
+                except ValueError:
+                    # If value doesn't match, try member name (uppercase)
+                    element_type = ElementType[element_type.upper()] if element_type.upper() in ElementType.__members__ else ElementType.TEXT
+
+            # Content-based reclassification: detect HTML tables in text content
+            content_str = elem_data.get('content', '')
+            if isinstance(content_str, str) and '<table' in content_str.lower():
+                if element_type == ElementType.TEXT:
+                    logger.info(f"Element {elem_data.get('element_id')}: Reclassifying TEXT to TABLE (HTML table in content)")
+                    element_type = ElementType.TABLE

            # Prepare content based on element type
            if element_type == ElementType.TABLE:
@@ -538,7 +550,12 @@ class OCRToUnifiedConverter:
            return None

    def _extract_table_data(self, elem_data: Dict) -> Optional[TableData]:
-        """Extract table data from element."""
+        """
+        Extract table data from element using BeautifulSoup for robust HTML parsing.
+
+        This method produces TableData objects with fully populated cells arrays,
+        matching the format produced by DirectExtractionEngine for consistency.
+        """
        try:
            html = elem_data.get('html', '')
            extracted_text = elem_data.get('extracted_text', '')
@@ -550,31 +567,101 @@ class OCRToUnifiedConverter:
                    html = content
                    logger.debug("Using content field as HTML table source")

-            # Try to parse HTML to get rows and columns
-            rows = 0
+            # Return None if no HTML table content
+            if not html or '<table' not in html.lower():
+                if extracted_text:
+                    # Return minimal TableData with just caption if we have text
+                    return TableData(rows=0, cols=0, cells=[], caption=extracted_text)
+                return None
+
+            # Parse HTML table using BeautifulSoup
+            try:
+                from bs4 import BeautifulSoup
+                soup = BeautifulSoup(html, 'html.parser')
+                table = soup.find('table')
+
+                if not table:
+                    logger.warning("No <table> element found in HTML")
+                    return self._fallback_table_data(html, extracted_text)
+
+                cells = []
+                headers = []
+                rows = table.find_all('tr')
+
+                # Track actual column positions accounting for rowspan/colspan
+                # This is a simplified approach - complex spanning may need enhancement
+                for row_idx, row in enumerate(rows):
+                    row_cells = row.find_all(['td', 'th'])
+                    col_idx = 0
+
+                    for cell in row_cells:
+                        cell_content = cell.get_text(strip=True)
+                        rowspan = int(cell.get('rowspan', 1))
+                        colspan = int(cell.get('colspan', 1))
+
+                        cells.append(TableCell(
+                            row=row_idx,
+                            col=col_idx,
+                            row_span=rowspan,
+                            col_span=colspan,
+                            content=cell_content
+                        ))
+
+                        # Collect headers from <th> elements or first row
+                        if cell.name == 'th' or row_idx == 0:
+                            headers.append(cell_content)
+
+                        # Advance column index by colspan
+                        col_idx += colspan
+
+                # Calculate actual dimensions
+                num_rows = len(rows)
+                num_cols = max(
+                    sum(int(cell.get('colspan', 1)) for cell in row.find_all(['td', 'th']))
+                    for row in rows
+                ) if rows else 0
+
+                logger.debug(
+                    f"Parsed HTML table: {num_rows} rows, {num_cols} cols, {len(cells)} cells"
+                )
+
+                return TableData(
+                    rows=num_rows,
+                    cols=num_cols,
+                    cells=cells,
+                    headers=headers if headers else None,
+                    caption=extracted_text if extracted_text else None
+                )
+
+            except ImportError:
+                logger.warning("BeautifulSoup not available, using fallback parsing")
+                return self._fallback_table_data(html, extracted_text)
+
+        except Exception as e:
+            logger.warning(f"Failed to extract table data: {e}")
+            return None
+
+    def _fallback_table_data(self, html: str, extracted_text: str = '') -> Optional[TableData]:
+        """
+        Fallback table parsing when BeautifulSoup is not available.
+        Returns basic TableData with row/col counts only (no cells).
+        """
+        try:
+            rows = html.count('<tr')
            cols = 0
-            cells = []
+            if rows > 0:
+                first_row_end = html.find('</tr>')
+                if first_row_end > 0:
+                    first_row = html[:first_row_end]
+                    cols = first_row.count('<td') + first_row.count('<th')

-            if html:
-                # Simple HTML parsing (could be enhanced with BeautifulSoup)
-                rows = html.count('<tr')
-                if rows > 0:
-                    # Estimate columns from first row
-                    first_row_end = html.find('</tr>')
-                    if first_row_end > 0:
-                        first_row = html[:first_row_end]
-                        cols = first_row.count('<td') + first_row.count('<th')
-
-            # Return None if no valid table data found
            if rows == 0 and cols == 0 and not extracted_text:
                return None

-            # Note: TableData uses 'cols' not 'columns'
-            # HTML content can be stored as caption or in element metadata
            return TableData(
                rows=rows,
                cols=cols,
-                cells=cells,
+                cells=[],  # Empty cells in fallback mode
                caption=extracted_text if extracted_text else None
            )
        except:
@@ -653,9 +740,9 @@ class OCRToUnifiedConverter:
            min_distance = float('inf')

            for target in targets:
-                # Caption should be below the target
-                if target.bbox.y2 <= caption.bbox.y1:
-                    distance = caption.bbox.y1 - target.bbox.y2
+                # Caption should be below the target (y1 is bottom in BoundingBox)
+                if target.bbox.y1 <= caption.bbox.y0:
+                    distance = caption.bbox.y0 - target.bbox.y1
                    if distance < min_distance:
                        min_distance = distance
                        best_target = target
@@ -684,8 +771,8 @@ class OCRToUnifiedConverter:
            else:
                prev_item = list_items[i-1]
                # Check if items are consecutive (similar x position, reasonable y gap)
-                x_aligned = abs(item.bbox.x1 - prev_item.bbox.x1) < 20
-                y_consecutive = (item.bbox.y1 - prev_item.bbox.y2) < 30
+                x_aligned = abs(item.bbox.x0 - prev_item.bbox.x0) < 20
+                y_consecutive = (item.bbox.y0 - prev_item.bbox.y1) < 30

                if x_aligned and y_consecutive:
                    current_group.append(item)
@@ -714,11 +801,11 @@ class OCRToUnifiedConverter:
            if i + 1 < len(headers):
                next_header_y = headers[i + 1].bbox.y1

-            # Find all elements between headers
+            # Find all elements between headers (y0=top, y1=bottom)
            content_elements = [
                e for e in elements
-                if (e.bbox.y1 > header.bbox.y2 and
-                    e.bbox.y1 < next_header_y and
+                if (e.bbox.y0 > header.bbox.y1 and
+                    e.bbox.y0 < next_header_y and
                    e.type not in [ElementType.HEADER, ElementType.TITLE])
            ]