chore: backup before code cleanup

Backup commit before executing remove-unused-code proposal. This includes all pending changes and new features. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-11 11:55:39 +08:00
parent eff9b0bcd5
commit 940a406dce
58 changed files with 8226 additions and 175 deletions
--- a/backend/app/services/pp_structure_enhanced.py
+++ b/backend/app/services/pp_structure_enhanced.py
@@ -28,9 +28,11 @@ from PIL import Image
 import numpy as np
 import cv2
 from app.models.unified_document import ElementType
+from app.services.cell_validation_engine import CellValidationEngine, CellValidationConfig
 from app.core.config import settings
 from app.services.memory_manager import prediction_context
 from app.services.cv_table_detector import CVTableDetector
+from app.services.table_content_rebuilder import TableContentRebuilder

 logger = logging.getLogger(__name__)

@@ -91,7 +93,8 @@ class PPStructureEnhanced:
        preprocessed_image: Optional[Image.Image] = None,
        scaling_info: Optional['ScalingInfo'] = None,
        save_visualization: bool = False,
-        use_cv_table_detection: bool = False
+        use_cv_table_detection: bool = False,
+        raw_ocr_regions: Optional[List[Dict[str, Any]]] = None
    ) -> Dict[str, Any]:
        """
        Analyze document with full PP-StructureV3 capabilities.
@@ -110,6 +113,8 @@ class PPStructureEnhanced:
                               (layout_det_res, layout_order_res, overall_ocr_res, etc.)
            use_cv_table_detection: If True, use CV-based line detection for wired tables
                                   instead of ML-based cell detection (RT-DETR-L)
+            raw_ocr_regions: Optional list of raw OCR text regions for table content
+                            rebuilding. Used when PP-StructureV3's table HTML is incorrect.

        Returns:
            Dictionary with complete structure information including:
@@ -222,6 +227,7 @@ class PPStructureEnhanced:

                # Extract table_res_list which contains cell_box_list
                layout_det_res = None
+                overall_ocr_res = None
                if result_dict:
                    if 'table_res_list' in result_dict:
                        table_res_list = result_dict['table_res_list']
@@ -235,13 +241,20 @@ class PPStructureEnhanced:
                        layout_det_res = result_dict['layout_det_res']
                        logger.info(f"Found layout_det_res with {len(layout_det_res.get('boxes', []))} boxes")

+                    # Extract overall_ocr_res for gap filling (avoid separate Raw OCR inference)
+                    if 'overall_ocr_res' in result_dict:
+                        overall_ocr_res = result_dict['overall_ocr_res']
+                        ocr_count = len(overall_ocr_res.get('rec_texts', []))
+                        logger.info(f"Found overall_ocr_res with {ocr_count} text regions")
+
                # Process parsing_res_list if found
                if parsing_res_list:
                    elements = self._process_parsing_res_list(
                        parsing_res_list, current_page, output_dir, image_path, scaling_info,
                        table_res_list=table_res_list,  # Pass table_res_list for cell_box_list
                        layout_det_res=layout_det_res,  # Pass layout_det_res for Image-in-Table
-                        use_cv_table_detection=use_cv_table_detection  # Use CV for wired tables
+                        use_cv_table_detection=use_cv_table_detection,  # Use CV for wired tables
+                        raw_ocr_regions=raw_ocr_regions  # Pass raw OCR for table content rebuilding
                    )
                    all_elements.extend(elements)

@@ -289,6 +302,15 @@ class PPStructureEnhanced:
            if visualization_dir:
                result['visualization_dir'] = str(visualization_dir)

+            # Add overall_ocr_res for gap filling (converted to standard format)
+            # This allows gap_filling_service to use PP-StructureV3's internal OCR
+            # instead of running a separate Raw OCR inference
+            if overall_ocr_res:
+                result['overall_ocr_res'] = self._convert_overall_ocr_to_regions(
+                    overall_ocr_res, scaling_info
+                )
+                logger.info(f"Converted {len(result['overall_ocr_res'])} OCR regions from overall_ocr_res")
+
            return result

        except Exception as e:
@@ -327,7 +349,8 @@ class PPStructureEnhanced:
        scaling_info: Optional['ScalingInfo'] = None,
        table_res_list: Optional[List[Dict]] = None,
        layout_det_res: Optional[Dict] = None,
-        use_cv_table_detection: bool = False
+        use_cv_table_detection: bool = False,
+        raw_ocr_regions: Optional[List[Dict[str, Any]]] = None
    ) -> List[Dict[str, Any]]:
        """
        Process parsing_res_list to extract all elements.
@@ -341,6 +364,7 @@ class PPStructureEnhanced:
            table_res_list: Optional list of table results containing cell_box_list
            layout_det_res: Optional layout detection result for Image-in-Table processing
            use_cv_table_detection: If True, use CV line detection for wired tables
+            raw_ocr_regions: Optional list of raw OCR text regions for table content rebuilding

        Returns:
            List of processed elements with normalized structure
@@ -415,6 +439,11 @@ class PPStructureEnhanced:
                    mapped_type = ElementType.TABLE
                    html_table_content = content  # Store for later use

+            # Strip LaTeX math formatting from text content (PP-Structure formula detection)
+            if content and mapped_type in [ElementType.TEXT, ElementType.TITLE, ElementType.HEADER]:
+                if '$' in content and '\\' in content:
+                    content = self._strip_latex_math(content)
+
            # Create element
            element = {
                'element_id': f"pp3_{current_page}_{idx}",
@@ -468,18 +497,84 @@ class PPStructureEnhanced:
                                    logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes from table_res_list (HTML match)")
                                    break

-                    # If no HTML match, use first available table_res with cell_box_list
+                    # If no HTML match, find best matching table_res by bbox overlap
                    if not cell_boxes_extracted:
+                        best_match = None
+                        best_overlap = 0.0
+
                        for tbl_res in table_res_list:
-                            if 'cell_box_list' in tbl_res and tbl_res['cell_box_list']:
-                                cell_boxes = tbl_res['cell_box_list']
-                                element['cell_boxes'] = [[float(c) for c in box] for box in cell_boxes]
-                                element['cell_boxes_source'] = 'table_res_list'
-                                cell_boxes_extracted = True
-                                logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes from table_res_list (first available)")
-                                # Remove used table_res to avoid reuse
-                                table_res_list.remove(tbl_res)
-                                break
+                            if 'cell_box_list' not in tbl_res or not tbl_res['cell_box_list']:
+                                continue
+
+                            # Get table_res bbox from its cell_box_list
+                            cell_boxes_temp = tbl_res['cell_box_list']
+                            if not cell_boxes_temp:
+                                continue
+
+                            # Calculate bounding box of all cells
+                            tbl_x1 = min(cb[0] for cb in cell_boxes_temp)
+                            tbl_y1 = min(cb[1] for cb in cell_boxes_temp)
+                            tbl_x2 = max(cb[2] for cb in cell_boxes_temp)
+                            tbl_y2 = max(cb[3] for cb in cell_boxes_temp)
+
+                            # Calculate IoU (Intersection over Union) with element bbox
+                            # bbox is [x1, y1, x2, y2]
+                            elem_x1, elem_y1, elem_x2, elem_y2 = bbox[0], bbox[1], bbox[2], bbox[3]
+
+                            # Intersection
+                            inter_x1 = max(tbl_x1, elem_x1)
+                            inter_y1 = max(tbl_y1, elem_y1)
+                            inter_x2 = min(tbl_x2, elem_x2)
+                            inter_y2 = min(tbl_y2, elem_y2)
+
+                            if inter_x1 < inter_x2 and inter_y1 < inter_y2:
+                                inter_area = (inter_x2 - inter_x1) * (inter_y2 - inter_y1)
+                                elem_area = (elem_x2 - elem_x1) * (elem_y2 - elem_y1)
+                                tbl_area = (tbl_x2 - tbl_x1) * (tbl_y2 - tbl_y1)
+
+                                # Use overlap ratio with element bbox (how much of element is covered)
+                                overlap_ratio = inter_area / elem_area if elem_area > 0 else 0
+
+                                if overlap_ratio > best_overlap:
+                                    best_overlap = overlap_ratio
+                                    best_match = tbl_res
+
+                        # Use best match if overlap is significant (>10%)
+                        if best_match and best_overlap > 0.1:
+                            cell_boxes = best_match['cell_box_list']
+                            element['cell_boxes'] = [[float(c) for c in box] for box in cell_boxes]
+                            element['cell_boxes_source'] = 'table_res_list'
+                            cell_boxes_extracted = True
+                            logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes from table_res_list (bbox match, overlap={best_overlap:.2f})")
+
+                            # Extract pred_html if not already set
+                            if not html_content and 'pred_html' in best_match:
+                                html_content = best_match['pred_html']
+                                element['html'] = html_content
+                                element['extracted_text'] = self._extract_text_from_html(html_content)
+                                logger.info(f"[TABLE] Extracted HTML from table_res_list (bbox match, {len(html_content)} chars)")
+
+                            # Remove used table_res to avoid reuse
+                            table_res_list.remove(best_match)
+                        elif table_res_list:
+                            # Fallback to first available if no bbox match found
+                            for tbl_res in table_res_list:
+                                if 'cell_box_list' in tbl_res and tbl_res['cell_box_list']:
+                                    cell_boxes = tbl_res['cell_box_list']
+                                    element['cell_boxes'] = [[float(c) for c in box] for box in cell_boxes]
+                                    element['cell_boxes_source'] = 'table_res_list'
+                                    cell_boxes_extracted = True
+                                    logger.warning(f"[TABLE] Using first available table_res (no bbox match, {len(cell_boxes)} cells)")
+
+                                    # Extract pred_html if not already set
+                                    if not html_content and 'pred_html' in tbl_res:
+                                        html_content = tbl_res['pred_html']
+                                        element['html'] = html_content
+                                        element['extracted_text'] = self._extract_text_from_html(html_content)
+                                        logger.info(f"[TABLE] Extracted HTML from table_res_list (fallback, {len(html_content)} chars)")
+
+                                    table_res_list.remove(tbl_res)
+                                    break

                if not cell_boxes_extracted and 'boxes' in res_data:
                    # PPStructureV3 returned cell boxes in res (unlikely in PaddleX 3.x)
@@ -558,6 +653,42 @@ class PPStructureEnhanced:
                        element['embedded_images'] = embedded_images
                        logger.info(f"[TABLE] Embedded {len(embedded_images)} images into table")

+                # 4. Table content rebuilding from raw OCR regions
+                # When cell_boxes have boundary issues, rebuild table content from raw OCR
+                # Only if table_content_rebuilder is enabled (disabled by default as it's a patch behavior)
+                logger.info(f"[TABLE] raw_ocr_regions available: {raw_ocr_regions is not None and len(raw_ocr_regions) if raw_ocr_regions else 0}")
+                logger.info(f"[TABLE] cell_boxes available: {len(element.get('cell_boxes', []))}")
+                if settings.table_content_rebuilder_enabled and raw_ocr_regions and element.get('cell_boxes'):
+                    rebuilder = TableContentRebuilder()
+                    should_rebuild, rebuild_reason = rebuilder.should_rebuild(
+                        element['cell_boxes'],
+                        bbox,
+                        element.get('html', '')
+                    )
+
+                    if should_rebuild:
+                        logger.info(f"[TABLE] Triggering table rebuild: {rebuild_reason}")
+                        rebuilt_table, rebuild_stats = rebuilder.rebuild_table(
+                            cell_boxes=element['cell_boxes'],
+                            table_bbox=bbox,
+                            raw_ocr_regions=raw_ocr_regions,
+                            original_html=element.get('html', '')
+                        )
+
+                        if rebuilt_table:
+                            # Update element with rebuilt content
+                            element['html'] = rebuilt_table['html']
+                            element['rebuilt_table'] = rebuilt_table
+                            element['rebuild_stats'] = rebuild_stats
+                            element['extracted_text'] = self._extract_text_from_html(rebuilt_table['html'])
+                            logger.info(
+                                f"[TABLE] Rebuilt table: {rebuilt_table['rows']}x{rebuilt_table['cols']} "
+                                f"with {len(rebuilt_table['cells'])} cells"
+                            )
+                        else:
+                            logger.warning(f"[TABLE] Rebuild failed: {rebuild_stats.get('reason', 'unknown')}")
+                            element['rebuild_stats'] = rebuild_stats
+
            # Special handling for images/figures/charts/stamps (visual elements that need cropping)
            elif mapped_type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART, ElementType.DIAGRAM, ElementType.STAMP, ElementType.LOGO]:
                # Save image if path provided
@@ -587,6 +718,21 @@ class PPStructureEnhanced:
            elements.append(element)
            logger.debug(f"Processed element {idx}: type={mapped_type}, bbox={bbox}")

+        # Apply cell validation to filter over-detected tables
+        if settings.cell_validation_enabled:
+            cell_validator = CellValidationEngine(CellValidationConfig(
+                max_cell_density=settings.cell_validation_max_density,
+                min_avg_cell_area=settings.cell_validation_min_cell_area,
+                min_cell_height=settings.cell_validation_min_cell_height,
+                enabled=True
+            ))
+            elements, validation_stats = cell_validator.validate_and_filter_elements(elements)
+            if validation_stats['reclassified_tables'] > 0:
+                logger.info(
+                    f"Cell validation: {validation_stats['reclassified_tables']}/{validation_stats['total_tables']} "
+                    f"tables reclassified as TEXT due to over-detection"
+                )
+
        return elements

    def _embed_images_in_table(
@@ -911,18 +1057,145 @@ class PPStructureEnhanced:
            type_counts[elem_type] = type_counts.get(elem_type, 0) + 1
        return type_counts

+    def _convert_overall_ocr_to_regions(
+        self,
+        overall_ocr_res: Dict[str, Any],
+        scaling_info: Optional['ScalingInfo'] = None
+    ) -> List[Dict[str, Any]]:
+        """
+        Convert PP-StructureV3's overall_ocr_res to standard OCR region format.
+
+        This allows gap_filling_service to use PP-StructureV3's internal OCR results
+        instead of running a separate Raw OCR inference, saving approximately 50%
+        of total inference time.
+
+        The overall_ocr_res structure:
+        - dt_polys: List of polygon coordinates [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
+        - rec_texts: List of recognized text strings
+        - rec_scores: List of confidence scores
+
+        Args:
+            overall_ocr_res: Dictionary containing OCR results from PP-StructureV3
+            scaling_info: Optional scaling info for coordinate restoration
+
+        Returns:
+            List of OCR region dictionaries in standard format:
+            [{'text': str, 'bbox': [[x1,y1],...], 'confidence': float}, ...]
+        """
+        regions = []
+
+        dt_polys = overall_ocr_res.get('dt_polys', [])
+        rec_texts = overall_ocr_res.get('rec_texts', [])
+        rec_scores = overall_ocr_res.get('rec_scores', [])
+
+        # Ensure all lists have the same length
+        num_regions = min(len(dt_polys), len(rec_texts))
+        if len(rec_scores) < num_regions:
+            # Pad with default confidence if scores are missing
+            rec_scores = list(rec_scores) + [0.9] * (num_regions - len(rec_scores))
+
+        for i in range(num_regions):
+            text = rec_texts[i]
+            if not text or not text.strip():
+                continue
+
+            poly = dt_polys[i]
+            confidence = rec_scores[i] if i < len(rec_scores) else 0.9
+
+            # Apply scaling restoration if needed
+            if scaling_info and hasattr(scaling_info, 'scale_factor') and scaling_info.scale_factor != 1.0:
+                scale = scaling_info.scale_factor
+                poly = [[pt[0] / scale, pt[1] / scale] for pt in poly]
+
+            regions.append({
+                'text': text,
+                'bbox': poly,  # Keep polygon format for compatibility
+                'confidence': confidence
+            })
+
+        return regions
+
    def _extract_text_from_html(self, html: str) -> str:
        """Extract plain text from HTML content."""
        try:
            from bs4 import BeautifulSoup
            soup = BeautifulSoup(html, 'html.parser')
-            return soup.get_text(separator=' ', strip=True)
+            text = soup.get_text(separator=' ', strip=True)
        except:
            # Fallback: just remove HTML tags
            import re
            text = re.sub(r'<[^>]+>', ' ', html)
            text = re.sub(r'\s+', ' ', text)
-            return text.strip()
+            text = text.strip()
+
+        # Strip LaTeX math formatting if present
+        return self._strip_latex_math(text)
+
+    def _strip_latex_math(self, text: str) -> str:
+        """
+        Convert LaTeX math notation to plain text.
+
+        PP-StructureV3 outputs formulas in LaTeX format like:
+        $N\\cdot m\\times8.851=|b\\cdot|$
+
+        This converts them to readable plain text.
+        """
+        import re
+
+        if not text or '$' not in text:
+            return text
+
+        # Remove $...$ delimiters but keep content
+        text = re.sub(r'\$([^$]+)\$', r'\1', text)
+
+        # Convert common LaTeX math commands to plain text
+        replacements = [
+            (r'\\cdot', '·'),           # Multiplication dot
+            (r'\\times', '×'),          # Multiplication sign
+            (r'\\div', '÷'),            # Division sign
+            (r'\\pm', '±'),             # Plus-minus
+            (r'\\leq', '≤'),            # Less than or equal
+            (r'\\geq', '≥'),            # Greater than or equal
+            (r'\\neq', '≠'),            # Not equal
+            (r'\\approx', '≈'),         # Approximately equal
+            (r'\\circ', '°'),           # Degree symbol
+            (r'\\degree', '°'),         # Degree symbol
+            (r'\\alpha', 'α'),
+            (r'\\beta', 'β'),
+            (r'\\gamma', 'γ'),
+            (r'\\delta', 'δ'),
+            (r'\\mu', 'μ'),
+            (r'\\Omega', 'Ω'),
+            (r'\\infty', '∞'),
+            (r'\^\\{2\\}', '²'),         # Superscript 2
+            (r'\^\\{3\\}', '³'),         # Superscript 3
+            (r'\^2', '²'),
+            (r'\^3', '³'),
+            (r'_\\{([^}]+)\\}', r'_\1'), # Subscript
+            (r'\\mathrm\{([^}]+)\}', r'\1'),   # Roman text
+            (r'\\mathsf\{([^}]+)\}', r'\1'),   # Sans-serif text
+            (r'\\mathbf\{([^}]+)\}', r'\1'),   # Bold text
+            (r'\\text\{([^}]+)\}', r'\1'),     # Text mode
+            (r'\\left', ''),
+            (r'\\right', ''),
+            (r'\\[|]', '|'),             # Pipe symbols
+            (r'\\ ', ' '),               # Escaped space
+            (r'\\,', ' '),               # Thin space
+            (r'\\;', ' '),               # Medium space
+            (r'\\quad', ' '),            # Quad space
+            (r'\\qquad', ' '),           # Double quad space
+        ]
+
+        for pattern, replacement in replacements:
+            text = re.sub(pattern, replacement, text)
+
+        # Clean up any remaining backslashes followed by letters (unknown commands)
+        text = re.sub(r'\\[a-zA-Z]+', '', text)
+
+        # Clean up multiple spaces
+        text = re.sub(r'\s+', ' ', text)
+
+        return text.strip()

    def _extract_bbox_from_filename(self, filename: str) -> List[int]:
        """Extract bbox from filename if it contains coordinate information."""