feat: simplify layout model selection and archive proposals

Changes: - Replace PP-Structure 7-slider parameter UI with simple 3-option layout model selector - Add layout model mapping: chinese (PP-DocLayout-S), default (PubLayNet), cdla - Add LayoutModelSelector component and zh-TW translations - Fix "default" model behavior with sentinel value for PubLayNet - Add gap filling service for OCR track coverage improvement - Add PP-Structure debug utilities - Archive completed/incomplete proposals: - add-ocr-track-gap-filling (complete) - fix-ocr-track-table-rendering (incomplete) - simplify-ppstructure-model-selection (22/25 tasks) - Add new layout model tests, archive old PP-Structure param tests - Update OpenSpec ocr-processing spec with layout model requirements 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-27 13:27:00 +08:00
parent c65df754cf
commit 59206a6ab8
35 changed files with 3621 additions and 658 deletions
--- a/backend/app/services/ocr_to_unified_converter.py
+++ b/backend/app/services/ocr_to_unified_converter.py
@@ -3,6 +3,9 @@ OCR to UnifiedDocument Converter

 Converts PP-StructureV3 OCR results to UnifiedDocument format, preserving
 all structure information and metadata.
+
+Includes gap filling support to supplement PP-StructureV3 output with raw OCR
+regions when significant content loss is detected.
 """

 import logging
@@ -16,10 +19,165 @@ from app.models.unified_document import (
    BoundingBox, StyleInfo, TableData, ElementType,
    ProcessingTrack, TableCell, Dimensions
 )
+from app.services.gap_filling_service import GapFillingService

 logger = logging.getLogger(__name__)


+def trim_empty_columns(table_dict: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Remove empty columns from a table dictionary.
+
+    A column is considered empty if ALL cells in that column have content that is
+    empty or whitespace-only (using .strip() to determine emptiness).
+
+    This function:
+    1. Identifies columns where every cell's content is empty/whitespace
+    2. Removes identified empty columns
+    3. Updates cols/columns value
+    4. Recalculates each cell's col index
+    5. Adjusts col_span when spans cross removed columns
+    6. Removes cells entirely when their complete span falls within removed columns
+    7. Preserves original bbox (no layout drift)
+
+    Args:
+        table_dict: Table dictionary with keys: rows, cols/columns, cells
+
+    Returns:
+        Cleaned table dictionary with empty columns removed
+    """
+    cells = table_dict.get('cells', [])
+    if not cells:
+        return table_dict
+
+    # Get original column count
+    original_cols = table_dict.get('cols', table_dict.get('columns', 0))
+    if original_cols == 0:
+        # Calculate from cells if not provided
+        max_col = 0
+        for cell in cells:
+            cell_col = cell.get('col', 0) if isinstance(cell, dict) else getattr(cell, 'col', 0)
+            cell_span = cell.get('col_span', 1) if isinstance(cell, dict) else getattr(cell, 'col_span', 1)
+            max_col = max(max_col, cell_col + cell_span)
+        original_cols = max_col
+
+    if original_cols == 0:
+        return table_dict
+
+    # Build a map: column_index -> list of cell contents
+    # For cells with col_span > 1, we only check their primary column
+    column_contents: Dict[int, List[str]] = {i: [] for i in range(original_cols)}
+
+    for cell in cells:
+        if isinstance(cell, dict):
+            col = cell.get('col', 0)
+            col_span = cell.get('col_span', 1)
+            content = cell.get('content', '')
+        else:
+            col = getattr(cell, 'col', 0)
+            col_span = getattr(cell, 'col_span', 1)
+            content = getattr(cell, 'content', '')
+
+        # Mark content for each column this cell spans
+        for c in range(col, min(col + col_span, original_cols)):
+            if c in column_contents:
+                column_contents[c].append(str(content).strip() if content else '')
+
+    # Identify empty columns (all content is empty/whitespace)
+    empty_columns = set()
+    for col_idx, contents in column_contents.items():
+        # A column is empty if ALL cells in it have empty content
+        # Note: If a column has no cells at all, it's considered empty
+        if all(c == '' for c in contents):
+            empty_columns.add(col_idx)
+
+    if not empty_columns:
+        # No empty columns to remove, just ensure cols is set
+        result = dict(table_dict)
+        if result.get('cols', result.get('columns', 0)) == 0:
+            result['cols'] = original_cols
+            if 'columns' in result:
+                result['columns'] = original_cols
+        return result
+
+    logger.debug(f"Removing empty columns: {sorted(empty_columns)} from table with {original_cols} cols")
+
+    # Build column mapping: old_col -> new_col (or None if removed)
+    col_mapping: Dict[int, Optional[int]] = {}
+    new_col = 0
+    for old_col in range(original_cols):
+        if old_col in empty_columns:
+            col_mapping[old_col] = None
+        else:
+            col_mapping[old_col] = new_col
+            new_col += 1
+
+    new_cols = new_col
+
+    # Process cells
+    new_cells = []
+    for cell in cells:
+        if isinstance(cell, dict):
+            old_col = cell.get('col', 0)
+            old_col_span = cell.get('col_span', 1)
+        else:
+            old_col = getattr(cell, 'col', 0)
+            old_col_span = getattr(cell, 'col_span', 1)
+
+        # Calculate new col and col_span
+        # Find the first non-removed column in this cell's span
+        new_start_col = None
+        new_end_col = None
+
+        for c in range(old_col, min(old_col + old_col_span, original_cols)):
+            mapped = col_mapping.get(c)
+            if mapped is not None:
+                if new_start_col is None:
+                    new_start_col = mapped
+                new_end_col = mapped
+
+        # If entire span falls within removed columns, skip this cell
+        if new_start_col is None:
+            logger.debug(f"Removing cell at row={cell.get('row', 0) if isinstance(cell, dict) else cell.row}, "
+                        f"col={old_col} (entire span in removed columns)")
+            continue
+
+        new_col_span = new_end_col - new_start_col + 1
+
+        # Create new cell
+        if isinstance(cell, dict):
+            new_cell = dict(cell)
+            new_cell['col'] = new_start_col
+            new_cell['col_span'] = new_col_span
+        else:
+            # Handle TableCell objects
+            new_cell = {
+                'row': cell.row,
+                'col': new_start_col,
+                'row_span': cell.row_span,
+                'col_span': new_col_span,
+                'content': cell.content
+            }
+            if hasattr(cell, 'bbox') and cell.bbox:
+                new_cell['bbox'] = cell.bbox
+            if hasattr(cell, 'style') and cell.style:
+                new_cell['style'] = cell.style
+
+        new_cells.append(new_cell)
+
+    # Build result
+    result = dict(table_dict)
+    result['cells'] = new_cells
+    result['cols'] = new_cols
+    if 'columns' in result:
+        result['columns'] = new_cols
+
+    logger.info(f"Trimmed table: {original_cols} -> {new_cols} columns, "
+               f"{len(cells)} -> {len(new_cells)} cells")
+
+    return result
+
+
 class OCRToUnifiedConverter:
    """
    Converter for transforming PP-StructureV3 OCR results to UnifiedDocument format.
@@ -30,11 +188,19 @@ class OCRToUnifiedConverter:
    - Multi-page document assembly
    - Metadata preservation
    - Structure relationship mapping
+    - Gap filling with raw OCR regions (when PP-StructureV3 misses content)
    """

-    def __init__(self):
-        """Initialize the converter."""
+    def __init__(self, enable_gap_filling: bool = True):
+        """
+        Initialize the converter.
+
+        Args:
+            enable_gap_filling: Whether to enable gap filling with raw OCR regions
+        """
        self.element_counter = 0
+        self.gap_filling_service = GapFillingService() if enable_gap_filling else None
+        self.gap_filling_stats: Dict[str, Any] = {}

    def convert(
        self,
@@ -120,13 +286,21 @@ class OCRToUnifiedConverter:
        Extract pages from OCR results.

        Handles both enhanced PP-StructureV3 results (with parsing_res_list)
-        and traditional markdown results.
+        and traditional markdown results. Applies gap filling when enabled.
        """
        pages = []

+        # Extract raw OCR text regions for gap filling
+        raw_text_regions = ocr_results.get('text_regions', [])
+        ocr_dimensions = ocr_results.get('ocr_dimensions', {})
+
        # Check if we have enhanced results from PPStructureEnhanced
        if 'enhanced_results' in ocr_results:
-            pages = self._extract_from_enhanced_results(ocr_results['enhanced_results'])
+            pages = self._extract_from_enhanced_results(
+                ocr_results['enhanced_results'],
+                raw_text_regions=raw_text_regions,
+                ocr_dimensions=ocr_dimensions
+            )
        # Check for traditional OCR results with text_regions at top level (from process_file_traditional)
        elif 'text_regions' in ocr_results:
            pages = self._extract_from_traditional_ocr(ocr_results)
@@ -143,9 +317,21 @@ class OCRToUnifiedConverter:

    def _extract_from_enhanced_results(
        self,
-        enhanced_results: List[Dict[str, Any]]
+        enhanced_results: List[Dict[str, Any]],
+        raw_text_regions: Optional[List[Dict[str, Any]]] = None,
+        ocr_dimensions: Optional[Dict[str, Any]] = None
    ) -> List[Page]:
-        """Extract pages from enhanced PP-StructureV3 results."""
+        """
+        Extract pages from enhanced PP-StructureV3 results.
+
+        Applies gap filling when enabled to supplement PP-StructureV3 output
+        with raw OCR regions that were not detected by the layout model.
+
+        Args:
+            enhanced_results: PP-StructureV3 enhanced results
+            raw_text_regions: Raw OCR text regions for gap filling
+            ocr_dimensions: OCR image dimensions for coordinate alignment
+        """
        pages = []

        for page_idx, page_result in enumerate(enhanced_results):
@@ -158,15 +344,52 @@ class OCRToUnifiedConverter:
                    if element:
                        elements.append(element)

+            # Get page dimensions
+            pp_dimensions = Dimensions(
+                width=page_result.get('width', 0),
+                height=page_result.get('height', 0)
+            )
+
+            # Apply gap filling if enabled and raw regions available
+            if self.gap_filling_service and raw_text_regions:
+                # Filter raw regions for current page
+                page_raw_regions = [
+                    r for r in raw_text_regions
+                    if r.get('page', 0) == page_idx or r.get('page', 1) == page_idx + 1
+                ]
+
+                if page_raw_regions:
+                    supplemented, stats = self.gap_filling_service.fill_gaps(
+                        raw_ocr_regions=page_raw_regions,
+                        pp_structure_elements=elements,
+                        page_number=page_idx + 1,
+                        ocr_dimensions=ocr_dimensions,
+                        pp_dimensions=pp_dimensions
+                    )
+
+                    # Store statistics
+                    self.gap_filling_stats[f'page_{page_idx + 1}'] = stats
+
+                    if supplemented:
+                        logger.info(
+                            f"Page {page_idx + 1}: Gap filling added {len(supplemented)} elements "
+                            f"(coverage: {stats.get('coverage_ratio', 0):.2%})"
+                        )
+                        elements.extend(supplemented)
+
+                        # Recalculate reading order for combined elements
+                        reading_order = self.gap_filling_service.recalculate_reading_order(elements)
+                        page_result['reading_order'] = reading_order
+
            # Create page
            page = Page(
                page_number=page_idx + 1,
-                dimensions=Dimensions(
-                    width=page_result.get('width', 0),
-                    height=page_result.get('height', 0)
-                ),
+                dimensions=pp_dimensions,
                elements=elements,
-                metadata={'reading_order': page_result.get('reading_order', [])}
+                metadata={
+                    'reading_order': page_result.get('reading_order', []),
+                    'gap_filling': self.gap_filling_stats.get(f'page_{page_idx + 1}', {})
+                }
            )

            pages.append(page)
@@ -500,6 +723,9 @@ class OCRToUnifiedConverter:
    ) -> Optional[DocumentElement]:
        """Convert table data to DocumentElement."""
        try:
+            # Clean up empty columns before building TableData
+            table_dict = trim_empty_columns(table_dict)
+
            # Extract bbox
            bbox_data = table_dict.get('bbox', [0, 0, 0, 0])
            bbox = BoundingBox(
@@ -587,14 +813,22 @@ class OCRToUnifiedConverter:
                cells = []
                headers = []
                rows = table.find_all('tr')
+                num_rows = len(rows)

-                # Track actual column positions accounting for rowspan/colspan
-                # This is a simplified approach - complex spanning may need enhancement
+                # First pass: calculate total columns by finding max column extent
+                # Track cells that span multiple rows: occupied[row][col] = True
+                occupied: Dict[int, Dict[int, bool]] = {r: {} for r in range(num_rows)}
+
+                # Parse all cells with proper rowspan/colspan handling
                for row_idx, row in enumerate(rows):
                    row_cells = row.find_all(['td', 'th'])
                    col_idx = 0

                    for cell in row_cells:
+                        # Skip columns that are occupied by rowspan from previous rows
+                        while occupied[row_idx].get(col_idx, False):
+                            col_idx += 1
+
                        cell_content = cell.get_text(strip=True)
                        rowspan = int(cell.get('rowspan', 1))
                        colspan = int(cell.get('colspan', 1))
@@ -611,26 +845,66 @@ class OCRToUnifiedConverter:
                        if cell.name == 'th' or row_idx == 0:
                            headers.append(cell_content)

+                        # Mark cells as occupied for rowspan/colspan
+                        for r in range(row_idx, min(row_idx + rowspan, num_rows)):
+                            for c in range(col_idx, col_idx + colspan):
+                                if r not in occupied:
+                                    occupied[r] = {}
+                                occupied[r][c] = True
+
                        # Advance column index by colspan
                        col_idx += colspan

-                # Calculate actual dimensions
-                num_rows = len(rows)
-                num_cols = max(
-                    sum(int(cell.get('colspan', 1)) for cell in row.find_all(['td', 'th']))
-                    for row in rows
-                ) if rows else 0
+                # Calculate actual column count from occupied cells
+                num_cols = 0
+                for r in range(num_rows):
+                    if occupied[r]:
+                        max_col_in_row = max(occupied[r].keys()) + 1
+                        num_cols = max(num_cols, max_col_in_row)

                logger.debug(
                    f"Parsed HTML table: {num_rows} rows, {num_cols} cols, {len(cells)} cells"
                )

+                # Build table dict for cleanup
+                table_dict = {
+                    'rows': num_rows,
+                    'cols': num_cols,
+                    'cells': [
+                        {
+                            'row': c.row,
+                            'col': c.col,
+                            'row_span': c.row_span,
+                            'col_span': c.col_span,
+                            'content': c.content
+                        }
+                        for c in cells
+                    ],
+                    'headers': headers if headers else None,
+                    'caption': extracted_text if extracted_text else None
+                }
+
+                # Clean up empty columns
+                table_dict = trim_empty_columns(table_dict)
+
+                # Convert cleaned cells back to TableCell objects
+                cleaned_cells = [
+                    TableCell(
+                        row=c['row'],
+                        col=c['col'],
+                        row_span=c.get('row_span', 1),
+                        col_span=c.get('col_span', 1),
+                        content=c.get('content', '')
+                    )
+                    for c in table_dict.get('cells', [])
+                ]
+
                return TableData(
-                    rows=num_rows,
-                    cols=num_cols,
-                    cells=cells,
-                    headers=headers if headers else None,
-                    caption=extracted_text if extracted_text else None
+                    rows=table_dict.get('rows', num_rows),
+                    cols=table_dict.get('cols', num_cols),
+                    cells=cleaned_cells,
+                    headers=table_dict.get('headers'),
+                    caption=table_dict.get('caption')
                )

            except ImportError: