feat: simplify layout model selection and archive proposals

Changes:
- Replace PP-Structure 7-slider parameter UI with simple 3-option layout model selector
- Add layout model mapping: chinese (PP-DocLayout-S), default (PubLayNet), cdla
- Add LayoutModelSelector component and zh-TW translations
- Fix "default" model behavior with sentinel value for PubLayNet
- Add gap filling service for OCR track coverage improvement
- Add PP-Structure debug utilities
- Archive completed/incomplete proposals:
  - add-ocr-track-gap-filling (complete)
  - fix-ocr-track-table-rendering (incomplete)
  - simplify-ppstructure-model-selection (22/25 tasks)
- Add new layout model tests, archive old PP-Structure param tests
- Update OpenSpec ocr-processing spec with layout model requirements

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-27 13:27:00 +08:00
parent c65df754cf
commit 59206a6ab8
35 changed files with 3621 additions and 658 deletions

View File

@@ -3,6 +3,9 @@ OCR to UnifiedDocument Converter
Converts PP-StructureV3 OCR results to UnifiedDocument format, preserving
all structure information and metadata.
Includes gap filling support to supplement PP-StructureV3 output with raw OCR
regions when significant content loss is detected.
"""
import logging
@@ -16,10 +19,165 @@ from app.models.unified_document import (
BoundingBox, StyleInfo, TableData, ElementType,
ProcessingTrack, TableCell, Dimensions
)
from app.services.gap_filling_service import GapFillingService
logger = logging.getLogger(__name__)
def trim_empty_columns(table_dict: Dict[str, Any]) -> Dict[str, Any]:
"""
Remove empty columns from a table dictionary.
A column is considered empty if ALL cells in that column have content that is
empty or whitespace-only (using .strip() to determine emptiness).
This function:
1. Identifies columns where every cell's content is empty/whitespace
2. Removes identified empty columns
3. Updates cols/columns value
4. Recalculates each cell's col index
5. Adjusts col_span when spans cross removed columns
6. Removes cells entirely when their complete span falls within removed columns
7. Preserves original bbox (no layout drift)
Args:
table_dict: Table dictionary with keys: rows, cols/columns, cells
Returns:
Cleaned table dictionary with empty columns removed
"""
cells = table_dict.get('cells', [])
if not cells:
return table_dict
# Get original column count
original_cols = table_dict.get('cols', table_dict.get('columns', 0))
if original_cols == 0:
# Calculate from cells if not provided
max_col = 0
for cell in cells:
cell_col = cell.get('col', 0) if isinstance(cell, dict) else getattr(cell, 'col', 0)
cell_span = cell.get('col_span', 1) if isinstance(cell, dict) else getattr(cell, 'col_span', 1)
max_col = max(max_col, cell_col + cell_span)
original_cols = max_col
if original_cols == 0:
return table_dict
# Build a map: column_index -> list of cell contents
# For cells with col_span > 1, we only check their primary column
column_contents: Dict[int, List[str]] = {i: [] for i in range(original_cols)}
for cell in cells:
if isinstance(cell, dict):
col = cell.get('col', 0)
col_span = cell.get('col_span', 1)
content = cell.get('content', '')
else:
col = getattr(cell, 'col', 0)
col_span = getattr(cell, 'col_span', 1)
content = getattr(cell, 'content', '')
# Mark content for each column this cell spans
for c in range(col, min(col + col_span, original_cols)):
if c in column_contents:
column_contents[c].append(str(content).strip() if content else '')
# Identify empty columns (all content is empty/whitespace)
empty_columns = set()
for col_idx, contents in column_contents.items():
# A column is empty if ALL cells in it have empty content
# Note: If a column has no cells at all, it's considered empty
if all(c == '' for c in contents):
empty_columns.add(col_idx)
if not empty_columns:
# No empty columns to remove, just ensure cols is set
result = dict(table_dict)
if result.get('cols', result.get('columns', 0)) == 0:
result['cols'] = original_cols
if 'columns' in result:
result['columns'] = original_cols
return result
logger.debug(f"Removing empty columns: {sorted(empty_columns)} from table with {original_cols} cols")
# Build column mapping: old_col -> new_col (or None if removed)
col_mapping: Dict[int, Optional[int]] = {}
new_col = 0
for old_col in range(original_cols):
if old_col in empty_columns:
col_mapping[old_col] = None
else:
col_mapping[old_col] = new_col
new_col += 1
new_cols = new_col
# Process cells
new_cells = []
for cell in cells:
if isinstance(cell, dict):
old_col = cell.get('col', 0)
old_col_span = cell.get('col_span', 1)
else:
old_col = getattr(cell, 'col', 0)
old_col_span = getattr(cell, 'col_span', 1)
# Calculate new col and col_span
# Find the first non-removed column in this cell's span
new_start_col = None
new_end_col = None
for c in range(old_col, min(old_col + old_col_span, original_cols)):
mapped = col_mapping.get(c)
if mapped is not None:
if new_start_col is None:
new_start_col = mapped
new_end_col = mapped
# If entire span falls within removed columns, skip this cell
if new_start_col is None:
logger.debug(f"Removing cell at row={cell.get('row', 0) if isinstance(cell, dict) else cell.row}, "
f"col={old_col} (entire span in removed columns)")
continue
new_col_span = new_end_col - new_start_col + 1
# Create new cell
if isinstance(cell, dict):
new_cell = dict(cell)
new_cell['col'] = new_start_col
new_cell['col_span'] = new_col_span
else:
# Handle TableCell objects
new_cell = {
'row': cell.row,
'col': new_start_col,
'row_span': cell.row_span,
'col_span': new_col_span,
'content': cell.content
}
if hasattr(cell, 'bbox') and cell.bbox:
new_cell['bbox'] = cell.bbox
if hasattr(cell, 'style') and cell.style:
new_cell['style'] = cell.style
new_cells.append(new_cell)
# Build result
result = dict(table_dict)
result['cells'] = new_cells
result['cols'] = new_cols
if 'columns' in result:
result['columns'] = new_cols
logger.info(f"Trimmed table: {original_cols} -> {new_cols} columns, "
f"{len(cells)} -> {len(new_cells)} cells")
return result
class OCRToUnifiedConverter:
"""
Converter for transforming PP-StructureV3 OCR results to UnifiedDocument format.
@@ -30,11 +188,19 @@ class OCRToUnifiedConverter:
- Multi-page document assembly
- Metadata preservation
- Structure relationship mapping
- Gap filling with raw OCR regions (when PP-StructureV3 misses content)
"""
def __init__(self):
"""Initialize the converter."""
def __init__(self, enable_gap_filling: bool = True):
"""
Initialize the converter.
Args:
enable_gap_filling: Whether to enable gap filling with raw OCR regions
"""
self.element_counter = 0
self.gap_filling_service = GapFillingService() if enable_gap_filling else None
self.gap_filling_stats: Dict[str, Any] = {}
def convert(
self,
@@ -120,13 +286,21 @@ class OCRToUnifiedConverter:
Extract pages from OCR results.
Handles both enhanced PP-StructureV3 results (with parsing_res_list)
and traditional markdown results.
and traditional markdown results. Applies gap filling when enabled.
"""
pages = []
# Extract raw OCR text regions for gap filling
raw_text_regions = ocr_results.get('text_regions', [])
ocr_dimensions = ocr_results.get('ocr_dimensions', {})
# Check if we have enhanced results from PPStructureEnhanced
if 'enhanced_results' in ocr_results:
pages = self._extract_from_enhanced_results(ocr_results['enhanced_results'])
pages = self._extract_from_enhanced_results(
ocr_results['enhanced_results'],
raw_text_regions=raw_text_regions,
ocr_dimensions=ocr_dimensions
)
# Check for traditional OCR results with text_regions at top level (from process_file_traditional)
elif 'text_regions' in ocr_results:
pages = self._extract_from_traditional_ocr(ocr_results)
@@ -143,9 +317,21 @@ class OCRToUnifiedConverter:
def _extract_from_enhanced_results(
self,
enhanced_results: List[Dict[str, Any]]
enhanced_results: List[Dict[str, Any]],
raw_text_regions: Optional[List[Dict[str, Any]]] = None,
ocr_dimensions: Optional[Dict[str, Any]] = None
) -> List[Page]:
"""Extract pages from enhanced PP-StructureV3 results."""
"""
Extract pages from enhanced PP-StructureV3 results.
Applies gap filling when enabled to supplement PP-StructureV3 output
with raw OCR regions that were not detected by the layout model.
Args:
enhanced_results: PP-StructureV3 enhanced results
raw_text_regions: Raw OCR text regions for gap filling
ocr_dimensions: OCR image dimensions for coordinate alignment
"""
pages = []
for page_idx, page_result in enumerate(enhanced_results):
@@ -158,15 +344,52 @@ class OCRToUnifiedConverter:
if element:
elements.append(element)
# Get page dimensions
pp_dimensions = Dimensions(
width=page_result.get('width', 0),
height=page_result.get('height', 0)
)
# Apply gap filling if enabled and raw regions available
if self.gap_filling_service and raw_text_regions:
# Filter raw regions for current page
page_raw_regions = [
r for r in raw_text_regions
if r.get('page', 0) == page_idx or r.get('page', 1) == page_idx + 1
]
if page_raw_regions:
supplemented, stats = self.gap_filling_service.fill_gaps(
raw_ocr_regions=page_raw_regions,
pp_structure_elements=elements,
page_number=page_idx + 1,
ocr_dimensions=ocr_dimensions,
pp_dimensions=pp_dimensions
)
# Store statistics
self.gap_filling_stats[f'page_{page_idx + 1}'] = stats
if supplemented:
logger.info(
f"Page {page_idx + 1}: Gap filling added {len(supplemented)} elements "
f"(coverage: {stats.get('coverage_ratio', 0):.2%})"
)
elements.extend(supplemented)
# Recalculate reading order for combined elements
reading_order = self.gap_filling_service.recalculate_reading_order(elements)
page_result['reading_order'] = reading_order
# Create page
page = Page(
page_number=page_idx + 1,
dimensions=Dimensions(
width=page_result.get('width', 0),
height=page_result.get('height', 0)
),
dimensions=pp_dimensions,
elements=elements,
metadata={'reading_order': page_result.get('reading_order', [])}
metadata={
'reading_order': page_result.get('reading_order', []),
'gap_filling': self.gap_filling_stats.get(f'page_{page_idx + 1}', {})
}
)
pages.append(page)
@@ -500,6 +723,9 @@ class OCRToUnifiedConverter:
) -> Optional[DocumentElement]:
"""Convert table data to DocumentElement."""
try:
# Clean up empty columns before building TableData
table_dict = trim_empty_columns(table_dict)
# Extract bbox
bbox_data = table_dict.get('bbox', [0, 0, 0, 0])
bbox = BoundingBox(
@@ -587,14 +813,22 @@ class OCRToUnifiedConverter:
cells = []
headers = []
rows = table.find_all('tr')
num_rows = len(rows)
# Track actual column positions accounting for rowspan/colspan
# This is a simplified approach - complex spanning may need enhancement
# First pass: calculate total columns by finding max column extent
# Track cells that span multiple rows: occupied[row][col] = True
occupied: Dict[int, Dict[int, bool]] = {r: {} for r in range(num_rows)}
# Parse all cells with proper rowspan/colspan handling
for row_idx, row in enumerate(rows):
row_cells = row.find_all(['td', 'th'])
col_idx = 0
for cell in row_cells:
# Skip columns that are occupied by rowspan from previous rows
while occupied[row_idx].get(col_idx, False):
col_idx += 1
cell_content = cell.get_text(strip=True)
rowspan = int(cell.get('rowspan', 1))
colspan = int(cell.get('colspan', 1))
@@ -611,26 +845,66 @@ class OCRToUnifiedConverter:
if cell.name == 'th' or row_idx == 0:
headers.append(cell_content)
# Mark cells as occupied for rowspan/colspan
for r in range(row_idx, min(row_idx + rowspan, num_rows)):
for c in range(col_idx, col_idx + colspan):
if r not in occupied:
occupied[r] = {}
occupied[r][c] = True
# Advance column index by colspan
col_idx += colspan
# Calculate actual dimensions
num_rows = len(rows)
num_cols = max(
sum(int(cell.get('colspan', 1)) for cell in row.find_all(['td', 'th']))
for row in rows
) if rows else 0
# Calculate actual column count from occupied cells
num_cols = 0
for r in range(num_rows):
if occupied[r]:
max_col_in_row = max(occupied[r].keys()) + 1
num_cols = max(num_cols, max_col_in_row)
logger.debug(
f"Parsed HTML table: {num_rows} rows, {num_cols} cols, {len(cells)} cells"
)
# Build table dict for cleanup
table_dict = {
'rows': num_rows,
'cols': num_cols,
'cells': [
{
'row': c.row,
'col': c.col,
'row_span': c.row_span,
'col_span': c.col_span,
'content': c.content
}
for c in cells
],
'headers': headers if headers else None,
'caption': extracted_text if extracted_text else None
}
# Clean up empty columns
table_dict = trim_empty_columns(table_dict)
# Convert cleaned cells back to TableCell objects
cleaned_cells = [
TableCell(
row=c['row'],
col=c['col'],
row_span=c.get('row_span', 1),
col_span=c.get('col_span', 1),
content=c.get('content', '')
)
for c in table_dict.get('cells', [])
]
return TableData(
rows=num_rows,
cols=num_cols,
cells=cells,
headers=headers if headers else None,
caption=extracted_text if extracted_text else None
rows=table_dict.get('rows', num_rows),
cols=table_dict.get('cols', num_cols),
cells=cleaned_cells,
headers=table_dict.get('headers'),
caption=table_dict.get('caption')
)
except ImportError: