feat: simplify layout model selection and archive proposals
Changes: - Replace PP-Structure 7-slider parameter UI with simple 3-option layout model selector - Add layout model mapping: chinese (PP-DocLayout-S), default (PubLayNet), cdla - Add LayoutModelSelector component and zh-TW translations - Fix "default" model behavior with sentinel value for PubLayNet - Add gap filling service for OCR track coverage improvement - Add PP-Structure debug utilities - Archive completed/incomplete proposals: - add-ocr-track-gap-filling (complete) - fix-ocr-track-table-rendering (incomplete) - simplify-ppstructure-model-selection (22/25 tasks) - Add new layout model tests, archive old PP-Structure param tests - Update OpenSpec ocr-processing spec with layout model requirements 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -3,6 +3,9 @@ OCR to UnifiedDocument Converter
|
||||
|
||||
Converts PP-StructureV3 OCR results to UnifiedDocument format, preserving
|
||||
all structure information and metadata.
|
||||
|
||||
Includes gap filling support to supplement PP-StructureV3 output with raw OCR
|
||||
regions when significant content loss is detected.
|
||||
"""
|
||||
|
||||
import logging
|
||||
@@ -16,10 +19,165 @@ from app.models.unified_document import (
|
||||
BoundingBox, StyleInfo, TableData, ElementType,
|
||||
ProcessingTrack, TableCell, Dimensions
|
||||
)
|
||||
from app.services.gap_filling_service import GapFillingService
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def trim_empty_columns(table_dict: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Remove empty columns from a table dictionary.
|
||||
|
||||
A column is considered empty if ALL cells in that column have content that is
|
||||
empty or whitespace-only (using .strip() to determine emptiness).
|
||||
|
||||
This function:
|
||||
1. Identifies columns where every cell's content is empty/whitespace
|
||||
2. Removes identified empty columns
|
||||
3. Updates cols/columns value
|
||||
4. Recalculates each cell's col index
|
||||
5. Adjusts col_span when spans cross removed columns
|
||||
6. Removes cells entirely when their complete span falls within removed columns
|
||||
7. Preserves original bbox (no layout drift)
|
||||
|
||||
Args:
|
||||
table_dict: Table dictionary with keys: rows, cols/columns, cells
|
||||
|
||||
Returns:
|
||||
Cleaned table dictionary with empty columns removed
|
||||
"""
|
||||
cells = table_dict.get('cells', [])
|
||||
if not cells:
|
||||
return table_dict
|
||||
|
||||
# Get original column count
|
||||
original_cols = table_dict.get('cols', table_dict.get('columns', 0))
|
||||
if original_cols == 0:
|
||||
# Calculate from cells if not provided
|
||||
max_col = 0
|
||||
for cell in cells:
|
||||
cell_col = cell.get('col', 0) if isinstance(cell, dict) else getattr(cell, 'col', 0)
|
||||
cell_span = cell.get('col_span', 1) if isinstance(cell, dict) else getattr(cell, 'col_span', 1)
|
||||
max_col = max(max_col, cell_col + cell_span)
|
||||
original_cols = max_col
|
||||
|
||||
if original_cols == 0:
|
||||
return table_dict
|
||||
|
||||
# Build a map: column_index -> list of cell contents
|
||||
# For cells with col_span > 1, we only check their primary column
|
||||
column_contents: Dict[int, List[str]] = {i: [] for i in range(original_cols)}
|
||||
|
||||
for cell in cells:
|
||||
if isinstance(cell, dict):
|
||||
col = cell.get('col', 0)
|
||||
col_span = cell.get('col_span', 1)
|
||||
content = cell.get('content', '')
|
||||
else:
|
||||
col = getattr(cell, 'col', 0)
|
||||
col_span = getattr(cell, 'col_span', 1)
|
||||
content = getattr(cell, 'content', '')
|
||||
|
||||
# Mark content for each column this cell spans
|
||||
for c in range(col, min(col + col_span, original_cols)):
|
||||
if c in column_contents:
|
||||
column_contents[c].append(str(content).strip() if content else '')
|
||||
|
||||
# Identify empty columns (all content is empty/whitespace)
|
||||
empty_columns = set()
|
||||
for col_idx, contents in column_contents.items():
|
||||
# A column is empty if ALL cells in it have empty content
|
||||
# Note: If a column has no cells at all, it's considered empty
|
||||
if all(c == '' for c in contents):
|
||||
empty_columns.add(col_idx)
|
||||
|
||||
if not empty_columns:
|
||||
# No empty columns to remove, just ensure cols is set
|
||||
result = dict(table_dict)
|
||||
if result.get('cols', result.get('columns', 0)) == 0:
|
||||
result['cols'] = original_cols
|
||||
if 'columns' in result:
|
||||
result['columns'] = original_cols
|
||||
return result
|
||||
|
||||
logger.debug(f"Removing empty columns: {sorted(empty_columns)} from table with {original_cols} cols")
|
||||
|
||||
# Build column mapping: old_col -> new_col (or None if removed)
|
||||
col_mapping: Dict[int, Optional[int]] = {}
|
||||
new_col = 0
|
||||
for old_col in range(original_cols):
|
||||
if old_col in empty_columns:
|
||||
col_mapping[old_col] = None
|
||||
else:
|
||||
col_mapping[old_col] = new_col
|
||||
new_col += 1
|
||||
|
||||
new_cols = new_col
|
||||
|
||||
# Process cells
|
||||
new_cells = []
|
||||
for cell in cells:
|
||||
if isinstance(cell, dict):
|
||||
old_col = cell.get('col', 0)
|
||||
old_col_span = cell.get('col_span', 1)
|
||||
else:
|
||||
old_col = getattr(cell, 'col', 0)
|
||||
old_col_span = getattr(cell, 'col_span', 1)
|
||||
|
||||
# Calculate new col and col_span
|
||||
# Find the first non-removed column in this cell's span
|
||||
new_start_col = None
|
||||
new_end_col = None
|
||||
|
||||
for c in range(old_col, min(old_col + old_col_span, original_cols)):
|
||||
mapped = col_mapping.get(c)
|
||||
if mapped is not None:
|
||||
if new_start_col is None:
|
||||
new_start_col = mapped
|
||||
new_end_col = mapped
|
||||
|
||||
# If entire span falls within removed columns, skip this cell
|
||||
if new_start_col is None:
|
||||
logger.debug(f"Removing cell at row={cell.get('row', 0) if isinstance(cell, dict) else cell.row}, "
|
||||
f"col={old_col} (entire span in removed columns)")
|
||||
continue
|
||||
|
||||
new_col_span = new_end_col - new_start_col + 1
|
||||
|
||||
# Create new cell
|
||||
if isinstance(cell, dict):
|
||||
new_cell = dict(cell)
|
||||
new_cell['col'] = new_start_col
|
||||
new_cell['col_span'] = new_col_span
|
||||
else:
|
||||
# Handle TableCell objects
|
||||
new_cell = {
|
||||
'row': cell.row,
|
||||
'col': new_start_col,
|
||||
'row_span': cell.row_span,
|
||||
'col_span': new_col_span,
|
||||
'content': cell.content
|
||||
}
|
||||
if hasattr(cell, 'bbox') and cell.bbox:
|
||||
new_cell['bbox'] = cell.bbox
|
||||
if hasattr(cell, 'style') and cell.style:
|
||||
new_cell['style'] = cell.style
|
||||
|
||||
new_cells.append(new_cell)
|
||||
|
||||
# Build result
|
||||
result = dict(table_dict)
|
||||
result['cells'] = new_cells
|
||||
result['cols'] = new_cols
|
||||
if 'columns' in result:
|
||||
result['columns'] = new_cols
|
||||
|
||||
logger.info(f"Trimmed table: {original_cols} -> {new_cols} columns, "
|
||||
f"{len(cells)} -> {len(new_cells)} cells")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class OCRToUnifiedConverter:
|
||||
"""
|
||||
Converter for transforming PP-StructureV3 OCR results to UnifiedDocument format.
|
||||
@@ -30,11 +188,19 @@ class OCRToUnifiedConverter:
|
||||
- Multi-page document assembly
|
||||
- Metadata preservation
|
||||
- Structure relationship mapping
|
||||
- Gap filling with raw OCR regions (when PP-StructureV3 misses content)
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the converter."""
|
||||
def __init__(self, enable_gap_filling: bool = True):
|
||||
"""
|
||||
Initialize the converter.
|
||||
|
||||
Args:
|
||||
enable_gap_filling: Whether to enable gap filling with raw OCR regions
|
||||
"""
|
||||
self.element_counter = 0
|
||||
self.gap_filling_service = GapFillingService() if enable_gap_filling else None
|
||||
self.gap_filling_stats: Dict[str, Any] = {}
|
||||
|
||||
def convert(
|
||||
self,
|
||||
@@ -120,13 +286,21 @@ class OCRToUnifiedConverter:
|
||||
Extract pages from OCR results.
|
||||
|
||||
Handles both enhanced PP-StructureV3 results (with parsing_res_list)
|
||||
and traditional markdown results.
|
||||
and traditional markdown results. Applies gap filling when enabled.
|
||||
"""
|
||||
pages = []
|
||||
|
||||
# Extract raw OCR text regions for gap filling
|
||||
raw_text_regions = ocr_results.get('text_regions', [])
|
||||
ocr_dimensions = ocr_results.get('ocr_dimensions', {})
|
||||
|
||||
# Check if we have enhanced results from PPStructureEnhanced
|
||||
if 'enhanced_results' in ocr_results:
|
||||
pages = self._extract_from_enhanced_results(ocr_results['enhanced_results'])
|
||||
pages = self._extract_from_enhanced_results(
|
||||
ocr_results['enhanced_results'],
|
||||
raw_text_regions=raw_text_regions,
|
||||
ocr_dimensions=ocr_dimensions
|
||||
)
|
||||
# Check for traditional OCR results with text_regions at top level (from process_file_traditional)
|
||||
elif 'text_regions' in ocr_results:
|
||||
pages = self._extract_from_traditional_ocr(ocr_results)
|
||||
@@ -143,9 +317,21 @@ class OCRToUnifiedConverter:
|
||||
|
||||
def _extract_from_enhanced_results(
|
||||
self,
|
||||
enhanced_results: List[Dict[str, Any]]
|
||||
enhanced_results: List[Dict[str, Any]],
|
||||
raw_text_regions: Optional[List[Dict[str, Any]]] = None,
|
||||
ocr_dimensions: Optional[Dict[str, Any]] = None
|
||||
) -> List[Page]:
|
||||
"""Extract pages from enhanced PP-StructureV3 results."""
|
||||
"""
|
||||
Extract pages from enhanced PP-StructureV3 results.
|
||||
|
||||
Applies gap filling when enabled to supplement PP-StructureV3 output
|
||||
with raw OCR regions that were not detected by the layout model.
|
||||
|
||||
Args:
|
||||
enhanced_results: PP-StructureV3 enhanced results
|
||||
raw_text_regions: Raw OCR text regions for gap filling
|
||||
ocr_dimensions: OCR image dimensions for coordinate alignment
|
||||
"""
|
||||
pages = []
|
||||
|
||||
for page_idx, page_result in enumerate(enhanced_results):
|
||||
@@ -158,15 +344,52 @@ class OCRToUnifiedConverter:
|
||||
if element:
|
||||
elements.append(element)
|
||||
|
||||
# Get page dimensions
|
||||
pp_dimensions = Dimensions(
|
||||
width=page_result.get('width', 0),
|
||||
height=page_result.get('height', 0)
|
||||
)
|
||||
|
||||
# Apply gap filling if enabled and raw regions available
|
||||
if self.gap_filling_service and raw_text_regions:
|
||||
# Filter raw regions for current page
|
||||
page_raw_regions = [
|
||||
r for r in raw_text_regions
|
||||
if r.get('page', 0) == page_idx or r.get('page', 1) == page_idx + 1
|
||||
]
|
||||
|
||||
if page_raw_regions:
|
||||
supplemented, stats = self.gap_filling_service.fill_gaps(
|
||||
raw_ocr_regions=page_raw_regions,
|
||||
pp_structure_elements=elements,
|
||||
page_number=page_idx + 1,
|
||||
ocr_dimensions=ocr_dimensions,
|
||||
pp_dimensions=pp_dimensions
|
||||
)
|
||||
|
||||
# Store statistics
|
||||
self.gap_filling_stats[f'page_{page_idx + 1}'] = stats
|
||||
|
||||
if supplemented:
|
||||
logger.info(
|
||||
f"Page {page_idx + 1}: Gap filling added {len(supplemented)} elements "
|
||||
f"(coverage: {stats.get('coverage_ratio', 0):.2%})"
|
||||
)
|
||||
elements.extend(supplemented)
|
||||
|
||||
# Recalculate reading order for combined elements
|
||||
reading_order = self.gap_filling_service.recalculate_reading_order(elements)
|
||||
page_result['reading_order'] = reading_order
|
||||
|
||||
# Create page
|
||||
page = Page(
|
||||
page_number=page_idx + 1,
|
||||
dimensions=Dimensions(
|
||||
width=page_result.get('width', 0),
|
||||
height=page_result.get('height', 0)
|
||||
),
|
||||
dimensions=pp_dimensions,
|
||||
elements=elements,
|
||||
metadata={'reading_order': page_result.get('reading_order', [])}
|
||||
metadata={
|
||||
'reading_order': page_result.get('reading_order', []),
|
||||
'gap_filling': self.gap_filling_stats.get(f'page_{page_idx + 1}', {})
|
||||
}
|
||||
)
|
||||
|
||||
pages.append(page)
|
||||
@@ -500,6 +723,9 @@ class OCRToUnifiedConverter:
|
||||
) -> Optional[DocumentElement]:
|
||||
"""Convert table data to DocumentElement."""
|
||||
try:
|
||||
# Clean up empty columns before building TableData
|
||||
table_dict = trim_empty_columns(table_dict)
|
||||
|
||||
# Extract bbox
|
||||
bbox_data = table_dict.get('bbox', [0, 0, 0, 0])
|
||||
bbox = BoundingBox(
|
||||
@@ -587,14 +813,22 @@ class OCRToUnifiedConverter:
|
||||
cells = []
|
||||
headers = []
|
||||
rows = table.find_all('tr')
|
||||
num_rows = len(rows)
|
||||
|
||||
# Track actual column positions accounting for rowspan/colspan
|
||||
# This is a simplified approach - complex spanning may need enhancement
|
||||
# First pass: calculate total columns by finding max column extent
|
||||
# Track cells that span multiple rows: occupied[row][col] = True
|
||||
occupied: Dict[int, Dict[int, bool]] = {r: {} for r in range(num_rows)}
|
||||
|
||||
# Parse all cells with proper rowspan/colspan handling
|
||||
for row_idx, row in enumerate(rows):
|
||||
row_cells = row.find_all(['td', 'th'])
|
||||
col_idx = 0
|
||||
|
||||
for cell in row_cells:
|
||||
# Skip columns that are occupied by rowspan from previous rows
|
||||
while occupied[row_idx].get(col_idx, False):
|
||||
col_idx += 1
|
||||
|
||||
cell_content = cell.get_text(strip=True)
|
||||
rowspan = int(cell.get('rowspan', 1))
|
||||
colspan = int(cell.get('colspan', 1))
|
||||
@@ -611,26 +845,66 @@ class OCRToUnifiedConverter:
|
||||
if cell.name == 'th' or row_idx == 0:
|
||||
headers.append(cell_content)
|
||||
|
||||
# Mark cells as occupied for rowspan/colspan
|
||||
for r in range(row_idx, min(row_idx + rowspan, num_rows)):
|
||||
for c in range(col_idx, col_idx + colspan):
|
||||
if r not in occupied:
|
||||
occupied[r] = {}
|
||||
occupied[r][c] = True
|
||||
|
||||
# Advance column index by colspan
|
||||
col_idx += colspan
|
||||
|
||||
# Calculate actual dimensions
|
||||
num_rows = len(rows)
|
||||
num_cols = max(
|
||||
sum(int(cell.get('colspan', 1)) for cell in row.find_all(['td', 'th']))
|
||||
for row in rows
|
||||
) if rows else 0
|
||||
# Calculate actual column count from occupied cells
|
||||
num_cols = 0
|
||||
for r in range(num_rows):
|
||||
if occupied[r]:
|
||||
max_col_in_row = max(occupied[r].keys()) + 1
|
||||
num_cols = max(num_cols, max_col_in_row)
|
||||
|
||||
logger.debug(
|
||||
f"Parsed HTML table: {num_rows} rows, {num_cols} cols, {len(cells)} cells"
|
||||
)
|
||||
|
||||
# Build table dict for cleanup
|
||||
table_dict = {
|
||||
'rows': num_rows,
|
||||
'cols': num_cols,
|
||||
'cells': [
|
||||
{
|
||||
'row': c.row,
|
||||
'col': c.col,
|
||||
'row_span': c.row_span,
|
||||
'col_span': c.col_span,
|
||||
'content': c.content
|
||||
}
|
||||
for c in cells
|
||||
],
|
||||
'headers': headers if headers else None,
|
||||
'caption': extracted_text if extracted_text else None
|
||||
}
|
||||
|
||||
# Clean up empty columns
|
||||
table_dict = trim_empty_columns(table_dict)
|
||||
|
||||
# Convert cleaned cells back to TableCell objects
|
||||
cleaned_cells = [
|
||||
TableCell(
|
||||
row=c['row'],
|
||||
col=c['col'],
|
||||
row_span=c.get('row_span', 1),
|
||||
col_span=c.get('col_span', 1),
|
||||
content=c.get('content', '')
|
||||
)
|
||||
for c in table_dict.get('cells', [])
|
||||
]
|
||||
|
||||
return TableData(
|
||||
rows=num_rows,
|
||||
cols=num_cols,
|
||||
cells=cells,
|
||||
headers=headers if headers else None,
|
||||
caption=extracted_text if extracted_text else None
|
||||
rows=table_dict.get('rows', num_rows),
|
||||
cols=table_dict.get('cols', num_cols),
|
||||
cells=cleaned_cells,
|
||||
headers=table_dict.get('headers'),
|
||||
caption=table_dict.get('caption')
|
||||
)
|
||||
|
||||
except ImportError:
|
||||
|
||||
Reference in New Issue
Block a user