test
This commit is contained in:
@@ -178,6 +178,114 @@ def trim_empty_columns(table_dict: Dict[str, Any]) -> Dict[str, Any]:
|
||||
return result
|
||||
|
||||
|
||||
def validate_cell_boxes(
|
||||
cell_boxes: List[List[float]],
|
||||
table_bbox: List[float],
|
||||
page_width: float,
|
||||
page_height: float,
|
||||
tolerance: float = 5.0
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Validate cell_boxes coordinates against page boundaries and table bbox.
|
||||
|
||||
PP-StructureV3 sometimes returns cell_boxes with coordinates that exceed
|
||||
page boundaries. This function validates and reports issues.
|
||||
|
||||
Args:
|
||||
cell_boxes: List of cell bounding boxes [[x0, y0, x1, y1], ...]
|
||||
table_bbox: Table bounding box [x0, y0, x1, y1]
|
||||
page_width: Page width in pixels
|
||||
page_height: Page height in pixels
|
||||
tolerance: Allowed tolerance for boundary checks (pixels)
|
||||
|
||||
Returns:
|
||||
Dict with:
|
||||
- valid: bool - whether all cell_boxes are valid
|
||||
- invalid_count: int - number of invalid cell_boxes
|
||||
- clamped_boxes: List - cell_boxes clamped to valid boundaries
|
||||
- issues: List[str] - description of issues found
|
||||
"""
|
||||
if not cell_boxes:
|
||||
return {'valid': True, 'invalid_count': 0, 'clamped_boxes': [], 'issues': []}
|
||||
|
||||
issues = []
|
||||
invalid_count = 0
|
||||
clamped_boxes = []
|
||||
|
||||
# Page boundaries with tolerance
|
||||
min_x = -tolerance
|
||||
min_y = -tolerance
|
||||
max_x = page_width + tolerance
|
||||
max_y = page_height + tolerance
|
||||
|
||||
for idx, box in enumerate(cell_boxes):
|
||||
if not box or len(box) < 4:
|
||||
issues.append(f"Cell {idx}: Invalid box format")
|
||||
invalid_count += 1
|
||||
clamped_boxes.append([0, 0, 0, 0])
|
||||
continue
|
||||
|
||||
x0, y0, x1, y1 = box[:4]
|
||||
is_valid = True
|
||||
cell_issues = []
|
||||
|
||||
# Check if coordinates exceed page boundaries
|
||||
if x0 < min_x:
|
||||
cell_issues.append(f"x0={x0:.1f} < 0")
|
||||
is_valid = False
|
||||
if y0 < min_y:
|
||||
cell_issues.append(f"y0={y0:.1f} < 0")
|
||||
is_valid = False
|
||||
if x1 > max_x:
|
||||
cell_issues.append(f"x1={x1:.1f} > page_width={page_width:.1f}")
|
||||
is_valid = False
|
||||
if y1 > max_y:
|
||||
cell_issues.append(f"y1={y1:.1f} > page_height={page_height:.1f}")
|
||||
is_valid = False
|
||||
|
||||
# Check for inverted coordinates
|
||||
if x0 > x1:
|
||||
cell_issues.append(f"x0={x0:.1f} > x1={x1:.1f}")
|
||||
is_valid = False
|
||||
if y0 > y1:
|
||||
cell_issues.append(f"y0={y0:.1f} > y1={y1:.1f}")
|
||||
is_valid = False
|
||||
|
||||
if not is_valid:
|
||||
invalid_count += 1
|
||||
issues.append(f"Cell {idx}: {', '.join(cell_issues)}")
|
||||
|
||||
# Clamp to valid boundaries
|
||||
clamped_box = [
|
||||
max(0, min(x0, page_width)),
|
||||
max(0, min(y0, page_height)),
|
||||
max(0, min(x1, page_width)),
|
||||
max(0, min(y1, page_height))
|
||||
]
|
||||
|
||||
# Ensure proper ordering after clamping
|
||||
if clamped_box[0] > clamped_box[2]:
|
||||
clamped_box[0], clamped_box[2] = clamped_box[2], clamped_box[0]
|
||||
if clamped_box[1] > clamped_box[3]:
|
||||
clamped_box[1], clamped_box[3] = clamped_box[3], clamped_box[1]
|
||||
|
||||
clamped_boxes.append(clamped_box)
|
||||
|
||||
if invalid_count > 0:
|
||||
logger.warning(
|
||||
f"Cell boxes validation: {invalid_count}/{len(cell_boxes)} invalid. "
|
||||
f"Page: {page_width:.0f}x{page_height:.0f}, Table bbox: {table_bbox}"
|
||||
)
|
||||
|
||||
return {
|
||||
'valid': invalid_count == 0,
|
||||
'invalid_count': invalid_count,
|
||||
'clamped_boxes': clamped_boxes,
|
||||
'issues': issues,
|
||||
'needs_fallback': invalid_count > len(cell_boxes) * 0.5 # >50% invalid = needs fallback
|
||||
}
|
||||
|
||||
|
||||
class OCRToUnifiedConverter:
|
||||
"""
|
||||
Converter for transforming PP-StructureV3 OCR results to UnifiedDocument format.
|
||||
@@ -337,19 +445,22 @@ class OCRToUnifiedConverter:
|
||||
for page_idx, page_result in enumerate(enhanced_results):
|
||||
elements = []
|
||||
|
||||
# Get page dimensions first (needed for element conversion)
|
||||
page_width = page_result.get('width', 0)
|
||||
page_height = page_result.get('height', 0)
|
||||
pp_dimensions = Dimensions(width=page_width, height=page_height)
|
||||
|
||||
# Process elements from parsing_res_list
|
||||
if 'elements' in page_result:
|
||||
for elem_data in page_result['elements']:
|
||||
element = self._convert_pp3_element(elem_data, page_idx)
|
||||
element = self._convert_pp3_element(
|
||||
elem_data, page_idx,
|
||||
page_width=page_width,
|
||||
page_height=page_height
|
||||
)
|
||||
if element:
|
||||
elements.append(element)
|
||||
|
||||
# Get page dimensions
|
||||
pp_dimensions = Dimensions(
|
||||
width=page_result.get('width', 0),
|
||||
height=page_result.get('height', 0)
|
||||
)
|
||||
|
||||
# Apply gap filling if enabled and raw regions available
|
||||
if self.gap_filling_service and raw_text_regions:
|
||||
# Filter raw regions for current page
|
||||
@@ -556,9 +667,19 @@ class OCRToUnifiedConverter:
|
||||
def _convert_pp3_element(
|
||||
self,
|
||||
elem_data: Dict[str, Any],
|
||||
page_idx: int
|
||||
page_idx: int,
|
||||
page_width: float = 0,
|
||||
page_height: float = 0
|
||||
) -> Optional[DocumentElement]:
|
||||
"""Convert PP-StructureV3 element to DocumentElement."""
|
||||
"""
|
||||
Convert PP-StructureV3 element to DocumentElement.
|
||||
|
||||
Args:
|
||||
elem_data: Element data from PP-StructureV3
|
||||
page_idx: Page index (0-based)
|
||||
page_width: Page width for coordinate validation
|
||||
page_height: Page height for coordinate validation
|
||||
"""
|
||||
try:
|
||||
# Extract bbox
|
||||
bbox_data = elem_data.get('bbox', [0, 0, 0, 0])
|
||||
@@ -597,18 +718,67 @@ class OCRToUnifiedConverter:
|
||||
# Preserve cell_boxes and embedded_images in metadata for PDF generation
|
||||
# These are extracted by PP-StructureV3 and provide accurate cell positioning
|
||||
if 'cell_boxes' in elem_data:
|
||||
elem_data.setdefault('metadata', {})['cell_boxes'] = elem_data['cell_boxes']
|
||||
elem_data['metadata']['cell_boxes_source'] = elem_data.get('cell_boxes_source', 'table_res_list')
|
||||
cell_boxes = elem_data['cell_boxes']
|
||||
elem_data.setdefault('metadata', {})['cell_boxes_source'] = elem_data.get('cell_boxes_source', 'table_res_list')
|
||||
|
||||
# Validate cell_boxes coordinates if page dimensions are available
|
||||
if page_width > 0 and page_height > 0:
|
||||
validation = validate_cell_boxes(
|
||||
cell_boxes=cell_boxes,
|
||||
table_bbox=bbox_data,
|
||||
page_width=page_width,
|
||||
page_height=page_height
|
||||
)
|
||||
|
||||
if not validation['valid']:
|
||||
elem_data['metadata']['cell_boxes_validation'] = {
|
||||
'valid': False,
|
||||
'invalid_count': validation['invalid_count'],
|
||||
'total_count': len(cell_boxes),
|
||||
'needs_fallback': validation['needs_fallback']
|
||||
}
|
||||
# Use clamped boxes instead of invalid ones
|
||||
elem_data['metadata']['cell_boxes'] = validation['clamped_boxes']
|
||||
elem_data['metadata']['cell_boxes_original'] = cell_boxes
|
||||
|
||||
if validation['needs_fallback']:
|
||||
logger.warning(
|
||||
f"Table {elem_data.get('element_id')}: "
|
||||
f"{validation['invalid_count']}/{len(cell_boxes)} cell_boxes invalid, "
|
||||
f"fallback recommended"
|
||||
)
|
||||
else:
|
||||
elem_data['metadata']['cell_boxes'] = cell_boxes
|
||||
elem_data['metadata']['cell_boxes_validation'] = {'valid': True}
|
||||
else:
|
||||
# No page dimensions available, store as-is
|
||||
elem_data['metadata']['cell_boxes'] = cell_boxes
|
||||
|
||||
if 'embedded_images' in elem_data:
|
||||
elem_data.setdefault('metadata', {})['embedded_images'] = elem_data['embedded_images']
|
||||
elif element_type in [ElementType.IMAGE, ElementType.FIGURE]:
|
||||
# For images, use metadata dict as content
|
||||
elif element_type in [
|
||||
ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART,
|
||||
ElementType.DIAGRAM, ElementType.LOGO, ElementType.STAMP
|
||||
]:
|
||||
# For all visual elements, use metadata dict as content
|
||||
# Priority: saved_path > img_path (PP-StructureV3 uses saved_path)
|
||||
image_path = (
|
||||
elem_data.get('saved_path') or
|
||||
elem_data.get('img_path') or
|
||||
''
|
||||
)
|
||||
content = {
|
||||
'path': elem_data.get('img_path', ''),
|
||||
'saved_path': image_path, # Preserve original path key
|
||||
'path': image_path, # For backward compatibility
|
||||
'width': elem_data.get('width', 0),
|
||||
'height': elem_data.get('height', 0),
|
||||
'format': elem_data.get('format', 'unknown')
|
||||
}
|
||||
if not image_path:
|
||||
logger.warning(
|
||||
f"Visual element {element_type.value} missing image path: "
|
||||
f"saved_path={elem_data.get('saved_path')}, img_path={elem_data.get('img_path')}"
|
||||
)
|
||||
else:
|
||||
content = elem_data.get('content', '')
|
||||
|
||||
@@ -1139,10 +1309,18 @@ class OCRToUnifiedConverter:
|
||||
for page_idx, page_data in enumerate(pages_data):
|
||||
elements = []
|
||||
|
||||
# Get page dimensions first
|
||||
page_width = page_data.get('width', 0)
|
||||
page_height = page_data.get('height', 0)
|
||||
|
||||
# Process each element in the page
|
||||
if 'elements' in page_data:
|
||||
for elem_data in page_data['elements']:
|
||||
element = self._convert_pp3_element(elem_data, page_idx)
|
||||
element = self._convert_pp3_element(
|
||||
elem_data, page_idx,
|
||||
page_width=page_width,
|
||||
page_height=page_height
|
||||
)
|
||||
if element:
|
||||
elements.append(element)
|
||||
|
||||
@@ -1150,8 +1328,8 @@ class OCRToUnifiedConverter:
|
||||
page = Page(
|
||||
page_number=page_idx + 1,
|
||||
dimensions=Dimensions(
|
||||
width=page_data.get('width', 0),
|
||||
height=page_data.get('height', 0)
|
||||
width=page_width,
|
||||
height=page_height
|
||||
),
|
||||
elements=elements,
|
||||
metadata={'reading_order': self._calculate_reading_order(elements)}
|
||||
|
||||
Reference in New Issue
Block a user