This commit is contained in:
egg
2025-12-04 18:00:37 +08:00
parent 9437387ef1
commit 8265be1741
22 changed files with 2672 additions and 196 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -178,6 +178,114 @@ def trim_empty_columns(table_dict: Dict[str, Any]) -> Dict[str, Any]:
return result
def validate_cell_boxes(
cell_boxes: List[List[float]],
table_bbox: List[float],
page_width: float,
page_height: float,
tolerance: float = 5.0
) -> Dict[str, Any]:
"""
Validate cell_boxes coordinates against page boundaries and table bbox.
PP-StructureV3 sometimes returns cell_boxes with coordinates that exceed
page boundaries. This function validates and reports issues.
Args:
cell_boxes: List of cell bounding boxes [[x0, y0, x1, y1], ...]
table_bbox: Table bounding box [x0, y0, x1, y1]
page_width: Page width in pixels
page_height: Page height in pixels
tolerance: Allowed tolerance for boundary checks (pixels)
Returns:
Dict with:
- valid: bool - whether all cell_boxes are valid
- invalid_count: int - number of invalid cell_boxes
- clamped_boxes: List - cell_boxes clamped to valid boundaries
- issues: List[str] - description of issues found
"""
if not cell_boxes:
return {'valid': True, 'invalid_count': 0, 'clamped_boxes': [], 'issues': []}
issues = []
invalid_count = 0
clamped_boxes = []
# Page boundaries with tolerance
min_x = -tolerance
min_y = -tolerance
max_x = page_width + tolerance
max_y = page_height + tolerance
for idx, box in enumerate(cell_boxes):
if not box or len(box) < 4:
issues.append(f"Cell {idx}: Invalid box format")
invalid_count += 1
clamped_boxes.append([0, 0, 0, 0])
continue
x0, y0, x1, y1 = box[:4]
is_valid = True
cell_issues = []
# Check if coordinates exceed page boundaries
if x0 < min_x:
cell_issues.append(f"x0={x0:.1f} < 0")
is_valid = False
if y0 < min_y:
cell_issues.append(f"y0={y0:.1f} < 0")
is_valid = False
if x1 > max_x:
cell_issues.append(f"x1={x1:.1f} > page_width={page_width:.1f}")
is_valid = False
if y1 > max_y:
cell_issues.append(f"y1={y1:.1f} > page_height={page_height:.1f}")
is_valid = False
# Check for inverted coordinates
if x0 > x1:
cell_issues.append(f"x0={x0:.1f} > x1={x1:.1f}")
is_valid = False
if y0 > y1:
cell_issues.append(f"y0={y0:.1f} > y1={y1:.1f}")
is_valid = False
if not is_valid:
invalid_count += 1
issues.append(f"Cell {idx}: {', '.join(cell_issues)}")
# Clamp to valid boundaries
clamped_box = [
max(0, min(x0, page_width)),
max(0, min(y0, page_height)),
max(0, min(x1, page_width)),
max(0, min(y1, page_height))
]
# Ensure proper ordering after clamping
if clamped_box[0] > clamped_box[2]:
clamped_box[0], clamped_box[2] = clamped_box[2], clamped_box[0]
if clamped_box[1] > clamped_box[3]:
clamped_box[1], clamped_box[3] = clamped_box[3], clamped_box[1]
clamped_boxes.append(clamped_box)
if invalid_count > 0:
logger.warning(
f"Cell boxes validation: {invalid_count}/{len(cell_boxes)} invalid. "
f"Page: {page_width:.0f}x{page_height:.0f}, Table bbox: {table_bbox}"
)
return {
'valid': invalid_count == 0,
'invalid_count': invalid_count,
'clamped_boxes': clamped_boxes,
'issues': issues,
'needs_fallback': invalid_count > len(cell_boxes) * 0.5 # >50% invalid = needs fallback
}
class OCRToUnifiedConverter:
"""
Converter for transforming PP-StructureV3 OCR results to UnifiedDocument format.
@@ -337,19 +445,22 @@ class OCRToUnifiedConverter:
for page_idx, page_result in enumerate(enhanced_results):
elements = []
# Get page dimensions first (needed for element conversion)
page_width = page_result.get('width', 0)
page_height = page_result.get('height', 0)
pp_dimensions = Dimensions(width=page_width, height=page_height)
# Process elements from parsing_res_list
if 'elements' in page_result:
for elem_data in page_result['elements']:
element = self._convert_pp3_element(elem_data, page_idx)
element = self._convert_pp3_element(
elem_data, page_idx,
page_width=page_width,
page_height=page_height
)
if element:
elements.append(element)
# Get page dimensions
pp_dimensions = Dimensions(
width=page_result.get('width', 0),
height=page_result.get('height', 0)
)
# Apply gap filling if enabled and raw regions available
if self.gap_filling_service and raw_text_regions:
# Filter raw regions for current page
@@ -556,9 +667,19 @@ class OCRToUnifiedConverter:
def _convert_pp3_element(
self,
elem_data: Dict[str, Any],
page_idx: int
page_idx: int,
page_width: float = 0,
page_height: float = 0
) -> Optional[DocumentElement]:
"""Convert PP-StructureV3 element to DocumentElement."""
"""
Convert PP-StructureV3 element to DocumentElement.
Args:
elem_data: Element data from PP-StructureV3
page_idx: Page index (0-based)
page_width: Page width for coordinate validation
page_height: Page height for coordinate validation
"""
try:
# Extract bbox
bbox_data = elem_data.get('bbox', [0, 0, 0, 0])
@@ -597,18 +718,67 @@ class OCRToUnifiedConverter:
# Preserve cell_boxes and embedded_images in metadata for PDF generation
# These are extracted by PP-StructureV3 and provide accurate cell positioning
if 'cell_boxes' in elem_data:
elem_data.setdefault('metadata', {})['cell_boxes'] = elem_data['cell_boxes']
elem_data['metadata']['cell_boxes_source'] = elem_data.get('cell_boxes_source', 'table_res_list')
cell_boxes = elem_data['cell_boxes']
elem_data.setdefault('metadata', {})['cell_boxes_source'] = elem_data.get('cell_boxes_source', 'table_res_list')
# Validate cell_boxes coordinates if page dimensions are available
if page_width > 0 and page_height > 0:
validation = validate_cell_boxes(
cell_boxes=cell_boxes,
table_bbox=bbox_data,
page_width=page_width,
page_height=page_height
)
if not validation['valid']:
elem_data['metadata']['cell_boxes_validation'] = {
'valid': False,
'invalid_count': validation['invalid_count'],
'total_count': len(cell_boxes),
'needs_fallback': validation['needs_fallback']
}
# Use clamped boxes instead of invalid ones
elem_data['metadata']['cell_boxes'] = validation['clamped_boxes']
elem_data['metadata']['cell_boxes_original'] = cell_boxes
if validation['needs_fallback']:
logger.warning(
f"Table {elem_data.get('element_id')}: "
f"{validation['invalid_count']}/{len(cell_boxes)} cell_boxes invalid, "
f"fallback recommended"
)
else:
elem_data['metadata']['cell_boxes'] = cell_boxes
elem_data['metadata']['cell_boxes_validation'] = {'valid': True}
else:
# No page dimensions available, store as-is
elem_data['metadata']['cell_boxes'] = cell_boxes
if 'embedded_images' in elem_data:
elem_data.setdefault('metadata', {})['embedded_images'] = elem_data['embedded_images']
elif element_type in [ElementType.IMAGE, ElementType.FIGURE]:
# For images, use metadata dict as content
elif element_type in [
ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART,
ElementType.DIAGRAM, ElementType.LOGO, ElementType.STAMP
]:
# For all visual elements, use metadata dict as content
# Priority: saved_path > img_path (PP-StructureV3 uses saved_path)
image_path = (
elem_data.get('saved_path') or
elem_data.get('img_path') or
''
)
content = {
'path': elem_data.get('img_path', ''),
'saved_path': image_path, # Preserve original path key
'path': image_path, # For backward compatibility
'width': elem_data.get('width', 0),
'height': elem_data.get('height', 0),
'format': elem_data.get('format', 'unknown')
}
if not image_path:
logger.warning(
f"Visual element {element_type.value} missing image path: "
f"saved_path={elem_data.get('saved_path')}, img_path={elem_data.get('img_path')}"
)
else:
content = elem_data.get('content', '')
@@ -1139,10 +1309,18 @@ class OCRToUnifiedConverter:
for page_idx, page_data in enumerate(pages_data):
elements = []
# Get page dimensions first
page_width = page_data.get('width', 0)
page_height = page_data.get('height', 0)
# Process each element in the page
if 'elements' in page_data:
for elem_data in page_data['elements']:
element = self._convert_pp3_element(elem_data, page_idx)
element = self._convert_pp3_element(
elem_data, page_idx,
page_width=page_width,
page_height=page_height
)
if element:
elements.append(element)
@@ -1150,8 +1328,8 @@ class OCRToUnifiedConverter:
page = Page(
page_number=page_idx + 1,
dimensions=Dimensions(
width=page_data.get('width', 0),
height=page_data.get('height', 0)
width=page_width,
height=page_height
),
elements=elements,
metadata={'reading_order': self._calculate_reading_order(elements)}

View File

@@ -3371,18 +3371,21 @@ class PDFGeneratorService:
"rows": 6,
"cols": 2,
"cells": [
{"row": 0, "col": 0, "content": "..."},
{"row": 0, "col": 0, "content": "...", "row_span": 1, "col_span": 2},
{"row": 0, "col": 1, "content": "..."},
...
]
}
Returns format compatible with HTMLTableParser output:
Returns format compatible with HTMLTableParser output (with colspan/rowspan/col):
[
{"cells": [{"text": "..."}, {"text": "..."}]}, # row 0
{"cells": [{"text": "..."}, {"text": "..."}]}, # row 1
{"cells": [{"text": "...", "colspan": 1, "rowspan": 1, "col": 0}, ...]},
{"cells": [{"text": "...", "colspan": 1, "rowspan": 1, "col": 0}, ...]},
...
]
Note: This returns actual cells per row with their absolute column positions.
The table renderer uses 'col' to place cells correctly in the grid.
"""
try:
num_rows = content.get('rows', 0)
@@ -3392,21 +3395,39 @@ class PDFGeneratorService:
if not cells or num_rows == 0 or num_cols == 0:
return []
# Initialize rows structure
rows_data = []
for _ in range(num_rows):
rows_data.append({'cells': [{'text': ''} for _ in range(num_cols)]})
# Fill in cell content
# Group cells by row
cells_by_row = {}
for cell in cells:
row_idx = cell.get('row', 0)
col_idx = cell.get('col', 0)
cell_content = cell.get('content', '')
if row_idx not in cells_by_row:
cells_by_row[row_idx] = []
cells_by_row[row_idx].append(cell)
if 0 <= row_idx < num_rows and 0 <= col_idx < num_cols:
rows_data[row_idx]['cells'][col_idx]['text'] = str(cell_content) if cell_content else ''
# Sort cells within each row by column
for row_idx in cells_by_row:
cells_by_row[row_idx].sort(key=lambda c: c.get('col', 0))
logger.debug(f"Built {num_rows} rows from cells dict")
# Build rows structure with colspan/rowspan info and absolute col position
rows_data = []
for row_idx in range(num_rows):
row_cells = []
if row_idx in cells_by_row:
for cell in cells_by_row[row_idx]:
cell_content = cell.get('content', '')
row_span = cell.get('row_span', 1) or 1
col_span = cell.get('col_span', 1) or 1
col_idx = cell.get('col', 0)
row_cells.append({
'text': str(cell_content) if cell_content else '',
'rowspan': row_span,
'colspan': col_span,
'col': col_idx # Absolute column position
})
rows_data.append({'cells': row_cells})
logger.debug(f"Built {num_rows} rows from cells dict with span info")
return rows_data
except Exception as e:
@@ -3471,19 +3492,115 @@ class PDFGeneratorService:
table_width = bbox.x1 - bbox.x0
table_height = bbox.y1 - bbox.y0
# Build table data for ReportLab
table_content = []
for row in rows:
row_data = [cell['text'].strip() for cell in row['cells']]
table_content.append(row_data)
# Create table
from reportlab.platypus import Table, TableStyle
from reportlab.lib import colors
# Determine number of rows and columns for cell_boxes calculation
# Determine grid size from rows structure
# Note: rows may have 'col' attribute for absolute positioning (from Direct extraction)
# or may be sequential (from HTML parsing)
num_rows = len(rows)
max_cols = max(len(row['cells']) for row in rows) if rows else 0
# Check if cells have absolute column positions
has_absolute_cols = any(
'col' in cell
for row in rows
for cell in row['cells']
)
# Calculate actual number of columns
max_cols = 0
if has_absolute_cols:
# Use absolute col positions + colspan to find max column
for row in rows:
for cell in row['cells']:
col = cell.get('col', 0)
colspan = cell.get('colspan', 1)
max_cols = max(max_cols, col + colspan)
else:
# Sequential cells: sum up colspans
for row in rows:
col_pos = 0
for cell in row['cells']:
colspan = cell.get('colspan', 1)
col_pos += colspan
max_cols = max(max_cols, col_pos)
# Build table data for ReportLab with proper grid structure
# ReportLab needs a full grid with placeholders for spanned cells
# and SPAN commands to merge them
table_content = []
span_commands = []
covered = set() # Track cells covered by spans
# First pass: mark covered cells and collect SPAN commands
for row_idx, row in enumerate(rows):
if has_absolute_cols:
# Use absolute column positions
for cell in row['cells']:
col_pos = cell.get('col', 0)
colspan = cell.get('colspan', 1)
rowspan = cell.get('rowspan', 1)
# Mark cells covered by this span
if colspan > 1 or rowspan > 1:
for r in range(row_idx, row_idx + rowspan):
for c in range(col_pos, col_pos + colspan):
if (r, c) != (row_idx, col_pos):
covered.add((r, c))
# Add SPAN command for ReportLab
span_commands.append((
'SPAN',
(col_pos, row_idx),
(col_pos + colspan - 1, row_idx + rowspan - 1)
))
else:
# Sequential positioning
col_pos = 0
for cell in row['cells']:
while (row_idx, col_pos) in covered:
col_pos += 1
colspan = cell.get('colspan', 1)
rowspan = cell.get('rowspan', 1)
if colspan > 1 or rowspan > 1:
for r in range(row_idx, row_idx + rowspan):
for c in range(col_pos, col_pos + colspan):
if (r, c) != (row_idx, col_pos):
covered.add((r, c))
span_commands.append((
'SPAN',
(col_pos, row_idx),
(col_pos + colspan - 1, row_idx + rowspan - 1)
))
col_pos += colspan
# Second pass: build content grid
for row_idx in range(num_rows):
row_data = [''] * max_cols
if row_idx < len(rows):
if has_absolute_cols:
# Place cells at their absolute positions
for cell in rows[row_idx]['cells']:
col_pos = cell.get('col', 0)
if col_pos < max_cols:
row_data[col_pos] = cell['text'].strip()
else:
# Sequential placement
col_pos = 0
for cell in rows[row_idx]['cells']:
while col_pos < max_cols and (row_idx, col_pos) in covered:
col_pos += 1
if col_pos < max_cols:
row_data[col_pos] = cell['text'].strip()
colspan = cell.get('colspan', 1)
col_pos += colspan
table_content.append(row_data)
logger.debug(f"Built table grid: {num_rows} rows × {max_cols} cols, {len(span_commands)} span commands (absolute_cols={has_absolute_cols})")
# Use original column widths from extraction if available
# Otherwise try to compute from cell_boxes (from PP-StructureV3)
@@ -3517,7 +3634,7 @@ class PDFGeneratorService:
# Apply style with minimal padding to reduce table extension
# Use Chinese font to support special characters (℃, μm, ≦, ×, Ω, etc.)
font_for_table = self.font_name if self.font_registered else 'Helvetica'
style = TableStyle([
style_commands = [
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
('FONTNAME', (0, 0), (-1, -1), font_for_table),
('FONTSIZE', (0, 0), (-1, -1), 8),
@@ -3529,7 +3646,13 @@ class PDFGeneratorService:
('BOTTOMPADDING', (0, 0), (-1, -1), 0),
('LEFTPADDING', (0, 0), (-1, -1), 1),
('RIGHTPADDING', (0, 0), (-1, -1), 1),
])
]
# Add span commands for merged cells
style_commands.extend(span_commands)
if span_commands:
logger.info(f"Applied {len(span_commands)} SPAN commands for merged cells")
style = TableStyle(style_commands)
t.setStyle(style)
# Use canvas scaling as fallback to fit table within bbox
@@ -4350,33 +4473,100 @@ class PDFGeneratorService:
# Replace newlines with <br/>
safe_content = safe_content.replace('\n', '<br/>')
# Calculate font size from bbox height, but keep minimum 10pt
font_size = max(box_height * 0.7, 10)
font_size = min(font_size, 24) # Cap at 24pt
# Get original font size from style info
style_info = elem.get('style', {})
original_font_size = style_info.get('font_size', 12.0)
# Create style for this element
elem_style = ParagraphStyle(
f'elem_{id(elem)}',
parent=base_style,
fontSize=font_size,
leading=font_size * 1.2,
# Detect vertical text (Y-axis labels, etc.)
# Vertical text has aspect_ratio (height/width) > 2 and multiple characters
is_vertical_text = (
box_height > box_width * 2 and
len(content.strip()) > 1
)
# Create paragraph
para = Paragraph(safe_content, elem_style)
if is_vertical_text:
# For vertical text, use original font size and rotate
font_size = min(original_font_size, box_width * 0.9)
font_size = max(font_size, 6) # Minimum 6pt
# Calculate available width and height
available_width = box_width
available_height = box_height * 2 # Allow overflow
# Save canvas state for rotation
pdf_canvas.saveState()
# Wrap the paragraph
para_width, para_height = para.wrap(available_width, available_height)
# Convert to PDF coordinates
pdf_y_center = current_page_height - (y0 + y1) / 2
x_center = (x0 + x1) / 2
# Convert to PDF coordinates (y from bottom)
pdf_y = current_page_height - y0 - para_height
# Translate to center, rotate, translate back
pdf_canvas.translate(x_center, pdf_y_center)
pdf_canvas.rotate(90)
# Draw the paragraph
para.drawOn(pdf_canvas, x0, pdf_y)
# Set font and draw text centered
pdf_canvas.setFont(
self.font_name if self.font_registered else 'Helvetica',
font_size
)
# Draw text at origin (since we translated to center)
text_width = pdf_canvas.stringWidth(
safe_content.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>'),
self.font_name if self.font_registered else 'Helvetica',
font_size
)
pdf_canvas.drawString(-text_width / 2, -font_size / 3,
safe_content.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>'))
pdf_canvas.restoreState()
else:
# For horizontal text, dynamically fit text within bbox
# Start with original font size and reduce until text fits
MIN_FONT_SIZE = 6
MAX_FONT_SIZE = 14
if original_font_size > 0:
start_font_size = min(original_font_size, MAX_FONT_SIZE)
else:
start_font_size = min(box_height * 0.7, MAX_FONT_SIZE)
font_size = max(start_font_size, MIN_FONT_SIZE)
# Try progressively smaller font sizes until text fits
para = None
para_height = box_height + 1 # Start with height > box to enter loop
while font_size >= MIN_FONT_SIZE and para_height > box_height:
elem_style = ParagraphStyle(
f'elem_{id(elem)}_{font_size}',
parent=base_style,
fontSize=font_size,
leading=font_size * 1.15, # Tighter leading
)
para = Paragraph(safe_content, elem_style)
para_width, para_height = para.wrap(box_width, box_height * 3)
if para_height <= box_height:
break # Text fits!
font_size -= 0.5 # Reduce font size and try again
# Ensure minimum font size
if font_size < MIN_FONT_SIZE:
font_size = MIN_FONT_SIZE
elem_style = ParagraphStyle(
f'elem_{id(elem)}_min',
parent=base_style,
fontSize=font_size,
leading=font_size * 1.15,
)
para = Paragraph(safe_content, elem_style)
para_width, para_height = para.wrap(box_width, box_height * 3)
# Convert to PDF coordinates (y from bottom)
# Clip to bbox height to prevent overflow
actual_height = min(para_height, box_height)
pdf_y = current_page_height - y0 - actual_height
# Draw the paragraph
para.drawOn(pdf_canvas, x0, pdf_y)
# Save PDF
pdf_canvas.save()
@@ -4451,13 +4641,47 @@ class PDFGeneratorService:
pdf_y_bottom = page_height - ty1
pdf_canvas.rect(tx0, pdf_y_bottom, table_width, table_height, stroke=1, fill=0)
# Step 2: Draw cell borders using cell_boxes
# Step 2: Get or calculate cell boxes
cell_boxes = metadata.get('cell_boxes', [])
if cell_boxes:
# Normalize cell boxes for grid alignment
if hasattr(self, '_normalize_cell_boxes_to_grid'):
cell_boxes = self._normalize_cell_boxes_to_grid(cell_boxes)
# If no cell_boxes, calculate from column_widths and row_heights
if not cell_boxes:
column_widths = metadata.get('column_widths', [])
row_heights = metadata.get('row_heights', [])
if column_widths and row_heights:
# Calculate cell positions from widths and heights
cell_boxes = []
rows = content.get('rows', len(row_heights)) if isinstance(content, dict) else len(row_heights)
cols = content.get('cols', len(column_widths)) if isinstance(content, dict) else len(column_widths)
# Calculate cumulative positions
x_positions = [tx0]
for w in column_widths[:cols]:
x_positions.append(x_positions[-1] + w)
y_positions = [ty0]
for h in row_heights[:rows]:
y_positions.append(y_positions[-1] + h)
# Create cell boxes for each cell (row-major order)
for row_idx in range(rows):
for col_idx in range(cols):
if col_idx < len(x_positions) - 1 and row_idx < len(y_positions) - 1:
cx0 = x_positions[col_idx]
cy0 = y_positions[row_idx]
cx1 = x_positions[col_idx + 1]
cy1 = y_positions[row_idx + 1]
cell_boxes.append([cx0, cy0, cx1, cy1])
logger.debug(f"Calculated {len(cell_boxes)} cell boxes from {cols} cols x {rows} rows")
# Normalize cell boxes for grid alignment
if cell_boxes and hasattr(self, '_normalize_cell_boxes_to_grid'):
cell_boxes = self._normalize_cell_boxes_to_grid(cell_boxes)
# Draw cell borders
if cell_boxes:
pdf_canvas.setLineWidth(0.5)
for box in cell_boxes:
if len(box) >= 4:

View File

@@ -558,8 +558,8 @@ class PPStructureEnhanced:
element['embedded_images'] = embedded_images
logger.info(f"[TABLE] Embedded {len(embedded_images)} images into table")
# Special handling for images/figures/stamps (visual elements that need cropping)
elif mapped_type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.STAMP, ElementType.LOGO]:
# Special handling for images/figures/charts/stamps (visual elements that need cropping)
elif mapped_type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART, ElementType.DIAGRAM, ElementType.STAMP, ElementType.LOGO]:
# Save image if path provided
if 'img_path' in item and output_dir:
saved_path = self._save_image(item['img_path'], output_dir, element['element_id'])