This commit is contained in:
egg
2025-12-04 18:00:37 +08:00
parent 9437387ef1
commit 8265be1741
22 changed files with 2672 additions and 196 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -178,6 +178,114 @@ def trim_empty_columns(table_dict: Dict[str, Any]) -> Dict[str, Any]:
return result
def validate_cell_boxes(
cell_boxes: List[List[float]],
table_bbox: List[float],
page_width: float,
page_height: float,
tolerance: float = 5.0
) -> Dict[str, Any]:
"""
Validate cell_boxes coordinates against page boundaries and table bbox.
PP-StructureV3 sometimes returns cell_boxes with coordinates that exceed
page boundaries. This function validates and reports issues.
Args:
cell_boxes: List of cell bounding boxes [[x0, y0, x1, y1], ...]
table_bbox: Table bounding box [x0, y0, x1, y1]
page_width: Page width in pixels
page_height: Page height in pixels
tolerance: Allowed tolerance for boundary checks (pixels)
Returns:
Dict with:
- valid: bool - whether all cell_boxes are valid
- invalid_count: int - number of invalid cell_boxes
- clamped_boxes: List - cell_boxes clamped to valid boundaries
- issues: List[str] - description of issues found
"""
if not cell_boxes:
return {'valid': True, 'invalid_count': 0, 'clamped_boxes': [], 'issues': []}
issues = []
invalid_count = 0
clamped_boxes = []
# Page boundaries with tolerance
min_x = -tolerance
min_y = -tolerance
max_x = page_width + tolerance
max_y = page_height + tolerance
for idx, box in enumerate(cell_boxes):
if not box or len(box) < 4:
issues.append(f"Cell {idx}: Invalid box format")
invalid_count += 1
clamped_boxes.append([0, 0, 0, 0])
continue
x0, y0, x1, y1 = box[:4]
is_valid = True
cell_issues = []
# Check if coordinates exceed page boundaries
if x0 < min_x:
cell_issues.append(f"x0={x0:.1f} < 0")
is_valid = False
if y0 < min_y:
cell_issues.append(f"y0={y0:.1f} < 0")
is_valid = False
if x1 > max_x:
cell_issues.append(f"x1={x1:.1f} > page_width={page_width:.1f}")
is_valid = False
if y1 > max_y:
cell_issues.append(f"y1={y1:.1f} > page_height={page_height:.1f}")
is_valid = False
# Check for inverted coordinates
if x0 > x1:
cell_issues.append(f"x0={x0:.1f} > x1={x1:.1f}")
is_valid = False
if y0 > y1:
cell_issues.append(f"y0={y0:.1f} > y1={y1:.1f}")
is_valid = False
if not is_valid:
invalid_count += 1
issues.append(f"Cell {idx}: {', '.join(cell_issues)}")
# Clamp to valid boundaries
clamped_box = [
max(0, min(x0, page_width)),
max(0, min(y0, page_height)),
max(0, min(x1, page_width)),
max(0, min(y1, page_height))
]
# Ensure proper ordering after clamping
if clamped_box[0] > clamped_box[2]:
clamped_box[0], clamped_box[2] = clamped_box[2], clamped_box[0]
if clamped_box[1] > clamped_box[3]:
clamped_box[1], clamped_box[3] = clamped_box[3], clamped_box[1]
clamped_boxes.append(clamped_box)
if invalid_count > 0:
logger.warning(
f"Cell boxes validation: {invalid_count}/{len(cell_boxes)} invalid. "
f"Page: {page_width:.0f}x{page_height:.0f}, Table bbox: {table_bbox}"
)
return {
'valid': invalid_count == 0,
'invalid_count': invalid_count,
'clamped_boxes': clamped_boxes,
'issues': issues,
'needs_fallback': invalid_count > len(cell_boxes) * 0.5 # >50% invalid = needs fallback
}
class OCRToUnifiedConverter:
"""
Converter for transforming PP-StructureV3 OCR results to UnifiedDocument format.
@@ -337,19 +445,22 @@ class OCRToUnifiedConverter:
for page_idx, page_result in enumerate(enhanced_results):
elements = []
# Get page dimensions first (needed for element conversion)
page_width = page_result.get('width', 0)
page_height = page_result.get('height', 0)
pp_dimensions = Dimensions(width=page_width, height=page_height)
# Process elements from parsing_res_list
if 'elements' in page_result:
for elem_data in page_result['elements']:
element = self._convert_pp3_element(elem_data, page_idx)
element = self._convert_pp3_element(
elem_data, page_idx,
page_width=page_width,
page_height=page_height
)
if element:
elements.append(element)
# Get page dimensions
pp_dimensions = Dimensions(
width=page_result.get('width', 0),
height=page_result.get('height', 0)
)
# Apply gap filling if enabled and raw regions available
if self.gap_filling_service and raw_text_regions:
# Filter raw regions for current page
@@ -556,9 +667,19 @@ class OCRToUnifiedConverter:
def _convert_pp3_element(
self,
elem_data: Dict[str, Any],
page_idx: int
page_idx: int,
page_width: float = 0,
page_height: float = 0
) -> Optional[DocumentElement]:
"""Convert PP-StructureV3 element to DocumentElement."""
"""
Convert PP-StructureV3 element to DocumentElement.
Args:
elem_data: Element data from PP-StructureV3
page_idx: Page index (0-based)
page_width: Page width for coordinate validation
page_height: Page height for coordinate validation
"""
try:
# Extract bbox
bbox_data = elem_data.get('bbox', [0, 0, 0, 0])
@@ -597,18 +718,67 @@ class OCRToUnifiedConverter:
# Preserve cell_boxes and embedded_images in metadata for PDF generation
# These are extracted by PP-StructureV3 and provide accurate cell positioning
if 'cell_boxes' in elem_data:
elem_data.setdefault('metadata', {})['cell_boxes'] = elem_data['cell_boxes']
elem_data['metadata']['cell_boxes_source'] = elem_data.get('cell_boxes_source', 'table_res_list')
cell_boxes = elem_data['cell_boxes']
elem_data.setdefault('metadata', {})['cell_boxes_source'] = elem_data.get('cell_boxes_source', 'table_res_list')
# Validate cell_boxes coordinates if page dimensions are available
if page_width > 0 and page_height > 0:
validation = validate_cell_boxes(
cell_boxes=cell_boxes,
table_bbox=bbox_data,
page_width=page_width,
page_height=page_height
)
if not validation['valid']:
elem_data['metadata']['cell_boxes_validation'] = {
'valid': False,
'invalid_count': validation['invalid_count'],
'total_count': len(cell_boxes),
'needs_fallback': validation['needs_fallback']
}
# Use clamped boxes instead of invalid ones
elem_data['metadata']['cell_boxes'] = validation['clamped_boxes']
elem_data['metadata']['cell_boxes_original'] = cell_boxes
if validation['needs_fallback']:
logger.warning(
f"Table {elem_data.get('element_id')}: "
f"{validation['invalid_count']}/{len(cell_boxes)} cell_boxes invalid, "
f"fallback recommended"
)
else:
elem_data['metadata']['cell_boxes'] = cell_boxes
elem_data['metadata']['cell_boxes_validation'] = {'valid': True}
else:
# No page dimensions available, store as-is
elem_data['metadata']['cell_boxes'] = cell_boxes
if 'embedded_images' in elem_data:
elem_data.setdefault('metadata', {})['embedded_images'] = elem_data['embedded_images']
elif element_type in [ElementType.IMAGE, ElementType.FIGURE]:
# For images, use metadata dict as content
elif element_type in [
ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART,
ElementType.DIAGRAM, ElementType.LOGO, ElementType.STAMP
]:
# For all visual elements, use metadata dict as content
# Priority: saved_path > img_path (PP-StructureV3 uses saved_path)
image_path = (
elem_data.get('saved_path') or
elem_data.get('img_path') or
''
)
content = {
'path': elem_data.get('img_path', ''),
'saved_path': image_path, # Preserve original path key
'path': image_path, # For backward compatibility
'width': elem_data.get('width', 0),
'height': elem_data.get('height', 0),
'format': elem_data.get('format', 'unknown')
}
if not image_path:
logger.warning(
f"Visual element {element_type.value} missing image path: "
f"saved_path={elem_data.get('saved_path')}, img_path={elem_data.get('img_path')}"
)
else:
content = elem_data.get('content', '')
@@ -1139,10 +1309,18 @@ class OCRToUnifiedConverter:
for page_idx, page_data in enumerate(pages_data):
elements = []
# Get page dimensions first
page_width = page_data.get('width', 0)
page_height = page_data.get('height', 0)
# Process each element in the page
if 'elements' in page_data:
for elem_data in page_data['elements']:
element = self._convert_pp3_element(elem_data, page_idx)
element = self._convert_pp3_element(
elem_data, page_idx,
page_width=page_width,
page_height=page_height
)
if element:
elements.append(element)
@@ -1150,8 +1328,8 @@ class OCRToUnifiedConverter:
page = Page(
page_number=page_idx + 1,
dimensions=Dimensions(
width=page_data.get('width', 0),
height=page_data.get('height', 0)
width=page_width,
height=page_height
),
elements=elements,
metadata={'reading_order': self._calculate_reading_order(elements)}

View File

@@ -3371,18 +3371,21 @@ class PDFGeneratorService:
"rows": 6,
"cols": 2,
"cells": [
{"row": 0, "col": 0, "content": "..."},
{"row": 0, "col": 0, "content": "...", "row_span": 1, "col_span": 2},
{"row": 0, "col": 1, "content": "..."},
...
]
}
Returns format compatible with HTMLTableParser output:
Returns format compatible with HTMLTableParser output (with colspan/rowspan/col):
[
{"cells": [{"text": "..."}, {"text": "..."}]}, # row 0
{"cells": [{"text": "..."}, {"text": "..."}]}, # row 1
{"cells": [{"text": "...", "colspan": 1, "rowspan": 1, "col": 0}, ...]},
{"cells": [{"text": "...", "colspan": 1, "rowspan": 1, "col": 0}, ...]},
...
]
Note: This returns actual cells per row with their absolute column positions.
The table renderer uses 'col' to place cells correctly in the grid.
"""
try:
num_rows = content.get('rows', 0)
@@ -3392,21 +3395,39 @@ class PDFGeneratorService:
if not cells or num_rows == 0 or num_cols == 0:
return []
# Initialize rows structure
rows_data = []
for _ in range(num_rows):
rows_data.append({'cells': [{'text': ''} for _ in range(num_cols)]})
# Fill in cell content
# Group cells by row
cells_by_row = {}
for cell in cells:
row_idx = cell.get('row', 0)
col_idx = cell.get('col', 0)
cell_content = cell.get('content', '')
if row_idx not in cells_by_row:
cells_by_row[row_idx] = []
cells_by_row[row_idx].append(cell)
if 0 <= row_idx < num_rows and 0 <= col_idx < num_cols:
rows_data[row_idx]['cells'][col_idx]['text'] = str(cell_content) if cell_content else ''
# Sort cells within each row by column
for row_idx in cells_by_row:
cells_by_row[row_idx].sort(key=lambda c: c.get('col', 0))
logger.debug(f"Built {num_rows} rows from cells dict")
# Build rows structure with colspan/rowspan info and absolute col position
rows_data = []
for row_idx in range(num_rows):
row_cells = []
if row_idx in cells_by_row:
for cell in cells_by_row[row_idx]:
cell_content = cell.get('content', '')
row_span = cell.get('row_span', 1) or 1
col_span = cell.get('col_span', 1) or 1
col_idx = cell.get('col', 0)
row_cells.append({
'text': str(cell_content) if cell_content else '',
'rowspan': row_span,
'colspan': col_span,
'col': col_idx # Absolute column position
})
rows_data.append({'cells': row_cells})
logger.debug(f"Built {num_rows} rows from cells dict with span info")
return rows_data
except Exception as e:
@@ -3471,19 +3492,115 @@ class PDFGeneratorService:
table_width = bbox.x1 - bbox.x0
table_height = bbox.y1 - bbox.y0
# Build table data for ReportLab
table_content = []
for row in rows:
row_data = [cell['text'].strip() for cell in row['cells']]
table_content.append(row_data)
# Create table
from reportlab.platypus import Table, TableStyle
from reportlab.lib import colors
# Determine number of rows and columns for cell_boxes calculation
# Determine grid size from rows structure
# Note: rows may have 'col' attribute for absolute positioning (from Direct extraction)
# or may be sequential (from HTML parsing)
num_rows = len(rows)
max_cols = max(len(row['cells']) for row in rows) if rows else 0
# Check if cells have absolute column positions
has_absolute_cols = any(
'col' in cell
for row in rows
for cell in row['cells']
)
# Calculate actual number of columns
max_cols = 0
if has_absolute_cols:
# Use absolute col positions + colspan to find max column
for row in rows:
for cell in row['cells']:
col = cell.get('col', 0)
colspan = cell.get('colspan', 1)
max_cols = max(max_cols, col + colspan)
else:
# Sequential cells: sum up colspans
for row in rows:
col_pos = 0
for cell in row['cells']:
colspan = cell.get('colspan', 1)
col_pos += colspan
max_cols = max(max_cols, col_pos)
# Build table data for ReportLab with proper grid structure
# ReportLab needs a full grid with placeholders for spanned cells
# and SPAN commands to merge them
table_content = []
span_commands = []
covered = set() # Track cells covered by spans
# First pass: mark covered cells and collect SPAN commands
for row_idx, row in enumerate(rows):
if has_absolute_cols:
# Use absolute column positions
for cell in row['cells']:
col_pos = cell.get('col', 0)
colspan = cell.get('colspan', 1)
rowspan = cell.get('rowspan', 1)
# Mark cells covered by this span
if colspan > 1 or rowspan > 1:
for r in range(row_idx, row_idx + rowspan):
for c in range(col_pos, col_pos + colspan):
if (r, c) != (row_idx, col_pos):
covered.add((r, c))
# Add SPAN command for ReportLab
span_commands.append((
'SPAN',
(col_pos, row_idx),
(col_pos + colspan - 1, row_idx + rowspan - 1)
))
else:
# Sequential positioning
col_pos = 0
for cell in row['cells']:
while (row_idx, col_pos) in covered:
col_pos += 1
colspan = cell.get('colspan', 1)
rowspan = cell.get('rowspan', 1)
if colspan > 1 or rowspan > 1:
for r in range(row_idx, row_idx + rowspan):
for c in range(col_pos, col_pos + colspan):
if (r, c) != (row_idx, col_pos):
covered.add((r, c))
span_commands.append((
'SPAN',
(col_pos, row_idx),
(col_pos + colspan - 1, row_idx + rowspan - 1)
))
col_pos += colspan
# Second pass: build content grid
for row_idx in range(num_rows):
row_data = [''] * max_cols
if row_idx < len(rows):
if has_absolute_cols:
# Place cells at their absolute positions
for cell in rows[row_idx]['cells']:
col_pos = cell.get('col', 0)
if col_pos < max_cols:
row_data[col_pos] = cell['text'].strip()
else:
# Sequential placement
col_pos = 0
for cell in rows[row_idx]['cells']:
while col_pos < max_cols and (row_idx, col_pos) in covered:
col_pos += 1
if col_pos < max_cols:
row_data[col_pos] = cell['text'].strip()
colspan = cell.get('colspan', 1)
col_pos += colspan
table_content.append(row_data)
logger.debug(f"Built table grid: {num_rows} rows × {max_cols} cols, {len(span_commands)} span commands (absolute_cols={has_absolute_cols})")
# Use original column widths from extraction if available
# Otherwise try to compute from cell_boxes (from PP-StructureV3)
@@ -3517,7 +3634,7 @@ class PDFGeneratorService:
# Apply style with minimal padding to reduce table extension
# Use Chinese font to support special characters (℃, μm, ≦, ×, Ω, etc.)
font_for_table = self.font_name if self.font_registered else 'Helvetica'
style = TableStyle([
style_commands = [
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
('FONTNAME', (0, 0), (-1, -1), font_for_table),
('FONTSIZE', (0, 0), (-1, -1), 8),
@@ -3529,7 +3646,13 @@ class PDFGeneratorService:
('BOTTOMPADDING', (0, 0), (-1, -1), 0),
('LEFTPADDING', (0, 0), (-1, -1), 1),
('RIGHTPADDING', (0, 0), (-1, -1), 1),
])
]
# Add span commands for merged cells
style_commands.extend(span_commands)
if span_commands:
logger.info(f"Applied {len(span_commands)} SPAN commands for merged cells")
style = TableStyle(style_commands)
t.setStyle(style)
# Use canvas scaling as fallback to fit table within bbox
@@ -4350,33 +4473,100 @@ class PDFGeneratorService:
# Replace newlines with <br/>
safe_content = safe_content.replace('\n', '<br/>')
# Calculate font size from bbox height, but keep minimum 10pt
font_size = max(box_height * 0.7, 10)
font_size = min(font_size, 24) # Cap at 24pt
# Get original font size from style info
style_info = elem.get('style', {})
original_font_size = style_info.get('font_size', 12.0)
# Create style for this element
elem_style = ParagraphStyle(
f'elem_{id(elem)}',
parent=base_style,
fontSize=font_size,
leading=font_size * 1.2,
# Detect vertical text (Y-axis labels, etc.)
# Vertical text has aspect_ratio (height/width) > 2 and multiple characters
is_vertical_text = (
box_height > box_width * 2 and
len(content.strip()) > 1
)
# Create paragraph
para = Paragraph(safe_content, elem_style)
if is_vertical_text:
# For vertical text, use original font size and rotate
font_size = min(original_font_size, box_width * 0.9)
font_size = max(font_size, 6) # Minimum 6pt
# Calculate available width and height
available_width = box_width
available_height = box_height * 2 # Allow overflow
# Save canvas state for rotation
pdf_canvas.saveState()
# Wrap the paragraph
para_width, para_height = para.wrap(available_width, available_height)
# Convert to PDF coordinates
pdf_y_center = current_page_height - (y0 + y1) / 2
x_center = (x0 + x1) / 2
# Convert to PDF coordinates (y from bottom)
pdf_y = current_page_height - y0 - para_height
# Translate to center, rotate, translate back
pdf_canvas.translate(x_center, pdf_y_center)
pdf_canvas.rotate(90)
# Draw the paragraph
para.drawOn(pdf_canvas, x0, pdf_y)
# Set font and draw text centered
pdf_canvas.setFont(
self.font_name if self.font_registered else 'Helvetica',
font_size
)
# Draw text at origin (since we translated to center)
text_width = pdf_canvas.stringWidth(
safe_content.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>'),
self.font_name if self.font_registered else 'Helvetica',
font_size
)
pdf_canvas.drawString(-text_width / 2, -font_size / 3,
safe_content.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>'))
pdf_canvas.restoreState()
else:
# For horizontal text, dynamically fit text within bbox
# Start with original font size and reduce until text fits
MIN_FONT_SIZE = 6
MAX_FONT_SIZE = 14
if original_font_size > 0:
start_font_size = min(original_font_size, MAX_FONT_SIZE)
else:
start_font_size = min(box_height * 0.7, MAX_FONT_SIZE)
font_size = max(start_font_size, MIN_FONT_SIZE)
# Try progressively smaller font sizes until text fits
para = None
para_height = box_height + 1 # Start with height > box to enter loop
while font_size >= MIN_FONT_SIZE and para_height > box_height:
elem_style = ParagraphStyle(
f'elem_{id(elem)}_{font_size}',
parent=base_style,
fontSize=font_size,
leading=font_size * 1.15, # Tighter leading
)
para = Paragraph(safe_content, elem_style)
para_width, para_height = para.wrap(box_width, box_height * 3)
if para_height <= box_height:
break # Text fits!
font_size -= 0.5 # Reduce font size and try again
# Ensure minimum font size
if font_size < MIN_FONT_SIZE:
font_size = MIN_FONT_SIZE
elem_style = ParagraphStyle(
f'elem_{id(elem)}_min',
parent=base_style,
fontSize=font_size,
leading=font_size * 1.15,
)
para = Paragraph(safe_content, elem_style)
para_width, para_height = para.wrap(box_width, box_height * 3)
# Convert to PDF coordinates (y from bottom)
# Clip to bbox height to prevent overflow
actual_height = min(para_height, box_height)
pdf_y = current_page_height - y0 - actual_height
# Draw the paragraph
para.drawOn(pdf_canvas, x0, pdf_y)
# Save PDF
pdf_canvas.save()
@@ -4451,13 +4641,47 @@ class PDFGeneratorService:
pdf_y_bottom = page_height - ty1
pdf_canvas.rect(tx0, pdf_y_bottom, table_width, table_height, stroke=1, fill=0)
# Step 2: Draw cell borders using cell_boxes
# Step 2: Get or calculate cell boxes
cell_boxes = metadata.get('cell_boxes', [])
if cell_boxes:
# Normalize cell boxes for grid alignment
if hasattr(self, '_normalize_cell_boxes_to_grid'):
cell_boxes = self._normalize_cell_boxes_to_grid(cell_boxes)
# If no cell_boxes, calculate from column_widths and row_heights
if not cell_boxes:
column_widths = metadata.get('column_widths', [])
row_heights = metadata.get('row_heights', [])
if column_widths and row_heights:
# Calculate cell positions from widths and heights
cell_boxes = []
rows = content.get('rows', len(row_heights)) if isinstance(content, dict) else len(row_heights)
cols = content.get('cols', len(column_widths)) if isinstance(content, dict) else len(column_widths)
# Calculate cumulative positions
x_positions = [tx0]
for w in column_widths[:cols]:
x_positions.append(x_positions[-1] + w)
y_positions = [ty0]
for h in row_heights[:rows]:
y_positions.append(y_positions[-1] + h)
# Create cell boxes for each cell (row-major order)
for row_idx in range(rows):
for col_idx in range(cols):
if col_idx < len(x_positions) - 1 and row_idx < len(y_positions) - 1:
cx0 = x_positions[col_idx]
cy0 = y_positions[row_idx]
cx1 = x_positions[col_idx + 1]
cy1 = y_positions[row_idx + 1]
cell_boxes.append([cx0, cy0, cx1, cy1])
logger.debug(f"Calculated {len(cell_boxes)} cell boxes from {cols} cols x {rows} rows")
# Normalize cell boxes for grid alignment
if cell_boxes and hasattr(self, '_normalize_cell_boxes_to_grid'):
cell_boxes = self._normalize_cell_boxes_to_grid(cell_boxes)
# Draw cell borders
if cell_boxes:
pdf_canvas.setLineWidth(0.5)
for box in cell_boxes:
if len(box) >= 4:

View File

@@ -558,8 +558,8 @@ class PPStructureEnhanced:
element['embedded_images'] = embedded_images
logger.info(f"[TABLE] Embedded {len(embedded_images)} images into table")
# Special handling for images/figures/stamps (visual elements that need cropping)
elif mapped_type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.STAMP, ElementType.LOGO]:
# Special handling for images/figures/charts/stamps (visual elements that need cropping)
elif mapped_type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART, ElementType.DIAGRAM, ElementType.STAMP, ElementType.LOGO]:
# Save image if path provided
if 'img_path' in item and output_dir:
saved_path = self._save_image(item['img_path'], output_dir, element['element_id'])

View File

@@ -0,0 +1,43 @@
"""Debug PyMuPDF table.cells structure"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
import fitz
pdf_path = Path(__file__).parent.parent.parent / "demo_docs" / "edit3.pdf"
doc = fitz.open(str(pdf_path))
page = doc[0]
tables = page.find_tables()
for idx, table in enumerate(tables.tables):
data = table.extract()
num_rows = len(data)
num_cols = max(len(row) for row in data) if data else 0
print(f"Table {idx}:")
print(f" table.extract() dimensions: {num_rows} rows x {num_cols} cols")
print(f" Expected positions: {num_rows * num_cols}")
cell_rects = getattr(table, 'cells', None)
if cell_rects:
print(f" table.cells length: {len(cell_rects)}")
none_count = sum(1 for c in cell_rects if c is None)
actual_count = sum(1 for c in cell_rects if c is not None)
print(f" None cells: {none_count}")
print(f" Actual cells: {actual_count}")
# Check if cell_rects matches grid size
if len(cell_rects) != num_rows * num_cols:
print(f" WARNING: cell_rects length ({len(cell_rects)}) != grid size ({num_rows * num_cols})")
# Show first few cells
print(f" First 5 cells: {cell_rects[:5]}")
else:
print(f" table.cells: NOT AVAILABLE")
# Check row_count and col_count
print(f" table.row_count: {getattr(table, 'row_count', 'N/A')}")
print(f" table.col_count: {getattr(table, 'col_count', 'N/A')}")
doc.close()

View File

@@ -0,0 +1,48 @@
"""Debug PyMuPDF table structure - find merge info"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
import fitz
pdf_path = Path(__file__).parent.parent.parent / "demo_docs" / "edit3.pdf"
doc = fitz.open(str(pdf_path))
page = doc[0]
tables = page.find_tables()
for idx, table in enumerate(tables.tables):
print(f"\nTable {idx}:")
# Check all available attributes
print(f" Available attributes: {[a for a in dir(table) if not a.startswith('_')]}")
# Try to get header info
if hasattr(table, 'header'):
print(f" header: {table.header}")
# Check for cells info
cell_rects = table.cells
print(f" cells count: {len(cell_rects)}")
# Get the extracted data
data = table.extract()
print(f" extract() shape: {len(data)} x {max(len(r) for r in data)}")
# Check if there's a way to map cells to grid positions
# Look at the pandas output which might have merge info
try:
df = table.to_pandas()
print(f" pandas shape: {df.shape}")
except Exception as e:
print(f" pandas error: {e}")
# Check the TableRow objects if available
if hasattr(table, 'rows'):
rows = table.rows
print(f" rows: {len(rows)}")
for ri, row in enumerate(rows[:3]): # first 3 rows
print(f" row {ri}: {len(row.cells)} cells")
for ci, cell in enumerate(row.cells[:5]): # first 5 cells
print(f" cell {ci}: bbox={cell}")
doc.close()

View File

@@ -0,0 +1,111 @@
"""
Generate test PDF to verify Phase 1 fixes
"""
import sys
import os
from pathlib import Path
# Add backend to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from app.services.direct_extraction_engine import DirectExtractionEngine
from app.services.pdf_generator_service import PDFGeneratorService
from app.services.unified_document_exporter import UnifiedDocumentExporter
def generate_test_pdf(input_pdf: str, output_dir: Path):
"""Generate test PDF using Direct Track extraction"""
input_path = Path(input_pdf)
output_dir.mkdir(parents=True, exist_ok=True)
print(f"Processing: {input_path.name}")
print(f"Output dir: {output_dir}")
# Step 1: Extract with Direct Track
engine = DirectExtractionEngine(
enable_table_detection=True,
enable_image_extraction=True,
min_image_area=200.0, # Filter tiny images
enable_whiteout_detection=True,
enable_content_sanitization=True
)
unified_doc = engine.extract(input_path, output_dir=output_dir)
# Print extraction stats
print(f"\n=== Extraction Results ===")
print(f"Document ID: {unified_doc.document_id}")
print(f"Pages: {len(unified_doc.pages)}")
table_count = 0
image_count = 0
merged_cells = 0
total_cells = 0
for page in unified_doc.pages:
for elem in page.elements:
if elem.type.value == 'table':
table_count += 1
if elem.content and hasattr(elem.content, 'cells'):
total_cells += len(elem.content.cells)
for cell in elem.content.cells:
if cell.row_span > 1 or cell.col_span > 1:
merged_cells += 1
elif elem.type.value == 'image':
image_count += 1
print(f"Tables: {table_count}")
print(f" - Total cells: {total_cells}")
print(f" - Merged cells: {merged_cells}")
print(f"Images: {image_count}")
# Step 2: Export to JSON
exporter = UnifiedDocumentExporter()
json_path = output_dir / f"{input_path.stem}_result.json"
exporter.export_to_json(unified_doc, json_path)
print(f"\nJSON saved: {json_path}")
# Step 3: Generate layout PDF
pdf_generator = PDFGeneratorService()
pdf_path = output_dir / f"{input_path.stem}_layout.pdf"
try:
pdf_generator.generate_from_unified_document(
unified_doc=unified_doc,
output_path=pdf_path,
source_file_path=input_path
)
print(f"PDF saved: {pdf_path}")
return pdf_path
except Exception as e:
print(f"PDF generation error: {e}")
import traceback
traceback.print_exc()
return None
if __name__ == "__main__":
# Test with edit3.pdf (has complex tables with merging)
demo_docs = Path(__file__).parent.parent.parent / "demo_docs"
output_base = Path(__file__).parent.parent / "storage" / "test_phase1"
# Process edit3.pdf
edit3_pdf = demo_docs / "edit3.pdf"
if edit3_pdf.exists():
output_dir = output_base / "edit3"
result = generate_test_pdf(str(edit3_pdf), output_dir)
if result:
print(f"\n✓ Test PDF generated: {result}")
# Also process edit.pdf for comparison
edit_pdf = demo_docs / "edit.pdf"
if edit_pdf.exists():
output_dir = output_base / "edit"
result = generate_test_pdf(str(edit_pdf), output_dir)
if result:
print(f"\n✓ Test PDF generated: {result}")
print(f"\n=== Output Location ===")
print(f"{output_base}")

View File

@@ -0,0 +1,285 @@
"""
Phase 1 Bug Fixes Verification Tests
Tests for:
1.1 Direct Track table cell merging
1.2 OCR Track image path preservation
1.3 Cell boxes coordinate validation
1.4 Tiny decoration image filtering
1.5 Covering image removal
"""
import sys
import os
from pathlib import Path
# Add backend to path
sys.path.insert(0, str(Path(__file__).parent.parent))
import fitz
from app.services.direct_extraction_engine import DirectExtractionEngine
from app.services.ocr_to_unified_converter import validate_cell_boxes
from app.models.unified_document import TableCell
def test_1_1_table_cell_merging():
"""Test 1.1.5: Verify edit3.pdf returns correct merged cells"""
print("\n" + "="*60)
print("TEST 1.1: Direct Track Table Cell Merging")
print("="*60)
pdf_path = Path(__file__).parent.parent.parent / "demo_docs" / "edit3.pdf"
if not pdf_path.exists():
print(f"SKIP: {pdf_path} not found")
return False
doc = fitz.open(str(pdf_path))
total_cells = 0
merged_cells = 0
for page_num, page in enumerate(doc):
tables = page.find_tables()
for table_idx, table in enumerate(tables.tables):
data = table.extract()
cell_rects = getattr(table, 'cells', None)
if cell_rects:
num_rows = len(data)
num_cols = max(len(row) for row in data) if data else 0
# Count actual cells (non-None)
actual_cells = sum(1 for c in cell_rects if c is not None)
none_cells = sum(1 for c in cell_rects if c is None)
print(f" Page {page_num}, Table {table_idx}:")
print(f" Grid size: {num_rows} x {num_cols} = {num_rows * num_cols} positions")
print(f" Actual cells: {actual_cells}")
print(f" Merged positions (None): {none_cells}")
total_cells += actual_cells
if none_cells > 0:
merged_cells += 1
doc.close()
print(f"\n Total actual cells across all tables: {total_cells}")
print(f" Tables with merging: {merged_cells}")
# According to PLAN.md, edit3.pdf should have 83 cells (not 204)
# The presence of None values indicates merging is detected
if total_cells > 0 and total_cells < 204:
print(" RESULT: PASS - Cell merging detected correctly")
return True
elif total_cells == 204:
print(" RESULT: FAIL - All cells treated as 1x1 (no merging detected)")
return False
else:
print(f" RESULT: INCONCLUSIVE - {total_cells} cells found")
return None
def test_1_3_cell_boxes_validation():
"""Test 1.3: Verify cell_boxes coordinate validation"""
print("\n" + "="*60)
print("TEST 1.3: Cell Boxes Coordinate Validation")
print("="*60)
# Test case 1: Valid coordinates
valid_boxes = [
[10, 10, 100, 50],
[100, 10, 200, 50],
[10, 50, 200, 100]
]
result = validate_cell_boxes(valid_boxes, [0, 0, 300, 200], 300, 200)
print(f" Valid boxes: valid={result['valid']}, invalid_count={result['invalid_count']}")
assert result['valid'], "Valid boxes should pass validation"
# Test case 2: Out of bounds coordinates
invalid_boxes = [
[-10, 10, 100, 50], # x0 < 0
[10, 10, 400, 50], # x1 > page_width
[10, 10, 100, 300] # y1 > page_height
]
result = validate_cell_boxes(invalid_boxes, [0, 0, 300, 200], 300, 200)
print(f" Invalid boxes: valid={result['valid']}, invalid_count={result['invalid_count']}")
assert not result['valid'], "Invalid boxes should fail validation"
assert result['invalid_count'] == 3, "Should detect 3 invalid boxes"
# Test case 3: Clamping
assert len(result['clamped_boxes']) == 3, "Should return clamped boxes"
clamped = result['clamped_boxes'][0]
assert clamped[0] >= 0, "Clamped x0 should be >= 0"
print(" RESULT: PASS - Coordinate validation works correctly")
return True
def test_1_4_tiny_image_filtering():
"""Test 1.4: Verify tiny decoration image filtering"""
print("\n" + "="*60)
print("TEST 1.4: Tiny Decoration Image Filtering")
print("="*60)
pdf_path = Path(__file__).parent.parent.parent / "demo_docs" / "edit3.pdf"
if not pdf_path.exists():
print(f"SKIP: {pdf_path} not found")
return None
doc = fitz.open(str(pdf_path))
tiny_count = 0
normal_count = 0
min_area = 200 # Same threshold as in DirectExtractionEngine
for page_num, page in enumerate(doc):
images = page.get_images()
for img in images:
xref = img[0]
rects = page.get_image_rects(xref)
if rects:
rect = rects[0]
area = (rect.x1 - rect.x0) * (rect.y1 - rect.y0)
if area < min_area:
tiny_count += 1
print(f" Page {page_num}: Tiny image xref={xref}, area={area:.1f} px²")
else:
normal_count += 1
doc.close()
print(f"\n Tiny images (< {min_area} px²): {tiny_count}")
print(f" Normal images: {normal_count}")
if tiny_count > 0:
print(" RESULT: PASS - Tiny images detected, will be filtered")
return True
else:
print(" RESULT: INFO - No tiny images found in test file")
return None
def test_1_5_covering_image_detection():
"""Test 1.5: Verify covering image detection"""
print("\n" + "="*60)
print("TEST 1.5: Covering Image Detection")
print("="*60)
pdf_path = Path(__file__).parent.parent.parent / "demo_docs" / "edit3.pdf"
if not pdf_path.exists():
print(f"SKIP: {pdf_path} not found")
return None
engine = DirectExtractionEngine(
enable_whiteout_detection=True,
whiteout_iou_threshold=0.8
)
doc = fitz.open(str(pdf_path))
total_covering = 0
for page_num, page in enumerate(doc):
result = engine._preprocess_page(page, page_num, doc)
covering_images = result.get('covering_images', [])
if covering_images:
print(f" Page {page_num}: {len(covering_images)} covering images detected")
for img in covering_images[:3]: # Show first 3
print(f" - xref={img.get('xref')}, type={img.get('color_type')}, "
f"bbox={[round(x, 1) for x in img.get('bbox', [])]}")
total_covering += len(covering_images)
doc.close()
print(f"\n Total covering images detected: {total_covering}")
if total_covering > 0:
print(" RESULT: PASS - Covering images detected, will be filtered")
return True
else:
print(" RESULT: INFO - No covering images found in test file")
return None
def test_direct_extraction_full():
"""Full integration test for Direct Track extraction"""
print("\n" + "="*60)
print("INTEGRATION TEST: Direct Track Full Extraction")
print("="*60)
pdf_path = Path(__file__).parent.parent.parent / "demo_docs" / "edit3.pdf"
if not pdf_path.exists():
print(f"SKIP: {pdf_path} not found")
return None
engine = DirectExtractionEngine(
enable_table_detection=True,
enable_image_extraction=True,
min_image_area=200.0,
enable_whiteout_detection=True
)
try:
result = engine.extract(pdf_path) # Pass Path object, not string
# Count elements
table_count = 0
image_count = 0
merged_table_count = 0
for page in result.pages:
for elem in page.elements:
if elem.type.value == 'table':
table_count += 1
if elem.content and hasattr(elem.content, 'cells'):
# Check for merged cells
for cell in elem.content.cells:
if cell.row_span > 1 or cell.col_span > 1:
merged_table_count += 1
break
elif elem.type.value == 'image':
image_count += 1
print(f" Document ID: {result.document_id}")
print(f" Pages: {len(result.pages)}")
print(f" Tables: {table_count} (with merging: {merged_table_count})")
print(f" Images: {image_count}")
print(" RESULT: PASS - Extraction completed successfully")
return True
except Exception as e:
print(f" RESULT: FAIL - {e}")
import traceback
traceback.print_exc()
return False
if __name__ == "__main__":
print("="*60)
print("Phase 1 Bug Fixes Verification Tests")
print("="*60)
results = {}
# Run tests
results['1.1_table_merging'] = test_1_1_table_cell_merging()
results['1.3_coord_validation'] = test_1_3_cell_boxes_validation()
results['1.4_tiny_filtering'] = test_1_4_tiny_image_filtering()
results['1.5_covering_detection'] = test_1_5_covering_image_detection()
results['integration'] = test_direct_extraction_full()
# Summary
print("\n" + "="*60)
print("TEST SUMMARY")
print("="*60)
for test_name, result in results.items():
status = "PASS" if result is True else "FAIL" if result is False else "SKIP/INFO"
print(f" {test_name}: {status}")
passed = sum(1 for r in results.values() if r is True)
failed = sum(1 for r in results.values() if r is False)
skipped = sum(1 for r in results.values() if r is None)
print(f"\n Total: {passed} passed, {failed} failed, {skipped} skipped/info")