chore: backup before code cleanup
Backup commit before executing remove-unused-code proposal. This includes all pending changes and new features. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -189,7 +189,7 @@ def validate_cell_boxes(
|
||||
Validate cell_boxes coordinates against page boundaries and table bbox.
|
||||
|
||||
PP-StructureV3 sometimes returns cell_boxes with coordinates that exceed
|
||||
page boundaries. This function validates and reports issues.
|
||||
page boundaries or table bbox. This function validates and clamps to valid boundaries.
|
||||
|
||||
Args:
|
||||
cell_boxes: List of cell bounding boxes [[x0, y0, x1, y1], ...]
|
||||
@@ -213,10 +213,22 @@ def validate_cell_boxes(
|
||||
clamped_boxes = []
|
||||
|
||||
# Page boundaries with tolerance
|
||||
min_x = -tolerance
|
||||
min_y = -tolerance
|
||||
max_x = page_width + tolerance
|
||||
max_y = page_height + tolerance
|
||||
page_min_x = -tolerance
|
||||
page_min_y = -tolerance
|
||||
page_max_x = page_width + tolerance
|
||||
page_max_y = page_height + tolerance
|
||||
|
||||
# Table boundaries with tolerance (prefer clamping to table bbox)
|
||||
table_min_x = table_bbox[0] - tolerance if len(table_bbox) >= 4 else page_min_x
|
||||
table_min_y = table_bbox[1] - tolerance if len(table_bbox) >= 4 else page_min_y
|
||||
table_max_x = table_bbox[2] + tolerance if len(table_bbox) >= 4 else page_max_x
|
||||
table_max_y = table_bbox[3] + tolerance if len(table_bbox) >= 4 else page_max_y
|
||||
|
||||
# For clamping, use the intersection of page and expanded table bbox
|
||||
clamp_min_x = max(0, table_bbox[0] - tolerance) if len(table_bbox) >= 4 else 0
|
||||
clamp_min_y = max(0, table_bbox[1] - tolerance) if len(table_bbox) >= 4 else 0
|
||||
clamp_max_x = min(page_width, table_bbox[2] + tolerance) if len(table_bbox) >= 4 else page_width
|
||||
clamp_max_y = min(page_height, table_bbox[3] + tolerance) if len(table_bbox) >= 4 else page_height
|
||||
|
||||
for idx, box in enumerate(cell_boxes):
|
||||
if not box or len(box) < 4:
|
||||
@@ -230,19 +242,38 @@ def validate_cell_boxes(
|
||||
cell_issues = []
|
||||
|
||||
# Check if coordinates exceed page boundaries
|
||||
if x0 < min_x:
|
||||
if x0 < page_min_x:
|
||||
cell_issues.append(f"x0={x0:.1f} < 0")
|
||||
is_valid = False
|
||||
if y0 < min_y:
|
||||
if y0 < page_min_y:
|
||||
cell_issues.append(f"y0={y0:.1f} < 0")
|
||||
is_valid = False
|
||||
if x1 > max_x:
|
||||
if x1 > page_max_x:
|
||||
cell_issues.append(f"x1={x1:.1f} > page_width={page_width:.1f}")
|
||||
is_valid = False
|
||||
if y1 > max_y:
|
||||
if y1 > page_max_y:
|
||||
cell_issues.append(f"y1={y1:.1f} > page_height={page_height:.1f}")
|
||||
is_valid = False
|
||||
|
||||
# Check if coordinates significantly exceed table bbox (more than 20% of table size)
|
||||
if len(table_bbox) >= 4:
|
||||
table_w = table_bbox[2] - table_bbox[0]
|
||||
table_h = table_bbox[3] - table_bbox[1]
|
||||
expand_tolerance = max(tolerance, table_h * 0.2) # 20% of table height
|
||||
|
||||
if y0 < table_bbox[1] - expand_tolerance:
|
||||
cell_issues.append(f"y0={y0:.1f} above table (table_y0={table_bbox[1]:.1f})")
|
||||
is_valid = False
|
||||
if y1 > table_bbox[3] + expand_tolerance:
|
||||
cell_issues.append(f"y1={y1:.1f} below table (table_y1={table_bbox[3]:.1f})")
|
||||
is_valid = False
|
||||
if x0 < table_bbox[0] - expand_tolerance:
|
||||
cell_issues.append(f"x0={x0:.1f} left of table (table_x0={table_bbox[0]:.1f})")
|
||||
is_valid = False
|
||||
if x1 > table_bbox[2] + expand_tolerance:
|
||||
cell_issues.append(f"x1={x1:.1f} right of table (table_x1={table_bbox[2]:.1f})")
|
||||
is_valid = False
|
||||
|
||||
# Check for inverted coordinates
|
||||
if x0 > x1:
|
||||
cell_issues.append(f"x0={x0:.1f} > x1={x1:.1f}")
|
||||
@@ -255,12 +286,12 @@ def validate_cell_boxes(
|
||||
invalid_count += 1
|
||||
issues.append(f"Cell {idx}: {', '.join(cell_issues)}")
|
||||
|
||||
# Clamp to valid boundaries
|
||||
# Clamp to valid boundaries (table bbox with some tolerance)
|
||||
clamped_box = [
|
||||
max(0, min(x0, page_width)),
|
||||
max(0, min(y0, page_height)),
|
||||
max(0, min(x1, page_width)),
|
||||
max(0, min(y1, page_height))
|
||||
max(clamp_min_x, min(x0, clamp_max_x)),
|
||||
max(clamp_min_y, min(y0, clamp_max_y)),
|
||||
max(clamp_min_x, min(x1, clamp_max_x)),
|
||||
max(clamp_min_y, min(y1, clamp_max_y))
|
||||
]
|
||||
|
||||
# Ensure proper ordering after clamping
|
||||
@@ -395,10 +426,15 @@ class OCRToUnifiedConverter:
|
||||
|
||||
Handles both enhanced PP-StructureV3 results (with parsing_res_list)
|
||||
and traditional markdown results. Applies gap filling when enabled.
|
||||
|
||||
Gap filling can use either:
|
||||
1. overall_ocr_res from PP-StructureV3 (preferred, no extra inference)
|
||||
2. Separate raw OCR text_regions (fallback)
|
||||
"""
|
||||
pages = []
|
||||
|
||||
# Extract raw OCR text regions for gap filling
|
||||
# Prefer overall_ocr_res from PP-StructureV3 when available
|
||||
raw_text_regions = ocr_results.get('text_regions', [])
|
||||
ocr_dimensions = ocr_results.get('ocr_dimensions', {})
|
||||
|
||||
@@ -461,13 +497,22 @@ class OCRToUnifiedConverter:
|
||||
if element:
|
||||
elements.append(element)
|
||||
|
||||
# Apply gap filling if enabled and raw regions available
|
||||
if self.gap_filling_service and raw_text_regions:
|
||||
# Filter raw regions for current page
|
||||
page_raw_regions = [
|
||||
r for r in raw_text_regions
|
||||
if r.get('page', 0) == page_idx or r.get('page', 1) == page_idx + 1
|
||||
]
|
||||
# Apply gap filling if enabled
|
||||
# Priority: 1) overall_ocr_res from page_result, 2) raw_text_regions from separate OCR
|
||||
if self.gap_filling_service:
|
||||
# Check for overall_ocr_res from PP-StructureV3 (preferred, no extra inference)
|
||||
page_raw_regions = page_result.get('overall_ocr_res', [])
|
||||
|
||||
if page_raw_regions:
|
||||
logger.debug(f"Page {page_idx + 1}: Using overall_ocr_res ({len(page_raw_regions)} regions)")
|
||||
elif raw_text_regions:
|
||||
# Fallback to separate raw OCR regions
|
||||
page_raw_regions = [
|
||||
r for r in raw_text_regions
|
||||
if r.get('page', 0) == page_idx or r.get('page', 1) == page_idx + 1
|
||||
]
|
||||
if page_raw_regions:
|
||||
logger.debug(f"Page {page_idx + 1}: Using separate raw OCR ({len(page_raw_regions)} regions)")
|
||||
|
||||
if page_raw_regions:
|
||||
supplemented, stats = self.gap_filling_service.fill_gaps(
|
||||
@@ -711,8 +756,33 @@ class OCRToUnifiedConverter:
|
||||
# Prepare content based on element type
|
||||
if element_type == ElementType.TABLE:
|
||||
# For tables, use TableData as content
|
||||
# Pass cell_boxes for accurate cell positioning
|
||||
table_data = self._extract_table_data(elem_data)
|
||||
# Priority: rebuilt_table > HTML parsing
|
||||
# rebuilt_table contains clean cells without empty padding
|
||||
if 'rebuilt_table' in elem_data:
|
||||
rebuilt = elem_data['rebuilt_table']
|
||||
# Use rebuilt cells directly - they don't include empty cells
|
||||
rebuilt_cells = rebuilt.get('cells', [])
|
||||
from app.models.unified_document import TableCell
|
||||
table_cells = [
|
||||
TableCell(
|
||||
row=c.get('row', 0),
|
||||
col=c.get('col', 0),
|
||||
row_span=c.get('row_span', 1),
|
||||
col_span=c.get('col_span', 1),
|
||||
content=c.get('content', '')
|
||||
)
|
||||
for c in rebuilt_cells
|
||||
]
|
||||
table_data = TableData(
|
||||
rows=rebuilt.get('rows', 0),
|
||||
cols=rebuilt.get('cols', 0),
|
||||
cells=table_cells,
|
||||
caption=elem_data.get('extracted_text')
|
||||
)
|
||||
logger.info(f"[CONVERTER] Table {elem_data.get('element_id')}: Using rebuilt_table directly ({len(rebuilt_cells)} cells)")
|
||||
else:
|
||||
# Fallback to HTML parsing for non-rebuilt tables
|
||||
table_data = self._extract_table_data(elem_data)
|
||||
content = table_data if table_data else elem_data.get('content', '')
|
||||
|
||||
# Preserve cell_boxes and embedded_images in metadata for PDF generation
|
||||
@@ -756,6 +826,18 @@ class OCRToUnifiedConverter:
|
||||
|
||||
if 'embedded_images' in elem_data:
|
||||
elem_data.setdefault('metadata', {})['embedded_images'] = elem_data['embedded_images']
|
||||
|
||||
# Pass through rebuild information for tables that were rebuilt
|
||||
# This tells the PDF renderer to use HTML content instead of cell_boxes
|
||||
logger.info(f"[CONVERTER] Table {elem_data.get('element_id')}: checking for rebuild_stats, keys={list(elem_data.keys())}")
|
||||
if 'rebuild_stats' in elem_data:
|
||||
elem_data.setdefault('metadata', {})['rebuild_stats'] = elem_data['rebuild_stats']
|
||||
elem_data['metadata']['was_rebuilt'] = True
|
||||
logger.info(f"[CONVERTER] Table {elem_data.get('element_id')}: FOUND rebuild_stats, setting was_rebuilt=True")
|
||||
|
||||
if 'rebuilt_table' in elem_data:
|
||||
elem_data.setdefault('metadata', {})['rebuilt_table'] = elem_data['rebuilt_table']
|
||||
|
||||
elif element_type in [
|
||||
ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART,
|
||||
ElementType.DIAGRAM, ElementType.LOGO, ElementType.STAMP
|
||||
|
||||
Reference in New Issue
Block a user