chore: backup before code cleanup

Backup commit before executing remove-unused-code proposal.
This includes all pending changes and new features.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
egg
2025-12-11 11:55:39 +08:00
parent eff9b0bcd5
commit 940a406dce
58 changed files with 8226 additions and 175 deletions

View File

@@ -189,7 +189,7 @@ def validate_cell_boxes(
Validate cell_boxes coordinates against page boundaries and table bbox.
PP-StructureV3 sometimes returns cell_boxes with coordinates that exceed
page boundaries. This function validates and reports issues.
page boundaries or table bbox. This function validates and clamps to valid boundaries.
Args:
cell_boxes: List of cell bounding boxes [[x0, y0, x1, y1], ...]
@@ -213,10 +213,22 @@ def validate_cell_boxes(
clamped_boxes = []
# Page boundaries with tolerance
min_x = -tolerance
min_y = -tolerance
max_x = page_width + tolerance
max_y = page_height + tolerance
page_min_x = -tolerance
page_min_y = -tolerance
page_max_x = page_width + tolerance
page_max_y = page_height + tolerance
# Table boundaries with tolerance (prefer clamping to table bbox)
table_min_x = table_bbox[0] - tolerance if len(table_bbox) >= 4 else page_min_x
table_min_y = table_bbox[1] - tolerance if len(table_bbox) >= 4 else page_min_y
table_max_x = table_bbox[2] + tolerance if len(table_bbox) >= 4 else page_max_x
table_max_y = table_bbox[3] + tolerance if len(table_bbox) >= 4 else page_max_y
# For clamping, use the intersection of page and expanded table bbox
clamp_min_x = max(0, table_bbox[0] - tolerance) if len(table_bbox) >= 4 else 0
clamp_min_y = max(0, table_bbox[1] - tolerance) if len(table_bbox) >= 4 else 0
clamp_max_x = min(page_width, table_bbox[2] + tolerance) if len(table_bbox) >= 4 else page_width
clamp_max_y = min(page_height, table_bbox[3] + tolerance) if len(table_bbox) >= 4 else page_height
for idx, box in enumerate(cell_boxes):
if not box or len(box) < 4:
@@ -230,19 +242,38 @@ def validate_cell_boxes(
cell_issues = []
# Check if coordinates exceed page boundaries
if x0 < min_x:
if x0 < page_min_x:
cell_issues.append(f"x0={x0:.1f} < 0")
is_valid = False
if y0 < min_y:
if y0 < page_min_y:
cell_issues.append(f"y0={y0:.1f} < 0")
is_valid = False
if x1 > max_x:
if x1 > page_max_x:
cell_issues.append(f"x1={x1:.1f} > page_width={page_width:.1f}")
is_valid = False
if y1 > max_y:
if y1 > page_max_y:
cell_issues.append(f"y1={y1:.1f} > page_height={page_height:.1f}")
is_valid = False
# Check if coordinates significantly exceed table bbox (more than 20% of table size)
if len(table_bbox) >= 4:
table_w = table_bbox[2] - table_bbox[0]
table_h = table_bbox[3] - table_bbox[1]
expand_tolerance = max(tolerance, table_h * 0.2) # 20% of table height
if y0 < table_bbox[1] - expand_tolerance:
cell_issues.append(f"y0={y0:.1f} above table (table_y0={table_bbox[1]:.1f})")
is_valid = False
if y1 > table_bbox[3] + expand_tolerance:
cell_issues.append(f"y1={y1:.1f} below table (table_y1={table_bbox[3]:.1f})")
is_valid = False
if x0 < table_bbox[0] - expand_tolerance:
cell_issues.append(f"x0={x0:.1f} left of table (table_x0={table_bbox[0]:.1f})")
is_valid = False
if x1 > table_bbox[2] + expand_tolerance:
cell_issues.append(f"x1={x1:.1f} right of table (table_x1={table_bbox[2]:.1f})")
is_valid = False
# Check for inverted coordinates
if x0 > x1:
cell_issues.append(f"x0={x0:.1f} > x1={x1:.1f}")
@@ -255,12 +286,12 @@ def validate_cell_boxes(
invalid_count += 1
issues.append(f"Cell {idx}: {', '.join(cell_issues)}")
# Clamp to valid boundaries
# Clamp to valid boundaries (table bbox with some tolerance)
clamped_box = [
max(0, min(x0, page_width)),
max(0, min(y0, page_height)),
max(0, min(x1, page_width)),
max(0, min(y1, page_height))
max(clamp_min_x, min(x0, clamp_max_x)),
max(clamp_min_y, min(y0, clamp_max_y)),
max(clamp_min_x, min(x1, clamp_max_x)),
max(clamp_min_y, min(y1, clamp_max_y))
]
# Ensure proper ordering after clamping
@@ -395,10 +426,15 @@ class OCRToUnifiedConverter:
Handles both enhanced PP-StructureV3 results (with parsing_res_list)
and traditional markdown results. Applies gap filling when enabled.
Gap filling can use either:
1. overall_ocr_res from PP-StructureV3 (preferred, no extra inference)
2. Separate raw OCR text_regions (fallback)
"""
pages = []
# Extract raw OCR text regions for gap filling
# Prefer overall_ocr_res from PP-StructureV3 when available
raw_text_regions = ocr_results.get('text_regions', [])
ocr_dimensions = ocr_results.get('ocr_dimensions', {})
@@ -461,13 +497,22 @@ class OCRToUnifiedConverter:
if element:
elements.append(element)
# Apply gap filling if enabled and raw regions available
if self.gap_filling_service and raw_text_regions:
# Filter raw regions for current page
page_raw_regions = [
r for r in raw_text_regions
if r.get('page', 0) == page_idx or r.get('page', 1) == page_idx + 1
]
# Apply gap filling if enabled
# Priority: 1) overall_ocr_res from page_result, 2) raw_text_regions from separate OCR
if self.gap_filling_service:
# Check for overall_ocr_res from PP-StructureV3 (preferred, no extra inference)
page_raw_regions = page_result.get('overall_ocr_res', [])
if page_raw_regions:
logger.debug(f"Page {page_idx + 1}: Using overall_ocr_res ({len(page_raw_regions)} regions)")
elif raw_text_regions:
# Fallback to separate raw OCR regions
page_raw_regions = [
r for r in raw_text_regions
if r.get('page', 0) == page_idx or r.get('page', 1) == page_idx + 1
]
if page_raw_regions:
logger.debug(f"Page {page_idx + 1}: Using separate raw OCR ({len(page_raw_regions)} regions)")
if page_raw_regions:
supplemented, stats = self.gap_filling_service.fill_gaps(
@@ -711,8 +756,33 @@ class OCRToUnifiedConverter:
# Prepare content based on element type
if element_type == ElementType.TABLE:
# For tables, use TableData as content
# Pass cell_boxes for accurate cell positioning
table_data = self._extract_table_data(elem_data)
# Priority: rebuilt_table > HTML parsing
# rebuilt_table contains clean cells without empty padding
if 'rebuilt_table' in elem_data:
rebuilt = elem_data['rebuilt_table']
# Use rebuilt cells directly - they don't include empty cells
rebuilt_cells = rebuilt.get('cells', [])
from app.models.unified_document import TableCell
table_cells = [
TableCell(
row=c.get('row', 0),
col=c.get('col', 0),
row_span=c.get('row_span', 1),
col_span=c.get('col_span', 1),
content=c.get('content', '')
)
for c in rebuilt_cells
]
table_data = TableData(
rows=rebuilt.get('rows', 0),
cols=rebuilt.get('cols', 0),
cells=table_cells,
caption=elem_data.get('extracted_text')
)
logger.info(f"[CONVERTER] Table {elem_data.get('element_id')}: Using rebuilt_table directly ({len(rebuilt_cells)} cells)")
else:
# Fallback to HTML parsing for non-rebuilt tables
table_data = self._extract_table_data(elem_data)
content = table_data if table_data else elem_data.get('content', '')
# Preserve cell_boxes and embedded_images in metadata for PDF generation
@@ -756,6 +826,18 @@ class OCRToUnifiedConverter:
if 'embedded_images' in elem_data:
elem_data.setdefault('metadata', {})['embedded_images'] = elem_data['embedded_images']
# Pass through rebuild information for tables that were rebuilt
# This tells the PDF renderer to use HTML content instead of cell_boxes
logger.info(f"[CONVERTER] Table {elem_data.get('element_id')}: checking for rebuild_stats, keys={list(elem_data.keys())}")
if 'rebuild_stats' in elem_data:
elem_data.setdefault('metadata', {})['rebuild_stats'] = elem_data['rebuild_stats']
elem_data['metadata']['was_rebuilt'] = True
logger.info(f"[CONVERTER] Table {elem_data.get('element_id')}: FOUND rebuild_stats, setting was_rebuilt=True")
if 'rebuilt_table' in elem_data:
elem_data.setdefault('metadata', {})['rebuilt_table'] = elem_data['rebuilt_table']
elif element_type in [
ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART,
ElementType.DIAGRAM, ElementType.LOGO, ElementType.STAMP