chore: backup before code cleanup

Backup commit before executing remove-unused-code proposal.
This includes all pending changes and new features.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
egg
2025-12-11 11:55:39 +08:00
parent eff9b0bcd5
commit 940a406dce
58 changed files with 8226 additions and 175 deletions

View File

@@ -0,0 +1,583 @@
"""
Cell Validation Engine
Validates PP-StructureV3 table detections using metric-based heuristics
to filter over-detected cells and reclassify invalid tables as TEXT elements.
Metrics used:
- Cell density: cells per 10,000 px² (normal: 0.4-1.0, over-detected: 6+)
- Average cell area: px² per cell (normal: 10,000-25,000, over-detected: ~1,600)
- Cell height: table_height / cell_count (minimum: 10px for readable text)
"""
import logging
from dataclasses import dataclass
from typing import List, Dict, Any, Optional, Tuple
from html.parser import HTMLParser
import re
logger = logging.getLogger(__name__)
@dataclass
class CellValidationConfig:
"""Configuration for cell validation thresholds."""
max_cell_density: float = 3.0 # cells per 10,000 px²
min_avg_cell_area: float = 3000.0 # px² per cell
min_cell_height: float = 10.0 # px per cell row
enabled: bool = True
@dataclass
class TableValidationResult:
"""Result of table validation."""
is_valid: bool
table_element: Dict[str, Any]
reason: Optional[str] = None
metrics: Optional[Dict[str, float]] = None
class CellValidationEngine:
"""
Validates table elements from PP-StructureV3 output.
Over-detected tables are identified by abnormal metrics and
reclassified as TEXT elements while preserving content.
"""
def __init__(self, config: Optional[CellValidationConfig] = None):
self.config = config or CellValidationConfig()
def calculate_table_metrics(
self,
bbox: List[float],
cell_boxes: List[List[float]]
) -> Dict[str, float]:
"""
Calculate validation metrics for a table.
Args:
bbox: Table bounding box [x0, y0, x1, y1]
cell_boxes: List of cell bounding boxes
Returns:
Dictionary with calculated metrics
"""
if len(bbox) < 4:
return {"cell_count": 0, "cell_density": 0, "avg_cell_area": 0, "avg_cell_height": 0}
cell_count = len(cell_boxes)
if cell_count == 0:
return {"cell_count": 0, "cell_density": 0, "avg_cell_area": 0, "avg_cell_height": 0}
# Calculate table dimensions
table_width = bbox[2] - bbox[0]
table_height = bbox[3] - bbox[1]
table_area = table_width * table_height
if table_area <= 0:
return {"cell_count": cell_count, "cell_density": 0, "avg_cell_area": 0, "avg_cell_height": 0}
# Cell density: cells per 10,000 px²
cell_density = (cell_count / table_area) * 10000
# Average cell area
avg_cell_area = table_area / cell_count
# Average cell height (table height / cell count)
avg_cell_height = table_height / cell_count
return {
"cell_count": cell_count,
"table_width": table_width,
"table_height": table_height,
"table_area": table_area,
"cell_density": cell_density,
"avg_cell_area": avg_cell_area,
"avg_cell_height": avg_cell_height
}
def validate_table(
self,
element: Dict[str, Any]
) -> TableValidationResult:
"""
Validate a single table element.
Args:
element: Table element from PP-StructureV3 output
Returns:
TableValidationResult with validation status and metrics
"""
if not self.config.enabled:
return TableValidationResult(is_valid=True, table_element=element)
# Extract bbox and cell_boxes
bbox = element.get("bbox", [])
cell_boxes = element.get("cell_boxes", [])
# Tables without cells pass validation (structure-only tables)
if not cell_boxes:
return TableValidationResult(
is_valid=True,
table_element=element,
reason="No cells to validate"
)
# Calculate metrics
metrics = self.calculate_table_metrics(bbox, cell_boxes)
# Check cell density
if metrics["cell_density"] > self.config.max_cell_density:
return TableValidationResult(
is_valid=False,
table_element=element,
reason=f"Cell density {metrics['cell_density']:.2f} exceeds threshold {self.config.max_cell_density}",
metrics=metrics
)
# Check average cell area
if metrics["avg_cell_area"] < self.config.min_avg_cell_area:
return TableValidationResult(
is_valid=False,
table_element=element,
reason=f"Avg cell area {metrics['avg_cell_area']:.0f}px² below threshold {self.config.min_avg_cell_area}px²",
metrics=metrics
)
# Check cell height
if metrics["avg_cell_height"] < self.config.min_cell_height:
return TableValidationResult(
is_valid=False,
table_element=element,
reason=f"Avg cell height {metrics['avg_cell_height']:.1f}px below threshold {self.config.min_cell_height}px",
metrics=metrics
)
# Content-based validation: check if content looks like prose vs tabular data
content_check = self._validate_table_content(element)
if not content_check["is_tabular"]:
return TableValidationResult(
is_valid=False,
table_element=element,
reason=content_check["reason"],
metrics=metrics
)
return TableValidationResult(
is_valid=True,
table_element=element,
metrics=metrics
)
def _validate_table_content(self, element: Dict[str, Any]) -> Dict[str, Any]:
"""
Validate table content to detect false positive tables.
Checks:
1. Sparsity: text coverage ratio (text area / table area)
2. Header: does table have proper header structure
3. Key-Value: for 2-col tables, is it a key-value list or random layout
4. Prose: are cells containing long prose text
Returns:
Dict with is_tabular (bool) and reason (str)
"""
html_content = element.get("content", "")
bbox = element.get("bbox", [])
cell_boxes = element.get("cell_boxes", [])
if not html_content or '<table' not in html_content.lower():
return {"is_tabular": True, "reason": "no_html_content"}
try:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
table = soup.find('table')
if not table:
return {"is_tabular": True, "reason": "no_table_element"}
rows = table.find_all('tr')
if not rows:
return {"is_tabular": True, "reason": "no_rows"}
# Extract cell contents with row structure
row_data = []
all_cells = []
for row_idx, row in enumerate(rows):
cells = row.find_all(['td', 'th'])
row_cells = []
for cell in cells:
text = cell.get_text(strip=True)
colspan = int(cell.get('colspan', 1))
is_header = cell.name == 'th'
cell_info = {
"text": text,
"length": len(text),
"colspan": colspan,
"is_header": is_header,
"row": row_idx
}
row_cells.append(cell_info)
all_cells.append(cell_info)
row_data.append(row_cells)
if not all_cells:
return {"is_tabular": True, "reason": "no_cells"}
num_rows = len(row_data)
num_cols = max(len(r) for r in row_data) if row_data else 0
# === Check 1: Sparsity (text coverage) ===
sparsity_result = self._check_sparsity(bbox, cell_boxes, all_cells)
if not sparsity_result["is_valid"]:
return {"is_tabular": False, "reason": sparsity_result["reason"]}
# === Check 2: Header structure ===
header_result = self._check_header_structure(row_data, num_cols)
if not header_result["has_header"] and num_rows > 3:
# Large table without header is suspicious
logger.debug(f"Table has no header structure with {num_rows} rows")
# === Check 3: Key-Value pattern for 2-column tables ===
if num_cols == 2:
kv_result = self._check_key_value_pattern(row_data)
if kv_result["is_kv_list"] and kv_result["confidence"] > 0.7:
# High confidence key-value list - keep as table but log
logger.debug(f"Table identified as key-value list (conf={kv_result['confidence']:.2f})")
elif not kv_result["is_kv_list"] and kv_result["is_random_layout"]:
# Random 2-column layout, not a real table
return {
"is_tabular": False,
"reason": f"random_two_column_layout (not key-value)"
}
# === Check 4: Prose content ===
long_cells = [c for c in all_cells if c["length"] > 80]
prose_ratio = len(long_cells) / len(all_cells) if all_cells else 0
if prose_ratio > 0.3:
return {
"is_tabular": False,
"reason": f"prose_content ({len(long_cells)}/{len(all_cells)} cells > 80 chars)"
}
# === Check 5: Section header as table ===
if num_rows <= 2 and num_cols <= 2:
first_row = row_data[0] if row_data else []
if len(first_row) == 1:
text = first_row[0]["text"]
if text.isupper() and len(text) < 50:
return {
"is_tabular": False,
"reason": f"section_header_only ({text[:30]})"
}
return {"is_tabular": True, "reason": "content_valid"}
except Exception as e:
logger.warning(f"Content validation failed: {e}")
return {"is_tabular": True, "reason": f"validation_error: {e}"}
def _check_sparsity(
self,
bbox: List[float],
cell_boxes: List[List[float]],
all_cells: List[Dict]
) -> Dict[str, Any]:
"""
Check text coverage ratio (sparsity).
Two-column layouts have large empty gaps in the middle.
Real tables have more uniform cell distribution.
"""
if len(bbox) < 4:
return {"is_valid": True, "reason": "no_bbox"}
table_width = bbox[2] - bbox[0]
table_height = bbox[3] - bbox[1]
table_area = table_width * table_height
if table_area <= 0:
return {"is_valid": True, "reason": "invalid_area"}
# Calculate text area from cell_boxes
if cell_boxes:
text_area = 0
for cb in cell_boxes:
if len(cb) >= 4:
w = abs(cb[2] - cb[0])
h = abs(cb[3] - cb[1])
text_area += w * h
coverage = text_area / table_area
else:
# Estimate from cell content length
total_chars = sum(c["length"] for c in all_cells)
# Rough estimate: 1 char ≈ 8x12 pixels = 96 px²
estimated_text_area = total_chars * 96
coverage = min(estimated_text_area / table_area, 1.0)
# Very sparse table (< 15% coverage) is suspicious
if coverage < 0.15:
return {
"is_valid": False,
"reason": f"sparse_content (coverage={coverage:.1%})"
}
return {"is_valid": True, "coverage": coverage}
def _check_header_structure(
self,
row_data: List[List[Dict]],
num_cols: int
) -> Dict[str, Any]:
"""
Check if table has proper header structure.
Real tables usually have:
- First row with <th> elements
- Or first row with different content pattern (labels vs values)
"""
if not row_data:
return {"has_header": False}
first_row = row_data[0]
# Check for <th> elements
th_count = sum(1 for c in first_row if c.get("is_header", False))
if th_count > 0 and th_count >= len(first_row) * 0.5:
return {"has_header": True, "type": "th_elements"}
# Check for header-like content (short, distinct from body)
if len(row_data) > 1:
first_row_avg_len = sum(c["length"] for c in first_row) / len(first_row) if first_row else 0
body_rows = row_data[1:]
body_cells = [c for row in body_rows for c in row]
body_avg_len = sum(c["length"] for c in body_cells) / len(body_cells) if body_cells else 0
# Header row should be shorter (labels) than body (data)
if first_row_avg_len < body_avg_len * 0.7:
return {"has_header": True, "type": "short_labels"}
return {"has_header": False}
def _check_key_value_pattern(
self,
row_data: List[List[Dict]]
) -> Dict[str, Any]:
"""
For 2-column tables, check if it's a key-value list.
Key-value characteristics:
- Left column: short labels (< 30 chars)
- Right column: values (can be longer)
- Consistent pattern across rows
Random layout characteristics:
- Both columns have similar length distribution
- No clear label-value relationship
"""
if not row_data:
return {"is_kv_list": False, "is_random_layout": False, "confidence": 0}
left_lengths = []
right_lengths = []
kv_rows = 0
total_rows = 0
for row in row_data:
if len(row) != 2:
continue
total_rows += 1
left = row[0]
right = row[1]
left_lengths.append(left["length"])
right_lengths.append(right["length"])
# Key-value pattern: left is short label, right is value
if left["length"] < 40 and left["length"] < right["length"] * 2:
kv_rows += 1
if total_rows == 0:
return {"is_kv_list": False, "is_random_layout": False, "confidence": 0}
kv_ratio = kv_rows / total_rows
avg_left = sum(left_lengths) / len(left_lengths) if left_lengths else 0
avg_right = sum(right_lengths) / len(right_lengths) if right_lengths else 0
# High KV ratio and left column is shorter = key-value list
if kv_ratio > 0.6 and avg_left < avg_right:
return {
"is_kv_list": True,
"is_random_layout": False,
"confidence": kv_ratio,
"avg_left": avg_left,
"avg_right": avg_right
}
# Similar lengths on both sides = random layout
if avg_left > 0 and 0.5 < avg_right / avg_left < 2.0:
# Both columns have similar content length
return {
"is_kv_list": False,
"is_random_layout": True,
"confidence": 1 - kv_ratio,
"avg_left": avg_left,
"avg_right": avg_right
}
return {
"is_kv_list": False,
"is_random_layout": False,
"confidence": 0,
"avg_left": avg_left,
"avg_right": avg_right
}
def extract_text_from_table_html(self, html_content: str) -> str:
"""
Extract plain text from table HTML content.
Args:
html_content: HTML string containing table structure
Returns:
Plain text extracted from table cells
"""
if not html_content:
return ""
try:
class TableTextExtractor(HTMLParser):
def __init__(self):
super().__init__()
self.text_parts = []
self.in_cell = False
def handle_starttag(self, tag, attrs):
if tag in ('td', 'th'):
self.in_cell = True
def handle_endtag(self, tag):
if tag in ('td', 'th'):
self.in_cell = False
def handle_data(self, data):
if self.in_cell:
stripped = data.strip()
if stripped:
self.text_parts.append(stripped)
parser = TableTextExtractor()
parser.feed(html_content)
return ' '.join(parser.text_parts)
except Exception as e:
logger.warning(f"Failed to parse table HTML: {e}")
# Fallback: strip HTML tags with regex
text = re.sub(r'<[^>]+>', ' ', html_content)
text = re.sub(r'\s+', ' ', text).strip()
return text
def reclassify_as_text(self, element: Dict[str, Any]) -> Dict[str, Any]:
"""
Convert an over-detected table element to a TEXT element.
Args:
element: Table element to reclassify
Returns:
New TEXT element with preserved content
"""
# Extract text content from HTML
html_content = element.get("content", "")
text_content = self.extract_text_from_table_html(html_content)
# Create new TEXT element
text_element = {
"element_id": element.get("element_id", ""),
"type": "text",
"original_type": "table_reclassified", # Mark as reclassified
"content": text_content,
"page": element.get("page", 0),
"bbox": element.get("bbox", []),
"index": element.get("index", 0),
"confidence": element.get("confidence", 1.0),
"reclassified_from": "table",
"reclassification_reason": "over_detection"
}
return text_element
def validate_and_filter_elements(
self,
elements: List[Dict[str, Any]]
) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
"""
Validate all elements and filter/reclassify over-detected tables.
Args:
elements: List of elements from PP-StructureV3 output
Returns:
Tuple of (filtered_elements, statistics)
"""
filtered_elements = []
stats = {
"total_tables": 0,
"valid_tables": 0,
"reclassified_tables": 0,
"reclassification_details": []
}
for element in elements:
if element.get("type") != "table":
# Non-table elements pass through unchanged
filtered_elements.append(element)
continue
stats["total_tables"] += 1
# Validate table
result = self.validate_table(element)
if result.is_valid:
stats["valid_tables"] += 1
filtered_elements.append(element)
else:
# Reclassify as TEXT
stats["reclassified_tables"] += 1
text_element = self.reclassify_as_text(element)
filtered_elements.append(text_element)
stats["reclassification_details"].append({
"element_id": element.get("element_id"),
"reason": result.reason,
"metrics": result.metrics
})
logger.info(
f"Reclassified table {element.get('element_id')} as TEXT: {result.reason}"
)
# Re-sort by reading order (y0 then x0)
filtered_elements = self._sort_by_reading_order(filtered_elements)
return filtered_elements, stats
def _sort_by_reading_order(
self,
elements: List[Dict[str, Any]]
) -> List[Dict[str, Any]]:
"""Sort elements by reading order (top-to-bottom, left-to-right)."""
def sort_key(elem):
bbox = elem.get("bbox", [0, 0, 0, 0])
if isinstance(bbox, dict):
y0 = bbox.get("y0", 0)
x0 = bbox.get("x0", 0)
elif isinstance(bbox, list) and len(bbox) >= 2:
x0, y0 = bbox[0], bbox[1]
else:
y0, x0 = 0, 0
return (y0, x0)
return sorted(elements, key=sort_key)

View File

@@ -83,12 +83,34 @@ class TextRegion:
return ((x0 + x1) / 2, (y0 + y1) / 2)
# Element type to IoA threshold mapping
# TABLE needs strict filtering (low threshold) to prevent duplicate content
# FIGURE allows more text through (high threshold) to preserve axis labels, legends
# TEXT/TITLE uses moderate threshold to tolerate boundary detection errors
ELEMENT_TYPE_IOA_THRESHOLDS = {
ElementType.TABLE: 'table',
ElementType.FIGURE: 'figure',
ElementType.IMAGE: 'figure',
ElementType.CHART: 'figure',
ElementType.DIAGRAM: 'figure',
}
class GapFillingService:
"""
Service for detecting and filling gaps in PP-StructureV3 output.
This service uses IoA (Intersection over Area) algorithm for coverage detection,
which correctly measures "small box contained in large box" relationship.
Key improvements over IoU:
- IoA = intersection_area / ocr_box_area (non-symmetric)
- Better for detecting if OCR text is covered by larger layout regions
- Different thresholds per element type (TEXT, TABLE, FIGURE)
- Optional boundary shrinking to reduce edge duplicates
This service:
1. Calculates coverage of PP-StructureV3 elements over raw OCR regions
1. Calculates coverage of PP-StructureV3 elements over raw OCR regions using IoA
2. Identifies uncovered raw OCR regions
3. Supplements uncovered regions as TEXT elements
4. Deduplicates against existing PP-StructureV3 TEXT elements
@@ -98,9 +120,12 @@ class GapFillingService:
def __init__(
self,
coverage_threshold: float = None,
iou_threshold: float = None,
confidence_threshold: float = None,
dedup_iou_threshold: float = None,
ioa_threshold_text: float = None,
ioa_threshold_table: float = None,
ioa_threshold_figure: float = None,
dedup_ioa_threshold: float = None,
shrink_pixels: int = None,
enabled: bool = None
):
"""
@@ -108,27 +133,48 @@ class GapFillingService:
Args:
coverage_threshold: Coverage ratio below which gap filling activates (default: 0.7)
iou_threshold: IoU threshold for coverage detection (default: 0.15)
confidence_threshold: Minimum confidence for raw OCR regions (default: 0.3)
dedup_iou_threshold: IoU threshold for deduplication (default: 0.5)
ioa_threshold_text: IoA threshold for TEXT/TITLE elements (default: 0.6)
ioa_threshold_table: IoA threshold for TABLE elements (default: 0.1)
ioa_threshold_figure: IoA threshold for FIGURE/IMAGE elements (default: 0.8)
dedup_ioa_threshold: IoA threshold for deduplication (default: 0.5)
shrink_pixels: Shrink OCR bbox inward by this many pixels (default: 1)
enabled: Whether gap filling is enabled (default: True)
"""
self.coverage_threshold = coverage_threshold if coverage_threshold is not None else getattr(
settings, 'gap_filling_coverage_threshold', 0.7
)
self.iou_threshold = iou_threshold if iou_threshold is not None else getattr(
settings, 'gap_filling_iou_threshold', 0.15
)
self.confidence_threshold = confidence_threshold if confidence_threshold is not None else getattr(
settings, 'gap_filling_confidence_threshold', 0.3
)
self.dedup_iou_threshold = dedup_iou_threshold if dedup_iou_threshold is not None else getattr(
settings, 'gap_filling_dedup_iou_threshold', 0.5
# IoA thresholds per element type
self.ioa_threshold_text = ioa_threshold_text if ioa_threshold_text is not None else getattr(
settings, 'gap_filling_ioa_threshold_text', 0.6
)
self.ioa_threshold_table = ioa_threshold_table if ioa_threshold_table is not None else getattr(
settings, 'gap_filling_ioa_threshold_table', 0.1
)
self.ioa_threshold_figure = ioa_threshold_figure if ioa_threshold_figure is not None else getattr(
settings, 'gap_filling_ioa_threshold_figure', 0.8
)
self.dedup_ioa_threshold = dedup_ioa_threshold if dedup_ioa_threshold is not None else getattr(
settings, 'gap_filling_dedup_ioa_threshold', 0.5
)
# Boundary shrinking
self.shrink_pixels = shrink_pixels if shrink_pixels is not None else getattr(
settings, 'gap_filling_shrink_pixels', 1
)
self.enabled = enabled if enabled is not None else getattr(
settings, 'gap_filling_enabled', True
)
# Legacy compatibility
self.iou_threshold = getattr(settings, 'gap_filling_iou_threshold', 0.15)
self.dedup_iou_threshold = getattr(settings, 'gap_filling_dedup_iou_threshold', 0.5)
def should_activate(
self,
raw_ocr_regions: List[TextRegion],
@@ -209,21 +255,83 @@ class GapFillingService:
logger.debug(f"Found {len(uncovered)} uncovered regions out of {len(raw_ocr_regions)}")
return uncovered
def _get_ioa_threshold_for_element(self, element_type: ElementType) -> float:
"""
Get the IoA threshold for a specific element type.
Different element types have different thresholds:
- TABLE: 0.1 (strict, prevents duplicate table content)
- FIGURE/IMAGE: 0.8 (preserves text inside figures)
- TEXT/others: 0.6 (tolerates boundary errors)
Args:
element_type: The element type to get threshold for
Returns:
IoA threshold value
"""
threshold_type = ELEMENT_TYPE_IOA_THRESHOLDS.get(element_type, 'text')
if threshold_type == 'table':
return self.ioa_threshold_table
elif threshold_type == 'figure':
return self.ioa_threshold_figure
else:
return self.ioa_threshold_text
def _shrink_bbox(
self,
bbox: Tuple[float, float, float, float],
pixels: int
) -> Tuple[float, float, float, float]:
"""
Shrink a bounding box inward by the specified number of pixels.
This reduces false "uncovered" detection at region boundaries.
Args:
bbox: Original bbox (x0, y0, x1, y1)
pixels: Number of pixels to shrink on each side
Returns:
Shrunk bbox (x0, y0, x1, y1)
"""
x0, y0, x1, y1 = bbox
# Ensure we don't shrink to negative width/height
width = x1 - x0
height = y1 - y0
max_shrink = min(width / 2, height / 2, pixels)
return (
x0 + max_shrink,
y0 + max_shrink,
x1 - max_shrink,
y1 - max_shrink
)
def _is_region_covered(
self,
region: TextRegion,
pp_structure_elements: List[DocumentElement],
skip_table_coverage: bool = True
skip_table_coverage: bool = False
) -> bool:
"""
Check if a raw OCR region is covered by any PP-StructureV3 element.
Uses IoA (Intersection over Area) instead of IoU for better coverage detection.
IoA = intersection_area / ocr_box_area
This correctly measures "OCR box is contained in layout region".
Different element types use different IoA thresholds:
- TABLE: 0.1 (strict, any overlap means covered)
- FIGURE/IMAGE: 0.8 (preserve text inside figures like axis labels)
- TEXT/others: 0.6 (tolerate boundary errors)
Args:
region: Raw OCR text region
pp_structure_elements: List of PP-StructureV3 elements
skip_table_coverage: If True, don't consider TABLE elements as covering
(allows raw OCR text inside tables to pass through
for layered rendering)
skip_table_coverage: If True, don't consider TABLE elements as covering.
Default is False - TABLE elements DO cover regions
to prevent duplicate rendering of table cell content.
Returns:
True if the region is covered
@@ -231,10 +339,13 @@ class GapFillingService:
center_x, center_y = region.center
region_bbox = region.normalized_bbox
# Apply boundary shrinking to reduce edge duplicates
if self.shrink_pixels > 0:
region_bbox = self._shrink_bbox(region_bbox, self.shrink_pixels)
for element in pp_structure_elements:
# Skip TABLE elements when checking coverage
# This allows raw OCR text inside tables to be preserved
# PDF generator will render: table borders + raw text positions
# Check TABLE elements for coverage (default behavior)
# This prevents gap_fill from adding duplicate text inside table areas
if skip_table_coverage and element.type == ElementType.TABLE:
continue
@@ -247,9 +358,11 @@ class GapFillingService:
if self._point_in_bbox(center_x, center_y, elem_bbox):
return True
# Check 2: IoU exceeds threshold
iou = self._calculate_iou(region_bbox, elem_bbox)
if iou > self.iou_threshold:
# Check 2: IoA exceeds element-type-specific threshold
# IoA = intersection_area / ocr_box_area
ioa = self._calculate_ioa(region_bbox, elem_bbox)
threshold = self._get_ioa_threshold_for_element(element.type)
if ioa > threshold:
return True
return False
@@ -262,6 +375,9 @@ class GapFillingService:
"""
Remove regions that highly overlap with existing PP-StructureV3 TEXT elements.
Uses IoA (Intersection over Area) for deduplication to correctly detect
when an OCR region is already covered by an existing TEXT element.
Args:
uncovered_regions: List of uncovered raw OCR regions
pp_structure_elements: List of PP-StructureV3 elements
@@ -278,6 +394,11 @@ class GapFillingService:
deduplicated = []
for region in uncovered_regions:
region_bbox = region.normalized_bbox
# Apply boundary shrinking for deduplication as well
if self.shrink_pixels > 0:
region_bbox = self._shrink_bbox(region_bbox, self.shrink_pixels)
is_duplicate = False
for element in text_elements:
@@ -286,10 +407,11 @@ class GapFillingService:
element.bbox.x1, element.bbox.y1
)
iou = self._calculate_iou(region_bbox, elem_bbox)
if iou > self.dedup_iou_threshold:
# Use IoA for deduplication
ioa = self._calculate_ioa(region_bbox, elem_bbox)
if ioa > self.dedup_ioa_threshold:
logger.debug(
f"Skipping duplicate region (IoU={iou:.2f}): '{region.text[:30]}...'"
f"Skipping duplicate region (IoA={ioa:.2f}): '{region.text[:30]}...'"
)
is_duplicate = True
break
@@ -622,6 +744,52 @@ class GapFillingService:
x0, y0, x1, y1 = bbox
return x0 <= x <= x1 and y0 <= y <= y1
@staticmethod
def _calculate_ioa(
ocr_bbox: Tuple[float, float, float, float],
layout_bbox: Tuple[float, float, float, float]
) -> float:
"""
Calculate Intersection over Area (IoA) of OCR bbox relative to layout bbox.
IoA = intersection_area / ocr_box_area
This is the recommended algorithm for detecting if an OCR text region
is contained within a larger layout region. Unlike IoU which is symmetric,
IoA correctly measures "how much of the OCR box is inside the layout region".
Example:
- OCR box: 100x20 pixels (small text line)
- Layout box: 500x800 pixels (large paragraph region)
- IoU would be very small (~0.005) even if OCR is fully inside layout
- IoA would be 1.0 if OCR is fully inside layout, which is correct
Args:
ocr_bbox: OCR text region bbox (x0, y0, x1, y1) - typically smaller
layout_bbox: Layout element bbox (x0, y0, x1, y1) - typically larger
Returns:
IoA value between 0 and 1
"""
# Calculate intersection
x0 = max(ocr_bbox[0], layout_bbox[0])
y0 = max(ocr_bbox[1], layout_bbox[1])
x1 = min(ocr_bbox[2], layout_bbox[2])
y1 = min(ocr_bbox[3], layout_bbox[3])
if x1 <= x0 or y1 <= y0:
return 0.0
intersection = (x1 - x0) * (y1 - y0)
# Calculate OCR box area (denominator for IoA)
ocr_area = (ocr_bbox[2] - ocr_bbox[0]) * (ocr_bbox[3] - ocr_bbox[1])
if ocr_area <= 0:
return 0.0
return intersection / ocr_area
@staticmethod
def _calculate_iou(
bbox1: Tuple[float, float, float, float],
@@ -630,6 +798,9 @@ class GapFillingService:
"""
Calculate Intersection over Union (IoU) of two bboxes.
Note: This method is kept for backward compatibility.
For coverage detection, use _calculate_ioa() instead.
Args:
bbox1: First bbox (x0, y0, x1, y1)
bbox2: Second bbox (x0, y0, x1, y1)

View File

@@ -6,7 +6,7 @@ Supports both PaddleOCR (for scanned documents) and direct extraction (for edita
import json
import logging
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union
from typing import Any, Dict, List, Optional, Tuple, Union
from datetime import datetime
import uuid
import gc # For garbage collection
@@ -446,6 +446,47 @@ class OCRService:
except Exception as e:
logger.warning(f"Failed to clear GPU cache: {e}")
def _apply_ocr_config(self, ocr_config: 'OCRConfig'):
"""
Apply OCR configuration from preset or custom settings.
This modifies the runtime settings used by PP-Structure.
Args:
ocr_config: OCRConfig object with processing settings
"""
logger.info(f"Applying OCR config: {ocr_config.model_dump()}")
# Store the config for use in PP-Structure initialization
self._runtime_ocr_config = ocr_config
# Apply table parsing mode settings
# These will be used when initializing PP-StructureV3
settings.table_parsing_mode = ocr_config.table_parsing_mode.value if hasattr(ocr_config.table_parsing_mode, 'value') else ocr_config.table_parsing_mode
# Apply preprocessing settings
settings.use_doc_orientation_classify = ocr_config.use_doc_orientation_classify
settings.use_doc_unwarping = ocr_config.use_doc_unwarping
settings.use_textline_orientation = ocr_config.use_textline_orientation
# Apply recognition module settings
settings.enable_chart_recognition = ocr_config.enable_chart_recognition
settings.enable_formula_recognition = ocr_config.enable_formula_recognition
settings.enable_seal_recognition = ocr_config.enable_seal_recognition
settings.enable_region_detection = ocr_config.enable_region_detection
# Apply layout threshold if specified
if ocr_config.layout_threshold is not None:
settings.layout_detection_threshold = ocr_config.layout_threshold
if ocr_config.layout_nms_threshold is not None:
settings.layout_nms_threshold = ocr_config.layout_nms_threshold
# Invalidate existing structure engine to force re-initialization with new settings
if self.structure_engine is not None:
logger.info("Invalidating PP-StructureV3 engine to apply new OCR config")
self._unload_structure_engine()
logger.info(f"OCR config applied: table_parsing_mode={settings.table_parsing_mode}")
def get_ocr_engine(self, lang: str = 'ch') -> PaddleOCR:
"""
Get or create OCR engine for specified language with GPU support
@@ -615,6 +656,39 @@ class OCRService:
formula_model = settings.formula_recognition_model_name
chart_model = settings.chart_recognition_model_name
# Apply table_parsing_mode settings
# This is the KEY configuration to prevent "cell explosion" on datasheet-type documents
table_parsing_mode = settings.table_parsing_mode
logger.info(f"Table parsing mode: {table_parsing_mode}")
if table_parsing_mode == "disabled":
# 方案A: 完全關閉 table recognition
use_table = False
wired_table_model = None
wireless_table_model = None
wired_cell_det_model = None
wireless_cell_det_model = None
logger.info("Table parsing DISABLED - no cell segmentation")
elif table_parsing_mode == "classification_only":
# 方案C: 只做 table classification不做 cell segmentation
use_table = False # Don't parse table structure
wired_table_model = None
wireless_table_model = None
wired_cell_det_model = None
wireless_cell_det_model = None
# Keep table_cls_model to identify table regions
logger.info("Table parsing CLASSIFICATION_ONLY - regions identified but no cell parsing")
elif table_parsing_mode == "conservative":
# 方案B: 保守模式 - 只禁用 wireless tables (aggressive)
# 注意:不要修改 layout_threshold它會影響所有元素偵測不只是表格
wireless_table_model = None
wireless_cell_det_model = None
logger.info(f"Table parsing CONSERVATIVE - wireless disabled (layout_threshold unchanged)")
# else: "full" mode - use all default settings (aggressive)
# Apply table detection config overrides for individual table types
if table_detection_config:
if not table_detection_config.enable_wired_table:
@@ -1343,6 +1417,7 @@ class OCRService:
if detect_layout:
# Pass current_page to analyze_layout for correct page numbering
# Also pass text_regions for table content rebuilding
layout_data, images_metadata = self.analyze_layout(
image_path,
output_dir=output_dir,
@@ -1350,7 +1425,8 @@ class OCRService:
layout_model=layout_model,
preprocessing_mode=preprocessing_mode,
preprocessing_config=preprocessing_config,
table_detection_config=table_detection_config
table_detection_config=table_detection_config,
raw_ocr_regions=text_regions # For table content rebuilding
)
# Generate Markdown
@@ -1379,6 +1455,12 @@ class OCRService:
# If layout data is enhanced, add enhanced results for converter
if layout_data and layout_data.get('enhanced'):
# Debug: check if table elements have rebuild_stats
for elem in layout_data.get('elements', []):
if elem.get('type') == 'table':
has_rebuild = 'rebuild_stats' in elem
logger.info(f"[OCR_SERVICE] Table {elem.get('element_id')}: has rebuild_stats={has_rebuild}, keys={list(elem.keys())[:10]}")
result['enhanced_results'] = [{
'elements': layout_data.get('elements', []),
'reading_order': layout_data.get('reading_order', []),
@@ -1509,7 +1591,8 @@ class OCRService:
layout_model: Optional[str] = None,
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
preprocessing_config: Optional[PreprocessingConfig] = None,
table_detection_config: Optional[TableDetectionConfig] = None
table_detection_config: Optional[TableDetectionConfig] = None,
raw_ocr_regions: Optional[List[Dict[str, Any]]] = None
) -> Tuple[Optional[Dict], List[Dict]]:
"""
Analyze document layout using PP-StructureV3 with enhanced element extraction
@@ -1522,6 +1605,7 @@ class OCRService:
preprocessing_mode: Preprocessing mode ('auto', 'manual', 'disabled')
preprocessing_config: Manual preprocessing config (used when mode='manual')
table_detection_config: Table detection config (wired/wireless/region options)
raw_ocr_regions: Optional list of raw OCR text regions for table content rebuilding
Returns:
Tuple of (layout_data, images_metadata)
@@ -1607,7 +1691,8 @@ class OCRService:
preprocessed_image=preprocessed_image,
scaling_info=scaling_info,
save_visualization=True, # Save layout detection visualization images
use_cv_table_detection=use_cv_table_detection
use_cv_table_detection=use_cv_table_detection,
raw_ocr_regions=raw_ocr_regions # For table content rebuilding
)
if result.get('has_parsing_res_list'):
@@ -2225,7 +2310,8 @@ class OCRService:
layout_model: Optional[str] = None,
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
preprocessing_config: Optional[PreprocessingConfig] = None,
table_detection_config: Optional[TableDetectionConfig] = None
table_detection_config: Optional[TableDetectionConfig] = None,
ocr_config: Optional['OCRConfig'] = None
) -> Union[UnifiedDocument, Dict]:
"""
Main processing method with dual-track support.
@@ -2242,11 +2328,16 @@ class OCRService:
preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
preprocessing_config: Manual preprocessing config (used when mode='manual')
table_detection_config: Table detection config (wired/wireless/region options)
ocr_config: OCR processing config from preset or custom settings
Returns:
UnifiedDocument if dual-track is enabled and use_dual_track=True,
Dict with legacy format otherwise
"""
# Apply OCR config to settings if provided
if ocr_config:
self._apply_ocr_config(ocr_config)
# Use dual-track processing if:
# 1. use_dual_track is True (auto-detection), OR
# 2. force_track is specified (explicit track selection)

View File

@@ -189,7 +189,7 @@ def validate_cell_boxes(
Validate cell_boxes coordinates against page boundaries and table bbox.
PP-StructureV3 sometimes returns cell_boxes with coordinates that exceed
page boundaries. This function validates and reports issues.
page boundaries or table bbox. This function validates and clamps to valid boundaries.
Args:
cell_boxes: List of cell bounding boxes [[x0, y0, x1, y1], ...]
@@ -213,10 +213,22 @@ def validate_cell_boxes(
clamped_boxes = []
# Page boundaries with tolerance
min_x = -tolerance
min_y = -tolerance
max_x = page_width + tolerance
max_y = page_height + tolerance
page_min_x = -tolerance
page_min_y = -tolerance
page_max_x = page_width + tolerance
page_max_y = page_height + tolerance
# Table boundaries with tolerance (prefer clamping to table bbox)
table_min_x = table_bbox[0] - tolerance if len(table_bbox) >= 4 else page_min_x
table_min_y = table_bbox[1] - tolerance if len(table_bbox) >= 4 else page_min_y
table_max_x = table_bbox[2] + tolerance if len(table_bbox) >= 4 else page_max_x
table_max_y = table_bbox[3] + tolerance if len(table_bbox) >= 4 else page_max_y
# For clamping, use the intersection of page and expanded table bbox
clamp_min_x = max(0, table_bbox[0] - tolerance) if len(table_bbox) >= 4 else 0
clamp_min_y = max(0, table_bbox[1] - tolerance) if len(table_bbox) >= 4 else 0
clamp_max_x = min(page_width, table_bbox[2] + tolerance) if len(table_bbox) >= 4 else page_width
clamp_max_y = min(page_height, table_bbox[3] + tolerance) if len(table_bbox) >= 4 else page_height
for idx, box in enumerate(cell_boxes):
if not box or len(box) < 4:
@@ -230,19 +242,38 @@ def validate_cell_boxes(
cell_issues = []
# Check if coordinates exceed page boundaries
if x0 < min_x:
if x0 < page_min_x:
cell_issues.append(f"x0={x0:.1f} < 0")
is_valid = False
if y0 < min_y:
if y0 < page_min_y:
cell_issues.append(f"y0={y0:.1f} < 0")
is_valid = False
if x1 > max_x:
if x1 > page_max_x:
cell_issues.append(f"x1={x1:.1f} > page_width={page_width:.1f}")
is_valid = False
if y1 > max_y:
if y1 > page_max_y:
cell_issues.append(f"y1={y1:.1f} > page_height={page_height:.1f}")
is_valid = False
# Check if coordinates significantly exceed table bbox (more than 20% of table size)
if len(table_bbox) >= 4:
table_w = table_bbox[2] - table_bbox[0]
table_h = table_bbox[3] - table_bbox[1]
expand_tolerance = max(tolerance, table_h * 0.2) # 20% of table height
if y0 < table_bbox[1] - expand_tolerance:
cell_issues.append(f"y0={y0:.1f} above table (table_y0={table_bbox[1]:.1f})")
is_valid = False
if y1 > table_bbox[3] + expand_tolerance:
cell_issues.append(f"y1={y1:.1f} below table (table_y1={table_bbox[3]:.1f})")
is_valid = False
if x0 < table_bbox[0] - expand_tolerance:
cell_issues.append(f"x0={x0:.1f} left of table (table_x0={table_bbox[0]:.1f})")
is_valid = False
if x1 > table_bbox[2] + expand_tolerance:
cell_issues.append(f"x1={x1:.1f} right of table (table_x1={table_bbox[2]:.1f})")
is_valid = False
# Check for inverted coordinates
if x0 > x1:
cell_issues.append(f"x0={x0:.1f} > x1={x1:.1f}")
@@ -255,12 +286,12 @@ def validate_cell_boxes(
invalid_count += 1
issues.append(f"Cell {idx}: {', '.join(cell_issues)}")
# Clamp to valid boundaries
# Clamp to valid boundaries (table bbox with some tolerance)
clamped_box = [
max(0, min(x0, page_width)),
max(0, min(y0, page_height)),
max(0, min(x1, page_width)),
max(0, min(y1, page_height))
max(clamp_min_x, min(x0, clamp_max_x)),
max(clamp_min_y, min(y0, clamp_max_y)),
max(clamp_min_x, min(x1, clamp_max_x)),
max(clamp_min_y, min(y1, clamp_max_y))
]
# Ensure proper ordering after clamping
@@ -395,10 +426,15 @@ class OCRToUnifiedConverter:
Handles both enhanced PP-StructureV3 results (with parsing_res_list)
and traditional markdown results. Applies gap filling when enabled.
Gap filling can use either:
1. overall_ocr_res from PP-StructureV3 (preferred, no extra inference)
2. Separate raw OCR text_regions (fallback)
"""
pages = []
# Extract raw OCR text regions for gap filling
# Prefer overall_ocr_res from PP-StructureV3 when available
raw_text_regions = ocr_results.get('text_regions', [])
ocr_dimensions = ocr_results.get('ocr_dimensions', {})
@@ -461,13 +497,22 @@ class OCRToUnifiedConverter:
if element:
elements.append(element)
# Apply gap filling if enabled and raw regions available
if self.gap_filling_service and raw_text_regions:
# Filter raw regions for current page
page_raw_regions = [
r for r in raw_text_regions
if r.get('page', 0) == page_idx or r.get('page', 1) == page_idx + 1
]
# Apply gap filling if enabled
# Priority: 1) overall_ocr_res from page_result, 2) raw_text_regions from separate OCR
if self.gap_filling_service:
# Check for overall_ocr_res from PP-StructureV3 (preferred, no extra inference)
page_raw_regions = page_result.get('overall_ocr_res', [])
if page_raw_regions:
logger.debug(f"Page {page_idx + 1}: Using overall_ocr_res ({len(page_raw_regions)} regions)")
elif raw_text_regions:
# Fallback to separate raw OCR regions
page_raw_regions = [
r for r in raw_text_regions
if r.get('page', 0) == page_idx or r.get('page', 1) == page_idx + 1
]
if page_raw_regions:
logger.debug(f"Page {page_idx + 1}: Using separate raw OCR ({len(page_raw_regions)} regions)")
if page_raw_regions:
supplemented, stats = self.gap_filling_service.fill_gaps(
@@ -711,8 +756,33 @@ class OCRToUnifiedConverter:
# Prepare content based on element type
if element_type == ElementType.TABLE:
# For tables, use TableData as content
# Pass cell_boxes for accurate cell positioning
table_data = self._extract_table_data(elem_data)
# Priority: rebuilt_table > HTML parsing
# rebuilt_table contains clean cells without empty padding
if 'rebuilt_table' in elem_data:
rebuilt = elem_data['rebuilt_table']
# Use rebuilt cells directly - they don't include empty cells
rebuilt_cells = rebuilt.get('cells', [])
from app.models.unified_document import TableCell
table_cells = [
TableCell(
row=c.get('row', 0),
col=c.get('col', 0),
row_span=c.get('row_span', 1),
col_span=c.get('col_span', 1),
content=c.get('content', '')
)
for c in rebuilt_cells
]
table_data = TableData(
rows=rebuilt.get('rows', 0),
cols=rebuilt.get('cols', 0),
cells=table_cells,
caption=elem_data.get('extracted_text')
)
logger.info(f"[CONVERTER] Table {elem_data.get('element_id')}: Using rebuilt_table directly ({len(rebuilt_cells)} cells)")
else:
# Fallback to HTML parsing for non-rebuilt tables
table_data = self._extract_table_data(elem_data)
content = table_data if table_data else elem_data.get('content', '')
# Preserve cell_boxes and embedded_images in metadata for PDF generation
@@ -756,6 +826,18 @@ class OCRToUnifiedConverter:
if 'embedded_images' in elem_data:
elem_data.setdefault('metadata', {})['embedded_images'] = elem_data['embedded_images']
# Pass through rebuild information for tables that were rebuilt
# This tells the PDF renderer to use HTML content instead of cell_boxes
logger.info(f"[CONVERTER] Table {elem_data.get('element_id')}: checking for rebuild_stats, keys={list(elem_data.keys())}")
if 'rebuild_stats' in elem_data:
elem_data.setdefault('metadata', {})['rebuild_stats'] = elem_data['rebuild_stats']
elem_data['metadata']['was_rebuilt'] = True
logger.info(f"[CONVERTER] Table {elem_data.get('element_id')}: FOUND rebuild_stats, setting was_rebuilt=True")
if 'rebuilt_table' in elem_data:
elem_data.setdefault('metadata', {})['rebuilt_table'] = elem_data['rebuilt_table']
elif element_type in [
ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART,
ElementType.DIAGRAM, ElementType.LOGO, ElementType.STAMP

View File

@@ -26,6 +26,23 @@ from html.parser import HTMLParser
from app.core.config import settings
# Import table column corrector for column alignment fix
try:
from app.services.table_column_corrector import TableColumnCorrector
TABLE_COLUMN_CORRECTOR_AVAILABLE = True
except ImportError:
TABLE_COLUMN_CORRECTOR_AVAILABLE = False
TableColumnCorrector = None
# Import text region renderer for simple text positioning
try:
from app.services.text_region_renderer import TextRegionRenderer, load_raw_ocr_regions
TEXT_REGION_RENDERER_AVAILABLE = True
except ImportError:
TEXT_REGION_RENDERER_AVAILABLE = False
TextRegionRenderer = None
load_raw_ocr_regions = None
# Import UnifiedDocument for dual-track support
try:
from app.models.unified_document import (
@@ -596,7 +613,8 @@ class PDFGeneratorService:
'content': html_content,
'bbox': [element.bbox.x0, element.bbox.y0,
element.bbox.x1, element.bbox.y1],
'page': page_num - 1 # layout uses 0-based
'page': page_num - 1, # layout uses 0-based
'element_id': element.element_id # For _use_border_only matching
}
# Preserve cell_boxes and embedded_images from metadata
@@ -607,18 +625,29 @@ class PDFGeneratorService:
table_element['cell_boxes_source'] = element.metadata.get('cell_boxes_source', 'metadata')
if 'embedded_images' in element.metadata:
table_element['embedded_images'] = element.metadata['embedded_images']
# Pass through rebuild flag - rebuilt tables should use HTML content
if element.metadata.get('was_rebuilt'):
table_element['was_rebuilt'] = True
logger.debug(f"Table {element.element_id}: marked as rebuilt")
layout_elements.append(table_element)
# Add bbox to images_metadata for text overlap filtering
# (no actual image file, just bbox for filtering)
images_metadata.append({
img_metadata = {
'image_path': None, # No fake table image
'bbox': bbox_polygon,
'page': page_num - 1, # 0-based for images_metadata
'type': 'table',
'element_id': element.element_id
})
}
# Also copy cell_boxes for quality checking
if element.metadata and 'cell_boxes' in element.metadata:
img_metadata['cell_boxes'] = element.metadata['cell_boxes']
# Mark if table was rebuilt
if element.metadata and element.metadata.get('was_rebuilt'):
img_metadata['was_rebuilt'] = True
images_metadata.append(img_metadata)
# Handle image/visual elements (including stamps/seals)
elif element.is_visual or element.type in [
@@ -1022,15 +1051,25 @@ class PDFGeneratorService:
# Set current track
self.current_processing_track = 'ocr'
# Convert UnifiedDocument to OCR data format (legacy)
ocr_data = self.convert_unified_document_to_ocr_data(unified_doc)
# Check if simple text positioning mode is enabled
if (settings.simple_text_positioning_enabled and
TEXT_REGION_RENDERER_AVAILABLE):
logger.info("Using simple text positioning mode")
result = self._generate_simple_text_pdf(
unified_doc=unified_doc,
output_path=output_path,
source_file_path=source_file_path
)
else:
# Convert UnifiedDocument to OCR data format (legacy)
ocr_data = self.convert_unified_document_to_ocr_data(unified_doc)
# Use existing generation pipeline
result = self._generate_pdf_from_data(
ocr_data=ocr_data,
output_path=output_path,
source_file_path=source_file_path
)
# Use existing generation pipeline
result = self._generate_pdf_from_data(
ocr_data=ocr_data,
output_path=output_path,
source_file_path=source_file_path
)
# Reset track
self.current_processing_track = None
@@ -1043,6 +1082,235 @@ class PDFGeneratorService:
self.current_processing_track = None
return False
def _generate_simple_text_pdf(
self,
unified_doc: 'UnifiedDocument',
output_path: Path,
source_file_path: Optional[Path] = None
) -> bool:
"""
Generate PDF using simple text positioning from raw OCR regions.
This approach bypasses complex table structure reconstruction and renders
raw OCR text directly at detected positions with rotation correction.
Images, charts, figures, seals, and formulas are still rendered normally.
Args:
unified_doc: UnifiedDocument from OCR processing
output_path: Path to save generated PDF
source_file_path: Optional path to original source file
Returns:
True if successful, False otherwise
"""
try:
logger.info("=== Simple Text Positioning PDF Generation ===")
# Initialize text region renderer
text_renderer = TextRegionRenderer(
font_name=self.font_name,
debug=settings.simple_text_positioning_debug
)
# Get result directory from output_path
result_dir = output_path.parent
# Try to determine task_id from result directory or output filename
# Output path is typically: result_dir/task_id_edited.pdf
task_id = None
if output_path.stem.endswith('_edited'):
task_id = output_path.stem.replace('_edited', '')
elif result_dir.name:
# result_dir is typically the task_id directory
task_id = result_dir.name
if not task_id:
logger.warning("Could not determine task_id, falling back to legacy method")
ocr_data = self.convert_unified_document_to_ocr_data(unified_doc)
return self._generate_pdf_from_data(
ocr_data=ocr_data,
output_path=output_path,
source_file_path=source_file_path
)
logger.info(f"Task ID: {task_id}, Result dir: {result_dir}")
# Get total pages from UnifiedDocument
total_pages = len(unified_doc.pages) if unified_doc.pages else 1
# Get page dimensions from first page (for canvas initialization)
if not unified_doc.pages:
logger.error("No pages in document")
return False
first_page = unified_doc.pages[0]
if hasattr(first_page, 'dimensions') and first_page.dimensions:
page_width = float(first_page.dimensions.width)
page_height = float(first_page.dimensions.height)
else:
# Fallback to default size
page_width = 612.0 # Letter width
page_height = 792.0 # Letter height
logger.warning(f"No page dimensions found, using default {page_width}x{page_height}")
logger.info(f"Initial page size: {page_width:.1f} x {page_height:.1f}")
# Create PDF canvas
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(page_width, page_height))
# Collect image-type elements from UnifiedDocument for rendering
# Types that should be rendered as images: figure, image, chart, seal, formula
image_element_types = {'figure', 'image', 'chart', 'seal', 'formula'}
# Process each page
for page_num in range(1, total_pages + 1):
logger.info(f">>> Processing page {page_num}/{total_pages}")
# Get page dimensions for current page
if page_num <= len(unified_doc.pages):
current_page = unified_doc.pages[page_num - 1]
if hasattr(current_page, 'dimensions') and current_page.dimensions:
current_width = float(current_page.dimensions.width)
current_height = float(current_page.dimensions.height)
else:
current_width = page_width
current_height = page_height
else:
current_width = page_width
current_height = page_height
if page_num > 1:
pdf_canvas.showPage()
# Set page size
pdf_canvas.setPageSize((current_width, current_height))
# === Layer 1: Render images, charts, figures, seals, formulas ===
# Also collect exclusion zones for text avoidance
exclusion_zones = [] # List of (x0, y0, x1, y1) tuples
if page_num <= len(unified_doc.pages):
current_page = unified_doc.pages[page_num - 1]
page_elements = current_page.elements if hasattr(current_page, 'elements') else []
image_elements_rendered = 0
for elem in page_elements:
elem_type = elem.type if hasattr(elem, 'type') else elem.get('type', '')
# Handle enum type
if hasattr(elem_type, 'value'):
elem_type = elem_type.value
if elem_type in image_element_types:
# Get image path from element content
content = elem.content if hasattr(elem, 'content') else elem.get('content', {})
if isinstance(content, dict):
saved_path = content.get('saved_path') or content.get('path')
else:
saved_path = None
# Get bbox for exclusion zone (even if image file not found)
bbox = elem.bbox if hasattr(elem, 'bbox') else elem.get('bbox', {})
if hasattr(bbox, 'x0'):
x0, y0, x1, y1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1
elif isinstance(bbox, dict):
x0 = bbox.get('x0', 0)
y0 = bbox.get('y0', 0)
x1 = bbox.get('x1', x0 + bbox.get('width', 0))
y1 = bbox.get('y1', y0 + bbox.get('height', 0))
else:
continue
# Add to exclusion zones for text avoidance
# Use original image coordinates (not PDF flipped)
exclusion_zones.append((x0, y0, x1, y1))
if saved_path:
# Try to find the image file
image_path = result_dir / saved_path
if not image_path.exists():
# Try in imgs subdirectory
image_path = result_dir / 'imgs' / saved_path
if not image_path.exists():
# Try just the filename
image_path = result_dir / Path(saved_path).name
if image_path.exists():
try:
# Convert coordinates (flip Y for PDF)
pdf_x = x0
pdf_y = current_height - y1 # Bottom of image in PDF coords
img_width = x1 - x0
img_height = y1 - y0
# Draw image
pdf_canvas.drawImage(
str(image_path),
pdf_x, pdf_y,
width=img_width,
height=img_height,
preserveAspectRatio=True,
mask='auto'
)
image_elements_rendered += 1
logger.debug(f"Rendered {elem_type}: {saved_path} at ({pdf_x:.1f}, {pdf_y:.1f})")
except Exception as e:
logger.warning(f"Failed to render {elem_type} {saved_path}: {e}")
else:
logger.warning(f"Image file not found: {saved_path}")
if image_elements_rendered > 0:
logger.info(f"Rendered {image_elements_rendered} image elements (figures/charts/seals/formulas)")
if exclusion_zones:
logger.info(f"Collected {len(exclusion_zones)} exclusion zones for text avoidance")
# === Layer 2: Render text from raw OCR regions ===
raw_regions = load_raw_ocr_regions(str(result_dir), task_id, page_num)
if not raw_regions:
logger.warning(f"No raw OCR regions found for page {page_num}")
else:
logger.info(f"Loaded {len(raw_regions)} raw OCR regions for page {page_num}")
# Collect texts inside exclusion zones for position-aware deduplication
# This prevents duplicate axis labels from being rendered near charts
zone_texts = None
if exclusion_zones:
zone_texts = text_renderer.collect_zone_texts(
raw_regions, exclusion_zones, threshold=0.5, include_axis_labels=True
)
if zone_texts:
logger.info(f"Collected {len(zone_texts)} zone texts for deduplication: {list(zone_texts)[:10]}...")
# Render all text regions, avoiding exclusion zones (images/charts)
# Scale factors are 1.0 since OCR dimensions match page dimensions
rendered = text_renderer.render_all_regions(
pdf_canvas=pdf_canvas,
regions=raw_regions,
page_height=current_height,
scale_x=1.0,
scale_y=1.0,
exclusion_zones=exclusion_zones,
zone_texts=zone_texts
)
logger.info(f"Rendered {rendered} text regions")
logger.info(f"<<< Page {page_num} complete")
# Save PDF
pdf_canvas.save()
file_size = output_path.stat().st_size
logger.info(f"Generated PDF: {output_path.name} ({file_size} bytes)")
return True
except Exception as e:
logger.error(f"Failed to generate simple text PDF: {e}")
import traceback
traceback.print_exc()
return False
def _generate_pdf_from_data(
self,
ocr_data: Dict,
@@ -1093,8 +1361,15 @@ class PDFGeneratorService:
logger.info("No page_dimensions found, using first page size for all pages")
# Step 3: Get original file dimensions for all pages
# For OCR track, we use OCR coordinate system dimensions directly to avoid scaling issues
original_page_sizes = {}
if source_file_path:
use_ocr_dimensions_for_pdf = (self.current_processing_track == 'ocr')
if use_ocr_dimensions_for_pdf:
# OCR Track: Use OCR coordinate system dimensions directly
# This ensures no scaling is needed (scale = 1.0)
logger.info(f"OCR Track: 使用 OCR 座標系尺寸作為 PDF 頁面尺寸(避免縮放)")
elif source_file_path:
original_page_sizes = self.get_all_page_sizes(source_file_path)
if original_page_sizes:
logger.info(f"從原始文件獲取到 {len(original_page_sizes)} 頁尺寸")
@@ -1104,8 +1379,12 @@ class PDFGeneratorService:
logger.info(f"無原始文件,將使用 OCR/UnifiedDocument 尺寸")
# Determine initial canvas size (will be updated per page)
# Priority: original file first page > OCR/UnifiedDocument first page
if 0 in original_page_sizes:
# Priority for OCR track: OCR dimensions (no scaling)
# Priority for Direct track: original file first page > OCR/UnifiedDocument first page
if use_ocr_dimensions_for_pdf:
target_width, target_height = ocr_width, ocr_height
logger.info(f"初始 PDF 尺寸OCR Track, 使用 OCR 座標系): {target_width:.1f} x {target_height:.1f}")
elif 0 in original_page_sizes:
target_width, target_height = original_page_sizes[0]
logger.info(f"初始 PDF 尺寸(來自原始文件首頁): {target_width:.1f} x {target_height:.1f}")
else:
@@ -1159,14 +1438,49 @@ class PDFGeneratorService:
# Create PDF canvas with initial page size (will be updated per page)
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height))
# LAYERED RENDERING: Exclude tables from regions_to_avoid
# Text inside tables will be rendered at raw OCR positions (via GapFillingService)
# while table borders are drawn separately using cell_boxes
# Only avoid overlap with actual images/figures/charts
regions_to_avoid = [img for img in images_metadata if img.get('type') != 'table']
table_count = len([img for img in images_metadata if img.get('type') == 'table'])
# Smart filtering: only include tables with good cell_boxes quality in regions_to_avoid
# Tables with bad cell_boxes will use raw OCR text positioning instead
# Exception: Rebuilt tables always use HTML content and filter text
regions_to_avoid = []
good_quality_tables = []
bad_quality_tables = []
rebuilt_tables = []
logger.info(f"過濾文字區域: {len(regions_to_avoid)} 個區域需要避免 (不含表格), {table_count} 個表格使用分層渲染")
for img in images_metadata:
if img.get('type') == 'table':
elem_id = img.get('element_id', 'unknown')
# Check if this table was rebuilt - rebuilt tables have good content
was_rebuilt = img.get('was_rebuilt', False)
if was_rebuilt:
# Rebuilt tables have accurate content - filter text, use HTML
regions_to_avoid.append(img)
rebuilt_tables.append(elem_id)
else:
# Check cell_boxes quality for non-rebuilt tables
cell_boxes = img.get('cell_boxes', [])
quality = self._check_cell_boxes_quality(cell_boxes, elem_id)
if quality == 'good':
# Good quality: filter text, render with cell_boxes
regions_to_avoid.append(img)
good_quality_tables.append(elem_id)
else:
# Bad quality: don't filter text, just draw border
bad_quality_tables.append(elem_id)
img['_use_border_only'] = True # Mark for border-only rendering
else:
# Non-table elements (images, figures, charts) always avoid
regions_to_avoid.append(img)
logger.info(f"過濾文字區域: {len(regions_to_avoid)} 個區域需要避免")
if rebuilt_tables:
logger.info(f" 重建表格用 HTML: {rebuilt_tables}")
if good_quality_tables:
logger.info(f" 表格用 cell_boxes: {good_quality_tables}")
if bad_quality_tables:
logger.info(f" 表格用 raw OCR text (border only): {bad_quality_tables}")
filtered_text_regions = self._filter_text_in_regions(text_regions, regions_to_avoid)
@@ -1178,10 +1492,24 @@ class PDFGeneratorService:
pages_data[page_num] = []
pages_data[page_num].append(region)
# Get table elements from layout_data
# Get table elements from layout_data and copy _use_border_only flags
table_elements = []
if layout_data and layout_data.get('elements'):
table_elements = [e for e in layout_data['elements'] if e.get('type') == 'table']
# Create a lookup for _use_border_only flags from images_metadata
border_only_tables = {img.get('element_id') for img in images_metadata
if img.get('type') == 'table' and img.get('_use_border_only')}
logger.debug(f"[DEBUG] border_only_tables from images_metadata: {border_only_tables}")
for e in layout_data['elements']:
if e.get('type') == 'table':
elem_id = e.get('element_id')
logger.debug(f"[DEBUG] layout_data table element_id: {elem_id}")
# Copy the flag if this table should use border only
if elem_id in border_only_tables:
e['_use_border_only'] = True
logger.info(f"[DEBUG] Set _use_border_only=True for table {elem_id}")
table_elements.append(e)
# Process each page
total_pages = ocr_data.get('total_pages', 1)
@@ -1195,14 +1523,23 @@ class PDFGeneratorService:
logger.info(f">>> 處理第 {page_num}/{total_pages}")
# Get current page dimensions with priority order:
# 1. Original file dimensions (highest priority)
# 2. OCR/UnifiedDocument dimensions
# 3. Fallback to first page dimensions
# For OCR Track: always use OCR dimensions (scale = 1.0)
# For Direct Track:
# 1. Original file dimensions (highest priority)
# 2. OCR/UnifiedDocument dimensions
# 3. Fallback to first page dimensions
page_idx = page_num - 1
dimension_source = "unknown"
# Priority 1: Original file dimensions
if page_idx in original_page_sizes:
# For OCR Track: always use OCR dimensions
if use_ocr_dimensions_for_pdf and page_idx in page_dimensions:
current_page_dims = page_dimensions[page_idx]
current_target_w = float(current_page_dims['width'])
current_target_h = float(current_page_dims['height'])
dimension_source = "ocr_track_direct"
# Priority 1: Original file dimensions (Direct Track only)
elif page_idx in original_page_sizes:
current_target_w, current_target_h = original_page_sizes[page_idx]
dimension_source = "original_file"
@@ -1774,12 +2111,26 @@ class PDFGeneratorService:
non_empty_lines = [l for l in lines if l.strip()]
num_lines = max(len(non_empty_lines), 1)
# Font size = bbox_height / num_lines * factor
# Font size calculation with stabilization
# Use 0.8 factor to leave room for line spacing
font_size = (bbox_height / num_lines) * 0.8
font_size = max(min(font_size, 72), 4) # Clamp between 4pt and 72pt
raw_font_size = (bbox_height / num_lines) * 0.8
logger.debug(f"Text has {num_lines} non-empty lines, bbox_height={bbox_height:.1f}, calculated font_size={font_size:.1f}")
# Stabilize font size for body text (most common case)
# Normal body text should be 9-11pt, only deviate for clear outliers
element_type = region.get('element_type', 'text')
if element_type in ('text', 'paragraph'):
# For body text, bias toward 10pt baseline
if 7 <= raw_font_size <= 14:
# Near-normal range: use weighted average toward 10pt
font_size = raw_font_size * 0.7 + 10 * 0.3
else:
# Clear outlier: use raw but clamp more aggressively
font_size = max(min(raw_font_size, 14), 7)
else:
# For titles/headers/etc, use raw calculation with wider range
font_size = max(min(raw_font_size, 72), 4)
logger.debug(f"Text has {num_lines} non-empty lines, bbox_height={bbox_height:.1f}, raw={raw_font_size:.1f}, final={font_size:.1f}")
# Transform coordinates: OCR (top-left origin) → PDF (bottom-left origin)
# CRITICAL: Y-axis flip!
@@ -2008,24 +2359,45 @@ class PDFGeneratorService:
result_dir: Directory containing result files (for embedded images)
"""
try:
elem_id = table_element.get('element_id', 'unknown')
use_border_only = table_element.get('_use_border_only', False)
logger.info(f"[DEBUG] draw_table_region: elem_id={elem_id}, _use_border_only={use_border_only}")
html_content = table_element.get('content', '')
if not html_content:
# Even without HTML, draw border if requested
if use_border_only:
self._draw_table_border_only(pdf_canvas, table_element, page_height, scale_w, scale_h)
return
# Try to use cell_boxes for direct rendering first (more accurate)
# Apply column correction if enabled
cell_boxes = table_element.get('cell_boxes', [])
if cell_boxes:
logger.info(f"[TABLE] Using cell_boxes direct rendering ({len(cell_boxes)} cells)")
success = self._draw_table_with_cell_boxes(
pdf_canvas, table_element, page_height,
scale_w, scale_h, result_dir
)
if success:
return # Successfully rendered with cell_boxes
if (settings.table_column_correction_enabled and
TABLE_COLUMN_CORRECTOR_AVAILABLE and
cell_boxes):
try:
corrector = TableColumnCorrector(
correction_threshold=settings.table_column_correction_threshold,
vertical_merge_enabled=settings.vertical_fragment_merge_enabled,
vertical_aspect_ratio=settings.vertical_fragment_aspect_ratio
)
# Get table bbox for vertical fragment detection
table_bbox = table_element.get('bbox', [])
if isinstance(table_bbox, dict):
table_bbox = [table_bbox['x0'], table_bbox['y0'], table_bbox['x1'], table_bbox['y1']]
logger.info("[TABLE] Falling back to ReportLab Table")
corrected_html, stats = corrector.correct(
html=html_content,
cell_boxes=cell_boxes,
table_bbox=table_bbox if isinstance(table_bbox, list) and len(table_bbox) >= 4 else None
)
if stats.get('column_corrections', 0) > 0:
logger.info(f"[TABLE] {elem_id}: Column correction applied - {stats}")
html_content = corrected_html
except Exception as e:
logger.warning(f"[TABLE] {elem_id}: Column correction failed: {e}, using original HTML")
# Fallback: Parse HTML to extract table structure and use ReportLab Table
# Parse HTML first to get table structure for grid validation
parser = HTMLTableParser()
parser.feed(html_content)
@@ -2040,6 +2412,83 @@ class PDFGeneratorService:
if not rows:
return
# Calculate number of rows and columns from HTML for grid validation
num_rows = len(rows)
max_cols = 0
for row in rows:
row_cols = sum(cell.get('colspan', 1) for cell in row['cells'])
max_cols = max(max_cols, row_cols)
# Check if table was rebuilt - if so, use HTML content directly
was_rebuilt = table_element.get('was_rebuilt', False)
cell_boxes_rendered = False # Track if we rendered borders with cell_boxes
if was_rebuilt:
logger.info(f"[TABLE] {elem_id}: Table was rebuilt, using HTML content directly")
elif use_border_only:
# Bad quality cell_boxes: skip cell_boxes rendering, use ReportLab Table with borders
logger.info(f"[TABLE] {elem_id}: Bad cell_boxes quality, using ReportLab Table with borders")
else:
# Check if cell_boxes can produce a valid grid before rendering borders
cell_boxes = table_element.get('cell_boxes', [])
if cell_boxes:
# Get table bbox for grid calculation
temp_bbox = table_element.get('bbox', [])
if isinstance(temp_bbox, dict):
raw_bbox = [temp_bbox['x0'], temp_bbox['y0'], temp_bbox['x1'], temp_bbox['y1']]
elif isinstance(temp_bbox, list) and len(temp_bbox) >= 4:
if isinstance(temp_bbox[0], (int, float)):
raw_bbox = temp_bbox[:4]
else:
raw_bbox = [temp_bbox[0][0], temp_bbox[0][1], temp_bbox[2][0], temp_bbox[2][1]]
else:
raw_bbox = None
# Pre-check: can we compute a valid grid from cell_boxes?
if raw_bbox:
test_col_widths, test_row_heights = self._compute_table_grid_from_cell_boxes(
cell_boxes, raw_bbox, num_rows, max_cols
)
grid_valid = test_col_widths is not None and test_row_heights is not None
if grid_valid:
logger.info(f"[TABLE] Grid validation passed, rendering borders with cell_boxes")
success = self._draw_table_with_cell_boxes(
pdf_canvas, table_element, page_height,
scale_w, scale_h, result_dir
)
if success:
cell_boxes_rendered = True
logger.info("[TABLE] cell_boxes rendered borders, continuing with text-only ReportLab Table")
else:
logger.info("[TABLE] cell_boxes rendering failed, using ReportLab Table with borders")
else:
# Grid mismatch: try cellboxes-first rendering if enabled
if settings.table_rendering_prefer_cellboxes:
logger.info(f"[TABLE] Grid mismatch, trying cellboxes-first rendering")
from app.services.pdf_table_renderer import TableRenderer, TableRenderConfig
renderer = TableRenderer(TableRenderConfig())
success = renderer.render_from_cellboxes_grid(
pdf_canvas,
cell_boxes,
html_content,
tuple(raw_bbox),
page_height,
scale_w,
scale_h,
row_threshold=settings.table_cellboxes_row_threshold,
col_threshold=settings.table_cellboxes_col_threshold
)
if success:
logger.info("[TABLE] cellboxes-first rendering succeeded, skipping HTML-based rendering")
return # Table fully rendered, exit early
else:
logger.info("[TABLE] cellboxes-first rendering failed, falling back to HTML-based")
else:
logger.info(f"[TABLE] Grid validation failed (mismatch), using ReportLab Table with borders")
else:
logger.info("[TABLE] No valid bbox for grid validation, using ReportLab Table with borders")
# Get bbox directly from table element
table_bbox = table_element.get('bbox')
@@ -2106,15 +2555,7 @@ class PDFGeneratorService:
pdf_y = page_height - ocr_y_bottom
# Build table data for ReportLab with proper colspan/rowspan handling
# First pass: determine the actual grid size by accounting for spans
num_rows = len(rows)
# Calculate actual number of columns by checking first row's total span
max_cols = 0
for row in rows:
row_cols = sum(cell.get('colspan', 1) for cell in row['cells'])
max_cols = max(max_cols, row_cols)
# num_rows and max_cols already calculated above for grid validation
logger.info(f"[表格] {num_rows}行x{max_cols}列 → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 寬x高: {table_width:.0f}x{table_height:.0f}")
# Create a grid to track occupied cells (for rowspan handling)
@@ -2223,16 +2664,25 @@ class PDFGeneratorService:
logger.info(f"[TABLE] Created with {len(col_widths)} cols, {len(row_heights)} rows")
# Apply table style
style = TableStyle([
# If cell_boxes rendered borders, skip GRID style (text-only rendering)
style_commands = [
('FONT', (0, 0), (-1, -1), self.font_name if self.font_registered else 'Helvetica', font_size),
('GRID', (0, 0), (-1, -1), 0.5, colors.black),
('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
('ALIGN', (0, 0), (-1, -1), 'CENTER'),
('LEFTPADDING', (0, 0), (-1, -1), 2),
('RIGHTPADDING', (0, 0), (-1, -1), 2),
('TOPPADDING', (0, 0), (-1, -1), 2),
('BOTTOMPADDING', (0, 0), (-1, -1), 2),
])
]
# Only add GRID if cell_boxes didn't render borders
if not cell_boxes_rendered:
style_commands.insert(1, ('GRID', (0, 0), (-1, -1), 0.5, colors.black))
logger.info("[TABLE] Adding GRID style (cell_boxes not used)")
else:
logger.info("[TABLE] Skipping GRID style (cell_boxes rendered borders)")
style = TableStyle(style_commands)
# Add header style if first row has headers
if rows and rows[0]['cells'] and rows[0]['cells'][0].get('is_header'):
@@ -2435,6 +2885,106 @@ class PDFGeneratorService:
logger.debug(f"[TABLE] Normalized {len(cell_boxes)} cell boxes to grid")
return normalized_boxes
def _draw_table_border_only(
self,
pdf_canvas: canvas.Canvas,
table_element: Dict,
page_height: float,
scale_w: float = 1.0,
scale_h: float = 1.0
):
"""
Draw only the outer border of a table (for tables with bad cell_boxes quality).
Text inside the table will be rendered using raw OCR positions.
Args:
pdf_canvas: ReportLab canvas object
table_element: Table element dict
page_height: Height of page in PDF coordinates
scale_w: Scale factor for X coordinates
scale_h: Scale factor for Y coordinates
"""
table_bbox = table_element.get('bbox', [])
if not table_bbox or len(table_bbox) < 4:
return
element_id = table_element.get('element_id', 'unknown')
# Handle different bbox formats
if isinstance(table_bbox, dict):
x0, y0, x1, y1 = table_bbox['x0'], table_bbox['y0'], table_bbox['x1'], table_bbox['y1']
elif isinstance(table_bbox[0], (int, float)):
x0, y0, x1, y1 = table_bbox[0], table_bbox[1], table_bbox[2], table_bbox[3]
else:
return
# Apply scaling
pdf_x0 = x0 * scale_w
pdf_y0 = y0 * scale_h
pdf_x1 = x1 * scale_w
pdf_y1 = y1 * scale_h
# Convert to PDF coordinates (flip Y)
pdf_top = page_height - pdf_y0
pdf_bottom = page_height - pdf_y1
width = pdf_x1 - pdf_x0
height = pdf_y1 - pdf_y0
# Draw outer border only
pdf_canvas.setStrokeColor(colors.black)
pdf_canvas.setLineWidth(0.5)
pdf_canvas.rect(pdf_x0, pdf_bottom, width, height, stroke=1, fill=0)
logger.info(f"[TABLE] {element_id}: Drew border only (bad cell_boxes quality)")
def _check_cell_boxes_quality(self, cell_boxes: List, element_id: str = "") -> str:
"""
Check the quality of cell_boxes to determine rendering strategy.
Args:
cell_boxes: List of cell bounding boxes
element_id: Optional element ID for logging
Returns:
'good' if cell_boxes form a proper grid, 'bad' otherwise
"""
# If quality check is disabled, always return 'good' to use pure PP-Structure output
if not settings.table_quality_check_enabled:
logger.debug(f"[TABLE QUALITY] {element_id}: good - quality check disabled (pure PP-Structure mode)")
return 'good'
if not cell_boxes or len(cell_boxes) < 2:
logger.debug(f"[TABLE QUALITY] {element_id}: bad - too few cells ({len(cell_boxes) if cell_boxes else 0})")
return 'bad' # No cell_boxes or too few
# Count overlapping cell pairs
overlap_count = 0
for i, box1 in enumerate(cell_boxes):
for j, box2 in enumerate(cell_boxes):
if i >= j:
continue
if not isinstance(box1, (list, tuple)) or len(box1) < 4:
continue
if not isinstance(box2, (list, tuple)) or len(box2) < 4:
continue
x_overlap = box1[0] < box2[2] and box1[2] > box2[0]
y_overlap = box1[1] < box2[3] and box1[3] > box2[1]
if x_overlap and y_overlap:
overlap_count += 1
total_pairs = len(cell_boxes) * (len(cell_boxes) - 1) // 2
overlap_ratio = overlap_count / total_pairs if total_pairs > 0 else 0
# Relaxed threshold: 20% overlap instead of 10% to allow more tables through
# This is because PP-StructureV3's cell detection sometimes has slight overlaps
if overlap_ratio > 0.20:
logger.info(f"[TABLE QUALITY] {element_id}: bad - overlap ratio {overlap_ratio:.2%} > 20%")
return 'bad'
logger.debug(f"[TABLE QUALITY] {element_id}: good - {len(cell_boxes)} cells, overlap {overlap_ratio:.2%}")
return 'good'
def _draw_table_with_cell_boxes(
self,
pdf_canvas: canvas.Canvas,
@@ -2465,39 +3015,64 @@ class PDFGeneratorService:
"""
try:
cell_boxes = table_element.get('cell_boxes', [])
# Always draw outer table border first (fallback for incomplete cell_boxes)
table_bbox = table_element.get('bbox', [])
if table_bbox and len(table_bbox) >= 4:
# Handle different bbox formats (list or dict)
if isinstance(table_bbox, dict):
tx1 = float(table_bbox.get('x0', 0))
ty1 = float(table_bbox.get('y0', 0))
tx2 = float(table_bbox.get('x1', 0))
ty2 = float(table_bbox.get('y1', 0))
else:
tx1, ty1, tx2, ty2 = table_bbox[:4]
# Apply scaling
tx1_scaled = tx1 * scale_w
ty1_scaled = ty1 * scale_h
tx2_scaled = tx2 * scale_w
ty2_scaled = ty2 * scale_h
# Check cell_boxes quality - skip if they don't form a proper grid
if cell_boxes and len(cell_boxes) > 2:
# Count overlapping cell pairs
overlap_count = 0
for i, box1 in enumerate(cell_boxes):
for j, box2 in enumerate(cell_boxes):
if i >= j:
continue
x_overlap = box1[0] < box2[2] and box1[2] > box2[0]
y_overlap = box1[1] < box2[3] and box1[3] > box2[1]
if x_overlap and y_overlap:
overlap_count += 1
table_width = tx2_scaled - tx1_scaled
table_height = ty2_scaled - ty1_scaled
# If more than 25% of cell pairs overlap, cell_boxes are unreliable
# Increased from 10% to 25% to allow more tables to use cell_boxes rendering
# which provides better visual fidelity than ReportLab Table fallback
total_pairs = len(cell_boxes) * (len(cell_boxes) - 1) // 2
overlap_ratio = overlap_count / total_pairs if total_pairs > 0 else 0
# Transform Y coordinate (PDF uses bottom-left origin)
pdf_x = tx1_scaled
pdf_y = page_height - ty2_scaled # Bottom of table in PDF coords
# Draw outer table border (slightly thicker for visibility)
pdf_canvas.setStrokeColor(colors.black)
pdf_canvas.setLineWidth(1.0)
pdf_canvas.rect(pdf_x, pdf_y, table_width, table_height, stroke=1, fill=0)
logger.info(f"[TABLE] Drew outer table border at [{int(tx1)},{int(ty1)},{int(tx2)},{int(ty2)}]")
if overlap_ratio > 0.25:
logger.warning(
f"[TABLE] Skipping cell_boxes rendering: {overlap_count}/{total_pairs} "
f"({overlap_ratio:.1%}) cell pairs overlap - using ReportLab Table fallback"
)
return False # Return False to trigger ReportLab Table fallback
if not cell_boxes:
# Fallback: draw outer border only when no cell_boxes
if table_bbox and len(table_bbox) >= 4:
# Handle different bbox formats (list or dict)
if isinstance(table_bbox, dict):
tx1 = float(table_bbox.get('x0', 0))
ty1 = float(table_bbox.get('y0', 0))
tx2 = float(table_bbox.get('x1', 0))
ty2 = float(table_bbox.get('y1', 0))
else:
tx1, ty1, tx2, ty2 = table_bbox[:4]
# Apply scaling
tx1_scaled = tx1 * scale_w
ty1_scaled = ty1 * scale_h
tx2_scaled = tx2 * scale_w
ty2_scaled = ty2 * scale_h
table_width = tx2_scaled - tx1_scaled
table_height = ty2_scaled - ty1_scaled
# Transform Y coordinate (PDF uses bottom-left origin)
pdf_x = tx1_scaled
pdf_y = page_height - ty2_scaled # Bottom of table in PDF coords
# Draw outer table border (slightly thicker for visibility)
pdf_canvas.setStrokeColor(colors.black)
pdf_canvas.setLineWidth(1.0)
pdf_canvas.rect(pdf_x, pdf_y, table_width, table_height, stroke=1, fill=0)
logger.info(f"[TABLE] Drew outer table border at [{int(tx1)},{int(ty1)},{int(tx2)},{int(ty2)}]")
logger.warning("[TABLE] No cell_boxes available, only outer border drawn")
# Still draw embedded images even without cell borders
embedded_images = table_element.get('embedded_images', [])
@@ -2511,31 +3086,47 @@ class PDFGeneratorService:
# Normalize cell boxes to create aligned grid
cell_boxes = self._normalize_cell_boxes_to_grid(cell_boxes)
logger.info(f"[TABLE] Drawing {len(cell_boxes)} cell borders (layered mode, grid-aligned)")
logger.info(f"[TABLE] Drawing {len(cell_boxes)} cells using grid lines (avoiding duplicates)")
# Collect unique grid lines to avoid drawing duplicate/overlapping lines
h_lines = set() # Horizontal lines: (y, x_start, x_end)
v_lines = set() # Vertical lines: (x, y_start, y_end)
# Draw each cell border
for box in cell_boxes:
x1, y1, x2, y2 = box[0], box[1], box[2], box[3]
# Apply scaling
x1_scaled = x1 * scale_w
y1_scaled = y1 * scale_h
x2_scaled = x2 * scale_w
y2_scaled = y2 * scale_h
x1_s = x1 * scale_w
y1_s = y1 * scale_h
x2_s = x2 * scale_w
y2_s = y2 * scale_h
cell_width = x2_scaled - x1_scaled
cell_height = y2_scaled - y1_scaled
# Round to 1 decimal place to help with deduplication
x1_s, y1_s, x2_s, y2_s = round(x1_s, 1), round(y1_s, 1), round(x2_s, 1), round(y2_s, 1)
# Transform Y coordinate (PDF uses bottom-left origin)
pdf_x = x1_scaled
pdf_y = page_height - y2_scaled # Bottom of cell in PDF coords
# Add horizontal lines (top and bottom of cell)
h_lines.add((y1_s, x1_s, x2_s)) # Top line
h_lines.add((y2_s, x1_s, x2_s)) # Bottom line
# Draw cell border only (no fill, no text)
pdf_canvas.setStrokeColor(colors.black)
pdf_canvas.setLineWidth(0.5)
pdf_canvas.rect(pdf_x, pdf_y, cell_width, cell_height, stroke=1, fill=0)
# Add vertical lines (left and right of cell)
v_lines.add((x1_s, y1_s, y2_s)) # Left line
v_lines.add((x2_s, y1_s, y2_s)) # Right line
logger.info(f"[TABLE] Drew {len(cell_boxes)} cell borders")
# Draw unique horizontal lines
pdf_canvas.setStrokeColor(colors.black)
pdf_canvas.setLineWidth(0.5)
for y, x_start, x_end in h_lines:
pdf_y = page_height - y # Transform Y coordinate
pdf_canvas.line(x_start, pdf_y, x_end, pdf_y)
# Draw unique vertical lines
for x, y_start, y_end in v_lines:
pdf_y_start = page_height - y_start
pdf_y_end = page_height - y_end
pdf_canvas.line(x, pdf_y_start, x, pdf_y_end)
logger.info(f"[TABLE] Drew {len(h_lines)} horizontal + {len(v_lines)} vertical grid lines")
# Draw embedded images
embedded_images = table_element.get('embedded_images', [])

View File

@@ -24,6 +24,256 @@ from reportlab.platypus import Paragraph, Table, TableStyle
logger = logging.getLogger(__name__)
# ============================================================================
# Cell Box Grid Inferrer
# ============================================================================
class CellBoxGridInferrer:
"""
Infer table grid structure from cell_boxes coordinates.
This class clusters cell_boxes by Y-coordinate (rows) and X-coordinate (columns)
to determine the grid structure, regardless of HTML colspan/rowspan.
"""
def __init__(
self,
row_threshold: float = 15.0,
col_threshold: float = 15.0
):
"""
Initialize grid inferrer.
Args:
row_threshold: Y-coordinate threshold for row clustering
col_threshold: X-coordinate threshold for column clustering
"""
self.row_threshold = row_threshold
self.col_threshold = col_threshold
def infer_grid(
self,
cell_boxes: List[List[float]]
) -> Optional[Dict]:
"""
Infer grid structure from cell_boxes.
Args:
cell_boxes: List of [x0, y0, x1, y1] coordinates
Returns:
Dict with 'grid', 'num_rows', 'num_cols', 'row_boundaries', 'col_boundaries'
or None if inference fails
"""
if not cell_boxes or len(cell_boxes) < 1:
return None
try:
# Filter valid boxes
valid_boxes = [
b for b in cell_boxes
if b is not None and len(b) >= 4
]
if not valid_boxes:
return None
# Extract Y and X boundaries from all cells
y_mins = [b[1] for b in valid_boxes] # y0
y_maxs = [b[3] for b in valid_boxes] # y1
x_mins = [b[0] for b in valid_boxes] # x0
x_maxs = [b[2] for b in valid_boxes] # x1
# Cluster Y values to determine rows
all_y = sorted(set(y_mins + y_maxs))
y_boundaries = self._cluster_to_boundaries(all_y, self.row_threshold)
# Cluster X values to determine columns
all_x = sorted(set(x_mins + x_maxs))
x_boundaries = self._cluster_to_boundaries(all_x, self.col_threshold)
if len(y_boundaries) < 2 or len(x_boundaries) < 2:
return None
num_rows = len(y_boundaries) - 1
num_cols = len(x_boundaries) - 1
# Build grid: map (row, col) -> cell_box info
grid = {}
for idx, box in enumerate(valid_boxes):
x0, y0, x1, y1 = box[:4]
# Find row by y_center
y_center = (y0 + y1) / 2
row = self._find_position(y_center, y_boundaries)
# Find col by x_center
x_center = (x0 + x1) / 2
col = self._find_position(x_center, x_boundaries)
if row is not None and col is not None:
grid[(row, col)] = {
'bbox': box,
'index': idx,
'content': ''
}
# Calculate row heights and column widths
row_heights = [
y_boundaries[i + 1] - y_boundaries[i]
for i in range(num_rows)
]
col_widths = [
x_boundaries[i + 1] - x_boundaries[i]
for i in range(num_cols)
]
return {
'grid': grid,
'num_rows': num_rows,
'num_cols': num_cols,
'row_boundaries': y_boundaries,
'col_boundaries': x_boundaries,
'row_heights': row_heights,
'col_widths': col_widths
}
except Exception as e:
logger.error(f"Grid inference failed: {e}")
return None
def _cluster_to_boundaries(
self,
values: List[float],
threshold: float
) -> List[float]:
"""
Cluster nearby values and return representative boundaries.
Args:
values: Sorted list of coordinate values
threshold: Clustering threshold
Returns:
List of boundary values (cluster representatives)
"""
if not values:
return []
boundaries = [values[0]]
current_cluster = [values[0]]
for v in values[1:]:
if v - current_cluster[-1] <= threshold:
current_cluster.append(v)
else:
# Finish current cluster, use average as boundary
boundaries[-1] = sum(current_cluster) / len(current_cluster)
boundaries.append(v)
current_cluster = [v]
# Finish last cluster
if current_cluster:
boundaries[-1] = sum(current_cluster) / len(current_cluster)
return boundaries
def _find_position(
self,
value: float,
boundaries: List[float]
) -> Optional[int]:
"""
Find which interval a value falls into.
Args:
value: Coordinate value
boundaries: List of boundary values
Returns:
Index of interval, or None if out of bounds
"""
for i in range(len(boundaries) - 1):
if boundaries[i] <= value <= boundaries[i + 1]:
return i
# Check if close to any boundary
for i in range(len(boundaries) - 1):
mid = (boundaries[i] + boundaries[i + 1]) / 2
if abs(value - mid) < (boundaries[i + 1] - boundaries[i]):
return i
return None
def extract_cell_contents_from_html(html: str) -> List[str]:
"""
Extract cell text contents from HTML in reading order.
Args:
html: HTML table string
Returns:
List of text strings, one per cell
"""
try:
parser = HTMLTableParser()
parser.feed(html)
if not parser.tables:
return []
contents = []
for row in parser.tables[0].get('rows', []):
for cell in row.get('cells', []):
text = cell.get('text', '').strip()
contents.append(text)
return contents
except Exception as e:
logger.error(f"HTML content extraction failed: {e}")
return []
def map_content_to_grid(
grid: Dict[Tuple[int, int], Dict],
contents: List[str],
num_rows: int,
num_cols: int
) -> Dict[Tuple[int, int], Dict]:
"""
Map extracted content to grid cells row by row.
Args:
grid: Dict mapping (row, col) to cell info
contents: List of text contents from HTML
num_rows: Number of rows in grid
num_cols: Number of columns in grid
Returns:
Updated grid with content assigned
"""
content_idx = 0
for row in range(num_rows):
for col in range(num_cols):
if (row, col) in grid:
if content_idx < len(contents):
grid[(row, col)]['content'] = contents[content_idx]
content_idx += 1
else:
grid[(row, col)]['content'] = ''
# Log if there's a significant mismatch
if content_idx < len(contents):
logger.debug(
f"Content mismatch: {len(contents)} HTML cells, "
f"only {content_idx} mapped to {len(grid)} grid cells"
)
return grid
# ============================================================================
# Configuration
# ============================================================================
@@ -405,6 +655,147 @@ class TableRenderer:
traceback.print_exc()
return False
def render_from_cellboxes_grid(
self,
pdf_canvas,
cell_boxes: List[List[float]],
html_content: str,
table_bbox: Tuple[float, float, float, float],
page_height: float,
scale_w: float = 1.0,
scale_h: float = 1.0,
row_threshold: float = 15.0,
col_threshold: float = 15.0
) -> bool:
"""
Render table using cell_boxes as the primary structure source.
This method infers grid structure from cell_boxes coordinates and
maps HTML content to cells, regardless of HTML colspan/rowspan.
Args:
pdf_canvas: ReportLab canvas
cell_boxes: List of [x0, y0, x1, y1] for each cell
html_content: HTML table string (for text content)
table_bbox: Table bounding box
page_height: PDF page height
scale_w: Horizontal scale factor
scale_h: Vertical scale factor
row_threshold: Y-coordinate threshold for row clustering
col_threshold: X-coordinate threshold for column clustering
Returns:
True if successful, False otherwise
"""
try:
if not cell_boxes:
logger.debug("No cell_boxes provided for grid rendering")
return False
# Infer grid structure from cell_boxes
inferrer = CellBoxGridInferrer(
row_threshold=row_threshold,
col_threshold=col_threshold
)
grid_info = inferrer.infer_grid(cell_boxes)
if not grid_info:
logger.debug("Failed to infer grid from cell_boxes")
return False
grid = grid_info['grid']
num_rows = grid_info['num_rows']
num_cols = grid_info['num_cols']
row_boundaries = grid_info['row_boundaries']
col_boundaries = grid_info['col_boundaries']
logger.info(
f"[TABLE] CellBoxes grid inferred: {num_rows} rows x {num_cols} cols "
f"from {len(cell_boxes)} cell_boxes"
)
# Extract content from HTML
if html_content:
contents = extract_cell_contents_from_html(html_content)
grid = map_content_to_grid(grid, contents, num_rows, num_cols)
logger.debug(f"[TABLE] Mapped {len(contents)} HTML cells to grid")
# Apply scale factors to boundaries
scaled_row_boundaries = [y * scale_h for y in row_boundaries]
scaled_col_boundaries = [x * scale_w for x in col_boundaries]
# Draw cell borders and content
pdf_canvas.saveState()
pdf_canvas.setStrokeColor(self.config.border_color)
pdf_canvas.setLineWidth(self.config.border_width)
# Create paragraph style for text
style = ParagraphStyle(
'CellBoxCell',
fontName=self.config.font_name,
fontSize=self.config.font_size,
alignment=TA_CENTER,
leading=self.config.font_size * 1.2
)
for row in range(num_rows):
for col in range(num_cols):
# Calculate cell boundaries
x0 = scaled_col_boundaries[col]
x1 = scaled_col_boundaries[col + 1] if col + 1 < len(scaled_col_boundaries) else x0 + 50
y0 = scaled_row_boundaries[row]
y1 = scaled_row_boundaries[row + 1] if row + 1 < len(scaled_row_boundaries) else y0 + 20
# Convert to PDF coordinates (flip Y)
pdf_x0 = x0
pdf_y0 = page_height - y1
pdf_x1 = x1
pdf_y1 = page_height - y0
cell_width = pdf_x1 - pdf_x0
cell_height = pdf_y1 - pdf_y0
# Draw cell border
pdf_canvas.rect(pdf_x0, pdf_y0, cell_width, cell_height)
# Draw text if cell exists in grid
if (row, col) in grid:
cell_content = grid[(row, col)].get('content', '')
if cell_content:
# Calculate text position with padding
text_x = pdf_x0 + self.config.left_padding
text_y = pdf_y0 + cell_height - self.config.top_padding - self.config.font_size
# Fit text to cell
available_width = cell_width - self.config.left_padding - self.config.right_padding
font_size = self._fit_text_to_cell(
pdf_canvas, cell_content, available_width, cell_height
)
# Draw centered text
pdf_canvas.setFont(self.config.font_name, font_size)
text_width = pdf_canvas.stringWidth(
cell_content, self.config.font_name, font_size
)
# Center horizontally
text_x = pdf_x0 + (cell_width - text_width) / 2
# Center vertically
text_y = pdf_y0 + (cell_height - font_size) / 2
pdf_canvas.drawString(text_x, text_y, cell_content)
pdf_canvas.restoreState()
logger.info(f"[TABLE] Successfully rendered {num_rows}x{num_cols} table from cell_boxes")
return True
except Exception as e:
logger.error(f"CellBoxes grid rendering failed: {e}")
import traceback
traceback.print_exc()
return False
# =========================================================================
# Grid and Cell Box Helpers
# =========================================================================

View File

@@ -28,9 +28,11 @@ from PIL import Image
import numpy as np
import cv2
from app.models.unified_document import ElementType
from app.services.cell_validation_engine import CellValidationEngine, CellValidationConfig
from app.core.config import settings
from app.services.memory_manager import prediction_context
from app.services.cv_table_detector import CVTableDetector
from app.services.table_content_rebuilder import TableContentRebuilder
logger = logging.getLogger(__name__)
@@ -91,7 +93,8 @@ class PPStructureEnhanced:
preprocessed_image: Optional[Image.Image] = None,
scaling_info: Optional['ScalingInfo'] = None,
save_visualization: bool = False,
use_cv_table_detection: bool = False
use_cv_table_detection: bool = False,
raw_ocr_regions: Optional[List[Dict[str, Any]]] = None
) -> Dict[str, Any]:
"""
Analyze document with full PP-StructureV3 capabilities.
@@ -110,6 +113,8 @@ class PPStructureEnhanced:
(layout_det_res, layout_order_res, overall_ocr_res, etc.)
use_cv_table_detection: If True, use CV-based line detection for wired tables
instead of ML-based cell detection (RT-DETR-L)
raw_ocr_regions: Optional list of raw OCR text regions for table content
rebuilding. Used when PP-StructureV3's table HTML is incorrect.
Returns:
Dictionary with complete structure information including:
@@ -222,6 +227,7 @@ class PPStructureEnhanced:
# Extract table_res_list which contains cell_box_list
layout_det_res = None
overall_ocr_res = None
if result_dict:
if 'table_res_list' in result_dict:
table_res_list = result_dict['table_res_list']
@@ -235,13 +241,20 @@ class PPStructureEnhanced:
layout_det_res = result_dict['layout_det_res']
logger.info(f"Found layout_det_res with {len(layout_det_res.get('boxes', []))} boxes")
# Extract overall_ocr_res for gap filling (avoid separate Raw OCR inference)
if 'overall_ocr_res' in result_dict:
overall_ocr_res = result_dict['overall_ocr_res']
ocr_count = len(overall_ocr_res.get('rec_texts', []))
logger.info(f"Found overall_ocr_res with {ocr_count} text regions")
# Process parsing_res_list if found
if parsing_res_list:
elements = self._process_parsing_res_list(
parsing_res_list, current_page, output_dir, image_path, scaling_info,
table_res_list=table_res_list, # Pass table_res_list for cell_box_list
layout_det_res=layout_det_res, # Pass layout_det_res for Image-in-Table
use_cv_table_detection=use_cv_table_detection # Use CV for wired tables
use_cv_table_detection=use_cv_table_detection, # Use CV for wired tables
raw_ocr_regions=raw_ocr_regions # Pass raw OCR for table content rebuilding
)
all_elements.extend(elements)
@@ -289,6 +302,15 @@ class PPStructureEnhanced:
if visualization_dir:
result['visualization_dir'] = str(visualization_dir)
# Add overall_ocr_res for gap filling (converted to standard format)
# This allows gap_filling_service to use PP-StructureV3's internal OCR
# instead of running a separate Raw OCR inference
if overall_ocr_res:
result['overall_ocr_res'] = self._convert_overall_ocr_to_regions(
overall_ocr_res, scaling_info
)
logger.info(f"Converted {len(result['overall_ocr_res'])} OCR regions from overall_ocr_res")
return result
except Exception as e:
@@ -327,7 +349,8 @@ class PPStructureEnhanced:
scaling_info: Optional['ScalingInfo'] = None,
table_res_list: Optional[List[Dict]] = None,
layout_det_res: Optional[Dict] = None,
use_cv_table_detection: bool = False
use_cv_table_detection: bool = False,
raw_ocr_regions: Optional[List[Dict[str, Any]]] = None
) -> List[Dict[str, Any]]:
"""
Process parsing_res_list to extract all elements.
@@ -341,6 +364,7 @@ class PPStructureEnhanced:
table_res_list: Optional list of table results containing cell_box_list
layout_det_res: Optional layout detection result for Image-in-Table processing
use_cv_table_detection: If True, use CV line detection for wired tables
raw_ocr_regions: Optional list of raw OCR text regions for table content rebuilding
Returns:
List of processed elements with normalized structure
@@ -415,6 +439,11 @@ class PPStructureEnhanced:
mapped_type = ElementType.TABLE
html_table_content = content # Store for later use
# Strip LaTeX math formatting from text content (PP-Structure formula detection)
if content and mapped_type in [ElementType.TEXT, ElementType.TITLE, ElementType.HEADER]:
if '$' in content and '\\' in content:
content = self._strip_latex_math(content)
# Create element
element = {
'element_id': f"pp3_{current_page}_{idx}",
@@ -468,18 +497,84 @@ class PPStructureEnhanced:
logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes from table_res_list (HTML match)")
break
# If no HTML match, use first available table_res with cell_box_list
# If no HTML match, find best matching table_res by bbox overlap
if not cell_boxes_extracted:
best_match = None
best_overlap = 0.0
for tbl_res in table_res_list:
if 'cell_box_list' in tbl_res and tbl_res['cell_box_list']:
cell_boxes = tbl_res['cell_box_list']
element['cell_boxes'] = [[float(c) for c in box] for box in cell_boxes]
element['cell_boxes_source'] = 'table_res_list'
cell_boxes_extracted = True
logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes from table_res_list (first available)")
# Remove used table_res to avoid reuse
table_res_list.remove(tbl_res)
break
if 'cell_box_list' not in tbl_res or not tbl_res['cell_box_list']:
continue
# Get table_res bbox from its cell_box_list
cell_boxes_temp = tbl_res['cell_box_list']
if not cell_boxes_temp:
continue
# Calculate bounding box of all cells
tbl_x1 = min(cb[0] for cb in cell_boxes_temp)
tbl_y1 = min(cb[1] for cb in cell_boxes_temp)
tbl_x2 = max(cb[2] for cb in cell_boxes_temp)
tbl_y2 = max(cb[3] for cb in cell_boxes_temp)
# Calculate IoU (Intersection over Union) with element bbox
# bbox is [x1, y1, x2, y2]
elem_x1, elem_y1, elem_x2, elem_y2 = bbox[0], bbox[1], bbox[2], bbox[3]
# Intersection
inter_x1 = max(tbl_x1, elem_x1)
inter_y1 = max(tbl_y1, elem_y1)
inter_x2 = min(tbl_x2, elem_x2)
inter_y2 = min(tbl_y2, elem_y2)
if inter_x1 < inter_x2 and inter_y1 < inter_y2:
inter_area = (inter_x2 - inter_x1) * (inter_y2 - inter_y1)
elem_area = (elem_x2 - elem_x1) * (elem_y2 - elem_y1)
tbl_area = (tbl_x2 - tbl_x1) * (tbl_y2 - tbl_y1)
# Use overlap ratio with element bbox (how much of element is covered)
overlap_ratio = inter_area / elem_area if elem_area > 0 else 0
if overlap_ratio > best_overlap:
best_overlap = overlap_ratio
best_match = tbl_res
# Use best match if overlap is significant (>10%)
if best_match and best_overlap > 0.1:
cell_boxes = best_match['cell_box_list']
element['cell_boxes'] = [[float(c) for c in box] for box in cell_boxes]
element['cell_boxes_source'] = 'table_res_list'
cell_boxes_extracted = True
logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes from table_res_list (bbox match, overlap={best_overlap:.2f})")
# Extract pred_html if not already set
if not html_content and 'pred_html' in best_match:
html_content = best_match['pred_html']
element['html'] = html_content
element['extracted_text'] = self._extract_text_from_html(html_content)
logger.info(f"[TABLE] Extracted HTML from table_res_list (bbox match, {len(html_content)} chars)")
# Remove used table_res to avoid reuse
table_res_list.remove(best_match)
elif table_res_list:
# Fallback to first available if no bbox match found
for tbl_res in table_res_list:
if 'cell_box_list' in tbl_res and tbl_res['cell_box_list']:
cell_boxes = tbl_res['cell_box_list']
element['cell_boxes'] = [[float(c) for c in box] for box in cell_boxes]
element['cell_boxes_source'] = 'table_res_list'
cell_boxes_extracted = True
logger.warning(f"[TABLE] Using first available table_res (no bbox match, {len(cell_boxes)} cells)")
# Extract pred_html if not already set
if not html_content and 'pred_html' in tbl_res:
html_content = tbl_res['pred_html']
element['html'] = html_content
element['extracted_text'] = self._extract_text_from_html(html_content)
logger.info(f"[TABLE] Extracted HTML from table_res_list (fallback, {len(html_content)} chars)")
table_res_list.remove(tbl_res)
break
if not cell_boxes_extracted and 'boxes' in res_data:
# PPStructureV3 returned cell boxes in res (unlikely in PaddleX 3.x)
@@ -558,6 +653,42 @@ class PPStructureEnhanced:
element['embedded_images'] = embedded_images
logger.info(f"[TABLE] Embedded {len(embedded_images)} images into table")
# 4. Table content rebuilding from raw OCR regions
# When cell_boxes have boundary issues, rebuild table content from raw OCR
# Only if table_content_rebuilder is enabled (disabled by default as it's a patch behavior)
logger.info(f"[TABLE] raw_ocr_regions available: {raw_ocr_regions is not None and len(raw_ocr_regions) if raw_ocr_regions else 0}")
logger.info(f"[TABLE] cell_boxes available: {len(element.get('cell_boxes', []))}")
if settings.table_content_rebuilder_enabled and raw_ocr_regions and element.get('cell_boxes'):
rebuilder = TableContentRebuilder()
should_rebuild, rebuild_reason = rebuilder.should_rebuild(
element['cell_boxes'],
bbox,
element.get('html', '')
)
if should_rebuild:
logger.info(f"[TABLE] Triggering table rebuild: {rebuild_reason}")
rebuilt_table, rebuild_stats = rebuilder.rebuild_table(
cell_boxes=element['cell_boxes'],
table_bbox=bbox,
raw_ocr_regions=raw_ocr_regions,
original_html=element.get('html', '')
)
if rebuilt_table:
# Update element with rebuilt content
element['html'] = rebuilt_table['html']
element['rebuilt_table'] = rebuilt_table
element['rebuild_stats'] = rebuild_stats
element['extracted_text'] = self._extract_text_from_html(rebuilt_table['html'])
logger.info(
f"[TABLE] Rebuilt table: {rebuilt_table['rows']}x{rebuilt_table['cols']} "
f"with {len(rebuilt_table['cells'])} cells"
)
else:
logger.warning(f"[TABLE] Rebuild failed: {rebuild_stats.get('reason', 'unknown')}")
element['rebuild_stats'] = rebuild_stats
# Special handling for images/figures/charts/stamps (visual elements that need cropping)
elif mapped_type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART, ElementType.DIAGRAM, ElementType.STAMP, ElementType.LOGO]:
# Save image if path provided
@@ -587,6 +718,21 @@ class PPStructureEnhanced:
elements.append(element)
logger.debug(f"Processed element {idx}: type={mapped_type}, bbox={bbox}")
# Apply cell validation to filter over-detected tables
if settings.cell_validation_enabled:
cell_validator = CellValidationEngine(CellValidationConfig(
max_cell_density=settings.cell_validation_max_density,
min_avg_cell_area=settings.cell_validation_min_cell_area,
min_cell_height=settings.cell_validation_min_cell_height,
enabled=True
))
elements, validation_stats = cell_validator.validate_and_filter_elements(elements)
if validation_stats['reclassified_tables'] > 0:
logger.info(
f"Cell validation: {validation_stats['reclassified_tables']}/{validation_stats['total_tables']} "
f"tables reclassified as TEXT due to over-detection"
)
return elements
def _embed_images_in_table(
@@ -911,18 +1057,145 @@ class PPStructureEnhanced:
type_counts[elem_type] = type_counts.get(elem_type, 0) + 1
return type_counts
def _convert_overall_ocr_to_regions(
self,
overall_ocr_res: Dict[str, Any],
scaling_info: Optional['ScalingInfo'] = None
) -> List[Dict[str, Any]]:
"""
Convert PP-StructureV3's overall_ocr_res to standard OCR region format.
This allows gap_filling_service to use PP-StructureV3's internal OCR results
instead of running a separate Raw OCR inference, saving approximately 50%
of total inference time.
The overall_ocr_res structure:
- dt_polys: List of polygon coordinates [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
- rec_texts: List of recognized text strings
- rec_scores: List of confidence scores
Args:
overall_ocr_res: Dictionary containing OCR results from PP-StructureV3
scaling_info: Optional scaling info for coordinate restoration
Returns:
List of OCR region dictionaries in standard format:
[{'text': str, 'bbox': [[x1,y1],...], 'confidence': float}, ...]
"""
regions = []
dt_polys = overall_ocr_res.get('dt_polys', [])
rec_texts = overall_ocr_res.get('rec_texts', [])
rec_scores = overall_ocr_res.get('rec_scores', [])
# Ensure all lists have the same length
num_regions = min(len(dt_polys), len(rec_texts))
if len(rec_scores) < num_regions:
# Pad with default confidence if scores are missing
rec_scores = list(rec_scores) + [0.9] * (num_regions - len(rec_scores))
for i in range(num_regions):
text = rec_texts[i]
if not text or not text.strip():
continue
poly = dt_polys[i]
confidence = rec_scores[i] if i < len(rec_scores) else 0.9
# Apply scaling restoration if needed
if scaling_info and hasattr(scaling_info, 'scale_factor') and scaling_info.scale_factor != 1.0:
scale = scaling_info.scale_factor
poly = [[pt[0] / scale, pt[1] / scale] for pt in poly]
regions.append({
'text': text,
'bbox': poly, # Keep polygon format for compatibility
'confidence': confidence
})
return regions
def _extract_text_from_html(self, html: str) -> str:
"""Extract plain text from HTML content."""
try:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
return soup.get_text(separator=' ', strip=True)
text = soup.get_text(separator=' ', strip=True)
except:
# Fallback: just remove HTML tags
import re
text = re.sub(r'<[^>]+>', ' ', html)
text = re.sub(r'\s+', ' ', text)
return text.strip()
text = text.strip()
# Strip LaTeX math formatting if present
return self._strip_latex_math(text)
def _strip_latex_math(self, text: str) -> str:
"""
Convert LaTeX math notation to plain text.
PP-StructureV3 outputs formulas in LaTeX format like:
$N\\cdot m\\times8.851=|b\\cdot|$
This converts them to readable plain text.
"""
import re
if not text or '$' not in text:
return text
# Remove $...$ delimiters but keep content
text = re.sub(r'\$([^$]+)\$', r'\1', text)
# Convert common LaTeX math commands to plain text
replacements = [
(r'\\cdot', '·'), # Multiplication dot
(r'\\times', '×'), # Multiplication sign
(r'\\div', '÷'), # Division sign
(r'\\pm', '±'), # Plus-minus
(r'\\leq', ''), # Less than or equal
(r'\\geq', ''), # Greater than or equal
(r'\\neq', ''), # Not equal
(r'\\approx', ''), # Approximately equal
(r'\\circ', '°'), # Degree symbol
(r'\\degree', '°'), # Degree symbol
(r'\\alpha', 'α'),
(r'\\beta', 'β'),
(r'\\gamma', 'γ'),
(r'\\delta', 'δ'),
(r'\\mu', 'μ'),
(r'\\Omega', 'Ω'),
(r'\\infty', ''),
(r'\^\\{2\\}', '²'), # Superscript 2
(r'\^\\{3\\}', '³'), # Superscript 3
(r'\^2', '²'),
(r'\^3', '³'),
(r'_\\{([^}]+)\\}', r'_\1'), # Subscript
(r'\\mathrm\{([^}]+)\}', r'\1'), # Roman text
(r'\\mathsf\{([^}]+)\}', r'\1'), # Sans-serif text
(r'\\mathbf\{([^}]+)\}', r'\1'), # Bold text
(r'\\text\{([^}]+)\}', r'\1'), # Text mode
(r'\\left', ''),
(r'\\right', ''),
(r'\\[|]', '|'), # Pipe symbols
(r'\\ ', ' '), # Escaped space
(r'\\,', ' '), # Thin space
(r'\\;', ' '), # Medium space
(r'\\quad', ' '), # Quad space
(r'\\qquad', ' '), # Double quad space
]
for pattern, replacement in replacements:
text = re.sub(pattern, replacement, text)
# Clean up any remaining backslashes followed by letters (unknown commands)
text = re.sub(r'\\[a-zA-Z]+', '', text)
# Clean up multiple spaces
text = re.sub(r'\s+', ' ', text)
return text.strip()
def _extract_bbox_from_filename(self, filename: str) -> List[int]:
"""Extract bbox from filename if it contains coordinate information."""

View File

@@ -335,6 +335,14 @@ class OCRPipeline(ProcessingPipeline):
processing_time = time.time() - start_time
# Debug: Check if ocr_result has rebuild_stats
if 'enhanced_results' in ocr_result:
for page_result in ocr_result['enhanced_results']:
for elem in page_result.get('elements', []):
if elem.get('type') == 'table' or (hasattr(elem.get('type'), 'value') and elem.get('type').value == 'table'):
has_rebuild = 'rebuild_stats' in elem
logger.info(f"[ORCHESTRATOR] Before converter - Table {elem.get('element_id')}: has rebuild_stats={has_rebuild}")
# Convert to UnifiedDocument
unified_doc = self.converter.convert(
ocr_result,

View File

@@ -0,0 +1,790 @@
"""
Table Column Alignment Corrector
This module provides post-processing correction for PP-Structure's table
structure recognition, which frequently outputs cells with incorrect column
indices (column shift).
The correction uses a "Header-Anchor Alignment" strategy:
1. Extract header row (row_idx=0) column X-coordinate ranges as anchors
2. Validate each cell's column assignment against header X-ranges
3. Correct column index if cell X-overlap with assigned column is insufficient
Additionally supports "Vertical Fragment Merging" for Chinese vertical text
that gets split into multiple narrow text blocks.
"""
import logging
import re
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Tuple
from html.parser import HTMLParser
logger = logging.getLogger(__name__)
@dataclass
class BBox:
"""Bounding box with x0, y0, x1, y1 coordinates."""
x0: float
y0: float
x1: float
y1: float
@property
def width(self) -> float:
return self.x1 - self.x0
@property
def height(self) -> float:
return self.y1 - self.y0
@property
def center_x(self) -> float:
return (self.x0 + self.x1) / 2
@property
def center_y(self) -> float:
return (self.y0 + self.y1) / 2
@classmethod
def from_list(cls, coords: List[float]) -> 'BBox':
"""Create BBox from [x0, y0, x1, y1] list."""
if len(coords) >= 4:
return cls(coords[0], coords[1], coords[2], coords[3])
raise ValueError(f"Invalid bbox coords: {coords}")
@dataclass
class ColumnAnchor:
"""Represents a column's X-coordinate range from header row."""
col_idx: int
x_min: float
x_max: float
colspan: int = 1
@property
def center_x(self) -> float:
return (self.x_min + self.x_max) / 2
@dataclass
class TableCell:
"""Represents a cell extracted from HTML with position info."""
row_idx: int
col_idx: int
content: str
colspan: int = 1
rowspan: int = 1
bbox: Optional[BBox] = None
is_header: bool = False
@dataclass
class TextBlock:
"""Represents a text block that may need merging."""
text: str
bbox: BBox
@property
def aspect_ratio(self) -> float:
"""Width / Height ratio. Vertical text has low aspect ratio."""
if self.bbox.height == 0:
return float('inf')
return self.bbox.width / self.bbox.height
class TableHTMLParser(HTMLParser):
"""
Parse table HTML to extract cells with row/col indices and spans.
PP-Structure outputs HTML like:
<table><tr><td>content</td><td colspan="2">merged</td></tr></table>
"""
def __init__(self):
super().__init__()
self.cells: List[TableCell] = []
self.current_row_idx = -1
self.current_col_idx = 0
self.current_cell: Optional[TableCell] = None
self.in_table = False
# Track occupied cells for colspan/rowspan handling
self.occupied: Dict[Tuple[int, int], bool] = {}
def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]):
attrs_dict = dict(attrs)
if tag == 'table':
self.in_table = True
self.current_row_idx = -1
self.occupied = {}
elif tag == 'tr' and self.in_table:
self.current_row_idx += 1
self.current_col_idx = 0
# Skip occupied columns from previous rowspans
while (self.current_row_idx, self.current_col_idx) in self.occupied:
self.current_col_idx += 1
elif tag in ('td', 'th') and self.in_table:
# Skip occupied columns
while (self.current_row_idx, self.current_col_idx) in self.occupied:
self.current_col_idx += 1
colspan = int(attrs_dict.get('colspan', 1))
rowspan = int(attrs_dict.get('rowspan', 1))
self.current_cell = TableCell(
row_idx=self.current_row_idx,
col_idx=self.current_col_idx,
content='',
colspan=colspan,
rowspan=rowspan,
is_header=(tag == 'th')
)
# Mark occupied cells for spans
for r in range(rowspan):
for c in range(colspan):
self.occupied[(self.current_row_idx + r, self.current_col_idx + c)] = True
def handle_endtag(self, tag: str):
if tag == 'table':
self.in_table = False
elif tag in ('td', 'th') and self.current_cell is not None:
self.current_cell.content = self.current_cell.content.strip()
self.cells.append(self.current_cell)
self.current_col_idx += self.current_cell.colspan
self.current_cell = None
def handle_data(self, data: str):
if self.current_cell is not None:
self.current_cell.content += data
def calculate_x_overlap(cell_bbox: BBox, anchor: ColumnAnchor) -> float:
"""
Calculate the X-axis overlap ratio between a cell and a column anchor.
Returns:
Overlap ratio (0.0 to 1.0) relative to the cell's width.
1.0 means the cell is fully within the anchor's X range.
"""
if cell_bbox.width == 0:
return 0.0
overlap_start = max(cell_bbox.x0, anchor.x_min)
overlap_end = min(cell_bbox.x1, anchor.x_max)
overlap_width = max(0, overlap_end - overlap_start)
return overlap_width / cell_bbox.width
def calculate_iou(bbox1: BBox, bbox2: BBox) -> float:
"""Calculate Intersection over Union between two bounding boxes."""
# Intersection
x0 = max(bbox1.x0, bbox2.x0)
y0 = max(bbox1.y0, bbox2.y0)
x1 = min(bbox1.x1, bbox2.x1)
y1 = min(bbox1.y1, bbox2.y1)
if x1 <= x0 or y1 <= y0:
return 0.0
intersection = (x1 - x0) * (y1 - y0)
# Union
area1 = bbox1.width * bbox1.height
area2 = bbox2.width * bbox2.height
union = area1 + area2 - intersection
if union == 0:
return 0.0
return intersection / union
def parse_table_html(html: str) -> List[TableCell]:
"""
Parse table HTML and extract cells with row/col indices.
Args:
html: Table HTML string from PP-Structure
Returns:
List of TableCell objects with position and content
"""
parser = TableHTMLParser()
try:
parser.feed(html)
except Exception as e:
logger.warning(f"Failed to parse table HTML: {e}")
return []
return parser.cells
def find_header_row(cells: List[TableCell], min_columns: int = 3) -> Optional[int]:
"""
Find the best row to use as header anchor.
Strategy: Find the first row with at least min_columns individual cells
(cells without colspan > 1). This avoids using merged title rows as headers.
Args:
cells: All parsed cells
min_columns: Minimum number of individual columns required
Returns:
Row index of the best header row, or None if not found
"""
# Group cells by row
rows: Dict[int, List[TableCell]] = {}
for cell in cells:
if cell.row_idx not in rows:
rows[cell.row_idx] = []
rows[cell.row_idx].append(cell)
# Find first row with enough individual columns (no colspan)
for row_idx in sorted(rows.keys()):
row_cells = rows[row_idx]
individual_cells = [c for c in row_cells if c.colspan == 1]
if len(individual_cells) >= min_columns:
logger.debug(f"[COLUMN CORRECTION] Found header row {row_idx} with {len(individual_cells)} individual columns")
return row_idx
# Fallback: find row with most individual cells
best_row = None
best_count = 0
for row_idx, row_cells in rows.items():
individual_count = len([c for c in row_cells if c.colspan == 1])
if individual_count > best_count:
best_count = individual_count
best_row = row_idx
if best_row is not None and best_count >= 2:
logger.debug(f"[COLUMN CORRECTION] Using fallback header row {best_row} with {best_count} columns")
return best_row
return None
def build_column_anchors(
header_cells: List[TableCell],
cell_boxes: List[List[float]],
all_cells: Optional[List[TableCell]] = None
) -> List[ColumnAnchor]:
"""
Build column anchors from header row cells matched with cell_boxes.
The header row is used as the authoritative reference for column X-coordinate
ranges. For tables with merged title rows, we find the first row with
multiple individual columns.
Args:
header_cells: Cells from the identified header row
cell_boxes: List of [x0, y0, x1, y1] coordinates from PP-Structure
all_cells: All cells for finding actual header row (optional)
Returns:
List of ColumnAnchor sorted by x_min
"""
if not header_cells or not cell_boxes:
return []
# If header row has too many merged cells, try to find a better header row
individual_cells = [c for c in header_cells if c.colspan == 1]
if len(individual_cells) < 3 and all_cells:
header_row_idx = find_header_row(all_cells, min_columns=3)
if header_row_idx is not None:
header_cells = [c for c in all_cells if c.row_idx == header_row_idx]
individual_cells = [c for c in header_cells if c.colspan == 1]
logger.info(f"[COLUMN CORRECTION] Switched to row {header_row_idx} as header ({len(individual_cells)} columns)")
# Only use individual cells (no colspan) for accurate column boundaries
if individual_cells:
header_cells = individual_cells
# Convert cell_boxes to BBox objects
boxes = []
for coords in cell_boxes:
try:
boxes.append(BBox.from_list(coords))
except (ValueError, IndexError):
continue
if not boxes:
return []
# Group boxes by approximate Y position to find the header row's boxes
# Sort all boxes by Y first
boxes_by_y = sorted(boxes, key=lambda b: b.y0)
# Find the Y range of the header cells (need to estimate based on row index)
header_row_idx = header_cells[0].row_idx if header_cells else 0
# Group boxes into rows by Y clustering
row_groups: List[List[BBox]] = []
current_group: List[BBox] = []
current_y = None
y_threshold = 40 # pixels tolerance for same row
for box in boxes_by_y:
if current_y is None:
current_group.append(box)
current_y = box.center_y
elif abs(box.center_y - current_y) < y_threshold:
current_group.append(box)
current_y = (current_y * len(current_group) + box.center_y) / (len(current_group) + 1)
else:
if current_group:
row_groups.append(sorted(current_group, key=lambda b: b.x0))
current_group = [box]
current_y = box.center_y
if current_group:
row_groups.append(sorted(current_group, key=lambda b: b.x0))
# Find the row group that best matches the header row
# Look for a row with similar number of boxes as header cells
target_count = len(header_cells)
best_row_group = None
best_diff = float('inf')
for group in row_groups:
diff = abs(len(group) - target_count)
if diff < best_diff:
best_diff = diff
best_row_group = group
if not best_row_group:
logger.warning("[COLUMN CORRECTION] Could not find matching cell_boxes row for header")
return []
logger.debug(f"[COLUMN CORRECTION] Matched header row with {len(best_row_group)} cell_boxes")
# Sort header cells by col_idx and match with boxes in X-order
header_sorted = sorted(header_cells, key=lambda c: c.col_idx)
boxes_sorted = best_row_group # Already sorted by x0
anchors = []
for i, cell in enumerate(header_sorted):
if i < len(boxes_sorted):
box = boxes_sorted[i]
anchors.append(ColumnAnchor(
col_idx=cell.col_idx,
x_min=box.x0,
x_max=box.x1,
colspan=cell.colspan
))
return sorted(anchors, key=lambda a: a.x_min)
def match_cell_to_cellbox(
cell: TableCell,
cell_boxes: List[BBox],
row_cells: List[TableCell]
) -> Optional[BBox]:
"""
Match a table cell to its corresponding cell_box using position heuristics.
Strategy:
1. For header row, use X-order matching
2. For other rows, use IoU if we have inferred bbox
3. Fall back to position-based matching within row
Args:
cell: The cell to match
cell_boxes: All cell_boxes for this table
row_cells: All cells in the same row (for position context)
Returns:
Matched BBox or None if no match found
"""
if not cell_boxes:
return None
# Sort cell_boxes by Y first, then X
sorted_boxes = sorted(cell_boxes, key=lambda b: (b.y0, b.x0))
# Group boxes by approximate Y position (same row)
row_groups: List[List[BBox]] = []
current_group: List[BBox] = []
current_y = None
for box in sorted_boxes:
if current_y is None or abs(box.center_y - current_y) < 30: # 30px tolerance
current_group.append(box)
if current_y is None:
current_y = box.center_y
else:
current_y = (current_y + box.center_y) / 2
else:
if current_group:
row_groups.append(sorted(current_group, key=lambda b: b.x0))
current_group = [box]
current_y = box.center_y
if current_group:
row_groups.append(sorted(current_group, key=lambda b: b.x0))
# Find the row that best matches cell.row_idx
if cell.row_idx < len(row_groups):
row_boxes = row_groups[cell.row_idx]
# Sort cells in this row by col_idx
row_cells_sorted = sorted(row_cells, key=lambda c: c.col_idx)
cell_position = row_cells_sorted.index(cell) if cell in row_cells_sorted else -1
if 0 <= cell_position < len(row_boxes):
return row_boxes[cell_position]
return None
def correct_cell_column(
cell: TableCell,
anchors: List[ColumnAnchor],
threshold: float = 0.5
) -> int:
"""
Determine the correct column index for a cell based on X-coordinate overlap.
Args:
cell: The cell to check
anchors: Column anchors from header row
threshold: Minimum overlap ratio to trigger correction
Returns:
Corrected column index (may be same as original)
"""
if not cell.bbox or not anchors:
return cell.col_idx
# Find the anchor with best X-overlap
best_anchor = None
best_overlap = 0.0
for anchor in anchors:
overlap = calculate_x_overlap(cell.bbox, anchor)
if overlap > best_overlap:
best_overlap = overlap
best_anchor = anchor
# If we found a significantly better column, use it
if best_anchor and best_overlap >= threshold:
if best_anchor.col_idx != cell.col_idx:
logger.info(
f"[COLUMN CORRECTION] Row {cell.row_idx}: "
f"'{cell.content[:20]}...' col {cell.col_idx} -> {best_anchor.col_idx} "
f"(overlap: {best_overlap:.1%})"
)
return best_anchor.col_idx
# If no good overlap, try nearest by center point
if best_overlap < 0.1:
cell_center = cell.bbox.center_x
nearest_anchor = min(anchors, key=lambda a: abs(a.center_x - cell_center))
if nearest_anchor.col_idx != cell.col_idx:
logger.info(
f"[COLUMN CORRECTION] Row {cell.row_idx}: "
f"'{cell.content[:20]}...' col {cell.col_idx} -> {nearest_anchor.col_idx} "
f"(nearest by center)"
)
return nearest_anchor.col_idx
return cell.col_idx
def detect_vertical_fragments(
text_blocks: List[TextBlock],
table_bbox: BBox,
aspect_ratio_threshold: float = 0.3,
left_margin_ratio: float = 0.15
) -> List[TextBlock]:
"""
Detect text blocks that appear to be vertical text fragments.
Criteria:
1. Width << Height (aspect ratio < threshold)
2. Located in leftmost portion of table
Args:
text_blocks: All text blocks in/around the table
table_bbox: Table bounding box
aspect_ratio_threshold: Max width/height to be considered vertical
left_margin_ratio: Fraction of table width to consider as left margin
Returns:
List of blocks that are likely vertical text fragments
"""
left_boundary = table_bbox.x0 + (table_bbox.width * left_margin_ratio)
fragments = []
for block in text_blocks:
if block.aspect_ratio < aspect_ratio_threshold:
if block.bbox.center_x < left_boundary:
fragments.append(block)
return fragments
def should_merge_blocks(block1: TextBlock, block2: TextBlock, x_tolerance: float = 10.0, y_gap_max: float = 20.0) -> bool:
"""
Check if two blocks should be merged as vertical text.
Criteria:
1. X-center deviation < tolerance
2. Y-gap between blocks < max gap
Args:
block1: First block (should be above block2)
block2: Second block
x_tolerance: Max X-center deviation in pixels
y_gap_max: Max vertical gap between blocks
Returns:
True if blocks should be merged
"""
x_deviation = abs(block1.bbox.center_x - block2.bbox.center_x)
y_gap = block2.bbox.y0 - block1.bbox.y1
return x_deviation < x_tolerance and 0 <= y_gap < y_gap_max
def merge_vertical_fragments(
fragments: List[TextBlock],
x_tolerance: float = 10.0,
y_gap_max: float = 20.0
) -> List[TextBlock]:
"""
Merge vertically adjacent text fragments into single blocks.
Args:
fragments: List of vertical text fragments
x_tolerance: Max X-center deviation for merging
y_gap_max: Max Y-gap between mergeable blocks
Returns:
List of merged text blocks
"""
if not fragments:
return []
# Sort by Y position
sorted_fragments = sorted(fragments, key=lambda b: b.bbox.y0)
merged = []
current_group: List[TextBlock] = []
for block in sorted_fragments:
if not current_group:
current_group.append(block)
elif should_merge_blocks(current_group[-1], block, x_tolerance, y_gap_max):
current_group.append(block)
else:
# Merge current group and start new one
merged.append(_merge_group(current_group))
current_group = [block]
if current_group:
merged.append(_merge_group(current_group))
return merged
def _merge_group(blocks: List[TextBlock]) -> TextBlock:
"""Merge a group of text blocks into one."""
if len(blocks) == 1:
return blocks[0]
# Combine text (top to bottom)
combined_text = ''.join(b.text for b in blocks)
# Calculate merged bbox
x0 = min(b.bbox.x0 for b in blocks)
y0 = min(b.bbox.y0 for b in blocks)
x1 = max(b.bbox.x1 for b in blocks)
y1 = max(b.bbox.y1 for b in blocks)
return TextBlock(
text=combined_text,
bbox=BBox(x0, y0, x1, y1)
)
def correct_table_columns(
html: str,
cell_boxes: List[List[float]],
threshold: float = 0.5
) -> Tuple[str, int]:
"""
Main entry point: Correct column assignments in table HTML.
This function:
1. Parses the HTML to extract cells with row/col
2. Builds column anchors from header row
3. Matches cells to cell_boxes
4. Corrects column indices based on X-overlap
5. Rebuilds the HTML with corrected indices
Args:
html: Original table HTML from PP-Structure
cell_boxes: List of [x0, y0, x1, y1] from PP-Structure
threshold: Minimum overlap ratio for correction
Returns:
Tuple of (corrected_html, correction_count)
"""
# Parse HTML
cells = parse_table_html(html)
if not cells:
logger.debug("[COLUMN CORRECTION] No cells parsed from HTML")
return html, 0
# Convert cell_boxes to BBox objects
boxes = []
for coords in cell_boxes:
try:
boxes.append(BBox.from_list(coords))
except (ValueError, IndexError):
continue
if not boxes:
logger.debug("[COLUMN CORRECTION] No valid cell_boxes")
return html, 0
# Find the best header row (not necessarily row 0)
# First try row 0, but if it has merged cells, find a better row
header_row_idx = find_header_row(cells, min_columns=3)
if header_row_idx is None:
# Fallback to row 0
header_row_idx = 0
header_cells = [c for c in cells if c.row_idx == header_row_idx]
if not header_cells:
logger.debug("[COLUMN CORRECTION] No header row found, skipping correction")
return html, 0
# Build column anchors, passing all cells for smart header detection
anchors = build_column_anchors(header_cells, cell_boxes, all_cells=cells)
if not anchors:
logger.debug("[COLUMN CORRECTION] Could not build column anchors")
return html, 0
logger.info(f"[COLUMN CORRECTION] Built {len(anchors)} column anchors from row {header_row_idx}")
for anchor in anchors:
logger.debug(f" Column {anchor.col_idx}: X range [{anchor.x_min:.1f}, {anchor.x_max:.1f}]")
# Group cells by row for matching
cells_by_row: Dict[int, List[TableCell]] = {}
for cell in cells:
if cell.row_idx not in cells_by_row:
cells_by_row[cell.row_idx] = []
cells_by_row[cell.row_idx].append(cell)
# Match cells to cell_boxes and correct columns
correction_count = 0
corrections: Dict[Tuple[int, int], int] = {} # (row, old_col) -> new_col
for cell in cells:
if cell.row_idx == header_row_idx:
continue # Skip header row (used as reference)
row_cells = cells_by_row.get(cell.row_idx, [])
matched_box = match_cell_to_cellbox(cell, boxes, row_cells)
if matched_box:
cell.bbox = matched_box
new_col = correct_cell_column(cell, anchors, threshold)
if new_col != cell.col_idx:
corrections[(cell.row_idx, cell.col_idx)] = new_col
correction_count += 1
if correction_count == 0:
logger.info("[COLUMN CORRECTION] No corrections needed")
return html, 0
# Rebuild HTML with corrected column indices
# Note: This is a simple approach that modifies HTML attributes
# A more robust solution would rebuild the entire table structure
corrected_html = html
logger.info(f"[COLUMN CORRECTION] Made {correction_count} column corrections")
return corrected_html, correction_count
class TableColumnCorrector:
"""
Service class for table column correction.
Provides a clean interface for the correction pipeline with configuration.
"""
def __init__(
self,
correction_threshold: float = 0.5,
vertical_merge_enabled: bool = True,
vertical_aspect_ratio: float = 0.3
):
self.correction_threshold = correction_threshold
self.vertical_merge_enabled = vertical_merge_enabled
self.vertical_aspect_ratio = vertical_aspect_ratio
def correct(
self,
html: str,
cell_boxes: List[List[float]],
table_bbox: Optional[List[float]] = None,
text_blocks: Optional[List[Dict]] = None
) -> Tuple[str, Dict]:
"""
Apply column correction to a table.
Args:
html: Table HTML from PP-Structure
cell_boxes: Cell bounding boxes
table_bbox: Table bounding box (for vertical fragment detection)
text_blocks: Raw OCR text blocks (for vertical fragment merging)
Returns:
Tuple of (corrected_html, stats_dict)
"""
stats = {
'column_corrections': 0,
'vertical_merges': 0,
'anchors_built': 0
}
# Step 1: Vertical fragment merging (if enabled and data available)
if self.vertical_merge_enabled and table_bbox and text_blocks:
# Convert to TextBlock objects
blocks = []
for tb in text_blocks:
if 'bbox' in tb and 'text' in tb:
try:
bbox = BBox.from_list(tb['bbox'])
blocks.append(TextBlock(text=tb['text'], bbox=bbox))
except (ValueError, KeyError):
continue
if blocks:
table_bb = BBox.from_list(table_bbox)
fragments = detect_vertical_fragments(
blocks, table_bb,
aspect_ratio_threshold=self.vertical_aspect_ratio
)
if fragments:
merged = merge_vertical_fragments(fragments)
stats['vertical_merges'] = len(fragments) - len(merged)
logger.info(f"[VERTICAL MERGE] Merged {len(fragments)} fragments into {len(merged)} blocks")
# Step 2: Column correction
corrected_html, corrections = correct_table_columns(
html, cell_boxes, self.correction_threshold
)
stats['column_corrections'] = corrections
return corrected_html, stats

View File

@@ -0,0 +1,806 @@
"""
Table Content Rebuilder
Rebuilds table content from raw OCR regions when PP-StructureV3's HTML output
is incorrect due to cell merge errors or boundary detection issues.
This module addresses the key problem: PP-StructureV3's ML-based table recognition
often merges multiple cells incorrectly, especially for borderless tables.
The solution uses:
1. cell_boxes validation (filter out-of-bounds cells)
2. Raw OCR regions to rebuild accurate cell content
3. Grid-based row/col position calculation
"""
import logging
from dataclasses import dataclass
from typing import List, Dict, Any, Optional, Tuple
from collections import defaultdict
logger = logging.getLogger(__name__)
@dataclass
class CellBox:
"""Represents a validated cell bounding box."""
x0: float
y0: float
x1: float
y1: float
original_index: int
@property
def center_y(self) -> float:
return (self.y0 + self.y1) / 2
@property
def center_x(self) -> float:
return (self.x0 + self.x1) / 2
@property
def area(self) -> float:
return max(0, (self.x1 - self.x0) * (self.y1 - self.y0))
@dataclass
class OCRTextRegion:
"""Represents a raw OCR text region."""
text: str
x0: float
y0: float
x1: float
y1: float
confidence: float = 1.0
@property
def center_y(self) -> float:
return (self.y0 + self.y1) / 2
@property
def center_x(self) -> float:
return (self.x0 + self.x1) / 2
@dataclass
class RebuiltCell:
"""Represents a rebuilt table cell."""
row: int
col: int
row_span: int
col_span: int
content: str
bbox: Optional[List[float]] = None
ocr_regions: List[OCRTextRegion] = None
def __post_init__(self):
if self.ocr_regions is None:
self.ocr_regions = []
class TableContentRebuilder:
"""
Rebuilds table content from raw OCR regions and validated cell_boxes.
This class solves the problem where PP-StructureV3's HTML output incorrectly
merges multiple cells. Instead of relying on the ML-generated HTML, it:
1. Validates cell_boxes against table bbox
2. Groups cell_boxes into rows/columns by coordinate clustering
3. Fills each cell with matching raw OCR text
4. Generates correct table structure
"""
def __init__(
self,
boundary_tolerance: float = 20.0,
row_clustering_threshold: float = 15.0,
col_clustering_threshold: float = 15.0,
iou_threshold_for_ocr_match: float = 0.3,
min_text_coverage: float = 0.5
):
"""
Initialize the rebuilder.
Args:
boundary_tolerance: Tolerance for cell_boxes boundary check (pixels)
row_clustering_threshold: Max Y-distance for cells in same row (pixels)
col_clustering_threshold: Max X-distance for cells in same column (pixels)
iou_threshold_for_ocr_match: Min IoU to consider OCR region inside cell
min_text_coverage: Min overlap ratio for OCR text to be assigned to cell
"""
self.boundary_tolerance = boundary_tolerance
self.row_clustering_threshold = row_clustering_threshold
self.col_clustering_threshold = col_clustering_threshold
self.iou_threshold = iou_threshold_for_ocr_match
self.min_text_coverage = min_text_coverage
def validate_cell_boxes(
self,
cell_boxes: List[List[float]],
table_bbox: List[float]
) -> Tuple[List[CellBox], Dict[str, Any]]:
"""
Validate cell_boxes against table bbox, filtering invalid ones.
Args:
cell_boxes: List of cell bounding boxes [[x0, y0, x1, y1], ...]
table_bbox: Table bounding box [x0, y0, x1, y1]
Returns:
Tuple of (valid_cells, validation_stats)
"""
if not cell_boxes or len(table_bbox) < 4:
return [], {"total": 0, "valid": 0, "invalid": 0, "reason": "empty_input"}
table_x0, table_y0, table_x1, table_y1 = table_bbox[:4]
table_height = table_y1 - table_y0
table_width = table_x1 - table_x0
# Expanded table bounds with tolerance
expanded_y1 = table_y1 + self.boundary_tolerance
expanded_x1 = table_x1 + self.boundary_tolerance
expanded_y0 = table_y0 - self.boundary_tolerance
expanded_x0 = table_x0 - self.boundary_tolerance
valid_cells = []
invalid_reasons = defaultdict(int)
for idx, box in enumerate(cell_boxes):
if not box or len(box) < 4:
invalid_reasons["invalid_format"] += 1
continue
x0, y0, x1, y1 = box[:4]
# Check if cell is significantly outside table bounds
# Cell's bottom (y1) shouldn't exceed table's bottom + tolerance
if y1 > expanded_y1:
invalid_reasons["y1_exceeds_table"] += 1
continue
# Cell's top (y0) shouldn't be above table's top - tolerance
if y0 < expanded_y0:
invalid_reasons["y0_above_table"] += 1
continue
# Cell's right (x1) shouldn't exceed table's right + tolerance
if x1 > expanded_x1:
invalid_reasons["x1_exceeds_table"] += 1
continue
# Cell's left (x0) shouldn't be left of table - tolerance
if x0 < expanded_x0:
invalid_reasons["x0_left_of_table"] += 1
continue
# Check for inverted coordinates
if x0 >= x1 or y0 >= y1:
invalid_reasons["inverted_coords"] += 1
continue
# Check cell height is reasonable (at least 8px for readable text)
cell_height = y1 - y0
if cell_height < 8:
invalid_reasons["too_small"] += 1
continue
valid_cells.append(CellBox(
x0=x0, y0=y0, x1=x1, y1=y1,
original_index=idx
))
stats = {
"total": len(cell_boxes),
"valid": len(valid_cells),
"invalid": len(cell_boxes) - len(valid_cells),
"invalid_reasons": dict(invalid_reasons),
"validity_ratio": len(valid_cells) / len(cell_boxes) if cell_boxes else 0
}
logger.info(
f"Cell box validation: {stats['valid']}/{stats['total']} valid "
f"(ratio={stats['validity_ratio']:.2%})"
)
if invalid_reasons:
logger.debug(f"Invalid reasons: {dict(invalid_reasons)}")
return valid_cells, stats
def parse_raw_ocr_regions(
self,
raw_regions: List[Dict[str, Any]],
table_bbox: List[float]
) -> List[OCRTextRegion]:
"""
Parse raw OCR regions and filter to those within/near table bbox.
Args:
raw_regions: List of raw OCR region dicts with 'text', 'bbox', 'confidence'
table_bbox: Table bounding box [x0, y0, x1, y1]
Returns:
List of OCRTextRegion objects within table area
"""
if not raw_regions or len(table_bbox) < 4:
return []
table_x0, table_y0, table_x1, table_y1 = table_bbox[:4]
# Expand table area slightly to catch edge text
margin = 10
result = []
for region in raw_regions:
text = region.get('text', '').strip()
if not text:
continue
bbox = region.get('bbox', [])
confidence = region.get('confidence', 1.0)
# Parse bbox (handle both nested and flat formats)
if not bbox:
continue
if isinstance(bbox[0], (list, tuple)):
# Nested format: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
xs = [pt[0] for pt in bbox if len(pt) >= 2]
ys = [pt[1] for pt in bbox if len(pt) >= 2]
if xs and ys:
x0, y0, x1, y1 = min(xs), min(ys), max(xs), max(ys)
else:
continue
elif len(bbox) == 4:
x0, y0, x1, y1 = bbox
else:
continue
# Check if region overlaps with table area
if (x1 < table_x0 - margin or x0 > table_x1 + margin or
y1 < table_y0 - margin or y0 > table_y1 + margin):
continue
result.append(OCRTextRegion(
text=text,
x0=float(x0), y0=float(y0),
x1=float(x1), y1=float(y1),
confidence=confidence
))
logger.debug(f"Parsed {len(result)} OCR regions within table area")
return result
def cluster_cells_into_grid(
self,
cells: List[CellBox]
) -> Tuple[List[float], List[float], Dict[Tuple[int, int], CellBox]]:
"""
Cluster cells into rows and columns based on coordinates.
Args:
cells: List of validated CellBox objects
Returns:
Tuple of (row_boundaries, col_boundaries, cell_grid)
- row_boundaries: Y coordinates for row divisions
- col_boundaries: X coordinates for column divisions
- cell_grid: Dict mapping (row, col) to CellBox
"""
if not cells:
return [], [], {}
# Collect all unique Y boundaries (top and bottom of cells)
y_coords = set()
x_coords = set()
for cell in cells:
y_coords.add(round(cell.y0, 1))
y_coords.add(round(cell.y1, 1))
x_coords.add(round(cell.x0, 1))
x_coords.add(round(cell.x1, 1))
# Cluster nearby coordinates
row_boundaries = self._cluster_coordinates(sorted(y_coords), self.row_clustering_threshold)
col_boundaries = self._cluster_coordinates(sorted(x_coords), self.col_clustering_threshold)
logger.debug(f"Found {len(row_boundaries)} row boundaries, {len(col_boundaries)} col boundaries")
# Map cells to grid positions
cell_grid = {}
for cell in cells:
# Find row (based on cell's top Y coordinate)
row = self._find_position(cell.y0, row_boundaries)
# Find column (based on cell's left X coordinate)
col = self._find_position(cell.x0, col_boundaries)
if row is not None and col is not None:
# Check for span (if cell extends across multiple rows/cols)
row_end = self._find_position(cell.y1, row_boundaries)
col_end = self._find_position(cell.x1, col_boundaries)
# Store with potential span info
if (row, col) not in cell_grid:
cell_grid[(row, col)] = cell
return row_boundaries, col_boundaries, cell_grid
def _cluster_coordinates(
self,
coords: List[float],
threshold: float
) -> List[float]:
"""Cluster nearby coordinates into distinct values."""
if not coords:
return []
clustered = [coords[0]]
for coord in coords[1:]:
if coord - clustered[-1] > threshold:
clustered.append(coord)
return clustered
def _find_position(
self,
value: float,
boundaries: List[float]
) -> Optional[int]:
"""Find which position (index) a value falls into."""
for i, boundary in enumerate(boundaries):
if value <= boundary + self.row_clustering_threshold:
return i
return len(boundaries) - 1 if boundaries else None
def assign_ocr_to_cells(
self,
cells: List[CellBox],
ocr_regions: List[OCRTextRegion],
row_boundaries: List[float],
col_boundaries: List[float]
) -> Dict[Tuple[int, int], List[OCRTextRegion]]:
"""
Assign OCR text regions to cells based on spatial overlap.
Args:
cells: List of validated CellBox objects
ocr_regions: List of OCRTextRegion objects
row_boundaries: Y coordinates for row divisions
col_boundaries: X coordinates for column divisions
Returns:
Dict mapping (row, col) to list of OCR regions in that cell
"""
cell_ocr_map: Dict[Tuple[int, int], List[OCRTextRegion]] = defaultdict(list)
for ocr in ocr_regions:
best_cell = None
best_overlap = 0
for cell in cells:
overlap = self._calculate_overlap_ratio(
(ocr.x0, ocr.y0, ocr.x1, ocr.y1),
(cell.x0, cell.y0, cell.x1, cell.y1)
)
if overlap > best_overlap and overlap >= self.min_text_coverage:
best_overlap = overlap
best_cell = cell
if best_cell:
row = self._find_position(best_cell.y0, row_boundaries)
col = self._find_position(best_cell.x0, col_boundaries)
if row is not None and col is not None:
cell_ocr_map[(row, col)].append(ocr)
return cell_ocr_map
def _calculate_overlap_ratio(
self,
box1: Tuple[float, float, float, float],
box2: Tuple[float, float, float, float]
) -> float:
"""Calculate overlap ratio of box1 with box2."""
x0_1, y0_1, x1_1, y1_1 = box1
x0_2, y0_2, x1_2, y1_2 = box2
# Calculate intersection
inter_x0 = max(x0_1, x0_2)
inter_y0 = max(y0_1, y0_2)
inter_x1 = min(x1_1, x1_2)
inter_y1 = min(y1_1, y1_2)
if inter_x0 >= inter_x1 or inter_y0 >= inter_y1:
return 0.0
inter_area = (inter_x1 - inter_x0) * (inter_y1 - inter_y0)
box1_area = (x1_1 - x0_1) * (y1_1 - y0_1)
return inter_area / box1_area if box1_area > 0 else 0.0
def rebuild_table(
self,
cell_boxes: List[List[float]],
table_bbox: List[float],
raw_ocr_regions: List[Dict[str, Any]],
original_html: str = ""
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
"""
Rebuild table content from cell_boxes and raw OCR regions.
This is the main entry point. It:
1. Validates cell_boxes
2. If validity ratio is low, uses pure OCR-based rebuild
3. Otherwise, uses cell_boxes + OCR hybrid rebuild
Args:
cell_boxes: List of cell bounding boxes from PP-StructureV3
table_bbox: Table bounding box [x0, y0, x1, y1]
raw_ocr_regions: List of raw OCR region dicts
original_html: Original HTML from PP-StructureV3 (for fallback)
Returns:
Tuple of (rebuilt_table_dict, rebuild_stats)
"""
stats = {
"action": "none",
"reason": "",
"original_cell_count": len(cell_boxes) if cell_boxes else 0,
"valid_cell_count": 0,
"ocr_regions_in_table": 0,
"rebuilt_rows": 0,
"rebuilt_cols": 0
}
# Step 1: Validate cell_boxes
valid_cells, validation_stats = self.validate_cell_boxes(cell_boxes, table_bbox)
stats["valid_cell_count"] = validation_stats["valid"]
stats["validation"] = validation_stats
# Step 2: Parse raw OCR regions in table area
ocr_regions = self.parse_raw_ocr_regions(raw_ocr_regions, table_bbox)
stats["ocr_regions_in_table"] = len(ocr_regions)
if not ocr_regions:
stats["action"] = "skip"
stats["reason"] = "no_ocr_regions_in_table"
return None, stats
# Step 3: Choose rebuild strategy based on cell_boxes validity
# If validity ratio is too low (< 50%), use pure OCR-based rebuild
if validation_stats["validity_ratio"] < 0.5 or len(valid_cells) < 2:
logger.info(
f"Using pure OCR-based rebuild (validity={validation_stats['validity_ratio']:.2%})"
)
return self._rebuild_from_ocr_only(ocr_regions, table_bbox, stats)
# Otherwise, use hybrid cell_boxes + OCR rebuild
return self._rebuild_with_cell_boxes(valid_cells, ocr_regions, stats, table_bbox)
def _rebuild_from_ocr_only(
self,
ocr_regions: List[OCRTextRegion],
table_bbox: List[float],
stats: Dict[str, Any]
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
"""
Rebuild table using only OCR regions (when cell_boxes are unreliable).
Strategy:
1. Detect column boundary from OCR x-coordinates
2. Cluster OCR regions by Y coordinate into rows
3. Split each row into left/right columns
"""
if not ocr_regions:
stats["action"] = "skip"
stats["reason"] = "no_ocr_regions"
return None, stats
# Get table bounds
table_x0, table_y0, table_x1, table_y1 = table_bbox[:4]
table_width = table_x1 - table_x0
# Step 1: Detect column split point by analyzing x-coordinates
# Look for the gap between left column (x0 < 250) and right column (x0 >= 250)
col_split_x = self._detect_column_split(ocr_regions, table_bbox)
logger.debug(f"Detected column split at x={col_split_x}")
# Step 2: Cluster OCR regions by Y coordinate into rows
# Use smaller threshold (12px) to properly separate rows
row_threshold = 12.0
sorted_ocr = sorted(ocr_regions, key=lambda r: r.center_y)
rows = []
current_row = [sorted_ocr[0]]
for ocr in sorted_ocr[1:]:
if ocr.center_y - current_row[-1].center_y <= row_threshold:
current_row.append(ocr)
else:
rows.append(current_row)
current_row = [ocr]
rows.append(current_row)
logger.debug(f"Detected {len(rows)} rows")
# Step 3: Analyze column structure
left_regions = [r for r in ocr_regions if r.x0 < col_split_x]
right_regions = [r for r in ocr_regions if r.x0 >= col_split_x]
num_cols = 2 if len(left_regions) >= 2 and len(right_regions) >= 2 else 1
# Step 4: Build cells for each row
rebuilt_cells = []
for row_idx, row_ocrs in enumerate(rows):
row_ocrs_sorted = sorted(row_ocrs, key=lambda r: r.center_x)
if num_cols == 2:
# Split into left and right columns using x0
left_ocrs = [r for r in row_ocrs_sorted if r.x0 < col_split_x]
right_ocrs = [r for r in row_ocrs_sorted if r.x0 >= col_split_x]
# Left column cell
if left_ocrs:
left_content = " ".join(r.text for r in left_ocrs)
left_bbox = [
min(r.x0 for r in left_ocrs),
min(r.y0 for r in left_ocrs),
max(r.x1 for r in left_ocrs),
max(r.y1 for r in left_ocrs)
]
rebuilt_cells.append({
"row": row_idx,
"col": 0,
"row_span": 1,
"col_span": 1,
"content": left_content,
"bbox": left_bbox
})
# Right column cell
if right_ocrs:
right_content = " ".join(r.text for r in right_ocrs)
right_bbox = [
min(r.x0 for r in right_ocrs),
min(r.y0 for r in right_ocrs),
max(r.x1 for r in right_ocrs),
max(r.y1 for r in right_ocrs)
]
rebuilt_cells.append({
"row": row_idx,
"col": 1,
"row_span": 1,
"col_span": 1,
"content": right_content,
"bbox": right_bbox
})
else:
# Single column - merge all OCR in row
row_content = " ".join(r.text for r in row_ocrs_sorted)
row_bbox = [
min(r.x0 for r in row_ocrs_sorted),
min(r.y0 for r in row_ocrs_sorted),
max(r.x1 for r in row_ocrs_sorted),
max(r.y1 for r in row_ocrs_sorted)
]
rebuilt_cells.append({
"row": row_idx,
"col": 0,
"row_span": 1,
"col_span": 1,
"content": row_content,
"bbox": row_bbox
})
num_rows = len(rows)
stats["rebuilt_rows"] = num_rows
stats["rebuilt_cols"] = num_cols
# Build result
rebuilt_table = {
"rows": num_rows,
"cols": num_cols,
"cells": rebuilt_cells,
"html": self._generate_html(rebuilt_cells, num_rows, num_cols),
"rebuild_source": "pure_ocr"
}
stats["action"] = "rebuilt"
stats["reason"] = "pure_ocr_success"
stats["rebuilt_cell_count"] = len(rebuilt_cells)
logger.info(
f"Table rebuilt (pure OCR): {num_rows}x{num_cols} with {len(rebuilt_cells)} cells"
)
return rebuilt_table, stats
def _detect_column_split(
self,
ocr_regions: List[OCRTextRegion],
table_bbox: List[float]
) -> float:
"""
Detect the column split point by analyzing x-coordinates.
For tables with left/right structure (e.g., property-value tables),
there's usually a gap between left column text and right column text.
"""
if not ocr_regions:
return (table_bbox[0] + table_bbox[2]) / 2
# Collect all x0 values (left edge of each text region)
x0_values = sorted(set(round(r.x0) for r in ocr_regions))
if len(x0_values) < 2:
return (table_bbox[0] + table_bbox[2]) / 2
# Find the largest gap between consecutive x0 values
# This usually indicates the column boundary
max_gap = 0
split_point = (table_bbox[0] + table_bbox[2]) / 2
for i in range(len(x0_values) - 1):
gap = x0_values[i + 1] - x0_values[i]
if gap > max_gap and gap > 50: # Require minimum 50px gap
max_gap = gap
split_point = (x0_values[i] + x0_values[i + 1]) / 2
# If no clear gap found, use table center
if max_gap < 50:
split_point = (table_bbox[0] + table_bbox[2]) / 2
return split_point
def _rebuild_with_cell_boxes(
self,
valid_cells: List[CellBox],
ocr_regions: List[OCRTextRegion],
stats: Dict[str, Any],
table_bbox: Optional[List[float]] = None
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
"""Rebuild table using cell_boxes structure + OCR content."""
# Step 3: Cluster cells into grid
row_boundaries, col_boundaries, cell_grid = self.cluster_cells_into_grid(valid_cells)
num_rows = len(row_boundaries) - 1 if len(row_boundaries) > 1 else 1
num_cols = len(col_boundaries) - 1 if len(col_boundaries) > 1 else 1
# Quality check: if hybrid produces too many columns or sparse grid, fall back to pure OCR
# A well-formed table typically has 2-5 columns. Too many columns indicates poor clustering.
total_expected_cells = num_rows * num_cols
if num_cols > 5 or total_expected_cells > 100:
logger.info(
f"Hybrid mode produced {num_rows}x{num_cols} grid (too sparse), "
f"falling back to pure OCR mode"
)
if table_bbox:
return self._rebuild_from_ocr_only(ocr_regions, table_bbox, stats)
stats["rebuilt_rows"] = num_rows
stats["rebuilt_cols"] = num_cols
# Step 4: Assign OCR text to cells
cell_ocr_map = self.assign_ocr_to_cells(
valid_cells, ocr_regions, row_boundaries, col_boundaries
)
# Step 5: Build rebuilt cells
rebuilt_cells = []
for (row, col), ocr_list in cell_ocr_map.items():
# Sort OCR regions by position (top to bottom, left to right)
sorted_ocr = sorted(ocr_list, key=lambda r: (r.center_y, r.center_x))
content = " ".join(r.text for r in sorted_ocr)
# Find the cell bbox for this position
cell_bbox = None
for cell in valid_cells:
cell_row = self._find_position(cell.y0, row_boundaries)
cell_col = self._find_position(cell.x0, col_boundaries)
if cell_row == row and cell_col == col:
cell_bbox = [cell.x0, cell.y0, cell.x1, cell.y1]
break
rebuilt_cells.append({
"row": row,
"col": col,
"row_span": 1,
"col_span": 1,
"content": content,
"bbox": cell_bbox
})
# Quality check: if too few cells have content compared to grid size, fall back to pure OCR
content_ratio = len(rebuilt_cells) / total_expected_cells if total_expected_cells > 0 else 0
if content_ratio < 0.3 and table_bbox:
logger.info(
f"Hybrid mode has low content ratio ({content_ratio:.2%}), "
f"falling back to pure OCR mode"
)
return self._rebuild_from_ocr_only(ocr_regions, table_bbox, stats)
# Build result
rebuilt_table = {
"rows": num_rows,
"cols": num_cols,
"cells": rebuilt_cells,
"html": self._generate_html(rebuilt_cells, num_rows, num_cols),
"rebuild_source": "cell_boxes_hybrid"
}
stats["action"] = "rebuilt"
stats["reason"] = "hybrid_success"
stats["rebuilt_cell_count"] = len(rebuilt_cells)
logger.info(
f"Table rebuilt (hybrid): {num_rows}x{num_cols} with {len(rebuilt_cells)} cells "
f"(from {len(ocr_regions)} OCR regions)"
)
return rebuilt_table, stats
def _generate_html(
self,
cells: List[Dict[str, Any]],
num_rows: int,
num_cols: int
) -> str:
"""Generate HTML table from rebuilt cells."""
# Create grid
grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
for cell in cells:
row, col = cell["row"], cell["col"]
if 0 <= row < num_rows and 0 <= col < num_cols:
grid[row][col] = cell["content"]
# Build HTML
html_parts = ["<html><body><table>"]
for row_idx in range(num_rows):
html_parts.append("<tr>")
for col_idx in range(num_cols):
content = grid[row_idx][col_idx] or ""
tag = "th" if row_idx == 0 else "td"
html_parts.append(f"<{tag}>{content}</{tag}>")
html_parts.append("</tr>")
html_parts.append("</table></body></html>")
return "".join(html_parts)
def should_rebuild(
self,
cell_boxes: List[List[float]],
table_bbox: List[float],
original_html: str = ""
) -> Tuple[bool, str]:
"""
Determine if table should be rebuilt based on cell_boxes validity.
Args:
cell_boxes: List of cell bounding boxes
table_bbox: Table bounding box
original_html: Original HTML from PP-StructureV3
Returns:
Tuple of (should_rebuild, reason)
"""
if not cell_boxes:
return False, "no_cell_boxes"
_, validation_stats = self.validate_cell_boxes(cell_boxes, table_bbox)
# Always rebuild if ANY cells are invalid - PP-Structure HTML often merges cells incorrectly
# even when most cell_boxes are valid
if validation_stats["invalid"] > 0:
return True, f"invalid_cells_{validation_stats['invalid']}/{validation_stats['total']}"
# Rebuild if there are boundary violations
invalid_reasons = validation_stats.get("invalid_reasons", {})
boundary_violations = (
invalid_reasons.get("y1_exceeds_table", 0) +
invalid_reasons.get("y0_above_table", 0) +
invalid_reasons.get("x1_exceeds_table", 0) +
invalid_reasons.get("x0_left_of_table", 0)
)
if boundary_violations > 0:
return True, f"boundary_violations_{boundary_violations}"
# Also rebuild to ensure OCR-based content is used instead of PP-Structure HTML
# PP-Structure's HTML often has incorrect cell merging
return True, "ocr_content_preferred"

View File

@@ -0,0 +1,664 @@
"""
Simple Text Region Renderer
Renders raw OCR text regions directly to PDF at their detected positions,
with rotation correction based on bbox quadrilateral geometry.
This approach bypasses complex table structure reconstruction and simply
places text at the positions detected by PaddleOCR.
"""
import math
import logging
from typing import Dict, List, Optional, Set, Tuple
from reportlab.pdfgen import canvas
from reportlab.lib.colors import black
logger = logging.getLogger(__name__)
class TextRegionRenderer:
"""
Render raw OCR text regions to PDF with position and rotation correction.
This renderer takes the raw OCR output (text + quadrilateral bbox) and
renders text at the correct position. Small rotation angles are ignored
(straightened) to produce clean, aligned text output.
"""
# Minimum font size to prevent illegible text
MIN_FONT_SIZE = 6.0
# Maximum font size to prevent oversized text
MAX_FONT_SIZE = 72.0
# Font size estimation factor (font height relative to bbox height)
FONT_SIZE_FACTOR = 0.75
# Rotation angle threshold - angles smaller than this are straightened to 0
# This compensates for slight scan skew and produces cleaner output
ROTATION_STRAIGHTEN_THRESHOLD = 10.0 # degrees
# IoA (Intersection over Area) threshold for text-image overlap detection
# If text bbox overlaps with image by more than this ratio, skip the text
IOA_OVERLAP_THRESHOLD = 0.3 # 30% overlap
def __init__(
self,
font_name: str = 'NotoSansSC',
debug: bool = False,
straighten_threshold: float = None,
ioa_threshold: float = None
):
"""
Initialize the text region renderer.
Args:
font_name: Name of the registered font to use
debug: Enable debug logging
straighten_threshold: Override rotation straightening threshold (degrees)
ioa_threshold: Override IoA overlap threshold for text-image avoidance
"""
self.font_name = font_name
self.debug = debug
self.straighten_threshold = straighten_threshold or self.ROTATION_STRAIGHTEN_THRESHOLD
self.ioa_threshold = ioa_threshold or self.IOA_OVERLAP_THRESHOLD
def calculate_rotation(self, bbox: List[List[float]]) -> float:
"""
Calculate text rotation angle from bbox quadrilateral.
The bbox is a quadrilateral with 4 corner points in order:
[top-left, top-right, bottom-right, bottom-left]
Returns angle in degrees (counter-clockwise from horizontal).
Positive angle means text is tilted upward to the right.
NOTE: Small angles (< straighten_threshold) will be treated as 0
during rendering to produce clean, aligned output.
Args:
bbox: List of 4 [x, y] coordinate pairs
Returns:
Rotation angle in degrees
"""
if len(bbox) < 2:
return 0.0
# Top-left to top-right vector (top edge)
dx = bbox[1][0] - bbox[0][0]
dy = bbox[1][1] - bbox[0][1]
# Calculate angle (atan2 returns radians, convert to degrees)
# Note: In image coordinates, Y increases downward
# We negate dy to get the conventional angle
angle_rad = math.atan2(-dy, dx)
angle_deg = math.degrees(angle_rad)
if self.debug:
logger.debug(f"Rotation calculation: dx={dx:.1f}, dy={dy:.1f}, angle={angle_deg:.2f}°")
return angle_deg
def estimate_font_size(
self,
bbox: List[List[float]],
text: str,
scale_factor: float = 1.0
) -> float:
"""
Estimate appropriate font size from bbox dimensions.
Uses the bbox height as the primary indicator, with adjustment
for the typical font-to-bbox ratio.
Args:
bbox: List of 4 [x, y] coordinate pairs
text: The text content (for width-based adjustments)
scale_factor: Coordinate scaling factor
Returns:
Estimated font size in points
"""
if len(bbox) < 4:
return 12.0 # Default font size
# Calculate bbox height (average of left and right edges)
left_height = math.dist(bbox[0], bbox[3])
right_height = math.dist(bbox[1], bbox[2])
avg_height = (left_height + right_height) / 2
# Apply scale factor and font size ratio
font_size = avg_height * scale_factor * self.FONT_SIZE_FACTOR
# Clamp to reasonable range
font_size = max(self.MIN_FONT_SIZE, min(self.MAX_FONT_SIZE, font_size))
if self.debug:
logger.debug(f"Font size estimation: bbox_h={avg_height:.1f}, "
f"scale={scale_factor:.3f}, font={font_size:.1f}pt")
return font_size
def get_bbox_center(self, bbox: List[List[float]]) -> Tuple[float, float]:
"""
Calculate the center point of a bbox quadrilateral.
Args:
bbox: List of 4 [x, y] coordinate pairs
Returns:
Tuple of (center_x, center_y)
"""
if len(bbox) < 4:
return (0.0, 0.0)
center_x = sum(p[0] for p in bbox) / 4
center_y = sum(p[1] for p in bbox) / 4
return (center_x, center_y)
def get_bbox_as_rect(self, bbox: List[List[float]]) -> Tuple[float, float, float, float]:
"""
Convert quadrilateral bbox to axis-aligned rectangle (x0, y0, x1, y1).
Args:
bbox: List of 4 [x, y] coordinate pairs
Returns:
Tuple of (x0, y0, x1, y1) - min/max coordinates
"""
if len(bbox) < 4:
return (0.0, 0.0, 0.0, 0.0)
x_coords = [p[0] for p in bbox]
y_coords = [p[1] for p in bbox]
return (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
def get_bbox_left_baseline(
self,
bbox: List[List[float]]
) -> Tuple[float, float]:
"""
Get the left baseline point for text rendering.
For left-aligned text, we use the bottom-left corner as the
baseline starting point (text baseline is at the bottom).
Args:
bbox: List of 4 [x, y] coordinate pairs
Returns:
Tuple of (x, y) for the left baseline point
"""
if len(bbox) < 4:
return (0.0, 0.0)
# Use bottom-left corner for baseline
# bbox[3] is bottom-left in the standard ordering
x = bbox[3][0]
y = bbox[3][1]
return (x, y)
def calculate_ioa(
self,
text_rect: Tuple[float, float, float, float],
image_rect: Tuple[float, float, float, float]
) -> float:
"""
Calculate Intersection over Area (IoA) of text bbox with image bbox.
IoA = intersection_area / text_area
This measures how much of the text region overlaps with the image.
Args:
text_rect: Text bbox as (x0, y0, x1, y1)
image_rect: Image bbox as (x0, y0, x1, y1)
Returns:
IoA ratio (0.0 to 1.0)
"""
tx0, ty0, tx1, ty1 = text_rect
ix0, iy0, ix1, iy1 = image_rect
# Calculate text area
text_area = (tx1 - tx0) * (ty1 - ty0)
if text_area <= 0:
return 0.0
# Calculate intersection
inter_x0 = max(tx0, ix0)
inter_y0 = max(ty0, iy0)
inter_x1 = min(tx1, ix1)
inter_y1 = min(ty1, iy1)
if inter_x0 >= inter_x1 or inter_y0 >= inter_y1:
return 0.0 # No intersection
inter_area = (inter_x1 - inter_x0) * (inter_y1 - inter_y0)
return inter_area / text_area
def is_overlapping_exclusion_zones(
self,
bbox: List[List[float]],
exclusion_zones: List[Tuple[float, float, float, float]]
) -> bool:
"""
Check if text bbox overlaps significantly with any exclusion zone.
Args:
bbox: Text bbox as quadrilateral
exclusion_zones: List of (x0, y0, x1, y1) rectangles to avoid
Returns:
True if text should be skipped due to overlap
"""
if not exclusion_zones:
return False
text_rect = self.get_bbox_as_rect(bbox)
for zone in exclusion_zones:
ioa = self.calculate_ioa(text_rect, zone)
if ioa >= self.ioa_threshold:
if self.debug:
logger.debug(f"Text overlaps exclusion zone: IoA={ioa:.2f} >= {self.ioa_threshold}")
return True
return False
def is_inside_zone(
self,
bbox: List[List[float]],
zone: Tuple[float, float, float, float],
threshold: float = 0.5
) -> bool:
"""
Check if text bbox is inside a zone (for collecting chart texts).
Args:
bbox: Text bbox as quadrilateral
zone: Zone as (x0, y0, x1, y1) rectangle
threshold: Minimum IoA to consider "inside"
Returns:
True if text is inside the zone
"""
text_rect = self.get_bbox_as_rect(bbox)
ioa = self.calculate_ioa(text_rect, zone)
return ioa >= threshold
def is_axis_label(
self,
bbox: List[List[float]],
zone: Tuple[float, float, float, float],
margin: float = 50.0
) -> bool:
"""
Check if text bbox is an axis label for a chart/image zone.
Axis labels are typically:
- Vertical text to the LEFT of the chart (Y-axis label)
- Horizontal text BELOW the chart (X-axis label)
Args:
bbox: Text bbox as quadrilateral
zone: Chart/image zone as (x0, y0, x1, y1) rectangle
margin: Maximum distance from zone edge to be considered axis label
Returns:
True if text appears to be an axis label for this zone
"""
if len(bbox) < 4:
return False
text_rect = self.get_bbox_as_rect(bbox)
tx0, ty0, tx1, ty1 = text_rect
zx0, zy0, zx1, zy1 = zone
# Calculate text dimensions
text_width = tx1 - tx0
text_height = ty1 - ty0
# Check for Y-axis label: vertical text to the LEFT of zone
# - Text is to the left of zone (tx1 <= zx0 + small overlap)
# - Text's Y range overlaps with zone's Y range
# - Text is taller than wide (aspect ratio > 2) OR very narrow
is_left_of_zone = tx1 <= zx0 + margin and tx1 >= zx0 - margin
y_overlaps = not (ty1 < zy0 or ty0 > zy1)
is_vertical_text = text_height > text_width * 2
if is_left_of_zone and y_overlaps and is_vertical_text:
if self.debug:
logger.debug(f"Detected Y-axis label: text is left of zone, vertical")
return True
# Check for X-axis label: horizontal text BELOW the zone
# - Text is below zone (ty0 >= zy1 - small overlap)
# - Text's X range overlaps with zone's X range
# - Text is wider than tall (normal horizontal text)
is_below_zone = ty0 >= zy1 - margin and ty0 <= zy1 + margin
x_overlaps = not (tx1 < zx0 or tx0 > zx1)
is_horizontal_text = text_width > text_height
if is_below_zone and x_overlaps and is_horizontal_text:
if self.debug:
logger.debug(f"Detected X-axis label: text is below zone, horizontal")
return True
return False
def is_near_zone(
self,
bbox: List[List[float]],
zone: Tuple[float, float, float, float],
margin: float = 100.0
) -> bool:
"""
Check if text bbox is near (within margin) of a zone.
Args:
bbox: Text bbox as quadrilateral
zone: Zone as (x0, y0, x1, y1) rectangle
margin: Maximum distance from zone to be considered "near"
Returns:
True if text is near the zone
"""
if len(bbox) < 4:
return False
text_rect = self.get_bbox_as_rect(bbox)
tx0, ty0, tx1, ty1 = text_rect
zx0, zy0, zx1, zy1 = zone
# Expand zone by margin
expanded_zone = (zx0 - margin, zy0 - margin, zx1 + margin, zy1 + margin)
# Check if text overlaps with expanded zone
ex0, ey0, ex1, ey1 = expanded_zone
return not (tx1 < ex0 or tx0 > ex1 or ty1 < ey0 or ty0 > ey1)
def collect_zone_texts(
self,
regions: List[Dict],
zones: List[Tuple[float, float, float, float]],
threshold: float = 0.5,
include_axis_labels: bool = True
) -> Set[str]:
"""
Collect text content from regions inside zones or identified as axis labels.
This set is used during rendering for position-aware deduplication:
- Text that matches this set AND is near a zone will be skipped
- Text that matches but is far from zones will still be rendered
Args:
regions: List of raw OCR region dicts
zones: List of (x0, y0, x1, y1) rectangles (e.g., chart bboxes)
threshold: Minimum IoA to consider text as "inside" zone
include_axis_labels: Also collect axis labels adjacent to zones
Returns:
Set of text strings found inside zones or as axis labels
"""
zone_texts = set()
for region in regions:
text = region.get('text', '').strip()
bbox = region.get('bbox', [])
if not text or len(bbox) < 4:
continue
for zone in zones:
# Check if inside zone
if self.is_inside_zone(bbox, zone, threshold):
zone_texts.add(text)
if self.debug:
logger.debug(f"Collected zone text (inside): '{text}'")
break
# Check if it's an axis label
if include_axis_labels and self.is_axis_label(bbox, zone):
zone_texts.add(text)
if self.debug:
logger.debug(f"Collected zone text (axis label): '{text}'")
break
return zone_texts
def render_text_region(
self,
pdf_canvas: canvas.Canvas,
region: Dict,
page_height: float,
scale_x: float = 1.0,
scale_y: float = 1.0,
exclusion_zones: List[Tuple[float, float, float, float]] = None,
zone_texts: Set[str] = None
) -> Tuple[bool, str]:
"""
Render a single OCR text region to the PDF canvas.
Handles coordinate transformation from image coordinates (origin top-left)
to PDF coordinates (origin bottom-left).
Small rotation angles are straightened to produce clean output.
Text overlapping with exclusion zones (images) is skipped.
Deduplication logic (position-aware):
- If text matches zone_texts AND is NEAR the zone (or is axis label),
skip it to avoid duplicate chart labels
- Text far from zones is rendered even if it matches zone content
Args:
pdf_canvas: ReportLab canvas to draw on
region: Raw OCR region dict with 'text' and 'bbox'
page_height: Height of the PDF page (for Y-flip)
scale_x: X coordinate scaling factor
scale_y: Y coordinate scaling factor
exclusion_zones: List of (x0, y0, x1, y1) rectangles to avoid
zone_texts: Set of zone-internal texts (dedupe only if near zone)
Returns:
Tuple of (success: bool, skip_reason: str)
- success=True, skip_reason='' if rendered successfully
- success=False, skip_reason='overlap'/'dedupe'/'error'/'' if skipped
"""
text = region.get('text', '').strip()
bbox = region.get('bbox', [])
if not text or len(bbox) < 4:
return (False, '')
# Check if text overlaps with exclusion zones (images/charts)
if exclusion_zones and self.is_overlapping_exclusion_zones(bbox, exclusion_zones):
if self.debug:
logger.debug(f"Skipping text '{text[:20]}...' due to exclusion zone overlap")
return (False, 'overlap')
# Check if text should be deduplicated based on position
# Only skip if text matches zone content AND is near a zone (or is axis label)
if zone_texts and text in zone_texts and exclusion_zones:
for zone in exclusion_zones:
# Check if it's an axis label for this zone
if self.is_axis_label(bbox, zone):
if self.debug:
logger.debug(f"Skipping text '{text[:20]}...' - axis label for zone")
return (False, 'dedupe')
# Check if it's near this zone (for zone-internal text deduplication)
if self.is_near_zone(bbox, zone, margin=100.0):
if self.debug:
logger.debug(f"Skipping text '{text[:20]}...' - matches zone text and is near zone")
return (False, 'dedupe')
try:
# Calculate text properties
rotation = self.calculate_rotation(bbox)
font_size = self.estimate_font_size(bbox, text, scale_y)
# Straighten small rotations for cleaner output
# Only apply rotation for significant angles (e.g., 90° rotated text)
if abs(rotation) < self.straighten_threshold:
rotation = 0.0
# Get left baseline point in image coordinates
img_x, img_y = self.get_bbox_left_baseline(bbox)
# Apply scaling
scaled_x = img_x * scale_x
scaled_y = img_y * scale_y
# Convert to PDF coordinates (flip Y axis)
pdf_x = scaled_x
pdf_y = page_height - scaled_y
# Save canvas state
pdf_canvas.saveState()
# Try to set font with fallback
try:
pdf_canvas.setFont(self.font_name, font_size)
except KeyError:
# Font not registered, try fallback fonts
fallback_fonts = ['Helvetica', 'Times-Roman', 'Courier']
font_set = False
for fallback in fallback_fonts:
try:
pdf_canvas.setFont(fallback, font_size)
font_set = True
if self.debug:
logger.debug(f"Using fallback font: {fallback}")
break
except KeyError:
continue
if not font_set:
logger.warning(f"No available font found, skipping region")
pdf_canvas.restoreState()
return (False, 'error')
pdf_canvas.setFillColor(black)
# Apply rotation if needed (only for significant angles like 90°)
if abs(rotation) > 0.5:
pdf_canvas.translate(pdf_x, pdf_y)
pdf_canvas.rotate(rotation)
pdf_canvas.drawString(0, 0, text)
else:
pdf_canvas.drawString(pdf_x, pdf_y, text)
# Restore canvas state
pdf_canvas.restoreState()
if self.debug:
logger.debug(f"Rendered text '{text[:20]}...' at ({pdf_x:.1f}, {pdf_y:.1f}), "
f"rot={rotation:.1f}°, size={font_size:.1f}pt")
return (True, '')
except Exception as e:
logger.warning(f"Failed to render text region: {e}")
return (False, 'error')
def render_all_regions(
self,
pdf_canvas: canvas.Canvas,
regions: List[Dict],
page_height: float,
scale_x: float = 1.0,
scale_y: float = 1.0,
page_filter: Optional[int] = None,
exclusion_zones: List[Tuple[float, float, float, float]] = None,
zone_texts: Set[str] = None
) -> int:
"""
Render all OCR text regions to the PDF canvas.
Args:
pdf_canvas: ReportLab canvas to draw on
regions: List of raw OCR region dicts
page_height: Height of the PDF page
scale_x: X coordinate scaling factor
scale_y: Y coordinate scaling factor
page_filter: If set, only render regions for this page index
exclusion_zones: List of (x0, y0, x1, y1) rectangles to avoid
zone_texts: Set of zone-internal texts (for position-aware deduplication)
Returns:
Number of regions successfully rendered
"""
rendered_count = 0
skipped_overlap = 0
skipped_dedupe = 0
for region in regions:
# Filter by page if specified
if page_filter is not None:
region_page = region.get('page', 0)
if region_page != page_filter:
continue
success, skip_reason = self.render_text_region(
pdf_canvas, region, page_height, scale_x, scale_y,
exclusion_zones, zone_texts
)
if success:
rendered_count += 1
elif skip_reason == 'overlap':
skipped_overlap += 1
elif skip_reason == 'dedupe':
skipped_dedupe += 1
# Log results with skip counts
total_processed = rendered_count + skipped_overlap + skipped_dedupe
skip_parts = []
if skipped_overlap > 0:
skip_parts.append(f"{skipped_overlap} overlap")
if skipped_dedupe > 0:
skip_parts.append(f"{skipped_dedupe} dedupe")
if skip_parts:
logger.info(f"Rendered {rendered_count}/{total_processed} text regions "
f"(skipped: {', '.join(skip_parts)})")
else:
logger.info(f"Rendered {rendered_count}/{len(regions)} text regions")
return rendered_count
def load_raw_ocr_regions(result_dir: str, task_id: str, page_num: int) -> List[Dict]:
"""
Load raw OCR regions from the result directory.
Args:
result_dir: Path to the result directory
task_id: Task ID
page_num: Page number (1-indexed)
Returns:
List of raw OCR region dictionaries
"""
from pathlib import Path
import json
# Construct filename pattern
filename = f"{task_id}_edit_page_{page_num}_raw_ocr_regions.json"
file_path = Path(result_dir) / filename
if not file_path.exists():
logger.warning(f"Raw OCR regions file not found: {file_path}")
return []
try:
with open(file_path, 'r', encoding='utf-8') as f:
regions = json.load(f)
logger.info(f"Loaded {len(regions)} raw OCR regions from {filename}")
return regions
except Exception as e:
logger.error(f"Failed to load raw OCR regions: {e}")
return []