fix: improve OCR track table rendering with Paragraph wrapping

Changes:
- Remove PDF caching to ensure code changes take effect
- Add PDF rotation handling (90/270 degree swap)
- Add dict bbox format support for UnifiedDocument
- Use Paragraph objects for table cells to enable text auto-wrapping
- Align OCR track table rendering logic with Direct track (no fixed rowHeights)

Known issue: PP-StructureV3 does not provide cell bbox in output
(block_content only contains HTML string, no res['boxes'] like old PPStructure)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-28 09:22:07 +08:00
parent 2861f54838
commit 86bbea6fbf

View File

@@ -885,10 +885,8 @@ class PDFGeneratorService:
True if successful, False otherwise True if successful, False otherwise
""" """
try: try:
# Check if PDF already exists (caching) # Note: Removed PDF caching - always regenerate to ensure latest code changes take effect
if output_path.exists(): # If caching is needed, implement at a higher level with proper cache invalidation
logger.info(f"PDF already exists: {output_path.name}")
return True
# Get text regions # Get text regions
text_regions = ocr_data.get('text_regions', []) text_regions = ocr_data.get('text_regions', [])
@@ -1223,6 +1221,21 @@ class PDFGeneratorService:
mediabox = page.mediabox mediabox = page.mediabox
width_pt = float(mediabox.width) width_pt = float(mediabox.width)
height_pt = float(mediabox.height) height_pt = float(mediabox.height)
# IMPORTANT: Consider page rotation!
# PDF pages can have /Rotate attribute (0, 90, 180, 270)
# When rotation is 90 or 270 degrees, width and height should be swapped
# because pdf2image and PDF viewers apply this rotation when rendering
rotation = page.get('/Rotate', 0)
if rotation is None:
rotation = 0
rotation = int(rotation) % 360
if rotation in (90, 270):
# Swap width and height for 90/270 degree rotation
width_pt, height_pt = height_pt, width_pt
logger.info(f"Page {page_idx}: Rotation={rotation}°, swapped dimensions to {width_pt:.1f} x {height_pt:.1f}")
page_sizes[page_idx] = (width_pt, height_pt) page_sizes[page_idx] = (width_pt, height_pt)
logger.info(f"Extracted dimensions from PDF: {total_pages} pages") logger.info(f"Extracted dimensions from PDF: {total_pages} pages")
@@ -1256,9 +1269,23 @@ class PDFGeneratorService:
return page_sizes[0] return page_sizes[0]
return None return None
def _get_bbox_coords(self, bbox: Union[List[List[float]], List[float]]) -> Optional[Tuple[float, float, float, float]]: def _get_bbox_coords(self, bbox: Union[Dict, List[List[float]], List[float]]) -> Optional[Tuple[float, float, float, float]]:
"""將任何 bbox 格式 (多邊形或 [x1,y1,x2,y2]) 轉換為 [x_min, y_min, x_max, y_max]""" """將任何 bbox 格式 (dict, 多邊形或 [x1,y1,x2,y2]) 轉換為 [x_min, y_min, x_max, y_max]"""
try: try:
if bbox is None:
return None
# Dict format from UnifiedDocument: {"x0": ..., "y0": ..., "x1": ..., "y1": ...}
if isinstance(bbox, dict):
if 'x0' in bbox and 'y0' in bbox and 'x1' in bbox and 'y1' in bbox:
return float(bbox['x0']), float(bbox['y0']), float(bbox['x1']), float(bbox['y1'])
else:
logger.warning(f"Dict bbox 缺少必要欄位: {bbox}")
return None
if not isinstance(bbox, (list, tuple)) or len(bbox) < 4:
return None
if isinstance(bbox[0], (list, tuple)): if isinstance(bbox[0], (list, tuple)):
# 處理多邊形 [[x, y], ...] # 處理多邊形 [[x, y], ...]
x_coords = [p[0] for p in bbox if isinstance(p, (list, tuple)) and len(p) >= 2] x_coords = [p[0] for p in bbox if isinstance(p, (list, tuple)) and len(p) >= 2]
@@ -1268,7 +1295,7 @@ class PDFGeneratorService:
return min(x_coords), min(y_coords), max(x_coords), max(y_coords) return min(x_coords), min(y_coords), max(x_coords), max(y_coords)
elif isinstance(bbox[0], (int, float)) and len(bbox) == 4: elif isinstance(bbox[0], (int, float)) and len(bbox) == 4:
# 處理 [x1, y1, x2, y2] # 處理 [x1, y1, x2, y2]
return bbox[0], bbox[1], bbox[2], bbox[3] return float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3])
else: else:
logger.warning(f"未知的 bbox 格式: {bbox}") logger.warning(f"未知的 bbox 格式: {bbox}")
return None return None
@@ -1337,14 +1364,56 @@ class PDFGeneratorService:
return not no_overlap return not no_overlap
def _filter_text_in_regions(self, text_regions: List[Dict], regions_to_avoid: List[Dict], tolerance: float = 10.0) -> List[Dict]: def _calculate_overlap_ratio(self, text_bbox_data: Dict, avoid_bbox_data: Dict) -> float:
""" """
過濾掉與 'regions_to_avoid'(例如表格、圖片)重疊的文字區域 計算文字區域與避免區域的重疊比例
Args:
text_bbox_data: 文字區域 bbox 數據
avoid_bbox_data: 避免區域 bbox 數據
Returns:
重疊面積佔文字區域面積的比例 (0.0 - 1.0)
"""
text_coords = self._get_bbox_coords(text_bbox_data.get('bbox'))
avoid_coords = self._get_bbox_coords(avoid_bbox_data.get('bbox'))
if not text_coords or not avoid_coords:
return 0.0
tx0, ty0, tx1, ty1 = text_coords
ax0, ay0, ax1, ay1 = avoid_coords
# Calculate text area
text_area = (tx1 - tx0) * (ty1 - ty0)
if text_area <= 0:
return 0.0
# Calculate intersection
inter_x0 = max(tx0, ax0)
inter_y0 = max(ty0, ay0)
inter_x1 = min(tx1, ax1)
inter_y1 = min(ty1, ay1)
# Check if there's actual intersection
if inter_x1 <= inter_x0 or inter_y1 <= inter_y0:
return 0.0
inter_area = (inter_x1 - inter_x0) * (inter_y1 - inter_y0)
return inter_area / text_area
def _filter_text_in_regions(self, text_regions: List[Dict], regions_to_avoid: List[Dict], overlap_threshold: float = 0.5) -> List[Dict]:
"""
過濾掉與 'regions_to_avoid'(例如表格、圖片)顯著重疊的文字區域。
使用重疊比例閾值來判斷是否過濾,避免過濾掉僅相鄰但不重疊的文字。
Args: Args:
text_regions: 文字區域列表 text_regions: 文字區域列表
regions_to_avoid: 需要避免的區域列表(表格、圖片) regions_to_avoid: 需要避免的區域列表(表格、圖片)
tolerance: 容錯值(像素),增加到 10.0 以更好地處理邊界情況 overlap_threshold: 重疊比例閾值 (0.0-1.0),只有當文字區域
與避免區域的重疊比例超過此閾值時才會被過濾
預設 0.5 表示超過 50% 重疊才過濾
Returns: Returns:
過濾後的文字區域列表 過濾後的文字區域列表
@@ -1354,17 +1423,24 @@ class PDFGeneratorService:
for text_region in text_regions: for text_region in text_regions:
should_filter = False should_filter = False
max_overlap = 0.0
for avoid_region in regions_to_avoid: for avoid_region in regions_to_avoid:
# 使用重疊檢測:只要有任何重疊就過濾掉 # 計算重疊比例
if self._bbox_overlaps(text_region, avoid_region, tolerance=tolerance): overlap_ratio = self._calculate_overlap_ratio(text_region, avoid_region)
max_overlap = max(max_overlap, overlap_ratio)
# 只有當重疊比例超過閾值時才過濾
if overlap_ratio > overlap_threshold:
should_filter = True should_filter = True
filtered_count += 1 filtered_count += 1
logger.debug(f"過濾掉重疊文字: {text_region.get('text', '')[:20]}...") logger.debug(f"過濾掉重疊文字 (重疊比例: {overlap_ratio:.1%}): {text_region.get('text', '')[:30]}...")
break # 找到一個重疊區域就足夠了 break
if not should_filter: if not should_filter:
filtered_text.append(text_region) filtered_text.append(text_region)
if max_overlap > 0:
logger.debug(f"保留文字 (最大重疊比例: {max_overlap:.1%}): {text_region.get('text', '')[:30]}...")
logger.info(f"原始文字區域: {len(text_regions)}, 過濾後: {len(filtered_text)}, 移除: {filtered_count}") logger.info(f"原始文字區域: {len(text_regions)}, 過濾後: {len(filtered_text)}, 移除: {filtered_count}")
return filtered_text return filtered_text
@@ -1391,17 +1467,42 @@ class PDFGeneratorService:
bbox = region.get('bbox', []) bbox = region.get('bbox', [])
confidence = region.get('confidence', 1.0) confidence = region.get('confidence', 1.0)
if not text or not bbox or len(bbox) < 4: if not text or not bbox:
return return
try: try:
# bbox from OCR: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]] # Handle different bbox formats
# Points: top-left, top-right, bottom-right, bottom-left if isinstance(bbox, dict):
# OCR coordinates: origin (0,0) at top-left, Y increases downward # Dict format from UnifiedDocument: {"x0": ..., "y0": ..., "x1": ..., "y1": ...}
if 'x0' in bbox and 'y0' in bbox and 'x1' in bbox and 'y1' in bbox:
ocr_x_left = float(bbox['x0'])
ocr_y_top = float(bbox['y0'])
ocr_x_right = float(bbox['x1'])
ocr_y_bottom = float(bbox['y1'])
else:
logger.warning(f"Dict bbox missing required keys: {bbox}")
return
elif isinstance(bbox, list):
if len(bbox) < 4:
return
# Polygon format [[x,y], [x,y], [x,y], [x,y]] (4 points)
if isinstance(bbox[0], list):
ocr_x_left = bbox[0][0] # Left X ocr_x_left = bbox[0][0] # Left X
ocr_y_top = bbox[0][1] # Top Y in OCR coordinates ocr_y_top = bbox[0][1] # Top Y in OCR coordinates
ocr_x_right = bbox[2][0] # Right X ocr_x_right = bbox[2][0] # Right X
ocr_y_bottom = bbox[2][1] # Bottom Y in OCR coordinates ocr_y_bottom = bbox[2][1] # Bottom Y in OCR coordinates
# Simple list format [x0, y0, x1, y1]
elif isinstance(bbox[0], (int, float)):
ocr_x_left = bbox[0]
ocr_y_top = bbox[1]
ocr_x_right = bbox[2]
ocr_y_bottom = bbox[3]
else:
logger.warning(f"Unexpected bbox list format: {bbox}")
return
else:
logger.warning(f"Invalid bbox format: {bbox}")
return
logger.info(f"[文字] '{text[:20]}...' OCR原始座標: L={ocr_x_left:.0f}, T={ocr_y_top:.0f}, R={ocr_x_right:.0f}, B={ocr_y_bottom:.0f}") logger.info(f"[文字] '{text[:20]}...' OCR原始座標: L={ocr_x_left:.0f}, T={ocr_y_top:.0f}, R={ocr_x_right:.0f}, B={ocr_y_bottom:.0f}")
@@ -1489,13 +1590,17 @@ class PDFGeneratorService:
if settings.pdf_enable_bbox_debug: if settings.pdf_enable_bbox_debug:
pdf_canvas.setStrokeColorRGB(1, 0, 0, 0.3) # Red, semi-transparent pdf_canvas.setStrokeColorRGB(1, 0, 0, 0.3) # Red, semi-transparent
pdf_canvas.setLineWidth(0.5) pdf_canvas.setLineWidth(0.5)
# Transform all bbox points to PDF coordinates (apply scaling first) # Use already-extracted coordinates (works for all bbox formats)
pdf_points = [(p[0] * scale_w, page_height - p[1] * scale_h) for p in bbox] # Draw rectangle using the scaled coordinates
pdf_x1 = ocr_x_left * scale_w
pdf_y1 = page_height - ocr_y_top * scale_h
pdf_x2 = ocr_x_right * scale_w
pdf_y2 = page_height - ocr_y_bottom * scale_h
# Draw bbox rectangle # Draw bbox rectangle
for i in range(4): pdf_canvas.line(pdf_x1, pdf_y1, pdf_x2, pdf_y1) # top
x1, y1 = pdf_points[i] pdf_canvas.line(pdf_x2, pdf_y1, pdf_x2, pdf_y2) # right
x2, y2 = pdf_points[(i + 1) % 4] pdf_canvas.line(pdf_x2, pdf_y2, pdf_x1, pdf_y2) # bottom
pdf_canvas.line(x1, y1, x2, y2) pdf_canvas.line(pdf_x1, pdf_y2, pdf_x1, pdf_y1) # left
except Exception as e: except Exception as e:
logger.warning(f"Failed to draw text region '{text[:20]}...': {e}") logger.warning(f"Failed to draw text region '{text[:20]}...': {e}")
@@ -1560,7 +1665,17 @@ class PDFGeneratorService:
return return
# Handle different bbox formats # Handle different bbox formats
if isinstance(table_bbox, list) and len(table_bbox) == 4: if isinstance(table_bbox, dict):
# Dict format from UnifiedDocument: {"x0": ..., "y0": ..., "x1": ..., "y1": ...}
if 'x0' in table_bbox and 'y0' in table_bbox and 'x1' in table_bbox and 'y1' in table_bbox:
ocr_x_left_raw = float(table_bbox['x0'])
ocr_y_top_raw = float(table_bbox['y0'])
ocr_x_right_raw = float(table_bbox['x1'])
ocr_y_bottom_raw = float(table_bbox['y1'])
else:
logger.error(f"Dict bbox missing required keys (x0, y0, x1, y1): {table_bbox}")
return
elif isinstance(table_bbox, list) and len(table_bbox) == 4:
# Simple bbox format [x0, y0, x1, y1] # Simple bbox format [x0, y0, x1, y1]
if isinstance(table_bbox[0], (int, float)): if isinstance(table_bbox[0], (int, float)):
ocr_x_left_raw = table_bbox[0] ocr_x_left_raw = table_bbox[0]
@@ -1595,32 +1710,87 @@ class PDFGeneratorService:
pdf_x = ocr_x_left pdf_x = ocr_x_left
pdf_y = page_height - ocr_y_bottom pdf_y = page_height - ocr_y_bottom
# Build table data for ReportLab # Build table data for ReportLab with proper colspan/rowspan handling
# Convert parsed structure to simple 2D array # First pass: determine the actual grid size by accounting for spans
max_cols = max(len(row['cells']) for row in rows) num_rows = len(rows)
logger.info(f"[表格] {len(rows)}行x{max_cols}列 → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 寬x高: {table_width:.0f}x{table_height:.0f}")
reportlab_data = []
# Calculate actual number of columns by checking first row's total span
max_cols = 0
for row in rows: for row in rows:
row_data = [] row_cols = sum(cell.get('colspan', 1) for cell in row['cells'])
max_cols = max(max_cols, row_cols)
logger.info(f"[表格] {num_rows}行x{max_cols}列 → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 寬x高: {table_width:.0f}x{table_height:.0f}")
# Create a grid to track occupied cells (for rowspan handling)
# occupied[row][col] = True if cell is occupied by a span from above
occupied = [[False] * max_cols for _ in range(num_rows)]
# Build the 2D data array and collect span commands
reportlab_data = []
span_commands = []
for row_idx, row in enumerate(rows):
row_data = [''] * max_cols
col_idx = 0
for cell in row['cells']: for cell in row['cells']:
# Skip occupied cells (from rowspan above)
while col_idx < max_cols and occupied[row_idx][col_idx]:
col_idx += 1
if col_idx >= max_cols:
break
text = cell['text'].strip() text = cell['text'].strip()
row_data.append(text) colspan = cell.get('colspan', 1)
# Pad row if needed rowspan = cell.get('rowspan', 1)
while len(row_data) < max_cols:
row_data.append('') # Place text in the top-left cell of the span
row_data[col_idx] = text
# Mark cells as occupied for rowspan
for r in range(row_idx, min(row_idx + rowspan, num_rows)):
for c in range(col_idx, min(col_idx + colspan, max_cols)):
if r > row_idx or c > col_idx:
occupied[r][c] = True
# Add SPAN command if cell spans multiple rows/cols
if colspan > 1 or rowspan > 1:
span_end_col = min(col_idx + colspan - 1, max_cols - 1)
span_end_row = min(row_idx + rowspan - 1, num_rows - 1)
span_commands.append(('SPAN', (col_idx, row_idx), (span_end_col, span_end_row)))
col_idx += colspan
reportlab_data.append(row_data) reportlab_data.append(row_data)
# Calculate column widths (equal distribution) # Calculate column widths (equal distribution)
col_widths = [table_width / max_cols] * max_cols col_widths = [table_width / max_cols] * max_cols
# Create ReportLab Table # Create ReportLab Table
# Use smaller font size to fit in bbox # Use smaller font to fit content with auto-wrap
font_size = min(table_height / len(rows) * 0.5, 10) font_size = 8 # Fixed reasonable font size for table content
font_size = max(font_size, 6)
# Create table with font # Create paragraph style for text wrapping in cells
cell_style = ParagraphStyle(
'CellStyle',
fontName=self.font_name if self.font_registered else 'Helvetica',
fontSize=font_size,
leading=font_size * 1.2,
alignment=TA_CENTER,
wordWrap='CJK', # Better wrapping for Chinese text
)
# Convert text to Paragraph objects for auto-wrapping
for row_idx, row_data in enumerate(reportlab_data):
for col_idx, cell_text in enumerate(row_data):
if cell_text:
# Escape HTML special characters and create Paragraph
escaped_text = cell_text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
reportlab_data[row_idx][col_idx] = Paragraph(escaped_text, cell_style)
# Create table WITHOUT fixed row heights - let it auto-size based on content
table = Table(reportlab_data, colWidths=col_widths) table = Table(reportlab_data, colWidths=col_widths)
# Apply table style # Apply table style
@@ -1640,12 +1810,35 @@ class PDFGeneratorService:
style.add('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey) style.add('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey)
style.add('FONT', (0, 0), (-1, 0), self.font_name if self.font_registered else 'Helvetica-Bold', font_size) style.add('FONT', (0, 0), (-1, 0), self.font_name if self.font_registered else 'Helvetica-Bold', font_size)
# Add span commands for merged cells
for span_cmd in span_commands:
style.add(*span_cmd)
table.setStyle(style) table.setStyle(style)
# Calculate table size logger.info(f"[表格] 套用 {len(span_commands)} 個合併儲存格 (SPAN)")
table.wrapOn(pdf_canvas, table_width, table_height)
# Draw table at position # Calculate actual table size after wrapping
actual_width, actual_height = table.wrapOn(pdf_canvas, table_width, table_height)
logger.info(f"[表格] 目標尺寸: {table_width:.0f}x{table_height:.0f}, 實際尺寸: {actual_width:.0f}x{actual_height:.0f}")
# Scale table to fit bbox if it exceeds the target size
scale_x = table_width / actual_width if actual_width > table_width else 1.0
scale_y = table_height / actual_height if actual_height > table_height else 1.0
scale_factor = min(scale_x, scale_y) # Use smaller scale to fit both dimensions
if scale_factor < 1.0:
logger.info(f"[表格] 縮放比例: {scale_factor:.2f} (需要縮小以適應 bbox)")
# Apply scaling transformation
pdf_canvas.saveState()
pdf_canvas.translate(pdf_x, pdf_y)
pdf_canvas.scale(scale_factor, scale_factor)
# Draw at origin since we've already translated
table.drawOn(pdf_canvas, 0, 0)
pdf_canvas.restoreState()
else:
# Draw table at position without scaling
table.drawOn(pdf_canvas, pdf_x, pdf_y) table.drawOn(pdf_canvas, pdf_x, pdf_y)
logger.info(f"Drew table at ({pdf_x:.0f}, {pdf_y:.0f}) size {table_width:.0f}x{table_height:.0f} with {len(rows)} rows") logger.info(f"Drew table at ({pdf_x:.0f}, {pdf_y:.0f}) size {table_width:.0f}x{table_height:.0f} with {len(rows)} rows")
@@ -1696,17 +1889,43 @@ class PDFGeneratorService:
# Get bbox for positioning # Get bbox for positioning
bbox = region.get('bbox', []) bbox = region.get('bbox', [])
if not bbox or len(bbox) < 4: if not bbox:
# If no bbox, skip for now
logger.warning(f"No bbox for image {image_path_str}") logger.warning(f"No bbox for image {image_path_str}")
return return
# bbox from OCR: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]] # Handle different bbox formats
# OCR coordinates: origin (0,0) at top-left, Y increases downward if isinstance(bbox, dict):
# Dict format from UnifiedDocument: {"x0": ..., "y0": ..., "x1": ..., "y1": ...}
if 'x0' in bbox and 'y0' in bbox and 'x1' in bbox and 'y1' in bbox:
ocr_x_left_raw = float(bbox['x0'])
ocr_y_top_raw = float(bbox['y0'])
ocr_x_right_raw = float(bbox['x1'])
ocr_y_bottom_raw = float(bbox['y1'])
else:
logger.warning(f"Dict bbox missing required keys for image: {bbox}")
return
elif isinstance(bbox, list):
if len(bbox) < 4:
logger.warning(f"List bbox too short for image: {bbox}")
return
# Polygon format [[x,y], [x,y], [x,y], [x,y]]
if isinstance(bbox[0], list):
ocr_x_left_raw = bbox[0][0] ocr_x_left_raw = bbox[0][0]
ocr_y_top_raw = bbox[0][1] ocr_y_top_raw = bbox[0][1]
ocr_x_right_raw = bbox[2][0] ocr_x_right_raw = bbox[2][0]
ocr_y_bottom_raw = bbox[2][1] ocr_y_bottom_raw = bbox[2][1]
# Simple list format [x0, y0, x1, y1]
elif isinstance(bbox[0], (int, float)):
ocr_x_left_raw = bbox[0]
ocr_y_top_raw = bbox[1]
ocr_x_right_raw = bbox[2]
ocr_y_bottom_raw = bbox[3]
else:
logger.warning(f"Unexpected bbox list format for image: {bbox}")
return
else:
logger.warning(f"Invalid bbox format for image: {bbox}")
return
logger.info(f"[圖片] '{image_path_str}' OCR原始座標: L={ocr_x_left_raw:.0f}, T={ocr_y_top_raw:.0f}, R={ocr_x_right_raw:.0f}, B={ocr_y_bottom_raw:.0f}") logger.info(f"[圖片] '{image_path_str}' OCR原始座標: L={ocr_x_left_raw:.0f}, T={ocr_y_top_raw:.0f}, R={ocr_x_right_raw:.0f}, B={ocr_y_bottom_raw:.0f}")