fix: improve OCR track table rendering with Paragraph wrapping

Changes: - Remove PDF caching to ensure code changes take effect - Add PDF rotation handling (90/270 degree swap) - Add dict bbox format support for UnifiedDocument - Use Paragraph objects for table cells to enable text auto-wrapping - Align OCR track table rendering logic with Direct track (no fixed rowHeights) Known issue: PP-StructureV3 does not provide cell bbox in output (block_content only contains HTML string, no res['boxes'] like old PPStructure) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-28 09:22:07 +08:00
parent 2861f54838
commit 86bbea6fbf
1 changed files with 275 additions and 56 deletions
--- a/backend/app/services/pdf_generator_service.py
+++ b/backend/app/services/pdf_generator_service.py
@@ -885,10 +885,8 @@ class PDFGeneratorService:
            True if successful, False otherwise
        """
        try:
-            # Check if PDF already exists (caching)
+            # Note: Removed PDF caching - always regenerate to ensure latest code changes take effect
-            if output_path.exists():
+            # If caching is needed, implement at a higher level with proper cache invalidation
                logger.info(f"PDF already exists: {output_path.name}")
                return True
            # Get text regions
            text_regions = ocr_data.get('text_regions', [])
@@ -1223,6 +1221,21 @@ class PDFGeneratorService:
                        mediabox = page.mediabox
                        width_pt = float(mediabox.width)
                        height_pt = float(mediabox.height)
                        # IMPORTANT: Consider page rotation!
                        # PDF pages can have /Rotate attribute (0, 90, 180, 270)
                        # When rotation is 90 or 270 degrees, width and height should be swapped
                        # because pdf2image and PDF viewers apply this rotation when rendering
                        rotation = page.get('/Rotate', 0)
                        if rotation is None:
                            rotation = 0
                        rotation = int(rotation) % 360
                        if rotation in (90, 270):
                            # Swap width and height for 90/270 degree rotation
                            width_pt, height_pt = height_pt, width_pt
                            logger.info(f"Page {page_idx}: Rotation={rotation}°, swapped dimensions to {width_pt:.1f} x {height_pt:.1f}")
                        page_sizes[page_idx] = (width_pt, height_pt)
                    logger.info(f"Extracted dimensions from PDF: {total_pages} pages")
@@ -1256,9 +1269,23 @@ class PDFGeneratorService:
            return page_sizes[0]
        return None
-    def _get_bbox_coords(self, bbox: Union[List[List[float]], List[float]]) -> Optional[Tuple[float, float, float, float]]:
+    def _get_bbox_coords(self, bbox: Union[Dict, List[List[float]], List[float]]) -> Optional[Tuple[float, float, float, float]]:
-        """將任何 bbox 格式 (多邊形或 [x1,y1,x2,y2]) 轉換為 [x_min, y_min, x_max, y_max]"""
+        """將任何 bbox 格式 (dict, 多邊形或 [x1,y1,x2,y2]) 轉換為 [x_min, y_min, x_max, y_max]"""
        try:
            if bbox is None:
                return None
            # Dict format from UnifiedDocument: {"x0": ..., "y0": ..., "x1": ..., "y1": ...}
            if isinstance(bbox, dict):
                if 'x0' in bbox and 'y0' in bbox and 'x1' in bbox and 'y1' in bbox:
                    return float(bbox['x0']), float(bbox['y0']), float(bbox['x1']), float(bbox['y1'])
                else:
                    logger.warning(f"Dict bbox 缺少必要欄位: {bbox}")
                    return None
            if not isinstance(bbox, (list, tuple)) or len(bbox) < 4:
                return None
            if isinstance(bbox[0], (list, tuple)):
                # 處理多邊形 [[x, y], ...]
                x_coords = [p[0] for p in bbox if isinstance(p, (list, tuple)) and len(p) >= 2]
@@ -1268,7 +1295,7 @@ class PDFGeneratorService:
                return min(x_coords), min(y_coords), max(x_coords), max(y_coords)
            elif isinstance(bbox[0], (int, float)) and len(bbox) == 4:
                # 處理 [x1, y1, x2, y2]
-                return bbox[0], bbox[1], bbox[2], bbox[3]
+                return float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3])
            else:
                logger.warning(f"未知的 bbox 格式: {bbox}")
                return None
@@ -1337,14 +1364,56 @@ class PDFGeneratorService:
        return not no_overlap
-    def _filter_text_in_regions(self, text_regions: List[Dict], regions_to_avoid: List[Dict], tolerance: float = 10.0) -> List[Dict]:
+    def _calculate_overlap_ratio(self, text_bbox_data: Dict, avoid_bbox_data: Dict) -> float:
        """
-        過濾掉與 'regions_to_avoid'（例如表格、圖片）重疊的文字區域。
+        計算文字區域與避免區域的重疊比例。
        Args:
            text_bbox_data: 文字區域 bbox 數據
            avoid_bbox_data: 避免區域 bbox 數據
        Returns:
            重疊面積佔文字區域面積的比例 (0.0 - 1.0)
        """
        text_coords = self._get_bbox_coords(text_bbox_data.get('bbox'))
        avoid_coords = self._get_bbox_coords(avoid_bbox_data.get('bbox'))
        if not text_coords or not avoid_coords:
            return 0.0
        tx0, ty0, tx1, ty1 = text_coords
        ax0, ay0, ax1, ay1 = avoid_coords
        # Calculate text area
        text_area = (tx1 - tx0) * (ty1 - ty0)
        if text_area <= 0:
            return 0.0
        # Calculate intersection
        inter_x0 = max(tx0, ax0)
        inter_y0 = max(ty0, ay0)
        inter_x1 = min(tx1, ax1)
        inter_y1 = min(ty1, ay1)
        # Check if there's actual intersection
        if inter_x1 <= inter_x0 or inter_y1 <= inter_y0:
            return 0.0
        inter_area = (inter_x1 - inter_x0) * (inter_y1 - inter_y0)
        return inter_area / text_area
    def _filter_text_in_regions(self, text_regions: List[Dict], regions_to_avoid: List[Dict], overlap_threshold: float = 0.5) -> List[Dict]:
        """
        過濾掉與 'regions_to_avoid'（例如表格、圖片）顯著重疊的文字區域。
        使用重疊比例閾值來判斷是否過濾，避免過濾掉僅相鄰但不重疊的文字。
        Args:
            text_regions: 文字區域列表
            regions_to_avoid: 需要避免的區域列表（表格、圖片）
-            tolerance: 容錯值（像素），增加到 10.0 以更好地處理邊界情況
+            overlap_threshold: 重疊比例閾值 (0.0-1.0)，只有當文字區域
                              與避免區域的重疊比例超過此閾值時才會被過濾
                              預設 0.5 表示超過 50% 重疊才過濾
        Returns:
            過濾後的文字區域列表
@@ -1354,17 +1423,24 @@ class PDFGeneratorService:
        for text_region in text_regions:
            should_filter = False
            max_overlap = 0.0
            for avoid_region in regions_to_avoid:
-                # 使用重疊檢測：只要有任何重疊就過濾掉
+                # 計算重疊比例
-                if self._bbox_overlaps(text_region, avoid_region, tolerance=tolerance):
+                overlap_ratio = self._calculate_overlap_ratio(text_region, avoid_region)
                max_overlap = max(max_overlap, overlap_ratio)
                # 只有當重疊比例超過閾值時才過濾
                if overlap_ratio > overlap_threshold:
                    should_filter = True
                    filtered_count += 1
-                    logger.debug(f"過濾掉重疊文字: {text_region.get('text', '')[:20]}...")
+                    logger.debug(f"過濾掉重疊文字 (重疊比例: {overlap_ratio:.1%}): {text_region.get('text', '')[:30]}...")
-                    break  # 找到一個重疊區域就足夠了
+                    break
            if not should_filter:
                filtered_text.append(text_region)
                if max_overlap > 0:
                    logger.debug(f"保留文字 (最大重疊比例: {max_overlap:.1%}): {text_region.get('text', '')[:30]}...")
        logger.info(f"原始文字區域: {len(text_regions)}, 過濾後: {len(filtered_text)}, 移除: {filtered_count}")
        return filtered_text
@@ -1391,17 +1467,42 @@ class PDFGeneratorService:
        bbox = region.get('bbox', [])
        confidence = region.get('confidence', 1.0)
-        if not text or not bbox or len(bbox) < 4:
+        if not text or not bbox:
            return
        try:
-            # bbox from OCR: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
+            # Handle different bbox formats
-            # Points: top-left, top-right, bottom-right, bottom-left
+            if isinstance(bbox, dict):
-            # OCR coordinates: origin (0,0) at top-left, Y increases downward
+                # Dict format from UnifiedDocument: {"x0": ..., "y0": ..., "x1": ..., "y1": ...}
                if 'x0' in bbox and 'y0' in bbox and 'x1' in bbox and 'y1' in bbox:
                    ocr_x_left = float(bbox['x0'])
                    ocr_y_top = float(bbox['y0'])
                    ocr_x_right = float(bbox['x1'])
                    ocr_y_bottom = float(bbox['y1'])
                else:
                    logger.warning(f"Dict bbox missing required keys: {bbox}")
                    return
            elif isinstance(bbox, list):
                if len(bbox) < 4:
                    return
                # Polygon format [[x,y], [x,y], [x,y], [x,y]] (4 points)
                if isinstance(bbox[0], list):
                    ocr_x_left = bbox[0][0]    # Left X
                    ocr_y_top = bbox[0][1]     # Top Y in OCR coordinates
                    ocr_x_right = bbox[2][0]   # Right X
                    ocr_y_bottom = bbox[2][1]  # Bottom Y in OCR coordinates
                # Simple list format [x0, y0, x1, y1]
                elif isinstance(bbox[0], (int, float)):
                    ocr_x_left = bbox[0]
                    ocr_y_top = bbox[1]
                    ocr_x_right = bbox[2]
                    ocr_y_bottom = bbox[3]
                else:
                    logger.warning(f"Unexpected bbox list format: {bbox}")
                    return
            else:
                logger.warning(f"Invalid bbox format: {bbox}")
                return
            logger.info(f"[文字] '{text[:20]}...' OCR原始座標: L={ocr_x_left:.0f}, T={ocr_y_top:.0f}, R={ocr_x_right:.0f}, B={ocr_y_bottom:.0f}")
@@ -1489,13 +1590,17 @@ class PDFGeneratorService:
            if settings.pdf_enable_bbox_debug:
                pdf_canvas.setStrokeColorRGB(1, 0, 0, 0.3)  # Red, semi-transparent
                pdf_canvas.setLineWidth(0.5)
-                # Transform all bbox points to PDF coordinates (apply scaling first)
+                # Use already-extracted coordinates (works for all bbox formats)
-                pdf_points = [(p[0] * scale_w, page_height - p[1] * scale_h) for p in bbox]
+                # Draw rectangle using the scaled coordinates
                pdf_x1 = ocr_x_left * scale_w
                pdf_y1 = page_height - ocr_y_top * scale_h
                pdf_x2 = ocr_x_right * scale_w
                pdf_y2 = page_height - ocr_y_bottom * scale_h
                # Draw bbox rectangle
-                for i in range(4):
+                pdf_canvas.line(pdf_x1, pdf_y1, pdf_x2, pdf_y1)  # top
-                    x1, y1 = pdf_points[i]
+                pdf_canvas.line(pdf_x2, pdf_y1, pdf_x2, pdf_y2)  # right
-                    x2, y2 = pdf_points[(i + 1) % 4]
+                pdf_canvas.line(pdf_x2, pdf_y2, pdf_x1, pdf_y2)  # bottom
-                    pdf_canvas.line(x1, y1, x2, y2)
+                pdf_canvas.line(pdf_x1, pdf_y2, pdf_x1, pdf_y1)  # left
        except Exception as e:
            logger.warning(f"Failed to draw text region '{text[:20]}...': {e}")
@@ -1560,7 +1665,17 @@ class PDFGeneratorService:
                return
            # Handle different bbox formats
-            if isinstance(table_bbox, list) and len(table_bbox) == 4:
+            if isinstance(table_bbox, dict):
                # Dict format from UnifiedDocument: {"x0": ..., "y0": ..., "x1": ..., "y1": ...}
                if 'x0' in table_bbox and 'y0' in table_bbox and 'x1' in table_bbox and 'y1' in table_bbox:
                    ocr_x_left_raw = float(table_bbox['x0'])
                    ocr_y_top_raw = float(table_bbox['y0'])
                    ocr_x_right_raw = float(table_bbox['x1'])
                    ocr_y_bottom_raw = float(table_bbox['y1'])
                else:
                    logger.error(f"Dict bbox missing required keys (x0, y0, x1, y1): {table_bbox}")
                    return
            elif isinstance(table_bbox, list) and len(table_bbox) == 4:
                # Simple bbox format [x0, y0, x1, y1]
                if isinstance(table_bbox[0], (int, float)):
                    ocr_x_left_raw = table_bbox[0]
@@ -1595,32 +1710,87 @@ class PDFGeneratorService:
            pdf_x = ocr_x_left
            pdf_y = page_height - ocr_y_bottom
-            # Build table data for ReportLab
+            # Build table data for ReportLab with proper colspan/rowspan handling
-            # Convert parsed structure to simple 2D array
+            # First pass: determine the actual grid size by accounting for spans
-            max_cols = max(len(row['cells']) for row in rows)
+            num_rows = len(rows)
            logger.info(f"[表格] {len(rows)}行x{max_cols}列 → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 寬x高: {table_width:.0f}x{table_height:.0f}")
            reportlab_data = []
            # Calculate actual number of columns by checking first row's total span
            max_cols = 0
            for row in rows:
-                row_data = []
+                row_cols = sum(cell.get('colspan', 1) for cell in row['cells'])
                max_cols = max(max_cols, row_cols)
            logger.info(f"[表格] {num_rows}行x{max_cols}列 → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 寬x高: {table_width:.0f}x{table_height:.0f}")
            # Create a grid to track occupied cells (for rowspan handling)
            # occupied[row][col] = True if cell is occupied by a span from above
            occupied = [[False] * max_cols for _ in range(num_rows)]
            # Build the 2D data array and collect span commands
            reportlab_data = []
            span_commands = []
            for row_idx, row in enumerate(rows):
                row_data = [''] * max_cols
                col_idx = 0
                for cell in row['cells']:
                    # Skip occupied cells (from rowspan above)
                    while col_idx < max_cols and occupied[row_idx][col_idx]:
                        col_idx += 1
                    if col_idx >= max_cols:
                        break
                    text = cell['text'].strip()
-                    row_data.append(text)
+                    colspan = cell.get('colspan', 1)
-                # Pad row if needed
+                    rowspan = cell.get('rowspan', 1)
-                while len(row_data) < max_cols:
+
-                    row_data.append('')
+                    # Place text in the top-left cell of the span
                    row_data[col_idx] = text
                    # Mark cells as occupied for rowspan
                    for r in range(row_idx, min(row_idx + rowspan, num_rows)):
                        for c in range(col_idx, min(col_idx + colspan, max_cols)):
                            if r > row_idx or c > col_idx:
                                occupied[r][c] = True
                    # Add SPAN command if cell spans multiple rows/cols
                    if colspan > 1 or rowspan > 1:
                        span_end_col = min(col_idx + colspan - 1, max_cols - 1)
                        span_end_row = min(row_idx + rowspan - 1, num_rows - 1)
                        span_commands.append(('SPAN', (col_idx, row_idx), (span_end_col, span_end_row)))
                    col_idx += colspan
                reportlab_data.append(row_data)
            # Calculate column widths (equal distribution)
            col_widths = [table_width / max_cols] * max_cols
            # Create ReportLab Table
-            # Use smaller font size to fit in bbox
+            # Use smaller font to fit content with auto-wrap
-            font_size = min(table_height / len(rows) * 0.5, 10)
+            font_size = 8  # Fixed reasonable font size for table content
            font_size = max(font_size, 6)
-            # Create table with font
+            # Create paragraph style for text wrapping in cells
            cell_style = ParagraphStyle(
                'CellStyle',
                fontName=self.font_name if self.font_registered else 'Helvetica',
                fontSize=font_size,
                leading=font_size * 1.2,
                alignment=TA_CENTER,
                wordWrap='CJK',  # Better wrapping for Chinese text
            )
            # Convert text to Paragraph objects for auto-wrapping
            for row_idx, row_data in enumerate(reportlab_data):
                for col_idx, cell_text in enumerate(row_data):
                    if cell_text:
                        # Escape HTML special characters and create Paragraph
                        escaped_text = cell_text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
                        reportlab_data[row_idx][col_idx] = Paragraph(escaped_text, cell_style)
            # Create table WITHOUT fixed row heights - let it auto-size based on content
            table = Table(reportlab_data, colWidths=col_widths)
            # Apply table style
@@ -1640,12 +1810,35 @@ class PDFGeneratorService:
                style.add('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey)
                style.add('FONT', (0, 0), (-1, 0), self.font_name if self.font_registered else 'Helvetica-Bold', font_size)
            # Add span commands for merged cells
            for span_cmd in span_commands:
                style.add(*span_cmd)
            table.setStyle(style)
-            # Calculate table size
+            logger.info(f"[表格] 套用 {len(span_commands)} 個合併儲存格 (SPAN)")
            table.wrapOn(pdf_canvas, table_width, table_height)
-            # Draw table at position
+            # Calculate actual table size after wrapping
            actual_width, actual_height = table.wrapOn(pdf_canvas, table_width, table_height)
            logger.info(f"[表格] 目標尺寸: {table_width:.0f}x{table_height:.0f}, 實際尺寸: {actual_width:.0f}x{actual_height:.0f}")
            # Scale table to fit bbox if it exceeds the target size
            scale_x = table_width / actual_width if actual_width > table_width else 1.0
            scale_y = table_height / actual_height if actual_height > table_height else 1.0
            scale_factor = min(scale_x, scale_y)  # Use smaller scale to fit both dimensions
            if scale_factor < 1.0:
                logger.info(f"[表格] 縮放比例: {scale_factor:.2f} (需要縮小以適應 bbox)")
                # Apply scaling transformation
                pdf_canvas.saveState()
                pdf_canvas.translate(pdf_x, pdf_y)
                pdf_canvas.scale(scale_factor, scale_factor)
                # Draw at origin since we've already translated
                table.drawOn(pdf_canvas, 0, 0)
                pdf_canvas.restoreState()
            else:
                # Draw table at position without scaling
                table.drawOn(pdf_canvas, pdf_x, pdf_y)
            logger.info(f"Drew table at ({pdf_x:.0f}, {pdf_y:.0f}) size {table_width:.0f}x{table_height:.0f} with {len(rows)} rows")
@@ -1696,17 +1889,43 @@ class PDFGeneratorService:
            # Get bbox for positioning
            bbox = region.get('bbox', [])
-            if not bbox or len(bbox) < 4:
+            if not bbox:
                # If no bbox, skip for now
                logger.warning(f"No bbox for image {image_path_str}")
                return
-            # bbox from OCR: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
+            # Handle different bbox formats
-            # OCR coordinates: origin (0,0) at top-left, Y increases downward
+            if isinstance(bbox, dict):
                # Dict format from UnifiedDocument: {"x0": ..., "y0": ..., "x1": ..., "y1": ...}
                if 'x0' in bbox and 'y0' in bbox and 'x1' in bbox and 'y1' in bbox:
                    ocr_x_left_raw = float(bbox['x0'])
                    ocr_y_top_raw = float(bbox['y0'])
                    ocr_x_right_raw = float(bbox['x1'])
                    ocr_y_bottom_raw = float(bbox['y1'])
                else:
                    logger.warning(f"Dict bbox missing required keys for image: {bbox}")
                    return
            elif isinstance(bbox, list):
                if len(bbox) < 4:
                    logger.warning(f"List bbox too short for image: {bbox}")
                    return
                # Polygon format [[x,y], [x,y], [x,y], [x,y]]
                if isinstance(bbox[0], list):
                    ocr_x_left_raw = bbox[0][0]
                    ocr_y_top_raw = bbox[0][1]
                    ocr_x_right_raw = bbox[2][0]
                    ocr_y_bottom_raw = bbox[2][1]
                # Simple list format [x0, y0, x1, y1]
                elif isinstance(bbox[0], (int, float)):
                    ocr_x_left_raw = bbox[0]
                    ocr_y_top_raw = bbox[1]
                    ocr_x_right_raw = bbox[2]
                    ocr_y_bottom_raw = bbox[3]
                else:
                    logger.warning(f"Unexpected bbox list format for image: {bbox}")
                    return
            else:
                logger.warning(f"Invalid bbox format for image: {bbox}")
                return
            logger.info(f"[圖片] '{image_path_str}' OCR原始座標: L={ocr_x_left_raw:.0f}, T={ocr_y_top_raw:.0f}, R={ocr_x_right_raw:.0f}, B={ocr_y_bottom_raw:.0f}")