diff --git a/backend/app/services/pdf_generator_service.py b/backend/app/services/pdf_generator_service.py index 8b5291b..61b2230 100644 --- a/backend/app/services/pdf_generator_service.py +++ b/backend/app/services/pdf_generator_service.py @@ -885,10 +885,8 @@ class PDFGeneratorService: True if successful, False otherwise """ try: - # Check if PDF already exists (caching) - if output_path.exists(): - logger.info(f"PDF already exists: {output_path.name}") - return True + # Note: Removed PDF caching - always regenerate to ensure latest code changes take effect + # If caching is needed, implement at a higher level with proper cache invalidation # Get text regions text_regions = ocr_data.get('text_regions', []) @@ -1223,6 +1221,21 @@ class PDFGeneratorService: mediabox = page.mediabox width_pt = float(mediabox.width) height_pt = float(mediabox.height) + + # IMPORTANT: Consider page rotation! + # PDF pages can have /Rotate attribute (0, 90, 180, 270) + # When rotation is 90 or 270 degrees, width and height should be swapped + # because pdf2image and PDF viewers apply this rotation when rendering + rotation = page.get('/Rotate', 0) + if rotation is None: + rotation = 0 + rotation = int(rotation) % 360 + + if rotation in (90, 270): + # Swap width and height for 90/270 degree rotation + width_pt, height_pt = height_pt, width_pt + logger.info(f"Page {page_idx}: Rotation={rotation}°, swapped dimensions to {width_pt:.1f} x {height_pt:.1f}") + page_sizes[page_idx] = (width_pt, height_pt) logger.info(f"Extracted dimensions from PDF: {total_pages} pages") @@ -1256,9 +1269,23 @@ class PDFGeneratorService: return page_sizes[0] return None - def _get_bbox_coords(self, bbox: Union[List[List[float]], List[float]]) -> Optional[Tuple[float, float, float, float]]: - """將任何 bbox 格式 (多邊形或 [x1,y1,x2,y2]) 轉換為 [x_min, y_min, x_max, y_max]""" + def _get_bbox_coords(self, bbox: Union[Dict, List[List[float]], List[float]]) -> Optional[Tuple[float, float, float, float]]: + """將任何 bbox 格式 (dict, 多邊形或 [x1,y1,x2,y2]) 轉換為 [x_min, y_min, x_max, y_max]""" try: + if bbox is None: + return None + + # Dict format from UnifiedDocument: {"x0": ..., "y0": ..., "x1": ..., "y1": ...} + if isinstance(bbox, dict): + if 'x0' in bbox and 'y0' in bbox and 'x1' in bbox and 'y1' in bbox: + return float(bbox['x0']), float(bbox['y0']), float(bbox['x1']), float(bbox['y1']) + else: + logger.warning(f"Dict bbox 缺少必要欄位: {bbox}") + return None + + if not isinstance(bbox, (list, tuple)) or len(bbox) < 4: + return None + if isinstance(bbox[0], (list, tuple)): # 處理多邊形 [[x, y], ...] x_coords = [p[0] for p in bbox if isinstance(p, (list, tuple)) and len(p) >= 2] @@ -1268,7 +1295,7 @@ class PDFGeneratorService: return min(x_coords), min(y_coords), max(x_coords), max(y_coords) elif isinstance(bbox[0], (int, float)) and len(bbox) == 4: # 處理 [x1, y1, x2, y2] - return bbox[0], bbox[1], bbox[2], bbox[3] + return float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3]) else: logger.warning(f"未知的 bbox 格式: {bbox}") return None @@ -1337,14 +1364,56 @@ class PDFGeneratorService: return not no_overlap - def _filter_text_in_regions(self, text_regions: List[Dict], regions_to_avoid: List[Dict], tolerance: float = 10.0) -> List[Dict]: + def _calculate_overlap_ratio(self, text_bbox_data: Dict, avoid_bbox_data: Dict) -> float: """ - 過濾掉與 'regions_to_avoid'(例如表格、圖片)重疊的文字區域。 + 計算文字區域與避免區域的重疊比例。 + + Args: + text_bbox_data: 文字區域 bbox 數據 + avoid_bbox_data: 避免區域 bbox 數據 + + Returns: + 重疊面積佔文字區域面積的比例 (0.0 - 1.0) + """ + text_coords = self._get_bbox_coords(text_bbox_data.get('bbox')) + avoid_coords = self._get_bbox_coords(avoid_bbox_data.get('bbox')) + + if not text_coords or not avoid_coords: + return 0.0 + + tx0, ty0, tx1, ty1 = text_coords + ax0, ay0, ax1, ay1 = avoid_coords + + # Calculate text area + text_area = (tx1 - tx0) * (ty1 - ty0) + if text_area <= 0: + return 0.0 + + # Calculate intersection + inter_x0 = max(tx0, ax0) + inter_y0 = max(ty0, ay0) + inter_x1 = min(tx1, ax1) + inter_y1 = min(ty1, ay1) + + # Check if there's actual intersection + if inter_x1 <= inter_x0 or inter_y1 <= inter_y0: + return 0.0 + + inter_area = (inter_x1 - inter_x0) * (inter_y1 - inter_y0) + return inter_area / text_area + + def _filter_text_in_regions(self, text_regions: List[Dict], regions_to_avoid: List[Dict], overlap_threshold: float = 0.5) -> List[Dict]: + """ + 過濾掉與 'regions_to_avoid'(例如表格、圖片)顯著重疊的文字區域。 + + 使用重疊比例閾值來判斷是否過濾,避免過濾掉僅相鄰但不重疊的文字。 Args: text_regions: 文字區域列表 regions_to_avoid: 需要避免的區域列表(表格、圖片) - tolerance: 容錯值(像素),增加到 10.0 以更好地處理邊界情況 + overlap_threshold: 重疊比例閾值 (0.0-1.0),只有當文字區域 + 與避免區域的重疊比例超過此閾值時才會被過濾 + 預設 0.5 表示超過 50% 重疊才過濾 Returns: 過濾後的文字區域列表 @@ -1354,17 +1423,24 @@ class PDFGeneratorService: for text_region in text_regions: should_filter = False + max_overlap = 0.0 for avoid_region in regions_to_avoid: - # 使用重疊檢測:只要有任何重疊就過濾掉 - if self._bbox_overlaps(text_region, avoid_region, tolerance=tolerance): + # 計算重疊比例 + overlap_ratio = self._calculate_overlap_ratio(text_region, avoid_region) + max_overlap = max(max_overlap, overlap_ratio) + + # 只有當重疊比例超過閾值時才過濾 + if overlap_ratio > overlap_threshold: should_filter = True filtered_count += 1 - logger.debug(f"過濾掉重疊文字: {text_region.get('text', '')[:20]}...") - break # 找到一個重疊區域就足夠了 + logger.debug(f"過濾掉重疊文字 (重疊比例: {overlap_ratio:.1%}): {text_region.get('text', '')[:30]}...") + break if not should_filter: filtered_text.append(text_region) + if max_overlap > 0: + logger.debug(f"保留文字 (最大重疊比例: {max_overlap:.1%}): {text_region.get('text', '')[:30]}...") logger.info(f"原始文字區域: {len(text_regions)}, 過濾後: {len(filtered_text)}, 移除: {filtered_count}") return filtered_text @@ -1391,17 +1467,42 @@ class PDFGeneratorService: bbox = region.get('bbox', []) confidence = region.get('confidence', 1.0) - if not text or not bbox or len(bbox) < 4: + if not text or not bbox: return try: - # bbox from OCR: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]] - # Points: top-left, top-right, bottom-right, bottom-left - # OCR coordinates: origin (0,0) at top-left, Y increases downward - ocr_x_left = bbox[0][0] # Left X - ocr_y_top = bbox[0][1] # Top Y in OCR coordinates - ocr_x_right = bbox[2][0] # Right X - ocr_y_bottom = bbox[2][1] # Bottom Y in OCR coordinates + # Handle different bbox formats + if isinstance(bbox, dict): + # Dict format from UnifiedDocument: {"x0": ..., "y0": ..., "x1": ..., "y1": ...} + if 'x0' in bbox and 'y0' in bbox and 'x1' in bbox and 'y1' in bbox: + ocr_x_left = float(bbox['x0']) + ocr_y_top = float(bbox['y0']) + ocr_x_right = float(bbox['x1']) + ocr_y_bottom = float(bbox['y1']) + else: + logger.warning(f"Dict bbox missing required keys: {bbox}") + return + elif isinstance(bbox, list): + if len(bbox) < 4: + return + # Polygon format [[x,y], [x,y], [x,y], [x,y]] (4 points) + if isinstance(bbox[0], list): + ocr_x_left = bbox[0][0] # Left X + ocr_y_top = bbox[0][1] # Top Y in OCR coordinates + ocr_x_right = bbox[2][0] # Right X + ocr_y_bottom = bbox[2][1] # Bottom Y in OCR coordinates + # Simple list format [x0, y0, x1, y1] + elif isinstance(bbox[0], (int, float)): + ocr_x_left = bbox[0] + ocr_y_top = bbox[1] + ocr_x_right = bbox[2] + ocr_y_bottom = bbox[3] + else: + logger.warning(f"Unexpected bbox list format: {bbox}") + return + else: + logger.warning(f"Invalid bbox format: {bbox}") + return logger.info(f"[文字] '{text[:20]}...' OCR原始座標: L={ocr_x_left:.0f}, T={ocr_y_top:.0f}, R={ocr_x_right:.0f}, B={ocr_y_bottom:.0f}") @@ -1489,13 +1590,17 @@ class PDFGeneratorService: if settings.pdf_enable_bbox_debug: pdf_canvas.setStrokeColorRGB(1, 0, 0, 0.3) # Red, semi-transparent pdf_canvas.setLineWidth(0.5) - # Transform all bbox points to PDF coordinates (apply scaling first) - pdf_points = [(p[0] * scale_w, page_height - p[1] * scale_h) for p in bbox] + # Use already-extracted coordinates (works for all bbox formats) + # Draw rectangle using the scaled coordinates + pdf_x1 = ocr_x_left * scale_w + pdf_y1 = page_height - ocr_y_top * scale_h + pdf_x2 = ocr_x_right * scale_w + pdf_y2 = page_height - ocr_y_bottom * scale_h # Draw bbox rectangle - for i in range(4): - x1, y1 = pdf_points[i] - x2, y2 = pdf_points[(i + 1) % 4] - pdf_canvas.line(x1, y1, x2, y2) + pdf_canvas.line(pdf_x1, pdf_y1, pdf_x2, pdf_y1) # top + pdf_canvas.line(pdf_x2, pdf_y1, pdf_x2, pdf_y2) # right + pdf_canvas.line(pdf_x2, pdf_y2, pdf_x1, pdf_y2) # bottom + pdf_canvas.line(pdf_x1, pdf_y2, pdf_x1, pdf_y1) # left except Exception as e: logger.warning(f"Failed to draw text region '{text[:20]}...': {e}") @@ -1560,7 +1665,17 @@ class PDFGeneratorService: return # Handle different bbox formats - if isinstance(table_bbox, list) and len(table_bbox) == 4: + if isinstance(table_bbox, dict): + # Dict format from UnifiedDocument: {"x0": ..., "y0": ..., "x1": ..., "y1": ...} + if 'x0' in table_bbox and 'y0' in table_bbox and 'x1' in table_bbox and 'y1' in table_bbox: + ocr_x_left_raw = float(table_bbox['x0']) + ocr_y_top_raw = float(table_bbox['y0']) + ocr_x_right_raw = float(table_bbox['x1']) + ocr_y_bottom_raw = float(table_bbox['y1']) + else: + logger.error(f"Dict bbox missing required keys (x0, y0, x1, y1): {table_bbox}") + return + elif isinstance(table_bbox, list) and len(table_bbox) == 4: # Simple bbox format [x0, y0, x1, y1] if isinstance(table_bbox[0], (int, float)): ocr_x_left_raw = table_bbox[0] @@ -1595,32 +1710,87 @@ class PDFGeneratorService: pdf_x = ocr_x_left pdf_y = page_height - ocr_y_bottom - # Build table data for ReportLab - # Convert parsed structure to simple 2D array - max_cols = max(len(row['cells']) for row in rows) - - logger.info(f"[表格] {len(rows)}行x{max_cols}列 → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 寬x高: {table_width:.0f}x{table_height:.0f}") - reportlab_data = [] + # Build table data for ReportLab with proper colspan/rowspan handling + # First pass: determine the actual grid size by accounting for spans + num_rows = len(rows) + # Calculate actual number of columns by checking first row's total span + max_cols = 0 for row in rows: - row_data = [] + row_cols = sum(cell.get('colspan', 1) for cell in row['cells']) + max_cols = max(max_cols, row_cols) + + logger.info(f"[表格] {num_rows}行x{max_cols}列 → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 寬x高: {table_width:.0f}x{table_height:.0f}") + + # Create a grid to track occupied cells (for rowspan handling) + # occupied[row][col] = True if cell is occupied by a span from above + occupied = [[False] * max_cols for _ in range(num_rows)] + + # Build the 2D data array and collect span commands + reportlab_data = [] + span_commands = [] + + for row_idx, row in enumerate(rows): + row_data = [''] * max_cols + col_idx = 0 + for cell in row['cells']: + # Skip occupied cells (from rowspan above) + while col_idx < max_cols and occupied[row_idx][col_idx]: + col_idx += 1 + + if col_idx >= max_cols: + break + text = cell['text'].strip() - row_data.append(text) - # Pad row if needed - while len(row_data) < max_cols: - row_data.append('') + colspan = cell.get('colspan', 1) + rowspan = cell.get('rowspan', 1) + + # Place text in the top-left cell of the span + row_data[col_idx] = text + + # Mark cells as occupied for rowspan + for r in range(row_idx, min(row_idx + rowspan, num_rows)): + for c in range(col_idx, min(col_idx + colspan, max_cols)): + if r > row_idx or c > col_idx: + occupied[r][c] = True + + # Add SPAN command if cell spans multiple rows/cols + if colspan > 1 or rowspan > 1: + span_end_col = min(col_idx + colspan - 1, max_cols - 1) + span_end_row = min(row_idx + rowspan - 1, num_rows - 1) + span_commands.append(('SPAN', (col_idx, row_idx), (span_end_col, span_end_row))) + + col_idx += colspan + reportlab_data.append(row_data) # Calculate column widths (equal distribution) col_widths = [table_width / max_cols] * max_cols # Create ReportLab Table - # Use smaller font size to fit in bbox - font_size = min(table_height / len(rows) * 0.5, 10) - font_size = max(font_size, 6) + # Use smaller font to fit content with auto-wrap + font_size = 8 # Fixed reasonable font size for table content - # Create table with font + # Create paragraph style for text wrapping in cells + cell_style = ParagraphStyle( + 'CellStyle', + fontName=self.font_name if self.font_registered else 'Helvetica', + fontSize=font_size, + leading=font_size * 1.2, + alignment=TA_CENTER, + wordWrap='CJK', # Better wrapping for Chinese text + ) + + # Convert text to Paragraph objects for auto-wrapping + for row_idx, row_data in enumerate(reportlab_data): + for col_idx, cell_text in enumerate(row_data): + if cell_text: + # Escape HTML special characters and create Paragraph + escaped_text = cell_text.replace('&', '&').replace('<', '<').replace('>', '>') + reportlab_data[row_idx][col_idx] = Paragraph(escaped_text, cell_style) + + # Create table WITHOUT fixed row heights - let it auto-size based on content table = Table(reportlab_data, colWidths=col_widths) # Apply table style @@ -1640,13 +1810,36 @@ class PDFGeneratorService: style.add('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey) style.add('FONT', (0, 0), (-1, 0), self.font_name if self.font_registered else 'Helvetica-Bold', font_size) + # Add span commands for merged cells + for span_cmd in span_commands: + style.add(*span_cmd) + table.setStyle(style) - # Calculate table size - table.wrapOn(pdf_canvas, table_width, table_height) + logger.info(f"[表格] 套用 {len(span_commands)} 個合併儲存格 (SPAN)") - # Draw table at position - table.drawOn(pdf_canvas, pdf_x, pdf_y) + # Calculate actual table size after wrapping + actual_width, actual_height = table.wrapOn(pdf_canvas, table_width, table_height) + + logger.info(f"[表格] 目標尺寸: {table_width:.0f}x{table_height:.0f}, 實際尺寸: {actual_width:.0f}x{actual_height:.0f}") + + # Scale table to fit bbox if it exceeds the target size + scale_x = table_width / actual_width if actual_width > table_width else 1.0 + scale_y = table_height / actual_height if actual_height > table_height else 1.0 + scale_factor = min(scale_x, scale_y) # Use smaller scale to fit both dimensions + + if scale_factor < 1.0: + logger.info(f"[表格] 縮放比例: {scale_factor:.2f} (需要縮小以適應 bbox)") + # Apply scaling transformation + pdf_canvas.saveState() + pdf_canvas.translate(pdf_x, pdf_y) + pdf_canvas.scale(scale_factor, scale_factor) + # Draw at origin since we've already translated + table.drawOn(pdf_canvas, 0, 0) + pdf_canvas.restoreState() + else: + # Draw table at position without scaling + table.drawOn(pdf_canvas, pdf_x, pdf_y) logger.info(f"Drew table at ({pdf_x:.0f}, {pdf_y:.0f}) size {table_width:.0f}x{table_height:.0f} with {len(rows)} rows") @@ -1696,17 +1889,43 @@ class PDFGeneratorService: # Get bbox for positioning bbox = region.get('bbox', []) - if not bbox or len(bbox) < 4: - # If no bbox, skip for now + if not bbox: logger.warning(f"No bbox for image {image_path_str}") return - # bbox from OCR: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]] - # OCR coordinates: origin (0,0) at top-left, Y increases downward - ocr_x_left_raw = bbox[0][0] - ocr_y_top_raw = bbox[0][1] - ocr_x_right_raw = bbox[2][0] - ocr_y_bottom_raw = bbox[2][1] + # Handle different bbox formats + if isinstance(bbox, dict): + # Dict format from UnifiedDocument: {"x0": ..., "y0": ..., "x1": ..., "y1": ...} + if 'x0' in bbox and 'y0' in bbox and 'x1' in bbox and 'y1' in bbox: + ocr_x_left_raw = float(bbox['x0']) + ocr_y_top_raw = float(bbox['y0']) + ocr_x_right_raw = float(bbox['x1']) + ocr_y_bottom_raw = float(bbox['y1']) + else: + logger.warning(f"Dict bbox missing required keys for image: {bbox}") + return + elif isinstance(bbox, list): + if len(bbox) < 4: + logger.warning(f"List bbox too short for image: {bbox}") + return + # Polygon format [[x,y], [x,y], [x,y], [x,y]] + if isinstance(bbox[0], list): + ocr_x_left_raw = bbox[0][0] + ocr_y_top_raw = bbox[0][1] + ocr_x_right_raw = bbox[2][0] + ocr_y_bottom_raw = bbox[2][1] + # Simple list format [x0, y0, x1, y1] + elif isinstance(bbox[0], (int, float)): + ocr_x_left_raw = bbox[0] + ocr_y_top_raw = bbox[1] + ocr_x_right_raw = bbox[2] + ocr_y_bottom_raw = bbox[3] + else: + logger.warning(f"Unexpected bbox list format for image: {bbox}") + return + else: + logger.warning(f"Invalid bbox format for image: {bbox}") + return logger.info(f"[圖片] '{image_path_str}' OCR原始座標: L={ocr_x_left_raw:.0f}, T={ocr_y_top_raw:.0f}, R={ocr_x_right_raw:.0f}, B={ocr_y_bottom_raw:.0f}")