From 0999898358a6da19bcbf2c07e8db84cbacc4b8b2 Mon Sep 17 00:00:00 2001 From: egg Date: Tue, 25 Nov 2025 15:09:39 +0800 Subject: [PATCH] fix: improve multi-page PDF dimension handling and coordinate transformation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Resolve issues where multi-page PDFs with varying page sizes had incorrect element positioning and scaling. Each page now maintains its own dimensions and scale factors throughout the generation process. Key improvements: Direct Track Processing: - Store per-page dimensions in page_dimensions mapping (0-based index) - Set correct page size for each page using setPageSize() - Pass current page height to all drawing methods for accurate Y-axis conversion - Each page uses its own dimensions instead of first page dimensions OCR Track Processing: - Calculate per-page scale factors with 3-tier priority: 1. Original file dimensions (highest priority) 2. OCR/UnifiedDocument dimensions 3. Fallback to first page dimensions - Apply correct scaling factors for each page's coordinate transformation - Handle mixed-size pages correctly (e.g., A4 + A3 in same document) Dimension Extraction: - Add get_all_page_sizes() method to extract dimensions for all PDF pages - Return Dict[int, Tuple[float, float]] mapping page index to (width, height) - Maintain backward compatibility with get_original_page_size() for first page - Support both images (single page) and multi-page PDFs Coordinate System: - Add ocr_dimensions priority check in calculate_page_dimensions() - Priority order: ocr_dimensions > dimensions > bbox inference - Ensure consistent coordinate space across processing tracks Benefits: - Correct rendering for documents with mixed page sizes - Accurate element positioning on all pages - Proper scaling for scanned documents with varying DPI per page - Better handling of landscape/portrait mixed documents Related to archived proposal: fix-pdf-coordinate-system 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- backend/app/services/pdf_generator_service.py | 257 +++++++++++++----- 1 file changed, 191 insertions(+), 66 deletions(-) diff --git a/backend/app/services/pdf_generator_service.py b/backend/app/services/pdf_generator_service.py index bcffbb1..4a4bf17 100644 --- a/backend/app/services/pdf_generator_service.py +++ b/backend/app/services/pdf_generator_service.py @@ -503,6 +503,14 @@ class PDFGeneratorService: else: logger.warning(f"No image path found for visual element {element.element_id}") + # Build page dimensions mapping for multi-page support + page_dimensions = {} + for page in unified_doc.pages: + page_dimensions[page.page_number - 1] = { # 0-based index + 'width': page.dimensions.width, + 'height': page.dimensions.height + } + # Build OCR data structure ocr_data = { 'text_regions': text_regions, @@ -516,6 +524,7 @@ class PDFGeneratorService: 'width': unified_doc.pages[0].dimensions.width if unified_doc.pages else 0, 'height': unified_doc.pages[0].dimensions.height if unified_doc.pages else 0 }, + 'page_dimensions': page_dimensions, # Per-page dimensions for multi-page support # Metadata for tracking '_from_unified_document': True, '_processing_track': unified_doc.metadata.processing_track.value @@ -669,7 +678,7 @@ class PDFGeneratorService: # Set current track for helper methods self.current_processing_track = 'direct' - # Get page dimensions from first page + # Get page dimensions from first page (for canvas initialization) if not unified_doc.pages: logger.error("No pages in document") return False @@ -678,9 +687,9 @@ class PDFGeneratorService: page_width = first_page.dimensions.width page_height = first_page.dimensions.height - logger.info(f"Page dimensions: {page_width} x {page_height}") + logger.info(f"First page dimensions: {page_width} x {page_height}") - # Create PDF canvas with source dimensions + # Create PDF canvas with first page dimensions (will be updated per page) from reportlab.pdfgen import canvas pdf_canvas = canvas.Canvas(str(output_path), pagesize=(page_width, page_height)) @@ -688,9 +697,17 @@ class PDFGeneratorService: for page_idx, page in enumerate(unified_doc.pages): logger.info(f">>> Processing page {page_idx + 1}/{len(unified_doc.pages)}") + # Get current page dimensions + current_page_width = page.dimensions.width + current_page_height = page.dimensions.height + logger.info(f"Page {page_idx + 1} dimensions: {current_page_width} x {current_page_height}") + if page_idx > 0: pdf_canvas.showPage() + # Set page size for current page + pdf_canvas.setPageSize((current_page_width, current_page_height)) + # Separate elements by type text_elements = [] table_elements = [] @@ -757,19 +774,19 @@ class PDFGeneratorService: # Draw elements in document order for elem_type, elem in all_elements: if elem_type == 'image': - self._draw_image_element_direct(pdf_canvas, elem, page_height, output_path.parent) + self._draw_image_element_direct(pdf_canvas, elem, current_page_height, output_path.parent) elif elem_type == 'table': - self._draw_table_element_direct(pdf_canvas, elem, page_height) + self._draw_table_element_direct(pdf_canvas, elem, current_page_height) elif elem_type == 'list': # FIX: Check if list item overlaps with table/image if not self._is_element_inside_regions(elem.bbox, regions_to_avoid): - self._draw_text_element_direct(pdf_canvas, elem, page_height) + self._draw_text_element_direct(pdf_canvas, elem, current_page_height) else: logger.debug(f"Skipping list element {elem.element_id} inside table/image region") elif elem_type == 'text': # FIX: Check if text overlaps with table/image before drawing if not self._is_element_inside_regions(elem.bbox, regions_to_avoid): - self._draw_text_element_direct(pdf_canvas, elem, page_height) + self._draw_text_element_direct(pdf_canvas, elem, current_page_height) else: logger.debug(f"Skipping text element {elem.element_id} inside table/image region") @@ -875,29 +892,38 @@ class PDFGeneratorService: # Get layout data layout_data = ocr_data.get('layout_data', {}) - # Step 1: Get OCR processing dimensions + # Step 1: Get OCR processing dimensions (for first page / default) ocr_width, ocr_height = self.calculate_page_dimensions(ocr_data, source_file_path=None) - logger.info(f"OCR 處理時使用的座標系尺寸: {ocr_width:.1f} x {ocr_height:.1f}") + logger.info(f"OCR 處理時使用的座標系尺寸 (第一頁): {ocr_width:.1f} x {ocr_height:.1f}") - # Step 2: Get target PDF dimensions + # Step 2: Get page dimensions mapping for multi-page support + page_dimensions = ocr_data.get('page_dimensions', {}) + if not page_dimensions: + # Fallback: use first page dimensions for all pages + page_dimensions = {0: {'width': ocr_width, 'height': ocr_height}} + logger.info("No page_dimensions found, using first page size for all pages") + + # Step 3: Get original file dimensions for all pages + original_page_sizes = {} if source_file_path: - target_dims = self.get_original_page_size(source_file_path) - if target_dims: - target_width, target_height = target_dims - logger.info(f"目標 PDF 尺寸(來自原始文件): {target_width:.1f} x {target_height:.1f}") + original_page_sizes = self.get_all_page_sizes(source_file_path) + if original_page_sizes: + logger.info(f"從原始文件獲取到 {len(original_page_sizes)} 頁尺寸") else: - target_width, target_height = ocr_width, ocr_height - logger.warning(f"無法獲取原始文件尺寸,使用 OCR 尺寸作為目標") + logger.warning(f"無法獲取原始文件尺寸,將使用 OCR/UnifiedDocument 尺寸") + else: + logger.info(f"無原始文件,將使用 OCR/UnifiedDocument 尺寸") + + # Determine initial canvas size (will be updated per page) + # Priority: original file first page > OCR/UnifiedDocument first page + if 0 in original_page_sizes: + target_width, target_height = original_page_sizes[0] + logger.info(f"初始 PDF 尺寸(來自原始文件首頁): {target_width:.1f} x {target_height:.1f}") else: target_width, target_height = ocr_width, ocr_height - logger.info(f"無原始文件,使用 OCR 尺寸作為目標: {target_width:.1f} x {target_height:.1f}") + logger.info(f"初始 PDF 尺寸(來自 OCR/UnifiedDocument): {target_width:.1f} x {target_height:.1f}") - # Step 3: Calculate scale factors - scale_w = target_width / ocr_width if ocr_width > 0 else 1.0 - scale_h = target_height / ocr_height if ocr_height > 0 else 1.0 - logger.info(f"縮放因子: X={scale_w:.3f}, Y={scale_h:.3f}") - - # Create PDF canvas + # Create PDF canvas with initial page size (will be updated per page) pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height)) # Filter text regions to avoid overlap with tables/images @@ -931,9 +957,60 @@ class PDFGeneratorService: for page_num in range(1, total_pages + 1): logger.info(f">>> 處理第 {page_num}/{total_pages} 頁") + + # Get current page dimensions with priority order: + # 1. Original file dimensions (highest priority) + # 2. OCR/UnifiedDocument dimensions + # 3. Fallback to first page dimensions + page_idx = page_num - 1 + dimension_source = "unknown" + + # Priority 1: Original file dimensions + if page_idx in original_page_sizes: + current_target_w, current_target_h = original_page_sizes[page_idx] + dimension_source = "original_file" + + # Priority 2: OCR/UnifiedDocument dimensions + elif page_idx in page_dimensions: + current_page_dims = page_dimensions[page_idx] + current_target_w = float(current_page_dims['width']) + current_target_h = float(current_page_dims['height']) + dimension_source = "ocr_unified_doc" + + # Priority 3: Fallback to first page + else: + current_target_w = ocr_width + current_target_h = ocr_height + dimension_source = "fallback_first_page" + logger.warning(f"No dimensions for page {page_num}, using first page size") + + # Calculate scale factors for coordinate transformation + # OCR coordinates need to be scaled if original file dimensions differ + if dimension_source == "original_file": + # Get OCR dimensions for this page to calculate scale + if page_idx in page_dimensions: + ocr_page_w = float(page_dimensions[page_idx]['width']) + ocr_page_h = float(page_dimensions[page_idx]['height']) + else: + ocr_page_w = ocr_width + ocr_page_h = ocr_height + + current_scale_w = current_target_w / ocr_page_w if ocr_page_w > 0 else 1.0 + current_scale_h = current_target_h / ocr_page_h if ocr_page_h > 0 else 1.0 + else: + # Using OCR/UnifiedDocument dimensions directly, no scaling needed + current_scale_w = 1.0 + current_scale_h = 1.0 + + logger.info(f"第 {page_num} 頁尺寸: {current_target_w:.1f} x {current_target_h:.1f} " + f"(來源: {dimension_source}, 縮放: {current_scale_w:.3f}x{current_scale_h:.3f})") + if page_num > 1: pdf_canvas.showPage() + # Set page size for current page + pdf_canvas.setPageSize((current_target_w, current_target_h)) + # Get regions for this page page_text_regions = pages_data.get(page_num, []) page_table_regions = [t for t in table_elements if t.get('page') == page_num - 1] @@ -949,22 +1026,22 @@ class PDFGeneratorService: # 1. Draw images (bottom layer) for img_meta in page_image_regions: self.draw_image_region( - pdf_canvas, img_meta, target_height, - json_parent_dir, scale_w, scale_h + pdf_canvas, img_meta, current_target_h, + json_parent_dir, current_scale_w, current_scale_h ) # 2. Draw tables (middle layer) for table_elem in page_table_regions: self.draw_table_region( pdf_canvas, table_elem, images_metadata, - target_height, scale_w, scale_h + current_target_h, current_scale_w, current_scale_h ) # 3. Draw text (top layer) for region in page_text_regions: self.draw_text_region( - pdf_canvas, region, target_height, - scale_w, scale_h + pdf_canvas, region, current_target_h, + current_scale_w, current_scale_h ) logger.info(f"<<< 第 {page_num} 頁完成") @@ -984,8 +1061,8 @@ class PDFGeneratorService: def calculate_page_dimensions(self, ocr_data: Dict, source_file_path: Optional[Path] = None) -> Tuple[float, float]: """ - 從 OCR JSON 數據中推斷 OCR 處理時的實際頁面尺寸。 - 這非常重要,因為 OCR 可能在高解析度影像上運行。 + 從 OCR JSON 數據中取得頁面尺寸。 + 優先使用明確的 dimensions 欄位,失敗時才回退到 bbox 推斷。 Args: ocr_data: Complete OCR data dictionary with text_regions and layout @@ -994,6 +1071,26 @@ class PDFGeneratorService: Returns: Tuple of (width, height) in points """ + # *** 優先級 1: 檢查 ocr_dimensions (UnifiedDocument 轉換來的) *** + if 'ocr_dimensions' in ocr_data: + dims = ocr_data['ocr_dimensions'] + w = float(dims.get('width', 0)) + h = float(dims.get('height', 0)) + if w > 0 and h > 0: + logger.info(f"使用 ocr_dimensions 欄位的頁面尺寸: {w:.1f} x {h:.1f}") + return (w, h) + + # *** 優先級 2: 檢查原始 JSON 的 dimensions *** + if 'dimensions' in ocr_data: + dims = ocr_data['dimensions'] + w = float(dims.get('width', 0)) + h = float(dims.get('height', 0)) + if w > 0 and h > 0: + logger.info(f"使用 dimensions 欄位的頁面尺寸: {w:.1f} x {h:.1f}") + return (w, h) + + # *** 優先級 3: Fallback - 從 bbox 推斷 (僅當上述皆缺失時使用) *** + logger.info("dimensions 欄位不可用,回退到 bbox 推斷") max_x = 0 max_y = 0 @@ -1069,9 +1166,69 @@ class PDFGeneratorService: return dims return A4 + def get_all_page_sizes(self, file_path: Path) -> Dict[int, Tuple[float, float]]: + """ + Extract dimensions for all pages from original source file + + Args: + file_path: Path to original file (image or PDF) + + Returns: + Dict mapping page index (0-based) to (width, height) in points + Empty dict if extraction fails + """ + page_sizes = {} + + try: + if not file_path.exists(): + logger.warning(f"File not found: {file_path}") + return page_sizes + + # For images, single page with dimensions from PIL + if file_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']: + img = Image.open(file_path) + # Use pixel dimensions directly as points (1:1 mapping) + # This matches how PaddleOCR reports coordinates + width_pt = float(img.width) + height_pt = float(img.height) + page_sizes[0] = (width_pt, height_pt) + logger.info(f"Extracted dimensions from image: {width_pt:.1f} x {height_pt:.1f} points (1:1 pixel mapping)") + return page_sizes + + # For PDFs, extract dimensions for all pages using PyPDF2 + if file_path.suffix.lower() == '.pdf': + try: + from PyPDF2 import PdfReader + reader = PdfReader(file_path) + total_pages = len(reader.pages) + + for page_idx in range(total_pages): + page = reader.pages[page_idx] + # MediaBox gives [x1, y1, x2, y2] in points + mediabox = page.mediabox + width_pt = float(mediabox.width) + height_pt = float(mediabox.height) + page_sizes[page_idx] = (width_pt, height_pt) + + logger.info(f"Extracted dimensions from PDF: {total_pages} pages") + for idx, (w, h) in page_sizes.items(): + logger.debug(f" Page {idx}: {w:.1f} x {h:.1f} points") + + return page_sizes + + except ImportError: + logger.warning("PyPDF2 not available, cannot extract PDF dimensions") + except Exception as e: + logger.warning(f"Failed to extract PDF dimensions: {e}") + + except Exception as e: + logger.warning(f"Failed to get page sizes from {file_path}: {e}") + + return page_sizes + def get_original_page_size(self, file_path: Path) -> Optional[Tuple[float, float]]: """ - Extract page dimensions from original source file + Extract first page dimensions from original source file (backward compatibility) Args: file_path: Path to original file (image or PDF) @@ -1079,41 +1236,9 @@ class PDFGeneratorService: Returns: Tuple of (width, height) in points or None """ - try: - if not file_path.exists(): - return None - - # For images, get dimensions from PIL - if file_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']: - img = Image.open(file_path) - # Use pixel dimensions directly as points (1:1 mapping) - # This matches how PaddleOCR reports coordinates - width_pt = float(img.width) - height_pt = float(img.height) - logger.info(f"Extracted dimensions from image: {width_pt:.1f} x {height_pt:.1f} points (1:1 pixel mapping)") - return (width_pt, height_pt) - - # For PDFs, extract dimensions using PyPDF2 - if file_path.suffix.lower() == '.pdf': - try: - from PyPDF2 import PdfReader - reader = PdfReader(file_path) - if len(reader.pages) > 0: - page = reader.pages[0] - # MediaBox gives [x1, y1, x2, y2] in points - mediabox = page.mediabox - width_pt = float(mediabox.width) - height_pt = float(mediabox.height) - logger.info(f"Extracted dimensions from PDF: {width_pt:.1f} x {height_pt:.1f} points") - return (width_pt, height_pt) - except ImportError: - logger.warning("PyPDF2 not available, cannot extract PDF dimensions") - except Exception as e: - logger.warning(f"Failed to extract PDF dimensions: {e}") - - except Exception as e: - logger.warning(f"Failed to get page size from {file_path}: {e}") - + page_sizes = self.get_all_page_sizes(file_path) + if 0 in page_sizes: + return page_sizes[0] return None def _get_bbox_coords(self, bbox: Union[List[List[float]], List[float]]) -> Optional[Tuple[float, float, float, float]]: