fix: improve multi-page PDF dimension handling and coordinate transformation
Resolve issues where multi-page PDFs with varying page sizes had incorrect element positioning and scaling. Each page now maintains its own dimensions and scale factors throughout the generation process. Key improvements: Direct Track Processing: - Store per-page dimensions in page_dimensions mapping (0-based index) - Set correct page size for each page using setPageSize() - Pass current page height to all drawing methods for accurate Y-axis conversion - Each page uses its own dimensions instead of first page dimensions OCR Track Processing: - Calculate per-page scale factors with 3-tier priority: 1. Original file dimensions (highest priority) 2. OCR/UnifiedDocument dimensions 3. Fallback to first page dimensions - Apply correct scaling factors for each page's coordinate transformation - Handle mixed-size pages correctly (e.g., A4 + A3 in same document) Dimension Extraction: - Add get_all_page_sizes() method to extract dimensions for all PDF pages - Return Dict[int, Tuple[float, float]] mapping page index to (width, height) - Maintain backward compatibility with get_original_page_size() for first page - Support both images (single page) and multi-page PDFs Coordinate System: - Add ocr_dimensions priority check in calculate_page_dimensions() - Priority order: ocr_dimensions > dimensions > bbox inference - Ensure consistent coordinate space across processing tracks Benefits: - Correct rendering for documents with mixed page sizes - Accurate element positioning on all pages - Proper scaling for scanned documents with varying DPI per page - Better handling of landscape/portrait mixed documents Related to archived proposal: fix-pdf-coordinate-system 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -503,6 +503,14 @@ class PDFGeneratorService:
|
||||
else:
|
||||
logger.warning(f"No image path found for visual element {element.element_id}")
|
||||
|
||||
# Build page dimensions mapping for multi-page support
|
||||
page_dimensions = {}
|
||||
for page in unified_doc.pages:
|
||||
page_dimensions[page.page_number - 1] = { # 0-based index
|
||||
'width': page.dimensions.width,
|
||||
'height': page.dimensions.height
|
||||
}
|
||||
|
||||
# Build OCR data structure
|
||||
ocr_data = {
|
||||
'text_regions': text_regions,
|
||||
@@ -516,6 +524,7 @@ class PDFGeneratorService:
|
||||
'width': unified_doc.pages[0].dimensions.width if unified_doc.pages else 0,
|
||||
'height': unified_doc.pages[0].dimensions.height if unified_doc.pages else 0
|
||||
},
|
||||
'page_dimensions': page_dimensions, # Per-page dimensions for multi-page support
|
||||
# Metadata for tracking
|
||||
'_from_unified_document': True,
|
||||
'_processing_track': unified_doc.metadata.processing_track.value
|
||||
@@ -669,7 +678,7 @@ class PDFGeneratorService:
|
||||
# Set current track for helper methods
|
||||
self.current_processing_track = 'direct'
|
||||
|
||||
# Get page dimensions from first page
|
||||
# Get page dimensions from first page (for canvas initialization)
|
||||
if not unified_doc.pages:
|
||||
logger.error("No pages in document")
|
||||
return False
|
||||
@@ -678,9 +687,9 @@ class PDFGeneratorService:
|
||||
page_width = first_page.dimensions.width
|
||||
page_height = first_page.dimensions.height
|
||||
|
||||
logger.info(f"Page dimensions: {page_width} x {page_height}")
|
||||
logger.info(f"First page dimensions: {page_width} x {page_height}")
|
||||
|
||||
# Create PDF canvas with source dimensions
|
||||
# Create PDF canvas with first page dimensions (will be updated per page)
|
||||
from reportlab.pdfgen import canvas
|
||||
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(page_width, page_height))
|
||||
|
||||
@@ -688,9 +697,17 @@ class PDFGeneratorService:
|
||||
for page_idx, page in enumerate(unified_doc.pages):
|
||||
logger.info(f">>> Processing page {page_idx + 1}/{len(unified_doc.pages)}")
|
||||
|
||||
# Get current page dimensions
|
||||
current_page_width = page.dimensions.width
|
||||
current_page_height = page.dimensions.height
|
||||
logger.info(f"Page {page_idx + 1} dimensions: {current_page_width} x {current_page_height}")
|
||||
|
||||
if page_idx > 0:
|
||||
pdf_canvas.showPage()
|
||||
|
||||
# Set page size for current page
|
||||
pdf_canvas.setPageSize((current_page_width, current_page_height))
|
||||
|
||||
# Separate elements by type
|
||||
text_elements = []
|
||||
table_elements = []
|
||||
@@ -757,19 +774,19 @@ class PDFGeneratorService:
|
||||
# Draw elements in document order
|
||||
for elem_type, elem in all_elements:
|
||||
if elem_type == 'image':
|
||||
self._draw_image_element_direct(pdf_canvas, elem, page_height, output_path.parent)
|
||||
self._draw_image_element_direct(pdf_canvas, elem, current_page_height, output_path.parent)
|
||||
elif elem_type == 'table':
|
||||
self._draw_table_element_direct(pdf_canvas, elem, page_height)
|
||||
self._draw_table_element_direct(pdf_canvas, elem, current_page_height)
|
||||
elif elem_type == 'list':
|
||||
# FIX: Check if list item overlaps with table/image
|
||||
if not self._is_element_inside_regions(elem.bbox, regions_to_avoid):
|
||||
self._draw_text_element_direct(pdf_canvas, elem, page_height)
|
||||
self._draw_text_element_direct(pdf_canvas, elem, current_page_height)
|
||||
else:
|
||||
logger.debug(f"Skipping list element {elem.element_id} inside table/image region")
|
||||
elif elem_type == 'text':
|
||||
# FIX: Check if text overlaps with table/image before drawing
|
||||
if not self._is_element_inside_regions(elem.bbox, regions_to_avoid):
|
||||
self._draw_text_element_direct(pdf_canvas, elem, page_height)
|
||||
self._draw_text_element_direct(pdf_canvas, elem, current_page_height)
|
||||
else:
|
||||
logger.debug(f"Skipping text element {elem.element_id} inside table/image region")
|
||||
|
||||
@@ -875,29 +892,38 @@ class PDFGeneratorService:
|
||||
# Get layout data
|
||||
layout_data = ocr_data.get('layout_data', {})
|
||||
|
||||
# Step 1: Get OCR processing dimensions
|
||||
# Step 1: Get OCR processing dimensions (for first page / default)
|
||||
ocr_width, ocr_height = self.calculate_page_dimensions(ocr_data, source_file_path=None)
|
||||
logger.info(f"OCR 處理時使用的座標系尺寸: {ocr_width:.1f} x {ocr_height:.1f}")
|
||||
logger.info(f"OCR 處理時使用的座標系尺寸 (第一頁): {ocr_width:.1f} x {ocr_height:.1f}")
|
||||
|
||||
# Step 2: Get target PDF dimensions
|
||||
# Step 2: Get page dimensions mapping for multi-page support
|
||||
page_dimensions = ocr_data.get('page_dimensions', {})
|
||||
if not page_dimensions:
|
||||
# Fallback: use first page dimensions for all pages
|
||||
page_dimensions = {0: {'width': ocr_width, 'height': ocr_height}}
|
||||
logger.info("No page_dimensions found, using first page size for all pages")
|
||||
|
||||
# Step 3: Get original file dimensions for all pages
|
||||
original_page_sizes = {}
|
||||
if source_file_path:
|
||||
target_dims = self.get_original_page_size(source_file_path)
|
||||
if target_dims:
|
||||
target_width, target_height = target_dims
|
||||
logger.info(f"目標 PDF 尺寸(來自原始文件): {target_width:.1f} x {target_height:.1f}")
|
||||
original_page_sizes = self.get_all_page_sizes(source_file_path)
|
||||
if original_page_sizes:
|
||||
logger.info(f"從原始文件獲取到 {len(original_page_sizes)} 頁尺寸")
|
||||
else:
|
||||
logger.warning(f"無法獲取原始文件尺寸,將使用 OCR/UnifiedDocument 尺寸")
|
||||
else:
|
||||
logger.info(f"無原始文件,將使用 OCR/UnifiedDocument 尺寸")
|
||||
|
||||
# Determine initial canvas size (will be updated per page)
|
||||
# Priority: original file first page > OCR/UnifiedDocument first page
|
||||
if 0 in original_page_sizes:
|
||||
target_width, target_height = original_page_sizes[0]
|
||||
logger.info(f"初始 PDF 尺寸(來自原始文件首頁): {target_width:.1f} x {target_height:.1f}")
|
||||
else:
|
||||
target_width, target_height = ocr_width, ocr_height
|
||||
logger.warning(f"無法獲取原始文件尺寸,使用 OCR 尺寸作為目標")
|
||||
else:
|
||||
target_width, target_height = ocr_width, ocr_height
|
||||
logger.info(f"無原始文件,使用 OCR 尺寸作為目標: {target_width:.1f} x {target_height:.1f}")
|
||||
logger.info(f"初始 PDF 尺寸(來自 OCR/UnifiedDocument): {target_width:.1f} x {target_height:.1f}")
|
||||
|
||||
# Step 3: Calculate scale factors
|
||||
scale_w = target_width / ocr_width if ocr_width > 0 else 1.0
|
||||
scale_h = target_height / ocr_height if ocr_height > 0 else 1.0
|
||||
logger.info(f"縮放因子: X={scale_w:.3f}, Y={scale_h:.3f}")
|
||||
|
||||
# Create PDF canvas
|
||||
# Create PDF canvas with initial page size (will be updated per page)
|
||||
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height))
|
||||
|
||||
# Filter text regions to avoid overlap with tables/images
|
||||
@@ -931,9 +957,60 @@ class PDFGeneratorService:
|
||||
|
||||
for page_num in range(1, total_pages + 1):
|
||||
logger.info(f">>> 處理第 {page_num}/{total_pages} 頁")
|
||||
|
||||
# Get current page dimensions with priority order:
|
||||
# 1. Original file dimensions (highest priority)
|
||||
# 2. OCR/UnifiedDocument dimensions
|
||||
# 3. Fallback to first page dimensions
|
||||
page_idx = page_num - 1
|
||||
dimension_source = "unknown"
|
||||
|
||||
# Priority 1: Original file dimensions
|
||||
if page_idx in original_page_sizes:
|
||||
current_target_w, current_target_h = original_page_sizes[page_idx]
|
||||
dimension_source = "original_file"
|
||||
|
||||
# Priority 2: OCR/UnifiedDocument dimensions
|
||||
elif page_idx in page_dimensions:
|
||||
current_page_dims = page_dimensions[page_idx]
|
||||
current_target_w = float(current_page_dims['width'])
|
||||
current_target_h = float(current_page_dims['height'])
|
||||
dimension_source = "ocr_unified_doc"
|
||||
|
||||
# Priority 3: Fallback to first page
|
||||
else:
|
||||
current_target_w = ocr_width
|
||||
current_target_h = ocr_height
|
||||
dimension_source = "fallback_first_page"
|
||||
logger.warning(f"No dimensions for page {page_num}, using first page size")
|
||||
|
||||
# Calculate scale factors for coordinate transformation
|
||||
# OCR coordinates need to be scaled if original file dimensions differ
|
||||
if dimension_source == "original_file":
|
||||
# Get OCR dimensions for this page to calculate scale
|
||||
if page_idx in page_dimensions:
|
||||
ocr_page_w = float(page_dimensions[page_idx]['width'])
|
||||
ocr_page_h = float(page_dimensions[page_idx]['height'])
|
||||
else:
|
||||
ocr_page_w = ocr_width
|
||||
ocr_page_h = ocr_height
|
||||
|
||||
current_scale_w = current_target_w / ocr_page_w if ocr_page_w > 0 else 1.0
|
||||
current_scale_h = current_target_h / ocr_page_h if ocr_page_h > 0 else 1.0
|
||||
else:
|
||||
# Using OCR/UnifiedDocument dimensions directly, no scaling needed
|
||||
current_scale_w = 1.0
|
||||
current_scale_h = 1.0
|
||||
|
||||
logger.info(f"第 {page_num} 頁尺寸: {current_target_w:.1f} x {current_target_h:.1f} "
|
||||
f"(來源: {dimension_source}, 縮放: {current_scale_w:.3f}x{current_scale_h:.3f})")
|
||||
|
||||
if page_num > 1:
|
||||
pdf_canvas.showPage()
|
||||
|
||||
# Set page size for current page
|
||||
pdf_canvas.setPageSize((current_target_w, current_target_h))
|
||||
|
||||
# Get regions for this page
|
||||
page_text_regions = pages_data.get(page_num, [])
|
||||
page_table_regions = [t for t in table_elements if t.get('page') == page_num - 1]
|
||||
@@ -949,22 +1026,22 @@ class PDFGeneratorService:
|
||||
# 1. Draw images (bottom layer)
|
||||
for img_meta in page_image_regions:
|
||||
self.draw_image_region(
|
||||
pdf_canvas, img_meta, target_height,
|
||||
json_parent_dir, scale_w, scale_h
|
||||
pdf_canvas, img_meta, current_target_h,
|
||||
json_parent_dir, current_scale_w, current_scale_h
|
||||
)
|
||||
|
||||
# 2. Draw tables (middle layer)
|
||||
for table_elem in page_table_regions:
|
||||
self.draw_table_region(
|
||||
pdf_canvas, table_elem, images_metadata,
|
||||
target_height, scale_w, scale_h
|
||||
current_target_h, current_scale_w, current_scale_h
|
||||
)
|
||||
|
||||
# 3. Draw text (top layer)
|
||||
for region in page_text_regions:
|
||||
self.draw_text_region(
|
||||
pdf_canvas, region, target_height,
|
||||
scale_w, scale_h
|
||||
pdf_canvas, region, current_target_h,
|
||||
current_scale_w, current_scale_h
|
||||
)
|
||||
|
||||
logger.info(f"<<< 第 {page_num} 頁完成")
|
||||
@@ -984,8 +1061,8 @@ class PDFGeneratorService:
|
||||
|
||||
def calculate_page_dimensions(self, ocr_data: Dict, source_file_path: Optional[Path] = None) -> Tuple[float, float]:
|
||||
"""
|
||||
從 OCR JSON 數據中推斷 OCR 處理時的實際頁面尺寸。
|
||||
這非常重要,因為 OCR 可能在高解析度影像上運行。
|
||||
從 OCR JSON 數據中取得頁面尺寸。
|
||||
優先使用明確的 dimensions 欄位,失敗時才回退到 bbox 推斷。
|
||||
|
||||
Args:
|
||||
ocr_data: Complete OCR data dictionary with text_regions and layout
|
||||
@@ -994,6 +1071,26 @@ class PDFGeneratorService:
|
||||
Returns:
|
||||
Tuple of (width, height) in points
|
||||
"""
|
||||
# *** 優先級 1: 檢查 ocr_dimensions (UnifiedDocument 轉換來的) ***
|
||||
if 'ocr_dimensions' in ocr_data:
|
||||
dims = ocr_data['ocr_dimensions']
|
||||
w = float(dims.get('width', 0))
|
||||
h = float(dims.get('height', 0))
|
||||
if w > 0 and h > 0:
|
||||
logger.info(f"使用 ocr_dimensions 欄位的頁面尺寸: {w:.1f} x {h:.1f}")
|
||||
return (w, h)
|
||||
|
||||
# *** 優先級 2: 檢查原始 JSON 的 dimensions ***
|
||||
if 'dimensions' in ocr_data:
|
||||
dims = ocr_data['dimensions']
|
||||
w = float(dims.get('width', 0))
|
||||
h = float(dims.get('height', 0))
|
||||
if w > 0 and h > 0:
|
||||
logger.info(f"使用 dimensions 欄位的頁面尺寸: {w:.1f} x {h:.1f}")
|
||||
return (w, h)
|
||||
|
||||
# *** 優先級 3: Fallback - 從 bbox 推斷 (僅當上述皆缺失時使用) ***
|
||||
logger.info("dimensions 欄位不可用,回退到 bbox 推斷")
|
||||
max_x = 0
|
||||
max_y = 0
|
||||
|
||||
@@ -1069,9 +1166,69 @@ class PDFGeneratorService:
|
||||
return dims
|
||||
return A4
|
||||
|
||||
def get_all_page_sizes(self, file_path: Path) -> Dict[int, Tuple[float, float]]:
|
||||
"""
|
||||
Extract dimensions for all pages from original source file
|
||||
|
||||
Args:
|
||||
file_path: Path to original file (image or PDF)
|
||||
|
||||
Returns:
|
||||
Dict mapping page index (0-based) to (width, height) in points
|
||||
Empty dict if extraction fails
|
||||
"""
|
||||
page_sizes = {}
|
||||
|
||||
try:
|
||||
if not file_path.exists():
|
||||
logger.warning(f"File not found: {file_path}")
|
||||
return page_sizes
|
||||
|
||||
# For images, single page with dimensions from PIL
|
||||
if file_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']:
|
||||
img = Image.open(file_path)
|
||||
# Use pixel dimensions directly as points (1:1 mapping)
|
||||
# This matches how PaddleOCR reports coordinates
|
||||
width_pt = float(img.width)
|
||||
height_pt = float(img.height)
|
||||
page_sizes[0] = (width_pt, height_pt)
|
||||
logger.info(f"Extracted dimensions from image: {width_pt:.1f} x {height_pt:.1f} points (1:1 pixel mapping)")
|
||||
return page_sizes
|
||||
|
||||
# For PDFs, extract dimensions for all pages using PyPDF2
|
||||
if file_path.suffix.lower() == '.pdf':
|
||||
try:
|
||||
from PyPDF2 import PdfReader
|
||||
reader = PdfReader(file_path)
|
||||
total_pages = len(reader.pages)
|
||||
|
||||
for page_idx in range(total_pages):
|
||||
page = reader.pages[page_idx]
|
||||
# MediaBox gives [x1, y1, x2, y2] in points
|
||||
mediabox = page.mediabox
|
||||
width_pt = float(mediabox.width)
|
||||
height_pt = float(mediabox.height)
|
||||
page_sizes[page_idx] = (width_pt, height_pt)
|
||||
|
||||
logger.info(f"Extracted dimensions from PDF: {total_pages} pages")
|
||||
for idx, (w, h) in page_sizes.items():
|
||||
logger.debug(f" Page {idx}: {w:.1f} x {h:.1f} points")
|
||||
|
||||
return page_sizes
|
||||
|
||||
except ImportError:
|
||||
logger.warning("PyPDF2 not available, cannot extract PDF dimensions")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to extract PDF dimensions: {e}")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to get page sizes from {file_path}: {e}")
|
||||
|
||||
return page_sizes
|
||||
|
||||
def get_original_page_size(self, file_path: Path) -> Optional[Tuple[float, float]]:
|
||||
"""
|
||||
Extract page dimensions from original source file
|
||||
Extract first page dimensions from original source file (backward compatibility)
|
||||
|
||||
Args:
|
||||
file_path: Path to original file (image or PDF)
|
||||
@@ -1079,41 +1236,9 @@ class PDFGeneratorService:
|
||||
Returns:
|
||||
Tuple of (width, height) in points or None
|
||||
"""
|
||||
try:
|
||||
if not file_path.exists():
|
||||
return None
|
||||
|
||||
# For images, get dimensions from PIL
|
||||
if file_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']:
|
||||
img = Image.open(file_path)
|
||||
# Use pixel dimensions directly as points (1:1 mapping)
|
||||
# This matches how PaddleOCR reports coordinates
|
||||
width_pt = float(img.width)
|
||||
height_pt = float(img.height)
|
||||
logger.info(f"Extracted dimensions from image: {width_pt:.1f} x {height_pt:.1f} points (1:1 pixel mapping)")
|
||||
return (width_pt, height_pt)
|
||||
|
||||
# For PDFs, extract dimensions using PyPDF2
|
||||
if file_path.suffix.lower() == '.pdf':
|
||||
try:
|
||||
from PyPDF2 import PdfReader
|
||||
reader = PdfReader(file_path)
|
||||
if len(reader.pages) > 0:
|
||||
page = reader.pages[0]
|
||||
# MediaBox gives [x1, y1, x2, y2] in points
|
||||
mediabox = page.mediabox
|
||||
width_pt = float(mediabox.width)
|
||||
height_pt = float(mediabox.height)
|
||||
logger.info(f"Extracted dimensions from PDF: {width_pt:.1f} x {height_pt:.1f} points")
|
||||
return (width_pt, height_pt)
|
||||
except ImportError:
|
||||
logger.warning("PyPDF2 not available, cannot extract PDF dimensions")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to extract PDF dimensions: {e}")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to get page size from {file_path}: {e}")
|
||||
|
||||
page_sizes = self.get_all_page_sizes(file_path)
|
||||
if 0 in page_sizes:
|
||||
return page_sizes[0]
|
||||
return None
|
||||
|
||||
def _get_bbox_coords(self, bbox: Union[List[List[float]], List[float]]) -> Optional[Tuple[float, float, float, float]]:
|
||||
|
||||
Reference in New Issue
Block a user