fix: improve multi-page PDF dimension handling and coordinate transformation

Resolve issues where multi-page PDFs with varying page sizes had incorrect
element positioning and scaling. Each page now maintains its own dimensions
and scale factors throughout the generation process.

Key improvements:

Direct Track Processing:
- Store per-page dimensions in page_dimensions mapping (0-based index)
- Set correct page size for each page using setPageSize()
- Pass current page height to all drawing methods for accurate Y-axis conversion
- Each page uses its own dimensions instead of first page dimensions

OCR Track Processing:
- Calculate per-page scale factors with 3-tier priority:
  1. Original file dimensions (highest priority)
  2. OCR/UnifiedDocument dimensions
  3. Fallback to first page dimensions
- Apply correct scaling factors for each page's coordinate transformation
- Handle mixed-size pages correctly (e.g., A4 + A3 in same document)

Dimension Extraction:
- Add get_all_page_sizes() method to extract dimensions for all PDF pages
- Return Dict[int, Tuple[float, float]] mapping page index to (width, height)
- Maintain backward compatibility with get_original_page_size() for first page
- Support both images (single page) and multi-page PDFs

Coordinate System:
- Add ocr_dimensions priority check in calculate_page_dimensions()
- Priority order: ocr_dimensions > dimensions > bbox inference
- Ensure consistent coordinate space across processing tracks

Benefits:
- Correct rendering for documents with mixed page sizes
- Accurate element positioning on all pages
- Proper scaling for scanned documents with varying DPI per page
- Better handling of landscape/portrait mixed documents

Related to archived proposal: fix-pdf-coordinate-system

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-25 15:09:39 +08:00
parent 2312b4cd66
commit 0999898358

View File

@@ -503,6 +503,14 @@ class PDFGeneratorService:
else:
logger.warning(f"No image path found for visual element {element.element_id}")
# Build page dimensions mapping for multi-page support
page_dimensions = {}
for page in unified_doc.pages:
page_dimensions[page.page_number - 1] = { # 0-based index
'width': page.dimensions.width,
'height': page.dimensions.height
}
# Build OCR data structure
ocr_data = {
'text_regions': text_regions,
@@ -516,6 +524,7 @@ class PDFGeneratorService:
'width': unified_doc.pages[0].dimensions.width if unified_doc.pages else 0,
'height': unified_doc.pages[0].dimensions.height if unified_doc.pages else 0
},
'page_dimensions': page_dimensions, # Per-page dimensions for multi-page support
# Metadata for tracking
'_from_unified_document': True,
'_processing_track': unified_doc.metadata.processing_track.value
@@ -669,7 +678,7 @@ class PDFGeneratorService:
# Set current track for helper methods
self.current_processing_track = 'direct'
# Get page dimensions from first page
# Get page dimensions from first page (for canvas initialization)
if not unified_doc.pages:
logger.error("No pages in document")
return False
@@ -678,9 +687,9 @@ class PDFGeneratorService:
page_width = first_page.dimensions.width
page_height = first_page.dimensions.height
logger.info(f"Page dimensions: {page_width} x {page_height}")
logger.info(f"First page dimensions: {page_width} x {page_height}")
# Create PDF canvas with source dimensions
# Create PDF canvas with first page dimensions (will be updated per page)
from reportlab.pdfgen import canvas
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(page_width, page_height))
@@ -688,9 +697,17 @@ class PDFGeneratorService:
for page_idx, page in enumerate(unified_doc.pages):
logger.info(f">>> Processing page {page_idx + 1}/{len(unified_doc.pages)}")
# Get current page dimensions
current_page_width = page.dimensions.width
current_page_height = page.dimensions.height
logger.info(f"Page {page_idx + 1} dimensions: {current_page_width} x {current_page_height}")
if page_idx > 0:
pdf_canvas.showPage()
# Set page size for current page
pdf_canvas.setPageSize((current_page_width, current_page_height))
# Separate elements by type
text_elements = []
table_elements = []
@@ -757,19 +774,19 @@ class PDFGeneratorService:
# Draw elements in document order
for elem_type, elem in all_elements:
if elem_type == 'image':
self._draw_image_element_direct(pdf_canvas, elem, page_height, output_path.parent)
self._draw_image_element_direct(pdf_canvas, elem, current_page_height, output_path.parent)
elif elem_type == 'table':
self._draw_table_element_direct(pdf_canvas, elem, page_height)
self._draw_table_element_direct(pdf_canvas, elem, current_page_height)
elif elem_type == 'list':
# FIX: Check if list item overlaps with table/image
if not self._is_element_inside_regions(elem.bbox, regions_to_avoid):
self._draw_text_element_direct(pdf_canvas, elem, page_height)
self._draw_text_element_direct(pdf_canvas, elem, current_page_height)
else:
logger.debug(f"Skipping list element {elem.element_id} inside table/image region")
elif elem_type == 'text':
# FIX: Check if text overlaps with table/image before drawing
if not self._is_element_inside_regions(elem.bbox, regions_to_avoid):
self._draw_text_element_direct(pdf_canvas, elem, page_height)
self._draw_text_element_direct(pdf_canvas, elem, current_page_height)
else:
logger.debug(f"Skipping text element {elem.element_id} inside table/image region")
@@ -875,29 +892,38 @@ class PDFGeneratorService:
# Get layout data
layout_data = ocr_data.get('layout_data', {})
# Step 1: Get OCR processing dimensions
# Step 1: Get OCR processing dimensions (for first page / default)
ocr_width, ocr_height = self.calculate_page_dimensions(ocr_data, source_file_path=None)
logger.info(f"OCR 處理時使用的座標系尺寸: {ocr_width:.1f} x {ocr_height:.1f}")
logger.info(f"OCR 處理時使用的座標系尺寸 (第一頁): {ocr_width:.1f} x {ocr_height:.1f}")
# Step 2: Get target PDF dimensions
# Step 2: Get page dimensions mapping for multi-page support
page_dimensions = ocr_data.get('page_dimensions', {})
if not page_dimensions:
# Fallback: use first page dimensions for all pages
page_dimensions = {0: {'width': ocr_width, 'height': ocr_height}}
logger.info("No page_dimensions found, using first page size for all pages")
# Step 3: Get original file dimensions for all pages
original_page_sizes = {}
if source_file_path:
target_dims = self.get_original_page_size(source_file_path)
if target_dims:
target_width, target_height = target_dims
logger.info(f"目標 PDF 尺寸(來自原始文件): {target_width:.1f} x {target_height:.1f}")
original_page_sizes = self.get_all_page_sizes(source_file_path)
if original_page_sizes:
logger.info(f"從原始文件獲取到 {len(original_page_sizes)} 頁尺寸")
else:
target_width, target_height = ocr_width, ocr_height
logger.warning(f"無法獲取原始文件尺寸,使用 OCR 尺寸作為目標")
logger.warning(f"無法獲取原始文件尺寸,將使用 OCR/UnifiedDocument 尺寸")
else:
logger.info(f"無原始文件,將使用 OCR/UnifiedDocument 尺寸")
# Determine initial canvas size (will be updated per page)
# Priority: original file first page > OCR/UnifiedDocument first page
if 0 in original_page_sizes:
target_width, target_height = original_page_sizes[0]
logger.info(f"初始 PDF 尺寸(來自原始文件首頁): {target_width:.1f} x {target_height:.1f}")
else:
target_width, target_height = ocr_width, ocr_height
logger.info(f"無原始文件,使用 OCR 尺寸作為目標: {target_width:.1f} x {target_height:.1f}")
logger.info(f"初始 PDF 尺寸(來自 OCR/UnifiedDocument: {target_width:.1f} x {target_height:.1f}")
# Step 3: Calculate scale factors
scale_w = target_width / ocr_width if ocr_width > 0 else 1.0
scale_h = target_height / ocr_height if ocr_height > 0 else 1.0
logger.info(f"縮放因子: X={scale_w:.3f}, Y={scale_h:.3f}")
# Create PDF canvas
# Create PDF canvas with initial page size (will be updated per page)
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height))
# Filter text regions to avoid overlap with tables/images
@@ -931,9 +957,60 @@ class PDFGeneratorService:
for page_num in range(1, total_pages + 1):
logger.info(f">>> 處理第 {page_num}/{total_pages}")
# Get current page dimensions with priority order:
# 1. Original file dimensions (highest priority)
# 2. OCR/UnifiedDocument dimensions
# 3. Fallback to first page dimensions
page_idx = page_num - 1
dimension_source = "unknown"
# Priority 1: Original file dimensions
if page_idx in original_page_sizes:
current_target_w, current_target_h = original_page_sizes[page_idx]
dimension_source = "original_file"
# Priority 2: OCR/UnifiedDocument dimensions
elif page_idx in page_dimensions:
current_page_dims = page_dimensions[page_idx]
current_target_w = float(current_page_dims['width'])
current_target_h = float(current_page_dims['height'])
dimension_source = "ocr_unified_doc"
# Priority 3: Fallback to first page
else:
current_target_w = ocr_width
current_target_h = ocr_height
dimension_source = "fallback_first_page"
logger.warning(f"No dimensions for page {page_num}, using first page size")
# Calculate scale factors for coordinate transformation
# OCR coordinates need to be scaled if original file dimensions differ
if dimension_source == "original_file":
# Get OCR dimensions for this page to calculate scale
if page_idx in page_dimensions:
ocr_page_w = float(page_dimensions[page_idx]['width'])
ocr_page_h = float(page_dimensions[page_idx]['height'])
else:
ocr_page_w = ocr_width
ocr_page_h = ocr_height
current_scale_w = current_target_w / ocr_page_w if ocr_page_w > 0 else 1.0
current_scale_h = current_target_h / ocr_page_h if ocr_page_h > 0 else 1.0
else:
# Using OCR/UnifiedDocument dimensions directly, no scaling needed
current_scale_w = 1.0
current_scale_h = 1.0
logger.info(f"{page_num} 頁尺寸: {current_target_w:.1f} x {current_target_h:.1f} "
f"(來源: {dimension_source}, 縮放: {current_scale_w:.3f}x{current_scale_h:.3f})")
if page_num > 1:
pdf_canvas.showPage()
# Set page size for current page
pdf_canvas.setPageSize((current_target_w, current_target_h))
# Get regions for this page
page_text_regions = pages_data.get(page_num, [])
page_table_regions = [t for t in table_elements if t.get('page') == page_num - 1]
@@ -949,22 +1026,22 @@ class PDFGeneratorService:
# 1. Draw images (bottom layer)
for img_meta in page_image_regions:
self.draw_image_region(
pdf_canvas, img_meta, target_height,
json_parent_dir, scale_w, scale_h
pdf_canvas, img_meta, current_target_h,
json_parent_dir, current_scale_w, current_scale_h
)
# 2. Draw tables (middle layer)
for table_elem in page_table_regions:
self.draw_table_region(
pdf_canvas, table_elem, images_metadata,
target_height, scale_w, scale_h
current_target_h, current_scale_w, current_scale_h
)
# 3. Draw text (top layer)
for region in page_text_regions:
self.draw_text_region(
pdf_canvas, region, target_height,
scale_w, scale_h
pdf_canvas, region, current_target_h,
current_scale_w, current_scale_h
)
logger.info(f"<<< 第 {page_num} 頁完成")
@@ -984,8 +1061,8 @@ class PDFGeneratorService:
def calculate_page_dimensions(self, ocr_data: Dict, source_file_path: Optional[Path] = None) -> Tuple[float, float]:
"""
從 OCR JSON 數據中推斷 OCR 處理時的實際頁面尺寸。
這非常重要,因為 OCR 可能在高解析度影像上運行
從 OCR JSON 數據中取得頁面尺寸。
優先使用明確的 dimensions 欄位,失敗時才回退到 bbox 推斷
Args:
ocr_data: Complete OCR data dictionary with text_regions and layout
@@ -994,6 +1071,26 @@ class PDFGeneratorService:
Returns:
Tuple of (width, height) in points
"""
# *** 優先級 1: 檢查 ocr_dimensions (UnifiedDocument 轉換來的) ***
if 'ocr_dimensions' in ocr_data:
dims = ocr_data['ocr_dimensions']
w = float(dims.get('width', 0))
h = float(dims.get('height', 0))
if w > 0 and h > 0:
logger.info(f"使用 ocr_dimensions 欄位的頁面尺寸: {w:.1f} x {h:.1f}")
return (w, h)
# *** 優先級 2: 檢查原始 JSON 的 dimensions ***
if 'dimensions' in ocr_data:
dims = ocr_data['dimensions']
w = float(dims.get('width', 0))
h = float(dims.get('height', 0))
if w > 0 and h > 0:
logger.info(f"使用 dimensions 欄位的頁面尺寸: {w:.1f} x {h:.1f}")
return (w, h)
# *** 優先級 3: Fallback - 從 bbox 推斷 (僅當上述皆缺失時使用) ***
logger.info("dimensions 欄位不可用,回退到 bbox 推斷")
max_x = 0
max_y = 0
@@ -1069,9 +1166,69 @@ class PDFGeneratorService:
return dims
return A4
def get_all_page_sizes(self, file_path: Path) -> Dict[int, Tuple[float, float]]:
"""
Extract dimensions for all pages from original source file
Args:
file_path: Path to original file (image or PDF)
Returns:
Dict mapping page index (0-based) to (width, height) in points
Empty dict if extraction fails
"""
page_sizes = {}
try:
if not file_path.exists():
logger.warning(f"File not found: {file_path}")
return page_sizes
# For images, single page with dimensions from PIL
if file_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']:
img = Image.open(file_path)
# Use pixel dimensions directly as points (1:1 mapping)
# This matches how PaddleOCR reports coordinates
width_pt = float(img.width)
height_pt = float(img.height)
page_sizes[0] = (width_pt, height_pt)
logger.info(f"Extracted dimensions from image: {width_pt:.1f} x {height_pt:.1f} points (1:1 pixel mapping)")
return page_sizes
# For PDFs, extract dimensions for all pages using PyPDF2
if file_path.suffix.lower() == '.pdf':
try:
from PyPDF2 import PdfReader
reader = PdfReader(file_path)
total_pages = len(reader.pages)
for page_idx in range(total_pages):
page = reader.pages[page_idx]
# MediaBox gives [x1, y1, x2, y2] in points
mediabox = page.mediabox
width_pt = float(mediabox.width)
height_pt = float(mediabox.height)
page_sizes[page_idx] = (width_pt, height_pt)
logger.info(f"Extracted dimensions from PDF: {total_pages} pages")
for idx, (w, h) in page_sizes.items():
logger.debug(f" Page {idx}: {w:.1f} x {h:.1f} points")
return page_sizes
except ImportError:
logger.warning("PyPDF2 not available, cannot extract PDF dimensions")
except Exception as e:
logger.warning(f"Failed to extract PDF dimensions: {e}")
except Exception as e:
logger.warning(f"Failed to get page sizes from {file_path}: {e}")
return page_sizes
def get_original_page_size(self, file_path: Path) -> Optional[Tuple[float, float]]:
"""
Extract page dimensions from original source file
Extract first page dimensions from original source file (backward compatibility)
Args:
file_path: Path to original file (image or PDF)
@@ -1079,41 +1236,9 @@ class PDFGeneratorService:
Returns:
Tuple of (width, height) in points or None
"""
try:
if not file_path.exists():
return None
# For images, get dimensions from PIL
if file_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']:
img = Image.open(file_path)
# Use pixel dimensions directly as points (1:1 mapping)
# This matches how PaddleOCR reports coordinates
width_pt = float(img.width)
height_pt = float(img.height)
logger.info(f"Extracted dimensions from image: {width_pt:.1f} x {height_pt:.1f} points (1:1 pixel mapping)")
return (width_pt, height_pt)
# For PDFs, extract dimensions using PyPDF2
if file_path.suffix.lower() == '.pdf':
try:
from PyPDF2 import PdfReader
reader = PdfReader(file_path)
if len(reader.pages) > 0:
page = reader.pages[0]
# MediaBox gives [x1, y1, x2, y2] in points
mediabox = page.mediabox
width_pt = float(mediabox.width)
height_pt = float(mediabox.height)
logger.info(f"Extracted dimensions from PDF: {width_pt:.1f} x {height_pt:.1f} points")
return (width_pt, height_pt)
except ImportError:
logger.warning("PyPDF2 not available, cannot extract PDF dimensions")
except Exception as e:
logger.warning(f"Failed to extract PDF dimensions: {e}")
except Exception as e:
logger.warning(f"Failed to get page size from {file_path}: {e}")
page_sizes = self.get_all_page_sizes(file_path)
if 0 in page_sizes:
return page_sizes[0]
return None
def _get_bbox_coords(self, bbox: Union[List[List[float]], List[float]]) -> Optional[Tuple[float, float, float, float]]: