fix: correct OCR coordinate scaling by inferring dimensions from bbox
Critical Fix: The previous implementation incorrectly calculated scale factors because calculate_page_dimensions() was prioritizing source file dimensions over OCR coordinate analysis, resulting in scale=1.0 when it should have been ~0.27. Root Cause: - PaddleOCR processes PDFs at high resolution (e.g., 2185x3500 pixels) - OCR bbox coordinates are in this high-res space - calculate_page_dimensions() was returning source PDF size (595x842) instead - This caused scale_w=1.0, scale_h=1.0, placing all text out of bounds Solution: 1. Rewrite calculate_page_dimensions() to: - Accept full ocr_data instead of just text_regions - Process both text_regions AND layout elements - Handle polygon bbox format [[x,y], ...] correctly - Infer OCR dimensions from max bbox coordinates FIRST - Only fallback to source file dimensions if inference fails 2. Separate OCR dimensions from target PDF dimensions: - ocr_width/height: Inferred from bbox (e.g., 2185x3280) - target_width/height: From source file (e.g., 595x842) - scale_w = target_width / ocr_width (e.g., 0.272) - scale_h = target_height / ocr_height (e.g., 0.257) 3. Add PyPDF2 support: - Extract dimensions from source PDF files - Required for getting target PDF size Changes: - backend/app/services/pdf_generator_service.py: - Fix calculate_page_dimensions() to infer from bbox first - Add PyPDF2 support in get_original_page_size() - Simplify scaling logic (removed ocr_dimensions dependency) - Update all drawing calls to use target_height instead of page_height - requirements.txt: - Add PyPDF2>=3.0.0 for PDF dimension extraction - backend/test_bbox_scaling.py: - Add comprehensive test for high-res OCR → A4 PDF scenario - Validates proper scale factor calculation (0.272 x 0.257) Test Results: ✓ OCR dimensions correctly inferred: 2185.0 x 3280.0 ✓ Target PDF dimensions extracted: 595.3 x 841.9 ✓ Scale factors correct: X=0.272, Y=0.257 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -138,48 +138,70 @@ class PDFGeneratorService:
|
||||
logger.error(f"Failed to load JSON {json_path}: {e}")
|
||||
return None
|
||||
|
||||
def calculate_page_dimensions(self, text_regions: List[Dict], source_file_path: Optional[Path] = None) -> Tuple[float, float]:
|
||||
def calculate_page_dimensions(self, ocr_data: Dict, source_file_path: Optional[Path] = None) -> Tuple[float, float]:
|
||||
"""
|
||||
Calculate page dimensions from source file or text region bounding boxes
|
||||
從 OCR JSON 數據中推斷 OCR 處理時的實際頁面尺寸。
|
||||
這非常重要,因為 OCR 可能在高解析度影像上運行。
|
||||
|
||||
Args:
|
||||
text_regions: List of text regions with bbox coordinates
|
||||
source_file_path: Optional path to source file for accurate dimensions
|
||||
ocr_data: Complete OCR data dictionary with text_regions and layout
|
||||
source_file_path: Optional path to source file (fallback only)
|
||||
|
||||
Returns:
|
||||
Tuple of (width, height) in points
|
||||
"""
|
||||
# First try to get dimensions from source file
|
||||
if source_file_path:
|
||||
dims = self.get_original_page_size(source_file_path)
|
||||
if dims:
|
||||
return dims
|
||||
|
||||
if not text_regions:
|
||||
return A4 # Default to A4 size
|
||||
|
||||
max_x = 0
|
||||
max_y = 0
|
||||
|
||||
for region in text_regions:
|
||||
bbox = region.get('bbox', [])
|
||||
if not bbox or len(bbox) < 4:
|
||||
continue
|
||||
# 我們需要檢查所有可能的區域,以找到最大的座標
|
||||
text_regions = ocr_data.get('text_regions', [])
|
||||
layout_elements = ocr_data.get('layout_data', {}).get('elements', []) if ocr_data.get('layout_data') else []
|
||||
all_regions = text_regions + layout_elements
|
||||
|
||||
# bbox format: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
|
||||
for point in bbox:
|
||||
if isinstance(point, (list, tuple)) and len(point) >= 2:
|
||||
x, y = point[0], point[1]
|
||||
max_x = max(max_x, x)
|
||||
max_y = max(max_y, y)
|
||||
if not all_regions:
|
||||
# 如果 JSON 為空,回退到原始檔案尺寸
|
||||
logger.warning("JSON 中沒有找到 text_regions 或 layout elements,回退到原始檔案尺寸。")
|
||||
if source_file_path:
|
||||
dims = self.get_original_page_size(source_file_path)
|
||||
if dims:
|
||||
return dims
|
||||
return A4
|
||||
|
||||
# OCR coordinates are in pixels, use them directly as points (1:1 mapping)
|
||||
# Do NOT add padding - this causes layout issues
|
||||
width = max_x if max_x > 0 else A4[0]
|
||||
height = max_y if max_y > 0 else A4[1]
|
||||
region_count = 0
|
||||
for region in all_regions:
|
||||
try:
|
||||
bbox = region.get('bbox')
|
||||
if not bbox:
|
||||
continue
|
||||
|
||||
logger.info(f"Calculated page dimensions from OCR: {width:.1f} x {height:.1f} points")
|
||||
return (width, height)
|
||||
region_count += 1
|
||||
|
||||
if isinstance(bbox[0], (int, float)):
|
||||
# 處理簡單的 [x1, y1, x2, y2] 格式
|
||||
max_x = max(max_x, bbox[2])
|
||||
max_y = max(max_y, bbox[3])
|
||||
else:
|
||||
# 處理多邊形 [[x, y], ...] 格式
|
||||
x_coords = [p[0] for p in bbox if isinstance(p, (list, tuple)) and len(p) >= 2]
|
||||
y_coords = [p[1] for p in bbox if isinstance(p, (list, tuple)) and len(p) >= 2]
|
||||
if x_coords and y_coords:
|
||||
max_x = max(max_x, max(x_coords))
|
||||
max_y = max(max_y, max(y_coords))
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error processing bbox {bbox}: {e}")
|
||||
|
||||
if max_x > 0 and max_y > 0:
|
||||
logger.info(f"從 {region_count} 個區域中推斷出的 OCR 座標系尺寸: {max_x:.1f} x {max_y:.1f}")
|
||||
return (max_x, max_y)
|
||||
else:
|
||||
# 如果所有 bbox 都解析失敗,才回退
|
||||
logger.warning("無法從 bbox 推斷尺寸,回退到原始檔案尺寸。")
|
||||
if source_file_path:
|
||||
dims = self.get_original_page_size(source_file_path)
|
||||
if dims:
|
||||
return dims
|
||||
return A4
|
||||
|
||||
def get_original_page_size(self, file_path: Path) -> Optional[Tuple[float, float]]:
|
||||
"""
|
||||
@@ -205,8 +227,23 @@ class PDFGeneratorService:
|
||||
logger.info(f"Extracted dimensions from image: {width_pt:.1f} x {height_pt:.1f} points (1:1 pixel mapping)")
|
||||
return (width_pt, height_pt)
|
||||
|
||||
# For PDFs, would need PyPDF2 or similar
|
||||
# For now, return None to use calculated dimensions
|
||||
# For PDFs, extract dimensions using PyPDF2
|
||||
if file_path.suffix.lower() == '.pdf':
|
||||
try:
|
||||
from PyPDF2 import PdfReader
|
||||
reader = PdfReader(file_path)
|
||||
if len(reader.pages) > 0:
|
||||
page = reader.pages[0]
|
||||
# MediaBox gives [x1, y1, x2, y2] in points
|
||||
mediabox = page.mediabox
|
||||
width_pt = float(mediabox.width)
|
||||
height_pt = float(mediabox.height)
|
||||
logger.info(f"Extracted dimensions from PDF: {width_pt:.1f} x {height_pt:.1f} points")
|
||||
return (width_pt, height_pt)
|
||||
except ImportError:
|
||||
logger.warning("PyPDF2 not available, cannot extract PDF dimensions")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to extract PDF dimensions: {e}")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to get page size from {file_path}: {e}")
|
||||
@@ -541,38 +578,34 @@ class PDFGeneratorService:
|
||||
# Get layout data
|
||||
layout_data = ocr_data.get('layout_data', {})
|
||||
|
||||
# Get OCR dimensions (the dimensions of images as processed by OCR)
|
||||
ocr_dimensions = ocr_data.get('ocr_dimensions')
|
||||
# Step 1: Get OCR processing dimensions (the large image OCR actually used)
|
||||
# This comes from analyzing all bbox coordinates in the OCR data
|
||||
ocr_width, ocr_height = self.calculate_page_dimensions(ocr_data, source_file_path=None)
|
||||
logger.info(f"OCR 處理時使用的座標系尺寸: {ocr_width:.1f} x {ocr_height:.1f}")
|
||||
|
||||
# Determine page dimensions
|
||||
page_size = self.calculate_page_dimensions(text_regions, source_file_path)
|
||||
# Step 2: Get target PDF dimensions (usually the original file size)
|
||||
# This is what we want the final PDF size to be
|
||||
if source_file_path:
|
||||
target_dims = self.get_original_page_size(source_file_path)
|
||||
if target_dims:
|
||||
target_width, target_height = target_dims
|
||||
logger.info(f"目標 PDF 尺寸(來自原始文件): {target_width:.1f} x {target_height:.1f}")
|
||||
else:
|
||||
# If we can't get original size, use OCR dimensions as target
|
||||
target_width, target_height = ocr_width, ocr_height
|
||||
logger.warning(f"無法獲取原始文件尺寸,使用 OCR 尺寸作為目標: {target_width:.1f} x {target_height:.1f}")
|
||||
else:
|
||||
# No source file, use OCR dimensions as target (1:1 mapping)
|
||||
target_width, target_height = ocr_width, ocr_height
|
||||
logger.info(f"無原始文件,使用 OCR 尺寸作為目標: {target_width:.1f} x {target_height:.1f}")
|
||||
|
||||
page_width, page_height = page_size
|
||||
# Step 3: Calculate scale factors to convert OCR coordinates to PDF coordinates
|
||||
scale_w = target_width / ocr_width
|
||||
scale_h = target_height / ocr_height
|
||||
logger.info(f"縮放因子: X={scale_w:.3f}, Y={scale_h:.3f} (OCR座標 → PDF座標)")
|
||||
|
||||
# Calculate scale factors if OCR dimensions are available
|
||||
# Default to 1.0 if no OCR dimensions (backward compatibility)
|
||||
scale_w = 1.0
|
||||
scale_h = 1.0
|
||||
|
||||
if ocr_dimensions:
|
||||
# For single image
|
||||
if isinstance(ocr_dimensions, dict):
|
||||
ocr_width = ocr_dimensions.get('width', page_width)
|
||||
ocr_height = ocr_dimensions.get('height', page_height)
|
||||
scale_w = page_width / ocr_width
|
||||
scale_h = page_height / ocr_height
|
||||
logger.info(f"Scale factors - X: {scale_w:.3f}, Y: {scale_h:.3f} (OCR: {ocr_width}x{ocr_height}, PDF: {page_width}x{page_height})")
|
||||
# For multi-page PDF - we'll handle per-page scaling below
|
||||
elif isinstance(ocr_dimensions, list) and ocr_dimensions:
|
||||
# Use first page dimensions as default
|
||||
ocr_width = ocr_dimensions[0].get('width', page_width)
|
||||
ocr_height = ocr_dimensions[0].get('height', page_height)
|
||||
scale_w = page_width / ocr_width
|
||||
scale_h = page_height / ocr_height
|
||||
logger.info(f"Default scale factors - X: {scale_w:.3f}, Y: {scale_h:.3f}")
|
||||
|
||||
# Create PDF canvas
|
||||
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(page_width, page_height))
|
||||
# Create PDF canvas with target dimensions
|
||||
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height))
|
||||
|
||||
# Extract table bboxes to exclude text in those regions
|
||||
table_bboxes = []
|
||||
@@ -628,29 +661,15 @@ class PDFGeneratorService:
|
||||
if page_num > 1:
|
||||
pdf_canvas.showPage() # Start new page
|
||||
|
||||
# Get scale factors for this page (for multi-page PDFs)
|
||||
page_scale_w = scale_w
|
||||
page_scale_h = scale_h
|
||||
if isinstance(ocr_dimensions, list) and ocr_dimensions:
|
||||
# Find dimensions for this specific page
|
||||
for dim_info in ocr_dimensions:
|
||||
if dim_info.get('page') == page_num:
|
||||
ocr_width = dim_info.get('width', page_width)
|
||||
ocr_height = dim_info.get('height', page_height)
|
||||
page_scale_w = page_width / ocr_width
|
||||
page_scale_h = page_height / ocr_height
|
||||
logger.info(f"Page {page_num} scale factors - X: {page_scale_w:.3f}, Y: {page_scale_h:.3f}")
|
||||
break
|
||||
|
||||
# Draw text regions for this page (excluding table text)
|
||||
page_regions = pages_data.get(page_num, [])
|
||||
for region in page_regions:
|
||||
self.draw_text_region(pdf_canvas, region, page_height, page_scale_w, page_scale_h)
|
||||
self.draw_text_region(pdf_canvas, region, target_height, scale_w, scale_h)
|
||||
|
||||
# Draw tables for this page
|
||||
for table_elem in table_elements:
|
||||
if table_elem.get('page', 0) == page_num - 1: # page is 0-indexed
|
||||
self.draw_table_region(pdf_canvas, table_elem, images_metadata, page_height, page_scale_w, page_scale_h)
|
||||
self.draw_table_region(pdf_canvas, table_elem, images_metadata, target_height, scale_w, scale_h)
|
||||
|
||||
# Draw non-table images for this page (figure, chart, seal, etc.)
|
||||
for img_meta in images_metadata:
|
||||
@@ -661,10 +680,10 @@ class PDFGeneratorService:
|
||||
self.draw_image_region(
|
||||
pdf_canvas,
|
||||
img_meta,
|
||||
page_height,
|
||||
target_height,
|
||||
json_path.parent,
|
||||
page_scale_w,
|
||||
page_scale_h
|
||||
scale_w,
|
||||
scale_h
|
||||
)
|
||||
|
||||
# Save PDF
|
||||
|
||||
Reference in New Issue
Block a user