fix: add proper coordinate scaling from OCR space to PDF space

Problem:
- OCR processes images at smaller resolutions but coordinates were being used directly on larger PDF canvases
- This caused all text/tables/images to be drawn at wrong scale in bottom-left corner

Solution:
- Track OCR image dimensions in JSON output (ocr_dimensions)
- Calculate proper scale factors: scale_w = pdf_width/ocr_width, scale_h = pdf_height/ocr_height
- Apply scaling to all coordinates before drawing on PDF canvas
- Support per-page scaling for multi-page PDFs

Changes:
1. ocr_service.py:
   - Add OCR image dimensions capture using PIL
   - Include ocr_dimensions in JSON output for both single images and PDFs

2. pdf_generator_service.py:
   - Calculate scale factors from OCR dimensions vs target PDF dimensions
   - Update all drawing methods (text, table, image) to accept and apply scale factors
   - Apply scaling to bbox coordinates before coordinate transformation

3. test_pdf_scaling.py:
   - Add test script to verify scaling works correctly
   - Test with OCR at 500x700 scaled to PDF at 1000x1400 (2x scaling)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-17 20:45:36 +08:00
parent fa1abcd8e6
commit d33f605bdb
3 changed files with 198 additions and 19 deletions

View File

@@ -332,6 +332,7 @@ class OCRService:
total_valid_regions = 0
all_layout_data = []
all_images_metadata = []
all_ocr_dimensions = []
for page_num, page_image_path in enumerate(image_paths, 1):
logger.info(f"Processing PDF page {page_num}/{len(image_paths)}")
@@ -363,6 +364,14 @@ class OCRService:
if page_result.get('images_metadata'):
all_images_metadata.extend(page_result['images_metadata'])
# Store OCR dimensions for each page
if page_result.get('ocr_dimensions'):
all_ocr_dimensions.append({
'page': page_num,
'width': page_result['ocr_dimensions']['width'],
'height': page_result['ocr_dimensions']['height']
})
# Calculate overall average confidence
avg_confidence = total_confidence_sum / total_valid_regions if total_valid_regions > 0 else 0.0
@@ -407,11 +416,18 @@ class OCRService:
'processing_time': processing_time,
'timestamp': datetime.utcnow().isoformat(),
'total_pages': len(image_paths),
'ocr_dimensions': all_ocr_dimensions if all_ocr_dimensions else None,
}
# Get OCR engine (for non-PDF images)
ocr_engine = self.get_ocr_engine(lang)
# Get the actual image dimensions that OCR will use
from PIL import Image
with Image.open(image_path) as img:
ocr_width, ocr_height = img.size
logger.info(f"OCR processing image dimensions: {ocr_width}x{ocr_height}")
# Perform OCR
logger.info(f"Processing image: {image_path.name}")
# Note: In PaddleOCR 3.x, use_angle_cls is set during initialization, not in ocr() call
@@ -480,6 +496,10 @@ class OCRService:
'markdown_content': markdown_content,
'processing_time': processing_time,
'timestamp': datetime.utcnow().isoformat(),
'ocr_dimensions': {
'width': ocr_width,
'height': ocr_height
}
}
logger.info(

View File

@@ -217,7 +217,9 @@ class PDFGeneratorService:
self,
pdf_canvas: canvas.Canvas,
region: Dict,
page_height: float
page_height: float,
scale_w: float = 1.0,
scale_h: float = 1.0
):
"""
Draw a text region at precise coordinates
@@ -226,6 +228,8 @@ class PDFGeneratorService:
pdf_canvas: ReportLab canvas object
region: Text region dict with text, bbox, confidence
page_height: Height of page (for coordinate transformation)
scale_w: Scale factor for X coordinates (PDF width / OCR width)
scale_h: Scale factor for Y coordinates (PDF height / OCR height)
"""
text = region.get('text', '')
bbox = region.get('bbox', [])
@@ -243,7 +247,13 @@ class PDFGeneratorService:
ocr_x_right = bbox[2][0] # Right X
ocr_y_bottom = bbox[2][1] # Bottom Y in OCR coordinates
# Calculate bbox dimensions
# Apply scale factors to convert from OCR space to PDF space
ocr_x_left = ocr_x_left * scale_w
ocr_y_top = ocr_y_top * scale_h
ocr_x_right = ocr_x_right * scale_w
ocr_y_bottom = ocr_y_bottom * scale_h
# Calculate bbox dimensions (after scaling)
bbox_width = abs(ocr_x_right - ocr_x_left)
bbox_height = abs(ocr_y_bottom - ocr_y_top)
@@ -279,8 +289,8 @@ class PDFGeneratorService:
if settings.pdf_enable_bbox_debug:
pdf_canvas.setStrokeColorRGB(1, 0, 0, 0.3) # Red, semi-transparent
pdf_canvas.setLineWidth(0.5)
# Transform all bbox points to PDF coordinates
pdf_points = [(p[0], page_height - p[1]) for p in bbox]
# Transform all bbox points to PDF coordinates (apply scaling first)
pdf_points = [(p[0] * scale_w, page_height - p[1] * scale_h) for p in bbox]
# Draw bbox rectangle
for i in range(4):
x1, y1 = pdf_points[i]
@@ -295,7 +305,9 @@ class PDFGeneratorService:
pdf_canvas: canvas.Canvas,
table_element: Dict,
images_metadata: List[Dict],
page_height: float
page_height: float,
scale_w: float = 1.0,
scale_h: float = 1.0
):
"""
Draw a table region by parsing HTML and rebuilding with ReportLab Table
@@ -305,6 +317,8 @@ class PDFGeneratorService:
table_element: Table element dict with HTML content
images_metadata: List of image metadata to find table bbox
page_height: Height of page
scale_w: Scale factor for X coordinates (PDF width / OCR width)
scale_h: Scale factor for Y coordinates (PDF height / OCR height)
"""
try:
html_content = table_element.get('content', '')
@@ -340,11 +354,11 @@ class PDFGeneratorService:
logger.warning("No bbox found for table")
return
# Extract bbox coordinates
ocr_x_left = table_bbox[0][0]
ocr_y_top = table_bbox[0][1]
ocr_x_right = table_bbox[2][0]
ocr_y_bottom = table_bbox[2][1]
# Extract bbox coordinates and apply scaling
ocr_x_left = table_bbox[0][0] * scale_w
ocr_y_top = table_bbox[0][1] * scale_h
ocr_x_right = table_bbox[2][0] * scale_w
ocr_y_bottom = table_bbox[2][1] * scale_h
table_width = abs(ocr_x_right - ocr_x_left)
table_height = abs(ocr_y_bottom - ocr_y_top)
@@ -416,7 +430,9 @@ class PDFGeneratorService:
pdf_canvas: canvas.Canvas,
region: Dict,
page_height: float,
result_dir: Path
result_dir: Path,
scale_w: float = 1.0,
scale_h: float = 1.0
):
"""
Draw an image region by embedding the extracted image
@@ -428,6 +444,8 @@ class PDFGeneratorService:
region: Image metadata dict with image_path and bbox
page_height: Height of page (for coordinate transformation)
result_dir: Directory containing result files
scale_w: Scale factor for X coordinates (PDF width / OCR width)
scale_h: Scale factor for Y coordinates (PDF height / OCR height)
"""
try:
image_path_str = region.get('image_path', '')
@@ -450,12 +468,12 @@ class PDFGeneratorService:
# bbox from OCR: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
# OCR coordinates: origin (0,0) at top-left, Y increases downward
ocr_x_left = bbox[0][0]
ocr_y_top = bbox[0][1]
ocr_x_right = bbox[2][0]
ocr_y_bottom = bbox[2][1]
ocr_x_left = bbox[0][0] * scale_w
ocr_y_top = bbox[0][1] * scale_h
ocr_x_right = bbox[2][0] * scale_w
ocr_y_bottom = bbox[2][1] * scale_h
# Calculate bbox dimensions
# Calculate bbox dimensions (after scaling)
bbox_width = abs(ocr_x_right - ocr_x_left)
bbox_height = abs(ocr_y_bottom - ocr_y_top)
@@ -523,11 +541,36 @@ class PDFGeneratorService:
# Get layout data
layout_data = ocr_data.get('layout_data', {})
# Get OCR dimensions (the dimensions of images as processed by OCR)
ocr_dimensions = ocr_data.get('ocr_dimensions')
# Determine page dimensions
page_size = self.calculate_page_dimensions(text_regions, source_file_path)
page_width, page_height = page_size
# Calculate scale factors if OCR dimensions are available
# Default to 1.0 if no OCR dimensions (backward compatibility)
scale_w = 1.0
scale_h = 1.0
if ocr_dimensions:
# For single image
if isinstance(ocr_dimensions, dict):
ocr_width = ocr_dimensions.get('width', page_width)
ocr_height = ocr_dimensions.get('height', page_height)
scale_w = page_width / ocr_width
scale_h = page_height / ocr_height
logger.info(f"Scale factors - X: {scale_w:.3f}, Y: {scale_h:.3f} (OCR: {ocr_width}x{ocr_height}, PDF: {page_width}x{page_height})")
# For multi-page PDF - we'll handle per-page scaling below
elif isinstance(ocr_dimensions, list) and ocr_dimensions:
# Use first page dimensions as default
ocr_width = ocr_dimensions[0].get('width', page_width)
ocr_height = ocr_dimensions[0].get('height', page_height)
scale_w = page_width / ocr_width
scale_h = page_height / ocr_height
logger.info(f"Default scale factors - X: {scale_w:.3f}, Y: {scale_h:.3f}")
# Create PDF canvas
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(page_width, page_height))
@@ -585,15 +628,29 @@ class PDFGeneratorService:
if page_num > 1:
pdf_canvas.showPage() # Start new page
# Get scale factors for this page (for multi-page PDFs)
page_scale_w = scale_w
page_scale_h = scale_h
if isinstance(ocr_dimensions, list) and ocr_dimensions:
# Find dimensions for this specific page
for dim_info in ocr_dimensions:
if dim_info.get('page') == page_num:
ocr_width = dim_info.get('width', page_width)
ocr_height = dim_info.get('height', page_height)
page_scale_w = page_width / ocr_width
page_scale_h = page_height / ocr_height
logger.info(f"Page {page_num} scale factors - X: {page_scale_w:.3f}, Y: {page_scale_h:.3f}")
break
# Draw text regions for this page (excluding table text)
page_regions = pages_data.get(page_num, [])
for region in page_regions:
self.draw_text_region(pdf_canvas, region, page_height)
self.draw_text_region(pdf_canvas, region, page_height, page_scale_w, page_scale_h)
# Draw tables for this page
for table_elem in table_elements:
if table_elem.get('page', 0) == page_num - 1: # page is 0-indexed
self.draw_table_region(pdf_canvas, table_elem, images_metadata, page_height)
self.draw_table_region(pdf_canvas, table_elem, images_metadata, page_height, page_scale_w, page_scale_h)
# Draw non-table images for this page (figure, chart, seal, etc.)
for img_meta in images_metadata:
@@ -605,7 +662,9 @@ class PDFGeneratorService:
pdf_canvas,
img_meta,
page_height,
json_path.parent
json_path.parent,
page_scale_w,
page_scale_h
)
# Save PDF