fix: add proper coordinate scaling from OCR space to PDF space
Problem: - OCR processes images at smaller resolutions but coordinates were being used directly on larger PDF canvases - This caused all text/tables/images to be drawn at wrong scale in bottom-left corner Solution: - Track OCR image dimensions in JSON output (ocr_dimensions) - Calculate proper scale factors: scale_w = pdf_width/ocr_width, scale_h = pdf_height/ocr_height - Apply scaling to all coordinates before drawing on PDF canvas - Support per-page scaling for multi-page PDFs Changes: 1. ocr_service.py: - Add OCR image dimensions capture using PIL - Include ocr_dimensions in JSON output for both single images and PDFs 2. pdf_generator_service.py: - Calculate scale factors from OCR dimensions vs target PDF dimensions - Update all drawing methods (text, table, image) to accept and apply scale factors - Apply scaling to bbox coordinates before coordinate transformation 3. test_pdf_scaling.py: - Add test script to verify scaling works correctly - Test with OCR at 500x700 scaled to PDF at 1000x1400 (2x scaling) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -332,6 +332,7 @@ class OCRService:
|
|||||||
total_valid_regions = 0
|
total_valid_regions = 0
|
||||||
all_layout_data = []
|
all_layout_data = []
|
||||||
all_images_metadata = []
|
all_images_metadata = []
|
||||||
|
all_ocr_dimensions = []
|
||||||
|
|
||||||
for page_num, page_image_path in enumerate(image_paths, 1):
|
for page_num, page_image_path in enumerate(image_paths, 1):
|
||||||
logger.info(f"Processing PDF page {page_num}/{len(image_paths)}")
|
logger.info(f"Processing PDF page {page_num}/{len(image_paths)}")
|
||||||
@@ -363,6 +364,14 @@ class OCRService:
|
|||||||
if page_result.get('images_metadata'):
|
if page_result.get('images_metadata'):
|
||||||
all_images_metadata.extend(page_result['images_metadata'])
|
all_images_metadata.extend(page_result['images_metadata'])
|
||||||
|
|
||||||
|
# Store OCR dimensions for each page
|
||||||
|
if page_result.get('ocr_dimensions'):
|
||||||
|
all_ocr_dimensions.append({
|
||||||
|
'page': page_num,
|
||||||
|
'width': page_result['ocr_dimensions']['width'],
|
||||||
|
'height': page_result['ocr_dimensions']['height']
|
||||||
|
})
|
||||||
|
|
||||||
# Calculate overall average confidence
|
# Calculate overall average confidence
|
||||||
avg_confidence = total_confidence_sum / total_valid_regions if total_valid_regions > 0 else 0.0
|
avg_confidence = total_confidence_sum / total_valid_regions if total_valid_regions > 0 else 0.0
|
||||||
|
|
||||||
@@ -407,11 +416,18 @@ class OCRService:
|
|||||||
'processing_time': processing_time,
|
'processing_time': processing_time,
|
||||||
'timestamp': datetime.utcnow().isoformat(),
|
'timestamp': datetime.utcnow().isoformat(),
|
||||||
'total_pages': len(image_paths),
|
'total_pages': len(image_paths),
|
||||||
|
'ocr_dimensions': all_ocr_dimensions if all_ocr_dimensions else None,
|
||||||
}
|
}
|
||||||
|
|
||||||
# Get OCR engine (for non-PDF images)
|
# Get OCR engine (for non-PDF images)
|
||||||
ocr_engine = self.get_ocr_engine(lang)
|
ocr_engine = self.get_ocr_engine(lang)
|
||||||
|
|
||||||
|
# Get the actual image dimensions that OCR will use
|
||||||
|
from PIL import Image
|
||||||
|
with Image.open(image_path) as img:
|
||||||
|
ocr_width, ocr_height = img.size
|
||||||
|
logger.info(f"OCR processing image dimensions: {ocr_width}x{ocr_height}")
|
||||||
|
|
||||||
# Perform OCR
|
# Perform OCR
|
||||||
logger.info(f"Processing image: {image_path.name}")
|
logger.info(f"Processing image: {image_path.name}")
|
||||||
# Note: In PaddleOCR 3.x, use_angle_cls is set during initialization, not in ocr() call
|
# Note: In PaddleOCR 3.x, use_angle_cls is set during initialization, not in ocr() call
|
||||||
@@ -480,6 +496,10 @@ class OCRService:
|
|||||||
'markdown_content': markdown_content,
|
'markdown_content': markdown_content,
|
||||||
'processing_time': processing_time,
|
'processing_time': processing_time,
|
||||||
'timestamp': datetime.utcnow().isoformat(),
|
'timestamp': datetime.utcnow().isoformat(),
|
||||||
|
'ocr_dimensions': {
|
||||||
|
'width': ocr_width,
|
||||||
|
'height': ocr_height
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
|
|||||||
@@ -217,7 +217,9 @@ class PDFGeneratorService:
|
|||||||
self,
|
self,
|
||||||
pdf_canvas: canvas.Canvas,
|
pdf_canvas: canvas.Canvas,
|
||||||
region: Dict,
|
region: Dict,
|
||||||
page_height: float
|
page_height: float,
|
||||||
|
scale_w: float = 1.0,
|
||||||
|
scale_h: float = 1.0
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Draw a text region at precise coordinates
|
Draw a text region at precise coordinates
|
||||||
@@ -226,6 +228,8 @@ class PDFGeneratorService:
|
|||||||
pdf_canvas: ReportLab canvas object
|
pdf_canvas: ReportLab canvas object
|
||||||
region: Text region dict with text, bbox, confidence
|
region: Text region dict with text, bbox, confidence
|
||||||
page_height: Height of page (for coordinate transformation)
|
page_height: Height of page (for coordinate transformation)
|
||||||
|
scale_w: Scale factor for X coordinates (PDF width / OCR width)
|
||||||
|
scale_h: Scale factor for Y coordinates (PDF height / OCR height)
|
||||||
"""
|
"""
|
||||||
text = region.get('text', '')
|
text = region.get('text', '')
|
||||||
bbox = region.get('bbox', [])
|
bbox = region.get('bbox', [])
|
||||||
@@ -243,7 +247,13 @@ class PDFGeneratorService:
|
|||||||
ocr_x_right = bbox[2][0] # Right X
|
ocr_x_right = bbox[2][0] # Right X
|
||||||
ocr_y_bottom = bbox[2][1] # Bottom Y in OCR coordinates
|
ocr_y_bottom = bbox[2][1] # Bottom Y in OCR coordinates
|
||||||
|
|
||||||
# Calculate bbox dimensions
|
# Apply scale factors to convert from OCR space to PDF space
|
||||||
|
ocr_x_left = ocr_x_left * scale_w
|
||||||
|
ocr_y_top = ocr_y_top * scale_h
|
||||||
|
ocr_x_right = ocr_x_right * scale_w
|
||||||
|
ocr_y_bottom = ocr_y_bottom * scale_h
|
||||||
|
|
||||||
|
# Calculate bbox dimensions (after scaling)
|
||||||
bbox_width = abs(ocr_x_right - ocr_x_left)
|
bbox_width = abs(ocr_x_right - ocr_x_left)
|
||||||
bbox_height = abs(ocr_y_bottom - ocr_y_top)
|
bbox_height = abs(ocr_y_bottom - ocr_y_top)
|
||||||
|
|
||||||
@@ -279,8 +289,8 @@ class PDFGeneratorService:
|
|||||||
if settings.pdf_enable_bbox_debug:
|
if settings.pdf_enable_bbox_debug:
|
||||||
pdf_canvas.setStrokeColorRGB(1, 0, 0, 0.3) # Red, semi-transparent
|
pdf_canvas.setStrokeColorRGB(1, 0, 0, 0.3) # Red, semi-transparent
|
||||||
pdf_canvas.setLineWidth(0.5)
|
pdf_canvas.setLineWidth(0.5)
|
||||||
# Transform all bbox points to PDF coordinates
|
# Transform all bbox points to PDF coordinates (apply scaling first)
|
||||||
pdf_points = [(p[0], page_height - p[1]) for p in bbox]
|
pdf_points = [(p[0] * scale_w, page_height - p[1] * scale_h) for p in bbox]
|
||||||
# Draw bbox rectangle
|
# Draw bbox rectangle
|
||||||
for i in range(4):
|
for i in range(4):
|
||||||
x1, y1 = pdf_points[i]
|
x1, y1 = pdf_points[i]
|
||||||
@@ -295,7 +305,9 @@ class PDFGeneratorService:
|
|||||||
pdf_canvas: canvas.Canvas,
|
pdf_canvas: canvas.Canvas,
|
||||||
table_element: Dict,
|
table_element: Dict,
|
||||||
images_metadata: List[Dict],
|
images_metadata: List[Dict],
|
||||||
page_height: float
|
page_height: float,
|
||||||
|
scale_w: float = 1.0,
|
||||||
|
scale_h: float = 1.0
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Draw a table region by parsing HTML and rebuilding with ReportLab Table
|
Draw a table region by parsing HTML and rebuilding with ReportLab Table
|
||||||
@@ -305,6 +317,8 @@ class PDFGeneratorService:
|
|||||||
table_element: Table element dict with HTML content
|
table_element: Table element dict with HTML content
|
||||||
images_metadata: List of image metadata to find table bbox
|
images_metadata: List of image metadata to find table bbox
|
||||||
page_height: Height of page
|
page_height: Height of page
|
||||||
|
scale_w: Scale factor for X coordinates (PDF width / OCR width)
|
||||||
|
scale_h: Scale factor for Y coordinates (PDF height / OCR height)
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
html_content = table_element.get('content', '')
|
html_content = table_element.get('content', '')
|
||||||
@@ -340,11 +354,11 @@ class PDFGeneratorService:
|
|||||||
logger.warning("No bbox found for table")
|
logger.warning("No bbox found for table")
|
||||||
return
|
return
|
||||||
|
|
||||||
# Extract bbox coordinates
|
# Extract bbox coordinates and apply scaling
|
||||||
ocr_x_left = table_bbox[0][0]
|
ocr_x_left = table_bbox[0][0] * scale_w
|
||||||
ocr_y_top = table_bbox[0][1]
|
ocr_y_top = table_bbox[0][1] * scale_h
|
||||||
ocr_x_right = table_bbox[2][0]
|
ocr_x_right = table_bbox[2][0] * scale_w
|
||||||
ocr_y_bottom = table_bbox[2][1]
|
ocr_y_bottom = table_bbox[2][1] * scale_h
|
||||||
|
|
||||||
table_width = abs(ocr_x_right - ocr_x_left)
|
table_width = abs(ocr_x_right - ocr_x_left)
|
||||||
table_height = abs(ocr_y_bottom - ocr_y_top)
|
table_height = abs(ocr_y_bottom - ocr_y_top)
|
||||||
@@ -416,7 +430,9 @@ class PDFGeneratorService:
|
|||||||
pdf_canvas: canvas.Canvas,
|
pdf_canvas: canvas.Canvas,
|
||||||
region: Dict,
|
region: Dict,
|
||||||
page_height: float,
|
page_height: float,
|
||||||
result_dir: Path
|
result_dir: Path,
|
||||||
|
scale_w: float = 1.0,
|
||||||
|
scale_h: float = 1.0
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Draw an image region by embedding the extracted image
|
Draw an image region by embedding the extracted image
|
||||||
@@ -428,6 +444,8 @@ class PDFGeneratorService:
|
|||||||
region: Image metadata dict with image_path and bbox
|
region: Image metadata dict with image_path and bbox
|
||||||
page_height: Height of page (for coordinate transformation)
|
page_height: Height of page (for coordinate transformation)
|
||||||
result_dir: Directory containing result files
|
result_dir: Directory containing result files
|
||||||
|
scale_w: Scale factor for X coordinates (PDF width / OCR width)
|
||||||
|
scale_h: Scale factor for Y coordinates (PDF height / OCR height)
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
image_path_str = region.get('image_path', '')
|
image_path_str = region.get('image_path', '')
|
||||||
@@ -450,12 +468,12 @@ class PDFGeneratorService:
|
|||||||
|
|
||||||
# bbox from OCR: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
|
# bbox from OCR: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
|
||||||
# OCR coordinates: origin (0,0) at top-left, Y increases downward
|
# OCR coordinates: origin (0,0) at top-left, Y increases downward
|
||||||
ocr_x_left = bbox[0][0]
|
ocr_x_left = bbox[0][0] * scale_w
|
||||||
ocr_y_top = bbox[0][1]
|
ocr_y_top = bbox[0][1] * scale_h
|
||||||
ocr_x_right = bbox[2][0]
|
ocr_x_right = bbox[2][0] * scale_w
|
||||||
ocr_y_bottom = bbox[2][1]
|
ocr_y_bottom = bbox[2][1] * scale_h
|
||||||
|
|
||||||
# Calculate bbox dimensions
|
# Calculate bbox dimensions (after scaling)
|
||||||
bbox_width = abs(ocr_x_right - ocr_x_left)
|
bbox_width = abs(ocr_x_right - ocr_x_left)
|
||||||
bbox_height = abs(ocr_y_bottom - ocr_y_top)
|
bbox_height = abs(ocr_y_bottom - ocr_y_top)
|
||||||
|
|
||||||
@@ -523,11 +541,36 @@ class PDFGeneratorService:
|
|||||||
# Get layout data
|
# Get layout data
|
||||||
layout_data = ocr_data.get('layout_data', {})
|
layout_data = ocr_data.get('layout_data', {})
|
||||||
|
|
||||||
|
# Get OCR dimensions (the dimensions of images as processed by OCR)
|
||||||
|
ocr_dimensions = ocr_data.get('ocr_dimensions')
|
||||||
|
|
||||||
# Determine page dimensions
|
# Determine page dimensions
|
||||||
page_size = self.calculate_page_dimensions(text_regions, source_file_path)
|
page_size = self.calculate_page_dimensions(text_regions, source_file_path)
|
||||||
|
|
||||||
page_width, page_height = page_size
|
page_width, page_height = page_size
|
||||||
|
|
||||||
|
# Calculate scale factors if OCR dimensions are available
|
||||||
|
# Default to 1.0 if no OCR dimensions (backward compatibility)
|
||||||
|
scale_w = 1.0
|
||||||
|
scale_h = 1.0
|
||||||
|
|
||||||
|
if ocr_dimensions:
|
||||||
|
# For single image
|
||||||
|
if isinstance(ocr_dimensions, dict):
|
||||||
|
ocr_width = ocr_dimensions.get('width', page_width)
|
||||||
|
ocr_height = ocr_dimensions.get('height', page_height)
|
||||||
|
scale_w = page_width / ocr_width
|
||||||
|
scale_h = page_height / ocr_height
|
||||||
|
logger.info(f"Scale factors - X: {scale_w:.3f}, Y: {scale_h:.3f} (OCR: {ocr_width}x{ocr_height}, PDF: {page_width}x{page_height})")
|
||||||
|
# For multi-page PDF - we'll handle per-page scaling below
|
||||||
|
elif isinstance(ocr_dimensions, list) and ocr_dimensions:
|
||||||
|
# Use first page dimensions as default
|
||||||
|
ocr_width = ocr_dimensions[0].get('width', page_width)
|
||||||
|
ocr_height = ocr_dimensions[0].get('height', page_height)
|
||||||
|
scale_w = page_width / ocr_width
|
||||||
|
scale_h = page_height / ocr_height
|
||||||
|
logger.info(f"Default scale factors - X: {scale_w:.3f}, Y: {scale_h:.3f}")
|
||||||
|
|
||||||
# Create PDF canvas
|
# Create PDF canvas
|
||||||
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(page_width, page_height))
|
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(page_width, page_height))
|
||||||
|
|
||||||
@@ -585,15 +628,29 @@ class PDFGeneratorService:
|
|||||||
if page_num > 1:
|
if page_num > 1:
|
||||||
pdf_canvas.showPage() # Start new page
|
pdf_canvas.showPage() # Start new page
|
||||||
|
|
||||||
|
# Get scale factors for this page (for multi-page PDFs)
|
||||||
|
page_scale_w = scale_w
|
||||||
|
page_scale_h = scale_h
|
||||||
|
if isinstance(ocr_dimensions, list) and ocr_dimensions:
|
||||||
|
# Find dimensions for this specific page
|
||||||
|
for dim_info in ocr_dimensions:
|
||||||
|
if dim_info.get('page') == page_num:
|
||||||
|
ocr_width = dim_info.get('width', page_width)
|
||||||
|
ocr_height = dim_info.get('height', page_height)
|
||||||
|
page_scale_w = page_width / ocr_width
|
||||||
|
page_scale_h = page_height / ocr_height
|
||||||
|
logger.info(f"Page {page_num} scale factors - X: {page_scale_w:.3f}, Y: {page_scale_h:.3f}")
|
||||||
|
break
|
||||||
|
|
||||||
# Draw text regions for this page (excluding table text)
|
# Draw text regions for this page (excluding table text)
|
||||||
page_regions = pages_data.get(page_num, [])
|
page_regions = pages_data.get(page_num, [])
|
||||||
for region in page_regions:
|
for region in page_regions:
|
||||||
self.draw_text_region(pdf_canvas, region, page_height)
|
self.draw_text_region(pdf_canvas, region, page_height, page_scale_w, page_scale_h)
|
||||||
|
|
||||||
# Draw tables for this page
|
# Draw tables for this page
|
||||||
for table_elem in table_elements:
|
for table_elem in table_elements:
|
||||||
if table_elem.get('page', 0) == page_num - 1: # page is 0-indexed
|
if table_elem.get('page', 0) == page_num - 1: # page is 0-indexed
|
||||||
self.draw_table_region(pdf_canvas, table_elem, images_metadata, page_height)
|
self.draw_table_region(pdf_canvas, table_elem, images_metadata, page_height, page_scale_w, page_scale_h)
|
||||||
|
|
||||||
# Draw non-table images for this page (figure, chart, seal, etc.)
|
# Draw non-table images for this page (figure, chart, seal, etc.)
|
||||||
for img_meta in images_metadata:
|
for img_meta in images_metadata:
|
||||||
@@ -605,7 +662,9 @@ class PDFGeneratorService:
|
|||||||
pdf_canvas,
|
pdf_canvas,
|
||||||
img_meta,
|
img_meta,
|
||||||
page_height,
|
page_height,
|
||||||
json_path.parent
|
json_path.parent,
|
||||||
|
page_scale_w,
|
||||||
|
page_scale_h
|
||||||
)
|
)
|
||||||
|
|
||||||
# Save PDF
|
# Save PDF
|
||||||
|
|||||||
100
backend/test_pdf_scaling.py
Normal file
100
backend/test_pdf_scaling.py
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
"""
|
||||||
|
Test script for PDF generation with proper scaling
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from app.services.pdf_generator_service import pdf_generator_service
|
||||||
|
|
||||||
|
def test_pdf_generation():
|
||||||
|
"""Test PDF generation with mock data that includes OCR dimensions"""
|
||||||
|
|
||||||
|
# Create a test directory
|
||||||
|
test_dir = Path("test_output")
|
||||||
|
test_dir.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
# Create mock OCR JSON data with OCR dimensions
|
||||||
|
mock_ocr_data = {
|
||||||
|
"status": "success",
|
||||||
|
"file_name": "test_image.jpg",
|
||||||
|
"language": "ch",
|
||||||
|
"ocr_dimensions": {
|
||||||
|
"width": 500, # OCR processed at 500px wide
|
||||||
|
"height": 700 # OCR processed at 700px tall
|
||||||
|
},
|
||||||
|
"text_regions": [
|
||||||
|
{
|
||||||
|
"text": "測試文字 Test Text",
|
||||||
|
"bbox": [[50, 100], [250, 100], [250, 150], [50, 150]],
|
||||||
|
"confidence": 0.95
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "第二行文字 Second line",
|
||||||
|
"bbox": [[50, 200], [300, 200], [300, 250], [50, 250]],
|
||||||
|
"confidence": 0.92
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"total_text_regions": 2,
|
||||||
|
"average_confidence": 0.935,
|
||||||
|
"layout_data": None,
|
||||||
|
"images_metadata": [],
|
||||||
|
"markdown_content": "# Test Document\n\n測試文字 Test Text\n\n第二行文字 Second line",
|
||||||
|
"processing_time": 1.5,
|
||||||
|
"timestamp": "2025-11-17T00:00:00"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Save mock JSON
|
||||||
|
json_path = test_dir / "test_ocr_result.json"
|
||||||
|
with open(json_path, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(mock_ocr_data, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
print(f"Created test JSON at: {json_path}")
|
||||||
|
|
||||||
|
# Test PDF generation
|
||||||
|
pdf_path = test_dir / "test_output.pdf"
|
||||||
|
|
||||||
|
# Create a dummy source file for dimensions (1000x1400 target PDF size)
|
||||||
|
from PIL import Image
|
||||||
|
source_image = test_dir / "test_source.jpg"
|
||||||
|
img = Image.new('RGB', (1000, 1400), color='white')
|
||||||
|
img.save(source_image)
|
||||||
|
print(f"Created test source image: {source_image} (1000x1400)")
|
||||||
|
|
||||||
|
# Generate PDF
|
||||||
|
print("\nGenerating PDF with scaling...")
|
||||||
|
|
||||||
|
# Set up logging to see scale factors
|
||||||
|
import logging
|
||||||
|
logging.basicConfig(level=logging.INFO, format='%(message)s')
|
||||||
|
|
||||||
|
success = pdf_generator_service.generate_layout_pdf(
|
||||||
|
json_path=json_path,
|
||||||
|
output_path=pdf_path,
|
||||||
|
source_file_path=source_image
|
||||||
|
)
|
||||||
|
|
||||||
|
if success:
|
||||||
|
print(f"✓ PDF generated successfully: {pdf_path}")
|
||||||
|
print(f" Expected scale factors: X={1000/500:.2f}, Y={1400/700:.2f}")
|
||||||
|
print(" Text should now be properly scaled and positioned!")
|
||||||
|
else:
|
||||||
|
print("✗ PDF generation failed")
|
||||||
|
|
||||||
|
return success
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import sys
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent))
|
||||||
|
|
||||||
|
print("Testing PDF generation with proper scaling...")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
success = test_pdf_generation()
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
if success:
|
||||||
|
print("✓ Test completed successfully!")
|
||||||
|
print("Check test_output/test_output.pdf to verify scaling")
|
||||||
|
else:
|
||||||
|
print("✗ Test failed")
|
||||||
Reference in New Issue
Block a user