fix: add proper coordinate scaling from OCR space to PDF space
Problem: - OCR processes images at smaller resolutions but coordinates were being used directly on larger PDF canvases - This caused all text/tables/images to be drawn at wrong scale in bottom-left corner Solution: - Track OCR image dimensions in JSON output (ocr_dimensions) - Calculate proper scale factors: scale_w = pdf_width/ocr_width, scale_h = pdf_height/ocr_height - Apply scaling to all coordinates before drawing on PDF canvas - Support per-page scaling for multi-page PDFs Changes: 1. ocr_service.py: - Add OCR image dimensions capture using PIL - Include ocr_dimensions in JSON output for both single images and PDFs 2. pdf_generator_service.py: - Calculate scale factors from OCR dimensions vs target PDF dimensions - Update all drawing methods (text, table, image) to accept and apply scale factors - Apply scaling to bbox coordinates before coordinate transformation 3. test_pdf_scaling.py: - Add test script to verify scaling works correctly - Test with OCR at 500x700 scaled to PDF at 1000x1400 (2x scaling) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -332,6 +332,7 @@ class OCRService:
|
||||
total_valid_regions = 0
|
||||
all_layout_data = []
|
||||
all_images_metadata = []
|
||||
all_ocr_dimensions = []
|
||||
|
||||
for page_num, page_image_path in enumerate(image_paths, 1):
|
||||
logger.info(f"Processing PDF page {page_num}/{len(image_paths)}")
|
||||
@@ -363,6 +364,14 @@ class OCRService:
|
||||
if page_result.get('images_metadata'):
|
||||
all_images_metadata.extend(page_result['images_metadata'])
|
||||
|
||||
# Store OCR dimensions for each page
|
||||
if page_result.get('ocr_dimensions'):
|
||||
all_ocr_dimensions.append({
|
||||
'page': page_num,
|
||||
'width': page_result['ocr_dimensions']['width'],
|
||||
'height': page_result['ocr_dimensions']['height']
|
||||
})
|
||||
|
||||
# Calculate overall average confidence
|
||||
avg_confidence = total_confidence_sum / total_valid_regions if total_valid_regions > 0 else 0.0
|
||||
|
||||
@@ -407,11 +416,18 @@ class OCRService:
|
||||
'processing_time': processing_time,
|
||||
'timestamp': datetime.utcnow().isoformat(),
|
||||
'total_pages': len(image_paths),
|
||||
'ocr_dimensions': all_ocr_dimensions if all_ocr_dimensions else None,
|
||||
}
|
||||
|
||||
# Get OCR engine (for non-PDF images)
|
||||
ocr_engine = self.get_ocr_engine(lang)
|
||||
|
||||
# Get the actual image dimensions that OCR will use
|
||||
from PIL import Image
|
||||
with Image.open(image_path) as img:
|
||||
ocr_width, ocr_height = img.size
|
||||
logger.info(f"OCR processing image dimensions: {ocr_width}x{ocr_height}")
|
||||
|
||||
# Perform OCR
|
||||
logger.info(f"Processing image: {image_path.name}")
|
||||
# Note: In PaddleOCR 3.x, use_angle_cls is set during initialization, not in ocr() call
|
||||
@@ -480,6 +496,10 @@ class OCRService:
|
||||
'markdown_content': markdown_content,
|
||||
'processing_time': processing_time,
|
||||
'timestamp': datetime.utcnow().isoformat(),
|
||||
'ocr_dimensions': {
|
||||
'width': ocr_width,
|
||||
'height': ocr_height
|
||||
}
|
||||
}
|
||||
|
||||
logger.info(
|
||||
|
||||
Reference in New Issue
Block a user