fix: add proper coordinate scaling from OCR space to PDF space

Problem: - OCR processes images at smaller resolutions but coordinates were being used directly on larger PDF canvases - This caused all text/tables/images to be drawn at wrong scale in bottom-left corner Solution: - Track OCR image dimensions in JSON output (ocr_dimensions) - Calculate proper scale factors: scale_w = pdf_width/ocr_width, scale_h = pdf_height/ocr_height - Apply scaling to all coordinates before drawing on PDF canvas - Support per-page scaling for multi-page PDFs Changes: 1. ocr_service.py: - Add OCR image dimensions capture using PIL - Include ocr_dimensions in JSON output for both single images and PDFs 2. pdf_generator_service.py: - Calculate scale factors from OCR dimensions vs target PDF dimensions - Update all drawing methods (text, table, image) to accept and apply scale factors - Apply scaling to bbox coordinates before coordinate transformation 3. test_pdf_scaling.py: - Add test script to verify scaling works correctly - Test with OCR at 500x700 scaled to PDF at 1000x1400 (2x scaling) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-17 20:45:36 +08:00
parent fa1abcd8e6
commit d33f605bdb
3 changed files with 198 additions and 19 deletions
--- a/backend/app/services/ocr_service.py
+++ b/backend/app/services/ocr_service.py
@@ -332,6 +332,7 @@ class OCRService:
                total_valid_regions = 0
                all_layout_data = []
                all_images_metadata = []
+                all_ocr_dimensions = []

                for page_num, page_image_path in enumerate(image_paths, 1):
                    logger.info(f"Processing PDF page {page_num}/{len(image_paths)}")
@@ -363,6 +364,14 @@ class OCRService:
                        if page_result.get('images_metadata'):
                            all_images_metadata.extend(page_result['images_metadata'])

+                        # Store OCR dimensions for each page
+                        if page_result.get('ocr_dimensions'):
+                            all_ocr_dimensions.append({
+                                'page': page_num,
+                                'width': page_result['ocr_dimensions']['width'],
+                                'height': page_result['ocr_dimensions']['height']
+                            })
+
                # Calculate overall average confidence
                avg_confidence = total_confidence_sum / total_valid_regions if total_valid_regions > 0 else 0.0

@@ -407,11 +416,18 @@ class OCRService:
                    'processing_time': processing_time,
                    'timestamp': datetime.utcnow().isoformat(),
                    'total_pages': len(image_paths),
+                    'ocr_dimensions': all_ocr_dimensions if all_ocr_dimensions else None,
                }

            # Get OCR engine (for non-PDF images)
            ocr_engine = self.get_ocr_engine(lang)

+            # Get the actual image dimensions that OCR will use
+            from PIL import Image
+            with Image.open(image_path) as img:
+                ocr_width, ocr_height = img.size
+                logger.info(f"OCR processing image dimensions: {ocr_width}x{ocr_height}")
+
            # Perform OCR
            logger.info(f"Processing image: {image_path.name}")
            # Note: In PaddleOCR 3.x, use_angle_cls is set during initialization, not in ocr() call
@@ -480,6 +496,10 @@ class OCRService:
                'markdown_content': markdown_content,
                'processing_time': processing_time,
                'timestamp': datetime.utcnow().isoformat(),
+                'ocr_dimensions': {
+                    'width': ocr_width,
+                    'height': ocr_height
+                }
            }

            logger.info(