diff --git a/backend/app/services/ocr_service.py b/backend/app/services/ocr_service.py index cd86251..7cefec5 100644 --- a/backend/app/services/ocr_service.py +++ b/backend/app/services/ocr_service.py @@ -332,6 +332,7 @@ class OCRService: total_valid_regions = 0 all_layout_data = [] all_images_metadata = [] + all_ocr_dimensions = [] for page_num, page_image_path in enumerate(image_paths, 1): logger.info(f"Processing PDF page {page_num}/{len(image_paths)}") @@ -363,6 +364,14 @@ class OCRService: if page_result.get('images_metadata'): all_images_metadata.extend(page_result['images_metadata']) + # Store OCR dimensions for each page + if page_result.get('ocr_dimensions'): + all_ocr_dimensions.append({ + 'page': page_num, + 'width': page_result['ocr_dimensions']['width'], + 'height': page_result['ocr_dimensions']['height'] + }) + # Calculate overall average confidence avg_confidence = total_confidence_sum / total_valid_regions if total_valid_regions > 0 else 0.0 @@ -407,11 +416,18 @@ class OCRService: 'processing_time': processing_time, 'timestamp': datetime.utcnow().isoformat(), 'total_pages': len(image_paths), + 'ocr_dimensions': all_ocr_dimensions if all_ocr_dimensions else None, } # Get OCR engine (for non-PDF images) ocr_engine = self.get_ocr_engine(lang) + # Get the actual image dimensions that OCR will use + from PIL import Image + with Image.open(image_path) as img: + ocr_width, ocr_height = img.size + logger.info(f"OCR processing image dimensions: {ocr_width}x{ocr_height}") + # Perform OCR logger.info(f"Processing image: {image_path.name}") # Note: In PaddleOCR 3.x, use_angle_cls is set during initialization, not in ocr() call @@ -480,6 +496,10 @@ class OCRService: 'markdown_content': markdown_content, 'processing_time': processing_time, 'timestamp': datetime.utcnow().isoformat(), + 'ocr_dimensions': { + 'width': ocr_width, + 'height': ocr_height + } } logger.info( diff --git a/backend/app/services/pdf_generator_service.py b/backend/app/services/pdf_generator_service.py index 35f1f9f..7eef951 100644 --- a/backend/app/services/pdf_generator_service.py +++ b/backend/app/services/pdf_generator_service.py @@ -217,7 +217,9 @@ class PDFGeneratorService: self, pdf_canvas: canvas.Canvas, region: Dict, - page_height: float + page_height: float, + scale_w: float = 1.0, + scale_h: float = 1.0 ): """ Draw a text region at precise coordinates @@ -226,6 +228,8 @@ class PDFGeneratorService: pdf_canvas: ReportLab canvas object region: Text region dict with text, bbox, confidence page_height: Height of page (for coordinate transformation) + scale_w: Scale factor for X coordinates (PDF width / OCR width) + scale_h: Scale factor for Y coordinates (PDF height / OCR height) """ text = region.get('text', '') bbox = region.get('bbox', []) @@ -243,7 +247,13 @@ class PDFGeneratorService: ocr_x_right = bbox[2][0] # Right X ocr_y_bottom = bbox[2][1] # Bottom Y in OCR coordinates - # Calculate bbox dimensions + # Apply scale factors to convert from OCR space to PDF space + ocr_x_left = ocr_x_left * scale_w + ocr_y_top = ocr_y_top * scale_h + ocr_x_right = ocr_x_right * scale_w + ocr_y_bottom = ocr_y_bottom * scale_h + + # Calculate bbox dimensions (after scaling) bbox_width = abs(ocr_x_right - ocr_x_left) bbox_height = abs(ocr_y_bottom - ocr_y_top) @@ -279,8 +289,8 @@ class PDFGeneratorService: if settings.pdf_enable_bbox_debug: pdf_canvas.setStrokeColorRGB(1, 0, 0, 0.3) # Red, semi-transparent pdf_canvas.setLineWidth(0.5) - # Transform all bbox points to PDF coordinates - pdf_points = [(p[0], page_height - p[1]) for p in bbox] + # Transform all bbox points to PDF coordinates (apply scaling first) + pdf_points = [(p[0] * scale_w, page_height - p[1] * scale_h) for p in bbox] # Draw bbox rectangle for i in range(4): x1, y1 = pdf_points[i] @@ -295,7 +305,9 @@ class PDFGeneratorService: pdf_canvas: canvas.Canvas, table_element: Dict, images_metadata: List[Dict], - page_height: float + page_height: float, + scale_w: float = 1.0, + scale_h: float = 1.0 ): """ Draw a table region by parsing HTML and rebuilding with ReportLab Table @@ -305,6 +317,8 @@ class PDFGeneratorService: table_element: Table element dict with HTML content images_metadata: List of image metadata to find table bbox page_height: Height of page + scale_w: Scale factor for X coordinates (PDF width / OCR width) + scale_h: Scale factor for Y coordinates (PDF height / OCR height) """ try: html_content = table_element.get('content', '') @@ -340,11 +354,11 @@ class PDFGeneratorService: logger.warning("No bbox found for table") return - # Extract bbox coordinates - ocr_x_left = table_bbox[0][0] - ocr_y_top = table_bbox[0][1] - ocr_x_right = table_bbox[2][0] - ocr_y_bottom = table_bbox[2][1] + # Extract bbox coordinates and apply scaling + ocr_x_left = table_bbox[0][0] * scale_w + ocr_y_top = table_bbox[0][1] * scale_h + ocr_x_right = table_bbox[2][0] * scale_w + ocr_y_bottom = table_bbox[2][1] * scale_h table_width = abs(ocr_x_right - ocr_x_left) table_height = abs(ocr_y_bottom - ocr_y_top) @@ -416,7 +430,9 @@ class PDFGeneratorService: pdf_canvas: canvas.Canvas, region: Dict, page_height: float, - result_dir: Path + result_dir: Path, + scale_w: float = 1.0, + scale_h: float = 1.0 ): """ Draw an image region by embedding the extracted image @@ -428,6 +444,8 @@ class PDFGeneratorService: region: Image metadata dict with image_path and bbox page_height: Height of page (for coordinate transformation) result_dir: Directory containing result files + scale_w: Scale factor for X coordinates (PDF width / OCR width) + scale_h: Scale factor for Y coordinates (PDF height / OCR height) """ try: image_path_str = region.get('image_path', '') @@ -450,12 +468,12 @@ class PDFGeneratorService: # bbox from OCR: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]] # OCR coordinates: origin (0,0) at top-left, Y increases downward - ocr_x_left = bbox[0][0] - ocr_y_top = bbox[0][1] - ocr_x_right = bbox[2][0] - ocr_y_bottom = bbox[2][1] + ocr_x_left = bbox[0][0] * scale_w + ocr_y_top = bbox[0][1] * scale_h + ocr_x_right = bbox[2][0] * scale_w + ocr_y_bottom = bbox[2][1] * scale_h - # Calculate bbox dimensions + # Calculate bbox dimensions (after scaling) bbox_width = abs(ocr_x_right - ocr_x_left) bbox_height = abs(ocr_y_bottom - ocr_y_top) @@ -523,11 +541,36 @@ class PDFGeneratorService: # Get layout data layout_data = ocr_data.get('layout_data', {}) + # Get OCR dimensions (the dimensions of images as processed by OCR) + ocr_dimensions = ocr_data.get('ocr_dimensions') + # Determine page dimensions page_size = self.calculate_page_dimensions(text_regions, source_file_path) page_width, page_height = page_size + # Calculate scale factors if OCR dimensions are available + # Default to 1.0 if no OCR dimensions (backward compatibility) + scale_w = 1.0 + scale_h = 1.0 + + if ocr_dimensions: + # For single image + if isinstance(ocr_dimensions, dict): + ocr_width = ocr_dimensions.get('width', page_width) + ocr_height = ocr_dimensions.get('height', page_height) + scale_w = page_width / ocr_width + scale_h = page_height / ocr_height + logger.info(f"Scale factors - X: {scale_w:.3f}, Y: {scale_h:.3f} (OCR: {ocr_width}x{ocr_height}, PDF: {page_width}x{page_height})") + # For multi-page PDF - we'll handle per-page scaling below + elif isinstance(ocr_dimensions, list) and ocr_dimensions: + # Use first page dimensions as default + ocr_width = ocr_dimensions[0].get('width', page_width) + ocr_height = ocr_dimensions[0].get('height', page_height) + scale_w = page_width / ocr_width + scale_h = page_height / ocr_height + logger.info(f"Default scale factors - X: {scale_w:.3f}, Y: {scale_h:.3f}") + # Create PDF canvas pdf_canvas = canvas.Canvas(str(output_path), pagesize=(page_width, page_height)) @@ -585,15 +628,29 @@ class PDFGeneratorService: if page_num > 1: pdf_canvas.showPage() # Start new page + # Get scale factors for this page (for multi-page PDFs) + page_scale_w = scale_w + page_scale_h = scale_h + if isinstance(ocr_dimensions, list) and ocr_dimensions: + # Find dimensions for this specific page + for dim_info in ocr_dimensions: + if dim_info.get('page') == page_num: + ocr_width = dim_info.get('width', page_width) + ocr_height = dim_info.get('height', page_height) + page_scale_w = page_width / ocr_width + page_scale_h = page_height / ocr_height + logger.info(f"Page {page_num} scale factors - X: {page_scale_w:.3f}, Y: {page_scale_h:.3f}") + break + # Draw text regions for this page (excluding table text) page_regions = pages_data.get(page_num, []) for region in page_regions: - self.draw_text_region(pdf_canvas, region, page_height) + self.draw_text_region(pdf_canvas, region, page_height, page_scale_w, page_scale_h) # Draw tables for this page for table_elem in table_elements: if table_elem.get('page', 0) == page_num - 1: # page is 0-indexed - self.draw_table_region(pdf_canvas, table_elem, images_metadata, page_height) + self.draw_table_region(pdf_canvas, table_elem, images_metadata, page_height, page_scale_w, page_scale_h) # Draw non-table images for this page (figure, chart, seal, etc.) for img_meta in images_metadata: @@ -605,7 +662,9 @@ class PDFGeneratorService: pdf_canvas, img_meta, page_height, - json_path.parent + json_path.parent, + page_scale_w, + page_scale_h ) # Save PDF diff --git a/backend/test_pdf_scaling.py b/backend/test_pdf_scaling.py new file mode 100644 index 0000000..7424097 --- /dev/null +++ b/backend/test_pdf_scaling.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python +""" +Test script for PDF generation with proper scaling +""" + +import json +from pathlib import Path +from app.services.pdf_generator_service import pdf_generator_service + +def test_pdf_generation(): + """Test PDF generation with mock data that includes OCR dimensions""" + + # Create a test directory + test_dir = Path("test_output") + test_dir.mkdir(exist_ok=True) + + # Create mock OCR JSON data with OCR dimensions + mock_ocr_data = { + "status": "success", + "file_name": "test_image.jpg", + "language": "ch", + "ocr_dimensions": { + "width": 500, # OCR processed at 500px wide + "height": 700 # OCR processed at 700px tall + }, + "text_regions": [ + { + "text": "測試文字 Test Text", + "bbox": [[50, 100], [250, 100], [250, 150], [50, 150]], + "confidence": 0.95 + }, + { + "text": "第二行文字 Second line", + "bbox": [[50, 200], [300, 200], [300, 250], [50, 250]], + "confidence": 0.92 + } + ], + "total_text_regions": 2, + "average_confidence": 0.935, + "layout_data": None, + "images_metadata": [], + "markdown_content": "# Test Document\n\n測試文字 Test Text\n\n第二行文字 Second line", + "processing_time": 1.5, + "timestamp": "2025-11-17T00:00:00" + } + + # Save mock JSON + json_path = test_dir / "test_ocr_result.json" + with open(json_path, "w", encoding="utf-8") as f: + json.dump(mock_ocr_data, f, ensure_ascii=False, indent=2) + + print(f"Created test JSON at: {json_path}") + + # Test PDF generation + pdf_path = test_dir / "test_output.pdf" + + # Create a dummy source file for dimensions (1000x1400 target PDF size) + from PIL import Image + source_image = test_dir / "test_source.jpg" + img = Image.new('RGB', (1000, 1400), color='white') + img.save(source_image) + print(f"Created test source image: {source_image} (1000x1400)") + + # Generate PDF + print("\nGenerating PDF with scaling...") + + # Set up logging to see scale factors + import logging + logging.basicConfig(level=logging.INFO, format='%(message)s') + + success = pdf_generator_service.generate_layout_pdf( + json_path=json_path, + output_path=pdf_path, + source_file_path=source_image + ) + + if success: + print(f"✓ PDF generated successfully: {pdf_path}") + print(f" Expected scale factors: X={1000/500:.2f}, Y={1400/700:.2f}") + print(" Text should now be properly scaled and positioned!") + else: + print("✗ PDF generation failed") + + return success + +if __name__ == "__main__": + import sys + sys.path.insert(0, str(Path(__file__).parent)) + + print("Testing PDF generation with proper scaling...") + print("=" * 60) + + success = test_pdf_generation() + + print("\n" + "=" * 60) + if success: + print("✓ Test completed successfully!") + print("Check test_output/test_output.pdf to verify scaling") + else: + print("✗ Test failed") \ No newline at end of file