Problem: - OCR processes images at smaller resolutions but coordinates were being used directly on larger PDF canvases - This caused all text/tables/images to be drawn at wrong scale in bottom-left corner Solution: - Track OCR image dimensions in JSON output (ocr_dimensions) - Calculate proper scale factors: scale_w = pdf_width/ocr_width, scale_h = pdf_height/ocr_height - Apply scaling to all coordinates before drawing on PDF canvas - Support per-page scaling for multi-page PDFs Changes: 1. ocr_service.py: - Add OCR image dimensions capture using PIL - Include ocr_dimensions in JSON output for both single images and PDFs 2. pdf_generator_service.py: - Calculate scale factors from OCR dimensions vs target PDF dimensions - Update all drawing methods (text, table, image) to accept and apply scale factors - Apply scaling to bbox coordinates before coordinate transformation 3. test_pdf_scaling.py: - Add test script to verify scaling works correctly - Test with OCR at 500x700 scaled to PDF at 1000x1400 (2x scaling) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
686 lines
26 KiB
Python
686 lines
26 KiB
Python
"""
|
|
Layout-Preserving PDF Generation Service
|
|
Generates PDF files that preserve the original document layout using OCR JSON data
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Tuple
|
|
from datetime import datetime
|
|
|
|
from reportlab.lib.pagesizes import A4, letter
|
|
from reportlab.lib.units import mm
|
|
from reportlab.pdfgen import canvas
|
|
from reportlab.pdfbase import pdfmetrics
|
|
from reportlab.pdfbase.ttfonts import TTFont
|
|
from reportlab.platypus import Table, TableStyle
|
|
from reportlab.lib import colors
|
|
from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_RIGHT
|
|
from reportlab.platypus import Paragraph
|
|
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
|
from PIL import Image
|
|
from html.parser import HTMLParser
|
|
|
|
from app.core.config import settings
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class HTMLTableParser(HTMLParser):
|
|
"""Parse HTML table to extract structure and data"""
|
|
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.tables = []
|
|
self.current_table = None
|
|
self.current_row = None
|
|
self.current_cell = None
|
|
self.in_table = False
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
attrs_dict = dict(attrs)
|
|
|
|
if tag == 'table':
|
|
self.in_table = True
|
|
self.current_table = {'rows': []}
|
|
|
|
elif tag == 'tr' and self.in_table:
|
|
self.current_row = {'cells': []}
|
|
|
|
elif tag in ('td', 'th') and self.in_table and self.current_row is not None:
|
|
colspan = int(attrs_dict.get('colspan', 1))
|
|
rowspan = int(attrs_dict.get('rowspan', 1))
|
|
self.current_cell = {
|
|
'text': '',
|
|
'is_header': tag == 'th',
|
|
'colspan': colspan,
|
|
'rowspan': rowspan
|
|
}
|
|
|
|
def handle_endtag(self, tag):
|
|
if tag == 'table' and self.in_table:
|
|
if self.current_table and self.current_table['rows']:
|
|
self.tables.append(self.current_table)
|
|
self.current_table = None
|
|
self.in_table = False
|
|
|
|
elif tag == 'tr' and self.current_row is not None:
|
|
if self.current_table is not None:
|
|
self.current_table['rows'].append(self.current_row)
|
|
self.current_row = None
|
|
|
|
elif tag in ('td', 'th') and self.current_cell is not None:
|
|
if self.current_row is not None:
|
|
self.current_row['cells'].append(self.current_cell)
|
|
self.current_cell = None
|
|
|
|
def handle_data(self, data):
|
|
if self.current_cell is not None:
|
|
self.current_cell['text'] += data.strip() + ' '
|
|
|
|
|
|
class PDFGeneratorService:
|
|
"""Service for generating layout-preserving PDFs from OCR JSON data"""
|
|
|
|
def __init__(self):
|
|
"""Initialize PDF generator with font configuration"""
|
|
self.font_name = 'NotoSansSC'
|
|
self.font_path = None
|
|
self.font_registered = False
|
|
|
|
self._register_chinese_font()
|
|
|
|
def _register_chinese_font(self):
|
|
"""Register Chinese font for PDF generation"""
|
|
try:
|
|
# Get font path from settings
|
|
font_path = Path(settings.chinese_font_path)
|
|
|
|
# Try relative path from project root
|
|
if not font_path.is_absolute():
|
|
# Adjust path - settings.chinese_font_path starts with ./backend/
|
|
project_root = Path(__file__).resolve().parent.parent.parent.parent
|
|
font_path = project_root / font_path
|
|
|
|
if not font_path.exists():
|
|
logger.error(f"Chinese font not found at {font_path}")
|
|
return
|
|
|
|
# Register font
|
|
pdfmetrics.registerFont(TTFont(self.font_name, str(font_path)))
|
|
self.font_path = font_path
|
|
self.font_registered = True
|
|
logger.info(f"Chinese font registered: {self.font_name} from {font_path}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to register Chinese font: {e}")
|
|
self.font_registered = False
|
|
|
|
def load_ocr_json(self, json_path: Path) -> Optional[Dict]:
|
|
"""
|
|
Load and parse OCR JSON result file
|
|
|
|
Args:
|
|
json_path: Path to JSON file
|
|
|
|
Returns:
|
|
Parsed JSON data or None if failed
|
|
"""
|
|
try:
|
|
with open(json_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
logger.info(f"Loaded OCR JSON: {json_path.name}")
|
|
return data
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to load JSON {json_path}: {e}")
|
|
return None
|
|
|
|
def calculate_page_dimensions(self, text_regions: List[Dict], source_file_path: Optional[Path] = None) -> Tuple[float, float]:
|
|
"""
|
|
Calculate page dimensions from source file or text region bounding boxes
|
|
|
|
Args:
|
|
text_regions: List of text regions with bbox coordinates
|
|
source_file_path: Optional path to source file for accurate dimensions
|
|
|
|
Returns:
|
|
Tuple of (width, height) in points
|
|
"""
|
|
# First try to get dimensions from source file
|
|
if source_file_path:
|
|
dims = self.get_original_page_size(source_file_path)
|
|
if dims:
|
|
return dims
|
|
|
|
if not text_regions:
|
|
return A4 # Default to A4 size
|
|
|
|
max_x = 0
|
|
max_y = 0
|
|
|
|
for region in text_regions:
|
|
bbox = region.get('bbox', [])
|
|
if not bbox or len(bbox) < 4:
|
|
continue
|
|
|
|
# bbox format: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
|
|
for point in bbox:
|
|
if isinstance(point, (list, tuple)) and len(point) >= 2:
|
|
x, y = point[0], point[1]
|
|
max_x = max(max_x, x)
|
|
max_y = max(max_y, y)
|
|
|
|
# OCR coordinates are in pixels, use them directly as points (1:1 mapping)
|
|
# Do NOT add padding - this causes layout issues
|
|
width = max_x if max_x > 0 else A4[0]
|
|
height = max_y if max_y > 0 else A4[1]
|
|
|
|
logger.info(f"Calculated page dimensions from OCR: {width:.1f} x {height:.1f} points")
|
|
return (width, height)
|
|
|
|
def get_original_page_size(self, file_path: Path) -> Optional[Tuple[float, float]]:
|
|
"""
|
|
Extract page dimensions from original source file
|
|
|
|
Args:
|
|
file_path: Path to original file (image or PDF)
|
|
|
|
Returns:
|
|
Tuple of (width, height) in points or None
|
|
"""
|
|
try:
|
|
if not file_path.exists():
|
|
return None
|
|
|
|
# For images, get dimensions from PIL
|
|
if file_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']:
|
|
img = Image.open(file_path)
|
|
# Use pixel dimensions directly as points (1:1 mapping)
|
|
# This matches how PaddleOCR reports coordinates
|
|
width_pt = float(img.width)
|
|
height_pt = float(img.height)
|
|
logger.info(f"Extracted dimensions from image: {width_pt:.1f} x {height_pt:.1f} points (1:1 pixel mapping)")
|
|
return (width_pt, height_pt)
|
|
|
|
# For PDFs, would need PyPDF2 or similar
|
|
# For now, return None to use calculated dimensions
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Failed to get page size from {file_path}: {e}")
|
|
|
|
return None
|
|
|
|
def draw_text_region(
|
|
self,
|
|
pdf_canvas: canvas.Canvas,
|
|
region: Dict,
|
|
page_height: float,
|
|
scale_w: float = 1.0,
|
|
scale_h: float = 1.0
|
|
):
|
|
"""
|
|
Draw a text region at precise coordinates
|
|
|
|
Args:
|
|
pdf_canvas: ReportLab canvas object
|
|
region: Text region dict with text, bbox, confidence
|
|
page_height: Height of page (for coordinate transformation)
|
|
scale_w: Scale factor for X coordinates (PDF width / OCR width)
|
|
scale_h: Scale factor for Y coordinates (PDF height / OCR height)
|
|
"""
|
|
text = region.get('text', '')
|
|
bbox = region.get('bbox', [])
|
|
confidence = region.get('confidence', 1.0)
|
|
|
|
if not text or not bbox or len(bbox) < 4:
|
|
return
|
|
|
|
try:
|
|
# bbox from OCR: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
|
|
# Points: top-left, top-right, bottom-right, bottom-left
|
|
# OCR coordinates: origin (0,0) at top-left, Y increases downward
|
|
ocr_x_left = bbox[0][0] # Left X
|
|
ocr_y_top = bbox[0][1] # Top Y in OCR coordinates
|
|
ocr_x_right = bbox[2][0] # Right X
|
|
ocr_y_bottom = bbox[2][1] # Bottom Y in OCR coordinates
|
|
|
|
# Apply scale factors to convert from OCR space to PDF space
|
|
ocr_x_left = ocr_x_left * scale_w
|
|
ocr_y_top = ocr_y_top * scale_h
|
|
ocr_x_right = ocr_x_right * scale_w
|
|
ocr_y_bottom = ocr_y_bottom * scale_h
|
|
|
|
# Calculate bbox dimensions (after scaling)
|
|
bbox_width = abs(ocr_x_right - ocr_x_left)
|
|
bbox_height = abs(ocr_y_bottom - ocr_y_top)
|
|
|
|
# Calculate font size using heuristics
|
|
# Font size is typically 70-90% of bbox height
|
|
# Testing shows 0.75 works well for most cases
|
|
font_size = bbox_height * 0.75
|
|
font_size = max(min(font_size, 72), 4) # Clamp between 4pt and 72pt
|
|
|
|
# Transform coordinates: OCR (top-left origin) → PDF (bottom-left origin)
|
|
# CRITICAL: Y-axis flip!
|
|
pdf_x = ocr_x_left
|
|
pdf_y = page_height - ocr_y_bottom # Flip Y-axis using bottom coordinate
|
|
|
|
# Set font
|
|
font_name = self.font_name if self.font_registered else 'Helvetica'
|
|
pdf_canvas.setFont(font_name, font_size)
|
|
|
|
# Calculate text width to prevent overflow
|
|
text_width = pdf_canvas.stringWidth(text, font_name, font_size)
|
|
|
|
# If text is too wide for bbox, scale down font
|
|
if text_width > bbox_width:
|
|
scale_factor = bbox_width / text_width
|
|
font_size = font_size * scale_factor * 0.95 # 95% to add small margin
|
|
font_size = max(font_size, 3) # Minimum 3pt
|
|
pdf_canvas.setFont(font_name, font_size)
|
|
|
|
# Draw text at calculated position
|
|
pdf_canvas.drawString(pdf_x, pdf_y, text)
|
|
|
|
# Debug: Draw bounding box (optional)
|
|
if settings.pdf_enable_bbox_debug:
|
|
pdf_canvas.setStrokeColorRGB(1, 0, 0, 0.3) # Red, semi-transparent
|
|
pdf_canvas.setLineWidth(0.5)
|
|
# Transform all bbox points to PDF coordinates (apply scaling first)
|
|
pdf_points = [(p[0] * scale_w, page_height - p[1] * scale_h) for p in bbox]
|
|
# Draw bbox rectangle
|
|
for i in range(4):
|
|
x1, y1 = pdf_points[i]
|
|
x2, y2 = pdf_points[(i + 1) % 4]
|
|
pdf_canvas.line(x1, y1, x2, y2)
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Failed to draw text region '{text[:20]}...': {e}")
|
|
|
|
def draw_table_region(
|
|
self,
|
|
pdf_canvas: canvas.Canvas,
|
|
table_element: Dict,
|
|
images_metadata: List[Dict],
|
|
page_height: float,
|
|
scale_w: float = 1.0,
|
|
scale_h: float = 1.0
|
|
):
|
|
"""
|
|
Draw a table region by parsing HTML and rebuilding with ReportLab Table
|
|
|
|
Args:
|
|
pdf_canvas: ReportLab canvas object
|
|
table_element: Table element dict with HTML content
|
|
images_metadata: List of image metadata to find table bbox
|
|
page_height: Height of page
|
|
scale_w: Scale factor for X coordinates (PDF width / OCR width)
|
|
scale_h: Scale factor for Y coordinates (PDF height / OCR height)
|
|
"""
|
|
try:
|
|
html_content = table_element.get('content', '')
|
|
if not html_content:
|
|
return
|
|
|
|
# Parse HTML to extract table structure
|
|
parser = HTMLTableParser()
|
|
parser.feed(html_content)
|
|
|
|
if not parser.tables:
|
|
logger.warning("No tables found in HTML content")
|
|
return
|
|
|
|
# Get the first table (PP-StructureV3 usually provides one table per element)
|
|
table_data = parser.tables[0]
|
|
rows = table_data['rows']
|
|
|
|
if not rows:
|
|
return
|
|
|
|
# Find corresponding table image to get bbox
|
|
table_bbox = None
|
|
for img_meta in images_metadata:
|
|
img_path = img_meta.get('image_path', '')
|
|
if 'table' in img_path.lower():
|
|
bbox = img_meta.get('bbox', [])
|
|
if bbox and len(bbox) >= 4:
|
|
table_bbox = bbox
|
|
break
|
|
|
|
if not table_bbox:
|
|
logger.warning("No bbox found for table")
|
|
return
|
|
|
|
# Extract bbox coordinates and apply scaling
|
|
ocr_x_left = table_bbox[0][0] * scale_w
|
|
ocr_y_top = table_bbox[0][1] * scale_h
|
|
ocr_x_right = table_bbox[2][0] * scale_w
|
|
ocr_y_bottom = table_bbox[2][1] * scale_h
|
|
|
|
table_width = abs(ocr_x_right - ocr_x_left)
|
|
table_height = abs(ocr_y_bottom - ocr_y_top)
|
|
|
|
# Transform coordinates
|
|
pdf_x = ocr_x_left
|
|
pdf_y = page_height - ocr_y_bottom
|
|
|
|
# Build table data for ReportLab
|
|
# Convert parsed structure to simple 2D array
|
|
max_cols = max(len(row['cells']) for row in rows)
|
|
reportlab_data = []
|
|
|
|
for row in rows:
|
|
row_data = []
|
|
for cell in row['cells']:
|
|
text = cell['text'].strip()
|
|
row_data.append(text)
|
|
# Pad row if needed
|
|
while len(row_data) < max_cols:
|
|
row_data.append('')
|
|
reportlab_data.append(row_data)
|
|
|
|
# Calculate column widths (equal distribution)
|
|
col_widths = [table_width / max_cols] * max_cols
|
|
|
|
# Create ReportLab Table
|
|
# Use smaller font size to fit in bbox
|
|
font_size = min(table_height / len(rows) * 0.5, 10)
|
|
font_size = max(font_size, 6)
|
|
|
|
# Create table with font
|
|
table = Table(reportlab_data, colWidths=col_widths)
|
|
|
|
# Apply table style
|
|
style = TableStyle([
|
|
('FONT', (0, 0), (-1, -1), self.font_name if self.font_registered else 'Helvetica', font_size),
|
|
('GRID', (0, 0), (-1, -1), 0.5, colors.black),
|
|
('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
|
|
('ALIGN', (0, 0), (-1, -1), 'CENTER'),
|
|
('LEFTPADDING', (0, 0), (-1, -1), 2),
|
|
('RIGHTPADDING', (0, 0), (-1, -1), 2),
|
|
('TOPPADDING', (0, 0), (-1, -1), 2),
|
|
('BOTTOMPADDING', (0, 0), (-1, -1), 2),
|
|
])
|
|
|
|
# Add header style if first row has headers
|
|
if rows and rows[0]['cells'] and rows[0]['cells'][0].get('is_header'):
|
|
style.add('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey)
|
|
style.add('FONT', (0, 0), (-1, 0), self.font_name if self.font_registered else 'Helvetica-Bold', font_size)
|
|
|
|
table.setStyle(style)
|
|
|
|
# Calculate table size
|
|
table.wrapOn(pdf_canvas, table_width, table_height)
|
|
|
|
# Draw table at position
|
|
table.drawOn(pdf_canvas, pdf_x, pdf_y)
|
|
|
|
logger.info(f"Drew table at ({pdf_x:.0f}, {pdf_y:.0f}) size {table_width:.0f}x{table_height:.0f} with {len(rows)} rows")
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Failed to draw table region: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
def draw_image_region(
|
|
self,
|
|
pdf_canvas: canvas.Canvas,
|
|
region: Dict,
|
|
page_height: float,
|
|
result_dir: Path,
|
|
scale_w: float = 1.0,
|
|
scale_h: float = 1.0
|
|
):
|
|
"""
|
|
Draw an image region by embedding the extracted image
|
|
|
|
Handles images extracted by PP-StructureV3 (tables, figures, charts, etc.)
|
|
|
|
Args:
|
|
pdf_canvas: ReportLab canvas object
|
|
region: Image metadata dict with image_path and bbox
|
|
page_height: Height of page (for coordinate transformation)
|
|
result_dir: Directory containing result files
|
|
scale_w: Scale factor for X coordinates (PDF width / OCR width)
|
|
scale_h: Scale factor for Y coordinates (PDF height / OCR height)
|
|
"""
|
|
try:
|
|
image_path_str = region.get('image_path', '')
|
|
if not image_path_str:
|
|
return
|
|
|
|
# Construct full path to image
|
|
image_path = result_dir / image_path_str
|
|
|
|
if not image_path.exists():
|
|
logger.warning(f"Image not found: {image_path}")
|
|
return
|
|
|
|
# Get bbox for positioning
|
|
bbox = region.get('bbox', [])
|
|
if not bbox or len(bbox) < 4:
|
|
# If no bbox, skip for now
|
|
logger.warning(f"No bbox for image {image_path_str}")
|
|
return
|
|
|
|
# bbox from OCR: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
|
|
# OCR coordinates: origin (0,0) at top-left, Y increases downward
|
|
ocr_x_left = bbox[0][0] * scale_w
|
|
ocr_y_top = bbox[0][1] * scale_h
|
|
ocr_x_right = bbox[2][0] * scale_w
|
|
ocr_y_bottom = bbox[2][1] * scale_h
|
|
|
|
# Calculate bbox dimensions (after scaling)
|
|
bbox_width = abs(ocr_x_right - ocr_x_left)
|
|
bbox_height = abs(ocr_y_bottom - ocr_y_top)
|
|
|
|
# Transform coordinates: OCR (top-left origin) → PDF (bottom-left origin)
|
|
# CRITICAL: Y-axis flip!
|
|
# For images, we position at bottom-left corner
|
|
pdf_x_left = ocr_x_left
|
|
pdf_y_bottom = page_height - ocr_y_bottom # Flip Y-axis
|
|
|
|
# Draw image using ReportLab
|
|
# drawImage expects: (path, x, y, width, height)
|
|
# where (x, y) is the bottom-left corner of the image
|
|
pdf_canvas.drawImage(
|
|
str(image_path),
|
|
pdf_x_left,
|
|
pdf_y_bottom,
|
|
width=bbox_width,
|
|
height=bbox_height,
|
|
preserveAspectRatio=True,
|
|
mask='auto' # Handle transparency
|
|
)
|
|
|
|
logger.info(f"Drew image: {image_path_str} at ({pdf_x_left:.0f}, {pdf_y_bottom:.0f}) size {bbox_width:.0f}x{bbox_height:.0f}")
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Failed to draw image region: {e}")
|
|
|
|
def generate_layout_pdf(
|
|
self,
|
|
json_path: Path,
|
|
output_path: Path,
|
|
source_file_path: Optional[Path] = None
|
|
) -> bool:
|
|
"""
|
|
Generate layout-preserving PDF from OCR JSON data
|
|
|
|
Args:
|
|
json_path: Path to OCR JSON file
|
|
output_path: Path to save generated PDF
|
|
source_file_path: Optional path to original source file for dimension extraction
|
|
|
|
Returns:
|
|
True if successful, False otherwise
|
|
"""
|
|
try:
|
|
# Check if PDF already exists (caching)
|
|
if output_path.exists():
|
|
logger.info(f"PDF already exists: {output_path.name}")
|
|
return True
|
|
|
|
# Load JSON data
|
|
ocr_data = self.load_ocr_json(json_path)
|
|
if not ocr_data:
|
|
return False
|
|
|
|
# Get text regions
|
|
text_regions = ocr_data.get('text_regions', [])
|
|
if not text_regions:
|
|
logger.warning("No text regions found in JSON")
|
|
return False
|
|
|
|
# Get images metadata
|
|
images_metadata = ocr_data.get('images_metadata', [])
|
|
|
|
# Get layout data
|
|
layout_data = ocr_data.get('layout_data', {})
|
|
|
|
# Get OCR dimensions (the dimensions of images as processed by OCR)
|
|
ocr_dimensions = ocr_data.get('ocr_dimensions')
|
|
|
|
# Determine page dimensions
|
|
page_size = self.calculate_page_dimensions(text_regions, source_file_path)
|
|
|
|
page_width, page_height = page_size
|
|
|
|
# Calculate scale factors if OCR dimensions are available
|
|
# Default to 1.0 if no OCR dimensions (backward compatibility)
|
|
scale_w = 1.0
|
|
scale_h = 1.0
|
|
|
|
if ocr_dimensions:
|
|
# For single image
|
|
if isinstance(ocr_dimensions, dict):
|
|
ocr_width = ocr_dimensions.get('width', page_width)
|
|
ocr_height = ocr_dimensions.get('height', page_height)
|
|
scale_w = page_width / ocr_width
|
|
scale_h = page_height / ocr_height
|
|
logger.info(f"Scale factors - X: {scale_w:.3f}, Y: {scale_h:.3f} (OCR: {ocr_width}x{ocr_height}, PDF: {page_width}x{page_height})")
|
|
# For multi-page PDF - we'll handle per-page scaling below
|
|
elif isinstance(ocr_dimensions, list) and ocr_dimensions:
|
|
# Use first page dimensions as default
|
|
ocr_width = ocr_dimensions[0].get('width', page_width)
|
|
ocr_height = ocr_dimensions[0].get('height', page_height)
|
|
scale_w = page_width / ocr_width
|
|
scale_h = page_height / ocr_height
|
|
logger.info(f"Default scale factors - X: {scale_w:.3f}, Y: {scale_h:.3f}")
|
|
|
|
# Create PDF canvas
|
|
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(page_width, page_height))
|
|
|
|
# Extract table bboxes to exclude text in those regions
|
|
table_bboxes = []
|
|
for img_meta in images_metadata:
|
|
img_path = img_meta.get('image_path', '')
|
|
if 'table' in img_path.lower():
|
|
bbox = img_meta.get('bbox', [])
|
|
if bbox and len(bbox) >= 4:
|
|
table_bboxes.append(bbox)
|
|
|
|
# Helper function to check if a point is inside a bbox
|
|
def point_in_bbox(x, y, bbox):
|
|
x1, y1 = bbox[0]
|
|
x2, y2 = bbox[2]
|
|
return min(x1, x2) <= x <= max(x1, x2) and min(y1, y2) <= y <= max(y1, y2)
|
|
|
|
# Filter text regions to exclude those inside tables
|
|
filtered_text_regions = []
|
|
for region in text_regions:
|
|
bbox = region.get('bbox', [])
|
|
if not bbox or len(bbox) < 4:
|
|
continue
|
|
|
|
# Check if text region center is inside any table bbox
|
|
center_x = (bbox[0][0] + bbox[2][0]) / 2
|
|
center_y = (bbox[0][1] + bbox[2][1]) / 2
|
|
|
|
is_in_table = any(point_in_bbox(center_x, center_y, table_bbox) for table_bbox in table_bboxes)
|
|
|
|
if not is_in_table:
|
|
filtered_text_regions.append(region)
|
|
else:
|
|
logger.debug(f"Excluded text '{region.get('text', '')[:20]}...' (inside table)")
|
|
|
|
logger.info(f"Filtered {len(text_regions) - len(filtered_text_regions)} text regions inside tables")
|
|
|
|
# Group regions by page
|
|
pages_data = {}
|
|
for region in filtered_text_regions:
|
|
page_num = region.get('page', 1)
|
|
if page_num not in pages_data:
|
|
pages_data[page_num] = []
|
|
pages_data[page_num].append(region)
|
|
|
|
# Get table elements from layout_data
|
|
table_elements = []
|
|
if layout_data and layout_data.get('elements'):
|
|
table_elements = [e for e in layout_data['elements'] if e.get('type') == 'table']
|
|
|
|
# Process each page
|
|
total_pages = ocr_data.get('total_pages', 1)
|
|
for page_num in range(1, total_pages + 1):
|
|
if page_num > 1:
|
|
pdf_canvas.showPage() # Start new page
|
|
|
|
# Get scale factors for this page (for multi-page PDFs)
|
|
page_scale_w = scale_w
|
|
page_scale_h = scale_h
|
|
if isinstance(ocr_dimensions, list) and ocr_dimensions:
|
|
# Find dimensions for this specific page
|
|
for dim_info in ocr_dimensions:
|
|
if dim_info.get('page') == page_num:
|
|
ocr_width = dim_info.get('width', page_width)
|
|
ocr_height = dim_info.get('height', page_height)
|
|
page_scale_w = page_width / ocr_width
|
|
page_scale_h = page_height / ocr_height
|
|
logger.info(f"Page {page_num} scale factors - X: {page_scale_w:.3f}, Y: {page_scale_h:.3f}")
|
|
break
|
|
|
|
# Draw text regions for this page (excluding table text)
|
|
page_regions = pages_data.get(page_num, [])
|
|
for region in page_regions:
|
|
self.draw_text_region(pdf_canvas, region, page_height, page_scale_w, page_scale_h)
|
|
|
|
# Draw tables for this page
|
|
for table_elem in table_elements:
|
|
if table_elem.get('page', 0) == page_num - 1: # page is 0-indexed
|
|
self.draw_table_region(pdf_canvas, table_elem, images_metadata, page_height, page_scale_w, page_scale_h)
|
|
|
|
# Draw non-table images for this page (figure, chart, seal, etc.)
|
|
for img_meta in images_metadata:
|
|
if img_meta.get('page') == page_num - 1: # page is 0-indexed
|
|
img_path = img_meta.get('image_path', '')
|
|
# Skip table images (they're now rendered as tables)
|
|
if 'table' not in img_path.lower():
|
|
self.draw_image_region(
|
|
pdf_canvas,
|
|
img_meta,
|
|
page_height,
|
|
json_path.parent,
|
|
page_scale_w,
|
|
page_scale_h
|
|
)
|
|
|
|
# Save PDF
|
|
pdf_canvas.save()
|
|
|
|
file_size = output_path.stat().st_size
|
|
logger.info(f"Generated layout-preserving PDF: {output_path.name} ({file_size} bytes)")
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to generate PDF: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
|
|
# Singleton instance
|
|
pdf_generator_service = PDFGeneratorService()
|