feat: implement layout-preserving PDF generation with table reconstruction

Major Features:
- Add PDF generation service with Chinese font support
- Parse HTML tables from PP-StructureV3 and rebuild with ReportLab
- Extract table text for translation purposes
- Auto-filter text regions inside tables to avoid overlaps

Backend Changes:
1. pdf_generator_service.py (NEW)
   - HTMLTableParser: Parse HTML tables to extract structure
   - PDFGeneratorService: Generate layout-preserving PDFs
   - Coordinate transformation: OCR (top-left) → PDF (bottom-left)
   - Font size heuristics: 75% of bbox height with width checking
   - Table reconstruction: Parse HTML → ReportLab Table
   - Image embedding: Extract bbox from filenames

2. ocr_service.py
   - Add _extract_table_text() for translation support
   - Add output_dir parameter to save images to result directory
   - Extract bbox from image filenames (img_in_table_box_x1_y1_x2_y2.jpg)

3. tasks.py
   - Update process_task_ocr to use save_results() with PDF generation
   - Fix download_pdf endpoint to use database-stored PDF paths
   - Support on-demand PDF generation from JSON

4. config.py
   - Add chinese_font_path configuration
   - Add pdf_enable_bbox_debug flag

Frontend Changes:
1. PDFViewer.tsx (NEW)
   - React PDF viewer with zoom and pagination
   - Memoized file config to prevent unnecessary reloads

2. TaskDetailPage.tsx & ResultsPage.tsx
   - Integrate PDF preview and download

3. main.tsx
   - Configure PDF.js worker via CDN

4. vite.config.ts
   - Add host: '0.0.0.0' for network access
   - Use VITE_API_URL environment variable for backend proxy

Dependencies:
- reportlab: PDF generation library
- Noto Sans SC font: Chinese character support

🤖 Generated with Claude Code
https://claude.com/claude-code

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-17 20:21:56 +08:00
parent 012da1abc4
commit fa1abcd8e6
16 changed files with 1427 additions and 57 deletions

View File

@@ -96,6 +96,11 @@ class Settings(BaseSettings):
pdf_margin_left: int = Field(default=20)
pdf_margin_right: int = Field(default=20)
# ===== Layout-Preserving PDF Configuration =====
chinese_font_path: str = Field(default="./backend/fonts/NotoSansSC-Regular.ttf")
pdf_font_size_base: int = Field(default=12)
pdf_enable_bbox_debug: bool = Field(default=False) # Draw bounding boxes for debugging
# ===== Translation Configuration (Reserved) =====
enable_translation: bool = Field(default=False)
translation_engine: str = Field(default="offline")

View File

@@ -66,34 +66,33 @@ def process_task_ocr(task_id: str, task_db_id: int, file_path: str, filename: st
# Initialize OCR service
ocr_service = OCRService()
# Create result directory before OCR processing (needed for saving extracted images)
result_dir = Path(settings.result_dir) / task_id
result_dir.mkdir(parents=True, exist_ok=True)
# Process the file with OCR
ocr_result = ocr_service.process_image(
image_path=Path(file_path),
lang='ch',
detect_layout=True
detect_layout=True,
output_dir=result_dir
)
# Calculate processing time
processing_time_ms = int((datetime.now() - start_time).total_seconds() * 1000)
# Create result directory
result_dir = Path(settings.result_dir) / task_id
result_dir.mkdir(parents=True, exist_ok=True)
# Save JSON result
json_path = result_dir / f"{Path(filename).stem}_result.json"
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(ocr_result, f, ensure_ascii=False, indent=2)
# Save Markdown result
markdown_path = result_dir / f"{Path(filename).stem}_result.md"
markdown_content = ocr_result.get('markdown_content', '')
with open(markdown_path, 'w', encoding='utf-8') as f:
f.write(markdown_content)
# Save results using OCR service (includes JSON, Markdown, and PDF generation)
json_path, markdown_path, pdf_path = ocr_service.save_results(
result=ocr_result,
output_dir=result_dir,
file_id=Path(filename).stem,
source_file_path=Path(file_path)
)
# Update task with results (direct database update)
task.result_json_path = str(json_path)
task.result_markdown_path = str(markdown_path)
task.result_json_path = str(json_path) if json_path else None
task.result_markdown_path = str(markdown_path) if markdown_path else None
task.result_pdf_path = str(pdf_path) if pdf_path else None
task.processing_time_ms = processing_time_ms
task.status = TaskStatus.COMPLETED
task.completed_at = datetime.utcnow()
@@ -468,10 +467,16 @@ async def download_pdf(
current_user: User = Depends(get_current_user)
):
"""
Download task result as searchable PDF file
Download task result as layout-preserving PDF file
- **task_id**: Task UUID
Returns a PDF that preserves the original document layout using OCR results.
The PDF is generated from OCR JSON data and cached for subsequent requests.
"""
from pathlib import Path
from app.services.pdf_generator_service import pdf_generator_service
# Get task
task = task_service.get_task_by_id(
db=db,
@@ -485,12 +490,69 @@ async def download_pdf(
detail="Task not found"
)
# Check if task is completed
if task.status.value != "completed":
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Task is not completed yet. Please wait for OCR processing to finish."
)
# Check if PDF path is stored in database
if task.result_pdf_path and Path(task.result_pdf_path).exists():
pdf_path = Path(task.result_pdf_path)
logger.info(f"Using pre-generated PDF from database: {pdf_path.name}")
else:
# Fallback: Try to generate PDF on-demand
result_dir = Path(settings.result_dir) / task_id
# Use stored JSON path or construct it
if task.result_json_path and Path(task.result_json_path).exists():
json_path = Path(task.result_json_path)
else:
# Try to find JSON file in result directory
json_files = list(result_dir.glob("*_result.json"))
if not json_files:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="OCR result JSON not found"
)
json_path = json_files[0]
# Construct PDF path based on JSON filename
pdf_filename = json_path.stem.replace("_result", "_layout") + ".pdf"
pdf_path = result_dir / pdf_filename
# Generate PDF if it doesn't exist
if not pdf_path.exists():
logger.info(f"Generating layout-preserving PDF for task {task_id}")
# Get source file path if available
source_file = None
task_file = db.query(TaskFile).filter(TaskFile.task_id == task.id).first()
if task_file and task_file.stored_path and Path(task_file.stored_path).exists():
source_file = Path(task_file.stored_path)
# Generate PDF
success = pdf_generator_service.generate_layout_pdf(
json_path=json_path,
output_path=pdf_path,
source_file_path=source_file
)
if not success:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="Failed to generate PDF. Please check server logs."
)
logger.info(f"PDF generated successfully: {pdf_path.name}")
# Validate file access
is_valid, error_msg = file_access_service.validate_file_access(
db=db,
user_id=current_user.id,
task_id=task_id,
file_path=task.result_pdf_path
file_path=str(pdf_path)
)
if not is_valid:
@@ -502,7 +564,7 @@ async def download_pdf(
# Return file
filename = f"{task.filename or task_id}_result.pdf"
return FileResponse(
path=task.result_pdf_path,
path=str(pdf_path),
filename=filename,
media_type="application/pdf"
)

View File

@@ -284,7 +284,8 @@ class OCRService:
image_path: Path,
lang: str = 'ch',
detect_layout: bool = True,
confidence_threshold: Optional[float] = None
confidence_threshold: Optional[float] = None,
output_dir: Optional[Path] = None
) -> Dict:
"""
Process single image with OCR and layout analysis
@@ -340,7 +341,8 @@ class OCRService:
page_image_path,
lang=lang,
detect_layout=detect_layout,
confidence_threshold=confidence_threshold
confidence_threshold=confidence_threshold,
output_dir=output_dir
)
# Accumulate results
@@ -458,7 +460,7 @@ class OCRService:
images_metadata = []
if detect_layout:
layout_data, images_metadata = self.analyze_layout(image_path)
layout_data, images_metadata = self.analyze_layout(image_path, output_dir=output_dir)
# Generate Markdown
markdown_content = self.generate_markdown(text_regions, layout_data)
@@ -500,12 +502,71 @@ class OCRService:
'processing_time': (datetime.now() - start_time).total_seconds(),
}
def analyze_layout(self, image_path: Path) -> Tuple[Optional[Dict], List[Dict]]:
def _extract_table_text(self, html_content: str) -> str:
"""
Extract text from HTML table content for translation purposes
Args:
html_content: HTML content containing table
Returns:
Extracted text from table cells
"""
try:
from html.parser import HTMLParser
class TableTextExtractor(HTMLParser):
def __init__(self):
super().__init__()
self.text_parts = []
self.in_table = False
def handle_starttag(self, tag, attrs):
if tag == 'table':
self.in_table = True
def handle_endtag(self, tag):
if tag == 'table':
self.in_table = False
elif tag in ('td', 'th') and self.in_table:
self.text_parts.append(' | ') # Cell separator
elif tag == 'tr' and self.in_table:
self.text_parts.append('\n') # Row separator
def handle_data(self, data):
if self.in_table:
stripped = data.strip()
if stripped:
self.text_parts.append(stripped)
parser = TableTextExtractor()
parser.feed(html_content)
# Clean up the extracted text
extracted = ''.join(parser.text_parts)
# Remove multiple separators
import re
extracted = re.sub(r'\s*\|\s*\|+\s*', ' | ', extracted)
extracted = re.sub(r'\n+', '\n', extracted)
extracted = extracted.strip()
return extracted
except Exception as e:
logger.warning(f"Failed to extract table text: {e}")
# Fallback: just remove HTML tags
import re
text = re.sub(r'<[^>]+>', ' ', html_content)
text = re.sub(r'\s+', ' ', text)
return text.strip()
def analyze_layout(self, image_path: Path, output_dir: Optional[Path] = None) -> Tuple[Optional[Dict], List[Dict]]:
"""
Analyze document layout using PP-StructureV3
Args:
image_path: Path to image file
output_dir: Optional output directory for saving extracted images (defaults to image_path.parent)
Returns:
Tuple of (layout_data, images_metadata)
@@ -548,16 +609,59 @@ class OCRService:
'page': page_idx,
'bbox': [], # PP-StructureV3 doesn't provide individual bbox in this format
}
# Extract text from table for translation purposes
if has_table:
table_text = self._extract_table_text(markdown_texts)
element['extracted_text'] = table_text
logger.info(f"Extracted {len(table_text)} characters from table")
layout_elements.append(element)
# Add image metadata
# Add image metadata and SAVE images to disk
for img_idx, (img_path, img_obj) in enumerate(markdown_images.items()):
# Save image to disk
try:
# Determine base directory for saving images
base_dir = output_dir if output_dir else image_path.parent
# Create full path for image file
full_img_path = base_dir / img_path
# Create imgs/ subdirectory if it doesn't exist
full_img_path.parent.mkdir(parents=True, exist_ok=True)
# Save image object to disk
if hasattr(img_obj, 'save'):
# img_obj is PIL Image
img_obj.save(str(full_img_path))
logger.info(f"Saved extracted image to {full_img_path}")
else:
logger.warning(f"Image object for {img_path} does not have save() method, skipping")
except Exception as e:
logger.warning(f"Failed to save image {img_path}: {str(e)}")
# Continue processing even if image save fails
# Extract bbox from filename (format: img_in_table_box_x1_y1_x2_y2.jpg)
bbox = []
try:
import re
match = re.search(r'box_(\d+)_(\d+)_(\d+)_(\d+)', img_path)
if match:
x1, y1, x2, y2 = map(int, match.groups())
# Convert to 4-point bbox format: [[x1,y1], [x2,y1], [x2,y2], [x1,y2]]
bbox = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
logger.info(f"Extracted bbox from filename: {bbox}")
except Exception as e:
logger.warning(f"Failed to extract bbox from {img_path}: {e}")
images_metadata.append({
'element_id': len(layout_elements) + img_idx,
'image_path': img_path,
'type': 'image',
'page': page_idx,
'bbox': [],
'bbox': bbox,
})
if layout_elements:
@@ -638,18 +742,20 @@ class OCRService:
self,
result: Dict,
output_dir: Path,
file_id: str
) -> Tuple[Optional[Path], Optional[Path]]:
file_id: str,
source_file_path: Optional[Path] = None
) -> Tuple[Optional[Path], Optional[Path], Optional[Path]]:
"""
Save OCR results to JSON and Markdown files
Save OCR results to JSON, Markdown, and layout-preserving PDF files
Args:
result: OCR result dictionary
output_dir: Output directory
file_id: Unique file identifier
source_file_path: Optional path to original source file for PDF generation
Returns:
Tuple of (json_path, markdown_path)
Tuple of (json_path, markdown_path, pdf_path)
"""
try:
output_dir.mkdir(parents=True, exist_ok=True)
@@ -666,8 +772,37 @@ class OCRService:
f.write(markdown_content)
logger.info(f"Results saved: {json_path.name}, {markdown_path.name}")
return json_path, markdown_path
# Generate layout-preserving PDF
pdf_path = None
try:
from app.services.pdf_generator_service import pdf_generator_service
pdf_filename = f"{file_id}_layout.pdf"
pdf_path = output_dir / pdf_filename
logger.info(f"Generating layout-preserving PDF: {pdf_filename}")
success = pdf_generator_service.generate_layout_pdf(
json_path=json_path,
output_path=pdf_path,
source_file_path=source_file_path
)
if success:
logger.info(f"✓ PDF generated successfully: {pdf_path.name}")
else:
logger.warning(f"✗ PDF generation failed for {file_id}")
pdf_path = None
except Exception as e:
logger.error(f"Error generating PDF for {file_id}: {str(e)}")
import traceback
traceback.print_exc()
pdf_path = None
return json_path, markdown_path, pdf_path
except Exception as e:
logger.error(f"Error saving results: {str(e)}")
return None, None
return None, None, None

View File

@@ -0,0 +1,626 @@
"""
Layout-Preserving PDF Generation Service
Generates PDF files that preserve the original document layout using OCR JSON data
"""
import json
import logging
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from datetime import datetime
from reportlab.lib.pagesizes import A4, letter
from reportlab.lib.units import mm
from reportlab.pdfgen import canvas
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.platypus import Table, TableStyle
from reportlab.lib import colors
from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_RIGHT
from reportlab.platypus import Paragraph
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from PIL import Image
from html.parser import HTMLParser
from app.core.config import settings
logger = logging.getLogger(__name__)
class HTMLTableParser(HTMLParser):
"""Parse HTML table to extract structure and data"""
def __init__(self):
super().__init__()
self.tables = []
self.current_table = None
self.current_row = None
self.current_cell = None
self.in_table = False
def handle_starttag(self, tag, attrs):
attrs_dict = dict(attrs)
if tag == 'table':
self.in_table = True
self.current_table = {'rows': []}
elif tag == 'tr' and self.in_table:
self.current_row = {'cells': []}
elif tag in ('td', 'th') and self.in_table and self.current_row is not None:
colspan = int(attrs_dict.get('colspan', 1))
rowspan = int(attrs_dict.get('rowspan', 1))
self.current_cell = {
'text': '',
'is_header': tag == 'th',
'colspan': colspan,
'rowspan': rowspan
}
def handle_endtag(self, tag):
if tag == 'table' and self.in_table:
if self.current_table and self.current_table['rows']:
self.tables.append(self.current_table)
self.current_table = None
self.in_table = False
elif tag == 'tr' and self.current_row is not None:
if self.current_table is not None:
self.current_table['rows'].append(self.current_row)
self.current_row = None
elif tag in ('td', 'th') and self.current_cell is not None:
if self.current_row is not None:
self.current_row['cells'].append(self.current_cell)
self.current_cell = None
def handle_data(self, data):
if self.current_cell is not None:
self.current_cell['text'] += data.strip() + ' '
class PDFGeneratorService:
"""Service for generating layout-preserving PDFs from OCR JSON data"""
def __init__(self):
"""Initialize PDF generator with font configuration"""
self.font_name = 'NotoSansSC'
self.font_path = None
self.font_registered = False
self._register_chinese_font()
def _register_chinese_font(self):
"""Register Chinese font for PDF generation"""
try:
# Get font path from settings
font_path = Path(settings.chinese_font_path)
# Try relative path from project root
if not font_path.is_absolute():
# Adjust path - settings.chinese_font_path starts with ./backend/
project_root = Path(__file__).resolve().parent.parent.parent.parent
font_path = project_root / font_path
if not font_path.exists():
logger.error(f"Chinese font not found at {font_path}")
return
# Register font
pdfmetrics.registerFont(TTFont(self.font_name, str(font_path)))
self.font_path = font_path
self.font_registered = True
logger.info(f"Chinese font registered: {self.font_name} from {font_path}")
except Exception as e:
logger.error(f"Failed to register Chinese font: {e}")
self.font_registered = False
def load_ocr_json(self, json_path: Path) -> Optional[Dict]:
"""
Load and parse OCR JSON result file
Args:
json_path: Path to JSON file
Returns:
Parsed JSON data or None if failed
"""
try:
with open(json_path, 'r', encoding='utf-8') as f:
data = json.load(f)
logger.info(f"Loaded OCR JSON: {json_path.name}")
return data
except Exception as e:
logger.error(f"Failed to load JSON {json_path}: {e}")
return None
def calculate_page_dimensions(self, text_regions: List[Dict], source_file_path: Optional[Path] = None) -> Tuple[float, float]:
"""
Calculate page dimensions from source file or text region bounding boxes
Args:
text_regions: List of text regions with bbox coordinates
source_file_path: Optional path to source file for accurate dimensions
Returns:
Tuple of (width, height) in points
"""
# First try to get dimensions from source file
if source_file_path:
dims = self.get_original_page_size(source_file_path)
if dims:
return dims
if not text_regions:
return A4 # Default to A4 size
max_x = 0
max_y = 0
for region in text_regions:
bbox = region.get('bbox', [])
if not bbox or len(bbox) < 4:
continue
# bbox format: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
for point in bbox:
if isinstance(point, (list, tuple)) and len(point) >= 2:
x, y = point[0], point[1]
max_x = max(max_x, x)
max_y = max(max_y, y)
# OCR coordinates are in pixels, use them directly as points (1:1 mapping)
# Do NOT add padding - this causes layout issues
width = max_x if max_x > 0 else A4[0]
height = max_y if max_y > 0 else A4[1]
logger.info(f"Calculated page dimensions from OCR: {width:.1f} x {height:.1f} points")
return (width, height)
def get_original_page_size(self, file_path: Path) -> Optional[Tuple[float, float]]:
"""
Extract page dimensions from original source file
Args:
file_path: Path to original file (image or PDF)
Returns:
Tuple of (width, height) in points or None
"""
try:
if not file_path.exists():
return None
# For images, get dimensions from PIL
if file_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']:
img = Image.open(file_path)
# Use pixel dimensions directly as points (1:1 mapping)
# This matches how PaddleOCR reports coordinates
width_pt = float(img.width)
height_pt = float(img.height)
logger.info(f"Extracted dimensions from image: {width_pt:.1f} x {height_pt:.1f} points (1:1 pixel mapping)")
return (width_pt, height_pt)
# For PDFs, would need PyPDF2 or similar
# For now, return None to use calculated dimensions
except Exception as e:
logger.warning(f"Failed to get page size from {file_path}: {e}")
return None
def draw_text_region(
self,
pdf_canvas: canvas.Canvas,
region: Dict,
page_height: float
):
"""
Draw a text region at precise coordinates
Args:
pdf_canvas: ReportLab canvas object
region: Text region dict with text, bbox, confidence
page_height: Height of page (for coordinate transformation)
"""
text = region.get('text', '')
bbox = region.get('bbox', [])
confidence = region.get('confidence', 1.0)
if not text or not bbox or len(bbox) < 4:
return
try:
# bbox from OCR: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
# Points: top-left, top-right, bottom-right, bottom-left
# OCR coordinates: origin (0,0) at top-left, Y increases downward
ocr_x_left = bbox[0][0] # Left X
ocr_y_top = bbox[0][1] # Top Y in OCR coordinates
ocr_x_right = bbox[2][0] # Right X
ocr_y_bottom = bbox[2][1] # Bottom Y in OCR coordinates
# Calculate bbox dimensions
bbox_width = abs(ocr_x_right - ocr_x_left)
bbox_height = abs(ocr_y_bottom - ocr_y_top)
# Calculate font size using heuristics
# Font size is typically 70-90% of bbox height
# Testing shows 0.75 works well for most cases
font_size = bbox_height * 0.75
font_size = max(min(font_size, 72), 4) # Clamp between 4pt and 72pt
# Transform coordinates: OCR (top-left origin) → PDF (bottom-left origin)
# CRITICAL: Y-axis flip!
pdf_x = ocr_x_left
pdf_y = page_height - ocr_y_bottom # Flip Y-axis using bottom coordinate
# Set font
font_name = self.font_name if self.font_registered else 'Helvetica'
pdf_canvas.setFont(font_name, font_size)
# Calculate text width to prevent overflow
text_width = pdf_canvas.stringWidth(text, font_name, font_size)
# If text is too wide for bbox, scale down font
if text_width > bbox_width:
scale_factor = bbox_width / text_width
font_size = font_size * scale_factor * 0.95 # 95% to add small margin
font_size = max(font_size, 3) # Minimum 3pt
pdf_canvas.setFont(font_name, font_size)
# Draw text at calculated position
pdf_canvas.drawString(pdf_x, pdf_y, text)
# Debug: Draw bounding box (optional)
if settings.pdf_enable_bbox_debug:
pdf_canvas.setStrokeColorRGB(1, 0, 0, 0.3) # Red, semi-transparent
pdf_canvas.setLineWidth(0.5)
# Transform all bbox points to PDF coordinates
pdf_points = [(p[0], page_height - p[1]) for p in bbox]
# Draw bbox rectangle
for i in range(4):
x1, y1 = pdf_points[i]
x2, y2 = pdf_points[(i + 1) % 4]
pdf_canvas.line(x1, y1, x2, y2)
except Exception as e:
logger.warning(f"Failed to draw text region '{text[:20]}...': {e}")
def draw_table_region(
self,
pdf_canvas: canvas.Canvas,
table_element: Dict,
images_metadata: List[Dict],
page_height: float
):
"""
Draw a table region by parsing HTML and rebuilding with ReportLab Table
Args:
pdf_canvas: ReportLab canvas object
table_element: Table element dict with HTML content
images_metadata: List of image metadata to find table bbox
page_height: Height of page
"""
try:
html_content = table_element.get('content', '')
if not html_content:
return
# Parse HTML to extract table structure
parser = HTMLTableParser()
parser.feed(html_content)
if not parser.tables:
logger.warning("No tables found in HTML content")
return
# Get the first table (PP-StructureV3 usually provides one table per element)
table_data = parser.tables[0]
rows = table_data['rows']
if not rows:
return
# Find corresponding table image to get bbox
table_bbox = None
for img_meta in images_metadata:
img_path = img_meta.get('image_path', '')
if 'table' in img_path.lower():
bbox = img_meta.get('bbox', [])
if bbox and len(bbox) >= 4:
table_bbox = bbox
break
if not table_bbox:
logger.warning("No bbox found for table")
return
# Extract bbox coordinates
ocr_x_left = table_bbox[0][0]
ocr_y_top = table_bbox[0][1]
ocr_x_right = table_bbox[2][0]
ocr_y_bottom = table_bbox[2][1]
table_width = abs(ocr_x_right - ocr_x_left)
table_height = abs(ocr_y_bottom - ocr_y_top)
# Transform coordinates
pdf_x = ocr_x_left
pdf_y = page_height - ocr_y_bottom
# Build table data for ReportLab
# Convert parsed structure to simple 2D array
max_cols = max(len(row['cells']) for row in rows)
reportlab_data = []
for row in rows:
row_data = []
for cell in row['cells']:
text = cell['text'].strip()
row_data.append(text)
# Pad row if needed
while len(row_data) < max_cols:
row_data.append('')
reportlab_data.append(row_data)
# Calculate column widths (equal distribution)
col_widths = [table_width / max_cols] * max_cols
# Create ReportLab Table
# Use smaller font size to fit in bbox
font_size = min(table_height / len(rows) * 0.5, 10)
font_size = max(font_size, 6)
# Create table with font
table = Table(reportlab_data, colWidths=col_widths)
# Apply table style
style = TableStyle([
('FONT', (0, 0), (-1, -1), self.font_name if self.font_registered else 'Helvetica', font_size),
('GRID', (0, 0), (-1, -1), 0.5, colors.black),
('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
('ALIGN', (0, 0), (-1, -1), 'CENTER'),
('LEFTPADDING', (0, 0), (-1, -1), 2),
('RIGHTPADDING', (0, 0), (-1, -1), 2),
('TOPPADDING', (0, 0), (-1, -1), 2),
('BOTTOMPADDING', (0, 0), (-1, -1), 2),
])
# Add header style if first row has headers
if rows and rows[0]['cells'] and rows[0]['cells'][0].get('is_header'):
style.add('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey)
style.add('FONT', (0, 0), (-1, 0), self.font_name if self.font_registered else 'Helvetica-Bold', font_size)
table.setStyle(style)
# Calculate table size
table.wrapOn(pdf_canvas, table_width, table_height)
# Draw table at position
table.drawOn(pdf_canvas, pdf_x, pdf_y)
logger.info(f"Drew table at ({pdf_x:.0f}, {pdf_y:.0f}) size {table_width:.0f}x{table_height:.0f} with {len(rows)} rows")
except Exception as e:
logger.warning(f"Failed to draw table region: {e}")
import traceback
traceback.print_exc()
def draw_image_region(
self,
pdf_canvas: canvas.Canvas,
region: Dict,
page_height: float,
result_dir: Path
):
"""
Draw an image region by embedding the extracted image
Handles images extracted by PP-StructureV3 (tables, figures, charts, etc.)
Args:
pdf_canvas: ReportLab canvas object
region: Image metadata dict with image_path and bbox
page_height: Height of page (for coordinate transformation)
result_dir: Directory containing result files
"""
try:
image_path_str = region.get('image_path', '')
if not image_path_str:
return
# Construct full path to image
image_path = result_dir / image_path_str
if not image_path.exists():
logger.warning(f"Image not found: {image_path}")
return
# Get bbox for positioning
bbox = region.get('bbox', [])
if not bbox or len(bbox) < 4:
# If no bbox, skip for now
logger.warning(f"No bbox for image {image_path_str}")
return
# bbox from OCR: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
# OCR coordinates: origin (0,0) at top-left, Y increases downward
ocr_x_left = bbox[0][0]
ocr_y_top = bbox[0][1]
ocr_x_right = bbox[2][0]
ocr_y_bottom = bbox[2][1]
# Calculate bbox dimensions
bbox_width = abs(ocr_x_right - ocr_x_left)
bbox_height = abs(ocr_y_bottom - ocr_y_top)
# Transform coordinates: OCR (top-left origin) → PDF (bottom-left origin)
# CRITICAL: Y-axis flip!
# For images, we position at bottom-left corner
pdf_x_left = ocr_x_left
pdf_y_bottom = page_height - ocr_y_bottom # Flip Y-axis
# Draw image using ReportLab
# drawImage expects: (path, x, y, width, height)
# where (x, y) is the bottom-left corner of the image
pdf_canvas.drawImage(
str(image_path),
pdf_x_left,
pdf_y_bottom,
width=bbox_width,
height=bbox_height,
preserveAspectRatio=True,
mask='auto' # Handle transparency
)
logger.info(f"Drew image: {image_path_str} at ({pdf_x_left:.0f}, {pdf_y_bottom:.0f}) size {bbox_width:.0f}x{bbox_height:.0f}")
except Exception as e:
logger.warning(f"Failed to draw image region: {e}")
def generate_layout_pdf(
self,
json_path: Path,
output_path: Path,
source_file_path: Optional[Path] = None
) -> bool:
"""
Generate layout-preserving PDF from OCR JSON data
Args:
json_path: Path to OCR JSON file
output_path: Path to save generated PDF
source_file_path: Optional path to original source file for dimension extraction
Returns:
True if successful, False otherwise
"""
try:
# Check if PDF already exists (caching)
if output_path.exists():
logger.info(f"PDF already exists: {output_path.name}")
return True
# Load JSON data
ocr_data = self.load_ocr_json(json_path)
if not ocr_data:
return False
# Get text regions
text_regions = ocr_data.get('text_regions', [])
if not text_regions:
logger.warning("No text regions found in JSON")
return False
# Get images metadata
images_metadata = ocr_data.get('images_metadata', [])
# Get layout data
layout_data = ocr_data.get('layout_data', {})
# Determine page dimensions
page_size = self.calculate_page_dimensions(text_regions, source_file_path)
page_width, page_height = page_size
# Create PDF canvas
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(page_width, page_height))
# Extract table bboxes to exclude text in those regions
table_bboxes = []
for img_meta in images_metadata:
img_path = img_meta.get('image_path', '')
if 'table' in img_path.lower():
bbox = img_meta.get('bbox', [])
if bbox and len(bbox) >= 4:
table_bboxes.append(bbox)
# Helper function to check if a point is inside a bbox
def point_in_bbox(x, y, bbox):
x1, y1 = bbox[0]
x2, y2 = bbox[2]
return min(x1, x2) <= x <= max(x1, x2) and min(y1, y2) <= y <= max(y1, y2)
# Filter text regions to exclude those inside tables
filtered_text_regions = []
for region in text_regions:
bbox = region.get('bbox', [])
if not bbox or len(bbox) < 4:
continue
# Check if text region center is inside any table bbox
center_x = (bbox[0][0] + bbox[2][0]) / 2
center_y = (bbox[0][1] + bbox[2][1]) / 2
is_in_table = any(point_in_bbox(center_x, center_y, table_bbox) for table_bbox in table_bboxes)
if not is_in_table:
filtered_text_regions.append(region)
else:
logger.debug(f"Excluded text '{region.get('text', '')[:20]}...' (inside table)")
logger.info(f"Filtered {len(text_regions) - len(filtered_text_regions)} text regions inside tables")
# Group regions by page
pages_data = {}
for region in filtered_text_regions:
page_num = region.get('page', 1)
if page_num not in pages_data:
pages_data[page_num] = []
pages_data[page_num].append(region)
# Get table elements from layout_data
table_elements = []
if layout_data and layout_data.get('elements'):
table_elements = [e for e in layout_data['elements'] if e.get('type') == 'table']
# Process each page
total_pages = ocr_data.get('total_pages', 1)
for page_num in range(1, total_pages + 1):
if page_num > 1:
pdf_canvas.showPage() # Start new page
# Draw text regions for this page (excluding table text)
page_regions = pages_data.get(page_num, [])
for region in page_regions:
self.draw_text_region(pdf_canvas, region, page_height)
# Draw tables for this page
for table_elem in table_elements:
if table_elem.get('page', 0) == page_num - 1: # page is 0-indexed
self.draw_table_region(pdf_canvas, table_elem, images_metadata, page_height)
# Draw non-table images for this page (figure, chart, seal, etc.)
for img_meta in images_metadata:
if img_meta.get('page') == page_num - 1: # page is 0-indexed
img_path = img_meta.get('image_path', '')
# Skip table images (they're now rendered as tables)
if 'table' not in img_path.lower():
self.draw_image_region(
pdf_canvas,
img_meta,
page_height,
json_path.parent
)
# Save PDF
pdf_canvas.save()
file_size = output_path.stat().st_size
logger.info(f"Generated layout-preserving PDF: {output_path.name} ({file_size} bytes)")
return True
except Exception as e:
logger.error(f"Failed to generate PDF: {e}")
import traceback
traceback.print_exc()
return False
# Singleton instance
pdf_generator_service = PDFGeneratorService()