feat: add translated PDF format selection (layout/reflow)

- Add generate_translated_layout_pdf() method for layout-preserving translated PDFs
- Add generate_translated_pdf() method for reflow translated PDFs
- Update translate router to accept format parameter (layout/reflow)
- Update frontend with dropdown to select translated PDF format
- Fix reflow PDF table cell extraction from content dict
- Add embedded images handling in reflow PDF tables
- Archive improve-translated-text-fitting openspec proposal

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-12-03 10:10:28 +08:00
parent 0dcea4a7e7
commit 08adf3d01d
15 changed files with 1384 additions and 1222 deletions

View File

@@ -645,16 +645,22 @@ async def download_markdown(
@router.get("/{task_id}/download/pdf", summary="Download PDF result")
async def download_pdf(
task_id: str,
format: Optional[str] = Query(
None,
description="PDF format: 'layout' (default) preserves original coordinates, 'reflow' provides flowing text with consistent font sizes"
),
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user)
):
"""
Download task result as layout-preserving PDF file
Download task result as PDF file
- **task_id**: Task UUID
- **format**: Optional format parameter
- `layout` (default): Preserves original document layout and coordinates
- `reflow`: Flowing text with consistent font sizes for better readability
Returns a PDF that preserves the original document layout using OCR results.
The PDF is generated from OCR JSON data and cached for subsequent requests.
Returns a PDF generated from OCR JSON data. The PDF is cached for subsequent requests.
"""
from pathlib import Path
from app.services.pdf_generator_service import pdf_generator_service
@@ -679,12 +685,15 @@ async def download_pdf(
detail="Task is not completed yet. Please wait for OCR processing to finish."
)
# Check if PDF path is stored in database
if task.result_pdf_path and Path(task.result_pdf_path).exists():
# Determine format (default to layout)
use_reflow = format and format.lower() == "reflow"
# Check if PDF path is stored in database (only for layout format, as reflow is always generated)
if not use_reflow and task.result_pdf_path and Path(task.result_pdf_path).exists():
pdf_path = Path(task.result_pdf_path)
logger.info(f"Using pre-generated PDF from database: {pdf_path.name}")
else:
# Fallback: Try to generate PDF on-demand
# Generate PDF on-demand
result_dir = Path(settings.result_dir) / task_id
# Use stored JSON path or construct it
@@ -700,13 +709,14 @@ async def download_pdf(
)
json_path = json_files[0]
# Construct PDF path based on JSON filename
pdf_filename = json_path.stem.replace("_result", "_layout") + ".pdf"
# Construct PDF path based on JSON filename and format
format_suffix = "_reflow" if use_reflow else "_layout"
pdf_filename = json_path.stem.replace("_result", format_suffix) + ".pdf"
pdf_path = result_dir / pdf_filename
# Generate PDF if it doesn't exist
if not pdf_path.exists():
logger.info(f"Generating layout-preserving PDF for task {task_id}")
logger.info(f"Generating {'reflow' if use_reflow else 'layout-preserving'} PDF for task {task_id}")
# Get source file path if available
source_file = None
@@ -714,12 +724,20 @@ async def download_pdf(
if task_file and task_file.stored_path and Path(task_file.stored_path).exists():
source_file = Path(task_file.stored_path)
# Generate PDF
success = pdf_generator_service.generate_layout_pdf(
json_path=json_path,
output_path=pdf_path,
source_file_path=source_file
)
# Generate PDF based on format
if use_reflow:
# For reflow, pass result_dir as source_file_path (contains extracted images)
success = pdf_generator_service.generate_reflow_pdf(
json_path=json_path,
output_path=pdf_path,
source_file_path=result_dir
)
else:
success = pdf_generator_service.generate_layout_pdf(
json_path=json_path,
output_path=pdf_path,
source_file_path=source_file
)
if not success:
raise HTTPException(
@@ -743,8 +761,10 @@ async def download_pdf(
detail=error_msg
)
# Return file
filename = f"{task.filename or task_id}_result.pdf"
# Return file with format indication in filename
base_name = task.filename or task_id
format_suffix = "_reflow" if use_reflow else "_layout"
filename = f"{base_name}{format_suffix}.pdf"
return FileResponse(
path=str(pdf_path),
filename=filename,

View File

@@ -507,16 +507,18 @@ async def delete_translation(
async def download_translated_pdf(
task_id: str,
lang: str = Query(..., description="Target language code"),
format: str = Query("reflow", description="PDF format: 'layout' or 'reflow'"),
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user)
):
"""
Download a translated PDF with layout preservation.
Download a translated PDF.
- **task_id**: Task UUID
- **lang**: Target language code (e.g., 'en', 'ja')
- **format**: PDF format - 'layout' (preserves positions with text wrapping) or 'reflow' (flowing layout)
Returns PDF file with translated content preserving original layout.
Returns PDF file with translated content.
"""
from app.services.pdf_generator_service import pdf_generator_service
from app.services.translation_service import list_available_translations
@@ -587,26 +589,37 @@ async def download_translated_pdf(
detail="Invalid translation file format"
)
# Validate format parameter
use_layout = format.lower() == 'layout'
# Generate translated PDF to temp file
output_filename = f"{task_id}_translated_{lang}.pdf"
format_suffix = '_layout' if use_layout else '_reflow'
output_filename = f"{task_id}_translated_{lang}{format_suffix}.pdf"
with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp_file:
output_path = Path(tmp_file.name)
try:
# Get source file path for images if available
source_file_path = None
if task.files and len(task.files) > 0:
stored_path = task.files[0].stored_path
if stored_path and Path(stored_path).exists():
source_file_path = Path(stored_path)
# Use result_dir as image source (contains extracted images)
image_dir = result_json_path.parent
success = pdf_generator_service.generate_translated_pdf(
result_json_path=result_json_path,
translation_json_path=translation_file,
output_path=output_path,
source_file_path=source_file_path
)
# Choose PDF generation method based on format
if use_layout:
# Layout mode: preserve original positions with text wrapping
success = pdf_generator_service.generate_translated_layout_pdf(
result_json_path=result_json_path,
translation_json_path=translation_file,
output_path=output_path,
source_file_path=image_dir
)
else:
# Reflow mode: flowing layout
success = pdf_generator_service.generate_translated_pdf(
result_json_path=result_json_path,
translation_json_path=translation_file,
output_path=output_path,
source_file_path=image_dir
)
if not success:
raise HTTPException(

View File

@@ -15,7 +15,8 @@ from reportlab.lib.units import mm
from reportlab.pdfgen import canvas
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.platypus import Table, TableStyle
from reportlab.platypus import Table, TableStyle, SimpleDocTemplate, Spacer
from reportlab.platypus import Image as PlatypusImage
from reportlab.lib import colors
from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_RIGHT
from reportlab.platypus import Paragraph
@@ -3601,6 +3602,387 @@ class PDFGeneratorService:
except Exception as e:
logger.error(f"Failed to draw image element {element.element_id}: {e}")
# ============================================================================
# Reflow Layout PDF Generation
# ============================================================================
def _get_elements_in_reading_order(self, page_data: Dict) -> List[Dict]:
"""
Get elements sorted by reading order.
For OCR track: Uses explicit 'reading_order' array from JSON
For Direct track: Uses implicit element list order (PyMuPDF sort=True)
Args:
page_data: Page dictionary containing 'elements' and optionally 'reading_order'
Returns:
List of elements in proper reading order
"""
elements = page_data.get('elements', [])
reading_order = page_data.get('reading_order')
if reading_order and isinstance(reading_order, list):
# OCR track: use explicit reading order
ordered = []
for idx in reading_order:
if isinstance(idx, int) and 0 <= idx < len(elements):
ordered.append(elements[idx])
# Add any elements not in reading_order at the end
ordered_indices = set(reading_order)
for i, elem in enumerate(elements):
if i not in ordered_indices:
ordered.append(elem)
return ordered
else:
# Direct track: elements already in reading order from PyMuPDF
return elements
def _get_reflow_styles(self) -> Dict[str, ParagraphStyle]:
"""Create consistent styles for reflow PDF generation."""
base_styles = getSampleStyleSheet()
font_name = self.font_name if self.font_registered else 'Helvetica'
styles = {
'Title': ParagraphStyle(
'ReflowTitle',
parent=base_styles['Normal'],
fontName=font_name,
fontSize=18,
leading=22,
spaceAfter=12,
textColor=colors.black,
),
'Heading1': ParagraphStyle(
'ReflowH1',
parent=base_styles['Normal'],
fontName=font_name,
fontSize=16,
leading=20,
spaceAfter=10,
spaceBefore=12,
textColor=colors.black,
),
'Heading2': ParagraphStyle(
'ReflowH2',
parent=base_styles['Normal'],
fontName=font_name,
fontSize=14,
leading=18,
spaceAfter=8,
spaceBefore=10,
textColor=colors.black,
),
'Body': ParagraphStyle(
'ReflowBody',
parent=base_styles['Normal'],
fontName=font_name,
fontSize=12,
leading=16,
spaceAfter=6,
textColor=colors.black,
),
'TableCell': ParagraphStyle(
'ReflowTableCell',
parent=base_styles['Normal'],
fontName=font_name,
fontSize=10,
leading=13,
textColor=colors.black,
),
'Caption': ParagraphStyle(
'ReflowCaption',
parent=base_styles['Normal'],
fontName=font_name,
fontSize=10,
leading=13,
spaceAfter=8,
textColor=colors.gray,
),
}
return styles
def _create_reflow_table(self, table_data: Dict, styles: Dict) -> Optional[Table]:
"""
Create a Platypus Table for reflow mode.
Args:
table_data: Table element dictionary with 'rows' or 'cells'
styles: Style dictionary
Returns:
Platypus Table object or None
"""
try:
# Get content - cells might be inside 'content' dict
content = table_data.get('content', {})
if isinstance(content, dict):
rows_data = content.get('rows', []) if isinstance(content.get('rows'), list) else []
cells = content.get('cells', [])
else:
rows_data = table_data.get('rows', [])
cells = table_data.get('cells', [])
if not rows_data and cells:
# Group cells by row - support both 'row'/'col' and 'row_index'/'col_index' keys
row_map = {}
for cell in cells:
row_idx = cell.get('row', cell.get('row_index', 0))
if row_idx not in row_map:
row_map[row_idx] = []
row_map[row_idx].append(cell)
# Sort and create rows
rows_data = []
for row_idx in sorted(row_map.keys()):
row_cells = sorted(row_map[row_idx], key=lambda c: c.get('col', c.get('col_index', 0)))
rows_data.append({'cells': row_cells})
if not rows_data:
return None
# Build table data
data = []
for row in rows_data:
row_data = []
row_cells = row.get('cells', [])
for cell in row_cells:
# Support both 'text' and 'content' keys
text = cell.get('text', cell.get('content', ''))
if not isinstance(text, str):
text = str(text) if text else ''
# Escape HTML special characters
text = text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
row_data.append(Paragraph(text, styles['TableCell']))
if row_data:
data.append(row_data)
if not data:
return None
# Create table
table = Table(data)
table.setStyle(TableStyle([
('GRID', (0, 0), (-1, -1), 0.5, colors.black),
('VALIGN', (0, 0), (-1, -1), 'TOP'),
('LEFTPADDING', (0, 0), (-1, -1), 6),
('RIGHTPADDING', (0, 0), (-1, -1), 6),
('TOPPADDING', (0, 0), (-1, -1), 4),
('BOTTOMPADDING', (0, 0), (-1, -1), 4),
('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey), # Header row
]))
return table
except Exception as e:
logger.error(f"Failed to create reflow table: {e}")
return None
def _embed_image_reflow(
self,
element: Dict,
result_dir: Path,
max_width: float = 450
) -> Optional[PlatypusImage]:
"""
Embed an image for reflow mode.
Args:
element: Image element dictionary
result_dir: Directory containing images
max_width: Maximum width in points
Returns:
Platypus Image object or None
"""
try:
# Get image path - check multiple possible locations
img_path_str = element.get('image_path', element.get('path', ''))
# Also check content.saved_path (Direct track format)
if not img_path_str:
content = element.get('content', {})
if isinstance(content, dict):
img_path_str = content.get('saved_path', content.get('path', ''))
if not img_path_str:
return None
img_path = result_dir / img_path_str
if not img_path.exists():
# Try just the filename
img_path = result_dir / Path(img_path_str).name
if not img_path.exists():
logger.warning(f"Image not found for reflow: {img_path_str}")
return None
# Create Platypus Image
img = PlatypusImage(str(img_path))
# Scale to fit page width if necessary
if img.drawWidth > max_width:
ratio = max_width / img.drawWidth
img.drawWidth = max_width
img.drawHeight *= ratio
return img
except Exception as e:
logger.error(f"Failed to embed image for reflow: {e}")
return None
def generate_reflow_pdf(
self,
json_path: Path,
output_path: Path,
source_file_path: Optional[Path] = None
) -> bool:
"""
Generate reflow layout PDF from OCR/Direct JSON data.
This creates a flowing document with consistent font sizes,
proper reading order, and inline tables/images.
Args:
json_path: Path to result JSON file (UnifiedDocument format)
output_path: Path to save generated PDF
source_file_path: Optional path to original source file (for images)
Returns:
True if successful, False otherwise
"""
try:
# Load JSON data
logger.info(f"Generating reflow PDF from: {json_path}")
with open(json_path, 'r', encoding='utf-8') as f:
json_data = json.load(f)
# Get styles
styles = self._get_reflow_styles()
# Build document content
story = []
# Use source_file_path if provided (for translated PDFs where JSON is in temp dir)
# Otherwise use json_path.parent (for regular reflow PDFs)
if source_file_path and source_file_path.is_dir():
result_dir = source_file_path
elif source_file_path and source_file_path.is_file():
result_dir = source_file_path.parent
else:
result_dir = json_path.parent
# Process each page
pages = json_data.get('pages', [])
for page_idx, page_data in enumerate(pages):
if page_idx > 0:
# Add page break between pages
story.append(Spacer(1, 30))
# Get elements in reading order
elements = self._get_elements_in_reading_order(page_data)
for elem in elements:
elem_type = elem.get('type', elem.get('element_type', 'text'))
content = elem.get('content', elem.get('text', ''))
# Types that can have dict content (handled specially)
dict_content_types = ('table', 'Table', 'image', 'figure', 'Image', 'Figure', 'chart', 'Chart')
# Ensure content is a string for text elements
if isinstance(content, dict):
# Tables, images, charts have dict content - handled by their respective methods
if elem_type not in dict_content_types:
# Skip other elements with dict content
continue
elif not isinstance(content, str):
content = str(content) if content else ''
if elem_type in ('table', 'Table'):
# Handle table
table = self._create_reflow_table(elem, styles)
if table:
story.append(table)
story.append(Spacer(1, 12))
# Handle embedded images in table (from metadata)
metadata = elem.get('metadata', {})
embedded_images = metadata.get('embedded_images', [])
for emb_img in embedded_images:
img_path_str = emb_img.get('saved_path', '')
if img_path_str:
img_path = result_dir / img_path_str
if not img_path.exists():
img_path = result_dir / Path(img_path_str).name
if img_path.exists():
try:
img = PlatypusImage(str(img_path))
# Scale to fit page width if necessary
max_width = 450
if img.drawWidth > max_width:
ratio = max_width / img.drawWidth
img.drawWidth = max_width
img.drawHeight *= ratio
story.append(img)
story.append(Spacer(1, 8))
logger.info(f"Embedded table image in reflow: {img_path.name}")
except Exception as e:
logger.warning(f"Failed to embed table image: {e}")
elif elem_type in ('image', 'figure', 'Image', 'Figure', 'chart', 'Chart'):
# Handle image/chart
img = self._embed_image_reflow(elem, result_dir)
if img:
story.append(img)
story.append(Spacer(1, 8))
elif elem_type in ('title', 'Title'):
# Title text
if content:
content = content.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
story.append(Paragraph(content, styles['Title']))
elif elem_type in ('section_header', 'SectionHeader', 'h1', 'H1'):
# Heading 1
if content:
content = content.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
story.append(Paragraph(content, styles['Heading1']))
elif elem_type in ('h2', 'H2', 'Heading2'):
# Heading 2
if content:
content = content.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
story.append(Paragraph(content, styles['Heading2']))
else:
# Body text (default)
if content:
content = content.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
story.append(Paragraph(content, styles['Body']))
if not story:
logger.warning("No content to generate reflow PDF")
return False
# Create PDF document
doc = SimpleDocTemplate(
str(output_path),
pagesize=A4,
leftMargin=50,
rightMargin=50,
topMargin=50,
bottomMargin=50
)
# Build PDF
doc.build(story)
logger.info(f"Generated reflow PDF: {output_path} ({output_path.stat().st_size} bytes)")
return True
except Exception as e:
logger.error(f"Failed to generate reflow PDF: {e}")
import traceback
traceback.print_exc()
return False
def generate_translated_pdf(
self,
result_json_path: Path,
@@ -3609,7 +3991,7 @@ class PDFGeneratorService:
source_file_path: Optional[Path] = None
) -> bool:
"""
Generate layout-preserving PDF with translated content.
Generate reflow layout PDF with translated content.
This method loads the original result JSON and translation JSON,
merges them to replace original content with translations, and
@@ -3660,7 +4042,7 @@ class PDFGeneratorService:
f"target_lang={target_lang}"
)
# Write translated JSON to a temporary file and use existing generate_layout_pdf
# Write translated JSON to a temporary file and use reflow PDF generation
with tempfile.NamedTemporaryFile(
mode='w',
suffix='_translated.json',
@@ -3671,11 +4053,12 @@ class PDFGeneratorService:
tmp_path = Path(tmp_file.name)
try:
# Use existing PDF generation with translated content
success = self.generate_layout_pdf(
# Use reflow PDF generation for better translated content display
# Pass result_json_path.parent as image directory (not the temp file's parent)
success = self.generate_reflow_pdf(
json_path=tmp_path,
output_path=output_path,
source_file_path=source_file_path
source_file_path=result_json_path.parent # Contains extracted images
)
return success
finally:
@@ -3695,6 +4078,319 @@ class PDFGeneratorService:
traceback.print_exc()
return False
def generate_translated_layout_pdf(
self,
result_json_path: Path,
translation_json_path: Path,
output_path: Path,
source_file_path: Optional[Path] = None
) -> bool:
"""
Generate layout-preserving PDF with translated content.
This method creates a PDF that maintains the original document layout
while displaying translated text. Key features:
- Text wraps within original bounding boxes (no font shrinking)
- Tables adapt to translated content
- Images and other elements remain at original positions
- Font size is kept readable (minimum 10pt)
Args:
result_json_path: Path to original result JSON file
translation_json_path: Path to translation JSON file
output_path: Path to save generated translated PDF
source_file_path: Optional path for image directory
Returns:
True if successful, False otherwise
"""
import tempfile
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import Paragraph, Frame
from reportlab.lib.enums import TA_LEFT, TA_CENTER, TA_RIGHT
try:
# Import apply_translations from translation service
from app.services.translation_service import apply_translations
# Load original result JSON
logger.info(f"Loading result JSON for layout PDF: {result_json_path}")
with open(result_json_path, 'r', encoding='utf-8') as f:
result_json = json.load(f)
# Load translation JSON
logger.info(f"Loading translation JSON: {translation_json_path}")
with open(translation_json_path, 'r', encoding='utf-8') as f:
translation_json = json.load(f)
# Extract translations dict
translations = translation_json.get('translations', {})
if not translations:
logger.warning("No translations found, falling back to original layout PDF")
return self.generate_layout_pdf(
json_path=result_json_path,
output_path=output_path,
source_file_path=source_file_path
)
# Apply translations to result JSON
translated_doc = apply_translations(result_json, translations)
target_lang = translation_json.get('target_lang', 'unknown')
logger.info(
f"Generating translated layout PDF: {len(translations)} translations, "
f"target_lang={target_lang}"
)
# Determine image directory
if source_file_path and source_file_path.is_dir():
image_dir = source_file_path
elif source_file_path and source_file_path.is_file():
image_dir = source_file_path.parent
else:
image_dir = result_json_path.parent
# Create PDF canvas
from reportlab.pdfgen import canvas
# Get page dimensions from first page
pages = translated_doc.get('pages', [])
if not pages:
logger.error("No pages in document")
return False
first_page = pages[0]
dims = first_page.get('dimensions', {})
page_width = dims.get('width', 595.32)
page_height = dims.get('height', 841.92)
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(page_width, page_height))
# Create paragraph styles for text wrapping
base_style = ParagraphStyle(
'TranslatedBase',
fontName=self.font_name if self.font_registered else 'Helvetica',
fontSize=10,
leading=12,
wordWrap='CJK', # Support CJK word wrapping
)
# Process each page
for page_idx, page_data in enumerate(pages):
logger.info(f"Processing translated layout page {page_idx + 1}/{len(pages)}")
# Get current page dimensions
dims = page_data.get('dimensions', {})
current_page_width = dims.get('width', page_width)
current_page_height = dims.get('height', page_height)
if page_idx > 0:
pdf_canvas.showPage()
pdf_canvas.setPageSize((current_page_width, current_page_height))
# Process elements
elements = page_data.get('elements', [])
for elem in elements:
elem_type = elem.get('type', 'text')
content = elem.get('content', '')
bbox = elem.get('bbox', {})
if not bbox:
continue
x0 = bbox.get('x0', 0)
y0 = bbox.get('y0', 0)
x1 = bbox.get('x1', 0)
y1 = bbox.get('y1', 0)
box_width = x1 - x0
box_height = y1 - y0
if box_width <= 0 or box_height <= 0:
continue
# Handle different element types
if elem_type in ('image', 'figure', 'Image', 'Figure', 'chart', 'Chart'):
# Draw image
img = self._embed_image_reflow(elem, image_dir)
if img:
# Convert to PDF coordinates
pdf_y = current_page_height - y1
# Scale image to fit bbox
scale = min(box_width / img.drawWidth, box_height / img.drawHeight)
img.drawWidth *= scale
img.drawHeight *= scale
img.drawOn(pdf_canvas, x0, pdf_y)
elif elem_type in ('table', 'Table'):
# Draw table with wrapping
self._draw_translated_table(
pdf_canvas, elem, current_page_height, image_dir
)
elif isinstance(content, str) and content.strip():
# Text element - use Paragraph for word wrapping
# Escape special characters
safe_content = content.replace('&', '&amp;')
safe_content = safe_content.replace('<', '&lt;')
safe_content = safe_content.replace('>', '&gt;')
# Replace newlines with <br/>
safe_content = safe_content.replace('\n', '<br/>')
# Calculate font size from bbox height, but keep minimum 10pt
font_size = max(box_height * 0.7, 10)
font_size = min(font_size, 24) # Cap at 24pt
# Create style for this element
elem_style = ParagraphStyle(
f'elem_{id(elem)}',
parent=base_style,
fontSize=font_size,
leading=font_size * 1.2,
)
# Create paragraph
para = Paragraph(safe_content, elem_style)
# Calculate available width and height
available_width = box_width
available_height = box_height * 2 # Allow overflow
# Wrap the paragraph
para_width, para_height = para.wrap(available_width, available_height)
# Convert to PDF coordinates (y from bottom)
pdf_y = current_page_height - y0 - para_height
# Draw the paragraph
para.drawOn(pdf_canvas, x0, pdf_y)
# Save PDF
pdf_canvas.save()
logger.info(f"Translated layout PDF saved to {output_path}")
return True
except FileNotFoundError as e:
logger.error(f"File not found: {e}")
return False
except json.JSONDecodeError as e:
logger.error(f"Invalid JSON: {e}")
return False
except Exception as e:
logger.error(f"Failed to generate translated layout PDF: {e}")
import traceback
traceback.print_exc()
return False
def _draw_translated_table(
self,
pdf_canvas,
elem: Dict,
page_height: float,
image_dir: Path
):
"""
Draw a table with translated content using Platypus Table.
Supports adaptive column widths and text wrapping within cells.
Args:
pdf_canvas: ReportLab canvas
elem: Table element dict
page_height: Page height for coordinate transformation
image_dir: Directory containing images
"""
from reportlab.platypus import Table, TableStyle, Paragraph
from reportlab.lib.styles import ParagraphStyle
from reportlab.lib import colors
try:
content = elem.get('content', {})
bbox = elem.get('bbox', {})
if not bbox:
return
x0 = bbox.get('x0', 0)
y0 = bbox.get('y0', 0)
x1 = bbox.get('x1', 0)
y1 = bbox.get('y1', 0)
table_width = x1 - x0
table_height = y1 - y0
# Parse table content
if isinstance(content, dict):
rows = content.get('rows', [])
cells = content.get('cells', [])
else:
return
if not rows and not cells:
return
# Build table data
table_data = []
if rows:
for row in rows:
row_cells = row if isinstance(row, list) else row.get('cells', [])
row_data = []
for cell in row_cells:
if isinstance(cell, str):
cell_text = cell
elif isinstance(cell, dict):
cell_text = cell.get('content', cell.get('text', ''))
else:
cell_text = str(cell) if cell else ''
# Create paragraph for text wrapping
safe_text = str(cell_text).replace('&', '&amp;')
safe_text = safe_text.replace('<', '&lt;').replace('>', '&gt;')
cell_style = ParagraphStyle(
f'cell_{id(cell)}',
fontName=self.font_name if self.font_registered else 'Helvetica',
fontSize=9,
leading=11,
wordWrap='CJK',
)
para = Paragraph(safe_text, cell_style)
row_data.append(para)
if row_data:
table_data.append(row_data)
if not table_data:
return
# Calculate column widths
num_cols = max(len(row) for row in table_data) if table_data else 1
col_width = table_width / num_cols if num_cols > 0 else table_width
# Create table
table = Table(table_data, colWidths=[col_width] * num_cols)
# Apply table style
table.setStyle(TableStyle([
('GRID', (0, 0), (-1, -1), 0.5, colors.black),
('VALIGN', (0, 0), (-1, -1), 'TOP'),
('LEFTPADDING', (0, 0), (-1, -1), 4),
('RIGHTPADDING', (0, 0), (-1, -1), 4),
('TOPPADDING', (0, 0), (-1, -1), 2),
('BOTTOMPADDING', (0, 0), (-1, -1), 2),
]))
# Wrap and draw table
t_width, t_height = table.wrap(table_width, table_height * 2)
# Convert to PDF coordinates
pdf_y = page_height - y0 - t_height
table.drawOn(pdf_canvas, x0, pdf_y)
except Exception as e:
logger.error(f"Failed to draw translated table: {e}")
# Singleton instance
pdf_generator_service = PDFGeneratorService()