feat: add translated PDF export with layout preservation
Adds the ability to download translated documents as PDF files while
preserving the original document layout. Key changes:
- Add apply_translations() function to merge translation JSON with UnifiedDocument
- Add generate_translated_pdf() method to PDFGeneratorService
- Add POST /api/v2/translate/{task_id}/pdf endpoint
- Add downloadTranslatedPdf() method and PDF button in frontend
- Add comprehensive unit tests (52 tests: merge, PDF generation, API endpoints)
- Archive add-translated-pdf-export proposal
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -3601,6 +3601,100 @@ class PDFGeneratorService:
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to draw image element {element.element_id}: {e}")
|
||||
|
||||
def generate_translated_pdf(
|
||||
self,
|
||||
result_json_path: Path,
|
||||
translation_json_path: Path,
|
||||
output_path: Path,
|
||||
source_file_path: Optional[Path] = None
|
||||
) -> bool:
|
||||
"""
|
||||
Generate layout-preserving PDF with translated content.
|
||||
|
||||
This method loads the original result JSON and translation JSON,
|
||||
merges them to replace original content with translations, and
|
||||
generates a PDF with the translated content at original positions.
|
||||
|
||||
Args:
|
||||
result_json_path: Path to original result JSON file (UnifiedDocument format)
|
||||
translation_json_path: Path to translation JSON file
|
||||
output_path: Path to save generated translated PDF
|
||||
source_file_path: Optional path to original source file
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise
|
||||
"""
|
||||
import tempfile
|
||||
|
||||
try:
|
||||
# Import apply_translations from translation service
|
||||
from app.services.translation_service import apply_translations
|
||||
|
||||
# Load original result JSON
|
||||
logger.info(f"Loading result JSON: {result_json_path}")
|
||||
with open(result_json_path, 'r', encoding='utf-8') as f:
|
||||
result_json = json.load(f)
|
||||
|
||||
# Load translation JSON
|
||||
logger.info(f"Loading translation JSON: {translation_json_path}")
|
||||
with open(translation_json_path, 'r', encoding='utf-8') as f:
|
||||
translation_json = json.load(f)
|
||||
|
||||
# Extract translations dict from translation JSON
|
||||
translations = translation_json.get('translations', {})
|
||||
if not translations:
|
||||
logger.warning("No translations found in translation JSON")
|
||||
# Still generate PDF with original content as fallback
|
||||
return self.generate_layout_pdf(
|
||||
json_path=result_json_path,
|
||||
output_path=output_path,
|
||||
source_file_path=source_file_path
|
||||
)
|
||||
|
||||
# Apply translations to result JSON
|
||||
translated_doc = apply_translations(result_json, translations)
|
||||
|
||||
target_lang = translation_json.get('target_lang', 'unknown')
|
||||
logger.info(
|
||||
f"Generating translated PDF: {len(translations)} translations applied, "
|
||||
f"target_lang={target_lang}"
|
||||
)
|
||||
|
||||
# Write translated JSON to a temporary file and use existing generate_layout_pdf
|
||||
with tempfile.NamedTemporaryFile(
|
||||
mode='w',
|
||||
suffix='_translated.json',
|
||||
delete=False,
|
||||
encoding='utf-8'
|
||||
) as tmp_file:
|
||||
json.dump(translated_doc, tmp_file, ensure_ascii=False, indent=2)
|
||||
tmp_path = Path(tmp_file.name)
|
||||
|
||||
try:
|
||||
# Use existing PDF generation with translated content
|
||||
success = self.generate_layout_pdf(
|
||||
json_path=tmp_path,
|
||||
output_path=output_path,
|
||||
source_file_path=source_file_path
|
||||
)
|
||||
return success
|
||||
finally:
|
||||
# Clean up temporary file
|
||||
if tmp_path.exists():
|
||||
tmp_path.unlink()
|
||||
|
||||
except FileNotFoundError as e:
|
||||
logger.error(f"File not found: {e}")
|
||||
return False
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Invalid JSON: {e}")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to generate translated PDF: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
|
||||
# Singleton instance
|
||||
pdf_generator_service = PDFGeneratorService()
|
||||
|
||||
@@ -35,6 +35,166 @@ TABLE_TYPE = 'table'
|
||||
SKIP_TYPES = {'page_number', 'image', 'chart', 'logo', 'reference'}
|
||||
|
||||
|
||||
def apply_translations(
|
||||
result_json: Dict,
|
||||
translations: Dict[str, Any]
|
||||
) -> Dict:
|
||||
"""
|
||||
Apply translations to a result JSON document, creating a translated copy.
|
||||
|
||||
This function merges translation data with the original document structure,
|
||||
replacing original content with translated content while preserving all
|
||||
other properties (bounding boxes, styles, etc.).
|
||||
|
||||
Args:
|
||||
result_json: Original UnifiedDocument JSON data
|
||||
translations: Translation dict mapping element_id to translated content.
|
||||
For text elements: element_id -> translated_string
|
||||
For tables: element_id -> {"cells": [{"row": int, "col": int, "content": str}]}
|
||||
|
||||
Returns:
|
||||
A deep copy of result_json with translations applied
|
||||
"""
|
||||
import copy
|
||||
translated_doc = copy.deepcopy(result_json)
|
||||
applied_count = 0
|
||||
|
||||
for page in translated_doc.get('pages', []):
|
||||
for elem in page.get('elements', []):
|
||||
elem_id = elem.get('element_id', '')
|
||||
elem_type = elem.get('type', '')
|
||||
|
||||
if elem_id not in translations:
|
||||
continue
|
||||
|
||||
translation = translations[elem_id]
|
||||
|
||||
# Handle text elements (string translation)
|
||||
if isinstance(translation, str):
|
||||
if elem_type in TRANSLATABLE_TEXT_TYPES:
|
||||
elem['content'] = translation
|
||||
applied_count += 1
|
||||
else:
|
||||
logger.warning(
|
||||
f"Translation for {elem_id} is string but element type is {elem_type}"
|
||||
)
|
||||
|
||||
# Handle table elements (cells translation)
|
||||
elif isinstance(translation, dict) and 'cells' in translation:
|
||||
if elem_type == TABLE_TYPE and isinstance(elem.get('content'), dict):
|
||||
_apply_table_translation(elem, translation)
|
||||
applied_count += 1
|
||||
else:
|
||||
logger.warning(
|
||||
f"Translation for {elem_id} is table but element type is {elem_type}"
|
||||
)
|
||||
|
||||
logger.info(f"Applied {applied_count} translations to document")
|
||||
return translated_doc
|
||||
|
||||
|
||||
def _apply_table_translation(
|
||||
table_elem: Dict,
|
||||
translation: Dict[str, Any]
|
||||
) -> None:
|
||||
"""
|
||||
Apply translation to a table element's cells.
|
||||
|
||||
Args:
|
||||
table_elem: Table element dict with content.cells
|
||||
translation: Translation dict with 'cells' list
|
||||
"""
|
||||
content = table_elem.get('content', {})
|
||||
original_cells = content.get('cells', [])
|
||||
|
||||
if not original_cells:
|
||||
return
|
||||
|
||||
# Build lookup for translated cells by (row, col)
|
||||
translated_cells = {}
|
||||
for cell in translation.get('cells', []):
|
||||
row = cell.get('row', 0)
|
||||
col = cell.get('col', 0)
|
||||
translated_cells[(row, col)] = cell.get('content', '')
|
||||
|
||||
# Apply translations to matching cells
|
||||
for cell in original_cells:
|
||||
row = cell.get('row', 0)
|
||||
col = cell.get('col', 0)
|
||||
key = (row, col)
|
||||
|
||||
if key in translated_cells:
|
||||
cell['content'] = translated_cells[key]
|
||||
|
||||
|
||||
def load_translation_json(translation_path: Path) -> Optional[Dict]:
|
||||
"""
|
||||
Load translation JSON file.
|
||||
|
||||
Args:
|
||||
translation_path: Path to translation JSON file
|
||||
|
||||
Returns:
|
||||
Translation JSON dict or None if file doesn't exist
|
||||
"""
|
||||
if not translation_path.exists():
|
||||
return None
|
||||
|
||||
try:
|
||||
with open(translation_path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load translation JSON: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def find_translation_file(
|
||||
result_dir: Path,
|
||||
target_lang: str
|
||||
) -> Optional[Path]:
|
||||
"""
|
||||
Find translation file for a given language in result directory.
|
||||
|
||||
Args:
|
||||
result_dir: Directory containing result files
|
||||
target_lang: Target language code (e.g., 'en', 'zh-TW')
|
||||
|
||||
Returns:
|
||||
Path to translation file or None if not found
|
||||
"""
|
||||
# Look for *_translated_{lang}.json pattern
|
||||
pattern = f"*_translated_{target_lang}.json"
|
||||
matches = list(result_dir.glob(pattern))
|
||||
|
||||
if matches:
|
||||
return matches[0]
|
||||
return None
|
||||
|
||||
|
||||
def list_available_translations(result_dir: Path) -> List[str]:
|
||||
"""
|
||||
List all available translation languages for a result directory.
|
||||
|
||||
Args:
|
||||
result_dir: Directory containing result files
|
||||
|
||||
Returns:
|
||||
List of language codes with available translations
|
||||
"""
|
||||
languages = []
|
||||
pattern = "*_translated_*.json"
|
||||
|
||||
for path in result_dir.glob(pattern):
|
||||
# Extract language from filename: xxx_translated_{lang}.json
|
||||
stem = path.stem
|
||||
if '_translated_' in stem:
|
||||
lang = stem.split('_translated_')[-1]
|
||||
if lang:
|
||||
languages.append(lang)
|
||||
|
||||
return languages
|
||||
|
||||
|
||||
@dataclass
|
||||
class TranslationBatch:
|
||||
"""A batch of items to translate together"""
|
||||
|
||||
Reference in New Issue
Block a user