feat: add translated PDF export with layout preservation

Adds the ability to download translated documents as PDF files while preserving the original document layout. Key changes: - Add apply_translations() function to merge translation JSON with UnifiedDocument - Add generate_translated_pdf() method to PDFGeneratorService - Add POST /api/v2/translate/{task_id}/pdf endpoint - Add downloadTranslatedPdf() method and PDF button in frontend - Add comprehensive unit tests (52 tests: merge, PDF generation, API endpoints) - Archive add-translated-pdf-export proposal 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-02 12:33:31 +08:00
parent 8d9b69ba93
commit a07aad96b3
15 changed files with 2663 additions and 2 deletions
--- a/backend/app/services/pdf_generator_service.py
+++ b/backend/app/services/pdf_generator_service.py
@@ -3601,6 +3601,100 @@ class PDFGeneratorService:
        except Exception as e:
            logger.error(f"Failed to draw image element {element.element_id}: {e}")

+    def generate_translated_pdf(
+        self,
+        result_json_path: Path,
+        translation_json_path: Path,
+        output_path: Path,
+        source_file_path: Optional[Path] = None
+    ) -> bool:
+        """
+        Generate layout-preserving PDF with translated content.
+
+        This method loads the original result JSON and translation JSON,
+        merges them to replace original content with translations, and
+        generates a PDF with the translated content at original positions.
+
+        Args:
+            result_json_path: Path to original result JSON file (UnifiedDocument format)
+            translation_json_path: Path to translation JSON file
+            output_path: Path to save generated translated PDF
+            source_file_path: Optional path to original source file
+
+        Returns:
+            True if successful, False otherwise
+        """
+        import tempfile
+
+        try:
+            # Import apply_translations from translation service
+            from app.services.translation_service import apply_translations
+
+            # Load original result JSON
+            logger.info(f"Loading result JSON: {result_json_path}")
+            with open(result_json_path, 'r', encoding='utf-8') as f:
+                result_json = json.load(f)
+
+            # Load translation JSON
+            logger.info(f"Loading translation JSON: {translation_json_path}")
+            with open(translation_json_path, 'r', encoding='utf-8') as f:
+                translation_json = json.load(f)
+
+            # Extract translations dict from translation JSON
+            translations = translation_json.get('translations', {})
+            if not translations:
+                logger.warning("No translations found in translation JSON")
+                # Still generate PDF with original content as fallback
+                return self.generate_layout_pdf(
+                    json_path=result_json_path,
+                    output_path=output_path,
+                    source_file_path=source_file_path
+                )
+
+            # Apply translations to result JSON
+            translated_doc = apply_translations(result_json, translations)
+
+            target_lang = translation_json.get('target_lang', 'unknown')
+            logger.info(
+                f"Generating translated PDF: {len(translations)} translations applied, "
+                f"target_lang={target_lang}"
+            )
+
+            # Write translated JSON to a temporary file and use existing generate_layout_pdf
+            with tempfile.NamedTemporaryFile(
+                mode='w',
+                suffix='_translated.json',
+                delete=False,
+                encoding='utf-8'
+            ) as tmp_file:
+                json.dump(translated_doc, tmp_file, ensure_ascii=False, indent=2)
+                tmp_path = Path(tmp_file.name)
+
+            try:
+                # Use existing PDF generation with translated content
+                success = self.generate_layout_pdf(
+                    json_path=tmp_path,
+                    output_path=output_path,
+                    source_file_path=source_file_path
+                )
+                return success
+            finally:
+                # Clean up temporary file
+                if tmp_path.exists():
+                    tmp_path.unlink()
+
+        except FileNotFoundError as e:
+            logger.error(f"File not found: {e}")
+            return False
+        except json.JSONDecodeError as e:
+            logger.error(f"Invalid JSON: {e}")
+            return False
+        except Exception as e:
+            logger.error(f"Failed to generate translated PDF: {e}")
+            import traceback
+            traceback.print_exc()
+            return False
+

 # Singleton instance
 pdf_generator_service = PDFGeneratorService()
--- a/backend/app/services/translation_service.py
+++ b/backend/app/services/translation_service.py
@@ -35,6 +35,166 @@ TABLE_TYPE = 'table'
 SKIP_TYPES = {'page_number', 'image', 'chart', 'logo', 'reference'}


+def apply_translations(
+    result_json: Dict,
+    translations: Dict[str, Any]
+) -> Dict:
+    """
+    Apply translations to a result JSON document, creating a translated copy.
+
+    This function merges translation data with the original document structure,
+    replacing original content with translated content while preserving all
+    other properties (bounding boxes, styles, etc.).
+
+    Args:
+        result_json: Original UnifiedDocument JSON data
+        translations: Translation dict mapping element_id to translated content.
+                     For text elements: element_id -> translated_string
+                     For tables: element_id -> {"cells": [{"row": int, "col": int, "content": str}]}
+
+    Returns:
+        A deep copy of result_json with translations applied
+    """
+    import copy
+    translated_doc = copy.deepcopy(result_json)
+    applied_count = 0
+
+    for page in translated_doc.get('pages', []):
+        for elem in page.get('elements', []):
+            elem_id = elem.get('element_id', '')
+            elem_type = elem.get('type', '')
+
+            if elem_id not in translations:
+                continue
+
+            translation = translations[elem_id]
+
+            # Handle text elements (string translation)
+            if isinstance(translation, str):
+                if elem_type in TRANSLATABLE_TEXT_TYPES:
+                    elem['content'] = translation
+                    applied_count += 1
+                else:
+                    logger.warning(
+                        f"Translation for {elem_id} is string but element type is {elem_type}"
+                    )
+
+            # Handle table elements (cells translation)
+            elif isinstance(translation, dict) and 'cells' in translation:
+                if elem_type == TABLE_TYPE and isinstance(elem.get('content'), dict):
+                    _apply_table_translation(elem, translation)
+                    applied_count += 1
+                else:
+                    logger.warning(
+                        f"Translation for {elem_id} is table but element type is {elem_type}"
+                    )
+
+    logger.info(f"Applied {applied_count} translations to document")
+    return translated_doc
+
+
+def _apply_table_translation(
+    table_elem: Dict,
+    translation: Dict[str, Any]
+) -> None:
+    """
+    Apply translation to a table element's cells.
+
+    Args:
+        table_elem: Table element dict with content.cells
+        translation: Translation dict with 'cells' list
+    """
+    content = table_elem.get('content', {})
+    original_cells = content.get('cells', [])
+
+    if not original_cells:
+        return
+
+    # Build lookup for translated cells by (row, col)
+    translated_cells = {}
+    for cell in translation.get('cells', []):
+        row = cell.get('row', 0)
+        col = cell.get('col', 0)
+        translated_cells[(row, col)] = cell.get('content', '')
+
+    # Apply translations to matching cells
+    for cell in original_cells:
+        row = cell.get('row', 0)
+        col = cell.get('col', 0)
+        key = (row, col)
+
+        if key in translated_cells:
+            cell['content'] = translated_cells[key]
+
+
+def load_translation_json(translation_path: Path) -> Optional[Dict]:
+    """
+    Load translation JSON file.
+
+    Args:
+        translation_path: Path to translation JSON file
+
+    Returns:
+        Translation JSON dict or None if file doesn't exist
+    """
+    if not translation_path.exists():
+        return None
+
+    try:
+        with open(translation_path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    except Exception as e:
+        logger.error(f"Failed to load translation JSON: {e}")
+        return None
+
+
+def find_translation_file(
+    result_dir: Path,
+    target_lang: str
+) -> Optional[Path]:
+    """
+    Find translation file for a given language in result directory.
+
+    Args:
+        result_dir: Directory containing result files
+        target_lang: Target language code (e.g., 'en', 'zh-TW')
+
+    Returns:
+        Path to translation file or None if not found
+    """
+    # Look for *_translated_{lang}.json pattern
+    pattern = f"*_translated_{target_lang}.json"
+    matches = list(result_dir.glob(pattern))
+
+    if matches:
+        return matches[0]
+    return None
+
+
+def list_available_translations(result_dir: Path) -> List[str]:
+    """
+    List all available translation languages for a result directory.
+
+    Args:
+        result_dir: Directory containing result files
+
+    Returns:
+        List of language codes with available translations
+    """
+    languages = []
+    pattern = "*_translated_*.json"
+
+    for path in result_dir.glob(pattern):
+        # Extract language from filename: xxx_translated_{lang}.json
+        stem = path.stem
+        if '_translated_' in stem:
+            lang = stem.split('_translated_')[-1]
+            if lang:
+                languages.append(lang)
+
+    return languages
+
+
@dataclass
 class TranslationBatch:
    """A batch of items to translate together"""