Files
OCR/backend/tests/services/test_translation_merge.py
egg a07aad96b3 feat: add translated PDF export with layout preservation
Adds the ability to download translated documents as PDF files while
preserving the original document layout. Key changes:

- Add apply_translations() function to merge translation JSON with UnifiedDocument
- Add generate_translated_pdf() method to PDFGeneratorService
- Add POST /api/v2/translate/{task_id}/pdf endpoint
- Add downloadTranslatedPdf() method and PDF button in frontend
- Add comprehensive unit tests (52 tests: merge, PDF generation, API endpoints)
- Archive add-translated-pdf-export proposal

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-02 12:33:31 +08:00

524 lines
18 KiB
Python

"""
Unit tests for translation merging functionality.
Tests the apply_translations() function and related utilities
for merging translation data with UnifiedDocument structure.
"""
import pytest
import json
import tempfile
from pathlib import Path
from app.services.translation_service import (
apply_translations,
_apply_table_translation,
load_translation_json,
find_translation_file,
list_available_translations,
TRANSLATABLE_TEXT_TYPES,
TABLE_TYPE,
)
class TestApplyTranslations:
"""Tests for apply_translations() function"""
def test_apply_text_translation(self):
"""Test applying translation to text elements"""
result_json = {
"pages": [
{
"page_number": 1,
"elements": [
{
"element_id": "text_1",
"type": "text",
"content": "Hello World",
"bounding_box": {"x": 0, "y": 0, "width": 100, "height": 20}
}
]
}
]
}
translations = {
"text_1": "你好世界"
}
result = apply_translations(result_json, translations)
assert result["pages"][0]["elements"][0]["content"] == "你好世界"
# Original should be unchanged
assert result_json["pages"][0]["elements"][0]["content"] == "Hello World"
def test_apply_multiple_translations(self):
"""Test applying translations to multiple elements"""
result_json = {
"pages": [
{
"page_number": 1,
"elements": [
{"element_id": "title_1", "type": "title", "content": "Title"},
{"element_id": "text_1", "type": "text", "content": "Body text"},
{"element_id": "header_1", "type": "header", "content": "Header"},
]
}
]
}
translations = {
"title_1": "標題",
"text_1": "正文",
"header_1": "頁首"
}
result = apply_translations(result_json, translations)
assert result["pages"][0]["elements"][0]["content"] == "標題"
assert result["pages"][0]["elements"][1]["content"] == "正文"
assert result["pages"][0]["elements"][2]["content"] == "頁首"
def test_preserve_non_translated_elements(self):
"""Test that elements without translations are preserved"""
result_json = {
"pages": [
{
"page_number": 1,
"elements": [
{"element_id": "text_1", "type": "text", "content": "Translate me"},
{"element_id": "text_2", "type": "text", "content": "Keep me"},
]
}
]
}
translations = {
"text_1": "翻譯我"
}
result = apply_translations(result_json, translations)
assert result["pages"][0]["elements"][0]["content"] == "翻譯我"
assert result["pages"][0]["elements"][1]["content"] == "Keep me"
def test_preserve_element_properties(self):
"""Test that element properties (bounding_box, style_info) are preserved"""
result_json = {
"pages": [
{
"page_number": 1,
"elements": [
{
"element_id": "text_1",
"type": "text",
"content": "Original",
"bounding_box": {"x": 10, "y": 20, "width": 100, "height": 30},
"style_info": {"font_size": 12, "font_name": "Arial"}
}
]
}
]
}
translations = {"text_1": "Translated"}
result = apply_translations(result_json, translations)
elem = result["pages"][0]["elements"][0]
assert elem["content"] == "Translated"
assert elem["bounding_box"] == {"x": 10, "y": 20, "width": 100, "height": 30}
assert elem["style_info"] == {"font_size": 12, "font_name": "Arial"}
def test_multi_page_document(self):
"""Test translation across multiple pages"""
result_json = {
"pages": [
{
"page_number": 1,
"elements": [{"element_id": "p1_text", "type": "text", "content": "Page 1"}]
},
{
"page_number": 2,
"elements": [{"element_id": "p2_text", "type": "text", "content": "Page 2"}]
}
]
}
translations = {
"p1_text": "第一頁",
"p2_text": "第二頁"
}
result = apply_translations(result_json, translations)
assert result["pages"][0]["elements"][0]["content"] == "第一頁"
assert result["pages"][1]["elements"][0]["content"] == "第二頁"
def test_all_translatable_types(self):
"""Test that all translatable text types are handled"""
elements = []
translations = {}
for i, elem_type in enumerate(TRANSLATABLE_TEXT_TYPES):
elem_id = f"{elem_type}_{i}"
elements.append({
"element_id": elem_id,
"type": elem_type,
"content": f"Original {elem_type}"
})
translations[elem_id] = f"Translated {elem_type}"
result_json = {"pages": [{"page_number": 1, "elements": elements}]}
result = apply_translations(result_json, translations)
for i, elem_type in enumerate(TRANSLATABLE_TEXT_TYPES):
assert result["pages"][0]["elements"][i]["content"] == f"Translated {elem_type}"
def test_skip_non_translatable_types(self):
"""Test that non-translatable types are not modified even with translation"""
result_json = {
"pages": [
{
"page_number": 1,
"elements": [
{"element_id": "img_1", "type": "image", "content": "image.png"},
{"element_id": "chart_1", "type": "chart", "content": "chart data"},
]
}
]
}
# Even though translations exist, image/chart should not be modified
translations = {
"img_1": "Should not apply",
"chart_1": "Should not apply"
}
result = apply_translations(result_json, translations)
assert result["pages"][0]["elements"][0]["content"] == "image.png"
assert result["pages"][0]["elements"][1]["content"] == "chart data"
def test_empty_translations(self):
"""Test with empty translations dict"""
result_json = {
"pages": [
{
"page_number": 1,
"elements": [{"element_id": "text_1", "type": "text", "content": "Original"}]
}
]
}
result = apply_translations(result_json, {})
assert result["pages"][0]["elements"][0]["content"] == "Original"
def test_empty_document(self):
"""Test with empty document"""
result_json = {"pages": []}
translations = {"text_1": "Translation"}
result = apply_translations(result_json, translations)
assert result["pages"] == []
class TestApplyTableTranslation:
"""Tests for _apply_table_translation() function"""
def test_apply_table_cell_translation(self):
"""Test applying translations to table cells"""
table_elem = {
"element_id": "table_1",
"type": "table",
"content": {
"cells": [
{"row": 0, "col": 0, "content": "Header 1"},
{"row": 0, "col": 1, "content": "Header 2"},
{"row": 1, "col": 0, "content": "Data 1"},
{"row": 1, "col": 1, "content": "Data 2"},
]
}
}
translation = {
"cells": [
{"row": 0, "col": 0, "content": "表頭 1"},
{"row": 0, "col": 1, "content": "表頭 2"},
{"row": 1, "col": 0, "content": "資料 1"},
{"row": 1, "col": 1, "content": "資料 2"},
]
}
_apply_table_translation(table_elem, translation)
cells = table_elem["content"]["cells"]
assert cells[0]["content"] == "表頭 1"
assert cells[1]["content"] == "表頭 2"
assert cells[2]["content"] == "資料 1"
assert cells[3]["content"] == "資料 2"
def test_partial_table_translation(self):
"""Test partial translation of table cells"""
table_elem = {
"element_id": "table_1",
"type": "table",
"content": {
"cells": [
{"row": 0, "col": 0, "content": "A"},
{"row": 0, "col": 1, "content": "B"},
{"row": 1, "col": 0, "content": "C"},
{"row": 1, "col": 1, "content": "D"},
]
}
}
# Only translate some cells
translation = {
"cells": [
{"row": 0, "col": 0, "content": ""},
{"row": 1, "col": 1, "content": ""},
]
}
_apply_table_translation(table_elem, translation)
cells = table_elem["content"]["cells"]
assert cells[0]["content"] == "" # Translated
assert cells[1]["content"] == "B" # Original
assert cells[2]["content"] == "C" # Original
assert cells[3]["content"] == "" # Translated
def test_table_with_empty_cells(self):
"""Test table with empty cells list"""
table_elem = {
"element_id": "table_1",
"type": "table",
"content": {"cells": []}
}
translation = {
"cells": [{"row": 0, "col": 0, "content": "New"}]
}
# Should not raise error
_apply_table_translation(table_elem, translation)
assert table_elem["content"]["cells"] == []
def test_table_translation_via_apply_translations(self):
"""Test table translation through main apply_translations function"""
result_json = {
"pages": [
{
"page_number": 1,
"elements": [
{
"element_id": "table_1",
"type": "table",
"content": {
"cells": [
{"row": 0, "col": 0, "content": "Name"},
{"row": 0, "col": 1, "content": "Value"},
]
}
}
]
}
]
}
translations = {
"table_1": {
"cells": [
{"row": 0, "col": 0, "content": "名稱"},
{"row": 0, "col": 1, "content": "數值"},
]
}
}
result = apply_translations(result_json, translations)
cells = result["pages"][0]["elements"][0]["content"]["cells"]
assert cells[0]["content"] == "名稱"
assert cells[1]["content"] == "數值"
class TestTranslationFileUtilities:
"""Tests for translation file utility functions"""
def test_load_translation_json(self, tmp_path):
"""Test loading translation JSON file"""
translation_data = {
"translations": {"text_1": "Translation"},
"target_lang": "zh-TW"
}
translation_file = tmp_path / "test_translated_zh-TW.json"
translation_file.write_text(json.dumps(translation_data), encoding='utf-8')
result = load_translation_json(translation_file)
assert result is not None
assert result["translations"]["text_1"] == "Translation"
assert result["target_lang"] == "zh-TW"
def test_load_translation_json_not_found(self, tmp_path):
"""Test loading non-existent translation file"""
non_existent = tmp_path / "does_not_exist.json"
result = load_translation_json(non_existent)
assert result is None
def test_find_translation_file(self, tmp_path):
"""Test finding translation file by language"""
# Create test files
(tmp_path / "doc_translated_en.json").write_text("{}", encoding='utf-8')
(tmp_path / "doc_translated_zh-TW.json").write_text("{}", encoding='utf-8')
result = find_translation_file(tmp_path, "zh-TW")
assert result is not None
assert result.name == "doc_translated_zh-TW.json"
def test_find_translation_file_not_found(self, tmp_path):
"""Test finding non-existent translation file"""
(tmp_path / "doc_translated_en.json").write_text("{}", encoding='utf-8')
result = find_translation_file(tmp_path, "ja")
assert result is None
def test_list_available_translations(self, tmp_path):
"""Test listing available translation languages"""
(tmp_path / "doc_translated_en.json").write_text("{}", encoding='utf-8')
(tmp_path / "doc_translated_zh-TW.json").write_text("{}", encoding='utf-8')
(tmp_path / "doc_translated_ja.json").write_text("{}", encoding='utf-8')
(tmp_path / "other_file.json").write_text("{}", encoding='utf-8')
result = list_available_translations(tmp_path)
assert set(result) == {"en", "zh-TW", "ja"}
def test_list_available_translations_empty(self, tmp_path):
"""Test listing when no translations exist"""
(tmp_path / "result.json").write_text("{}", encoding='utf-8')
result = list_available_translations(tmp_path)
assert result == []
class TestDeepCopyBehavior:
"""Tests to verify deep copy behavior"""
def test_original_not_modified(self):
"""Test that original document is not modified"""
original = {
"pages": [
{
"page_number": 1,
"elements": [
{"element_id": "text_1", "type": "text", "content": "Original"}
]
}
]
}
original_content = original["pages"][0]["elements"][0]["content"]
translations = {"text_1": "Modified"}
result = apply_translations(original, translations)
# Original should be unchanged
assert original["pages"][0]["elements"][0]["content"] == original_content
# Result should have translation
assert result["pages"][0]["elements"][0]["content"] == "Modified"
def test_nested_objects_are_copied(self):
"""Test that nested objects are properly deep copied"""
original = {
"pages": [
{
"page_number": 1,
"elements": [
{
"element_id": "table_1",
"type": "table",
"content": {
"cells": [
{"row": 0, "col": 0, "content": "Original"}
]
}
}
]
}
]
}
original_cell_content = original["pages"][0]["elements"][0]["content"]["cells"][0]["content"]
translations = {
"table_1": {"cells": [{"row": 0, "col": 0, "content": "Modified"}]}
}
result = apply_translations(original, translations)
# Original nested content should be unchanged
assert original["pages"][0]["elements"][0]["content"]["cells"][0]["content"] == original_cell_content
class TestEdgeCases:
"""Tests for edge cases and error handling"""
def test_missing_element_id(self):
"""Test handling elements without element_id"""
result_json = {
"pages": [
{
"page_number": 1,
"elements": [
{"type": "text", "content": "No ID element"}
]
}
]
}
translations = {"text_1": "Translation"}
# Should not raise error
result = apply_translations(result_json, translations)
assert result["pages"][0]["elements"][0]["content"] == "No ID element"
def test_missing_type(self):
"""Test handling elements without type"""
result_json = {
"pages": [
{
"page_number": 1,
"elements": [
{"element_id": "elem_1", "content": "No type"}
]
}
]
}
translations = {"elem_1": "Translation"}
# Should not raise error, should not apply translation without matching type
result = apply_translations(result_json, translations)
# Content unchanged because type doesn't match TRANSLATABLE_TEXT_TYPES
assert result["pages"][0]["elements"][0]["content"] == "No type"
def test_unicode_translations(self):
"""Test handling of various unicode characters"""
result_json = {
"pages": [
{
"page_number": 1,
"elements": [
{"element_id": "text_1", "type": "text", "content": "English"},
{"element_id": "text_2", "type": "text", "content": "More text"},
{"element_id": "text_3", "type": "text", "content": "Another"},
]
}
]
}
translations = {
"text_1": "日本語テキスト", # Japanese
"text_2": "한국어 텍스트", # Korean
"text_3": "العربية" # Arabic
}
result = apply_translations(result_json, translations)
assert result["pages"][0]["elements"][0]["content"] == "日本語テキスト"
assert result["pages"][0]["elements"][1]["content"] == "한국어 텍스트"
assert result["pages"][0]["elements"][2]["content"] == "العربية"