Files
OCR/backend/tests/api/test_translate_pdf_api.py
egg a07aad96b3 feat: add translated PDF export with layout preservation
Adds the ability to download translated documents as PDF files while
preserving the original document layout. Key changes:

- Add apply_translations() function to merge translation JSON with UnifiedDocument
- Add generate_translated_pdf() method to PDFGeneratorService
- Add POST /api/v2/translate/{task_id}/pdf endpoint
- Add downloadTranslatedPdf() method and PDF button in frontend
- Add comprehensive unit tests (52 tests: merge, PDF generation, API endpoints)
- Archive add-translated-pdf-export proposal

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-02 12:33:31 +08:00

728 lines
24 KiB
Python

"""
API integration tests for Translated PDF Download endpoint.
Tests the POST /api/v2/translate/{task_id}/pdf endpoint for downloading
translated PDFs with layout preservation.
Note: These tests use extensive mocking to avoid importing heavy dependencies
like PaddleOCR and PyTorch which aren't available in the test environment.
"""
import pytest
import json
import sys
from pathlib import Path
from unittest.mock import patch, MagicMock
from datetime import datetime
# Mock heavy dependencies before importing app modules
sys.modules['paddleocr'] = MagicMock()
sys.modules['paddlex'] = MagicMock()
sys.modules['torch'] = MagicMock()
sys.modules['modelscope'] = MagicMock()
from fastapi.testclient import TestClient
from fastapi import FastAPI, Depends, HTTPException, status, Query
from fastapi.responses import FileResponse
from sqlalchemy import create_engine, Column, Integer, String, Boolean, Enum as SQLEnum
from sqlalchemy.orm import sessionmaker, declarative_base
import enum
# Create test models without importing from app
Base = declarative_base()
class TaskStatusEnum(enum.Enum):
PENDING = "pending"
PROCESSING = "processing"
COMPLETED = "completed"
FAILED = "failed"
class MockUser(Base):
__tablename__ = "users"
id = Column(Integer, primary_key=True, index=True)
email = Column(String, unique=True, index=True)
hashed_password = Column(String)
is_active = Column(Boolean, default=True)
class MockTask(Base):
__tablename__ = "tasks"
id = Column(Integer, primary_key=True, index=True)
user_id = Column(Integer)
task_id = Column(String, unique=True, index=True)
filename = Column(String)
status = Column(SQLEnum(TaskStatusEnum), default=TaskStatusEnum.PENDING)
result_json_path = Column(String, nullable=True)
file_path = Column(String, nullable=True)
# Create test database
SQLALCHEMY_DATABASE_URL = "sqlite:///./test_translate_pdf.db"
engine = create_engine(SQLALCHEMY_DATABASE_URL, connect_args={"check_same_thread": False})
TestingSessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
def create_test_app():
"""Create a minimal FastAPI app for testing the translate PDF endpoint"""
test_app = FastAPI()
@test_app.post("/api/v2/translate/{task_id}/pdf")
async def download_translated_pdf(
task_id: str,
lang: str = Query(..., description="Target language code"),
):
"""Mock implementation of the translated PDF endpoint"""
from app.services.pdf_generator_service import pdf_generator_service
# Get db_session and current_user from app state (set in test)
db = test_app.state.db_session
current_user = test_app.state.current_user
# Find task
task = db.query(MockTask).filter(
MockTask.task_id == task_id,
MockTask.user_id == current_user.id
).first()
if not task:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Task not found"
)
if not task.result_json_path:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="OCR result not found"
)
result_json_path = Path(task.result_json_path)
if not result_json_path.exists():
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Result file not found"
)
# Find translation file
result_dir = result_json_path.parent
base_name = result_json_path.stem.replace('_result', '').replace('edit_', '')
translation_file = result_dir / f"{base_name}_translated_{lang}.json"
if not translation_file.exists():
translation_file = result_dir / f"edit_translated_{lang}.json"
if not translation_file.exists():
# List available translations
available = [f.stem.split("_translated_")[-1]
for f in result_dir.glob("*_translated_*.json")]
if available:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"Translation for language '{lang}' not found. Available translations: {', '.join(available)}"
)
else:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="No translations found for this task."
)
# Check translation content
try:
with open(translation_file, 'r', encoding='utf-8') as f:
translation_data = json.load(f)
if not translation_data.get('translations'):
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Translation file is empty or incomplete"
)
except json.JSONDecodeError:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Invalid translation file format"
)
# Generate PDF
import tempfile
output_filename = f"{task_id}_translated_{lang}.pdf"
with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp_file:
output_path = Path(tmp_file.name)
try:
source_file_path = None
if task.file_path and Path(task.file_path).exists():
source_file_path = Path(task.file_path)
success = pdf_generator_service.generate_translated_pdf(
result_json_path=result_json_path,
translation_json_path=translation_file,
output_path=output_path,
source_file_path=source_file_path
)
if not success:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="Failed to generate translated PDF"
)
return FileResponse(
path=str(output_path),
filename=output_filename,
media_type="application/pdf",
headers={
"Content-Disposition": f'attachment; filename="{output_filename}"'
}
)
except HTTPException:
if output_path.exists():
output_path.unlink()
raise
except Exception as e:
if output_path.exists():
output_path.unlink()
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to generate translated PDF: {str(e)}"
)
return test_app
@pytest.fixture(scope="function")
def db_session():
"""Create test database session"""
Base.metadata.create_all(bind=engine)
session = TestingSessionLocal()
try:
yield session
finally:
session.close()
Base.metadata.drop_all(bind=engine)
@pytest.fixture
def test_user(db_session):
"""Create test user"""
user = MockUser(
email="translate_test@example.com",
hashed_password="test_hash",
is_active=True
)
db_session.add(user)
db_session.commit()
db_session.refresh(user)
return user
@pytest.fixture
def test_app(db_session, test_user):
"""Create test app with dependencies injected"""
app = create_test_app()
app.state.db_session = db_session
app.state.current_user = test_user
return app
@pytest.fixture
def client(test_app):
"""Create test client"""
return TestClient(test_app)
@pytest.fixture
def test_task_with_result(db_session, test_user, tmp_path):
"""Create test task with result JSON and translation file"""
task_id = "test-translate-pdf-123"
result_dir = tmp_path / "results" / task_id
result_dir.mkdir(parents=True)
# Create result JSON
result_json = {
"document_info": {
"total_pages": 1,
"processing_track": "Direct"
},
"pages": [
{
"page_number": 1,
"width": 612,
"height": 792,
"elements": [
{
"element_id": "text_1",
"type": "text",
"content": "Hello World",
"bounding_box": {"x": 72, "y": 72, "width": 200, "height": 20}
}
]
}
]
}
result_json_path = result_dir / "edit_result.json"
result_json_path.write_text(json.dumps(result_json), encoding='utf-8')
# Create translation file
translation_json = {
"task_id": task_id,
"target_lang": "zh-TW",
"translated_at": datetime.utcnow().isoformat() + "Z",
"provider": "dify",
"translations": {
"text_1": "你好世界"
},
"statistics": {
"total_elements": 1,
"translated_elements": 1,
"skipped_elements": 0,
"total_characters": 11,
"processing_time_seconds": 1.5,
"total_tokens": 50
}
}
translation_path = result_dir / "edit_translated_zh-TW.json"
translation_path.write_text(json.dumps(translation_json), encoding='utf-8')
# Create task
task = MockTask(
user_id=test_user.id,
task_id=task_id,
filename="test.pdf",
status=TaskStatusEnum.COMPLETED,
result_json_path=str(result_json_path),
file_path=str(tmp_path / "test.pdf")
)
db_session.add(task)
db_session.commit()
db_session.refresh(task)
return task, result_dir
@pytest.fixture
def test_task_no_result(db_session, test_user):
"""Create test task without result JSON"""
task = MockTask(
user_id=test_user.id,
task_id="test-no-result-456",
filename="test.pdf",
status=TaskStatusEnum.COMPLETED,
result_json_path=None
)
db_session.add(task)
db_session.commit()
db_session.refresh(task)
return task
@pytest.fixture
def test_task_no_translation(db_session, test_user, tmp_path):
"""Create test task with result JSON but no translation"""
task_id = "test-no-translation-789"
result_dir = tmp_path / "results" / task_id
result_dir.mkdir(parents=True)
# Create result JSON only (no translation file)
result_json = {
"document_info": {"total_pages": 1, "processing_track": "Direct"},
"pages": [{"page_number": 1, "width": 612, "height": 792, "elements": []}]
}
result_json_path = result_dir / "edit_result.json"
result_json_path.write_text(json.dumps(result_json), encoding='utf-8')
task = MockTask(
user_id=test_user.id,
task_id=task_id,
filename="test.pdf",
status=TaskStatusEnum.COMPLETED,
result_json_path=str(result_json_path)
)
db_session.add(task)
db_session.commit()
db_session.refresh(task)
return task
@pytest.fixture
def test_task_empty_translation(db_session, test_user, tmp_path):
"""Create test task with empty translation file"""
task_id = "test-empty-translation-101"
result_dir = tmp_path / "results" / task_id
result_dir.mkdir(parents=True)
# Create result JSON
result_json = {
"document_info": {"total_pages": 1, "processing_track": "Direct"},
"pages": [{"page_number": 1, "width": 612, "height": 792, "elements": []}]
}
result_json_path = result_dir / "edit_result.json"
result_json_path.write_text(json.dumps(result_json), encoding='utf-8')
# Create empty translation file
translation_json = {
"task_id": task_id,
"target_lang": "ja",
"translations": {} # Empty translations
}
translation_path = result_dir / "edit_translated_ja.json"
translation_path.write_text(json.dumps(translation_json), encoding='utf-8')
task = MockTask(
user_id=test_user.id,
task_id=task_id,
filename="test.pdf",
status=TaskStatusEnum.COMPLETED,
result_json_path=str(result_json_path)
)
db_session.add(task)
db_session.commit()
db_session.refresh(task)
return task
@pytest.fixture
def other_user(db_session):
"""Create another user for ownership tests"""
user = MockUser(
email="other_user@example.com",
hashed_password="other_hash",
is_active=True
)
db_session.add(user)
db_session.commit()
db_session.refresh(user)
return user
class TestTranslatedPDFDownload:
"""Tests for POST /api/v2/translate/{task_id}/pdf endpoint"""
@patch('app.services.pdf_generator_service.pdf_generator_service')
def test_download_translated_pdf_success(
self, mock_pdf_service, client, db_session, test_user, test_task_with_result, tmp_path
):
"""Test successful translated PDF download"""
task, result_dir = test_task_with_result
# Create a mock PDF file for the response
mock_pdf_path = tmp_path / "output.pdf"
mock_pdf_path.write_bytes(b"%PDF-1.4 mock pdf content")
def mock_generate(result_json_path, translation_json_path, output_path, source_file_path=None):
# Copy mock PDF to output path
output_path.write_bytes(mock_pdf_path.read_bytes())
return True
mock_pdf_service.generate_translated_pdf.side_effect = mock_generate
response = client.post(
f"/api/v2/translate/{task.task_id}/pdf?lang=zh-TW"
)
assert response.status_code == 200
assert response.headers["content-type"] == "application/pdf"
assert "attachment" in response.headers.get("content-disposition", "")
assert task.task_id in response.headers.get("content-disposition", "")
# Verify PDF service was called
mock_pdf_service.generate_translated_pdf.assert_called_once()
def test_download_pdf_task_not_found(self, client, db_session, test_user):
"""Test 404 when task doesn't exist"""
response = client.post(
"/api/v2/translate/nonexistent-task-id/pdf?lang=zh-TW"
)
assert response.status_code == 404
assert "Task not found" in response.json()["detail"]
def test_download_pdf_no_result_json(self, client, db_session, test_user, test_task_no_result):
"""Test 404 when task has no result JSON"""
response = client.post(
f"/api/v2/translate/{test_task_no_result.task_id}/pdf?lang=zh-TW"
)
assert response.status_code == 404
assert "OCR result not found" in response.json()["detail"]
def test_download_pdf_translation_not_found(
self, client, db_session, test_user, test_task_no_translation
):
"""Test 404 when translation for requested language doesn't exist"""
response = client.post(
f"/api/v2/translate/{test_task_no_translation.task_id}/pdf?lang=ko"
)
assert response.status_code == 404
detail = response.json()["detail"]
# Message could mention the language or indicate no translations found
assert "ko" in detail or "translation" in detail.lower() or "found" in detail.lower()
def test_download_pdf_empty_translation(
self, client, db_session, test_user, test_task_empty_translation
):
"""Test 400 when translation file is empty"""
response = client.post(
f"/api/v2/translate/{test_task_empty_translation.task_id}/pdf?lang=ja"
)
assert response.status_code == 400
assert "empty" in response.json()["detail"].lower() or "incomplete" in response.json()["detail"].lower()
def test_download_pdf_missing_lang_param(
self, client, db_session, test_user, test_task_with_result
):
"""Test 422 when lang query parameter is missing"""
task, _ = test_task_with_result
response = client.post(
f"/api/v2/translate/{task.task_id}/pdf"
)
# FastAPI returns 422 for missing required query params
assert response.status_code == 422
def test_download_pdf_wrong_user(
self, db_session, other_user, test_task_with_result, tmp_path
):
"""Test 404 when task belongs to different user"""
task, _ = test_task_with_result
# Create new app with other_user
app = create_test_app()
app.state.db_session = db_session
app.state.current_user = other_user
client = TestClient(app)
response = client.post(
f"/api/v2/translate/{task.task_id}/pdf?lang=zh-TW"
)
# Task service returns None for tasks not owned by current user
assert response.status_code == 404
assert "Task not found" in response.json()["detail"]
@patch('app.services.pdf_generator_service.pdf_generator_service')
def test_download_pdf_generation_failure(
self, mock_pdf_service, client, db_session, test_user, test_task_with_result
):
"""Test 500 when PDF generation fails"""
task, _ = test_task_with_result
# Mock PDF generation failure
mock_pdf_service.generate_translated_pdf.return_value = False
response = client.post(
f"/api/v2/translate/{task.task_id}/pdf?lang=zh-TW"
)
assert response.status_code == 500
assert "Failed to generate" in response.json()["detail"]
@patch('app.services.pdf_generator_service.pdf_generator_service')
def test_download_pdf_exception_handling(
self, mock_pdf_service, client, db_session, test_user, test_task_with_result
):
"""Test 500 when PDF generation raises exception"""
task, _ = test_task_with_result
# Mock PDF generation exception
mock_pdf_service.generate_translated_pdf.side_effect = Exception("Unexpected error")
response = client.post(
f"/api/v2/translate/{task.task_id}/pdf?lang=zh-TW"
)
assert response.status_code == 500
assert "Failed to generate" in response.json()["detail"]
class TestTranslatedPDFWithMultipleLanguages:
"""Tests for multiple translation languages"""
@pytest.fixture
def task_with_multiple_translations(self, db_session, test_user, tmp_path):
"""Create task with translations in multiple languages"""
task_id = "test-multi-lang-222"
result_dir = tmp_path / "results" / task_id
result_dir.mkdir(parents=True)
# Create result JSON
result_json = {
"document_info": {"total_pages": 1, "processing_track": "Direct"},
"pages": [{
"page_number": 1,
"width": 612, "height": 792,
"elements": [
{"element_id": "text_1", "type": "text", "content": "Hello",
"bounding_box": {"x": 72, "y": 72, "width": 100, "height": 20}}
]
}]
}
result_json_path = result_dir / "edit_result.json"
result_json_path.write_text(json.dumps(result_json), encoding='utf-8')
# Create translations for multiple languages
for lang, translation in [("zh-TW", "你好"), ("ja", "こんにちは"), ("ko", "안녕하세요")]:
translation_json = {
"task_id": task_id,
"target_lang": lang,
"translated_at": datetime.utcnow().isoformat() + "Z",
"translations": {"text_1": translation},
"statistics": {"translated_elements": 1}
}
(result_dir / f"edit_translated_{lang}.json").write_text(
json.dumps(translation_json), encoding='utf-8'
)
task = MockTask(
user_id=test_user.id,
task_id=task_id,
filename="test.pdf",
status=TaskStatusEnum.COMPLETED,
result_json_path=str(result_json_path)
)
db_session.add(task)
db_session.commit()
db_session.refresh(task)
return task, result_dir
@patch('app.services.pdf_generator_service.pdf_generator_service')
def test_download_different_languages(
self, mock_pdf_service, client, db_session, test_user,
task_with_multiple_translations, tmp_path
):
"""Test downloading PDFs for different languages"""
task, result_dir = task_with_multiple_translations
mock_pdf_path = tmp_path / "output.pdf"
mock_pdf_path.write_bytes(b"%PDF-1.4 mock")
def mock_generate(result_json_path, translation_json_path, output_path, source_file_path=None):
output_path.write_bytes(mock_pdf_path.read_bytes())
return True
mock_pdf_service.generate_translated_pdf.side_effect = mock_generate
for lang in ["zh-TW", "ja", "ko"]:
response = client.post(
f"/api/v2/translate/{task.task_id}/pdf?lang={lang}"
)
assert response.status_code == 200, f"Failed for language {lang}"
assert response.headers["content-type"] == "application/pdf"
# Verify PDF service was called 3 times
assert mock_pdf_service.generate_translated_pdf.call_count == 3
def test_download_nonexistent_language(
self, client, db_session, test_user, task_with_multiple_translations
):
"""Test 404 for language that doesn't exist"""
task, _ = task_with_multiple_translations
response = client.post(
f"/api/v2/translate/{task.task_id}/pdf?lang=de"
)
assert response.status_code == 404
detail = response.json()["detail"]
# Should mention available languages
assert "zh-TW" in detail or "ja" in detail or "ko" in detail or "not found" in detail.lower()
class TestInvalidTranslationFile:
"""Tests for invalid translation file scenarios"""
@pytest.fixture
def task_with_invalid_json(self, db_session, test_user, tmp_path):
"""Create task with invalid JSON translation file"""
task_id = "test-invalid-json-333"
result_dir = tmp_path / "results" / task_id
result_dir.mkdir(parents=True)
# Create result JSON
result_json = {
"document_info": {"total_pages": 1, "processing_track": "Direct"},
"pages": [{"page_number": 1, "width": 612, "height": 792, "elements": []}]
}
result_json_path = result_dir / "edit_result.json"
result_json_path.write_text(json.dumps(result_json), encoding='utf-8')
# Create invalid JSON translation file
(result_dir / "edit_translated_en.json").write_text("{ invalid json }", encoding='utf-8')
task = MockTask(
user_id=test_user.id,
task_id=task_id,
filename="test.pdf",
status=TaskStatusEnum.COMPLETED,
result_json_path=str(result_json_path)
)
db_session.add(task)
db_session.commit()
db_session.refresh(task)
return task
def test_download_pdf_invalid_json(
self, client, db_session, test_user, task_with_invalid_json
):
"""Test 400 when translation file has invalid JSON"""
response = client.post(
f"/api/v2/translate/{task_with_invalid_json.task_id}/pdf?lang=en"
)
assert response.status_code == 400
assert "Invalid" in response.json()["detail"] or "format" in response.json()["detail"].lower()
class TestResultFileNotFound:
"""Tests for missing result file scenario"""
@pytest.fixture
def task_with_missing_file(self, db_session, test_user, tmp_path):
"""Create task pointing to non-existent result file"""
task_id = "test-missing-file-444"
result_dir = tmp_path / "results" / task_id
result_dir.mkdir(parents=True)
# Point to non-existent file
result_json_path = result_dir / "nonexistent_result.json"
task = MockTask(
user_id=test_user.id,
task_id=task_id,
filename="test.pdf",
status=TaskStatusEnum.COMPLETED,
result_json_path=str(result_json_path)
)
db_session.add(task)
db_session.commit()
db_session.refresh(task)
return task
def test_download_pdf_result_file_missing(
self, client, db_session, test_user, task_with_missing_file
):
"""Test 404 when result file doesn't exist on disk"""
response = client.post(
f"/api/v2/translate/{task_with_missing_file.task_id}/pdf?lang=zh-TW"
)
assert response.status_code == 404
assert "not found" in response.json()["detail"].lower()
if __name__ == '__main__':
pytest.main([__file__, '-v'])