feat: add multilingual font support for translated PDFs

- Add NotoSansKR and NotoSansThai fonts for Korean and Thai language support
- Update download_fonts.sh to download all required fonts
- Add LANGUAGE_FONT_MAP for language-to-font mapping in pdf_generator_service.py
- Add get_font_for_language() method to select appropriate font based on target language
- Update _get_reflow_styles() to accept target_lang parameter
- Pass target_lang through generate_translated_pdf() to PDF generation methods
- Fix garbled characters (亂碼) issue for Korean and Thai translations

Supported languages:
- Chinese (zh-CN/zh-TW), Japanese (ja): NotoSansSC
- Korean (ko): NotoSansKR
- Thai (th): NotoSansThai
- Russian, Vietnamese, Latin languages: NotoSansSC

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
egg
2025-12-12 19:18:58 +08:00
parent efa7e4175c
commit 3876477bda
4 changed files with 133 additions and 49 deletions

View File

@@ -145,40 +145,83 @@ class PDFGeneratorService:
STYLE_FLAG_UNDERLINE = 4
STYLE_FLAG_STRIKETHROUGH = 8
# Language to font mapping for proper character support
LANGUAGE_FONT_MAP = {
# Korean requires separate font
'ko': 'NotoSansKR',
# Thai requires separate font
'th': 'NotoSansThai',
# All other languages use NotoSansSC (covers CJK, Latin, Cyrillic, etc.)
'default': 'NotoSansSC',
}
def __init__(self):
"""Initialize PDF generator with font configuration"""
self.font_name = 'NotoSansSC'
self.font_name = 'NotoSansSC' # Default font
self.font_path = None
self.font_registered = False
self.current_processing_track = None # Track type for current document
self.registered_fonts = {} # Track which fonts are registered
self._register_chinese_font()
self._register_multilingual_fonts()
def _register_chinese_font(self):
"""Register Chinese font for PDF generation"""
try:
# Get font path from settings
font_path = Path(settings.chinese_font_path)
def _register_multilingual_fonts(self):
"""Register fonts for all supported languages"""
project_root = Path(__file__).resolve().parent.parent.parent.parent
fonts_dir = project_root / 'backend' / 'fonts'
# Try relative path from project root
if not font_path.is_absolute():
# Adjust path - settings.chinese_font_path starts with ./backend/
project_root = Path(__file__).resolve().parent.parent.parent.parent
font_path = project_root / font_path
# Font files to register
font_files = {
'NotoSansSC': 'NotoSansSC-Regular.ttf',
'NotoSansKR': 'NotoSansKR-Regular.ttf',
'NotoSansThai': 'NotoSansThai-Regular.ttf',
}
if not font_path.exists():
logger.error(f"Chinese font not found at {font_path}")
return
for font_name, font_file in font_files.items():
font_path = fonts_dir / font_file
if font_path.exists():
try:
pdfmetrics.registerFont(TTFont(font_name, str(font_path)))
self.registered_fonts[font_name] = font_path
logger.info(f"Font registered: {font_name} from {font_path}")
except Exception as e:
logger.warning(f"Failed to register font {font_name}: {e}")
else:
logger.warning(f"Font file not found: {font_path}")
# Register font
pdfmetrics.registerFont(TTFont(self.font_name, str(font_path)))
self.font_path = font_path
# Set default font
if 'NotoSansSC' in self.registered_fonts:
self.font_name = 'NotoSansSC'
self.font_path = self.registered_fonts['NotoSansSC']
self.font_registered = True
logger.info(f"Chinese font registered: {self.font_name} from {font_path}")
except Exception as e:
logger.error(f"Failed to register Chinese font: {e}")
else:
self.font_registered = False
logger.error("Default font NotoSansSC not available")
def get_font_for_language(self, target_lang: str) -> str:
"""
Get the appropriate font name for a target language.
Args:
target_lang: Language code (e.g., 'ko', 'th', 'ja', 'zh-TW')
Returns:
Font name to use for this language
"""
# Check if language has a specific font
font_name = self.LANGUAGE_FONT_MAP.get(target_lang, self.LANGUAGE_FONT_MAP['default'])
# Verify font is registered, fall back to default if not
if font_name not in self.registered_fonts:
logger.warning(f"Font {font_name} for language {target_lang} not registered, using default")
font_name = self.LANGUAGE_FONT_MAP['default']
# Final fallback to Helvetica if no fonts available
if font_name not in self.registered_fonts:
logger.warning(f"No suitable font found for language {target_lang}, using Helvetica")
return 'Helvetica'
return font_name
def _detect_content_orientation(
self,
@@ -4462,10 +4505,20 @@ class PDFGeneratorService:
# Direct track: elements already in reading order from PyMuPDF
return elements
def _get_reflow_styles(self) -> Dict[str, ParagraphStyle]:
"""Create consistent styles for reflow PDF generation."""
def _get_reflow_styles(self, target_lang: Optional[str] = None) -> Dict[str, ParagraphStyle]:
"""
Create consistent styles for reflow PDF generation.
Args:
target_lang: Optional target language code for selecting appropriate font.
If not provided, uses default font.
"""
base_styles = getSampleStyleSheet()
font_name = self.font_name if self.font_registered else 'Helvetica'
# Use language-specific font if target_lang is provided
if target_lang:
font_name = self.get_font_for_language(target_lang)
else:
font_name = self.font_name if self.font_registered else 'Helvetica'
styles = {
'Title': ParagraphStyle(
@@ -4936,7 +4989,8 @@ class PDFGeneratorService:
json_path: Path,
output_path: Path,
source_file_path: Optional[Path] = None,
use_elements_only: bool = False
use_elements_only: bool = False,
target_lang: Optional[str] = None
) -> bool:
"""
Generate reflow layout PDF from OCR/Direct JSON data.
@@ -4953,6 +5007,7 @@ class PDFGeneratorService:
source_file_path: Optional path to original source file (for images)
use_elements_only: If True, always use elements from JSON (for translated PDFs
where translations are applied to elements, not raw_ocr_regions)
target_lang: Optional target language code for selecting appropriate font
Returns:
True if successful, False otherwise
@@ -4969,8 +5024,8 @@ class PDFGeneratorService:
is_ocr_track = processing_track == 'ocr'
logger.info(f"Reflow PDF generation - Processing track: {processing_track}")
# Get styles
styles = self._get_reflow_styles()
# Get styles (with language-specific font if target_lang provided)
styles = self._get_reflow_styles(target_lang=target_lang)
# Build document content
story = []
@@ -5149,7 +5204,8 @@ class PDFGeneratorService:
result_json=result_json,
raw_ocr_translations=raw_ocr_translations,
output_path=output_path,
result_dir=result_json_path.parent
result_dir=result_json_path.parent,
target_lang=target_lang
)
# Direct Track: Use element-based translations
@@ -5189,7 +5245,8 @@ class PDFGeneratorService:
json_path=tmp_path,
output_path=output_path,
source_file_path=result_json_path.parent, # Contains extracted images
use_elements_only=True # Use elements with translations applied
use_elements_only=True, # Use elements with translations applied
target_lang=target_lang # Use language-specific font
)
return success
finally:
@@ -5214,7 +5271,8 @@ class PDFGeneratorService:
result_json: Dict,
raw_ocr_translations: List[Dict],
output_path: Path,
result_dir: Path
result_dir: Path,
target_lang: Optional[str] = None
) -> bool:
"""
Generate translated reflow PDF for OCR Track documents.
@@ -5226,13 +5284,14 @@ class PDFGeneratorService:
raw_ocr_translations: List of {page, index, original, translated}
output_path: Path to save generated PDF
result_dir: Path to result directory for images
target_lang: Target language code for selecting appropriate font
Returns:
True if successful, False otherwise
"""
try:
# Get styles
styles = self._get_reflow_styles()
# Get styles with language-specific font
styles = self._get_reflow_styles(target_lang=target_lang)
# Build document content
story = []