feat: add multilingual font support for translated PDFs
- Add NotoSansKR and NotoSansThai fonts for Korean and Thai language support - Update download_fonts.sh to download all required fonts - Add LANGUAGE_FONT_MAP for language-to-font mapping in pdf_generator_service.py - Add get_font_for_language() method to select appropriate font based on target language - Update _get_reflow_styles() to accept target_lang parameter - Pass target_lang through generate_translated_pdf() to PDF generation methods - Fix garbled characters (亂碼) issue for Korean and Thai translations Supported languages: - Chinese (zh-CN/zh-TW), Japanese (ja): NotoSansSC - Korean (ko): NotoSansKR - Thai (th): NotoSansThai - Russian, Vietnamese, Latin languages: NotoSansSC 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -145,40 +145,83 @@ class PDFGeneratorService:
|
||||
STYLE_FLAG_UNDERLINE = 4
|
||||
STYLE_FLAG_STRIKETHROUGH = 8
|
||||
|
||||
# Language to font mapping for proper character support
|
||||
LANGUAGE_FONT_MAP = {
|
||||
# Korean requires separate font
|
||||
'ko': 'NotoSansKR',
|
||||
# Thai requires separate font
|
||||
'th': 'NotoSansThai',
|
||||
# All other languages use NotoSansSC (covers CJK, Latin, Cyrillic, etc.)
|
||||
'default': 'NotoSansSC',
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize PDF generator with font configuration"""
|
||||
self.font_name = 'NotoSansSC'
|
||||
self.font_name = 'NotoSansSC' # Default font
|
||||
self.font_path = None
|
||||
self.font_registered = False
|
||||
self.current_processing_track = None # Track type for current document
|
||||
self.registered_fonts = {} # Track which fonts are registered
|
||||
|
||||
self._register_chinese_font()
|
||||
self._register_multilingual_fonts()
|
||||
|
||||
def _register_chinese_font(self):
|
||||
"""Register Chinese font for PDF generation"""
|
||||
try:
|
||||
# Get font path from settings
|
||||
font_path = Path(settings.chinese_font_path)
|
||||
def _register_multilingual_fonts(self):
|
||||
"""Register fonts for all supported languages"""
|
||||
project_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||
fonts_dir = project_root / 'backend' / 'fonts'
|
||||
|
||||
# Try relative path from project root
|
||||
if not font_path.is_absolute():
|
||||
# Adjust path - settings.chinese_font_path starts with ./backend/
|
||||
project_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||
font_path = project_root / font_path
|
||||
# Font files to register
|
||||
font_files = {
|
||||
'NotoSansSC': 'NotoSansSC-Regular.ttf',
|
||||
'NotoSansKR': 'NotoSansKR-Regular.ttf',
|
||||
'NotoSansThai': 'NotoSansThai-Regular.ttf',
|
||||
}
|
||||
|
||||
if not font_path.exists():
|
||||
logger.error(f"Chinese font not found at {font_path}")
|
||||
return
|
||||
for font_name, font_file in font_files.items():
|
||||
font_path = fonts_dir / font_file
|
||||
if font_path.exists():
|
||||
try:
|
||||
pdfmetrics.registerFont(TTFont(font_name, str(font_path)))
|
||||
self.registered_fonts[font_name] = font_path
|
||||
logger.info(f"Font registered: {font_name} from {font_path}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to register font {font_name}: {e}")
|
||||
else:
|
||||
logger.warning(f"Font file not found: {font_path}")
|
||||
|
||||
# Register font
|
||||
pdfmetrics.registerFont(TTFont(self.font_name, str(font_path)))
|
||||
self.font_path = font_path
|
||||
# Set default font
|
||||
if 'NotoSansSC' in self.registered_fonts:
|
||||
self.font_name = 'NotoSansSC'
|
||||
self.font_path = self.registered_fonts['NotoSansSC']
|
||||
self.font_registered = True
|
||||
logger.info(f"Chinese font registered: {self.font_name} from {font_path}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to register Chinese font: {e}")
|
||||
else:
|
||||
self.font_registered = False
|
||||
logger.error("Default font NotoSansSC not available")
|
||||
|
||||
def get_font_for_language(self, target_lang: str) -> str:
|
||||
"""
|
||||
Get the appropriate font name for a target language.
|
||||
|
||||
Args:
|
||||
target_lang: Language code (e.g., 'ko', 'th', 'ja', 'zh-TW')
|
||||
|
||||
Returns:
|
||||
Font name to use for this language
|
||||
"""
|
||||
# Check if language has a specific font
|
||||
font_name = self.LANGUAGE_FONT_MAP.get(target_lang, self.LANGUAGE_FONT_MAP['default'])
|
||||
|
||||
# Verify font is registered, fall back to default if not
|
||||
if font_name not in self.registered_fonts:
|
||||
logger.warning(f"Font {font_name} for language {target_lang} not registered, using default")
|
||||
font_name = self.LANGUAGE_FONT_MAP['default']
|
||||
|
||||
# Final fallback to Helvetica if no fonts available
|
||||
if font_name not in self.registered_fonts:
|
||||
logger.warning(f"No suitable font found for language {target_lang}, using Helvetica")
|
||||
return 'Helvetica'
|
||||
|
||||
return font_name
|
||||
|
||||
def _detect_content_orientation(
|
||||
self,
|
||||
@@ -4462,10 +4505,20 @@ class PDFGeneratorService:
|
||||
# Direct track: elements already in reading order from PyMuPDF
|
||||
return elements
|
||||
|
||||
def _get_reflow_styles(self) -> Dict[str, ParagraphStyle]:
|
||||
"""Create consistent styles for reflow PDF generation."""
|
||||
def _get_reflow_styles(self, target_lang: Optional[str] = None) -> Dict[str, ParagraphStyle]:
|
||||
"""
|
||||
Create consistent styles for reflow PDF generation.
|
||||
|
||||
Args:
|
||||
target_lang: Optional target language code for selecting appropriate font.
|
||||
If not provided, uses default font.
|
||||
"""
|
||||
base_styles = getSampleStyleSheet()
|
||||
font_name = self.font_name if self.font_registered else 'Helvetica'
|
||||
# Use language-specific font if target_lang is provided
|
||||
if target_lang:
|
||||
font_name = self.get_font_for_language(target_lang)
|
||||
else:
|
||||
font_name = self.font_name if self.font_registered else 'Helvetica'
|
||||
|
||||
styles = {
|
||||
'Title': ParagraphStyle(
|
||||
@@ -4936,7 +4989,8 @@ class PDFGeneratorService:
|
||||
json_path: Path,
|
||||
output_path: Path,
|
||||
source_file_path: Optional[Path] = None,
|
||||
use_elements_only: bool = False
|
||||
use_elements_only: bool = False,
|
||||
target_lang: Optional[str] = None
|
||||
) -> bool:
|
||||
"""
|
||||
Generate reflow layout PDF from OCR/Direct JSON data.
|
||||
@@ -4953,6 +5007,7 @@ class PDFGeneratorService:
|
||||
source_file_path: Optional path to original source file (for images)
|
||||
use_elements_only: If True, always use elements from JSON (for translated PDFs
|
||||
where translations are applied to elements, not raw_ocr_regions)
|
||||
target_lang: Optional target language code for selecting appropriate font
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise
|
||||
@@ -4969,8 +5024,8 @@ class PDFGeneratorService:
|
||||
is_ocr_track = processing_track == 'ocr'
|
||||
logger.info(f"Reflow PDF generation - Processing track: {processing_track}")
|
||||
|
||||
# Get styles
|
||||
styles = self._get_reflow_styles()
|
||||
# Get styles (with language-specific font if target_lang provided)
|
||||
styles = self._get_reflow_styles(target_lang=target_lang)
|
||||
|
||||
# Build document content
|
||||
story = []
|
||||
@@ -5149,7 +5204,8 @@ class PDFGeneratorService:
|
||||
result_json=result_json,
|
||||
raw_ocr_translations=raw_ocr_translations,
|
||||
output_path=output_path,
|
||||
result_dir=result_json_path.parent
|
||||
result_dir=result_json_path.parent,
|
||||
target_lang=target_lang
|
||||
)
|
||||
|
||||
# Direct Track: Use element-based translations
|
||||
@@ -5189,7 +5245,8 @@ class PDFGeneratorService:
|
||||
json_path=tmp_path,
|
||||
output_path=output_path,
|
||||
source_file_path=result_json_path.parent, # Contains extracted images
|
||||
use_elements_only=True # Use elements with translations applied
|
||||
use_elements_only=True, # Use elements with translations applied
|
||||
target_lang=target_lang # Use language-specific font
|
||||
)
|
||||
return success
|
||||
finally:
|
||||
@@ -5214,7 +5271,8 @@ class PDFGeneratorService:
|
||||
result_json: Dict,
|
||||
raw_ocr_translations: List[Dict],
|
||||
output_path: Path,
|
||||
result_dir: Path
|
||||
result_dir: Path,
|
||||
target_lang: Optional[str] = None
|
||||
) -> bool:
|
||||
"""
|
||||
Generate translated reflow PDF for OCR Track documents.
|
||||
@@ -5226,13 +5284,14 @@ class PDFGeneratorService:
|
||||
raw_ocr_translations: List of {page, index, original, translated}
|
||||
output_path: Path to save generated PDF
|
||||
result_dir: Path to result directory for images
|
||||
target_lang: Target language code for selecting appropriate font
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise
|
||||
"""
|
||||
try:
|
||||
# Get styles
|
||||
styles = self._get_reflow_styles()
|
||||
# Get styles with language-specific font
|
||||
styles = self._get_reflow_styles(target_lang=target_lang)
|
||||
|
||||
# Build document content
|
||||
story = []
|
||||
|
||||
Reference in New Issue
Block a user