feat: add multilingual font support for translated PDFs

- Add NotoSansKR and NotoSansThai fonts for Korean and Thai language support
- Update download_fonts.sh to download all required fonts
- Add LANGUAGE_FONT_MAP for language-to-font mapping in pdf_generator_service.py
- Add get_font_for_language() method to select appropriate font based on target language
- Update _get_reflow_styles() to accept target_lang parameter
- Pass target_lang through generate_translated_pdf() to PDF generation methods
- Fix garbled characters (亂碼) issue for Korean and Thai translations

Supported languages:
- Chinese (zh-CN/zh-TW), Japanese (ja): NotoSansSC
- Korean (ko): NotoSansKR
- Thai (th): NotoSansThai
- Russian, Vietnamese, Latin languages: NotoSansSC

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
egg
2025-12-12 19:18:58 +08:00
parent efa7e4175c
commit 3876477bda
4 changed files with 133 additions and 49 deletions

View File

@@ -145,40 +145,83 @@ class PDFGeneratorService:
STYLE_FLAG_UNDERLINE = 4 STYLE_FLAG_UNDERLINE = 4
STYLE_FLAG_STRIKETHROUGH = 8 STYLE_FLAG_STRIKETHROUGH = 8
# Language to font mapping for proper character support
LANGUAGE_FONT_MAP = {
# Korean requires separate font
'ko': 'NotoSansKR',
# Thai requires separate font
'th': 'NotoSansThai',
# All other languages use NotoSansSC (covers CJK, Latin, Cyrillic, etc.)
'default': 'NotoSansSC',
}
def __init__(self): def __init__(self):
"""Initialize PDF generator with font configuration""" """Initialize PDF generator with font configuration"""
self.font_name = 'NotoSansSC' self.font_name = 'NotoSansSC' # Default font
self.font_path = None self.font_path = None
self.font_registered = False self.font_registered = False
self.current_processing_track = None # Track type for current document self.current_processing_track = None # Track type for current document
self.registered_fonts = {} # Track which fonts are registered
self._register_chinese_font() self._register_multilingual_fonts()
def _register_chinese_font(self): def _register_multilingual_fonts(self):
"""Register Chinese font for PDF generation""" """Register fonts for all supported languages"""
try: project_root = Path(__file__).resolve().parent.parent.parent.parent
# Get font path from settings fonts_dir = project_root / 'backend' / 'fonts'
font_path = Path(settings.chinese_font_path)
# Try relative path from project root # Font files to register
if not font_path.is_absolute(): font_files = {
# Adjust path - settings.chinese_font_path starts with ./backend/ 'NotoSansSC': 'NotoSansSC-Regular.ttf',
project_root = Path(__file__).resolve().parent.parent.parent.parent 'NotoSansKR': 'NotoSansKR-Regular.ttf',
font_path = project_root / font_path 'NotoSansThai': 'NotoSansThai-Regular.ttf',
}
if not font_path.exists(): for font_name, font_file in font_files.items():
logger.error(f"Chinese font not found at {font_path}") font_path = fonts_dir / font_file
return if font_path.exists():
try:
pdfmetrics.registerFont(TTFont(font_name, str(font_path)))
self.registered_fonts[font_name] = font_path
logger.info(f"Font registered: {font_name} from {font_path}")
except Exception as e:
logger.warning(f"Failed to register font {font_name}: {e}")
else:
logger.warning(f"Font file not found: {font_path}")
# Register font # Set default font
pdfmetrics.registerFont(TTFont(self.font_name, str(font_path))) if 'NotoSansSC' in self.registered_fonts:
self.font_path = font_path self.font_name = 'NotoSansSC'
self.font_path = self.registered_fonts['NotoSansSC']
self.font_registered = True self.font_registered = True
logger.info(f"Chinese font registered: {self.font_name} from {font_path}") else:
except Exception as e:
logger.error(f"Failed to register Chinese font: {e}")
self.font_registered = False self.font_registered = False
logger.error("Default font NotoSansSC not available")
def get_font_for_language(self, target_lang: str) -> str:
"""
Get the appropriate font name for a target language.
Args:
target_lang: Language code (e.g., 'ko', 'th', 'ja', 'zh-TW')
Returns:
Font name to use for this language
"""
# Check if language has a specific font
font_name = self.LANGUAGE_FONT_MAP.get(target_lang, self.LANGUAGE_FONT_MAP['default'])
# Verify font is registered, fall back to default if not
if font_name not in self.registered_fonts:
logger.warning(f"Font {font_name} for language {target_lang} not registered, using default")
font_name = self.LANGUAGE_FONT_MAP['default']
# Final fallback to Helvetica if no fonts available
if font_name not in self.registered_fonts:
logger.warning(f"No suitable font found for language {target_lang}, using Helvetica")
return 'Helvetica'
return font_name
def _detect_content_orientation( def _detect_content_orientation(
self, self,
@@ -4462,10 +4505,20 @@ class PDFGeneratorService:
# Direct track: elements already in reading order from PyMuPDF # Direct track: elements already in reading order from PyMuPDF
return elements return elements
def _get_reflow_styles(self) -> Dict[str, ParagraphStyle]: def _get_reflow_styles(self, target_lang: Optional[str] = None) -> Dict[str, ParagraphStyle]:
"""Create consistent styles for reflow PDF generation.""" """
Create consistent styles for reflow PDF generation.
Args:
target_lang: Optional target language code for selecting appropriate font.
If not provided, uses default font.
"""
base_styles = getSampleStyleSheet() base_styles = getSampleStyleSheet()
font_name = self.font_name if self.font_registered else 'Helvetica' # Use language-specific font if target_lang is provided
if target_lang:
font_name = self.get_font_for_language(target_lang)
else:
font_name = self.font_name if self.font_registered else 'Helvetica'
styles = { styles = {
'Title': ParagraphStyle( 'Title': ParagraphStyle(
@@ -4936,7 +4989,8 @@ class PDFGeneratorService:
json_path: Path, json_path: Path,
output_path: Path, output_path: Path,
source_file_path: Optional[Path] = None, source_file_path: Optional[Path] = None,
use_elements_only: bool = False use_elements_only: bool = False,
target_lang: Optional[str] = None
) -> bool: ) -> bool:
""" """
Generate reflow layout PDF from OCR/Direct JSON data. Generate reflow layout PDF from OCR/Direct JSON data.
@@ -4953,6 +5007,7 @@ class PDFGeneratorService:
source_file_path: Optional path to original source file (for images) source_file_path: Optional path to original source file (for images)
use_elements_only: If True, always use elements from JSON (for translated PDFs use_elements_only: If True, always use elements from JSON (for translated PDFs
where translations are applied to elements, not raw_ocr_regions) where translations are applied to elements, not raw_ocr_regions)
target_lang: Optional target language code for selecting appropriate font
Returns: Returns:
True if successful, False otherwise True if successful, False otherwise
@@ -4969,8 +5024,8 @@ class PDFGeneratorService:
is_ocr_track = processing_track == 'ocr' is_ocr_track = processing_track == 'ocr'
logger.info(f"Reflow PDF generation - Processing track: {processing_track}") logger.info(f"Reflow PDF generation - Processing track: {processing_track}")
# Get styles # Get styles (with language-specific font if target_lang provided)
styles = self._get_reflow_styles() styles = self._get_reflow_styles(target_lang=target_lang)
# Build document content # Build document content
story = [] story = []
@@ -5149,7 +5204,8 @@ class PDFGeneratorService:
result_json=result_json, result_json=result_json,
raw_ocr_translations=raw_ocr_translations, raw_ocr_translations=raw_ocr_translations,
output_path=output_path, output_path=output_path,
result_dir=result_json_path.parent result_dir=result_json_path.parent,
target_lang=target_lang
) )
# Direct Track: Use element-based translations # Direct Track: Use element-based translations
@@ -5189,7 +5245,8 @@ class PDFGeneratorService:
json_path=tmp_path, json_path=tmp_path,
output_path=output_path, output_path=output_path,
source_file_path=result_json_path.parent, # Contains extracted images source_file_path=result_json_path.parent, # Contains extracted images
use_elements_only=True # Use elements with translations applied use_elements_only=True, # Use elements with translations applied
target_lang=target_lang # Use language-specific font
) )
return success return success
finally: finally:
@@ -5214,7 +5271,8 @@ class PDFGeneratorService:
result_json: Dict, result_json: Dict,
raw_ocr_translations: List[Dict], raw_ocr_translations: List[Dict],
output_path: Path, output_path: Path,
result_dir: Path result_dir: Path,
target_lang: Optional[str] = None
) -> bool: ) -> bool:
""" """
Generate translated reflow PDF for OCR Track documents. Generate translated reflow PDF for OCR Track documents.
@@ -5226,13 +5284,14 @@ class PDFGeneratorService:
raw_ocr_translations: List of {page, index, original, translated} raw_ocr_translations: List of {page, index, original, translated}
output_path: Path to save generated PDF output_path: Path to save generated PDF
result_dir: Path to result directory for images result_dir: Path to result directory for images
target_lang: Target language code for selecting appropriate font
Returns: Returns:
True if successful, False otherwise True if successful, False otherwise
""" """
try: try:
# Get styles # Get styles with language-specific font
styles = self._get_reflow_styles() styles = self._get_reflow_styles(target_lang=target_lang)
# Build document content # Build document content
story = [] story = []

View File

@@ -1,31 +1,56 @@
#!/bin/bash #!/bin/bash
# Download Noto Sans SC TrueType font for layout-preserving PDF generation # Download Noto Sans fonts for multilingual PDF generation
# Supports: CJK (Chinese, Japanese, Korean), Thai, Vietnamese, Russian, etc.
set -e set -e
FONT_DIR="backend/fonts" FONT_DIR="backend/fonts"
FONT_URL="https://github.com/notofonts/noto-cjk/raw/main/Sans/Variable/TTF/Subset/NotoSansSC-VF.ttf"
FONT_FILE="NotoSansSC-Regular.ttf"
echo "🔤 Downloading Chinese font for PDF generation..." echo "🔤 Downloading fonts for multilingual PDF generation..."
# Create font directory # Create font directory
mkdir -p "$FONT_DIR" mkdir -p "$FONT_DIR"
# Download font if not exists # Function to download font
if [ -f "$FONT_DIR/$FONT_FILE" ]; then download_font() {
echo "✓ Font already exists: $FONT_DIR/$FONT_FILE" local url="$1"
else local file="$2"
echo "Downloading from GitHub..." local desc="$3"
wget "$FONT_URL" -O "$FONT_DIR/$FONT_FILE"
if [ -f "$FONT_DIR/$FONT_FILE" ]; then if [ -f "$FONT_DIR/$file" ]; then
SIZE=$(du -h "$FONT_DIR/$FONT_FILE" | cut -f1) echo "$desc already exists"
echo "✓ Font downloaded successfully: $SIZE"
else else
echo "✗ Font download failed" echo "Downloading $desc..."
exit 1 if wget -q "$url" -O "$FONT_DIR/$file"; then
SIZE=$(du -h "$FONT_DIR/$file" | cut -f1)
echo "$desc downloaded: $SIZE"
else
echo "✗ Failed to download $desc"
return 1
fi
fi fi
fi }
# NotoSansSC - Chinese (Simplified), also covers Japanese and basic CJK
download_font \
"https://github.com/notofonts/noto-cjk/raw/main/Sans/Variable/TTF/Subset/NotoSansSC-VF.ttf" \
"NotoSansSC-Regular.ttf" \
"Noto Sans SC (Chinese/Japanese)"
# NotoSansKR - Korean
download_font \
"https://github.com/notofonts/noto-cjk/raw/main/Sans/Variable/TTF/Subset/NotoSansKR-VF.ttf" \
"NotoSansKR-Regular.ttf" \
"Noto Sans KR (Korean)"
# NotoSansThai - Thai
download_font \
"https://github.com/notofonts/noto-fonts/raw/main/hinted/ttf/NotoSansThai/NotoSansThai-Regular.ttf" \
"NotoSansThai-Regular.ttf" \
"Noto Sans Thai"
echo ""
echo "✅ Font setup complete!" echo "✅ Font setup complete!"
echo "Supported languages: Chinese (zh-CN/zh-TW), Japanese (ja), Korean (ko),"
echo " Thai (th), Russian (ru), Vietnamese (vi),"
echo " and all Latin-based languages (en, de, fr, es, etc.)"

Binary file not shown.

Binary file not shown.