feat: add multilingual font support for translated PDFs
- Add NotoSansKR and NotoSansThai fonts for Korean and Thai language support - Update download_fonts.sh to download all required fonts - Add LANGUAGE_FONT_MAP for language-to-font mapping in pdf_generator_service.py - Add get_font_for_language() method to select appropriate font based on target language - Update _get_reflow_styles() to accept target_lang parameter - Pass target_lang through generate_translated_pdf() to PDF generation methods - Fix garbled characters (亂碼) issue for Korean and Thai translations Supported languages: - Chinese (zh-CN/zh-TW), Japanese (ja): NotoSansSC - Korean (ko): NotoSansKR - Thai (th): NotoSansThai - Russian, Vietnamese, Latin languages: NotoSansSC 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -145,40 +145,83 @@ class PDFGeneratorService:
|
|||||||
STYLE_FLAG_UNDERLINE = 4
|
STYLE_FLAG_UNDERLINE = 4
|
||||||
STYLE_FLAG_STRIKETHROUGH = 8
|
STYLE_FLAG_STRIKETHROUGH = 8
|
||||||
|
|
||||||
|
# Language to font mapping for proper character support
|
||||||
|
LANGUAGE_FONT_MAP = {
|
||||||
|
# Korean requires separate font
|
||||||
|
'ko': 'NotoSansKR',
|
||||||
|
# Thai requires separate font
|
||||||
|
'th': 'NotoSansThai',
|
||||||
|
# All other languages use NotoSansSC (covers CJK, Latin, Cyrillic, etc.)
|
||||||
|
'default': 'NotoSansSC',
|
||||||
|
}
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
"""Initialize PDF generator with font configuration"""
|
"""Initialize PDF generator with font configuration"""
|
||||||
self.font_name = 'NotoSansSC'
|
self.font_name = 'NotoSansSC' # Default font
|
||||||
self.font_path = None
|
self.font_path = None
|
||||||
self.font_registered = False
|
self.font_registered = False
|
||||||
self.current_processing_track = None # Track type for current document
|
self.current_processing_track = None # Track type for current document
|
||||||
|
self.registered_fonts = {} # Track which fonts are registered
|
||||||
|
|
||||||
self._register_chinese_font()
|
self._register_multilingual_fonts()
|
||||||
|
|
||||||
def _register_chinese_font(self):
|
def _register_multilingual_fonts(self):
|
||||||
"""Register Chinese font for PDF generation"""
|
"""Register fonts for all supported languages"""
|
||||||
try:
|
project_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||||
# Get font path from settings
|
fonts_dir = project_root / 'backend' / 'fonts'
|
||||||
font_path = Path(settings.chinese_font_path)
|
|
||||||
|
|
||||||
# Try relative path from project root
|
# Font files to register
|
||||||
if not font_path.is_absolute():
|
font_files = {
|
||||||
# Adjust path - settings.chinese_font_path starts with ./backend/
|
'NotoSansSC': 'NotoSansSC-Regular.ttf',
|
||||||
project_root = Path(__file__).resolve().parent.parent.parent.parent
|
'NotoSansKR': 'NotoSansKR-Regular.ttf',
|
||||||
font_path = project_root / font_path
|
'NotoSansThai': 'NotoSansThai-Regular.ttf',
|
||||||
|
}
|
||||||
|
|
||||||
if not font_path.exists():
|
for font_name, font_file in font_files.items():
|
||||||
logger.error(f"Chinese font not found at {font_path}")
|
font_path = fonts_dir / font_file
|
||||||
return
|
if font_path.exists():
|
||||||
|
try:
|
||||||
|
pdfmetrics.registerFont(TTFont(font_name, str(font_path)))
|
||||||
|
self.registered_fonts[font_name] = font_path
|
||||||
|
logger.info(f"Font registered: {font_name} from {font_path}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to register font {font_name}: {e}")
|
||||||
|
else:
|
||||||
|
logger.warning(f"Font file not found: {font_path}")
|
||||||
|
|
||||||
# Register font
|
# Set default font
|
||||||
pdfmetrics.registerFont(TTFont(self.font_name, str(font_path)))
|
if 'NotoSansSC' in self.registered_fonts:
|
||||||
self.font_path = font_path
|
self.font_name = 'NotoSansSC'
|
||||||
|
self.font_path = self.registered_fonts['NotoSansSC']
|
||||||
self.font_registered = True
|
self.font_registered = True
|
||||||
logger.info(f"Chinese font registered: {self.font_name} from {font_path}")
|
else:
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to register Chinese font: {e}")
|
|
||||||
self.font_registered = False
|
self.font_registered = False
|
||||||
|
logger.error("Default font NotoSansSC not available")
|
||||||
|
|
||||||
|
def get_font_for_language(self, target_lang: str) -> str:
|
||||||
|
"""
|
||||||
|
Get the appropriate font name for a target language.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
target_lang: Language code (e.g., 'ko', 'th', 'ja', 'zh-TW')
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Font name to use for this language
|
||||||
|
"""
|
||||||
|
# Check if language has a specific font
|
||||||
|
font_name = self.LANGUAGE_FONT_MAP.get(target_lang, self.LANGUAGE_FONT_MAP['default'])
|
||||||
|
|
||||||
|
# Verify font is registered, fall back to default if not
|
||||||
|
if font_name not in self.registered_fonts:
|
||||||
|
logger.warning(f"Font {font_name} for language {target_lang} not registered, using default")
|
||||||
|
font_name = self.LANGUAGE_FONT_MAP['default']
|
||||||
|
|
||||||
|
# Final fallback to Helvetica if no fonts available
|
||||||
|
if font_name not in self.registered_fonts:
|
||||||
|
logger.warning(f"No suitable font found for language {target_lang}, using Helvetica")
|
||||||
|
return 'Helvetica'
|
||||||
|
|
||||||
|
return font_name
|
||||||
|
|
||||||
def _detect_content_orientation(
|
def _detect_content_orientation(
|
||||||
self,
|
self,
|
||||||
@@ -4462,10 +4505,20 @@ class PDFGeneratorService:
|
|||||||
# Direct track: elements already in reading order from PyMuPDF
|
# Direct track: elements already in reading order from PyMuPDF
|
||||||
return elements
|
return elements
|
||||||
|
|
||||||
def _get_reflow_styles(self) -> Dict[str, ParagraphStyle]:
|
def _get_reflow_styles(self, target_lang: Optional[str] = None) -> Dict[str, ParagraphStyle]:
|
||||||
"""Create consistent styles for reflow PDF generation."""
|
"""
|
||||||
|
Create consistent styles for reflow PDF generation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
target_lang: Optional target language code for selecting appropriate font.
|
||||||
|
If not provided, uses default font.
|
||||||
|
"""
|
||||||
base_styles = getSampleStyleSheet()
|
base_styles = getSampleStyleSheet()
|
||||||
font_name = self.font_name if self.font_registered else 'Helvetica'
|
# Use language-specific font if target_lang is provided
|
||||||
|
if target_lang:
|
||||||
|
font_name = self.get_font_for_language(target_lang)
|
||||||
|
else:
|
||||||
|
font_name = self.font_name if self.font_registered else 'Helvetica'
|
||||||
|
|
||||||
styles = {
|
styles = {
|
||||||
'Title': ParagraphStyle(
|
'Title': ParagraphStyle(
|
||||||
@@ -4936,7 +4989,8 @@ class PDFGeneratorService:
|
|||||||
json_path: Path,
|
json_path: Path,
|
||||||
output_path: Path,
|
output_path: Path,
|
||||||
source_file_path: Optional[Path] = None,
|
source_file_path: Optional[Path] = None,
|
||||||
use_elements_only: bool = False
|
use_elements_only: bool = False,
|
||||||
|
target_lang: Optional[str] = None
|
||||||
) -> bool:
|
) -> bool:
|
||||||
"""
|
"""
|
||||||
Generate reflow layout PDF from OCR/Direct JSON data.
|
Generate reflow layout PDF from OCR/Direct JSON data.
|
||||||
@@ -4953,6 +5007,7 @@ class PDFGeneratorService:
|
|||||||
source_file_path: Optional path to original source file (for images)
|
source_file_path: Optional path to original source file (for images)
|
||||||
use_elements_only: If True, always use elements from JSON (for translated PDFs
|
use_elements_only: If True, always use elements from JSON (for translated PDFs
|
||||||
where translations are applied to elements, not raw_ocr_regions)
|
where translations are applied to elements, not raw_ocr_regions)
|
||||||
|
target_lang: Optional target language code for selecting appropriate font
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
True if successful, False otherwise
|
True if successful, False otherwise
|
||||||
@@ -4969,8 +5024,8 @@ class PDFGeneratorService:
|
|||||||
is_ocr_track = processing_track == 'ocr'
|
is_ocr_track = processing_track == 'ocr'
|
||||||
logger.info(f"Reflow PDF generation - Processing track: {processing_track}")
|
logger.info(f"Reflow PDF generation - Processing track: {processing_track}")
|
||||||
|
|
||||||
# Get styles
|
# Get styles (with language-specific font if target_lang provided)
|
||||||
styles = self._get_reflow_styles()
|
styles = self._get_reflow_styles(target_lang=target_lang)
|
||||||
|
|
||||||
# Build document content
|
# Build document content
|
||||||
story = []
|
story = []
|
||||||
@@ -5149,7 +5204,8 @@ class PDFGeneratorService:
|
|||||||
result_json=result_json,
|
result_json=result_json,
|
||||||
raw_ocr_translations=raw_ocr_translations,
|
raw_ocr_translations=raw_ocr_translations,
|
||||||
output_path=output_path,
|
output_path=output_path,
|
||||||
result_dir=result_json_path.parent
|
result_dir=result_json_path.parent,
|
||||||
|
target_lang=target_lang
|
||||||
)
|
)
|
||||||
|
|
||||||
# Direct Track: Use element-based translations
|
# Direct Track: Use element-based translations
|
||||||
@@ -5189,7 +5245,8 @@ class PDFGeneratorService:
|
|||||||
json_path=tmp_path,
|
json_path=tmp_path,
|
||||||
output_path=output_path,
|
output_path=output_path,
|
||||||
source_file_path=result_json_path.parent, # Contains extracted images
|
source_file_path=result_json_path.parent, # Contains extracted images
|
||||||
use_elements_only=True # Use elements with translations applied
|
use_elements_only=True, # Use elements with translations applied
|
||||||
|
target_lang=target_lang # Use language-specific font
|
||||||
)
|
)
|
||||||
return success
|
return success
|
||||||
finally:
|
finally:
|
||||||
@@ -5214,7 +5271,8 @@ class PDFGeneratorService:
|
|||||||
result_json: Dict,
|
result_json: Dict,
|
||||||
raw_ocr_translations: List[Dict],
|
raw_ocr_translations: List[Dict],
|
||||||
output_path: Path,
|
output_path: Path,
|
||||||
result_dir: Path
|
result_dir: Path,
|
||||||
|
target_lang: Optional[str] = None
|
||||||
) -> bool:
|
) -> bool:
|
||||||
"""
|
"""
|
||||||
Generate translated reflow PDF for OCR Track documents.
|
Generate translated reflow PDF for OCR Track documents.
|
||||||
@@ -5226,13 +5284,14 @@ class PDFGeneratorService:
|
|||||||
raw_ocr_translations: List of {page, index, original, translated}
|
raw_ocr_translations: List of {page, index, original, translated}
|
||||||
output_path: Path to save generated PDF
|
output_path: Path to save generated PDF
|
||||||
result_dir: Path to result directory for images
|
result_dir: Path to result directory for images
|
||||||
|
target_lang: Target language code for selecting appropriate font
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
True if successful, False otherwise
|
True if successful, False otherwise
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# Get styles
|
# Get styles with language-specific font
|
||||||
styles = self._get_reflow_styles()
|
styles = self._get_reflow_styles(target_lang=target_lang)
|
||||||
|
|
||||||
# Build document content
|
# Build document content
|
||||||
story = []
|
story = []
|
||||||
|
|||||||
@@ -1,31 +1,56 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
# Download Noto Sans SC TrueType font for layout-preserving PDF generation
|
# Download Noto Sans fonts for multilingual PDF generation
|
||||||
|
# Supports: CJK (Chinese, Japanese, Korean), Thai, Vietnamese, Russian, etc.
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
FONT_DIR="backend/fonts"
|
FONT_DIR="backend/fonts"
|
||||||
FONT_URL="https://github.com/notofonts/noto-cjk/raw/main/Sans/Variable/TTF/Subset/NotoSansSC-VF.ttf"
|
|
||||||
FONT_FILE="NotoSansSC-Regular.ttf"
|
|
||||||
|
|
||||||
echo "🔤 Downloading Chinese font for PDF generation..."
|
echo "🔤 Downloading fonts for multilingual PDF generation..."
|
||||||
|
|
||||||
# Create font directory
|
# Create font directory
|
||||||
mkdir -p "$FONT_DIR"
|
mkdir -p "$FONT_DIR"
|
||||||
|
|
||||||
# Download font if not exists
|
# Function to download font
|
||||||
if [ -f "$FONT_DIR/$FONT_FILE" ]; then
|
download_font() {
|
||||||
echo "✓ Font already exists: $FONT_DIR/$FONT_FILE"
|
local url="$1"
|
||||||
else
|
local file="$2"
|
||||||
echo "Downloading from GitHub..."
|
local desc="$3"
|
||||||
wget "$FONT_URL" -O "$FONT_DIR/$FONT_FILE"
|
|
||||||
|
|
||||||
if [ -f "$FONT_DIR/$FONT_FILE" ]; then
|
if [ -f "$FONT_DIR/$file" ]; then
|
||||||
SIZE=$(du -h "$FONT_DIR/$FONT_FILE" | cut -f1)
|
echo "✓ $desc already exists"
|
||||||
echo "✓ Font downloaded successfully: $SIZE"
|
|
||||||
else
|
else
|
||||||
echo "✗ Font download failed"
|
echo "Downloading $desc..."
|
||||||
exit 1
|
if wget -q "$url" -O "$FONT_DIR/$file"; then
|
||||||
|
SIZE=$(du -h "$FONT_DIR/$file" | cut -f1)
|
||||||
|
echo "✓ $desc downloaded: $SIZE"
|
||||||
|
else
|
||||||
|
echo "✗ Failed to download $desc"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
fi
|
}
|
||||||
|
|
||||||
|
# NotoSansSC - Chinese (Simplified), also covers Japanese and basic CJK
|
||||||
|
download_font \
|
||||||
|
"https://github.com/notofonts/noto-cjk/raw/main/Sans/Variable/TTF/Subset/NotoSansSC-VF.ttf" \
|
||||||
|
"NotoSansSC-Regular.ttf" \
|
||||||
|
"Noto Sans SC (Chinese/Japanese)"
|
||||||
|
|
||||||
|
# NotoSansKR - Korean
|
||||||
|
download_font \
|
||||||
|
"https://github.com/notofonts/noto-cjk/raw/main/Sans/Variable/TTF/Subset/NotoSansKR-VF.ttf" \
|
||||||
|
"NotoSansKR-Regular.ttf" \
|
||||||
|
"Noto Sans KR (Korean)"
|
||||||
|
|
||||||
|
# NotoSansThai - Thai
|
||||||
|
download_font \
|
||||||
|
"https://github.com/notofonts/noto-fonts/raw/main/hinted/ttf/NotoSansThai/NotoSansThai-Regular.ttf" \
|
||||||
|
"NotoSansThai-Regular.ttf" \
|
||||||
|
"Noto Sans Thai"
|
||||||
|
|
||||||
|
echo ""
|
||||||
echo "✅ Font setup complete!"
|
echo "✅ Font setup complete!"
|
||||||
|
echo "Supported languages: Chinese (zh-CN/zh-TW), Japanese (ja), Korean (ko),"
|
||||||
|
echo " Thai (th), Russian (ru), Vietnamese (vi),"
|
||||||
|
echo " and all Latin-based languages (en, de, fr, es, etc.)"
|
||||||
|
|||||||
BIN
backend/fonts/NotoSansKR-Regular.ttf
Normal file
BIN
backend/fonts/NotoSansKR-Regular.ttf
Normal file
Binary file not shown.
BIN
backend/fonts/NotoSansThai-Regular.ttf
Normal file
BIN
backend/fonts/NotoSansThai-Regular.ttf
Normal file
Binary file not shown.
Reference in New Issue
Block a user