diff --git a/backend/app/services/pdf_generator_service.py b/backend/app/services/pdf_generator_service.py index 5ede0af..874caf4 100644 --- a/backend/app/services/pdf_generator_service.py +++ b/backend/app/services/pdf_generator_service.py @@ -145,40 +145,83 @@ class PDFGeneratorService: STYLE_FLAG_UNDERLINE = 4 STYLE_FLAG_STRIKETHROUGH = 8 + # Language to font mapping for proper character support + LANGUAGE_FONT_MAP = { + # Korean requires separate font + 'ko': 'NotoSansKR', + # Thai requires separate font + 'th': 'NotoSansThai', + # All other languages use NotoSansSC (covers CJK, Latin, Cyrillic, etc.) + 'default': 'NotoSansSC', + } + def __init__(self): """Initialize PDF generator with font configuration""" - self.font_name = 'NotoSansSC' + self.font_name = 'NotoSansSC' # Default font self.font_path = None self.font_registered = False self.current_processing_track = None # Track type for current document + self.registered_fonts = {} # Track which fonts are registered - self._register_chinese_font() + self._register_multilingual_fonts() - def _register_chinese_font(self): - """Register Chinese font for PDF generation""" - try: - # Get font path from settings - font_path = Path(settings.chinese_font_path) + def _register_multilingual_fonts(self): + """Register fonts for all supported languages""" + project_root = Path(__file__).resolve().parent.parent.parent.parent + fonts_dir = project_root / 'backend' / 'fonts' - # Try relative path from project root - if not font_path.is_absolute(): - # Adjust path - settings.chinese_font_path starts with ./backend/ - project_root = Path(__file__).resolve().parent.parent.parent.parent - font_path = project_root / font_path + # Font files to register + font_files = { + 'NotoSansSC': 'NotoSansSC-Regular.ttf', + 'NotoSansKR': 'NotoSansKR-Regular.ttf', + 'NotoSansThai': 'NotoSansThai-Regular.ttf', + } - if not font_path.exists(): - logger.error(f"Chinese font not found at {font_path}") - return + for font_name, font_file in font_files.items(): + font_path = fonts_dir / font_file + if font_path.exists(): + try: + pdfmetrics.registerFont(TTFont(font_name, str(font_path))) + self.registered_fonts[font_name] = font_path + logger.info(f"Font registered: {font_name} from {font_path}") + except Exception as e: + logger.warning(f"Failed to register font {font_name}: {e}") + else: + logger.warning(f"Font file not found: {font_path}") - # Register font - pdfmetrics.registerFont(TTFont(self.font_name, str(font_path))) - self.font_path = font_path + # Set default font + if 'NotoSansSC' in self.registered_fonts: + self.font_name = 'NotoSansSC' + self.font_path = self.registered_fonts['NotoSansSC'] self.font_registered = True - logger.info(f"Chinese font registered: {self.font_name} from {font_path}") - - except Exception as e: - logger.error(f"Failed to register Chinese font: {e}") + else: self.font_registered = False + logger.error("Default font NotoSansSC not available") + + def get_font_for_language(self, target_lang: str) -> str: + """ + Get the appropriate font name for a target language. + + Args: + target_lang: Language code (e.g., 'ko', 'th', 'ja', 'zh-TW') + + Returns: + Font name to use for this language + """ + # Check if language has a specific font + font_name = self.LANGUAGE_FONT_MAP.get(target_lang, self.LANGUAGE_FONT_MAP['default']) + + # Verify font is registered, fall back to default if not + if font_name not in self.registered_fonts: + logger.warning(f"Font {font_name} for language {target_lang} not registered, using default") + font_name = self.LANGUAGE_FONT_MAP['default'] + + # Final fallback to Helvetica if no fonts available + if font_name not in self.registered_fonts: + logger.warning(f"No suitable font found for language {target_lang}, using Helvetica") + return 'Helvetica' + + return font_name def _detect_content_orientation( self, @@ -4462,10 +4505,20 @@ class PDFGeneratorService: # Direct track: elements already in reading order from PyMuPDF return elements - def _get_reflow_styles(self) -> Dict[str, ParagraphStyle]: - """Create consistent styles for reflow PDF generation.""" + def _get_reflow_styles(self, target_lang: Optional[str] = None) -> Dict[str, ParagraphStyle]: + """ + Create consistent styles for reflow PDF generation. + + Args: + target_lang: Optional target language code for selecting appropriate font. + If not provided, uses default font. + """ base_styles = getSampleStyleSheet() - font_name = self.font_name if self.font_registered else 'Helvetica' + # Use language-specific font if target_lang is provided + if target_lang: + font_name = self.get_font_for_language(target_lang) + else: + font_name = self.font_name if self.font_registered else 'Helvetica' styles = { 'Title': ParagraphStyle( @@ -4936,7 +4989,8 @@ class PDFGeneratorService: json_path: Path, output_path: Path, source_file_path: Optional[Path] = None, - use_elements_only: bool = False + use_elements_only: bool = False, + target_lang: Optional[str] = None ) -> bool: """ Generate reflow layout PDF from OCR/Direct JSON data. @@ -4953,6 +5007,7 @@ class PDFGeneratorService: source_file_path: Optional path to original source file (for images) use_elements_only: If True, always use elements from JSON (for translated PDFs where translations are applied to elements, not raw_ocr_regions) + target_lang: Optional target language code for selecting appropriate font Returns: True if successful, False otherwise @@ -4969,8 +5024,8 @@ class PDFGeneratorService: is_ocr_track = processing_track == 'ocr' logger.info(f"Reflow PDF generation - Processing track: {processing_track}") - # Get styles - styles = self._get_reflow_styles() + # Get styles (with language-specific font if target_lang provided) + styles = self._get_reflow_styles(target_lang=target_lang) # Build document content story = [] @@ -5149,7 +5204,8 @@ class PDFGeneratorService: result_json=result_json, raw_ocr_translations=raw_ocr_translations, output_path=output_path, - result_dir=result_json_path.parent + result_dir=result_json_path.parent, + target_lang=target_lang ) # Direct Track: Use element-based translations @@ -5189,7 +5245,8 @@ class PDFGeneratorService: json_path=tmp_path, output_path=output_path, source_file_path=result_json_path.parent, # Contains extracted images - use_elements_only=True # Use elements with translations applied + use_elements_only=True, # Use elements with translations applied + target_lang=target_lang # Use language-specific font ) return success finally: @@ -5214,7 +5271,8 @@ class PDFGeneratorService: result_json: Dict, raw_ocr_translations: List[Dict], output_path: Path, - result_dir: Path + result_dir: Path, + target_lang: Optional[str] = None ) -> bool: """ Generate translated reflow PDF for OCR Track documents. @@ -5226,13 +5284,14 @@ class PDFGeneratorService: raw_ocr_translations: List of {page, index, original, translated} output_path: Path to save generated PDF result_dir: Path to result directory for images + target_lang: Target language code for selecting appropriate font Returns: True if successful, False otherwise """ try: - # Get styles - styles = self._get_reflow_styles() + # Get styles with language-specific font + styles = self._get_reflow_styles(target_lang=target_lang) # Build document content story = [] diff --git a/backend/download_fonts.sh b/backend/download_fonts.sh index 2ca7e45..87d7e64 100755 --- a/backend/download_fonts.sh +++ b/backend/download_fonts.sh @@ -1,31 +1,56 @@ #!/bin/bash -# Download Noto Sans SC TrueType font for layout-preserving PDF generation +# Download Noto Sans fonts for multilingual PDF generation +# Supports: CJK (Chinese, Japanese, Korean), Thai, Vietnamese, Russian, etc. set -e FONT_DIR="backend/fonts" -FONT_URL="https://github.com/notofonts/noto-cjk/raw/main/Sans/Variable/TTF/Subset/NotoSansSC-VF.ttf" -FONT_FILE="NotoSansSC-Regular.ttf" -echo "🔤 Downloading Chinese font for PDF generation..." +echo "🔤 Downloading fonts for multilingual PDF generation..." # Create font directory mkdir -p "$FONT_DIR" -# Download font if not exists -if [ -f "$FONT_DIR/$FONT_FILE" ]; then - echo "✓ Font already exists: $FONT_DIR/$FONT_FILE" -else - echo "Downloading from GitHub..." - wget "$FONT_URL" -O "$FONT_DIR/$FONT_FILE" +# Function to download font +download_font() { + local url="$1" + local file="$2" + local desc="$3" - if [ -f "$FONT_DIR/$FONT_FILE" ]; then - SIZE=$(du -h "$FONT_DIR/$FONT_FILE" | cut -f1) - echo "✓ Font downloaded successfully: $SIZE" + if [ -f "$FONT_DIR/$file" ]; then + echo "✓ $desc already exists" else - echo "✗ Font download failed" - exit 1 + echo "Downloading $desc..." + if wget -q "$url" -O "$FONT_DIR/$file"; then + SIZE=$(du -h "$FONT_DIR/$file" | cut -f1) + echo "✓ $desc downloaded: $SIZE" + else + echo "✗ Failed to download $desc" + return 1 + fi fi -fi +} +# NotoSansSC - Chinese (Simplified), also covers Japanese and basic CJK +download_font \ + "https://github.com/notofonts/noto-cjk/raw/main/Sans/Variable/TTF/Subset/NotoSansSC-VF.ttf" \ + "NotoSansSC-Regular.ttf" \ + "Noto Sans SC (Chinese/Japanese)" + +# NotoSansKR - Korean +download_font \ + "https://github.com/notofonts/noto-cjk/raw/main/Sans/Variable/TTF/Subset/NotoSansKR-VF.ttf" \ + "NotoSansKR-Regular.ttf" \ + "Noto Sans KR (Korean)" + +# NotoSansThai - Thai +download_font \ + "https://github.com/notofonts/noto-fonts/raw/main/hinted/ttf/NotoSansThai/NotoSansThai-Regular.ttf" \ + "NotoSansThai-Regular.ttf" \ + "Noto Sans Thai" + +echo "" echo "✅ Font setup complete!" +echo "Supported languages: Chinese (zh-CN/zh-TW), Japanese (ja), Korean (ko)," +echo " Thai (th), Russian (ru), Vietnamese (vi)," +echo " and all Latin-based languages (en, de, fr, es, etc.)" diff --git a/backend/fonts/NotoSansKR-Regular.ttf b/backend/fonts/NotoSansKR-Regular.ttf new file mode 100644 index 0000000..15b3f19 Binary files /dev/null and b/backend/fonts/NotoSansKR-Regular.ttf differ diff --git a/backend/fonts/NotoSansThai-Regular.ttf b/backend/fonts/NotoSansThai-Regular.ttf new file mode 100644 index 0000000..47c3fa8 Binary files /dev/null and b/backend/fonts/NotoSansThai-Regular.ttf differ