Files
OCR/backend/app/services/pdf_font_manager.py
egg eff9b0bcd5 feat: refactor dual-track architecture (Phase 1-5)
## Backend Changes
- **Service Layer Refactoring**:
  - Add ProcessingOrchestrator for unified document processing
  - Add PDFTableRenderer for table rendering extraction
  - Add PDFFontManager for font management with CJK support
  - Add MemoryPolicyEngine (73% code reduction from MemoryGuard)

- **Bug Fixes**:
  - Fix Direct Track table row span calculation
  - Fix OCR Track image path handling
  - Add cell_boxes coordinate validation
  - Filter out small decorative images
  - Add covering image detection

## Frontend Changes
- **State Management**:
  - Add TaskStore for centralized task state management
  - Add localStorage persistence for recent tasks
  - Add processing state tracking

- **Type Consolidation**:
  - Merge shared types from api.ts to apiV2.ts
  - Update imports in authStore, uploadStore, ResultsTable, SettingsPage

- **Page Integration**:
  - Integrate TaskStore in ProcessingPage and TaskDetailPage
  - Update useTaskValidation hook with cache sync

## Testing
- Direct Track: edit.pdf (3 pages, 1.281s), edit3.pdf (2 pages, 0.203s)
- Cell boxes validation: 43 valid, 0 invalid
- Table merging: 12 merged cells verified

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-07 07:18:27 +08:00

313 lines
9.9 KiB
Python

"""
PDF Font Manager - Handles font loading, registration, and fallback.
This module provides unified font management for PDF generation,
including CJK font support and font fallback mechanisms.
"""
import logging
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
logger = logging.getLogger(__name__)
# ============================================================================
# Configuration
# ============================================================================
@dataclass
class FontConfig:
"""Configuration for font management."""
# Primary fonts
chinese_font_name: str = "NotoSansSC"
chinese_font_path: Optional[Path] = None
# Fallback fonts (built-in)
fallback_font_name: str = "Helvetica"
fallback_cjk_font_name: str = "HeiseiMin-W3" # Built-in ReportLab CJK
# Font sizes
default_font_size: int = 10
min_font_size: int = 6
max_font_size: int = 14
# Font registration options
auto_register: bool = True
enable_cjk_fallback: bool = True
# ============================================================================
# Font Manager
# ============================================================================
class FontManager:
"""
Manages font registration and selection for PDF generation.
Features:
- Lazy font registration
- CJK (Chinese/Japanese/Korean) font support
- Automatic fallback to built-in fonts
- Font caching to avoid duplicate registration
"""
_instance = None
_registered_fonts: Dict[str, Path] = {}
def __new__(cls, *args, **kwargs):
"""Singleton pattern to avoid duplicate font registration."""
if cls._instance is None:
cls._instance = super().__new__(cls)
cls._instance._initialized = False
return cls._instance
def __init__(self, config: Optional[FontConfig] = None):
"""
Initialize FontManager.
Args:
config: FontConfig instance (uses defaults if None)
"""
if self._initialized:
return
self.config = config or FontConfig()
self._primary_font_registered = False
self._cjk_fallback_available = False
# Auto-register fonts if enabled
if self.config.auto_register:
self._register_fonts()
self._initialized = True
@property
def primary_font_name(self) -> str:
"""Get the primary font name to use."""
if self._primary_font_registered:
return self.config.chinese_font_name
return self.config.fallback_font_name
@property
def is_cjk_enabled(self) -> bool:
"""Check if CJK fonts are available."""
return self._primary_font_registered or self._cjk_fallback_available
@classmethod
def reset(cls):
"""Reset singleton instance (for testing)."""
cls._instance = None
cls._registered_fonts = {}
def get_font_for_text(self, text: str) -> str:
"""
Get appropriate font name for given text.
Args:
text: Text to render
Returns:
Font name suitable for the text content
"""
if self._contains_cjk(text):
if self._primary_font_registered:
return self.config.chinese_font_name
elif self._cjk_fallback_available:
return self.config.fallback_cjk_font_name
return self.primary_font_name
def get_font_size(
self,
text: str,
available_width: float,
available_height: float,
pdf_canvas=None
) -> int:
"""
Calculate optimal font size for text to fit within bounds.
Args:
text: Text to render
available_width: Maximum width available
available_height: Maximum height available
pdf_canvas: Optional canvas for precise measurement
Returns:
Font size that fits within bounds
"""
font_name = self.get_font_for_text(text)
for size in range(self.config.max_font_size, self.config.min_font_size - 1, -1):
if pdf_canvas:
# Precise measurement with canvas
text_width = pdf_canvas.stringWidth(text, font_name, size)
else:
# Approximate measurement
text_width = len(text) * size * 0.6 # Rough estimate
text_height = size * 1.2 # Line height
if text_width <= available_width and text_height <= available_height:
return size
return self.config.min_font_size
def register_font(
self,
font_name: str,
font_path: Path,
force: bool = False
) -> bool:
"""
Register a custom font.
Args:
font_name: Name to register font under
font_path: Path to TTF font file
force: Force re-registration if already registered
Returns:
True if registration successful
"""
if font_name in self._registered_fonts and not force:
logger.debug(f"Font {font_name} already registered")
return True
try:
if not font_path.exists():
logger.error(f"Font file not found: {font_path}")
return False
pdfmetrics.registerFont(TTFont(font_name, str(font_path)))
self._registered_fonts[font_name] = font_path
logger.info(f"Font registered: {font_name} from {font_path}")
return True
except Exception as e:
logger.error(f"Failed to register font {font_name}: {e}")
return False
def get_registered_fonts(self) -> List[str]:
"""Get list of registered custom font names."""
return list(self._registered_fonts.keys())
# =========================================================================
# Private Methods
# =========================================================================
def _register_fonts(self):
"""Register configured fonts."""
# Register primary Chinese font
if self.config.chinese_font_path:
self._register_chinese_font()
# Setup CJK fallback
if self.config.enable_cjk_fallback:
self._setup_cjk_fallback()
def _register_chinese_font(self):
"""Register the primary Chinese font."""
font_path = self.config.chinese_font_path
if font_path is None:
# Try to load from settings
try:
from app.core.config import settings
font_path = Path(settings.chinese_font_path)
except Exception as e:
logger.debug(f"Could not load font path from settings: {e}")
return
# Resolve relative path
if not font_path.is_absolute():
# Try project root
project_root = Path(__file__).resolve().parent.parent.parent.parent
font_path = project_root / font_path
if not font_path.exists():
logger.warning(f"Chinese font not found at {font_path}")
return
try:
pdfmetrics.registerFont(TTFont(self.config.chinese_font_name, str(font_path)))
self._registered_fonts[self.config.chinese_font_name] = font_path
self._primary_font_registered = True
logger.info(f"Chinese font registered: {self.config.chinese_font_name}")
except Exception as e:
logger.error(f"Failed to register Chinese font: {e}")
def _setup_cjk_fallback(self):
"""Setup CJK fallback using built-in fonts."""
try:
# ReportLab includes CID fonts for CJK
from reportlab.pdfbase.cidfonts import UnicodeCIDFont
# Register CJK fonts if not already registered
try:
pdfmetrics.registerFont(UnicodeCIDFont('HeiseiMin-W3'))
self._cjk_fallback_available = True
logger.debug("CJK fallback font available: HeiseiMin-W3")
except Exception:
pass # Font may already be registered
except ImportError:
logger.debug("CID fonts not available for CJK fallback")
def _contains_cjk(self, text: str) -> bool:
"""
Check if text contains CJK characters.
Args:
text: Text to check
Returns:
True if text contains Chinese, Japanese, or Korean characters
"""
if not text:
return False
for char in text:
code = ord(char)
# CJK Unified Ideographs and related ranges
if any([
0x4E00 <= code <= 0x9FFF, # CJK Unified Ideographs
0x3400 <= code <= 0x4DBF, # CJK Extension A
0x20000 <= code <= 0x2A6DF, # CJK Extension B
0x3000 <= code <= 0x303F, # CJK Punctuation
0x3040 <= code <= 0x309F, # Hiragana
0x30A0 <= code <= 0x30FF, # Katakana
0xAC00 <= code <= 0xD7AF, # Korean Hangul
]):
return True
return False
# ============================================================================
# Convenience Functions
# ============================================================================
_default_manager: Optional[FontManager] = None
def get_font_manager() -> FontManager:
"""Get the default FontManager instance."""
global _default_manager
if _default_manager is None:
_default_manager = FontManager()
return _default_manager
def register_font(font_name: str, font_path: Path) -> bool:
"""Register a font using the default manager."""
return get_font_manager().register_font(font_name, font_path)
def get_font_for_text(text: str) -> str:
"""Get appropriate font for text using the default manager."""
return get_font_manager().get_font_for_text(text)