""" Tool_OCR - PDF Generator Service Converts Markdown to layout-preserved PDFs using Pandoc + WeasyPrint """ import logging import subprocess from pathlib import Path from typing import Optional, Dict from datetime import datetime from weasyprint import HTML, CSS from markdown import markdown from app.core.config import settings logger = logging.getLogger(__name__) class PDFGenerationError(Exception): """Exception raised when PDF generation fails""" pass class PDFGenerator: """ PDF generation service with layout preservation Supports two generation methods: 1. Pandoc (preferred): Markdown → HTML → PDF via pandoc command 2. WeasyPrint (fallback): Direct Python-based HTML → PDF conversion """ # Default CSS template for layout preservation DEFAULT_CSS = """ @page { size: A4; margin: 2cm; } body { font-family: "Noto Sans CJK SC", "Noto Sans CJK TC", "Microsoft YaHei", "SimSun", sans-serif; font-size: 11pt; line-height: 1.6; color: #333; } h1 { font-size: 24pt; font-weight: bold; margin-top: 0; margin-bottom: 12pt; color: #000; page-break-after: avoid; } h2 { font-size: 18pt; font-weight: bold; margin-top: 18pt; margin-bottom: 10pt; color: #000; page-break-after: avoid; } h3 { font-size: 14pt; font-weight: bold; margin-top: 14pt; margin-bottom: 8pt; color: #000; page-break-after: avoid; } p { margin: 0 0 10pt 0; text-align: justify; } table { width: 100%; border-collapse: collapse; margin: 12pt 0; page-break-inside: avoid; } table th { background-color: #f0f0f0; border: 1px solid #ccc; padding: 8pt; text-align: left; font-weight: bold; } table td { border: 1px solid #ccc; padding: 8pt; text-align: left; } code { font-family: "Courier New", monospace; font-size: 10pt; background-color: #f5f5f5; padding: 2pt 4pt; border-radius: 3px; } pre { background-color: #f5f5f5; border: 1px solid #ddd; border-radius: 5px; padding: 10pt; overflow-x: auto; page-break-inside: avoid; } pre code { background-color: transparent; padding: 0; } img { max-width: 100%; height: auto; display: block; margin: 12pt auto; page-break-inside: avoid; } blockquote { border-left: 4px solid #ddd; padding-left: 12pt; margin: 12pt 0; color: #666; font-style: italic; } ul, ol { margin: 10pt 0; padding-left: 20pt; } li { margin: 5pt 0; } hr { border: none; border-top: 1px solid #ccc; margin: 20pt 0; } .page-break { page-break-after: always; } """ # Academic paper template ACADEMIC_CSS = """ @page { size: A4; margin: 2.5cm; } body { font-family: "Times New Roman", "Noto Serif CJK SC", serif; font-size: 12pt; line-height: 1.8; color: #000; } h1 { font-size: 20pt; text-align: center; margin-bottom: 24pt; page-break-after: avoid; } h2 { font-size: 16pt; margin-top: 20pt; margin-bottom: 12pt; page-break-after: avoid; } h3 { font-size: 14pt; margin-top: 16pt; margin-bottom: 10pt; page-break-after: avoid; } p { text-indent: 2em; text-align: justify; margin: 0 0 12pt 0; } table { width: 100%; border-collapse: collapse; margin: 16pt auto; page-break-inside: avoid; } table caption { font-weight: bold; margin-bottom: 8pt; } """ # Business report template BUSINESS_CSS = """ @page { size: A4; margin: 2cm 2.5cm; } body { font-family: "Arial", "Noto Sans CJK SC", sans-serif; font-size: 11pt; line-height: 1.5; color: #333; } h1 { font-size: 22pt; color: #0066cc; border-bottom: 3px solid #0066cc; padding-bottom: 8pt; margin-bottom: 20pt; page-break-after: avoid; } h2 { font-size: 16pt; color: #0066cc; margin-top: 20pt; margin-bottom: 12pt; page-break-after: avoid; } table { width: 100%; border-collapse: collapse; margin: 16pt 0; } table th { background-color: #0066cc; color: white; padding: 10pt; font-weight: bold; } table td { border: 1px solid #ddd; padding: 10pt; } table tr:nth-child(even) { background-color: #f9f9f9; } """ def __init__(self): """Initialize PDF generator""" self.css_templates = { "default": self.DEFAULT_CSS, "academic": self.ACADEMIC_CSS, "business": self.BUSINESS_CSS, } def check_pandoc_available(self) -> bool: """ Check if Pandoc is installed and available Returns: bool: True if pandoc is available, False otherwise """ try: result = subprocess.run( ["pandoc", "--version"], capture_output=True, text=True, timeout=5 ) return result.returncode == 0 except (subprocess.TimeoutExpired, FileNotFoundError): logger.warning("Pandoc not found or timed out") return False def generate_pdf_pandoc( self, markdown_path: Path, output_path: Path, css_template: str = "default", metadata: Optional[Dict] = None ) -> Path: """ Generate PDF using Pandoc (preferred method) Args: markdown_path: Path to input Markdown file output_path: Path to output PDF file css_template: CSS template name or custom CSS string metadata: Optional metadata dict (title, author, date) Returns: Path: Path to generated PDF file Raises: PDFGenerationError: If PDF generation fails """ try: # Create temporary CSS file css_content = self.css_templates.get(css_template, css_template) css_file = output_path.parent / f"temp_{datetime.now().timestamp()}.css" css_file.write_text(css_content, encoding="utf-8") # Build pandoc command pandoc_cmd = [ "pandoc", str(markdown_path), "-o", str(output_path), "--pdf-engine=weasyprint", "--css", str(css_file), "--standalone", "--from=markdown+tables+fenced_code_blocks+footnotes", ] # Add metadata if provided if metadata: if metadata.get("title"): pandoc_cmd.extend(["--metadata", f"title={metadata['title']}"]) if metadata.get("author"): pandoc_cmd.extend(["--metadata", f"author={metadata['author']}"]) if metadata.get("date"): pandoc_cmd.extend(["--metadata", f"date={metadata['date']}"]) # Execute pandoc logger.info(f"Executing pandoc: {' '.join(pandoc_cmd)}") result = subprocess.run( pandoc_cmd, capture_output=True, text=True, timeout=60 # 60 second timeout for large documents ) # Clean up temporary CSS file css_file.unlink(missing_ok=True) if result.returncode != 0: error_msg = f"Pandoc failed: {result.stderr}" logger.error(error_msg) raise PDFGenerationError(error_msg) if not output_path.exists(): raise PDFGenerationError(f"PDF file not created: {output_path}") logger.info(f"PDF generated successfully via Pandoc: {output_path}") return output_path except subprocess.TimeoutExpired: css_file.unlink(missing_ok=True) raise PDFGenerationError("Pandoc execution timed out") except Exception as e: css_file.unlink(missing_ok=True) raise PDFGenerationError(f"Pandoc PDF generation failed: {str(e)}") def generate_pdf_weasyprint( self, markdown_path: Path, output_path: Path, css_template: str = "default", metadata: Optional[Dict] = None ) -> Path: """ Generate PDF using WeasyPrint directly (fallback method) Args: markdown_path: Path to input Markdown file output_path: Path to output PDF file css_template: CSS template name or custom CSS string metadata: Optional metadata dict (title, author, date) Returns: Path: Path to generated PDF file Raises: PDFGenerationError: If PDF generation fails """ try: # Read Markdown content markdown_content = markdown_path.read_text(encoding="utf-8") # Convert Markdown to HTML html_content = markdown( markdown_content, extensions=[ 'tables', 'fenced_code', 'codehilite', 'nl2br', 'sane_lists', ] ) # Wrap HTML with proper structure title = metadata.get("title", markdown_path.stem) if metadata else markdown_path.stem full_html = f""" {title} {html_content} """ # Get CSS content css_content = self.css_templates.get(css_template, css_template) # Generate PDF logger.info(f"Generating PDF via WeasyPrint: {output_path}") html = HTML(string=full_html, base_url=str(markdown_path.parent)) css = CSS(string=css_content) html.write_pdf(str(output_path), stylesheets=[css]) if not output_path.exists(): raise PDFGenerationError(f"PDF file not created: {output_path}") logger.info(f"PDF generated successfully via WeasyPrint: {output_path}") return output_path except Exception as e: raise PDFGenerationError(f"WeasyPrint PDF generation failed: {str(e)}") def generate_pdf( self, markdown_path: Path, output_path: Path, css_template: str = "default", metadata: Optional[Dict] = None, prefer_pandoc: bool = True ) -> Path: """ Generate PDF from Markdown with automatic fallback Args: markdown_path: Path to input Markdown file output_path: Path to output PDF file css_template: CSS template name ("default", "academic", "business") or custom CSS metadata: Optional metadata dict (title, author, date) prefer_pandoc: Use Pandoc if available, fallback to WeasyPrint Returns: Path: Path to generated PDF file Raises: PDFGenerationError: If both methods fail """ if not markdown_path.exists(): raise PDFGenerationError(f"Markdown file not found: {markdown_path}") # Ensure output directory exists output_path.parent.mkdir(parents=True, exist_ok=True) # Try Pandoc first if preferred and available if prefer_pandoc and self.check_pandoc_available(): try: return self.generate_pdf_pandoc(markdown_path, output_path, css_template, metadata) except PDFGenerationError as e: logger.warning(f"Pandoc failed, falling back to WeasyPrint: {e}") # Fall through to WeasyPrint # Use WeasyPrint (fallback or direct) return self.generate_pdf_weasyprint(markdown_path, output_path, css_template, metadata) def get_available_templates(self) -> Dict[str, str]: """ Get list of available CSS templates Returns: Dict mapping template names to descriptions """ return { "default": "通用排版模板,適合大多數文檔", "academic": "學術論文模板,適合研究報告", "business": "商業報告模板,適合企業文檔", } def save_custom_template(self, template_name: str, css_content: str) -> None: """ Save a custom CSS template Args: template_name: Template name css_content: CSS content """ self.css_templates[template_name] = css_content logger.info(f"Custom CSS template saved: {template_name}")