""" Tool_OCR - PDF Generator Service Converts Markdown to layout-preserved PDFs using Pandoc + WeasyPrint """ import logging import subprocess from pathlib import Path from typing import Optional, Dict from datetime import datetime from weasyprint import HTML, CSS from markdown import markdown from app.core.config import settings logger = logging.getLogger(__name__) class PDFGenerationError(Exception): """Exception raised when PDF generation fails""" pass class PDFGenerator: """ PDF generation service with layout preservation Supports two generation methods: 1. Pandoc (preferred): Markdown → HTML → PDF via pandoc command 2. WeasyPrint (fallback): Direct Python-based HTML → PDF conversion """ # Default CSS template for layout preservation DEFAULT_CSS = """ @page { size: A4; margin: 2cm; } body { font-family: "Noto Sans CJK SC", "Noto Sans CJK TC", "Microsoft YaHei", "SimSun", sans-serif; font-size: 11pt; line-height: 1.6; color: #333; } h1 { font-size: 24pt; font-weight: bold; margin-top: 0; margin-bottom: 12pt; color: #000; page-break-after: avoid; } h2 { font-size: 18pt; font-weight: bold; margin-top: 18pt; margin-bottom: 10pt; color: #000; page-break-after: avoid; } h3 { font-size: 14pt; font-weight: bold; margin-top: 14pt; margin-bottom: 8pt; color: #000; page-break-after: avoid; } p { margin: 0 0 10pt 0; text-align: justify; } table { width: 100%; border-collapse: collapse; margin: 12pt 0; page-break-inside: avoid; } table th { background-color: #f0f0f0; border: 1px solid #ccc; padding: 8pt; text-align: left; font-weight: bold; } table td { border: 1px solid #ccc; padding: 8pt; text-align: left; } code { font-family: "Courier New", monospace; font-size: 10pt; background-color: #f5f5f5; padding: 2pt 4pt; border-radius: 3px; } pre { background-color: #f5f5f5; border: 1px solid #ddd; border-radius: 5px; padding: 10pt; overflow-x: auto; page-break-inside: avoid; } pre code { background-color: transparent; padding: 0; } img { max-width: 100%; height: auto; display: block; margin: 12pt auto; page-break-inside: avoid; } blockquote { border-left: 4px solid #ddd; padding-left: 12pt; margin: 12pt 0; color: #666; font-style: italic; } ul, ol { margin: 10pt 0; padding-left: 20pt; } li { margin: 5pt 0; } hr { border: none; border-top: 1px solid #ccc; margin: 20pt 0; } .page-break { page-break-after: always; } """ # Academic paper template ACADEMIC_CSS = """ @page { size: A4; margin: 2.5cm; } body { font-family: "Times New Roman", "Noto Serif CJK SC", serif; font-size: 12pt; line-height: 1.8; color: #000; } h1 { font-size: 20pt; text-align: center; margin-bottom: 24pt; page-break-after: avoid; } h2 { font-size: 16pt; margin-top: 20pt; margin-bottom: 12pt; page-break-after: avoid; } h3 { font-size: 14pt; margin-top: 16pt; margin-bottom: 10pt; page-break-after: avoid; } p { text-indent: 2em; text-align: justify; margin: 0 0 12pt 0; } table { width: 100%; border-collapse: collapse; margin: 16pt auto; page-break-inside: avoid; } table caption { font-weight: bold; margin-bottom: 8pt; } """ # Business report template BUSINESS_CSS = """ @page { size: A4; margin: 2cm 2.5cm; } body { font-family: "Arial", "Noto Sans CJK SC", sans-serif; font-size: 11pt; line-height: 1.5; color: #333; } h1 { font-size: 22pt; color: #0066cc; border-bottom: 3px solid #0066cc; padding-bottom: 8pt; margin-bottom: 20pt; page-break-after: avoid; } h2 { font-size: 16pt; color: #0066cc; margin-top: 20pt; margin-bottom: 12pt; page-break-after: avoid; } table { width: 100%; border-collapse: collapse; margin: 16pt 0; } table th { background-color: #0066cc; color: white; padding: 10pt; font-weight: bold; } table td { border: 1px solid #ddd; padding: 10pt; } table tr:nth-child(even) { background-color: #f9f9f9; } """ def __init__(self): """Initialize PDF generator""" self.css_templates = { "default": self.DEFAULT_CSS, "academic": self.ACADEMIC_CSS, "business": self.BUSINESS_CSS, } def check_pandoc_available(self) -> bool: """ Check if Pandoc is installed and available Returns: bool: True if pandoc is available, False otherwise """ try: result = subprocess.run( ["pandoc", "--version"], capture_output=True, text=True, timeout=5 ) return result.returncode == 0 except (subprocess.TimeoutExpired, FileNotFoundError): logger.warning("Pandoc not found or timed out") return False def generate_pdf_pandoc( self, markdown_path: Path, output_path: Path, css_template: str = "default", metadata: Optional[Dict] = None ) -> Path: """ Generate PDF using Pandoc (preferred method) Args: markdown_path: Path to input Markdown file output_path: Path to output PDF file css_template: CSS template name or custom CSS string metadata: Optional metadata dict (title, author, date) Returns: Path: Path to generated PDF file Raises: PDFGenerationError: If PDF generation fails """ try: # Create temporary CSS file css_content = self.css_templates.get(css_template, css_template) css_file = output_path.parent / f"temp_{datetime.now().timestamp()}.css" css_file.write_text(css_content, encoding="utf-8") # Build pandoc command pandoc_cmd = [ "pandoc", str(markdown_path), "-o", str(output_path), "--pdf-engine=weasyprint", "--css", str(css_file), "--standalone", "--from=markdown+tables+fenced_code_blocks+footnotes", ] # Add metadata if provided if metadata: if metadata.get("title"): pandoc_cmd.extend(["--metadata", f"title={metadata['title']}"]) if metadata.get("author"): pandoc_cmd.extend(["--metadata", f"author={metadata['author']}"]) if metadata.get("date"): pandoc_cmd.extend(["--metadata", f"date={metadata['date']}"]) # Execute pandoc logger.info(f"Executing pandoc: {' '.join(pandoc_cmd)}") result = subprocess.run( pandoc_cmd, capture_output=True, text=True, timeout=60 # 60 second timeout for large documents ) # Clean up temporary CSS file css_file.unlink(missing_ok=True) if result.returncode != 0: error_msg = f"Pandoc failed: {result.stderr}" logger.error(error_msg) raise PDFGenerationError(error_msg) if not output_path.exists(): raise PDFGenerationError(f"PDF file not created: {output_path}") logger.info(f"PDF generated successfully via Pandoc: {output_path}") return output_path except subprocess.TimeoutExpired: css_file.unlink(missing_ok=True) raise PDFGenerationError("Pandoc execution timed out") except Exception as e: css_file.unlink(missing_ok=True) raise PDFGenerationError(f"Pandoc PDF generation failed: {str(e)}") def generate_pdf_weasyprint( self, markdown_path: Path, output_path: Path, css_template: str = "default", metadata: Optional[Dict] = None ) -> Path: """ Generate PDF using WeasyPrint directly (fallback method) Args: markdown_path: Path to input Markdown file output_path: Path to output PDF file css_template: CSS template name or custom CSS string metadata: Optional metadata dict (title, author, date) Returns: Path: Path to generated PDF file Raises: PDFGenerationError: If PDF generation fails """ try: # Read Markdown content markdown_content = markdown_path.read_text(encoding="utf-8") # Convert Markdown to HTML html_content = markdown( markdown_content, extensions=[ 'tables', 'fenced_code', 'codehilite', 'nl2br', 'sane_lists', ] ) # Wrap HTML with proper structure title = metadata.get("title", markdown_path.stem) if metadata else markdown_path.stem full_html = f"""