Files
OCR/backend/app/services/pdf_generator.py
beabigegg da700721fa first
2025-11-12 22:53:17 +08:00

508 lines
13 KiB
Python

"""
Tool_OCR - PDF Generator Service
Converts Markdown to layout-preserved PDFs using Pandoc + WeasyPrint
"""
import logging
import subprocess
from pathlib import Path
from typing import Optional, Dict
from datetime import datetime
from weasyprint import HTML, CSS
from markdown import markdown
from app.core.config import settings
logger = logging.getLogger(__name__)
class PDFGenerationError(Exception):
"""Exception raised when PDF generation fails"""
pass
class PDFGenerator:
"""
PDF generation service with layout preservation
Supports two generation methods:
1. Pandoc (preferred): Markdown → HTML → PDF via pandoc command
2. WeasyPrint (fallback): Direct Python-based HTML → PDF conversion
"""
# Default CSS template for layout preservation
DEFAULT_CSS = """
@page {
size: A4;
margin: 2cm;
}
body {
font-family: "Noto Sans CJK SC", "Noto Sans CJK TC", "Microsoft YaHei", "SimSun", sans-serif;
font-size: 11pt;
line-height: 1.6;
color: #333;
}
h1 {
font-size: 24pt;
font-weight: bold;
margin-top: 0;
margin-bottom: 12pt;
color: #000;
page-break-after: avoid;
}
h2 {
font-size: 18pt;
font-weight: bold;
margin-top: 18pt;
margin-bottom: 10pt;
color: #000;
page-break-after: avoid;
}
h3 {
font-size: 14pt;
font-weight: bold;
margin-top: 14pt;
margin-bottom: 8pt;
color: #000;
page-break-after: avoid;
}
p {
margin: 0 0 10pt 0;
text-align: justify;
}
table {
width: 100%;
border-collapse: collapse;
margin: 12pt 0;
page-break-inside: avoid;
}
table th {
background-color: #f0f0f0;
border: 1px solid #ccc;
padding: 8pt;
text-align: left;
font-weight: bold;
}
table td {
border: 1px solid #ccc;
padding: 8pt;
text-align: left;
}
code {
font-family: "Courier New", monospace;
font-size: 10pt;
background-color: #f5f5f5;
padding: 2pt 4pt;
border-radius: 3px;
}
pre {
background-color: #f5f5f5;
border: 1px solid #ddd;
border-radius: 5px;
padding: 10pt;
overflow-x: auto;
page-break-inside: avoid;
}
pre code {
background-color: transparent;
padding: 0;
}
img {
max-width: 100%;
height: auto;
display: block;
margin: 12pt auto;
page-break-inside: avoid;
}
blockquote {
border-left: 4px solid #ddd;
padding-left: 12pt;
margin: 12pt 0;
color: #666;
font-style: italic;
}
ul, ol {
margin: 10pt 0;
padding-left: 20pt;
}
li {
margin: 5pt 0;
}
hr {
border: none;
border-top: 1px solid #ccc;
margin: 20pt 0;
}
.page-break {
page-break-after: always;
}
"""
# Academic paper template
ACADEMIC_CSS = """
@page {
size: A4;
margin: 2.5cm;
}
body {
font-family: "Times New Roman", "Noto Serif CJK SC", serif;
font-size: 12pt;
line-height: 1.8;
color: #000;
}
h1 {
font-size: 20pt;
text-align: center;
margin-bottom: 24pt;
page-break-after: avoid;
}
h2 {
font-size: 16pt;
margin-top: 20pt;
margin-bottom: 12pt;
page-break-after: avoid;
}
h3 {
font-size: 14pt;
margin-top: 16pt;
margin-bottom: 10pt;
page-break-after: avoid;
}
p {
text-indent: 2em;
text-align: justify;
margin: 0 0 12pt 0;
}
table {
width: 100%;
border-collapse: collapse;
margin: 16pt auto;
page-break-inside: avoid;
}
table caption {
font-weight: bold;
margin-bottom: 8pt;
}
"""
# Business report template
BUSINESS_CSS = """
@page {
size: A4;
margin: 2cm 2.5cm;
}
body {
font-family: "Arial", "Noto Sans CJK SC", sans-serif;
font-size: 11pt;
line-height: 1.5;
color: #333;
}
h1 {
font-size: 22pt;
color: #0066cc;
border-bottom: 3px solid #0066cc;
padding-bottom: 8pt;
margin-bottom: 20pt;
page-break-after: avoid;
}
h2 {
font-size: 16pt;
color: #0066cc;
margin-top: 20pt;
margin-bottom: 12pt;
page-break-after: avoid;
}
table {
width: 100%;
border-collapse: collapse;
margin: 16pt 0;
}
table th {
background-color: #0066cc;
color: white;
padding: 10pt;
font-weight: bold;
}
table td {
border: 1px solid #ddd;
padding: 10pt;
}
table tr:nth-child(even) {
background-color: #f9f9f9;
}
"""
def __init__(self):
"""Initialize PDF generator"""
self.css_templates = {
"default": self.DEFAULT_CSS,
"academic": self.ACADEMIC_CSS,
"business": self.BUSINESS_CSS,
}
def check_pandoc_available(self) -> bool:
"""
Check if Pandoc is installed and available
Returns:
bool: True if pandoc is available, False otherwise
"""
try:
result = subprocess.run(
["pandoc", "--version"],
capture_output=True,
text=True,
timeout=5
)
return result.returncode == 0
except (subprocess.TimeoutExpired, FileNotFoundError):
logger.warning("Pandoc not found or timed out")
return False
def generate_pdf_pandoc(
self,
markdown_path: Path,
output_path: Path,
css_template: str = "default",
metadata: Optional[Dict] = None
) -> Path:
"""
Generate PDF using Pandoc (preferred method)
Args:
markdown_path: Path to input Markdown file
output_path: Path to output PDF file
css_template: CSS template name or custom CSS string
metadata: Optional metadata dict (title, author, date)
Returns:
Path: Path to generated PDF file
Raises:
PDFGenerationError: If PDF generation fails
"""
try:
# Create temporary CSS file
css_content = self.css_templates.get(css_template, css_template)
css_file = output_path.parent / f"temp_{datetime.now().timestamp()}.css"
css_file.write_text(css_content, encoding="utf-8")
# Build pandoc command
pandoc_cmd = [
"pandoc",
str(markdown_path),
"-o", str(output_path),
"--pdf-engine=weasyprint",
"--css", str(css_file),
"--standalone",
"--from=markdown+tables+fenced_code_blocks+footnotes",
]
# Add metadata if provided
if metadata:
if metadata.get("title"):
pandoc_cmd.extend(["--metadata", f"title={metadata['title']}"])
if metadata.get("author"):
pandoc_cmd.extend(["--metadata", f"author={metadata['author']}"])
if metadata.get("date"):
pandoc_cmd.extend(["--metadata", f"date={metadata['date']}"])
# Execute pandoc
logger.info(f"Executing pandoc: {' '.join(pandoc_cmd)}")
result = subprocess.run(
pandoc_cmd,
capture_output=True,
text=True,
timeout=60 # 60 second timeout for large documents
)
# Clean up temporary CSS file
css_file.unlink(missing_ok=True)
if result.returncode != 0:
error_msg = f"Pandoc failed: {result.stderr}"
logger.error(error_msg)
raise PDFGenerationError(error_msg)
if not output_path.exists():
raise PDFGenerationError(f"PDF file not created: {output_path}")
logger.info(f"PDF generated successfully via Pandoc: {output_path}")
return output_path
except subprocess.TimeoutExpired:
css_file.unlink(missing_ok=True)
raise PDFGenerationError("Pandoc execution timed out")
except Exception as e:
css_file.unlink(missing_ok=True)
raise PDFGenerationError(f"Pandoc PDF generation failed: {str(e)}")
def generate_pdf_weasyprint(
self,
markdown_path: Path,
output_path: Path,
css_template: str = "default",
metadata: Optional[Dict] = None
) -> Path:
"""
Generate PDF using WeasyPrint directly (fallback method)
Args:
markdown_path: Path to input Markdown file
output_path: Path to output PDF file
css_template: CSS template name or custom CSS string
metadata: Optional metadata dict (title, author, date)
Returns:
Path: Path to generated PDF file
Raises:
PDFGenerationError: If PDF generation fails
"""
try:
# Read Markdown content
markdown_content = markdown_path.read_text(encoding="utf-8")
# Convert Markdown to HTML
html_content = markdown(
markdown_content,
extensions=[
'tables',
'fenced_code',
'codehilite',
'nl2br',
'sane_lists',
]
)
# Wrap HTML with proper structure
title = metadata.get("title", markdown_path.stem) if metadata else markdown_path.stem
full_html = f"""
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<title>{title}</title>
</head>
<body>
{html_content}
</body>
</html>
"""
# Get CSS content
css_content = self.css_templates.get(css_template, css_template)
# Generate PDF
logger.info(f"Generating PDF via WeasyPrint: {output_path}")
html = HTML(string=full_html, base_url=str(markdown_path.parent))
css = CSS(string=css_content)
html.write_pdf(str(output_path), stylesheets=[css])
if not output_path.exists():
raise PDFGenerationError(f"PDF file not created: {output_path}")
logger.info(f"PDF generated successfully via WeasyPrint: {output_path}")
return output_path
except Exception as e:
raise PDFGenerationError(f"WeasyPrint PDF generation failed: {str(e)}")
def generate_pdf(
self,
markdown_path: Path,
output_path: Path,
css_template: str = "default",
metadata: Optional[Dict] = None,
prefer_pandoc: bool = True
) -> Path:
"""
Generate PDF from Markdown with automatic fallback
Args:
markdown_path: Path to input Markdown file
output_path: Path to output PDF file
css_template: CSS template name ("default", "academic", "business") or custom CSS
metadata: Optional metadata dict (title, author, date)
prefer_pandoc: Use Pandoc if available, fallback to WeasyPrint
Returns:
Path: Path to generated PDF file
Raises:
PDFGenerationError: If both methods fail
"""
if not markdown_path.exists():
raise PDFGenerationError(f"Markdown file not found: {markdown_path}")
# Ensure output directory exists
output_path.parent.mkdir(parents=True, exist_ok=True)
# Try Pandoc first if preferred and available
if prefer_pandoc and self.check_pandoc_available():
try:
return self.generate_pdf_pandoc(markdown_path, output_path, css_template, metadata)
except PDFGenerationError as e:
logger.warning(f"Pandoc failed, falling back to WeasyPrint: {e}")
# Fall through to WeasyPrint
# Use WeasyPrint (fallback or direct)
return self.generate_pdf_weasyprint(markdown_path, output_path, css_template, metadata)
def get_available_templates(self) -> Dict[str, str]:
"""
Get list of available CSS templates
Returns:
Dict mapping template names to descriptions
"""
return {
"default": "通用排版模板,適合大多數文檔",
"academic": "學術論文模板,適合研究報告",
"business": "商業報告模板,適合企業文檔",
}
def save_custom_template(self, template_name: str, css_content: str) -> None:
"""
Save a custom CSS template
Args:
template_name: Template name
css_content: CSS content
"""
self.css_templates[template_name] = css_content
logger.info(f"Custom CSS template saved: {template_name}")