first
This commit is contained in:
507
backend/app/services/pdf_generator.py
Normal file
507
backend/app/services/pdf_generator.py
Normal file
@@ -0,0 +1,507 @@
|
||||
"""
|
||||
Tool_OCR - PDF Generator Service
|
||||
Converts Markdown to layout-preserved PDFs using Pandoc + WeasyPrint
|
||||
"""
|
||||
|
||||
import logging
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict
|
||||
from datetime import datetime
|
||||
|
||||
from weasyprint import HTML, CSS
|
||||
from markdown import markdown
|
||||
|
||||
from app.core.config import settings
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PDFGenerationError(Exception):
|
||||
"""Exception raised when PDF generation fails"""
|
||||
pass
|
||||
|
||||
|
||||
class PDFGenerator:
|
||||
"""
|
||||
PDF generation service with layout preservation
|
||||
|
||||
Supports two generation methods:
|
||||
1. Pandoc (preferred): Markdown → HTML → PDF via pandoc command
|
||||
2. WeasyPrint (fallback): Direct Python-based HTML → PDF conversion
|
||||
"""
|
||||
|
||||
# Default CSS template for layout preservation
|
||||
DEFAULT_CSS = """
|
||||
@page {
|
||||
size: A4;
|
||||
margin: 2cm;
|
||||
}
|
||||
|
||||
body {
|
||||
font-family: "Noto Sans CJK SC", "Noto Sans CJK TC", "Microsoft YaHei", "SimSun", sans-serif;
|
||||
font-size: 11pt;
|
||||
line-height: 1.6;
|
||||
color: #333;
|
||||
}
|
||||
|
||||
h1 {
|
||||
font-size: 24pt;
|
||||
font-weight: bold;
|
||||
margin-top: 0;
|
||||
margin-bottom: 12pt;
|
||||
color: #000;
|
||||
page-break-after: avoid;
|
||||
}
|
||||
|
||||
h2 {
|
||||
font-size: 18pt;
|
||||
font-weight: bold;
|
||||
margin-top: 18pt;
|
||||
margin-bottom: 10pt;
|
||||
color: #000;
|
||||
page-break-after: avoid;
|
||||
}
|
||||
|
||||
h3 {
|
||||
font-size: 14pt;
|
||||
font-weight: bold;
|
||||
margin-top: 14pt;
|
||||
margin-bottom: 8pt;
|
||||
color: #000;
|
||||
page-break-after: avoid;
|
||||
}
|
||||
|
||||
p {
|
||||
margin: 0 0 10pt 0;
|
||||
text-align: justify;
|
||||
}
|
||||
|
||||
table {
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
margin: 12pt 0;
|
||||
page-break-inside: avoid;
|
||||
}
|
||||
|
||||
table th {
|
||||
background-color: #f0f0f0;
|
||||
border: 1px solid #ccc;
|
||||
padding: 8pt;
|
||||
text-align: left;
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
table td {
|
||||
border: 1px solid #ccc;
|
||||
padding: 8pt;
|
||||
text-align: left;
|
||||
}
|
||||
|
||||
code {
|
||||
font-family: "Courier New", monospace;
|
||||
font-size: 10pt;
|
||||
background-color: #f5f5f5;
|
||||
padding: 2pt 4pt;
|
||||
border-radius: 3px;
|
||||
}
|
||||
|
||||
pre {
|
||||
background-color: #f5f5f5;
|
||||
border: 1px solid #ddd;
|
||||
border-radius: 5px;
|
||||
padding: 10pt;
|
||||
overflow-x: auto;
|
||||
page-break-inside: avoid;
|
||||
}
|
||||
|
||||
pre code {
|
||||
background-color: transparent;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
img {
|
||||
max-width: 100%;
|
||||
height: auto;
|
||||
display: block;
|
||||
margin: 12pt auto;
|
||||
page-break-inside: avoid;
|
||||
}
|
||||
|
||||
blockquote {
|
||||
border-left: 4px solid #ddd;
|
||||
padding-left: 12pt;
|
||||
margin: 12pt 0;
|
||||
color: #666;
|
||||
font-style: italic;
|
||||
}
|
||||
|
||||
ul, ol {
|
||||
margin: 10pt 0;
|
||||
padding-left: 20pt;
|
||||
}
|
||||
|
||||
li {
|
||||
margin: 5pt 0;
|
||||
}
|
||||
|
||||
hr {
|
||||
border: none;
|
||||
border-top: 1px solid #ccc;
|
||||
margin: 20pt 0;
|
||||
}
|
||||
|
||||
.page-break {
|
||||
page-break-after: always;
|
||||
}
|
||||
"""
|
||||
|
||||
# Academic paper template
|
||||
ACADEMIC_CSS = """
|
||||
@page {
|
||||
size: A4;
|
||||
margin: 2.5cm;
|
||||
}
|
||||
|
||||
body {
|
||||
font-family: "Times New Roman", "Noto Serif CJK SC", serif;
|
||||
font-size: 12pt;
|
||||
line-height: 1.8;
|
||||
color: #000;
|
||||
}
|
||||
|
||||
h1 {
|
||||
font-size: 20pt;
|
||||
text-align: center;
|
||||
margin-bottom: 24pt;
|
||||
page-break-after: avoid;
|
||||
}
|
||||
|
||||
h2 {
|
||||
font-size: 16pt;
|
||||
margin-top: 20pt;
|
||||
margin-bottom: 12pt;
|
||||
page-break-after: avoid;
|
||||
}
|
||||
|
||||
h3 {
|
||||
font-size: 14pt;
|
||||
margin-top: 16pt;
|
||||
margin-bottom: 10pt;
|
||||
page-break-after: avoid;
|
||||
}
|
||||
|
||||
p {
|
||||
text-indent: 2em;
|
||||
text-align: justify;
|
||||
margin: 0 0 12pt 0;
|
||||
}
|
||||
|
||||
table {
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
margin: 16pt auto;
|
||||
page-break-inside: avoid;
|
||||
}
|
||||
|
||||
table caption {
|
||||
font-weight: bold;
|
||||
margin-bottom: 8pt;
|
||||
}
|
||||
"""
|
||||
|
||||
# Business report template
|
||||
BUSINESS_CSS = """
|
||||
@page {
|
||||
size: A4;
|
||||
margin: 2cm 2.5cm;
|
||||
}
|
||||
|
||||
body {
|
||||
font-family: "Arial", "Noto Sans CJK SC", sans-serif;
|
||||
font-size: 11pt;
|
||||
line-height: 1.5;
|
||||
color: #333;
|
||||
}
|
||||
|
||||
h1 {
|
||||
font-size: 22pt;
|
||||
color: #0066cc;
|
||||
border-bottom: 3px solid #0066cc;
|
||||
padding-bottom: 8pt;
|
||||
margin-bottom: 20pt;
|
||||
page-break-after: avoid;
|
||||
}
|
||||
|
||||
h2 {
|
||||
font-size: 16pt;
|
||||
color: #0066cc;
|
||||
margin-top: 20pt;
|
||||
margin-bottom: 12pt;
|
||||
page-break-after: avoid;
|
||||
}
|
||||
|
||||
table {
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
margin: 16pt 0;
|
||||
}
|
||||
|
||||
table th {
|
||||
background-color: #0066cc;
|
||||
color: white;
|
||||
padding: 10pt;
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
table td {
|
||||
border: 1px solid #ddd;
|
||||
padding: 10pt;
|
||||
}
|
||||
|
||||
table tr:nth-child(even) {
|
||||
background-color: #f9f9f9;
|
||||
}
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize PDF generator"""
|
||||
self.css_templates = {
|
||||
"default": self.DEFAULT_CSS,
|
||||
"academic": self.ACADEMIC_CSS,
|
||||
"business": self.BUSINESS_CSS,
|
||||
}
|
||||
|
||||
def check_pandoc_available(self) -> bool:
|
||||
"""
|
||||
Check if Pandoc is installed and available
|
||||
|
||||
Returns:
|
||||
bool: True if pandoc is available, False otherwise
|
||||
"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["pandoc", "--version"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5
|
||||
)
|
||||
return result.returncode == 0
|
||||
except (subprocess.TimeoutExpired, FileNotFoundError):
|
||||
logger.warning("Pandoc not found or timed out")
|
||||
return False
|
||||
|
||||
def generate_pdf_pandoc(
|
||||
self,
|
||||
markdown_path: Path,
|
||||
output_path: Path,
|
||||
css_template: str = "default",
|
||||
metadata: Optional[Dict] = None
|
||||
) -> Path:
|
||||
"""
|
||||
Generate PDF using Pandoc (preferred method)
|
||||
|
||||
Args:
|
||||
markdown_path: Path to input Markdown file
|
||||
output_path: Path to output PDF file
|
||||
css_template: CSS template name or custom CSS string
|
||||
metadata: Optional metadata dict (title, author, date)
|
||||
|
||||
Returns:
|
||||
Path: Path to generated PDF file
|
||||
|
||||
Raises:
|
||||
PDFGenerationError: If PDF generation fails
|
||||
"""
|
||||
try:
|
||||
# Create temporary CSS file
|
||||
css_content = self.css_templates.get(css_template, css_template)
|
||||
css_file = output_path.parent / f"temp_{datetime.now().timestamp()}.css"
|
||||
css_file.write_text(css_content, encoding="utf-8")
|
||||
|
||||
# Build pandoc command
|
||||
pandoc_cmd = [
|
||||
"pandoc",
|
||||
str(markdown_path),
|
||||
"-o", str(output_path),
|
||||
"--pdf-engine=weasyprint",
|
||||
"--css", str(css_file),
|
||||
"--standalone",
|
||||
"--from=markdown+tables+fenced_code_blocks+footnotes",
|
||||
]
|
||||
|
||||
# Add metadata if provided
|
||||
if metadata:
|
||||
if metadata.get("title"):
|
||||
pandoc_cmd.extend(["--metadata", f"title={metadata['title']}"])
|
||||
if metadata.get("author"):
|
||||
pandoc_cmd.extend(["--metadata", f"author={metadata['author']}"])
|
||||
if metadata.get("date"):
|
||||
pandoc_cmd.extend(["--metadata", f"date={metadata['date']}"])
|
||||
|
||||
# Execute pandoc
|
||||
logger.info(f"Executing pandoc: {' '.join(pandoc_cmd)}")
|
||||
result = subprocess.run(
|
||||
pandoc_cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60 # 60 second timeout for large documents
|
||||
)
|
||||
|
||||
# Clean up temporary CSS file
|
||||
css_file.unlink(missing_ok=True)
|
||||
|
||||
if result.returncode != 0:
|
||||
error_msg = f"Pandoc failed: {result.stderr}"
|
||||
logger.error(error_msg)
|
||||
raise PDFGenerationError(error_msg)
|
||||
|
||||
if not output_path.exists():
|
||||
raise PDFGenerationError(f"PDF file not created: {output_path}")
|
||||
|
||||
logger.info(f"PDF generated successfully via Pandoc: {output_path}")
|
||||
return output_path
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
css_file.unlink(missing_ok=True)
|
||||
raise PDFGenerationError("Pandoc execution timed out")
|
||||
except Exception as e:
|
||||
css_file.unlink(missing_ok=True)
|
||||
raise PDFGenerationError(f"Pandoc PDF generation failed: {str(e)}")
|
||||
|
||||
def generate_pdf_weasyprint(
|
||||
self,
|
||||
markdown_path: Path,
|
||||
output_path: Path,
|
||||
css_template: str = "default",
|
||||
metadata: Optional[Dict] = None
|
||||
) -> Path:
|
||||
"""
|
||||
Generate PDF using WeasyPrint directly (fallback method)
|
||||
|
||||
Args:
|
||||
markdown_path: Path to input Markdown file
|
||||
output_path: Path to output PDF file
|
||||
css_template: CSS template name or custom CSS string
|
||||
metadata: Optional metadata dict (title, author, date)
|
||||
|
||||
Returns:
|
||||
Path: Path to generated PDF file
|
||||
|
||||
Raises:
|
||||
PDFGenerationError: If PDF generation fails
|
||||
"""
|
||||
try:
|
||||
# Read Markdown content
|
||||
markdown_content = markdown_path.read_text(encoding="utf-8")
|
||||
|
||||
# Convert Markdown to HTML
|
||||
html_content = markdown(
|
||||
markdown_content,
|
||||
extensions=[
|
||||
'tables',
|
||||
'fenced_code',
|
||||
'codehilite',
|
||||
'nl2br',
|
||||
'sane_lists',
|
||||
]
|
||||
)
|
||||
|
||||
# Wrap HTML with proper structure
|
||||
title = metadata.get("title", markdown_path.stem) if metadata else markdown_path.stem
|
||||
full_html = f"""
|
||||
<!DOCTYPE html>
|
||||
<html lang="zh-CN">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>{title}</title>
|
||||
</head>
|
||||
<body>
|
||||
{html_content}
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
# Get CSS content
|
||||
css_content = self.css_templates.get(css_template, css_template)
|
||||
|
||||
# Generate PDF
|
||||
logger.info(f"Generating PDF via WeasyPrint: {output_path}")
|
||||
html = HTML(string=full_html, base_url=str(markdown_path.parent))
|
||||
css = CSS(string=css_content)
|
||||
html.write_pdf(str(output_path), stylesheets=[css])
|
||||
|
||||
if not output_path.exists():
|
||||
raise PDFGenerationError(f"PDF file not created: {output_path}")
|
||||
|
||||
logger.info(f"PDF generated successfully via WeasyPrint: {output_path}")
|
||||
return output_path
|
||||
|
||||
except Exception as e:
|
||||
raise PDFGenerationError(f"WeasyPrint PDF generation failed: {str(e)}")
|
||||
|
||||
def generate_pdf(
|
||||
self,
|
||||
markdown_path: Path,
|
||||
output_path: Path,
|
||||
css_template: str = "default",
|
||||
metadata: Optional[Dict] = None,
|
||||
prefer_pandoc: bool = True
|
||||
) -> Path:
|
||||
"""
|
||||
Generate PDF from Markdown with automatic fallback
|
||||
|
||||
Args:
|
||||
markdown_path: Path to input Markdown file
|
||||
output_path: Path to output PDF file
|
||||
css_template: CSS template name ("default", "academic", "business") or custom CSS
|
||||
metadata: Optional metadata dict (title, author, date)
|
||||
prefer_pandoc: Use Pandoc if available, fallback to WeasyPrint
|
||||
|
||||
Returns:
|
||||
Path: Path to generated PDF file
|
||||
|
||||
Raises:
|
||||
PDFGenerationError: If both methods fail
|
||||
"""
|
||||
if not markdown_path.exists():
|
||||
raise PDFGenerationError(f"Markdown file not found: {markdown_path}")
|
||||
|
||||
# Ensure output directory exists
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Try Pandoc first if preferred and available
|
||||
if prefer_pandoc and self.check_pandoc_available():
|
||||
try:
|
||||
return self.generate_pdf_pandoc(markdown_path, output_path, css_template, metadata)
|
||||
except PDFGenerationError as e:
|
||||
logger.warning(f"Pandoc failed, falling back to WeasyPrint: {e}")
|
||||
# Fall through to WeasyPrint
|
||||
|
||||
# Use WeasyPrint (fallback or direct)
|
||||
return self.generate_pdf_weasyprint(markdown_path, output_path, css_template, metadata)
|
||||
|
||||
def get_available_templates(self) -> Dict[str, str]:
|
||||
"""
|
||||
Get list of available CSS templates
|
||||
|
||||
Returns:
|
||||
Dict mapping template names to descriptions
|
||||
"""
|
||||
return {
|
||||
"default": "通用排版模板,適合大多數文檔",
|
||||
"academic": "學術論文模板,適合研究報告",
|
||||
"business": "商業報告模板,適合企業文檔",
|
||||
}
|
||||
|
||||
def save_custom_template(self, template_name: str, css_content: str) -> None:
|
||||
"""
|
||||
Save a custom CSS template
|
||||
|
||||
Args:
|
||||
template_name: Template name
|
||||
css_content: CSS content
|
||||
"""
|
||||
self.css_templates[template_name] = css_content
|
||||
logger.info(f"Custom CSS template saved: {template_name}")
|
||||
Reference in New Issue
Block a user