508 lines
13 KiB
Python
508 lines
13 KiB
Python
"""
|
|
Tool_OCR - PDF Generator Service
|
|
Converts Markdown to layout-preserved PDFs using Pandoc + WeasyPrint
|
|
"""
|
|
|
|
import logging
|
|
import subprocess
|
|
from pathlib import Path
|
|
from typing import Optional, Dict
|
|
from datetime import datetime
|
|
|
|
from weasyprint import HTML, CSS
|
|
from markdown import markdown
|
|
|
|
from app.core.config import settings
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class PDFGenerationError(Exception):
|
|
"""Exception raised when PDF generation fails"""
|
|
pass
|
|
|
|
|
|
class PDFGenerator:
|
|
"""
|
|
PDF generation service with layout preservation
|
|
|
|
Supports two generation methods:
|
|
1. Pandoc (preferred): Markdown → HTML → PDF via pandoc command
|
|
2. WeasyPrint (fallback): Direct Python-based HTML → PDF conversion
|
|
"""
|
|
|
|
# Default CSS template for layout preservation
|
|
DEFAULT_CSS = """
|
|
@page {
|
|
size: A4;
|
|
margin: 2cm;
|
|
}
|
|
|
|
body {
|
|
font-family: "Noto Sans CJK SC", "Noto Sans CJK TC", "Microsoft YaHei", "SimSun", sans-serif;
|
|
font-size: 11pt;
|
|
line-height: 1.6;
|
|
color: #333;
|
|
}
|
|
|
|
h1 {
|
|
font-size: 24pt;
|
|
font-weight: bold;
|
|
margin-top: 0;
|
|
margin-bottom: 12pt;
|
|
color: #000;
|
|
page-break-after: avoid;
|
|
}
|
|
|
|
h2 {
|
|
font-size: 18pt;
|
|
font-weight: bold;
|
|
margin-top: 18pt;
|
|
margin-bottom: 10pt;
|
|
color: #000;
|
|
page-break-after: avoid;
|
|
}
|
|
|
|
h3 {
|
|
font-size: 14pt;
|
|
font-weight: bold;
|
|
margin-top: 14pt;
|
|
margin-bottom: 8pt;
|
|
color: #000;
|
|
page-break-after: avoid;
|
|
}
|
|
|
|
p {
|
|
margin: 0 0 10pt 0;
|
|
text-align: justify;
|
|
}
|
|
|
|
table {
|
|
width: 100%;
|
|
border-collapse: collapse;
|
|
margin: 12pt 0;
|
|
page-break-inside: avoid;
|
|
}
|
|
|
|
table th {
|
|
background-color: #f0f0f0;
|
|
border: 1px solid #ccc;
|
|
padding: 8pt;
|
|
text-align: left;
|
|
font-weight: bold;
|
|
}
|
|
|
|
table td {
|
|
border: 1px solid #ccc;
|
|
padding: 8pt;
|
|
text-align: left;
|
|
}
|
|
|
|
code {
|
|
font-family: "Courier New", monospace;
|
|
font-size: 10pt;
|
|
background-color: #f5f5f5;
|
|
padding: 2pt 4pt;
|
|
border-radius: 3px;
|
|
}
|
|
|
|
pre {
|
|
background-color: #f5f5f5;
|
|
border: 1px solid #ddd;
|
|
border-radius: 5px;
|
|
padding: 10pt;
|
|
overflow-x: auto;
|
|
page-break-inside: avoid;
|
|
}
|
|
|
|
pre code {
|
|
background-color: transparent;
|
|
padding: 0;
|
|
}
|
|
|
|
img {
|
|
max-width: 100%;
|
|
height: auto;
|
|
display: block;
|
|
margin: 12pt auto;
|
|
page-break-inside: avoid;
|
|
}
|
|
|
|
blockquote {
|
|
border-left: 4px solid #ddd;
|
|
padding-left: 12pt;
|
|
margin: 12pt 0;
|
|
color: #666;
|
|
font-style: italic;
|
|
}
|
|
|
|
ul, ol {
|
|
margin: 10pt 0;
|
|
padding-left: 20pt;
|
|
}
|
|
|
|
li {
|
|
margin: 5pt 0;
|
|
}
|
|
|
|
hr {
|
|
border: none;
|
|
border-top: 1px solid #ccc;
|
|
margin: 20pt 0;
|
|
}
|
|
|
|
.page-break {
|
|
page-break-after: always;
|
|
}
|
|
"""
|
|
|
|
# Academic paper template
|
|
ACADEMIC_CSS = """
|
|
@page {
|
|
size: A4;
|
|
margin: 2.5cm;
|
|
}
|
|
|
|
body {
|
|
font-family: "Times New Roman", "Noto Serif CJK SC", serif;
|
|
font-size: 12pt;
|
|
line-height: 1.8;
|
|
color: #000;
|
|
}
|
|
|
|
h1 {
|
|
font-size: 20pt;
|
|
text-align: center;
|
|
margin-bottom: 24pt;
|
|
page-break-after: avoid;
|
|
}
|
|
|
|
h2 {
|
|
font-size: 16pt;
|
|
margin-top: 20pt;
|
|
margin-bottom: 12pt;
|
|
page-break-after: avoid;
|
|
}
|
|
|
|
h3 {
|
|
font-size: 14pt;
|
|
margin-top: 16pt;
|
|
margin-bottom: 10pt;
|
|
page-break-after: avoid;
|
|
}
|
|
|
|
p {
|
|
text-indent: 2em;
|
|
text-align: justify;
|
|
margin: 0 0 12pt 0;
|
|
}
|
|
|
|
table {
|
|
width: 100%;
|
|
border-collapse: collapse;
|
|
margin: 16pt auto;
|
|
page-break-inside: avoid;
|
|
}
|
|
|
|
table caption {
|
|
font-weight: bold;
|
|
margin-bottom: 8pt;
|
|
}
|
|
"""
|
|
|
|
# Business report template
|
|
BUSINESS_CSS = """
|
|
@page {
|
|
size: A4;
|
|
margin: 2cm 2.5cm;
|
|
}
|
|
|
|
body {
|
|
font-family: "Arial", "Noto Sans CJK SC", sans-serif;
|
|
font-size: 11pt;
|
|
line-height: 1.5;
|
|
color: #333;
|
|
}
|
|
|
|
h1 {
|
|
font-size: 22pt;
|
|
color: #0066cc;
|
|
border-bottom: 3px solid #0066cc;
|
|
padding-bottom: 8pt;
|
|
margin-bottom: 20pt;
|
|
page-break-after: avoid;
|
|
}
|
|
|
|
h2 {
|
|
font-size: 16pt;
|
|
color: #0066cc;
|
|
margin-top: 20pt;
|
|
margin-bottom: 12pt;
|
|
page-break-after: avoid;
|
|
}
|
|
|
|
table {
|
|
width: 100%;
|
|
border-collapse: collapse;
|
|
margin: 16pt 0;
|
|
}
|
|
|
|
table th {
|
|
background-color: #0066cc;
|
|
color: white;
|
|
padding: 10pt;
|
|
font-weight: bold;
|
|
}
|
|
|
|
table td {
|
|
border: 1px solid #ddd;
|
|
padding: 10pt;
|
|
}
|
|
|
|
table tr:nth-child(even) {
|
|
background-color: #f9f9f9;
|
|
}
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize PDF generator"""
|
|
self.css_templates = {
|
|
"default": self.DEFAULT_CSS,
|
|
"academic": self.ACADEMIC_CSS,
|
|
"business": self.BUSINESS_CSS,
|
|
}
|
|
|
|
def check_pandoc_available(self) -> bool:
|
|
"""
|
|
Check if Pandoc is installed and available
|
|
|
|
Returns:
|
|
bool: True if pandoc is available, False otherwise
|
|
"""
|
|
try:
|
|
result = subprocess.run(
|
|
["pandoc", "--version"],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=5
|
|
)
|
|
return result.returncode == 0
|
|
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
logger.warning("Pandoc not found or timed out")
|
|
return False
|
|
|
|
def generate_pdf_pandoc(
|
|
self,
|
|
markdown_path: Path,
|
|
output_path: Path,
|
|
css_template: str = "default",
|
|
metadata: Optional[Dict] = None
|
|
) -> Path:
|
|
"""
|
|
Generate PDF using Pandoc (preferred method)
|
|
|
|
Args:
|
|
markdown_path: Path to input Markdown file
|
|
output_path: Path to output PDF file
|
|
css_template: CSS template name or custom CSS string
|
|
metadata: Optional metadata dict (title, author, date)
|
|
|
|
Returns:
|
|
Path: Path to generated PDF file
|
|
|
|
Raises:
|
|
PDFGenerationError: If PDF generation fails
|
|
"""
|
|
try:
|
|
# Create temporary CSS file
|
|
css_content = self.css_templates.get(css_template, css_template)
|
|
css_file = output_path.parent / f"temp_{datetime.now().timestamp()}.css"
|
|
css_file.write_text(css_content, encoding="utf-8")
|
|
|
|
# Build pandoc command
|
|
pandoc_cmd = [
|
|
"pandoc",
|
|
str(markdown_path),
|
|
"-o", str(output_path),
|
|
"--pdf-engine=weasyprint",
|
|
"--css", str(css_file),
|
|
"--standalone",
|
|
"--from=markdown+tables+fenced_code_blocks+footnotes",
|
|
]
|
|
|
|
# Add metadata if provided
|
|
if metadata:
|
|
if metadata.get("title"):
|
|
pandoc_cmd.extend(["--metadata", f"title={metadata['title']}"])
|
|
if metadata.get("author"):
|
|
pandoc_cmd.extend(["--metadata", f"author={metadata['author']}"])
|
|
if metadata.get("date"):
|
|
pandoc_cmd.extend(["--metadata", f"date={metadata['date']}"])
|
|
|
|
# Execute pandoc
|
|
logger.info(f"Executing pandoc: {' '.join(pandoc_cmd)}")
|
|
result = subprocess.run(
|
|
pandoc_cmd,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=60 # 60 second timeout for large documents
|
|
)
|
|
|
|
# Clean up temporary CSS file
|
|
css_file.unlink(missing_ok=True)
|
|
|
|
if result.returncode != 0:
|
|
error_msg = f"Pandoc failed: {result.stderr}"
|
|
logger.error(error_msg)
|
|
raise PDFGenerationError(error_msg)
|
|
|
|
if not output_path.exists():
|
|
raise PDFGenerationError(f"PDF file not created: {output_path}")
|
|
|
|
logger.info(f"PDF generated successfully via Pandoc: {output_path}")
|
|
return output_path
|
|
|
|
except subprocess.TimeoutExpired:
|
|
css_file.unlink(missing_ok=True)
|
|
raise PDFGenerationError("Pandoc execution timed out")
|
|
except Exception as e:
|
|
css_file.unlink(missing_ok=True)
|
|
raise PDFGenerationError(f"Pandoc PDF generation failed: {str(e)}")
|
|
|
|
def generate_pdf_weasyprint(
|
|
self,
|
|
markdown_path: Path,
|
|
output_path: Path,
|
|
css_template: str = "default",
|
|
metadata: Optional[Dict] = None
|
|
) -> Path:
|
|
"""
|
|
Generate PDF using WeasyPrint directly (fallback method)
|
|
|
|
Args:
|
|
markdown_path: Path to input Markdown file
|
|
output_path: Path to output PDF file
|
|
css_template: CSS template name or custom CSS string
|
|
metadata: Optional metadata dict (title, author, date)
|
|
|
|
Returns:
|
|
Path: Path to generated PDF file
|
|
|
|
Raises:
|
|
PDFGenerationError: If PDF generation fails
|
|
"""
|
|
try:
|
|
# Read Markdown content
|
|
markdown_content = markdown_path.read_text(encoding="utf-8")
|
|
|
|
# Convert Markdown to HTML
|
|
html_content = markdown(
|
|
markdown_content,
|
|
extensions=[
|
|
'tables',
|
|
'fenced_code',
|
|
'codehilite',
|
|
'nl2br',
|
|
'sane_lists',
|
|
]
|
|
)
|
|
|
|
# Wrap HTML with proper structure
|
|
title = metadata.get("title", markdown_path.stem) if metadata else markdown_path.stem
|
|
full_html = f"""
|
|
<!DOCTYPE html>
|
|
<html lang="zh-CN">
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<title>{title}</title>
|
|
</head>
|
|
<body>
|
|
{html_content}
|
|
</body>
|
|
</html>
|
|
"""
|
|
|
|
# Get CSS content
|
|
css_content = self.css_templates.get(css_template, css_template)
|
|
|
|
# Generate PDF
|
|
logger.info(f"Generating PDF via WeasyPrint: {output_path}")
|
|
html = HTML(string=full_html, base_url=str(markdown_path.parent))
|
|
css = CSS(string=css_content)
|
|
html.write_pdf(str(output_path), stylesheets=[css])
|
|
|
|
if not output_path.exists():
|
|
raise PDFGenerationError(f"PDF file not created: {output_path}")
|
|
|
|
logger.info(f"PDF generated successfully via WeasyPrint: {output_path}")
|
|
return output_path
|
|
|
|
except Exception as e:
|
|
raise PDFGenerationError(f"WeasyPrint PDF generation failed: {str(e)}")
|
|
|
|
def generate_pdf(
|
|
self,
|
|
markdown_path: Path,
|
|
output_path: Path,
|
|
css_template: str = "default",
|
|
metadata: Optional[Dict] = None,
|
|
prefer_pandoc: bool = True
|
|
) -> Path:
|
|
"""
|
|
Generate PDF from Markdown with automatic fallback
|
|
|
|
Args:
|
|
markdown_path: Path to input Markdown file
|
|
output_path: Path to output PDF file
|
|
css_template: CSS template name ("default", "academic", "business") or custom CSS
|
|
metadata: Optional metadata dict (title, author, date)
|
|
prefer_pandoc: Use Pandoc if available, fallback to WeasyPrint
|
|
|
|
Returns:
|
|
Path: Path to generated PDF file
|
|
|
|
Raises:
|
|
PDFGenerationError: If both methods fail
|
|
"""
|
|
if not markdown_path.exists():
|
|
raise PDFGenerationError(f"Markdown file not found: {markdown_path}")
|
|
|
|
# Ensure output directory exists
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Try Pandoc first if preferred and available
|
|
if prefer_pandoc and self.check_pandoc_available():
|
|
try:
|
|
return self.generate_pdf_pandoc(markdown_path, output_path, css_template, metadata)
|
|
except PDFGenerationError as e:
|
|
logger.warning(f"Pandoc failed, falling back to WeasyPrint: {e}")
|
|
# Fall through to WeasyPrint
|
|
|
|
# Use WeasyPrint (fallback or direct)
|
|
return self.generate_pdf_weasyprint(markdown_path, output_path, css_template, metadata)
|
|
|
|
def get_available_templates(self) -> Dict[str, str]:
|
|
"""
|
|
Get list of available CSS templates
|
|
|
|
Returns:
|
|
Dict mapping template names to descriptions
|
|
"""
|
|
return {
|
|
"default": "通用排版模板,適合大多數文檔",
|
|
"academic": "學術論文模板,適合研究報告",
|
|
"business": "商業報告模板,適合企業文檔",
|
|
}
|
|
|
|
def save_custom_template(self, template_name: str, css_content: str) -> None:
|
|
"""
|
|
Save a custom CSS template
|
|
|
|
Args:
|
|
template_name: Template name
|
|
css_content: CSS content
|
|
"""
|
|
self.css_templates[template_name] = css_content
|
|
logger.info(f"Custom CSS template saved: {template_name}")
|