first
This commit is contained in:
210
backend/app/services/office_converter.py
Normal file
210
backend/app/services/office_converter.py
Normal file
@@ -0,0 +1,210 @@
|
||||
"""
|
||||
Tool_OCR - Office Document Converter Service
|
||||
Convert Office documents (DOC/DOCX/PPT/PPTX) to PDF for OCR processing
|
||||
"""
|
||||
|
||||
import logging
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
import tempfile
|
||||
import shutil
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class OfficeConverterError(Exception):
|
||||
"""Exception raised for Office conversion errors"""
|
||||
pass
|
||||
|
||||
|
||||
class OfficeConverter:
|
||||
"""Convert Office documents to PDF for OCR processing"""
|
||||
|
||||
# Supported Office formats
|
||||
OFFICE_FORMATS = {
|
||||
'.doc': 'application/msword',
|
||||
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||
'.ppt': 'application/vnd.ms-powerpoint',
|
||||
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation'
|
||||
}
|
||||
|
||||
def __init__(self, libreoffice_path: str = "/Applications/LibreOffice.app/Contents/MacOS/soffice"):
|
||||
"""
|
||||
Initialize Office converter
|
||||
|
||||
Args:
|
||||
libreoffice_path: Path to LibreOffice executable
|
||||
"""
|
||||
self.libreoffice_path = libreoffice_path
|
||||
self._verify_libreoffice()
|
||||
|
||||
def _verify_libreoffice(self):
|
||||
"""Verify LibreOffice is installed and accessible"""
|
||||
if not Path(self.libreoffice_path).exists():
|
||||
# Try alternative path for Homebrew installation
|
||||
alt_path = shutil.which("soffice")
|
||||
if alt_path:
|
||||
self.libreoffice_path = alt_path
|
||||
logger.info(f"Using LibreOffice at: {alt_path}")
|
||||
else:
|
||||
raise OfficeConverterError(
|
||||
"LibreOffice not found. Please install LibreOffice: brew install libreoffice"
|
||||
)
|
||||
|
||||
def is_office_document(self, file_path: Path) -> bool:
|
||||
"""
|
||||
Check if file is an Office document
|
||||
|
||||
Args:
|
||||
file_path: Path to file
|
||||
|
||||
Returns:
|
||||
True if file is an Office document
|
||||
"""
|
||||
return file_path.suffix.lower() in self.OFFICE_FORMATS
|
||||
|
||||
def convert_to_pdf(self, office_path: Path, output_dir: Optional[Path] = None) -> Path:
|
||||
"""
|
||||
Convert Office document to PDF
|
||||
|
||||
Args:
|
||||
office_path: Path to Office document
|
||||
output_dir: Optional output directory (uses temp dir if not specified)
|
||||
|
||||
Returns:
|
||||
Path to converted PDF file
|
||||
|
||||
Raises:
|
||||
OfficeConverterError: If conversion fails
|
||||
"""
|
||||
if not office_path.exists():
|
||||
raise OfficeConverterError(f"Office file not found: {office_path}")
|
||||
|
||||
if not self.is_office_document(office_path):
|
||||
raise OfficeConverterError(
|
||||
f"Unsupported format: {office_path.suffix}. "
|
||||
f"Supported formats: {', '.join(self.OFFICE_FORMATS.keys())}"
|
||||
)
|
||||
|
||||
# Determine output directory
|
||||
if output_dir is None:
|
||||
output_dir = office_path.parent
|
||||
else:
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Expected output PDF path
|
||||
pdf_filename = office_path.stem + '.pdf'
|
||||
output_pdf_path = output_dir / pdf_filename
|
||||
|
||||
# Remove existing PDF if present
|
||||
if output_pdf_path.exists():
|
||||
output_pdf_path.unlink()
|
||||
|
||||
logger.info(f"Converting {office_path.name} to PDF using LibreOffice")
|
||||
|
||||
try:
|
||||
# Use LibreOffice headless mode for conversion
|
||||
# --headless: Run without GUI
|
||||
# --convert-to pdf: Convert to PDF format
|
||||
# --outdir: Output directory
|
||||
cmd = [
|
||||
self.libreoffice_path,
|
||||
'--headless',
|
||||
'--convert-to', 'pdf',
|
||||
'--outdir', str(output_dir),
|
||||
str(office_path)
|
||||
]
|
||||
|
||||
logger.debug(f"Running command: {' '.join(cmd)}")
|
||||
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60 # 60 second timeout
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
error_msg = result.stderr or result.stdout
|
||||
raise OfficeConverterError(
|
||||
f"LibreOffice conversion failed: {error_msg}"
|
||||
)
|
||||
|
||||
# Verify PDF was created
|
||||
if not output_pdf_path.exists():
|
||||
raise OfficeConverterError(
|
||||
f"PDF file not created at expected location: {output_pdf_path}"
|
||||
)
|
||||
|
||||
logger.info(f"Successfully converted to PDF: {output_pdf_path.name}")
|
||||
return output_pdf_path
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
raise OfficeConverterError(
|
||||
f"Conversion timeout (60s) for file: {office_path.name}"
|
||||
)
|
||||
except Exception as e:
|
||||
if isinstance(e, OfficeConverterError):
|
||||
raise
|
||||
raise OfficeConverterError(f"Conversion error: {str(e)}")
|
||||
|
||||
def convert_docx_to_pdf(self, docx_path: Path, output_dir: Optional[Path] = None) -> Path:
|
||||
"""
|
||||
Convert DOCX to PDF
|
||||
|
||||
Args:
|
||||
docx_path: Path to DOCX file
|
||||
output_dir: Optional output directory
|
||||
|
||||
Returns:
|
||||
Path to converted PDF
|
||||
"""
|
||||
if docx_path.suffix.lower() != '.docx':
|
||||
raise OfficeConverterError(f"Expected .docx file, got: {docx_path.suffix}")
|
||||
return self.convert_to_pdf(docx_path, output_dir)
|
||||
|
||||
def convert_doc_to_pdf(self, doc_path: Path, output_dir: Optional[Path] = None) -> Path:
|
||||
"""
|
||||
Convert legacy DOC to PDF
|
||||
|
||||
Args:
|
||||
doc_path: Path to DOC file
|
||||
output_dir: Optional output directory
|
||||
|
||||
Returns:
|
||||
Path to converted PDF
|
||||
"""
|
||||
if doc_path.suffix.lower() != '.doc':
|
||||
raise OfficeConverterError(f"Expected .doc file, got: {doc_path.suffix}")
|
||||
return self.convert_to_pdf(doc_path, output_dir)
|
||||
|
||||
def convert_pptx_to_pdf(self, pptx_path: Path, output_dir: Optional[Path] = None) -> Path:
|
||||
"""
|
||||
Convert PPTX to PDF
|
||||
|
||||
Args:
|
||||
pptx_path: Path to PPTX file
|
||||
output_dir: Optional output directory
|
||||
|
||||
Returns:
|
||||
Path to converted PDF
|
||||
"""
|
||||
if pptx_path.suffix.lower() != '.pptx':
|
||||
raise OfficeConverterError(f"Expected .pptx file, got: {pptx_path.suffix}")
|
||||
return self.convert_to_pdf(pptx_path, output_dir)
|
||||
|
||||
def convert_ppt_to_pdf(self, ppt_path: Path, output_dir: Optional[Path] = None) -> Path:
|
||||
"""
|
||||
Convert legacy PPT to PDF
|
||||
|
||||
Args:
|
||||
ppt_path: Path to PPT file
|
||||
output_dir: Optional output directory
|
||||
|
||||
Returns:
|
||||
Path to converted PDF
|
||||
"""
|
||||
if ppt_path.suffix.lower() != '.ppt':
|
||||
raise OfficeConverterError(f"Expected .ppt file, got: {ppt_path.suffix}")
|
||||
return self.convert_to_pdf(ppt_path, output_dir)
|
||||
Reference in New Issue
Block a user