""" Tool_OCR - Office Document Converter Service Convert Office documents (DOC/DOCX/PPT/PPTX) to PDF for OCR processing """ import logging import subprocess from pathlib import Path from typing import Optional import tempfile import shutil logger = logging.getLogger(__name__) class OfficeConverterError(Exception): """Exception raised for Office conversion errors""" pass class OfficeConverter: """Convert Office documents to PDF for OCR processing""" # Supported Office formats OFFICE_FORMATS = { '.doc': 'application/msword', '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', '.ppt': 'application/vnd.ms-powerpoint', '.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation' } def __init__(self, libreoffice_path: str = "/Applications/LibreOffice.app/Contents/MacOS/soffice"): """ Initialize Office converter Args: libreoffice_path: Path to LibreOffice executable """ self.libreoffice_path = libreoffice_path self._verify_libreoffice() def _verify_libreoffice(self): """Verify LibreOffice is installed and accessible""" if not Path(self.libreoffice_path).exists(): # Try alternative path (system-wide installation) alt_path = shutil.which("soffice") if alt_path: self.libreoffice_path = alt_path logger.info(f"Using LibreOffice at: {alt_path}") else: raise OfficeConverterError( "LibreOffice not found. Please install LibreOffice:\n" " Ubuntu/Debian: sudo apt install -y libreoffice-writer libreoffice-impress libreoffice-core-nogui\n" " macOS: brew install libreoffice" ) def is_office_document(self, file_path: Path) -> bool: """ Check if file is an Office document Args: file_path: Path to file Returns: True if file is an Office document """ return file_path.suffix.lower() in self.OFFICE_FORMATS def convert_to_pdf(self, office_path: Path, output_dir: Optional[Path] = None) -> Path: """ Convert Office document to PDF Args: office_path: Path to Office document output_dir: Optional output directory (uses temp dir if not specified) Returns: Path to converted PDF file Raises: OfficeConverterError: If conversion fails """ if not office_path.exists(): raise OfficeConverterError(f"Office file not found: {office_path}") if not self.is_office_document(office_path): raise OfficeConverterError( f"Unsupported format: {office_path.suffix}. " f"Supported formats: {', '.join(self.OFFICE_FORMATS.keys())}" ) # Determine output directory if output_dir is None: output_dir = office_path.parent else: output_dir.mkdir(parents=True, exist_ok=True) # Expected output PDF path pdf_filename = office_path.stem + '.pdf' output_pdf_path = output_dir / pdf_filename # Remove existing PDF if present if output_pdf_path.exists(): output_pdf_path.unlink() logger.info(f"Converting {office_path.name} to PDF using LibreOffice") try: # Use LibreOffice headless mode for conversion # --headless: Run without GUI # --convert-to pdf: Convert to PDF format # --outdir: Output directory cmd = [ self.libreoffice_path, '--headless', '--convert-to', 'pdf', '--outdir', str(output_dir), str(office_path) ] logger.debug(f"Running command: {' '.join(cmd)}") result = subprocess.run( cmd, capture_output=True, text=True, timeout=60 # 60 second timeout ) if result.returncode != 0: error_msg = result.stderr or result.stdout raise OfficeConverterError( f"LibreOffice conversion failed: {error_msg}" ) # Verify PDF was created if not output_pdf_path.exists(): raise OfficeConverterError( f"PDF file not created at expected location: {output_pdf_path}" ) logger.info(f"Successfully converted to PDF: {output_pdf_path.name}") return output_pdf_path except subprocess.TimeoutExpired: raise OfficeConverterError( f"Conversion timeout (60s) for file: {office_path.name}" ) except Exception as e: if isinstance(e, OfficeConverterError): raise raise OfficeConverterError(f"Conversion error: {str(e)}") def convert_docx_to_pdf(self, docx_path: Path, output_dir: Optional[Path] = None) -> Path: """ Convert DOCX to PDF Args: docx_path: Path to DOCX file output_dir: Optional output directory Returns: Path to converted PDF """ if docx_path.suffix.lower() != '.docx': raise OfficeConverterError(f"Expected .docx file, got: {docx_path.suffix}") return self.convert_to_pdf(docx_path, output_dir) def convert_doc_to_pdf(self, doc_path: Path, output_dir: Optional[Path] = None) -> Path: """ Convert legacy DOC to PDF Args: doc_path: Path to DOC file output_dir: Optional output directory Returns: Path to converted PDF """ if doc_path.suffix.lower() != '.doc': raise OfficeConverterError(f"Expected .doc file, got: {doc_path.suffix}") return self.convert_to_pdf(doc_path, output_dir) def convert_pptx_to_pdf(self, pptx_path: Path, output_dir: Optional[Path] = None) -> Path: """ Convert PPTX to PDF Args: pptx_path: Path to PPTX file output_dir: Optional output directory Returns: Path to converted PDF """ if pptx_path.suffix.lower() != '.pptx': raise OfficeConverterError(f"Expected .pptx file, got: {pptx_path.suffix}") return self.convert_to_pdf(pptx_path, output_dir) def convert_ppt_to_pdf(self, ppt_path: Path, output_dir: Optional[Path] = None) -> Path: """ Convert legacy PPT to PDF Args: ppt_path: Path to PPT file output_dir: Optional output directory Returns: Path to converted PDF """ if ppt_path.suffix.lower() != '.ppt': raise OfficeConverterError(f"Expected .ppt file, got: {ppt_path.suffix}") return self.convert_to_pdf(ppt_path, output_dir)