This commit is contained in:
beabigegg
2025-11-12 22:53:17 +08:00
commit da700721fa
130 changed files with 23393 additions and 0 deletions

View File

@@ -0,0 +1,210 @@
"""
Tool_OCR - Office Document Converter Service
Convert Office documents (DOC/DOCX/PPT/PPTX) to PDF for OCR processing
"""
import logging
import subprocess
from pathlib import Path
from typing import Optional
import tempfile
import shutil
logger = logging.getLogger(__name__)
class OfficeConverterError(Exception):
"""Exception raised for Office conversion errors"""
pass
class OfficeConverter:
"""Convert Office documents to PDF for OCR processing"""
# Supported Office formats
OFFICE_FORMATS = {
'.doc': 'application/msword',
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'.ppt': 'application/vnd.ms-powerpoint',
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation'
}
def __init__(self, libreoffice_path: str = "/Applications/LibreOffice.app/Contents/MacOS/soffice"):
"""
Initialize Office converter
Args:
libreoffice_path: Path to LibreOffice executable
"""
self.libreoffice_path = libreoffice_path
self._verify_libreoffice()
def _verify_libreoffice(self):
"""Verify LibreOffice is installed and accessible"""
if not Path(self.libreoffice_path).exists():
# Try alternative path for Homebrew installation
alt_path = shutil.which("soffice")
if alt_path:
self.libreoffice_path = alt_path
logger.info(f"Using LibreOffice at: {alt_path}")
else:
raise OfficeConverterError(
"LibreOffice not found. Please install LibreOffice: brew install libreoffice"
)
def is_office_document(self, file_path: Path) -> bool:
"""
Check if file is an Office document
Args:
file_path: Path to file
Returns:
True if file is an Office document
"""
return file_path.suffix.lower() in self.OFFICE_FORMATS
def convert_to_pdf(self, office_path: Path, output_dir: Optional[Path] = None) -> Path:
"""
Convert Office document to PDF
Args:
office_path: Path to Office document
output_dir: Optional output directory (uses temp dir if not specified)
Returns:
Path to converted PDF file
Raises:
OfficeConverterError: If conversion fails
"""
if not office_path.exists():
raise OfficeConverterError(f"Office file not found: {office_path}")
if not self.is_office_document(office_path):
raise OfficeConverterError(
f"Unsupported format: {office_path.suffix}. "
f"Supported formats: {', '.join(self.OFFICE_FORMATS.keys())}"
)
# Determine output directory
if output_dir is None:
output_dir = office_path.parent
else:
output_dir.mkdir(parents=True, exist_ok=True)
# Expected output PDF path
pdf_filename = office_path.stem + '.pdf'
output_pdf_path = output_dir / pdf_filename
# Remove existing PDF if present
if output_pdf_path.exists():
output_pdf_path.unlink()
logger.info(f"Converting {office_path.name} to PDF using LibreOffice")
try:
# Use LibreOffice headless mode for conversion
# --headless: Run without GUI
# --convert-to pdf: Convert to PDF format
# --outdir: Output directory
cmd = [
self.libreoffice_path,
'--headless',
'--convert-to', 'pdf',
'--outdir', str(output_dir),
str(office_path)
]
logger.debug(f"Running command: {' '.join(cmd)}")
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=60 # 60 second timeout
)
if result.returncode != 0:
error_msg = result.stderr or result.stdout
raise OfficeConverterError(
f"LibreOffice conversion failed: {error_msg}"
)
# Verify PDF was created
if not output_pdf_path.exists():
raise OfficeConverterError(
f"PDF file not created at expected location: {output_pdf_path}"
)
logger.info(f"Successfully converted to PDF: {output_pdf_path.name}")
return output_pdf_path
except subprocess.TimeoutExpired:
raise OfficeConverterError(
f"Conversion timeout (60s) for file: {office_path.name}"
)
except Exception as e:
if isinstance(e, OfficeConverterError):
raise
raise OfficeConverterError(f"Conversion error: {str(e)}")
def convert_docx_to_pdf(self, docx_path: Path, output_dir: Optional[Path] = None) -> Path:
"""
Convert DOCX to PDF
Args:
docx_path: Path to DOCX file
output_dir: Optional output directory
Returns:
Path to converted PDF
"""
if docx_path.suffix.lower() != '.docx':
raise OfficeConverterError(f"Expected .docx file, got: {docx_path.suffix}")
return self.convert_to_pdf(docx_path, output_dir)
def convert_doc_to_pdf(self, doc_path: Path, output_dir: Optional[Path] = None) -> Path:
"""
Convert legacy DOC to PDF
Args:
doc_path: Path to DOC file
output_dir: Optional output directory
Returns:
Path to converted PDF
"""
if doc_path.suffix.lower() != '.doc':
raise OfficeConverterError(f"Expected .doc file, got: {doc_path.suffix}")
return self.convert_to_pdf(doc_path, output_dir)
def convert_pptx_to_pdf(self, pptx_path: Path, output_dir: Optional[Path] = None) -> Path:
"""
Convert PPTX to PDF
Args:
pptx_path: Path to PPTX file
output_dir: Optional output directory
Returns:
Path to converted PDF
"""
if pptx_path.suffix.lower() != '.pptx':
raise OfficeConverterError(f"Expected .pptx file, got: {pptx_path.suffix}")
return self.convert_to_pdf(pptx_path, output_dir)
def convert_ppt_to_pdf(self, ppt_path: Path, output_dir: Optional[Path] = None) -> Path:
"""
Convert legacy PPT to PDF
Args:
ppt_path: Path to PPT file
output_dir: Optional output directory
Returns:
Path to converted PDF
"""
if ppt_path.suffix.lower() != '.ppt':
raise OfficeConverterError(f"Expected .ppt file, got: {ppt_path.suffix}")
return self.convert_to_pdf(ppt_path, output_dir)