Files
OCR/backend/app/services/office_converter.py
egg d7e64737b7 feat: migrate to WSL Ubuntu native development environment
從 Docker/macOS+Conda 部署遷移到 WSL2 Ubuntu 原生開發環境

主要變更:
- 移除所有 Docker 相關配置檔案 (Dockerfile, docker-compose.yml, .dockerignore 等)
- 移除 macOS/Conda 設置腳本 (SETUP.md, setup_conda.sh)
- 新增 WSL Ubuntu 自動化環境設置腳本 (setup_dev_env.sh)
- 新增後端/前端快速啟動腳本 (start_backend.sh, start_frontend.sh)
- 統一開發端口配置 (backend: 8000, frontend: 5173)
- 改進資料庫連接穩定性(連接池、超時設置、重試機制)
- 更新專案文檔以反映當前 WSL 開發環境

Technical improvements:
- Database connection pooling with health checks and auto-reconnection
- Retry logic for long-running OCR tasks to prevent DB timeouts
- Extended JWT token expiration to 24 hours
- Support for Office documents (pptx, docx) via LibreOffice headless
- Comprehensive system dependency installation in single script

Environment:
- OS: WSL2 Ubuntu 24.04
- Python: 3.12 (venv)
- Node.js: 24.x LTS (nvm)
- Backend Port: 8000
- Frontend Port: 5173

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-13 21:00:42 +08:00

213 lines
6.9 KiB
Python

"""
Tool_OCR - Office Document Converter Service
Convert Office documents (DOC/DOCX/PPT/PPTX) to PDF for OCR processing
"""
import logging
import subprocess
from pathlib import Path
from typing import Optional
import tempfile
import shutil
logger = logging.getLogger(__name__)
class OfficeConverterError(Exception):
"""Exception raised for Office conversion errors"""
pass
class OfficeConverter:
"""Convert Office documents to PDF for OCR processing"""
# Supported Office formats
OFFICE_FORMATS = {
'.doc': 'application/msword',
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'.ppt': 'application/vnd.ms-powerpoint',
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation'
}
def __init__(self, libreoffice_path: str = "/Applications/LibreOffice.app/Contents/MacOS/soffice"):
"""
Initialize Office converter
Args:
libreoffice_path: Path to LibreOffice executable
"""
self.libreoffice_path = libreoffice_path
self._verify_libreoffice()
def _verify_libreoffice(self):
"""Verify LibreOffice is installed and accessible"""
if not Path(self.libreoffice_path).exists():
# Try alternative path (system-wide installation)
alt_path = shutil.which("soffice")
if alt_path:
self.libreoffice_path = alt_path
logger.info(f"Using LibreOffice at: {alt_path}")
else:
raise OfficeConverterError(
"LibreOffice not found. Please install LibreOffice:\n"
" Ubuntu/Debian: sudo apt install -y libreoffice-writer libreoffice-impress libreoffice-core-nogui\n"
" macOS: brew install libreoffice"
)
def is_office_document(self, file_path: Path) -> bool:
"""
Check if file is an Office document
Args:
file_path: Path to file
Returns:
True if file is an Office document
"""
return file_path.suffix.lower() in self.OFFICE_FORMATS
def convert_to_pdf(self, office_path: Path, output_dir: Optional[Path] = None) -> Path:
"""
Convert Office document to PDF
Args:
office_path: Path to Office document
output_dir: Optional output directory (uses temp dir if not specified)
Returns:
Path to converted PDF file
Raises:
OfficeConverterError: If conversion fails
"""
if not office_path.exists():
raise OfficeConverterError(f"Office file not found: {office_path}")
if not self.is_office_document(office_path):
raise OfficeConverterError(
f"Unsupported format: {office_path.suffix}. "
f"Supported formats: {', '.join(self.OFFICE_FORMATS.keys())}"
)
# Determine output directory
if output_dir is None:
output_dir = office_path.parent
else:
output_dir.mkdir(parents=True, exist_ok=True)
# Expected output PDF path
pdf_filename = office_path.stem + '.pdf'
output_pdf_path = output_dir / pdf_filename
# Remove existing PDF if present
if output_pdf_path.exists():
output_pdf_path.unlink()
logger.info(f"Converting {office_path.name} to PDF using LibreOffice")
try:
# Use LibreOffice headless mode for conversion
# --headless: Run without GUI
# --convert-to pdf: Convert to PDF format
# --outdir: Output directory
cmd = [
self.libreoffice_path,
'--headless',
'--convert-to', 'pdf',
'--outdir', str(output_dir),
str(office_path)
]
logger.debug(f"Running command: {' '.join(cmd)}")
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=60 # 60 second timeout
)
if result.returncode != 0:
error_msg = result.stderr or result.stdout
raise OfficeConverterError(
f"LibreOffice conversion failed: {error_msg}"
)
# Verify PDF was created
if not output_pdf_path.exists():
raise OfficeConverterError(
f"PDF file not created at expected location: {output_pdf_path}"
)
logger.info(f"Successfully converted to PDF: {output_pdf_path.name}")
return output_pdf_path
except subprocess.TimeoutExpired:
raise OfficeConverterError(
f"Conversion timeout (60s) for file: {office_path.name}"
)
except Exception as e:
if isinstance(e, OfficeConverterError):
raise
raise OfficeConverterError(f"Conversion error: {str(e)}")
def convert_docx_to_pdf(self, docx_path: Path, output_dir: Optional[Path] = None) -> Path:
"""
Convert DOCX to PDF
Args:
docx_path: Path to DOCX file
output_dir: Optional output directory
Returns:
Path to converted PDF
"""
if docx_path.suffix.lower() != '.docx':
raise OfficeConverterError(f"Expected .docx file, got: {docx_path.suffix}")
return self.convert_to_pdf(docx_path, output_dir)
def convert_doc_to_pdf(self, doc_path: Path, output_dir: Optional[Path] = None) -> Path:
"""
Convert legacy DOC to PDF
Args:
doc_path: Path to DOC file
output_dir: Optional output directory
Returns:
Path to converted PDF
"""
if doc_path.suffix.lower() != '.doc':
raise OfficeConverterError(f"Expected .doc file, got: {doc_path.suffix}")
return self.convert_to_pdf(doc_path, output_dir)
def convert_pptx_to_pdf(self, pptx_path: Path, output_dir: Optional[Path] = None) -> Path:
"""
Convert PPTX to PDF
Args:
pptx_path: Path to PPTX file
output_dir: Optional output directory
Returns:
Path to converted PDF
"""
if pptx_path.suffix.lower() != '.pptx':
raise OfficeConverterError(f"Expected .pptx file, got: {pptx_path.suffix}")
return self.convert_to_pdf(pptx_path, output_dir)
def convert_ppt_to_pdf(self, ppt_path: Path, output_dir: Optional[Path] = None) -> Path:
"""
Convert legacy PPT to PDF
Args:
ppt_path: Path to PPT file
output_dir: Optional output directory
Returns:
Path to converted PDF
"""
if ppt_path.suffix.lower() != '.ppt':
raise OfficeConverterError(f"Expected .ppt file, got: {ppt_path.suffix}")
return self.convert_to_pdf(ppt_path, output_dir)