從 Docker/macOS+Conda 部署遷移到 WSL2 Ubuntu 原生開發環境 主要變更: - 移除所有 Docker 相關配置檔案 (Dockerfile, docker-compose.yml, .dockerignore 等) - 移除 macOS/Conda 設置腳本 (SETUP.md, setup_conda.sh) - 新增 WSL Ubuntu 自動化環境設置腳本 (setup_dev_env.sh) - 新增後端/前端快速啟動腳本 (start_backend.sh, start_frontend.sh) - 統一開發端口配置 (backend: 8000, frontend: 5173) - 改進資料庫連接穩定性(連接池、超時設置、重試機制) - 更新專案文檔以反映當前 WSL 開發環境 Technical improvements: - Database connection pooling with health checks and auto-reconnection - Retry logic for long-running OCR tasks to prevent DB timeouts - Extended JWT token expiration to 24 hours - Support for Office documents (pptx, docx) via LibreOffice headless - Comprehensive system dependency installation in single script Environment: - OS: WSL2 Ubuntu 24.04 - Python: 3.12 (venv) - Node.js: 24.x LTS (nvm) - Backend Port: 8000 - Frontend Port: 5173 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
213 lines
6.9 KiB
Python
213 lines
6.9 KiB
Python
"""
|
|
Tool_OCR - Office Document Converter Service
|
|
Convert Office documents (DOC/DOCX/PPT/PPTX) to PDF for OCR processing
|
|
"""
|
|
|
|
import logging
|
|
import subprocess
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
import tempfile
|
|
import shutil
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class OfficeConverterError(Exception):
|
|
"""Exception raised for Office conversion errors"""
|
|
pass
|
|
|
|
|
|
class OfficeConverter:
|
|
"""Convert Office documents to PDF for OCR processing"""
|
|
|
|
# Supported Office formats
|
|
OFFICE_FORMATS = {
|
|
'.doc': 'application/msword',
|
|
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
'.ppt': 'application/vnd.ms-powerpoint',
|
|
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation'
|
|
}
|
|
|
|
def __init__(self, libreoffice_path: str = "/Applications/LibreOffice.app/Contents/MacOS/soffice"):
|
|
"""
|
|
Initialize Office converter
|
|
|
|
Args:
|
|
libreoffice_path: Path to LibreOffice executable
|
|
"""
|
|
self.libreoffice_path = libreoffice_path
|
|
self._verify_libreoffice()
|
|
|
|
def _verify_libreoffice(self):
|
|
"""Verify LibreOffice is installed and accessible"""
|
|
if not Path(self.libreoffice_path).exists():
|
|
# Try alternative path (system-wide installation)
|
|
alt_path = shutil.which("soffice")
|
|
if alt_path:
|
|
self.libreoffice_path = alt_path
|
|
logger.info(f"Using LibreOffice at: {alt_path}")
|
|
else:
|
|
raise OfficeConverterError(
|
|
"LibreOffice not found. Please install LibreOffice:\n"
|
|
" Ubuntu/Debian: sudo apt install -y libreoffice-writer libreoffice-impress libreoffice-core-nogui\n"
|
|
" macOS: brew install libreoffice"
|
|
)
|
|
|
|
def is_office_document(self, file_path: Path) -> bool:
|
|
"""
|
|
Check if file is an Office document
|
|
|
|
Args:
|
|
file_path: Path to file
|
|
|
|
Returns:
|
|
True if file is an Office document
|
|
"""
|
|
return file_path.suffix.lower() in self.OFFICE_FORMATS
|
|
|
|
def convert_to_pdf(self, office_path: Path, output_dir: Optional[Path] = None) -> Path:
|
|
"""
|
|
Convert Office document to PDF
|
|
|
|
Args:
|
|
office_path: Path to Office document
|
|
output_dir: Optional output directory (uses temp dir if not specified)
|
|
|
|
Returns:
|
|
Path to converted PDF file
|
|
|
|
Raises:
|
|
OfficeConverterError: If conversion fails
|
|
"""
|
|
if not office_path.exists():
|
|
raise OfficeConverterError(f"Office file not found: {office_path}")
|
|
|
|
if not self.is_office_document(office_path):
|
|
raise OfficeConverterError(
|
|
f"Unsupported format: {office_path.suffix}. "
|
|
f"Supported formats: {', '.join(self.OFFICE_FORMATS.keys())}"
|
|
)
|
|
|
|
# Determine output directory
|
|
if output_dir is None:
|
|
output_dir = office_path.parent
|
|
else:
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Expected output PDF path
|
|
pdf_filename = office_path.stem + '.pdf'
|
|
output_pdf_path = output_dir / pdf_filename
|
|
|
|
# Remove existing PDF if present
|
|
if output_pdf_path.exists():
|
|
output_pdf_path.unlink()
|
|
|
|
logger.info(f"Converting {office_path.name} to PDF using LibreOffice")
|
|
|
|
try:
|
|
# Use LibreOffice headless mode for conversion
|
|
# --headless: Run without GUI
|
|
# --convert-to pdf: Convert to PDF format
|
|
# --outdir: Output directory
|
|
cmd = [
|
|
self.libreoffice_path,
|
|
'--headless',
|
|
'--convert-to', 'pdf',
|
|
'--outdir', str(output_dir),
|
|
str(office_path)
|
|
]
|
|
|
|
logger.debug(f"Running command: {' '.join(cmd)}")
|
|
|
|
result = subprocess.run(
|
|
cmd,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=60 # 60 second timeout
|
|
)
|
|
|
|
if result.returncode != 0:
|
|
error_msg = result.stderr or result.stdout
|
|
raise OfficeConverterError(
|
|
f"LibreOffice conversion failed: {error_msg}"
|
|
)
|
|
|
|
# Verify PDF was created
|
|
if not output_pdf_path.exists():
|
|
raise OfficeConverterError(
|
|
f"PDF file not created at expected location: {output_pdf_path}"
|
|
)
|
|
|
|
logger.info(f"Successfully converted to PDF: {output_pdf_path.name}")
|
|
return output_pdf_path
|
|
|
|
except subprocess.TimeoutExpired:
|
|
raise OfficeConverterError(
|
|
f"Conversion timeout (60s) for file: {office_path.name}"
|
|
)
|
|
except Exception as e:
|
|
if isinstance(e, OfficeConverterError):
|
|
raise
|
|
raise OfficeConverterError(f"Conversion error: {str(e)}")
|
|
|
|
def convert_docx_to_pdf(self, docx_path: Path, output_dir: Optional[Path] = None) -> Path:
|
|
"""
|
|
Convert DOCX to PDF
|
|
|
|
Args:
|
|
docx_path: Path to DOCX file
|
|
output_dir: Optional output directory
|
|
|
|
Returns:
|
|
Path to converted PDF
|
|
"""
|
|
if docx_path.suffix.lower() != '.docx':
|
|
raise OfficeConverterError(f"Expected .docx file, got: {docx_path.suffix}")
|
|
return self.convert_to_pdf(docx_path, output_dir)
|
|
|
|
def convert_doc_to_pdf(self, doc_path: Path, output_dir: Optional[Path] = None) -> Path:
|
|
"""
|
|
Convert legacy DOC to PDF
|
|
|
|
Args:
|
|
doc_path: Path to DOC file
|
|
output_dir: Optional output directory
|
|
|
|
Returns:
|
|
Path to converted PDF
|
|
"""
|
|
if doc_path.suffix.lower() != '.doc':
|
|
raise OfficeConverterError(f"Expected .doc file, got: {doc_path.suffix}")
|
|
return self.convert_to_pdf(doc_path, output_dir)
|
|
|
|
def convert_pptx_to_pdf(self, pptx_path: Path, output_dir: Optional[Path] = None) -> Path:
|
|
"""
|
|
Convert PPTX to PDF
|
|
|
|
Args:
|
|
pptx_path: Path to PPTX file
|
|
output_dir: Optional output directory
|
|
|
|
Returns:
|
|
Path to converted PDF
|
|
"""
|
|
if pptx_path.suffix.lower() != '.pptx':
|
|
raise OfficeConverterError(f"Expected .pptx file, got: {pptx_path.suffix}")
|
|
return self.convert_to_pdf(pptx_path, output_dir)
|
|
|
|
def convert_ppt_to_pdf(self, ppt_path: Path, output_dir: Optional[Path] = None) -> Path:
|
|
"""
|
|
Convert legacy PPT to PDF
|
|
|
|
Args:
|
|
ppt_path: Path to PPT file
|
|
output_dir: Optional output directory
|
|
|
|
Returns:
|
|
Path to converted PDF
|
|
"""
|
|
if ppt_path.suffix.lower() != '.ppt':
|
|
raise OfficeConverterError(f"Expected .ppt file, got: {ppt_path.suffix}")
|
|
return self.convert_to_pdf(ppt_path, output_dir)
|