This commit is contained in:
beabigegg
2025-11-12 22:53:17 +08:00
commit da700721fa
130 changed files with 23393 additions and 0 deletions

View File

@@ -0,0 +1,3 @@
"""
Tool_OCR - Services Package
"""

View File

@@ -0,0 +1,394 @@
"""
Tool_OCR - Background Tasks Service
Handles async processing, cleanup, and scheduled tasks
"""
import logging
import asyncio
import time
from datetime import datetime, timedelta
from pathlib import Path
from typing import Optional, Callable, Any
from sqlalchemy.orm import Session
from app.core.database import SessionLocal
from app.models.ocr import OCRBatch, OCRFile, OCRResult, BatchStatus, FileStatus
from app.services.ocr_service import OCRService
from app.services.file_manager import FileManager
from app.services.pdf_generator import PDFGenerator
logger = logging.getLogger(__name__)
class BackgroundTaskManager:
"""
Manages background tasks including retry logic, cleanup, and scheduled jobs
"""
def __init__(
self,
max_retries: int = 3,
retry_delay: int = 5,
cleanup_interval: int = 3600, # 1 hour
file_retention_hours: int = 24
):
self.max_retries = max_retries
self.retry_delay = retry_delay
self.cleanup_interval = cleanup_interval
self.file_retention_hours = file_retention_hours
self.ocr_service = OCRService()
self.file_manager = FileManager()
self.pdf_generator = PDFGenerator()
async def execute_with_retry(
self,
func: Callable,
*args,
max_retries: Optional[int] = None,
retry_delay: Optional[int] = None,
**kwargs
) -> Any:
"""
Execute a function with retry logic
Args:
func: Function to execute
args: Positional arguments for func
max_retries: Maximum retry attempts (overrides default)
retry_delay: Delay between retries in seconds (overrides default)
kwargs: Keyword arguments for func
Returns:
Function result
Raises:
Exception: If all retries are exhausted
"""
max_retries = max_retries or self.max_retries
retry_delay = retry_delay or self.retry_delay
last_exception = None
for attempt in range(max_retries + 1):
try:
if asyncio.iscoroutinefunction(func):
return await func(*args, **kwargs)
else:
return func(*args, **kwargs)
except Exception as e:
last_exception = e
if attempt < max_retries:
logger.warning(
f"Attempt {attempt + 1}/{max_retries + 1} failed for {func.__name__}: {e}. "
f"Retrying in {retry_delay}s..."
)
await asyncio.sleep(retry_delay)
else:
logger.error(
f"All {max_retries + 1} attempts failed for {func.__name__}: {e}"
)
raise last_exception
def process_single_file_with_retry(
self,
ocr_file: OCRFile,
batch_id: int,
lang: str,
detect_layout: bool,
db: Session
) -> bool:
"""
Process a single file with retry logic
Args:
ocr_file: OCRFile instance
batch_id: Batch ID
lang: Language code
detect_layout: Whether to detect layout
db: Database session
Returns:
bool: True if successful, False otherwise
"""
for attempt in range(self.max_retries + 1):
try:
# Update file status
ocr_file.status = FileStatus.PROCESSING
ocr_file.started_at = datetime.utcnow()
ocr_file.retry_count = attempt
db.commit()
# Get file paths
file_path = Path(ocr_file.file_path)
paths = self.file_manager.get_file_paths(batch_id, ocr_file.id)
# Process OCR
result = self.ocr_service.process_image(
file_path,
lang=lang,
detect_layout=detect_layout
)
# Check if processing was successful
if result['status'] != 'success':
raise Exception(result.get('error_message', 'Unknown error during OCR processing'))
# Save results
json_path, markdown_path = self.ocr_service.save_results(
result=result,
output_dir=paths["output_dir"],
file_id=str(ocr_file.id)
)
# Extract data from result
text_regions = result.get('text_regions', [])
layout_data = result.get('layout_data')
images_metadata = result.get('images_metadata', [])
# Calculate average confidence (or use from result)
avg_confidence = result.get('average_confidence')
# Create OCR result record
ocr_result = OCRResult(
file_id=ocr_file.id,
markdown_path=str(markdown_path) if markdown_path else None,
json_path=str(json_path) if json_path else None,
images_dir=None, # Images dir not used in current implementation
detected_language=lang,
total_text_regions=len(text_regions),
average_confidence=avg_confidence,
layout_data=layout_data,
images_metadata=images_metadata
)
db.add(ocr_result)
# Update file status
ocr_file.status = FileStatus.COMPLETED
ocr_file.completed_at = datetime.utcnow()
ocr_file.processing_time = (ocr_file.completed_at - ocr_file.started_at).total_seconds()
db.commit()
logger.info(f"Successfully processed file {ocr_file.id} ({ocr_file.original_filename})")
return True
except Exception as e:
logger.error(f"Attempt {attempt + 1}/{self.max_retries + 1} failed for file {ocr_file.id}: {e}")
if attempt < self.max_retries:
# Wait before retry
time.sleep(self.retry_delay)
else:
# Final failure
ocr_file.status = FileStatus.FAILED
ocr_file.error_message = f"Failed after {self.max_retries + 1} attempts: {str(e)}"
ocr_file.completed_at = datetime.utcnow()
ocr_file.retry_count = attempt
db.commit()
return False
return False
async def cleanup_expired_files(self, db: Session):
"""
Clean up files and batches older than retention period
Args:
db: Database session
"""
try:
cutoff_time = datetime.utcnow() - timedelta(hours=self.file_retention_hours)
# Find expired batches
expired_batches = db.query(OCRBatch).filter(
OCRBatch.created_at < cutoff_time,
OCRBatch.status.in_([BatchStatus.COMPLETED, BatchStatus.FAILED, BatchStatus.PARTIAL])
).all()
logger.info(f"Found {len(expired_batches)} expired batches to clean up")
for batch in expired_batches:
try:
# Get batch directory
batch_dir = self.file_manager.base_upload_dir / "batches" / str(batch.id)
# Delete physical files
if batch_dir.exists():
import shutil
shutil.rmtree(batch_dir)
logger.info(f"Deleted batch directory: {batch_dir}")
# Delete database records
# Delete results first (foreign key constraint)
db.query(OCRResult).filter(
OCRResult.file_id.in_(
db.query(OCRFile.id).filter(OCRFile.batch_id == batch.id)
)
).delete(synchronize_session=False)
# Delete files
db.query(OCRFile).filter(OCRFile.batch_id == batch.id).delete()
# Delete batch
db.delete(batch)
db.commit()
logger.info(f"Cleaned up expired batch {batch.id}")
except Exception as e:
logger.error(f"Error cleaning up batch {batch.id}: {e}")
db.rollback()
except Exception as e:
logger.error(f"Error in cleanup_expired_files: {e}")
async def generate_pdf_background(
self,
result_id: int,
output_path: str,
css_template: str = "default",
db: Session = None
):
"""
Generate PDF in background with retry logic
Args:
result_id: OCR result ID
output_path: Output PDF path
css_template: CSS template name
db: Database session
"""
should_close_db = False
if db is None:
db = SessionLocal()
should_close_db = True
try:
# Get result
result = db.query(OCRResult).filter(OCRResult.id == result_id).first()
if not result:
logger.error(f"Result {result_id} not found")
return
# Generate PDF with retry
await self.execute_with_retry(
self.pdf_generator.generate_pdf,
markdown_path=result.markdown_path,
output_path=output_path,
css_template=css_template,
max_retries=2,
retry_delay=3
)
logger.info(f"Successfully generated PDF for result {result_id}: {output_path}")
except Exception as e:
logger.error(f"Failed to generate PDF for result {result_id}: {e}")
finally:
if should_close_db:
db.close()
async def start_cleanup_scheduler(self):
"""
Start periodic cleanup scheduler
Runs cleanup task at specified intervals
"""
logger.info(f"Starting cleanup scheduler (interval: {self.cleanup_interval}s, retention: {self.file_retention_hours}h)")
while True:
try:
db = SessionLocal()
await self.cleanup_expired_files(db)
db.close()
except Exception as e:
logger.error(f"Error in cleanup scheduler: {e}")
# Wait for next interval
await asyncio.sleep(self.cleanup_interval)
# Global task manager instance
task_manager = BackgroundTaskManager()
def process_batch_files_with_retry(
batch_id: int,
lang: str,
detect_layout: bool,
db: Session
):
"""
Process all files in a batch with retry logic
Args:
batch_id: Batch ID
lang: Language code
detect_layout: Whether to detect layout
db: Database session
"""
try:
# Get batch
batch = db.query(OCRBatch).filter(OCRBatch.id == batch_id).first()
if not batch:
logger.error(f"Batch {batch_id} not found")
return
# Update batch status
batch.status = BatchStatus.PROCESSING
batch.started_at = datetime.utcnow()
db.commit()
# Get pending files
files = db.query(OCRFile).filter(
OCRFile.batch_id == batch_id,
OCRFile.status == FileStatus.PENDING
).all()
logger.info(f"Processing {len(files)} files in batch {batch_id} with retry logic")
# Process each file with retry
for ocr_file in files:
success = task_manager.process_single_file_with_retry(
ocr_file=ocr_file,
batch_id=batch_id,
lang=lang,
detect_layout=detect_layout,
db=db
)
# Update batch progress
if success:
batch.completed_files += 1
else:
batch.failed_files += 1
db.commit()
# Update batch final status
if batch.failed_files == 0:
batch.status = BatchStatus.COMPLETED
elif batch.completed_files > 0:
batch.status = BatchStatus.PARTIAL
else:
batch.status = BatchStatus.FAILED
batch.completed_at = datetime.utcnow()
db.commit()
logger.info(
f"Batch {batch_id} processing complete: "
f"{batch.completed_files} succeeded, {batch.failed_files} failed"
)
except Exception as e:
logger.error(f"Fatal error processing batch {batch_id}: {e}")
try:
batch = db.query(OCRBatch).filter(OCRBatch.id == batch_id).first()
if batch:
batch.status = BatchStatus.FAILED
batch.completed_at = datetime.utcnow()
db.commit()
except Exception as commit_error:
logger.error(f"Error updating batch status: {commit_error}")

View File

@@ -0,0 +1,512 @@
"""
Tool_OCR - Export Service
Handles OCR result export in multiple formats with filtering and formatting rules
"""
import json
import logging
import zipfile
from pathlib import Path
from typing import List, Dict, Optional, Any
from datetime import datetime
import pandas as pd
from sqlalchemy.orm import Session
from app.core.config import settings
from app.models.ocr import OCRBatch, OCRFile, OCRResult, FileStatus
from app.models.export import ExportRule
from app.services.pdf_generator import PDFGenerator, PDFGenerationError
logger = logging.getLogger(__name__)
class ExportError(Exception):
"""Exception raised for export errors"""
pass
class ExportService:
"""
Export service for OCR results
Supported formats:
- TXT: Plain text export
- JSON: Full metadata export
- Excel: Tabular data export
- Markdown: Direct Markdown export
- PDF: Layout-preserved PDF export
- ZIP: Batch export archive
"""
def __init__(self):
"""Initialize export service"""
self.pdf_generator = PDFGenerator()
def apply_filters(
self,
results: List[OCRResult],
filters: Dict[str, Any]
) -> List[OCRResult]:
"""
Apply filters to OCR results
Args:
results: List of OCR results
filters: Filter configuration
- confidence_threshold: Minimum confidence (0.0-1.0)
- filename_pattern: Glob pattern for filename matching
- language: Filter by detected language
Returns:
List[OCRResult]: Filtered results
"""
filtered = results
# Confidence threshold filter
if "confidence_threshold" in filters:
threshold = filters["confidence_threshold"]
filtered = [r for r in filtered if r.average_confidence and r.average_confidence >= threshold]
# Filename pattern filter (using simple substring match)
if "filename_pattern" in filters:
pattern = filters["filename_pattern"].lower()
filtered = [
r for r in filtered
if pattern in r.file.original_filename.lower()
]
# Language filter
if "language" in filters:
lang = filters["language"]
filtered = [r for r in filtered if r.detected_language == lang]
return filtered
def export_to_txt(
self,
results: List[OCRResult],
output_path: Path,
formatting: Optional[Dict] = None
) -> Path:
"""
Export results to plain text file
Args:
results: List of OCR results
output_path: Output file path
formatting: Formatting options
- add_line_numbers: Add line numbers
- group_by_filename: Group text by source file
- include_metadata: Add file metadata headers
Returns:
Path: Output file path
Raises:
ExportError: If export fails
"""
try:
formatting = formatting or {}
output_lines = []
for idx, result in enumerate(results, 1):
# Read Markdown file
if not result.markdown_path or not Path(result.markdown_path).exists():
logger.warning(f"Markdown file not found for result {result.id}")
continue
markdown_content = Path(result.markdown_path).read_text(encoding="utf-8")
# Add metadata header if requested
if formatting.get("include_metadata", False):
output_lines.append(f"=" * 80)
output_lines.append(f"文件: {result.file.original_filename}")
output_lines.append(f"語言: {result.detected_language or '未知'}")
output_lines.append(f"信心度: {result.average_confidence:.2%}" if result.average_confidence else "信心度: N/A")
output_lines.append(f"=" * 80)
output_lines.append("")
# Add content with optional line numbers
if formatting.get("add_line_numbers", False):
for line_num, line in enumerate(markdown_content.split('\n'), 1):
output_lines.append(f"{line_num:4d} | {line}")
else:
output_lines.append(markdown_content)
# Add separator between files if grouping
if formatting.get("group_by_filename", False) and idx < len(results):
output_lines.append("\n" + "-" * 80 + "\n")
# Write to file
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text("\n".join(output_lines), encoding="utf-8")
logger.info(f"Exported {len(results)} results to TXT: {output_path}")
return output_path
except Exception as e:
raise ExportError(f"TXT export failed: {str(e)}")
def export_to_json(
self,
results: List[OCRResult],
output_path: Path,
include_layout: bool = True,
include_images: bool = True
) -> Path:
"""
Export results to JSON file with full metadata
Args:
results: List of OCR results
output_path: Output file path
include_layout: Include layout data
include_images: Include images metadata
Returns:
Path: Output file path
Raises:
ExportError: If export fails
"""
try:
export_data = {
"export_time": datetime.utcnow().isoformat(),
"total_files": len(results),
"results": []
}
for result in results:
result_data = {
"file_id": result.file.id,
"filename": result.file.original_filename,
"file_format": result.file.file_format,
"file_size": result.file.file_size,
"processing_time": result.file.processing_time,
"detected_language": result.detected_language,
"total_text_regions": result.total_text_regions,
"average_confidence": result.average_confidence,
"markdown_path": result.markdown_path,
}
# Include layout data if requested
if include_layout and result.layout_data:
result_data["layout_data"] = result.layout_data
# Include images metadata if requested
if include_images and result.images_metadata:
result_data["images_metadata"] = result.images_metadata
export_data["results"].append(result_data)
# Write to file
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(
json.dumps(export_data, ensure_ascii=False, indent=2),
encoding="utf-8"
)
logger.info(f"Exported {len(results)} results to JSON: {output_path}")
return output_path
except Exception as e:
raise ExportError(f"JSON export failed: {str(e)}")
def export_to_excel(
self,
results: List[OCRResult],
output_path: Path,
include_confidence: bool = True,
include_processing_time: bool = True
) -> Path:
"""
Export results to Excel file
Args:
results: List of OCR results
output_path: Output file path
include_confidence: Include confidence scores
include_processing_time: Include processing time
Returns:
Path: Output file path
Raises:
ExportError: If export fails
"""
try:
rows = []
for result in results:
# Read Markdown content
text_content = ""
if result.markdown_path and Path(result.markdown_path).exists():
text_content = Path(result.markdown_path).read_text(encoding="utf-8")
row = {
"文件名": result.file.original_filename,
"格式": result.file.file_format,
"大小(字節)": result.file.file_size,
"語言": result.detected_language or "未知",
"文本區域數": result.total_text_regions,
"提取內容": text_content[:1000] + "..." if len(text_content) > 1000 else text_content,
}
if include_confidence:
row["平均信心度"] = f"{result.average_confidence:.2%}" if result.average_confidence else "N/A"
if include_processing_time:
row["處理時間(秒)"] = f"{result.file.processing_time:.2f}" if result.file.processing_time else "N/A"
rows.append(row)
# Create DataFrame and export
df = pd.DataFrame(rows)
output_path.parent.mkdir(parents=True, exist_ok=True)
df.to_excel(output_path, index=False, engine='openpyxl')
logger.info(f"Exported {len(results)} results to Excel: {output_path}")
return output_path
except Exception as e:
raise ExportError(f"Excel export failed: {str(e)}")
def export_to_markdown(
self,
results: List[OCRResult],
output_path: Path,
combine: bool = True
) -> Path:
"""
Export results to Markdown file(s)
Args:
results: List of OCR results
output_path: Output file path (or directory if not combining)
combine: Combine all results into one file
Returns:
Path: Output file/directory path
Raises:
ExportError: If export fails
"""
try:
if combine:
# Combine all Markdown files into one
combined_content = []
for result in results:
if not result.markdown_path or not Path(result.markdown_path).exists():
continue
markdown_content = Path(result.markdown_path).read_text(encoding="utf-8")
# Add header
combined_content.append(f"# {result.file.original_filename}\n")
combined_content.append(markdown_content)
combined_content.append("\n---\n") # Separator
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text("\n".join(combined_content), encoding="utf-8")
logger.info(f"Exported {len(results)} results to combined Markdown: {output_path}")
return output_path
else:
# Export each result to separate file
output_path.mkdir(parents=True, exist_ok=True)
for result in results:
if not result.markdown_path or not Path(result.markdown_path).exists():
continue
# Copy Markdown file to output directory
src_path = Path(result.markdown_path)
dst_path = output_path / f"{result.file.original_filename}.md"
dst_path.write_text(src_path.read_text(encoding="utf-8"), encoding="utf-8")
logger.info(f"Exported {len(results)} results to separate Markdown files: {output_path}")
return output_path
except Exception as e:
raise ExportError(f"Markdown export failed: {str(e)}")
def export_to_pdf(
self,
result: OCRResult,
output_path: Path,
css_template: str = "default",
metadata: Optional[Dict] = None
) -> Path:
"""
Export single result to PDF with layout preservation
Args:
result: OCR result
output_path: Output PDF path
css_template: CSS template name or custom CSS
metadata: Optional PDF metadata
Returns:
Path: Output PDF path
Raises:
ExportError: If export fails
"""
try:
if not result.markdown_path or not Path(result.markdown_path).exists():
raise ExportError(f"Markdown file not found for result {result.id}")
markdown_path = Path(result.markdown_path)
# Prepare metadata
pdf_metadata = metadata or {}
if "title" not in pdf_metadata:
pdf_metadata["title"] = result.file.original_filename
# Generate PDF
self.pdf_generator.generate_pdf(
markdown_path=markdown_path,
output_path=output_path,
css_template=css_template,
metadata=pdf_metadata
)
logger.info(f"Exported result {result.id} to PDF: {output_path}")
return output_path
except PDFGenerationError as e:
raise ExportError(f"PDF generation failed: {str(e)}")
except Exception as e:
raise ExportError(f"PDF export failed: {str(e)}")
def export_batch_to_zip(
self,
db: Session,
batch_id: int,
output_path: Path,
include_formats: Optional[List[str]] = None
) -> Path:
"""
Export entire batch to ZIP archive
Args:
db: Database session
batch_id: Batch ID
output_path: Output ZIP path
include_formats: List of formats to include (markdown, json, txt, excel, pdf)
Returns:
Path: Output ZIP path
Raises:
ExportError: If export fails
"""
try:
include_formats = include_formats or ["markdown", "json"]
# Get batch and results
batch = db.query(OCRBatch).filter(OCRBatch.id == batch_id).first()
if not batch:
raise ExportError(f"Batch {batch_id} not found")
results = db.query(OCRResult).join(OCRFile).filter(
OCRFile.batch_id == batch_id,
OCRFile.status == FileStatus.COMPLETED
).all()
if not results:
raise ExportError(f"No completed results found for batch {batch_id}")
# Create temporary export directory
temp_dir = output_path.parent / f"temp_export_{batch_id}"
temp_dir.mkdir(parents=True, exist_ok=True)
try:
# Export in requested formats
if "markdown" in include_formats:
md_dir = temp_dir / "markdown"
self.export_to_markdown(results, md_dir, combine=False)
if "json" in include_formats:
json_path = temp_dir / "batch_results.json"
self.export_to_json(results, json_path)
if "txt" in include_formats:
txt_path = temp_dir / "batch_results.txt"
self.export_to_txt(results, txt_path)
if "excel" in include_formats:
excel_path = temp_dir / "batch_results.xlsx"
self.export_to_excel(results, excel_path)
# Create ZIP archive
output_path.parent.mkdir(parents=True, exist_ok=True)
with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
for file_path in temp_dir.rglob('*'):
if file_path.is_file():
arcname = file_path.relative_to(temp_dir)
zipf.write(file_path, arcname)
logger.info(f"Exported batch {batch_id} to ZIP: {output_path}")
return output_path
finally:
# Clean up temporary directory
import shutil
shutil.rmtree(temp_dir, ignore_errors=True)
except Exception as e:
raise ExportError(f"Batch ZIP export failed: {str(e)}")
def apply_export_rule(
self,
db: Session,
results: List[OCRResult],
rule_id: int
) -> List[OCRResult]:
"""
Apply export rule to filter and format results
Args:
db: Database session
results: List of OCR results
rule_id: Export rule ID
Returns:
List[OCRResult]: Filtered results
Raises:
ExportError: If rule not found
"""
rule = db.query(ExportRule).filter(ExportRule.id == rule_id).first()
if not rule:
raise ExportError(f"Export rule {rule_id} not found")
config = rule.config_json
# Apply filters
if "filters" in config:
results = self.apply_filters(results, config["filters"])
# Note: Formatting options are applied in individual export methods
return results
def get_export_formats(self) -> Dict[str, str]:
"""
Get available export formats
Returns:
Dict mapping format codes to descriptions
"""
return {
"txt": "純文本格式 (.txt)",
"json": "JSON 格式 - 包含完整元數據 (.json)",
"excel": "Excel 表格格式 (.xlsx)",
"markdown": "Markdown 格式 (.md)",
"pdf": "版面保留 PDF 格式 (.pdf)",
"zip": "批次打包格式 (.zip)",
}

View File

@@ -0,0 +1,420 @@
"""
Tool_OCR - File Management Service
Handles file uploads, storage, validation, and cleanup
"""
import logging
import shutil
import uuid
from pathlib import Path
from typing import List, Tuple, Optional
from datetime import datetime, timedelta
from fastapi import UploadFile
from sqlalchemy.orm import Session
from app.core.config import settings
from app.models.ocr import OCRBatch, OCRFile, FileStatus
from app.services.preprocessor import DocumentPreprocessor
logger = logging.getLogger(__name__)
class FileManagementError(Exception):
"""Exception raised for file management errors"""
pass
class FileManager:
"""
File management service for upload, storage, and cleanup
Directory structure:
uploads/
├── batches/
│ └── {batch_id}/
│ ├── inputs/ # Original uploaded files
│ ├── outputs/ # OCR results
│ │ ├── markdown/ # Markdown files
│ │ ├── json/ # JSON files
│ │ └── images/ # Extracted images
│ └── exports/ # Export files (PDF, Excel, etc.)
"""
def __init__(self):
"""Initialize file manager"""
self.preprocessor = DocumentPreprocessor()
self.base_upload_dir = Path(settings.upload_dir)
self.base_upload_dir.mkdir(parents=True, exist_ok=True)
def create_batch_directory(self, batch_id: int) -> Path:
"""
Create directory structure for a batch
Args:
batch_id: Batch ID
Returns:
Path: Batch directory path
"""
batch_dir = self.base_upload_dir / "batches" / str(batch_id)
# Create subdirectories
(batch_dir / "inputs").mkdir(parents=True, exist_ok=True)
(batch_dir / "outputs" / "markdown").mkdir(parents=True, exist_ok=True)
(batch_dir / "outputs" / "json").mkdir(parents=True, exist_ok=True)
(batch_dir / "outputs" / "images").mkdir(parents=True, exist_ok=True)
(batch_dir / "exports").mkdir(parents=True, exist_ok=True)
logger.info(f"Created batch directory: {batch_dir}")
return batch_dir
def get_batch_directory(self, batch_id: int) -> Path:
"""
Get batch directory path
Args:
batch_id: Batch ID
Returns:
Path: Batch directory path
"""
return self.base_upload_dir / "batches" / str(batch_id)
def validate_upload(self, file: UploadFile) -> Tuple[bool, Optional[str]]:
"""
Validate uploaded file before saving
Args:
file: Uploaded file
Returns:
Tuple of (is_valid, error_message)
"""
# Check filename
if not file.filename:
return False, "文件名不能為空"
# Check file size (read content size)
file.file.seek(0, 2) # Seek to end
file_size = file.file.tell()
file.file.seek(0) # Reset to beginning
if file_size == 0:
return False, "文件為空"
if file_size > settings.max_upload_size:
max_mb = settings.max_upload_size / (1024 * 1024)
return False, f"文件大小超過限制 ({max_mb}MB)"
# Check file extension
file_ext = Path(file.filename).suffix.lower()
allowed_extensions = {'.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.ppt', '.pptx'}
if file_ext not in allowed_extensions:
return False, f"不支持的文件格式 ({file_ext}),僅支持: {', '.join(allowed_extensions)}"
return True, None
def save_upload(
self,
file: UploadFile,
batch_id: int,
validate: bool = True
) -> Tuple[Path, str]:
"""
Save uploaded file to batch directory
Args:
file: Uploaded file
batch_id: Batch ID
validate: Whether to validate file
Returns:
Tuple of (file_path, original_filename)
Raises:
FileManagementError: If file validation or saving fails
"""
# Validate if requested
if validate:
is_valid, error_msg = self.validate_upload(file)
if not is_valid:
raise FileManagementError(error_msg)
# Generate unique filename to avoid conflicts
original_filename = file.filename
file_ext = Path(original_filename).suffix
unique_filename = f"{uuid.uuid4()}{file_ext}"
# Get batch input directory
batch_dir = self.get_batch_directory(batch_id)
input_dir = batch_dir / "inputs"
input_dir.mkdir(parents=True, exist_ok=True)
# Save file
file_path = input_dir / unique_filename
try:
with file_path.open("wb") as buffer:
shutil.copyfileobj(file.file, buffer)
logger.info(f"Saved upload: {file_path} (original: {original_filename})")
return file_path, original_filename
except Exception as e:
# Clean up partial file if exists
file_path.unlink(missing_ok=True)
raise FileManagementError(f"保存文件失敗: {str(e)}")
def validate_saved_file(self, file_path: Path) -> Tuple[bool, Optional[str], Optional[str]]:
"""
Validate saved file using preprocessor
Args:
file_path: Path to saved file
Returns:
Tuple of (is_valid, error_message, detected_format)
"""
return self.preprocessor.validate_file(file_path)
def create_batch(
self,
db: Session,
user_id: int,
batch_name: Optional[str] = None
) -> OCRBatch:
"""
Create new OCR batch
Args:
db: Database session
user_id: User ID
batch_name: Optional batch name
Returns:
OCRBatch: Created batch object
"""
# Create batch record
batch = OCRBatch(
user_id=user_id,
batch_name=batch_name or f"Batch_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
)
db.add(batch)
db.commit()
db.refresh(batch)
# Create directory structure
self.create_batch_directory(batch.id)
logger.info(f"Created batch: {batch.id} for user {user_id}")
return batch
def add_file_to_batch(
self,
db: Session,
batch_id: int,
file: UploadFile
) -> OCRFile:
"""
Add file to batch and save to disk
Args:
db: Database session
batch_id: Batch ID
file: Uploaded file
Returns:
OCRFile: Created file record
Raises:
FileManagementError: If file operations fail
"""
# Save file to disk
file_path, original_filename = self.save_upload(file, batch_id)
# Validate saved file
is_valid, detected_format, error_msg = self.validate_saved_file(file_path)
# Create file record
ocr_file = OCRFile(
batch_id=batch_id,
filename=file_path.name,
original_filename=original_filename,
file_path=str(file_path),
file_size=file_path.stat().st_size,
file_format=detected_format or Path(original_filename).suffix.lower().lstrip('.'),
status=FileStatus.PENDING if is_valid else FileStatus.FAILED,
error_message=error_msg if not is_valid else None
)
db.add(ocr_file)
# Update batch total_files count
batch = db.query(OCRBatch).filter(OCRBatch.id == batch_id).first()
if batch:
batch.total_files += 1
if not is_valid:
batch.failed_files += 1
db.commit()
db.refresh(ocr_file)
logger.info(f"Added file to batch {batch_id}: {ocr_file.id} (status: {ocr_file.status})")
return ocr_file
def add_files_to_batch(
self,
db: Session,
batch_id: int,
files: List[UploadFile]
) -> List[OCRFile]:
"""
Add multiple files to batch
Args:
db: Database session
batch_id: Batch ID
files: List of uploaded files
Returns:
List[OCRFile]: List of created file records
"""
ocr_files = []
for file in files:
try:
ocr_file = self.add_file_to_batch(db, batch_id, file)
ocr_files.append(ocr_file)
except FileManagementError as e:
logger.error(f"Failed to add file {file.filename} to batch {batch_id}: {e}")
# Continue with other files
continue
return ocr_files
def get_file_paths(self, batch_id: int, file_id: int) -> dict:
"""
Get all paths for a file in a batch
Args:
batch_id: Batch ID
file_id: File ID
Returns:
Dict containing all relevant paths
"""
batch_dir = self.get_batch_directory(batch_id)
return {
"input_dir": batch_dir / "inputs",
"output_dir": batch_dir / "outputs",
"markdown_dir": batch_dir / "outputs" / "markdown",
"json_dir": batch_dir / "outputs" / "json",
"images_dir": batch_dir / "outputs" / "images" / str(file_id),
"export_dir": batch_dir / "exports",
}
def cleanup_expired_batches(self, db: Session, retention_hours: int = 24) -> int:
"""
Clean up expired batch files
Args:
db: Database session
retention_hours: Number of hours to retain files
Returns:
int: Number of batches cleaned up
"""
cutoff_time = datetime.utcnow() - timedelta(hours=retention_hours)
# Find expired batches
expired_batches = db.query(OCRBatch).filter(
OCRBatch.created_at < cutoff_time
).all()
cleaned_count = 0
for batch in expired_batches:
try:
# Delete batch directory
batch_dir = self.get_batch_directory(batch.id)
if batch_dir.exists():
shutil.rmtree(batch_dir)
logger.info(f"Deleted batch directory: {batch_dir}")
# Delete database records (cascade will handle related records)
db.delete(batch)
cleaned_count += 1
except Exception as e:
logger.error(f"Failed to cleanup batch {batch.id}: {e}")
continue
if cleaned_count > 0:
db.commit()
logger.info(f"Cleaned up {cleaned_count} expired batches")
return cleaned_count
def verify_file_ownership(
self,
db: Session,
user_id: int,
batch_id: int
) -> bool:
"""
Verify user owns the batch
Args:
db: Database session
user_id: User ID
batch_id: Batch ID
Returns:
bool: True if user owns batch, False otherwise
"""
batch = db.query(OCRBatch).filter(
OCRBatch.id == batch_id,
OCRBatch.user_id == user_id
).first()
return batch is not None
def get_batch_statistics(self, db: Session, batch_id: int) -> dict:
"""
Get statistics for a batch
Args:
db: Database session
batch_id: Batch ID
Returns:
Dict containing batch statistics
"""
batch = db.query(OCRBatch).filter(OCRBatch.id == batch_id).first()
if not batch:
return {}
# Calculate total file size
total_size = sum(f.file_size for f in batch.files)
# Calculate processing time
processing_time = None
if batch.completed_at and batch.started_at:
processing_time = (batch.completed_at - batch.started_at).total_seconds()
return {
"batch_id": batch.id,
"batch_name": batch.batch_name,
"status": batch.status,
"total_files": batch.total_files,
"completed_files": batch.completed_files,
"failed_files": batch.failed_files,
"pending_files": batch.total_files - batch.completed_files - batch.failed_files,
"progress_percentage": batch.progress_percentage,
"total_file_size": total_size,
"total_file_size_mb": round(total_size / (1024 * 1024), 2),
"created_at": batch.created_at.isoformat(),
"started_at": batch.started_at.isoformat() if batch.started_at else None,
"completed_at": batch.completed_at.isoformat() if batch.completed_at else None,
"processing_time": processing_time,
}

View File

@@ -0,0 +1,516 @@
"""
Tool_OCR - Core OCR Service
PaddleOCR-VL integration for text and structure extraction
"""
import json
import logging
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from datetime import datetime
import uuid
from paddleocr import PaddleOCR, PPStructureV3
from PIL import Image
from pdf2image import convert_from_path
from app.core.config import settings
from app.services.office_converter import OfficeConverter, OfficeConverterError
logger = logging.getLogger(__name__)
class OCRService:
"""
Core OCR service using PaddleOCR-VL
Handles text recognition and document structure analysis
"""
def __init__(self):
"""Initialize PaddleOCR and PPStructure engines"""
self.ocr_languages = settings.ocr_languages_list
self.confidence_threshold = settings.ocr_confidence_threshold
# Initialize PaddleOCR engine (will be lazy-loaded per language)
self.ocr_engines = {}
# Initialize PP-Structure for layout analysis
self.structure_engine = None
# Initialize Office document converter
self.office_converter = OfficeConverter()
logger.info("OCR Service initialized")
def get_ocr_engine(self, lang: str = 'ch') -> PaddleOCR:
"""
Get or create OCR engine for specified language
Args:
lang: Language code (ch, en, japan, korean, etc.)
Returns:
PaddleOCR engine instance
"""
if lang not in self.ocr_engines:
logger.info(f"Initializing PaddleOCR engine for language: {lang}")
self.ocr_engines[lang] = PaddleOCR(
use_angle_cls=True,
lang=lang,
# Note: show_log and use_gpu parameters removed in PaddleOCR 3.x
)
logger.info(f"PaddleOCR engine ready for {lang}")
return self.ocr_engines[lang]
def get_structure_engine(self) -> PPStructureV3:
"""
Get or create PP-Structure engine for layout analysis
Returns:
PPStructure engine instance
"""
if self.structure_engine is None:
logger.info("Initializing PP-StructureV3 engine")
self.structure_engine = PPStructureV3(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False,
use_table_recognition=True,
use_formula_recognition=True,
layout_threshold=0.5,
)
logger.info("PP-StructureV3 engine ready")
return self.structure_engine
def convert_pdf_to_images(self, pdf_path: Path, output_dir: Path) -> List[Path]:
"""
Convert PDF to images (one per page)
Args:
pdf_path: Path to PDF file
output_dir: Directory to save converted images
Returns:
List of paths to converted images
"""
try:
output_dir.mkdir(parents=True, exist_ok=True)
logger.info(f"Converting PDF {pdf_path.name} to images")
# Convert PDF to images (300 DPI for good quality)
images = convert_from_path(
str(pdf_path),
dpi=300,
fmt='png'
)
image_paths = []
for i, image in enumerate(images):
# Save each page as PNG
image_path = output_dir / f"{pdf_path.stem}_page_{i+1}.png"
image.save(str(image_path), 'PNG')
image_paths.append(image_path)
logger.info(f"Saved page {i+1} to {image_path.name}")
logger.info(f"Converted {len(image_paths)} pages from PDF")
return image_paths
except Exception as e:
logger.error(f"PDF conversion error: {str(e)}")
raise
def process_image(
self,
image_path: Path,
lang: str = 'ch',
detect_layout: bool = True,
confidence_threshold: Optional[float] = None
) -> Dict:
"""
Process single image with OCR and layout analysis
Args:
image_path: Path to image file
lang: Language for OCR
detect_layout: Whether to perform layout analysis
confidence_threshold: Minimum confidence threshold (uses default if None)
Returns:
Dictionary with OCR results and metadata
"""
start_time = datetime.now()
threshold = confidence_threshold if confidence_threshold is not None else self.confidence_threshold
try:
# Check if file is Office document
if self.office_converter.is_office_document(image_path):
logger.info(f"Detected Office document: {image_path.name}, converting to PDF")
try:
# Convert Office document to PDF
pdf_path = self.office_converter.convert_to_pdf(image_path)
logger.info(f"Office document converted to PDF: {pdf_path.name}")
# Process the PDF (will be handled by PDF processing logic below)
image_path = pdf_path
except OfficeConverterError as e:
logger.error(f"Office conversion failed: {str(e)}")
raise
# Check if file is PDF
is_pdf = image_path.suffix.lower() == '.pdf'
if is_pdf:
# Convert PDF to images
logger.info(f"Detected PDF file: {image_path.name}, converting to images")
pdf_images_dir = image_path.parent / f"{image_path.stem}_pages"
image_paths = self.convert_pdf_to_images(image_path, pdf_images_dir)
# Process all pages
all_text_regions = []
total_confidence_sum = 0.0
total_valid_regions = 0
all_layout_data = []
all_images_metadata = []
for page_num, page_image_path in enumerate(image_paths, 1):
logger.info(f"Processing PDF page {page_num}/{len(image_paths)}")
# Process each page
page_result = self.process_image(
page_image_path,
lang=lang,
detect_layout=detect_layout,
confidence_threshold=confidence_threshold
)
# Accumulate results
if page_result['status'] == 'success':
# Add page number to each text region
for region in page_result['text_regions']:
region['page'] = page_num
all_text_regions.append(region)
total_confidence_sum += page_result['average_confidence'] * page_result['total_text_regions']
total_valid_regions += page_result['total_text_regions']
# Accumulate layout data
if page_result.get('layout_data'):
all_layout_data.append(page_result['layout_data'])
# Accumulate images metadata
if page_result.get('images_metadata'):
all_images_metadata.extend(page_result['images_metadata'])
# Calculate overall average confidence
avg_confidence = total_confidence_sum / total_valid_regions if total_valid_regions > 0 else 0.0
# Combine layout data from all pages
combined_layout = None
if all_layout_data:
combined_elements = []
for layout in all_layout_data:
if layout.get('elements'):
combined_elements.extend(layout['elements'])
if combined_elements:
combined_layout = {
'elements': combined_elements,
'total_elements': len(combined_elements),
'reading_order': list(range(len(combined_elements))),
}
# Generate combined markdown
markdown_content = self.generate_markdown(all_text_regions, combined_layout)
# Calculate processing time
processing_time = (datetime.now() - start_time).total_seconds()
logger.info(
f"PDF processing completed: {image_path.name} - "
f"{len(image_paths)} pages, "
f"{len(all_text_regions)} regions, "
f"{avg_confidence:.2f} avg confidence, "
f"{processing_time:.2f}s"
)
return {
'status': 'success',
'file_name': image_path.name,
'language': lang,
'text_regions': all_text_regions,
'total_text_regions': len(all_text_regions),
'average_confidence': avg_confidence,
'layout_data': combined_layout,
'images_metadata': all_images_metadata,
'markdown_content': markdown_content,
'processing_time': processing_time,
'timestamp': datetime.utcnow().isoformat(),
'total_pages': len(image_paths),
}
# Get OCR engine (for non-PDF images)
ocr_engine = self.get_ocr_engine(lang)
# Perform OCR
logger.info(f"Processing image: {image_path.name}")
# Note: In PaddleOCR 3.x, use_angle_cls is set during initialization, not in ocr() call
ocr_results = ocr_engine.ocr(str(image_path))
# Parse OCR results (PaddleOCR 3.x format)
text_regions = []
total_confidence = 0.0
valid_regions = 0
if ocr_results and isinstance(ocr_results, (list, tuple)) and len(ocr_results) > 0:
# PaddleOCR 3.x returns a list of dictionaries (one per page)
for page_result in ocr_results:
if isinstance(page_result, dict):
# New format: {'rec_texts': [...], 'rec_scores': [...], 'rec_polys': [...]}
texts = page_result.get('rec_texts', [])
scores = page_result.get('rec_scores', [])
polys = page_result.get('rec_polys', [])
# Process each recognized text
for idx, text in enumerate(texts):
# Get corresponding score and bbox
confidence = scores[idx] if idx < len(scores) else 1.0
bbox = polys[idx] if idx < len(polys) else []
# Convert numpy array bbox to list for JSON serialization
if hasattr(bbox, 'tolist'):
bbox = bbox.tolist()
# Filter by confidence threshold
if confidence >= threshold:
text_regions.append({
'text': text,
'bbox': bbox,
'confidence': float(confidence),
})
total_confidence += confidence
valid_regions += 1
avg_confidence = total_confidence / valid_regions if valid_regions > 0 else 0.0
logger.info(f"Parsed {len(text_regions)} text regions with avg confidence {avg_confidence:.3f}")
# Layout analysis (if requested)
layout_data = None
images_metadata = []
if detect_layout:
layout_data, images_metadata = self.analyze_layout(image_path)
# Generate Markdown
markdown_content = self.generate_markdown(text_regions, layout_data)
# Calculate processing time
processing_time = (datetime.now() - start_time).total_seconds()
result = {
'status': 'success',
'file_name': image_path.name,
'language': lang,
'text_regions': text_regions,
'total_text_regions': len(text_regions),
'average_confidence': avg_confidence,
'layout_data': layout_data,
'images_metadata': images_metadata,
'markdown_content': markdown_content,
'processing_time': processing_time,
'timestamp': datetime.utcnow().isoformat(),
}
logger.info(
f"OCR completed: {image_path.name} - "
f"{len(text_regions)} regions, "
f"{avg_confidence:.2f} avg confidence, "
f"{processing_time:.2f}s"
)
return result
except Exception as e:
import traceback
error_trace = traceback.format_exc()
logger.error(f"OCR processing error for {image_path.name}: {str(e)}\n{error_trace}")
return {
'status': 'error',
'file_name': image_path.name,
'error_message': str(e),
'processing_time': (datetime.now() - start_time).total_seconds(),
}
def analyze_layout(self, image_path: Path) -> Tuple[Optional[Dict], List[Dict]]:
"""
Analyze document layout using PP-StructureV3
Args:
image_path: Path to image file
Returns:
Tuple of (layout_data, images_metadata)
"""
try:
structure_engine = self.get_structure_engine()
# Perform structure analysis using predict() method (PaddleOCR 3.x API)
logger.info(f"Running layout analysis on {image_path.name}")
results = structure_engine.predict(str(image_path))
layout_elements = []
images_metadata = []
# Process each page result (for images, usually just one page)
for page_idx, page_result in enumerate(results):
# Get markdown dictionary from result object
if hasattr(page_result, 'markdown'):
markdown_dict = page_result.markdown
logger.info(f"Page {page_idx} markdown keys: {markdown_dict.keys() if isinstance(markdown_dict, dict) else type(markdown_dict)}")
# Extract layout information from markdown structure
if isinstance(markdown_dict, dict):
# Get markdown texts (HTML format with tables and structure)
markdown_texts = markdown_dict.get('markdown_texts', '')
markdown_images = markdown_dict.get('markdown_images', {})
# Create a layout element for the structured content
if markdown_texts:
# Parse HTML content to identify tables and text
import re
# Check if content contains tables
has_table = '<table' in markdown_texts.lower()
element = {
'element_id': len(layout_elements),
'type': 'table' if has_table else 'text',
'content': markdown_texts,
'page': page_idx,
'bbox': [], # PP-StructureV3 doesn't provide individual bbox in this format
}
layout_elements.append(element)
# Add image metadata
for img_idx, (img_path, img_obj) in enumerate(markdown_images.items()):
images_metadata.append({
'element_id': len(layout_elements) + img_idx,
'image_path': img_path,
'type': 'image',
'page': page_idx,
'bbox': [],
})
if layout_elements:
layout_data = {
'elements': layout_elements,
'total_elements': len(layout_elements),
'reading_order': list(range(len(layout_elements))),
}
logger.info(f"Detected {len(layout_elements)} layout elements")
return layout_data, images_metadata
else:
logger.warning("No layout elements detected")
return None, []
except Exception as e:
import traceback
error_trace = traceback.format_exc()
logger.error(f"Layout analysis error: {str(e)}\n{error_trace}")
return None, []
def generate_markdown(
self,
text_regions: List[Dict],
layout_data: Optional[Dict] = None
) -> str:
"""
Generate Markdown from OCR results
Args:
text_regions: List of text regions with bbox and text
layout_data: Optional layout structure information
Returns:
Markdown formatted string
"""
markdown_lines = []
if layout_data and layout_data.get('elements'):
# Generate structured Markdown based on layout
for element in layout_data['elements']:
element_type = element.get('type', 'text')
content = element.get('content', '')
if element_type == 'title':
markdown_lines.append(f"# {content}\n")
elif element_type == 'table':
# Table in HTML format
markdown_lines.append(content)
markdown_lines.append("")
elif element_type == 'figure':
element_id = element.get('element_id')
markdown_lines.append(f"![Figure {element_id}](./images/img_{element_id}.jpg)\n")
else:
markdown_lines.append(f"{content}\n")
else:
# Simple Markdown from text regions only
# Sort by vertical position (top to bottom)
def get_y_coord(region):
"""Safely extract Y coordinate from bbox"""
bbox = region.get('bbox', [])
if isinstance(bbox, (list, tuple)) and len(bbox) > 0:
if isinstance(bbox[0], (list, tuple)) and len(bbox[0]) > 1:
return bbox[0][1] # [[x1,y1], [x2,y2], ...] format
elif len(bbox) > 1:
return bbox[1] # [x1, y1, x2, y2, ...] format
return 0 # Default to 0 if can't extract
sorted_regions = sorted(text_regions, key=get_y_coord)
for region in sorted_regions:
text = region['text']
markdown_lines.append(text)
return "\n".join(markdown_lines)
def save_results(
self,
result: Dict,
output_dir: Path,
file_id: str
) -> Tuple[Optional[Path], Optional[Path]]:
"""
Save OCR results to JSON and Markdown files
Args:
result: OCR result dictionary
output_dir: Output directory
file_id: Unique file identifier
Returns:
Tuple of (json_path, markdown_path)
"""
try:
output_dir.mkdir(parents=True, exist_ok=True)
# Save JSON
json_path = output_dir / f"{file_id}_result.json"
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=2)
# Save Markdown
markdown_path = output_dir / f"{file_id}_output.md"
markdown_content = result.get('markdown_content', '')
with open(markdown_path, 'w', encoding='utf-8') as f:
f.write(markdown_content)
logger.info(f"Results saved: {json_path.name}, {markdown_path.name}")
return json_path, markdown_path
except Exception as e:
logger.error(f"Error saving results: {str(e)}")
return None, None

View File

@@ -0,0 +1,210 @@
"""
Tool_OCR - Office Document Converter Service
Convert Office documents (DOC/DOCX/PPT/PPTX) to PDF for OCR processing
"""
import logging
import subprocess
from pathlib import Path
from typing import Optional
import tempfile
import shutil
logger = logging.getLogger(__name__)
class OfficeConverterError(Exception):
"""Exception raised for Office conversion errors"""
pass
class OfficeConverter:
"""Convert Office documents to PDF for OCR processing"""
# Supported Office formats
OFFICE_FORMATS = {
'.doc': 'application/msword',
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'.ppt': 'application/vnd.ms-powerpoint',
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation'
}
def __init__(self, libreoffice_path: str = "/Applications/LibreOffice.app/Contents/MacOS/soffice"):
"""
Initialize Office converter
Args:
libreoffice_path: Path to LibreOffice executable
"""
self.libreoffice_path = libreoffice_path
self._verify_libreoffice()
def _verify_libreoffice(self):
"""Verify LibreOffice is installed and accessible"""
if not Path(self.libreoffice_path).exists():
# Try alternative path for Homebrew installation
alt_path = shutil.which("soffice")
if alt_path:
self.libreoffice_path = alt_path
logger.info(f"Using LibreOffice at: {alt_path}")
else:
raise OfficeConverterError(
"LibreOffice not found. Please install LibreOffice: brew install libreoffice"
)
def is_office_document(self, file_path: Path) -> bool:
"""
Check if file is an Office document
Args:
file_path: Path to file
Returns:
True if file is an Office document
"""
return file_path.suffix.lower() in self.OFFICE_FORMATS
def convert_to_pdf(self, office_path: Path, output_dir: Optional[Path] = None) -> Path:
"""
Convert Office document to PDF
Args:
office_path: Path to Office document
output_dir: Optional output directory (uses temp dir if not specified)
Returns:
Path to converted PDF file
Raises:
OfficeConverterError: If conversion fails
"""
if not office_path.exists():
raise OfficeConverterError(f"Office file not found: {office_path}")
if not self.is_office_document(office_path):
raise OfficeConverterError(
f"Unsupported format: {office_path.suffix}. "
f"Supported formats: {', '.join(self.OFFICE_FORMATS.keys())}"
)
# Determine output directory
if output_dir is None:
output_dir = office_path.parent
else:
output_dir.mkdir(parents=True, exist_ok=True)
# Expected output PDF path
pdf_filename = office_path.stem + '.pdf'
output_pdf_path = output_dir / pdf_filename
# Remove existing PDF if present
if output_pdf_path.exists():
output_pdf_path.unlink()
logger.info(f"Converting {office_path.name} to PDF using LibreOffice")
try:
# Use LibreOffice headless mode for conversion
# --headless: Run without GUI
# --convert-to pdf: Convert to PDF format
# --outdir: Output directory
cmd = [
self.libreoffice_path,
'--headless',
'--convert-to', 'pdf',
'--outdir', str(output_dir),
str(office_path)
]
logger.debug(f"Running command: {' '.join(cmd)}")
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=60 # 60 second timeout
)
if result.returncode != 0:
error_msg = result.stderr or result.stdout
raise OfficeConverterError(
f"LibreOffice conversion failed: {error_msg}"
)
# Verify PDF was created
if not output_pdf_path.exists():
raise OfficeConverterError(
f"PDF file not created at expected location: {output_pdf_path}"
)
logger.info(f"Successfully converted to PDF: {output_pdf_path.name}")
return output_pdf_path
except subprocess.TimeoutExpired:
raise OfficeConverterError(
f"Conversion timeout (60s) for file: {office_path.name}"
)
except Exception as e:
if isinstance(e, OfficeConverterError):
raise
raise OfficeConverterError(f"Conversion error: {str(e)}")
def convert_docx_to_pdf(self, docx_path: Path, output_dir: Optional[Path] = None) -> Path:
"""
Convert DOCX to PDF
Args:
docx_path: Path to DOCX file
output_dir: Optional output directory
Returns:
Path to converted PDF
"""
if docx_path.suffix.lower() != '.docx':
raise OfficeConverterError(f"Expected .docx file, got: {docx_path.suffix}")
return self.convert_to_pdf(docx_path, output_dir)
def convert_doc_to_pdf(self, doc_path: Path, output_dir: Optional[Path] = None) -> Path:
"""
Convert legacy DOC to PDF
Args:
doc_path: Path to DOC file
output_dir: Optional output directory
Returns:
Path to converted PDF
"""
if doc_path.suffix.lower() != '.doc':
raise OfficeConverterError(f"Expected .doc file, got: {doc_path.suffix}")
return self.convert_to_pdf(doc_path, output_dir)
def convert_pptx_to_pdf(self, pptx_path: Path, output_dir: Optional[Path] = None) -> Path:
"""
Convert PPTX to PDF
Args:
pptx_path: Path to PPTX file
output_dir: Optional output directory
Returns:
Path to converted PDF
"""
if pptx_path.suffix.lower() != '.pptx':
raise OfficeConverterError(f"Expected .pptx file, got: {pptx_path.suffix}")
return self.convert_to_pdf(pptx_path, output_dir)
def convert_ppt_to_pdf(self, ppt_path: Path, output_dir: Optional[Path] = None) -> Path:
"""
Convert legacy PPT to PDF
Args:
ppt_path: Path to PPT file
output_dir: Optional output directory
Returns:
Path to converted PDF
"""
if ppt_path.suffix.lower() != '.ppt':
raise OfficeConverterError(f"Expected .ppt file, got: {ppt_path.suffix}")
return self.convert_to_pdf(ppt_path, output_dir)

View File

@@ -0,0 +1,507 @@
"""
Tool_OCR - PDF Generator Service
Converts Markdown to layout-preserved PDFs using Pandoc + WeasyPrint
"""
import logging
import subprocess
from pathlib import Path
from typing import Optional, Dict
from datetime import datetime
from weasyprint import HTML, CSS
from markdown import markdown
from app.core.config import settings
logger = logging.getLogger(__name__)
class PDFGenerationError(Exception):
"""Exception raised when PDF generation fails"""
pass
class PDFGenerator:
"""
PDF generation service with layout preservation
Supports two generation methods:
1. Pandoc (preferred): Markdown → HTML → PDF via pandoc command
2. WeasyPrint (fallback): Direct Python-based HTML → PDF conversion
"""
# Default CSS template for layout preservation
DEFAULT_CSS = """
@page {
size: A4;
margin: 2cm;
}
body {
font-family: "Noto Sans CJK SC", "Noto Sans CJK TC", "Microsoft YaHei", "SimSun", sans-serif;
font-size: 11pt;
line-height: 1.6;
color: #333;
}
h1 {
font-size: 24pt;
font-weight: bold;
margin-top: 0;
margin-bottom: 12pt;
color: #000;
page-break-after: avoid;
}
h2 {
font-size: 18pt;
font-weight: bold;
margin-top: 18pt;
margin-bottom: 10pt;
color: #000;
page-break-after: avoid;
}
h3 {
font-size: 14pt;
font-weight: bold;
margin-top: 14pt;
margin-bottom: 8pt;
color: #000;
page-break-after: avoid;
}
p {
margin: 0 0 10pt 0;
text-align: justify;
}
table {
width: 100%;
border-collapse: collapse;
margin: 12pt 0;
page-break-inside: avoid;
}
table th {
background-color: #f0f0f0;
border: 1px solid #ccc;
padding: 8pt;
text-align: left;
font-weight: bold;
}
table td {
border: 1px solid #ccc;
padding: 8pt;
text-align: left;
}
code {
font-family: "Courier New", monospace;
font-size: 10pt;
background-color: #f5f5f5;
padding: 2pt 4pt;
border-radius: 3px;
}
pre {
background-color: #f5f5f5;
border: 1px solid #ddd;
border-radius: 5px;
padding: 10pt;
overflow-x: auto;
page-break-inside: avoid;
}
pre code {
background-color: transparent;
padding: 0;
}
img {
max-width: 100%;
height: auto;
display: block;
margin: 12pt auto;
page-break-inside: avoid;
}
blockquote {
border-left: 4px solid #ddd;
padding-left: 12pt;
margin: 12pt 0;
color: #666;
font-style: italic;
}
ul, ol {
margin: 10pt 0;
padding-left: 20pt;
}
li {
margin: 5pt 0;
}
hr {
border: none;
border-top: 1px solid #ccc;
margin: 20pt 0;
}
.page-break {
page-break-after: always;
}
"""
# Academic paper template
ACADEMIC_CSS = """
@page {
size: A4;
margin: 2.5cm;
}
body {
font-family: "Times New Roman", "Noto Serif CJK SC", serif;
font-size: 12pt;
line-height: 1.8;
color: #000;
}
h1 {
font-size: 20pt;
text-align: center;
margin-bottom: 24pt;
page-break-after: avoid;
}
h2 {
font-size: 16pt;
margin-top: 20pt;
margin-bottom: 12pt;
page-break-after: avoid;
}
h3 {
font-size: 14pt;
margin-top: 16pt;
margin-bottom: 10pt;
page-break-after: avoid;
}
p {
text-indent: 2em;
text-align: justify;
margin: 0 0 12pt 0;
}
table {
width: 100%;
border-collapse: collapse;
margin: 16pt auto;
page-break-inside: avoid;
}
table caption {
font-weight: bold;
margin-bottom: 8pt;
}
"""
# Business report template
BUSINESS_CSS = """
@page {
size: A4;
margin: 2cm 2.5cm;
}
body {
font-family: "Arial", "Noto Sans CJK SC", sans-serif;
font-size: 11pt;
line-height: 1.5;
color: #333;
}
h1 {
font-size: 22pt;
color: #0066cc;
border-bottom: 3px solid #0066cc;
padding-bottom: 8pt;
margin-bottom: 20pt;
page-break-after: avoid;
}
h2 {
font-size: 16pt;
color: #0066cc;
margin-top: 20pt;
margin-bottom: 12pt;
page-break-after: avoid;
}
table {
width: 100%;
border-collapse: collapse;
margin: 16pt 0;
}
table th {
background-color: #0066cc;
color: white;
padding: 10pt;
font-weight: bold;
}
table td {
border: 1px solid #ddd;
padding: 10pt;
}
table tr:nth-child(even) {
background-color: #f9f9f9;
}
"""
def __init__(self):
"""Initialize PDF generator"""
self.css_templates = {
"default": self.DEFAULT_CSS,
"academic": self.ACADEMIC_CSS,
"business": self.BUSINESS_CSS,
}
def check_pandoc_available(self) -> bool:
"""
Check if Pandoc is installed and available
Returns:
bool: True if pandoc is available, False otherwise
"""
try:
result = subprocess.run(
["pandoc", "--version"],
capture_output=True,
text=True,
timeout=5
)
return result.returncode == 0
except (subprocess.TimeoutExpired, FileNotFoundError):
logger.warning("Pandoc not found or timed out")
return False
def generate_pdf_pandoc(
self,
markdown_path: Path,
output_path: Path,
css_template: str = "default",
metadata: Optional[Dict] = None
) -> Path:
"""
Generate PDF using Pandoc (preferred method)
Args:
markdown_path: Path to input Markdown file
output_path: Path to output PDF file
css_template: CSS template name or custom CSS string
metadata: Optional metadata dict (title, author, date)
Returns:
Path: Path to generated PDF file
Raises:
PDFGenerationError: If PDF generation fails
"""
try:
# Create temporary CSS file
css_content = self.css_templates.get(css_template, css_template)
css_file = output_path.parent / f"temp_{datetime.now().timestamp()}.css"
css_file.write_text(css_content, encoding="utf-8")
# Build pandoc command
pandoc_cmd = [
"pandoc",
str(markdown_path),
"-o", str(output_path),
"--pdf-engine=weasyprint",
"--css", str(css_file),
"--standalone",
"--from=markdown+tables+fenced_code_blocks+footnotes",
]
# Add metadata if provided
if metadata:
if metadata.get("title"):
pandoc_cmd.extend(["--metadata", f"title={metadata['title']}"])
if metadata.get("author"):
pandoc_cmd.extend(["--metadata", f"author={metadata['author']}"])
if metadata.get("date"):
pandoc_cmd.extend(["--metadata", f"date={metadata['date']}"])
# Execute pandoc
logger.info(f"Executing pandoc: {' '.join(pandoc_cmd)}")
result = subprocess.run(
pandoc_cmd,
capture_output=True,
text=True,
timeout=60 # 60 second timeout for large documents
)
# Clean up temporary CSS file
css_file.unlink(missing_ok=True)
if result.returncode != 0:
error_msg = f"Pandoc failed: {result.stderr}"
logger.error(error_msg)
raise PDFGenerationError(error_msg)
if not output_path.exists():
raise PDFGenerationError(f"PDF file not created: {output_path}")
logger.info(f"PDF generated successfully via Pandoc: {output_path}")
return output_path
except subprocess.TimeoutExpired:
css_file.unlink(missing_ok=True)
raise PDFGenerationError("Pandoc execution timed out")
except Exception as e:
css_file.unlink(missing_ok=True)
raise PDFGenerationError(f"Pandoc PDF generation failed: {str(e)}")
def generate_pdf_weasyprint(
self,
markdown_path: Path,
output_path: Path,
css_template: str = "default",
metadata: Optional[Dict] = None
) -> Path:
"""
Generate PDF using WeasyPrint directly (fallback method)
Args:
markdown_path: Path to input Markdown file
output_path: Path to output PDF file
css_template: CSS template name or custom CSS string
metadata: Optional metadata dict (title, author, date)
Returns:
Path: Path to generated PDF file
Raises:
PDFGenerationError: If PDF generation fails
"""
try:
# Read Markdown content
markdown_content = markdown_path.read_text(encoding="utf-8")
# Convert Markdown to HTML
html_content = markdown(
markdown_content,
extensions=[
'tables',
'fenced_code',
'codehilite',
'nl2br',
'sane_lists',
]
)
# Wrap HTML with proper structure
title = metadata.get("title", markdown_path.stem) if metadata else markdown_path.stem
full_html = f"""
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<title>{title}</title>
</head>
<body>
{html_content}
</body>
</html>
"""
# Get CSS content
css_content = self.css_templates.get(css_template, css_template)
# Generate PDF
logger.info(f"Generating PDF via WeasyPrint: {output_path}")
html = HTML(string=full_html, base_url=str(markdown_path.parent))
css = CSS(string=css_content)
html.write_pdf(str(output_path), stylesheets=[css])
if not output_path.exists():
raise PDFGenerationError(f"PDF file not created: {output_path}")
logger.info(f"PDF generated successfully via WeasyPrint: {output_path}")
return output_path
except Exception as e:
raise PDFGenerationError(f"WeasyPrint PDF generation failed: {str(e)}")
def generate_pdf(
self,
markdown_path: Path,
output_path: Path,
css_template: str = "default",
metadata: Optional[Dict] = None,
prefer_pandoc: bool = True
) -> Path:
"""
Generate PDF from Markdown with automatic fallback
Args:
markdown_path: Path to input Markdown file
output_path: Path to output PDF file
css_template: CSS template name ("default", "academic", "business") or custom CSS
metadata: Optional metadata dict (title, author, date)
prefer_pandoc: Use Pandoc if available, fallback to WeasyPrint
Returns:
Path: Path to generated PDF file
Raises:
PDFGenerationError: If both methods fail
"""
if not markdown_path.exists():
raise PDFGenerationError(f"Markdown file not found: {markdown_path}")
# Ensure output directory exists
output_path.parent.mkdir(parents=True, exist_ok=True)
# Try Pandoc first if preferred and available
if prefer_pandoc and self.check_pandoc_available():
try:
return self.generate_pdf_pandoc(markdown_path, output_path, css_template, metadata)
except PDFGenerationError as e:
logger.warning(f"Pandoc failed, falling back to WeasyPrint: {e}")
# Fall through to WeasyPrint
# Use WeasyPrint (fallback or direct)
return self.generate_pdf_weasyprint(markdown_path, output_path, css_template, metadata)
def get_available_templates(self) -> Dict[str, str]:
"""
Get list of available CSS templates
Returns:
Dict mapping template names to descriptions
"""
return {
"default": "通用排版模板,適合大多數文檔",
"academic": "學術論文模板,適合研究報告",
"business": "商業報告模板,適合企業文檔",
}
def save_custom_template(self, template_name: str, css_content: str) -> None:
"""
Save a custom CSS template
Args:
template_name: Template name
css_content: CSS content
"""
self.css_templates[template_name] = css_content
logger.info(f"Custom CSS template saved: {template_name}")

View File

@@ -0,0 +1,230 @@
"""
Tool_OCR - Document Preprocessor Service
Handles file validation, format detection, and preprocessing
"""
import magic
from pathlib import Path
from typing import Tuple, Optional
import logging
from PIL import Image
import cv2
import numpy as np
from app.core.config import settings
logger = logging.getLogger(__name__)
class DocumentPreprocessor:
"""
Document preprocessing service for format standardization
Validates and prepares documents for OCR processing
"""
SUPPORTED_IMAGE_FORMATS = ['png', 'jpg', 'jpeg', 'bmp', 'tiff', 'tif']
SUPPORTED_PDF_FORMAT = ['pdf']
ALL_SUPPORTED_FORMATS = SUPPORTED_IMAGE_FORMATS + SUPPORTED_PDF_FORMAT
def __init__(self):
self.allowed_extensions = settings.allowed_extensions_list
self.max_file_size = settings.max_upload_size
logger.info(f"DocumentPreprocessor initialized with allowed_extensions: {self.allowed_extensions}")
def validate_file(self, file_path: Path) -> Tuple[bool, Optional[str], Optional[str]]:
"""
Validate file format, size, and integrity
Args:
file_path: Path to the file to validate
Returns:
Tuple of (is_valid, file_format, error_message)
"""
try:
# Check file exists
if not file_path.exists():
return False, None, f"File not found: {file_path}"
# Check file size
file_size = file_path.stat().st_size
if file_size > self.max_file_size:
max_mb = self.max_file_size / (1024 * 1024)
actual_mb = file_size / (1024 * 1024)
return False, None, f"File too large: {actual_mb:.2f}MB (max {max_mb:.2f}MB)"
# Detect file format using magic numbers
mime = magic.Magic(mime=True)
mime_type = mime.from_file(str(file_path))
# Map MIME type to format
file_format = self._mime_to_format(mime_type)
if not file_format:
return False, None, f"Unsupported file type: {mime_type}"
# Check if format is in allowed extensions
if file_format not in self.allowed_extensions:
return False, None, f"File format '{file_format}' not allowed"
# Validate file integrity
is_valid, error = self._validate_integrity(file_path, file_format)
if not is_valid:
return False, file_format, f"File corrupted: {error}"
logger.info(f"File validated successfully: {file_path.name} ({file_format})")
return True, file_format, None
except Exception as e:
logger.error(f"File validation error: {str(e)}")
return False, None, f"Validation error: {str(e)}"
def _mime_to_format(self, mime_type: str) -> Optional[str]:
"""Convert MIME type to file format"""
mime_map = {
'image/png': 'png',
'image/jpeg': 'jpg',
'image/jpg': 'jpg',
'image/bmp': 'bmp',
'image/tiff': 'tiff',
'image/x-tiff': 'tiff',
'application/pdf': 'pdf',
'application/msword': 'doc',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
'application/vnd.ms-powerpoint': 'ppt',
'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
}
return mime_map.get(mime_type)
def _validate_integrity(self, file_path: Path, file_format: str) -> Tuple[bool, Optional[str]]:
"""
Validate file integrity by attempting to open it
Args:
file_path: Path to file
file_format: Detected file format
Returns:
Tuple of (is_valid, error_message)
"""
try:
if file_format in self.SUPPORTED_IMAGE_FORMATS:
# Try to open image
with Image.open(file_path) as img:
img.verify() # Verify image integrity
# Reopen for actual check (verify() closes the file)
with Image.open(file_path) as img:
_ = img.size # Force load to detect corruption
return True, None
elif file_format == 'pdf':
# Basic PDF validation - check file starts with PDF signature
with open(file_path, 'rb') as f:
header = f.read(5)
if header != b'%PDF-':
return False, "Invalid PDF header"
return True, None
elif file_format in ['doc', 'docx', 'ppt', 'pptx']:
# Office documents - basic validation (check file size and can be opened)
# Modern Office formats (docx, pptx) are ZIP-based
if file_format in ['docx', 'pptx']:
import zipfile
try:
with zipfile.ZipFile(file_path, 'r') as zf:
# Check if it has the required Office structure
if file_format == 'docx' and 'word/document.xml' not in zf.namelist():
return False, "Invalid DOCX structure"
elif file_format == 'pptx' and 'ppt/presentation.xml' not in zf.namelist():
return False, "Invalid PPTX structure"
except zipfile.BadZipFile:
return False, "Invalid Office file (corrupt ZIP)"
# Old formats (doc, ppt) - just check file exists and has content
return True, None
else:
return False, f"Unknown format: {file_format}"
except Exception as e:
return False, str(e)
def preprocess_image(
self,
image_path: Path,
enhance: bool = True,
output_path: Optional[Path] = None
) -> Tuple[bool, Optional[Path], Optional[str]]:
"""
Preprocess image to improve OCR accuracy
Args:
image_path: Path to input image
enhance: Whether to apply enhancement
output_path: Optional output path (defaults to temp directory)
Returns:
Tuple of (success, processed_image_path, error_message)
"""
try:
# Read image
img = cv2.imread(str(image_path))
if img is None:
return False, None, "Failed to read image"
if not enhance:
# No preprocessing, return original
return True, image_path, None
# Convert to grayscale
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# Apply adaptive thresholding to handle varying lighting
processed = cv2.adaptiveThreshold(
gray,
255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY,
11,
2
)
# Denoise
processed = cv2.fastNlMeansDenoising(processed, None, 10, 7, 21)
# Determine output path
if output_path is None:
output_path = Path(settings.processed_dir) / f"processed_{image_path.name}"
# Save processed image
cv2.imwrite(str(output_path), processed)
logger.info(f"Image preprocessed: {image_path.name} -> {output_path.name}")
return True, output_path, None
except Exception as e:
logger.error(f"Image preprocessing error: {str(e)}")
return False, None, f"Preprocessing error: {str(e)}"
def get_file_info(self, file_path: Path) -> dict:
"""
Get comprehensive file information
Args:
file_path: Path to file
Returns:
Dictionary with file information
"""
stat = file_path.stat()
mime = magic.Magic(mime=True)
mime_type = mime.from_file(str(file_path))
return {
'name': file_path.name,
'path': str(file_path),
'size': stat.st_size,
'size_mb': stat.st_size / (1024 * 1024),
'mime_type': mime_type,
'format': self._mime_to_format(mime_type),
'created_at': stat.st_ctime,
'modified_at': stat.st_mtime,
}

View File

@@ -0,0 +1,282 @@
"""
Tool_OCR - Translation Service (RESERVED)
Abstract interface and stub implementation for future translation feature
"""
from abc import ABC, abstractmethod
from typing import Dict, Optional, List
from enum import Enum
import logging
logger = logging.getLogger(__name__)
class TranslationEngine(str, Enum):
"""Supported translation engines"""
OFFLINE = "offline" # Argos Translate (offline)
ERNIE = "ernie" # Baidu ERNIE API
GOOGLE = "google" # Google Translate API
DEEPL = "deepl" # DeepL API
class LanguageCode(str, Enum):
"""Supported language codes"""
CHINESE = "zh"
ENGLISH = "en"
JAPANESE = "ja"
KOREAN = "ko"
FRENCH = "fr"
GERMAN = "de"
SPANISH = "es"
class TranslationServiceInterface(ABC):
"""
Abstract interface for translation services
This interface defines the contract for all translation engine implementations.
Future implementations should inherit from this class.
"""
@abstractmethod
def translate_text(
self,
text: str,
source_lang: str,
target_lang: str,
**kwargs
) -> str:
"""
Translate a single text string
Args:
text: Text to translate
source_lang: Source language code
target_lang: Target language code
**kwargs: Engine-specific parameters
Returns:
str: Translated text
"""
pass
@abstractmethod
def translate_document(
self,
markdown_content: str,
source_lang: str,
target_lang: str,
preserve_structure: bool = True,
**kwargs
) -> Dict[str, any]:
"""
Translate a Markdown document while preserving structure
Args:
markdown_content: Markdown content to translate
source_lang: Source language code
target_lang: Target language code
preserve_structure: Whether to preserve markdown structure
**kwargs: Engine-specific parameters
Returns:
Dict containing:
- translated_content: Translated markdown
- metadata: Translation metadata (engine, time, etc.)
"""
pass
@abstractmethod
def batch_translate(
self,
texts: List[str],
source_lang: str,
target_lang: str,
**kwargs
) -> List[str]:
"""
Translate multiple texts in batch
Args:
texts: List of texts to translate
source_lang: Source language code
target_lang: Target language code
**kwargs: Engine-specific parameters
Returns:
List[str]: List of translated texts
"""
pass
@abstractmethod
def get_supported_languages(self) -> List[str]:
"""
Get list of supported language codes for this engine
Returns:
List[str]: List of supported language codes
"""
pass
@abstractmethod
def validate_config(self) -> bool:
"""
Validate engine configuration (API keys, model files, etc.)
Returns:
bool: True if configuration is valid
"""
pass
class TranslationEngineFactory:
"""
Factory for creating translation engine instances
RESERVED: This is a placeholder for future implementation.
When translation feature is implemented, this factory will instantiate
the appropriate translation engine based on configuration.
"""
@staticmethod
def create_engine(
engine_type: TranslationEngine,
config: Optional[Dict] = None
) -> TranslationServiceInterface:
"""
Create a translation engine instance
Args:
engine_type: Type of translation engine
config: Engine-specific configuration
Returns:
TranslationServiceInterface: Translation engine instance
Raises:
NotImplementedError: Always raised (stub implementation)
"""
raise NotImplementedError(
"Translation feature is not yet implemented. "
"This is a reserved placeholder for future development."
)
@staticmethod
def get_available_engines() -> List[str]:
"""
Get list of available translation engines
Returns:
List[str]: List of engine types (currently empty)
"""
return []
@staticmethod
def is_engine_available(engine_type: TranslationEngine) -> bool:
"""
Check if a specific engine is available
Args:
engine_type: Engine type to check
Returns:
bool: Always False (stub implementation)
"""
return False
class StubTranslationService:
"""
Stub translation service for API endpoints
This service provides placeholder responses for translation endpoints
until the feature is fully implemented.
"""
@staticmethod
def get_feature_status() -> Dict[str, any]:
"""
Get translation feature status
Returns:
Dict with feature status information
"""
return {
"available": False,
"status": "reserved",
"message": "Translation feature is reserved for future implementation",
"supported_engines": [],
"planned_engines": [
{
"type": "offline",
"name": "Argos Translate",
"description": "Offline neural translation",
"status": "planned"
},
{
"type": "ernie",
"name": "Baidu ERNIE",
"description": "Baidu AI translation API",
"status": "planned"
},
{
"type": "google",
"name": "Google Translate",
"description": "Google Cloud Translation API",
"status": "planned"
},
{
"type": "deepl",
"name": "DeepL",
"description": "DeepL translation API",
"status": "planned"
}
],
"roadmap": {
"phase": "Phase 5",
"priority": "low",
"implementation_after": "Production deployment and user feedback"
}
}
@staticmethod
def get_supported_languages() -> List[Dict[str, str]]:
"""
Get list of languages planned for translation support
Returns:
List of language info dicts
"""
return [
{"code": "zh", "name": "Chinese (Simplified)", "status": "planned"},
{"code": "en", "name": "English", "status": "planned"},
{"code": "ja", "name": "Japanese", "status": "planned"},
{"code": "ko", "name": "Korean", "status": "planned"},
{"code": "fr", "name": "French", "status": "planned"},
{"code": "de", "name": "German", "status": "planned"},
{"code": "es", "name": "Spanish", "status": "planned"},
]
# Example placeholder for future engine implementations:
#
# class ArgosTranslationEngine(TranslationServiceInterface):
# """Offline translation using Argos Translate"""
# def __init__(self, model_path: str):
# self.model_path = model_path
# # Initialize Argos models
#
# def translate_text(self, text, source_lang, target_lang, **kwargs):
# # Implementation here
# pass
#
# class ERNIETranslationEngine(TranslationServiceInterface):
# """Baidu ERNIE API translation"""
# def __init__(self, api_key: str, api_secret: str):
# self.api_key = api_key
# self.api_secret = api_secret
#
# def translate_text(self, text, source_lang, target_lang, **kwargs):
# # Implementation here
# pass