first
This commit is contained in:
3
backend/app/services/__init__.py
Normal file
3
backend/app/services/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
"""
|
||||
Tool_OCR - Services Package
|
||||
"""
|
||||
394
backend/app/services/background_tasks.py
Normal file
394
backend/app/services/background_tasks.py
Normal file
@@ -0,0 +1,394 @@
|
||||
"""
|
||||
Tool_OCR - Background Tasks Service
|
||||
Handles async processing, cleanup, and scheduled tasks
|
||||
"""
|
||||
|
||||
import logging
|
||||
import asyncio
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Optional, Callable, Any
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.core.database import SessionLocal
|
||||
from app.models.ocr import OCRBatch, OCRFile, OCRResult, BatchStatus, FileStatus
|
||||
from app.services.ocr_service import OCRService
|
||||
from app.services.file_manager import FileManager
|
||||
from app.services.pdf_generator import PDFGenerator
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BackgroundTaskManager:
|
||||
"""
|
||||
Manages background tasks including retry logic, cleanup, and scheduled jobs
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
max_retries: int = 3,
|
||||
retry_delay: int = 5,
|
||||
cleanup_interval: int = 3600, # 1 hour
|
||||
file_retention_hours: int = 24
|
||||
):
|
||||
self.max_retries = max_retries
|
||||
self.retry_delay = retry_delay
|
||||
self.cleanup_interval = cleanup_interval
|
||||
self.file_retention_hours = file_retention_hours
|
||||
self.ocr_service = OCRService()
|
||||
self.file_manager = FileManager()
|
||||
self.pdf_generator = PDFGenerator()
|
||||
|
||||
async def execute_with_retry(
|
||||
self,
|
||||
func: Callable,
|
||||
*args,
|
||||
max_retries: Optional[int] = None,
|
||||
retry_delay: Optional[int] = None,
|
||||
**kwargs
|
||||
) -> Any:
|
||||
"""
|
||||
Execute a function with retry logic
|
||||
|
||||
Args:
|
||||
func: Function to execute
|
||||
args: Positional arguments for func
|
||||
max_retries: Maximum retry attempts (overrides default)
|
||||
retry_delay: Delay between retries in seconds (overrides default)
|
||||
kwargs: Keyword arguments for func
|
||||
|
||||
Returns:
|
||||
Function result
|
||||
|
||||
Raises:
|
||||
Exception: If all retries are exhausted
|
||||
"""
|
||||
max_retries = max_retries or self.max_retries
|
||||
retry_delay = retry_delay or self.retry_delay
|
||||
|
||||
last_exception = None
|
||||
for attempt in range(max_retries + 1):
|
||||
try:
|
||||
if asyncio.iscoroutinefunction(func):
|
||||
return await func(*args, **kwargs)
|
||||
else:
|
||||
return func(*args, **kwargs)
|
||||
except Exception as e:
|
||||
last_exception = e
|
||||
if attempt < max_retries:
|
||||
logger.warning(
|
||||
f"Attempt {attempt + 1}/{max_retries + 1} failed for {func.__name__}: {e}. "
|
||||
f"Retrying in {retry_delay}s..."
|
||||
)
|
||||
await asyncio.sleep(retry_delay)
|
||||
else:
|
||||
logger.error(
|
||||
f"All {max_retries + 1} attempts failed for {func.__name__}: {e}"
|
||||
)
|
||||
|
||||
raise last_exception
|
||||
|
||||
def process_single_file_with_retry(
|
||||
self,
|
||||
ocr_file: OCRFile,
|
||||
batch_id: int,
|
||||
lang: str,
|
||||
detect_layout: bool,
|
||||
db: Session
|
||||
) -> bool:
|
||||
"""
|
||||
Process a single file with retry logic
|
||||
|
||||
Args:
|
||||
ocr_file: OCRFile instance
|
||||
batch_id: Batch ID
|
||||
lang: Language code
|
||||
detect_layout: Whether to detect layout
|
||||
db: Database session
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise
|
||||
"""
|
||||
for attempt in range(self.max_retries + 1):
|
||||
try:
|
||||
# Update file status
|
||||
ocr_file.status = FileStatus.PROCESSING
|
||||
ocr_file.started_at = datetime.utcnow()
|
||||
ocr_file.retry_count = attempt
|
||||
db.commit()
|
||||
|
||||
# Get file paths
|
||||
file_path = Path(ocr_file.file_path)
|
||||
paths = self.file_manager.get_file_paths(batch_id, ocr_file.id)
|
||||
|
||||
# Process OCR
|
||||
result = self.ocr_service.process_image(
|
||||
file_path,
|
||||
lang=lang,
|
||||
detect_layout=detect_layout
|
||||
)
|
||||
|
||||
# Check if processing was successful
|
||||
if result['status'] != 'success':
|
||||
raise Exception(result.get('error_message', 'Unknown error during OCR processing'))
|
||||
|
||||
# Save results
|
||||
json_path, markdown_path = self.ocr_service.save_results(
|
||||
result=result,
|
||||
output_dir=paths["output_dir"],
|
||||
file_id=str(ocr_file.id)
|
||||
)
|
||||
|
||||
# Extract data from result
|
||||
text_regions = result.get('text_regions', [])
|
||||
layout_data = result.get('layout_data')
|
||||
images_metadata = result.get('images_metadata', [])
|
||||
|
||||
# Calculate average confidence (or use from result)
|
||||
avg_confidence = result.get('average_confidence')
|
||||
|
||||
# Create OCR result record
|
||||
ocr_result = OCRResult(
|
||||
file_id=ocr_file.id,
|
||||
markdown_path=str(markdown_path) if markdown_path else None,
|
||||
json_path=str(json_path) if json_path else None,
|
||||
images_dir=None, # Images dir not used in current implementation
|
||||
detected_language=lang,
|
||||
total_text_regions=len(text_regions),
|
||||
average_confidence=avg_confidence,
|
||||
layout_data=layout_data,
|
||||
images_metadata=images_metadata
|
||||
)
|
||||
db.add(ocr_result)
|
||||
|
||||
# Update file status
|
||||
ocr_file.status = FileStatus.COMPLETED
|
||||
ocr_file.completed_at = datetime.utcnow()
|
||||
ocr_file.processing_time = (ocr_file.completed_at - ocr_file.started_at).total_seconds()
|
||||
|
||||
db.commit()
|
||||
|
||||
logger.info(f"Successfully processed file {ocr_file.id} ({ocr_file.original_filename})")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Attempt {attempt + 1}/{self.max_retries + 1} failed for file {ocr_file.id}: {e}")
|
||||
|
||||
if attempt < self.max_retries:
|
||||
# Wait before retry
|
||||
time.sleep(self.retry_delay)
|
||||
else:
|
||||
# Final failure
|
||||
ocr_file.status = FileStatus.FAILED
|
||||
ocr_file.error_message = f"Failed after {self.max_retries + 1} attempts: {str(e)}"
|
||||
ocr_file.completed_at = datetime.utcnow()
|
||||
ocr_file.retry_count = attempt
|
||||
db.commit()
|
||||
return False
|
||||
|
||||
return False
|
||||
|
||||
async def cleanup_expired_files(self, db: Session):
|
||||
"""
|
||||
Clean up files and batches older than retention period
|
||||
|
||||
Args:
|
||||
db: Database session
|
||||
"""
|
||||
try:
|
||||
cutoff_time = datetime.utcnow() - timedelta(hours=self.file_retention_hours)
|
||||
|
||||
# Find expired batches
|
||||
expired_batches = db.query(OCRBatch).filter(
|
||||
OCRBatch.created_at < cutoff_time,
|
||||
OCRBatch.status.in_([BatchStatus.COMPLETED, BatchStatus.FAILED, BatchStatus.PARTIAL])
|
||||
).all()
|
||||
|
||||
logger.info(f"Found {len(expired_batches)} expired batches to clean up")
|
||||
|
||||
for batch in expired_batches:
|
||||
try:
|
||||
# Get batch directory
|
||||
batch_dir = self.file_manager.base_upload_dir / "batches" / str(batch.id)
|
||||
|
||||
# Delete physical files
|
||||
if batch_dir.exists():
|
||||
import shutil
|
||||
shutil.rmtree(batch_dir)
|
||||
logger.info(f"Deleted batch directory: {batch_dir}")
|
||||
|
||||
# Delete database records
|
||||
# Delete results first (foreign key constraint)
|
||||
db.query(OCRResult).filter(
|
||||
OCRResult.file_id.in_(
|
||||
db.query(OCRFile.id).filter(OCRFile.batch_id == batch.id)
|
||||
)
|
||||
).delete(synchronize_session=False)
|
||||
|
||||
# Delete files
|
||||
db.query(OCRFile).filter(OCRFile.batch_id == batch.id).delete()
|
||||
|
||||
# Delete batch
|
||||
db.delete(batch)
|
||||
db.commit()
|
||||
|
||||
logger.info(f"Cleaned up expired batch {batch.id}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error cleaning up batch {batch.id}: {e}")
|
||||
db.rollback()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in cleanup_expired_files: {e}")
|
||||
|
||||
async def generate_pdf_background(
|
||||
self,
|
||||
result_id: int,
|
||||
output_path: str,
|
||||
css_template: str = "default",
|
||||
db: Session = None
|
||||
):
|
||||
"""
|
||||
Generate PDF in background with retry logic
|
||||
|
||||
Args:
|
||||
result_id: OCR result ID
|
||||
output_path: Output PDF path
|
||||
css_template: CSS template name
|
||||
db: Database session
|
||||
"""
|
||||
should_close_db = False
|
||||
if db is None:
|
||||
db = SessionLocal()
|
||||
should_close_db = True
|
||||
|
||||
try:
|
||||
# Get result
|
||||
result = db.query(OCRResult).filter(OCRResult.id == result_id).first()
|
||||
if not result:
|
||||
logger.error(f"Result {result_id} not found")
|
||||
return
|
||||
|
||||
# Generate PDF with retry
|
||||
await self.execute_with_retry(
|
||||
self.pdf_generator.generate_pdf,
|
||||
markdown_path=result.markdown_path,
|
||||
output_path=output_path,
|
||||
css_template=css_template,
|
||||
max_retries=2,
|
||||
retry_delay=3
|
||||
)
|
||||
|
||||
logger.info(f"Successfully generated PDF for result {result_id}: {output_path}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to generate PDF for result {result_id}: {e}")
|
||||
finally:
|
||||
if should_close_db:
|
||||
db.close()
|
||||
|
||||
async def start_cleanup_scheduler(self):
|
||||
"""
|
||||
Start periodic cleanup scheduler
|
||||
|
||||
Runs cleanup task at specified intervals
|
||||
"""
|
||||
logger.info(f"Starting cleanup scheduler (interval: {self.cleanup_interval}s, retention: {self.file_retention_hours}h)")
|
||||
|
||||
while True:
|
||||
try:
|
||||
db = SessionLocal()
|
||||
await self.cleanup_expired_files(db)
|
||||
db.close()
|
||||
except Exception as e:
|
||||
logger.error(f"Error in cleanup scheduler: {e}")
|
||||
|
||||
# Wait for next interval
|
||||
await asyncio.sleep(self.cleanup_interval)
|
||||
|
||||
|
||||
# Global task manager instance
|
||||
task_manager = BackgroundTaskManager()
|
||||
|
||||
|
||||
def process_batch_files_with_retry(
|
||||
batch_id: int,
|
||||
lang: str,
|
||||
detect_layout: bool,
|
||||
db: Session
|
||||
):
|
||||
"""
|
||||
Process all files in a batch with retry logic
|
||||
|
||||
Args:
|
||||
batch_id: Batch ID
|
||||
lang: Language code
|
||||
detect_layout: Whether to detect layout
|
||||
db: Database session
|
||||
"""
|
||||
try:
|
||||
# Get batch
|
||||
batch = db.query(OCRBatch).filter(OCRBatch.id == batch_id).first()
|
||||
if not batch:
|
||||
logger.error(f"Batch {batch_id} not found")
|
||||
return
|
||||
|
||||
# Update batch status
|
||||
batch.status = BatchStatus.PROCESSING
|
||||
batch.started_at = datetime.utcnow()
|
||||
db.commit()
|
||||
|
||||
# Get pending files
|
||||
files = db.query(OCRFile).filter(
|
||||
OCRFile.batch_id == batch_id,
|
||||
OCRFile.status == FileStatus.PENDING
|
||||
).all()
|
||||
|
||||
logger.info(f"Processing {len(files)} files in batch {batch_id} with retry logic")
|
||||
|
||||
# Process each file with retry
|
||||
for ocr_file in files:
|
||||
success = task_manager.process_single_file_with_retry(
|
||||
ocr_file=ocr_file,
|
||||
batch_id=batch_id,
|
||||
lang=lang,
|
||||
detect_layout=detect_layout,
|
||||
db=db
|
||||
)
|
||||
|
||||
# Update batch progress
|
||||
if success:
|
||||
batch.completed_files += 1
|
||||
else:
|
||||
batch.failed_files += 1
|
||||
|
||||
db.commit()
|
||||
|
||||
# Update batch final status
|
||||
if batch.failed_files == 0:
|
||||
batch.status = BatchStatus.COMPLETED
|
||||
elif batch.completed_files > 0:
|
||||
batch.status = BatchStatus.PARTIAL
|
||||
else:
|
||||
batch.status = BatchStatus.FAILED
|
||||
|
||||
batch.completed_at = datetime.utcnow()
|
||||
db.commit()
|
||||
|
||||
logger.info(
|
||||
f"Batch {batch_id} processing complete: "
|
||||
f"{batch.completed_files} succeeded, {batch.failed_files} failed"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Fatal error processing batch {batch_id}: {e}")
|
||||
try:
|
||||
batch = db.query(OCRBatch).filter(OCRBatch.id == batch_id).first()
|
||||
if batch:
|
||||
batch.status = BatchStatus.FAILED
|
||||
batch.completed_at = datetime.utcnow()
|
||||
db.commit()
|
||||
except Exception as commit_error:
|
||||
logger.error(f"Error updating batch status: {commit_error}")
|
||||
512
backend/app/services/export_service.py
Normal file
512
backend/app/services/export_service.py
Normal file
@@ -0,0 +1,512 @@
|
||||
"""
|
||||
Tool_OCR - Export Service
|
||||
Handles OCR result export in multiple formats with filtering and formatting rules
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Optional, Any
|
||||
from datetime import datetime
|
||||
|
||||
import pandas as pd
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.core.config import settings
|
||||
from app.models.ocr import OCRBatch, OCRFile, OCRResult, FileStatus
|
||||
from app.models.export import ExportRule
|
||||
from app.services.pdf_generator import PDFGenerator, PDFGenerationError
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ExportError(Exception):
|
||||
"""Exception raised for export errors"""
|
||||
pass
|
||||
|
||||
|
||||
class ExportService:
|
||||
"""
|
||||
Export service for OCR results
|
||||
|
||||
Supported formats:
|
||||
- TXT: Plain text export
|
||||
- JSON: Full metadata export
|
||||
- Excel: Tabular data export
|
||||
- Markdown: Direct Markdown export
|
||||
- PDF: Layout-preserved PDF export
|
||||
- ZIP: Batch export archive
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize export service"""
|
||||
self.pdf_generator = PDFGenerator()
|
||||
|
||||
def apply_filters(
|
||||
self,
|
||||
results: List[OCRResult],
|
||||
filters: Dict[str, Any]
|
||||
) -> List[OCRResult]:
|
||||
"""
|
||||
Apply filters to OCR results
|
||||
|
||||
Args:
|
||||
results: List of OCR results
|
||||
filters: Filter configuration
|
||||
- confidence_threshold: Minimum confidence (0.0-1.0)
|
||||
- filename_pattern: Glob pattern for filename matching
|
||||
- language: Filter by detected language
|
||||
|
||||
Returns:
|
||||
List[OCRResult]: Filtered results
|
||||
"""
|
||||
filtered = results
|
||||
|
||||
# Confidence threshold filter
|
||||
if "confidence_threshold" in filters:
|
||||
threshold = filters["confidence_threshold"]
|
||||
filtered = [r for r in filtered if r.average_confidence and r.average_confidence >= threshold]
|
||||
|
||||
# Filename pattern filter (using simple substring match)
|
||||
if "filename_pattern" in filters:
|
||||
pattern = filters["filename_pattern"].lower()
|
||||
filtered = [
|
||||
r for r in filtered
|
||||
if pattern in r.file.original_filename.lower()
|
||||
]
|
||||
|
||||
# Language filter
|
||||
if "language" in filters:
|
||||
lang = filters["language"]
|
||||
filtered = [r for r in filtered if r.detected_language == lang]
|
||||
|
||||
return filtered
|
||||
|
||||
def export_to_txt(
|
||||
self,
|
||||
results: List[OCRResult],
|
||||
output_path: Path,
|
||||
formatting: Optional[Dict] = None
|
||||
) -> Path:
|
||||
"""
|
||||
Export results to plain text file
|
||||
|
||||
Args:
|
||||
results: List of OCR results
|
||||
output_path: Output file path
|
||||
formatting: Formatting options
|
||||
- add_line_numbers: Add line numbers
|
||||
- group_by_filename: Group text by source file
|
||||
- include_metadata: Add file metadata headers
|
||||
|
||||
Returns:
|
||||
Path: Output file path
|
||||
|
||||
Raises:
|
||||
ExportError: If export fails
|
||||
"""
|
||||
try:
|
||||
formatting = formatting or {}
|
||||
output_lines = []
|
||||
|
||||
for idx, result in enumerate(results, 1):
|
||||
# Read Markdown file
|
||||
if not result.markdown_path or not Path(result.markdown_path).exists():
|
||||
logger.warning(f"Markdown file not found for result {result.id}")
|
||||
continue
|
||||
|
||||
markdown_content = Path(result.markdown_path).read_text(encoding="utf-8")
|
||||
|
||||
# Add metadata header if requested
|
||||
if formatting.get("include_metadata", False):
|
||||
output_lines.append(f"=" * 80)
|
||||
output_lines.append(f"文件: {result.file.original_filename}")
|
||||
output_lines.append(f"語言: {result.detected_language or '未知'}")
|
||||
output_lines.append(f"信心度: {result.average_confidence:.2%}" if result.average_confidence else "信心度: N/A")
|
||||
output_lines.append(f"=" * 80)
|
||||
output_lines.append("")
|
||||
|
||||
# Add content with optional line numbers
|
||||
if formatting.get("add_line_numbers", False):
|
||||
for line_num, line in enumerate(markdown_content.split('\n'), 1):
|
||||
output_lines.append(f"{line_num:4d} | {line}")
|
||||
else:
|
||||
output_lines.append(markdown_content)
|
||||
|
||||
# Add separator between files if grouping
|
||||
if formatting.get("group_by_filename", False) and idx < len(results):
|
||||
output_lines.append("\n" + "-" * 80 + "\n")
|
||||
|
||||
# Write to file
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
output_path.write_text("\n".join(output_lines), encoding="utf-8")
|
||||
|
||||
logger.info(f"Exported {len(results)} results to TXT: {output_path}")
|
||||
return output_path
|
||||
|
||||
except Exception as e:
|
||||
raise ExportError(f"TXT export failed: {str(e)}")
|
||||
|
||||
def export_to_json(
|
||||
self,
|
||||
results: List[OCRResult],
|
||||
output_path: Path,
|
||||
include_layout: bool = True,
|
||||
include_images: bool = True
|
||||
) -> Path:
|
||||
"""
|
||||
Export results to JSON file with full metadata
|
||||
|
||||
Args:
|
||||
results: List of OCR results
|
||||
output_path: Output file path
|
||||
include_layout: Include layout data
|
||||
include_images: Include images metadata
|
||||
|
||||
Returns:
|
||||
Path: Output file path
|
||||
|
||||
Raises:
|
||||
ExportError: If export fails
|
||||
"""
|
||||
try:
|
||||
export_data = {
|
||||
"export_time": datetime.utcnow().isoformat(),
|
||||
"total_files": len(results),
|
||||
"results": []
|
||||
}
|
||||
|
||||
for result in results:
|
||||
result_data = {
|
||||
"file_id": result.file.id,
|
||||
"filename": result.file.original_filename,
|
||||
"file_format": result.file.file_format,
|
||||
"file_size": result.file.file_size,
|
||||
"processing_time": result.file.processing_time,
|
||||
"detected_language": result.detected_language,
|
||||
"total_text_regions": result.total_text_regions,
|
||||
"average_confidence": result.average_confidence,
|
||||
"markdown_path": result.markdown_path,
|
||||
}
|
||||
|
||||
# Include layout data if requested
|
||||
if include_layout and result.layout_data:
|
||||
result_data["layout_data"] = result.layout_data
|
||||
|
||||
# Include images metadata if requested
|
||||
if include_images and result.images_metadata:
|
||||
result_data["images_metadata"] = result.images_metadata
|
||||
|
||||
export_data["results"].append(result_data)
|
||||
|
||||
# Write to file
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
output_path.write_text(
|
||||
json.dumps(export_data, ensure_ascii=False, indent=2),
|
||||
encoding="utf-8"
|
||||
)
|
||||
|
||||
logger.info(f"Exported {len(results)} results to JSON: {output_path}")
|
||||
return output_path
|
||||
|
||||
except Exception as e:
|
||||
raise ExportError(f"JSON export failed: {str(e)}")
|
||||
|
||||
def export_to_excel(
|
||||
self,
|
||||
results: List[OCRResult],
|
||||
output_path: Path,
|
||||
include_confidence: bool = True,
|
||||
include_processing_time: bool = True
|
||||
) -> Path:
|
||||
"""
|
||||
Export results to Excel file
|
||||
|
||||
Args:
|
||||
results: List of OCR results
|
||||
output_path: Output file path
|
||||
include_confidence: Include confidence scores
|
||||
include_processing_time: Include processing time
|
||||
|
||||
Returns:
|
||||
Path: Output file path
|
||||
|
||||
Raises:
|
||||
ExportError: If export fails
|
||||
"""
|
||||
try:
|
||||
rows = []
|
||||
|
||||
for result in results:
|
||||
# Read Markdown content
|
||||
text_content = ""
|
||||
if result.markdown_path and Path(result.markdown_path).exists():
|
||||
text_content = Path(result.markdown_path).read_text(encoding="utf-8")
|
||||
|
||||
row = {
|
||||
"文件名": result.file.original_filename,
|
||||
"格式": result.file.file_format,
|
||||
"大小(字節)": result.file.file_size,
|
||||
"語言": result.detected_language or "未知",
|
||||
"文本區域數": result.total_text_regions,
|
||||
"提取內容": text_content[:1000] + "..." if len(text_content) > 1000 else text_content,
|
||||
}
|
||||
|
||||
if include_confidence:
|
||||
row["平均信心度"] = f"{result.average_confidence:.2%}" if result.average_confidence else "N/A"
|
||||
|
||||
if include_processing_time:
|
||||
row["處理時間(秒)"] = f"{result.file.processing_time:.2f}" if result.file.processing_time else "N/A"
|
||||
|
||||
rows.append(row)
|
||||
|
||||
# Create DataFrame and export
|
||||
df = pd.DataFrame(rows)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
df.to_excel(output_path, index=False, engine='openpyxl')
|
||||
|
||||
logger.info(f"Exported {len(results)} results to Excel: {output_path}")
|
||||
return output_path
|
||||
|
||||
except Exception as e:
|
||||
raise ExportError(f"Excel export failed: {str(e)}")
|
||||
|
||||
def export_to_markdown(
|
||||
self,
|
||||
results: List[OCRResult],
|
||||
output_path: Path,
|
||||
combine: bool = True
|
||||
) -> Path:
|
||||
"""
|
||||
Export results to Markdown file(s)
|
||||
|
||||
Args:
|
||||
results: List of OCR results
|
||||
output_path: Output file path (or directory if not combining)
|
||||
combine: Combine all results into one file
|
||||
|
||||
Returns:
|
||||
Path: Output file/directory path
|
||||
|
||||
Raises:
|
||||
ExportError: If export fails
|
||||
"""
|
||||
try:
|
||||
if combine:
|
||||
# Combine all Markdown files into one
|
||||
combined_content = []
|
||||
|
||||
for result in results:
|
||||
if not result.markdown_path or not Path(result.markdown_path).exists():
|
||||
continue
|
||||
|
||||
markdown_content = Path(result.markdown_path).read_text(encoding="utf-8")
|
||||
|
||||
# Add header
|
||||
combined_content.append(f"# {result.file.original_filename}\n")
|
||||
combined_content.append(markdown_content)
|
||||
combined_content.append("\n---\n") # Separator
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
output_path.write_text("\n".join(combined_content), encoding="utf-8")
|
||||
|
||||
logger.info(f"Exported {len(results)} results to combined Markdown: {output_path}")
|
||||
return output_path
|
||||
|
||||
else:
|
||||
# Export each result to separate file
|
||||
output_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
for result in results:
|
||||
if not result.markdown_path or not Path(result.markdown_path).exists():
|
||||
continue
|
||||
|
||||
# Copy Markdown file to output directory
|
||||
src_path = Path(result.markdown_path)
|
||||
dst_path = output_path / f"{result.file.original_filename}.md"
|
||||
dst_path.write_text(src_path.read_text(encoding="utf-8"), encoding="utf-8")
|
||||
|
||||
logger.info(f"Exported {len(results)} results to separate Markdown files: {output_path}")
|
||||
return output_path
|
||||
|
||||
except Exception as e:
|
||||
raise ExportError(f"Markdown export failed: {str(e)}")
|
||||
|
||||
def export_to_pdf(
|
||||
self,
|
||||
result: OCRResult,
|
||||
output_path: Path,
|
||||
css_template: str = "default",
|
||||
metadata: Optional[Dict] = None
|
||||
) -> Path:
|
||||
"""
|
||||
Export single result to PDF with layout preservation
|
||||
|
||||
Args:
|
||||
result: OCR result
|
||||
output_path: Output PDF path
|
||||
css_template: CSS template name or custom CSS
|
||||
metadata: Optional PDF metadata
|
||||
|
||||
Returns:
|
||||
Path: Output PDF path
|
||||
|
||||
Raises:
|
||||
ExportError: If export fails
|
||||
"""
|
||||
try:
|
||||
if not result.markdown_path or not Path(result.markdown_path).exists():
|
||||
raise ExportError(f"Markdown file not found for result {result.id}")
|
||||
|
||||
markdown_path = Path(result.markdown_path)
|
||||
|
||||
# Prepare metadata
|
||||
pdf_metadata = metadata or {}
|
||||
if "title" not in pdf_metadata:
|
||||
pdf_metadata["title"] = result.file.original_filename
|
||||
|
||||
# Generate PDF
|
||||
self.pdf_generator.generate_pdf(
|
||||
markdown_path=markdown_path,
|
||||
output_path=output_path,
|
||||
css_template=css_template,
|
||||
metadata=pdf_metadata
|
||||
)
|
||||
|
||||
logger.info(f"Exported result {result.id} to PDF: {output_path}")
|
||||
return output_path
|
||||
|
||||
except PDFGenerationError as e:
|
||||
raise ExportError(f"PDF generation failed: {str(e)}")
|
||||
except Exception as e:
|
||||
raise ExportError(f"PDF export failed: {str(e)}")
|
||||
|
||||
def export_batch_to_zip(
|
||||
self,
|
||||
db: Session,
|
||||
batch_id: int,
|
||||
output_path: Path,
|
||||
include_formats: Optional[List[str]] = None
|
||||
) -> Path:
|
||||
"""
|
||||
Export entire batch to ZIP archive
|
||||
|
||||
Args:
|
||||
db: Database session
|
||||
batch_id: Batch ID
|
||||
output_path: Output ZIP path
|
||||
include_formats: List of formats to include (markdown, json, txt, excel, pdf)
|
||||
|
||||
Returns:
|
||||
Path: Output ZIP path
|
||||
|
||||
Raises:
|
||||
ExportError: If export fails
|
||||
"""
|
||||
try:
|
||||
include_formats = include_formats or ["markdown", "json"]
|
||||
|
||||
# Get batch and results
|
||||
batch = db.query(OCRBatch).filter(OCRBatch.id == batch_id).first()
|
||||
if not batch:
|
||||
raise ExportError(f"Batch {batch_id} not found")
|
||||
|
||||
results = db.query(OCRResult).join(OCRFile).filter(
|
||||
OCRFile.batch_id == batch_id,
|
||||
OCRFile.status == FileStatus.COMPLETED
|
||||
).all()
|
||||
|
||||
if not results:
|
||||
raise ExportError(f"No completed results found for batch {batch_id}")
|
||||
|
||||
# Create temporary export directory
|
||||
temp_dir = output_path.parent / f"temp_export_{batch_id}"
|
||||
temp_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
# Export in requested formats
|
||||
if "markdown" in include_formats:
|
||||
md_dir = temp_dir / "markdown"
|
||||
self.export_to_markdown(results, md_dir, combine=False)
|
||||
|
||||
if "json" in include_formats:
|
||||
json_path = temp_dir / "batch_results.json"
|
||||
self.export_to_json(results, json_path)
|
||||
|
||||
if "txt" in include_formats:
|
||||
txt_path = temp_dir / "batch_results.txt"
|
||||
self.export_to_txt(results, txt_path)
|
||||
|
||||
if "excel" in include_formats:
|
||||
excel_path = temp_dir / "batch_results.xlsx"
|
||||
self.export_to_excel(results, excel_path)
|
||||
|
||||
# Create ZIP archive
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
||||
for file_path in temp_dir.rglob('*'):
|
||||
if file_path.is_file():
|
||||
arcname = file_path.relative_to(temp_dir)
|
||||
zipf.write(file_path, arcname)
|
||||
|
||||
logger.info(f"Exported batch {batch_id} to ZIP: {output_path}")
|
||||
return output_path
|
||||
|
||||
finally:
|
||||
# Clean up temporary directory
|
||||
import shutil
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
|
||||
except Exception as e:
|
||||
raise ExportError(f"Batch ZIP export failed: {str(e)}")
|
||||
|
||||
def apply_export_rule(
|
||||
self,
|
||||
db: Session,
|
||||
results: List[OCRResult],
|
||||
rule_id: int
|
||||
) -> List[OCRResult]:
|
||||
"""
|
||||
Apply export rule to filter and format results
|
||||
|
||||
Args:
|
||||
db: Database session
|
||||
results: List of OCR results
|
||||
rule_id: Export rule ID
|
||||
|
||||
Returns:
|
||||
List[OCRResult]: Filtered results
|
||||
|
||||
Raises:
|
||||
ExportError: If rule not found
|
||||
"""
|
||||
rule = db.query(ExportRule).filter(ExportRule.id == rule_id).first()
|
||||
if not rule:
|
||||
raise ExportError(f"Export rule {rule_id} not found")
|
||||
|
||||
config = rule.config_json
|
||||
|
||||
# Apply filters
|
||||
if "filters" in config:
|
||||
results = self.apply_filters(results, config["filters"])
|
||||
|
||||
# Note: Formatting options are applied in individual export methods
|
||||
return results
|
||||
|
||||
def get_export_formats(self) -> Dict[str, str]:
|
||||
"""
|
||||
Get available export formats
|
||||
|
||||
Returns:
|
||||
Dict mapping format codes to descriptions
|
||||
"""
|
||||
return {
|
||||
"txt": "純文本格式 (.txt)",
|
||||
"json": "JSON 格式 - 包含完整元數據 (.json)",
|
||||
"excel": "Excel 表格格式 (.xlsx)",
|
||||
"markdown": "Markdown 格式 (.md)",
|
||||
"pdf": "版面保留 PDF 格式 (.pdf)",
|
||||
"zip": "批次打包格式 (.zip)",
|
||||
}
|
||||
420
backend/app/services/file_manager.py
Normal file
420
backend/app/services/file_manager.py
Normal file
@@ -0,0 +1,420 @@
|
||||
"""
|
||||
Tool_OCR - File Management Service
|
||||
Handles file uploads, storage, validation, and cleanup
|
||||
"""
|
||||
|
||||
import logging
|
||||
import shutil
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple, Optional
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from fastapi import UploadFile
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.core.config import settings
|
||||
from app.models.ocr import OCRBatch, OCRFile, FileStatus
|
||||
from app.services.preprocessor import DocumentPreprocessor
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FileManagementError(Exception):
|
||||
"""Exception raised for file management errors"""
|
||||
pass
|
||||
|
||||
|
||||
class FileManager:
|
||||
"""
|
||||
File management service for upload, storage, and cleanup
|
||||
|
||||
Directory structure:
|
||||
uploads/
|
||||
├── batches/
|
||||
│ └── {batch_id}/
|
||||
│ ├── inputs/ # Original uploaded files
|
||||
│ ├── outputs/ # OCR results
|
||||
│ │ ├── markdown/ # Markdown files
|
||||
│ │ ├── json/ # JSON files
|
||||
│ │ └── images/ # Extracted images
|
||||
│ └── exports/ # Export files (PDF, Excel, etc.)
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize file manager"""
|
||||
self.preprocessor = DocumentPreprocessor()
|
||||
self.base_upload_dir = Path(settings.upload_dir)
|
||||
self.base_upload_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def create_batch_directory(self, batch_id: int) -> Path:
|
||||
"""
|
||||
Create directory structure for a batch
|
||||
|
||||
Args:
|
||||
batch_id: Batch ID
|
||||
|
||||
Returns:
|
||||
Path: Batch directory path
|
||||
"""
|
||||
batch_dir = self.base_upload_dir / "batches" / str(batch_id)
|
||||
|
||||
# Create subdirectories
|
||||
(batch_dir / "inputs").mkdir(parents=True, exist_ok=True)
|
||||
(batch_dir / "outputs" / "markdown").mkdir(parents=True, exist_ok=True)
|
||||
(batch_dir / "outputs" / "json").mkdir(parents=True, exist_ok=True)
|
||||
(batch_dir / "outputs" / "images").mkdir(parents=True, exist_ok=True)
|
||||
(batch_dir / "exports").mkdir(parents=True, exist_ok=True)
|
||||
|
||||
logger.info(f"Created batch directory: {batch_dir}")
|
||||
return batch_dir
|
||||
|
||||
def get_batch_directory(self, batch_id: int) -> Path:
|
||||
"""
|
||||
Get batch directory path
|
||||
|
||||
Args:
|
||||
batch_id: Batch ID
|
||||
|
||||
Returns:
|
||||
Path: Batch directory path
|
||||
"""
|
||||
return self.base_upload_dir / "batches" / str(batch_id)
|
||||
|
||||
def validate_upload(self, file: UploadFile) -> Tuple[bool, Optional[str]]:
|
||||
"""
|
||||
Validate uploaded file before saving
|
||||
|
||||
Args:
|
||||
file: Uploaded file
|
||||
|
||||
Returns:
|
||||
Tuple of (is_valid, error_message)
|
||||
"""
|
||||
# Check filename
|
||||
if not file.filename:
|
||||
return False, "文件名不能為空"
|
||||
|
||||
# Check file size (read content size)
|
||||
file.file.seek(0, 2) # Seek to end
|
||||
file_size = file.file.tell()
|
||||
file.file.seek(0) # Reset to beginning
|
||||
|
||||
if file_size == 0:
|
||||
return False, "文件為空"
|
||||
|
||||
if file_size > settings.max_upload_size:
|
||||
max_mb = settings.max_upload_size / (1024 * 1024)
|
||||
return False, f"文件大小超過限制 ({max_mb}MB)"
|
||||
|
||||
# Check file extension
|
||||
file_ext = Path(file.filename).suffix.lower()
|
||||
allowed_extensions = {'.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.ppt', '.pptx'}
|
||||
if file_ext not in allowed_extensions:
|
||||
return False, f"不支持的文件格式 ({file_ext}),僅支持: {', '.join(allowed_extensions)}"
|
||||
|
||||
return True, None
|
||||
|
||||
def save_upload(
|
||||
self,
|
||||
file: UploadFile,
|
||||
batch_id: int,
|
||||
validate: bool = True
|
||||
) -> Tuple[Path, str]:
|
||||
"""
|
||||
Save uploaded file to batch directory
|
||||
|
||||
Args:
|
||||
file: Uploaded file
|
||||
batch_id: Batch ID
|
||||
validate: Whether to validate file
|
||||
|
||||
Returns:
|
||||
Tuple of (file_path, original_filename)
|
||||
|
||||
Raises:
|
||||
FileManagementError: If file validation or saving fails
|
||||
"""
|
||||
# Validate if requested
|
||||
if validate:
|
||||
is_valid, error_msg = self.validate_upload(file)
|
||||
if not is_valid:
|
||||
raise FileManagementError(error_msg)
|
||||
|
||||
# Generate unique filename to avoid conflicts
|
||||
original_filename = file.filename
|
||||
file_ext = Path(original_filename).suffix
|
||||
unique_filename = f"{uuid.uuid4()}{file_ext}"
|
||||
|
||||
# Get batch input directory
|
||||
batch_dir = self.get_batch_directory(batch_id)
|
||||
input_dir = batch_dir / "inputs"
|
||||
input_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Save file
|
||||
file_path = input_dir / unique_filename
|
||||
try:
|
||||
with file_path.open("wb") as buffer:
|
||||
shutil.copyfileobj(file.file, buffer)
|
||||
|
||||
logger.info(f"Saved upload: {file_path} (original: {original_filename})")
|
||||
return file_path, original_filename
|
||||
|
||||
except Exception as e:
|
||||
# Clean up partial file if exists
|
||||
file_path.unlink(missing_ok=True)
|
||||
raise FileManagementError(f"保存文件失敗: {str(e)}")
|
||||
|
||||
def validate_saved_file(self, file_path: Path) -> Tuple[bool, Optional[str], Optional[str]]:
|
||||
"""
|
||||
Validate saved file using preprocessor
|
||||
|
||||
Args:
|
||||
file_path: Path to saved file
|
||||
|
||||
Returns:
|
||||
Tuple of (is_valid, error_message, detected_format)
|
||||
"""
|
||||
return self.preprocessor.validate_file(file_path)
|
||||
|
||||
def create_batch(
|
||||
self,
|
||||
db: Session,
|
||||
user_id: int,
|
||||
batch_name: Optional[str] = None
|
||||
) -> OCRBatch:
|
||||
"""
|
||||
Create new OCR batch
|
||||
|
||||
Args:
|
||||
db: Database session
|
||||
user_id: User ID
|
||||
batch_name: Optional batch name
|
||||
|
||||
Returns:
|
||||
OCRBatch: Created batch object
|
||||
"""
|
||||
# Create batch record
|
||||
batch = OCRBatch(
|
||||
user_id=user_id,
|
||||
batch_name=batch_name or f"Batch_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
||||
)
|
||||
db.add(batch)
|
||||
db.commit()
|
||||
db.refresh(batch)
|
||||
|
||||
# Create directory structure
|
||||
self.create_batch_directory(batch.id)
|
||||
|
||||
logger.info(f"Created batch: {batch.id} for user {user_id}")
|
||||
return batch
|
||||
|
||||
def add_file_to_batch(
|
||||
self,
|
||||
db: Session,
|
||||
batch_id: int,
|
||||
file: UploadFile
|
||||
) -> OCRFile:
|
||||
"""
|
||||
Add file to batch and save to disk
|
||||
|
||||
Args:
|
||||
db: Database session
|
||||
batch_id: Batch ID
|
||||
file: Uploaded file
|
||||
|
||||
Returns:
|
||||
OCRFile: Created file record
|
||||
|
||||
Raises:
|
||||
FileManagementError: If file operations fail
|
||||
"""
|
||||
# Save file to disk
|
||||
file_path, original_filename = self.save_upload(file, batch_id)
|
||||
|
||||
# Validate saved file
|
||||
is_valid, detected_format, error_msg = self.validate_saved_file(file_path)
|
||||
|
||||
# Create file record
|
||||
ocr_file = OCRFile(
|
||||
batch_id=batch_id,
|
||||
filename=file_path.name,
|
||||
original_filename=original_filename,
|
||||
file_path=str(file_path),
|
||||
file_size=file_path.stat().st_size,
|
||||
file_format=detected_format or Path(original_filename).suffix.lower().lstrip('.'),
|
||||
status=FileStatus.PENDING if is_valid else FileStatus.FAILED,
|
||||
error_message=error_msg if not is_valid else None
|
||||
)
|
||||
|
||||
db.add(ocr_file)
|
||||
|
||||
# Update batch total_files count
|
||||
batch = db.query(OCRBatch).filter(OCRBatch.id == batch_id).first()
|
||||
if batch:
|
||||
batch.total_files += 1
|
||||
if not is_valid:
|
||||
batch.failed_files += 1
|
||||
|
||||
db.commit()
|
||||
db.refresh(ocr_file)
|
||||
|
||||
logger.info(f"Added file to batch {batch_id}: {ocr_file.id} (status: {ocr_file.status})")
|
||||
return ocr_file
|
||||
|
||||
def add_files_to_batch(
|
||||
self,
|
||||
db: Session,
|
||||
batch_id: int,
|
||||
files: List[UploadFile]
|
||||
) -> List[OCRFile]:
|
||||
"""
|
||||
Add multiple files to batch
|
||||
|
||||
Args:
|
||||
db: Database session
|
||||
batch_id: Batch ID
|
||||
files: List of uploaded files
|
||||
|
||||
Returns:
|
||||
List[OCRFile]: List of created file records
|
||||
"""
|
||||
ocr_files = []
|
||||
for file in files:
|
||||
try:
|
||||
ocr_file = self.add_file_to_batch(db, batch_id, file)
|
||||
ocr_files.append(ocr_file)
|
||||
except FileManagementError as e:
|
||||
logger.error(f"Failed to add file {file.filename} to batch {batch_id}: {e}")
|
||||
# Continue with other files
|
||||
continue
|
||||
|
||||
return ocr_files
|
||||
|
||||
def get_file_paths(self, batch_id: int, file_id: int) -> dict:
|
||||
"""
|
||||
Get all paths for a file in a batch
|
||||
|
||||
Args:
|
||||
batch_id: Batch ID
|
||||
file_id: File ID
|
||||
|
||||
Returns:
|
||||
Dict containing all relevant paths
|
||||
"""
|
||||
batch_dir = self.get_batch_directory(batch_id)
|
||||
|
||||
return {
|
||||
"input_dir": batch_dir / "inputs",
|
||||
"output_dir": batch_dir / "outputs",
|
||||
"markdown_dir": batch_dir / "outputs" / "markdown",
|
||||
"json_dir": batch_dir / "outputs" / "json",
|
||||
"images_dir": batch_dir / "outputs" / "images" / str(file_id),
|
||||
"export_dir": batch_dir / "exports",
|
||||
}
|
||||
|
||||
def cleanup_expired_batches(self, db: Session, retention_hours: int = 24) -> int:
|
||||
"""
|
||||
Clean up expired batch files
|
||||
|
||||
Args:
|
||||
db: Database session
|
||||
retention_hours: Number of hours to retain files
|
||||
|
||||
Returns:
|
||||
int: Number of batches cleaned up
|
||||
"""
|
||||
cutoff_time = datetime.utcnow() - timedelta(hours=retention_hours)
|
||||
|
||||
# Find expired batches
|
||||
expired_batches = db.query(OCRBatch).filter(
|
||||
OCRBatch.created_at < cutoff_time
|
||||
).all()
|
||||
|
||||
cleaned_count = 0
|
||||
for batch in expired_batches:
|
||||
try:
|
||||
# Delete batch directory
|
||||
batch_dir = self.get_batch_directory(batch.id)
|
||||
if batch_dir.exists():
|
||||
shutil.rmtree(batch_dir)
|
||||
logger.info(f"Deleted batch directory: {batch_dir}")
|
||||
|
||||
# Delete database records (cascade will handle related records)
|
||||
db.delete(batch)
|
||||
cleaned_count += 1
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to cleanup batch {batch.id}: {e}")
|
||||
continue
|
||||
|
||||
if cleaned_count > 0:
|
||||
db.commit()
|
||||
logger.info(f"Cleaned up {cleaned_count} expired batches")
|
||||
|
||||
return cleaned_count
|
||||
|
||||
def verify_file_ownership(
|
||||
self,
|
||||
db: Session,
|
||||
user_id: int,
|
||||
batch_id: int
|
||||
) -> bool:
|
||||
"""
|
||||
Verify user owns the batch
|
||||
|
||||
Args:
|
||||
db: Database session
|
||||
user_id: User ID
|
||||
batch_id: Batch ID
|
||||
|
||||
Returns:
|
||||
bool: True if user owns batch, False otherwise
|
||||
"""
|
||||
batch = db.query(OCRBatch).filter(
|
||||
OCRBatch.id == batch_id,
|
||||
OCRBatch.user_id == user_id
|
||||
).first()
|
||||
|
||||
return batch is not None
|
||||
|
||||
def get_batch_statistics(self, db: Session, batch_id: int) -> dict:
|
||||
"""
|
||||
Get statistics for a batch
|
||||
|
||||
Args:
|
||||
db: Database session
|
||||
batch_id: Batch ID
|
||||
|
||||
Returns:
|
||||
Dict containing batch statistics
|
||||
"""
|
||||
batch = db.query(OCRBatch).filter(OCRBatch.id == batch_id).first()
|
||||
if not batch:
|
||||
return {}
|
||||
|
||||
# Calculate total file size
|
||||
total_size = sum(f.file_size for f in batch.files)
|
||||
|
||||
# Calculate processing time
|
||||
processing_time = None
|
||||
if batch.completed_at and batch.started_at:
|
||||
processing_time = (batch.completed_at - batch.started_at).total_seconds()
|
||||
|
||||
return {
|
||||
"batch_id": batch.id,
|
||||
"batch_name": batch.batch_name,
|
||||
"status": batch.status,
|
||||
"total_files": batch.total_files,
|
||||
"completed_files": batch.completed_files,
|
||||
"failed_files": batch.failed_files,
|
||||
"pending_files": batch.total_files - batch.completed_files - batch.failed_files,
|
||||
"progress_percentage": batch.progress_percentage,
|
||||
"total_file_size": total_size,
|
||||
"total_file_size_mb": round(total_size / (1024 * 1024), 2),
|
||||
"created_at": batch.created_at.isoformat(),
|
||||
"started_at": batch.started_at.isoformat() if batch.started_at else None,
|
||||
"completed_at": batch.completed_at.isoformat() if batch.completed_at else None,
|
||||
"processing_time": processing_time,
|
||||
}
|
||||
516
backend/app/services/ocr_service.py
Normal file
516
backend/app/services/ocr_service.py
Normal file
@@ -0,0 +1,516 @@
|
||||
"""
|
||||
Tool_OCR - Core OCR Service
|
||||
PaddleOCR-VL integration for text and structure extraction
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from datetime import datetime
|
||||
import uuid
|
||||
|
||||
from paddleocr import PaddleOCR, PPStructureV3
|
||||
from PIL import Image
|
||||
from pdf2image import convert_from_path
|
||||
|
||||
from app.core.config import settings
|
||||
from app.services.office_converter import OfficeConverter, OfficeConverterError
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class OCRService:
|
||||
"""
|
||||
Core OCR service using PaddleOCR-VL
|
||||
Handles text recognition and document structure analysis
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize PaddleOCR and PPStructure engines"""
|
||||
self.ocr_languages = settings.ocr_languages_list
|
||||
self.confidence_threshold = settings.ocr_confidence_threshold
|
||||
|
||||
# Initialize PaddleOCR engine (will be lazy-loaded per language)
|
||||
self.ocr_engines = {}
|
||||
|
||||
# Initialize PP-Structure for layout analysis
|
||||
self.structure_engine = None
|
||||
|
||||
# Initialize Office document converter
|
||||
self.office_converter = OfficeConverter()
|
||||
|
||||
logger.info("OCR Service initialized")
|
||||
|
||||
def get_ocr_engine(self, lang: str = 'ch') -> PaddleOCR:
|
||||
"""
|
||||
Get or create OCR engine for specified language
|
||||
|
||||
Args:
|
||||
lang: Language code (ch, en, japan, korean, etc.)
|
||||
|
||||
Returns:
|
||||
PaddleOCR engine instance
|
||||
"""
|
||||
if lang not in self.ocr_engines:
|
||||
logger.info(f"Initializing PaddleOCR engine for language: {lang}")
|
||||
self.ocr_engines[lang] = PaddleOCR(
|
||||
use_angle_cls=True,
|
||||
lang=lang,
|
||||
# Note: show_log and use_gpu parameters removed in PaddleOCR 3.x
|
||||
)
|
||||
logger.info(f"PaddleOCR engine ready for {lang}")
|
||||
|
||||
return self.ocr_engines[lang]
|
||||
|
||||
def get_structure_engine(self) -> PPStructureV3:
|
||||
"""
|
||||
Get or create PP-Structure engine for layout analysis
|
||||
|
||||
Returns:
|
||||
PPStructure engine instance
|
||||
"""
|
||||
if self.structure_engine is None:
|
||||
logger.info("Initializing PP-StructureV3 engine")
|
||||
self.structure_engine = PPStructureV3(
|
||||
use_doc_orientation_classify=False,
|
||||
use_doc_unwarping=False,
|
||||
use_textline_orientation=False,
|
||||
use_table_recognition=True,
|
||||
use_formula_recognition=True,
|
||||
layout_threshold=0.5,
|
||||
)
|
||||
logger.info("PP-StructureV3 engine ready")
|
||||
|
||||
return self.structure_engine
|
||||
|
||||
def convert_pdf_to_images(self, pdf_path: Path, output_dir: Path) -> List[Path]:
|
||||
"""
|
||||
Convert PDF to images (one per page)
|
||||
|
||||
Args:
|
||||
pdf_path: Path to PDF file
|
||||
output_dir: Directory to save converted images
|
||||
|
||||
Returns:
|
||||
List of paths to converted images
|
||||
"""
|
||||
try:
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
logger.info(f"Converting PDF {pdf_path.name} to images")
|
||||
|
||||
# Convert PDF to images (300 DPI for good quality)
|
||||
images = convert_from_path(
|
||||
str(pdf_path),
|
||||
dpi=300,
|
||||
fmt='png'
|
||||
)
|
||||
|
||||
image_paths = []
|
||||
for i, image in enumerate(images):
|
||||
# Save each page as PNG
|
||||
image_path = output_dir / f"{pdf_path.stem}_page_{i+1}.png"
|
||||
image.save(str(image_path), 'PNG')
|
||||
image_paths.append(image_path)
|
||||
logger.info(f"Saved page {i+1} to {image_path.name}")
|
||||
|
||||
logger.info(f"Converted {len(image_paths)} pages from PDF")
|
||||
return image_paths
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"PDF conversion error: {str(e)}")
|
||||
raise
|
||||
|
||||
def process_image(
|
||||
self,
|
||||
image_path: Path,
|
||||
lang: str = 'ch',
|
||||
detect_layout: bool = True,
|
||||
confidence_threshold: Optional[float] = None
|
||||
) -> Dict:
|
||||
"""
|
||||
Process single image with OCR and layout analysis
|
||||
|
||||
Args:
|
||||
image_path: Path to image file
|
||||
lang: Language for OCR
|
||||
detect_layout: Whether to perform layout analysis
|
||||
confidence_threshold: Minimum confidence threshold (uses default if None)
|
||||
|
||||
Returns:
|
||||
Dictionary with OCR results and metadata
|
||||
"""
|
||||
start_time = datetime.now()
|
||||
threshold = confidence_threshold if confidence_threshold is not None else self.confidence_threshold
|
||||
|
||||
try:
|
||||
# Check if file is Office document
|
||||
if self.office_converter.is_office_document(image_path):
|
||||
logger.info(f"Detected Office document: {image_path.name}, converting to PDF")
|
||||
try:
|
||||
# Convert Office document to PDF
|
||||
pdf_path = self.office_converter.convert_to_pdf(image_path)
|
||||
logger.info(f"Office document converted to PDF: {pdf_path.name}")
|
||||
|
||||
# Process the PDF (will be handled by PDF processing logic below)
|
||||
image_path = pdf_path
|
||||
except OfficeConverterError as e:
|
||||
logger.error(f"Office conversion failed: {str(e)}")
|
||||
raise
|
||||
|
||||
# Check if file is PDF
|
||||
is_pdf = image_path.suffix.lower() == '.pdf'
|
||||
|
||||
if is_pdf:
|
||||
# Convert PDF to images
|
||||
logger.info(f"Detected PDF file: {image_path.name}, converting to images")
|
||||
pdf_images_dir = image_path.parent / f"{image_path.stem}_pages"
|
||||
image_paths = self.convert_pdf_to_images(image_path, pdf_images_dir)
|
||||
|
||||
# Process all pages
|
||||
all_text_regions = []
|
||||
total_confidence_sum = 0.0
|
||||
total_valid_regions = 0
|
||||
all_layout_data = []
|
||||
all_images_metadata = []
|
||||
|
||||
for page_num, page_image_path in enumerate(image_paths, 1):
|
||||
logger.info(f"Processing PDF page {page_num}/{len(image_paths)}")
|
||||
|
||||
# Process each page
|
||||
page_result = self.process_image(
|
||||
page_image_path,
|
||||
lang=lang,
|
||||
detect_layout=detect_layout,
|
||||
confidence_threshold=confidence_threshold
|
||||
)
|
||||
|
||||
# Accumulate results
|
||||
if page_result['status'] == 'success':
|
||||
# Add page number to each text region
|
||||
for region in page_result['text_regions']:
|
||||
region['page'] = page_num
|
||||
all_text_regions.append(region)
|
||||
|
||||
total_confidence_sum += page_result['average_confidence'] * page_result['total_text_regions']
|
||||
total_valid_regions += page_result['total_text_regions']
|
||||
|
||||
# Accumulate layout data
|
||||
if page_result.get('layout_data'):
|
||||
all_layout_data.append(page_result['layout_data'])
|
||||
|
||||
# Accumulate images metadata
|
||||
if page_result.get('images_metadata'):
|
||||
all_images_metadata.extend(page_result['images_metadata'])
|
||||
|
||||
# Calculate overall average confidence
|
||||
avg_confidence = total_confidence_sum / total_valid_regions if total_valid_regions > 0 else 0.0
|
||||
|
||||
# Combine layout data from all pages
|
||||
combined_layout = None
|
||||
if all_layout_data:
|
||||
combined_elements = []
|
||||
for layout in all_layout_data:
|
||||
if layout.get('elements'):
|
||||
combined_elements.extend(layout['elements'])
|
||||
if combined_elements:
|
||||
combined_layout = {
|
||||
'elements': combined_elements,
|
||||
'total_elements': len(combined_elements),
|
||||
'reading_order': list(range(len(combined_elements))),
|
||||
}
|
||||
|
||||
# Generate combined markdown
|
||||
markdown_content = self.generate_markdown(all_text_regions, combined_layout)
|
||||
|
||||
# Calculate processing time
|
||||
processing_time = (datetime.now() - start_time).total_seconds()
|
||||
|
||||
logger.info(
|
||||
f"PDF processing completed: {image_path.name} - "
|
||||
f"{len(image_paths)} pages, "
|
||||
f"{len(all_text_regions)} regions, "
|
||||
f"{avg_confidence:.2f} avg confidence, "
|
||||
f"{processing_time:.2f}s"
|
||||
)
|
||||
|
||||
return {
|
||||
'status': 'success',
|
||||
'file_name': image_path.name,
|
||||
'language': lang,
|
||||
'text_regions': all_text_regions,
|
||||
'total_text_regions': len(all_text_regions),
|
||||
'average_confidence': avg_confidence,
|
||||
'layout_data': combined_layout,
|
||||
'images_metadata': all_images_metadata,
|
||||
'markdown_content': markdown_content,
|
||||
'processing_time': processing_time,
|
||||
'timestamp': datetime.utcnow().isoformat(),
|
||||
'total_pages': len(image_paths),
|
||||
}
|
||||
|
||||
# Get OCR engine (for non-PDF images)
|
||||
ocr_engine = self.get_ocr_engine(lang)
|
||||
|
||||
# Perform OCR
|
||||
logger.info(f"Processing image: {image_path.name}")
|
||||
# Note: In PaddleOCR 3.x, use_angle_cls is set during initialization, not in ocr() call
|
||||
ocr_results = ocr_engine.ocr(str(image_path))
|
||||
|
||||
# Parse OCR results (PaddleOCR 3.x format)
|
||||
text_regions = []
|
||||
total_confidence = 0.0
|
||||
valid_regions = 0
|
||||
|
||||
if ocr_results and isinstance(ocr_results, (list, tuple)) and len(ocr_results) > 0:
|
||||
# PaddleOCR 3.x returns a list of dictionaries (one per page)
|
||||
for page_result in ocr_results:
|
||||
if isinstance(page_result, dict):
|
||||
# New format: {'rec_texts': [...], 'rec_scores': [...], 'rec_polys': [...]}
|
||||
texts = page_result.get('rec_texts', [])
|
||||
scores = page_result.get('rec_scores', [])
|
||||
polys = page_result.get('rec_polys', [])
|
||||
|
||||
# Process each recognized text
|
||||
for idx, text in enumerate(texts):
|
||||
# Get corresponding score and bbox
|
||||
confidence = scores[idx] if idx < len(scores) else 1.0
|
||||
bbox = polys[idx] if idx < len(polys) else []
|
||||
|
||||
# Convert numpy array bbox to list for JSON serialization
|
||||
if hasattr(bbox, 'tolist'):
|
||||
bbox = bbox.tolist()
|
||||
|
||||
# Filter by confidence threshold
|
||||
if confidence >= threshold:
|
||||
text_regions.append({
|
||||
'text': text,
|
||||
'bbox': bbox,
|
||||
'confidence': float(confidence),
|
||||
})
|
||||
total_confidence += confidence
|
||||
valid_regions += 1
|
||||
|
||||
avg_confidence = total_confidence / valid_regions if valid_regions > 0 else 0.0
|
||||
|
||||
logger.info(f"Parsed {len(text_regions)} text regions with avg confidence {avg_confidence:.3f}")
|
||||
|
||||
# Layout analysis (if requested)
|
||||
layout_data = None
|
||||
images_metadata = []
|
||||
|
||||
if detect_layout:
|
||||
layout_data, images_metadata = self.analyze_layout(image_path)
|
||||
|
||||
# Generate Markdown
|
||||
markdown_content = self.generate_markdown(text_regions, layout_data)
|
||||
|
||||
# Calculate processing time
|
||||
processing_time = (datetime.now() - start_time).total_seconds()
|
||||
|
||||
result = {
|
||||
'status': 'success',
|
||||
'file_name': image_path.name,
|
||||
'language': lang,
|
||||
'text_regions': text_regions,
|
||||
'total_text_regions': len(text_regions),
|
||||
'average_confidence': avg_confidence,
|
||||
'layout_data': layout_data,
|
||||
'images_metadata': images_metadata,
|
||||
'markdown_content': markdown_content,
|
||||
'processing_time': processing_time,
|
||||
'timestamp': datetime.utcnow().isoformat(),
|
||||
}
|
||||
|
||||
logger.info(
|
||||
f"OCR completed: {image_path.name} - "
|
||||
f"{len(text_regions)} regions, "
|
||||
f"{avg_confidence:.2f} avg confidence, "
|
||||
f"{processing_time:.2f}s"
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
import traceback
|
||||
error_trace = traceback.format_exc()
|
||||
logger.error(f"OCR processing error for {image_path.name}: {str(e)}\n{error_trace}")
|
||||
return {
|
||||
'status': 'error',
|
||||
'file_name': image_path.name,
|
||||
'error_message': str(e),
|
||||
'processing_time': (datetime.now() - start_time).total_seconds(),
|
||||
}
|
||||
|
||||
def analyze_layout(self, image_path: Path) -> Tuple[Optional[Dict], List[Dict]]:
|
||||
"""
|
||||
Analyze document layout using PP-StructureV3
|
||||
|
||||
Args:
|
||||
image_path: Path to image file
|
||||
|
||||
Returns:
|
||||
Tuple of (layout_data, images_metadata)
|
||||
"""
|
||||
try:
|
||||
structure_engine = self.get_structure_engine()
|
||||
|
||||
# Perform structure analysis using predict() method (PaddleOCR 3.x API)
|
||||
logger.info(f"Running layout analysis on {image_path.name}")
|
||||
results = structure_engine.predict(str(image_path))
|
||||
|
||||
layout_elements = []
|
||||
images_metadata = []
|
||||
|
||||
# Process each page result (for images, usually just one page)
|
||||
for page_idx, page_result in enumerate(results):
|
||||
# Get markdown dictionary from result object
|
||||
if hasattr(page_result, 'markdown'):
|
||||
markdown_dict = page_result.markdown
|
||||
logger.info(f"Page {page_idx} markdown keys: {markdown_dict.keys() if isinstance(markdown_dict, dict) else type(markdown_dict)}")
|
||||
|
||||
# Extract layout information from markdown structure
|
||||
if isinstance(markdown_dict, dict):
|
||||
# Get markdown texts (HTML format with tables and structure)
|
||||
markdown_texts = markdown_dict.get('markdown_texts', '')
|
||||
markdown_images = markdown_dict.get('markdown_images', {})
|
||||
|
||||
# Create a layout element for the structured content
|
||||
if markdown_texts:
|
||||
# Parse HTML content to identify tables and text
|
||||
import re
|
||||
|
||||
# Check if content contains tables
|
||||
has_table = '<table' in markdown_texts.lower()
|
||||
|
||||
element = {
|
||||
'element_id': len(layout_elements),
|
||||
'type': 'table' if has_table else 'text',
|
||||
'content': markdown_texts,
|
||||
'page': page_idx,
|
||||
'bbox': [], # PP-StructureV3 doesn't provide individual bbox in this format
|
||||
}
|
||||
layout_elements.append(element)
|
||||
|
||||
# Add image metadata
|
||||
for img_idx, (img_path, img_obj) in enumerate(markdown_images.items()):
|
||||
images_metadata.append({
|
||||
'element_id': len(layout_elements) + img_idx,
|
||||
'image_path': img_path,
|
||||
'type': 'image',
|
||||
'page': page_idx,
|
||||
'bbox': [],
|
||||
})
|
||||
|
||||
if layout_elements:
|
||||
layout_data = {
|
||||
'elements': layout_elements,
|
||||
'total_elements': len(layout_elements),
|
||||
'reading_order': list(range(len(layout_elements))),
|
||||
}
|
||||
logger.info(f"Detected {len(layout_elements)} layout elements")
|
||||
return layout_data, images_metadata
|
||||
else:
|
||||
logger.warning("No layout elements detected")
|
||||
return None, []
|
||||
|
||||
except Exception as e:
|
||||
import traceback
|
||||
error_trace = traceback.format_exc()
|
||||
logger.error(f"Layout analysis error: {str(e)}\n{error_trace}")
|
||||
return None, []
|
||||
|
||||
def generate_markdown(
|
||||
self,
|
||||
text_regions: List[Dict],
|
||||
layout_data: Optional[Dict] = None
|
||||
) -> str:
|
||||
"""
|
||||
Generate Markdown from OCR results
|
||||
|
||||
Args:
|
||||
text_regions: List of text regions with bbox and text
|
||||
layout_data: Optional layout structure information
|
||||
|
||||
Returns:
|
||||
Markdown formatted string
|
||||
"""
|
||||
markdown_lines = []
|
||||
|
||||
if layout_data and layout_data.get('elements'):
|
||||
# Generate structured Markdown based on layout
|
||||
for element in layout_data['elements']:
|
||||
element_type = element.get('type', 'text')
|
||||
content = element.get('content', '')
|
||||
|
||||
if element_type == 'title':
|
||||
markdown_lines.append(f"# {content}\n")
|
||||
elif element_type == 'table':
|
||||
# Table in HTML format
|
||||
markdown_lines.append(content)
|
||||
markdown_lines.append("")
|
||||
elif element_type == 'figure':
|
||||
element_id = element.get('element_id')
|
||||
markdown_lines.append(f"\n")
|
||||
else:
|
||||
markdown_lines.append(f"{content}\n")
|
||||
|
||||
else:
|
||||
# Simple Markdown from text regions only
|
||||
# Sort by vertical position (top to bottom)
|
||||
def get_y_coord(region):
|
||||
"""Safely extract Y coordinate from bbox"""
|
||||
bbox = region.get('bbox', [])
|
||||
if isinstance(bbox, (list, tuple)) and len(bbox) > 0:
|
||||
if isinstance(bbox[0], (list, tuple)) and len(bbox[0]) > 1:
|
||||
return bbox[0][1] # [[x1,y1], [x2,y2], ...] format
|
||||
elif len(bbox) > 1:
|
||||
return bbox[1] # [x1, y1, x2, y2, ...] format
|
||||
return 0 # Default to 0 if can't extract
|
||||
|
||||
sorted_regions = sorted(text_regions, key=get_y_coord)
|
||||
|
||||
for region in sorted_regions:
|
||||
text = region['text']
|
||||
markdown_lines.append(text)
|
||||
|
||||
return "\n".join(markdown_lines)
|
||||
|
||||
def save_results(
|
||||
self,
|
||||
result: Dict,
|
||||
output_dir: Path,
|
||||
file_id: str
|
||||
) -> Tuple[Optional[Path], Optional[Path]]:
|
||||
"""
|
||||
Save OCR results to JSON and Markdown files
|
||||
|
||||
Args:
|
||||
result: OCR result dictionary
|
||||
output_dir: Output directory
|
||||
file_id: Unique file identifier
|
||||
|
||||
Returns:
|
||||
Tuple of (json_path, markdown_path)
|
||||
"""
|
||||
try:
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Save JSON
|
||||
json_path = output_dir / f"{file_id}_result.json"
|
||||
with open(json_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(result, f, ensure_ascii=False, indent=2)
|
||||
|
||||
# Save Markdown
|
||||
markdown_path = output_dir / f"{file_id}_output.md"
|
||||
markdown_content = result.get('markdown_content', '')
|
||||
with open(markdown_path, 'w', encoding='utf-8') as f:
|
||||
f.write(markdown_content)
|
||||
|
||||
logger.info(f"Results saved: {json_path.name}, {markdown_path.name}")
|
||||
return json_path, markdown_path
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error saving results: {str(e)}")
|
||||
return None, None
|
||||
210
backend/app/services/office_converter.py
Normal file
210
backend/app/services/office_converter.py
Normal file
@@ -0,0 +1,210 @@
|
||||
"""
|
||||
Tool_OCR - Office Document Converter Service
|
||||
Convert Office documents (DOC/DOCX/PPT/PPTX) to PDF for OCR processing
|
||||
"""
|
||||
|
||||
import logging
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
import tempfile
|
||||
import shutil
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class OfficeConverterError(Exception):
|
||||
"""Exception raised for Office conversion errors"""
|
||||
pass
|
||||
|
||||
|
||||
class OfficeConverter:
|
||||
"""Convert Office documents to PDF for OCR processing"""
|
||||
|
||||
# Supported Office formats
|
||||
OFFICE_FORMATS = {
|
||||
'.doc': 'application/msword',
|
||||
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||
'.ppt': 'application/vnd.ms-powerpoint',
|
||||
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation'
|
||||
}
|
||||
|
||||
def __init__(self, libreoffice_path: str = "/Applications/LibreOffice.app/Contents/MacOS/soffice"):
|
||||
"""
|
||||
Initialize Office converter
|
||||
|
||||
Args:
|
||||
libreoffice_path: Path to LibreOffice executable
|
||||
"""
|
||||
self.libreoffice_path = libreoffice_path
|
||||
self._verify_libreoffice()
|
||||
|
||||
def _verify_libreoffice(self):
|
||||
"""Verify LibreOffice is installed and accessible"""
|
||||
if not Path(self.libreoffice_path).exists():
|
||||
# Try alternative path for Homebrew installation
|
||||
alt_path = shutil.which("soffice")
|
||||
if alt_path:
|
||||
self.libreoffice_path = alt_path
|
||||
logger.info(f"Using LibreOffice at: {alt_path}")
|
||||
else:
|
||||
raise OfficeConverterError(
|
||||
"LibreOffice not found. Please install LibreOffice: brew install libreoffice"
|
||||
)
|
||||
|
||||
def is_office_document(self, file_path: Path) -> bool:
|
||||
"""
|
||||
Check if file is an Office document
|
||||
|
||||
Args:
|
||||
file_path: Path to file
|
||||
|
||||
Returns:
|
||||
True if file is an Office document
|
||||
"""
|
||||
return file_path.suffix.lower() in self.OFFICE_FORMATS
|
||||
|
||||
def convert_to_pdf(self, office_path: Path, output_dir: Optional[Path] = None) -> Path:
|
||||
"""
|
||||
Convert Office document to PDF
|
||||
|
||||
Args:
|
||||
office_path: Path to Office document
|
||||
output_dir: Optional output directory (uses temp dir if not specified)
|
||||
|
||||
Returns:
|
||||
Path to converted PDF file
|
||||
|
||||
Raises:
|
||||
OfficeConverterError: If conversion fails
|
||||
"""
|
||||
if not office_path.exists():
|
||||
raise OfficeConverterError(f"Office file not found: {office_path}")
|
||||
|
||||
if not self.is_office_document(office_path):
|
||||
raise OfficeConverterError(
|
||||
f"Unsupported format: {office_path.suffix}. "
|
||||
f"Supported formats: {', '.join(self.OFFICE_FORMATS.keys())}"
|
||||
)
|
||||
|
||||
# Determine output directory
|
||||
if output_dir is None:
|
||||
output_dir = office_path.parent
|
||||
else:
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Expected output PDF path
|
||||
pdf_filename = office_path.stem + '.pdf'
|
||||
output_pdf_path = output_dir / pdf_filename
|
||||
|
||||
# Remove existing PDF if present
|
||||
if output_pdf_path.exists():
|
||||
output_pdf_path.unlink()
|
||||
|
||||
logger.info(f"Converting {office_path.name} to PDF using LibreOffice")
|
||||
|
||||
try:
|
||||
# Use LibreOffice headless mode for conversion
|
||||
# --headless: Run without GUI
|
||||
# --convert-to pdf: Convert to PDF format
|
||||
# --outdir: Output directory
|
||||
cmd = [
|
||||
self.libreoffice_path,
|
||||
'--headless',
|
||||
'--convert-to', 'pdf',
|
||||
'--outdir', str(output_dir),
|
||||
str(office_path)
|
||||
]
|
||||
|
||||
logger.debug(f"Running command: {' '.join(cmd)}")
|
||||
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60 # 60 second timeout
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
error_msg = result.stderr or result.stdout
|
||||
raise OfficeConverterError(
|
||||
f"LibreOffice conversion failed: {error_msg}"
|
||||
)
|
||||
|
||||
# Verify PDF was created
|
||||
if not output_pdf_path.exists():
|
||||
raise OfficeConverterError(
|
||||
f"PDF file not created at expected location: {output_pdf_path}"
|
||||
)
|
||||
|
||||
logger.info(f"Successfully converted to PDF: {output_pdf_path.name}")
|
||||
return output_pdf_path
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
raise OfficeConverterError(
|
||||
f"Conversion timeout (60s) for file: {office_path.name}"
|
||||
)
|
||||
except Exception as e:
|
||||
if isinstance(e, OfficeConverterError):
|
||||
raise
|
||||
raise OfficeConverterError(f"Conversion error: {str(e)}")
|
||||
|
||||
def convert_docx_to_pdf(self, docx_path: Path, output_dir: Optional[Path] = None) -> Path:
|
||||
"""
|
||||
Convert DOCX to PDF
|
||||
|
||||
Args:
|
||||
docx_path: Path to DOCX file
|
||||
output_dir: Optional output directory
|
||||
|
||||
Returns:
|
||||
Path to converted PDF
|
||||
"""
|
||||
if docx_path.suffix.lower() != '.docx':
|
||||
raise OfficeConverterError(f"Expected .docx file, got: {docx_path.suffix}")
|
||||
return self.convert_to_pdf(docx_path, output_dir)
|
||||
|
||||
def convert_doc_to_pdf(self, doc_path: Path, output_dir: Optional[Path] = None) -> Path:
|
||||
"""
|
||||
Convert legacy DOC to PDF
|
||||
|
||||
Args:
|
||||
doc_path: Path to DOC file
|
||||
output_dir: Optional output directory
|
||||
|
||||
Returns:
|
||||
Path to converted PDF
|
||||
"""
|
||||
if doc_path.suffix.lower() != '.doc':
|
||||
raise OfficeConverterError(f"Expected .doc file, got: {doc_path.suffix}")
|
||||
return self.convert_to_pdf(doc_path, output_dir)
|
||||
|
||||
def convert_pptx_to_pdf(self, pptx_path: Path, output_dir: Optional[Path] = None) -> Path:
|
||||
"""
|
||||
Convert PPTX to PDF
|
||||
|
||||
Args:
|
||||
pptx_path: Path to PPTX file
|
||||
output_dir: Optional output directory
|
||||
|
||||
Returns:
|
||||
Path to converted PDF
|
||||
"""
|
||||
if pptx_path.suffix.lower() != '.pptx':
|
||||
raise OfficeConverterError(f"Expected .pptx file, got: {pptx_path.suffix}")
|
||||
return self.convert_to_pdf(pptx_path, output_dir)
|
||||
|
||||
def convert_ppt_to_pdf(self, ppt_path: Path, output_dir: Optional[Path] = None) -> Path:
|
||||
"""
|
||||
Convert legacy PPT to PDF
|
||||
|
||||
Args:
|
||||
ppt_path: Path to PPT file
|
||||
output_dir: Optional output directory
|
||||
|
||||
Returns:
|
||||
Path to converted PDF
|
||||
"""
|
||||
if ppt_path.suffix.lower() != '.ppt':
|
||||
raise OfficeConverterError(f"Expected .ppt file, got: {ppt_path.suffix}")
|
||||
return self.convert_to_pdf(ppt_path, output_dir)
|
||||
507
backend/app/services/pdf_generator.py
Normal file
507
backend/app/services/pdf_generator.py
Normal file
@@ -0,0 +1,507 @@
|
||||
"""
|
||||
Tool_OCR - PDF Generator Service
|
||||
Converts Markdown to layout-preserved PDFs using Pandoc + WeasyPrint
|
||||
"""
|
||||
|
||||
import logging
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict
|
||||
from datetime import datetime
|
||||
|
||||
from weasyprint import HTML, CSS
|
||||
from markdown import markdown
|
||||
|
||||
from app.core.config import settings
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PDFGenerationError(Exception):
|
||||
"""Exception raised when PDF generation fails"""
|
||||
pass
|
||||
|
||||
|
||||
class PDFGenerator:
|
||||
"""
|
||||
PDF generation service with layout preservation
|
||||
|
||||
Supports two generation methods:
|
||||
1. Pandoc (preferred): Markdown → HTML → PDF via pandoc command
|
||||
2. WeasyPrint (fallback): Direct Python-based HTML → PDF conversion
|
||||
"""
|
||||
|
||||
# Default CSS template for layout preservation
|
||||
DEFAULT_CSS = """
|
||||
@page {
|
||||
size: A4;
|
||||
margin: 2cm;
|
||||
}
|
||||
|
||||
body {
|
||||
font-family: "Noto Sans CJK SC", "Noto Sans CJK TC", "Microsoft YaHei", "SimSun", sans-serif;
|
||||
font-size: 11pt;
|
||||
line-height: 1.6;
|
||||
color: #333;
|
||||
}
|
||||
|
||||
h1 {
|
||||
font-size: 24pt;
|
||||
font-weight: bold;
|
||||
margin-top: 0;
|
||||
margin-bottom: 12pt;
|
||||
color: #000;
|
||||
page-break-after: avoid;
|
||||
}
|
||||
|
||||
h2 {
|
||||
font-size: 18pt;
|
||||
font-weight: bold;
|
||||
margin-top: 18pt;
|
||||
margin-bottom: 10pt;
|
||||
color: #000;
|
||||
page-break-after: avoid;
|
||||
}
|
||||
|
||||
h3 {
|
||||
font-size: 14pt;
|
||||
font-weight: bold;
|
||||
margin-top: 14pt;
|
||||
margin-bottom: 8pt;
|
||||
color: #000;
|
||||
page-break-after: avoid;
|
||||
}
|
||||
|
||||
p {
|
||||
margin: 0 0 10pt 0;
|
||||
text-align: justify;
|
||||
}
|
||||
|
||||
table {
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
margin: 12pt 0;
|
||||
page-break-inside: avoid;
|
||||
}
|
||||
|
||||
table th {
|
||||
background-color: #f0f0f0;
|
||||
border: 1px solid #ccc;
|
||||
padding: 8pt;
|
||||
text-align: left;
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
table td {
|
||||
border: 1px solid #ccc;
|
||||
padding: 8pt;
|
||||
text-align: left;
|
||||
}
|
||||
|
||||
code {
|
||||
font-family: "Courier New", monospace;
|
||||
font-size: 10pt;
|
||||
background-color: #f5f5f5;
|
||||
padding: 2pt 4pt;
|
||||
border-radius: 3px;
|
||||
}
|
||||
|
||||
pre {
|
||||
background-color: #f5f5f5;
|
||||
border: 1px solid #ddd;
|
||||
border-radius: 5px;
|
||||
padding: 10pt;
|
||||
overflow-x: auto;
|
||||
page-break-inside: avoid;
|
||||
}
|
||||
|
||||
pre code {
|
||||
background-color: transparent;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
img {
|
||||
max-width: 100%;
|
||||
height: auto;
|
||||
display: block;
|
||||
margin: 12pt auto;
|
||||
page-break-inside: avoid;
|
||||
}
|
||||
|
||||
blockquote {
|
||||
border-left: 4px solid #ddd;
|
||||
padding-left: 12pt;
|
||||
margin: 12pt 0;
|
||||
color: #666;
|
||||
font-style: italic;
|
||||
}
|
||||
|
||||
ul, ol {
|
||||
margin: 10pt 0;
|
||||
padding-left: 20pt;
|
||||
}
|
||||
|
||||
li {
|
||||
margin: 5pt 0;
|
||||
}
|
||||
|
||||
hr {
|
||||
border: none;
|
||||
border-top: 1px solid #ccc;
|
||||
margin: 20pt 0;
|
||||
}
|
||||
|
||||
.page-break {
|
||||
page-break-after: always;
|
||||
}
|
||||
"""
|
||||
|
||||
# Academic paper template
|
||||
ACADEMIC_CSS = """
|
||||
@page {
|
||||
size: A4;
|
||||
margin: 2.5cm;
|
||||
}
|
||||
|
||||
body {
|
||||
font-family: "Times New Roman", "Noto Serif CJK SC", serif;
|
||||
font-size: 12pt;
|
||||
line-height: 1.8;
|
||||
color: #000;
|
||||
}
|
||||
|
||||
h1 {
|
||||
font-size: 20pt;
|
||||
text-align: center;
|
||||
margin-bottom: 24pt;
|
||||
page-break-after: avoid;
|
||||
}
|
||||
|
||||
h2 {
|
||||
font-size: 16pt;
|
||||
margin-top: 20pt;
|
||||
margin-bottom: 12pt;
|
||||
page-break-after: avoid;
|
||||
}
|
||||
|
||||
h3 {
|
||||
font-size: 14pt;
|
||||
margin-top: 16pt;
|
||||
margin-bottom: 10pt;
|
||||
page-break-after: avoid;
|
||||
}
|
||||
|
||||
p {
|
||||
text-indent: 2em;
|
||||
text-align: justify;
|
||||
margin: 0 0 12pt 0;
|
||||
}
|
||||
|
||||
table {
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
margin: 16pt auto;
|
||||
page-break-inside: avoid;
|
||||
}
|
||||
|
||||
table caption {
|
||||
font-weight: bold;
|
||||
margin-bottom: 8pt;
|
||||
}
|
||||
"""
|
||||
|
||||
# Business report template
|
||||
BUSINESS_CSS = """
|
||||
@page {
|
||||
size: A4;
|
||||
margin: 2cm 2.5cm;
|
||||
}
|
||||
|
||||
body {
|
||||
font-family: "Arial", "Noto Sans CJK SC", sans-serif;
|
||||
font-size: 11pt;
|
||||
line-height: 1.5;
|
||||
color: #333;
|
||||
}
|
||||
|
||||
h1 {
|
||||
font-size: 22pt;
|
||||
color: #0066cc;
|
||||
border-bottom: 3px solid #0066cc;
|
||||
padding-bottom: 8pt;
|
||||
margin-bottom: 20pt;
|
||||
page-break-after: avoid;
|
||||
}
|
||||
|
||||
h2 {
|
||||
font-size: 16pt;
|
||||
color: #0066cc;
|
||||
margin-top: 20pt;
|
||||
margin-bottom: 12pt;
|
||||
page-break-after: avoid;
|
||||
}
|
||||
|
||||
table {
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
margin: 16pt 0;
|
||||
}
|
||||
|
||||
table th {
|
||||
background-color: #0066cc;
|
||||
color: white;
|
||||
padding: 10pt;
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
table td {
|
||||
border: 1px solid #ddd;
|
||||
padding: 10pt;
|
||||
}
|
||||
|
||||
table tr:nth-child(even) {
|
||||
background-color: #f9f9f9;
|
||||
}
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize PDF generator"""
|
||||
self.css_templates = {
|
||||
"default": self.DEFAULT_CSS,
|
||||
"academic": self.ACADEMIC_CSS,
|
||||
"business": self.BUSINESS_CSS,
|
||||
}
|
||||
|
||||
def check_pandoc_available(self) -> bool:
|
||||
"""
|
||||
Check if Pandoc is installed and available
|
||||
|
||||
Returns:
|
||||
bool: True if pandoc is available, False otherwise
|
||||
"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["pandoc", "--version"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5
|
||||
)
|
||||
return result.returncode == 0
|
||||
except (subprocess.TimeoutExpired, FileNotFoundError):
|
||||
logger.warning("Pandoc not found or timed out")
|
||||
return False
|
||||
|
||||
def generate_pdf_pandoc(
|
||||
self,
|
||||
markdown_path: Path,
|
||||
output_path: Path,
|
||||
css_template: str = "default",
|
||||
metadata: Optional[Dict] = None
|
||||
) -> Path:
|
||||
"""
|
||||
Generate PDF using Pandoc (preferred method)
|
||||
|
||||
Args:
|
||||
markdown_path: Path to input Markdown file
|
||||
output_path: Path to output PDF file
|
||||
css_template: CSS template name or custom CSS string
|
||||
metadata: Optional metadata dict (title, author, date)
|
||||
|
||||
Returns:
|
||||
Path: Path to generated PDF file
|
||||
|
||||
Raises:
|
||||
PDFGenerationError: If PDF generation fails
|
||||
"""
|
||||
try:
|
||||
# Create temporary CSS file
|
||||
css_content = self.css_templates.get(css_template, css_template)
|
||||
css_file = output_path.parent / f"temp_{datetime.now().timestamp()}.css"
|
||||
css_file.write_text(css_content, encoding="utf-8")
|
||||
|
||||
# Build pandoc command
|
||||
pandoc_cmd = [
|
||||
"pandoc",
|
||||
str(markdown_path),
|
||||
"-o", str(output_path),
|
||||
"--pdf-engine=weasyprint",
|
||||
"--css", str(css_file),
|
||||
"--standalone",
|
||||
"--from=markdown+tables+fenced_code_blocks+footnotes",
|
||||
]
|
||||
|
||||
# Add metadata if provided
|
||||
if metadata:
|
||||
if metadata.get("title"):
|
||||
pandoc_cmd.extend(["--metadata", f"title={metadata['title']}"])
|
||||
if metadata.get("author"):
|
||||
pandoc_cmd.extend(["--metadata", f"author={metadata['author']}"])
|
||||
if metadata.get("date"):
|
||||
pandoc_cmd.extend(["--metadata", f"date={metadata['date']}"])
|
||||
|
||||
# Execute pandoc
|
||||
logger.info(f"Executing pandoc: {' '.join(pandoc_cmd)}")
|
||||
result = subprocess.run(
|
||||
pandoc_cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60 # 60 second timeout for large documents
|
||||
)
|
||||
|
||||
# Clean up temporary CSS file
|
||||
css_file.unlink(missing_ok=True)
|
||||
|
||||
if result.returncode != 0:
|
||||
error_msg = f"Pandoc failed: {result.stderr}"
|
||||
logger.error(error_msg)
|
||||
raise PDFGenerationError(error_msg)
|
||||
|
||||
if not output_path.exists():
|
||||
raise PDFGenerationError(f"PDF file not created: {output_path}")
|
||||
|
||||
logger.info(f"PDF generated successfully via Pandoc: {output_path}")
|
||||
return output_path
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
css_file.unlink(missing_ok=True)
|
||||
raise PDFGenerationError("Pandoc execution timed out")
|
||||
except Exception as e:
|
||||
css_file.unlink(missing_ok=True)
|
||||
raise PDFGenerationError(f"Pandoc PDF generation failed: {str(e)}")
|
||||
|
||||
def generate_pdf_weasyprint(
|
||||
self,
|
||||
markdown_path: Path,
|
||||
output_path: Path,
|
||||
css_template: str = "default",
|
||||
metadata: Optional[Dict] = None
|
||||
) -> Path:
|
||||
"""
|
||||
Generate PDF using WeasyPrint directly (fallback method)
|
||||
|
||||
Args:
|
||||
markdown_path: Path to input Markdown file
|
||||
output_path: Path to output PDF file
|
||||
css_template: CSS template name or custom CSS string
|
||||
metadata: Optional metadata dict (title, author, date)
|
||||
|
||||
Returns:
|
||||
Path: Path to generated PDF file
|
||||
|
||||
Raises:
|
||||
PDFGenerationError: If PDF generation fails
|
||||
"""
|
||||
try:
|
||||
# Read Markdown content
|
||||
markdown_content = markdown_path.read_text(encoding="utf-8")
|
||||
|
||||
# Convert Markdown to HTML
|
||||
html_content = markdown(
|
||||
markdown_content,
|
||||
extensions=[
|
||||
'tables',
|
||||
'fenced_code',
|
||||
'codehilite',
|
||||
'nl2br',
|
||||
'sane_lists',
|
||||
]
|
||||
)
|
||||
|
||||
# Wrap HTML with proper structure
|
||||
title = metadata.get("title", markdown_path.stem) if metadata else markdown_path.stem
|
||||
full_html = f"""
|
||||
<!DOCTYPE html>
|
||||
<html lang="zh-CN">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>{title}</title>
|
||||
</head>
|
||||
<body>
|
||||
{html_content}
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
# Get CSS content
|
||||
css_content = self.css_templates.get(css_template, css_template)
|
||||
|
||||
# Generate PDF
|
||||
logger.info(f"Generating PDF via WeasyPrint: {output_path}")
|
||||
html = HTML(string=full_html, base_url=str(markdown_path.parent))
|
||||
css = CSS(string=css_content)
|
||||
html.write_pdf(str(output_path), stylesheets=[css])
|
||||
|
||||
if not output_path.exists():
|
||||
raise PDFGenerationError(f"PDF file not created: {output_path}")
|
||||
|
||||
logger.info(f"PDF generated successfully via WeasyPrint: {output_path}")
|
||||
return output_path
|
||||
|
||||
except Exception as e:
|
||||
raise PDFGenerationError(f"WeasyPrint PDF generation failed: {str(e)}")
|
||||
|
||||
def generate_pdf(
|
||||
self,
|
||||
markdown_path: Path,
|
||||
output_path: Path,
|
||||
css_template: str = "default",
|
||||
metadata: Optional[Dict] = None,
|
||||
prefer_pandoc: bool = True
|
||||
) -> Path:
|
||||
"""
|
||||
Generate PDF from Markdown with automatic fallback
|
||||
|
||||
Args:
|
||||
markdown_path: Path to input Markdown file
|
||||
output_path: Path to output PDF file
|
||||
css_template: CSS template name ("default", "academic", "business") or custom CSS
|
||||
metadata: Optional metadata dict (title, author, date)
|
||||
prefer_pandoc: Use Pandoc if available, fallback to WeasyPrint
|
||||
|
||||
Returns:
|
||||
Path: Path to generated PDF file
|
||||
|
||||
Raises:
|
||||
PDFGenerationError: If both methods fail
|
||||
"""
|
||||
if not markdown_path.exists():
|
||||
raise PDFGenerationError(f"Markdown file not found: {markdown_path}")
|
||||
|
||||
# Ensure output directory exists
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Try Pandoc first if preferred and available
|
||||
if prefer_pandoc and self.check_pandoc_available():
|
||||
try:
|
||||
return self.generate_pdf_pandoc(markdown_path, output_path, css_template, metadata)
|
||||
except PDFGenerationError as e:
|
||||
logger.warning(f"Pandoc failed, falling back to WeasyPrint: {e}")
|
||||
# Fall through to WeasyPrint
|
||||
|
||||
# Use WeasyPrint (fallback or direct)
|
||||
return self.generate_pdf_weasyprint(markdown_path, output_path, css_template, metadata)
|
||||
|
||||
def get_available_templates(self) -> Dict[str, str]:
|
||||
"""
|
||||
Get list of available CSS templates
|
||||
|
||||
Returns:
|
||||
Dict mapping template names to descriptions
|
||||
"""
|
||||
return {
|
||||
"default": "通用排版模板,適合大多數文檔",
|
||||
"academic": "學術論文模板,適合研究報告",
|
||||
"business": "商業報告模板,適合企業文檔",
|
||||
}
|
||||
|
||||
def save_custom_template(self, template_name: str, css_content: str) -> None:
|
||||
"""
|
||||
Save a custom CSS template
|
||||
|
||||
Args:
|
||||
template_name: Template name
|
||||
css_content: CSS content
|
||||
"""
|
||||
self.css_templates[template_name] = css_content
|
||||
logger.info(f"Custom CSS template saved: {template_name}")
|
||||
230
backend/app/services/preprocessor.py
Normal file
230
backend/app/services/preprocessor.py
Normal file
@@ -0,0 +1,230 @@
|
||||
"""
|
||||
Tool_OCR - Document Preprocessor Service
|
||||
Handles file validation, format detection, and preprocessing
|
||||
"""
|
||||
|
||||
import magic
|
||||
from pathlib import Path
|
||||
from typing import Tuple, Optional
|
||||
import logging
|
||||
from PIL import Image
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
from app.core.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DocumentPreprocessor:
|
||||
"""
|
||||
Document preprocessing service for format standardization
|
||||
Validates and prepares documents for OCR processing
|
||||
"""
|
||||
|
||||
SUPPORTED_IMAGE_FORMATS = ['png', 'jpg', 'jpeg', 'bmp', 'tiff', 'tif']
|
||||
SUPPORTED_PDF_FORMAT = ['pdf']
|
||||
ALL_SUPPORTED_FORMATS = SUPPORTED_IMAGE_FORMATS + SUPPORTED_PDF_FORMAT
|
||||
|
||||
def __init__(self):
|
||||
self.allowed_extensions = settings.allowed_extensions_list
|
||||
self.max_file_size = settings.max_upload_size
|
||||
logger.info(f"DocumentPreprocessor initialized with allowed_extensions: {self.allowed_extensions}")
|
||||
|
||||
def validate_file(self, file_path: Path) -> Tuple[bool, Optional[str], Optional[str]]:
|
||||
"""
|
||||
Validate file format, size, and integrity
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to validate
|
||||
|
||||
Returns:
|
||||
Tuple of (is_valid, file_format, error_message)
|
||||
"""
|
||||
try:
|
||||
# Check file exists
|
||||
if not file_path.exists():
|
||||
return False, None, f"File not found: {file_path}"
|
||||
|
||||
# Check file size
|
||||
file_size = file_path.stat().st_size
|
||||
if file_size > self.max_file_size:
|
||||
max_mb = self.max_file_size / (1024 * 1024)
|
||||
actual_mb = file_size / (1024 * 1024)
|
||||
return False, None, f"File too large: {actual_mb:.2f}MB (max {max_mb:.2f}MB)"
|
||||
|
||||
# Detect file format using magic numbers
|
||||
mime = magic.Magic(mime=True)
|
||||
mime_type = mime.from_file(str(file_path))
|
||||
|
||||
# Map MIME type to format
|
||||
file_format = self._mime_to_format(mime_type)
|
||||
if not file_format:
|
||||
return False, None, f"Unsupported file type: {mime_type}"
|
||||
|
||||
# Check if format is in allowed extensions
|
||||
if file_format not in self.allowed_extensions:
|
||||
return False, None, f"File format '{file_format}' not allowed"
|
||||
|
||||
# Validate file integrity
|
||||
is_valid, error = self._validate_integrity(file_path, file_format)
|
||||
if not is_valid:
|
||||
return False, file_format, f"File corrupted: {error}"
|
||||
|
||||
logger.info(f"File validated successfully: {file_path.name} ({file_format})")
|
||||
return True, file_format, None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"File validation error: {str(e)}")
|
||||
return False, None, f"Validation error: {str(e)}"
|
||||
|
||||
def _mime_to_format(self, mime_type: str) -> Optional[str]:
|
||||
"""Convert MIME type to file format"""
|
||||
mime_map = {
|
||||
'image/png': 'png',
|
||||
'image/jpeg': 'jpg',
|
||||
'image/jpg': 'jpg',
|
||||
'image/bmp': 'bmp',
|
||||
'image/tiff': 'tiff',
|
||||
'image/x-tiff': 'tiff',
|
||||
'application/pdf': 'pdf',
|
||||
'application/msword': 'doc',
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
|
||||
'application/vnd.ms-powerpoint': 'ppt',
|
||||
'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
|
||||
}
|
||||
return mime_map.get(mime_type)
|
||||
|
||||
def _validate_integrity(self, file_path: Path, file_format: str) -> Tuple[bool, Optional[str]]:
|
||||
"""
|
||||
Validate file integrity by attempting to open it
|
||||
|
||||
Args:
|
||||
file_path: Path to file
|
||||
file_format: Detected file format
|
||||
|
||||
Returns:
|
||||
Tuple of (is_valid, error_message)
|
||||
"""
|
||||
try:
|
||||
if file_format in self.SUPPORTED_IMAGE_FORMATS:
|
||||
# Try to open image
|
||||
with Image.open(file_path) as img:
|
||||
img.verify() # Verify image integrity
|
||||
# Reopen for actual check (verify() closes the file)
|
||||
with Image.open(file_path) as img:
|
||||
_ = img.size # Force load to detect corruption
|
||||
return True, None
|
||||
|
||||
elif file_format == 'pdf':
|
||||
# Basic PDF validation - check file starts with PDF signature
|
||||
with open(file_path, 'rb') as f:
|
||||
header = f.read(5)
|
||||
if header != b'%PDF-':
|
||||
return False, "Invalid PDF header"
|
||||
return True, None
|
||||
|
||||
elif file_format in ['doc', 'docx', 'ppt', 'pptx']:
|
||||
# Office documents - basic validation (check file size and can be opened)
|
||||
# Modern Office formats (docx, pptx) are ZIP-based
|
||||
if file_format in ['docx', 'pptx']:
|
||||
import zipfile
|
||||
try:
|
||||
with zipfile.ZipFile(file_path, 'r') as zf:
|
||||
# Check if it has the required Office structure
|
||||
if file_format == 'docx' and 'word/document.xml' not in zf.namelist():
|
||||
return False, "Invalid DOCX structure"
|
||||
elif file_format == 'pptx' and 'ppt/presentation.xml' not in zf.namelist():
|
||||
return False, "Invalid PPTX structure"
|
||||
except zipfile.BadZipFile:
|
||||
return False, "Invalid Office file (corrupt ZIP)"
|
||||
# Old formats (doc, ppt) - just check file exists and has content
|
||||
return True, None
|
||||
|
||||
else:
|
||||
return False, f"Unknown format: {file_format}"
|
||||
|
||||
except Exception as e:
|
||||
return False, str(e)
|
||||
|
||||
def preprocess_image(
|
||||
self,
|
||||
image_path: Path,
|
||||
enhance: bool = True,
|
||||
output_path: Optional[Path] = None
|
||||
) -> Tuple[bool, Optional[Path], Optional[str]]:
|
||||
"""
|
||||
Preprocess image to improve OCR accuracy
|
||||
|
||||
Args:
|
||||
image_path: Path to input image
|
||||
enhance: Whether to apply enhancement
|
||||
output_path: Optional output path (defaults to temp directory)
|
||||
|
||||
Returns:
|
||||
Tuple of (success, processed_image_path, error_message)
|
||||
"""
|
||||
try:
|
||||
# Read image
|
||||
img = cv2.imread(str(image_path))
|
||||
if img is None:
|
||||
return False, None, "Failed to read image"
|
||||
|
||||
if not enhance:
|
||||
# No preprocessing, return original
|
||||
return True, image_path, None
|
||||
|
||||
# Convert to grayscale
|
||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
# Apply adaptive thresholding to handle varying lighting
|
||||
processed = cv2.adaptiveThreshold(
|
||||
gray,
|
||||
255,
|
||||
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
||||
cv2.THRESH_BINARY,
|
||||
11,
|
||||
2
|
||||
)
|
||||
|
||||
# Denoise
|
||||
processed = cv2.fastNlMeansDenoising(processed, None, 10, 7, 21)
|
||||
|
||||
# Determine output path
|
||||
if output_path is None:
|
||||
output_path = Path(settings.processed_dir) / f"processed_{image_path.name}"
|
||||
|
||||
# Save processed image
|
||||
cv2.imwrite(str(output_path), processed)
|
||||
|
||||
logger.info(f"Image preprocessed: {image_path.name} -> {output_path.name}")
|
||||
return True, output_path, None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Image preprocessing error: {str(e)}")
|
||||
return False, None, f"Preprocessing error: {str(e)}"
|
||||
|
||||
def get_file_info(self, file_path: Path) -> dict:
|
||||
"""
|
||||
Get comprehensive file information
|
||||
|
||||
Args:
|
||||
file_path: Path to file
|
||||
|
||||
Returns:
|
||||
Dictionary with file information
|
||||
"""
|
||||
stat = file_path.stat()
|
||||
mime = magic.Magic(mime=True)
|
||||
mime_type = mime.from_file(str(file_path))
|
||||
|
||||
return {
|
||||
'name': file_path.name,
|
||||
'path': str(file_path),
|
||||
'size': stat.st_size,
|
||||
'size_mb': stat.st_size / (1024 * 1024),
|
||||
'mime_type': mime_type,
|
||||
'format': self._mime_to_format(mime_type),
|
||||
'created_at': stat.st_ctime,
|
||||
'modified_at': stat.st_mtime,
|
||||
}
|
||||
282
backend/app/services/translation_service.py
Normal file
282
backend/app/services/translation_service.py
Normal file
@@ -0,0 +1,282 @@
|
||||
"""
|
||||
Tool_OCR - Translation Service (RESERVED)
|
||||
Abstract interface and stub implementation for future translation feature
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, Optional, List
|
||||
from enum import Enum
|
||||
import logging
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TranslationEngine(str, Enum):
|
||||
"""Supported translation engines"""
|
||||
OFFLINE = "offline" # Argos Translate (offline)
|
||||
ERNIE = "ernie" # Baidu ERNIE API
|
||||
GOOGLE = "google" # Google Translate API
|
||||
DEEPL = "deepl" # DeepL API
|
||||
|
||||
|
||||
class LanguageCode(str, Enum):
|
||||
"""Supported language codes"""
|
||||
CHINESE = "zh"
|
||||
ENGLISH = "en"
|
||||
JAPANESE = "ja"
|
||||
KOREAN = "ko"
|
||||
FRENCH = "fr"
|
||||
GERMAN = "de"
|
||||
SPANISH = "es"
|
||||
|
||||
|
||||
class TranslationServiceInterface(ABC):
|
||||
"""
|
||||
Abstract interface for translation services
|
||||
|
||||
This interface defines the contract for all translation engine implementations.
|
||||
Future implementations should inherit from this class.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def translate_text(
|
||||
self,
|
||||
text: str,
|
||||
source_lang: str,
|
||||
target_lang: str,
|
||||
**kwargs
|
||||
) -> str:
|
||||
"""
|
||||
Translate a single text string
|
||||
|
||||
Args:
|
||||
text: Text to translate
|
||||
source_lang: Source language code
|
||||
target_lang: Target language code
|
||||
**kwargs: Engine-specific parameters
|
||||
|
||||
Returns:
|
||||
str: Translated text
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def translate_document(
|
||||
self,
|
||||
markdown_content: str,
|
||||
source_lang: str,
|
||||
target_lang: str,
|
||||
preserve_structure: bool = True,
|
||||
**kwargs
|
||||
) -> Dict[str, any]:
|
||||
"""
|
||||
Translate a Markdown document while preserving structure
|
||||
|
||||
Args:
|
||||
markdown_content: Markdown content to translate
|
||||
source_lang: Source language code
|
||||
target_lang: Target language code
|
||||
preserve_structure: Whether to preserve markdown structure
|
||||
**kwargs: Engine-specific parameters
|
||||
|
||||
Returns:
|
||||
Dict containing:
|
||||
- translated_content: Translated markdown
|
||||
- metadata: Translation metadata (engine, time, etc.)
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def batch_translate(
|
||||
self,
|
||||
texts: List[str],
|
||||
source_lang: str,
|
||||
target_lang: str,
|
||||
**kwargs
|
||||
) -> List[str]:
|
||||
"""
|
||||
Translate multiple texts in batch
|
||||
|
||||
Args:
|
||||
texts: List of texts to translate
|
||||
source_lang: Source language code
|
||||
target_lang: Target language code
|
||||
**kwargs: Engine-specific parameters
|
||||
|
||||
Returns:
|
||||
List[str]: List of translated texts
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_supported_languages(self) -> List[str]:
|
||||
"""
|
||||
Get list of supported language codes for this engine
|
||||
|
||||
Returns:
|
||||
List[str]: List of supported language codes
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def validate_config(self) -> bool:
|
||||
"""
|
||||
Validate engine configuration (API keys, model files, etc.)
|
||||
|
||||
Returns:
|
||||
bool: True if configuration is valid
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class TranslationEngineFactory:
|
||||
"""
|
||||
Factory for creating translation engine instances
|
||||
|
||||
RESERVED: This is a placeholder for future implementation.
|
||||
When translation feature is implemented, this factory will instantiate
|
||||
the appropriate translation engine based on configuration.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def create_engine(
|
||||
engine_type: TranslationEngine,
|
||||
config: Optional[Dict] = None
|
||||
) -> TranslationServiceInterface:
|
||||
"""
|
||||
Create a translation engine instance
|
||||
|
||||
Args:
|
||||
engine_type: Type of translation engine
|
||||
config: Engine-specific configuration
|
||||
|
||||
Returns:
|
||||
TranslationServiceInterface: Translation engine instance
|
||||
|
||||
Raises:
|
||||
NotImplementedError: Always raised (stub implementation)
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
"Translation feature is not yet implemented. "
|
||||
"This is a reserved placeholder for future development."
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def get_available_engines() -> List[str]:
|
||||
"""
|
||||
Get list of available translation engines
|
||||
|
||||
Returns:
|
||||
List[str]: List of engine types (currently empty)
|
||||
"""
|
||||
return []
|
||||
|
||||
@staticmethod
|
||||
def is_engine_available(engine_type: TranslationEngine) -> bool:
|
||||
"""
|
||||
Check if a specific engine is available
|
||||
|
||||
Args:
|
||||
engine_type: Engine type to check
|
||||
|
||||
Returns:
|
||||
bool: Always False (stub implementation)
|
||||
"""
|
||||
return False
|
||||
|
||||
|
||||
class StubTranslationService:
|
||||
"""
|
||||
Stub translation service for API endpoints
|
||||
|
||||
This service provides placeholder responses for translation endpoints
|
||||
until the feature is fully implemented.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def get_feature_status() -> Dict[str, any]:
|
||||
"""
|
||||
Get translation feature status
|
||||
|
||||
Returns:
|
||||
Dict with feature status information
|
||||
"""
|
||||
return {
|
||||
"available": False,
|
||||
"status": "reserved",
|
||||
"message": "Translation feature is reserved for future implementation",
|
||||
"supported_engines": [],
|
||||
"planned_engines": [
|
||||
{
|
||||
"type": "offline",
|
||||
"name": "Argos Translate",
|
||||
"description": "Offline neural translation",
|
||||
"status": "planned"
|
||||
},
|
||||
{
|
||||
"type": "ernie",
|
||||
"name": "Baidu ERNIE",
|
||||
"description": "Baidu AI translation API",
|
||||
"status": "planned"
|
||||
},
|
||||
{
|
||||
"type": "google",
|
||||
"name": "Google Translate",
|
||||
"description": "Google Cloud Translation API",
|
||||
"status": "planned"
|
||||
},
|
||||
{
|
||||
"type": "deepl",
|
||||
"name": "DeepL",
|
||||
"description": "DeepL translation API",
|
||||
"status": "planned"
|
||||
}
|
||||
],
|
||||
"roadmap": {
|
||||
"phase": "Phase 5",
|
||||
"priority": "low",
|
||||
"implementation_after": "Production deployment and user feedback"
|
||||
}
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def get_supported_languages() -> List[Dict[str, str]]:
|
||||
"""
|
||||
Get list of languages planned for translation support
|
||||
|
||||
Returns:
|
||||
List of language info dicts
|
||||
"""
|
||||
return [
|
||||
{"code": "zh", "name": "Chinese (Simplified)", "status": "planned"},
|
||||
{"code": "en", "name": "English", "status": "planned"},
|
||||
{"code": "ja", "name": "Japanese", "status": "planned"},
|
||||
{"code": "ko", "name": "Korean", "status": "planned"},
|
||||
{"code": "fr", "name": "French", "status": "planned"},
|
||||
{"code": "de", "name": "German", "status": "planned"},
|
||||
{"code": "es", "name": "Spanish", "status": "planned"},
|
||||
]
|
||||
|
||||
|
||||
# Example placeholder for future engine implementations:
|
||||
#
|
||||
# class ArgosTranslationEngine(TranslationServiceInterface):
|
||||
# """Offline translation using Argos Translate"""
|
||||
# def __init__(self, model_path: str):
|
||||
# self.model_path = model_path
|
||||
# # Initialize Argos models
|
||||
#
|
||||
# def translate_text(self, text, source_lang, target_lang, **kwargs):
|
||||
# # Implementation here
|
||||
# pass
|
||||
#
|
||||
# class ERNIETranslationEngine(TranslationServiceInterface):
|
||||
# """Baidu ERNIE API translation"""
|
||||
# def __init__(self, api_key: str, api_secret: str):
|
||||
# self.api_key = api_key
|
||||
# self.api_secret = api_secret
|
||||
#
|
||||
# def translate_text(self, text, source_lang, target_lang, **kwargs):
|
||||
# # Implementation here
|
||||
# pass
|
||||
Reference in New Issue
Block a user