first

2025-11-12 22:53:17 +08:00
commit da700721fa
130 changed files with 23393 additions and 0 deletions
--- a/backend/app/services/init.py
+++ b/backend/app/services/init.py
@@ -0,0 +1,3 @@
+"""
+Tool_OCR - Services Package
+"""
--- a/backend/app/services/background_tasks.py
+++ b/backend/app/services/background_tasks.py
@@ -0,0 +1,394 @@
+"""
+Tool_OCR - Background Tasks Service
+Handles async processing, cleanup, and scheduled tasks
+"""
+
+import logging
+import asyncio
+import time
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Optional, Callable, Any
+from sqlalchemy.orm import Session
+
+from app.core.database import SessionLocal
+from app.models.ocr import OCRBatch, OCRFile, OCRResult, BatchStatus, FileStatus
+from app.services.ocr_service import OCRService
+from app.services.file_manager import FileManager
+from app.services.pdf_generator import PDFGenerator
+
+
+logger = logging.getLogger(__name__)
+
+
+class BackgroundTaskManager:
+    """
+    Manages background tasks including retry logic, cleanup, and scheduled jobs
+    """
+
+    def __init__(
+        self,
+        max_retries: int = 3,
+        retry_delay: int = 5,
+        cleanup_interval: int = 3600,  # 1 hour
+        file_retention_hours: int = 24
+    ):
+        self.max_retries = max_retries
+        self.retry_delay = retry_delay
+        self.cleanup_interval = cleanup_interval
+        self.file_retention_hours = file_retention_hours
+        self.ocr_service = OCRService()
+        self.file_manager = FileManager()
+        self.pdf_generator = PDFGenerator()
+
+    async def execute_with_retry(
+        self,
+        func: Callable,
+        *args,
+        max_retries: Optional[int] = None,
+        retry_delay: Optional[int] = None,
+        **kwargs
+    ) -> Any:
+        """
+        Execute a function with retry logic
+
+        Args:
+            func: Function to execute
+            args: Positional arguments for func
+            max_retries: Maximum retry attempts (overrides default)
+            retry_delay: Delay between retries in seconds (overrides default)
+            kwargs: Keyword arguments for func
+
+        Returns:
+            Function result
+
+        Raises:
+            Exception: If all retries are exhausted
+        """
+        max_retries = max_retries or self.max_retries
+        retry_delay = retry_delay or self.retry_delay
+
+        last_exception = None
+        for attempt in range(max_retries + 1):
+            try:
+                if asyncio.iscoroutinefunction(func):
+                    return await func(*args, **kwargs)
+                else:
+                    return func(*args, **kwargs)
+            except Exception as e:
+                last_exception = e
+                if attempt < max_retries:
+                    logger.warning(
+                        f"Attempt {attempt + 1}/{max_retries + 1} failed for {func.__name__}: {e}. "
+                        f"Retrying in {retry_delay}s..."
+                    )
+                    await asyncio.sleep(retry_delay)
+                else:
+                    logger.error(
+                        f"All {max_retries + 1} attempts failed for {func.__name__}: {e}"
+                    )
+
+        raise last_exception
+
+    def process_single_file_with_retry(
+        self,
+        ocr_file: OCRFile,
+        batch_id: int,
+        lang: str,
+        detect_layout: bool,
+        db: Session
+    ) -> bool:
+        """
+        Process a single file with retry logic
+
+        Args:
+            ocr_file: OCRFile instance
+            batch_id: Batch ID
+            lang: Language code
+            detect_layout: Whether to detect layout
+            db: Database session
+
+        Returns:
+            bool: True if successful, False otherwise
+        """
+        for attempt in range(self.max_retries + 1):
+            try:
+                # Update file status
+                ocr_file.status = FileStatus.PROCESSING
+                ocr_file.started_at = datetime.utcnow()
+                ocr_file.retry_count = attempt
+                db.commit()
+
+                # Get file paths
+                file_path = Path(ocr_file.file_path)
+                paths = self.file_manager.get_file_paths(batch_id, ocr_file.id)
+
+                # Process OCR
+                result = self.ocr_service.process_image(
+                    file_path,
+                    lang=lang,
+                    detect_layout=detect_layout
+                )
+
+                # Check if processing was successful
+                if result['status'] != 'success':
+                    raise Exception(result.get('error_message', 'Unknown error during OCR processing'))
+
+                # Save results
+                json_path, markdown_path = self.ocr_service.save_results(
+                    result=result,
+                    output_dir=paths["output_dir"],
+                    file_id=str(ocr_file.id)
+                )
+
+                # Extract data from result
+                text_regions = result.get('text_regions', [])
+                layout_data = result.get('layout_data')
+                images_metadata = result.get('images_metadata', [])
+
+                # Calculate average confidence (or use from result)
+                avg_confidence = result.get('average_confidence')
+
+                # Create OCR result record
+                ocr_result = OCRResult(
+                    file_id=ocr_file.id,
+                    markdown_path=str(markdown_path) if markdown_path else None,
+                    json_path=str(json_path) if json_path else None,
+                    images_dir=None,  # Images dir not used in current implementation
+                    detected_language=lang,
+                    total_text_regions=len(text_regions),
+                    average_confidence=avg_confidence,
+                    layout_data=layout_data,
+                    images_metadata=images_metadata
+                )
+                db.add(ocr_result)
+
+                # Update file status
+                ocr_file.status = FileStatus.COMPLETED
+                ocr_file.completed_at = datetime.utcnow()
+                ocr_file.processing_time = (ocr_file.completed_at - ocr_file.started_at).total_seconds()
+
+                db.commit()
+
+                logger.info(f"Successfully processed file {ocr_file.id} ({ocr_file.original_filename})")
+                return True
+
+            except Exception as e:
+                logger.error(f"Attempt {attempt + 1}/{self.max_retries + 1} failed for file {ocr_file.id}: {e}")
+
+                if attempt < self.max_retries:
+                    # Wait before retry
+                    time.sleep(self.retry_delay)
+                else:
+                    # Final failure
+                    ocr_file.status = FileStatus.FAILED
+                    ocr_file.error_message = f"Failed after {self.max_retries + 1} attempts: {str(e)}"
+                    ocr_file.completed_at = datetime.utcnow()
+                    ocr_file.retry_count = attempt
+                    db.commit()
+                    return False
+
+        return False
+
+    async def cleanup_expired_files(self, db: Session):
+        """
+        Clean up files and batches older than retention period
+
+        Args:
+            db: Database session
+        """
+        try:
+            cutoff_time = datetime.utcnow() - timedelta(hours=self.file_retention_hours)
+
+            # Find expired batches
+            expired_batches = db.query(OCRBatch).filter(
+                OCRBatch.created_at < cutoff_time,
+                OCRBatch.status.in_([BatchStatus.COMPLETED, BatchStatus.FAILED, BatchStatus.PARTIAL])
+            ).all()
+
+            logger.info(f"Found {len(expired_batches)} expired batches to clean up")
+
+            for batch in expired_batches:
+                try:
+                    # Get batch directory
+                    batch_dir = self.file_manager.base_upload_dir / "batches" / str(batch.id)
+
+                    # Delete physical files
+                    if batch_dir.exists():
+                        import shutil
+                        shutil.rmtree(batch_dir)
+                        logger.info(f"Deleted batch directory: {batch_dir}")
+
+                    # Delete database records
+                    # Delete results first (foreign key constraint)
+                    db.query(OCRResult).filter(
+                        OCRResult.file_id.in_(
+                            db.query(OCRFile.id).filter(OCRFile.batch_id == batch.id)
+                        )
+                    ).delete(synchronize_session=False)
+
+                    # Delete files
+                    db.query(OCRFile).filter(OCRFile.batch_id == batch.id).delete()
+
+                    # Delete batch
+                    db.delete(batch)
+                    db.commit()
+
+                    logger.info(f"Cleaned up expired batch {batch.id}")
+
+                except Exception as e:
+                    logger.error(f"Error cleaning up batch {batch.id}: {e}")
+                    db.rollback()
+
+        except Exception as e:
+            logger.error(f"Error in cleanup_expired_files: {e}")
+
+    async def generate_pdf_background(
+        self,
+        result_id: int,
+        output_path: str,
+        css_template: str = "default",
+        db: Session = None
+    ):
+        """
+        Generate PDF in background with retry logic
+
+        Args:
+            result_id: OCR result ID
+            output_path: Output PDF path
+            css_template: CSS template name
+            db: Database session
+        """
+        should_close_db = False
+        if db is None:
+            db = SessionLocal()
+            should_close_db = True
+
+        try:
+            # Get result
+            result = db.query(OCRResult).filter(OCRResult.id == result_id).first()
+            if not result:
+                logger.error(f"Result {result_id} not found")
+                return
+
+            # Generate PDF with retry
+            await self.execute_with_retry(
+                self.pdf_generator.generate_pdf,
+                markdown_path=result.markdown_path,
+                output_path=output_path,
+                css_template=css_template,
+                max_retries=2,
+                retry_delay=3
+            )
+
+            logger.info(f"Successfully generated PDF for result {result_id}: {output_path}")
+
+        except Exception as e:
+            logger.error(f"Failed to generate PDF for result {result_id}: {e}")
+        finally:
+            if should_close_db:
+                db.close()
+
+    async def start_cleanup_scheduler(self):
+        """
+        Start periodic cleanup scheduler
+
+        Runs cleanup task at specified intervals
+        """
+        logger.info(f"Starting cleanup scheduler (interval: {self.cleanup_interval}s, retention: {self.file_retention_hours}h)")
+
+        while True:
+            try:
+                db = SessionLocal()
+                await self.cleanup_expired_files(db)
+                db.close()
+            except Exception as e:
+                logger.error(f"Error in cleanup scheduler: {e}")
+
+            # Wait for next interval
+            await asyncio.sleep(self.cleanup_interval)
+
+
+# Global task manager instance
+task_manager = BackgroundTaskManager()
+
+
+def process_batch_files_with_retry(
+    batch_id: int,
+    lang: str,
+    detect_layout: bool,
+    db: Session
+):
+    """
+    Process all files in a batch with retry logic
+
+    Args:
+        batch_id: Batch ID
+        lang: Language code
+        detect_layout: Whether to detect layout
+        db: Database session
+    """
+    try:
+        # Get batch
+        batch = db.query(OCRBatch).filter(OCRBatch.id == batch_id).first()
+        if not batch:
+            logger.error(f"Batch {batch_id} not found")
+            return
+
+        # Update batch status
+        batch.status = BatchStatus.PROCESSING
+        batch.started_at = datetime.utcnow()
+        db.commit()
+
+        # Get pending files
+        files = db.query(OCRFile).filter(
+            OCRFile.batch_id == batch_id,
+            OCRFile.status == FileStatus.PENDING
+        ).all()
+
+        logger.info(f"Processing {len(files)} files in batch {batch_id} with retry logic")
+
+        # Process each file with retry
+        for ocr_file in files:
+            success = task_manager.process_single_file_with_retry(
+                ocr_file=ocr_file,
+                batch_id=batch_id,
+                lang=lang,
+                detect_layout=detect_layout,
+                db=db
+            )
+
+            # Update batch progress
+            if success:
+                batch.completed_files += 1
+            else:
+                batch.failed_files += 1
+
+            db.commit()
+
+        # Update batch final status
+        if batch.failed_files == 0:
+            batch.status = BatchStatus.COMPLETED
+        elif batch.completed_files > 0:
+            batch.status = BatchStatus.PARTIAL
+        else:
+            batch.status = BatchStatus.FAILED
+
+        batch.completed_at = datetime.utcnow()
+        db.commit()
+
+        logger.info(
+            f"Batch {batch_id} processing complete: "
+            f"{batch.completed_files} succeeded, {batch.failed_files} failed"
+        )
+
+    except Exception as e:
+        logger.error(f"Fatal error processing batch {batch_id}: {e}")
+        try:
+            batch = db.query(OCRBatch).filter(OCRBatch.id == batch_id).first()
+            if batch:
+                batch.status = BatchStatus.FAILED
+                batch.completed_at = datetime.utcnow()
+                db.commit()
+        except Exception as commit_error:
+            logger.error(f"Error updating batch status: {commit_error}")
--- a/backend/app/services/export_service.py
+++ b/backend/app/services/export_service.py
@@ -0,0 +1,512 @@
+"""
+Tool_OCR - Export Service
+Handles OCR result export in multiple formats with filtering and formatting rules
+"""
+
+import json
+import logging
+import zipfile
+from pathlib import Path
+from typing import List, Dict, Optional, Any
+from datetime import datetime
+
+import pandas as pd
+from sqlalchemy.orm import Session
+
+from app.core.config import settings
+from app.models.ocr import OCRBatch, OCRFile, OCRResult, FileStatus
+from app.models.export import ExportRule
+from app.services.pdf_generator import PDFGenerator, PDFGenerationError
+
+
+logger = logging.getLogger(__name__)
+
+
+class ExportError(Exception):
+    """Exception raised for export errors"""
+    pass
+
+
+class ExportService:
+    """
+    Export service for OCR results
+
+    Supported formats:
+    - TXT: Plain text export
+    - JSON: Full metadata export
+    - Excel: Tabular data export
+    - Markdown: Direct Markdown export
+    - PDF: Layout-preserved PDF export
+    - ZIP: Batch export archive
+    """
+
+    def __init__(self):
+        """Initialize export service"""
+        self.pdf_generator = PDFGenerator()
+
+    def apply_filters(
+        self,
+        results: List[OCRResult],
+        filters: Dict[str, Any]
+    ) -> List[OCRResult]:
+        """
+        Apply filters to OCR results
+
+        Args:
+            results: List of OCR results
+            filters: Filter configuration
+                - confidence_threshold: Minimum confidence (0.0-1.0)
+                - filename_pattern: Glob pattern for filename matching
+                - language: Filter by detected language
+
+        Returns:
+            List[OCRResult]: Filtered results
+        """
+        filtered = results
+
+        # Confidence threshold filter
+        if "confidence_threshold" in filters:
+            threshold = filters["confidence_threshold"]
+            filtered = [r for r in filtered if r.average_confidence and r.average_confidence >= threshold]
+
+        # Filename pattern filter (using simple substring match)
+        if "filename_pattern" in filters:
+            pattern = filters["filename_pattern"].lower()
+            filtered = [
+                r for r in filtered
+                if pattern in r.file.original_filename.lower()
+            ]
+
+        # Language filter
+        if "language" in filters:
+            lang = filters["language"]
+            filtered = [r for r in filtered if r.detected_language == lang]
+
+        return filtered
+
+    def export_to_txt(
+        self,
+        results: List[OCRResult],
+        output_path: Path,
+        formatting: Optional[Dict] = None
+    ) -> Path:
+        """
+        Export results to plain text file
+
+        Args:
+            results: List of OCR results
+            output_path: Output file path
+            formatting: Formatting options
+                - add_line_numbers: Add line numbers
+                - group_by_filename: Group text by source file
+                - include_metadata: Add file metadata headers
+
+        Returns:
+            Path: Output file path
+
+        Raises:
+            ExportError: If export fails
+        """
+        try:
+            formatting = formatting or {}
+            output_lines = []
+
+            for idx, result in enumerate(results, 1):
+                # Read Markdown file
+                if not result.markdown_path or not Path(result.markdown_path).exists():
+                    logger.warning(f"Markdown file not found for result {result.id}")
+                    continue
+
+                markdown_content = Path(result.markdown_path).read_text(encoding="utf-8")
+
+                # Add metadata header if requested
+                if formatting.get("include_metadata", False):
+                    output_lines.append(f"=" * 80)
+                    output_lines.append(f"文件: {result.file.original_filename}")
+                    output_lines.append(f"語言: {result.detected_language or '未知'}")
+                    output_lines.append(f"信心度: {result.average_confidence:.2%}" if result.average_confidence else "信心度: N/A")
+                    output_lines.append(f"=" * 80)
+                    output_lines.append("")
+
+                # Add content with optional line numbers
+                if formatting.get("add_line_numbers", False):
+                    for line_num, line in enumerate(markdown_content.split('\n'), 1):
+                        output_lines.append(f"{line_num:4d} | {line}")
+                else:
+                    output_lines.append(markdown_content)
+
+                # Add separator between files if grouping
+                if formatting.get("group_by_filename", False) and idx < len(results):
+                    output_lines.append("\n" + "-" * 80 + "\n")
+
+            # Write to file
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+            output_path.write_text("\n".join(output_lines), encoding="utf-8")
+
+            logger.info(f"Exported {len(results)} results to TXT: {output_path}")
+            return output_path
+
+        except Exception as e:
+            raise ExportError(f"TXT export failed: {str(e)}")
+
+    def export_to_json(
+        self,
+        results: List[OCRResult],
+        output_path: Path,
+        include_layout: bool = True,
+        include_images: bool = True
+    ) -> Path:
+        """
+        Export results to JSON file with full metadata
+
+        Args:
+            results: List of OCR results
+            output_path: Output file path
+            include_layout: Include layout data
+            include_images: Include images metadata
+
+        Returns:
+            Path: Output file path
+
+        Raises:
+            ExportError: If export fails
+        """
+        try:
+            export_data = {
+                "export_time": datetime.utcnow().isoformat(),
+                "total_files": len(results),
+                "results": []
+            }
+
+            for result in results:
+                result_data = {
+                    "file_id": result.file.id,
+                    "filename": result.file.original_filename,
+                    "file_format": result.file.file_format,
+                    "file_size": result.file.file_size,
+                    "processing_time": result.file.processing_time,
+                    "detected_language": result.detected_language,
+                    "total_text_regions": result.total_text_regions,
+                    "average_confidence": result.average_confidence,
+                    "markdown_path": result.markdown_path,
+                }
+
+                # Include layout data if requested
+                if include_layout and result.layout_data:
+                    result_data["layout_data"] = result.layout_data
+
+                # Include images metadata if requested
+                if include_images and result.images_metadata:
+                    result_data["images_metadata"] = result.images_metadata
+
+                export_data["results"].append(result_data)
+
+            # Write to file
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+            output_path.write_text(
+                json.dumps(export_data, ensure_ascii=False, indent=2),
+                encoding="utf-8"
+            )
+
+            logger.info(f"Exported {len(results)} results to JSON: {output_path}")
+            return output_path
+
+        except Exception as e:
+            raise ExportError(f"JSON export failed: {str(e)}")
+
+    def export_to_excel(
+        self,
+        results: List[OCRResult],
+        output_path: Path,
+        include_confidence: bool = True,
+        include_processing_time: bool = True
+    ) -> Path:
+        """
+        Export results to Excel file
+
+        Args:
+            results: List of OCR results
+            output_path: Output file path
+            include_confidence: Include confidence scores
+            include_processing_time: Include processing time
+
+        Returns:
+            Path: Output file path
+
+        Raises:
+            ExportError: If export fails
+        """
+        try:
+            rows = []
+
+            for result in results:
+                # Read Markdown content
+                text_content = ""
+                if result.markdown_path and Path(result.markdown_path).exists():
+                    text_content = Path(result.markdown_path).read_text(encoding="utf-8")
+
+                row = {
+                    "文件名": result.file.original_filename,
+                    "格式": result.file.file_format,
+                    "大小(字節)": result.file.file_size,
+                    "語言": result.detected_language or "未知",
+                    "文本區域數": result.total_text_regions,
+                    "提取內容": text_content[:1000] + "..." if len(text_content) > 1000 else text_content,
+                }
+
+                if include_confidence:
+                    row["平均信心度"] = f"{result.average_confidence:.2%}" if result.average_confidence else "N/A"
+
+                if include_processing_time:
+                    row["處理時間(秒)"] = f"{result.file.processing_time:.2f}" if result.file.processing_time else "N/A"
+
+                rows.append(row)
+
+            # Create DataFrame and export
+            df = pd.DataFrame(rows)
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+            df.to_excel(output_path, index=False, engine='openpyxl')
+
+            logger.info(f"Exported {len(results)} results to Excel: {output_path}")
+            return output_path
+
+        except Exception as e:
+            raise ExportError(f"Excel export failed: {str(e)}")
+
+    def export_to_markdown(
+        self,
+        results: List[OCRResult],
+        output_path: Path,
+        combine: bool = True
+    ) -> Path:
+        """
+        Export results to Markdown file(s)
+
+        Args:
+            results: List of OCR results
+            output_path: Output file path (or directory if not combining)
+            combine: Combine all results into one file
+
+        Returns:
+            Path: Output file/directory path
+
+        Raises:
+            ExportError: If export fails
+        """
+        try:
+            if combine:
+                # Combine all Markdown files into one
+                combined_content = []
+
+                for result in results:
+                    if not result.markdown_path or not Path(result.markdown_path).exists():
+                        continue
+
+                    markdown_content = Path(result.markdown_path).read_text(encoding="utf-8")
+
+                    # Add header
+                    combined_content.append(f"# {result.file.original_filename}\n")
+                    combined_content.append(markdown_content)
+                    combined_content.append("\n---\n")  # Separator
+
+                output_path.parent.mkdir(parents=True, exist_ok=True)
+                output_path.write_text("\n".join(combined_content), encoding="utf-8")
+
+                logger.info(f"Exported {len(results)} results to combined Markdown: {output_path}")
+                return output_path
+
+            else:
+                # Export each result to separate file
+                output_path.mkdir(parents=True, exist_ok=True)
+
+                for result in results:
+                    if not result.markdown_path or not Path(result.markdown_path).exists():
+                        continue
+
+                    # Copy Markdown file to output directory
+                    src_path = Path(result.markdown_path)
+                    dst_path = output_path / f"{result.file.original_filename}.md"
+                    dst_path.write_text(src_path.read_text(encoding="utf-8"), encoding="utf-8")
+
+                logger.info(f"Exported {len(results)} results to separate Markdown files: {output_path}")
+                return output_path
+
+        except Exception as e:
+            raise ExportError(f"Markdown export failed: {str(e)}")
+
+    def export_to_pdf(
+        self,
+        result: OCRResult,
+        output_path: Path,
+        css_template: str = "default",
+        metadata: Optional[Dict] = None
+    ) -> Path:
+        """
+        Export single result to PDF with layout preservation
+
+        Args:
+            result: OCR result
+            output_path: Output PDF path
+            css_template: CSS template name or custom CSS
+            metadata: Optional PDF metadata
+
+        Returns:
+            Path: Output PDF path
+
+        Raises:
+            ExportError: If export fails
+        """
+        try:
+            if not result.markdown_path or not Path(result.markdown_path).exists():
+                raise ExportError(f"Markdown file not found for result {result.id}")
+
+            markdown_path = Path(result.markdown_path)
+
+            # Prepare metadata
+            pdf_metadata = metadata or {}
+            if "title" not in pdf_metadata:
+                pdf_metadata["title"] = result.file.original_filename
+
+            # Generate PDF
+            self.pdf_generator.generate_pdf(
+                markdown_path=markdown_path,
+                output_path=output_path,
+                css_template=css_template,
+                metadata=pdf_metadata
+            )
+
+            logger.info(f"Exported result {result.id} to PDF: {output_path}")
+            return output_path
+
+        except PDFGenerationError as e:
+            raise ExportError(f"PDF generation failed: {str(e)}")
+        except Exception as e:
+            raise ExportError(f"PDF export failed: {str(e)}")
+
+    def export_batch_to_zip(
+        self,
+        db: Session,
+        batch_id: int,
+        output_path: Path,
+        include_formats: Optional[List[str]] = None
+    ) -> Path:
+        """
+        Export entire batch to ZIP archive
+
+        Args:
+            db: Database session
+            batch_id: Batch ID
+            output_path: Output ZIP path
+            include_formats: List of formats to include (markdown, json, txt, excel, pdf)
+
+        Returns:
+            Path: Output ZIP path
+
+        Raises:
+            ExportError: If export fails
+        """
+        try:
+            include_formats = include_formats or ["markdown", "json"]
+
+            # Get batch and results
+            batch = db.query(OCRBatch).filter(OCRBatch.id == batch_id).first()
+            if not batch:
+                raise ExportError(f"Batch {batch_id} not found")
+
+            results = db.query(OCRResult).join(OCRFile).filter(
+                OCRFile.batch_id == batch_id,
+                OCRFile.status == FileStatus.COMPLETED
+            ).all()
+
+            if not results:
+                raise ExportError(f"No completed results found for batch {batch_id}")
+
+            # Create temporary export directory
+            temp_dir = output_path.parent / f"temp_export_{batch_id}"
+            temp_dir.mkdir(parents=True, exist_ok=True)
+
+            try:
+                # Export in requested formats
+                if "markdown" in include_formats:
+                    md_dir = temp_dir / "markdown"
+                    self.export_to_markdown(results, md_dir, combine=False)
+
+                if "json" in include_formats:
+                    json_path = temp_dir / "batch_results.json"
+                    self.export_to_json(results, json_path)
+
+                if "txt" in include_formats:
+                    txt_path = temp_dir / "batch_results.txt"
+                    self.export_to_txt(results, txt_path)
+
+                if "excel" in include_formats:
+                    excel_path = temp_dir / "batch_results.xlsx"
+                    self.export_to_excel(results, excel_path)
+
+                # Create ZIP archive
+                output_path.parent.mkdir(parents=True, exist_ok=True)
+                with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
+                    for file_path in temp_dir.rglob('*'):
+                        if file_path.is_file():
+                            arcname = file_path.relative_to(temp_dir)
+                            zipf.write(file_path, arcname)
+
+                logger.info(f"Exported batch {batch_id} to ZIP: {output_path}")
+                return output_path
+
+            finally:
+                # Clean up temporary directory
+                import shutil
+                shutil.rmtree(temp_dir, ignore_errors=True)
+
+        except Exception as e:
+            raise ExportError(f"Batch ZIP export failed: {str(e)}")
+
+    def apply_export_rule(
+        self,
+        db: Session,
+        results: List[OCRResult],
+        rule_id: int
+    ) -> List[OCRResult]:
+        """
+        Apply export rule to filter and format results
+
+        Args:
+            db: Database session
+            results: List of OCR results
+            rule_id: Export rule ID
+
+        Returns:
+            List[OCRResult]: Filtered results
+
+        Raises:
+            ExportError: If rule not found
+        """
+        rule = db.query(ExportRule).filter(ExportRule.id == rule_id).first()
+        if not rule:
+            raise ExportError(f"Export rule {rule_id} not found")
+
+        config = rule.config_json
+
+        # Apply filters
+        if "filters" in config:
+            results = self.apply_filters(results, config["filters"])
+
+        # Note: Formatting options are applied in individual export methods
+        return results
+
+    def get_export_formats(self) -> Dict[str, str]:
+        """
+        Get available export formats
+
+        Returns:
+            Dict mapping format codes to descriptions
+        """
+        return {
+            "txt": "純文本格式 (.txt)",
+            "json": "JSON 格式 - 包含完整元數據 (.json)",
+            "excel": "Excel 表格格式 (.xlsx)",
+            "markdown": "Markdown 格式 (.md)",
+            "pdf": "版面保留 PDF 格式 (.pdf)",
+            "zip": "批次打包格式 (.zip)",
+        }
--- a/backend/app/services/file_manager.py
+++ b/backend/app/services/file_manager.py
@@ -0,0 +1,420 @@
+"""
+Tool_OCR - File Management Service
+Handles file uploads, storage, validation, and cleanup
+"""
+
+import logging
+import shutil
+import uuid
+from pathlib import Path
+from typing import List, Tuple, Optional
+from datetime import datetime, timedelta
+
+from fastapi import UploadFile
+from sqlalchemy.orm import Session
+
+from app.core.config import settings
+from app.models.ocr import OCRBatch, OCRFile, FileStatus
+from app.services.preprocessor import DocumentPreprocessor
+
+
+logger = logging.getLogger(__name__)
+
+
+class FileManagementError(Exception):
+    """Exception raised for file management errors"""
+    pass
+
+
+class FileManager:
+    """
+    File management service for upload, storage, and cleanup
+
+    Directory structure:
+    uploads/
+    ├── batches/
+    │   └── {batch_id}/
+    │       ├── inputs/           # Original uploaded files
+    │       ├── outputs/          # OCR results
+    │       │   ├── markdown/     # Markdown files
+    │       │   ├── json/         # JSON files
+    │       │   └── images/       # Extracted images
+    │       └── exports/          # Export files (PDF, Excel, etc.)
+    """
+
+    def __init__(self):
+        """Initialize file manager"""
+        self.preprocessor = DocumentPreprocessor()
+        self.base_upload_dir = Path(settings.upload_dir)
+        self.base_upload_dir.mkdir(parents=True, exist_ok=True)
+
+    def create_batch_directory(self, batch_id: int) -> Path:
+        """
+        Create directory structure for a batch
+
+        Args:
+            batch_id: Batch ID
+
+        Returns:
+            Path: Batch directory path
+        """
+        batch_dir = self.base_upload_dir / "batches" / str(batch_id)
+
+        # Create subdirectories
+        (batch_dir / "inputs").mkdir(parents=True, exist_ok=True)
+        (batch_dir / "outputs" / "markdown").mkdir(parents=True, exist_ok=True)
+        (batch_dir / "outputs" / "json").mkdir(parents=True, exist_ok=True)
+        (batch_dir / "outputs" / "images").mkdir(parents=True, exist_ok=True)
+        (batch_dir / "exports").mkdir(parents=True, exist_ok=True)
+
+        logger.info(f"Created batch directory: {batch_dir}")
+        return batch_dir
+
+    def get_batch_directory(self, batch_id: int) -> Path:
+        """
+        Get batch directory path
+
+        Args:
+            batch_id: Batch ID
+
+        Returns:
+            Path: Batch directory path
+        """
+        return self.base_upload_dir / "batches" / str(batch_id)
+
+    def validate_upload(self, file: UploadFile) -> Tuple[bool, Optional[str]]:
+        """
+        Validate uploaded file before saving
+
+        Args:
+            file: Uploaded file
+
+        Returns:
+            Tuple of (is_valid, error_message)
+        """
+        # Check filename
+        if not file.filename:
+            return False, "文件名不能為空"
+
+        # Check file size (read content size)
+        file.file.seek(0, 2)  # Seek to end
+        file_size = file.file.tell()
+        file.file.seek(0)  # Reset to beginning
+
+        if file_size == 0:
+            return False, "文件為空"
+
+        if file_size > settings.max_upload_size:
+            max_mb = settings.max_upload_size / (1024 * 1024)
+            return False, f"文件大小超過限制 ({max_mb}MB)"
+
+        # Check file extension
+        file_ext = Path(file.filename).suffix.lower()
+        allowed_extensions = {'.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.ppt', '.pptx'}
+        if file_ext not in allowed_extensions:
+            return False, f"不支持的文件格式 ({file_ext})，僅支持: {', '.join(allowed_extensions)}"
+
+        return True, None
+
+    def save_upload(
+        self,
+        file: UploadFile,
+        batch_id: int,
+        validate: bool = True
+    ) -> Tuple[Path, str]:
+        """
+        Save uploaded file to batch directory
+
+        Args:
+            file: Uploaded file
+            batch_id: Batch ID
+            validate: Whether to validate file
+
+        Returns:
+            Tuple of (file_path, original_filename)
+
+        Raises:
+            FileManagementError: If file validation or saving fails
+        """
+        # Validate if requested
+        if validate:
+            is_valid, error_msg = self.validate_upload(file)
+            if not is_valid:
+                raise FileManagementError(error_msg)
+
+        # Generate unique filename to avoid conflicts
+        original_filename = file.filename
+        file_ext = Path(original_filename).suffix
+        unique_filename = f"{uuid.uuid4()}{file_ext}"
+
+        # Get batch input directory
+        batch_dir = self.get_batch_directory(batch_id)
+        input_dir = batch_dir / "inputs"
+        input_dir.mkdir(parents=True, exist_ok=True)
+
+        # Save file
+        file_path = input_dir / unique_filename
+        try:
+            with file_path.open("wb") as buffer:
+                shutil.copyfileobj(file.file, buffer)
+
+            logger.info(f"Saved upload: {file_path} (original: {original_filename})")
+            return file_path, original_filename
+
+        except Exception as e:
+            # Clean up partial file if exists
+            file_path.unlink(missing_ok=True)
+            raise FileManagementError(f"保存文件失敗: {str(e)}")
+
+    def validate_saved_file(self, file_path: Path) -> Tuple[bool, Optional[str], Optional[str]]:
+        """
+        Validate saved file using preprocessor
+
+        Args:
+            file_path: Path to saved file
+
+        Returns:
+            Tuple of (is_valid, error_message, detected_format)
+        """
+        return self.preprocessor.validate_file(file_path)
+
+    def create_batch(
+        self,
+        db: Session,
+        user_id: int,
+        batch_name: Optional[str] = None
+    ) -> OCRBatch:
+        """
+        Create new OCR batch
+
+        Args:
+            db: Database session
+            user_id: User ID
+            batch_name: Optional batch name
+
+        Returns:
+            OCRBatch: Created batch object
+        """
+        # Create batch record
+        batch = OCRBatch(
+            user_id=user_id,
+            batch_name=batch_name or f"Batch_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+        )
+        db.add(batch)
+        db.commit()
+        db.refresh(batch)
+
+        # Create directory structure
+        self.create_batch_directory(batch.id)
+
+        logger.info(f"Created batch: {batch.id} for user {user_id}")
+        return batch
+
+    def add_file_to_batch(
+        self,
+        db: Session,
+        batch_id: int,
+        file: UploadFile
+    ) -> OCRFile:
+        """
+        Add file to batch and save to disk
+
+        Args:
+            db: Database session
+            batch_id: Batch ID
+            file: Uploaded file
+
+        Returns:
+            OCRFile: Created file record
+
+        Raises:
+            FileManagementError: If file operations fail
+        """
+        # Save file to disk
+        file_path, original_filename = self.save_upload(file, batch_id)
+
+        # Validate saved file
+        is_valid, detected_format, error_msg = self.validate_saved_file(file_path)
+
+        # Create file record
+        ocr_file = OCRFile(
+            batch_id=batch_id,
+            filename=file_path.name,
+            original_filename=original_filename,
+            file_path=str(file_path),
+            file_size=file_path.stat().st_size,
+            file_format=detected_format or Path(original_filename).suffix.lower().lstrip('.'),
+            status=FileStatus.PENDING if is_valid else FileStatus.FAILED,
+            error_message=error_msg if not is_valid else None
+        )
+
+        db.add(ocr_file)
+
+        # Update batch total_files count
+        batch = db.query(OCRBatch).filter(OCRBatch.id == batch_id).first()
+        if batch:
+            batch.total_files += 1
+            if not is_valid:
+                batch.failed_files += 1
+
+        db.commit()
+        db.refresh(ocr_file)
+
+        logger.info(f"Added file to batch {batch_id}: {ocr_file.id} (status: {ocr_file.status})")
+        return ocr_file
+
+    def add_files_to_batch(
+        self,
+        db: Session,
+        batch_id: int,
+        files: List[UploadFile]
+    ) -> List[OCRFile]:
+        """
+        Add multiple files to batch
+
+        Args:
+            db: Database session
+            batch_id: Batch ID
+            files: List of uploaded files
+
+        Returns:
+            List[OCRFile]: List of created file records
+        """
+        ocr_files = []
+        for file in files:
+            try:
+                ocr_file = self.add_file_to_batch(db, batch_id, file)
+                ocr_files.append(ocr_file)
+            except FileManagementError as e:
+                logger.error(f"Failed to add file {file.filename} to batch {batch_id}: {e}")
+                # Continue with other files
+                continue
+
+        return ocr_files
+
+    def get_file_paths(self, batch_id: int, file_id: int) -> dict:
+        """
+        Get all paths for a file in a batch
+
+        Args:
+            batch_id: Batch ID
+            file_id: File ID
+
+        Returns:
+            Dict containing all relevant paths
+        """
+        batch_dir = self.get_batch_directory(batch_id)
+
+        return {
+            "input_dir": batch_dir / "inputs",
+            "output_dir": batch_dir / "outputs",
+            "markdown_dir": batch_dir / "outputs" / "markdown",
+            "json_dir": batch_dir / "outputs" / "json",
+            "images_dir": batch_dir / "outputs" / "images" / str(file_id),
+            "export_dir": batch_dir / "exports",
+        }
+
+    def cleanup_expired_batches(self, db: Session, retention_hours: int = 24) -> int:
+        """
+        Clean up expired batch files
+
+        Args:
+            db: Database session
+            retention_hours: Number of hours to retain files
+
+        Returns:
+            int: Number of batches cleaned up
+        """
+        cutoff_time = datetime.utcnow() - timedelta(hours=retention_hours)
+
+        # Find expired batches
+        expired_batches = db.query(OCRBatch).filter(
+            OCRBatch.created_at < cutoff_time
+        ).all()
+
+        cleaned_count = 0
+        for batch in expired_batches:
+            try:
+                # Delete batch directory
+                batch_dir = self.get_batch_directory(batch.id)
+                if batch_dir.exists():
+                    shutil.rmtree(batch_dir)
+                    logger.info(f"Deleted batch directory: {batch_dir}")
+
+                # Delete database records (cascade will handle related records)
+                db.delete(batch)
+                cleaned_count += 1
+
+            except Exception as e:
+                logger.error(f"Failed to cleanup batch {batch.id}: {e}")
+                continue
+
+        if cleaned_count > 0:
+            db.commit()
+            logger.info(f"Cleaned up {cleaned_count} expired batches")
+
+        return cleaned_count
+
+    def verify_file_ownership(
+        self,
+        db: Session,
+        user_id: int,
+        batch_id: int
+    ) -> bool:
+        """
+        Verify user owns the batch
+
+        Args:
+            db: Database session
+            user_id: User ID
+            batch_id: Batch ID
+
+        Returns:
+            bool: True if user owns batch, False otherwise
+        """
+        batch = db.query(OCRBatch).filter(
+            OCRBatch.id == batch_id,
+            OCRBatch.user_id == user_id
+        ).first()
+
+        return batch is not None
+
+    def get_batch_statistics(self, db: Session, batch_id: int) -> dict:
+        """
+        Get statistics for a batch
+
+        Args:
+            db: Database session
+            batch_id: Batch ID
+
+        Returns:
+            Dict containing batch statistics
+        """
+        batch = db.query(OCRBatch).filter(OCRBatch.id == batch_id).first()
+        if not batch:
+            return {}
+
+        # Calculate total file size
+        total_size = sum(f.file_size for f in batch.files)
+
+        # Calculate processing time
+        processing_time = None
+        if batch.completed_at and batch.started_at:
+            processing_time = (batch.completed_at - batch.started_at).total_seconds()
+
+        return {
+            "batch_id": batch.id,
+            "batch_name": batch.batch_name,
+            "status": batch.status,
+            "total_files": batch.total_files,
+            "completed_files": batch.completed_files,
+            "failed_files": batch.failed_files,
+            "pending_files": batch.total_files - batch.completed_files - batch.failed_files,
+            "progress_percentage": batch.progress_percentage,
+            "total_file_size": total_size,
+            "total_file_size_mb": round(total_size / (1024 * 1024), 2),
+            "created_at": batch.created_at.isoformat(),
+            "started_at": batch.started_at.isoformat() if batch.started_at else None,
+            "completed_at": batch.completed_at.isoformat() if batch.completed_at else None,
+            "processing_time": processing_time,
+        }
--- a/backend/app/services/ocr_service.py
+++ b/backend/app/services/ocr_service.py
@@ -0,0 +1,516 @@
+"""
+Tool_OCR - Core OCR Service
+PaddleOCR-VL integration for text and structure extraction
+"""
+
+import json
+import logging
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+from datetime import datetime
+import uuid
+
+from paddleocr import PaddleOCR, PPStructureV3
+from PIL import Image
+from pdf2image import convert_from_path
+
+from app.core.config import settings
+from app.services.office_converter import OfficeConverter, OfficeConverterError
+
+logger = logging.getLogger(__name__)
+
+
+class OCRService:
+    """
+    Core OCR service using PaddleOCR-VL
+    Handles text recognition and document structure analysis
+    """
+
+    def __init__(self):
+        """Initialize PaddleOCR and PPStructure engines"""
+        self.ocr_languages = settings.ocr_languages_list
+        self.confidence_threshold = settings.ocr_confidence_threshold
+
+        # Initialize PaddleOCR engine (will be lazy-loaded per language)
+        self.ocr_engines = {}
+
+        # Initialize PP-Structure for layout analysis
+        self.structure_engine = None
+
+        # Initialize Office document converter
+        self.office_converter = OfficeConverter()
+
+        logger.info("OCR Service initialized")
+
+    def get_ocr_engine(self, lang: str = 'ch') -> PaddleOCR:
+        """
+        Get or create OCR engine for specified language
+
+        Args:
+            lang: Language code (ch, en, japan, korean, etc.)
+
+        Returns:
+            PaddleOCR engine instance
+        """
+        if lang not in self.ocr_engines:
+            logger.info(f"Initializing PaddleOCR engine for language: {lang}")
+            self.ocr_engines[lang] = PaddleOCR(
+                use_angle_cls=True,
+                lang=lang,
+                # Note: show_log and use_gpu parameters removed in PaddleOCR 3.x
+            )
+            logger.info(f"PaddleOCR engine ready for {lang}")
+
+        return self.ocr_engines[lang]
+
+    def get_structure_engine(self) -> PPStructureV3:
+        """
+        Get or create PP-Structure engine for layout analysis
+
+        Returns:
+            PPStructure engine instance
+        """
+        if self.structure_engine is None:
+            logger.info("Initializing PP-StructureV3 engine")
+            self.structure_engine = PPStructureV3(
+                use_doc_orientation_classify=False,
+                use_doc_unwarping=False,
+                use_textline_orientation=False,
+                use_table_recognition=True,
+                use_formula_recognition=True,
+                layout_threshold=0.5,
+            )
+            logger.info("PP-StructureV3 engine ready")
+
+        return self.structure_engine
+
+    def convert_pdf_to_images(self, pdf_path: Path, output_dir: Path) -> List[Path]:
+        """
+        Convert PDF to images (one per page)
+
+        Args:
+            pdf_path: Path to PDF file
+            output_dir: Directory to save converted images
+
+        Returns:
+            List of paths to converted images
+        """
+        try:
+            output_dir.mkdir(parents=True, exist_ok=True)
+
+            logger.info(f"Converting PDF {pdf_path.name} to images")
+
+            # Convert PDF to images (300 DPI for good quality)
+            images = convert_from_path(
+                str(pdf_path),
+                dpi=300,
+                fmt='png'
+            )
+
+            image_paths = []
+            for i, image in enumerate(images):
+                # Save each page as PNG
+                image_path = output_dir / f"{pdf_path.stem}_page_{i+1}.png"
+                image.save(str(image_path), 'PNG')
+                image_paths.append(image_path)
+                logger.info(f"Saved page {i+1} to {image_path.name}")
+
+            logger.info(f"Converted {len(image_paths)} pages from PDF")
+            return image_paths
+
+        except Exception as e:
+            logger.error(f"PDF conversion error: {str(e)}")
+            raise
+
+    def process_image(
+        self,
+        image_path: Path,
+        lang: str = 'ch',
+        detect_layout: bool = True,
+        confidence_threshold: Optional[float] = None
+    ) -> Dict:
+        """
+        Process single image with OCR and layout analysis
+
+        Args:
+            image_path: Path to image file
+            lang: Language for OCR
+            detect_layout: Whether to perform layout analysis
+            confidence_threshold: Minimum confidence threshold (uses default if None)
+
+        Returns:
+            Dictionary with OCR results and metadata
+        """
+        start_time = datetime.now()
+        threshold = confidence_threshold if confidence_threshold is not None else self.confidence_threshold
+
+        try:
+            # Check if file is Office document
+            if self.office_converter.is_office_document(image_path):
+                logger.info(f"Detected Office document: {image_path.name}, converting to PDF")
+                try:
+                    # Convert Office document to PDF
+                    pdf_path = self.office_converter.convert_to_pdf(image_path)
+                    logger.info(f"Office document converted to PDF: {pdf_path.name}")
+
+                    # Process the PDF (will be handled by PDF processing logic below)
+                    image_path = pdf_path
+                except OfficeConverterError as e:
+                    logger.error(f"Office conversion failed: {str(e)}")
+                    raise
+
+            # Check if file is PDF
+            is_pdf = image_path.suffix.lower() == '.pdf'
+
+            if is_pdf:
+                # Convert PDF to images
+                logger.info(f"Detected PDF file: {image_path.name}, converting to images")
+                pdf_images_dir = image_path.parent / f"{image_path.stem}_pages"
+                image_paths = self.convert_pdf_to_images(image_path, pdf_images_dir)
+
+                # Process all pages
+                all_text_regions = []
+                total_confidence_sum = 0.0
+                total_valid_regions = 0
+                all_layout_data = []
+                all_images_metadata = []
+
+                for page_num, page_image_path in enumerate(image_paths, 1):
+                    logger.info(f"Processing PDF page {page_num}/{len(image_paths)}")
+
+                    # Process each page
+                    page_result = self.process_image(
+                        page_image_path,
+                        lang=lang,
+                        detect_layout=detect_layout,
+                        confidence_threshold=confidence_threshold
+                    )
+
+                    # Accumulate results
+                    if page_result['status'] == 'success':
+                        # Add page number to each text region
+                        for region in page_result['text_regions']:
+                            region['page'] = page_num
+                            all_text_regions.append(region)
+
+                        total_confidence_sum += page_result['average_confidence'] * page_result['total_text_regions']
+                        total_valid_regions += page_result['total_text_regions']
+
+                        # Accumulate layout data
+                        if page_result.get('layout_data'):
+                            all_layout_data.append(page_result['layout_data'])
+
+                        # Accumulate images metadata
+                        if page_result.get('images_metadata'):
+                            all_images_metadata.extend(page_result['images_metadata'])
+
+                # Calculate overall average confidence
+                avg_confidence = total_confidence_sum / total_valid_regions if total_valid_regions > 0 else 0.0
+
+                # Combine layout data from all pages
+                combined_layout = None
+                if all_layout_data:
+                    combined_elements = []
+                    for layout in all_layout_data:
+                        if layout.get('elements'):
+                            combined_elements.extend(layout['elements'])
+                    if combined_elements:
+                        combined_layout = {
+                            'elements': combined_elements,
+                            'total_elements': len(combined_elements),
+                            'reading_order': list(range(len(combined_elements))),
+                        }
+
+                # Generate combined markdown
+                markdown_content = self.generate_markdown(all_text_regions, combined_layout)
+
+                # Calculate processing time
+                processing_time = (datetime.now() - start_time).total_seconds()
+
+                logger.info(
+                    f"PDF processing completed: {image_path.name} - "
+                    f"{len(image_paths)} pages, "
+                    f"{len(all_text_regions)} regions, "
+                    f"{avg_confidence:.2f} avg confidence, "
+                    f"{processing_time:.2f}s"
+                )
+
+                return {
+                    'status': 'success',
+                    'file_name': image_path.name,
+                    'language': lang,
+                    'text_regions': all_text_regions,
+                    'total_text_regions': len(all_text_regions),
+                    'average_confidence': avg_confidence,
+                    'layout_data': combined_layout,
+                    'images_metadata': all_images_metadata,
+                    'markdown_content': markdown_content,
+                    'processing_time': processing_time,
+                    'timestamp': datetime.utcnow().isoformat(),
+                    'total_pages': len(image_paths),
+                }
+
+            # Get OCR engine (for non-PDF images)
+            ocr_engine = self.get_ocr_engine(lang)
+
+            # Perform OCR
+            logger.info(f"Processing image: {image_path.name}")
+            # Note: In PaddleOCR 3.x, use_angle_cls is set during initialization, not in ocr() call
+            ocr_results = ocr_engine.ocr(str(image_path))
+
+            # Parse OCR results (PaddleOCR 3.x format)
+            text_regions = []
+            total_confidence = 0.0
+            valid_regions = 0
+
+            if ocr_results and isinstance(ocr_results, (list, tuple)) and len(ocr_results) > 0:
+                # PaddleOCR 3.x returns a list of dictionaries (one per page)
+                for page_result in ocr_results:
+                    if isinstance(page_result, dict):
+                        # New format: {'rec_texts': [...], 'rec_scores': [...], 'rec_polys': [...]}
+                        texts = page_result.get('rec_texts', [])
+                        scores = page_result.get('rec_scores', [])
+                        polys = page_result.get('rec_polys', [])
+
+                        # Process each recognized text
+                        for idx, text in enumerate(texts):
+                            # Get corresponding score and bbox
+                            confidence = scores[idx] if idx < len(scores) else 1.0
+                            bbox = polys[idx] if idx < len(polys) else []
+
+                            # Convert numpy array bbox to list for JSON serialization
+                            if hasattr(bbox, 'tolist'):
+                                bbox = bbox.tolist()
+
+                            # Filter by confidence threshold
+                            if confidence >= threshold:
+                                text_regions.append({
+                                    'text': text,
+                                    'bbox': bbox,
+                                    'confidence': float(confidence),
+                                })
+                                total_confidence += confidence
+                                valid_regions += 1
+
+            avg_confidence = total_confidence / valid_regions if valid_regions > 0 else 0.0
+
+            logger.info(f"Parsed {len(text_regions)} text regions with avg confidence {avg_confidence:.3f}")
+
+            # Layout analysis (if requested)
+            layout_data = None
+            images_metadata = []
+
+            if detect_layout:
+                layout_data, images_metadata = self.analyze_layout(image_path)
+
+            # Generate Markdown
+            markdown_content = self.generate_markdown(text_regions, layout_data)
+
+            # Calculate processing time
+            processing_time = (datetime.now() - start_time).total_seconds()
+
+            result = {
+                'status': 'success',
+                'file_name': image_path.name,
+                'language': lang,
+                'text_regions': text_regions,
+                'total_text_regions': len(text_regions),
+                'average_confidence': avg_confidence,
+                'layout_data': layout_data,
+                'images_metadata': images_metadata,
+                'markdown_content': markdown_content,
+                'processing_time': processing_time,
+                'timestamp': datetime.utcnow().isoformat(),
+            }
+
+            logger.info(
+                f"OCR completed: {image_path.name} - "
+                f"{len(text_regions)} regions, "
+                f"{avg_confidence:.2f} avg confidence, "
+                f"{processing_time:.2f}s"
+            )
+
+            return result
+
+        except Exception as e:
+            import traceback
+            error_trace = traceback.format_exc()
+            logger.error(f"OCR processing error for {image_path.name}: {str(e)}\n{error_trace}")
+            return {
+                'status': 'error',
+                'file_name': image_path.name,
+                'error_message': str(e),
+                'processing_time': (datetime.now() - start_time).total_seconds(),
+            }
+
+    def analyze_layout(self, image_path: Path) -> Tuple[Optional[Dict], List[Dict]]:
+        """
+        Analyze document layout using PP-StructureV3
+
+        Args:
+            image_path: Path to image file
+
+        Returns:
+            Tuple of (layout_data, images_metadata)
+        """
+        try:
+            structure_engine = self.get_structure_engine()
+
+            # Perform structure analysis using predict() method (PaddleOCR 3.x API)
+            logger.info(f"Running layout analysis on {image_path.name}")
+            results = structure_engine.predict(str(image_path))
+
+            layout_elements = []
+            images_metadata = []
+
+            # Process each page result (for images, usually just one page)
+            for page_idx, page_result in enumerate(results):
+                # Get markdown dictionary from result object
+                if hasattr(page_result, 'markdown'):
+                    markdown_dict = page_result.markdown
+                    logger.info(f"Page {page_idx} markdown keys: {markdown_dict.keys() if isinstance(markdown_dict, dict) else type(markdown_dict)}")
+
+                    # Extract layout information from markdown structure
+                    if isinstance(markdown_dict, dict):
+                        # Get markdown texts (HTML format with tables and structure)
+                        markdown_texts = markdown_dict.get('markdown_texts', '')
+                        markdown_images = markdown_dict.get('markdown_images', {})
+
+                        # Create a layout element for the structured content
+                        if markdown_texts:
+                            # Parse HTML content to identify tables and text
+                            import re
+
+                            # Check if content contains tables
+                            has_table = '<table' in markdown_texts.lower()
+
+                            element = {
+                                'element_id': len(layout_elements),
+                                'type': 'table' if has_table else 'text',
+                                'content': markdown_texts,
+                                'page': page_idx,
+                                'bbox': [],  # PP-StructureV3 doesn't provide individual bbox in this format
+                            }
+                            layout_elements.append(element)
+
+                        # Add image metadata
+                        for img_idx, (img_path, img_obj) in enumerate(markdown_images.items()):
+                            images_metadata.append({
+                                'element_id': len(layout_elements) + img_idx,
+                                'image_path': img_path,
+                                'type': 'image',
+                                'page': page_idx,
+                                'bbox': [],
+                            })
+
+            if layout_elements:
+                layout_data = {
+                    'elements': layout_elements,
+                    'total_elements': len(layout_elements),
+                    'reading_order': list(range(len(layout_elements))),
+                }
+                logger.info(f"Detected {len(layout_elements)} layout elements")
+                return layout_data, images_metadata
+            else:
+                logger.warning("No layout elements detected")
+                return None, []
+
+        except Exception as e:
+            import traceback
+            error_trace = traceback.format_exc()
+            logger.error(f"Layout analysis error: {str(e)}\n{error_trace}")
+            return None, []
+
+    def generate_markdown(
+        self,
+        text_regions: List[Dict],
+        layout_data: Optional[Dict] = None
+    ) -> str:
+        """
+        Generate Markdown from OCR results
+
+        Args:
+            text_regions: List of text regions with bbox and text
+            layout_data: Optional layout structure information
+
+        Returns:
+            Markdown formatted string
+        """
+        markdown_lines = []
+
+        if layout_data and layout_data.get('elements'):
+            # Generate structured Markdown based on layout
+            for element in layout_data['elements']:
+                element_type = element.get('type', 'text')
+                content = element.get('content', '')
+
+                if element_type == 'title':
+                    markdown_lines.append(f"# {content}\n")
+                elif element_type == 'table':
+                    # Table in HTML format
+                    markdown_lines.append(content)
+                    markdown_lines.append("")
+                elif element_type == 'figure':
+                    element_id = element.get('element_id')
+                    markdown_lines.append(f"![Figure {element_id}](./images/img_{element_id}.jpg)\n")
+                else:
+                    markdown_lines.append(f"{content}\n")
+
+        else:
+            # Simple Markdown from text regions only
+            # Sort by vertical position (top to bottom)
+            def get_y_coord(region):
+                """Safely extract Y coordinate from bbox"""
+                bbox = region.get('bbox', [])
+                if isinstance(bbox, (list, tuple)) and len(bbox) > 0:
+                    if isinstance(bbox[0], (list, tuple)) and len(bbox[0]) > 1:
+                        return bbox[0][1]  # [[x1,y1], [x2,y2], ...] format
+                    elif len(bbox) > 1:
+                        return bbox[1]  # [x1, y1, x2, y2, ...] format
+                return 0  # Default to 0 if can't extract
+
+            sorted_regions = sorted(text_regions, key=get_y_coord)
+
+            for region in sorted_regions:
+                text = region['text']
+                markdown_lines.append(text)
+
+        return "\n".join(markdown_lines)
+
+    def save_results(
+        self,
+        result: Dict,
+        output_dir: Path,
+        file_id: str
+    ) -> Tuple[Optional[Path], Optional[Path]]:
+        """
+        Save OCR results to JSON and Markdown files
+
+        Args:
+            result: OCR result dictionary
+            output_dir: Output directory
+            file_id: Unique file identifier
+
+        Returns:
+            Tuple of (json_path, markdown_path)
+        """
+        try:
+            output_dir.mkdir(parents=True, exist_ok=True)
+
+            # Save JSON
+            json_path = output_dir / f"{file_id}_result.json"
+            with open(json_path, 'w', encoding='utf-8') as f:
+                json.dump(result, f, ensure_ascii=False, indent=2)
+
+            # Save Markdown
+            markdown_path = output_dir / f"{file_id}_output.md"
+            markdown_content = result.get('markdown_content', '')
+            with open(markdown_path, 'w', encoding='utf-8') as f:
+                f.write(markdown_content)
+
+            logger.info(f"Results saved: {json_path.name}, {markdown_path.name}")
+            return json_path, markdown_path
+
+        except Exception as e:
+            logger.error(f"Error saving results: {str(e)}")
+            return None, None
--- a/backend/app/services/office_converter.py
+++ b/backend/app/services/office_converter.py
@@ -0,0 +1,210 @@
+"""
+Tool_OCR - Office Document Converter Service
+Convert Office documents (DOC/DOCX/PPT/PPTX) to PDF for OCR processing
+"""
+
+import logging
+import subprocess
+from pathlib import Path
+from typing import Optional
+import tempfile
+import shutil
+
+logger = logging.getLogger(__name__)
+
+
+class OfficeConverterError(Exception):
+    """Exception raised for Office conversion errors"""
+    pass
+
+
+class OfficeConverter:
+    """Convert Office documents to PDF for OCR processing"""
+
+    # Supported Office formats
+    OFFICE_FORMATS = {
+        '.doc': 'application/msword',
+        '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+        '.ppt': 'application/vnd.ms-powerpoint',
+        '.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation'
+    }
+
+    def __init__(self, libreoffice_path: str = "/Applications/LibreOffice.app/Contents/MacOS/soffice"):
+        """
+        Initialize Office converter
+
+        Args:
+            libreoffice_path: Path to LibreOffice executable
+        """
+        self.libreoffice_path = libreoffice_path
+        self._verify_libreoffice()
+
+    def _verify_libreoffice(self):
+        """Verify LibreOffice is installed and accessible"""
+        if not Path(self.libreoffice_path).exists():
+            # Try alternative path for Homebrew installation
+            alt_path = shutil.which("soffice")
+            if alt_path:
+                self.libreoffice_path = alt_path
+                logger.info(f"Using LibreOffice at: {alt_path}")
+            else:
+                raise OfficeConverterError(
+                    "LibreOffice not found. Please install LibreOffice: brew install libreoffice"
+                )
+
+    def is_office_document(self, file_path: Path) -> bool:
+        """
+        Check if file is an Office document
+
+        Args:
+            file_path: Path to file
+
+        Returns:
+            True if file is an Office document
+        """
+        return file_path.suffix.lower() in self.OFFICE_FORMATS
+
+    def convert_to_pdf(self, office_path: Path, output_dir: Optional[Path] = None) -> Path:
+        """
+        Convert Office document to PDF
+
+        Args:
+            office_path: Path to Office document
+            output_dir: Optional output directory (uses temp dir if not specified)
+
+        Returns:
+            Path to converted PDF file
+
+        Raises:
+            OfficeConverterError: If conversion fails
+        """
+        if not office_path.exists():
+            raise OfficeConverterError(f"Office file not found: {office_path}")
+
+        if not self.is_office_document(office_path):
+            raise OfficeConverterError(
+                f"Unsupported format: {office_path.suffix}. "
+                f"Supported formats: {', '.join(self.OFFICE_FORMATS.keys())}"
+            )
+
+        # Determine output directory
+        if output_dir is None:
+            output_dir = office_path.parent
+        else:
+            output_dir.mkdir(parents=True, exist_ok=True)
+
+        # Expected output PDF path
+        pdf_filename = office_path.stem + '.pdf'
+        output_pdf_path = output_dir / pdf_filename
+
+        # Remove existing PDF if present
+        if output_pdf_path.exists():
+            output_pdf_path.unlink()
+
+        logger.info(f"Converting {office_path.name} to PDF using LibreOffice")
+
+        try:
+            # Use LibreOffice headless mode for conversion
+            # --headless: Run without GUI
+            # --convert-to pdf: Convert to PDF format
+            # --outdir: Output directory
+            cmd = [
+                self.libreoffice_path,
+                '--headless',
+                '--convert-to', 'pdf',
+                '--outdir', str(output_dir),
+                str(office_path)
+            ]
+
+            logger.debug(f"Running command: {' '.join(cmd)}")
+
+            result = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                timeout=60  # 60 second timeout
+            )
+
+            if result.returncode != 0:
+                error_msg = result.stderr or result.stdout
+                raise OfficeConverterError(
+                    f"LibreOffice conversion failed: {error_msg}"
+                )
+
+            # Verify PDF was created
+            if not output_pdf_path.exists():
+                raise OfficeConverterError(
+                    f"PDF file not created at expected location: {output_pdf_path}"
+                )
+
+            logger.info(f"Successfully converted to PDF: {output_pdf_path.name}")
+            return output_pdf_path
+
+        except subprocess.TimeoutExpired:
+            raise OfficeConverterError(
+                f"Conversion timeout (60s) for file: {office_path.name}"
+            )
+        except Exception as e:
+            if isinstance(e, OfficeConverterError):
+                raise
+            raise OfficeConverterError(f"Conversion error: {str(e)}")
+
+    def convert_docx_to_pdf(self, docx_path: Path, output_dir: Optional[Path] = None) -> Path:
+        """
+        Convert DOCX to PDF
+
+        Args:
+            docx_path: Path to DOCX file
+            output_dir: Optional output directory
+
+        Returns:
+            Path to converted PDF
+        """
+        if docx_path.suffix.lower() != '.docx':
+            raise OfficeConverterError(f"Expected .docx file, got: {docx_path.suffix}")
+        return self.convert_to_pdf(docx_path, output_dir)
+
+    def convert_doc_to_pdf(self, doc_path: Path, output_dir: Optional[Path] = None) -> Path:
+        """
+        Convert legacy DOC to PDF
+
+        Args:
+            doc_path: Path to DOC file
+            output_dir: Optional output directory
+
+        Returns:
+            Path to converted PDF
+        """
+        if doc_path.suffix.lower() != '.doc':
+            raise OfficeConverterError(f"Expected .doc file, got: {doc_path.suffix}")
+        return self.convert_to_pdf(doc_path, output_dir)
+
+    def convert_pptx_to_pdf(self, pptx_path: Path, output_dir: Optional[Path] = None) -> Path:
+        """
+        Convert PPTX to PDF
+
+        Args:
+            pptx_path: Path to PPTX file
+            output_dir: Optional output directory
+
+        Returns:
+            Path to converted PDF
+        """
+        if pptx_path.suffix.lower() != '.pptx':
+            raise OfficeConverterError(f"Expected .pptx file, got: {pptx_path.suffix}")
+        return self.convert_to_pdf(pptx_path, output_dir)
+
+    def convert_ppt_to_pdf(self, ppt_path: Path, output_dir: Optional[Path] = None) -> Path:
+        """
+        Convert legacy PPT to PDF
+
+        Args:
+            ppt_path: Path to PPT file
+            output_dir: Optional output directory
+
+        Returns:
+            Path to converted PDF
+        """
+        if ppt_path.suffix.lower() != '.ppt':
+            raise OfficeConverterError(f"Expected .ppt file, got: {ppt_path.suffix}")
+        return self.convert_to_pdf(ppt_path, output_dir)
--- a/backend/app/services/pdf_generator.py
+++ b/backend/app/services/pdf_generator.py
@@ -0,0 +1,507 @@
+"""
+Tool_OCR - PDF Generator Service
+Converts Markdown to layout-preserved PDFs using Pandoc + WeasyPrint
+"""
+
+import logging
+import subprocess
+from pathlib import Path
+from typing import Optional, Dict
+from datetime import datetime
+
+from weasyprint import HTML, CSS
+from markdown import markdown
+
+from app.core.config import settings
+
+
+logger = logging.getLogger(__name__)
+
+
+class PDFGenerationError(Exception):
+    """Exception raised when PDF generation fails"""
+    pass
+
+
+class PDFGenerator:
+    """
+    PDF generation service with layout preservation
+
+    Supports two generation methods:
+    1. Pandoc (preferred): Markdown → HTML → PDF via pandoc command
+    2. WeasyPrint (fallback): Direct Python-based HTML → PDF conversion
+    """
+
+    # Default CSS template for layout preservation
+    DEFAULT_CSS = """
+    @page {
+        size: A4;
+        margin: 2cm;
+    }
+
+    body {
+        font-family: "Noto Sans CJK SC", "Noto Sans CJK TC", "Microsoft YaHei", "SimSun", sans-serif;
+        font-size: 11pt;
+        line-height: 1.6;
+        color: #333;
+    }
+
+    h1 {
+        font-size: 24pt;
+        font-weight: bold;
+        margin-top: 0;
+        margin-bottom: 12pt;
+        color: #000;
+        page-break-after: avoid;
+    }
+
+    h2 {
+        font-size: 18pt;
+        font-weight: bold;
+        margin-top: 18pt;
+        margin-bottom: 10pt;
+        color: #000;
+        page-break-after: avoid;
+    }
+
+    h3 {
+        font-size: 14pt;
+        font-weight: bold;
+        margin-top: 14pt;
+        margin-bottom: 8pt;
+        color: #000;
+        page-break-after: avoid;
+    }
+
+    p {
+        margin: 0 0 10pt 0;
+        text-align: justify;
+    }
+
+    table {
+        width: 100%;
+        border-collapse: collapse;
+        margin: 12pt 0;
+        page-break-inside: avoid;
+    }
+
+    table th {
+        background-color: #f0f0f0;
+        border: 1px solid #ccc;
+        padding: 8pt;
+        text-align: left;
+        font-weight: bold;
+    }
+
+    table td {
+        border: 1px solid #ccc;
+        padding: 8pt;
+        text-align: left;
+    }
+
+    code {
+        font-family: "Courier New", monospace;
+        font-size: 10pt;
+        background-color: #f5f5f5;
+        padding: 2pt 4pt;
+        border-radius: 3px;
+    }
+
+    pre {
+        background-color: #f5f5f5;
+        border: 1px solid #ddd;
+        border-radius: 5px;
+        padding: 10pt;
+        overflow-x: auto;
+        page-break-inside: avoid;
+    }
+
+    pre code {
+        background-color: transparent;
+        padding: 0;
+    }
+
+    img {
+        max-width: 100%;
+        height: auto;
+        display: block;
+        margin: 12pt auto;
+        page-break-inside: avoid;
+    }
+
+    blockquote {
+        border-left: 4px solid #ddd;
+        padding-left: 12pt;
+        margin: 12pt 0;
+        color: #666;
+        font-style: italic;
+    }
+
+    ul, ol {
+        margin: 10pt 0;
+        padding-left: 20pt;
+    }
+
+    li {
+        margin: 5pt 0;
+    }
+
+    hr {
+        border: none;
+        border-top: 1px solid #ccc;
+        margin: 20pt 0;
+    }
+
+    .page-break {
+        page-break-after: always;
+    }
+    """
+
+    # Academic paper template
+    ACADEMIC_CSS = """
+    @page {
+        size: A4;
+        margin: 2.5cm;
+    }
+
+    body {
+        font-family: "Times New Roman", "Noto Serif CJK SC", serif;
+        font-size: 12pt;
+        line-height: 1.8;
+        color: #000;
+    }
+
+    h1 {
+        font-size: 20pt;
+        text-align: center;
+        margin-bottom: 24pt;
+        page-break-after: avoid;
+    }
+
+    h2 {
+        font-size: 16pt;
+        margin-top: 20pt;
+        margin-bottom: 12pt;
+        page-break-after: avoid;
+    }
+
+    h3 {
+        font-size: 14pt;
+        margin-top: 16pt;
+        margin-bottom: 10pt;
+        page-break-after: avoid;
+    }
+
+    p {
+        text-indent: 2em;
+        text-align: justify;
+        margin: 0 0 12pt 0;
+    }
+
+    table {
+        width: 100%;
+        border-collapse: collapse;
+        margin: 16pt auto;
+        page-break-inside: avoid;
+    }
+
+    table caption {
+        font-weight: bold;
+        margin-bottom: 8pt;
+    }
+    """
+
+    # Business report template
+    BUSINESS_CSS = """
+    @page {
+        size: A4;
+        margin: 2cm 2.5cm;
+    }
+
+    body {
+        font-family: "Arial", "Noto Sans CJK SC", sans-serif;
+        font-size: 11pt;
+        line-height: 1.5;
+        color: #333;
+    }
+
+    h1 {
+        font-size: 22pt;
+        color: #0066cc;
+        border-bottom: 3px solid #0066cc;
+        padding-bottom: 8pt;
+        margin-bottom: 20pt;
+        page-break-after: avoid;
+    }
+
+    h2 {
+        font-size: 16pt;
+        color: #0066cc;
+        margin-top: 20pt;
+        margin-bottom: 12pt;
+        page-break-after: avoid;
+    }
+
+    table {
+        width: 100%;
+        border-collapse: collapse;
+        margin: 16pt 0;
+    }
+
+    table th {
+        background-color: #0066cc;
+        color: white;
+        padding: 10pt;
+        font-weight: bold;
+    }
+
+    table td {
+        border: 1px solid #ddd;
+        padding: 10pt;
+    }
+
+    table tr:nth-child(even) {
+        background-color: #f9f9f9;
+    }
+    """
+
+    def __init__(self):
+        """Initialize PDF generator"""
+        self.css_templates = {
+            "default": self.DEFAULT_CSS,
+            "academic": self.ACADEMIC_CSS,
+            "business": self.BUSINESS_CSS,
+        }
+
+    def check_pandoc_available(self) -> bool:
+        """
+        Check if Pandoc is installed and available
+
+        Returns:
+            bool: True if pandoc is available, False otherwise
+        """
+        try:
+            result = subprocess.run(
+                ["pandoc", "--version"],
+                capture_output=True,
+                text=True,
+                timeout=5
+            )
+            return result.returncode == 0
+        except (subprocess.TimeoutExpired, FileNotFoundError):
+            logger.warning("Pandoc not found or timed out")
+            return False
+
+    def generate_pdf_pandoc(
+        self,
+        markdown_path: Path,
+        output_path: Path,
+        css_template: str = "default",
+        metadata: Optional[Dict] = None
+    ) -> Path:
+        """
+        Generate PDF using Pandoc (preferred method)
+
+        Args:
+            markdown_path: Path to input Markdown file
+            output_path: Path to output PDF file
+            css_template: CSS template name or custom CSS string
+            metadata: Optional metadata dict (title, author, date)
+
+        Returns:
+            Path: Path to generated PDF file
+
+        Raises:
+            PDFGenerationError: If PDF generation fails
+        """
+        try:
+            # Create temporary CSS file
+            css_content = self.css_templates.get(css_template, css_template)
+            css_file = output_path.parent / f"temp_{datetime.now().timestamp()}.css"
+            css_file.write_text(css_content, encoding="utf-8")
+
+            # Build pandoc command
+            pandoc_cmd = [
+                "pandoc",
+                str(markdown_path),
+                "-o", str(output_path),
+                "--pdf-engine=weasyprint",
+                "--css", str(css_file),
+                "--standalone",
+                "--from=markdown+tables+fenced_code_blocks+footnotes",
+            ]
+
+            # Add metadata if provided
+            if metadata:
+                if metadata.get("title"):
+                    pandoc_cmd.extend(["--metadata", f"title={metadata['title']}"])
+                if metadata.get("author"):
+                    pandoc_cmd.extend(["--metadata", f"author={metadata['author']}"])
+                if metadata.get("date"):
+                    pandoc_cmd.extend(["--metadata", f"date={metadata['date']}"])
+
+            # Execute pandoc
+            logger.info(f"Executing pandoc: {' '.join(pandoc_cmd)}")
+            result = subprocess.run(
+                pandoc_cmd,
+                capture_output=True,
+                text=True,
+                timeout=60  # 60 second timeout for large documents
+            )
+
+            # Clean up temporary CSS file
+            css_file.unlink(missing_ok=True)
+
+            if result.returncode != 0:
+                error_msg = f"Pandoc failed: {result.stderr}"
+                logger.error(error_msg)
+                raise PDFGenerationError(error_msg)
+
+            if not output_path.exists():
+                raise PDFGenerationError(f"PDF file not created: {output_path}")
+
+            logger.info(f"PDF generated successfully via Pandoc: {output_path}")
+            return output_path
+
+        except subprocess.TimeoutExpired:
+            css_file.unlink(missing_ok=True)
+            raise PDFGenerationError("Pandoc execution timed out")
+        except Exception as e:
+            css_file.unlink(missing_ok=True)
+            raise PDFGenerationError(f"Pandoc PDF generation failed: {str(e)}")
+
+    def generate_pdf_weasyprint(
+        self,
+        markdown_path: Path,
+        output_path: Path,
+        css_template: str = "default",
+        metadata: Optional[Dict] = None
+    ) -> Path:
+        """
+        Generate PDF using WeasyPrint directly (fallback method)
+
+        Args:
+            markdown_path: Path to input Markdown file
+            output_path: Path to output PDF file
+            css_template: CSS template name or custom CSS string
+            metadata: Optional metadata dict (title, author, date)
+
+        Returns:
+            Path: Path to generated PDF file
+
+        Raises:
+            PDFGenerationError: If PDF generation fails
+        """
+        try:
+            # Read Markdown content
+            markdown_content = markdown_path.read_text(encoding="utf-8")
+
+            # Convert Markdown to HTML
+            html_content = markdown(
+                markdown_content,
+                extensions=[
+                    'tables',
+                    'fenced_code',
+                    'codehilite',
+                    'nl2br',
+                    'sane_lists',
+                ]
+            )
+
+            # Wrap HTML with proper structure
+            title = metadata.get("title", markdown_path.stem) if metadata else markdown_path.stem
+            full_html = f"""
+<!DOCTYPE html>
+<html lang="zh-CN">
+<head>
+    <meta charset="UTF-8">
+    <title>{title}</title>
+</head>
+<body>
+{html_content}
+</body>
+</html>
+"""
+
+            # Get CSS content
+            css_content = self.css_templates.get(css_template, css_template)
+
+            # Generate PDF
+            logger.info(f"Generating PDF via WeasyPrint: {output_path}")
+            html = HTML(string=full_html, base_url=str(markdown_path.parent))
+            css = CSS(string=css_content)
+            html.write_pdf(str(output_path), stylesheets=[css])
+
+            if not output_path.exists():
+                raise PDFGenerationError(f"PDF file not created: {output_path}")
+
+            logger.info(f"PDF generated successfully via WeasyPrint: {output_path}")
+            return output_path
+
+        except Exception as e:
+            raise PDFGenerationError(f"WeasyPrint PDF generation failed: {str(e)}")
+
+    def generate_pdf(
+        self,
+        markdown_path: Path,
+        output_path: Path,
+        css_template: str = "default",
+        metadata: Optional[Dict] = None,
+        prefer_pandoc: bool = True
+    ) -> Path:
+        """
+        Generate PDF from Markdown with automatic fallback
+
+        Args:
+            markdown_path: Path to input Markdown file
+            output_path: Path to output PDF file
+            css_template: CSS template name ("default", "academic", "business") or custom CSS
+            metadata: Optional metadata dict (title, author, date)
+            prefer_pandoc: Use Pandoc if available, fallback to WeasyPrint
+
+        Returns:
+            Path: Path to generated PDF file
+
+        Raises:
+            PDFGenerationError: If both methods fail
+        """
+        if not markdown_path.exists():
+            raise PDFGenerationError(f"Markdown file not found: {markdown_path}")
+
+        # Ensure output directory exists
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+
+        # Try Pandoc first if preferred and available
+        if prefer_pandoc and self.check_pandoc_available():
+            try:
+                return self.generate_pdf_pandoc(markdown_path, output_path, css_template, metadata)
+            except PDFGenerationError as e:
+                logger.warning(f"Pandoc failed, falling back to WeasyPrint: {e}")
+                # Fall through to WeasyPrint
+
+        # Use WeasyPrint (fallback or direct)
+        return self.generate_pdf_weasyprint(markdown_path, output_path, css_template, metadata)
+
+    def get_available_templates(self) -> Dict[str, str]:
+        """
+        Get list of available CSS templates
+
+        Returns:
+            Dict mapping template names to descriptions
+        """
+        return {
+            "default": "通用排版模板，適合大多數文檔",
+            "academic": "學術論文模板，適合研究報告",
+            "business": "商業報告模板，適合企業文檔",
+        }
+
+    def save_custom_template(self, template_name: str, css_content: str) -> None:
+        """
+        Save a custom CSS template
+
+        Args:
+            template_name: Template name
+            css_content: CSS content
+        """
+        self.css_templates[template_name] = css_content
+        logger.info(f"Custom CSS template saved: {template_name}")
--- a/backend/app/services/preprocessor.py
+++ b/backend/app/services/preprocessor.py
@@ -0,0 +1,230 @@
+"""
+Tool_OCR - Document Preprocessor Service
+Handles file validation, format detection, and preprocessing
+"""
+
+import magic
+from pathlib import Path
+from typing import Tuple, Optional
+import logging
+from PIL import Image
+import cv2
+import numpy as np
+
+from app.core.config import settings
+
+logger = logging.getLogger(__name__)
+
+
+class DocumentPreprocessor:
+    """
+    Document preprocessing service for format standardization
+    Validates and prepares documents for OCR processing
+    """
+
+    SUPPORTED_IMAGE_FORMATS = ['png', 'jpg', 'jpeg', 'bmp', 'tiff', 'tif']
+    SUPPORTED_PDF_FORMAT = ['pdf']
+    ALL_SUPPORTED_FORMATS = SUPPORTED_IMAGE_FORMATS + SUPPORTED_PDF_FORMAT
+
+    def __init__(self):
+        self.allowed_extensions = settings.allowed_extensions_list
+        self.max_file_size = settings.max_upload_size
+        logger.info(f"DocumentPreprocessor initialized with allowed_extensions: {self.allowed_extensions}")
+
+    def validate_file(self, file_path: Path) -> Tuple[bool, Optional[str], Optional[str]]:
+        """
+        Validate file format, size, and integrity
+
+        Args:
+            file_path: Path to the file to validate
+
+        Returns:
+            Tuple of (is_valid, file_format, error_message)
+        """
+        try:
+            # Check file exists
+            if not file_path.exists():
+                return False, None, f"File not found: {file_path}"
+
+            # Check file size
+            file_size = file_path.stat().st_size
+            if file_size > self.max_file_size:
+                max_mb = self.max_file_size / (1024 * 1024)
+                actual_mb = file_size / (1024 * 1024)
+                return False, None, f"File too large: {actual_mb:.2f}MB (max {max_mb:.2f}MB)"
+
+            # Detect file format using magic numbers
+            mime = magic.Magic(mime=True)
+            mime_type = mime.from_file(str(file_path))
+
+            # Map MIME type to format
+            file_format = self._mime_to_format(mime_type)
+            if not file_format:
+                return False, None, f"Unsupported file type: {mime_type}"
+
+            # Check if format is in allowed extensions
+            if file_format not in self.allowed_extensions:
+                return False, None, f"File format '{file_format}' not allowed"
+
+            # Validate file integrity
+            is_valid, error = self._validate_integrity(file_path, file_format)
+            if not is_valid:
+                return False, file_format, f"File corrupted: {error}"
+
+            logger.info(f"File validated successfully: {file_path.name} ({file_format})")
+            return True, file_format, None
+
+        except Exception as e:
+            logger.error(f"File validation error: {str(e)}")
+            return False, None, f"Validation error: {str(e)}"
+
+    def _mime_to_format(self, mime_type: str) -> Optional[str]:
+        """Convert MIME type to file format"""
+        mime_map = {
+            'image/png': 'png',
+            'image/jpeg': 'jpg',
+            'image/jpg': 'jpg',
+            'image/bmp': 'bmp',
+            'image/tiff': 'tiff',
+            'image/x-tiff': 'tiff',
+            'application/pdf': 'pdf',
+            'application/msword': 'doc',
+            'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
+            'application/vnd.ms-powerpoint': 'ppt',
+            'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
+        }
+        return mime_map.get(mime_type)
+
+    def _validate_integrity(self, file_path: Path, file_format: str) -> Tuple[bool, Optional[str]]:
+        """
+        Validate file integrity by attempting to open it
+
+        Args:
+            file_path: Path to file
+            file_format: Detected file format
+
+        Returns:
+            Tuple of (is_valid, error_message)
+        """
+        try:
+            if file_format in self.SUPPORTED_IMAGE_FORMATS:
+                # Try to open image
+                with Image.open(file_path) as img:
+                    img.verify()  # Verify image integrity
+                # Reopen for actual check (verify() closes the file)
+                with Image.open(file_path) as img:
+                    _ = img.size  # Force load to detect corruption
+                return True, None
+
+            elif file_format == 'pdf':
+                # Basic PDF validation - check file starts with PDF signature
+                with open(file_path, 'rb') as f:
+                    header = f.read(5)
+                    if header != b'%PDF-':
+                        return False, "Invalid PDF header"
+                return True, None
+
+            elif file_format in ['doc', 'docx', 'ppt', 'pptx']:
+                # Office documents - basic validation (check file size and can be opened)
+                # Modern Office formats (docx, pptx) are ZIP-based
+                if file_format in ['docx', 'pptx']:
+                    import zipfile
+                    try:
+                        with zipfile.ZipFile(file_path, 'r') as zf:
+                            # Check if it has the required Office structure
+                            if file_format == 'docx' and 'word/document.xml' not in zf.namelist():
+                                return False, "Invalid DOCX structure"
+                            elif file_format == 'pptx' and 'ppt/presentation.xml' not in zf.namelist():
+                                return False, "Invalid PPTX structure"
+                    except zipfile.BadZipFile:
+                        return False, "Invalid Office file (corrupt ZIP)"
+                # Old formats (doc, ppt) - just check file exists and has content
+                return True, None
+
+            else:
+                return False, f"Unknown format: {file_format}"
+
+        except Exception as e:
+            return False, str(e)
+
+    def preprocess_image(
+        self,
+        image_path: Path,
+        enhance: bool = True,
+        output_path: Optional[Path] = None
+    ) -> Tuple[bool, Optional[Path], Optional[str]]:
+        """
+        Preprocess image to improve OCR accuracy
+
+        Args:
+            image_path: Path to input image
+            enhance: Whether to apply enhancement
+            output_path: Optional output path (defaults to temp directory)
+
+        Returns:
+            Tuple of (success, processed_image_path, error_message)
+        """
+        try:
+            # Read image
+            img = cv2.imread(str(image_path))
+            if img is None:
+                return False, None, "Failed to read image"
+
+            if not enhance:
+                # No preprocessing, return original
+                return True, image_path, None
+
+            # Convert to grayscale
+            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+            # Apply adaptive thresholding to handle varying lighting
+            processed = cv2.adaptiveThreshold(
+                gray,
+                255,
+                cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+                cv2.THRESH_BINARY,
+                11,
+                2
+            )
+
+            # Denoise
+            processed = cv2.fastNlMeansDenoising(processed, None, 10, 7, 21)
+
+            # Determine output path
+            if output_path is None:
+                output_path = Path(settings.processed_dir) / f"processed_{image_path.name}"
+
+            # Save processed image
+            cv2.imwrite(str(output_path), processed)
+
+            logger.info(f"Image preprocessed: {image_path.name} -> {output_path.name}")
+            return True, output_path, None
+
+        except Exception as e:
+            logger.error(f"Image preprocessing error: {str(e)}")
+            return False, None, f"Preprocessing error: {str(e)}"
+
+    def get_file_info(self, file_path: Path) -> dict:
+        """
+        Get comprehensive file information
+
+        Args:
+            file_path: Path to file
+
+        Returns:
+            Dictionary with file information
+        """
+        stat = file_path.stat()
+        mime = magic.Magic(mime=True)
+        mime_type = mime.from_file(str(file_path))
+
+        return {
+            'name': file_path.name,
+            'path': str(file_path),
+            'size': stat.st_size,
+            'size_mb': stat.st_size / (1024 * 1024),
+            'mime_type': mime_type,
+            'format': self._mime_to_format(mime_type),
+            'created_at': stat.st_ctime,
+            'modified_at': stat.st_mtime,
+        }
--- a/backend/app/services/translation_service.py
+++ b/backend/app/services/translation_service.py
@@ -0,0 +1,282 @@
+"""
+Tool_OCR - Translation Service (RESERVED)
+Abstract interface and stub implementation for future translation feature
+"""
+
+from abc import ABC, abstractmethod
+from typing import Dict, Optional, List
+from enum import Enum
+import logging
+
+
+logger = logging.getLogger(__name__)
+
+
+class TranslationEngine(str, Enum):
+    """Supported translation engines"""
+    OFFLINE = "offline"  # Argos Translate (offline)
+    ERNIE = "ernie"  # Baidu ERNIE API
+    GOOGLE = "google"  # Google Translate API
+    DEEPL = "deepl"  # DeepL API
+
+
+class LanguageCode(str, Enum):
+    """Supported language codes"""
+    CHINESE = "zh"
+    ENGLISH = "en"
+    JAPANESE = "ja"
+    KOREAN = "ko"
+    FRENCH = "fr"
+    GERMAN = "de"
+    SPANISH = "es"
+
+
+class TranslationServiceInterface(ABC):
+    """
+    Abstract interface for translation services
+
+    This interface defines the contract for all translation engine implementations.
+    Future implementations should inherit from this class.
+    """
+
+    @abstractmethod
+    def translate_text(
+        self,
+        text: str,
+        source_lang: str,
+        target_lang: str,
+        **kwargs
+    ) -> str:
+        """
+        Translate a single text string
+
+        Args:
+            text: Text to translate
+            source_lang: Source language code
+            target_lang: Target language code
+            **kwargs: Engine-specific parameters
+
+        Returns:
+            str: Translated text
+        """
+        pass
+
+    @abstractmethod
+    def translate_document(
+        self,
+        markdown_content: str,
+        source_lang: str,
+        target_lang: str,
+        preserve_structure: bool = True,
+        **kwargs
+    ) -> Dict[str, any]:
+        """
+        Translate a Markdown document while preserving structure
+
+        Args:
+            markdown_content: Markdown content to translate
+            source_lang: Source language code
+            target_lang: Target language code
+            preserve_structure: Whether to preserve markdown structure
+            **kwargs: Engine-specific parameters
+
+        Returns:
+            Dict containing:
+                - translated_content: Translated markdown
+                - metadata: Translation metadata (engine, time, etc.)
+        """
+        pass
+
+    @abstractmethod
+    def batch_translate(
+        self,
+        texts: List[str],
+        source_lang: str,
+        target_lang: str,
+        **kwargs
+    ) -> List[str]:
+        """
+        Translate multiple texts in batch
+
+        Args:
+            texts: List of texts to translate
+            source_lang: Source language code
+            target_lang: Target language code
+            **kwargs: Engine-specific parameters
+
+        Returns:
+            List[str]: List of translated texts
+        """
+        pass
+
+    @abstractmethod
+    def get_supported_languages(self) -> List[str]:
+        """
+        Get list of supported language codes for this engine
+
+        Returns:
+            List[str]: List of supported language codes
+        """
+        pass
+
+    @abstractmethod
+    def validate_config(self) -> bool:
+        """
+        Validate engine configuration (API keys, model files, etc.)
+
+        Returns:
+            bool: True if configuration is valid
+        """
+        pass
+
+
+class TranslationEngineFactory:
+    """
+    Factory for creating translation engine instances
+
+    RESERVED: This is a placeholder for future implementation.
+    When translation feature is implemented, this factory will instantiate
+    the appropriate translation engine based on configuration.
+    """
+
+    @staticmethod
+    def create_engine(
+        engine_type: TranslationEngine,
+        config: Optional[Dict] = None
+    ) -> TranslationServiceInterface:
+        """
+        Create a translation engine instance
+
+        Args:
+            engine_type: Type of translation engine
+            config: Engine-specific configuration
+
+        Returns:
+            TranslationServiceInterface: Translation engine instance
+
+        Raises:
+            NotImplementedError: Always raised (stub implementation)
+        """
+        raise NotImplementedError(
+            "Translation feature is not yet implemented. "
+            "This is a reserved placeholder for future development."
+        )
+
+    @staticmethod
+    def get_available_engines() -> List[str]:
+        """
+        Get list of available translation engines
+
+        Returns:
+            List[str]: List of engine types (currently empty)
+        """
+        return []
+
+    @staticmethod
+    def is_engine_available(engine_type: TranslationEngine) -> bool:
+        """
+        Check if a specific engine is available
+
+        Args:
+            engine_type: Engine type to check
+
+        Returns:
+            bool: Always False (stub implementation)
+        """
+        return False
+
+
+class StubTranslationService:
+    """
+    Stub translation service for API endpoints
+
+    This service provides placeholder responses for translation endpoints
+    until the feature is fully implemented.
+    """
+
+    @staticmethod
+    def get_feature_status() -> Dict[str, any]:
+        """
+        Get translation feature status
+
+        Returns:
+            Dict with feature status information
+        """
+        return {
+            "available": False,
+            "status": "reserved",
+            "message": "Translation feature is reserved for future implementation",
+            "supported_engines": [],
+            "planned_engines": [
+                {
+                    "type": "offline",
+                    "name": "Argos Translate",
+                    "description": "Offline neural translation",
+                    "status": "planned"
+                },
+                {
+                    "type": "ernie",
+                    "name": "Baidu ERNIE",
+                    "description": "Baidu AI translation API",
+                    "status": "planned"
+                },
+                {
+                    "type": "google",
+                    "name": "Google Translate",
+                    "description": "Google Cloud Translation API",
+                    "status": "planned"
+                },
+                {
+                    "type": "deepl",
+                    "name": "DeepL",
+                    "description": "DeepL translation API",
+                    "status": "planned"
+                }
+            ],
+            "roadmap": {
+                "phase": "Phase 5",
+                "priority": "low",
+                "implementation_after": "Production deployment and user feedback"
+            }
+        }
+
+    @staticmethod
+    def get_supported_languages() -> List[Dict[str, str]]:
+        """
+        Get list of languages planned for translation support
+
+        Returns:
+            List of language info dicts
+        """
+        return [
+            {"code": "zh", "name": "Chinese (Simplified)", "status": "planned"},
+            {"code": "en", "name": "English", "status": "planned"},
+            {"code": "ja", "name": "Japanese", "status": "planned"},
+            {"code": "ko", "name": "Korean", "status": "planned"},
+            {"code": "fr", "name": "French", "status": "planned"},
+            {"code": "de", "name": "German", "status": "planned"},
+            {"code": "es", "name": "Spanish", "status": "planned"},
+        ]
+
+
+# Example placeholder for future engine implementations:
+#
+# class ArgosTranslationEngine(TranslationServiceInterface):
+#     """Offline translation using Argos Translate"""
+#     def __init__(self, model_path: str):
+#         self.model_path = model_path
+#         # Initialize Argos models
+#
+#     def translate_text(self, text, source_lang, target_lang, **kwargs):
+#         # Implementation here
+#         pass
+#
+# class ERNIETranslationEngine(TranslationServiceInterface):
+#     """Baidu ERNIE API translation"""
+#     def __init__(self, api_key: str, api_secret: str):
+#         self.api_key = api_key
+#         self.api_secret = api_secret
+#
+#     def translate_text(self, text, source_lang, target_lang, **kwargs):
+#         # Implementation here
+#         pass