first

2025-11-12 22:53:17 +08:00
commit da700721fa
130 changed files with 23393 additions and 0 deletions
--- a/backend/app/services/export_service.py
+++ b/backend/app/services/export_service.py
@@ -0,0 +1,512 @@
+"""
+Tool_OCR - Export Service
+Handles OCR result export in multiple formats with filtering and formatting rules
+"""
+
+import json
+import logging
+import zipfile
+from pathlib import Path
+from typing import List, Dict, Optional, Any
+from datetime import datetime
+
+import pandas as pd
+from sqlalchemy.orm import Session
+
+from app.core.config import settings
+from app.models.ocr import OCRBatch, OCRFile, OCRResult, FileStatus
+from app.models.export import ExportRule
+from app.services.pdf_generator import PDFGenerator, PDFGenerationError
+
+
+logger = logging.getLogger(__name__)
+
+
+class ExportError(Exception):
+    """Exception raised for export errors"""
+    pass
+
+
+class ExportService:
+    """
+    Export service for OCR results
+
+    Supported formats:
+    - TXT: Plain text export
+    - JSON: Full metadata export
+    - Excel: Tabular data export
+    - Markdown: Direct Markdown export
+    - PDF: Layout-preserved PDF export
+    - ZIP: Batch export archive
+    """
+
+    def __init__(self):
+        """Initialize export service"""
+        self.pdf_generator = PDFGenerator()
+
+    def apply_filters(
+        self,
+        results: List[OCRResult],
+        filters: Dict[str, Any]
+    ) -> List[OCRResult]:
+        """
+        Apply filters to OCR results
+
+        Args:
+            results: List of OCR results
+            filters: Filter configuration
+                - confidence_threshold: Minimum confidence (0.0-1.0)
+                - filename_pattern: Glob pattern for filename matching
+                - language: Filter by detected language
+
+        Returns:
+            List[OCRResult]: Filtered results
+        """
+        filtered = results
+
+        # Confidence threshold filter
+        if "confidence_threshold" in filters:
+            threshold = filters["confidence_threshold"]
+            filtered = [r for r in filtered if r.average_confidence and r.average_confidence >= threshold]
+
+        # Filename pattern filter (using simple substring match)
+        if "filename_pattern" in filters:
+            pattern = filters["filename_pattern"].lower()
+            filtered = [
+                r for r in filtered
+                if pattern in r.file.original_filename.lower()
+            ]
+
+        # Language filter
+        if "language" in filters:
+            lang = filters["language"]
+            filtered = [r for r in filtered if r.detected_language == lang]
+
+        return filtered
+
+    def export_to_txt(
+        self,
+        results: List[OCRResult],
+        output_path: Path,
+        formatting: Optional[Dict] = None
+    ) -> Path:
+        """
+        Export results to plain text file
+
+        Args:
+            results: List of OCR results
+            output_path: Output file path
+            formatting: Formatting options
+                - add_line_numbers: Add line numbers
+                - group_by_filename: Group text by source file
+                - include_metadata: Add file metadata headers
+
+        Returns:
+            Path: Output file path
+
+        Raises:
+            ExportError: If export fails
+        """
+        try:
+            formatting = formatting or {}
+            output_lines = []
+
+            for idx, result in enumerate(results, 1):
+                # Read Markdown file
+                if not result.markdown_path or not Path(result.markdown_path).exists():
+                    logger.warning(f"Markdown file not found for result {result.id}")
+                    continue
+
+                markdown_content = Path(result.markdown_path).read_text(encoding="utf-8")
+
+                # Add metadata header if requested
+                if formatting.get("include_metadata", False):
+                    output_lines.append(f"=" * 80)
+                    output_lines.append(f"文件: {result.file.original_filename}")
+                    output_lines.append(f"語言: {result.detected_language or '未知'}")
+                    output_lines.append(f"信心度: {result.average_confidence:.2%}" if result.average_confidence else "信心度: N/A")
+                    output_lines.append(f"=" * 80)
+                    output_lines.append("")
+
+                # Add content with optional line numbers
+                if formatting.get("add_line_numbers", False):
+                    for line_num, line in enumerate(markdown_content.split('\n'), 1):
+                        output_lines.append(f"{line_num:4d} | {line}")
+                else:
+                    output_lines.append(markdown_content)
+
+                # Add separator between files if grouping
+                if formatting.get("group_by_filename", False) and idx < len(results):
+                    output_lines.append("\n" + "-" * 80 + "\n")
+
+            # Write to file
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+            output_path.write_text("\n".join(output_lines), encoding="utf-8")
+
+            logger.info(f"Exported {len(results)} results to TXT: {output_path}")
+            return output_path
+
+        except Exception as e:
+            raise ExportError(f"TXT export failed: {str(e)}")
+
+    def export_to_json(
+        self,
+        results: List[OCRResult],
+        output_path: Path,
+        include_layout: bool = True,
+        include_images: bool = True
+    ) -> Path:
+        """
+        Export results to JSON file with full metadata
+
+        Args:
+            results: List of OCR results
+            output_path: Output file path
+            include_layout: Include layout data
+            include_images: Include images metadata
+
+        Returns:
+            Path: Output file path
+
+        Raises:
+            ExportError: If export fails
+        """
+        try:
+            export_data = {
+                "export_time": datetime.utcnow().isoformat(),
+                "total_files": len(results),
+                "results": []
+            }
+
+            for result in results:
+                result_data = {
+                    "file_id": result.file.id,
+                    "filename": result.file.original_filename,
+                    "file_format": result.file.file_format,
+                    "file_size": result.file.file_size,
+                    "processing_time": result.file.processing_time,
+                    "detected_language": result.detected_language,
+                    "total_text_regions": result.total_text_regions,
+                    "average_confidence": result.average_confidence,
+                    "markdown_path": result.markdown_path,
+                }
+
+                # Include layout data if requested
+                if include_layout and result.layout_data:
+                    result_data["layout_data"] = result.layout_data
+
+                # Include images metadata if requested
+                if include_images and result.images_metadata:
+                    result_data["images_metadata"] = result.images_metadata
+
+                export_data["results"].append(result_data)
+
+            # Write to file
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+            output_path.write_text(
+                json.dumps(export_data, ensure_ascii=False, indent=2),
+                encoding="utf-8"
+            )
+
+            logger.info(f"Exported {len(results)} results to JSON: {output_path}")
+            return output_path
+
+        except Exception as e:
+            raise ExportError(f"JSON export failed: {str(e)}")
+
+    def export_to_excel(
+        self,
+        results: List[OCRResult],
+        output_path: Path,
+        include_confidence: bool = True,
+        include_processing_time: bool = True
+    ) -> Path:
+        """
+        Export results to Excel file
+
+        Args:
+            results: List of OCR results
+            output_path: Output file path
+            include_confidence: Include confidence scores
+            include_processing_time: Include processing time
+
+        Returns:
+            Path: Output file path
+
+        Raises:
+            ExportError: If export fails
+        """
+        try:
+            rows = []
+
+            for result in results:
+                # Read Markdown content
+                text_content = ""
+                if result.markdown_path and Path(result.markdown_path).exists():
+                    text_content = Path(result.markdown_path).read_text(encoding="utf-8")
+
+                row = {
+                    "文件名": result.file.original_filename,
+                    "格式": result.file.file_format,
+                    "大小(字節)": result.file.file_size,
+                    "語言": result.detected_language or "未知",
+                    "文本區域數": result.total_text_regions,
+                    "提取內容": text_content[:1000] + "..." if len(text_content) > 1000 else text_content,
+                }
+
+                if include_confidence:
+                    row["平均信心度"] = f"{result.average_confidence:.2%}" if result.average_confidence else "N/A"
+
+                if include_processing_time:
+                    row["處理時間(秒)"] = f"{result.file.processing_time:.2f}" if result.file.processing_time else "N/A"
+
+                rows.append(row)
+
+            # Create DataFrame and export
+            df = pd.DataFrame(rows)
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+            df.to_excel(output_path, index=False, engine='openpyxl')
+
+            logger.info(f"Exported {len(results)} results to Excel: {output_path}")
+            return output_path
+
+        except Exception as e:
+            raise ExportError(f"Excel export failed: {str(e)}")
+
+    def export_to_markdown(
+        self,
+        results: List[OCRResult],
+        output_path: Path,
+        combine: bool = True
+    ) -> Path:
+        """
+        Export results to Markdown file(s)
+
+        Args:
+            results: List of OCR results
+            output_path: Output file path (or directory if not combining)
+            combine: Combine all results into one file
+
+        Returns:
+            Path: Output file/directory path
+
+        Raises:
+            ExportError: If export fails
+        """
+        try:
+            if combine:
+                # Combine all Markdown files into one
+                combined_content = []
+
+                for result in results:
+                    if not result.markdown_path or not Path(result.markdown_path).exists():
+                        continue
+
+                    markdown_content = Path(result.markdown_path).read_text(encoding="utf-8")
+
+                    # Add header
+                    combined_content.append(f"# {result.file.original_filename}\n")
+                    combined_content.append(markdown_content)
+                    combined_content.append("\n---\n")  # Separator
+
+                output_path.parent.mkdir(parents=True, exist_ok=True)
+                output_path.write_text("\n".join(combined_content), encoding="utf-8")
+
+                logger.info(f"Exported {len(results)} results to combined Markdown: {output_path}")
+                return output_path
+
+            else:
+                # Export each result to separate file
+                output_path.mkdir(parents=True, exist_ok=True)
+
+                for result in results:
+                    if not result.markdown_path or not Path(result.markdown_path).exists():
+                        continue
+
+                    # Copy Markdown file to output directory
+                    src_path = Path(result.markdown_path)
+                    dst_path = output_path / f"{result.file.original_filename}.md"
+                    dst_path.write_text(src_path.read_text(encoding="utf-8"), encoding="utf-8")
+
+                logger.info(f"Exported {len(results)} results to separate Markdown files: {output_path}")
+                return output_path
+
+        except Exception as e:
+            raise ExportError(f"Markdown export failed: {str(e)}")
+
+    def export_to_pdf(
+        self,
+        result: OCRResult,
+        output_path: Path,
+        css_template: str = "default",
+        metadata: Optional[Dict] = None
+    ) -> Path:
+        """
+        Export single result to PDF with layout preservation
+
+        Args:
+            result: OCR result
+            output_path: Output PDF path
+            css_template: CSS template name or custom CSS
+            metadata: Optional PDF metadata
+
+        Returns:
+            Path: Output PDF path
+
+        Raises:
+            ExportError: If export fails
+        """
+        try:
+            if not result.markdown_path or not Path(result.markdown_path).exists():
+                raise ExportError(f"Markdown file not found for result {result.id}")
+
+            markdown_path = Path(result.markdown_path)
+
+            # Prepare metadata
+            pdf_metadata = metadata or {}
+            if "title" not in pdf_metadata:
+                pdf_metadata["title"] = result.file.original_filename
+
+            # Generate PDF
+            self.pdf_generator.generate_pdf(
+                markdown_path=markdown_path,
+                output_path=output_path,
+                css_template=css_template,
+                metadata=pdf_metadata
+            )
+
+            logger.info(f"Exported result {result.id} to PDF: {output_path}")
+            return output_path
+
+        except PDFGenerationError as e:
+            raise ExportError(f"PDF generation failed: {str(e)}")
+        except Exception as e:
+            raise ExportError(f"PDF export failed: {str(e)}")
+
+    def export_batch_to_zip(
+        self,
+        db: Session,
+        batch_id: int,
+        output_path: Path,
+        include_formats: Optional[List[str]] = None
+    ) -> Path:
+        """
+        Export entire batch to ZIP archive
+
+        Args:
+            db: Database session
+            batch_id: Batch ID
+            output_path: Output ZIP path
+            include_formats: List of formats to include (markdown, json, txt, excel, pdf)
+
+        Returns:
+            Path: Output ZIP path
+
+        Raises:
+            ExportError: If export fails
+        """
+        try:
+            include_formats = include_formats or ["markdown", "json"]
+
+            # Get batch and results
+            batch = db.query(OCRBatch).filter(OCRBatch.id == batch_id).first()
+            if not batch:
+                raise ExportError(f"Batch {batch_id} not found")
+
+            results = db.query(OCRResult).join(OCRFile).filter(
+                OCRFile.batch_id == batch_id,
+                OCRFile.status == FileStatus.COMPLETED
+            ).all()
+
+            if not results:
+                raise ExportError(f"No completed results found for batch {batch_id}")
+
+            # Create temporary export directory
+            temp_dir = output_path.parent / f"temp_export_{batch_id}"
+            temp_dir.mkdir(parents=True, exist_ok=True)
+
+            try:
+                # Export in requested formats
+                if "markdown" in include_formats:
+                    md_dir = temp_dir / "markdown"
+                    self.export_to_markdown(results, md_dir, combine=False)
+
+                if "json" in include_formats:
+                    json_path = temp_dir / "batch_results.json"
+                    self.export_to_json(results, json_path)
+
+                if "txt" in include_formats:
+                    txt_path = temp_dir / "batch_results.txt"
+                    self.export_to_txt(results, txt_path)
+
+                if "excel" in include_formats:
+                    excel_path = temp_dir / "batch_results.xlsx"
+                    self.export_to_excel(results, excel_path)
+
+                # Create ZIP archive
+                output_path.parent.mkdir(parents=True, exist_ok=True)
+                with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
+                    for file_path in temp_dir.rglob('*'):
+                        if file_path.is_file():
+                            arcname = file_path.relative_to(temp_dir)
+                            zipf.write(file_path, arcname)
+
+                logger.info(f"Exported batch {batch_id} to ZIP: {output_path}")
+                return output_path
+
+            finally:
+                # Clean up temporary directory
+                import shutil
+                shutil.rmtree(temp_dir, ignore_errors=True)
+
+        except Exception as e:
+            raise ExportError(f"Batch ZIP export failed: {str(e)}")
+
+    def apply_export_rule(
+        self,
+        db: Session,
+        results: List[OCRResult],
+        rule_id: int
+    ) -> List[OCRResult]:
+        """
+        Apply export rule to filter and format results
+
+        Args:
+            db: Database session
+            results: List of OCR results
+            rule_id: Export rule ID
+
+        Returns:
+            List[OCRResult]: Filtered results
+
+        Raises:
+            ExportError: If rule not found
+        """
+        rule = db.query(ExportRule).filter(ExportRule.id == rule_id).first()
+        if not rule:
+            raise ExportError(f"Export rule {rule_id} not found")
+
+        config = rule.config_json
+
+        # Apply filters
+        if "filters" in config:
+            results = self.apply_filters(results, config["filters"])
+
+        # Note: Formatting options are applied in individual export methods
+        return results
+
+    def get_export_formats(self) -> Dict[str, str]:
+        """
+        Get available export formats
+
+        Returns:
+            Dict mapping format codes to descriptions
+        """
+        return {
+            "txt": "純文本格式 (.txt)",
+            "json": "JSON 格式 - 包含完整元數據 (.json)",
+            "excel": "Excel 表格格式 (.xlsx)",
+            "markdown": "Markdown 格式 (.md)",
+            "pdf": "版面保留 PDF 格式 (.pdf)",
+            "zip": "批次打包格式 (.zip)",
+        }