first
This commit is contained in:
512
backend/app/services/export_service.py
Normal file
512
backend/app/services/export_service.py
Normal file
@@ -0,0 +1,512 @@
|
||||
"""
|
||||
Tool_OCR - Export Service
|
||||
Handles OCR result export in multiple formats with filtering and formatting rules
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Optional, Any
|
||||
from datetime import datetime
|
||||
|
||||
import pandas as pd
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.core.config import settings
|
||||
from app.models.ocr import OCRBatch, OCRFile, OCRResult, FileStatus
|
||||
from app.models.export import ExportRule
|
||||
from app.services.pdf_generator import PDFGenerator, PDFGenerationError
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ExportError(Exception):
|
||||
"""Exception raised for export errors"""
|
||||
pass
|
||||
|
||||
|
||||
class ExportService:
|
||||
"""
|
||||
Export service for OCR results
|
||||
|
||||
Supported formats:
|
||||
- TXT: Plain text export
|
||||
- JSON: Full metadata export
|
||||
- Excel: Tabular data export
|
||||
- Markdown: Direct Markdown export
|
||||
- PDF: Layout-preserved PDF export
|
||||
- ZIP: Batch export archive
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize export service"""
|
||||
self.pdf_generator = PDFGenerator()
|
||||
|
||||
def apply_filters(
|
||||
self,
|
||||
results: List[OCRResult],
|
||||
filters: Dict[str, Any]
|
||||
) -> List[OCRResult]:
|
||||
"""
|
||||
Apply filters to OCR results
|
||||
|
||||
Args:
|
||||
results: List of OCR results
|
||||
filters: Filter configuration
|
||||
- confidence_threshold: Minimum confidence (0.0-1.0)
|
||||
- filename_pattern: Glob pattern for filename matching
|
||||
- language: Filter by detected language
|
||||
|
||||
Returns:
|
||||
List[OCRResult]: Filtered results
|
||||
"""
|
||||
filtered = results
|
||||
|
||||
# Confidence threshold filter
|
||||
if "confidence_threshold" in filters:
|
||||
threshold = filters["confidence_threshold"]
|
||||
filtered = [r for r in filtered if r.average_confidence and r.average_confidence >= threshold]
|
||||
|
||||
# Filename pattern filter (using simple substring match)
|
||||
if "filename_pattern" in filters:
|
||||
pattern = filters["filename_pattern"].lower()
|
||||
filtered = [
|
||||
r for r in filtered
|
||||
if pattern in r.file.original_filename.lower()
|
||||
]
|
||||
|
||||
# Language filter
|
||||
if "language" in filters:
|
||||
lang = filters["language"]
|
||||
filtered = [r for r in filtered if r.detected_language == lang]
|
||||
|
||||
return filtered
|
||||
|
||||
def export_to_txt(
|
||||
self,
|
||||
results: List[OCRResult],
|
||||
output_path: Path,
|
||||
formatting: Optional[Dict] = None
|
||||
) -> Path:
|
||||
"""
|
||||
Export results to plain text file
|
||||
|
||||
Args:
|
||||
results: List of OCR results
|
||||
output_path: Output file path
|
||||
formatting: Formatting options
|
||||
- add_line_numbers: Add line numbers
|
||||
- group_by_filename: Group text by source file
|
||||
- include_metadata: Add file metadata headers
|
||||
|
||||
Returns:
|
||||
Path: Output file path
|
||||
|
||||
Raises:
|
||||
ExportError: If export fails
|
||||
"""
|
||||
try:
|
||||
formatting = formatting or {}
|
||||
output_lines = []
|
||||
|
||||
for idx, result in enumerate(results, 1):
|
||||
# Read Markdown file
|
||||
if not result.markdown_path or not Path(result.markdown_path).exists():
|
||||
logger.warning(f"Markdown file not found for result {result.id}")
|
||||
continue
|
||||
|
||||
markdown_content = Path(result.markdown_path).read_text(encoding="utf-8")
|
||||
|
||||
# Add metadata header if requested
|
||||
if formatting.get("include_metadata", False):
|
||||
output_lines.append(f"=" * 80)
|
||||
output_lines.append(f"文件: {result.file.original_filename}")
|
||||
output_lines.append(f"語言: {result.detected_language or '未知'}")
|
||||
output_lines.append(f"信心度: {result.average_confidence:.2%}" if result.average_confidence else "信心度: N/A")
|
||||
output_lines.append(f"=" * 80)
|
||||
output_lines.append("")
|
||||
|
||||
# Add content with optional line numbers
|
||||
if formatting.get("add_line_numbers", False):
|
||||
for line_num, line in enumerate(markdown_content.split('\n'), 1):
|
||||
output_lines.append(f"{line_num:4d} | {line}")
|
||||
else:
|
||||
output_lines.append(markdown_content)
|
||||
|
||||
# Add separator between files if grouping
|
||||
if formatting.get("group_by_filename", False) and idx < len(results):
|
||||
output_lines.append("\n" + "-" * 80 + "\n")
|
||||
|
||||
# Write to file
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
output_path.write_text("\n".join(output_lines), encoding="utf-8")
|
||||
|
||||
logger.info(f"Exported {len(results)} results to TXT: {output_path}")
|
||||
return output_path
|
||||
|
||||
except Exception as e:
|
||||
raise ExportError(f"TXT export failed: {str(e)}")
|
||||
|
||||
def export_to_json(
|
||||
self,
|
||||
results: List[OCRResult],
|
||||
output_path: Path,
|
||||
include_layout: bool = True,
|
||||
include_images: bool = True
|
||||
) -> Path:
|
||||
"""
|
||||
Export results to JSON file with full metadata
|
||||
|
||||
Args:
|
||||
results: List of OCR results
|
||||
output_path: Output file path
|
||||
include_layout: Include layout data
|
||||
include_images: Include images metadata
|
||||
|
||||
Returns:
|
||||
Path: Output file path
|
||||
|
||||
Raises:
|
||||
ExportError: If export fails
|
||||
"""
|
||||
try:
|
||||
export_data = {
|
||||
"export_time": datetime.utcnow().isoformat(),
|
||||
"total_files": len(results),
|
||||
"results": []
|
||||
}
|
||||
|
||||
for result in results:
|
||||
result_data = {
|
||||
"file_id": result.file.id,
|
||||
"filename": result.file.original_filename,
|
||||
"file_format": result.file.file_format,
|
||||
"file_size": result.file.file_size,
|
||||
"processing_time": result.file.processing_time,
|
||||
"detected_language": result.detected_language,
|
||||
"total_text_regions": result.total_text_regions,
|
||||
"average_confidence": result.average_confidence,
|
||||
"markdown_path": result.markdown_path,
|
||||
}
|
||||
|
||||
# Include layout data if requested
|
||||
if include_layout and result.layout_data:
|
||||
result_data["layout_data"] = result.layout_data
|
||||
|
||||
# Include images metadata if requested
|
||||
if include_images and result.images_metadata:
|
||||
result_data["images_metadata"] = result.images_metadata
|
||||
|
||||
export_data["results"].append(result_data)
|
||||
|
||||
# Write to file
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
output_path.write_text(
|
||||
json.dumps(export_data, ensure_ascii=False, indent=2),
|
||||
encoding="utf-8"
|
||||
)
|
||||
|
||||
logger.info(f"Exported {len(results)} results to JSON: {output_path}")
|
||||
return output_path
|
||||
|
||||
except Exception as e:
|
||||
raise ExportError(f"JSON export failed: {str(e)}")
|
||||
|
||||
def export_to_excel(
|
||||
self,
|
||||
results: List[OCRResult],
|
||||
output_path: Path,
|
||||
include_confidence: bool = True,
|
||||
include_processing_time: bool = True
|
||||
) -> Path:
|
||||
"""
|
||||
Export results to Excel file
|
||||
|
||||
Args:
|
||||
results: List of OCR results
|
||||
output_path: Output file path
|
||||
include_confidence: Include confidence scores
|
||||
include_processing_time: Include processing time
|
||||
|
||||
Returns:
|
||||
Path: Output file path
|
||||
|
||||
Raises:
|
||||
ExportError: If export fails
|
||||
"""
|
||||
try:
|
||||
rows = []
|
||||
|
||||
for result in results:
|
||||
# Read Markdown content
|
||||
text_content = ""
|
||||
if result.markdown_path and Path(result.markdown_path).exists():
|
||||
text_content = Path(result.markdown_path).read_text(encoding="utf-8")
|
||||
|
||||
row = {
|
||||
"文件名": result.file.original_filename,
|
||||
"格式": result.file.file_format,
|
||||
"大小(字節)": result.file.file_size,
|
||||
"語言": result.detected_language or "未知",
|
||||
"文本區域數": result.total_text_regions,
|
||||
"提取內容": text_content[:1000] + "..." if len(text_content) > 1000 else text_content,
|
||||
}
|
||||
|
||||
if include_confidence:
|
||||
row["平均信心度"] = f"{result.average_confidence:.2%}" if result.average_confidence else "N/A"
|
||||
|
||||
if include_processing_time:
|
||||
row["處理時間(秒)"] = f"{result.file.processing_time:.2f}" if result.file.processing_time else "N/A"
|
||||
|
||||
rows.append(row)
|
||||
|
||||
# Create DataFrame and export
|
||||
df = pd.DataFrame(rows)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
df.to_excel(output_path, index=False, engine='openpyxl')
|
||||
|
||||
logger.info(f"Exported {len(results)} results to Excel: {output_path}")
|
||||
return output_path
|
||||
|
||||
except Exception as e:
|
||||
raise ExportError(f"Excel export failed: {str(e)}")
|
||||
|
||||
def export_to_markdown(
|
||||
self,
|
||||
results: List[OCRResult],
|
||||
output_path: Path,
|
||||
combine: bool = True
|
||||
) -> Path:
|
||||
"""
|
||||
Export results to Markdown file(s)
|
||||
|
||||
Args:
|
||||
results: List of OCR results
|
||||
output_path: Output file path (or directory if not combining)
|
||||
combine: Combine all results into one file
|
||||
|
||||
Returns:
|
||||
Path: Output file/directory path
|
||||
|
||||
Raises:
|
||||
ExportError: If export fails
|
||||
"""
|
||||
try:
|
||||
if combine:
|
||||
# Combine all Markdown files into one
|
||||
combined_content = []
|
||||
|
||||
for result in results:
|
||||
if not result.markdown_path or not Path(result.markdown_path).exists():
|
||||
continue
|
||||
|
||||
markdown_content = Path(result.markdown_path).read_text(encoding="utf-8")
|
||||
|
||||
# Add header
|
||||
combined_content.append(f"# {result.file.original_filename}\n")
|
||||
combined_content.append(markdown_content)
|
||||
combined_content.append("\n---\n") # Separator
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
output_path.write_text("\n".join(combined_content), encoding="utf-8")
|
||||
|
||||
logger.info(f"Exported {len(results)} results to combined Markdown: {output_path}")
|
||||
return output_path
|
||||
|
||||
else:
|
||||
# Export each result to separate file
|
||||
output_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
for result in results:
|
||||
if not result.markdown_path or not Path(result.markdown_path).exists():
|
||||
continue
|
||||
|
||||
# Copy Markdown file to output directory
|
||||
src_path = Path(result.markdown_path)
|
||||
dst_path = output_path / f"{result.file.original_filename}.md"
|
||||
dst_path.write_text(src_path.read_text(encoding="utf-8"), encoding="utf-8")
|
||||
|
||||
logger.info(f"Exported {len(results)} results to separate Markdown files: {output_path}")
|
||||
return output_path
|
||||
|
||||
except Exception as e:
|
||||
raise ExportError(f"Markdown export failed: {str(e)}")
|
||||
|
||||
def export_to_pdf(
|
||||
self,
|
||||
result: OCRResult,
|
||||
output_path: Path,
|
||||
css_template: str = "default",
|
||||
metadata: Optional[Dict] = None
|
||||
) -> Path:
|
||||
"""
|
||||
Export single result to PDF with layout preservation
|
||||
|
||||
Args:
|
||||
result: OCR result
|
||||
output_path: Output PDF path
|
||||
css_template: CSS template name or custom CSS
|
||||
metadata: Optional PDF metadata
|
||||
|
||||
Returns:
|
||||
Path: Output PDF path
|
||||
|
||||
Raises:
|
||||
ExportError: If export fails
|
||||
"""
|
||||
try:
|
||||
if not result.markdown_path or not Path(result.markdown_path).exists():
|
||||
raise ExportError(f"Markdown file not found for result {result.id}")
|
||||
|
||||
markdown_path = Path(result.markdown_path)
|
||||
|
||||
# Prepare metadata
|
||||
pdf_metadata = metadata or {}
|
||||
if "title" not in pdf_metadata:
|
||||
pdf_metadata["title"] = result.file.original_filename
|
||||
|
||||
# Generate PDF
|
||||
self.pdf_generator.generate_pdf(
|
||||
markdown_path=markdown_path,
|
||||
output_path=output_path,
|
||||
css_template=css_template,
|
||||
metadata=pdf_metadata
|
||||
)
|
||||
|
||||
logger.info(f"Exported result {result.id} to PDF: {output_path}")
|
||||
return output_path
|
||||
|
||||
except PDFGenerationError as e:
|
||||
raise ExportError(f"PDF generation failed: {str(e)}")
|
||||
except Exception as e:
|
||||
raise ExportError(f"PDF export failed: {str(e)}")
|
||||
|
||||
def export_batch_to_zip(
|
||||
self,
|
||||
db: Session,
|
||||
batch_id: int,
|
||||
output_path: Path,
|
||||
include_formats: Optional[List[str]] = None
|
||||
) -> Path:
|
||||
"""
|
||||
Export entire batch to ZIP archive
|
||||
|
||||
Args:
|
||||
db: Database session
|
||||
batch_id: Batch ID
|
||||
output_path: Output ZIP path
|
||||
include_formats: List of formats to include (markdown, json, txt, excel, pdf)
|
||||
|
||||
Returns:
|
||||
Path: Output ZIP path
|
||||
|
||||
Raises:
|
||||
ExportError: If export fails
|
||||
"""
|
||||
try:
|
||||
include_formats = include_formats or ["markdown", "json"]
|
||||
|
||||
# Get batch and results
|
||||
batch = db.query(OCRBatch).filter(OCRBatch.id == batch_id).first()
|
||||
if not batch:
|
||||
raise ExportError(f"Batch {batch_id} not found")
|
||||
|
||||
results = db.query(OCRResult).join(OCRFile).filter(
|
||||
OCRFile.batch_id == batch_id,
|
||||
OCRFile.status == FileStatus.COMPLETED
|
||||
).all()
|
||||
|
||||
if not results:
|
||||
raise ExportError(f"No completed results found for batch {batch_id}")
|
||||
|
||||
# Create temporary export directory
|
||||
temp_dir = output_path.parent / f"temp_export_{batch_id}"
|
||||
temp_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
# Export in requested formats
|
||||
if "markdown" in include_formats:
|
||||
md_dir = temp_dir / "markdown"
|
||||
self.export_to_markdown(results, md_dir, combine=False)
|
||||
|
||||
if "json" in include_formats:
|
||||
json_path = temp_dir / "batch_results.json"
|
||||
self.export_to_json(results, json_path)
|
||||
|
||||
if "txt" in include_formats:
|
||||
txt_path = temp_dir / "batch_results.txt"
|
||||
self.export_to_txt(results, txt_path)
|
||||
|
||||
if "excel" in include_formats:
|
||||
excel_path = temp_dir / "batch_results.xlsx"
|
||||
self.export_to_excel(results, excel_path)
|
||||
|
||||
# Create ZIP archive
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
||||
for file_path in temp_dir.rglob('*'):
|
||||
if file_path.is_file():
|
||||
arcname = file_path.relative_to(temp_dir)
|
||||
zipf.write(file_path, arcname)
|
||||
|
||||
logger.info(f"Exported batch {batch_id} to ZIP: {output_path}")
|
||||
return output_path
|
||||
|
||||
finally:
|
||||
# Clean up temporary directory
|
||||
import shutil
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
|
||||
except Exception as e:
|
||||
raise ExportError(f"Batch ZIP export failed: {str(e)}")
|
||||
|
||||
def apply_export_rule(
|
||||
self,
|
||||
db: Session,
|
||||
results: List[OCRResult],
|
||||
rule_id: int
|
||||
) -> List[OCRResult]:
|
||||
"""
|
||||
Apply export rule to filter and format results
|
||||
|
||||
Args:
|
||||
db: Database session
|
||||
results: List of OCR results
|
||||
rule_id: Export rule ID
|
||||
|
||||
Returns:
|
||||
List[OCRResult]: Filtered results
|
||||
|
||||
Raises:
|
||||
ExportError: If rule not found
|
||||
"""
|
||||
rule = db.query(ExportRule).filter(ExportRule.id == rule_id).first()
|
||||
if not rule:
|
||||
raise ExportError(f"Export rule {rule_id} not found")
|
||||
|
||||
config = rule.config_json
|
||||
|
||||
# Apply filters
|
||||
if "filters" in config:
|
||||
results = self.apply_filters(results, config["filters"])
|
||||
|
||||
# Note: Formatting options are applied in individual export methods
|
||||
return results
|
||||
|
||||
def get_export_formats(self) -> Dict[str, str]:
|
||||
"""
|
||||
Get available export formats
|
||||
|
||||
Returns:
|
||||
Dict mapping format codes to descriptions
|
||||
"""
|
||||
return {
|
||||
"txt": "純文本格式 (.txt)",
|
||||
"json": "JSON 格式 - 包含完整元數據 (.json)",
|
||||
"excel": "Excel 表格格式 (.xlsx)",
|
||||
"markdown": "Markdown 格式 (.md)",
|
||||
"pdf": "版面保留 PDF 格式 (.pdf)",
|
||||
"zip": "批次打包格式 (.zip)",
|
||||
}
|
||||
Reference in New Issue
Block a user