This commit is contained in:
beabigegg
2025-11-12 22:53:17 +08:00
commit da700721fa
130 changed files with 23393 additions and 0 deletions

View File

@@ -0,0 +1,512 @@
"""
Tool_OCR - Export Service
Handles OCR result export in multiple formats with filtering and formatting rules
"""
import json
import logging
import zipfile
from pathlib import Path
from typing import List, Dict, Optional, Any
from datetime import datetime
import pandas as pd
from sqlalchemy.orm import Session
from app.core.config import settings
from app.models.ocr import OCRBatch, OCRFile, OCRResult, FileStatus
from app.models.export import ExportRule
from app.services.pdf_generator import PDFGenerator, PDFGenerationError
logger = logging.getLogger(__name__)
class ExportError(Exception):
"""Exception raised for export errors"""
pass
class ExportService:
"""
Export service for OCR results
Supported formats:
- TXT: Plain text export
- JSON: Full metadata export
- Excel: Tabular data export
- Markdown: Direct Markdown export
- PDF: Layout-preserved PDF export
- ZIP: Batch export archive
"""
def __init__(self):
"""Initialize export service"""
self.pdf_generator = PDFGenerator()
def apply_filters(
self,
results: List[OCRResult],
filters: Dict[str, Any]
) -> List[OCRResult]:
"""
Apply filters to OCR results
Args:
results: List of OCR results
filters: Filter configuration
- confidence_threshold: Minimum confidence (0.0-1.0)
- filename_pattern: Glob pattern for filename matching
- language: Filter by detected language
Returns:
List[OCRResult]: Filtered results
"""
filtered = results
# Confidence threshold filter
if "confidence_threshold" in filters:
threshold = filters["confidence_threshold"]
filtered = [r for r in filtered if r.average_confidence and r.average_confidence >= threshold]
# Filename pattern filter (using simple substring match)
if "filename_pattern" in filters:
pattern = filters["filename_pattern"].lower()
filtered = [
r for r in filtered
if pattern in r.file.original_filename.lower()
]
# Language filter
if "language" in filters:
lang = filters["language"]
filtered = [r for r in filtered if r.detected_language == lang]
return filtered
def export_to_txt(
self,
results: List[OCRResult],
output_path: Path,
formatting: Optional[Dict] = None
) -> Path:
"""
Export results to plain text file
Args:
results: List of OCR results
output_path: Output file path
formatting: Formatting options
- add_line_numbers: Add line numbers
- group_by_filename: Group text by source file
- include_metadata: Add file metadata headers
Returns:
Path: Output file path
Raises:
ExportError: If export fails
"""
try:
formatting = formatting or {}
output_lines = []
for idx, result in enumerate(results, 1):
# Read Markdown file
if not result.markdown_path or not Path(result.markdown_path).exists():
logger.warning(f"Markdown file not found for result {result.id}")
continue
markdown_content = Path(result.markdown_path).read_text(encoding="utf-8")
# Add metadata header if requested
if formatting.get("include_metadata", False):
output_lines.append(f"=" * 80)
output_lines.append(f"文件: {result.file.original_filename}")
output_lines.append(f"語言: {result.detected_language or '未知'}")
output_lines.append(f"信心度: {result.average_confidence:.2%}" if result.average_confidence else "信心度: N/A")
output_lines.append(f"=" * 80)
output_lines.append("")
# Add content with optional line numbers
if formatting.get("add_line_numbers", False):
for line_num, line in enumerate(markdown_content.split('\n'), 1):
output_lines.append(f"{line_num:4d} | {line}")
else:
output_lines.append(markdown_content)
# Add separator between files if grouping
if formatting.get("group_by_filename", False) and idx < len(results):
output_lines.append("\n" + "-" * 80 + "\n")
# Write to file
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text("\n".join(output_lines), encoding="utf-8")
logger.info(f"Exported {len(results)} results to TXT: {output_path}")
return output_path
except Exception as e:
raise ExportError(f"TXT export failed: {str(e)}")
def export_to_json(
self,
results: List[OCRResult],
output_path: Path,
include_layout: bool = True,
include_images: bool = True
) -> Path:
"""
Export results to JSON file with full metadata
Args:
results: List of OCR results
output_path: Output file path
include_layout: Include layout data
include_images: Include images metadata
Returns:
Path: Output file path
Raises:
ExportError: If export fails
"""
try:
export_data = {
"export_time": datetime.utcnow().isoformat(),
"total_files": len(results),
"results": []
}
for result in results:
result_data = {
"file_id": result.file.id,
"filename": result.file.original_filename,
"file_format": result.file.file_format,
"file_size": result.file.file_size,
"processing_time": result.file.processing_time,
"detected_language": result.detected_language,
"total_text_regions": result.total_text_regions,
"average_confidence": result.average_confidence,
"markdown_path": result.markdown_path,
}
# Include layout data if requested
if include_layout and result.layout_data:
result_data["layout_data"] = result.layout_data
# Include images metadata if requested
if include_images and result.images_metadata:
result_data["images_metadata"] = result.images_metadata
export_data["results"].append(result_data)
# Write to file
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(
json.dumps(export_data, ensure_ascii=False, indent=2),
encoding="utf-8"
)
logger.info(f"Exported {len(results)} results to JSON: {output_path}")
return output_path
except Exception as e:
raise ExportError(f"JSON export failed: {str(e)}")
def export_to_excel(
self,
results: List[OCRResult],
output_path: Path,
include_confidence: bool = True,
include_processing_time: bool = True
) -> Path:
"""
Export results to Excel file
Args:
results: List of OCR results
output_path: Output file path
include_confidence: Include confidence scores
include_processing_time: Include processing time
Returns:
Path: Output file path
Raises:
ExportError: If export fails
"""
try:
rows = []
for result in results:
# Read Markdown content
text_content = ""
if result.markdown_path and Path(result.markdown_path).exists():
text_content = Path(result.markdown_path).read_text(encoding="utf-8")
row = {
"文件名": result.file.original_filename,
"格式": result.file.file_format,
"大小(字節)": result.file.file_size,
"語言": result.detected_language or "未知",
"文本區域數": result.total_text_regions,
"提取內容": text_content[:1000] + "..." if len(text_content) > 1000 else text_content,
}
if include_confidence:
row["平均信心度"] = f"{result.average_confidence:.2%}" if result.average_confidence else "N/A"
if include_processing_time:
row["處理時間(秒)"] = f"{result.file.processing_time:.2f}" if result.file.processing_time else "N/A"
rows.append(row)
# Create DataFrame and export
df = pd.DataFrame(rows)
output_path.parent.mkdir(parents=True, exist_ok=True)
df.to_excel(output_path, index=False, engine='openpyxl')
logger.info(f"Exported {len(results)} results to Excel: {output_path}")
return output_path
except Exception as e:
raise ExportError(f"Excel export failed: {str(e)}")
def export_to_markdown(
self,
results: List[OCRResult],
output_path: Path,
combine: bool = True
) -> Path:
"""
Export results to Markdown file(s)
Args:
results: List of OCR results
output_path: Output file path (or directory if not combining)
combine: Combine all results into one file
Returns:
Path: Output file/directory path
Raises:
ExportError: If export fails
"""
try:
if combine:
# Combine all Markdown files into one
combined_content = []
for result in results:
if not result.markdown_path or not Path(result.markdown_path).exists():
continue
markdown_content = Path(result.markdown_path).read_text(encoding="utf-8")
# Add header
combined_content.append(f"# {result.file.original_filename}\n")
combined_content.append(markdown_content)
combined_content.append("\n---\n") # Separator
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text("\n".join(combined_content), encoding="utf-8")
logger.info(f"Exported {len(results)} results to combined Markdown: {output_path}")
return output_path
else:
# Export each result to separate file
output_path.mkdir(parents=True, exist_ok=True)
for result in results:
if not result.markdown_path or not Path(result.markdown_path).exists():
continue
# Copy Markdown file to output directory
src_path = Path(result.markdown_path)
dst_path = output_path / f"{result.file.original_filename}.md"
dst_path.write_text(src_path.read_text(encoding="utf-8"), encoding="utf-8")
logger.info(f"Exported {len(results)} results to separate Markdown files: {output_path}")
return output_path
except Exception as e:
raise ExportError(f"Markdown export failed: {str(e)}")
def export_to_pdf(
self,
result: OCRResult,
output_path: Path,
css_template: str = "default",
metadata: Optional[Dict] = None
) -> Path:
"""
Export single result to PDF with layout preservation
Args:
result: OCR result
output_path: Output PDF path
css_template: CSS template name or custom CSS
metadata: Optional PDF metadata
Returns:
Path: Output PDF path
Raises:
ExportError: If export fails
"""
try:
if not result.markdown_path or not Path(result.markdown_path).exists():
raise ExportError(f"Markdown file not found for result {result.id}")
markdown_path = Path(result.markdown_path)
# Prepare metadata
pdf_metadata = metadata or {}
if "title" not in pdf_metadata:
pdf_metadata["title"] = result.file.original_filename
# Generate PDF
self.pdf_generator.generate_pdf(
markdown_path=markdown_path,
output_path=output_path,
css_template=css_template,
metadata=pdf_metadata
)
logger.info(f"Exported result {result.id} to PDF: {output_path}")
return output_path
except PDFGenerationError as e:
raise ExportError(f"PDF generation failed: {str(e)}")
except Exception as e:
raise ExportError(f"PDF export failed: {str(e)}")
def export_batch_to_zip(
self,
db: Session,
batch_id: int,
output_path: Path,
include_formats: Optional[List[str]] = None
) -> Path:
"""
Export entire batch to ZIP archive
Args:
db: Database session
batch_id: Batch ID
output_path: Output ZIP path
include_formats: List of formats to include (markdown, json, txt, excel, pdf)
Returns:
Path: Output ZIP path
Raises:
ExportError: If export fails
"""
try:
include_formats = include_formats or ["markdown", "json"]
# Get batch and results
batch = db.query(OCRBatch).filter(OCRBatch.id == batch_id).first()
if not batch:
raise ExportError(f"Batch {batch_id} not found")
results = db.query(OCRResult).join(OCRFile).filter(
OCRFile.batch_id == batch_id,
OCRFile.status == FileStatus.COMPLETED
).all()
if not results:
raise ExportError(f"No completed results found for batch {batch_id}")
# Create temporary export directory
temp_dir = output_path.parent / f"temp_export_{batch_id}"
temp_dir.mkdir(parents=True, exist_ok=True)
try:
# Export in requested formats
if "markdown" in include_formats:
md_dir = temp_dir / "markdown"
self.export_to_markdown(results, md_dir, combine=False)
if "json" in include_formats:
json_path = temp_dir / "batch_results.json"
self.export_to_json(results, json_path)
if "txt" in include_formats:
txt_path = temp_dir / "batch_results.txt"
self.export_to_txt(results, txt_path)
if "excel" in include_formats:
excel_path = temp_dir / "batch_results.xlsx"
self.export_to_excel(results, excel_path)
# Create ZIP archive
output_path.parent.mkdir(parents=True, exist_ok=True)
with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
for file_path in temp_dir.rglob('*'):
if file_path.is_file():
arcname = file_path.relative_to(temp_dir)
zipf.write(file_path, arcname)
logger.info(f"Exported batch {batch_id} to ZIP: {output_path}")
return output_path
finally:
# Clean up temporary directory
import shutil
shutil.rmtree(temp_dir, ignore_errors=True)
except Exception as e:
raise ExportError(f"Batch ZIP export failed: {str(e)}")
def apply_export_rule(
self,
db: Session,
results: List[OCRResult],
rule_id: int
) -> List[OCRResult]:
"""
Apply export rule to filter and format results
Args:
db: Database session
results: List of OCR results
rule_id: Export rule ID
Returns:
List[OCRResult]: Filtered results
Raises:
ExportError: If rule not found
"""
rule = db.query(ExportRule).filter(ExportRule.id == rule_id).first()
if not rule:
raise ExportError(f"Export rule {rule_id} not found")
config = rule.config_json
# Apply filters
if "filters" in config:
results = self.apply_filters(results, config["filters"])
# Note: Formatting options are applied in individual export methods
return results
def get_export_formats(self) -> Dict[str, str]:
"""
Get available export formats
Returns:
Dict mapping format codes to descriptions
"""
return {
"txt": "純文本格式 (.txt)",
"json": "JSON 格式 - 包含完整元數據 (.json)",
"excel": "Excel 表格格式 (.xlsx)",
"markdown": "Markdown 格式 (.md)",
"pdf": "版面保留 PDF 格式 (.pdf)",
"zip": "批次打包格式 (.zip)",
}