Files
OCR/backend/app/routers/export.py
beabigegg da700721fa first
2025-11-12 22:53:17 +08:00

339 lines
10 KiB
Python

"""
Tool_OCR - Export Router
Export results in multiple formats
"""
import logging
from typing import List
from pathlib import Path
from fastapi import APIRouter, Depends, HTTPException, status
from fastapi.responses import FileResponse
from sqlalchemy.orm import Session
from app.core.deps import get_db, get_current_active_user
from app.models.user import User
from app.models.ocr import OCRBatch, OCRFile, OCRResult, FileStatus
from app.models.export import ExportRule
from app.schemas.export import (
ExportRequest,
ExportRuleCreate,
ExportRuleUpdate,
ExportRuleResponse,
CSSTemplateResponse,
)
from app.services.export_service import ExportService, ExportError
from app.services.pdf_generator import PDFGenerator
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/v1/export", tags=["Export"])
# Initialize services
export_service = ExportService()
pdf_generator = PDFGenerator()
@router.post("", summary="Export OCR results")
async def export_results(
request: ExportRequest,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_active_user)
):
"""
Export OCR results in specified format
Supports multiple export formats: txt, json, excel, markdown, pdf, zip
- **batch_id**: Batch ID to export
- **format**: Export format (txt, json, excel, markdown, pdf, zip)
- **rule_id**: Optional export rule ID to apply filters
- **css_template**: CSS template for PDF export (default, academic, business)
- **include_formats**: Formats to include in ZIP export
"""
# Verify batch ownership
batch = db.query(OCRBatch).filter(
OCRBatch.id == request.batch_id,
OCRBatch.user_id == current_user.id
).first()
if not batch:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Batch not found"
)
# Get completed results
results = db.query(OCRResult).join(OCRFile).filter(
OCRFile.batch_id == request.batch_id,
OCRFile.status == FileStatus.COMPLETED
).all()
if not results:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="No completed results found for this batch"
)
# Apply export rule if specified
if request.rule_id:
try:
results = export_service.apply_export_rule(db, results, request.rule_id)
except ExportError as e:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=str(e)
)
try:
# Generate export based on format
export_dir = Path(f"uploads/batches/{batch.id}/exports")
export_dir.mkdir(parents=True, exist_ok=True)
if request.format == "txt":
output_path = export_dir / f"batch_{batch.id}_export.txt"
export_service.export_to_txt(results, output_path)
elif request.format == "json":
output_path = export_dir / f"batch_{batch.id}_export.json"
export_service.export_to_json(results, output_path)
elif request.format == "excel":
output_path = export_dir / f"batch_{batch.id}_export.xlsx"
export_service.export_to_excel(results, output_path)
elif request.format == "markdown":
output_path = export_dir / f"batch_{batch.id}_export.md"
export_service.export_to_markdown(results, output_path, combine=True)
elif request.format == "zip":
output_path = export_dir / f"batch_{batch.id}_export.zip"
include_formats = request.include_formats or ["markdown", "json"]
export_service.export_batch_to_zip(db, batch.id, output_path, include_formats)
else:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=f"Unsupported export format: {request.format}"
)
logger.info(f"Exported batch {batch.id} to {request.format} format: {output_path}")
# Return file for download
return FileResponse(
path=str(output_path),
filename=output_path.name,
media_type="application/octet-stream"
)
except ExportError as e:
logger.error(f"Export error: {e}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=str(e)
)
except Exception as e:
logger.error(f"Unexpected export error: {e}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="Export failed"
)
@router.get("/pdf/{file_id}", summary="Generate PDF for single file")
async def generate_pdf(
file_id: int,
css_template: str = "default",
db: Session = Depends(get_db),
current_user: User = Depends(get_current_active_user)
):
"""
Generate layout-preserved PDF for a single file
- **file_id**: File ID
- **css_template**: CSS template (default, academic, business)
"""
# Get file and verify ownership
ocr_file = db.query(OCRFile).join(OCRBatch).filter(
OCRFile.id == file_id,
OCRBatch.user_id == current_user.id
).first()
if not ocr_file:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="File not found"
)
# Get result
result = db.query(OCRResult).filter(OCRResult.file_id == file_id).first()
if not result:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="OCR result not found"
)
try:
# Generate PDF
export_dir = Path(f"uploads/batches/{ocr_file.batch_id}/exports")
export_dir.mkdir(parents=True, exist_ok=True)
output_path = export_dir / f"file_{file_id}_export.pdf"
export_service.export_to_pdf(
result=result,
output_path=output_path,
css_template=css_template,
metadata={"title": ocr_file.original_filename}
)
logger.info(f"Generated PDF for file {file_id}: {output_path}")
return FileResponse(
path=str(output_path),
filename=f"{Path(ocr_file.original_filename).stem}.pdf",
media_type="application/pdf"
)
except ExportError as e:
logger.error(f"PDF generation error: {e}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=str(e)
)
@router.get("/rules", response_model=List[ExportRuleResponse], summary="List export rules")
async def list_export_rules(
db: Session = Depends(get_db),
current_user: User = Depends(get_current_active_user)
):
"""
List all export rules for current user
Returns list of saved export rules
"""
rules = db.query(ExportRule).filter(ExportRule.user_id == current_user.id).all()
return rules
@router.post("/rules", response_model=ExportRuleResponse, summary="Create export rule")
async def create_export_rule(
rule: ExportRuleCreate,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_active_user)
):
"""
Create new export rule
Saves custom export configuration for reuse
- **rule_name**: Rule name
- **description**: Optional description
- **config_json**: Rule configuration (filters, formatting, export_options)
- **css_template**: Optional custom CSS for PDF export
"""
# Create rule
new_rule = ExportRule(
user_id=current_user.id,
rule_name=rule.rule_name,
description=rule.description,
config_json=rule.config_json,
css_template=rule.css_template
)
db.add(new_rule)
db.commit()
db.refresh(new_rule)
logger.info(f"Created export rule {new_rule.id} for user {current_user.id}")
return new_rule
@router.put("/rules/{rule_id}", response_model=ExportRuleResponse, summary="Update export rule")
async def update_export_rule(
rule_id: int,
rule: ExportRuleUpdate,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_active_user)
):
"""
Update existing export rule
- **rule_id**: Rule ID to update
- **rule_name**: Optional new rule name
- **description**: Optional new description
- **config_json**: Optional new configuration
- **css_template**: Optional new CSS template
"""
# Get rule and verify ownership
db_rule = db.query(ExportRule).filter(
ExportRule.id == rule_id,
ExportRule.user_id == current_user.id
).first()
if not db_rule:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Export rule not found"
)
# Update fields
update_data = rule.dict(exclude_unset=True)
for field, value in update_data.items():
setattr(db_rule, field, value)
db.commit()
db.refresh(db_rule)
logger.info(f"Updated export rule {rule_id}")
return db_rule
@router.delete("/rules/{rule_id}", summary="Delete export rule")
async def delete_export_rule(
rule_id: int,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_active_user)
):
"""
Delete export rule
- **rule_id**: Rule ID to delete
"""
# Get rule and verify ownership
db_rule = db.query(ExportRule).filter(
ExportRule.id == rule_id,
ExportRule.user_id == current_user.id
).first()
if not db_rule:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Export rule not found"
)
db.delete(db_rule)
db.commit()
logger.info(f"Deleted export rule {rule_id}")
return {"message": "Export rule deleted successfully"}
@router.get("/css-templates", response_model=List[CSSTemplateResponse], summary="List CSS templates")
async def list_css_templates():
"""
List available CSS templates for PDF generation
Returns list of predefined CSS templates with descriptions
"""
templates = pdf_generator.get_available_templates()
return [
{"name": name, "description": desc}
for name, desc in templates.items()
]