feat: add dual-track API endpoints for document processing

- Add ProcessingTrackEnum, ProcessingOptions, ProcessingMetadata schemas
- Add DocumentAnalysisResponse for document type detection
- Update /start endpoint with dual-track query parameters
- Add /analyze endpoint for document type detection with confidence scores
- Add /metadata endpoint for processing track information
- Add /download/unified endpoint for UnifiedDocument format export
- Update tasks.md to mark Section 6 API updates as completed

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-19 09:38:12 +08:00
parent 8b9a364452
commit 1d0b63854a
3 changed files with 364 additions and 24 deletions

View File

@@ -28,25 +28,50 @@ from app.schemas.task import (
TaskStatsResponse,
TaskStatusEnum,
UploadResponse,
ProcessingTrackEnum,
ProcessingOptions,
AnalyzeRequest,
DocumentAnalysisResponse,
ProcessingMetadata,
TaskResponseWithMetadata,
ExportOptions,
)
from app.services.task_service import task_service
from app.services.file_access_service import file_access_service
from app.services.ocr_service import OCRService
# Import dual-track components
try:
from app.services.document_type_detector import DocumentTypeDetector
DUAL_TRACK_AVAILABLE = True
except ImportError:
DUAL_TRACK_AVAILABLE = False
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/v2/tasks", tags=["Tasks"])
def process_task_ocr(task_id: str, task_db_id: int, file_path: str, filename: str):
def process_task_ocr(
task_id: str,
task_db_id: int,
file_path: str,
filename: str,
use_dual_track: bool = True,
force_track: Optional[str] = None,
language: str = 'ch'
):
"""
Background task to process OCR for a task
Background task to process OCR for a task with dual-track support
Args:
task_id: Task UUID string
task_db_id: Task database ID
file_path: Path to uploaded file
filename: Original filename
use_dual_track: Enable dual-track processing
force_track: Force specific track ('ocr' or 'direct')
language: OCR language code
"""
from app.core.database import SessionLocal
from app.models.task import Task
@@ -56,6 +81,7 @@ def process_task_ocr(task_id: str, task_db_id: int, file_path: str, filename: st
try:
logger.info(f"Starting OCR processing for task {task_id}, file: {filename}")
logger.info(f"Processing options: dual_track={use_dual_track}, force_track={force_track}, lang={language}")
# Get task directly by database ID (bypass user isolation for background task)
task = db.query(Task).filter(Task.id == task_db_id).first()
@@ -70,13 +96,25 @@ def process_task_ocr(task_id: str, task_db_id: int, file_path: str, filename: st
result_dir = Path(settings.result_dir) / task_id
result_dir.mkdir(parents=True, exist_ok=True)
# Process the file with OCR
ocr_result = ocr_service.process_image(
image_path=Path(file_path),
lang='ch',
detect_layout=True,
output_dir=result_dir
)
# Process the file with OCR (use dual-track if available)
if use_dual_track and hasattr(ocr_service, 'process'):
# Use new dual-track processing
ocr_result = ocr_service.process(
file_path=Path(file_path),
lang=language,
detect_layout=True,
output_dir=result_dir,
use_dual_track=use_dual_track,
force_track=force_track
)
else:
# Fall back to traditional processing
ocr_result = ocr_service.process_image(
image_path=Path(file_path),
lang=language,
detect_layout=True,
output_dir=result_dir
)
# Calculate processing time
processing_time_ms = int((datetime.now() - start_time).total_seconds() * 1000)
@@ -574,13 +612,19 @@ async def download_pdf(
async def start_task(
task_id: str,
background_tasks: BackgroundTasks,
use_dual_track: bool = Query(True, description="Enable dual-track processing"),
force_track: Optional[str] = Query(None, description="Force track: 'ocr' or 'direct'"),
language: str = Query("ch", description="OCR language code"),
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user)
):
"""
Start processing a pending task
Start processing a pending task with dual-track support
- **task_id**: Task UUID
- **use_dual_track**: Enable intelligent track selection (default: true)
- **force_track**: Force specific processing track ('ocr' or 'direct')
- **language**: OCR language code (default: 'ch')
"""
try:
# Get task details
@@ -619,16 +663,20 @@ async def start_task(
status=TaskStatus.PROCESSING
)
# Start OCR processing in background
# Start OCR processing in background with dual-track parameters
background_tasks.add_task(
process_task_ocr,
task_id=task_id,
task_db_id=task.id,
file_path=task_file.stored_path,
filename=task_file.original_name
filename=task_file.original_name,
use_dual_track=use_dual_track,
force_track=force_track,
language=language
)
logger.info(f"Started OCR processing task {task_id} for user {current_user.email}")
logger.info(f"Options: dual_track={use_dual_track}, force_track={force_track}, lang={language}")
return task
except HTTPException:
@@ -747,3 +795,226 @@ async def retry_task(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to retry task: {str(e)}"
)
# ===== Document Analysis Endpoints =====
@router.post("/{task_id}/analyze", response_model=DocumentAnalysisResponse, summary="Analyze document type")
async def analyze_document(
task_id: str,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user)
):
"""
Analyze document to determine recommended processing track
Returns document type analysis with recommended processing track
(OCR for scanned documents, DIRECT for editable PDFs)
- **task_id**: Task UUID
"""
try:
if not DUAL_TRACK_AVAILABLE:
raise HTTPException(
status_code=status.HTTP_501_NOT_IMPLEMENTED,
detail="Dual-track processing not available"
)
# Get task details
task = task_service.get_task_by_id(
db=db,
task_id=task_id,
user_id=current_user.id
)
if not task:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Task not found"
)
# Get task file
task_file = db.query(TaskFile).filter(TaskFile.task_id == task.id).first()
if not task_file:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Task file not found"
)
# Analyze document
detector = DocumentTypeDetector()
recommendation = detector.analyze(Path(task_file.stored_path))
# Build response
response = DocumentAnalysisResponse(
task_id=task_id,
filename=task_file.original_name or "",
recommended_track=ProcessingTrackEnum(recommendation.track),
confidence=recommendation.confidence,
reason=recommendation.reason,
document_info=recommendation.document_info or {},
is_editable=recommendation.track == "direct",
text_coverage=recommendation.document_info.get("text_coverage") if recommendation.document_info else None,
page_count=recommendation.document_info.get("page_count") if recommendation.document_info else None
)
logger.info(f"Document analysis for task {task_id}: {recommendation.track} (confidence: {recommendation.confidence})")
return response
except HTTPException:
raise
except Exception as e:
logger.exception(f"Failed to analyze document for task {task_id}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to analyze document: {str(e)}"
)
@router.get("/{task_id}/metadata", response_model=ProcessingMetadata, summary="Get processing metadata")
async def get_processing_metadata(
task_id: str,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user)
):
"""
Get processing metadata for a completed task
Returns detailed processing information including track used,
element counts, and statistics.
- **task_id**: Task UUID
"""
try:
# Get task details
task = task_service.get_task_by_id(
db=db,
task_id=task_id,
user_id=current_user.id
)
if not task:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Task not found"
)
if task.status != TaskStatus.COMPLETED:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Task not completed"
)
# Load JSON result to get metadata
if not task.result_json_path:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Result JSON not found"
)
json_path = Path(task.result_json_path)
if not json_path.exists():
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Result file not found"
)
with open(json_path, 'r', encoding='utf-8') as f:
result_data = json.load(f)
# Extract metadata
metadata = result_data.get('metadata', {})
statistics = result_data.get('statistics', {})
response = ProcessingMetadata(
processing_track=ProcessingTrackEnum(metadata.get('processing_track', 'ocr')),
processing_time_seconds=metadata.get('processing_time', 0),
language=metadata.get('language', 'ch'),
page_count=statistics.get('page_count', 1),
total_elements=statistics.get('total_elements', 0),
total_text_regions=len(result_data.get('text_regions', [])) if 'text_regions' in result_data else statistics.get('total_elements', 0),
total_tables=statistics.get('total_tables', 0),
total_images=statistics.get('total_images', 0),
average_confidence=result_data.get('average_confidence'),
unified_format=metadata.get('processing_info', {}).get('export_format') == 'unified_document_v1'
)
return response
except HTTPException:
raise
except Exception as e:
logger.exception(f"Failed to get metadata for task {task_id}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to get metadata: {str(e)}"
)
@router.get("/{task_id}/download/unified", summary="Download unified format")
async def download_unified(
task_id: str,
include_metadata: bool = Query(True, description="Include processing metadata"),
include_statistics: bool = Query(True, description="Include document statistics"),
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user)
):
"""
Download results in unified document format
Returns JSON with full UnifiedDocument structure including
all elements, coordinates, and metadata.
- **task_id**: Task UUID
- **include_metadata**: Include processing metadata
- **include_statistics**: Include document statistics
"""
try:
# Get task details
task = task_service.get_task_by_id(
db=db,
task_id=task_id,
user_id=current_user.id
)
if not task:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Task not found"
)
if task.status != TaskStatus.COMPLETED:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Task not completed"
)
# Get JSON result path
if not task.result_json_path:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Result JSON not found"
)
json_path = Path(task.result_json_path)
if not json_path.exists():
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Result file not found"
)
# Return the unified format JSON
return FileResponse(
path=str(json_path),
filename=f"{task_id}_unified.json",
media_type="application/json"
)
except HTTPException:
raise
except Exception as e:
logger.exception(f"Failed to download unified format for task {task_id}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to download: {str(e)}"
)

View File

@@ -16,6 +16,14 @@ class TaskStatusEnum(str, Enum):
FAILED = "failed"
class ProcessingTrackEnum(str, Enum):
"""Processing track enumeration for dual-track processing"""
OCR = "ocr" # PaddleOCR PP-StructureV3 for scanned documents
DIRECT = "direct" # PyMuPDF for editable PDFs
HYBRID = "hybrid" # Mixed processing
AUTO = "auto" # Auto-detect best track
class TaskCreate(BaseModel):
"""Task creation request"""
filename: Optional[str] = Field(None, description="Original filename")
@@ -117,3 +125,64 @@ class UploadResponse(BaseModel):
file_size: int = Field(..., description="File size in bytes")
file_type: str = Field(..., description="File MIME type")
status: TaskStatusEnum = Field(..., description="Initial task status")
# ===== Dual-Track Processing Schemas =====
class ProcessingOptions(BaseModel):
"""Processing options for dual-track OCR"""
use_dual_track: bool = Field(default=True, description="Enable dual-track processing")
force_track: Optional[ProcessingTrackEnum] = Field(None, description="Force specific track (ocr/direct)")
language: str = Field(default="ch", description="OCR language code")
include_layout: bool = Field(default=True, description="Include layout analysis")
include_images: bool = Field(default=True, description="Extract and save images")
confidence_threshold: Optional[float] = Field(None, ge=0, le=1, description="OCR confidence threshold")
class AnalyzeRequest(BaseModel):
"""Document analysis request"""
use_dual_track: bool = Field(default=True, description="Enable dual-track processing")
force_track: Optional[ProcessingTrackEnum] = Field(None, description="Force specific track")
language: str = Field(default="ch", description="OCR language")
include_layout: bool = Field(default=True, description="Include layout analysis")
class DocumentAnalysisResponse(BaseModel):
"""Document type analysis response"""
task_id: str
filename: str
recommended_track: ProcessingTrackEnum
confidence: float = Field(..., ge=0, le=1, description="Detection confidence")
reason: str = Field(..., description="Reason for recommendation")
document_info: dict = Field(default_factory=dict, description="Document metadata")
is_editable: bool = Field(..., description="Whether document has extractable text")
text_coverage: Optional[float] = Field(None, description="Percentage of text coverage")
page_count: Optional[int] = Field(None, description="Number of pages")
class ProcessingMetadata(BaseModel):
"""Processing metadata included in responses"""
processing_track: ProcessingTrackEnum
processing_time_seconds: float
language: str
page_count: int
total_elements: int
total_text_regions: int
total_tables: int
total_images: int
average_confidence: Optional[float] = None
unified_format: bool = True
class TaskResponseWithMetadata(TaskResponse):
"""Extended task response with processing metadata"""
processing_track: Optional[ProcessingTrackEnum] = None
processing_metadata: Optional[ProcessingMetadata] = None
class ExportOptions(BaseModel):
"""Export format options"""
format: str = Field(default="json", description="Export format: json, markdown, pdf, unified")
include_metadata: bool = Field(default=True, description="Include processing metadata")
include_statistics: bool = Field(default=True, description="Include document statistics")
legacy_format: bool = Field(default=False, description="Use legacy JSON format for compatibility")