fix: resolve E2E test failures and add Office direct extraction design
- Fix MySQL connection timeout by creating fresh DB session after OCR - Fix /analyze endpoint attribute errors (detect vs analyze, metadata) - Add processing_track field extraction to TaskDetailResponse - Update E2E tests to use POST for /analyze endpoint - Increase Office document timeout to 300s - Add Section 2.4 tasks for Office document direct extraction - Document Office → PDF → Direct track strategy in design.md 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -127,6 +127,17 @@ def process_task_ocr(
|
||||
source_file_path=Path(file_path)
|
||||
)
|
||||
|
||||
# Close old session and create fresh one to avoid MySQL timeout
|
||||
# (long OCR processing may cause connection to become stale)
|
||||
db.close()
|
||||
db = SessionLocal()
|
||||
|
||||
# Re-fetch task with fresh connection
|
||||
task = db.query(Task).filter(Task.id == task_db_id).first()
|
||||
if not task:
|
||||
logger.error(f"Task {task_id} not found after OCR processing")
|
||||
return
|
||||
|
||||
# Update task with results (direct database update)
|
||||
task.result_json_path = str(json_path) if json_path else None
|
||||
task.result_markdown_path = str(markdown_path) if markdown_path else None
|
||||
@@ -304,7 +315,25 @@ async def get_task(
|
||||
detail="Task not found"
|
||||
)
|
||||
|
||||
return task
|
||||
# Extract processing_track from result JSON metadata if available
|
||||
processing_track = None
|
||||
if task.result_json_path:
|
||||
try:
|
||||
import json
|
||||
from pathlib import Path
|
||||
result_path = Path(task.result_json_path)
|
||||
if result_path.exists():
|
||||
with open(result_path) as f:
|
||||
result_data = json.load(f)
|
||||
metadata = result_data.get("metadata", {})
|
||||
processing_track = metadata.get("processing_track")
|
||||
except Exception:
|
||||
pass # Silently ignore errors reading the result file
|
||||
|
||||
# Create response with processing_track
|
||||
response = TaskDetailResponse.model_validate(task)
|
||||
response.processing_track = processing_track
|
||||
return response
|
||||
|
||||
|
||||
@router.patch("/{task_id}", response_model=TaskResponse)
|
||||
@@ -841,9 +870,9 @@ async def analyze_document(
|
||||
detail="Task file not found"
|
||||
)
|
||||
|
||||
# Analyze document
|
||||
# Analyze document (using detect method)
|
||||
detector = DocumentTypeDetector()
|
||||
recommendation = detector.analyze(Path(task_file.stored_path))
|
||||
recommendation = detector.detect(Path(task_file.stored_path))
|
||||
|
||||
# Build response
|
||||
response = DocumentAnalysisResponse(
|
||||
@@ -852,10 +881,10 @@ async def analyze_document(
|
||||
recommended_track=ProcessingTrackEnum(recommendation.track),
|
||||
confidence=recommendation.confidence,
|
||||
reason=recommendation.reason,
|
||||
document_info=recommendation.document_info or {},
|
||||
document_info=recommendation.metadata or {},
|
||||
is_editable=recommendation.track == "direct",
|
||||
text_coverage=recommendation.document_info.get("text_coverage") if recommendation.document_info else None,
|
||||
page_count=recommendation.document_info.get("page_count") if recommendation.document_info else None
|
||||
text_coverage=recommendation.metadata.get("text_coverage") if recommendation.metadata else None,
|
||||
page_count=recommendation.metadata.get("total_pages") if recommendation.metadata else None
|
||||
)
|
||||
|
||||
logger.info(f"Document analysis for task {task_id}: {recommendation.track} (confidence: {recommendation.confidence})")
|
||||
|
||||
Reference in New Issue
Block a user