fix: resolve E2E test failures and add Office direct extraction design

- Fix MySQL connection timeout by creating fresh DB session after OCR
- Fix /analyze endpoint attribute errors (detect vs analyze, metadata)
- Add processing_track field extraction to TaskDetailResponse
- Update E2E tests to use POST for /analyze endpoint
- Increase Office document timeout to 300s
- Add Section 2.4 tasks for Office document direct extraction
- Document Office → PDF → Direct track strategy in design.md

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-20 12:13:18 +08:00
parent c50a5e9d2b
commit 0974fc3a54
7 changed files with 746 additions and 9 deletions

View File

@@ -127,6 +127,17 @@ def process_task_ocr(
source_file_path=Path(file_path)
)
# Close old session and create fresh one to avoid MySQL timeout
# (long OCR processing may cause connection to become stale)
db.close()
db = SessionLocal()
# Re-fetch task with fresh connection
task = db.query(Task).filter(Task.id == task_db_id).first()
if not task:
logger.error(f"Task {task_id} not found after OCR processing")
return
# Update task with results (direct database update)
task.result_json_path = str(json_path) if json_path else None
task.result_markdown_path = str(markdown_path) if markdown_path else None
@@ -304,7 +315,25 @@ async def get_task(
detail="Task not found"
)
return task
# Extract processing_track from result JSON metadata if available
processing_track = None
if task.result_json_path:
try:
import json
from pathlib import Path
result_path = Path(task.result_json_path)
if result_path.exists():
with open(result_path) as f:
result_data = json.load(f)
metadata = result_data.get("metadata", {})
processing_track = metadata.get("processing_track")
except Exception:
pass # Silently ignore errors reading the result file
# Create response with processing_track
response = TaskDetailResponse.model_validate(task)
response.processing_track = processing_track
return response
@router.patch("/{task_id}", response_model=TaskResponse)
@@ -841,9 +870,9 @@ async def analyze_document(
detail="Task file not found"
)
# Analyze document
# Analyze document (using detect method)
detector = DocumentTypeDetector()
recommendation = detector.analyze(Path(task_file.stored_path))
recommendation = detector.detect(Path(task_file.stored_path))
# Build response
response = DocumentAnalysisResponse(
@@ -852,10 +881,10 @@ async def analyze_document(
recommended_track=ProcessingTrackEnum(recommendation.track),
confidence=recommendation.confidence,
reason=recommendation.reason,
document_info=recommendation.document_info or {},
document_info=recommendation.metadata or {},
is_editable=recommendation.track == "direct",
text_coverage=recommendation.document_info.get("text_coverage") if recommendation.document_info else None,
page_count=recommendation.document_info.get("page_count") if recommendation.document_info else None
text_coverage=recommendation.metadata.get("text_coverage") if recommendation.metadata else None,
page_count=recommendation.metadata.get("total_pages") if recommendation.metadata else None
)
logger.info(f"Document analysis for task {task_id}: {recommendation.track} (confidence: {recommendation.confidence})")