- Add debug_font_path, demo_docs_dir, e2e_api_base_url to config.py - Fix hardcoded paths in pp_structure_debug.py, create_demo_images.py - Fix hardcoded paths in test files - Update .env.example with new configuration options - Update .gitignore to exclude AI development files (.claude/, openspec/, AGENTS.md, CLAUDE.md) - Add production startup script (start-prod.sh) - Add README.md with project documentation - Add 1panel Docker deployment files (docker-compose.yml, Dockerfiles, nginx.conf) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
421 lines
16 KiB
Python
421 lines
16 KiB
Python
"""
|
|
End-to-End tests for PP-StructureV3 parameter customization
|
|
Tests full workflow: Upload → Set params → Process → Verify results
|
|
"""
|
|
|
|
import pytest
|
|
import requests
|
|
import time
|
|
import json
|
|
import os
|
|
from pathlib import Path
|
|
from typing import Optional, Dict
|
|
|
|
# Test configuration - use environment variable or settings
|
|
from app.core.config import settings
|
|
|
|
API_BASE_URL = settings.e2e_api_base_url
|
|
TEST_USER_EMAIL = os.getenv("E2E_TEST_USER_EMAIL", "test@example.com")
|
|
TEST_USER_PASSWORD = os.getenv("E2E_TEST_USER_PASSWORD", "testpassword")
|
|
|
|
# Test documents (assuming these exist in demo_docs/)
|
|
TEST_DOCUMENTS = {
|
|
'simple_text': 'demo_docs/simple_text.pdf',
|
|
'complex_diagram': 'demo_docs/complex_diagram.pdf',
|
|
'small_text': 'demo_docs/small_text.pdf',
|
|
}
|
|
|
|
|
|
class TestClient:
|
|
"""Helper class for API testing with authentication"""
|
|
|
|
def __init__(self, base_url: str = API_BASE_URL):
|
|
self.base_url = base_url
|
|
self.session = requests.Session()
|
|
self.access_token: Optional[str] = None
|
|
|
|
def login(self, email: str, password: str) -> bool:
|
|
"""Login and get access token"""
|
|
try:
|
|
response = self.session.post(
|
|
f"{self.base_url}/auth/login",
|
|
json={"email": email, "password": password}
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
self.access_token = data['access_token']
|
|
self.session.headers.update({
|
|
'Authorization': f'Bearer {self.access_token}'
|
|
})
|
|
return True
|
|
except Exception as e:
|
|
print(f"Login failed: {e}")
|
|
return False
|
|
|
|
def create_task(self, filename: str, file_type: str) -> Optional[str]:
|
|
"""Create a task and return task_id"""
|
|
try:
|
|
response = self.session.post(
|
|
f"{self.base_url}/tasks",
|
|
json={"filename": filename, "file_type": file_type}
|
|
)
|
|
response.raise_for_status()
|
|
return response.json()['task_id']
|
|
except Exception as e:
|
|
print(f"Create task failed: {e}")
|
|
return None
|
|
|
|
def upload_file(self, task_id: str, file_path: Path) -> bool:
|
|
"""Upload file to task"""
|
|
try:
|
|
with open(file_path, 'rb') as f:
|
|
files = {'file': (file_path.name, f, 'application/pdf')}
|
|
response = self.session.post(
|
|
f"{self.base_url}/upload/{task_id}",
|
|
files=files
|
|
)
|
|
response.raise_for_status()
|
|
return True
|
|
except Exception as e:
|
|
print(f"Upload failed: {e}")
|
|
return False
|
|
|
|
def start_task(self, task_id: str, pp_structure_params: Optional[Dict] = None) -> bool:
|
|
"""Start task processing with optional custom parameters"""
|
|
try:
|
|
body = {
|
|
"use_dual_track": True,
|
|
"language": "ch"
|
|
}
|
|
if pp_structure_params:
|
|
body["pp_structure_params"] = pp_structure_params
|
|
|
|
response = self.session.post(
|
|
f"{self.base_url}/tasks/{task_id}/start",
|
|
json=body
|
|
)
|
|
response.raise_for_status()
|
|
return True
|
|
except Exception as e:
|
|
print(f"Start task failed: {e}")
|
|
return False
|
|
|
|
def get_task_status(self, task_id: str) -> Optional[Dict]:
|
|
"""Get task status"""
|
|
try:
|
|
response = self.session.get(f"{self.base_url}/tasks/{task_id}")
|
|
response.raise_for_status()
|
|
return response.json()
|
|
except Exception as e:
|
|
print(f"Get task status failed: {e}")
|
|
return None
|
|
|
|
def wait_for_completion(self, task_id: str, timeout: int = 300) -> Optional[Dict]:
|
|
"""Wait for task to complete (max timeout seconds)"""
|
|
start_time = time.time()
|
|
while time.time() - start_time < timeout:
|
|
task = self.get_task_status(task_id)
|
|
if task and task['status'] in ['completed', 'failed']:
|
|
return task
|
|
time.sleep(2)
|
|
return None
|
|
|
|
def download_result_json(self, task_id: str) -> Optional[Dict]:
|
|
"""Download and parse result JSON"""
|
|
try:
|
|
response = self.session.get(f"{self.base_url}/tasks/{task_id}/download/json")
|
|
response.raise_for_status()
|
|
return response.json()
|
|
except Exception as e:
|
|
print(f"Download result failed: {e}")
|
|
return None
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def client():
|
|
"""Create authenticated test client"""
|
|
client = TestClient()
|
|
if not client.login(TEST_USER_EMAIL, TEST_USER_PASSWORD):
|
|
pytest.skip("Authentication failed - check credentials or server")
|
|
return client
|
|
|
|
|
|
@pytest.mark.e2e
|
|
class TestPPStructureParamsE2E:
|
|
"""End-to-end tests for PP-StructureV3 parameter customization"""
|
|
|
|
def test_default_parameters_workflow(self, client: TestClient):
|
|
"""Test complete workflow with default parameters"""
|
|
# Find a test document
|
|
test_doc = None
|
|
for doc_path in TEST_DOCUMENTS.values():
|
|
if Path(doc_path).exists():
|
|
test_doc = Path(doc_path)
|
|
break
|
|
|
|
if not test_doc:
|
|
pytest.skip("No test documents found")
|
|
|
|
# Step 1: Create task
|
|
task_id = client.create_task(test_doc.name, "application/pdf")
|
|
assert task_id is not None, "Failed to create task"
|
|
print(f"✓ Created task: {task_id}")
|
|
|
|
# Step 2: Upload file
|
|
success = client.upload_file(task_id, test_doc)
|
|
assert success, "Failed to upload file"
|
|
print(f"✓ Uploaded file: {test_doc.name}")
|
|
|
|
# Step 3: Start processing (no custom params)
|
|
success = client.start_task(task_id, pp_structure_params=None)
|
|
assert success, "Failed to start task"
|
|
print("✓ Started processing with default parameters")
|
|
|
|
# Step 4: Wait for completion
|
|
result = client.wait_for_completion(task_id, timeout=180)
|
|
assert result is not None, "Task did not complete in time"
|
|
assert result['status'] == 'completed', f"Task failed: {result.get('error_message')}"
|
|
print(f"✓ Task completed in {result.get('processing_time_ms', 0) / 1000:.2f}s")
|
|
|
|
# Step 5: Verify results
|
|
result_json = client.download_result_json(task_id)
|
|
assert result_json is not None, "Failed to download results"
|
|
assert 'text_regions' in result_json or 'elements' in result_json
|
|
print(f"✓ Results verified (default parameters)")
|
|
|
|
def test_high_quality_preset_workflow(self, client: TestClient):
|
|
"""Test workflow with high-quality preset parameters"""
|
|
# Find a test document
|
|
test_doc = None
|
|
for doc_path in TEST_DOCUMENTS.values():
|
|
if Path(doc_path).exists():
|
|
test_doc = Path(doc_path)
|
|
break
|
|
|
|
if not test_doc:
|
|
pytest.skip("No test documents found")
|
|
|
|
# High-quality preset
|
|
high_quality_params = {
|
|
"layout_detection_threshold": 0.1,
|
|
"layout_nms_threshold": 0.15,
|
|
"text_det_thresh": 0.1,
|
|
"text_det_box_thresh": 0.2,
|
|
"layout_merge_bboxes_mode": "small"
|
|
}
|
|
|
|
# Create and process task
|
|
task_id = client.create_task(test_doc.name, "application/pdf")
|
|
assert task_id is not None
|
|
print(f"✓ Created task: {task_id}")
|
|
|
|
client.upload_file(task_id, test_doc)
|
|
print(f"✓ Uploaded file: {test_doc.name}")
|
|
|
|
# Start with custom parameters
|
|
success = client.start_task(task_id, pp_structure_params=high_quality_params)
|
|
assert success, "Failed to start task with custom params"
|
|
print("✓ Started processing with HIGH-QUALITY preset")
|
|
|
|
# Wait for completion
|
|
result = client.wait_for_completion(task_id, timeout=180)
|
|
assert result is not None, "Task did not complete in time"
|
|
assert result['status'] == 'completed', f"Task failed: {result.get('error_message')}"
|
|
print(f"✓ Task completed in {result.get('processing_time_ms', 0) / 1000:.2f}s")
|
|
|
|
# Verify results
|
|
result_json = client.download_result_json(task_id)
|
|
assert result_json is not None
|
|
print(f"✓ Results verified (high-quality preset)")
|
|
|
|
def test_fast_preset_workflow(self, client: TestClient):
|
|
"""Test workflow with fast preset parameters"""
|
|
test_doc = None
|
|
for doc_path in TEST_DOCUMENTS.values():
|
|
if Path(doc_path).exists():
|
|
test_doc = Path(doc_path)
|
|
break
|
|
|
|
if not test_doc:
|
|
pytest.skip("No test documents found")
|
|
|
|
# Fast preset
|
|
fast_params = {
|
|
"layout_detection_threshold": 0.3,
|
|
"layout_nms_threshold": 0.3,
|
|
"text_det_thresh": 0.3,
|
|
"text_det_box_thresh": 0.4,
|
|
"layout_merge_bboxes_mode": "large"
|
|
}
|
|
|
|
# Create and process task
|
|
task_id = client.create_task(test_doc.name, "application/pdf")
|
|
assert task_id is not None
|
|
print(f"✓ Created task: {task_id}")
|
|
|
|
client.upload_file(task_id, test_doc)
|
|
print(f"✓ Uploaded file: {test_doc.name}")
|
|
|
|
# Start with fast parameters
|
|
success = client.start_task(task_id, pp_structure_params=fast_params)
|
|
assert success
|
|
print("✓ Started processing with FAST preset")
|
|
|
|
# Wait for completion
|
|
result = client.wait_for_completion(task_id, timeout=180)
|
|
assert result is not None
|
|
assert result['status'] == 'completed'
|
|
print(f"✓ Task completed in {result.get('processing_time_ms', 0) / 1000:.2f}s")
|
|
|
|
# Verify results
|
|
result_json = client.download_result_json(task_id)
|
|
assert result_json is not None
|
|
print(f"✓ Results verified (fast preset)")
|
|
|
|
def test_compare_default_vs_custom_params(self, client: TestClient):
|
|
"""Compare results between default and custom parameters"""
|
|
test_doc = None
|
|
for doc_path in TEST_DOCUMENTS.values():
|
|
if Path(doc_path).exists():
|
|
test_doc = Path(doc_path)
|
|
break
|
|
|
|
if not test_doc:
|
|
pytest.skip("No test documents found")
|
|
|
|
print(f"\n=== Comparing Default vs Custom Parameters ===")
|
|
print(f"Document: {test_doc.name}\n")
|
|
|
|
# Test 1: Default parameters
|
|
task_id_default = client.create_task(test_doc.name, "application/pdf")
|
|
client.upload_file(task_id_default, test_doc)
|
|
client.start_task(task_id_default, pp_structure_params=None)
|
|
|
|
result_default = client.wait_for_completion(task_id_default, timeout=180)
|
|
assert result_default and result_default['status'] == 'completed'
|
|
|
|
result_json_default = client.download_result_json(task_id_default)
|
|
time_default = result_default['processing_time_ms'] / 1000
|
|
|
|
# Count elements
|
|
elements_default = 0
|
|
if 'text_regions' in result_json_default:
|
|
elements_default = len(result_json_default['text_regions'])
|
|
elif 'elements' in result_json_default:
|
|
elements_default = len(result_json_default['elements'])
|
|
|
|
print(f"DEFAULT PARAMS:")
|
|
print(f" Processing time: {time_default:.2f}s")
|
|
print(f" Elements detected: {elements_default}")
|
|
|
|
# Test 2: High-quality parameters
|
|
custom_params = {
|
|
"layout_detection_threshold": 0.15,
|
|
"text_det_thresh": 0.15
|
|
}
|
|
|
|
task_id_custom = client.create_task(test_doc.name, "application/pdf")
|
|
client.upload_file(task_id_custom, test_doc)
|
|
client.start_task(task_id_custom, pp_structure_params=custom_params)
|
|
|
|
result_custom = client.wait_for_completion(task_id_custom, timeout=180)
|
|
assert result_custom and result_custom['status'] == 'completed'
|
|
|
|
result_json_custom = client.download_result_json(task_id_custom)
|
|
time_custom = result_custom['processing_time_ms'] / 1000
|
|
|
|
# Count elements
|
|
elements_custom = 0
|
|
if 'text_regions' in result_json_custom:
|
|
elements_custom = len(result_json_custom['text_regions'])
|
|
elif 'elements' in result_json_custom:
|
|
elements_custom = len(result_json_custom['elements'])
|
|
|
|
print(f"\nCUSTOM PARAMS (lower thresholds):")
|
|
print(f" Processing time: {time_custom:.2f}s")
|
|
print(f" Elements detected: {elements_custom}")
|
|
|
|
print(f"\nDIFFERENCE:")
|
|
print(f" Time delta: {abs(time_custom - time_default):.2f}s")
|
|
print(f" Element delta: {abs(elements_custom - elements_default)} elements")
|
|
print(f" Custom detected {elements_custom - elements_default:+d} more elements")
|
|
|
|
# Both should complete successfully
|
|
assert result_default['status'] == 'completed'
|
|
assert result_custom['status'] == 'completed'
|
|
|
|
# Custom params with lower thresholds should detect more elements
|
|
# (this might not always be true, but it's the expected behavior)
|
|
print(f"\n✓ Comparison complete")
|
|
|
|
|
|
@pytest.mark.e2e
|
|
@pytest.mark.slow
|
|
class TestPPStructureParamsPerformance:
|
|
"""Performance tests for PP-StructureV3 parameters"""
|
|
|
|
def test_parameter_initialization_overhead(self, client: TestClient):
|
|
"""Measure overhead of creating engine with custom parameters"""
|
|
test_doc = None
|
|
for doc_path in TEST_DOCUMENTS.values():
|
|
if Path(doc_path).exists():
|
|
test_doc = Path(doc_path)
|
|
break
|
|
|
|
if not test_doc:
|
|
pytest.skip("No test documents found")
|
|
|
|
print(f"\n=== Testing Parameter Initialization Overhead ===")
|
|
|
|
# Measure default (cached engine)
|
|
times_default = []
|
|
for i in range(3):
|
|
task_id = client.create_task(test_doc.name, "application/pdf")
|
|
client.upload_file(task_id, test_doc)
|
|
|
|
start = time.time()
|
|
client.start_task(task_id, pp_structure_params=None)
|
|
result = client.wait_for_completion(task_id, timeout=180)
|
|
end = time.time()
|
|
|
|
if result and result['status'] == 'completed':
|
|
times_default.append(end - start)
|
|
print(f" Default run {i+1}: {end - start:.2f}s")
|
|
|
|
avg_default = sum(times_default) / len(times_default) if times_default else 0
|
|
|
|
# Measure custom params (no cache)
|
|
times_custom = []
|
|
custom_params = {"layout_detection_threshold": 0.15}
|
|
|
|
for i in range(3):
|
|
task_id = client.create_task(test_doc.name, "application/pdf")
|
|
client.upload_file(task_id, test_doc)
|
|
|
|
start = time.time()
|
|
client.start_task(task_id, pp_structure_params=custom_params)
|
|
result = client.wait_for_completion(task_id, timeout=180)
|
|
end = time.time()
|
|
|
|
if result and result['status'] == 'completed':
|
|
times_custom.append(end - start)
|
|
print(f" Custom run {i+1}: {end - start:.2f}s")
|
|
|
|
avg_custom = sum(times_custom) / len(times_custom) if times_custom else 0
|
|
|
|
print(f"\nRESULTS:")
|
|
print(f" Average time (default): {avg_default:.2f}s")
|
|
print(f" Average time (custom): {avg_custom:.2f}s")
|
|
print(f" Overhead: {avg_custom - avg_default:.2f}s ({(avg_custom - avg_default) / avg_default * 100:.1f}%)")
|
|
|
|
# Overhead should be reasonable (< 20%)
|
|
if avg_default > 0:
|
|
overhead_percent = (avg_custom - avg_default) / avg_default * 100
|
|
assert overhead_percent < 50, f"Custom parameter overhead too high: {overhead_percent:.1f}%"
|
|
print(f"✓ Overhead within acceptable range")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# Run with: pytest backend/tests/e2e/test_ppstructure_params_e2e.py -v -s -m e2e
|
|
pytest.main([__file__, '-v', '-s', '-m', 'e2e'])
|