Files
OCR/backend/tests/archived/test_ppstructure_params_e2e.py
egg 86a6633000 feat: consolidate env config and add deployment files
- Add debug_font_path, demo_docs_dir, e2e_api_base_url to config.py
- Fix hardcoded paths in pp_structure_debug.py, create_demo_images.py
- Fix hardcoded paths in test files
- Update .env.example with new configuration options
- Update .gitignore to exclude AI development files (.claude/, openspec/, AGENTS.md, CLAUDE.md)
- Add production startup script (start-prod.sh)
- Add README.md with project documentation
- Add 1panel Docker deployment files (docker-compose.yml, Dockerfiles, nginx.conf)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-14 15:02:16 +08:00

421 lines
16 KiB
Python

"""
End-to-End tests for PP-StructureV3 parameter customization
Tests full workflow: Upload → Set params → Process → Verify results
"""
import pytest
import requests
import time
import json
import os
from pathlib import Path
from typing import Optional, Dict
# Test configuration - use environment variable or settings
from app.core.config import settings
API_BASE_URL = settings.e2e_api_base_url
TEST_USER_EMAIL = os.getenv("E2E_TEST_USER_EMAIL", "test@example.com")
TEST_USER_PASSWORD = os.getenv("E2E_TEST_USER_PASSWORD", "testpassword")
# Test documents (assuming these exist in demo_docs/)
TEST_DOCUMENTS = {
'simple_text': 'demo_docs/simple_text.pdf',
'complex_diagram': 'demo_docs/complex_diagram.pdf',
'small_text': 'demo_docs/small_text.pdf',
}
class TestClient:
"""Helper class for API testing with authentication"""
def __init__(self, base_url: str = API_BASE_URL):
self.base_url = base_url
self.session = requests.Session()
self.access_token: Optional[str] = None
def login(self, email: str, password: str) -> bool:
"""Login and get access token"""
try:
response = self.session.post(
f"{self.base_url}/auth/login",
json={"email": email, "password": password}
)
response.raise_for_status()
data = response.json()
self.access_token = data['access_token']
self.session.headers.update({
'Authorization': f'Bearer {self.access_token}'
})
return True
except Exception as e:
print(f"Login failed: {e}")
return False
def create_task(self, filename: str, file_type: str) -> Optional[str]:
"""Create a task and return task_id"""
try:
response = self.session.post(
f"{self.base_url}/tasks",
json={"filename": filename, "file_type": file_type}
)
response.raise_for_status()
return response.json()['task_id']
except Exception as e:
print(f"Create task failed: {e}")
return None
def upload_file(self, task_id: str, file_path: Path) -> bool:
"""Upload file to task"""
try:
with open(file_path, 'rb') as f:
files = {'file': (file_path.name, f, 'application/pdf')}
response = self.session.post(
f"{self.base_url}/upload/{task_id}",
files=files
)
response.raise_for_status()
return True
except Exception as e:
print(f"Upload failed: {e}")
return False
def start_task(self, task_id: str, pp_structure_params: Optional[Dict] = None) -> bool:
"""Start task processing with optional custom parameters"""
try:
body = {
"use_dual_track": True,
"language": "ch"
}
if pp_structure_params:
body["pp_structure_params"] = pp_structure_params
response = self.session.post(
f"{self.base_url}/tasks/{task_id}/start",
json=body
)
response.raise_for_status()
return True
except Exception as e:
print(f"Start task failed: {e}")
return False
def get_task_status(self, task_id: str) -> Optional[Dict]:
"""Get task status"""
try:
response = self.session.get(f"{self.base_url}/tasks/{task_id}")
response.raise_for_status()
return response.json()
except Exception as e:
print(f"Get task status failed: {e}")
return None
def wait_for_completion(self, task_id: str, timeout: int = 300) -> Optional[Dict]:
"""Wait for task to complete (max timeout seconds)"""
start_time = time.time()
while time.time() - start_time < timeout:
task = self.get_task_status(task_id)
if task and task['status'] in ['completed', 'failed']:
return task
time.sleep(2)
return None
def download_result_json(self, task_id: str) -> Optional[Dict]:
"""Download and parse result JSON"""
try:
response = self.session.get(f"{self.base_url}/tasks/{task_id}/download/json")
response.raise_for_status()
return response.json()
except Exception as e:
print(f"Download result failed: {e}")
return None
@pytest.fixture(scope="module")
def client():
"""Create authenticated test client"""
client = TestClient()
if not client.login(TEST_USER_EMAIL, TEST_USER_PASSWORD):
pytest.skip("Authentication failed - check credentials or server")
return client
@pytest.mark.e2e
class TestPPStructureParamsE2E:
"""End-to-end tests for PP-StructureV3 parameter customization"""
def test_default_parameters_workflow(self, client: TestClient):
"""Test complete workflow with default parameters"""
# Find a test document
test_doc = None
for doc_path in TEST_DOCUMENTS.values():
if Path(doc_path).exists():
test_doc = Path(doc_path)
break
if not test_doc:
pytest.skip("No test documents found")
# Step 1: Create task
task_id = client.create_task(test_doc.name, "application/pdf")
assert task_id is not None, "Failed to create task"
print(f"✓ Created task: {task_id}")
# Step 2: Upload file
success = client.upload_file(task_id, test_doc)
assert success, "Failed to upload file"
print(f"✓ Uploaded file: {test_doc.name}")
# Step 3: Start processing (no custom params)
success = client.start_task(task_id, pp_structure_params=None)
assert success, "Failed to start task"
print("✓ Started processing with default parameters")
# Step 4: Wait for completion
result = client.wait_for_completion(task_id, timeout=180)
assert result is not None, "Task did not complete in time"
assert result['status'] == 'completed', f"Task failed: {result.get('error_message')}"
print(f"✓ Task completed in {result.get('processing_time_ms', 0) / 1000:.2f}s")
# Step 5: Verify results
result_json = client.download_result_json(task_id)
assert result_json is not None, "Failed to download results"
assert 'text_regions' in result_json or 'elements' in result_json
print(f"✓ Results verified (default parameters)")
def test_high_quality_preset_workflow(self, client: TestClient):
"""Test workflow with high-quality preset parameters"""
# Find a test document
test_doc = None
for doc_path in TEST_DOCUMENTS.values():
if Path(doc_path).exists():
test_doc = Path(doc_path)
break
if not test_doc:
pytest.skip("No test documents found")
# High-quality preset
high_quality_params = {
"layout_detection_threshold": 0.1,
"layout_nms_threshold": 0.15,
"text_det_thresh": 0.1,
"text_det_box_thresh": 0.2,
"layout_merge_bboxes_mode": "small"
}
# Create and process task
task_id = client.create_task(test_doc.name, "application/pdf")
assert task_id is not None
print(f"✓ Created task: {task_id}")
client.upload_file(task_id, test_doc)
print(f"✓ Uploaded file: {test_doc.name}")
# Start with custom parameters
success = client.start_task(task_id, pp_structure_params=high_quality_params)
assert success, "Failed to start task with custom params"
print("✓ Started processing with HIGH-QUALITY preset")
# Wait for completion
result = client.wait_for_completion(task_id, timeout=180)
assert result is not None, "Task did not complete in time"
assert result['status'] == 'completed', f"Task failed: {result.get('error_message')}"
print(f"✓ Task completed in {result.get('processing_time_ms', 0) / 1000:.2f}s")
# Verify results
result_json = client.download_result_json(task_id)
assert result_json is not None
print(f"✓ Results verified (high-quality preset)")
def test_fast_preset_workflow(self, client: TestClient):
"""Test workflow with fast preset parameters"""
test_doc = None
for doc_path in TEST_DOCUMENTS.values():
if Path(doc_path).exists():
test_doc = Path(doc_path)
break
if not test_doc:
pytest.skip("No test documents found")
# Fast preset
fast_params = {
"layout_detection_threshold": 0.3,
"layout_nms_threshold": 0.3,
"text_det_thresh": 0.3,
"text_det_box_thresh": 0.4,
"layout_merge_bboxes_mode": "large"
}
# Create and process task
task_id = client.create_task(test_doc.name, "application/pdf")
assert task_id is not None
print(f"✓ Created task: {task_id}")
client.upload_file(task_id, test_doc)
print(f"✓ Uploaded file: {test_doc.name}")
# Start with fast parameters
success = client.start_task(task_id, pp_structure_params=fast_params)
assert success
print("✓ Started processing with FAST preset")
# Wait for completion
result = client.wait_for_completion(task_id, timeout=180)
assert result is not None
assert result['status'] == 'completed'
print(f"✓ Task completed in {result.get('processing_time_ms', 0) / 1000:.2f}s")
# Verify results
result_json = client.download_result_json(task_id)
assert result_json is not None
print(f"✓ Results verified (fast preset)")
def test_compare_default_vs_custom_params(self, client: TestClient):
"""Compare results between default and custom parameters"""
test_doc = None
for doc_path in TEST_DOCUMENTS.values():
if Path(doc_path).exists():
test_doc = Path(doc_path)
break
if not test_doc:
pytest.skip("No test documents found")
print(f"\n=== Comparing Default vs Custom Parameters ===")
print(f"Document: {test_doc.name}\n")
# Test 1: Default parameters
task_id_default = client.create_task(test_doc.name, "application/pdf")
client.upload_file(task_id_default, test_doc)
client.start_task(task_id_default, pp_structure_params=None)
result_default = client.wait_for_completion(task_id_default, timeout=180)
assert result_default and result_default['status'] == 'completed'
result_json_default = client.download_result_json(task_id_default)
time_default = result_default['processing_time_ms'] / 1000
# Count elements
elements_default = 0
if 'text_regions' in result_json_default:
elements_default = len(result_json_default['text_regions'])
elif 'elements' in result_json_default:
elements_default = len(result_json_default['elements'])
print(f"DEFAULT PARAMS:")
print(f" Processing time: {time_default:.2f}s")
print(f" Elements detected: {elements_default}")
# Test 2: High-quality parameters
custom_params = {
"layout_detection_threshold": 0.15,
"text_det_thresh": 0.15
}
task_id_custom = client.create_task(test_doc.name, "application/pdf")
client.upload_file(task_id_custom, test_doc)
client.start_task(task_id_custom, pp_structure_params=custom_params)
result_custom = client.wait_for_completion(task_id_custom, timeout=180)
assert result_custom and result_custom['status'] == 'completed'
result_json_custom = client.download_result_json(task_id_custom)
time_custom = result_custom['processing_time_ms'] / 1000
# Count elements
elements_custom = 0
if 'text_regions' in result_json_custom:
elements_custom = len(result_json_custom['text_regions'])
elif 'elements' in result_json_custom:
elements_custom = len(result_json_custom['elements'])
print(f"\nCUSTOM PARAMS (lower thresholds):")
print(f" Processing time: {time_custom:.2f}s")
print(f" Elements detected: {elements_custom}")
print(f"\nDIFFERENCE:")
print(f" Time delta: {abs(time_custom - time_default):.2f}s")
print(f" Element delta: {abs(elements_custom - elements_default)} elements")
print(f" Custom detected {elements_custom - elements_default:+d} more elements")
# Both should complete successfully
assert result_default['status'] == 'completed'
assert result_custom['status'] == 'completed'
# Custom params with lower thresholds should detect more elements
# (this might not always be true, but it's the expected behavior)
print(f"\n✓ Comparison complete")
@pytest.mark.e2e
@pytest.mark.slow
class TestPPStructureParamsPerformance:
"""Performance tests for PP-StructureV3 parameters"""
def test_parameter_initialization_overhead(self, client: TestClient):
"""Measure overhead of creating engine with custom parameters"""
test_doc = None
for doc_path in TEST_DOCUMENTS.values():
if Path(doc_path).exists():
test_doc = Path(doc_path)
break
if not test_doc:
pytest.skip("No test documents found")
print(f"\n=== Testing Parameter Initialization Overhead ===")
# Measure default (cached engine)
times_default = []
for i in range(3):
task_id = client.create_task(test_doc.name, "application/pdf")
client.upload_file(task_id, test_doc)
start = time.time()
client.start_task(task_id, pp_structure_params=None)
result = client.wait_for_completion(task_id, timeout=180)
end = time.time()
if result and result['status'] == 'completed':
times_default.append(end - start)
print(f" Default run {i+1}: {end - start:.2f}s")
avg_default = sum(times_default) / len(times_default) if times_default else 0
# Measure custom params (no cache)
times_custom = []
custom_params = {"layout_detection_threshold": 0.15}
for i in range(3):
task_id = client.create_task(test_doc.name, "application/pdf")
client.upload_file(task_id, test_doc)
start = time.time()
client.start_task(task_id, pp_structure_params=custom_params)
result = client.wait_for_completion(task_id, timeout=180)
end = time.time()
if result and result['status'] == 'completed':
times_custom.append(end - start)
print(f" Custom run {i+1}: {end - start:.2f}s")
avg_custom = sum(times_custom) / len(times_custom) if times_custom else 0
print(f"\nRESULTS:")
print(f" Average time (default): {avg_default:.2f}s")
print(f" Average time (custom): {avg_custom:.2f}s")
print(f" Overhead: {avg_custom - avg_default:.2f}s ({(avg_custom - avg_default) / avg_default * 100:.1f}%)")
# Overhead should be reasonable (< 20%)
if avg_default > 0:
overhead_percent = (avg_custom - avg_default) / avg_default * 100
assert overhead_percent < 50, f"Custom parameter overhead too high: {overhead_percent:.1f}%"
print(f"✓ Overhead within acceptable range")
if __name__ == '__main__':
# Run with: pytest backend/tests/e2e/test_ppstructure_params_e2e.py -v -s -m e2e
pytest.main([__file__, '-v', '-s', '-m', 'e2e'])