""" End-to-End tests for PP-StructureV3 parameter customization Tests full workflow: Upload → Set params → Process → Verify results """ import pytest import requests import time import json from pathlib import Path from typing import Optional, Dict # Test configuration API_BASE_URL = "http://localhost:8000/api/v2" TEST_USER_EMAIL = "ymirliu@panjit.com.tw" TEST_USER_PASSWORD = "4RFV5tgb6yhn" # Test documents (assuming these exist in demo_docs/) TEST_DOCUMENTS = { 'simple_text': 'demo_docs/simple_text.pdf', 'complex_diagram': 'demo_docs/complex_diagram.pdf', 'small_text': 'demo_docs/small_text.pdf', } class TestClient: """Helper class for API testing with authentication""" def __init__(self, base_url: str = API_BASE_URL): self.base_url = base_url self.session = requests.Session() self.access_token: Optional[str] = None def login(self, email: str, password: str) -> bool: """Login and get access token""" try: response = self.session.post( f"{self.base_url}/auth/login", json={"email": email, "password": password} ) response.raise_for_status() data = response.json() self.access_token = data['access_token'] self.session.headers.update({ 'Authorization': f'Bearer {self.access_token}' }) return True except Exception as e: print(f"Login failed: {e}") return False def create_task(self, filename: str, file_type: str) -> Optional[str]: """Create a task and return task_id""" try: response = self.session.post( f"{self.base_url}/tasks", json={"filename": filename, "file_type": file_type} ) response.raise_for_status() return response.json()['task_id'] except Exception as e: print(f"Create task failed: {e}") return None def upload_file(self, task_id: str, file_path: Path) -> bool: """Upload file to task""" try: with open(file_path, 'rb') as f: files = {'file': (file_path.name, f, 'application/pdf')} response = self.session.post( f"{self.base_url}/upload/{task_id}", files=files ) response.raise_for_status() return True except Exception as e: print(f"Upload failed: {e}") return False def start_task(self, task_id: str, pp_structure_params: Optional[Dict] = None) -> bool: """Start task processing with optional custom parameters""" try: body = { "use_dual_track": True, "language": "ch" } if pp_structure_params: body["pp_structure_params"] = pp_structure_params response = self.session.post( f"{self.base_url}/tasks/{task_id}/start", json=body ) response.raise_for_status() return True except Exception as e: print(f"Start task failed: {e}") return False def get_task_status(self, task_id: str) -> Optional[Dict]: """Get task status""" try: response = self.session.get(f"{self.base_url}/tasks/{task_id}") response.raise_for_status() return response.json() except Exception as e: print(f"Get task status failed: {e}") return None def wait_for_completion(self, task_id: str, timeout: int = 300) -> Optional[Dict]: """Wait for task to complete (max timeout seconds)""" start_time = time.time() while time.time() - start_time < timeout: task = self.get_task_status(task_id) if task and task['status'] in ['completed', 'failed']: return task time.sleep(2) return None def download_result_json(self, task_id: str) -> Optional[Dict]: """Download and parse result JSON""" try: response = self.session.get(f"{self.base_url}/tasks/{task_id}/download/json") response.raise_for_status() return response.json() except Exception as e: print(f"Download result failed: {e}") return None @pytest.fixture(scope="module") def client(): """Create authenticated test client""" client = TestClient() if not client.login(TEST_USER_EMAIL, TEST_USER_PASSWORD): pytest.skip("Authentication failed - check credentials or server") return client @pytest.mark.e2e class TestPPStructureParamsE2E: """End-to-end tests for PP-StructureV3 parameter customization""" def test_default_parameters_workflow(self, client: TestClient): """Test complete workflow with default parameters""" # Find a test document test_doc = None for doc_path in TEST_DOCUMENTS.values(): if Path(doc_path).exists(): test_doc = Path(doc_path) break if not test_doc: pytest.skip("No test documents found") # Step 1: Create task task_id = client.create_task(test_doc.name, "application/pdf") assert task_id is not None, "Failed to create task" print(f"✓ Created task: {task_id}") # Step 2: Upload file success = client.upload_file(task_id, test_doc) assert success, "Failed to upload file" print(f"✓ Uploaded file: {test_doc.name}") # Step 3: Start processing (no custom params) success = client.start_task(task_id, pp_structure_params=None) assert success, "Failed to start task" print("✓ Started processing with default parameters") # Step 4: Wait for completion result = client.wait_for_completion(task_id, timeout=180) assert result is not None, "Task did not complete in time" assert result['status'] == 'completed', f"Task failed: {result.get('error_message')}" print(f"✓ Task completed in {result.get('processing_time_ms', 0) / 1000:.2f}s") # Step 5: Verify results result_json = client.download_result_json(task_id) assert result_json is not None, "Failed to download results" assert 'text_regions' in result_json or 'elements' in result_json print(f"✓ Results verified (default parameters)") def test_high_quality_preset_workflow(self, client: TestClient): """Test workflow with high-quality preset parameters""" # Find a test document test_doc = None for doc_path in TEST_DOCUMENTS.values(): if Path(doc_path).exists(): test_doc = Path(doc_path) break if not test_doc: pytest.skip("No test documents found") # High-quality preset high_quality_params = { "layout_detection_threshold": 0.1, "layout_nms_threshold": 0.15, "text_det_thresh": 0.1, "text_det_box_thresh": 0.2, "layout_merge_bboxes_mode": "small" } # Create and process task task_id = client.create_task(test_doc.name, "application/pdf") assert task_id is not None print(f"✓ Created task: {task_id}") client.upload_file(task_id, test_doc) print(f"✓ Uploaded file: {test_doc.name}") # Start with custom parameters success = client.start_task(task_id, pp_structure_params=high_quality_params) assert success, "Failed to start task with custom params" print("✓ Started processing with HIGH-QUALITY preset") # Wait for completion result = client.wait_for_completion(task_id, timeout=180) assert result is not None, "Task did not complete in time" assert result['status'] == 'completed', f"Task failed: {result.get('error_message')}" print(f"✓ Task completed in {result.get('processing_time_ms', 0) / 1000:.2f}s") # Verify results result_json = client.download_result_json(task_id) assert result_json is not None print(f"✓ Results verified (high-quality preset)") def test_fast_preset_workflow(self, client: TestClient): """Test workflow with fast preset parameters""" test_doc = None for doc_path in TEST_DOCUMENTS.values(): if Path(doc_path).exists(): test_doc = Path(doc_path) break if not test_doc: pytest.skip("No test documents found") # Fast preset fast_params = { "layout_detection_threshold": 0.3, "layout_nms_threshold": 0.3, "text_det_thresh": 0.3, "text_det_box_thresh": 0.4, "layout_merge_bboxes_mode": "large" } # Create and process task task_id = client.create_task(test_doc.name, "application/pdf") assert task_id is not None print(f"✓ Created task: {task_id}") client.upload_file(task_id, test_doc) print(f"✓ Uploaded file: {test_doc.name}") # Start with fast parameters success = client.start_task(task_id, pp_structure_params=fast_params) assert success print("✓ Started processing with FAST preset") # Wait for completion result = client.wait_for_completion(task_id, timeout=180) assert result is not None assert result['status'] == 'completed' print(f"✓ Task completed in {result.get('processing_time_ms', 0) / 1000:.2f}s") # Verify results result_json = client.download_result_json(task_id) assert result_json is not None print(f"✓ Results verified (fast preset)") def test_compare_default_vs_custom_params(self, client: TestClient): """Compare results between default and custom parameters""" test_doc = None for doc_path in TEST_DOCUMENTS.values(): if Path(doc_path).exists(): test_doc = Path(doc_path) break if not test_doc: pytest.skip("No test documents found") print(f"\n=== Comparing Default vs Custom Parameters ===") print(f"Document: {test_doc.name}\n") # Test 1: Default parameters task_id_default = client.create_task(test_doc.name, "application/pdf") client.upload_file(task_id_default, test_doc) client.start_task(task_id_default, pp_structure_params=None) result_default = client.wait_for_completion(task_id_default, timeout=180) assert result_default and result_default['status'] == 'completed' result_json_default = client.download_result_json(task_id_default) time_default = result_default['processing_time_ms'] / 1000 # Count elements elements_default = 0 if 'text_regions' in result_json_default: elements_default = len(result_json_default['text_regions']) elif 'elements' in result_json_default: elements_default = len(result_json_default['elements']) print(f"DEFAULT PARAMS:") print(f" Processing time: {time_default:.2f}s") print(f" Elements detected: {elements_default}") # Test 2: High-quality parameters custom_params = { "layout_detection_threshold": 0.15, "text_det_thresh": 0.15 } task_id_custom = client.create_task(test_doc.name, "application/pdf") client.upload_file(task_id_custom, test_doc) client.start_task(task_id_custom, pp_structure_params=custom_params) result_custom = client.wait_for_completion(task_id_custom, timeout=180) assert result_custom and result_custom['status'] == 'completed' result_json_custom = client.download_result_json(task_id_custom) time_custom = result_custom['processing_time_ms'] / 1000 # Count elements elements_custom = 0 if 'text_regions' in result_json_custom: elements_custom = len(result_json_custom['text_regions']) elif 'elements' in result_json_custom: elements_custom = len(result_json_custom['elements']) print(f"\nCUSTOM PARAMS (lower thresholds):") print(f" Processing time: {time_custom:.2f}s") print(f" Elements detected: {elements_custom}") print(f"\nDIFFERENCE:") print(f" Time delta: {abs(time_custom - time_default):.2f}s") print(f" Element delta: {abs(elements_custom - elements_default)} elements") print(f" Custom detected {elements_custom - elements_default:+d} more elements") # Both should complete successfully assert result_default['status'] == 'completed' assert result_custom['status'] == 'completed' # Custom params with lower thresholds should detect more elements # (this might not always be true, but it's the expected behavior) print(f"\n✓ Comparison complete") @pytest.mark.e2e @pytest.mark.slow class TestPPStructureParamsPerformance: """Performance tests for PP-StructureV3 parameters""" def test_parameter_initialization_overhead(self, client: TestClient): """Measure overhead of creating engine with custom parameters""" test_doc = None for doc_path in TEST_DOCUMENTS.values(): if Path(doc_path).exists(): test_doc = Path(doc_path) break if not test_doc: pytest.skip("No test documents found") print(f"\n=== Testing Parameter Initialization Overhead ===") # Measure default (cached engine) times_default = [] for i in range(3): task_id = client.create_task(test_doc.name, "application/pdf") client.upload_file(task_id, test_doc) start = time.time() client.start_task(task_id, pp_structure_params=None) result = client.wait_for_completion(task_id, timeout=180) end = time.time() if result and result['status'] == 'completed': times_default.append(end - start) print(f" Default run {i+1}: {end - start:.2f}s") avg_default = sum(times_default) / len(times_default) if times_default else 0 # Measure custom params (no cache) times_custom = [] custom_params = {"layout_detection_threshold": 0.15} for i in range(3): task_id = client.create_task(test_doc.name, "application/pdf") client.upload_file(task_id, test_doc) start = time.time() client.start_task(task_id, pp_structure_params=custom_params) result = client.wait_for_completion(task_id, timeout=180) end = time.time() if result and result['status'] == 'completed': times_custom.append(end - start) print(f" Custom run {i+1}: {end - start:.2f}s") avg_custom = sum(times_custom) / len(times_custom) if times_custom else 0 print(f"\nRESULTS:") print(f" Average time (default): {avg_default:.2f}s") print(f" Average time (custom): {avg_custom:.2f}s") print(f" Overhead: {avg_custom - avg_default:.2f}s ({(avg_custom - avg_default) / avg_default * 100:.1f}%)") # Overhead should be reasonable (< 20%) if avg_default > 0: overhead_percent = (avg_custom - avg_default) / avg_default * 100 assert overhead_percent < 50, f"Custom parameter overhead too high: {overhead_percent:.1f}%" print(f"✓ Overhead within acceptable range") if __name__ == '__main__': # Run with: pytest backend/tests/e2e/test_ppstructure_params_e2e.py -v -s -m e2e pytest.main([__file__, '-v', '-s', '-m', 'e2e'])