OCR/backend/tests/archived/test_ppstructure_params_e2e.py

"""
End-to-End tests for PP-StructureV3 parameter customization
Tests full workflow: Upload → Set params → Process → Verify results
"""

import pytest
import requests
import time
import json
import os
from pathlib import Path
from typing import Optional, Dict

# Test configuration - use environment variable or settings
from app.core.config import settings

API_BASE_URL = settings.e2e_api_base_url
TEST_USER_EMAIL = os.getenv("E2E_TEST_USER_EMAIL", "test@example.com")
TEST_USER_PASSWORD = os.getenv("E2E_TEST_USER_PASSWORD", "testpassword")

# Test documents (assuming these exist in demo_docs/)
TEST_DOCUMENTS = {
    'simple_text': 'demo_docs/simple_text.pdf',
    'complex_diagram': 'demo_docs/complex_diagram.pdf',
    'small_text': 'demo_docs/small_text.pdf',
}


class TestClient:
    """Helper class for API testing with authentication"""

    def __init__(self, base_url: str = API_BASE_URL):
        self.base_url = base_url
        self.session = requests.Session()
        self.access_token: Optional[str] = None

    def login(self, email: str, password: str) -> bool:
        """Login and get access token"""
        try:
            response = self.session.post(
                f"{self.base_url}/auth/login",
                json={"email": email, "password": password}
            )
            response.raise_for_status()
            data = response.json()
            self.access_token = data['access_token']
            self.session.headers.update({
                'Authorization': f'Bearer {self.access_token}'
            })
            return True
        except Exception as e:
            print(f"Login failed: {e}")
            return False

    def create_task(self, filename: str, file_type: str) -> Optional[str]:
        """Create a task and return task_id"""
        try:
            response = self.session.post(
                f"{self.base_url}/tasks",
                json={"filename": filename, "file_type": file_type}
            )
            response.raise_for_status()
            return response.json()['task_id']
        except Exception as e:
            print(f"Create task failed: {e}")
            return None

    def upload_file(self, task_id: str, file_path: Path) -> bool:
        """Upload file to task"""
        try:
            with open(file_path, 'rb') as f:
                files = {'file': (file_path.name, f, 'application/pdf')}
                response = self.session.post(
                    f"{self.base_url}/upload/{task_id}",
                    files=files
                )
                response.raise_for_status()
                return True
        except Exception as e:
            print(f"Upload failed: {e}")
            return False

    def start_task(self, task_id: str, pp_structure_params: Optional[Dict] = None) -> bool:
        """Start task processing with optional custom parameters"""
        try:
            body = {
                "use_dual_track": True,
                "language": "ch"
            }
            if pp_structure_params:
                body["pp_structure_params"] = pp_structure_params

            response = self.session.post(
                f"{self.base_url}/tasks/{task_id}/start",
                json=body
            )
            response.raise_for_status()
            return True
        except Exception as e:
            print(f"Start task failed: {e}")
            return False

    def get_task_status(self, task_id: str) -> Optional[Dict]:
        """Get task status"""
        try:
            response = self.session.get(f"{self.base_url}/tasks/{task_id}")
            response.raise_for_status()
            return response.json()
        except Exception as e:
            print(f"Get task status failed: {e}")
            return None

    def wait_for_completion(self, task_id: str, timeout: int = 300) -> Optional[Dict]:
        """Wait for task to complete (max timeout seconds)"""
        start_time = time.time()
        while time.time() - start_time < timeout:
            task = self.get_task_status(task_id)
            if task and task['status'] in ['completed', 'failed']:
                return task
            time.sleep(2)
        return None

    def download_result_json(self, task_id: str) -> Optional[Dict]:
        """Download and parse result JSON"""
        try:
            response = self.session.get(f"{self.base_url}/tasks/{task_id}/download/json")
            response.raise_for_status()
            return response.json()
        except Exception as e:
            print(f"Download result failed: {e}")
            return None


@pytest.fixture(scope="module")
def client():
    """Create authenticated test client"""
    client = TestClient()
    if not client.login(TEST_USER_EMAIL, TEST_USER_PASSWORD):
        pytest.skip("Authentication failed - check credentials or server")
    return client


@pytest.mark.e2e
class TestPPStructureParamsE2E:
    """End-to-end tests for PP-StructureV3 parameter customization"""

    def test_default_parameters_workflow(self, client: TestClient):
        """Test complete workflow with default parameters"""
        # Find a test document
        test_doc = None
        for doc_path in TEST_DOCUMENTS.values():
            if Path(doc_path).exists():
                test_doc = Path(doc_path)
                break

        if not test_doc:
            pytest.skip("No test documents found")

        # Step 1: Create task
        task_id = client.create_task(test_doc.name, "application/pdf")
        assert task_id is not None, "Failed to create task"
        print(f"✓ Created task: {task_id}")

        # Step 2: Upload file
        success = client.upload_file(task_id, test_doc)
        assert success, "Failed to upload file"
        print(f"✓ Uploaded file: {test_doc.name}")

        # Step 3: Start processing (no custom params)
        success = client.start_task(task_id, pp_structure_params=None)
        assert success, "Failed to start task"
        print("✓ Started processing with default parameters")

        # Step 4: Wait for completion
        result = client.wait_for_completion(task_id, timeout=180)
        assert result is not None, "Task did not complete in time"
        assert result['status'] == 'completed', f"Task failed: {result.get('error_message')}"
        print(f"✓ Task completed in {result.get('processing_time_ms', 0) / 1000:.2f}s")

        # Step 5: Verify results
        result_json = client.download_result_json(task_id)
        assert result_json is not None, "Failed to download results"
        assert 'text_regions' in result_json or 'elements' in result_json
        print(f"✓ Results verified (default parameters)")

    def test_high_quality_preset_workflow(self, client: TestClient):
        """Test workflow with high-quality preset parameters"""
        # Find a test document
        test_doc = None
        for doc_path in TEST_DOCUMENTS.values():
            if Path(doc_path).exists():
                test_doc = Path(doc_path)
                break

        if not test_doc:
            pytest.skip("No test documents found")

        # High-quality preset
        high_quality_params = {
            "layout_detection_threshold": 0.1,
            "layout_nms_threshold": 0.15,
            "text_det_thresh": 0.1,
            "text_det_box_thresh": 0.2,
            "layout_merge_bboxes_mode": "small"
        }

        # Create and process task
        task_id = client.create_task(test_doc.name, "application/pdf")
        assert task_id is not None
        print(f"✓ Created task: {task_id}")

        client.upload_file(task_id, test_doc)
        print(f"✓ Uploaded file: {test_doc.name}")

        # Start with custom parameters
        success = client.start_task(task_id, pp_structure_params=high_quality_params)
        assert success, "Failed to start task with custom params"
        print("✓ Started processing with HIGH-QUALITY preset")

        # Wait for completion
        result = client.wait_for_completion(task_id, timeout=180)
        assert result is not None, "Task did not complete in time"
        assert result['status'] == 'completed', f"Task failed: {result.get('error_message')}"
        print(f"✓ Task completed in {result.get('processing_time_ms', 0) / 1000:.2f}s")

        # Verify results
        result_json = client.download_result_json(task_id)
        assert result_json is not None
        print(f"✓ Results verified (high-quality preset)")

    def test_fast_preset_workflow(self, client: TestClient):
        """Test workflow with fast preset parameters"""
        test_doc = None
        for doc_path in TEST_DOCUMENTS.values():
            if Path(doc_path).exists():
                test_doc = Path(doc_path)
                break

        if not test_doc:
            pytest.skip("No test documents found")

        # Fast preset
        fast_params = {
            "layout_detection_threshold": 0.3,
            "layout_nms_threshold": 0.3,
            "text_det_thresh": 0.3,
            "text_det_box_thresh": 0.4,
            "layout_merge_bboxes_mode": "large"
        }

        # Create and process task
        task_id = client.create_task(test_doc.name, "application/pdf")
        assert task_id is not None
        print(f"✓ Created task: {task_id}")

        client.upload_file(task_id, test_doc)
        print(f"✓ Uploaded file: {test_doc.name}")

        # Start with fast parameters
        success = client.start_task(task_id, pp_structure_params=fast_params)
        assert success
        print("✓ Started processing with FAST preset")

        # Wait for completion
        result = client.wait_for_completion(task_id, timeout=180)
        assert result is not None
        assert result['status'] == 'completed'
        print(f"✓ Task completed in {result.get('processing_time_ms', 0) / 1000:.2f}s")

        # Verify results
        result_json = client.download_result_json(task_id)
        assert result_json is not None
        print(f"✓ Results verified (fast preset)")

    def test_compare_default_vs_custom_params(self, client: TestClient):
        """Compare results between default and custom parameters"""
        test_doc = None
        for doc_path in TEST_DOCUMENTS.values():
            if Path(doc_path).exists():
                test_doc = Path(doc_path)
                break

        if not test_doc:
            pytest.skip("No test documents found")

        print(f"\n=== Comparing Default vs Custom Parameters ===")
        print(f"Document: {test_doc.name}\n")

        # Test 1: Default parameters
        task_id_default = client.create_task(test_doc.name, "application/pdf")
        client.upload_file(task_id_default, test_doc)
        client.start_task(task_id_default, pp_structure_params=None)

        result_default = client.wait_for_completion(task_id_default, timeout=180)
        assert result_default and result_default['status'] == 'completed'

        result_json_default = client.download_result_json(task_id_default)
        time_default = result_default['processing_time_ms'] / 1000

        # Count elements
        elements_default = 0
        if 'text_regions' in result_json_default:
            elements_default = len(result_json_default['text_regions'])
        elif 'elements' in result_json_default:
            elements_default = len(result_json_default['elements'])

        print(f"DEFAULT PARAMS:")
        print(f"  Processing time: {time_default:.2f}s")
        print(f"  Elements detected: {elements_default}")

        # Test 2: High-quality parameters
        custom_params = {
            "layout_detection_threshold": 0.15,
            "text_det_thresh": 0.15
        }

        task_id_custom = client.create_task(test_doc.name, "application/pdf")
        client.upload_file(task_id_custom, test_doc)
        client.start_task(task_id_custom, pp_structure_params=custom_params)

        result_custom = client.wait_for_completion(task_id_custom, timeout=180)
        assert result_custom and result_custom['status'] == 'completed'

        result_json_custom = client.download_result_json(task_id_custom)
        time_custom = result_custom['processing_time_ms'] / 1000

        # Count elements
        elements_custom = 0
        if 'text_regions' in result_json_custom:
            elements_custom = len(result_json_custom['text_regions'])
        elif 'elements' in result_json_custom:
            elements_custom = len(result_json_custom['elements'])

        print(f"\nCUSTOM PARAMS (lower thresholds):")
        print(f"  Processing time: {time_custom:.2f}s")
        print(f"  Elements detected: {elements_custom}")

        print(f"\nDIFFERENCE:")
        print(f"  Time delta: {abs(time_custom - time_default):.2f}s")
        print(f"  Element delta: {abs(elements_custom - elements_default)} elements")
        print(f"  Custom detected {elements_custom - elements_default:+d} more elements")

        # Both should complete successfully
        assert result_default['status'] == 'completed'
        assert result_custom['status'] == 'completed'

        # Custom params with lower thresholds should detect more elements
        # (this might not always be true, but it's the expected behavior)
        print(f"\n✓ Comparison complete")


@pytest.mark.e2e
@pytest.mark.slow
class TestPPStructureParamsPerformance:
    """Performance tests for PP-StructureV3 parameters"""

    def test_parameter_initialization_overhead(self, client: TestClient):
        """Measure overhead of creating engine with custom parameters"""
        test_doc = None
        for doc_path in TEST_DOCUMENTS.values():
            if Path(doc_path).exists():
                test_doc = Path(doc_path)
                break

        if not test_doc:
            pytest.skip("No test documents found")

        print(f"\n=== Testing Parameter Initialization Overhead ===")

        # Measure default (cached engine)
        times_default = []
        for i in range(3):
            task_id = client.create_task(test_doc.name, "application/pdf")
            client.upload_file(task_id, test_doc)

            start = time.time()
            client.start_task(task_id, pp_structure_params=None)
            result = client.wait_for_completion(task_id, timeout=180)
            end = time.time()

            if result and result['status'] == 'completed':
                times_default.append(end - start)
                print(f"  Default run {i+1}: {end - start:.2f}s")

        avg_default = sum(times_default) / len(times_default) if times_default else 0

        # Measure custom params (no cache)
        times_custom = []
        custom_params = {"layout_detection_threshold": 0.15}

        for i in range(3):
            task_id = client.create_task(test_doc.name, "application/pdf")
            client.upload_file(task_id, test_doc)

            start = time.time()
            client.start_task(task_id, pp_structure_params=custom_params)
            result = client.wait_for_completion(task_id, timeout=180)
            end = time.time()

            if result and result['status'] == 'completed':
                times_custom.append(end - start)
                print(f"  Custom run {i+1}: {end - start:.2f}s")

        avg_custom = sum(times_custom) / len(times_custom) if times_custom else 0

        print(f"\nRESULTS:")
        print(f"  Average time (default): {avg_default:.2f}s")
        print(f"  Average time (custom):  {avg_custom:.2f}s")
        print(f"  Overhead: {avg_custom - avg_default:.2f}s ({(avg_custom - avg_default) / avg_default * 100:.1f}%)")

        # Overhead should be reasonable (< 20%)
        if avg_default > 0:
            overhead_percent = (avg_custom - avg_default) / avg_default * 100
            assert overhead_percent < 50, f"Custom parameter overhead too high: {overhead_percent:.1f}%"
            print(f"✓ Overhead within acceptable range")


if __name__ == '__main__':
    # Run with: pytest backend/tests/e2e/test_ppstructure_params_e2e.py -v -s -m e2e
    pytest.main([__file__, '-v', '-s', '-m', 'e2e'])