OCR/backend/tests/e2e/test_pdf_layout_restoration.py

"""
End-to-end tests for PDF layout restoration (Phase 1-3).

Tests verify:
- Task 1.3: Image rendering in PDF output
- Task 2.4: Table rendering in PDF output
- Task 4.4: Track-specific rendering quality

Run with: pytest backend/tests/e2e/test_pdf_layout_restoration.py -v -s
"""

import pytest
import requests
import time
import os
from pathlib import Path
from typing import Optional
import json

# Configuration
_default_backend_port = os.getenv("BACKEND_PORT", "8000")
_default_base_url = f"http://localhost:{_default_backend_port}"
_api_base = os.getenv("TOOL_OCR_E2E_API_BASE_URL", _default_base_url).rstrip("/")
API_BASE_URL = f"{_api_base}/api/v2"
DEMO_DOCS_PATH = Path(
    os.getenv("TOOL_OCR_DEMO_DOCS_DIR")
    or (Path(__file__).resolve().parents[3] / "demo_docs")
)

# Test credentials must be provided via environment variables
TEST_USERNAME = os.getenv("TOOL_OCR_E2E_USERNAME")
TEST_PASSWORD = os.getenv("TOOL_OCR_E2E_PASSWORD")


class TestBase:
    """Base class for layout restoration tests."""

    @pytest.fixture(scope="class")
    def auth_token(self):
        """Authenticate and get access token."""
        if not TEST_USERNAME or not TEST_PASSWORD:
            pytest.skip("Set TOOL_OCR_E2E_USERNAME and TOOL_OCR_E2E_PASSWORD to run E2E tests")

        response = requests.post(
            f"{API_BASE_URL}/auth/login",
            json={
                "username": TEST_USERNAME,
                "password": TEST_PASSWORD
            }
        )

        if response.status_code != 200:
            pytest.skip(f"Authentication failed: {response.text}")

        data = response.json()
        return data["access_token"]

    @pytest.fixture
    def headers(self, auth_token):
        """Get authorization headers."""
        return {"Authorization": f"Bearer {auth_token}"}

    def wait_for_task_completion(
        self,
        task_id: str,
        headers: dict,
        timeout: int = 120,
        poll_interval: int = 2
    ) -> dict:
        """Wait for task to complete or fail."""
        start_time = time.time()

        while time.time() - start_time < timeout:
            response = requests.get(
                f"{API_BASE_URL}/tasks/{task_id}",
                headers=headers
            )

            if response.status_code != 200:
                raise Exception(f"Failed to get task status: {response.text}")

            task = response.json()
            status = task.get("status")

            if status == "completed":
                return task
            elif status == "failed":
                raise Exception(f"Task failed: {task.get('error_message')}")

            time.sleep(poll_interval)

        raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds")

    def upload_and_process(
        self,
        file_path: Path,
        headers: dict,
        force_track: Optional[str] = None
    ) -> str:
        """Upload file and start processing. Returns task_id."""
        # Upload file
        with open(file_path, "rb") as f:
            files = {"file": (file_path.name, f)}
            response = requests.post(
                f"{API_BASE_URL}/upload",
                files=files,
                headers=headers
            )

        if response.status_code != 200:
            raise Exception(f"Upload failed: {response.text}")

        upload_result = response.json()
        task_id = upload_result["task_id"]

        # Start processing
        params = {"use_dual_track": True}
        if force_track:
            params["force_track"] = force_track

        response = requests.post(
            f"{API_BASE_URL}/tasks/{task_id}/start",
            headers=headers,
            params=params
        )

        if response.status_code != 200:
            raise Exception(f"Start processing failed: {response.text}")

        return task_id

    def download_pdf(self, task_id: str, headers: dict, output_path: Path):
        """Download generated PDF."""
        response = requests.get(
            f"{API_BASE_URL}/tasks/{task_id}/download/pdf",
            headers=headers
        )

        if response.status_code != 200:
            raise Exception(f"PDF download failed: {response.text}")

        # Save PDF for inspection
        output_path.parent.mkdir(parents=True, exist_ok=True)
        with open(output_path, "wb") as f:
            f.write(response.content)

        return output_path

    def get_unified_document(self, task_id: str, headers: dict) -> dict:
        """Get UnifiedDocument JSON."""
        response = requests.get(
            f"{API_BASE_URL}/tasks/{task_id}/download/unified",
            headers=headers
        )

        if response.status_code != 200:
            raise Exception(f"UnifiedDocument download failed: {response.text}")

        return response.json()


class TestImageRendering(TestBase):
    """Task 1.3: Test image rendering in PDF output."""

    def test_1_3_1_ocr_track_image_rendering(self, headers):
        """Test 1.3.1: Verify images appear in OCR track PDF output."""
        # Use scan.pdf which should have images detected by OCR
        file_path = DEMO_DOCS_PATH / "scan.pdf"

        if not file_path.exists():
            pytest.skip(f"Test file not found: {file_path}")

        print(f"\n[Test 1.3.1] OCR Track Image Rendering")
        print(f"Processing: {file_path.name}")

        # Upload and process with OCR track
        task_id = self.upload_and_process(file_path, headers, force_track="ocr")
        print(f"Task ID: {task_id}")

        # Wait for completion
        task = self.wait_for_task_completion(task_id, headers, timeout=180)
        assert task["status"] == "completed"

        # Download PDF
        output_path = Path(__file__).parent / "test_output" / f"ocr_images_{task_id}.pdf"
        pdf_path = self.download_pdf(task_id, headers, output_path)
        print(f"PDF saved to: {pdf_path}")

        # Get UnifiedDocument to check image count
        unified_doc = self.get_unified_document(task_id, headers)

        total_images = 0
        for page in unified_doc.get("pages", []):
            for element in page.get("elements", []):
                if element.get("type") in ["image", "figure", "chart", "diagram"]:
                    total_images += 1

        print(f"Total images detected: {total_images}")

        # Verify PDF exists and has content
        assert pdf_path.exists()
        assert pdf_path.stat().st_size > 0

        # Check PDF magic bytes
        with open(pdf_path, "rb") as f:
            header = f.read(4)
            assert header == b"%PDF", "Output is not a valid PDF"

        print(f"[PASS] OCR track image rendering - PDF generated with {total_images} images")

    def test_1_3_2_direct_track_image_rendering(self, headers):
        """Test 1.3.2: Verify images appear in Direct track PDF output."""
        # Use edit.pdf which may contain embedded images
        file_path = DEMO_DOCS_PATH / "edit.pdf"

        if not file_path.exists():
            pytest.skip(f"Test file not found: {file_path}")

        print(f"\n[Test 1.3.2] Direct Track Image Rendering")
        print(f"Processing: {file_path.name}")

        # Upload and process with direct track
        task_id = self.upload_and_process(file_path, headers, force_track="direct")
        print(f"Task ID: {task_id}")

        # Wait for completion
        task = self.wait_for_task_completion(task_id, headers, timeout=120)
        assert task["status"] == "completed"

        # Download PDF
        output_path = Path(__file__).parent / "test_output" / f"direct_images_{task_id}.pdf"
        pdf_path = self.download_pdf(task_id, headers, output_path)
        print(f"PDF saved to: {pdf_path}")

        # Get UnifiedDocument to check image count
        unified_doc = self.get_unified_document(task_id, headers)

        total_images = 0
        for page in unified_doc.get("pages", []):
            for element in page.get("elements", []):
                if element.get("type") in ["image", "figure", "chart", "diagram"]:
                    total_images += 1

        print(f"Total images detected: {total_images}")

        # Verify PDF exists and has content
        assert pdf_path.exists()
        assert pdf_path.stat().st_size > 0

        print(f"[PASS] Direct track image rendering - PDF generated with {total_images} images")

    def test_1_3_3_verify_image_paths(self, headers):
        """Test 1.3.3: Verify images are saved and referenced correctly."""
        file_path = DEMO_DOCS_PATH / "scan.pdf"

        if not file_path.exists():
            pytest.skip(f"Test file not found: {file_path}")

        print(f"\n[Test 1.3.3] Image Path Verification")

        # Process with OCR track
        task_id = self.upload_and_process(file_path, headers, force_track="ocr")
        task = self.wait_for_task_completion(task_id, headers, timeout=180)
        assert task["status"] == "completed"

        # Get UnifiedDocument
        unified_doc = self.get_unified_document(task_id, headers)

        images_with_paths = []
        for page in unified_doc.get("pages", []):
            for element in page.get("elements", []):
                if element.get("type") in ["image", "figure"]:
                    content = element.get("content", {})
                    # Check for saved_path, path, or image_path
                    path = (content.get("saved_path") or
                           content.get("path") or
                           content.get("image_path"))

                    if path:
                        images_with_paths.append({
                            "element_id": element.get("element_id"),
                            "path": path,
                            "type": element.get("type")
                        })

        print(f"Images with paths: {len(images_with_paths)}")
        for img in images_with_paths[:5]:  # Print first 5
            print(f"  - {img['element_id']}: {img['path']}")

        # Verify at least some images have paths
        # Note: May be 0 if PP-Structure doesn't extract images from this specific PDF
        print(f"[INFO] Found {len(images_with_paths)} images with saved paths")
        print(f"[PASS] Image path verification complete")


class TestTableRendering(TestBase):
    """Task 2.4: Test table rendering in PDF output."""

    def test_2_4_1_simple_tables(self, headers):
        """Test 2.4.1: Verify simple tables render correctly."""
        # Use a document with simple tables
        file_path = DEMO_DOCS_PATH / "edit.pdf"

        if not file_path.exists():
            pytest.skip(f"Test file not found: {file_path}")

        print(f"\n[Test 2.4.1] Simple Table Rendering")

        # Process with direct track
        task_id = self.upload_and_process(file_path, headers, force_track="direct")
        task = self.wait_for_task_completion(task_id, headers, timeout=120)
        assert task["status"] == "completed"

        # Download PDF
        output_path = Path(__file__).parent / "test_output" / f"simple_tables_{task_id}.pdf"
        pdf_path = self.download_pdf(task_id, headers, output_path)

        # Get UnifiedDocument to count tables
        unified_doc = self.get_unified_document(task_id, headers)

        total_tables = 0
        for page in unified_doc.get("pages", []):
            for element in page.get("elements", []):
                if element.get("type") == "table":
                    total_tables += 1

        print(f"Total tables detected: {total_tables}")
        print(f"PDF saved to: {pdf_path}")

        assert pdf_path.exists()
        print(f"[PASS] Simple table rendering - {total_tables} tables in PDF")

    def test_2_4_2_complex_tables(self, headers):
        """Test 2.4.2: Verify complex multi-column tables render correctly."""
        # Use scan.pdf which may have complex tables
        file_path = DEMO_DOCS_PATH / "scan.pdf"

        if not file_path.exists():
            pytest.skip(f"Test file not found: {file_path}")

        print(f"\n[Test 2.4.2] Complex Table Rendering")

        # Process with OCR track (better for detecting tables in scanned docs)
        task_id = self.upload_and_process(file_path, headers, force_track="ocr")
        task = self.wait_for_task_completion(task_id, headers, timeout=180)
        assert task["status"] == "completed"

        # Download PDF
        output_path = Path(__file__).parent / "test_output" / f"complex_tables_{task_id}.pdf"
        pdf_path = self.download_pdf(task_id, headers, output_path)

        # Get UnifiedDocument to check table structure
        unified_doc = self.get_unified_document(task_id, headers)

        complex_tables = []
        for page in unified_doc.get("pages", []):
            for element in page.get("elements", []):
                if element.get("type") == "table":
                    content = element.get("content", {})
                    rows = content.get("rows", 0)
                    cols = content.get("cols", 0)

                    # Consider complex if >= 3 columns or >= 5 rows
                    if cols >= 3 or rows >= 5:
                        complex_tables.append({
                            "rows": rows,
                            "cols": cols,
                            "element_id": element.get("element_id")
                        })

        print(f"Complex tables found: {len(complex_tables)}")
        for table in complex_tables[:3]:  # Print first 3
            print(f"  - {table['element_id']}: {table['rows']}x{table['cols']}")

        print(f"PDF saved to: {pdf_path}")
        assert pdf_path.exists()
        print(f"[PASS] Complex table rendering - {len(complex_tables)} complex tables")

    def test_2_4_3_tables_both_tracks(self, headers):
        """Test 2.4.3: Compare table rendering between OCR and Direct tracks."""
        file_path = DEMO_DOCS_PATH / "edit.pdf"

        if not file_path.exists():
            pytest.skip(f"Test file not found: {file_path}")

        print(f"\n[Test 2.4.3] Table Rendering - Both Tracks Comparison")

        results = {}

        for track in ["ocr", "direct"]:
            print(f"\nProcessing with {track.upper()} track...")

            task_id = self.upload_and_process(file_path, headers, force_track=track)
            task = self.wait_for_task_completion(task_id, headers, timeout=180)
            assert task["status"] == "completed"

            # Download PDF
            output_path = Path(__file__).parent / "test_output" / f"tables_{track}_{task_id}.pdf"
            pdf_path = self.download_pdf(task_id, headers, output_path)

            # Get table count
            unified_doc = self.get_unified_document(task_id, headers)
            table_count = sum(
                1 for page in unified_doc.get("pages", [])
                for element in page.get("elements", [])
                if element.get("type") == "table"
            )

            results[track] = {
                "task_id": task_id,
                "table_count": table_count,
                "pdf_path": pdf_path,
                "pdf_size": pdf_path.stat().st_size
            }

            print(f"  {track.upper()} - Tables: {table_count}, PDF size: {results[track]['pdf_size']} bytes")

        print(f"\nComparison:")
        print(f"  OCR track:    {results['ocr']['table_count']} tables, {results['ocr']['pdf_size']} bytes")
        print(f"  Direct track: {results['direct']['table_count']} tables, {results['direct']['pdf_size']} bytes")

        # Both tracks should generate valid PDFs
        assert results['ocr']['pdf_path'].exists()
        assert results['direct']['pdf_path'].exists()

        print(f"[PASS] Table rendering comparison complete")


class TestTrackSpecificRendering(TestBase):
    """Task 4.4: Test track-specific rendering quality."""

    def test_4_4_1_compare_direct_with_original(self, headers):
        """Test 4.4.1: Compare Direct track output with original PDF."""
        file_path = DEMO_DOCS_PATH / "edit.pdf"

        if not file_path.exists():
            pytest.skip(f"Test file not found: {file_path}")

        print(f"\n[Test 4.4.1] Direct Track Quality Comparison")

        # Process with direct track
        task_id = self.upload_and_process(file_path, headers, force_track="direct")
        task = self.wait_for_task_completion(task_id, headers, timeout=120)
        assert task["status"] == "completed"

        # Download generated PDF
        output_path = Path(__file__).parent / "test_output" / f"direct_quality_{task_id}.pdf"
        pdf_path = self.download_pdf(task_id, headers, output_path)

        # Get metadata
        response = requests.get(
            f"{API_BASE_URL}/tasks/{task_id}/metadata",
            headers=headers
        )

        metadata = response.json() if response.status_code == 200 else {}

        print(f"Original file: {file_path.name} ({file_path.stat().st_size} bytes)")
        print(f"Generated PDF: {pdf_path.name} ({pdf_path.stat().st_size} bytes)")
        print(f"Processing track: {metadata.get('processing_track')}")
        print(f"Processing time: {metadata.get('processing_time_seconds', 0):.2f}s")

        # Verify it's Direct track
        assert metadata.get("processing_track") == "direct"

        # Get UnifiedDocument to check preservation
        unified_doc = self.get_unified_document(task_id, headers)

        stats = {
            "pages": len(unified_doc.get("pages", [])),
            "text_elements": 0,
            "images": 0,
            "tables": 0,
            "with_style": 0,
            "with_spans": 0
        }

        for page in unified_doc.get("pages", []):
            for element in page.get("elements", []):
                el_type = element.get("type")

                if el_type in ["text", "paragraph", "title", "header"]:
                    stats["text_elements"] += 1
                    if element.get("style"):
                        stats["with_style"] += 1
                    if element.get("children"):
                        stats["with_spans"] += 1

                elif el_type in ["image", "figure"]:
                    stats["images"] += 1
                elif el_type == "table":
                    stats["tables"] += 1

        print(f"\nDocument structure:")
        print(f"  Pages: {stats['pages']}")
        print(f"  Text elements: {stats['text_elements']} ({stats['with_style']} with style)")
        print(f"  Span children: {stats['with_spans']} elements")
        print(f"  Images: {stats['images']}")
        print(f"  Tables: {stats['tables']}")

        # Direct track should preserve styles
        assert pdf_path.exists()
        print(f"[PASS] Direct track quality check complete")

    def test_4_4_2_verify_ocr_quality(self, headers):
        """Test 4.4.2: Verify OCR track maintains quality."""
        file_path = DEMO_DOCS_PATH / "scan.pdf"

        if not file_path.exists():
            pytest.skip(f"Test file not found: {file_path}")

        print(f"\n[Test 4.4.2] OCR Track Quality Verification")

        # Process with OCR track
        task_id = self.upload_and_process(file_path, headers, force_track="ocr")
        task = self.wait_for_task_completion(task_id, headers, timeout=180)
        assert task["status"] == "completed"

        # Download generated PDF
        output_path = Path(__file__).parent / "test_output" / f"ocr_quality_{task_id}.pdf"
        pdf_path = self.download_pdf(task_id, headers, output_path)

        # Get metadata
        response = requests.get(
            f"{API_BASE_URL}/tasks/{task_id}/metadata",
            headers=headers
        )

        metadata = response.json() if response.status_code == 200 else {}

        print(f"Original file: {file_path.name} ({file_path.stat().st_size} bytes)")
        print(f"Generated PDF: {pdf_path.name} ({pdf_path.stat().st_size} bytes)")
        print(f"Processing track: {metadata.get('processing_track')}")
        print(f"Processing time: {metadata.get('processing_time_seconds', 0):.2f}s")

        # Verify it's OCR track
        assert metadata.get("processing_track") == "ocr"

        # Get UnifiedDocument
        unified_doc = self.get_unified_document(task_id, headers)

        text_regions = metadata.get("total_text_regions", 0)
        total_tables = metadata.get("total_tables", 0)
        total_images = metadata.get("total_images", 0)

        print(f"\nOCR results:")
        print(f"  Text regions: {text_regions}")
        print(f"  Tables: {total_tables}")
        print(f"  Images: {total_images}")

        # OCR track should extract content
        assert pdf_path.exists()
        assert text_regions > 0 or total_images > 0, "OCR should extract some content"

        print(f"[PASS] OCR track quality check complete")


if __name__ == "__main__":
    pytest.main([__file__, "-v", "-s"])