""" End-to-end tests for PDF layout restoration (Phase 1-3). Tests verify: - Task 1.3: Image rendering in PDF output - Task 2.4: Table rendering in PDF output - Task 4.4: Track-specific rendering quality Run with: pytest backend/tests/e2e/test_pdf_layout_restoration.py -v -s """ import pytest import requests import time import os from pathlib import Path from typing import Optional import json # Configuration _default_backend_port = os.getenv("BACKEND_PORT", "8000") _default_base_url = f"http://localhost:{_default_backend_port}" _api_base = os.getenv("TOOL_OCR_E2E_API_BASE_URL", _default_base_url).rstrip("/") API_BASE_URL = f"{_api_base}/api/v2" DEMO_DOCS_PATH = Path( os.getenv("TOOL_OCR_DEMO_DOCS_DIR") or (Path(__file__).resolve().parents[3] / "demo_docs") ) # Test credentials must be provided via environment variables TEST_USERNAME = os.getenv("TOOL_OCR_E2E_USERNAME") TEST_PASSWORD = os.getenv("TOOL_OCR_E2E_PASSWORD") class TestBase: """Base class for layout restoration tests.""" @pytest.fixture(scope="class") def auth_token(self): """Authenticate and get access token.""" if not TEST_USERNAME or not TEST_PASSWORD: pytest.skip("Set TOOL_OCR_E2E_USERNAME and TOOL_OCR_E2E_PASSWORD to run E2E tests") response = requests.post( f"{API_BASE_URL}/auth/login", json={ "username": TEST_USERNAME, "password": TEST_PASSWORD } ) if response.status_code != 200: pytest.skip(f"Authentication failed: {response.text}") data = response.json() return data["access_token"] @pytest.fixture def headers(self, auth_token): """Get authorization headers.""" return {"Authorization": f"Bearer {auth_token}"} def wait_for_task_completion( self, task_id: str, headers: dict, timeout: int = 120, poll_interval: int = 2 ) -> dict: """Wait for task to complete or fail.""" start_time = time.time() while time.time() - start_time < timeout: response = requests.get( f"{API_BASE_URL}/tasks/{task_id}", headers=headers ) if response.status_code != 200: raise Exception(f"Failed to get task status: {response.text}") task = response.json() status = task.get("status") if status == "completed": return task elif status == "failed": raise Exception(f"Task failed: {task.get('error_message')}") time.sleep(poll_interval) raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds") def upload_and_process( self, file_path: Path, headers: dict, force_track: Optional[str] = None ) -> str: """Upload file and start processing. Returns task_id.""" # Upload file with open(file_path, "rb") as f: files = {"file": (file_path.name, f)} response = requests.post( f"{API_BASE_URL}/upload", files=files, headers=headers ) if response.status_code != 200: raise Exception(f"Upload failed: {response.text}") upload_result = response.json() task_id = upload_result["task_id"] # Start processing params = {"use_dual_track": True} if force_track: params["force_track"] = force_track response = requests.post( f"{API_BASE_URL}/tasks/{task_id}/start", headers=headers, params=params ) if response.status_code != 200: raise Exception(f"Start processing failed: {response.text}") return task_id def download_pdf(self, task_id: str, headers: dict, output_path: Path): """Download generated PDF.""" response = requests.get( f"{API_BASE_URL}/tasks/{task_id}/download/pdf", headers=headers ) if response.status_code != 200: raise Exception(f"PDF download failed: {response.text}") # Save PDF for inspection output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, "wb") as f: f.write(response.content) return output_path def get_unified_document(self, task_id: str, headers: dict) -> dict: """Get UnifiedDocument JSON.""" response = requests.get( f"{API_BASE_URL}/tasks/{task_id}/download/unified", headers=headers ) if response.status_code != 200: raise Exception(f"UnifiedDocument download failed: {response.text}") return response.json() class TestImageRendering(TestBase): """Task 1.3: Test image rendering in PDF output.""" def test_1_3_1_ocr_track_image_rendering(self, headers): """Test 1.3.1: Verify images appear in OCR track PDF output.""" # Use scan.pdf which should have images detected by OCR file_path = DEMO_DOCS_PATH / "scan.pdf" if not file_path.exists(): pytest.skip(f"Test file not found: {file_path}") print(f"\n[Test 1.3.1] OCR Track Image Rendering") print(f"Processing: {file_path.name}") # Upload and process with OCR track task_id = self.upload_and_process(file_path, headers, force_track="ocr") print(f"Task ID: {task_id}") # Wait for completion task = self.wait_for_task_completion(task_id, headers, timeout=180) assert task["status"] == "completed" # Download PDF output_path = Path(__file__).parent / "test_output" / f"ocr_images_{task_id}.pdf" pdf_path = self.download_pdf(task_id, headers, output_path) print(f"PDF saved to: {pdf_path}") # Get UnifiedDocument to check image count unified_doc = self.get_unified_document(task_id, headers) total_images = 0 for page in unified_doc.get("pages", []): for element in page.get("elements", []): if element.get("type") in ["image", "figure", "chart", "diagram"]: total_images += 1 print(f"Total images detected: {total_images}") # Verify PDF exists and has content assert pdf_path.exists() assert pdf_path.stat().st_size > 0 # Check PDF magic bytes with open(pdf_path, "rb") as f: header = f.read(4) assert header == b"%PDF", "Output is not a valid PDF" print(f"[PASS] OCR track image rendering - PDF generated with {total_images} images") def test_1_3_2_direct_track_image_rendering(self, headers): """Test 1.3.2: Verify images appear in Direct track PDF output.""" # Use edit.pdf which may contain embedded images file_path = DEMO_DOCS_PATH / "edit.pdf" if not file_path.exists(): pytest.skip(f"Test file not found: {file_path}") print(f"\n[Test 1.3.2] Direct Track Image Rendering") print(f"Processing: {file_path.name}") # Upload and process with direct track task_id = self.upload_and_process(file_path, headers, force_track="direct") print(f"Task ID: {task_id}") # Wait for completion task = self.wait_for_task_completion(task_id, headers, timeout=120) assert task["status"] == "completed" # Download PDF output_path = Path(__file__).parent / "test_output" / f"direct_images_{task_id}.pdf" pdf_path = self.download_pdf(task_id, headers, output_path) print(f"PDF saved to: {pdf_path}") # Get UnifiedDocument to check image count unified_doc = self.get_unified_document(task_id, headers) total_images = 0 for page in unified_doc.get("pages", []): for element in page.get("elements", []): if element.get("type") in ["image", "figure", "chart", "diagram"]: total_images += 1 print(f"Total images detected: {total_images}") # Verify PDF exists and has content assert pdf_path.exists() assert pdf_path.stat().st_size > 0 print(f"[PASS] Direct track image rendering - PDF generated with {total_images} images") def test_1_3_3_verify_image_paths(self, headers): """Test 1.3.3: Verify images are saved and referenced correctly.""" file_path = DEMO_DOCS_PATH / "scan.pdf" if not file_path.exists(): pytest.skip(f"Test file not found: {file_path}") print(f"\n[Test 1.3.3] Image Path Verification") # Process with OCR track task_id = self.upload_and_process(file_path, headers, force_track="ocr") task = self.wait_for_task_completion(task_id, headers, timeout=180) assert task["status"] == "completed" # Get UnifiedDocument unified_doc = self.get_unified_document(task_id, headers) images_with_paths = [] for page in unified_doc.get("pages", []): for element in page.get("elements", []): if element.get("type") in ["image", "figure"]: content = element.get("content", {}) # Check for saved_path, path, or image_path path = (content.get("saved_path") or content.get("path") or content.get("image_path")) if path: images_with_paths.append({ "element_id": element.get("element_id"), "path": path, "type": element.get("type") }) print(f"Images with paths: {len(images_with_paths)}") for img in images_with_paths[:5]: # Print first 5 print(f" - {img['element_id']}: {img['path']}") # Verify at least some images have paths # Note: May be 0 if PP-Structure doesn't extract images from this specific PDF print(f"[INFO] Found {len(images_with_paths)} images with saved paths") print(f"[PASS] Image path verification complete") class TestTableRendering(TestBase): """Task 2.4: Test table rendering in PDF output.""" def test_2_4_1_simple_tables(self, headers): """Test 2.4.1: Verify simple tables render correctly.""" # Use a document with simple tables file_path = DEMO_DOCS_PATH / "edit.pdf" if not file_path.exists(): pytest.skip(f"Test file not found: {file_path}") print(f"\n[Test 2.4.1] Simple Table Rendering") # Process with direct track task_id = self.upload_and_process(file_path, headers, force_track="direct") task = self.wait_for_task_completion(task_id, headers, timeout=120) assert task["status"] == "completed" # Download PDF output_path = Path(__file__).parent / "test_output" / f"simple_tables_{task_id}.pdf" pdf_path = self.download_pdf(task_id, headers, output_path) # Get UnifiedDocument to count tables unified_doc = self.get_unified_document(task_id, headers) total_tables = 0 for page in unified_doc.get("pages", []): for element in page.get("elements", []): if element.get("type") == "table": total_tables += 1 print(f"Total tables detected: {total_tables}") print(f"PDF saved to: {pdf_path}") assert pdf_path.exists() print(f"[PASS] Simple table rendering - {total_tables} tables in PDF") def test_2_4_2_complex_tables(self, headers): """Test 2.4.2: Verify complex multi-column tables render correctly.""" # Use scan.pdf which may have complex tables file_path = DEMO_DOCS_PATH / "scan.pdf" if not file_path.exists(): pytest.skip(f"Test file not found: {file_path}") print(f"\n[Test 2.4.2] Complex Table Rendering") # Process with OCR track (better for detecting tables in scanned docs) task_id = self.upload_and_process(file_path, headers, force_track="ocr") task = self.wait_for_task_completion(task_id, headers, timeout=180) assert task["status"] == "completed" # Download PDF output_path = Path(__file__).parent / "test_output" / f"complex_tables_{task_id}.pdf" pdf_path = self.download_pdf(task_id, headers, output_path) # Get UnifiedDocument to check table structure unified_doc = self.get_unified_document(task_id, headers) complex_tables = [] for page in unified_doc.get("pages", []): for element in page.get("elements", []): if element.get("type") == "table": content = element.get("content", {}) rows = content.get("rows", 0) cols = content.get("cols", 0) # Consider complex if >= 3 columns or >= 5 rows if cols >= 3 or rows >= 5: complex_tables.append({ "rows": rows, "cols": cols, "element_id": element.get("element_id") }) print(f"Complex tables found: {len(complex_tables)}") for table in complex_tables[:3]: # Print first 3 print(f" - {table['element_id']}: {table['rows']}x{table['cols']}") print(f"PDF saved to: {pdf_path}") assert pdf_path.exists() print(f"[PASS] Complex table rendering - {len(complex_tables)} complex tables") def test_2_4_3_tables_both_tracks(self, headers): """Test 2.4.3: Compare table rendering between OCR and Direct tracks.""" file_path = DEMO_DOCS_PATH / "edit.pdf" if not file_path.exists(): pytest.skip(f"Test file not found: {file_path}") print(f"\n[Test 2.4.3] Table Rendering - Both Tracks Comparison") results = {} for track in ["ocr", "direct"]: print(f"\nProcessing with {track.upper()} track...") task_id = self.upload_and_process(file_path, headers, force_track=track) task = self.wait_for_task_completion(task_id, headers, timeout=180) assert task["status"] == "completed" # Download PDF output_path = Path(__file__).parent / "test_output" / f"tables_{track}_{task_id}.pdf" pdf_path = self.download_pdf(task_id, headers, output_path) # Get table count unified_doc = self.get_unified_document(task_id, headers) table_count = sum( 1 for page in unified_doc.get("pages", []) for element in page.get("elements", []) if element.get("type") == "table" ) results[track] = { "task_id": task_id, "table_count": table_count, "pdf_path": pdf_path, "pdf_size": pdf_path.stat().st_size } print(f" {track.upper()} - Tables: {table_count}, PDF size: {results[track]['pdf_size']} bytes") print(f"\nComparison:") print(f" OCR track: {results['ocr']['table_count']} tables, {results['ocr']['pdf_size']} bytes") print(f" Direct track: {results['direct']['table_count']} tables, {results['direct']['pdf_size']} bytes") # Both tracks should generate valid PDFs assert results['ocr']['pdf_path'].exists() assert results['direct']['pdf_path'].exists() print(f"[PASS] Table rendering comparison complete") class TestTrackSpecificRendering(TestBase): """Task 4.4: Test track-specific rendering quality.""" def test_4_4_1_compare_direct_with_original(self, headers): """Test 4.4.1: Compare Direct track output with original PDF.""" file_path = DEMO_DOCS_PATH / "edit.pdf" if not file_path.exists(): pytest.skip(f"Test file not found: {file_path}") print(f"\n[Test 4.4.1] Direct Track Quality Comparison") # Process with direct track task_id = self.upload_and_process(file_path, headers, force_track="direct") task = self.wait_for_task_completion(task_id, headers, timeout=120) assert task["status"] == "completed" # Download generated PDF output_path = Path(__file__).parent / "test_output" / f"direct_quality_{task_id}.pdf" pdf_path = self.download_pdf(task_id, headers, output_path) # Get metadata response = requests.get( f"{API_BASE_URL}/tasks/{task_id}/metadata", headers=headers ) metadata = response.json() if response.status_code == 200 else {} print(f"Original file: {file_path.name} ({file_path.stat().st_size} bytes)") print(f"Generated PDF: {pdf_path.name} ({pdf_path.stat().st_size} bytes)") print(f"Processing track: {metadata.get('processing_track')}") print(f"Processing time: {metadata.get('processing_time_seconds', 0):.2f}s") # Verify it's Direct track assert metadata.get("processing_track") == "direct" # Get UnifiedDocument to check preservation unified_doc = self.get_unified_document(task_id, headers) stats = { "pages": len(unified_doc.get("pages", [])), "text_elements": 0, "images": 0, "tables": 0, "with_style": 0, "with_spans": 0 } for page in unified_doc.get("pages", []): for element in page.get("elements", []): el_type = element.get("type") if el_type in ["text", "paragraph", "title", "header"]: stats["text_elements"] += 1 if element.get("style"): stats["with_style"] += 1 if element.get("children"): stats["with_spans"] += 1 elif el_type in ["image", "figure"]: stats["images"] += 1 elif el_type == "table": stats["tables"] += 1 print(f"\nDocument structure:") print(f" Pages: {stats['pages']}") print(f" Text elements: {stats['text_elements']} ({stats['with_style']} with style)") print(f" Span children: {stats['with_spans']} elements") print(f" Images: {stats['images']}") print(f" Tables: {stats['tables']}") # Direct track should preserve styles assert pdf_path.exists() print(f"[PASS] Direct track quality check complete") def test_4_4_2_verify_ocr_quality(self, headers): """Test 4.4.2: Verify OCR track maintains quality.""" file_path = DEMO_DOCS_PATH / "scan.pdf" if not file_path.exists(): pytest.skip(f"Test file not found: {file_path}") print(f"\n[Test 4.4.2] OCR Track Quality Verification") # Process with OCR track task_id = self.upload_and_process(file_path, headers, force_track="ocr") task = self.wait_for_task_completion(task_id, headers, timeout=180) assert task["status"] == "completed" # Download generated PDF output_path = Path(__file__).parent / "test_output" / f"ocr_quality_{task_id}.pdf" pdf_path = self.download_pdf(task_id, headers, output_path) # Get metadata response = requests.get( f"{API_BASE_URL}/tasks/{task_id}/metadata", headers=headers ) metadata = response.json() if response.status_code == 200 else {} print(f"Original file: {file_path.name} ({file_path.stat().st_size} bytes)") print(f"Generated PDF: {pdf_path.name} ({pdf_path.stat().st_size} bytes)") print(f"Processing track: {metadata.get('processing_track')}") print(f"Processing time: {metadata.get('processing_time_seconds', 0):.2f}s") # Verify it's OCR track assert metadata.get("processing_track") == "ocr" # Get UnifiedDocument unified_doc = self.get_unified_document(task_id, headers) text_regions = metadata.get("total_text_regions", 0) total_tables = metadata.get("total_tables", 0) total_images = metadata.get("total_images", 0) print(f"\nOCR results:") print(f" Text regions: {text_regions}") print(f" Tables: {total_tables}") print(f" Images: {total_images}") # OCR track should extract content assert pdf_path.exists() assert text_regions > 0 or total_images > 0, "OCR should extract some content" print(f"[PASS] OCR track quality check complete") if __name__ == "__main__": pytest.main([__file__, "-v", "-s"])