- Add debug_font_path, demo_docs_dir, e2e_api_base_url to config.py - Fix hardcoded paths in pp_structure_debug.py, create_demo_images.py - Fix hardcoded paths in test files - Update .env.example with new configuration options - Update .gitignore to exclude AI development files (.claude/, openspec/, AGENTS.md, CLAUDE.md) - Add production startup script (start-prod.sh) - Add README.md with project documentation - Add 1panel Docker deployment files (docker-compose.yml, Dockerfiles, nginx.conf) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
560 lines
20 KiB
Python
560 lines
20 KiB
Python
"""
|
|
End-to-end tests for PDF layout restoration (Phase 1-3).
|
|
|
|
Tests verify:
|
|
- Task 1.3: Image rendering in PDF output
|
|
- Task 2.4: Table rendering in PDF output
|
|
- Task 4.4: Track-specific rendering quality
|
|
|
|
Run with: pytest backend/tests/e2e/test_pdf_layout_restoration.py -v -s
|
|
"""
|
|
|
|
import pytest
|
|
import requests
|
|
import time
|
|
import os
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
import json
|
|
|
|
# Configuration
|
|
_default_backend_port = os.getenv("BACKEND_PORT", "8000")
|
|
_default_base_url = f"http://localhost:{_default_backend_port}"
|
|
_api_base = os.getenv("TOOL_OCR_E2E_API_BASE_URL", _default_base_url).rstrip("/")
|
|
API_BASE_URL = f"{_api_base}/api/v2"
|
|
DEMO_DOCS_PATH = Path(
|
|
os.getenv("TOOL_OCR_DEMO_DOCS_DIR")
|
|
or (Path(__file__).resolve().parents[3] / "demo_docs")
|
|
)
|
|
|
|
# Test credentials must be provided via environment variables
|
|
TEST_USERNAME = os.getenv("TOOL_OCR_E2E_USERNAME")
|
|
TEST_PASSWORD = os.getenv("TOOL_OCR_E2E_PASSWORD")
|
|
|
|
|
|
class TestBase:
|
|
"""Base class for layout restoration tests."""
|
|
|
|
@pytest.fixture(scope="class")
|
|
def auth_token(self):
|
|
"""Authenticate and get access token."""
|
|
if not TEST_USERNAME or not TEST_PASSWORD:
|
|
pytest.skip("Set TOOL_OCR_E2E_USERNAME and TOOL_OCR_E2E_PASSWORD to run E2E tests")
|
|
|
|
response = requests.post(
|
|
f"{API_BASE_URL}/auth/login",
|
|
json={
|
|
"username": TEST_USERNAME,
|
|
"password": TEST_PASSWORD
|
|
}
|
|
)
|
|
|
|
if response.status_code != 200:
|
|
pytest.skip(f"Authentication failed: {response.text}")
|
|
|
|
data = response.json()
|
|
return data["access_token"]
|
|
|
|
@pytest.fixture
|
|
def headers(self, auth_token):
|
|
"""Get authorization headers."""
|
|
return {"Authorization": f"Bearer {auth_token}"}
|
|
|
|
def wait_for_task_completion(
|
|
self,
|
|
task_id: str,
|
|
headers: dict,
|
|
timeout: int = 120,
|
|
poll_interval: int = 2
|
|
) -> dict:
|
|
"""Wait for task to complete or fail."""
|
|
start_time = time.time()
|
|
|
|
while time.time() - start_time < timeout:
|
|
response = requests.get(
|
|
f"{API_BASE_URL}/tasks/{task_id}",
|
|
headers=headers
|
|
)
|
|
|
|
if response.status_code != 200:
|
|
raise Exception(f"Failed to get task status: {response.text}")
|
|
|
|
task = response.json()
|
|
status = task.get("status")
|
|
|
|
if status == "completed":
|
|
return task
|
|
elif status == "failed":
|
|
raise Exception(f"Task failed: {task.get('error_message')}")
|
|
|
|
time.sleep(poll_interval)
|
|
|
|
raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds")
|
|
|
|
def upload_and_process(
|
|
self,
|
|
file_path: Path,
|
|
headers: dict,
|
|
force_track: Optional[str] = None
|
|
) -> str:
|
|
"""Upload file and start processing. Returns task_id."""
|
|
# Upload file
|
|
with open(file_path, "rb") as f:
|
|
files = {"file": (file_path.name, f)}
|
|
response = requests.post(
|
|
f"{API_BASE_URL}/upload",
|
|
files=files,
|
|
headers=headers
|
|
)
|
|
|
|
if response.status_code != 200:
|
|
raise Exception(f"Upload failed: {response.text}")
|
|
|
|
upload_result = response.json()
|
|
task_id = upload_result["task_id"]
|
|
|
|
# Start processing
|
|
params = {"use_dual_track": True}
|
|
if force_track:
|
|
params["force_track"] = force_track
|
|
|
|
response = requests.post(
|
|
f"{API_BASE_URL}/tasks/{task_id}/start",
|
|
headers=headers,
|
|
params=params
|
|
)
|
|
|
|
if response.status_code != 200:
|
|
raise Exception(f"Start processing failed: {response.text}")
|
|
|
|
return task_id
|
|
|
|
def download_pdf(self, task_id: str, headers: dict, output_path: Path):
|
|
"""Download generated PDF."""
|
|
response = requests.get(
|
|
f"{API_BASE_URL}/tasks/{task_id}/download/pdf",
|
|
headers=headers
|
|
)
|
|
|
|
if response.status_code != 200:
|
|
raise Exception(f"PDF download failed: {response.text}")
|
|
|
|
# Save PDF for inspection
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(output_path, "wb") as f:
|
|
f.write(response.content)
|
|
|
|
return output_path
|
|
|
|
def get_unified_document(self, task_id: str, headers: dict) -> dict:
|
|
"""Get UnifiedDocument JSON."""
|
|
response = requests.get(
|
|
f"{API_BASE_URL}/tasks/{task_id}/download/unified",
|
|
headers=headers
|
|
)
|
|
|
|
if response.status_code != 200:
|
|
raise Exception(f"UnifiedDocument download failed: {response.text}")
|
|
|
|
return response.json()
|
|
|
|
|
|
class TestImageRendering(TestBase):
|
|
"""Task 1.3: Test image rendering in PDF output."""
|
|
|
|
def test_1_3_1_ocr_track_image_rendering(self, headers):
|
|
"""Test 1.3.1: Verify images appear in OCR track PDF output."""
|
|
# Use scan.pdf which should have images detected by OCR
|
|
file_path = DEMO_DOCS_PATH / "scan.pdf"
|
|
|
|
if not file_path.exists():
|
|
pytest.skip(f"Test file not found: {file_path}")
|
|
|
|
print(f"\n[Test 1.3.1] OCR Track Image Rendering")
|
|
print(f"Processing: {file_path.name}")
|
|
|
|
# Upload and process with OCR track
|
|
task_id = self.upload_and_process(file_path, headers, force_track="ocr")
|
|
print(f"Task ID: {task_id}")
|
|
|
|
# Wait for completion
|
|
task = self.wait_for_task_completion(task_id, headers, timeout=180)
|
|
assert task["status"] == "completed"
|
|
|
|
# Download PDF
|
|
output_path = Path(__file__).parent / "test_output" / f"ocr_images_{task_id}.pdf"
|
|
pdf_path = self.download_pdf(task_id, headers, output_path)
|
|
print(f"PDF saved to: {pdf_path}")
|
|
|
|
# Get UnifiedDocument to check image count
|
|
unified_doc = self.get_unified_document(task_id, headers)
|
|
|
|
total_images = 0
|
|
for page in unified_doc.get("pages", []):
|
|
for element in page.get("elements", []):
|
|
if element.get("type") in ["image", "figure", "chart", "diagram"]:
|
|
total_images += 1
|
|
|
|
print(f"Total images detected: {total_images}")
|
|
|
|
# Verify PDF exists and has content
|
|
assert pdf_path.exists()
|
|
assert pdf_path.stat().st_size > 0
|
|
|
|
# Check PDF magic bytes
|
|
with open(pdf_path, "rb") as f:
|
|
header = f.read(4)
|
|
assert header == b"%PDF", "Output is not a valid PDF"
|
|
|
|
print(f"[PASS] OCR track image rendering - PDF generated with {total_images} images")
|
|
|
|
def test_1_3_2_direct_track_image_rendering(self, headers):
|
|
"""Test 1.3.2: Verify images appear in Direct track PDF output."""
|
|
# Use edit.pdf which may contain embedded images
|
|
file_path = DEMO_DOCS_PATH / "edit.pdf"
|
|
|
|
if not file_path.exists():
|
|
pytest.skip(f"Test file not found: {file_path}")
|
|
|
|
print(f"\n[Test 1.3.2] Direct Track Image Rendering")
|
|
print(f"Processing: {file_path.name}")
|
|
|
|
# Upload and process with direct track
|
|
task_id = self.upload_and_process(file_path, headers, force_track="direct")
|
|
print(f"Task ID: {task_id}")
|
|
|
|
# Wait for completion
|
|
task = self.wait_for_task_completion(task_id, headers, timeout=120)
|
|
assert task["status"] == "completed"
|
|
|
|
# Download PDF
|
|
output_path = Path(__file__).parent / "test_output" / f"direct_images_{task_id}.pdf"
|
|
pdf_path = self.download_pdf(task_id, headers, output_path)
|
|
print(f"PDF saved to: {pdf_path}")
|
|
|
|
# Get UnifiedDocument to check image count
|
|
unified_doc = self.get_unified_document(task_id, headers)
|
|
|
|
total_images = 0
|
|
for page in unified_doc.get("pages", []):
|
|
for element in page.get("elements", []):
|
|
if element.get("type") in ["image", "figure", "chart", "diagram"]:
|
|
total_images += 1
|
|
|
|
print(f"Total images detected: {total_images}")
|
|
|
|
# Verify PDF exists and has content
|
|
assert pdf_path.exists()
|
|
assert pdf_path.stat().st_size > 0
|
|
|
|
print(f"[PASS] Direct track image rendering - PDF generated with {total_images} images")
|
|
|
|
def test_1_3_3_verify_image_paths(self, headers):
|
|
"""Test 1.3.3: Verify images are saved and referenced correctly."""
|
|
file_path = DEMO_DOCS_PATH / "scan.pdf"
|
|
|
|
if not file_path.exists():
|
|
pytest.skip(f"Test file not found: {file_path}")
|
|
|
|
print(f"\n[Test 1.3.3] Image Path Verification")
|
|
|
|
# Process with OCR track
|
|
task_id = self.upload_and_process(file_path, headers, force_track="ocr")
|
|
task = self.wait_for_task_completion(task_id, headers, timeout=180)
|
|
assert task["status"] == "completed"
|
|
|
|
# Get UnifiedDocument
|
|
unified_doc = self.get_unified_document(task_id, headers)
|
|
|
|
images_with_paths = []
|
|
for page in unified_doc.get("pages", []):
|
|
for element in page.get("elements", []):
|
|
if element.get("type") in ["image", "figure"]:
|
|
content = element.get("content", {})
|
|
# Check for saved_path, path, or image_path
|
|
path = (content.get("saved_path") or
|
|
content.get("path") or
|
|
content.get("image_path"))
|
|
|
|
if path:
|
|
images_with_paths.append({
|
|
"element_id": element.get("element_id"),
|
|
"path": path,
|
|
"type": element.get("type")
|
|
})
|
|
|
|
print(f"Images with paths: {len(images_with_paths)}")
|
|
for img in images_with_paths[:5]: # Print first 5
|
|
print(f" - {img['element_id']}: {img['path']}")
|
|
|
|
# Verify at least some images have paths
|
|
# Note: May be 0 if PP-Structure doesn't extract images from this specific PDF
|
|
print(f"[INFO] Found {len(images_with_paths)} images with saved paths")
|
|
print(f"[PASS] Image path verification complete")
|
|
|
|
|
|
class TestTableRendering(TestBase):
|
|
"""Task 2.4: Test table rendering in PDF output."""
|
|
|
|
def test_2_4_1_simple_tables(self, headers):
|
|
"""Test 2.4.1: Verify simple tables render correctly."""
|
|
# Use a document with simple tables
|
|
file_path = DEMO_DOCS_PATH / "edit.pdf"
|
|
|
|
if not file_path.exists():
|
|
pytest.skip(f"Test file not found: {file_path}")
|
|
|
|
print(f"\n[Test 2.4.1] Simple Table Rendering")
|
|
|
|
# Process with direct track
|
|
task_id = self.upload_and_process(file_path, headers, force_track="direct")
|
|
task = self.wait_for_task_completion(task_id, headers, timeout=120)
|
|
assert task["status"] == "completed"
|
|
|
|
# Download PDF
|
|
output_path = Path(__file__).parent / "test_output" / f"simple_tables_{task_id}.pdf"
|
|
pdf_path = self.download_pdf(task_id, headers, output_path)
|
|
|
|
# Get UnifiedDocument to count tables
|
|
unified_doc = self.get_unified_document(task_id, headers)
|
|
|
|
total_tables = 0
|
|
for page in unified_doc.get("pages", []):
|
|
for element in page.get("elements", []):
|
|
if element.get("type") == "table":
|
|
total_tables += 1
|
|
|
|
print(f"Total tables detected: {total_tables}")
|
|
print(f"PDF saved to: {pdf_path}")
|
|
|
|
assert pdf_path.exists()
|
|
print(f"[PASS] Simple table rendering - {total_tables} tables in PDF")
|
|
|
|
def test_2_4_2_complex_tables(self, headers):
|
|
"""Test 2.4.2: Verify complex multi-column tables render correctly."""
|
|
# Use scan.pdf which may have complex tables
|
|
file_path = DEMO_DOCS_PATH / "scan.pdf"
|
|
|
|
if not file_path.exists():
|
|
pytest.skip(f"Test file not found: {file_path}")
|
|
|
|
print(f"\n[Test 2.4.2] Complex Table Rendering")
|
|
|
|
# Process with OCR track (better for detecting tables in scanned docs)
|
|
task_id = self.upload_and_process(file_path, headers, force_track="ocr")
|
|
task = self.wait_for_task_completion(task_id, headers, timeout=180)
|
|
assert task["status"] == "completed"
|
|
|
|
# Download PDF
|
|
output_path = Path(__file__).parent / "test_output" / f"complex_tables_{task_id}.pdf"
|
|
pdf_path = self.download_pdf(task_id, headers, output_path)
|
|
|
|
# Get UnifiedDocument to check table structure
|
|
unified_doc = self.get_unified_document(task_id, headers)
|
|
|
|
complex_tables = []
|
|
for page in unified_doc.get("pages", []):
|
|
for element in page.get("elements", []):
|
|
if element.get("type") == "table":
|
|
content = element.get("content", {})
|
|
rows = content.get("rows", 0)
|
|
cols = content.get("cols", 0)
|
|
|
|
# Consider complex if >= 3 columns or >= 5 rows
|
|
if cols >= 3 or rows >= 5:
|
|
complex_tables.append({
|
|
"rows": rows,
|
|
"cols": cols,
|
|
"element_id": element.get("element_id")
|
|
})
|
|
|
|
print(f"Complex tables found: {len(complex_tables)}")
|
|
for table in complex_tables[:3]: # Print first 3
|
|
print(f" - {table['element_id']}: {table['rows']}x{table['cols']}")
|
|
|
|
print(f"PDF saved to: {pdf_path}")
|
|
assert pdf_path.exists()
|
|
print(f"[PASS] Complex table rendering - {len(complex_tables)} complex tables")
|
|
|
|
def test_2_4_3_tables_both_tracks(self, headers):
|
|
"""Test 2.4.3: Compare table rendering between OCR and Direct tracks."""
|
|
file_path = DEMO_DOCS_PATH / "edit.pdf"
|
|
|
|
if not file_path.exists():
|
|
pytest.skip(f"Test file not found: {file_path}")
|
|
|
|
print(f"\n[Test 2.4.3] Table Rendering - Both Tracks Comparison")
|
|
|
|
results = {}
|
|
|
|
for track in ["ocr", "direct"]:
|
|
print(f"\nProcessing with {track.upper()} track...")
|
|
|
|
task_id = self.upload_and_process(file_path, headers, force_track=track)
|
|
task = self.wait_for_task_completion(task_id, headers, timeout=180)
|
|
assert task["status"] == "completed"
|
|
|
|
# Download PDF
|
|
output_path = Path(__file__).parent / "test_output" / f"tables_{track}_{task_id}.pdf"
|
|
pdf_path = self.download_pdf(task_id, headers, output_path)
|
|
|
|
# Get table count
|
|
unified_doc = self.get_unified_document(task_id, headers)
|
|
table_count = sum(
|
|
1 for page in unified_doc.get("pages", [])
|
|
for element in page.get("elements", [])
|
|
if element.get("type") == "table"
|
|
)
|
|
|
|
results[track] = {
|
|
"task_id": task_id,
|
|
"table_count": table_count,
|
|
"pdf_path": pdf_path,
|
|
"pdf_size": pdf_path.stat().st_size
|
|
}
|
|
|
|
print(f" {track.upper()} - Tables: {table_count}, PDF size: {results[track]['pdf_size']} bytes")
|
|
|
|
print(f"\nComparison:")
|
|
print(f" OCR track: {results['ocr']['table_count']} tables, {results['ocr']['pdf_size']} bytes")
|
|
print(f" Direct track: {results['direct']['table_count']} tables, {results['direct']['pdf_size']} bytes")
|
|
|
|
# Both tracks should generate valid PDFs
|
|
assert results['ocr']['pdf_path'].exists()
|
|
assert results['direct']['pdf_path'].exists()
|
|
|
|
print(f"[PASS] Table rendering comparison complete")
|
|
|
|
|
|
class TestTrackSpecificRendering(TestBase):
|
|
"""Task 4.4: Test track-specific rendering quality."""
|
|
|
|
def test_4_4_1_compare_direct_with_original(self, headers):
|
|
"""Test 4.4.1: Compare Direct track output with original PDF."""
|
|
file_path = DEMO_DOCS_PATH / "edit.pdf"
|
|
|
|
if not file_path.exists():
|
|
pytest.skip(f"Test file not found: {file_path}")
|
|
|
|
print(f"\n[Test 4.4.1] Direct Track Quality Comparison")
|
|
|
|
# Process with direct track
|
|
task_id = self.upload_and_process(file_path, headers, force_track="direct")
|
|
task = self.wait_for_task_completion(task_id, headers, timeout=120)
|
|
assert task["status"] == "completed"
|
|
|
|
# Download generated PDF
|
|
output_path = Path(__file__).parent / "test_output" / f"direct_quality_{task_id}.pdf"
|
|
pdf_path = self.download_pdf(task_id, headers, output_path)
|
|
|
|
# Get metadata
|
|
response = requests.get(
|
|
f"{API_BASE_URL}/tasks/{task_id}/metadata",
|
|
headers=headers
|
|
)
|
|
|
|
metadata = response.json() if response.status_code == 200 else {}
|
|
|
|
print(f"Original file: {file_path.name} ({file_path.stat().st_size} bytes)")
|
|
print(f"Generated PDF: {pdf_path.name} ({pdf_path.stat().st_size} bytes)")
|
|
print(f"Processing track: {metadata.get('processing_track')}")
|
|
print(f"Processing time: {metadata.get('processing_time_seconds', 0):.2f}s")
|
|
|
|
# Verify it's Direct track
|
|
assert metadata.get("processing_track") == "direct"
|
|
|
|
# Get UnifiedDocument to check preservation
|
|
unified_doc = self.get_unified_document(task_id, headers)
|
|
|
|
stats = {
|
|
"pages": len(unified_doc.get("pages", [])),
|
|
"text_elements": 0,
|
|
"images": 0,
|
|
"tables": 0,
|
|
"with_style": 0,
|
|
"with_spans": 0
|
|
}
|
|
|
|
for page in unified_doc.get("pages", []):
|
|
for element in page.get("elements", []):
|
|
el_type = element.get("type")
|
|
|
|
if el_type in ["text", "paragraph", "title", "header"]:
|
|
stats["text_elements"] += 1
|
|
if element.get("style"):
|
|
stats["with_style"] += 1
|
|
if element.get("children"):
|
|
stats["with_spans"] += 1
|
|
|
|
elif el_type in ["image", "figure"]:
|
|
stats["images"] += 1
|
|
elif el_type == "table":
|
|
stats["tables"] += 1
|
|
|
|
print(f"\nDocument structure:")
|
|
print(f" Pages: {stats['pages']}")
|
|
print(f" Text elements: {stats['text_elements']} ({stats['with_style']} with style)")
|
|
print(f" Span children: {stats['with_spans']} elements")
|
|
print(f" Images: {stats['images']}")
|
|
print(f" Tables: {stats['tables']}")
|
|
|
|
# Direct track should preserve styles
|
|
assert pdf_path.exists()
|
|
print(f"[PASS] Direct track quality check complete")
|
|
|
|
def test_4_4_2_verify_ocr_quality(self, headers):
|
|
"""Test 4.4.2: Verify OCR track maintains quality."""
|
|
file_path = DEMO_DOCS_PATH / "scan.pdf"
|
|
|
|
if not file_path.exists():
|
|
pytest.skip(f"Test file not found: {file_path}")
|
|
|
|
print(f"\n[Test 4.4.2] OCR Track Quality Verification")
|
|
|
|
# Process with OCR track
|
|
task_id = self.upload_and_process(file_path, headers, force_track="ocr")
|
|
task = self.wait_for_task_completion(task_id, headers, timeout=180)
|
|
assert task["status"] == "completed"
|
|
|
|
# Download generated PDF
|
|
output_path = Path(__file__).parent / "test_output" / f"ocr_quality_{task_id}.pdf"
|
|
pdf_path = self.download_pdf(task_id, headers, output_path)
|
|
|
|
# Get metadata
|
|
response = requests.get(
|
|
f"{API_BASE_URL}/tasks/{task_id}/metadata",
|
|
headers=headers
|
|
)
|
|
|
|
metadata = response.json() if response.status_code == 200 else {}
|
|
|
|
print(f"Original file: {file_path.name} ({file_path.stat().st_size} bytes)")
|
|
print(f"Generated PDF: {pdf_path.name} ({pdf_path.stat().st_size} bytes)")
|
|
print(f"Processing track: {metadata.get('processing_track')}")
|
|
print(f"Processing time: {metadata.get('processing_time_seconds', 0):.2f}s")
|
|
|
|
# Verify it's OCR track
|
|
assert metadata.get("processing_track") == "ocr"
|
|
|
|
# Get UnifiedDocument
|
|
unified_doc = self.get_unified_document(task_id, headers)
|
|
|
|
text_regions = metadata.get("total_text_regions", 0)
|
|
total_tables = metadata.get("total_tables", 0)
|
|
total_images = metadata.get("total_images", 0)
|
|
|
|
print(f"\nOCR results:")
|
|
print(f" Text regions: {text_regions}")
|
|
print(f" Tables: {total_tables}")
|
|
print(f" Images: {total_images}")
|
|
|
|
# OCR track should extract content
|
|
assert pdf_path.exists()
|
|
assert text_regions > 0 or total_images > 0, "OCR should extract some content"
|
|
|
|
print(f"[PASS] OCR track quality check complete")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
pytest.main([__file__, "-v", "-s"])
|