test
This commit is contained in:
111
backend/tests/generate_test_pdf.py
Normal file
111
backend/tests/generate_test_pdf.py
Normal file
@@ -0,0 +1,111 @@
|
||||
"""
|
||||
Generate test PDF to verify Phase 1 fixes
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
# Add backend to path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from app.services.direct_extraction_engine import DirectExtractionEngine
|
||||
from app.services.pdf_generator_service import PDFGeneratorService
|
||||
from app.services.unified_document_exporter import UnifiedDocumentExporter
|
||||
|
||||
|
||||
def generate_test_pdf(input_pdf: str, output_dir: Path):
|
||||
"""Generate test PDF using Direct Track extraction"""
|
||||
|
||||
input_path = Path(input_pdf)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print(f"Processing: {input_path.name}")
|
||||
print(f"Output dir: {output_dir}")
|
||||
|
||||
# Step 1: Extract with Direct Track
|
||||
engine = DirectExtractionEngine(
|
||||
enable_table_detection=True,
|
||||
enable_image_extraction=True,
|
||||
min_image_area=200.0, # Filter tiny images
|
||||
enable_whiteout_detection=True,
|
||||
enable_content_sanitization=True
|
||||
)
|
||||
|
||||
unified_doc = engine.extract(input_path, output_dir=output_dir)
|
||||
|
||||
# Print extraction stats
|
||||
print(f"\n=== Extraction Results ===")
|
||||
print(f"Document ID: {unified_doc.document_id}")
|
||||
print(f"Pages: {len(unified_doc.pages)}")
|
||||
|
||||
table_count = 0
|
||||
image_count = 0
|
||||
merged_cells = 0
|
||||
total_cells = 0
|
||||
|
||||
for page in unified_doc.pages:
|
||||
for elem in page.elements:
|
||||
if elem.type.value == 'table':
|
||||
table_count += 1
|
||||
if elem.content and hasattr(elem.content, 'cells'):
|
||||
total_cells += len(elem.content.cells)
|
||||
for cell in elem.content.cells:
|
||||
if cell.row_span > 1 or cell.col_span > 1:
|
||||
merged_cells += 1
|
||||
elif elem.type.value == 'image':
|
||||
image_count += 1
|
||||
|
||||
print(f"Tables: {table_count}")
|
||||
print(f" - Total cells: {total_cells}")
|
||||
print(f" - Merged cells: {merged_cells}")
|
||||
print(f"Images: {image_count}")
|
||||
|
||||
# Step 2: Export to JSON
|
||||
exporter = UnifiedDocumentExporter()
|
||||
json_path = output_dir / f"{input_path.stem}_result.json"
|
||||
exporter.export_to_json(unified_doc, json_path)
|
||||
print(f"\nJSON saved: {json_path}")
|
||||
|
||||
# Step 3: Generate layout PDF
|
||||
pdf_generator = PDFGeneratorService()
|
||||
pdf_path = output_dir / f"{input_path.stem}_layout.pdf"
|
||||
|
||||
try:
|
||||
pdf_generator.generate_from_unified_document(
|
||||
unified_doc=unified_doc,
|
||||
output_path=pdf_path,
|
||||
source_file_path=input_path
|
||||
)
|
||||
print(f"PDF saved: {pdf_path}")
|
||||
return pdf_path
|
||||
except Exception as e:
|
||||
print(f"PDF generation error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Test with edit3.pdf (has complex tables with merging)
|
||||
demo_docs = Path(__file__).parent.parent.parent / "demo_docs"
|
||||
output_base = Path(__file__).parent.parent / "storage" / "test_phase1"
|
||||
|
||||
# Process edit3.pdf
|
||||
edit3_pdf = demo_docs / "edit3.pdf"
|
||||
if edit3_pdf.exists():
|
||||
output_dir = output_base / "edit3"
|
||||
result = generate_test_pdf(str(edit3_pdf), output_dir)
|
||||
if result:
|
||||
print(f"\n✓ Test PDF generated: {result}")
|
||||
|
||||
# Also process edit.pdf for comparison
|
||||
edit_pdf = demo_docs / "edit.pdf"
|
||||
if edit_pdf.exists():
|
||||
output_dir = output_base / "edit"
|
||||
result = generate_test_pdf(str(edit_pdf), output_dir)
|
||||
if result:
|
||||
print(f"\n✓ Test PDF generated: {result}")
|
||||
|
||||
print(f"\n=== Output Location ===")
|
||||
print(f"{output_base}")
|
||||
Reference in New Issue
Block a user