This commit is contained in:
egg
2025-12-04 18:00:37 +08:00
parent 9437387ef1
commit 8265be1741
22 changed files with 2672 additions and 196 deletions

View File

@@ -0,0 +1,111 @@
"""
Generate test PDF to verify Phase 1 fixes
"""
import sys
import os
from pathlib import Path
# Add backend to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from app.services.direct_extraction_engine import DirectExtractionEngine
from app.services.pdf_generator_service import PDFGeneratorService
from app.services.unified_document_exporter import UnifiedDocumentExporter
def generate_test_pdf(input_pdf: str, output_dir: Path):
"""Generate test PDF using Direct Track extraction"""
input_path = Path(input_pdf)
output_dir.mkdir(parents=True, exist_ok=True)
print(f"Processing: {input_path.name}")
print(f"Output dir: {output_dir}")
# Step 1: Extract with Direct Track
engine = DirectExtractionEngine(
enable_table_detection=True,
enable_image_extraction=True,
min_image_area=200.0, # Filter tiny images
enable_whiteout_detection=True,
enable_content_sanitization=True
)
unified_doc = engine.extract(input_path, output_dir=output_dir)
# Print extraction stats
print(f"\n=== Extraction Results ===")
print(f"Document ID: {unified_doc.document_id}")
print(f"Pages: {len(unified_doc.pages)}")
table_count = 0
image_count = 0
merged_cells = 0
total_cells = 0
for page in unified_doc.pages:
for elem in page.elements:
if elem.type.value == 'table':
table_count += 1
if elem.content and hasattr(elem.content, 'cells'):
total_cells += len(elem.content.cells)
for cell in elem.content.cells:
if cell.row_span > 1 or cell.col_span > 1:
merged_cells += 1
elif elem.type.value == 'image':
image_count += 1
print(f"Tables: {table_count}")
print(f" - Total cells: {total_cells}")
print(f" - Merged cells: {merged_cells}")
print(f"Images: {image_count}")
# Step 2: Export to JSON
exporter = UnifiedDocumentExporter()
json_path = output_dir / f"{input_path.stem}_result.json"
exporter.export_to_json(unified_doc, json_path)
print(f"\nJSON saved: {json_path}")
# Step 3: Generate layout PDF
pdf_generator = PDFGeneratorService()
pdf_path = output_dir / f"{input_path.stem}_layout.pdf"
try:
pdf_generator.generate_from_unified_document(
unified_doc=unified_doc,
output_path=pdf_path,
source_file_path=input_path
)
print(f"PDF saved: {pdf_path}")
return pdf_path
except Exception as e:
print(f"PDF generation error: {e}")
import traceback
traceback.print_exc()
return None
if __name__ == "__main__":
# Test with edit3.pdf (has complex tables with merging)
demo_docs = Path(__file__).parent.parent.parent / "demo_docs"
output_base = Path(__file__).parent.parent / "storage" / "test_phase1"
# Process edit3.pdf
edit3_pdf = demo_docs / "edit3.pdf"
if edit3_pdf.exists():
output_dir = output_base / "edit3"
result = generate_test_pdf(str(edit3_pdf), output_dir)
if result:
print(f"\n✓ Test PDF generated: {result}")
# Also process edit.pdf for comparison
edit_pdf = demo_docs / "edit.pdf"
if edit_pdf.exists():
output_dir = output_base / "edit"
result = generate_test_pdf(str(edit_pdf), output_dir)
if result:
print(f"\n✓ Test PDF generated: {result}")
print(f"\n=== Output Location ===")
print(f"{output_base}")