OCR/backend/tests/debug_table_cells2.py

"""Debug PyMuPDF table structure - find merge info"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))

import fitz

pdf_path = Path(__file__).parent.parent.parent / "demo_docs" / "edit3.pdf"
doc = fitz.open(str(pdf_path))
page = doc[0]

tables = page.find_tables()
for idx, table in enumerate(tables.tables):
    print(f"\nTable {idx}:")

    # Check all available attributes
    print(f"  Available attributes: {[a for a in dir(table) if not a.startswith('_')]}")

    # Try to get header info
    if hasattr(table, 'header'):
        print(f"  header: {table.header}")

    # Check for cells info
    cell_rects = table.cells
    print(f"  cells count: {len(cell_rects)}")

    # Get the extracted data
    data = table.extract()
    print(f"  extract() shape: {len(data)} x {max(len(r) for r in data)}")

    # Check if there's a way to map cells to grid positions
    # Look at the pandas output which might have merge info
    try:
        df = table.to_pandas()
        print(f"  pandas shape: {df.shape}")
    except Exception as e:
        print(f"  pandas error: {e}")

    # Check the TableRow objects if available
    if hasattr(table, 'rows'):
        rows = table.rows
        print(f"  rows: {len(rows)}")
        for ri, row in enumerate(rows[:3]):  # first 3 rows
            print(f"    row {ri}: {len(row.cells)} cells")
            for ci, cell in enumerate(row.cells[:5]):  # first 5 cells
                print(f"      cell {ci}: bbox={cell}")

doc.close()