"""Debug PyMuPDF table structure - find merge info""" import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent)) import fitz pdf_path = Path(__file__).parent.parent.parent / "demo_docs" / "edit3.pdf" doc = fitz.open(str(pdf_path)) page = doc[0] tables = page.find_tables() for idx, table in enumerate(tables.tables): print(f"\nTable {idx}:") # Check all available attributes print(f" Available attributes: {[a for a in dir(table) if not a.startswith('_')]}") # Try to get header info if hasattr(table, 'header'): print(f" header: {table.header}") # Check for cells info cell_rects = table.cells print(f" cells count: {len(cell_rects)}") # Get the extracted data data = table.extract() print(f" extract() shape: {len(data)} x {max(len(r) for r in data)}") # Check if there's a way to map cells to grid positions # Look at the pandas output which might have merge info try: df = table.to_pandas() print(f" pandas shape: {df.shape}") except Exception as e: print(f" pandas error: {e}") # Check the TableRow objects if available if hasattr(table, 'rows'): rows = table.rows print(f" rows: {len(rows)}") for ri, row in enumerate(rows[:3]): # first 3 rows print(f" row {ri}: {len(row.cells)} cells") for ci, cell in enumerate(row.cells[:5]): # first 5 cells print(f" cell {ci}: bbox={cell}") doc.close()