49 lines
1.5 KiB
Python
49 lines
1.5 KiB
Python
"""Debug PyMuPDF table structure - find merge info"""
|
|
import sys
|
|
from pathlib import Path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
import fitz
|
|
|
|
pdf_path = Path(__file__).parent.parent.parent / "demo_docs" / "edit3.pdf"
|
|
doc = fitz.open(str(pdf_path))
|
|
page = doc[0]
|
|
|
|
tables = page.find_tables()
|
|
for idx, table in enumerate(tables.tables):
|
|
print(f"\nTable {idx}:")
|
|
|
|
# Check all available attributes
|
|
print(f" Available attributes: {[a for a in dir(table) if not a.startswith('_')]}")
|
|
|
|
# Try to get header info
|
|
if hasattr(table, 'header'):
|
|
print(f" header: {table.header}")
|
|
|
|
# Check for cells info
|
|
cell_rects = table.cells
|
|
print(f" cells count: {len(cell_rects)}")
|
|
|
|
# Get the extracted data
|
|
data = table.extract()
|
|
print(f" extract() shape: {len(data)} x {max(len(r) for r in data)}")
|
|
|
|
# Check if there's a way to map cells to grid positions
|
|
# Look at the pandas output which might have merge info
|
|
try:
|
|
df = table.to_pandas()
|
|
print(f" pandas shape: {df.shape}")
|
|
except Exception as e:
|
|
print(f" pandas error: {e}")
|
|
|
|
# Check the TableRow objects if available
|
|
if hasattr(table, 'rows'):
|
|
rows = table.rows
|
|
print(f" rows: {len(rows)}")
|
|
for ri, row in enumerate(rows[:3]): # first 3 rows
|
|
print(f" row {ri}: {len(row.cells)} cells")
|
|
for ci, cell in enumerate(row.cells[:5]): # first 5 cells
|
|
print(f" cell {ci}: bbox={cell}")
|
|
|
|
doc.close()
|