test
This commit is contained in:
48
backend/tests/debug_table_cells2.py
Normal file
48
backend/tests/debug_table_cells2.py
Normal file
@@ -0,0 +1,48 @@
|
||||
"""Debug PyMuPDF table structure - find merge info"""
|
||||
import sys
|
||||
from pathlib import Path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
import fitz
|
||||
|
||||
pdf_path = Path(__file__).parent.parent.parent / "demo_docs" / "edit3.pdf"
|
||||
doc = fitz.open(str(pdf_path))
|
||||
page = doc[0]
|
||||
|
||||
tables = page.find_tables()
|
||||
for idx, table in enumerate(tables.tables):
|
||||
print(f"\nTable {idx}:")
|
||||
|
||||
# Check all available attributes
|
||||
print(f" Available attributes: {[a for a in dir(table) if not a.startswith('_')]}")
|
||||
|
||||
# Try to get header info
|
||||
if hasattr(table, 'header'):
|
||||
print(f" header: {table.header}")
|
||||
|
||||
# Check for cells info
|
||||
cell_rects = table.cells
|
||||
print(f" cells count: {len(cell_rects)}")
|
||||
|
||||
# Get the extracted data
|
||||
data = table.extract()
|
||||
print(f" extract() shape: {len(data)} x {max(len(r) for r in data)}")
|
||||
|
||||
# Check if there's a way to map cells to grid positions
|
||||
# Look at the pandas output which might have merge info
|
||||
try:
|
||||
df = table.to_pandas()
|
||||
print(f" pandas shape: {df.shape}")
|
||||
except Exception as e:
|
||||
print(f" pandas error: {e}")
|
||||
|
||||
# Check the TableRow objects if available
|
||||
if hasattr(table, 'rows'):
|
||||
rows = table.rows
|
||||
print(f" rows: {len(rows)}")
|
||||
for ri, row in enumerate(rows[:3]): # first 3 rows
|
||||
print(f" row {ri}: {len(row.cells)} cells")
|
||||
for ci, cell in enumerate(row.cells[:5]): # first 5 cells
|
||||
print(f" cell {ci}: bbox={cell}")
|
||||
|
||||
doc.close()
|
||||
Reference in New Issue
Block a user