Files
OCR/backend/tests/debug_table_cells.py
2025-12-04 18:00:37 +08:00

44 lines
1.5 KiB
Python

"""Debug PyMuPDF table.cells structure"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
import fitz
pdf_path = Path(__file__).parent.parent.parent / "demo_docs" / "edit3.pdf"
doc = fitz.open(str(pdf_path))
page = doc[0]
tables = page.find_tables()
for idx, table in enumerate(tables.tables):
data = table.extract()
num_rows = len(data)
num_cols = max(len(row) for row in data) if data else 0
print(f"Table {idx}:")
print(f" table.extract() dimensions: {num_rows} rows x {num_cols} cols")
print(f" Expected positions: {num_rows * num_cols}")
cell_rects = getattr(table, 'cells', None)
if cell_rects:
print(f" table.cells length: {len(cell_rects)}")
none_count = sum(1 for c in cell_rects if c is None)
actual_count = sum(1 for c in cell_rects if c is not None)
print(f" None cells: {none_count}")
print(f" Actual cells: {actual_count}")
# Check if cell_rects matches grid size
if len(cell_rects) != num_rows * num_cols:
print(f" WARNING: cell_rects length ({len(cell_rects)}) != grid size ({num_rows * num_cols})")
# Show first few cells
print(f" First 5 cells: {cell_rects[:5]}")
else:
print(f" table.cells: NOT AVAILABLE")
# Check row_count and col_count
print(f" table.row_count: {getattr(table, 'row_count', 'N/A')}")
print(f" table.col_count: {getattr(table, 'col_count', 'N/A')}")
doc.close()