This commit is contained in:
egg
2025-12-04 18:00:37 +08:00
parent 9437387ef1
commit 8265be1741
22 changed files with 2672 additions and 196 deletions

View File

@@ -0,0 +1,48 @@
"""Debug PyMuPDF table structure - find merge info"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
import fitz
pdf_path = Path(__file__).parent.parent.parent / "demo_docs" / "edit3.pdf"
doc = fitz.open(str(pdf_path))
page = doc[0]
tables = page.find_tables()
for idx, table in enumerate(tables.tables):
print(f"\nTable {idx}:")
# Check all available attributes
print(f" Available attributes: {[a for a in dir(table) if not a.startswith('_')]}")
# Try to get header info
if hasattr(table, 'header'):
print(f" header: {table.header}")
# Check for cells info
cell_rects = table.cells
print(f" cells count: {len(cell_rects)}")
# Get the extracted data
data = table.extract()
print(f" extract() shape: {len(data)} x {max(len(r) for r in data)}")
# Check if there's a way to map cells to grid positions
# Look at the pandas output which might have merge info
try:
df = table.to_pandas()
print(f" pandas shape: {df.shape}")
except Exception as e:
print(f" pandas error: {e}")
# Check the TableRow objects if available
if hasattr(table, 'rows'):
rows = table.rows
print(f" rows: {len(rows)}")
for ri, row in enumerate(rows[:3]): # first 3 rows
print(f" row {ri}: {len(row.cells)} cells")
for ci, cell in enumerate(row.cells[:5]): # first 5 cells
print(f" cell {ci}: bbox={cell}")
doc.close()