fix: resolve Direct track PDF table rendering overlap with canvas scaling
This commit fixes the critical table overlap issue in Direct track PDF layout restoration where generated tables exceeded their bounding boxes and overlapped with surrounding text. Root Cause: ReportLab's Table component auto-calculates row heights based on content, often rendering tables larger than their specified bbox. The rowHeights parameter was ignored during actual rendering, and font size reduction didn't proportionally affect table height. Solution - Canvas Transform Scaling: Implemented a reliable canvas transform approach in _draw_table_element_direct(): 1. Wrap table with generous space to get natural rendered dimensions 2. Calculate scale factor: min(bbox_width/actual_width, bbox_height/actual_height, 1.0) 3. Apply canvas transform: saveState → translate → scale → drawOn → restoreState 4. Removed all buffers, using exact bbox positioning Key Changes: - backend/app/services/pdf_generator_service.py (_draw_table_element_direct): * Added canvas scaling logic (lines 2180-2208) * Removed buffer adjustments (previously 2pt→18pt attempts) * Use exact bbox position: pdf_y = page_height - bbox.y1 * Supports column widths from metadata to preserve original ratios - backend/app/services/direct_extraction_engine.py (_process_native_table): * Extract column widths from PyMuPDF table.cells data (lines 691-761) * Calculate and store original column width ratios (e.g., 40:60) * Store in element metadata for use during PDF generation * Prevents unnecessary text wrapping that increases table height Results: Test case showed perfect scaling: natural table 246.8×108.0pt → scaled to 246.8×89.6pt with factor 0.830, fitting exactly within bbox without overlap. Cleanup: - Removed test/debug scripts: check_tables.py, verify_chart_recognition.py - Removed demo files from demo_docs/ (basic/, layout/, mixed/, tables/) User Confirmed: "FINAL_SCALING_FIX.pdf 此份的結果是可接受的. 恭喜你完成的direct pdf的修復" Next: Other document formats require layout verification and fixes. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -744,7 +744,15 @@ class PDFGeneratorService:
|
||||
all_elements.append(('text', elem))
|
||||
|
||||
logger.debug(f"Drawing {len(all_elements)} elements in extraction order (preserves multi-column reading order)")
|
||||
logger.debug(f"Exclusion regions: {len(regions_to_avoid)} tables/images")
|
||||
logger.debug(f"Exclusion regions: {len(regions_to_avoid)} (tables/images/charts)")
|
||||
|
||||
# Debug: Log exclusion region types
|
||||
region_types = {}
|
||||
for region in regions_to_avoid:
|
||||
region_type = region.type.name
|
||||
region_types[region_type] = region_types.get(region_type, 0) + 1
|
||||
if region_types:
|
||||
logger.debug(f" Exclusion region breakdown: {region_types}")
|
||||
|
||||
# Draw elements in document order
|
||||
for elem_type, elem in all_elements:
|
||||
@@ -2133,7 +2141,8 @@ class PDFGeneratorService:
|
||||
|
||||
# Transform coordinates
|
||||
pdf_x = bbox.x0
|
||||
pdf_y = page_height - bbox.y1 # Bottom of table
|
||||
# Use exact bbox position (no buffer) - scaling will ensure table fits
|
||||
pdf_y = page_height - bbox.y1 # Bottom of table (ReportLab Y coordinate)
|
||||
|
||||
table_width = bbox.x1 - bbox.x0
|
||||
table_height = bbox.y1 - bbox.y0
|
||||
@@ -2148,20 +2157,53 @@ class PDFGeneratorService:
|
||||
from reportlab.platypus import Table, TableStyle
|
||||
from reportlab.lib import colors
|
||||
|
||||
t = Table(table_content, colWidths=[table_width / len(table_content[0])] * len(table_content[0]))
|
||||
# Use original column widths from extraction if available
|
||||
# Otherwise let ReportLab auto-calculate
|
||||
col_widths = None
|
||||
if element.metadata and 'column_widths' in element.metadata:
|
||||
col_widths = element.metadata['column_widths']
|
||||
logger.debug(f"Using extracted column widths: {col_widths}")
|
||||
|
||||
# Apply style
|
||||
# Create table without rowHeights (will use canvas scaling instead)
|
||||
t = Table(table_content, colWidths=col_widths)
|
||||
|
||||
# Apply style with minimal padding to reduce table extension
|
||||
style = TableStyle([
|
||||
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
|
||||
('FONTSIZE', (0, 0), (-1, -1), 8),
|
||||
('ALIGN', (0, 0), (-1, -1), 'LEFT'),
|
||||
('VALIGN', (0, 0), (-1, -1), 'TOP'),
|
||||
# Set minimal padding to prevent table from extending beyond bbox
|
||||
# User reported padding=1 was still insufficient
|
||||
('TOPPADDING', (0, 0), (-1, -1), 0),
|
||||
('BOTTOMPADDING', (0, 0), (-1, -1), 0),
|
||||
('LEFTPADDING', (0, 0), (-1, -1), 1),
|
||||
('RIGHTPADDING', (0, 0), (-1, -1), 1),
|
||||
])
|
||||
t.setStyle(style)
|
||||
|
||||
# Draw table
|
||||
t.wrapOn(pdf_canvas, table_width, table_height)
|
||||
t.drawOn(pdf_canvas, pdf_x, pdf_y)
|
||||
# CRITICAL: Use canvas scaling to fit table within bbox
|
||||
# This is more reliable than rowHeights which doesn't always work
|
||||
|
||||
# Step 1: Wrap to get actual rendered size
|
||||
actual_width, actual_height = t.wrapOn(pdf_canvas, table_width * 10, table_height * 10)
|
||||
logger.info(f"Table natural size: {actual_width:.1f} × {actual_height:.1f}pt, bbox: {table_width:.1f} × {table_height:.1f}pt")
|
||||
|
||||
# Step 2: Calculate scale factor to fit within bbox
|
||||
scale_x = table_width / actual_width if actual_width > table_width else 1.0
|
||||
scale_y = table_height / actual_height if actual_height > table_height else 1.0
|
||||
scale = min(scale_x, scale_y, 1.0) # Never scale up, only down
|
||||
|
||||
logger.info(f"Scale factor: {scale:.3f} (x={scale_x:.3f}, y={scale_y:.3f})")
|
||||
|
||||
# Step 3: Draw with scaling using canvas transform
|
||||
pdf_canvas.saveState()
|
||||
pdf_canvas.translate(pdf_x, pdf_y)
|
||||
pdf_canvas.scale(scale, scale)
|
||||
t.drawOn(pdf_canvas, 0, 0)
|
||||
pdf_canvas.restoreState()
|
||||
|
||||
logger.info(f"Drew table at ({pdf_x:.1f}, {pdf_y:.1f}) with scale {scale:.3f}, final size: {actual_width * scale:.1f} × {actual_height * scale:.1f}pt")
|
||||
|
||||
logger.debug(f"Drew table element: {len(rows)} rows")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user