test
This commit is contained in:
@@ -3371,18 +3371,21 @@ class PDFGeneratorService:
|
||||
"rows": 6,
|
||||
"cols": 2,
|
||||
"cells": [
|
||||
{"row": 0, "col": 0, "content": "..."},
|
||||
{"row": 0, "col": 0, "content": "...", "row_span": 1, "col_span": 2},
|
||||
{"row": 0, "col": 1, "content": "..."},
|
||||
...
|
||||
]
|
||||
}
|
||||
|
||||
Returns format compatible with HTMLTableParser output:
|
||||
Returns format compatible with HTMLTableParser output (with colspan/rowspan/col):
|
||||
[
|
||||
{"cells": [{"text": "..."}, {"text": "..."}]}, # row 0
|
||||
{"cells": [{"text": "..."}, {"text": "..."}]}, # row 1
|
||||
{"cells": [{"text": "...", "colspan": 1, "rowspan": 1, "col": 0}, ...]},
|
||||
{"cells": [{"text": "...", "colspan": 1, "rowspan": 1, "col": 0}, ...]},
|
||||
...
|
||||
]
|
||||
|
||||
Note: This returns actual cells per row with their absolute column positions.
|
||||
The table renderer uses 'col' to place cells correctly in the grid.
|
||||
"""
|
||||
try:
|
||||
num_rows = content.get('rows', 0)
|
||||
@@ -3392,21 +3395,39 @@ class PDFGeneratorService:
|
||||
if not cells or num_rows == 0 or num_cols == 0:
|
||||
return []
|
||||
|
||||
# Initialize rows structure
|
||||
rows_data = []
|
||||
for _ in range(num_rows):
|
||||
rows_data.append({'cells': [{'text': ''} for _ in range(num_cols)]})
|
||||
|
||||
# Fill in cell content
|
||||
# Group cells by row
|
||||
cells_by_row = {}
|
||||
for cell in cells:
|
||||
row_idx = cell.get('row', 0)
|
||||
col_idx = cell.get('col', 0)
|
||||
cell_content = cell.get('content', '')
|
||||
if row_idx not in cells_by_row:
|
||||
cells_by_row[row_idx] = []
|
||||
cells_by_row[row_idx].append(cell)
|
||||
|
||||
if 0 <= row_idx < num_rows and 0 <= col_idx < num_cols:
|
||||
rows_data[row_idx]['cells'][col_idx]['text'] = str(cell_content) if cell_content else ''
|
||||
# Sort cells within each row by column
|
||||
for row_idx in cells_by_row:
|
||||
cells_by_row[row_idx].sort(key=lambda c: c.get('col', 0))
|
||||
|
||||
logger.debug(f"Built {num_rows} rows from cells dict")
|
||||
# Build rows structure with colspan/rowspan info and absolute col position
|
||||
rows_data = []
|
||||
for row_idx in range(num_rows):
|
||||
row_cells = []
|
||||
if row_idx in cells_by_row:
|
||||
for cell in cells_by_row[row_idx]:
|
||||
cell_content = cell.get('content', '')
|
||||
row_span = cell.get('row_span', 1) or 1
|
||||
col_span = cell.get('col_span', 1) or 1
|
||||
col_idx = cell.get('col', 0)
|
||||
|
||||
row_cells.append({
|
||||
'text': str(cell_content) if cell_content else '',
|
||||
'rowspan': row_span,
|
||||
'colspan': col_span,
|
||||
'col': col_idx # Absolute column position
|
||||
})
|
||||
|
||||
rows_data.append({'cells': row_cells})
|
||||
|
||||
logger.debug(f"Built {num_rows} rows from cells dict with span info")
|
||||
return rows_data
|
||||
|
||||
except Exception as e:
|
||||
@@ -3471,19 +3492,115 @@ class PDFGeneratorService:
|
||||
table_width = bbox.x1 - bbox.x0
|
||||
table_height = bbox.y1 - bbox.y0
|
||||
|
||||
# Build table data for ReportLab
|
||||
table_content = []
|
||||
for row in rows:
|
||||
row_data = [cell['text'].strip() for cell in row['cells']]
|
||||
table_content.append(row_data)
|
||||
|
||||
# Create table
|
||||
from reportlab.platypus import Table, TableStyle
|
||||
from reportlab.lib import colors
|
||||
|
||||
# Determine number of rows and columns for cell_boxes calculation
|
||||
# Determine grid size from rows structure
|
||||
# Note: rows may have 'col' attribute for absolute positioning (from Direct extraction)
|
||||
# or may be sequential (from HTML parsing)
|
||||
num_rows = len(rows)
|
||||
max_cols = max(len(row['cells']) for row in rows) if rows else 0
|
||||
|
||||
# Check if cells have absolute column positions
|
||||
has_absolute_cols = any(
|
||||
'col' in cell
|
||||
for row in rows
|
||||
for cell in row['cells']
|
||||
)
|
||||
|
||||
# Calculate actual number of columns
|
||||
max_cols = 0
|
||||
if has_absolute_cols:
|
||||
# Use absolute col positions + colspan to find max column
|
||||
for row in rows:
|
||||
for cell in row['cells']:
|
||||
col = cell.get('col', 0)
|
||||
colspan = cell.get('colspan', 1)
|
||||
max_cols = max(max_cols, col + colspan)
|
||||
else:
|
||||
# Sequential cells: sum up colspans
|
||||
for row in rows:
|
||||
col_pos = 0
|
||||
for cell in row['cells']:
|
||||
colspan = cell.get('colspan', 1)
|
||||
col_pos += colspan
|
||||
max_cols = max(max_cols, col_pos)
|
||||
|
||||
# Build table data for ReportLab with proper grid structure
|
||||
# ReportLab needs a full grid with placeholders for spanned cells
|
||||
# and SPAN commands to merge them
|
||||
table_content = []
|
||||
span_commands = []
|
||||
covered = set() # Track cells covered by spans
|
||||
|
||||
# First pass: mark covered cells and collect SPAN commands
|
||||
for row_idx, row in enumerate(rows):
|
||||
if has_absolute_cols:
|
||||
# Use absolute column positions
|
||||
for cell in row['cells']:
|
||||
col_pos = cell.get('col', 0)
|
||||
colspan = cell.get('colspan', 1)
|
||||
rowspan = cell.get('rowspan', 1)
|
||||
|
||||
# Mark cells covered by this span
|
||||
if colspan > 1 or rowspan > 1:
|
||||
for r in range(row_idx, row_idx + rowspan):
|
||||
for c in range(col_pos, col_pos + colspan):
|
||||
if (r, c) != (row_idx, col_pos):
|
||||
covered.add((r, c))
|
||||
# Add SPAN command for ReportLab
|
||||
span_commands.append((
|
||||
'SPAN',
|
||||
(col_pos, row_idx),
|
||||
(col_pos + colspan - 1, row_idx + rowspan - 1)
|
||||
))
|
||||
else:
|
||||
# Sequential positioning
|
||||
col_pos = 0
|
||||
for cell in row['cells']:
|
||||
while (row_idx, col_pos) in covered:
|
||||
col_pos += 1
|
||||
|
||||
colspan = cell.get('colspan', 1)
|
||||
rowspan = cell.get('rowspan', 1)
|
||||
|
||||
if colspan > 1 or rowspan > 1:
|
||||
for r in range(row_idx, row_idx + rowspan):
|
||||
for c in range(col_pos, col_pos + colspan):
|
||||
if (r, c) != (row_idx, col_pos):
|
||||
covered.add((r, c))
|
||||
span_commands.append((
|
||||
'SPAN',
|
||||
(col_pos, row_idx),
|
||||
(col_pos + colspan - 1, row_idx + rowspan - 1)
|
||||
))
|
||||
col_pos += colspan
|
||||
|
||||
# Second pass: build content grid
|
||||
for row_idx in range(num_rows):
|
||||
row_data = [''] * max_cols
|
||||
|
||||
if row_idx < len(rows):
|
||||
if has_absolute_cols:
|
||||
# Place cells at their absolute positions
|
||||
for cell in rows[row_idx]['cells']:
|
||||
col_pos = cell.get('col', 0)
|
||||
if col_pos < max_cols:
|
||||
row_data[col_pos] = cell['text'].strip()
|
||||
else:
|
||||
# Sequential placement
|
||||
col_pos = 0
|
||||
for cell in rows[row_idx]['cells']:
|
||||
while col_pos < max_cols and (row_idx, col_pos) in covered:
|
||||
col_pos += 1
|
||||
if col_pos < max_cols:
|
||||
row_data[col_pos] = cell['text'].strip()
|
||||
colspan = cell.get('colspan', 1)
|
||||
col_pos += colspan
|
||||
|
||||
table_content.append(row_data)
|
||||
|
||||
logger.debug(f"Built table grid: {num_rows} rows × {max_cols} cols, {len(span_commands)} span commands (absolute_cols={has_absolute_cols})")
|
||||
|
||||
# Use original column widths from extraction if available
|
||||
# Otherwise try to compute from cell_boxes (from PP-StructureV3)
|
||||
@@ -3517,7 +3634,7 @@ class PDFGeneratorService:
|
||||
# Apply style with minimal padding to reduce table extension
|
||||
# Use Chinese font to support special characters (℃, μm, ≦, ×, Ω, etc.)
|
||||
font_for_table = self.font_name if self.font_registered else 'Helvetica'
|
||||
style = TableStyle([
|
||||
style_commands = [
|
||||
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
|
||||
('FONTNAME', (0, 0), (-1, -1), font_for_table),
|
||||
('FONTSIZE', (0, 0), (-1, -1), 8),
|
||||
@@ -3529,7 +3646,13 @@ class PDFGeneratorService:
|
||||
('BOTTOMPADDING', (0, 0), (-1, -1), 0),
|
||||
('LEFTPADDING', (0, 0), (-1, -1), 1),
|
||||
('RIGHTPADDING', (0, 0), (-1, -1), 1),
|
||||
])
|
||||
]
|
||||
# Add span commands for merged cells
|
||||
style_commands.extend(span_commands)
|
||||
if span_commands:
|
||||
logger.info(f"Applied {len(span_commands)} SPAN commands for merged cells")
|
||||
|
||||
style = TableStyle(style_commands)
|
||||
t.setStyle(style)
|
||||
|
||||
# Use canvas scaling as fallback to fit table within bbox
|
||||
@@ -4350,33 +4473,100 @@ class PDFGeneratorService:
|
||||
# Replace newlines with <br/>
|
||||
safe_content = safe_content.replace('\n', '<br/>')
|
||||
|
||||
# Calculate font size from bbox height, but keep minimum 10pt
|
||||
font_size = max(box_height * 0.7, 10)
|
||||
font_size = min(font_size, 24) # Cap at 24pt
|
||||
# Get original font size from style info
|
||||
style_info = elem.get('style', {})
|
||||
original_font_size = style_info.get('font_size', 12.0)
|
||||
|
||||
# Create style for this element
|
||||
elem_style = ParagraphStyle(
|
||||
f'elem_{id(elem)}',
|
||||
parent=base_style,
|
||||
fontSize=font_size,
|
||||
leading=font_size * 1.2,
|
||||
# Detect vertical text (Y-axis labels, etc.)
|
||||
# Vertical text has aspect_ratio (height/width) > 2 and multiple characters
|
||||
is_vertical_text = (
|
||||
box_height > box_width * 2 and
|
||||
len(content.strip()) > 1
|
||||
)
|
||||
|
||||
# Create paragraph
|
||||
para = Paragraph(safe_content, elem_style)
|
||||
if is_vertical_text:
|
||||
# For vertical text, use original font size and rotate
|
||||
font_size = min(original_font_size, box_width * 0.9)
|
||||
font_size = max(font_size, 6) # Minimum 6pt
|
||||
|
||||
# Calculate available width and height
|
||||
available_width = box_width
|
||||
available_height = box_height * 2 # Allow overflow
|
||||
# Save canvas state for rotation
|
||||
pdf_canvas.saveState()
|
||||
|
||||
# Wrap the paragraph
|
||||
para_width, para_height = para.wrap(available_width, available_height)
|
||||
# Convert to PDF coordinates
|
||||
pdf_y_center = current_page_height - (y0 + y1) / 2
|
||||
x_center = (x0 + x1) / 2
|
||||
|
||||
# Convert to PDF coordinates (y from bottom)
|
||||
pdf_y = current_page_height - y0 - para_height
|
||||
# Translate to center, rotate, translate back
|
||||
pdf_canvas.translate(x_center, pdf_y_center)
|
||||
pdf_canvas.rotate(90)
|
||||
|
||||
# Draw the paragraph
|
||||
para.drawOn(pdf_canvas, x0, pdf_y)
|
||||
# Set font and draw text centered
|
||||
pdf_canvas.setFont(
|
||||
self.font_name if self.font_registered else 'Helvetica',
|
||||
font_size
|
||||
)
|
||||
# Draw text at origin (since we translated to center)
|
||||
text_width = pdf_canvas.stringWidth(
|
||||
safe_content.replace('&', '&').replace('<', '<').replace('>', '>'),
|
||||
self.font_name if self.font_registered else 'Helvetica',
|
||||
font_size
|
||||
)
|
||||
pdf_canvas.drawString(-text_width / 2, -font_size / 3,
|
||||
safe_content.replace('&', '&').replace('<', '<').replace('>', '>'))
|
||||
|
||||
pdf_canvas.restoreState()
|
||||
else:
|
||||
# For horizontal text, dynamically fit text within bbox
|
||||
# Start with original font size and reduce until text fits
|
||||
MIN_FONT_SIZE = 6
|
||||
MAX_FONT_SIZE = 14
|
||||
|
||||
if original_font_size > 0:
|
||||
start_font_size = min(original_font_size, MAX_FONT_SIZE)
|
||||
else:
|
||||
start_font_size = min(box_height * 0.7, MAX_FONT_SIZE)
|
||||
|
||||
font_size = max(start_font_size, MIN_FONT_SIZE)
|
||||
|
||||
# Try progressively smaller font sizes until text fits
|
||||
para = None
|
||||
para_height = box_height + 1 # Start with height > box to enter loop
|
||||
|
||||
while font_size >= MIN_FONT_SIZE and para_height > box_height:
|
||||
elem_style = ParagraphStyle(
|
||||
f'elem_{id(elem)}_{font_size}',
|
||||
parent=base_style,
|
||||
fontSize=font_size,
|
||||
leading=font_size * 1.15, # Tighter leading
|
||||
)
|
||||
|
||||
para = Paragraph(safe_content, elem_style)
|
||||
para_width, para_height = para.wrap(box_width, box_height * 3)
|
||||
|
||||
if para_height <= box_height:
|
||||
break # Text fits!
|
||||
|
||||
font_size -= 0.5 # Reduce font size and try again
|
||||
|
||||
# Ensure minimum font size
|
||||
if font_size < MIN_FONT_SIZE:
|
||||
font_size = MIN_FONT_SIZE
|
||||
elem_style = ParagraphStyle(
|
||||
f'elem_{id(elem)}_min',
|
||||
parent=base_style,
|
||||
fontSize=font_size,
|
||||
leading=font_size * 1.15,
|
||||
)
|
||||
para = Paragraph(safe_content, elem_style)
|
||||
para_width, para_height = para.wrap(box_width, box_height * 3)
|
||||
|
||||
# Convert to PDF coordinates (y from bottom)
|
||||
# Clip to bbox height to prevent overflow
|
||||
actual_height = min(para_height, box_height)
|
||||
pdf_y = current_page_height - y0 - actual_height
|
||||
|
||||
# Draw the paragraph
|
||||
para.drawOn(pdf_canvas, x0, pdf_y)
|
||||
|
||||
# Save PDF
|
||||
pdf_canvas.save()
|
||||
@@ -4451,13 +4641,47 @@ class PDFGeneratorService:
|
||||
pdf_y_bottom = page_height - ty1
|
||||
pdf_canvas.rect(tx0, pdf_y_bottom, table_width, table_height, stroke=1, fill=0)
|
||||
|
||||
# Step 2: Draw cell borders using cell_boxes
|
||||
# Step 2: Get or calculate cell boxes
|
||||
cell_boxes = metadata.get('cell_boxes', [])
|
||||
if cell_boxes:
|
||||
# Normalize cell boxes for grid alignment
|
||||
if hasattr(self, '_normalize_cell_boxes_to_grid'):
|
||||
cell_boxes = self._normalize_cell_boxes_to_grid(cell_boxes)
|
||||
|
||||
# If no cell_boxes, calculate from column_widths and row_heights
|
||||
if not cell_boxes:
|
||||
column_widths = metadata.get('column_widths', [])
|
||||
row_heights = metadata.get('row_heights', [])
|
||||
|
||||
if column_widths and row_heights:
|
||||
# Calculate cell positions from widths and heights
|
||||
cell_boxes = []
|
||||
rows = content.get('rows', len(row_heights)) if isinstance(content, dict) else len(row_heights)
|
||||
cols = content.get('cols', len(column_widths)) if isinstance(content, dict) else len(column_widths)
|
||||
|
||||
# Calculate cumulative positions
|
||||
x_positions = [tx0]
|
||||
for w in column_widths[:cols]:
|
||||
x_positions.append(x_positions[-1] + w)
|
||||
|
||||
y_positions = [ty0]
|
||||
for h in row_heights[:rows]:
|
||||
y_positions.append(y_positions[-1] + h)
|
||||
|
||||
# Create cell boxes for each cell (row-major order)
|
||||
for row_idx in range(rows):
|
||||
for col_idx in range(cols):
|
||||
if col_idx < len(x_positions) - 1 and row_idx < len(y_positions) - 1:
|
||||
cx0 = x_positions[col_idx]
|
||||
cy0 = y_positions[row_idx]
|
||||
cx1 = x_positions[col_idx + 1]
|
||||
cy1 = y_positions[row_idx + 1]
|
||||
cell_boxes.append([cx0, cy0, cx1, cy1])
|
||||
|
||||
logger.debug(f"Calculated {len(cell_boxes)} cell boxes from {cols} cols x {rows} rows")
|
||||
|
||||
# Normalize cell boxes for grid alignment
|
||||
if cell_boxes and hasattr(self, '_normalize_cell_boxes_to_grid'):
|
||||
cell_boxes = self._normalize_cell_boxes_to_grid(cell_boxes)
|
||||
|
||||
# Draw cell borders
|
||||
if cell_boxes:
|
||||
pdf_canvas.setLineWidth(0.5)
|
||||
for box in cell_boxes:
|
||||
if len(box) >= 4:
|
||||
|
||||
Reference in New Issue
Block a user