feat: add table detection options and scan artifact removal
- Add TableDetectionSelector component for wired/wireless/region detection - Add CV-based table line detector module (disabled due to poor performance) - Add scan artifact removal preprocessing step (removes faint horizontal lines) - Add PreprocessingConfig schema with remove_scan_artifacts option - Update frontend PreprocessingSettings with scan artifact toggle - Integrate table detection config into ProcessingPage - Archive extract-table-cell-boxes proposal 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -447,7 +447,8 @@ class PDFGeneratorService:
|
||||
'text': text_content,
|
||||
'bbox': bbox_polygon,
|
||||
'confidence': element.confidence or 1.0,
|
||||
'page': page_num
|
||||
'page': page_num,
|
||||
'element_type': element.type.value # Include element type for styling
|
||||
}
|
||||
|
||||
# Include style information if available (for Direct track)
|
||||
@@ -466,13 +467,24 @@ class PDFGeneratorService:
|
||||
else:
|
||||
html_content = str(element.content)
|
||||
|
||||
layout_elements.append({
|
||||
table_element = {
|
||||
'type': 'table',
|
||||
'content': html_content,
|
||||
'bbox': [element.bbox.x0, element.bbox.y0,
|
||||
element.bbox.x1, element.bbox.y1],
|
||||
'page': page_num - 1 # layout uses 0-based
|
||||
})
|
||||
}
|
||||
|
||||
# Preserve cell_boxes and embedded_images from metadata
|
||||
# These are extracted by PP-StructureV3 and used for accurate table rendering
|
||||
if element.metadata:
|
||||
if 'cell_boxes' in element.metadata:
|
||||
table_element['cell_boxes'] = element.metadata['cell_boxes']
|
||||
table_element['cell_boxes_source'] = element.metadata.get('cell_boxes_source', 'metadata')
|
||||
if 'embedded_images' in element.metadata:
|
||||
table_element['embedded_images'] = element.metadata['embedded_images']
|
||||
|
||||
layout_elements.append(table_element)
|
||||
|
||||
# Add bbox to images_metadata for text overlap filtering
|
||||
# (no actual image file, just bbox for filtering)
|
||||
@@ -484,10 +496,10 @@ class PDFGeneratorService:
|
||||
'element_id': element.element_id
|
||||
})
|
||||
|
||||
# Handle image/visual elements
|
||||
# Handle image/visual elements (including stamps/seals)
|
||||
elif element.is_visual or element.type in [
|
||||
ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART,
|
||||
ElementType.DIAGRAM, ElementType.LOGO
|
||||
ElementType.DIAGRAM, ElementType.LOGO, ElementType.STAMP
|
||||
]:
|
||||
# Get image path using fallback logic
|
||||
image_path = self._get_image_path(element)
|
||||
@@ -729,13 +741,13 @@ class PDFGeneratorService:
|
||||
regions_to_avoid.append(element) # Tables are exclusion regions
|
||||
elif element.is_visual or element.type in [
|
||||
ElementType.IMAGE, ElementType.FIGURE,
|
||||
ElementType.CHART, ElementType.DIAGRAM, ElementType.LOGO
|
||||
ElementType.CHART, ElementType.DIAGRAM, ElementType.LOGO, ElementType.STAMP
|
||||
]:
|
||||
image_elements.append(element)
|
||||
# Only add real images to exclusion regions, NOT charts/diagrams
|
||||
# Charts often have large bounding boxes that include text labels
|
||||
# which should be rendered as selectable text on top
|
||||
if element.type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.LOGO]:
|
||||
if element.type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.LOGO, ElementType.STAMP]:
|
||||
regions_to_avoid.append(element)
|
||||
elif element.type == ElementType.LIST_ITEM:
|
||||
list_elements.append(element)
|
||||
@@ -934,11 +946,14 @@ class PDFGeneratorService:
|
||||
# Create PDF canvas with initial page size (will be updated per page)
|
||||
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height))
|
||||
|
||||
# Filter text regions to avoid overlap with tables/images
|
||||
regions_to_avoid = images_metadata
|
||||
# LAYERED RENDERING: Exclude tables from regions_to_avoid
|
||||
# Text inside tables will be rendered at raw OCR positions (via GapFillingService)
|
||||
# while table borders are drawn separately using cell_boxes
|
||||
# Only avoid overlap with actual images/figures/charts
|
||||
regions_to_avoid = [img for img in images_metadata if img.get('type') != 'table']
|
||||
table_count = len([img for img in images_metadata if img.get('type') == 'table'])
|
||||
|
||||
logger.info(f"過濾文字區域: {len(regions_to_avoid)} 個區域需要避免 (含 {table_count} 個表格)")
|
||||
logger.info(f"過濾文字區域: {len(regions_to_avoid)} 個區域需要避免 (不含表格), {table_count} 個表格使用分層渲染")
|
||||
|
||||
filtered_text_regions = self._filter_text_in_regions(text_regions, regions_to_avoid)
|
||||
|
||||
@@ -1042,7 +1057,8 @@ class PDFGeneratorService:
|
||||
for table_elem in page_table_regions:
|
||||
self.draw_table_region(
|
||||
pdf_canvas, table_elem, images_metadata,
|
||||
current_target_h, current_scale_w, current_scale_h
|
||||
current_target_h, current_scale_w, current_scale_h,
|
||||
result_dir=json_parent_dir
|
||||
)
|
||||
|
||||
# 3. Draw text (top layer)
|
||||
@@ -1542,8 +1558,8 @@ class PDFGeneratorService:
|
||||
logger.info(f"[文字] '{text[:30]}' → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 字體:{font_size:.1f}pt, 寬x高:{bbox_width:.0f}x{bbox_height:.0f}, 行數:{num_lines}")
|
||||
|
||||
# Set font with track-specific styling
|
||||
# Note: OCR track has no StyleInfo (extracted from images), so no advanced formatting
|
||||
style_info = region.get('style')
|
||||
element_type = region.get('element_type', 'text')
|
||||
is_direct_track = (self.current_processing_track == ProcessingTrack.DIRECT or
|
||||
self.current_processing_track == ProcessingTrack.HYBRID)
|
||||
|
||||
@@ -1555,9 +1571,25 @@ class PDFGeneratorService:
|
||||
font_size = pdf_canvas._fontsize
|
||||
logger.debug(f"Applied Direct track style: font={font_name}, size={font_size}")
|
||||
else:
|
||||
# OCR track or no style: Use simple font selection
|
||||
# OCR track or no style: Use simple font selection with element-type based styling
|
||||
font_name = self.font_name if self.font_registered else 'Helvetica'
|
||||
pdf_canvas.setFont(font_name, font_size)
|
||||
|
||||
# Apply element-type specific styling (for OCR track)
|
||||
if element_type == 'title':
|
||||
# Titles: use larger, bold font
|
||||
font_size = min(font_size * 1.3, 36) # 30% larger, max 36pt
|
||||
pdf_canvas.setFont(font_name, font_size)
|
||||
logger.debug(f"Applied title style: size={font_size:.1f}")
|
||||
elif element_type == 'header':
|
||||
# Headers: slightly larger
|
||||
font_size = min(font_size * 1.15, 24) # 15% larger, max 24pt
|
||||
pdf_canvas.setFont(font_name, font_size)
|
||||
elif element_type == 'caption':
|
||||
# Captions: slightly smaller, italic if available
|
||||
font_size = max(font_size * 0.9, 6) # 10% smaller, min 6pt
|
||||
pdf_canvas.setFont(font_name, font_size)
|
||||
else:
|
||||
pdf_canvas.setFont(font_name, font_size)
|
||||
|
||||
# Handle line breaks (split text by newlines)
|
||||
# OCR track: simple left-aligned rendering
|
||||
@@ -1726,7 +1758,8 @@ class PDFGeneratorService:
|
||||
images_metadata: List[Dict],
|
||||
page_height: float,
|
||||
scale_w: float = 1.0,
|
||||
scale_h: float = 1.0
|
||||
scale_h: float = 1.0,
|
||||
result_dir: Optional[Path] = None
|
||||
):
|
||||
"""
|
||||
Draw a table region by parsing HTML and rebuilding with ReportLab Table
|
||||
@@ -1738,13 +1771,27 @@ class PDFGeneratorService:
|
||||
page_height: Height of page
|
||||
scale_w: Scale factor for X coordinates (PDF width / OCR width)
|
||||
scale_h: Scale factor for Y coordinates (PDF height / OCR height)
|
||||
result_dir: Directory containing result files (for embedded images)
|
||||
"""
|
||||
try:
|
||||
html_content = table_element.get('content', '')
|
||||
if not html_content:
|
||||
return
|
||||
|
||||
# Parse HTML to extract table structure
|
||||
# Try to use cell_boxes for direct rendering first (more accurate)
|
||||
cell_boxes = table_element.get('cell_boxes', [])
|
||||
if cell_boxes:
|
||||
logger.info(f"[TABLE] Using cell_boxes direct rendering ({len(cell_boxes)} cells)")
|
||||
success = self._draw_table_with_cell_boxes(
|
||||
pdf_canvas, table_element, page_height,
|
||||
scale_w, scale_h, result_dir
|
||||
)
|
||||
if success:
|
||||
return # Successfully rendered with cell_boxes
|
||||
|
||||
logger.info("[TABLE] Falling back to ReportLab Table")
|
||||
|
||||
# Fallback: Parse HTML to extract table structure and use ReportLab Table
|
||||
parser = HTMLTableParser()
|
||||
parser.feed(html_content)
|
||||
|
||||
@@ -1901,14 +1948,18 @@ class PDFGeneratorService:
|
||||
logger.info(f"[TABLE] Using cell_boxes col widths (scaled)")
|
||||
else:
|
||||
col_widths = [table_width / max_cols] * max_cols
|
||||
logger.info(f"[TABLE] Using equal distribution col widths")
|
||||
logger.info(f"[TABLE] Using equal distribution col widths: {table_width/max_cols:.1f} each")
|
||||
|
||||
# Row heights are used optionally (ReportLab can auto-size)
|
||||
row_heights = None
|
||||
# Row heights - ALWAYS use to ensure table fits bbox properly
|
||||
# Use computed heights from cell_boxes, or uniform distribution as fallback
|
||||
if computed_row_heights:
|
||||
# Scale row_heights to PDF coordinates
|
||||
row_heights = [h * scale_h for h in computed_row_heights]
|
||||
logger.debug(f"[TABLE] Cell_boxes row heights available (scaled)")
|
||||
logger.info(f"[TABLE] Using cell_boxes row heights (scaled)")
|
||||
else:
|
||||
# Uniform distribution based on table bbox - ensures table fills its allocated space
|
||||
row_heights = [table_height / num_rows] * num_rows
|
||||
logger.info(f"[TABLE] Using uniform row heights: {table_height/num_rows:.1f} each")
|
||||
|
||||
# Create ReportLab Table
|
||||
# Use smaller font to fit content with auto-wrap
|
||||
@@ -1932,12 +1983,10 @@ class PDFGeneratorService:
|
||||
escaped_text = cell_text.replace('&', '&').replace('<', '<').replace('>', '>')
|
||||
reportlab_data[row_idx][col_idx] = Paragraph(escaped_text, cell_style)
|
||||
|
||||
# Create table with computed col widths
|
||||
# Note: We don't use row_heights even when available from cell_boxes because:
|
||||
# 1. ReportLab's auto-sizing handles content overflow better
|
||||
# 2. Fixed heights can cause text clipping when content exceeds cell size
|
||||
# 3. The col_widths from cell_boxes provide the main layout benefit
|
||||
table = Table(reportlab_data, colWidths=col_widths)
|
||||
# Create table with col widths and row heights
|
||||
# Always use row_heights to ensure table fits bbox properly
|
||||
table = Table(reportlab_data, colWidths=col_widths, rowHeights=row_heights)
|
||||
logger.info(f"[TABLE] Created with {len(col_widths)} cols, {len(row_heights)} rows")
|
||||
|
||||
# Apply table style
|
||||
style = TableStyle([
|
||||
@@ -1974,26 +2023,303 @@ class PDFGeneratorService:
|
||||
scale_y = table_height / actual_height if actual_height > table_height else 1.0
|
||||
scale_factor = min(scale_x, scale_y) # Use smaller scale to fit both dimensions
|
||||
|
||||
# Calculate the table top position in PDF coordinates
|
||||
# ReportLab uses bottom-left origin, so we need to position from TOP
|
||||
pdf_y_top = page_height - ocr_y_top # Top of table in PDF coords
|
||||
|
||||
# Calculate the actual bottom position based on scaled height
|
||||
# Table should be positioned so its TOP aligns with the bbox top
|
||||
scaled_height = actual_height * scale_factor
|
||||
pdf_y_bottom = pdf_y_top - scaled_height # Bottom of scaled table
|
||||
|
||||
logger.info(f"[表格] PDF座標: top={pdf_y_top:.0f}, bottom={pdf_y_bottom:.0f}, scaled_height={scaled_height:.0f}")
|
||||
|
||||
if scale_factor < 1.0:
|
||||
logger.info(f"[表格] 縮放比例: {scale_factor:.2f} (需要縮小以適應 bbox)")
|
||||
# Apply scaling transformation
|
||||
pdf_canvas.saveState()
|
||||
pdf_canvas.translate(pdf_x, pdf_y)
|
||||
pdf_canvas.translate(pdf_x, pdf_y_bottom)
|
||||
pdf_canvas.scale(scale_factor, scale_factor)
|
||||
# Draw at origin since we've already translated
|
||||
table.drawOn(pdf_canvas, 0, 0)
|
||||
pdf_canvas.restoreState()
|
||||
else:
|
||||
# Draw table at position without scaling
|
||||
table.drawOn(pdf_canvas, pdf_x, pdf_y)
|
||||
# pdf_y should be the bottom of the table
|
||||
table.drawOn(pdf_canvas, pdf_x, pdf_y_bottom)
|
||||
|
||||
logger.info(f"Drew table at ({pdf_x:.0f}, {pdf_y:.0f}) size {table_width:.0f}x{table_height:.0f} with {len(rows)} rows")
|
||||
logger.info(f"Drew table at ({pdf_x:.0f}, {pdf_y_bottom:.0f}) size {table_width:.0f}x{scaled_height:.0f} with {len(rows)} rows")
|
||||
|
||||
# Draw embedded images (images detected inside the table region)
|
||||
embedded_images = table_element.get('embedded_images', [])
|
||||
if embedded_images and result_dir:
|
||||
logger.info(f"[TABLE] Drawing {len(embedded_images)} embedded images")
|
||||
for emb_img in embedded_images:
|
||||
self._draw_embedded_image(
|
||||
pdf_canvas, emb_img, page_height, result_dir, scale_w, scale_h
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to draw table region: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
def _draw_embedded_image(
|
||||
self,
|
||||
pdf_canvas: canvas.Canvas,
|
||||
emb_img: Dict,
|
||||
page_height: float,
|
||||
result_dir: Path,
|
||||
scale_w: float = 1.0,
|
||||
scale_h: float = 1.0
|
||||
):
|
||||
"""Draw an embedded image inside a table region."""
|
||||
try:
|
||||
# Get image path
|
||||
saved_path = emb_img.get('saved_path', '')
|
||||
if not saved_path:
|
||||
return
|
||||
|
||||
# Construct full path
|
||||
image_path = result_dir / saved_path
|
||||
if not image_path.exists():
|
||||
image_path = result_dir / Path(saved_path).name
|
||||
|
||||
if not image_path.exists():
|
||||
logger.warning(f"Embedded image not found: {saved_path}")
|
||||
return
|
||||
|
||||
# Get bbox from embedded image data
|
||||
bbox = emb_img.get('bbox', [])
|
||||
if not bbox or len(bbox) < 4:
|
||||
logger.warning(f"No bbox for embedded image: {saved_path}")
|
||||
return
|
||||
|
||||
# Calculate position (bbox is [x0, y0, x1, y1])
|
||||
x0, y0, x1, y1 = bbox[0], bbox[1], bbox[2], bbox[3]
|
||||
|
||||
# Apply scaling
|
||||
x0_scaled = x0 * scale_w
|
||||
y0_scaled = y0 * scale_h
|
||||
x1_scaled = x1 * scale_w
|
||||
y1_scaled = y1 * scale_h
|
||||
|
||||
width = x1_scaled - x0_scaled
|
||||
height = y1_scaled - y0_scaled
|
||||
|
||||
# Transform Y coordinate (ReportLab uses bottom-left origin)
|
||||
pdf_x = x0_scaled
|
||||
pdf_y = page_height - y1_scaled
|
||||
|
||||
# Draw the image
|
||||
from reportlab.lib.utils import ImageReader
|
||||
img_reader = ImageReader(str(image_path))
|
||||
pdf_canvas.drawImage(
|
||||
img_reader, pdf_x, pdf_y, width, height,
|
||||
preserveAspectRatio=True, mask='auto'
|
||||
)
|
||||
|
||||
logger.info(f"Drew embedded image at ({pdf_x:.0f}, {pdf_y:.0f}) size {width:.0f}x{height:.0f}")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to draw embedded image: {e}")
|
||||
|
||||
def _normalize_cell_boxes_to_grid(
|
||||
self,
|
||||
cell_boxes: List[List[float]],
|
||||
threshold: float = 10.0
|
||||
) -> List[List[float]]:
|
||||
"""
|
||||
Normalize cell boxes to create a proper aligned grid.
|
||||
|
||||
Groups nearby coordinates and snaps them to a common value,
|
||||
eliminating the 2-11 pixel variations that cause skewed tables.
|
||||
|
||||
Args:
|
||||
cell_boxes: List of cell bboxes [[x1,y1,x2,y2], ...]
|
||||
threshold: Maximum distance to consider coordinates as "same line"
|
||||
|
||||
Returns:
|
||||
Normalized cell_boxes with aligned coordinates
|
||||
"""
|
||||
if not cell_boxes or len(cell_boxes) < 2:
|
||||
return cell_boxes
|
||||
|
||||
# Collect all X and Y coordinates
|
||||
x_coords = [] # (value, box_idx, is_x1)
|
||||
y_coords = [] # (value, box_idx, is_y1)
|
||||
|
||||
for i, box in enumerate(cell_boxes):
|
||||
x1, y1, x2, y2 = box[0], box[1], box[2], box[3]
|
||||
x_coords.append((x1, i, True)) # x1 (left)
|
||||
x_coords.append((x2, i, False)) # x2 (right)
|
||||
y_coords.append((y1, i, True)) # y1 (top)
|
||||
y_coords.append((y2, i, False)) # y2 (bottom)
|
||||
|
||||
def cluster_and_normalize(coords, threshold):
|
||||
"""Cluster nearby coordinates and return mapping to normalized values."""
|
||||
if not coords:
|
||||
return {}
|
||||
|
||||
# Sort by value
|
||||
sorted_coords = sorted(coords, key=lambda x: x[0])
|
||||
|
||||
# Cluster nearby values
|
||||
clusters = []
|
||||
current_cluster = [sorted_coords[0]]
|
||||
|
||||
for coord in sorted_coords[1:]:
|
||||
if coord[0] - current_cluster[-1][0] <= threshold:
|
||||
current_cluster.append(coord)
|
||||
else:
|
||||
clusters.append(current_cluster)
|
||||
current_cluster = [coord]
|
||||
clusters.append(current_cluster)
|
||||
|
||||
# Create mapping: (box_idx, is_first) -> normalized value
|
||||
mapping = {}
|
||||
for cluster in clusters:
|
||||
# Use average of cluster as normalized value
|
||||
avg_value = sum(c[0] for c in cluster) / len(cluster)
|
||||
for _, box_idx, is_first in cluster:
|
||||
mapping[(box_idx, is_first)] = avg_value
|
||||
|
||||
return mapping
|
||||
|
||||
x_mapping = cluster_and_normalize(x_coords, threshold)
|
||||
y_mapping = cluster_and_normalize(y_coords, threshold)
|
||||
|
||||
# Create normalized cell boxes
|
||||
normalized_boxes = []
|
||||
for i, box in enumerate(cell_boxes):
|
||||
x1_norm = x_mapping.get((i, True), box[0])
|
||||
x2_norm = x_mapping.get((i, False), box[2])
|
||||
y1_norm = y_mapping.get((i, True), box[1])
|
||||
y2_norm = y_mapping.get((i, False), box[3])
|
||||
normalized_boxes.append([x1_norm, y1_norm, x2_norm, y2_norm])
|
||||
|
||||
logger.debug(f"[TABLE] Normalized {len(cell_boxes)} cell boxes to grid")
|
||||
return normalized_boxes
|
||||
|
||||
def _draw_table_with_cell_boxes(
|
||||
self,
|
||||
pdf_canvas: canvas.Canvas,
|
||||
table_element: Dict,
|
||||
page_height: float,
|
||||
scale_w: float = 1.0,
|
||||
scale_h: float = 1.0,
|
||||
result_dir: Optional[Path] = None
|
||||
):
|
||||
"""
|
||||
Draw table borders using cell_boxes for accurate positioning.
|
||||
|
||||
LAYERED RENDERING APPROACH:
|
||||
- This method ONLY draws cell borders and embedded images
|
||||
- Text is rendered separately using raw OCR positions (via GapFillingService)
|
||||
- This decouples visual structure (borders) from content (text)
|
||||
|
||||
FALLBACK: If cell_boxes are incomplete, always draws the outer table
|
||||
border using the table's bbox to ensure table boundaries are visible.
|
||||
|
||||
Args:
|
||||
pdf_canvas: ReportLab canvas object
|
||||
table_element: Table element dict with cell_boxes
|
||||
page_height: Height of page in PDF coordinates
|
||||
scale_w: Scale factor for X coordinates
|
||||
scale_h: Scale factor for Y coordinates
|
||||
result_dir: Directory containing result files (for embedded images)
|
||||
"""
|
||||
try:
|
||||
cell_boxes = table_element.get('cell_boxes', [])
|
||||
|
||||
# Always draw outer table border first (fallback for incomplete cell_boxes)
|
||||
table_bbox = table_element.get('bbox', [])
|
||||
if table_bbox and len(table_bbox) >= 4:
|
||||
# Handle different bbox formats (list or dict)
|
||||
if isinstance(table_bbox, dict):
|
||||
tx1 = float(table_bbox.get('x0', 0))
|
||||
ty1 = float(table_bbox.get('y0', 0))
|
||||
tx2 = float(table_bbox.get('x1', 0))
|
||||
ty2 = float(table_bbox.get('y1', 0))
|
||||
else:
|
||||
tx1, ty1, tx2, ty2 = table_bbox[:4]
|
||||
|
||||
# Apply scaling
|
||||
tx1_scaled = tx1 * scale_w
|
||||
ty1_scaled = ty1 * scale_h
|
||||
tx2_scaled = tx2 * scale_w
|
||||
ty2_scaled = ty2 * scale_h
|
||||
|
||||
table_width = tx2_scaled - tx1_scaled
|
||||
table_height = ty2_scaled - ty1_scaled
|
||||
|
||||
# Transform Y coordinate (PDF uses bottom-left origin)
|
||||
pdf_x = tx1_scaled
|
||||
pdf_y = page_height - ty2_scaled # Bottom of table in PDF coords
|
||||
|
||||
# Draw outer table border (slightly thicker for visibility)
|
||||
pdf_canvas.setStrokeColor(colors.black)
|
||||
pdf_canvas.setLineWidth(1.0)
|
||||
pdf_canvas.rect(pdf_x, pdf_y, table_width, table_height, stroke=1, fill=0)
|
||||
logger.info(f"[TABLE] Drew outer table border at [{int(tx1)},{int(ty1)},{int(tx2)},{int(ty2)}]")
|
||||
|
||||
if not cell_boxes:
|
||||
logger.warning("[TABLE] No cell_boxes available, only outer border drawn")
|
||||
# Still draw embedded images even without cell borders
|
||||
embedded_images = table_element.get('embedded_images', [])
|
||||
if embedded_images and result_dir:
|
||||
for emb_img in embedded_images:
|
||||
self._draw_embedded_image(
|
||||
pdf_canvas, emb_img, page_height, result_dir, scale_w, scale_h
|
||||
)
|
||||
return True # Outer border drawn successfully
|
||||
|
||||
# Normalize cell boxes to create aligned grid
|
||||
cell_boxes = self._normalize_cell_boxes_to_grid(cell_boxes)
|
||||
|
||||
logger.info(f"[TABLE] Drawing {len(cell_boxes)} cell borders (layered mode, grid-aligned)")
|
||||
|
||||
# Draw each cell border
|
||||
for box in cell_boxes:
|
||||
x1, y1, x2, y2 = box[0], box[1], box[2], box[3]
|
||||
|
||||
# Apply scaling
|
||||
x1_scaled = x1 * scale_w
|
||||
y1_scaled = y1 * scale_h
|
||||
x2_scaled = x2 * scale_w
|
||||
y2_scaled = y2 * scale_h
|
||||
|
||||
cell_width = x2_scaled - x1_scaled
|
||||
cell_height = y2_scaled - y1_scaled
|
||||
|
||||
# Transform Y coordinate (PDF uses bottom-left origin)
|
||||
pdf_x = x1_scaled
|
||||
pdf_y = page_height - y2_scaled # Bottom of cell in PDF coords
|
||||
|
||||
# Draw cell border only (no fill, no text)
|
||||
pdf_canvas.setStrokeColor(colors.black)
|
||||
pdf_canvas.setLineWidth(0.5)
|
||||
pdf_canvas.rect(pdf_x, pdf_y, cell_width, cell_height, stroke=1, fill=0)
|
||||
|
||||
logger.info(f"[TABLE] Drew {len(cell_boxes)} cell borders")
|
||||
|
||||
# Draw embedded images
|
||||
embedded_images = table_element.get('embedded_images', [])
|
||||
if embedded_images and result_dir:
|
||||
logger.info(f"[TABLE] Drawing {len(embedded_images)} embedded images")
|
||||
for emb_img in embedded_images:
|
||||
self._draw_embedded_image(
|
||||
pdf_canvas, emb_img, page_height, result_dir, scale_w, scale_h
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"[TABLE] Failed to draw cell borders: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
def draw_image_region(
|
||||
self,
|
||||
pdf_canvas: canvas.Canvas,
|
||||
@@ -2923,12 +3249,29 @@ class PDFGeneratorService:
|
||||
from reportlab.platypus import Table, TableStyle
|
||||
from reportlab.lib import colors
|
||||
|
||||
# Determine number of rows and columns for cell_boxes calculation
|
||||
num_rows = len(rows)
|
||||
max_cols = max(len(row['cells']) for row in rows) if rows else 0
|
||||
|
||||
# Use original column widths from extraction if available
|
||||
# Otherwise let ReportLab auto-calculate
|
||||
# Otherwise try to compute from cell_boxes (from PP-StructureV3)
|
||||
col_widths = None
|
||||
if element.metadata and 'column_widths' in element.metadata:
|
||||
col_widths = element.metadata['column_widths']
|
||||
logger.debug(f"Using extracted column widths: {col_widths}")
|
||||
elif element.metadata and 'cell_boxes' in element.metadata:
|
||||
# Use cell_boxes from PP-StructureV3 for accurate column/row sizing
|
||||
cell_boxes = element.metadata['cell_boxes']
|
||||
cell_boxes_source = element.metadata.get('cell_boxes_source', 'unknown')
|
||||
table_bbox_list = [bbox.x0, bbox.y0, bbox.x1, bbox.y1]
|
||||
logger.info(f"[TABLE] Using {len(cell_boxes)} cell boxes from {cell_boxes_source}")
|
||||
|
||||
computed_col_widths, computed_row_heights = self._compute_table_grid_from_cell_boxes(
|
||||
cell_boxes, table_bbox_list, num_rows, max_cols
|
||||
)
|
||||
if computed_col_widths:
|
||||
col_widths = computed_col_widths
|
||||
logger.info(f"[TABLE] Computed {len(col_widths)} column widths from cell_boxes")
|
||||
|
||||
# NOTE: Don't use rowHeights from extraction - it causes content overlap
|
||||
# The extracted row heights are based on cell boundaries, not text content height.
|
||||
|
||||
Reference in New Issue
Block a user