feat: add table detection options and scan artifact removal

- Add TableDetectionSelector component for wired/wireless/region detection
- Add CV-based table line detector module (disabled due to poor performance)
- Add scan artifact removal preprocessing step (removes faint horizontal lines)
- Add PreprocessingConfig schema with remove_scan_artifacts option
- Update frontend PreprocessingSettings with scan artifact toggle
- Integrate table detection config into ProcessingPage
- Archive extract-table-cell-boxes proposal

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-30 13:21:50 +08:00
parent f5a2c8a750
commit 95ae1f1bdb
17 changed files with 1906 additions and 344 deletions

View File

@@ -447,7 +447,8 @@ class PDFGeneratorService:
'text': text_content,
'bbox': bbox_polygon,
'confidence': element.confidence or 1.0,
'page': page_num
'page': page_num,
'element_type': element.type.value # Include element type for styling
}
# Include style information if available (for Direct track)
@@ -466,13 +467,24 @@ class PDFGeneratorService:
else:
html_content = str(element.content)
layout_elements.append({
table_element = {
'type': 'table',
'content': html_content,
'bbox': [element.bbox.x0, element.bbox.y0,
element.bbox.x1, element.bbox.y1],
'page': page_num - 1 # layout uses 0-based
})
}
# Preserve cell_boxes and embedded_images from metadata
# These are extracted by PP-StructureV3 and used for accurate table rendering
if element.metadata:
if 'cell_boxes' in element.metadata:
table_element['cell_boxes'] = element.metadata['cell_boxes']
table_element['cell_boxes_source'] = element.metadata.get('cell_boxes_source', 'metadata')
if 'embedded_images' in element.metadata:
table_element['embedded_images'] = element.metadata['embedded_images']
layout_elements.append(table_element)
# Add bbox to images_metadata for text overlap filtering
# (no actual image file, just bbox for filtering)
@@ -484,10 +496,10 @@ class PDFGeneratorService:
'element_id': element.element_id
})
# Handle image/visual elements
# Handle image/visual elements (including stamps/seals)
elif element.is_visual or element.type in [
ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART,
ElementType.DIAGRAM, ElementType.LOGO
ElementType.DIAGRAM, ElementType.LOGO, ElementType.STAMP
]:
# Get image path using fallback logic
image_path = self._get_image_path(element)
@@ -729,13 +741,13 @@ class PDFGeneratorService:
regions_to_avoid.append(element) # Tables are exclusion regions
elif element.is_visual or element.type in [
ElementType.IMAGE, ElementType.FIGURE,
ElementType.CHART, ElementType.DIAGRAM, ElementType.LOGO
ElementType.CHART, ElementType.DIAGRAM, ElementType.LOGO, ElementType.STAMP
]:
image_elements.append(element)
# Only add real images to exclusion regions, NOT charts/diagrams
# Charts often have large bounding boxes that include text labels
# which should be rendered as selectable text on top
if element.type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.LOGO]:
if element.type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.LOGO, ElementType.STAMP]:
regions_to_avoid.append(element)
elif element.type == ElementType.LIST_ITEM:
list_elements.append(element)
@@ -934,11 +946,14 @@ class PDFGeneratorService:
# Create PDF canvas with initial page size (will be updated per page)
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height))
# Filter text regions to avoid overlap with tables/images
regions_to_avoid = images_metadata
# LAYERED RENDERING: Exclude tables from regions_to_avoid
# Text inside tables will be rendered at raw OCR positions (via GapFillingService)
# while table borders are drawn separately using cell_boxes
# Only avoid overlap with actual images/figures/charts
regions_to_avoid = [img for img in images_metadata if img.get('type') != 'table']
table_count = len([img for img in images_metadata if img.get('type') == 'table'])
logger.info(f"過濾文字區域: {len(regions_to_avoid)} 個區域需要避免 ( {table_count} 個表格)")
logger.info(f"過濾文字區域: {len(regions_to_avoid)} 個區域需要避免 (不含表格), {table_count} 個表格使用分層渲染")
filtered_text_regions = self._filter_text_in_regions(text_regions, regions_to_avoid)
@@ -1042,7 +1057,8 @@ class PDFGeneratorService:
for table_elem in page_table_regions:
self.draw_table_region(
pdf_canvas, table_elem, images_metadata,
current_target_h, current_scale_w, current_scale_h
current_target_h, current_scale_w, current_scale_h,
result_dir=json_parent_dir
)
# 3. Draw text (top layer)
@@ -1542,8 +1558,8 @@ class PDFGeneratorService:
logger.info(f"[文字] '{text[:30]}' → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 字體:{font_size:.1f}pt, 寬x高:{bbox_width:.0f}x{bbox_height:.0f}, 行數:{num_lines}")
# Set font with track-specific styling
# Note: OCR track has no StyleInfo (extracted from images), so no advanced formatting
style_info = region.get('style')
element_type = region.get('element_type', 'text')
is_direct_track = (self.current_processing_track == ProcessingTrack.DIRECT or
self.current_processing_track == ProcessingTrack.HYBRID)
@@ -1555,9 +1571,25 @@ class PDFGeneratorService:
font_size = pdf_canvas._fontsize
logger.debug(f"Applied Direct track style: font={font_name}, size={font_size}")
else:
# OCR track or no style: Use simple font selection
# OCR track or no style: Use simple font selection with element-type based styling
font_name = self.font_name if self.font_registered else 'Helvetica'
pdf_canvas.setFont(font_name, font_size)
# Apply element-type specific styling (for OCR track)
if element_type == 'title':
# Titles: use larger, bold font
font_size = min(font_size * 1.3, 36) # 30% larger, max 36pt
pdf_canvas.setFont(font_name, font_size)
logger.debug(f"Applied title style: size={font_size:.1f}")
elif element_type == 'header':
# Headers: slightly larger
font_size = min(font_size * 1.15, 24) # 15% larger, max 24pt
pdf_canvas.setFont(font_name, font_size)
elif element_type == 'caption':
# Captions: slightly smaller, italic if available
font_size = max(font_size * 0.9, 6) # 10% smaller, min 6pt
pdf_canvas.setFont(font_name, font_size)
else:
pdf_canvas.setFont(font_name, font_size)
# Handle line breaks (split text by newlines)
# OCR track: simple left-aligned rendering
@@ -1726,7 +1758,8 @@ class PDFGeneratorService:
images_metadata: List[Dict],
page_height: float,
scale_w: float = 1.0,
scale_h: float = 1.0
scale_h: float = 1.0,
result_dir: Optional[Path] = None
):
"""
Draw a table region by parsing HTML and rebuilding with ReportLab Table
@@ -1738,13 +1771,27 @@ class PDFGeneratorService:
page_height: Height of page
scale_w: Scale factor for X coordinates (PDF width / OCR width)
scale_h: Scale factor for Y coordinates (PDF height / OCR height)
result_dir: Directory containing result files (for embedded images)
"""
try:
html_content = table_element.get('content', '')
if not html_content:
return
# Parse HTML to extract table structure
# Try to use cell_boxes for direct rendering first (more accurate)
cell_boxes = table_element.get('cell_boxes', [])
if cell_boxes:
logger.info(f"[TABLE] Using cell_boxes direct rendering ({len(cell_boxes)} cells)")
success = self._draw_table_with_cell_boxes(
pdf_canvas, table_element, page_height,
scale_w, scale_h, result_dir
)
if success:
return # Successfully rendered with cell_boxes
logger.info("[TABLE] Falling back to ReportLab Table")
# Fallback: Parse HTML to extract table structure and use ReportLab Table
parser = HTMLTableParser()
parser.feed(html_content)
@@ -1901,14 +1948,18 @@ class PDFGeneratorService:
logger.info(f"[TABLE] Using cell_boxes col widths (scaled)")
else:
col_widths = [table_width / max_cols] * max_cols
logger.info(f"[TABLE] Using equal distribution col widths")
logger.info(f"[TABLE] Using equal distribution col widths: {table_width/max_cols:.1f} each")
# Row heights are used optionally (ReportLab can auto-size)
row_heights = None
# Row heights - ALWAYS use to ensure table fits bbox properly
# Use computed heights from cell_boxes, or uniform distribution as fallback
if computed_row_heights:
# Scale row_heights to PDF coordinates
row_heights = [h * scale_h for h in computed_row_heights]
logger.debug(f"[TABLE] Cell_boxes row heights available (scaled)")
logger.info(f"[TABLE] Using cell_boxes row heights (scaled)")
else:
# Uniform distribution based on table bbox - ensures table fills its allocated space
row_heights = [table_height / num_rows] * num_rows
logger.info(f"[TABLE] Using uniform row heights: {table_height/num_rows:.1f} each")
# Create ReportLab Table
# Use smaller font to fit content with auto-wrap
@@ -1932,12 +1983,10 @@ class PDFGeneratorService:
escaped_text = cell_text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
reportlab_data[row_idx][col_idx] = Paragraph(escaped_text, cell_style)
# Create table with computed col widths
# Note: We don't use row_heights even when available from cell_boxes because:
# 1. ReportLab's auto-sizing handles content overflow better
# 2. Fixed heights can cause text clipping when content exceeds cell size
# 3. The col_widths from cell_boxes provide the main layout benefit
table = Table(reportlab_data, colWidths=col_widths)
# Create table with col widths and row heights
# Always use row_heights to ensure table fits bbox properly
table = Table(reportlab_data, colWidths=col_widths, rowHeights=row_heights)
logger.info(f"[TABLE] Created with {len(col_widths)} cols, {len(row_heights)} rows")
# Apply table style
style = TableStyle([
@@ -1974,26 +2023,303 @@ class PDFGeneratorService:
scale_y = table_height / actual_height if actual_height > table_height else 1.0
scale_factor = min(scale_x, scale_y) # Use smaller scale to fit both dimensions
# Calculate the table top position in PDF coordinates
# ReportLab uses bottom-left origin, so we need to position from TOP
pdf_y_top = page_height - ocr_y_top # Top of table in PDF coords
# Calculate the actual bottom position based on scaled height
# Table should be positioned so its TOP aligns with the bbox top
scaled_height = actual_height * scale_factor
pdf_y_bottom = pdf_y_top - scaled_height # Bottom of scaled table
logger.info(f"[表格] PDF座標: top={pdf_y_top:.0f}, bottom={pdf_y_bottom:.0f}, scaled_height={scaled_height:.0f}")
if scale_factor < 1.0:
logger.info(f"[表格] 縮放比例: {scale_factor:.2f} (需要縮小以適應 bbox)")
# Apply scaling transformation
pdf_canvas.saveState()
pdf_canvas.translate(pdf_x, pdf_y)
pdf_canvas.translate(pdf_x, pdf_y_bottom)
pdf_canvas.scale(scale_factor, scale_factor)
# Draw at origin since we've already translated
table.drawOn(pdf_canvas, 0, 0)
pdf_canvas.restoreState()
else:
# Draw table at position without scaling
table.drawOn(pdf_canvas, pdf_x, pdf_y)
# pdf_y should be the bottom of the table
table.drawOn(pdf_canvas, pdf_x, pdf_y_bottom)
logger.info(f"Drew table at ({pdf_x:.0f}, {pdf_y:.0f}) size {table_width:.0f}x{table_height:.0f} with {len(rows)} rows")
logger.info(f"Drew table at ({pdf_x:.0f}, {pdf_y_bottom:.0f}) size {table_width:.0f}x{scaled_height:.0f} with {len(rows)} rows")
# Draw embedded images (images detected inside the table region)
embedded_images = table_element.get('embedded_images', [])
if embedded_images and result_dir:
logger.info(f"[TABLE] Drawing {len(embedded_images)} embedded images")
for emb_img in embedded_images:
self._draw_embedded_image(
pdf_canvas, emb_img, page_height, result_dir, scale_w, scale_h
)
except Exception as e:
logger.warning(f"Failed to draw table region: {e}")
import traceback
traceback.print_exc()
def _draw_embedded_image(
self,
pdf_canvas: canvas.Canvas,
emb_img: Dict,
page_height: float,
result_dir: Path,
scale_w: float = 1.0,
scale_h: float = 1.0
):
"""Draw an embedded image inside a table region."""
try:
# Get image path
saved_path = emb_img.get('saved_path', '')
if not saved_path:
return
# Construct full path
image_path = result_dir / saved_path
if not image_path.exists():
image_path = result_dir / Path(saved_path).name
if not image_path.exists():
logger.warning(f"Embedded image not found: {saved_path}")
return
# Get bbox from embedded image data
bbox = emb_img.get('bbox', [])
if not bbox or len(bbox) < 4:
logger.warning(f"No bbox for embedded image: {saved_path}")
return
# Calculate position (bbox is [x0, y0, x1, y1])
x0, y0, x1, y1 = bbox[0], bbox[1], bbox[2], bbox[3]
# Apply scaling
x0_scaled = x0 * scale_w
y0_scaled = y0 * scale_h
x1_scaled = x1 * scale_w
y1_scaled = y1 * scale_h
width = x1_scaled - x0_scaled
height = y1_scaled - y0_scaled
# Transform Y coordinate (ReportLab uses bottom-left origin)
pdf_x = x0_scaled
pdf_y = page_height - y1_scaled
# Draw the image
from reportlab.lib.utils import ImageReader
img_reader = ImageReader(str(image_path))
pdf_canvas.drawImage(
img_reader, pdf_x, pdf_y, width, height,
preserveAspectRatio=True, mask='auto'
)
logger.info(f"Drew embedded image at ({pdf_x:.0f}, {pdf_y:.0f}) size {width:.0f}x{height:.0f}")
except Exception as e:
logger.warning(f"Failed to draw embedded image: {e}")
def _normalize_cell_boxes_to_grid(
self,
cell_boxes: List[List[float]],
threshold: float = 10.0
) -> List[List[float]]:
"""
Normalize cell boxes to create a proper aligned grid.
Groups nearby coordinates and snaps them to a common value,
eliminating the 2-11 pixel variations that cause skewed tables.
Args:
cell_boxes: List of cell bboxes [[x1,y1,x2,y2], ...]
threshold: Maximum distance to consider coordinates as "same line"
Returns:
Normalized cell_boxes with aligned coordinates
"""
if not cell_boxes or len(cell_boxes) < 2:
return cell_boxes
# Collect all X and Y coordinates
x_coords = [] # (value, box_idx, is_x1)
y_coords = [] # (value, box_idx, is_y1)
for i, box in enumerate(cell_boxes):
x1, y1, x2, y2 = box[0], box[1], box[2], box[3]
x_coords.append((x1, i, True)) # x1 (left)
x_coords.append((x2, i, False)) # x2 (right)
y_coords.append((y1, i, True)) # y1 (top)
y_coords.append((y2, i, False)) # y2 (bottom)
def cluster_and_normalize(coords, threshold):
"""Cluster nearby coordinates and return mapping to normalized values."""
if not coords:
return {}
# Sort by value
sorted_coords = sorted(coords, key=lambda x: x[0])
# Cluster nearby values
clusters = []
current_cluster = [sorted_coords[0]]
for coord in sorted_coords[1:]:
if coord[0] - current_cluster[-1][0] <= threshold:
current_cluster.append(coord)
else:
clusters.append(current_cluster)
current_cluster = [coord]
clusters.append(current_cluster)
# Create mapping: (box_idx, is_first) -> normalized value
mapping = {}
for cluster in clusters:
# Use average of cluster as normalized value
avg_value = sum(c[0] for c in cluster) / len(cluster)
for _, box_idx, is_first in cluster:
mapping[(box_idx, is_first)] = avg_value
return mapping
x_mapping = cluster_and_normalize(x_coords, threshold)
y_mapping = cluster_and_normalize(y_coords, threshold)
# Create normalized cell boxes
normalized_boxes = []
for i, box in enumerate(cell_boxes):
x1_norm = x_mapping.get((i, True), box[0])
x2_norm = x_mapping.get((i, False), box[2])
y1_norm = y_mapping.get((i, True), box[1])
y2_norm = y_mapping.get((i, False), box[3])
normalized_boxes.append([x1_norm, y1_norm, x2_norm, y2_norm])
logger.debug(f"[TABLE] Normalized {len(cell_boxes)} cell boxes to grid")
return normalized_boxes
def _draw_table_with_cell_boxes(
self,
pdf_canvas: canvas.Canvas,
table_element: Dict,
page_height: float,
scale_w: float = 1.0,
scale_h: float = 1.0,
result_dir: Optional[Path] = None
):
"""
Draw table borders using cell_boxes for accurate positioning.
LAYERED RENDERING APPROACH:
- This method ONLY draws cell borders and embedded images
- Text is rendered separately using raw OCR positions (via GapFillingService)
- This decouples visual structure (borders) from content (text)
FALLBACK: If cell_boxes are incomplete, always draws the outer table
border using the table's bbox to ensure table boundaries are visible.
Args:
pdf_canvas: ReportLab canvas object
table_element: Table element dict with cell_boxes
page_height: Height of page in PDF coordinates
scale_w: Scale factor for X coordinates
scale_h: Scale factor for Y coordinates
result_dir: Directory containing result files (for embedded images)
"""
try:
cell_boxes = table_element.get('cell_boxes', [])
# Always draw outer table border first (fallback for incomplete cell_boxes)
table_bbox = table_element.get('bbox', [])
if table_bbox and len(table_bbox) >= 4:
# Handle different bbox formats (list or dict)
if isinstance(table_bbox, dict):
tx1 = float(table_bbox.get('x0', 0))
ty1 = float(table_bbox.get('y0', 0))
tx2 = float(table_bbox.get('x1', 0))
ty2 = float(table_bbox.get('y1', 0))
else:
tx1, ty1, tx2, ty2 = table_bbox[:4]
# Apply scaling
tx1_scaled = tx1 * scale_w
ty1_scaled = ty1 * scale_h
tx2_scaled = tx2 * scale_w
ty2_scaled = ty2 * scale_h
table_width = tx2_scaled - tx1_scaled
table_height = ty2_scaled - ty1_scaled
# Transform Y coordinate (PDF uses bottom-left origin)
pdf_x = tx1_scaled
pdf_y = page_height - ty2_scaled # Bottom of table in PDF coords
# Draw outer table border (slightly thicker for visibility)
pdf_canvas.setStrokeColor(colors.black)
pdf_canvas.setLineWidth(1.0)
pdf_canvas.rect(pdf_x, pdf_y, table_width, table_height, stroke=1, fill=0)
logger.info(f"[TABLE] Drew outer table border at [{int(tx1)},{int(ty1)},{int(tx2)},{int(ty2)}]")
if not cell_boxes:
logger.warning("[TABLE] No cell_boxes available, only outer border drawn")
# Still draw embedded images even without cell borders
embedded_images = table_element.get('embedded_images', [])
if embedded_images and result_dir:
for emb_img in embedded_images:
self._draw_embedded_image(
pdf_canvas, emb_img, page_height, result_dir, scale_w, scale_h
)
return True # Outer border drawn successfully
# Normalize cell boxes to create aligned grid
cell_boxes = self._normalize_cell_boxes_to_grid(cell_boxes)
logger.info(f"[TABLE] Drawing {len(cell_boxes)} cell borders (layered mode, grid-aligned)")
# Draw each cell border
for box in cell_boxes:
x1, y1, x2, y2 = box[0], box[1], box[2], box[3]
# Apply scaling
x1_scaled = x1 * scale_w
y1_scaled = y1 * scale_h
x2_scaled = x2 * scale_w
y2_scaled = y2 * scale_h
cell_width = x2_scaled - x1_scaled
cell_height = y2_scaled - y1_scaled
# Transform Y coordinate (PDF uses bottom-left origin)
pdf_x = x1_scaled
pdf_y = page_height - y2_scaled # Bottom of cell in PDF coords
# Draw cell border only (no fill, no text)
pdf_canvas.setStrokeColor(colors.black)
pdf_canvas.setLineWidth(0.5)
pdf_canvas.rect(pdf_x, pdf_y, cell_width, cell_height, stroke=1, fill=0)
logger.info(f"[TABLE] Drew {len(cell_boxes)} cell borders")
# Draw embedded images
embedded_images = table_element.get('embedded_images', [])
if embedded_images and result_dir:
logger.info(f"[TABLE] Drawing {len(embedded_images)} embedded images")
for emb_img in embedded_images:
self._draw_embedded_image(
pdf_canvas, emb_img, page_height, result_dir, scale_w, scale_h
)
return True
except Exception as e:
logger.warning(f"[TABLE] Failed to draw cell borders: {e}")
import traceback
traceback.print_exc()
return False
def draw_image_region(
self,
pdf_canvas: canvas.Canvas,
@@ -2923,12 +3249,29 @@ class PDFGeneratorService:
from reportlab.platypus import Table, TableStyle
from reportlab.lib import colors
# Determine number of rows and columns for cell_boxes calculation
num_rows = len(rows)
max_cols = max(len(row['cells']) for row in rows) if rows else 0
# Use original column widths from extraction if available
# Otherwise let ReportLab auto-calculate
# Otherwise try to compute from cell_boxes (from PP-StructureV3)
col_widths = None
if element.metadata and 'column_widths' in element.metadata:
col_widths = element.metadata['column_widths']
logger.debug(f"Using extracted column widths: {col_widths}")
elif element.metadata and 'cell_boxes' in element.metadata:
# Use cell_boxes from PP-StructureV3 for accurate column/row sizing
cell_boxes = element.metadata['cell_boxes']
cell_boxes_source = element.metadata.get('cell_boxes_source', 'unknown')
table_bbox_list = [bbox.x0, bbox.y0, bbox.x1, bbox.y1]
logger.info(f"[TABLE] Using {len(cell_boxes)} cell boxes from {cell_boxes_source}")
computed_col_widths, computed_row_heights = self._compute_table_grid_from_cell_boxes(
cell_boxes, table_bbox_list, num_rows, max_cols
)
if computed_col_widths:
col_widths = computed_col_widths
logger.info(f"[TABLE] Computed {len(col_widths)} column widths from cell_boxes")
# NOTE: Don't use rowHeights from extraction - it causes content overlap
# The extracted row heights are based on cell boundaries, not text content height.