feat: add table detection options and scan artifact removal

- Add TableDetectionSelector component for wired/wireless/region detection
- Add CV-based table line detector module (disabled due to poor performance)
- Add scan artifact removal preprocessing step (removes faint horizontal lines)
- Add PreprocessingConfig schema with remove_scan_artifacts option
- Update frontend PreprocessingSettings with scan artifact toggle
- Integrate table detection config into ProcessingPage
- Archive extract-table-cell-boxes proposal

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-30 13:21:50 +08:00
parent f5a2c8a750
commit 95ae1f1bdb
17 changed files with 1906 additions and 344 deletions

View File

@@ -104,7 +104,15 @@ class Settings(BaseSettings):
# Now using None to let PaddleX use its optimized defaults. # Now using None to let PaddleX use its optimized defaults.
layout_detection_threshold: Optional[float] = Field(default=None) # None = use PaddleX default layout_detection_threshold: Optional[float] = Field(default=None) # None = use PaddleX default
layout_nms_threshold: Optional[float] = Field(default=None) # None = use PaddleX default layout_nms_threshold: Optional[float] = Field(default=None) # None = use PaddleX default
layout_merge_mode: Optional[str] = Field(default=None) # None = use PaddleX default # layout_merge_bboxes_mode options:
# - "large": Keep larger box when overlap (default)
# - "small": Keep smaller box when overlap
# - "union": Keep all boxes (preserve overlapping tables/images)
# Using "union" to prevent tables from being merged together
layout_merge_mode: Optional[str] = Field(
default="union",
description="How to handle overlapping detection boxes. 'union' preserves all detected regions."
)
layout_unclip_ratio: Optional[float] = Field(default=None) # None = use PaddleX default layout_unclip_ratio: Optional[float] = Field(default=None) # None = use PaddleX default
# Text Detection Parameters # Text Detection Parameters
@@ -161,13 +169,8 @@ class Settings(BaseSettings):
description="Cell detection model for borderless tables. RT-DETR-L provides best accuracy." description="Cell detection model for borderless tables. RT-DETR-L provides best accuracy."
) )
# Table Cell Boxes Extraction - supplement PPStructureV3 with direct SLANeXt calls # Note: Table cell boxes are now extracted from table_res_list returned by PPStructureV3
# When enabled, directly invokes SLANeXt models to extract cell bounding boxes # No additional model calls needed - PPStructureV3 provides cell_box_list in table_res_list
# which are not exposed by the PPStructureV3 high-level API
enable_table_cell_boxes_extraction: bool = Field(
default=True,
description="Enable direct SLANeXt model calls to extract table cell bounding boxes for accurate PDF layout."
)
# Formula Recognition Model Configuration (Stage 4) # Formula Recognition Model Configuration (Stage 4)
# Available models: # Available models:

View File

@@ -40,6 +40,7 @@ from app.schemas.task import (
PreprocessingPreviewRequest, PreprocessingPreviewRequest,
PreprocessingPreviewResponse, PreprocessingPreviewResponse,
ImageQualityMetrics, ImageQualityMetrics,
TableDetectionConfig,
) )
from app.services.task_service import task_service from app.services.task_service import task_service
from app.services.file_access_service import file_access_service from app.services.file_access_service import file_access_service
@@ -75,7 +76,8 @@ def process_task_ocr(
language: str = 'ch', language: str = 'ch',
layout_model: Optional[str] = "chinese", layout_model: Optional[str] = "chinese",
preprocessing_mode: Optional[str] = "auto", preprocessing_mode: Optional[str] = "auto",
preprocessing_config: Optional[dict] = None preprocessing_config: Optional[dict] = None,
table_detection_config: Optional[dict] = None
): ):
""" """
Background task to process OCR for a task with dual-track support. Background task to process OCR for a task with dual-track support.
@@ -94,6 +96,7 @@ def process_task_ocr(
layout_model: Layout detection model ('chinese', 'default', 'cdla') layout_model: Layout detection model ('chinese', 'default', 'cdla')
preprocessing_mode: Preprocessing mode ('auto', 'manual', 'disabled') preprocessing_mode: Preprocessing mode ('auto', 'manual', 'disabled')
preprocessing_config: Manual preprocessing config dict (contrast, sharpen, binarize) preprocessing_config: Manual preprocessing config dict (contrast, sharpen, binarize)
table_detection_config: Table detection config dict (enable_wired_table, enable_wireless_table, enable_region_detection)
""" """
from app.core.database import SessionLocal from app.core.database import SessionLocal
from app.models.task import Task from app.models.task import Task
@@ -106,6 +109,7 @@ def process_task_ocr(
logger.info(f"Starting OCR processing for task {task_id}, file: {filename}") logger.info(f"Starting OCR processing for task {task_id}, file: {filename}")
logger.info(f"Processing options: dual_track={use_dual_track}, force_track={force_track}, lang={language}") logger.info(f"Processing options: dual_track={use_dual_track}, force_track={force_track}, lang={language}")
logger.info(f"Preprocessing options: mode={preprocessing_mode}, config={preprocessing_config}") logger.info(f"Preprocessing options: mode={preprocessing_mode}, config={preprocessing_config}")
logger.info(f"Table detection options: {table_detection_config}")
# Convert preprocessing parameters to proper types # Convert preprocessing parameters to proper types
preprocess_mode_enum = None preprocess_mode_enum = None
@@ -122,6 +126,15 @@ def process_task_ocr(
binarize=preprocessing_config.get("binarize", False) binarize=preprocessing_config.get("binarize", False)
) )
# Convert table detection config to object
table_det_config_obj = None
if table_detection_config:
table_det_config_obj = TableDetectionConfig(
enable_wired_table=table_detection_config.get("enable_wired_table", True),
enable_wireless_table=table_detection_config.get("enable_wireless_table", True),
enable_region_detection=table_detection_config.get("enable_region_detection", True)
)
# Get task directly by database ID (bypass user isolation for background task) # Get task directly by database ID (bypass user isolation for background task)
task = db.query(Task).filter(Task.id == task_db_id).first() task = db.query(Task).filter(Task.id == task_db_id).first()
if not task: if not task:
@@ -170,7 +183,8 @@ def process_task_ocr(
force_track=force_track, force_track=force_track,
layout_model=layout_model, layout_model=layout_model,
preprocessing_mode=preprocess_mode_enum, preprocessing_mode=preprocess_mode_enum,
preprocessing_config=preprocess_config_obj preprocessing_config=preprocess_config_obj,
table_detection_config=table_det_config_obj
) )
else: else:
# Fall back to traditional processing (no force_track support) # Fall back to traditional processing (no force_track support)
@@ -181,7 +195,8 @@ def process_task_ocr(
output_dir=result_dir, output_dir=result_dir,
layout_model=layout_model, layout_model=layout_model,
preprocessing_mode=preprocess_mode_enum, preprocessing_mode=preprocess_mode_enum,
preprocessing_config=preprocess_config_obj preprocessing_config=preprocess_config_obj,
table_detection_config=table_det_config_obj
) )
# Calculate processing time # Calculate processing time
@@ -754,6 +769,7 @@ async def start_task(
- **force_track**: Force specific processing track ('ocr' or 'direct') - **force_track**: Force specific processing track ('ocr' or 'direct')
- **language**: OCR language code (default: 'ch') - **language**: OCR language code (default: 'ch')
- **layout_model**: Layout detection model ('chinese', 'default', 'cdla') - **layout_model**: Layout detection model ('chinese', 'default', 'cdla')
- **table_detection**: Table detection config (enable_wired_table, enable_wireless_table, enable_region_detection)
""" """
try: try:
# Parse processing options with defaults # Parse processing options with defaults
@@ -781,6 +797,16 @@ async def start_task(
} }
logger.info(f"Preprocessing: mode={preprocessing_mode}, config={preprocessing_config}") logger.info(f"Preprocessing: mode={preprocessing_mode}, config={preprocessing_config}")
# Extract table detection options
table_detection_config = None
if options.table_detection:
table_detection_config = {
"enable_wired_table": options.table_detection.enable_wired_table,
"enable_wireless_table": options.table_detection.enable_wireless_table,
"enable_region_detection": options.table_detection.enable_region_detection
}
logger.info(f"Table detection: {table_detection_config}")
# Get task details # Get task details
task = task_service.get_task_by_id( task = task_service.get_task_by_id(
db=db, db=db,
@@ -829,11 +855,12 @@ async def start_task(
language=language, language=language,
layout_model=layout_model, layout_model=layout_model,
preprocessing_mode=preprocessing_mode, preprocessing_mode=preprocessing_mode,
preprocessing_config=preprocessing_config preprocessing_config=preprocessing_config,
table_detection_config=table_detection_config
) )
logger.info(f"Started OCR processing task {task_id} for user {current_user.email}") logger.info(f"Started OCR processing task {task_id} for user {current_user.email}")
logger.info(f"Options: dual_track={use_dual_track}, force_track={force_track}, lang={language}, layout_model={layout_model}, preprocessing={preprocessing_mode}") logger.info(f"Options: dual_track={use_dual_track}, force_track={force_track}, lang={language}, layout_model={layout_model}, preprocessing={preprocessing_mode}, table_detection={table_detection_config}")
return task return task
except HTTPException: except HTTPException:

View File

@@ -96,6 +96,35 @@ class PreprocessingConfig(BaseModel):
default=False, default=False,
description="Enable binarization (aggressive, for very low contrast). Not recommended for most documents." description="Enable binarization (aggressive, for very low contrast). Not recommended for most documents."
) )
remove_scan_artifacts: bool = Field(
default=True,
description="Remove horizontal scan line artifacts. Recommended for scanned documents to prevent misdetection of scanner light bar lines as table borders."
)
class TableDetectionConfig(BaseModel):
"""Table detection configuration for PP-StructureV3.
Controls which table detection modes to enable. PP-StructureV3 uses specialized
models for different table types:
- Wired (bordered): Tables with visible cell borders/grid lines
- Wireless (borderless): Tables without visible borders, relying on alignment
- Region detection: Detect table-like regions for better cell structure
Multiple options can be enabled simultaneously for comprehensive detection.
"""
enable_wired_table: bool = Field(
default=True,
description="Enable wired (bordered) table detection. Best for tables with visible grid lines."
)
enable_wireless_table: bool = Field(
default=True,
description="Enable wireless (borderless) table detection. Best for tables without visible borders."
)
enable_region_detection: bool = Field(
default=True,
description="Enable region detection for better table structure inference."
)
class ImageQualityMetrics(BaseModel): class ImageQualityMetrics(BaseModel):
@@ -294,6 +323,12 @@ class ProcessingOptions(BaseModel):
description="Manual preprocessing config (only used when preprocessing_mode='manual')" description="Manual preprocessing config (only used when preprocessing_mode='manual')"
) )
# Table detection configuration (OCR track only)
table_detection: Optional[TableDetectionConfig] = Field(
None,
description="Table detection config. If None, all table detection modes are enabled."
)
class AnalyzeRequest(BaseModel): class AnalyzeRequest(BaseModel):
"""Document analysis request""" """Document analysis request"""

View File

@@ -0,0 +1,362 @@
"""
CV-based Table Line Detection Module
Uses OpenCV morphological operations to detect table lines and extract cell boundaries.
This is more reliable for wired/bordered tables than ML-based cell detection.
"""
import cv2
import numpy as np
from typing import List, Tuple, Optional
from pathlib import Path
import logging
logger = logging.getLogger(__name__)
class CVTableDetector:
"""
Detects table cell boundaries using computer vision techniques.
Works by detecting horizontal and vertical lines in the image.
"""
def __init__(
self,
min_line_length: int = 30,
line_thickness: int = 2,
min_cell_width: int = 20,
min_cell_height: int = 15
):
"""
Initialize the CV table detector.
Args:
min_line_length: Minimum length of lines to detect (in pixels)
line_thickness: Expected thickness of table lines
min_cell_width: Minimum width of a valid cell
min_cell_height: Minimum height of a valid cell
"""
self.min_line_length = min_line_length
self.line_thickness = line_thickness
self.min_cell_width = min_cell_width
self.min_cell_height = min_cell_height
def detect_cells(
self,
image: np.ndarray,
table_bbox: Optional[List[float]] = None
) -> List[List[float]]:
"""
Detect cell boundaries in a table image.
Args:
image: Input image (BGR format)
table_bbox: Optional [x1, y1, x2, y2] to crop table region first
Returns:
List of cell bounding boxes [[x1, y1, x2, y2], ...]
"""
# Crop to table region if bbox provided
offset_x, offset_y = 0, 0
if table_bbox:
x1, y1, x2, y2 = [int(v) for v in table_bbox]
offset_x, offset_y = x1, y1
image = image[y1:y2, x1:x2]
if image.size == 0:
logger.warning("Empty image after cropping")
return []
# Convert to grayscale
if len(image.shape) == 3:
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
else:
gray = image
# Detect lines
horizontal_lines, vertical_lines = self._detect_lines(gray)
if horizontal_lines is None or vertical_lines is None:
logger.warning("Failed to detect table lines")
return []
# Find intersections to build grid
cells = self._build_cell_grid(horizontal_lines, vertical_lines, gray.shape)
# Convert to absolute coordinates
absolute_cells = []
for cell in cells:
abs_cell = [
cell[0] + offset_x,
cell[1] + offset_y,
cell[2] + offset_x,
cell[3] + offset_y
]
absolute_cells.append(abs_cell)
logger.info(f"[CV] Detected {len(absolute_cells)} cells from table lines")
return absolute_cells
def _detect_lines(
self,
gray: np.ndarray
) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
"""
Detect horizontal and vertical lines using morphological operations.
Args:
gray: Grayscale image
Returns:
Tuple of (horizontal_lines_mask, vertical_lines_mask)
"""
# Adaptive threshold for better line detection
binary = cv2.adaptiveThreshold(
gray, 255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY_INV,
11, 2
)
# Detect horizontal lines
h_kernel_length = max(self.min_line_length, gray.shape[1] // 30)
horizontal_kernel = cv2.getStructuringElement(
cv2.MORPH_RECT, (h_kernel_length, 1)
)
horizontal_lines = cv2.morphologyEx(
binary, cv2.MORPH_OPEN, horizontal_kernel, iterations=2
)
# Detect vertical lines
v_kernel_length = max(self.min_line_length, gray.shape[0] // 30)
vertical_kernel = cv2.getStructuringElement(
cv2.MORPH_RECT, (1, v_kernel_length)
)
vertical_lines = cv2.morphologyEx(
binary, cv2.MORPH_OPEN, vertical_kernel, iterations=2
)
return horizontal_lines, vertical_lines
def _build_cell_grid(
self,
horizontal_mask: np.ndarray,
vertical_mask: np.ndarray,
image_shape: Tuple[int, int]
) -> List[List[float]]:
"""
Build cell grid from detected line masks.
Args:
horizontal_mask: Binary mask of horizontal lines
vertical_mask: Binary mask of vertical lines
image_shape: (height, width) of the image
Returns:
List of cell bounding boxes
"""
height, width = image_shape[:2]
# Combine masks to find table structure
table_mask = cv2.add(horizontal_mask, vertical_mask)
# Find contours (cells are enclosed regions)
contours, hierarchy = cv2.findContours(
table_mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE
)
# Method 1: Use contours to find cells
cells_from_contours = self._cells_from_contours(contours, hierarchy)
# Method 2: Use line intersections to build grid
cells_from_grid = self._cells_from_line_intersections(
horizontal_mask, vertical_mask, height, width
)
# Use whichever method found more valid cells
if len(cells_from_grid) >= len(cells_from_contours):
return cells_from_grid
return cells_from_contours
def _cells_from_contours(
self,
contours,
hierarchy
) -> List[List[float]]:
"""Extract cell bounding boxes from contours."""
cells = []
for i, contour in enumerate(contours):
x, y, w, h = cv2.boundingRect(contour)
# Filter by minimum size
if w >= self.min_cell_width and h >= self.min_cell_height:
# Check if this is an inner contour (cell) not the outer table
if hierarchy is not None and hierarchy[0][i][3] != -1:
cells.append([float(x), float(y), float(x + w), float(y + h)])
return cells
def _cells_from_line_intersections(
self,
horizontal_mask: np.ndarray,
vertical_mask: np.ndarray,
height: int,
width: int
) -> List[List[float]]:
"""Build cells from line intersections (grid-based approach)."""
# Find horizontal line y-coordinates
h_projection = np.sum(horizontal_mask, axis=1)
h_lines = self._find_line_positions(h_projection, min_gap=self.min_cell_height)
# Find vertical line x-coordinates
v_projection = np.sum(vertical_mask, axis=0)
v_lines = self._find_line_positions(v_projection, min_gap=self.min_cell_width)
if len(h_lines) < 2 or len(v_lines) < 2:
logger.debug(f"Insufficient lines: {len(h_lines)} horizontal, {len(v_lines)} vertical")
return []
# Build cells from grid
cells = []
for i in range(len(h_lines) - 1):
for j in range(len(v_lines) - 1):
y1, y2 = h_lines[i], h_lines[i + 1]
x1, x2 = v_lines[j], v_lines[j + 1]
# Validate cell size
if (x2 - x1) >= self.min_cell_width and (y2 - y1) >= self.min_cell_height:
cells.append([float(x1), float(y1), float(x2), float(y2)])
return cells
def _find_line_positions(
self,
projection: np.ndarray,
min_gap: int
) -> List[int]:
"""
Find line positions from projection profile.
Args:
projection: 1D array of pixel sums
min_gap: Minimum gap between lines
Returns:
List of line positions
"""
# Threshold to find peaks (lines)
threshold = np.max(projection) * 0.3
peaks = projection > threshold
# Find transitions (line positions)
positions = []
in_peak = False
peak_start = 0
for i, is_peak in enumerate(peaks):
if is_peak and not in_peak:
peak_start = i
in_peak = True
elif not is_peak and in_peak:
# End of peak - use center
peak_center = (peak_start + i) // 2
if not positions or (peak_center - positions[-1]) >= min_gap:
positions.append(peak_center)
in_peak = False
return positions
def detect_and_merge_with_ml(
self,
image: np.ndarray,
table_bbox: List[float],
ml_cell_boxes: List[List[float]]
) -> List[List[float]]:
"""
Detect cells using CV and merge/validate with ML-detected boxes.
CV detection is used as the primary source for wired tables,
with ML boxes used to fill gaps or validate.
Args:
image: Input image
table_bbox: Table bounding box [x1, y1, x2, y2]
ml_cell_boxes: Cell boxes from ML model (RT-DETR-L)
Returns:
Merged/validated cell boxes
"""
cv_cells = self.detect_cells(image, table_bbox)
if not cv_cells:
# CV detection failed, fall back to ML
logger.info("[CV] No cells detected by CV, using ML cells")
return ml_cell_boxes
if not ml_cell_boxes:
# Only CV cells available
return cv_cells
# Validate: CV should find structured grid
# If CV found significantly fewer cells, there might be merged cells
cv_count = len(cv_cells)
ml_count = len(ml_cell_boxes)
logger.info(f"[CV] CV detected {cv_count} cells, ML detected {ml_count} cells")
# For wired tables, prefer CV detection (cleaner grid)
if cv_count >= ml_count * 0.5:
# CV found reasonable number of cells
return cv_cells
else:
# CV might have missed cells (possibly due to merged cells)
# Try to use ML boxes that don't overlap with CV cells
merged = list(cv_cells)
for ml_box in ml_cell_boxes:
if not self._has_significant_overlap(ml_box, cv_cells):
merged.append(ml_box)
return merged
def _has_significant_overlap(
self,
box: List[float],
boxes: List[List[float]],
threshold: float = 0.5
) -> bool:
"""Check if box significantly overlaps with any box in the list."""
for other in boxes:
iou = self._calculate_iou(box, other)
if iou > threshold:
return True
return False
def _calculate_iou(
self,
box1: List[float],
box2: List[float]
) -> float:
"""Calculate Intersection over Union of two boxes."""
x1 = max(box1[0], box2[0])
y1 = max(box1[1], box2[1])
x2 = min(box1[2], box2[2])
y2 = min(box1[3], box2[3])
if x2 <= x1 or y2 <= y1:
return 0.0
intersection = (x2 - x1) * (y2 - y1)
area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
union = area1 + area2 - intersection
return intersection / union if union > 0 else 0.0
def load_image(image_path: str) -> Optional[np.ndarray]:
"""Load image from path."""
path = Path(image_path)
if not path.exists():
logger.error(f"Image not found: {image_path}")
return None
return cv2.imread(str(path))

View File

@@ -212,7 +212,8 @@ class GapFillingService:
def _is_region_covered( def _is_region_covered(
self, self,
region: TextRegion, region: TextRegion,
pp_structure_elements: List[DocumentElement] pp_structure_elements: List[DocumentElement],
skip_table_coverage: bool = True
) -> bool: ) -> bool:
""" """
Check if a raw OCR region is covered by any PP-StructureV3 element. Check if a raw OCR region is covered by any PP-StructureV3 element.
@@ -220,6 +221,9 @@ class GapFillingService:
Args: Args:
region: Raw OCR text region region: Raw OCR text region
pp_structure_elements: List of PP-StructureV3 elements pp_structure_elements: List of PP-StructureV3 elements
skip_table_coverage: If True, don't consider TABLE elements as covering
(allows raw OCR text inside tables to pass through
for layered rendering)
Returns: Returns:
True if the region is covered True if the region is covered
@@ -228,6 +232,12 @@ class GapFillingService:
region_bbox = region.normalized_bbox region_bbox = region.normalized_bbox
for element in pp_structure_elements: for element in pp_structure_elements:
# Skip TABLE elements when checking coverage
# This allows raw OCR text inside tables to be preserved
# PDF generator will render: table borders + raw text positions
if skip_table_coverage and element.type == ElementType.TABLE:
continue
elem_bbox = ( elem_bbox = (
element.bbox.x0, element.bbox.y0, element.bbox.x0, element.bbox.y0,
element.bbox.x1, element.bbox.y1 element.bbox.x1, element.bbox.y1

View File

@@ -184,6 +184,99 @@ class LayoutPreprocessingService:
return normalized return normalized
def remove_scan_artifacts(
self,
image: np.ndarray,
line_thickness: int = 5,
min_line_length_ratio: float = 0.3,
faint_threshold: int = 30
) -> np.ndarray:
"""
Remove horizontal scan line artifacts from scanned documents.
Scanner light bar artifacts appear as FAINT horizontal lines across the image.
Key distinction from table borders:
- Scan artifacts are LIGHT/FAINT (close to background color)
- Table borders are DARK/BOLD (high contrast)
Method:
1. Detect horizontal edges using Sobel filter
2. Filter to keep only FAINT edges (low contrast)
3. Find continuous horizontal segments
4. Remove only faint horizontal lines while preserving bold table borders
Args:
image: Input image (BGR)
line_thickness: Maximum thickness of lines to remove (pixels)
min_line_length_ratio: Minimum line length as ratio of image width (0.0-1.0)
faint_threshold: Maximum edge strength for "faint" lines (0-255)
Returns:
Image with scan artifacts removed (BGR)
"""
h, w = image.shape[:2]
min_line_length = int(w * min_line_length_ratio)
# Convert to grayscale for detection
if len(image.shape) == 3:
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
else:
gray = image.copy()
# Step 1: Detect horizontal edges using Sobel (vertical gradient)
# Scan artifacts will have weak gradients, table borders will have strong gradients
sobel_y = cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=3)
sobel_abs = np.abs(sobel_y).astype(np.uint8)
# Step 2: Find FAINT horizontal edges only (low gradient magnitude)
# Strong edges (table borders) have high sobel values
# Faint edges (scan artifacts) have low sobel values
faint_edges = (sobel_abs > 5) & (sobel_abs < faint_threshold)
faint_edges = faint_edges.astype(np.uint8) * 255
# Step 3: Use horizontal morphological operations to find continuous lines
horizontal_kernel = cv2.getStructuringElement(
cv2.MORPH_RECT,
(min_line_length, 1)
)
# Opening removes short segments, keeping only long horizontal lines
horizontal_lines = cv2.morphologyEx(
faint_edges, cv2.MORPH_OPEN, horizontal_kernel, iterations=1
)
# Dilate slightly to cover the full artifact width
dilate_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, line_thickness))
line_mask = cv2.dilate(horizontal_lines, dilate_kernel, iterations=1)
# Check if any artifacts were detected
artifact_pixels = np.sum(line_mask > 0)
if artifact_pixels < 100:
logger.debug("No faint scan artifacts detected")
return image
# Calculate artifact coverage
total_pixels = h * w
coverage_ratio = artifact_pixels / total_pixels
# Faint artifacts should cover a small portion of the image
if coverage_ratio > 0.05: # More than 5% is suspicious
logger.debug(f"Faint artifact detection: coverage={coverage_ratio:.2%} (processing anyway)")
# Only process if coverage is not excessive
if coverage_ratio > 0.15: # More than 15% is definitely too much
logger.debug(f"Artifact detection rejected: coverage too high ({coverage_ratio:.2%})")
return image
# Use inpainting to remove artifacts
result = cv2.inpaint(image, line_mask, inpaintRadius=3, flags=cv2.INPAINT_TELEA)
logger.info(
f"Scan artifacts removed: {artifact_pixels} pixels ({coverage_ratio:.2%}), faint_threshold={faint_threshold}"
)
return result
def scale_for_layout_detection( def scale_for_layout_detection(
self, self,
image: np.ndarray, image: np.ndarray,
@@ -346,9 +439,13 @@ class LayoutPreprocessingService:
# Only enable for extremely low contrast (< 15) which indicates a scan quality issue # Only enable for extremely low contrast (< 15) which indicates a scan quality issue
binarize = False # Disabled by default binarize = False # Disabled by default
# Scan artifact removal is always enabled in auto mode for scanned documents
remove_scan_artifacts = True
logger.debug( logger.debug(
f"Auto config: contrast={contrast} strength={contrast_strength:.2f}, " f"Auto config: contrast={contrast} strength={contrast_strength:.2f}, "
f"sharpen={sharpen} strength={sharpen_strength:.2f}, binarize={binarize}" f"sharpen={sharpen} strength={sharpen_strength:.2f}, binarize={binarize}, "
f"remove_scan_artifacts={remove_scan_artifacts}"
) )
return PreprocessingConfig( return PreprocessingConfig(
@@ -356,7 +453,8 @@ class LayoutPreprocessingService:
contrast_strength=round(contrast_strength, 2), contrast_strength=round(contrast_strength, 2),
sharpen=sharpen, sharpen=sharpen,
sharpen_strength=round(sharpen_strength, 2), sharpen_strength=round(sharpen_strength, 2),
binarize=binarize binarize=binarize,
remove_scan_artifacts=remove_scan_artifacts
) )
def apply_contrast_enhancement( def apply_contrast_enhancement(
@@ -550,7 +648,8 @@ class LayoutPreprocessingService:
config_used=PreprocessingConfig( config_used=PreprocessingConfig(
contrast=PreprocessingContrastEnum.NONE, contrast=PreprocessingContrastEnum.NONE,
sharpen=False, sharpen=False,
binarize=False binarize=False,
remove_scan_artifacts=False
), ),
quality_metrics=metrics, quality_metrics=metrics,
was_processed=scaling_info.was_scaled, # True if scaling was applied was_processed=scaling_info.was_scaled, # True if scaling was applied
@@ -568,6 +667,13 @@ class LayoutPreprocessingService:
processed = scaled_image.copy() processed = scaled_image.copy()
was_processed = scaling_info.was_scaled # Start with True if already scaled was_processed = scaling_info.was_scaled # Start with True if already scaled
# Step 0: Remove scan artifacts BEFORE any enhancement
# This prevents scanner light bar lines from being enhanced and misdetected as table borders
if getattr(config, 'remove_scan_artifacts', True): # Default True for backwards compatibility
processed = self.remove_scan_artifacts(processed)
was_processed = True
logger.debug("Applied scan artifact removal")
# Step 1: Contrast enhancement # Step 1: Contrast enhancement
if config.contrast != PreprocessingContrastEnum.NONE: if config.contrast != PreprocessingContrastEnum.NONE:
processed = self.apply_contrast_enhancement( processed = self.apply_contrast_enhancement(

View File

@@ -30,7 +30,7 @@ from app.services.layout_preprocessing_service import (
get_layout_preprocessing_service, get_layout_preprocessing_service,
LayoutPreprocessingService, LayoutPreprocessingService,
) )
from app.schemas.task import PreprocessingModeEnum, PreprocessingConfig from app.schemas.task import PreprocessingModeEnum, PreprocessingConfig, TableDetectionConfig
# Import dual-track components # Import dual-track components
try: try:
@@ -454,7 +454,11 @@ class OCRService:
return self.ocr_engines[lang] return self.ocr_engines[lang]
def _ensure_structure_engine(self, layout_model: Optional[str] = None) -> PPStructureV3: def _ensure_structure_engine(
self,
layout_model: Optional[str] = None,
table_detection_config: Optional[TableDetectionConfig] = None
) -> PPStructureV3:
""" """
Get or create PP-Structure engine for layout analysis with GPU support. Get or create PP-Structure engine for layout analysis with GPU support.
Supports layout model selection for different document types. Supports layout model selection for different document types.
@@ -465,6 +469,10 @@ class OCRService:
- "default": PubLayNet-based (best for English documents) - "default": PubLayNet-based (best for English documents)
- "cdla": CDLA model (alternative for Chinese layout) - "cdla": CDLA model (alternative for Chinese layout)
- None: Use config default - None: Use config default
table_detection_config: Table detection configuration
- enable_wired_table: Enable bordered table detection
- enable_wireless_table: Enable borderless table detection
- enable_region_detection: Enable region detection
Returns: Returns:
PPStructure engine instance PPStructure engine instance
@@ -492,6 +500,19 @@ class OCRService:
logger.info(f"Layout model changed from {current_model} to {layout_model}, recreating engine") logger.info(f"Layout model changed from {current_model} to {layout_model}, recreating engine")
self.structure_engine = None # Force recreation self.structure_engine = None # Force recreation
# Check if we need to recreate the engine due to different table detection config
current_table_config = getattr(self, '_current_table_detection_config', None)
if self.structure_engine is not None and table_detection_config:
# Compare table detection settings
new_config_tuple = (
table_detection_config.enable_wired_table,
table_detection_config.enable_wireless_table,
table_detection_config.enable_region_detection
)
if current_table_config != new_config_tuple:
logger.info(f"Table detection config changed from {current_table_config} to {new_config_tuple}, recreating engine")
self.structure_engine = None # Force recreation
# Use cached engine or create new one # Use cached engine or create new one
if self.structure_engine is None: if self.structure_engine is None:
logger.info(f"Initializing PP-StructureV3 engine (GPU: {self.use_gpu})") logger.info(f"Initializing PP-StructureV3 engine (GPU: {self.use_gpu})")
@@ -504,6 +525,15 @@ class OCRService:
use_table = settings.enable_table_recognition use_table = settings.enable_table_recognition
use_seal = settings.enable_seal_recognition use_seal = settings.enable_seal_recognition
use_region = settings.enable_region_detection use_region = settings.enable_region_detection
# Apply table detection config overrides if provided
if table_detection_config:
# If both wired and wireless are disabled, disable table recognition entirely
if not table_detection_config.enable_wired_table and not table_detection_config.enable_wireless_table:
use_table = False
use_region = table_detection_config.enable_region_detection
logger.info(f"Table detection config applied: wired={table_detection_config.enable_wired_table}, "
f"wireless={table_detection_config.enable_wireless_table}, region={use_region}")
layout_threshold = settings.layout_detection_threshold layout_threshold = settings.layout_detection_threshold
layout_nms = settings.layout_nms_threshold layout_nms = settings.layout_nms_threshold
layout_merge = settings.layout_merge_mode layout_merge = settings.layout_merge_mode
@@ -538,6 +568,17 @@ class OCRService:
formula_model = settings.formula_recognition_model_name formula_model = settings.formula_recognition_model_name
chart_model = settings.chart_recognition_model_name chart_model = settings.chart_recognition_model_name
# Apply table detection config overrides for individual table types
if table_detection_config:
if not table_detection_config.enable_wired_table:
wired_table_model = None
wired_cell_det_model = None
logger.info("Wired table detection disabled by config")
if not table_detection_config.enable_wireless_table:
wireless_table_model = None
wireless_cell_det_model = None
logger.info("Wireless table detection disabled by config")
# Text detection/recognition model configuration # Text detection/recognition model configuration
text_det_model = settings.text_detection_model_name text_det_model = settings.text_detection_model_name
text_rec_model = settings.text_recognition_model_name text_rec_model = settings.text_recognition_model_name
@@ -641,6 +682,15 @@ class OCRService:
# Track model loading for cache management # Track model loading for cache management
self._model_last_used['structure'] = datetime.now() self._model_last_used['structure'] = datetime.now()
self._current_layout_model = layout_model # Track current model for recreation check self._current_layout_model = layout_model # Track current model for recreation check
# Track table detection config for recreation check
if table_detection_config:
self._current_table_detection_config = (
table_detection_config.enable_wired_table,
table_detection_config.enable_wireless_table,
table_detection_config.enable_region_detection
)
else:
self._current_table_detection_config = None
logger.info(f"PP-StructureV3 engine ready (PaddlePaddle {paddle.__version__}, {'GPU' if self.use_gpu else 'CPU'} mode)") logger.info(f"PP-StructureV3 engine ready (PaddlePaddle {paddle.__version__}, {'GPU' if self.use_gpu else 'CPU'} mode)")
@@ -712,6 +762,15 @@ class OCRService:
self.structure_engine = PPStructureV3(**cpu_kwargs) self.structure_engine = PPStructureV3(**cpu_kwargs)
self._current_layout_model = layout_model # Track current model for recreation check self._current_layout_model = layout_model # Track current model for recreation check
# Track table detection config for recreation check
if table_detection_config:
self._current_table_detection_config = (
table_detection_config.enable_wired_table,
table_detection_config.enable_wireless_table,
table_detection_config.enable_region_detection
)
else:
self._current_table_detection_config = None
logger.info(f"PP-StructureV3 engine ready (CPU mode - fallback, layout_model={settings.layout_detection_model_name})") logger.info(f"PP-StructureV3 engine ready (CPU mode - fallback, layout_model={settings.layout_detection_model_name})")
else: else:
raise raise
@@ -956,7 +1015,8 @@ class OCRService:
current_page: int = 0, current_page: int = 0,
layout_model: Optional[str] = None, layout_model: Optional[str] = None,
preprocessing_mode: Optional[PreprocessingModeEnum] = None, preprocessing_mode: Optional[PreprocessingModeEnum] = None,
preprocessing_config: Optional[PreprocessingConfig] = None preprocessing_config: Optional[PreprocessingConfig] = None,
table_detection_config: Optional[TableDetectionConfig] = None
) -> Dict: ) -> Dict:
""" """
Process single image with OCR and layout analysis Process single image with OCR and layout analysis
@@ -971,6 +1031,7 @@ class OCRService:
layout_model: Layout detection model ('chinese', 'default', 'cdla') layout_model: Layout detection model ('chinese', 'default', 'cdla')
preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled') preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
preprocessing_config: Manual preprocessing config (used when mode='manual') preprocessing_config: Manual preprocessing config (used when mode='manual')
table_detection_config: Table detection config (wired/wireless/region options)
Returns: Returns:
Dictionary with OCR results and metadata Dictionary with OCR results and metadata
@@ -1041,7 +1102,8 @@ class OCRService:
current_page=page_num - 1, # Convert to 0-based page number for layout data current_page=page_num - 1, # Convert to 0-based page number for layout data
layout_model=layout_model, layout_model=layout_model,
preprocessing_mode=preprocessing_mode, preprocessing_mode=preprocessing_mode,
preprocessing_config=preprocessing_config preprocessing_config=preprocessing_config,
table_detection_config=table_detection_config
) )
# Accumulate results # Accumulate results
@@ -1189,7 +1251,8 @@ class OCRService:
current_page=current_page, current_page=current_page,
layout_model=layout_model, layout_model=layout_model,
preprocessing_mode=preprocessing_mode, preprocessing_mode=preprocessing_mode,
preprocessing_config=preprocessing_config preprocessing_config=preprocessing_config,
table_detection_config=table_detection_config
) )
# Generate Markdown # Generate Markdown
@@ -1347,7 +1410,8 @@ class OCRService:
current_page: int = 0, current_page: int = 0,
layout_model: Optional[str] = None, layout_model: Optional[str] = None,
preprocessing_mode: Optional[PreprocessingModeEnum] = None, preprocessing_mode: Optional[PreprocessingModeEnum] = None,
preprocessing_config: Optional[PreprocessingConfig] = None preprocessing_config: Optional[PreprocessingConfig] = None,
table_detection_config: Optional[TableDetectionConfig] = None
) -> Tuple[Optional[Dict], List[Dict]]: ) -> Tuple[Optional[Dict], List[Dict]]:
""" """
Analyze document layout using PP-StructureV3 with enhanced element extraction Analyze document layout using PP-StructureV3 with enhanced element extraction
@@ -1359,6 +1423,7 @@ class OCRService:
layout_model: Layout detection model ('chinese', 'default', 'cdla') layout_model: Layout detection model ('chinese', 'default', 'cdla')
preprocessing_mode: Preprocessing mode ('auto', 'manual', 'disabled') preprocessing_mode: Preprocessing mode ('auto', 'manual', 'disabled')
preprocessing_config: Manual preprocessing config (used when mode='manual') preprocessing_config: Manual preprocessing config (used when mode='manual')
table_detection_config: Table detection config (wired/wireless/region options)
Returns: Returns:
Tuple of (layout_data, images_metadata) Tuple of (layout_data, images_metadata)
@@ -1376,7 +1441,7 @@ class OCRService:
f"Mode: {'CPU fallback' if self._cpu_fallback_active else 'GPU'}" f"Mode: {'CPU fallback' if self._cpu_fallback_active else 'GPU'}"
) )
structure_engine = self._ensure_structure_engine(layout_model) structure_engine = self._ensure_structure_engine(layout_model, table_detection_config)
# Apply image preprocessing for layout detection # Apply image preprocessing for layout detection
# Preprocessing includes: # Preprocessing includes:
@@ -1432,10 +1497,19 @@ class OCRService:
# Get scaling info for bbox coordinate restoration # Get scaling info for bbox coordinate restoration
scaling_info = preprocessing_result.scaling_info if preprocessing_result else None scaling_info = preprocessing_result.scaling_info if preprocessing_result else None
# CV table detection is disabled due to poor performance on complex tables
# Issues: 1) Detected boundaries smaller than content
# 2) Incorrectly splits merged cells
# The ML-based RT-DETR-L detection is currently more reliable.
# TODO: Improve CV algorithm with better line detection and grid alignment
use_cv_table_detection = False
result = enhanced_processor.analyze_with_full_structure( result = enhanced_processor.analyze_with_full_structure(
image_path, output_dir, current_page, image_path, output_dir, current_page,
preprocessed_image=preprocessed_image, preprocessed_image=preprocessed_image,
scaling_info=scaling_info scaling_info=scaling_info,
save_visualization=True, # Save layout detection visualization images
use_cv_table_detection=use_cv_table_detection
) )
if result.get('has_parsing_res_list'): if result.get('has_parsing_res_list'):
@@ -1673,7 +1747,8 @@ class OCRService:
force_track: Optional[str] = None, force_track: Optional[str] = None,
layout_model: Optional[str] = None, layout_model: Optional[str] = None,
preprocessing_mode: Optional[PreprocessingModeEnum] = None, preprocessing_mode: Optional[PreprocessingModeEnum] = None,
preprocessing_config: Optional[PreprocessingConfig] = None preprocessing_config: Optional[PreprocessingConfig] = None,
table_detection_config: Optional[TableDetectionConfig] = None
) -> Union[UnifiedDocument, Dict]: ) -> Union[UnifiedDocument, Dict]:
""" """
Process document using dual-track approach. Process document using dual-track approach.
@@ -1688,6 +1763,7 @@ class OCRService:
layout_model: Layout detection model ('chinese', 'default', 'cdla') (used for OCR track only) layout_model: Layout detection model ('chinese', 'default', 'cdla') (used for OCR track only)
preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled') preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
preprocessing_config: Manual preprocessing config (used when mode='manual') preprocessing_config: Manual preprocessing config (used when mode='manual')
table_detection_config: Table detection config (wired/wireless/region options)
Returns: Returns:
UnifiedDocument if dual-track is enabled, Dict otherwise UnifiedDocument if dual-track is enabled, Dict otherwise
@@ -1696,7 +1772,7 @@ class OCRService:
# Fallback to traditional OCR processing # Fallback to traditional OCR processing
return self.process_file_traditional( return self.process_file_traditional(
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model, file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
preprocessing_mode, preprocessing_config preprocessing_mode, preprocessing_config, table_detection_config
) )
start_time = datetime.now() start_time = datetime.now()
@@ -1770,7 +1846,8 @@ class OCRService:
confidence_threshold=confidence_threshold, confidence_threshold=confidence_threshold,
output_dir=output_dir, layout_model=layout_model, output_dir=output_dir, layout_model=layout_model,
preprocessing_mode=preprocessing_mode, preprocessing_mode=preprocessing_mode,
preprocessing_config=preprocessing_config preprocessing_config=preprocessing_config,
table_detection_config=table_detection_config
) )
# Convert OCR result to extract images # Convert OCR result to extract images
@@ -1804,7 +1881,7 @@ class OCRService:
logger.info("Using OCR track (PaddleOCR)") logger.info("Using OCR track (PaddleOCR)")
ocr_result = self.process_file_traditional( ocr_result = self.process_file_traditional(
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model, file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
preprocessing_mode, preprocessing_config preprocessing_mode, preprocessing_config, table_detection_config
) )
# Convert OCR result to UnifiedDocument using the converter # Convert OCR result to UnifiedDocument using the converter
@@ -1835,7 +1912,7 @@ class OCRService:
# Fallback to traditional OCR # Fallback to traditional OCR
return self.process_file_traditional( return self.process_file_traditional(
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model, file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
preprocessing_mode, preprocessing_config preprocessing_mode, preprocessing_config, table_detection_config
) )
def _merge_ocr_images_into_direct( def _merge_ocr_images_into_direct(
@@ -1916,7 +1993,8 @@ class OCRService:
output_dir: Optional[Path] = None, output_dir: Optional[Path] = None,
layout_model: Optional[str] = None, layout_model: Optional[str] = None,
preprocessing_mode: Optional[PreprocessingModeEnum] = None, preprocessing_mode: Optional[PreprocessingModeEnum] = None,
preprocessing_config: Optional[PreprocessingConfig] = None preprocessing_config: Optional[PreprocessingConfig] = None,
table_detection_config: Optional[TableDetectionConfig] = None
) -> Dict: ) -> Dict:
""" """
Traditional OCR processing (legacy method). Traditional OCR processing (legacy method).
@@ -1930,6 +2008,7 @@ class OCRService:
layout_model: Layout detection model ('chinese', 'default', 'cdla') layout_model: Layout detection model ('chinese', 'default', 'cdla')
preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled') preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
preprocessing_config: Manual preprocessing config (used when mode='manual') preprocessing_config: Manual preprocessing config (used when mode='manual')
table_detection_config: Table detection config (wired/wireless/region options)
Returns: Returns:
Dictionary with OCR results in legacy format Dictionary with OCR results in legacy format
@@ -1943,7 +2022,7 @@ class OCRService:
for i, image_path in enumerate(image_paths): for i, image_path in enumerate(image_paths):
result = self.process_image( result = self.process_image(
image_path, lang, detect_layout, confidence_threshold, output_dir, i, layout_model, image_path, lang, detect_layout, confidence_threshold, output_dir, i, layout_model,
preprocessing_mode, preprocessing_config preprocessing_mode, preprocessing_config, table_detection_config
) )
all_results.append(result) all_results.append(result)
@@ -1960,7 +2039,7 @@ class OCRService:
# Single image or other file # Single image or other file
return self.process_image( return self.process_image(
file_path, lang, detect_layout, confidence_threshold, output_dir, 0, layout_model, file_path, lang, detect_layout, confidence_threshold, output_dir, 0, layout_model,
preprocessing_mode, preprocessing_config preprocessing_mode, preprocessing_config, table_detection_config
) )
def _combine_results(self, results: List[Dict]) -> Dict: def _combine_results(self, results: List[Dict]) -> Dict:
@@ -2047,7 +2126,8 @@ class OCRService:
force_track: Optional[str] = None, force_track: Optional[str] = None,
layout_model: Optional[str] = None, layout_model: Optional[str] = None,
preprocessing_mode: Optional[PreprocessingModeEnum] = None, preprocessing_mode: Optional[PreprocessingModeEnum] = None,
preprocessing_config: Optional[PreprocessingConfig] = None preprocessing_config: Optional[PreprocessingConfig] = None,
table_detection_config: Optional[TableDetectionConfig] = None
) -> Union[UnifiedDocument, Dict]: ) -> Union[UnifiedDocument, Dict]:
""" """
Main processing method with dual-track support. Main processing method with dual-track support.
@@ -2063,6 +2143,7 @@ class OCRService:
layout_model: Layout detection model ('chinese', 'default', 'cdla') (used for OCR track only) layout_model: Layout detection model ('chinese', 'default', 'cdla') (used for OCR track only)
preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled') preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
preprocessing_config: Manual preprocessing config (used when mode='manual') preprocessing_config: Manual preprocessing config (used when mode='manual')
table_detection_config: Table detection config (wired/wireless/region options)
Returns: Returns:
UnifiedDocument if dual-track is enabled and use_dual_track=True, UnifiedDocument if dual-track is enabled and use_dual_track=True,
@@ -2075,13 +2156,13 @@ class OCRService:
# Use dual-track processing (or forced track) # Use dual-track processing (or forced track)
return self.process_with_dual_track( return self.process_with_dual_track(
file_path, lang, detect_layout, confidence_threshold, output_dir, force_track, layout_model, file_path, lang, detect_layout, confidence_threshold, output_dir, force_track, layout_model,
preprocessing_mode, preprocessing_config preprocessing_mode, preprocessing_config, table_detection_config
) )
else: else:
# Use traditional OCR processing (no force_track support) # Use traditional OCR processing (no force_track support)
return self.process_file_traditional( return self.process_file_traditional(
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model, file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
preprocessing_mode, preprocessing_config preprocessing_mode, preprocessing_config, table_detection_config
) )
def process_legacy( def process_legacy(

View File

@@ -590,8 +590,17 @@ class OCRToUnifiedConverter:
# Prepare content based on element type # Prepare content based on element type
if element_type == ElementType.TABLE: if element_type == ElementType.TABLE:
# For tables, use TableData as content # For tables, use TableData as content
# Pass cell_boxes for accurate cell positioning
table_data = self._extract_table_data(elem_data) table_data = self._extract_table_data(elem_data)
content = table_data if table_data else elem_data.get('content', '') content = table_data if table_data else elem_data.get('content', '')
# Preserve cell_boxes and embedded_images in metadata for PDF generation
# These are extracted by PP-StructureV3 and provide accurate cell positioning
if 'cell_boxes' in elem_data:
elem_data.setdefault('metadata', {})['cell_boxes'] = elem_data['cell_boxes']
elem_data['metadata']['cell_boxes_source'] = elem_data.get('cell_boxes_source', 'table_res_list')
if 'embedded_images' in elem_data:
elem_data.setdefault('metadata', {})['embedded_images'] = elem_data['embedded_images']
elif element_type in [ElementType.IMAGE, ElementType.FIGURE]: elif element_type in [ElementType.IMAGE, ElementType.FIGURE]:
# For images, use metadata dict as content # For images, use metadata dict as content
content = { content = {

View File

@@ -447,7 +447,8 @@ class PDFGeneratorService:
'text': text_content, 'text': text_content,
'bbox': bbox_polygon, 'bbox': bbox_polygon,
'confidence': element.confidence or 1.0, 'confidence': element.confidence or 1.0,
'page': page_num 'page': page_num,
'element_type': element.type.value # Include element type for styling
} }
# Include style information if available (for Direct track) # Include style information if available (for Direct track)
@@ -466,13 +467,24 @@ class PDFGeneratorService:
else: else:
html_content = str(element.content) html_content = str(element.content)
layout_elements.append({ table_element = {
'type': 'table', 'type': 'table',
'content': html_content, 'content': html_content,
'bbox': [element.bbox.x0, element.bbox.y0, 'bbox': [element.bbox.x0, element.bbox.y0,
element.bbox.x1, element.bbox.y1], element.bbox.x1, element.bbox.y1],
'page': page_num - 1 # layout uses 0-based 'page': page_num - 1 # layout uses 0-based
}) }
# Preserve cell_boxes and embedded_images from metadata
# These are extracted by PP-StructureV3 and used for accurate table rendering
if element.metadata:
if 'cell_boxes' in element.metadata:
table_element['cell_boxes'] = element.metadata['cell_boxes']
table_element['cell_boxes_source'] = element.metadata.get('cell_boxes_source', 'metadata')
if 'embedded_images' in element.metadata:
table_element['embedded_images'] = element.metadata['embedded_images']
layout_elements.append(table_element)
# Add bbox to images_metadata for text overlap filtering # Add bbox to images_metadata for text overlap filtering
# (no actual image file, just bbox for filtering) # (no actual image file, just bbox for filtering)
@@ -484,10 +496,10 @@ class PDFGeneratorService:
'element_id': element.element_id 'element_id': element.element_id
}) })
# Handle image/visual elements # Handle image/visual elements (including stamps/seals)
elif element.is_visual or element.type in [ elif element.is_visual or element.type in [
ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART, ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART,
ElementType.DIAGRAM, ElementType.LOGO ElementType.DIAGRAM, ElementType.LOGO, ElementType.STAMP
]: ]:
# Get image path using fallback logic # Get image path using fallback logic
image_path = self._get_image_path(element) image_path = self._get_image_path(element)
@@ -729,13 +741,13 @@ class PDFGeneratorService:
regions_to_avoid.append(element) # Tables are exclusion regions regions_to_avoid.append(element) # Tables are exclusion regions
elif element.is_visual or element.type in [ elif element.is_visual or element.type in [
ElementType.IMAGE, ElementType.FIGURE, ElementType.IMAGE, ElementType.FIGURE,
ElementType.CHART, ElementType.DIAGRAM, ElementType.LOGO ElementType.CHART, ElementType.DIAGRAM, ElementType.LOGO, ElementType.STAMP
]: ]:
image_elements.append(element) image_elements.append(element)
# Only add real images to exclusion regions, NOT charts/diagrams # Only add real images to exclusion regions, NOT charts/diagrams
# Charts often have large bounding boxes that include text labels # Charts often have large bounding boxes that include text labels
# which should be rendered as selectable text on top # which should be rendered as selectable text on top
if element.type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.LOGO]: if element.type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.LOGO, ElementType.STAMP]:
regions_to_avoid.append(element) regions_to_avoid.append(element)
elif element.type == ElementType.LIST_ITEM: elif element.type == ElementType.LIST_ITEM:
list_elements.append(element) list_elements.append(element)
@@ -934,11 +946,14 @@ class PDFGeneratorService:
# Create PDF canvas with initial page size (will be updated per page) # Create PDF canvas with initial page size (will be updated per page)
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height)) pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height))
# Filter text regions to avoid overlap with tables/images # LAYERED RENDERING: Exclude tables from regions_to_avoid
regions_to_avoid = images_metadata # Text inside tables will be rendered at raw OCR positions (via GapFillingService)
# while table borders are drawn separately using cell_boxes
# Only avoid overlap with actual images/figures/charts
regions_to_avoid = [img for img in images_metadata if img.get('type') != 'table']
table_count = len([img for img in images_metadata if img.get('type') == 'table']) table_count = len([img for img in images_metadata if img.get('type') == 'table'])
logger.info(f"過濾文字區域: {len(regions_to_avoid)} 個區域需要避免 ( {table_count} 個表格)") logger.info(f"過濾文字區域: {len(regions_to_avoid)} 個區域需要避免 (不含表格), {table_count} 個表格使用分層渲染")
filtered_text_regions = self._filter_text_in_regions(text_regions, regions_to_avoid) filtered_text_regions = self._filter_text_in_regions(text_regions, regions_to_avoid)
@@ -1042,7 +1057,8 @@ class PDFGeneratorService:
for table_elem in page_table_regions: for table_elem in page_table_regions:
self.draw_table_region( self.draw_table_region(
pdf_canvas, table_elem, images_metadata, pdf_canvas, table_elem, images_metadata,
current_target_h, current_scale_w, current_scale_h current_target_h, current_scale_w, current_scale_h,
result_dir=json_parent_dir
) )
# 3. Draw text (top layer) # 3. Draw text (top layer)
@@ -1542,8 +1558,8 @@ class PDFGeneratorService:
logger.info(f"[文字] '{text[:30]}' → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 字體:{font_size:.1f}pt, 寬x高:{bbox_width:.0f}x{bbox_height:.0f}, 行數:{num_lines}") logger.info(f"[文字] '{text[:30]}' → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 字體:{font_size:.1f}pt, 寬x高:{bbox_width:.0f}x{bbox_height:.0f}, 行數:{num_lines}")
# Set font with track-specific styling # Set font with track-specific styling
# Note: OCR track has no StyleInfo (extracted from images), so no advanced formatting
style_info = region.get('style') style_info = region.get('style')
element_type = region.get('element_type', 'text')
is_direct_track = (self.current_processing_track == ProcessingTrack.DIRECT or is_direct_track = (self.current_processing_track == ProcessingTrack.DIRECT or
self.current_processing_track == ProcessingTrack.HYBRID) self.current_processing_track == ProcessingTrack.HYBRID)
@@ -1555,8 +1571,24 @@ class PDFGeneratorService:
font_size = pdf_canvas._fontsize font_size = pdf_canvas._fontsize
logger.debug(f"Applied Direct track style: font={font_name}, size={font_size}") logger.debug(f"Applied Direct track style: font={font_name}, size={font_size}")
else: else:
# OCR track or no style: Use simple font selection # OCR track or no style: Use simple font selection with element-type based styling
font_name = self.font_name if self.font_registered else 'Helvetica' font_name = self.font_name if self.font_registered else 'Helvetica'
# Apply element-type specific styling (for OCR track)
if element_type == 'title':
# Titles: use larger, bold font
font_size = min(font_size * 1.3, 36) # 30% larger, max 36pt
pdf_canvas.setFont(font_name, font_size)
logger.debug(f"Applied title style: size={font_size:.1f}")
elif element_type == 'header':
# Headers: slightly larger
font_size = min(font_size * 1.15, 24) # 15% larger, max 24pt
pdf_canvas.setFont(font_name, font_size)
elif element_type == 'caption':
# Captions: slightly smaller, italic if available
font_size = max(font_size * 0.9, 6) # 10% smaller, min 6pt
pdf_canvas.setFont(font_name, font_size)
else:
pdf_canvas.setFont(font_name, font_size) pdf_canvas.setFont(font_name, font_size)
# Handle line breaks (split text by newlines) # Handle line breaks (split text by newlines)
@@ -1726,7 +1758,8 @@ class PDFGeneratorService:
images_metadata: List[Dict], images_metadata: List[Dict],
page_height: float, page_height: float,
scale_w: float = 1.0, scale_w: float = 1.0,
scale_h: float = 1.0 scale_h: float = 1.0,
result_dir: Optional[Path] = None
): ):
""" """
Draw a table region by parsing HTML and rebuilding with ReportLab Table Draw a table region by parsing HTML and rebuilding with ReportLab Table
@@ -1738,13 +1771,27 @@ class PDFGeneratorService:
page_height: Height of page page_height: Height of page
scale_w: Scale factor for X coordinates (PDF width / OCR width) scale_w: Scale factor for X coordinates (PDF width / OCR width)
scale_h: Scale factor for Y coordinates (PDF height / OCR height) scale_h: Scale factor for Y coordinates (PDF height / OCR height)
result_dir: Directory containing result files (for embedded images)
""" """
try: try:
html_content = table_element.get('content', '') html_content = table_element.get('content', '')
if not html_content: if not html_content:
return return
# Parse HTML to extract table structure # Try to use cell_boxes for direct rendering first (more accurate)
cell_boxes = table_element.get('cell_boxes', [])
if cell_boxes:
logger.info(f"[TABLE] Using cell_boxes direct rendering ({len(cell_boxes)} cells)")
success = self._draw_table_with_cell_boxes(
pdf_canvas, table_element, page_height,
scale_w, scale_h, result_dir
)
if success:
return # Successfully rendered with cell_boxes
logger.info("[TABLE] Falling back to ReportLab Table")
# Fallback: Parse HTML to extract table structure and use ReportLab Table
parser = HTMLTableParser() parser = HTMLTableParser()
parser.feed(html_content) parser.feed(html_content)
@@ -1901,14 +1948,18 @@ class PDFGeneratorService:
logger.info(f"[TABLE] Using cell_boxes col widths (scaled)") logger.info(f"[TABLE] Using cell_boxes col widths (scaled)")
else: else:
col_widths = [table_width / max_cols] * max_cols col_widths = [table_width / max_cols] * max_cols
logger.info(f"[TABLE] Using equal distribution col widths") logger.info(f"[TABLE] Using equal distribution col widths: {table_width/max_cols:.1f} each")
# Row heights are used optionally (ReportLab can auto-size) # Row heights - ALWAYS use to ensure table fits bbox properly
row_heights = None # Use computed heights from cell_boxes, or uniform distribution as fallback
if computed_row_heights: if computed_row_heights:
# Scale row_heights to PDF coordinates # Scale row_heights to PDF coordinates
row_heights = [h * scale_h for h in computed_row_heights] row_heights = [h * scale_h for h in computed_row_heights]
logger.debug(f"[TABLE] Cell_boxes row heights available (scaled)") logger.info(f"[TABLE] Using cell_boxes row heights (scaled)")
else:
# Uniform distribution based on table bbox - ensures table fills its allocated space
row_heights = [table_height / num_rows] * num_rows
logger.info(f"[TABLE] Using uniform row heights: {table_height/num_rows:.1f} each")
# Create ReportLab Table # Create ReportLab Table
# Use smaller font to fit content with auto-wrap # Use smaller font to fit content with auto-wrap
@@ -1932,12 +1983,10 @@ class PDFGeneratorService:
escaped_text = cell_text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;') escaped_text = cell_text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
reportlab_data[row_idx][col_idx] = Paragraph(escaped_text, cell_style) reportlab_data[row_idx][col_idx] = Paragraph(escaped_text, cell_style)
# Create table with computed col widths # Create table with col widths and row heights
# Note: We don't use row_heights even when available from cell_boxes because: # Always use row_heights to ensure table fits bbox properly
# 1. ReportLab's auto-sizing handles content overflow better table = Table(reportlab_data, colWidths=col_widths, rowHeights=row_heights)
# 2. Fixed heights can cause text clipping when content exceeds cell size logger.info(f"[TABLE] Created with {len(col_widths)} cols, {len(row_heights)} rows")
# 3. The col_widths from cell_boxes provide the main layout benefit
table = Table(reportlab_data, colWidths=col_widths)
# Apply table style # Apply table style
style = TableStyle([ style = TableStyle([
@@ -1974,26 +2023,303 @@ class PDFGeneratorService:
scale_y = table_height / actual_height if actual_height > table_height else 1.0 scale_y = table_height / actual_height if actual_height > table_height else 1.0
scale_factor = min(scale_x, scale_y) # Use smaller scale to fit both dimensions scale_factor = min(scale_x, scale_y) # Use smaller scale to fit both dimensions
# Calculate the table top position in PDF coordinates
# ReportLab uses bottom-left origin, so we need to position from TOP
pdf_y_top = page_height - ocr_y_top # Top of table in PDF coords
# Calculate the actual bottom position based on scaled height
# Table should be positioned so its TOP aligns with the bbox top
scaled_height = actual_height * scale_factor
pdf_y_bottom = pdf_y_top - scaled_height # Bottom of scaled table
logger.info(f"[表格] PDF座標: top={pdf_y_top:.0f}, bottom={pdf_y_bottom:.0f}, scaled_height={scaled_height:.0f}")
if scale_factor < 1.0: if scale_factor < 1.0:
logger.info(f"[表格] 縮放比例: {scale_factor:.2f} (需要縮小以適應 bbox)") logger.info(f"[表格] 縮放比例: {scale_factor:.2f} (需要縮小以適應 bbox)")
# Apply scaling transformation # Apply scaling transformation
pdf_canvas.saveState() pdf_canvas.saveState()
pdf_canvas.translate(pdf_x, pdf_y) pdf_canvas.translate(pdf_x, pdf_y_bottom)
pdf_canvas.scale(scale_factor, scale_factor) pdf_canvas.scale(scale_factor, scale_factor)
# Draw at origin since we've already translated # Draw at origin since we've already translated
table.drawOn(pdf_canvas, 0, 0) table.drawOn(pdf_canvas, 0, 0)
pdf_canvas.restoreState() pdf_canvas.restoreState()
else: else:
# Draw table at position without scaling # Draw table at position without scaling
table.drawOn(pdf_canvas, pdf_x, pdf_y) # pdf_y should be the bottom of the table
table.drawOn(pdf_canvas, pdf_x, pdf_y_bottom)
logger.info(f"Drew table at ({pdf_x:.0f}, {pdf_y:.0f}) size {table_width:.0f}x{table_height:.0f} with {len(rows)} rows") logger.info(f"Drew table at ({pdf_x:.0f}, {pdf_y_bottom:.0f}) size {table_width:.0f}x{scaled_height:.0f} with {len(rows)} rows")
# Draw embedded images (images detected inside the table region)
embedded_images = table_element.get('embedded_images', [])
if embedded_images and result_dir:
logger.info(f"[TABLE] Drawing {len(embedded_images)} embedded images")
for emb_img in embedded_images:
self._draw_embedded_image(
pdf_canvas, emb_img, page_height, result_dir, scale_w, scale_h
)
except Exception as e: except Exception as e:
logger.warning(f"Failed to draw table region: {e}") logger.warning(f"Failed to draw table region: {e}")
import traceback import traceback
traceback.print_exc() traceback.print_exc()
def _draw_embedded_image(
self,
pdf_canvas: canvas.Canvas,
emb_img: Dict,
page_height: float,
result_dir: Path,
scale_w: float = 1.0,
scale_h: float = 1.0
):
"""Draw an embedded image inside a table region."""
try:
# Get image path
saved_path = emb_img.get('saved_path', '')
if not saved_path:
return
# Construct full path
image_path = result_dir / saved_path
if not image_path.exists():
image_path = result_dir / Path(saved_path).name
if not image_path.exists():
logger.warning(f"Embedded image not found: {saved_path}")
return
# Get bbox from embedded image data
bbox = emb_img.get('bbox', [])
if not bbox or len(bbox) < 4:
logger.warning(f"No bbox for embedded image: {saved_path}")
return
# Calculate position (bbox is [x0, y0, x1, y1])
x0, y0, x1, y1 = bbox[0], bbox[1], bbox[2], bbox[3]
# Apply scaling
x0_scaled = x0 * scale_w
y0_scaled = y0 * scale_h
x1_scaled = x1 * scale_w
y1_scaled = y1 * scale_h
width = x1_scaled - x0_scaled
height = y1_scaled - y0_scaled
# Transform Y coordinate (ReportLab uses bottom-left origin)
pdf_x = x0_scaled
pdf_y = page_height - y1_scaled
# Draw the image
from reportlab.lib.utils import ImageReader
img_reader = ImageReader(str(image_path))
pdf_canvas.drawImage(
img_reader, pdf_x, pdf_y, width, height,
preserveAspectRatio=True, mask='auto'
)
logger.info(f"Drew embedded image at ({pdf_x:.0f}, {pdf_y:.0f}) size {width:.0f}x{height:.0f}")
except Exception as e:
logger.warning(f"Failed to draw embedded image: {e}")
def _normalize_cell_boxes_to_grid(
self,
cell_boxes: List[List[float]],
threshold: float = 10.0
) -> List[List[float]]:
"""
Normalize cell boxes to create a proper aligned grid.
Groups nearby coordinates and snaps them to a common value,
eliminating the 2-11 pixel variations that cause skewed tables.
Args:
cell_boxes: List of cell bboxes [[x1,y1,x2,y2], ...]
threshold: Maximum distance to consider coordinates as "same line"
Returns:
Normalized cell_boxes with aligned coordinates
"""
if not cell_boxes or len(cell_boxes) < 2:
return cell_boxes
# Collect all X and Y coordinates
x_coords = [] # (value, box_idx, is_x1)
y_coords = [] # (value, box_idx, is_y1)
for i, box in enumerate(cell_boxes):
x1, y1, x2, y2 = box[0], box[1], box[2], box[3]
x_coords.append((x1, i, True)) # x1 (left)
x_coords.append((x2, i, False)) # x2 (right)
y_coords.append((y1, i, True)) # y1 (top)
y_coords.append((y2, i, False)) # y2 (bottom)
def cluster_and_normalize(coords, threshold):
"""Cluster nearby coordinates and return mapping to normalized values."""
if not coords:
return {}
# Sort by value
sorted_coords = sorted(coords, key=lambda x: x[0])
# Cluster nearby values
clusters = []
current_cluster = [sorted_coords[0]]
for coord in sorted_coords[1:]:
if coord[0] - current_cluster[-1][0] <= threshold:
current_cluster.append(coord)
else:
clusters.append(current_cluster)
current_cluster = [coord]
clusters.append(current_cluster)
# Create mapping: (box_idx, is_first) -> normalized value
mapping = {}
for cluster in clusters:
# Use average of cluster as normalized value
avg_value = sum(c[0] for c in cluster) / len(cluster)
for _, box_idx, is_first in cluster:
mapping[(box_idx, is_first)] = avg_value
return mapping
x_mapping = cluster_and_normalize(x_coords, threshold)
y_mapping = cluster_and_normalize(y_coords, threshold)
# Create normalized cell boxes
normalized_boxes = []
for i, box in enumerate(cell_boxes):
x1_norm = x_mapping.get((i, True), box[0])
x2_norm = x_mapping.get((i, False), box[2])
y1_norm = y_mapping.get((i, True), box[1])
y2_norm = y_mapping.get((i, False), box[3])
normalized_boxes.append([x1_norm, y1_norm, x2_norm, y2_norm])
logger.debug(f"[TABLE] Normalized {len(cell_boxes)} cell boxes to grid")
return normalized_boxes
def _draw_table_with_cell_boxes(
self,
pdf_canvas: canvas.Canvas,
table_element: Dict,
page_height: float,
scale_w: float = 1.0,
scale_h: float = 1.0,
result_dir: Optional[Path] = None
):
"""
Draw table borders using cell_boxes for accurate positioning.
LAYERED RENDERING APPROACH:
- This method ONLY draws cell borders and embedded images
- Text is rendered separately using raw OCR positions (via GapFillingService)
- This decouples visual structure (borders) from content (text)
FALLBACK: If cell_boxes are incomplete, always draws the outer table
border using the table's bbox to ensure table boundaries are visible.
Args:
pdf_canvas: ReportLab canvas object
table_element: Table element dict with cell_boxes
page_height: Height of page in PDF coordinates
scale_w: Scale factor for X coordinates
scale_h: Scale factor for Y coordinates
result_dir: Directory containing result files (for embedded images)
"""
try:
cell_boxes = table_element.get('cell_boxes', [])
# Always draw outer table border first (fallback for incomplete cell_boxes)
table_bbox = table_element.get('bbox', [])
if table_bbox and len(table_bbox) >= 4:
# Handle different bbox formats (list or dict)
if isinstance(table_bbox, dict):
tx1 = float(table_bbox.get('x0', 0))
ty1 = float(table_bbox.get('y0', 0))
tx2 = float(table_bbox.get('x1', 0))
ty2 = float(table_bbox.get('y1', 0))
else:
tx1, ty1, tx2, ty2 = table_bbox[:4]
# Apply scaling
tx1_scaled = tx1 * scale_w
ty1_scaled = ty1 * scale_h
tx2_scaled = tx2 * scale_w
ty2_scaled = ty2 * scale_h
table_width = tx2_scaled - tx1_scaled
table_height = ty2_scaled - ty1_scaled
# Transform Y coordinate (PDF uses bottom-left origin)
pdf_x = tx1_scaled
pdf_y = page_height - ty2_scaled # Bottom of table in PDF coords
# Draw outer table border (slightly thicker for visibility)
pdf_canvas.setStrokeColor(colors.black)
pdf_canvas.setLineWidth(1.0)
pdf_canvas.rect(pdf_x, pdf_y, table_width, table_height, stroke=1, fill=0)
logger.info(f"[TABLE] Drew outer table border at [{int(tx1)},{int(ty1)},{int(tx2)},{int(ty2)}]")
if not cell_boxes:
logger.warning("[TABLE] No cell_boxes available, only outer border drawn")
# Still draw embedded images even without cell borders
embedded_images = table_element.get('embedded_images', [])
if embedded_images and result_dir:
for emb_img in embedded_images:
self._draw_embedded_image(
pdf_canvas, emb_img, page_height, result_dir, scale_w, scale_h
)
return True # Outer border drawn successfully
# Normalize cell boxes to create aligned grid
cell_boxes = self._normalize_cell_boxes_to_grid(cell_boxes)
logger.info(f"[TABLE] Drawing {len(cell_boxes)} cell borders (layered mode, grid-aligned)")
# Draw each cell border
for box in cell_boxes:
x1, y1, x2, y2 = box[0], box[1], box[2], box[3]
# Apply scaling
x1_scaled = x1 * scale_w
y1_scaled = y1 * scale_h
x2_scaled = x2 * scale_w
y2_scaled = y2 * scale_h
cell_width = x2_scaled - x1_scaled
cell_height = y2_scaled - y1_scaled
# Transform Y coordinate (PDF uses bottom-left origin)
pdf_x = x1_scaled
pdf_y = page_height - y2_scaled # Bottom of cell in PDF coords
# Draw cell border only (no fill, no text)
pdf_canvas.setStrokeColor(colors.black)
pdf_canvas.setLineWidth(0.5)
pdf_canvas.rect(pdf_x, pdf_y, cell_width, cell_height, stroke=1, fill=0)
logger.info(f"[TABLE] Drew {len(cell_boxes)} cell borders")
# Draw embedded images
embedded_images = table_element.get('embedded_images', [])
if embedded_images and result_dir:
logger.info(f"[TABLE] Drawing {len(embedded_images)} embedded images")
for emb_img in embedded_images:
self._draw_embedded_image(
pdf_canvas, emb_img, page_height, result_dir, scale_w, scale_h
)
return True
except Exception as e:
logger.warning(f"[TABLE] Failed to draw cell borders: {e}")
import traceback
traceback.print_exc()
return False
def draw_image_region( def draw_image_region(
self, self,
pdf_canvas: canvas.Canvas, pdf_canvas: canvas.Canvas,
@@ -2923,12 +3249,29 @@ class PDFGeneratorService:
from reportlab.platypus import Table, TableStyle from reportlab.platypus import Table, TableStyle
from reportlab.lib import colors from reportlab.lib import colors
# Determine number of rows and columns for cell_boxes calculation
num_rows = len(rows)
max_cols = max(len(row['cells']) for row in rows) if rows else 0
# Use original column widths from extraction if available # Use original column widths from extraction if available
# Otherwise let ReportLab auto-calculate # Otherwise try to compute from cell_boxes (from PP-StructureV3)
col_widths = None col_widths = None
if element.metadata and 'column_widths' in element.metadata: if element.metadata and 'column_widths' in element.metadata:
col_widths = element.metadata['column_widths'] col_widths = element.metadata['column_widths']
logger.debug(f"Using extracted column widths: {col_widths}") logger.debug(f"Using extracted column widths: {col_widths}")
elif element.metadata and 'cell_boxes' in element.metadata:
# Use cell_boxes from PP-StructureV3 for accurate column/row sizing
cell_boxes = element.metadata['cell_boxes']
cell_boxes_source = element.metadata.get('cell_boxes_source', 'unknown')
table_bbox_list = [bbox.x0, bbox.y0, bbox.x1, bbox.y1]
logger.info(f"[TABLE] Using {len(cell_boxes)} cell boxes from {cell_boxes_source}")
computed_col_widths, computed_row_heights = self._compute_table_grid_from_cell_boxes(
cell_boxes, table_bbox_list, num_rows, max_cols
)
if computed_col_widths:
col_widths = computed_col_widths
logger.info(f"[TABLE] Computed {len(col_widths)} column widths from cell_boxes")
# NOTE: Don't use rowHeights from extraction - it causes content overlap # NOTE: Don't use rowHeights from extraction - it causes content overlap
# The extracted row heights are based on cell boundaries, not text content height. # The extracted row heights are based on cell boundaries, not text content height.

View File

@@ -26,9 +26,11 @@ import paddle
from paddleocr import PPStructureV3 from paddleocr import PPStructureV3
from PIL import Image from PIL import Image
import numpy as np import numpy as np
import cv2
from app.models.unified_document import ElementType from app.models.unified_document import ElementType
from app.core.config import settings from app.core.config import settings
from app.services.memory_manager import prediction_context from app.services.memory_manager import prediction_context
from app.services.cv_table_detector import CVTableDetector
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -62,6 +64,7 @@ class PPStructureEnhanced:
'watermark': ElementType.WATERMARK, 'watermark': ElementType.WATERMARK,
'signature': ElementType.SIGNATURE, 'signature': ElementType.SIGNATURE,
'stamp': ElementType.STAMP, 'stamp': ElementType.STAMP,
'seal': ElementType.STAMP, # PP-StructureV3 may use 'seal' label
'logo': ElementType.LOGO, 'logo': ElementType.LOGO,
'barcode': ElementType.BARCODE, 'barcode': ElementType.BARCODE,
'qr-code': ElementType.QR_CODE, 'qr-code': ElementType.QR_CODE,
@@ -80,183 +83,15 @@ class PPStructureEnhanced:
""" """
self.structure_engine = structure_engine self.structure_engine = structure_engine
# Lazy-loaded SLANeXt models for cell boxes extraction
# These are loaded on-demand when enable_table_cell_boxes_extraction is True
self._slanet_wired_model = None
self._slanet_wireless_model = None
self._table_cls_model = None
def _get_slanet_model(self, is_wired: bool = True):
"""
Get or create SLANeXt model for cell boxes extraction (lazy loading).
Args:
is_wired: True for wired (bordered) tables, False for wireless
Returns:
SLANeXt model instance or None if loading fails
"""
if not settings.enable_table_cell_boxes_extraction:
return None
try:
from paddlex import create_model
if is_wired:
if self._slanet_wired_model is None:
model_name = settings.wired_table_model_name or "SLANeXt_wired"
logger.info(f"Loading SLANeXt wired model: {model_name}")
self._slanet_wired_model = create_model(model_name)
return self._slanet_wired_model
else:
if self._slanet_wireless_model is None:
model_name = settings.wireless_table_model_name or "SLANeXt_wireless"
logger.info(f"Loading SLANeXt wireless model: {model_name}")
self._slanet_wireless_model = create_model(model_name)
return self._slanet_wireless_model
except Exception as e:
logger.error(f"Failed to load SLANeXt model: {e}")
return None
def _get_table_classifier(self):
"""
Get or create table classification model (lazy loading).
Returns:
Table classifier model instance or None if loading fails
"""
if not settings.enable_table_cell_boxes_extraction:
return None
try:
from paddlex import create_model
if self._table_cls_model is None:
model_name = settings.table_classification_model_name or "PP-LCNet_x1_0_table_cls"
logger.info(f"Loading table classification model: {model_name}")
self._table_cls_model = create_model(model_name)
return self._table_cls_model
except Exception as e:
logger.error(f"Failed to load table classifier: {e}")
return None
def _extract_cell_boxes_with_slanet(
self,
table_image: np.ndarray,
table_bbox: List[float],
is_wired: Optional[bool] = None
) -> Optional[List[List[float]]]:
"""
Extract cell bounding boxes using direct SLANeXt model call.
This supplements PPStructureV3 which doesn't expose cell boxes in its output.
Args:
table_image: Cropped table image as numpy array (BGR format)
table_bbox: Table bounding box in page coordinates [x1, y1, x2, y2]
is_wired: If None, auto-detect using classifier. True for bordered tables.
Returns:
List of cell bounding boxes in page coordinates [[x1,y1,x2,y2], ...],
or None if extraction fails
"""
if not settings.enable_table_cell_boxes_extraction:
return None
try:
# Auto-detect table type if not specified
if is_wired is None:
classifier = self._get_table_classifier()
if classifier:
try:
cls_result = classifier.predict(table_image)
# PP-LCNet returns classification result
for res in cls_result:
label_names = res.get('label_names', [])
if label_names:
is_wired = 'wired' in str(label_names[0]).lower()
logger.debug(f"Table classified as: {'wired' if is_wired else 'wireless'}")
break
except Exception as e:
logger.warning(f"Table classification failed, defaulting to wired: {e}")
is_wired = True
else:
is_wired = True # Default to wired if classifier unavailable
# Get appropriate SLANeXt model
model = self._get_slanet_model(is_wired=is_wired)
if model is None:
return None
# Run SLANeXt prediction
results = model.predict(table_image)
# Extract cell boxes from result
cell_boxes = []
table_x, table_y = table_bbox[0], table_bbox[1]
for result in results:
# SLANeXt returns 'bbox' with 8-point polygon format
# [[x1,y1,x2,y2,x3,y3,x4,y4], ...]
boxes = result.get('bbox', [])
for box in boxes:
if isinstance(box, (list, tuple)):
if len(box) >= 8:
# 8-point polygon: convert to 4-point rectangle
xs = [box[i] for i in range(0, 8, 2)]
ys = [box[i] for i in range(1, 8, 2)]
x1, y1 = min(xs), min(ys)
x2, y2 = max(xs), max(ys)
elif len(box) >= 4:
# Already 4-point rectangle
x1, y1, x2, y2 = box[:4]
else:
continue
# Convert to absolute page coordinates
abs_box = [
float(x1 + table_x),
float(y1 + table_y),
float(x2 + table_x),
float(y2 + table_y)
]
cell_boxes.append(abs_box)
logger.info(f"SLANeXt extracted {len(cell_boxes)} cell boxes (is_wired={is_wired})")
return cell_boxes if cell_boxes else None
except Exception as e:
logger.error(f"Cell boxes extraction with SLANeXt failed: {e}")
return None
def release_slanet_models(self):
"""Release SLANeXt models to free GPU memory."""
if self._slanet_wired_model is not None:
del self._slanet_wired_model
self._slanet_wired_model = None
logger.info("Released SLANeXt wired model")
if self._slanet_wireless_model is not None:
del self._slanet_wireless_model
self._slanet_wireless_model = None
logger.info("Released SLANeXt wireless model")
if self._table_cls_model is not None:
del self._table_cls_model
self._table_cls_model = None
logger.info("Released table classifier model")
gc.collect()
if TORCH_AVAILABLE:
torch.cuda.empty_cache()
def analyze_with_full_structure( def analyze_with_full_structure(
self, self,
image_path: Path, image_path: Path,
output_dir: Optional[Path] = None, output_dir: Optional[Path] = None,
current_page: int = 0, current_page: int = 0,
preprocessed_image: Optional[Image.Image] = None, preprocessed_image: Optional[Image.Image] = None,
scaling_info: Optional['ScalingInfo'] = None scaling_info: Optional['ScalingInfo'] = None,
save_visualization: bool = False,
use_cv_table_detection: bool = False
) -> Dict[str, Any]: ) -> Dict[str, Any]:
""" """
Analyze document with full PP-StructureV3 capabilities. Analyze document with full PP-StructureV3 capabilities.
@@ -271,6 +106,10 @@ class PPStructureEnhanced:
scaling_info: Optional ScalingInfo from preprocessing. If image was scaled scaling_info: Optional ScalingInfo from preprocessing. If image was scaled
for layout detection, all bbox coordinates will be scaled back for layout detection, all bbox coordinates will be scaled back
to original image coordinates for proper cropping. to original image coordinates for proper cropping.
save_visualization: If True, save detection visualization images
(layout_det_res, layout_order_res, overall_ocr_res, etc.)
use_cv_table_detection: If True, use CV-based line detection for wired tables
instead of ML-based cell detection (RT-DETR-L)
Returns: Returns:
Dictionary with complete structure information including: Dictionary with complete structure information including:
@@ -278,6 +117,7 @@ class PPStructureEnhanced:
- reading_order: Reading order indices - reading_order: Reading order indices
- images: Extracted images with metadata - images: Extracted images with metadata
- tables: Extracted tables with structure - tables: Extracted tables with structure
- visualization_dir: Path to visualization images (if save_visualization=True)
""" """
try: try:
logger.info(f"Enhanced PP-StructureV3 analysis on {image_path.name}") logger.info(f"Enhanced PP-StructureV3 analysis on {image_path.name}")
@@ -313,9 +153,21 @@ class PPStructureEnhanced:
all_elements = [] all_elements = []
all_images = [] all_images = []
all_tables = [] all_tables = []
visualization_dir = None
# Process each page result # Process each page result
for page_idx, page_result in enumerate(results): for page_idx, page_result in enumerate(results):
# Save visualization images if requested
if save_visualization and output_dir and hasattr(page_result, 'save_to_img'):
try:
vis_dir = output_dir / 'visualization'
vis_dir.mkdir(parents=True, exist_ok=True)
page_result.save_to_img(str(vis_dir))
visualization_dir = vis_dir
logger.info(f"Saved visualization images to {vis_dir}")
except Exception as e:
logger.warning(f"Failed to save visualization images: {e}")
# Try to access parsing_res_list and table_res_list (the complete structure) # Try to access parsing_res_list and table_res_list (the complete structure)
parsing_res_list = None parsing_res_list = None
table_res_list = None table_res_list = None
@@ -369,6 +221,7 @@ class PPStructureEnhanced:
logger.info(f"Found parsing_res_list in to_dict['res'] with {len(parsing_res_list)} elements") logger.info(f"Found parsing_res_list in to_dict['res'] with {len(parsing_res_list)} elements")
# Extract table_res_list which contains cell_box_list # Extract table_res_list which contains cell_box_list
layout_det_res = None
if result_dict: if result_dict:
if 'table_res_list' in result_dict: if 'table_res_list' in result_dict:
table_res_list = result_dict['table_res_list'] table_res_list = result_dict['table_res_list']
@@ -377,20 +230,40 @@ class PPStructureEnhanced:
if 'cell_box_list' in tbl: if 'cell_box_list' in tbl:
logger.info(f" Table {i}: {len(tbl['cell_box_list'])} cell boxes") logger.info(f" Table {i}: {len(tbl['cell_box_list'])} cell boxes")
# Extract layout_det_res for Image-in-Table processing
if 'layout_det_res' in result_dict:
layout_det_res = result_dict['layout_det_res']
logger.info(f"Found layout_det_res with {len(layout_det_res.get('boxes', []))} boxes")
# Process parsing_res_list if found # Process parsing_res_list if found
if parsing_res_list: if parsing_res_list:
elements = self._process_parsing_res_list( elements = self._process_parsing_res_list(
parsing_res_list, current_page, output_dir, image_path, scaling_info, parsing_res_list, current_page, output_dir, image_path, scaling_info,
table_res_list=table_res_list # Pass table_res_list for cell_box_list table_res_list=table_res_list, # Pass table_res_list for cell_box_list
layout_det_res=layout_det_res, # Pass layout_det_res for Image-in-Table
use_cv_table_detection=use_cv_table_detection # Use CV for wired tables
) )
all_elements.extend(elements) all_elements.extend(elements)
# Extract tables and images from elements # Extract tables and images from elements
table_bboxes = [] # Collect table bboxes for standalone image filtering
for elem in elements: for elem in elements:
if elem['type'] == ElementType.TABLE: if elem['type'] == ElementType.TABLE:
all_tables.append(elem) all_tables.append(elem)
table_bboxes.append(elem.get('bbox', [0, 0, 0, 0]))
elif elem['type'] in [ElementType.IMAGE, ElementType.FIGURE]: elif elem['type'] in [ElementType.IMAGE, ElementType.FIGURE]:
all_images.append(elem) all_images.append(elem)
# Extract standalone images from layout_det_res (images NOT inside tables)
if layout_det_res and image_path and output_dir:
standalone_images = self._extract_standalone_images(
layout_det_res, table_bboxes, image_path, output_dir,
current_page, len(elements), scaling_info
)
if standalone_images:
all_elements.extend(standalone_images)
all_images.extend(standalone_images)
logger.info(f"Extracted {len(standalone_images)} standalone images from layout_det_res")
else: else:
# Fallback to markdown if parsing_res_list not available # Fallback to markdown if parsing_res_list not available
logger.warning("parsing_res_list not found, falling back to markdown") logger.warning("parsing_res_list not found, falling back to markdown")
@@ -402,7 +275,7 @@ class PPStructureEnhanced:
# Create reading order based on element positions # Create reading order based on element positions
reading_order = self._determine_reading_order(all_elements) reading_order = self._determine_reading_order(all_elements)
return { result = {
'elements': all_elements, 'elements': all_elements,
'total_elements': len(all_elements), 'total_elements': len(all_elements),
'reading_order': reading_order, 'reading_order': reading_order,
@@ -412,6 +285,12 @@ class PPStructureEnhanced:
'has_parsing_res_list': parsing_res_list is not None 'has_parsing_res_list': parsing_res_list is not None
} }
# Add visualization directory if available
if visualization_dir:
result['visualization_dir'] = str(visualization_dir)
return result
except Exception as e: except Exception as e:
logger.error(f"Enhanced PP-StructureV3 analysis error: {e}") logger.error(f"Enhanced PP-StructureV3 analysis error: {e}")
import traceback import traceback
@@ -446,7 +325,9 @@ class PPStructureEnhanced:
output_dir: Optional[Path], output_dir: Optional[Path],
source_image_path: Optional[Path] = None, source_image_path: Optional[Path] = None,
scaling_info: Optional['ScalingInfo'] = None, scaling_info: Optional['ScalingInfo'] = None,
table_res_list: Optional[List[Dict]] = None table_res_list: Optional[List[Dict]] = None,
layout_det_res: Optional[Dict] = None,
use_cv_table_detection: bool = False
) -> List[Dict[str, Any]]: ) -> List[Dict[str, Any]]:
""" """
Process parsing_res_list to extract all elements. Process parsing_res_list to extract all elements.
@@ -458,6 +339,8 @@ class PPStructureEnhanced:
output_dir: Optional output directory output_dir: Optional output directory
source_image_path: Path to source image for cropping image regions source_image_path: Path to source image for cropping image regions
table_res_list: Optional list of table results containing cell_box_list table_res_list: Optional list of table results containing cell_box_list
layout_det_res: Optional layout detection result for Image-in-Table processing
use_cv_table_detection: If True, use CV line detection for wired tables
Returns: Returns:
List of processed elements with normalized structure List of processed elements with normalized structure
@@ -628,53 +511,55 @@ class PPStructureEnhanced:
logger.info(f"[TABLE] Processed {len(processed_cells)} cell boxes with table offset ({table_x}, {table_y})") logger.info(f"[TABLE] Processed {len(processed_cells)} cell boxes with table offset ({table_x}, {table_y})")
cell_boxes_extracted = True cell_boxes_extracted = True
# Supplement with direct SLANeXt call if PPStructureV3 didn't provide boxes
if not cell_boxes_extracted and source_image_path and bbox != [0, 0, 0, 0]:
logger.info(f"[TABLE] No boxes from PPStructureV3, attempting SLANeXt extraction...")
try:
# Load source image and crop table region
source_img = Image.open(source_image_path)
source_array = np.array(source_img)
# Crop table region (bbox is in original image coordinates)
x1, y1, x2, y2 = [int(round(c)) for c in bbox]
# Ensure coordinates are within image bounds
h, w = source_array.shape[:2]
x1, y1 = max(0, x1), max(0, y1)
x2, y2 = min(w, x2), min(h, y2)
if x2 > x1 and y2 > y1:
table_crop = source_array[y1:y2, x1:x2]
# Convert RGB to BGR for SLANeXt
if len(table_crop.shape) == 3 and table_crop.shape[2] == 3:
table_crop_bgr = table_crop[:, :, ::-1]
else:
table_crop_bgr = table_crop
# Extract cell boxes using SLANeXt
slanet_boxes = self._extract_cell_boxes_with_slanet(
table_crop_bgr,
bbox, # Pass original bbox for coordinate offset
is_wired=None # Auto-detect
)
if slanet_boxes:
element['cell_boxes'] = slanet_boxes
element['cell_boxes_source'] = 'slanet'
cell_boxes_extracted = True
logger.info(f"[TABLE] SLANeXt extracted {len(slanet_boxes)} cell boxes")
else:
logger.warning(f"[TABLE] Invalid crop region: ({x1},{y1})-({x2},{y2})")
except Exception as e:
logger.error(f"[TABLE] SLANeXt extraction failed: {e}")
if not cell_boxes_extracted: if not cell_boxes_extracted:
logger.info(f"[TABLE] No cell boxes available. PPStructureV3 keys: {list(res_data.keys()) if res_data else 'empty'}") logger.info(f"[TABLE] No cell boxes available. PPStructureV3 keys: {list(res_data.keys()) if res_data else 'empty'}")
# Special handling for images/figures # 2.5 CV-based table line detection for wired tables
elif mapped_type in [ElementType.IMAGE, ElementType.FIGURE]: if use_cv_table_detection and source_image_path and source_image_path.exists():
try:
# Load image for CV processing
cv_image = cv2.imread(str(source_image_path))
if cv_image is not None:
cv_detector = CVTableDetector()
ml_cell_boxes = element.get('cell_boxes', [])
# Detect cells using CV line detection
cv_cells = cv_detector.detect_and_merge_with_ml(
cv_image,
bbox, # Table bbox
ml_cell_boxes
)
if cv_cells:
# Apply scaling if needed
if scaling_info and scaling_info.was_scaled:
cv_cells = [
[
c[0] * scaling_info.scale_x,
c[1] * scaling_info.scale_y,
c[2] * scaling_info.scale_x,
c[3] * scaling_info.scale_y
]
for c in cv_cells
]
element['cell_boxes'] = cv_cells
element['cell_boxes_source'] = 'cv_line_detection'
logger.info(f"[TABLE] CV line detection found {len(cv_cells)} cells (ML had {len(ml_cell_boxes)})")
except Exception as cv_error:
logger.warning(f"[TABLE] CV line detection failed: {cv_error}")
# 3. Image-in-Table 處理:檢測並嵌入表格內的圖片
if layout_det_res and source_image_path and output_dir:
embedded_images = self._embed_images_in_table(
element, bbox, layout_det_res, source_image_path, output_dir
)
if embedded_images:
element['embedded_images'] = embedded_images
logger.info(f"[TABLE] Embedded {len(embedded_images)} images into table")
# Special handling for images/figures/stamps (visual elements that need cropping)
elif mapped_type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.STAMP, ElementType.LOGO]:
# Save image if path provided # Save image if path provided
if 'img_path' in item and output_dir: if 'img_path' in item and output_dir:
saved_path = self._save_image(item['img_path'], output_dir, element['element_id']) saved_path = self._save_image(item['img_path'], output_dir, element['element_id'])
@@ -704,6 +589,209 @@ class PPStructureEnhanced:
return elements return elements
def _embed_images_in_table(
self,
table_element: Dict[str, Any],
table_bbox: List[float],
layout_det_res: Dict,
source_image_path: Path,
output_dir: Path
) -> List[Dict[str, Any]]:
"""
Detect and embed images that are inside a table region.
This handles the case where layout detection finds an image inside a table,
similar to how pp_demo embeds images in table HTML.
Args:
table_element: The table element being processed
table_bbox: Table bounding box [x1, y1, x2, y2]
layout_det_res: Layout detection result containing all detected boxes
source_image_path: Path to source image for cropping
output_dir: Output directory for saving cropped images
Returns:
List of embedded image info dicts with 'bbox', 'saved_path', 'html_tag'
"""
embedded_images = []
try:
boxes = layout_det_res.get('boxes', [])
table_x1, table_y1, table_x2, table_y2 = table_bbox
for box in boxes:
label = box.get('label', '').lower()
if label != 'image':
continue
# Get image bbox
img_coord = box.get('coordinate', [])
if len(img_coord) < 4:
continue
img_x1, img_y1, img_x2, img_y2 = img_coord[:4]
# Check if image is inside table (with some tolerance)
tolerance = 5 # pixels
if (img_x1 >= table_x1 - tolerance and
img_y1 >= table_y1 - tolerance and
img_x2 <= table_x2 + tolerance and
img_y2 <= table_y2 + tolerance):
logger.info(f"[IMAGE-IN-TABLE] Found image at [{int(img_x1)},{int(img_y1)},{int(img_x2)},{int(img_y2)}] inside table")
# Crop and save the image
img_element_id = f"img_in_table_{int(img_x1)}_{int(img_y1)}_{int(img_x2)}_{int(img_y2)}"
cropped_path = self._crop_and_save_image(
source_image_path,
[img_x1, img_y1, img_x2, img_y2],
output_dir,
img_element_id
)
if cropped_path:
# Create relative path for HTML embedding
rel_path = f"imgs/{Path(cropped_path).name}"
# Create img tag similar to pp_demo
img_html = f'<div style="text-align: center;"><img src="{rel_path}" alt="Image" /></div>'
embedded_image = {
'bbox': [img_x1, img_y1, img_x2, img_y2],
'saved_path': str(cropped_path),
'relative_path': rel_path,
'html_tag': img_html,
'element_id': img_element_id
}
embedded_images.append(embedded_image)
# Try to insert image into HTML content
if 'html' in table_element and table_element['html']:
# Insert image reference at the end of HTML before </table>
original_html = table_element['html']
if '</tbody>' in original_html:
# Insert before </tbody> in a new row
new_html = original_html.replace(
'</tbody>',
f'<tr><td colspan="99" style="text-align:center;"><img src="{rel_path}" alt="Embedded Image" /></td></tr></tbody>'
)
table_element['html'] = new_html
logger.info(f"[IMAGE-IN-TABLE] Embedded image into table HTML")
except Exception as e:
logger.error(f"[IMAGE-IN-TABLE] Error processing images in table: {e}")
return embedded_images
def _extract_standalone_images(
self,
layout_det_res: Dict,
table_bboxes: List[List[float]],
source_image_path: Path,
output_dir: Path,
current_page: int,
start_index: int,
scaling_info: Optional['ScalingInfo'] = None
) -> List[Dict[str, Any]]:
"""
Extract standalone images from layout_det_res that are NOT inside tables.
This handles images that PP-StructureV3 detects in layout_det_res but
doesn't include in parsing_res_list (non-table images).
Args:
layout_det_res: Layout detection result containing all detected boxes
table_bboxes: List of table bounding boxes to exclude images inside tables
source_image_path: Path to source image for cropping
output_dir: Output directory for saving cropped images
current_page: Current page number
start_index: Starting index for element IDs
scaling_info: Optional scaling info for coordinate restoration
Returns:
List of standalone image elements
"""
standalone_images = []
try:
boxes = layout_det_res.get('boxes', [])
logger.info(f"[STANDALONE-IMAGE] Checking {len(boxes)} boxes for standalone images")
for box_idx, box in enumerate(boxes):
label = box.get('label', '').lower()
if label != 'image':
continue
# Get image bbox
img_coord = box.get('coordinate', [])
if len(img_coord) < 4:
continue
img_x1, img_y1, img_x2, img_y2 = img_coord[:4]
# Check if image is inside any table (skip if so)
is_inside_table = False
for table_bbox in table_bboxes:
if len(table_bbox) < 4:
continue
tx1, ty1, tx2, ty2 = table_bbox[:4]
tolerance = 5 # pixels
if (img_x1 >= tx1 - tolerance and
img_y1 >= ty1 - tolerance and
img_x2 <= tx2 + tolerance and
img_y2 <= ty2 + tolerance):
is_inside_table = True
logger.debug(f"[STANDALONE-IMAGE] Image at [{int(img_x1)},{int(img_y1)}] is inside table, skipping")
break
if is_inside_table:
continue
# Scale bbox back to original coordinates if needed
if scaling_info and scaling_info.was_scaled:
scale_factor = scaling_info.scale_factor
img_x1 *= scale_factor
img_y1 *= scale_factor
img_x2 *= scale_factor
img_y2 *= scale_factor
logger.debug(f"[STANDALONE-IMAGE] Scaled bbox by {scale_factor:.3f}")
logger.info(f"[STANDALONE-IMAGE] Found standalone image at [{int(img_x1)},{int(img_y1)},{int(img_x2)},{int(img_y2)}]")
# Crop and save the image
element_idx = start_index + len(standalone_images)
img_element_id = f"standalone_img_{current_page}_{element_idx}"
cropped_path = self._crop_and_save_image(
source_image_path,
[img_x1, img_y1, img_x2, img_y2],
output_dir,
img_element_id
)
if cropped_path:
element = {
'element_id': img_element_id,
'type': ElementType.IMAGE,
'original_type': 'image',
'content': '',
'page': current_page,
'bbox': [img_x1, img_y1, img_x2, img_y2],
'index': element_idx,
'confidence': box.get('score', 1.0),
'saved_path': cropped_path,
'img_path': cropped_path,
'source': 'layout_det_res'
}
standalone_images.append(element)
logger.info(f"[STANDALONE-IMAGE] Extracted and saved: {cropped_path}")
except Exception as e:
logger.error(f"[STANDALONE-IMAGE] Error extracting standalone images: {e}")
import traceback
traceback.print_exc()
return standalone_images
def _process_markdown_fallback( def _process_markdown_fallback(
self, self,
page_result: Any, page_result: Any,

View File

@@ -0,0 +1,135 @@
"""
Test script for layered rendering approach.
Tests that table borders are drawn from cell_boxes
while text is rendered at raw OCR positions.
"""
import sys
sys.path.insert(0, '/home/egg/project/Tool_OCR/backend')
import json
from pathlib import Path
from app.services.pdf_generator_service import PDFGeneratorService
from app.services.gap_filling_service import GapFillingService
def test_layered_rendering():
"""Test the layered rendering approach."""
# Use existing test task
task_id = "84899366-f361-44f1-b989-5aba72419ca5"
result_dir = Path(f"/home/egg/project/Tool_OCR/backend/storage/results/{task_id}")
if not result_dir.exists():
print(f"[ERROR] Result directory not found: {result_dir}")
return False
# Load scan_result.json
scan_result_path = result_dir / "scan_result.json"
raw_ocr_path = result_dir / f"{task_id}_scan_page_1_raw_ocr_regions.json"
if not scan_result_path.exists():
print(f"[ERROR] scan_result.json not found")
return False
print(f"[INFO] Loading scan_result.json from {scan_result_path}")
with open(scan_result_path, 'r', encoding='utf-8') as f:
scan_result = json.load(f)
# Parse as UnifiedDocument using PDFGeneratorService's method
# scan_result IS the unified document (not nested under 'unified_document')
pdf_service = PDFGeneratorService()
unified_doc = pdf_service._json_to_unified_document(scan_result, result_dir)
if not unified_doc:
print(f"[ERROR] Failed to parse UnifiedDocument")
return False
print(f"[INFO] UnifiedDocument: {unified_doc.page_count} pages")
# Count elements
table_count = 0
text_count = 0
for page in unified_doc.pages:
for elem in page.elements:
if elem.type.value == 'table':
table_count += 1
# Check if cell_boxes are present (in metadata, not content)
cell_boxes = elem.metadata.get('cell_boxes', []) if elem.metadata else []
embedded_images = elem.metadata.get('embedded_images', []) if elem.metadata else []
print(f"[INFO] Table {elem.element_id}: {len(cell_boxes)} cell_boxes, {len(embedded_images)} embedded_images")
elif elem.type.value in ['text', 'paragraph', 'title']:
text_count += 1
print(f"[INFO] Tables: {table_count}, Text elements: {text_count}")
# Load raw OCR regions if available
raw_ocr_regions = []
if raw_ocr_path.exists():
print(f"[INFO] Loading raw OCR regions from {raw_ocr_path}")
with open(raw_ocr_path, 'r', encoding='utf-8') as f:
raw_ocr_data = json.load(f)
# Could be a list or dict with 'text_regions' key
if isinstance(raw_ocr_data, list):
raw_ocr_regions = raw_ocr_data
else:
raw_ocr_regions = raw_ocr_data.get('text_regions', [])
print(f"[INFO] Raw OCR regions: {len(raw_ocr_regions)}")
# Apply gap filling for each page
print(f"[INFO] Applying GapFillingService...")
gap_service = GapFillingService()
gap_filled_doc = unified_doc # Start with original
for page in unified_doc.pages:
page_num = page.page_number
page_dims = page.dimensions
# Get elements for this page
pp_elements = page.elements
# Apply gap filling
filled_elements, stats = gap_service.fill_gaps(
raw_ocr_regions=raw_ocr_regions,
pp_structure_elements=pp_elements,
page_number=page_num,
pp_dimensions=page_dims
)
# Update the page's elements
page.elements = filled_elements
print(f"[INFO] Page {page_num}: Added {stats.get('gaps_filled', 0)} gap-filled regions")
# Count elements after gap filling
final_text_count = 0
for page in gap_filled_doc.pages:
for elem in page.elements:
if elem.type.value in ['text', 'paragraph', 'title']:
final_text_count += 1
print(f"[INFO] After gap filling: {final_text_count} text elements (was {text_count})")
# Generate PDF
print(f"[INFO] Generating PDF with layered rendering...")
output_pdf = result_dir / "test_layered_rendering.pdf"
try:
success = pdf_service.generate_from_unified_document(
unified_doc=gap_filled_doc,
output_path=output_pdf
)
if success:
print(f"[SUCCESS] PDF generated: {output_pdf}")
print(f"[INFO] PDF size: {output_pdf.stat().st_size} bytes")
return True
else:
print(f"[ERROR] PDF generation returned False")
return False
except Exception as e:
print(f"[ERROR] PDF generation failed: {e}")
import traceback
traceback.print_exc()
return False
if __name__ == "__main__":
success = test_layered_rendering()
sys.exit(0 if success else 1)

View File

@@ -241,6 +241,25 @@ export default function PreprocessingSettings({
)} )}
</div> </div>
{/* Scan Artifact Removal Toggle */}
<div className="space-y-2">
<label className="flex items-center gap-2 cursor-pointer">
<input
type="checkbox"
checked={config.remove_scan_artifacts}
onChange={(e) => handleConfigChange('remove_scan_artifacts', e.target.checked)}
disabled={disabled}
className="w-4 h-4 rounded border-gray-300 text-blue-600 focus:ring-blue-500"
/>
<span className="text-sm text-gray-700">
{t('processing.preprocessing.removeScanArtifacts')}
</span>
</label>
<p className="text-xs text-gray-500 pl-6">
{t('processing.preprocessing.removeScanArtifactsDesc')}
</p>
</div>
{/* Binarize Toggle - Hidden by default, shown only in advanced mode */} {/* Binarize Toggle - Hidden by default, shown only in advanced mode */}
<details className="pt-2"> <details className="pt-2">
<summary className="text-xs text-gray-500 cursor-pointer hover:text-gray-700"> <summary className="text-xs text-gray-500 cursor-pointer hover:text-gray-700">

View File

@@ -0,0 +1,124 @@
import { cn } from '@/lib/utils'
import { Checkbox } from '@/components/ui/checkbox'
import { Table, Grid3X3, Rows3 } from 'lucide-react'
import { useTranslation } from 'react-i18next'
import type { TableDetectionConfig } from '@/types/apiV2'
interface TableDetectionSelectorProps {
value: TableDetectionConfig
onChange: (config: TableDetectionConfig) => void
disabled?: boolean
className?: string
}
interface DetectionOption {
key: keyof TableDetectionConfig
icon: React.ReactNode
labelKey: string
descKey: string
}
const DETECTION_OPTIONS: DetectionOption[] = [
{
key: 'enable_wired_table',
icon: <Grid3X3 className="w-5 h-5" />,
labelKey: 'processing.tableDetection.wired',
descKey: 'processing.tableDetection.wiredDesc',
},
{
key: 'enable_wireless_table',
icon: <Rows3 className="w-5 h-5" />,
labelKey: 'processing.tableDetection.wireless',
descKey: 'processing.tableDetection.wirelessDesc',
},
{
key: 'enable_region_detection',
icon: <Table className="w-5 h-5" />,
labelKey: 'processing.tableDetection.region',
descKey: 'processing.tableDetection.regionDesc',
},
]
export default function TableDetectionSelector({
value,
onChange,
disabled = false,
className,
}: TableDetectionSelectorProps) {
const { t } = useTranslation()
const handleOptionChange = (key: keyof TableDetectionConfig, checked: boolean) => {
onChange({
...value,
[key]: checked,
})
}
return (
<div className={cn('border rounded-lg p-4 bg-white', className)}>
{/* Header */}
<div className="flex items-center gap-2 mb-4">
<Table className="w-5 h-5 text-gray-600" />
<h3 className="text-lg font-semibold text-gray-900">{t('processing.tableDetection.title')}</h3>
</div>
{/* Detection Options */}
<div className="space-y-3">
{DETECTION_OPTIONS.map((option) => {
const isChecked = value[option.key]
return (
<label
key={option.key}
className={cn(
'flex items-start gap-4 p-4 rounded-lg border-2 transition-all cursor-pointer',
isChecked
? 'border-blue-500 bg-blue-50'
: 'border-gray-200 hover:border-gray-300 hover:bg-gray-50',
disabled && 'opacity-50 cursor-not-allowed'
)}
>
{/* Checkbox */}
<Checkbox
checked={isChecked}
onCheckedChange={(checked) => handleOptionChange(option.key, checked)}
disabled={disabled}
className="mt-0.5"
/>
{/* Icon */}
<div
className={cn(
'p-2 rounded-lg flex-shrink-0',
isChecked ? 'bg-blue-100 text-blue-600' : 'bg-gray-100 text-gray-500'
)}
>
{option.icon}
</div>
{/* Content */}
<div className="flex-1 min-w-0">
<span
className={cn(
'font-medium',
isChecked ? 'text-blue-700' : 'text-gray-900'
)}
>
{t(option.labelKey)}
</span>
<p className="text-sm text-gray-500 mt-1">{t(option.descKey)}</p>
</div>
</label>
)
})}
</div>
{/* Info Note */}
<div className="mt-4 p-3 bg-amber-50 border border-amber-200 rounded-md">
<p className="text-sm text-amber-800">
{t('processing.tableDetection.note')}
</p>
</div>
</div>
)
}

View File

@@ -64,6 +64,16 @@
"recommended": "推薦", "recommended": "推薦",
"note": "版面模型會影響文件結構(表格、文字區塊、圖片)的偵測效果。請根據您的文件類型選擇適合的模型。" "note": "版面模型會影響文件結構(表格、文字區塊、圖片)的偵測效果。請根據您的文件類型選擇適合的模型。"
}, },
"tableDetection": {
"title": "表格偵測模式",
"wired": "有框線表格",
"wiredDesc": "偵測有明顯格線邊框的表格,適用於正式表格文件",
"wireless": "無框線表格",
"wirelessDesc": "偵測無邊框的表格,透過對齊方式推斷表格結構",
"region": "區域偵測",
"regionDesc": "輔助偵測表格區域,改善複雜表格的儲存格識別",
"note": "可同時啟用多種偵測模式,系統會自動整合偵測結果。如果表格儲存格框線不正確,請嘗試調整偵測模式。"
},
"preprocessing": { "preprocessing": {
"title": "影像前處理", "title": "影像前處理",
"mode": { "mode": {
@@ -92,6 +102,8 @@
"strong": "強", "strong": "強",
"maximum": "最強" "maximum": "最強"
}, },
"removeScanArtifacts": "移除掃描瑕疵",
"removeScanArtifactsDesc": "移除掃描時光源產生的水平線痕,避免被誤判為表格框線",
"advanced": "進階選項", "advanced": "進階選項",
"binarize": "二值化處理", "binarize": "二值化處理",
"binarizeWarning": "不建議使用", "binarizeWarning": "不建議使用",

View File

@@ -12,9 +12,10 @@ import { Play, CheckCircle, FileText, AlertCircle, Clock, Activity, Loader2, Inf
import LayoutModelSelector from '@/components/LayoutModelSelector' import LayoutModelSelector from '@/components/LayoutModelSelector'
import PreprocessingSettings from '@/components/PreprocessingSettings' import PreprocessingSettings from '@/components/PreprocessingSettings'
import PreprocessingPreview from '@/components/PreprocessingPreview' import PreprocessingPreview from '@/components/PreprocessingPreview'
import TableDetectionSelector from '@/components/TableDetectionSelector'
import TaskNotFound from '@/components/TaskNotFound' import TaskNotFound from '@/components/TaskNotFound'
import { useTaskValidation } from '@/hooks/useTaskValidation' import { useTaskValidation } from '@/hooks/useTaskValidation'
import type { LayoutModel, ProcessingOptions, PreprocessingMode, PreprocessingConfig, DocumentAnalysisResponse } from '@/types/apiV2' import type { LayoutModel, ProcessingOptions, PreprocessingMode, PreprocessingConfig, TableDetectionConfig, DocumentAnalysisResponse } from '@/types/apiV2'
export default function ProcessingPage() { export default function ProcessingPage() {
const { t } = useTranslation() const { t } = useTranslation()
@@ -44,9 +45,17 @@ export default function ProcessingPage() {
sharpen: true, sharpen: true,
sharpen_strength: 1.0, sharpen_strength: 1.0,
binarize: false, binarize: false,
remove_scan_artifacts: true,
}) })
const [showPreview, setShowPreview] = useState(false) const [showPreview, setShowPreview] = useState(false)
// Table detection state
const [tableDetectionConfig, setTableDetectionConfig] = useState<TableDetectionConfig>({
enable_wired_table: true,
enable_wireless_table: true,
enable_region_detection: true,
})
// Analyze document to determine if OCR is needed (only for pending tasks) // Analyze document to determine if OCR is needed (only for pending tasks)
const { data: documentAnalysis, isLoading: isAnalyzing } = useQuery({ const { data: documentAnalysis, isLoading: isAnalyzing } = useQuery({
queryKey: ['documentAnalysis', taskId], queryKey: ['documentAnalysis', taskId],
@@ -70,6 +79,7 @@ export default function ProcessingPage() {
layout_model: layoutModel, layout_model: layoutModel,
preprocessing_mode: preprocessingMode, preprocessing_mode: preprocessingMode,
preprocessing_config: preprocessingMode === 'manual' ? preprocessingConfig : undefined, preprocessing_config: preprocessingMode === 'manual' ? preprocessingConfig : undefined,
table_detection: tableDetectionConfig,
} }
return apiClientV2.startTask(taskId!, options) return apiClientV2.startTask(taskId!, options)
@@ -441,6 +451,13 @@ export default function ProcessingPage() {
disabled={processOCRMutation.isPending} disabled={processOCRMutation.isPending}
/> />
{/* Table Detection Settings */}
<TableDetectionSelector
value={tableDetectionConfig}
onChange={setTableDetectionConfig}
disabled={processOCRMutation.isPending}
/>
{/* Preprocessing Settings */} {/* Preprocessing Settings */}
<PreprocessingSettings <PreprocessingSettings
mode={preprocessingMode} mode={preprocessingMode}

View File

@@ -108,6 +108,20 @@ export interface PreprocessingConfig {
sharpen: boolean sharpen: boolean
sharpen_strength: number // 0.5-2.0, default 1.0 sharpen_strength: number // 0.5-2.0, default 1.0
binarize: boolean binarize: boolean
remove_scan_artifacts: boolean // Remove horizontal scan line artifacts
}
/**
* Table detection configuration for PP-StructureV3.
* Controls which table detection modes to enable.
* - enable_wired_table: Tables with visible cell borders/grid lines
* - enable_wireless_table: Tables without visible borders
* - enable_region_detection: Detect table-like regions for better cell structure
*/
export interface TableDetectionConfig {
enable_wired_table: boolean
enable_wireless_table: boolean
enable_region_detection: boolean
} }
/** /**
@@ -147,6 +161,7 @@ export interface ProcessingOptions {
layout_model?: LayoutModel // Layout detection model selection (OCR track only) layout_model?: LayoutModel // Layout detection model selection (OCR track only)
preprocessing_mode?: PreprocessingMode // Preprocessing mode (OCR track only) preprocessing_mode?: PreprocessingMode // Preprocessing mode (OCR track only)
preprocessing_config?: PreprocessingConfig // Manual preprocessing config preprocessing_config?: PreprocessingConfig // Manual preprocessing config
table_detection?: TableDetectionConfig // Table detection options (OCR track only)
} }
export interface TaskCreate { export interface TaskCreate {

View File

@@ -1,62 +1,88 @@
# Tasks: Extract Table Cell Boxes # Tasks: Extract Table Cell Boxes
## Phase 1: 基礎設施 ## 重要發現 (2025-11-28)
### Task 1.1: 添加配置項 **PPStructureV3 (PaddleX 3.3.9) 確實提供 `table_res_list`**
- [x]`config.py` 添加 `enable_table_cell_boxes_extraction` 配置
- [x] 確認現有的表格模型配置可用 之前的實現假設需要額外調用 SLANeXt 模型,但經過深入測試發現:
- `result.json['res']['table_res_list']` 包含所有表格的 `cell_box_list`
- 不需要額外的模型調用
- 已移除多餘的 SLANeXt 代碼
## Phase 1: 基礎設施 (已完成)
### Task 1.1: 配置項
- [x] ~~添加 `enable_table_cell_boxes_extraction` 配置~~ (已移除,不再需要)
- [x] 確認 PPStructureV3 提供 `table_res_list`
### Task 1.2: 模型緩存機制 ### Task 1.2: 模型緩存機制
- [x] `PPStructureEnhanced` 中添加模型緩存屬性 - [x] ~~實現 SLANeXt 模型緩存~~ (已移除,不再需要)
- [x] 實現延遲載入邏輯 - [x] 直接使用 PPStructureV3 內建的 `table_res_list`
- [x] 添加模型釋放方法(可選)
## Phase 2: Cell Boxes 提取 ## Phase 2: Cell Boxes 提取 (已完成)
### Task 2.1: 修改表格處理邏輯 ### Task 2.1: 從 table_res_list 提取
- [x] `_process_parsing_res_list` 中添加 cell boxes 提取 - [x] `result.json['res']['table_res_list']` 獲取 `cell_box_list`
- [x] 實現圖片裁切邏輯 - [x] 通過 HTML 內容匹配表格
- [x] 調用 SLANeXt 模型獲取結果 - [x] 驗證座標格式 (已是絕對座標)
### Task 2.2: 座標轉換 ### Task 2.2: Image-in-Table 處理
- [x] 實現相對座標到全域座標的轉換 - [x] `layout_det_res` 獲取 image boxes
- [x] 處理 ScalingInfo 的座標縮放 - [x] 檢測表格內的圖片
- [x] 驗證座標轉換正確性 - [x] 裁切保存圖片
- [x] 嵌入到表格 HTML
### Task 2.3: 錯誤處理 ## Phase 3: PDF 生成優化 (已完成)
- [x] 添加 try-catch 包裝
- [x] 實現失敗時的降級處理
- [x] 添加適當的日誌記錄
## Phase 3: PDF 生成優化 ### Task 3.1: ~~利用 Cell Boxes 推斷網格~~ (已棄用)
- [x] ~~修改 `draw_table_region` 使用 cell_boxes~~
- [x] ~~根據實際 cell 位置計算行高列寬~~
- [x] 測試渲染效果 → **發現問題HTML 結構與 cell_boxes 不匹配**
### Task 3.1: 利用 Cell Boxes 渲染表格 ### Task 3.2: 方案 B - 分層渲染 (Layered Rendering) ✓ 已完成
- [x] 修改 `draw_table_region` 使用 cell_boxes
- [x] 根據實際 cell 位置計算行高列寬
- [ ] 測試渲染效果
### Task 3.2: 備選方案 **問題分析 (2025-11-30)**
- [x] 當 cell_boxes 不可用時,使用現有邏輯 - HTML 表格結構與 cell_boxes 不匹配,無法正確推斷網格
- 嘗試在 cell 內繪製文字失敗(超出邊框、匹配錯誤)
**解決方案**:分層渲染 - 分離表格邊框與文字繪製
- Layer 1: 使用 cell_boxes 繪製表格邊框
- Layer 2: 使用 raw OCR positions 繪製文字(獨立於表格結構)
- Layer 3: 繪製 embedded_images
**實作步驟 (2025-11-30)**
- [x] 修改 `GapFillingService._is_region_covered()` - 跳過 TABLE 元素覆蓋檢測
- [x] 簡化 `_draw_table_with_cell_boxes()` - 只繪製邊框 + 圖片
- [x] 修改 `regions_to_avoid` - 排除表格,讓文字穿透表格區域
- [x] 整合測試test_layered_rendering.py
### Task 3.3: 備選方案
- [x] 當 cell_boxes 不可用時,使用 ReportLab Table
- [x] 確保向後兼容 - [x] 確保向後兼容
## Phase 4: 測試與驗證 ## Phase 4: 測試與驗證 (已完成)
### Task 4.1: 單元測試 ### Task 4.1: 單元測試
- [ ] 測試 cell boxes 提取功能 - [x] 測試 cell_box_list 提取 (29 cells 成功)
- [ ] 測試座標轉換 - [x] 測試 Image-in-Table 處理 (1 image embedded)
- [ ] 測試錯誤處理 - [x] 測試錯誤處理
### Task 4.2: 整合測試 ### Task 4.2: 整合測試
- [ ] 使用實際 PDF 測試 OCR Track - [x] 使用實際 PDF 測試 OCR Track (test_layered_rendering.py)
- [ ] 驗證 PDF 版面還原效果 - [x] 驗證 PDF 版面還原效果
- [ ] 性能測試 - [x] 分層渲染測試結果:
- 50 text elements (從 raw OCR 補充,原本只有 5 個)
- 31 cell_boxes (8 + 23)
- 1 embedded_image
- PDF 生成成功 (57,290 bytes)
## Phase 5: 清理 ## Phase 5: 清理 (已完成)
### Task 5.1: 移除舊代碼 ### Task 5.1: 移除舊代碼
- [ ] 評估並移除不再需要的 Paragraph 包裝代碼 - [x] 移除 SLANeXt 模型緩存代碼
- [ ] 清理調試日誌 - [x] 移除 `_get_slanet_model()`, `_get_table_classifier()`, `_extract_cell_boxes_with_slanet()`, `release_slanet_models()`
- [ ] 更新文檔 - [x] 移除 `enable_table_cell_boxes_extraction` 配置
- [x] 清理調試日誌
--- ---
@@ -66,32 +92,182 @@
| 文件 | 修改內容 | | 文件 | 修改內容 |
|------|---------| |------|---------|
| `backend/app/core/config.py` | 添加配置項 | | `backend/app/core/config.py` | 移除 `enable_table_cell_boxes_extraction` |
| `backend/app/services/pp_structure_enhanced.py` | 主要實現 | | `backend/app/services/pp_structure_enhanced.py` | 使用 `table_res_list`, 添加 `_embed_images_in_table()` |
| `backend/app/services/pdf_generator_service.py` | 利用 cell_boxes | | `backend/app/services/pdf_generator_service.py` | 分層渲染:只繪製邊框,排除表格區域的文字過濾 |
| `backend/app/services/gap_filling_service.py` | `_is_region_covered()` 跳過 TABLE 元素 |
| `backend/tests/test_layered_rendering.py` | 分層渲染整合測試 |
### 依賴 ### PPStructureV3 數據結構
```python ```python
from paddlex import create_model result.json = {
'res': {
'parsing_res_list': [...], # 解析結果
'layout_det_res': {...}, # Layout 檢測結果
'table_res_list': [ # 表格識別結果
{
'cell_box_list': [[x1,y1,x2,y2], ...], # ← 關鍵!
'pred_html': '<html>...',
'table_ocr_pred': {...}
}
],
'overall_ocr_res': {...}
}
}
``` ```
### 測試數據 ### 測試結果
- Task ID: `79a3d256-88f6-41d4-a7e9-3e358c85db40` - Task ID: `442f9345-09ba-4a7d-949f-3bc88c2fa895`
- 表格 bbox: `[84, 269, 1174, 1508]` - cell_boxes: 29 cells (source: table_res_list)
- 預期 cell 數量: 29 (SLANeXt_wired) - embedded_images: 1 (img_in_table_935_838_1118_1031)
### 實現摘要 ### 本地 vs 雲端差異
**已完成 (715805b):** | 特性 | 本地 PaddleX 3.3.9 | 雲端 pp_demo |
1. `config.py`: 添加 `enable_table_cell_boxes_extraction` 配置項 |------|-------------------|--------------|
2. `pp_structure_enhanced.py`: | `table_res_list` | ✓ 提供 | ✓ 提供 |
- 添加 `_slanet_wired_model`, `_slanet_wireless_model`, `_table_cls_model` 緩存屬性 | `cell_box_list` | ✓ 29 cells | ✓ 27+8 cells |
- 實現 `_get_slanet_model()``_get_table_classifier()` 延遲載入 | Layout 識別 | 1 個合併表格 | 2 個獨立表格 |
- 實現 `_extract_cell_boxes_with_slanet()` 從裁切圖片提取 cell boxes | Image-in-Table | 需自行處理 | 自動嵌入 HTML |
- 實現 `release_slanet_models()` 釋放 GPU 記憶體
- 修改表格處理邏輯,當 PPStructureV3 沒有返回 boxes 時調用 SLANeXt ### 遺留問題
3. `pdf_generator_service.py`:
- 添加 `_compute_table_grid_from_cell_boxes()` 計算列寬和行高 1. **Layout 識別合併表格**:本地 Layout 模型把多個表格合併成一個大表格
- 修改 `draw_table_region()` 優先使用 cell_boxes 計算列寬 - 這導致 `table_res_list` 只有 1 個表格
- 雲端識別為 2 個獨立表格
- 可能需要調整 Layout 模型參數或後處理邏輯
---
## 分層渲染技術設計 (2025-11-30)
### 問題根因
ReportLab Table 需要規則矩形網格,但 PPStructureV3 的 cell_boxes 反映實際視覺位置,與 HTML 邏輯結構不匹配。嘗試在 cell 內繪製文字會導致:
- 文字超出邊框
- 匹配錯誤
- 部分文字遺失
### 解決方案:分層渲染
將表格渲染解耦為三個獨立層次:
```
┌─────────────────────────────────────────┐
│ Layer 3: Embedded Images │
│ (從 metadata['embedded_images'] 獲取) │
├─────────────────────────────────────────┤
│ Layer 2: Text at Raw OCR Positions │
│ (從 GapFillingService 補充的原始 OCR) │
├─────────────────────────────────────────┤
│ Layer 1: Table Cell Borders │
│ (從 metadata['cell_boxes'] 繪製) │
└─────────────────────────────────────────┘
```
### 實作細節
**1. GapFillingService 修改** (`_is_region_covered`):
```python
# 跳過 TABLE 元素覆蓋檢測,讓表格內文字通過
if skip_table_coverage and element.type == ElementType.TABLE:
continue
```
**2. PDF Generator 修改** (`regions_to_avoid`):
```python
# 排除表格,只避免與圖片重疊
regions_to_avoid = [img for img in images_metadata if img.get('type') != 'table']
```
**3. 簡化的 `_draw_table_with_cell_boxes`**:
```python
def _draw_table_with_cell_boxes(...):
"""只繪製邊框和圖片,不處理文字"""
# 1. 繪製每個 cell 的邊框
for box in cell_boxes:
pdf_canvas.rect(x, y, width, height, stroke=1, fill=0)
# 2. 繪製 embedded_images
for img in embedded_images:
self._draw_embedded_image(...)
```
### 優勢
1. **解耦**:邊框渲染與文字渲染完全獨立
2. **精確**:文字位置直接使用 OCR 結果,不需推斷
3. **穩定**:不受 cell_boxes 與 HTML 不匹配影響
4. **相容**visualization 中 overall_ocr_res.png 的效果可直接還原
### 測試結果
- Task ID: `84899366-f361-44f1-b989-5aba72419ca5`
- cell_boxes: 31 (8 + 23)
- 原始 text elements: 5
- 補充後 text elements: 50 (從 raw OCR 補充)
- PDF 大小: 57,290 bytes
---
## 混合渲染優化 (2025-11-30)
### 問題發現
分層渲染後仍有問題:
1. 表格歪斜cell_boxes 有 2-11 像素的座標偏差
2. Title 等元素樣式未應用OCR track 不套用樣式
### 解決方案:混合渲染 + 網格對齊
**1. Cell Boxes 網格對齊** (`_normalize_cell_boxes_to_grid`):
```python
def _normalize_cell_boxes_to_grid(self, cell_boxes, threshold=10.0):
"""
將相鄰座標聚合為統一值,消除 2-11 像素的偏差。
- 收集所有 X/Y 座標
- 聚類相近座標threshold 內)
- 使用平均值作為對齊後的座標
"""
```
**2. 元素類型樣式** (OCR track):
```python
# 在 draw_text_region 中加入元素類型檢查
element_type = region.get('element_type', 'text')
if element_type == 'title':
font_size = min(font_size * 1.3, 36) # 30% 放大
elif element_type == 'header':
font_size = min(font_size * 1.15, 24) # 15% 放大
elif element_type == 'caption':
font_size = max(font_size * 0.9, 6) # 10% 縮小
```
**3. 元素類型傳遞**:
```python
# convert_unified_document_to_ocr_data 中加入
text_region = {
'text': text_content,
'bbox': bbox_polygon,
'element_type': element.type.value # 新增
}
```
### 改進後效果
| 項目 | 改進前 | 改進後 |
|------|--------|--------|
| 表格邊框 | 歪斜 (2-11px 偏差) | 網格對齊 |
| Title 樣式 | 無 (與普通文字相同) | 36pt 放大字體 |
| 混合渲染 | 只用 raw OCR | PP-Structure + raw OCR |
### 測試結果 (2025-11-30)
- Task ID: `3a3f350f-2d81-4af4-8a18-021ea09ac433`
- Table 1: 8 cell_boxes → 網格對齊
- Table 2: 23 cell_boxes → 網格對齊 + 1 embedded image
- Title: Applied title style: size=36.0
- PDF 大小: 104,082 bytes