feat: add table detection options and scan artifact removal
- Add TableDetectionSelector component for wired/wireless/region detection - Add CV-based table line detector module (disabled due to poor performance) - Add scan artifact removal preprocessing step (removes faint horizontal lines) - Add PreprocessingConfig schema with remove_scan_artifacts option - Update frontend PreprocessingSettings with scan artifact toggle - Integrate table detection config into ProcessingPage - Archive extract-table-cell-boxes proposal 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -104,7 +104,15 @@ class Settings(BaseSettings):
|
|||||||
# Now using None to let PaddleX use its optimized defaults.
|
# Now using None to let PaddleX use its optimized defaults.
|
||||||
layout_detection_threshold: Optional[float] = Field(default=None) # None = use PaddleX default
|
layout_detection_threshold: Optional[float] = Field(default=None) # None = use PaddleX default
|
||||||
layout_nms_threshold: Optional[float] = Field(default=None) # None = use PaddleX default
|
layout_nms_threshold: Optional[float] = Field(default=None) # None = use PaddleX default
|
||||||
layout_merge_mode: Optional[str] = Field(default=None) # None = use PaddleX default
|
# layout_merge_bboxes_mode options:
|
||||||
|
# - "large": Keep larger box when overlap (default)
|
||||||
|
# - "small": Keep smaller box when overlap
|
||||||
|
# - "union": Keep all boxes (preserve overlapping tables/images)
|
||||||
|
# Using "union" to prevent tables from being merged together
|
||||||
|
layout_merge_mode: Optional[str] = Field(
|
||||||
|
default="union",
|
||||||
|
description="How to handle overlapping detection boxes. 'union' preserves all detected regions."
|
||||||
|
)
|
||||||
layout_unclip_ratio: Optional[float] = Field(default=None) # None = use PaddleX default
|
layout_unclip_ratio: Optional[float] = Field(default=None) # None = use PaddleX default
|
||||||
|
|
||||||
# Text Detection Parameters
|
# Text Detection Parameters
|
||||||
@@ -161,13 +169,8 @@ class Settings(BaseSettings):
|
|||||||
description="Cell detection model for borderless tables. RT-DETR-L provides best accuracy."
|
description="Cell detection model for borderless tables. RT-DETR-L provides best accuracy."
|
||||||
)
|
)
|
||||||
|
|
||||||
# Table Cell Boxes Extraction - supplement PPStructureV3 with direct SLANeXt calls
|
# Note: Table cell boxes are now extracted from table_res_list returned by PPStructureV3
|
||||||
# When enabled, directly invokes SLANeXt models to extract cell bounding boxes
|
# No additional model calls needed - PPStructureV3 provides cell_box_list in table_res_list
|
||||||
# which are not exposed by the PPStructureV3 high-level API
|
|
||||||
enable_table_cell_boxes_extraction: bool = Field(
|
|
||||||
default=True,
|
|
||||||
description="Enable direct SLANeXt model calls to extract table cell bounding boxes for accurate PDF layout."
|
|
||||||
)
|
|
||||||
|
|
||||||
# Formula Recognition Model Configuration (Stage 4)
|
# Formula Recognition Model Configuration (Stage 4)
|
||||||
# Available models:
|
# Available models:
|
||||||
|
|||||||
@@ -40,6 +40,7 @@ from app.schemas.task import (
|
|||||||
PreprocessingPreviewRequest,
|
PreprocessingPreviewRequest,
|
||||||
PreprocessingPreviewResponse,
|
PreprocessingPreviewResponse,
|
||||||
ImageQualityMetrics,
|
ImageQualityMetrics,
|
||||||
|
TableDetectionConfig,
|
||||||
)
|
)
|
||||||
from app.services.task_service import task_service
|
from app.services.task_service import task_service
|
||||||
from app.services.file_access_service import file_access_service
|
from app.services.file_access_service import file_access_service
|
||||||
@@ -75,7 +76,8 @@ def process_task_ocr(
|
|||||||
language: str = 'ch',
|
language: str = 'ch',
|
||||||
layout_model: Optional[str] = "chinese",
|
layout_model: Optional[str] = "chinese",
|
||||||
preprocessing_mode: Optional[str] = "auto",
|
preprocessing_mode: Optional[str] = "auto",
|
||||||
preprocessing_config: Optional[dict] = None
|
preprocessing_config: Optional[dict] = None,
|
||||||
|
table_detection_config: Optional[dict] = None
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Background task to process OCR for a task with dual-track support.
|
Background task to process OCR for a task with dual-track support.
|
||||||
@@ -94,6 +96,7 @@ def process_task_ocr(
|
|||||||
layout_model: Layout detection model ('chinese', 'default', 'cdla')
|
layout_model: Layout detection model ('chinese', 'default', 'cdla')
|
||||||
preprocessing_mode: Preprocessing mode ('auto', 'manual', 'disabled')
|
preprocessing_mode: Preprocessing mode ('auto', 'manual', 'disabled')
|
||||||
preprocessing_config: Manual preprocessing config dict (contrast, sharpen, binarize)
|
preprocessing_config: Manual preprocessing config dict (contrast, sharpen, binarize)
|
||||||
|
table_detection_config: Table detection config dict (enable_wired_table, enable_wireless_table, enable_region_detection)
|
||||||
"""
|
"""
|
||||||
from app.core.database import SessionLocal
|
from app.core.database import SessionLocal
|
||||||
from app.models.task import Task
|
from app.models.task import Task
|
||||||
@@ -106,6 +109,7 @@ def process_task_ocr(
|
|||||||
logger.info(f"Starting OCR processing for task {task_id}, file: {filename}")
|
logger.info(f"Starting OCR processing for task {task_id}, file: {filename}")
|
||||||
logger.info(f"Processing options: dual_track={use_dual_track}, force_track={force_track}, lang={language}")
|
logger.info(f"Processing options: dual_track={use_dual_track}, force_track={force_track}, lang={language}")
|
||||||
logger.info(f"Preprocessing options: mode={preprocessing_mode}, config={preprocessing_config}")
|
logger.info(f"Preprocessing options: mode={preprocessing_mode}, config={preprocessing_config}")
|
||||||
|
logger.info(f"Table detection options: {table_detection_config}")
|
||||||
|
|
||||||
# Convert preprocessing parameters to proper types
|
# Convert preprocessing parameters to proper types
|
||||||
preprocess_mode_enum = None
|
preprocess_mode_enum = None
|
||||||
@@ -122,6 +126,15 @@ def process_task_ocr(
|
|||||||
binarize=preprocessing_config.get("binarize", False)
|
binarize=preprocessing_config.get("binarize", False)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Convert table detection config to object
|
||||||
|
table_det_config_obj = None
|
||||||
|
if table_detection_config:
|
||||||
|
table_det_config_obj = TableDetectionConfig(
|
||||||
|
enable_wired_table=table_detection_config.get("enable_wired_table", True),
|
||||||
|
enable_wireless_table=table_detection_config.get("enable_wireless_table", True),
|
||||||
|
enable_region_detection=table_detection_config.get("enable_region_detection", True)
|
||||||
|
)
|
||||||
|
|
||||||
# Get task directly by database ID (bypass user isolation for background task)
|
# Get task directly by database ID (bypass user isolation for background task)
|
||||||
task = db.query(Task).filter(Task.id == task_db_id).first()
|
task = db.query(Task).filter(Task.id == task_db_id).first()
|
||||||
if not task:
|
if not task:
|
||||||
@@ -170,7 +183,8 @@ def process_task_ocr(
|
|||||||
force_track=force_track,
|
force_track=force_track,
|
||||||
layout_model=layout_model,
|
layout_model=layout_model,
|
||||||
preprocessing_mode=preprocess_mode_enum,
|
preprocessing_mode=preprocess_mode_enum,
|
||||||
preprocessing_config=preprocess_config_obj
|
preprocessing_config=preprocess_config_obj,
|
||||||
|
table_detection_config=table_det_config_obj
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# Fall back to traditional processing (no force_track support)
|
# Fall back to traditional processing (no force_track support)
|
||||||
@@ -181,7 +195,8 @@ def process_task_ocr(
|
|||||||
output_dir=result_dir,
|
output_dir=result_dir,
|
||||||
layout_model=layout_model,
|
layout_model=layout_model,
|
||||||
preprocessing_mode=preprocess_mode_enum,
|
preprocessing_mode=preprocess_mode_enum,
|
||||||
preprocessing_config=preprocess_config_obj
|
preprocessing_config=preprocess_config_obj,
|
||||||
|
table_detection_config=table_det_config_obj
|
||||||
)
|
)
|
||||||
|
|
||||||
# Calculate processing time
|
# Calculate processing time
|
||||||
@@ -754,6 +769,7 @@ async def start_task(
|
|||||||
- **force_track**: Force specific processing track ('ocr' or 'direct')
|
- **force_track**: Force specific processing track ('ocr' or 'direct')
|
||||||
- **language**: OCR language code (default: 'ch')
|
- **language**: OCR language code (default: 'ch')
|
||||||
- **layout_model**: Layout detection model ('chinese', 'default', 'cdla')
|
- **layout_model**: Layout detection model ('chinese', 'default', 'cdla')
|
||||||
|
- **table_detection**: Table detection config (enable_wired_table, enable_wireless_table, enable_region_detection)
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# Parse processing options with defaults
|
# Parse processing options with defaults
|
||||||
@@ -781,6 +797,16 @@ async def start_task(
|
|||||||
}
|
}
|
||||||
logger.info(f"Preprocessing: mode={preprocessing_mode}, config={preprocessing_config}")
|
logger.info(f"Preprocessing: mode={preprocessing_mode}, config={preprocessing_config}")
|
||||||
|
|
||||||
|
# Extract table detection options
|
||||||
|
table_detection_config = None
|
||||||
|
if options.table_detection:
|
||||||
|
table_detection_config = {
|
||||||
|
"enable_wired_table": options.table_detection.enable_wired_table,
|
||||||
|
"enable_wireless_table": options.table_detection.enable_wireless_table,
|
||||||
|
"enable_region_detection": options.table_detection.enable_region_detection
|
||||||
|
}
|
||||||
|
logger.info(f"Table detection: {table_detection_config}")
|
||||||
|
|
||||||
# Get task details
|
# Get task details
|
||||||
task = task_service.get_task_by_id(
|
task = task_service.get_task_by_id(
|
||||||
db=db,
|
db=db,
|
||||||
@@ -829,11 +855,12 @@ async def start_task(
|
|||||||
language=language,
|
language=language,
|
||||||
layout_model=layout_model,
|
layout_model=layout_model,
|
||||||
preprocessing_mode=preprocessing_mode,
|
preprocessing_mode=preprocessing_mode,
|
||||||
preprocessing_config=preprocessing_config
|
preprocessing_config=preprocessing_config,
|
||||||
|
table_detection_config=table_detection_config
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(f"Started OCR processing task {task_id} for user {current_user.email}")
|
logger.info(f"Started OCR processing task {task_id} for user {current_user.email}")
|
||||||
logger.info(f"Options: dual_track={use_dual_track}, force_track={force_track}, lang={language}, layout_model={layout_model}, preprocessing={preprocessing_mode}")
|
logger.info(f"Options: dual_track={use_dual_track}, force_track={force_track}, lang={language}, layout_model={layout_model}, preprocessing={preprocessing_mode}, table_detection={table_detection_config}")
|
||||||
return task
|
return task
|
||||||
|
|
||||||
except HTTPException:
|
except HTTPException:
|
||||||
|
|||||||
@@ -96,6 +96,35 @@ class PreprocessingConfig(BaseModel):
|
|||||||
default=False,
|
default=False,
|
||||||
description="Enable binarization (aggressive, for very low contrast). Not recommended for most documents."
|
description="Enable binarization (aggressive, for very low contrast). Not recommended for most documents."
|
||||||
)
|
)
|
||||||
|
remove_scan_artifacts: bool = Field(
|
||||||
|
default=True,
|
||||||
|
description="Remove horizontal scan line artifacts. Recommended for scanned documents to prevent misdetection of scanner light bar lines as table borders."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TableDetectionConfig(BaseModel):
|
||||||
|
"""Table detection configuration for PP-StructureV3.
|
||||||
|
|
||||||
|
Controls which table detection modes to enable. PP-StructureV3 uses specialized
|
||||||
|
models for different table types:
|
||||||
|
- Wired (bordered): Tables with visible cell borders/grid lines
|
||||||
|
- Wireless (borderless): Tables without visible borders, relying on alignment
|
||||||
|
- Region detection: Detect table-like regions for better cell structure
|
||||||
|
|
||||||
|
Multiple options can be enabled simultaneously for comprehensive detection.
|
||||||
|
"""
|
||||||
|
enable_wired_table: bool = Field(
|
||||||
|
default=True,
|
||||||
|
description="Enable wired (bordered) table detection. Best for tables with visible grid lines."
|
||||||
|
)
|
||||||
|
enable_wireless_table: bool = Field(
|
||||||
|
default=True,
|
||||||
|
description="Enable wireless (borderless) table detection. Best for tables without visible borders."
|
||||||
|
)
|
||||||
|
enable_region_detection: bool = Field(
|
||||||
|
default=True,
|
||||||
|
description="Enable region detection for better table structure inference."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class ImageQualityMetrics(BaseModel):
|
class ImageQualityMetrics(BaseModel):
|
||||||
@@ -294,6 +323,12 @@ class ProcessingOptions(BaseModel):
|
|||||||
description="Manual preprocessing config (only used when preprocessing_mode='manual')"
|
description="Manual preprocessing config (only used when preprocessing_mode='manual')"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Table detection configuration (OCR track only)
|
||||||
|
table_detection: Optional[TableDetectionConfig] = Field(
|
||||||
|
None,
|
||||||
|
description="Table detection config. If None, all table detection modes are enabled."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class AnalyzeRequest(BaseModel):
|
class AnalyzeRequest(BaseModel):
|
||||||
"""Document analysis request"""
|
"""Document analysis request"""
|
||||||
|
|||||||
362
backend/app/services/cv_table_detector.py
Normal file
362
backend/app/services/cv_table_detector.py
Normal file
@@ -0,0 +1,362 @@
|
|||||||
|
"""
|
||||||
|
CV-based Table Line Detection Module
|
||||||
|
|
||||||
|
Uses OpenCV morphological operations to detect table lines and extract cell boundaries.
|
||||||
|
This is more reliable for wired/bordered tables than ML-based cell detection.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import cv2
|
||||||
|
import numpy as np
|
||||||
|
from typing import List, Tuple, Optional
|
||||||
|
from pathlib import Path
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class CVTableDetector:
|
||||||
|
"""
|
||||||
|
Detects table cell boundaries using computer vision techniques.
|
||||||
|
Works by detecting horizontal and vertical lines in the image.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
min_line_length: int = 30,
|
||||||
|
line_thickness: int = 2,
|
||||||
|
min_cell_width: int = 20,
|
||||||
|
min_cell_height: int = 15
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Initialize the CV table detector.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
min_line_length: Minimum length of lines to detect (in pixels)
|
||||||
|
line_thickness: Expected thickness of table lines
|
||||||
|
min_cell_width: Minimum width of a valid cell
|
||||||
|
min_cell_height: Minimum height of a valid cell
|
||||||
|
"""
|
||||||
|
self.min_line_length = min_line_length
|
||||||
|
self.line_thickness = line_thickness
|
||||||
|
self.min_cell_width = min_cell_width
|
||||||
|
self.min_cell_height = min_cell_height
|
||||||
|
|
||||||
|
def detect_cells(
|
||||||
|
self,
|
||||||
|
image: np.ndarray,
|
||||||
|
table_bbox: Optional[List[float]] = None
|
||||||
|
) -> List[List[float]]:
|
||||||
|
"""
|
||||||
|
Detect cell boundaries in a table image.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image: Input image (BGR format)
|
||||||
|
table_bbox: Optional [x1, y1, x2, y2] to crop table region first
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of cell bounding boxes [[x1, y1, x2, y2], ...]
|
||||||
|
"""
|
||||||
|
# Crop to table region if bbox provided
|
||||||
|
offset_x, offset_y = 0, 0
|
||||||
|
if table_bbox:
|
||||||
|
x1, y1, x2, y2 = [int(v) for v in table_bbox]
|
||||||
|
offset_x, offset_y = x1, y1
|
||||||
|
image = image[y1:y2, x1:x2]
|
||||||
|
|
||||||
|
if image.size == 0:
|
||||||
|
logger.warning("Empty image after cropping")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Convert to grayscale
|
||||||
|
if len(image.shape) == 3:
|
||||||
|
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
||||||
|
else:
|
||||||
|
gray = image
|
||||||
|
|
||||||
|
# Detect lines
|
||||||
|
horizontal_lines, vertical_lines = self._detect_lines(gray)
|
||||||
|
|
||||||
|
if horizontal_lines is None or vertical_lines is None:
|
||||||
|
logger.warning("Failed to detect table lines")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Find intersections to build grid
|
||||||
|
cells = self._build_cell_grid(horizontal_lines, vertical_lines, gray.shape)
|
||||||
|
|
||||||
|
# Convert to absolute coordinates
|
||||||
|
absolute_cells = []
|
||||||
|
for cell in cells:
|
||||||
|
abs_cell = [
|
||||||
|
cell[0] + offset_x,
|
||||||
|
cell[1] + offset_y,
|
||||||
|
cell[2] + offset_x,
|
||||||
|
cell[3] + offset_y
|
||||||
|
]
|
||||||
|
absolute_cells.append(abs_cell)
|
||||||
|
|
||||||
|
logger.info(f"[CV] Detected {len(absolute_cells)} cells from table lines")
|
||||||
|
return absolute_cells
|
||||||
|
|
||||||
|
def _detect_lines(
|
||||||
|
self,
|
||||||
|
gray: np.ndarray
|
||||||
|
) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
|
||||||
|
"""
|
||||||
|
Detect horizontal and vertical lines using morphological operations.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
gray: Grayscale image
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (horizontal_lines_mask, vertical_lines_mask)
|
||||||
|
"""
|
||||||
|
# Adaptive threshold for better line detection
|
||||||
|
binary = cv2.adaptiveThreshold(
|
||||||
|
gray, 255,
|
||||||
|
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
||||||
|
cv2.THRESH_BINARY_INV,
|
||||||
|
11, 2
|
||||||
|
)
|
||||||
|
|
||||||
|
# Detect horizontal lines
|
||||||
|
h_kernel_length = max(self.min_line_length, gray.shape[1] // 30)
|
||||||
|
horizontal_kernel = cv2.getStructuringElement(
|
||||||
|
cv2.MORPH_RECT, (h_kernel_length, 1)
|
||||||
|
)
|
||||||
|
horizontal_lines = cv2.morphologyEx(
|
||||||
|
binary, cv2.MORPH_OPEN, horizontal_kernel, iterations=2
|
||||||
|
)
|
||||||
|
|
||||||
|
# Detect vertical lines
|
||||||
|
v_kernel_length = max(self.min_line_length, gray.shape[0] // 30)
|
||||||
|
vertical_kernel = cv2.getStructuringElement(
|
||||||
|
cv2.MORPH_RECT, (1, v_kernel_length)
|
||||||
|
)
|
||||||
|
vertical_lines = cv2.morphologyEx(
|
||||||
|
binary, cv2.MORPH_OPEN, vertical_kernel, iterations=2
|
||||||
|
)
|
||||||
|
|
||||||
|
return horizontal_lines, vertical_lines
|
||||||
|
|
||||||
|
def _build_cell_grid(
|
||||||
|
self,
|
||||||
|
horizontal_mask: np.ndarray,
|
||||||
|
vertical_mask: np.ndarray,
|
||||||
|
image_shape: Tuple[int, int]
|
||||||
|
) -> List[List[float]]:
|
||||||
|
"""
|
||||||
|
Build cell grid from detected line masks.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
horizontal_mask: Binary mask of horizontal lines
|
||||||
|
vertical_mask: Binary mask of vertical lines
|
||||||
|
image_shape: (height, width) of the image
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of cell bounding boxes
|
||||||
|
"""
|
||||||
|
height, width = image_shape[:2]
|
||||||
|
|
||||||
|
# Combine masks to find table structure
|
||||||
|
table_mask = cv2.add(horizontal_mask, vertical_mask)
|
||||||
|
|
||||||
|
# Find contours (cells are enclosed regions)
|
||||||
|
contours, hierarchy = cv2.findContours(
|
||||||
|
table_mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE
|
||||||
|
)
|
||||||
|
|
||||||
|
# Method 1: Use contours to find cells
|
||||||
|
cells_from_contours = self._cells_from_contours(contours, hierarchy)
|
||||||
|
|
||||||
|
# Method 2: Use line intersections to build grid
|
||||||
|
cells_from_grid = self._cells_from_line_intersections(
|
||||||
|
horizontal_mask, vertical_mask, height, width
|
||||||
|
)
|
||||||
|
|
||||||
|
# Use whichever method found more valid cells
|
||||||
|
if len(cells_from_grid) >= len(cells_from_contours):
|
||||||
|
return cells_from_grid
|
||||||
|
return cells_from_contours
|
||||||
|
|
||||||
|
def _cells_from_contours(
|
||||||
|
self,
|
||||||
|
contours,
|
||||||
|
hierarchy
|
||||||
|
) -> List[List[float]]:
|
||||||
|
"""Extract cell bounding boxes from contours."""
|
||||||
|
cells = []
|
||||||
|
|
||||||
|
for i, contour in enumerate(contours):
|
||||||
|
x, y, w, h = cv2.boundingRect(contour)
|
||||||
|
|
||||||
|
# Filter by minimum size
|
||||||
|
if w >= self.min_cell_width and h >= self.min_cell_height:
|
||||||
|
# Check if this is an inner contour (cell) not the outer table
|
||||||
|
if hierarchy is not None and hierarchy[0][i][3] != -1:
|
||||||
|
cells.append([float(x), float(y), float(x + w), float(y + h)])
|
||||||
|
|
||||||
|
return cells
|
||||||
|
|
||||||
|
def _cells_from_line_intersections(
|
||||||
|
self,
|
||||||
|
horizontal_mask: np.ndarray,
|
||||||
|
vertical_mask: np.ndarray,
|
||||||
|
height: int,
|
||||||
|
width: int
|
||||||
|
) -> List[List[float]]:
|
||||||
|
"""Build cells from line intersections (grid-based approach)."""
|
||||||
|
# Find horizontal line y-coordinates
|
||||||
|
h_projection = np.sum(horizontal_mask, axis=1)
|
||||||
|
h_lines = self._find_line_positions(h_projection, min_gap=self.min_cell_height)
|
||||||
|
|
||||||
|
# Find vertical line x-coordinates
|
||||||
|
v_projection = np.sum(vertical_mask, axis=0)
|
||||||
|
v_lines = self._find_line_positions(v_projection, min_gap=self.min_cell_width)
|
||||||
|
|
||||||
|
if len(h_lines) < 2 or len(v_lines) < 2:
|
||||||
|
logger.debug(f"Insufficient lines: {len(h_lines)} horizontal, {len(v_lines)} vertical")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Build cells from grid
|
||||||
|
cells = []
|
||||||
|
for i in range(len(h_lines) - 1):
|
||||||
|
for j in range(len(v_lines) - 1):
|
||||||
|
y1, y2 = h_lines[i], h_lines[i + 1]
|
||||||
|
x1, x2 = v_lines[j], v_lines[j + 1]
|
||||||
|
|
||||||
|
# Validate cell size
|
||||||
|
if (x2 - x1) >= self.min_cell_width and (y2 - y1) >= self.min_cell_height:
|
||||||
|
cells.append([float(x1), float(y1), float(x2), float(y2)])
|
||||||
|
|
||||||
|
return cells
|
||||||
|
|
||||||
|
def _find_line_positions(
|
||||||
|
self,
|
||||||
|
projection: np.ndarray,
|
||||||
|
min_gap: int
|
||||||
|
) -> List[int]:
|
||||||
|
"""
|
||||||
|
Find line positions from projection profile.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
projection: 1D array of pixel sums
|
||||||
|
min_gap: Minimum gap between lines
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of line positions
|
||||||
|
"""
|
||||||
|
# Threshold to find peaks (lines)
|
||||||
|
threshold = np.max(projection) * 0.3
|
||||||
|
peaks = projection > threshold
|
||||||
|
|
||||||
|
# Find transitions (line positions)
|
||||||
|
positions = []
|
||||||
|
in_peak = False
|
||||||
|
peak_start = 0
|
||||||
|
|
||||||
|
for i, is_peak in enumerate(peaks):
|
||||||
|
if is_peak and not in_peak:
|
||||||
|
peak_start = i
|
||||||
|
in_peak = True
|
||||||
|
elif not is_peak and in_peak:
|
||||||
|
# End of peak - use center
|
||||||
|
peak_center = (peak_start + i) // 2
|
||||||
|
if not positions or (peak_center - positions[-1]) >= min_gap:
|
||||||
|
positions.append(peak_center)
|
||||||
|
in_peak = False
|
||||||
|
|
||||||
|
return positions
|
||||||
|
|
||||||
|
def detect_and_merge_with_ml(
|
||||||
|
self,
|
||||||
|
image: np.ndarray,
|
||||||
|
table_bbox: List[float],
|
||||||
|
ml_cell_boxes: List[List[float]]
|
||||||
|
) -> List[List[float]]:
|
||||||
|
"""
|
||||||
|
Detect cells using CV and merge/validate with ML-detected boxes.
|
||||||
|
|
||||||
|
CV detection is used as the primary source for wired tables,
|
||||||
|
with ML boxes used to fill gaps or validate.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image: Input image
|
||||||
|
table_bbox: Table bounding box [x1, y1, x2, y2]
|
||||||
|
ml_cell_boxes: Cell boxes from ML model (RT-DETR-L)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Merged/validated cell boxes
|
||||||
|
"""
|
||||||
|
cv_cells = self.detect_cells(image, table_bbox)
|
||||||
|
|
||||||
|
if not cv_cells:
|
||||||
|
# CV detection failed, fall back to ML
|
||||||
|
logger.info("[CV] No cells detected by CV, using ML cells")
|
||||||
|
return ml_cell_boxes
|
||||||
|
|
||||||
|
if not ml_cell_boxes:
|
||||||
|
# Only CV cells available
|
||||||
|
return cv_cells
|
||||||
|
|
||||||
|
# Validate: CV should find structured grid
|
||||||
|
# If CV found significantly fewer cells, there might be merged cells
|
||||||
|
cv_count = len(cv_cells)
|
||||||
|
ml_count = len(ml_cell_boxes)
|
||||||
|
|
||||||
|
logger.info(f"[CV] CV detected {cv_count} cells, ML detected {ml_count} cells")
|
||||||
|
|
||||||
|
# For wired tables, prefer CV detection (cleaner grid)
|
||||||
|
if cv_count >= ml_count * 0.5:
|
||||||
|
# CV found reasonable number of cells
|
||||||
|
return cv_cells
|
||||||
|
else:
|
||||||
|
# CV might have missed cells (possibly due to merged cells)
|
||||||
|
# Try to use ML boxes that don't overlap with CV cells
|
||||||
|
merged = list(cv_cells)
|
||||||
|
for ml_box in ml_cell_boxes:
|
||||||
|
if not self._has_significant_overlap(ml_box, cv_cells):
|
||||||
|
merged.append(ml_box)
|
||||||
|
return merged
|
||||||
|
|
||||||
|
def _has_significant_overlap(
|
||||||
|
self,
|
||||||
|
box: List[float],
|
||||||
|
boxes: List[List[float]],
|
||||||
|
threshold: float = 0.5
|
||||||
|
) -> bool:
|
||||||
|
"""Check if box significantly overlaps with any box in the list."""
|
||||||
|
for other in boxes:
|
||||||
|
iou = self._calculate_iou(box, other)
|
||||||
|
if iou > threshold:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _calculate_iou(
|
||||||
|
self,
|
||||||
|
box1: List[float],
|
||||||
|
box2: List[float]
|
||||||
|
) -> float:
|
||||||
|
"""Calculate Intersection over Union of two boxes."""
|
||||||
|
x1 = max(box1[0], box2[0])
|
||||||
|
y1 = max(box1[1], box2[1])
|
||||||
|
x2 = min(box1[2], box2[2])
|
||||||
|
y2 = min(box1[3], box2[3])
|
||||||
|
|
||||||
|
if x2 <= x1 or y2 <= y1:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
intersection = (x2 - x1) * (y2 - y1)
|
||||||
|
area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
|
||||||
|
area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
|
||||||
|
union = area1 + area2 - intersection
|
||||||
|
|
||||||
|
return intersection / union if union > 0 else 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def load_image(image_path: str) -> Optional[np.ndarray]:
|
||||||
|
"""Load image from path."""
|
||||||
|
path = Path(image_path)
|
||||||
|
if not path.exists():
|
||||||
|
logger.error(f"Image not found: {image_path}")
|
||||||
|
return None
|
||||||
|
return cv2.imread(str(path))
|
||||||
@@ -212,7 +212,8 @@ class GapFillingService:
|
|||||||
def _is_region_covered(
|
def _is_region_covered(
|
||||||
self,
|
self,
|
||||||
region: TextRegion,
|
region: TextRegion,
|
||||||
pp_structure_elements: List[DocumentElement]
|
pp_structure_elements: List[DocumentElement],
|
||||||
|
skip_table_coverage: bool = True
|
||||||
) -> bool:
|
) -> bool:
|
||||||
"""
|
"""
|
||||||
Check if a raw OCR region is covered by any PP-StructureV3 element.
|
Check if a raw OCR region is covered by any PP-StructureV3 element.
|
||||||
@@ -220,6 +221,9 @@ class GapFillingService:
|
|||||||
Args:
|
Args:
|
||||||
region: Raw OCR text region
|
region: Raw OCR text region
|
||||||
pp_structure_elements: List of PP-StructureV3 elements
|
pp_structure_elements: List of PP-StructureV3 elements
|
||||||
|
skip_table_coverage: If True, don't consider TABLE elements as covering
|
||||||
|
(allows raw OCR text inside tables to pass through
|
||||||
|
for layered rendering)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
True if the region is covered
|
True if the region is covered
|
||||||
@@ -228,6 +232,12 @@ class GapFillingService:
|
|||||||
region_bbox = region.normalized_bbox
|
region_bbox = region.normalized_bbox
|
||||||
|
|
||||||
for element in pp_structure_elements:
|
for element in pp_structure_elements:
|
||||||
|
# Skip TABLE elements when checking coverage
|
||||||
|
# This allows raw OCR text inside tables to be preserved
|
||||||
|
# PDF generator will render: table borders + raw text positions
|
||||||
|
if skip_table_coverage and element.type == ElementType.TABLE:
|
||||||
|
continue
|
||||||
|
|
||||||
elem_bbox = (
|
elem_bbox = (
|
||||||
element.bbox.x0, element.bbox.y0,
|
element.bbox.x0, element.bbox.y0,
|
||||||
element.bbox.x1, element.bbox.y1
|
element.bbox.x1, element.bbox.y1
|
||||||
|
|||||||
@@ -184,6 +184,99 @@ class LayoutPreprocessingService:
|
|||||||
|
|
||||||
return normalized
|
return normalized
|
||||||
|
|
||||||
|
def remove_scan_artifacts(
|
||||||
|
self,
|
||||||
|
image: np.ndarray,
|
||||||
|
line_thickness: int = 5,
|
||||||
|
min_line_length_ratio: float = 0.3,
|
||||||
|
faint_threshold: int = 30
|
||||||
|
) -> np.ndarray:
|
||||||
|
"""
|
||||||
|
Remove horizontal scan line artifacts from scanned documents.
|
||||||
|
|
||||||
|
Scanner light bar artifacts appear as FAINT horizontal lines across the image.
|
||||||
|
Key distinction from table borders:
|
||||||
|
- Scan artifacts are LIGHT/FAINT (close to background color)
|
||||||
|
- Table borders are DARK/BOLD (high contrast)
|
||||||
|
|
||||||
|
Method:
|
||||||
|
1. Detect horizontal edges using Sobel filter
|
||||||
|
2. Filter to keep only FAINT edges (low contrast)
|
||||||
|
3. Find continuous horizontal segments
|
||||||
|
4. Remove only faint horizontal lines while preserving bold table borders
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image: Input image (BGR)
|
||||||
|
line_thickness: Maximum thickness of lines to remove (pixels)
|
||||||
|
min_line_length_ratio: Minimum line length as ratio of image width (0.0-1.0)
|
||||||
|
faint_threshold: Maximum edge strength for "faint" lines (0-255)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Image with scan artifacts removed (BGR)
|
||||||
|
"""
|
||||||
|
h, w = image.shape[:2]
|
||||||
|
min_line_length = int(w * min_line_length_ratio)
|
||||||
|
|
||||||
|
# Convert to grayscale for detection
|
||||||
|
if len(image.shape) == 3:
|
||||||
|
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
||||||
|
else:
|
||||||
|
gray = image.copy()
|
||||||
|
|
||||||
|
# Step 1: Detect horizontal edges using Sobel (vertical gradient)
|
||||||
|
# Scan artifacts will have weak gradients, table borders will have strong gradients
|
||||||
|
sobel_y = cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=3)
|
||||||
|
sobel_abs = np.abs(sobel_y).astype(np.uint8)
|
||||||
|
|
||||||
|
# Step 2: Find FAINT horizontal edges only (low gradient magnitude)
|
||||||
|
# Strong edges (table borders) have high sobel values
|
||||||
|
# Faint edges (scan artifacts) have low sobel values
|
||||||
|
faint_edges = (sobel_abs > 5) & (sobel_abs < faint_threshold)
|
||||||
|
faint_edges = faint_edges.astype(np.uint8) * 255
|
||||||
|
|
||||||
|
# Step 3: Use horizontal morphological operations to find continuous lines
|
||||||
|
horizontal_kernel = cv2.getStructuringElement(
|
||||||
|
cv2.MORPH_RECT,
|
||||||
|
(min_line_length, 1)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Opening removes short segments, keeping only long horizontal lines
|
||||||
|
horizontal_lines = cv2.morphologyEx(
|
||||||
|
faint_edges, cv2.MORPH_OPEN, horizontal_kernel, iterations=1
|
||||||
|
)
|
||||||
|
|
||||||
|
# Dilate slightly to cover the full artifact width
|
||||||
|
dilate_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, line_thickness))
|
||||||
|
line_mask = cv2.dilate(horizontal_lines, dilate_kernel, iterations=1)
|
||||||
|
|
||||||
|
# Check if any artifacts were detected
|
||||||
|
artifact_pixels = np.sum(line_mask > 0)
|
||||||
|
if artifact_pixels < 100:
|
||||||
|
logger.debug("No faint scan artifacts detected")
|
||||||
|
return image
|
||||||
|
|
||||||
|
# Calculate artifact coverage
|
||||||
|
total_pixels = h * w
|
||||||
|
coverage_ratio = artifact_pixels / total_pixels
|
||||||
|
|
||||||
|
# Faint artifacts should cover a small portion of the image
|
||||||
|
if coverage_ratio > 0.05: # More than 5% is suspicious
|
||||||
|
logger.debug(f"Faint artifact detection: coverage={coverage_ratio:.2%} (processing anyway)")
|
||||||
|
|
||||||
|
# Only process if coverage is not excessive
|
||||||
|
if coverage_ratio > 0.15: # More than 15% is definitely too much
|
||||||
|
logger.debug(f"Artifact detection rejected: coverage too high ({coverage_ratio:.2%})")
|
||||||
|
return image
|
||||||
|
|
||||||
|
# Use inpainting to remove artifacts
|
||||||
|
result = cv2.inpaint(image, line_mask, inpaintRadius=3, flags=cv2.INPAINT_TELEA)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Scan artifacts removed: {artifact_pixels} pixels ({coverage_ratio:.2%}), faint_threshold={faint_threshold}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
def scale_for_layout_detection(
|
def scale_for_layout_detection(
|
||||||
self,
|
self,
|
||||||
image: np.ndarray,
|
image: np.ndarray,
|
||||||
@@ -346,9 +439,13 @@ class LayoutPreprocessingService:
|
|||||||
# Only enable for extremely low contrast (< 15) which indicates a scan quality issue
|
# Only enable for extremely low contrast (< 15) which indicates a scan quality issue
|
||||||
binarize = False # Disabled by default
|
binarize = False # Disabled by default
|
||||||
|
|
||||||
|
# Scan artifact removal is always enabled in auto mode for scanned documents
|
||||||
|
remove_scan_artifacts = True
|
||||||
|
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"Auto config: contrast={contrast} strength={contrast_strength:.2f}, "
|
f"Auto config: contrast={contrast} strength={contrast_strength:.2f}, "
|
||||||
f"sharpen={sharpen} strength={sharpen_strength:.2f}, binarize={binarize}"
|
f"sharpen={sharpen} strength={sharpen_strength:.2f}, binarize={binarize}, "
|
||||||
|
f"remove_scan_artifacts={remove_scan_artifacts}"
|
||||||
)
|
)
|
||||||
|
|
||||||
return PreprocessingConfig(
|
return PreprocessingConfig(
|
||||||
@@ -356,7 +453,8 @@ class LayoutPreprocessingService:
|
|||||||
contrast_strength=round(contrast_strength, 2),
|
contrast_strength=round(contrast_strength, 2),
|
||||||
sharpen=sharpen,
|
sharpen=sharpen,
|
||||||
sharpen_strength=round(sharpen_strength, 2),
|
sharpen_strength=round(sharpen_strength, 2),
|
||||||
binarize=binarize
|
binarize=binarize,
|
||||||
|
remove_scan_artifacts=remove_scan_artifacts
|
||||||
)
|
)
|
||||||
|
|
||||||
def apply_contrast_enhancement(
|
def apply_contrast_enhancement(
|
||||||
@@ -550,7 +648,8 @@ class LayoutPreprocessingService:
|
|||||||
config_used=PreprocessingConfig(
|
config_used=PreprocessingConfig(
|
||||||
contrast=PreprocessingContrastEnum.NONE,
|
contrast=PreprocessingContrastEnum.NONE,
|
||||||
sharpen=False,
|
sharpen=False,
|
||||||
binarize=False
|
binarize=False,
|
||||||
|
remove_scan_artifacts=False
|
||||||
),
|
),
|
||||||
quality_metrics=metrics,
|
quality_metrics=metrics,
|
||||||
was_processed=scaling_info.was_scaled, # True if scaling was applied
|
was_processed=scaling_info.was_scaled, # True if scaling was applied
|
||||||
@@ -568,6 +667,13 @@ class LayoutPreprocessingService:
|
|||||||
processed = scaled_image.copy()
|
processed = scaled_image.copy()
|
||||||
was_processed = scaling_info.was_scaled # Start with True if already scaled
|
was_processed = scaling_info.was_scaled # Start with True if already scaled
|
||||||
|
|
||||||
|
# Step 0: Remove scan artifacts BEFORE any enhancement
|
||||||
|
# This prevents scanner light bar lines from being enhanced and misdetected as table borders
|
||||||
|
if getattr(config, 'remove_scan_artifacts', True): # Default True for backwards compatibility
|
||||||
|
processed = self.remove_scan_artifacts(processed)
|
||||||
|
was_processed = True
|
||||||
|
logger.debug("Applied scan artifact removal")
|
||||||
|
|
||||||
# Step 1: Contrast enhancement
|
# Step 1: Contrast enhancement
|
||||||
if config.contrast != PreprocessingContrastEnum.NONE:
|
if config.contrast != PreprocessingContrastEnum.NONE:
|
||||||
processed = self.apply_contrast_enhancement(
|
processed = self.apply_contrast_enhancement(
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ from app.services.layout_preprocessing_service import (
|
|||||||
get_layout_preprocessing_service,
|
get_layout_preprocessing_service,
|
||||||
LayoutPreprocessingService,
|
LayoutPreprocessingService,
|
||||||
)
|
)
|
||||||
from app.schemas.task import PreprocessingModeEnum, PreprocessingConfig
|
from app.schemas.task import PreprocessingModeEnum, PreprocessingConfig, TableDetectionConfig
|
||||||
|
|
||||||
# Import dual-track components
|
# Import dual-track components
|
||||||
try:
|
try:
|
||||||
@@ -454,7 +454,11 @@ class OCRService:
|
|||||||
|
|
||||||
return self.ocr_engines[lang]
|
return self.ocr_engines[lang]
|
||||||
|
|
||||||
def _ensure_structure_engine(self, layout_model: Optional[str] = None) -> PPStructureV3:
|
def _ensure_structure_engine(
|
||||||
|
self,
|
||||||
|
layout_model: Optional[str] = None,
|
||||||
|
table_detection_config: Optional[TableDetectionConfig] = None
|
||||||
|
) -> PPStructureV3:
|
||||||
"""
|
"""
|
||||||
Get or create PP-Structure engine for layout analysis with GPU support.
|
Get or create PP-Structure engine for layout analysis with GPU support.
|
||||||
Supports layout model selection for different document types.
|
Supports layout model selection for different document types.
|
||||||
@@ -465,6 +469,10 @@ class OCRService:
|
|||||||
- "default": PubLayNet-based (best for English documents)
|
- "default": PubLayNet-based (best for English documents)
|
||||||
- "cdla": CDLA model (alternative for Chinese layout)
|
- "cdla": CDLA model (alternative for Chinese layout)
|
||||||
- None: Use config default
|
- None: Use config default
|
||||||
|
table_detection_config: Table detection configuration
|
||||||
|
- enable_wired_table: Enable bordered table detection
|
||||||
|
- enable_wireless_table: Enable borderless table detection
|
||||||
|
- enable_region_detection: Enable region detection
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
PPStructure engine instance
|
PPStructure engine instance
|
||||||
@@ -492,6 +500,19 @@ class OCRService:
|
|||||||
logger.info(f"Layout model changed from {current_model} to {layout_model}, recreating engine")
|
logger.info(f"Layout model changed from {current_model} to {layout_model}, recreating engine")
|
||||||
self.structure_engine = None # Force recreation
|
self.structure_engine = None # Force recreation
|
||||||
|
|
||||||
|
# Check if we need to recreate the engine due to different table detection config
|
||||||
|
current_table_config = getattr(self, '_current_table_detection_config', None)
|
||||||
|
if self.structure_engine is not None and table_detection_config:
|
||||||
|
# Compare table detection settings
|
||||||
|
new_config_tuple = (
|
||||||
|
table_detection_config.enable_wired_table,
|
||||||
|
table_detection_config.enable_wireless_table,
|
||||||
|
table_detection_config.enable_region_detection
|
||||||
|
)
|
||||||
|
if current_table_config != new_config_tuple:
|
||||||
|
logger.info(f"Table detection config changed from {current_table_config} to {new_config_tuple}, recreating engine")
|
||||||
|
self.structure_engine = None # Force recreation
|
||||||
|
|
||||||
# Use cached engine or create new one
|
# Use cached engine or create new one
|
||||||
if self.structure_engine is None:
|
if self.structure_engine is None:
|
||||||
logger.info(f"Initializing PP-StructureV3 engine (GPU: {self.use_gpu})")
|
logger.info(f"Initializing PP-StructureV3 engine (GPU: {self.use_gpu})")
|
||||||
@@ -504,6 +525,15 @@ class OCRService:
|
|||||||
use_table = settings.enable_table_recognition
|
use_table = settings.enable_table_recognition
|
||||||
use_seal = settings.enable_seal_recognition
|
use_seal = settings.enable_seal_recognition
|
||||||
use_region = settings.enable_region_detection
|
use_region = settings.enable_region_detection
|
||||||
|
|
||||||
|
# Apply table detection config overrides if provided
|
||||||
|
if table_detection_config:
|
||||||
|
# If both wired and wireless are disabled, disable table recognition entirely
|
||||||
|
if not table_detection_config.enable_wired_table and not table_detection_config.enable_wireless_table:
|
||||||
|
use_table = False
|
||||||
|
use_region = table_detection_config.enable_region_detection
|
||||||
|
logger.info(f"Table detection config applied: wired={table_detection_config.enable_wired_table}, "
|
||||||
|
f"wireless={table_detection_config.enable_wireless_table}, region={use_region}")
|
||||||
layout_threshold = settings.layout_detection_threshold
|
layout_threshold = settings.layout_detection_threshold
|
||||||
layout_nms = settings.layout_nms_threshold
|
layout_nms = settings.layout_nms_threshold
|
||||||
layout_merge = settings.layout_merge_mode
|
layout_merge = settings.layout_merge_mode
|
||||||
@@ -538,6 +568,17 @@ class OCRService:
|
|||||||
formula_model = settings.formula_recognition_model_name
|
formula_model = settings.formula_recognition_model_name
|
||||||
chart_model = settings.chart_recognition_model_name
|
chart_model = settings.chart_recognition_model_name
|
||||||
|
|
||||||
|
# Apply table detection config overrides for individual table types
|
||||||
|
if table_detection_config:
|
||||||
|
if not table_detection_config.enable_wired_table:
|
||||||
|
wired_table_model = None
|
||||||
|
wired_cell_det_model = None
|
||||||
|
logger.info("Wired table detection disabled by config")
|
||||||
|
if not table_detection_config.enable_wireless_table:
|
||||||
|
wireless_table_model = None
|
||||||
|
wireless_cell_det_model = None
|
||||||
|
logger.info("Wireless table detection disabled by config")
|
||||||
|
|
||||||
# Text detection/recognition model configuration
|
# Text detection/recognition model configuration
|
||||||
text_det_model = settings.text_detection_model_name
|
text_det_model = settings.text_detection_model_name
|
||||||
text_rec_model = settings.text_recognition_model_name
|
text_rec_model = settings.text_recognition_model_name
|
||||||
@@ -641,6 +682,15 @@ class OCRService:
|
|||||||
# Track model loading for cache management
|
# Track model loading for cache management
|
||||||
self._model_last_used['structure'] = datetime.now()
|
self._model_last_used['structure'] = datetime.now()
|
||||||
self._current_layout_model = layout_model # Track current model for recreation check
|
self._current_layout_model = layout_model # Track current model for recreation check
|
||||||
|
# Track table detection config for recreation check
|
||||||
|
if table_detection_config:
|
||||||
|
self._current_table_detection_config = (
|
||||||
|
table_detection_config.enable_wired_table,
|
||||||
|
table_detection_config.enable_wireless_table,
|
||||||
|
table_detection_config.enable_region_detection
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self._current_table_detection_config = None
|
||||||
|
|
||||||
logger.info(f"PP-StructureV3 engine ready (PaddlePaddle {paddle.__version__}, {'GPU' if self.use_gpu else 'CPU'} mode)")
|
logger.info(f"PP-StructureV3 engine ready (PaddlePaddle {paddle.__version__}, {'GPU' if self.use_gpu else 'CPU'} mode)")
|
||||||
|
|
||||||
@@ -712,6 +762,15 @@ class OCRService:
|
|||||||
|
|
||||||
self.structure_engine = PPStructureV3(**cpu_kwargs)
|
self.structure_engine = PPStructureV3(**cpu_kwargs)
|
||||||
self._current_layout_model = layout_model # Track current model for recreation check
|
self._current_layout_model = layout_model # Track current model for recreation check
|
||||||
|
# Track table detection config for recreation check
|
||||||
|
if table_detection_config:
|
||||||
|
self._current_table_detection_config = (
|
||||||
|
table_detection_config.enable_wired_table,
|
||||||
|
table_detection_config.enable_wireless_table,
|
||||||
|
table_detection_config.enable_region_detection
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self._current_table_detection_config = None
|
||||||
logger.info(f"PP-StructureV3 engine ready (CPU mode - fallback, layout_model={settings.layout_detection_model_name})")
|
logger.info(f"PP-StructureV3 engine ready (CPU mode - fallback, layout_model={settings.layout_detection_model_name})")
|
||||||
else:
|
else:
|
||||||
raise
|
raise
|
||||||
@@ -956,7 +1015,8 @@ class OCRService:
|
|||||||
current_page: int = 0,
|
current_page: int = 0,
|
||||||
layout_model: Optional[str] = None,
|
layout_model: Optional[str] = None,
|
||||||
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
|
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
|
||||||
preprocessing_config: Optional[PreprocessingConfig] = None
|
preprocessing_config: Optional[PreprocessingConfig] = None,
|
||||||
|
table_detection_config: Optional[TableDetectionConfig] = None
|
||||||
) -> Dict:
|
) -> Dict:
|
||||||
"""
|
"""
|
||||||
Process single image with OCR and layout analysis
|
Process single image with OCR and layout analysis
|
||||||
@@ -971,6 +1031,7 @@ class OCRService:
|
|||||||
layout_model: Layout detection model ('chinese', 'default', 'cdla')
|
layout_model: Layout detection model ('chinese', 'default', 'cdla')
|
||||||
preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
|
preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
|
||||||
preprocessing_config: Manual preprocessing config (used when mode='manual')
|
preprocessing_config: Manual preprocessing config (used when mode='manual')
|
||||||
|
table_detection_config: Table detection config (wired/wireless/region options)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dictionary with OCR results and metadata
|
Dictionary with OCR results and metadata
|
||||||
@@ -1041,7 +1102,8 @@ class OCRService:
|
|||||||
current_page=page_num - 1, # Convert to 0-based page number for layout data
|
current_page=page_num - 1, # Convert to 0-based page number for layout data
|
||||||
layout_model=layout_model,
|
layout_model=layout_model,
|
||||||
preprocessing_mode=preprocessing_mode,
|
preprocessing_mode=preprocessing_mode,
|
||||||
preprocessing_config=preprocessing_config
|
preprocessing_config=preprocessing_config,
|
||||||
|
table_detection_config=table_detection_config
|
||||||
)
|
)
|
||||||
|
|
||||||
# Accumulate results
|
# Accumulate results
|
||||||
@@ -1189,7 +1251,8 @@ class OCRService:
|
|||||||
current_page=current_page,
|
current_page=current_page,
|
||||||
layout_model=layout_model,
|
layout_model=layout_model,
|
||||||
preprocessing_mode=preprocessing_mode,
|
preprocessing_mode=preprocessing_mode,
|
||||||
preprocessing_config=preprocessing_config
|
preprocessing_config=preprocessing_config,
|
||||||
|
table_detection_config=table_detection_config
|
||||||
)
|
)
|
||||||
|
|
||||||
# Generate Markdown
|
# Generate Markdown
|
||||||
@@ -1347,7 +1410,8 @@ class OCRService:
|
|||||||
current_page: int = 0,
|
current_page: int = 0,
|
||||||
layout_model: Optional[str] = None,
|
layout_model: Optional[str] = None,
|
||||||
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
|
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
|
||||||
preprocessing_config: Optional[PreprocessingConfig] = None
|
preprocessing_config: Optional[PreprocessingConfig] = None,
|
||||||
|
table_detection_config: Optional[TableDetectionConfig] = None
|
||||||
) -> Tuple[Optional[Dict], List[Dict]]:
|
) -> Tuple[Optional[Dict], List[Dict]]:
|
||||||
"""
|
"""
|
||||||
Analyze document layout using PP-StructureV3 with enhanced element extraction
|
Analyze document layout using PP-StructureV3 with enhanced element extraction
|
||||||
@@ -1359,6 +1423,7 @@ class OCRService:
|
|||||||
layout_model: Layout detection model ('chinese', 'default', 'cdla')
|
layout_model: Layout detection model ('chinese', 'default', 'cdla')
|
||||||
preprocessing_mode: Preprocessing mode ('auto', 'manual', 'disabled')
|
preprocessing_mode: Preprocessing mode ('auto', 'manual', 'disabled')
|
||||||
preprocessing_config: Manual preprocessing config (used when mode='manual')
|
preprocessing_config: Manual preprocessing config (used when mode='manual')
|
||||||
|
table_detection_config: Table detection config (wired/wireless/region options)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Tuple of (layout_data, images_metadata)
|
Tuple of (layout_data, images_metadata)
|
||||||
@@ -1376,7 +1441,7 @@ class OCRService:
|
|||||||
f"Mode: {'CPU fallback' if self._cpu_fallback_active else 'GPU'}"
|
f"Mode: {'CPU fallback' if self._cpu_fallback_active else 'GPU'}"
|
||||||
)
|
)
|
||||||
|
|
||||||
structure_engine = self._ensure_structure_engine(layout_model)
|
structure_engine = self._ensure_structure_engine(layout_model, table_detection_config)
|
||||||
|
|
||||||
# Apply image preprocessing for layout detection
|
# Apply image preprocessing for layout detection
|
||||||
# Preprocessing includes:
|
# Preprocessing includes:
|
||||||
@@ -1432,10 +1497,19 @@ class OCRService:
|
|||||||
# Get scaling info for bbox coordinate restoration
|
# Get scaling info for bbox coordinate restoration
|
||||||
scaling_info = preprocessing_result.scaling_info if preprocessing_result else None
|
scaling_info = preprocessing_result.scaling_info if preprocessing_result else None
|
||||||
|
|
||||||
|
# CV table detection is disabled due to poor performance on complex tables
|
||||||
|
# Issues: 1) Detected boundaries smaller than content
|
||||||
|
# 2) Incorrectly splits merged cells
|
||||||
|
# The ML-based RT-DETR-L detection is currently more reliable.
|
||||||
|
# TODO: Improve CV algorithm with better line detection and grid alignment
|
||||||
|
use_cv_table_detection = False
|
||||||
|
|
||||||
result = enhanced_processor.analyze_with_full_structure(
|
result = enhanced_processor.analyze_with_full_structure(
|
||||||
image_path, output_dir, current_page,
|
image_path, output_dir, current_page,
|
||||||
preprocessed_image=preprocessed_image,
|
preprocessed_image=preprocessed_image,
|
||||||
scaling_info=scaling_info
|
scaling_info=scaling_info,
|
||||||
|
save_visualization=True, # Save layout detection visualization images
|
||||||
|
use_cv_table_detection=use_cv_table_detection
|
||||||
)
|
)
|
||||||
|
|
||||||
if result.get('has_parsing_res_list'):
|
if result.get('has_parsing_res_list'):
|
||||||
@@ -1673,7 +1747,8 @@ class OCRService:
|
|||||||
force_track: Optional[str] = None,
|
force_track: Optional[str] = None,
|
||||||
layout_model: Optional[str] = None,
|
layout_model: Optional[str] = None,
|
||||||
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
|
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
|
||||||
preprocessing_config: Optional[PreprocessingConfig] = None
|
preprocessing_config: Optional[PreprocessingConfig] = None,
|
||||||
|
table_detection_config: Optional[TableDetectionConfig] = None
|
||||||
) -> Union[UnifiedDocument, Dict]:
|
) -> Union[UnifiedDocument, Dict]:
|
||||||
"""
|
"""
|
||||||
Process document using dual-track approach.
|
Process document using dual-track approach.
|
||||||
@@ -1688,6 +1763,7 @@ class OCRService:
|
|||||||
layout_model: Layout detection model ('chinese', 'default', 'cdla') (used for OCR track only)
|
layout_model: Layout detection model ('chinese', 'default', 'cdla') (used for OCR track only)
|
||||||
preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
|
preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
|
||||||
preprocessing_config: Manual preprocessing config (used when mode='manual')
|
preprocessing_config: Manual preprocessing config (used when mode='manual')
|
||||||
|
table_detection_config: Table detection config (wired/wireless/region options)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
UnifiedDocument if dual-track is enabled, Dict otherwise
|
UnifiedDocument if dual-track is enabled, Dict otherwise
|
||||||
@@ -1696,7 +1772,7 @@ class OCRService:
|
|||||||
# Fallback to traditional OCR processing
|
# Fallback to traditional OCR processing
|
||||||
return self.process_file_traditional(
|
return self.process_file_traditional(
|
||||||
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
|
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
|
||||||
preprocessing_mode, preprocessing_config
|
preprocessing_mode, preprocessing_config, table_detection_config
|
||||||
)
|
)
|
||||||
|
|
||||||
start_time = datetime.now()
|
start_time = datetime.now()
|
||||||
@@ -1770,7 +1846,8 @@ class OCRService:
|
|||||||
confidence_threshold=confidence_threshold,
|
confidence_threshold=confidence_threshold,
|
||||||
output_dir=output_dir, layout_model=layout_model,
|
output_dir=output_dir, layout_model=layout_model,
|
||||||
preprocessing_mode=preprocessing_mode,
|
preprocessing_mode=preprocessing_mode,
|
||||||
preprocessing_config=preprocessing_config
|
preprocessing_config=preprocessing_config,
|
||||||
|
table_detection_config=table_detection_config
|
||||||
)
|
)
|
||||||
|
|
||||||
# Convert OCR result to extract images
|
# Convert OCR result to extract images
|
||||||
@@ -1804,7 +1881,7 @@ class OCRService:
|
|||||||
logger.info("Using OCR track (PaddleOCR)")
|
logger.info("Using OCR track (PaddleOCR)")
|
||||||
ocr_result = self.process_file_traditional(
|
ocr_result = self.process_file_traditional(
|
||||||
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
|
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
|
||||||
preprocessing_mode, preprocessing_config
|
preprocessing_mode, preprocessing_config, table_detection_config
|
||||||
)
|
)
|
||||||
|
|
||||||
# Convert OCR result to UnifiedDocument using the converter
|
# Convert OCR result to UnifiedDocument using the converter
|
||||||
@@ -1835,7 +1912,7 @@ class OCRService:
|
|||||||
# Fallback to traditional OCR
|
# Fallback to traditional OCR
|
||||||
return self.process_file_traditional(
|
return self.process_file_traditional(
|
||||||
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
|
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
|
||||||
preprocessing_mode, preprocessing_config
|
preprocessing_mode, preprocessing_config, table_detection_config
|
||||||
)
|
)
|
||||||
|
|
||||||
def _merge_ocr_images_into_direct(
|
def _merge_ocr_images_into_direct(
|
||||||
@@ -1916,7 +1993,8 @@ class OCRService:
|
|||||||
output_dir: Optional[Path] = None,
|
output_dir: Optional[Path] = None,
|
||||||
layout_model: Optional[str] = None,
|
layout_model: Optional[str] = None,
|
||||||
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
|
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
|
||||||
preprocessing_config: Optional[PreprocessingConfig] = None
|
preprocessing_config: Optional[PreprocessingConfig] = None,
|
||||||
|
table_detection_config: Optional[TableDetectionConfig] = None
|
||||||
) -> Dict:
|
) -> Dict:
|
||||||
"""
|
"""
|
||||||
Traditional OCR processing (legacy method).
|
Traditional OCR processing (legacy method).
|
||||||
@@ -1930,6 +2008,7 @@ class OCRService:
|
|||||||
layout_model: Layout detection model ('chinese', 'default', 'cdla')
|
layout_model: Layout detection model ('chinese', 'default', 'cdla')
|
||||||
preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
|
preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
|
||||||
preprocessing_config: Manual preprocessing config (used when mode='manual')
|
preprocessing_config: Manual preprocessing config (used when mode='manual')
|
||||||
|
table_detection_config: Table detection config (wired/wireless/region options)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dictionary with OCR results in legacy format
|
Dictionary with OCR results in legacy format
|
||||||
@@ -1943,7 +2022,7 @@ class OCRService:
|
|||||||
for i, image_path in enumerate(image_paths):
|
for i, image_path in enumerate(image_paths):
|
||||||
result = self.process_image(
|
result = self.process_image(
|
||||||
image_path, lang, detect_layout, confidence_threshold, output_dir, i, layout_model,
|
image_path, lang, detect_layout, confidence_threshold, output_dir, i, layout_model,
|
||||||
preprocessing_mode, preprocessing_config
|
preprocessing_mode, preprocessing_config, table_detection_config
|
||||||
)
|
)
|
||||||
all_results.append(result)
|
all_results.append(result)
|
||||||
|
|
||||||
@@ -1960,7 +2039,7 @@ class OCRService:
|
|||||||
# Single image or other file
|
# Single image or other file
|
||||||
return self.process_image(
|
return self.process_image(
|
||||||
file_path, lang, detect_layout, confidence_threshold, output_dir, 0, layout_model,
|
file_path, lang, detect_layout, confidence_threshold, output_dir, 0, layout_model,
|
||||||
preprocessing_mode, preprocessing_config
|
preprocessing_mode, preprocessing_config, table_detection_config
|
||||||
)
|
)
|
||||||
|
|
||||||
def _combine_results(self, results: List[Dict]) -> Dict:
|
def _combine_results(self, results: List[Dict]) -> Dict:
|
||||||
@@ -2047,7 +2126,8 @@ class OCRService:
|
|||||||
force_track: Optional[str] = None,
|
force_track: Optional[str] = None,
|
||||||
layout_model: Optional[str] = None,
|
layout_model: Optional[str] = None,
|
||||||
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
|
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
|
||||||
preprocessing_config: Optional[PreprocessingConfig] = None
|
preprocessing_config: Optional[PreprocessingConfig] = None,
|
||||||
|
table_detection_config: Optional[TableDetectionConfig] = None
|
||||||
) -> Union[UnifiedDocument, Dict]:
|
) -> Union[UnifiedDocument, Dict]:
|
||||||
"""
|
"""
|
||||||
Main processing method with dual-track support.
|
Main processing method with dual-track support.
|
||||||
@@ -2063,6 +2143,7 @@ class OCRService:
|
|||||||
layout_model: Layout detection model ('chinese', 'default', 'cdla') (used for OCR track only)
|
layout_model: Layout detection model ('chinese', 'default', 'cdla') (used for OCR track only)
|
||||||
preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
|
preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
|
||||||
preprocessing_config: Manual preprocessing config (used when mode='manual')
|
preprocessing_config: Manual preprocessing config (used when mode='manual')
|
||||||
|
table_detection_config: Table detection config (wired/wireless/region options)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
UnifiedDocument if dual-track is enabled and use_dual_track=True,
|
UnifiedDocument if dual-track is enabled and use_dual_track=True,
|
||||||
@@ -2075,13 +2156,13 @@ class OCRService:
|
|||||||
# Use dual-track processing (or forced track)
|
# Use dual-track processing (or forced track)
|
||||||
return self.process_with_dual_track(
|
return self.process_with_dual_track(
|
||||||
file_path, lang, detect_layout, confidence_threshold, output_dir, force_track, layout_model,
|
file_path, lang, detect_layout, confidence_threshold, output_dir, force_track, layout_model,
|
||||||
preprocessing_mode, preprocessing_config
|
preprocessing_mode, preprocessing_config, table_detection_config
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# Use traditional OCR processing (no force_track support)
|
# Use traditional OCR processing (no force_track support)
|
||||||
return self.process_file_traditional(
|
return self.process_file_traditional(
|
||||||
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
|
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model,
|
||||||
preprocessing_mode, preprocessing_config
|
preprocessing_mode, preprocessing_config, table_detection_config
|
||||||
)
|
)
|
||||||
|
|
||||||
def process_legacy(
|
def process_legacy(
|
||||||
|
|||||||
@@ -590,8 +590,17 @@ class OCRToUnifiedConverter:
|
|||||||
# Prepare content based on element type
|
# Prepare content based on element type
|
||||||
if element_type == ElementType.TABLE:
|
if element_type == ElementType.TABLE:
|
||||||
# For tables, use TableData as content
|
# For tables, use TableData as content
|
||||||
|
# Pass cell_boxes for accurate cell positioning
|
||||||
table_data = self._extract_table_data(elem_data)
|
table_data = self._extract_table_data(elem_data)
|
||||||
content = table_data if table_data else elem_data.get('content', '')
|
content = table_data if table_data else elem_data.get('content', '')
|
||||||
|
|
||||||
|
# Preserve cell_boxes and embedded_images in metadata for PDF generation
|
||||||
|
# These are extracted by PP-StructureV3 and provide accurate cell positioning
|
||||||
|
if 'cell_boxes' in elem_data:
|
||||||
|
elem_data.setdefault('metadata', {})['cell_boxes'] = elem_data['cell_boxes']
|
||||||
|
elem_data['metadata']['cell_boxes_source'] = elem_data.get('cell_boxes_source', 'table_res_list')
|
||||||
|
if 'embedded_images' in elem_data:
|
||||||
|
elem_data.setdefault('metadata', {})['embedded_images'] = elem_data['embedded_images']
|
||||||
elif element_type in [ElementType.IMAGE, ElementType.FIGURE]:
|
elif element_type in [ElementType.IMAGE, ElementType.FIGURE]:
|
||||||
# For images, use metadata dict as content
|
# For images, use metadata dict as content
|
||||||
content = {
|
content = {
|
||||||
|
|||||||
@@ -447,7 +447,8 @@ class PDFGeneratorService:
|
|||||||
'text': text_content,
|
'text': text_content,
|
||||||
'bbox': bbox_polygon,
|
'bbox': bbox_polygon,
|
||||||
'confidence': element.confidence or 1.0,
|
'confidence': element.confidence or 1.0,
|
||||||
'page': page_num
|
'page': page_num,
|
||||||
|
'element_type': element.type.value # Include element type for styling
|
||||||
}
|
}
|
||||||
|
|
||||||
# Include style information if available (for Direct track)
|
# Include style information if available (for Direct track)
|
||||||
@@ -466,13 +467,24 @@ class PDFGeneratorService:
|
|||||||
else:
|
else:
|
||||||
html_content = str(element.content)
|
html_content = str(element.content)
|
||||||
|
|
||||||
layout_elements.append({
|
table_element = {
|
||||||
'type': 'table',
|
'type': 'table',
|
||||||
'content': html_content,
|
'content': html_content,
|
||||||
'bbox': [element.bbox.x0, element.bbox.y0,
|
'bbox': [element.bbox.x0, element.bbox.y0,
|
||||||
element.bbox.x1, element.bbox.y1],
|
element.bbox.x1, element.bbox.y1],
|
||||||
'page': page_num - 1 # layout uses 0-based
|
'page': page_num - 1 # layout uses 0-based
|
||||||
})
|
}
|
||||||
|
|
||||||
|
# Preserve cell_boxes and embedded_images from metadata
|
||||||
|
# These are extracted by PP-StructureV3 and used for accurate table rendering
|
||||||
|
if element.metadata:
|
||||||
|
if 'cell_boxes' in element.metadata:
|
||||||
|
table_element['cell_boxes'] = element.metadata['cell_boxes']
|
||||||
|
table_element['cell_boxes_source'] = element.metadata.get('cell_boxes_source', 'metadata')
|
||||||
|
if 'embedded_images' in element.metadata:
|
||||||
|
table_element['embedded_images'] = element.metadata['embedded_images']
|
||||||
|
|
||||||
|
layout_elements.append(table_element)
|
||||||
|
|
||||||
# Add bbox to images_metadata for text overlap filtering
|
# Add bbox to images_metadata for text overlap filtering
|
||||||
# (no actual image file, just bbox for filtering)
|
# (no actual image file, just bbox for filtering)
|
||||||
@@ -484,10 +496,10 @@ class PDFGeneratorService:
|
|||||||
'element_id': element.element_id
|
'element_id': element.element_id
|
||||||
})
|
})
|
||||||
|
|
||||||
# Handle image/visual elements
|
# Handle image/visual elements (including stamps/seals)
|
||||||
elif element.is_visual or element.type in [
|
elif element.is_visual or element.type in [
|
||||||
ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART,
|
ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART,
|
||||||
ElementType.DIAGRAM, ElementType.LOGO
|
ElementType.DIAGRAM, ElementType.LOGO, ElementType.STAMP
|
||||||
]:
|
]:
|
||||||
# Get image path using fallback logic
|
# Get image path using fallback logic
|
||||||
image_path = self._get_image_path(element)
|
image_path = self._get_image_path(element)
|
||||||
@@ -729,13 +741,13 @@ class PDFGeneratorService:
|
|||||||
regions_to_avoid.append(element) # Tables are exclusion regions
|
regions_to_avoid.append(element) # Tables are exclusion regions
|
||||||
elif element.is_visual or element.type in [
|
elif element.is_visual or element.type in [
|
||||||
ElementType.IMAGE, ElementType.FIGURE,
|
ElementType.IMAGE, ElementType.FIGURE,
|
||||||
ElementType.CHART, ElementType.DIAGRAM, ElementType.LOGO
|
ElementType.CHART, ElementType.DIAGRAM, ElementType.LOGO, ElementType.STAMP
|
||||||
]:
|
]:
|
||||||
image_elements.append(element)
|
image_elements.append(element)
|
||||||
# Only add real images to exclusion regions, NOT charts/diagrams
|
# Only add real images to exclusion regions, NOT charts/diagrams
|
||||||
# Charts often have large bounding boxes that include text labels
|
# Charts often have large bounding boxes that include text labels
|
||||||
# which should be rendered as selectable text on top
|
# which should be rendered as selectable text on top
|
||||||
if element.type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.LOGO]:
|
if element.type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.LOGO, ElementType.STAMP]:
|
||||||
regions_to_avoid.append(element)
|
regions_to_avoid.append(element)
|
||||||
elif element.type == ElementType.LIST_ITEM:
|
elif element.type == ElementType.LIST_ITEM:
|
||||||
list_elements.append(element)
|
list_elements.append(element)
|
||||||
@@ -934,11 +946,14 @@ class PDFGeneratorService:
|
|||||||
# Create PDF canvas with initial page size (will be updated per page)
|
# Create PDF canvas with initial page size (will be updated per page)
|
||||||
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height))
|
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height))
|
||||||
|
|
||||||
# Filter text regions to avoid overlap with tables/images
|
# LAYERED RENDERING: Exclude tables from regions_to_avoid
|
||||||
regions_to_avoid = images_metadata
|
# Text inside tables will be rendered at raw OCR positions (via GapFillingService)
|
||||||
|
# while table borders are drawn separately using cell_boxes
|
||||||
|
# Only avoid overlap with actual images/figures/charts
|
||||||
|
regions_to_avoid = [img for img in images_metadata if img.get('type') != 'table']
|
||||||
table_count = len([img for img in images_metadata if img.get('type') == 'table'])
|
table_count = len([img for img in images_metadata if img.get('type') == 'table'])
|
||||||
|
|
||||||
logger.info(f"過濾文字區域: {len(regions_to_avoid)} 個區域需要避免 (含 {table_count} 個表格)")
|
logger.info(f"過濾文字區域: {len(regions_to_avoid)} 個區域需要避免 (不含表格), {table_count} 個表格使用分層渲染")
|
||||||
|
|
||||||
filtered_text_regions = self._filter_text_in_regions(text_regions, regions_to_avoid)
|
filtered_text_regions = self._filter_text_in_regions(text_regions, regions_to_avoid)
|
||||||
|
|
||||||
@@ -1042,7 +1057,8 @@ class PDFGeneratorService:
|
|||||||
for table_elem in page_table_regions:
|
for table_elem in page_table_regions:
|
||||||
self.draw_table_region(
|
self.draw_table_region(
|
||||||
pdf_canvas, table_elem, images_metadata,
|
pdf_canvas, table_elem, images_metadata,
|
||||||
current_target_h, current_scale_w, current_scale_h
|
current_target_h, current_scale_w, current_scale_h,
|
||||||
|
result_dir=json_parent_dir
|
||||||
)
|
)
|
||||||
|
|
||||||
# 3. Draw text (top layer)
|
# 3. Draw text (top layer)
|
||||||
@@ -1542,8 +1558,8 @@ class PDFGeneratorService:
|
|||||||
logger.info(f"[文字] '{text[:30]}' → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 字體:{font_size:.1f}pt, 寬x高:{bbox_width:.0f}x{bbox_height:.0f}, 行數:{num_lines}")
|
logger.info(f"[文字] '{text[:30]}' → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 字體:{font_size:.1f}pt, 寬x高:{bbox_width:.0f}x{bbox_height:.0f}, 行數:{num_lines}")
|
||||||
|
|
||||||
# Set font with track-specific styling
|
# Set font with track-specific styling
|
||||||
# Note: OCR track has no StyleInfo (extracted from images), so no advanced formatting
|
|
||||||
style_info = region.get('style')
|
style_info = region.get('style')
|
||||||
|
element_type = region.get('element_type', 'text')
|
||||||
is_direct_track = (self.current_processing_track == ProcessingTrack.DIRECT or
|
is_direct_track = (self.current_processing_track == ProcessingTrack.DIRECT or
|
||||||
self.current_processing_track == ProcessingTrack.HYBRID)
|
self.current_processing_track == ProcessingTrack.HYBRID)
|
||||||
|
|
||||||
@@ -1555,9 +1571,25 @@ class PDFGeneratorService:
|
|||||||
font_size = pdf_canvas._fontsize
|
font_size = pdf_canvas._fontsize
|
||||||
logger.debug(f"Applied Direct track style: font={font_name}, size={font_size}")
|
logger.debug(f"Applied Direct track style: font={font_name}, size={font_size}")
|
||||||
else:
|
else:
|
||||||
# OCR track or no style: Use simple font selection
|
# OCR track or no style: Use simple font selection with element-type based styling
|
||||||
font_name = self.font_name if self.font_registered else 'Helvetica'
|
font_name = self.font_name if self.font_registered else 'Helvetica'
|
||||||
pdf_canvas.setFont(font_name, font_size)
|
|
||||||
|
# Apply element-type specific styling (for OCR track)
|
||||||
|
if element_type == 'title':
|
||||||
|
# Titles: use larger, bold font
|
||||||
|
font_size = min(font_size * 1.3, 36) # 30% larger, max 36pt
|
||||||
|
pdf_canvas.setFont(font_name, font_size)
|
||||||
|
logger.debug(f"Applied title style: size={font_size:.1f}")
|
||||||
|
elif element_type == 'header':
|
||||||
|
# Headers: slightly larger
|
||||||
|
font_size = min(font_size * 1.15, 24) # 15% larger, max 24pt
|
||||||
|
pdf_canvas.setFont(font_name, font_size)
|
||||||
|
elif element_type == 'caption':
|
||||||
|
# Captions: slightly smaller, italic if available
|
||||||
|
font_size = max(font_size * 0.9, 6) # 10% smaller, min 6pt
|
||||||
|
pdf_canvas.setFont(font_name, font_size)
|
||||||
|
else:
|
||||||
|
pdf_canvas.setFont(font_name, font_size)
|
||||||
|
|
||||||
# Handle line breaks (split text by newlines)
|
# Handle line breaks (split text by newlines)
|
||||||
# OCR track: simple left-aligned rendering
|
# OCR track: simple left-aligned rendering
|
||||||
@@ -1726,7 +1758,8 @@ class PDFGeneratorService:
|
|||||||
images_metadata: List[Dict],
|
images_metadata: List[Dict],
|
||||||
page_height: float,
|
page_height: float,
|
||||||
scale_w: float = 1.0,
|
scale_w: float = 1.0,
|
||||||
scale_h: float = 1.0
|
scale_h: float = 1.0,
|
||||||
|
result_dir: Optional[Path] = None
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Draw a table region by parsing HTML and rebuilding with ReportLab Table
|
Draw a table region by parsing HTML and rebuilding with ReportLab Table
|
||||||
@@ -1738,13 +1771,27 @@ class PDFGeneratorService:
|
|||||||
page_height: Height of page
|
page_height: Height of page
|
||||||
scale_w: Scale factor for X coordinates (PDF width / OCR width)
|
scale_w: Scale factor for X coordinates (PDF width / OCR width)
|
||||||
scale_h: Scale factor for Y coordinates (PDF height / OCR height)
|
scale_h: Scale factor for Y coordinates (PDF height / OCR height)
|
||||||
|
result_dir: Directory containing result files (for embedded images)
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
html_content = table_element.get('content', '')
|
html_content = table_element.get('content', '')
|
||||||
if not html_content:
|
if not html_content:
|
||||||
return
|
return
|
||||||
|
|
||||||
# Parse HTML to extract table structure
|
# Try to use cell_boxes for direct rendering first (more accurate)
|
||||||
|
cell_boxes = table_element.get('cell_boxes', [])
|
||||||
|
if cell_boxes:
|
||||||
|
logger.info(f"[TABLE] Using cell_boxes direct rendering ({len(cell_boxes)} cells)")
|
||||||
|
success = self._draw_table_with_cell_boxes(
|
||||||
|
pdf_canvas, table_element, page_height,
|
||||||
|
scale_w, scale_h, result_dir
|
||||||
|
)
|
||||||
|
if success:
|
||||||
|
return # Successfully rendered with cell_boxes
|
||||||
|
|
||||||
|
logger.info("[TABLE] Falling back to ReportLab Table")
|
||||||
|
|
||||||
|
# Fallback: Parse HTML to extract table structure and use ReportLab Table
|
||||||
parser = HTMLTableParser()
|
parser = HTMLTableParser()
|
||||||
parser.feed(html_content)
|
parser.feed(html_content)
|
||||||
|
|
||||||
@@ -1901,14 +1948,18 @@ class PDFGeneratorService:
|
|||||||
logger.info(f"[TABLE] Using cell_boxes col widths (scaled)")
|
logger.info(f"[TABLE] Using cell_boxes col widths (scaled)")
|
||||||
else:
|
else:
|
||||||
col_widths = [table_width / max_cols] * max_cols
|
col_widths = [table_width / max_cols] * max_cols
|
||||||
logger.info(f"[TABLE] Using equal distribution col widths")
|
logger.info(f"[TABLE] Using equal distribution col widths: {table_width/max_cols:.1f} each")
|
||||||
|
|
||||||
# Row heights are used optionally (ReportLab can auto-size)
|
# Row heights - ALWAYS use to ensure table fits bbox properly
|
||||||
row_heights = None
|
# Use computed heights from cell_boxes, or uniform distribution as fallback
|
||||||
if computed_row_heights:
|
if computed_row_heights:
|
||||||
# Scale row_heights to PDF coordinates
|
# Scale row_heights to PDF coordinates
|
||||||
row_heights = [h * scale_h for h in computed_row_heights]
|
row_heights = [h * scale_h for h in computed_row_heights]
|
||||||
logger.debug(f"[TABLE] Cell_boxes row heights available (scaled)")
|
logger.info(f"[TABLE] Using cell_boxes row heights (scaled)")
|
||||||
|
else:
|
||||||
|
# Uniform distribution based on table bbox - ensures table fills its allocated space
|
||||||
|
row_heights = [table_height / num_rows] * num_rows
|
||||||
|
logger.info(f"[TABLE] Using uniform row heights: {table_height/num_rows:.1f} each")
|
||||||
|
|
||||||
# Create ReportLab Table
|
# Create ReportLab Table
|
||||||
# Use smaller font to fit content with auto-wrap
|
# Use smaller font to fit content with auto-wrap
|
||||||
@@ -1932,12 +1983,10 @@ class PDFGeneratorService:
|
|||||||
escaped_text = cell_text.replace('&', '&').replace('<', '<').replace('>', '>')
|
escaped_text = cell_text.replace('&', '&').replace('<', '<').replace('>', '>')
|
||||||
reportlab_data[row_idx][col_idx] = Paragraph(escaped_text, cell_style)
|
reportlab_data[row_idx][col_idx] = Paragraph(escaped_text, cell_style)
|
||||||
|
|
||||||
# Create table with computed col widths
|
# Create table with col widths and row heights
|
||||||
# Note: We don't use row_heights even when available from cell_boxes because:
|
# Always use row_heights to ensure table fits bbox properly
|
||||||
# 1. ReportLab's auto-sizing handles content overflow better
|
table = Table(reportlab_data, colWidths=col_widths, rowHeights=row_heights)
|
||||||
# 2. Fixed heights can cause text clipping when content exceeds cell size
|
logger.info(f"[TABLE] Created with {len(col_widths)} cols, {len(row_heights)} rows")
|
||||||
# 3. The col_widths from cell_boxes provide the main layout benefit
|
|
||||||
table = Table(reportlab_data, colWidths=col_widths)
|
|
||||||
|
|
||||||
# Apply table style
|
# Apply table style
|
||||||
style = TableStyle([
|
style = TableStyle([
|
||||||
@@ -1974,26 +2023,303 @@ class PDFGeneratorService:
|
|||||||
scale_y = table_height / actual_height if actual_height > table_height else 1.0
|
scale_y = table_height / actual_height if actual_height > table_height else 1.0
|
||||||
scale_factor = min(scale_x, scale_y) # Use smaller scale to fit both dimensions
|
scale_factor = min(scale_x, scale_y) # Use smaller scale to fit both dimensions
|
||||||
|
|
||||||
|
# Calculate the table top position in PDF coordinates
|
||||||
|
# ReportLab uses bottom-left origin, so we need to position from TOP
|
||||||
|
pdf_y_top = page_height - ocr_y_top # Top of table in PDF coords
|
||||||
|
|
||||||
|
# Calculate the actual bottom position based on scaled height
|
||||||
|
# Table should be positioned so its TOP aligns with the bbox top
|
||||||
|
scaled_height = actual_height * scale_factor
|
||||||
|
pdf_y_bottom = pdf_y_top - scaled_height # Bottom of scaled table
|
||||||
|
|
||||||
|
logger.info(f"[表格] PDF座標: top={pdf_y_top:.0f}, bottom={pdf_y_bottom:.0f}, scaled_height={scaled_height:.0f}")
|
||||||
|
|
||||||
if scale_factor < 1.0:
|
if scale_factor < 1.0:
|
||||||
logger.info(f"[表格] 縮放比例: {scale_factor:.2f} (需要縮小以適應 bbox)")
|
logger.info(f"[表格] 縮放比例: {scale_factor:.2f} (需要縮小以適應 bbox)")
|
||||||
# Apply scaling transformation
|
# Apply scaling transformation
|
||||||
pdf_canvas.saveState()
|
pdf_canvas.saveState()
|
||||||
pdf_canvas.translate(pdf_x, pdf_y)
|
pdf_canvas.translate(pdf_x, pdf_y_bottom)
|
||||||
pdf_canvas.scale(scale_factor, scale_factor)
|
pdf_canvas.scale(scale_factor, scale_factor)
|
||||||
# Draw at origin since we've already translated
|
# Draw at origin since we've already translated
|
||||||
table.drawOn(pdf_canvas, 0, 0)
|
table.drawOn(pdf_canvas, 0, 0)
|
||||||
pdf_canvas.restoreState()
|
pdf_canvas.restoreState()
|
||||||
else:
|
else:
|
||||||
# Draw table at position without scaling
|
# Draw table at position without scaling
|
||||||
table.drawOn(pdf_canvas, pdf_x, pdf_y)
|
# pdf_y should be the bottom of the table
|
||||||
|
table.drawOn(pdf_canvas, pdf_x, pdf_y_bottom)
|
||||||
|
|
||||||
logger.info(f"Drew table at ({pdf_x:.0f}, {pdf_y:.0f}) size {table_width:.0f}x{table_height:.0f} with {len(rows)} rows")
|
logger.info(f"Drew table at ({pdf_x:.0f}, {pdf_y_bottom:.0f}) size {table_width:.0f}x{scaled_height:.0f} with {len(rows)} rows")
|
||||||
|
|
||||||
|
# Draw embedded images (images detected inside the table region)
|
||||||
|
embedded_images = table_element.get('embedded_images', [])
|
||||||
|
if embedded_images and result_dir:
|
||||||
|
logger.info(f"[TABLE] Drawing {len(embedded_images)} embedded images")
|
||||||
|
for emb_img in embedded_images:
|
||||||
|
self._draw_embedded_image(
|
||||||
|
pdf_canvas, emb_img, page_height, result_dir, scale_w, scale_h
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Failed to draw table region: {e}")
|
logger.warning(f"Failed to draw table region: {e}")
|
||||||
import traceback
|
import traceback
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
|
def _draw_embedded_image(
|
||||||
|
self,
|
||||||
|
pdf_canvas: canvas.Canvas,
|
||||||
|
emb_img: Dict,
|
||||||
|
page_height: float,
|
||||||
|
result_dir: Path,
|
||||||
|
scale_w: float = 1.0,
|
||||||
|
scale_h: float = 1.0
|
||||||
|
):
|
||||||
|
"""Draw an embedded image inside a table region."""
|
||||||
|
try:
|
||||||
|
# Get image path
|
||||||
|
saved_path = emb_img.get('saved_path', '')
|
||||||
|
if not saved_path:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Construct full path
|
||||||
|
image_path = result_dir / saved_path
|
||||||
|
if not image_path.exists():
|
||||||
|
image_path = result_dir / Path(saved_path).name
|
||||||
|
|
||||||
|
if not image_path.exists():
|
||||||
|
logger.warning(f"Embedded image not found: {saved_path}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Get bbox from embedded image data
|
||||||
|
bbox = emb_img.get('bbox', [])
|
||||||
|
if not bbox or len(bbox) < 4:
|
||||||
|
logger.warning(f"No bbox for embedded image: {saved_path}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Calculate position (bbox is [x0, y0, x1, y1])
|
||||||
|
x0, y0, x1, y1 = bbox[0], bbox[1], bbox[2], bbox[3]
|
||||||
|
|
||||||
|
# Apply scaling
|
||||||
|
x0_scaled = x0 * scale_w
|
||||||
|
y0_scaled = y0 * scale_h
|
||||||
|
x1_scaled = x1 * scale_w
|
||||||
|
y1_scaled = y1 * scale_h
|
||||||
|
|
||||||
|
width = x1_scaled - x0_scaled
|
||||||
|
height = y1_scaled - y0_scaled
|
||||||
|
|
||||||
|
# Transform Y coordinate (ReportLab uses bottom-left origin)
|
||||||
|
pdf_x = x0_scaled
|
||||||
|
pdf_y = page_height - y1_scaled
|
||||||
|
|
||||||
|
# Draw the image
|
||||||
|
from reportlab.lib.utils import ImageReader
|
||||||
|
img_reader = ImageReader(str(image_path))
|
||||||
|
pdf_canvas.drawImage(
|
||||||
|
img_reader, pdf_x, pdf_y, width, height,
|
||||||
|
preserveAspectRatio=True, mask='auto'
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"Drew embedded image at ({pdf_x:.0f}, {pdf_y:.0f}) size {width:.0f}x{height:.0f}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to draw embedded image: {e}")
|
||||||
|
|
||||||
|
def _normalize_cell_boxes_to_grid(
|
||||||
|
self,
|
||||||
|
cell_boxes: List[List[float]],
|
||||||
|
threshold: float = 10.0
|
||||||
|
) -> List[List[float]]:
|
||||||
|
"""
|
||||||
|
Normalize cell boxes to create a proper aligned grid.
|
||||||
|
|
||||||
|
Groups nearby coordinates and snaps them to a common value,
|
||||||
|
eliminating the 2-11 pixel variations that cause skewed tables.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
cell_boxes: List of cell bboxes [[x1,y1,x2,y2], ...]
|
||||||
|
threshold: Maximum distance to consider coordinates as "same line"
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Normalized cell_boxes with aligned coordinates
|
||||||
|
"""
|
||||||
|
if not cell_boxes or len(cell_boxes) < 2:
|
||||||
|
return cell_boxes
|
||||||
|
|
||||||
|
# Collect all X and Y coordinates
|
||||||
|
x_coords = [] # (value, box_idx, is_x1)
|
||||||
|
y_coords = [] # (value, box_idx, is_y1)
|
||||||
|
|
||||||
|
for i, box in enumerate(cell_boxes):
|
||||||
|
x1, y1, x2, y2 = box[0], box[1], box[2], box[3]
|
||||||
|
x_coords.append((x1, i, True)) # x1 (left)
|
||||||
|
x_coords.append((x2, i, False)) # x2 (right)
|
||||||
|
y_coords.append((y1, i, True)) # y1 (top)
|
||||||
|
y_coords.append((y2, i, False)) # y2 (bottom)
|
||||||
|
|
||||||
|
def cluster_and_normalize(coords, threshold):
|
||||||
|
"""Cluster nearby coordinates and return mapping to normalized values."""
|
||||||
|
if not coords:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
# Sort by value
|
||||||
|
sorted_coords = sorted(coords, key=lambda x: x[0])
|
||||||
|
|
||||||
|
# Cluster nearby values
|
||||||
|
clusters = []
|
||||||
|
current_cluster = [sorted_coords[0]]
|
||||||
|
|
||||||
|
for coord in sorted_coords[1:]:
|
||||||
|
if coord[0] - current_cluster[-1][0] <= threshold:
|
||||||
|
current_cluster.append(coord)
|
||||||
|
else:
|
||||||
|
clusters.append(current_cluster)
|
||||||
|
current_cluster = [coord]
|
||||||
|
clusters.append(current_cluster)
|
||||||
|
|
||||||
|
# Create mapping: (box_idx, is_first) -> normalized value
|
||||||
|
mapping = {}
|
||||||
|
for cluster in clusters:
|
||||||
|
# Use average of cluster as normalized value
|
||||||
|
avg_value = sum(c[0] for c in cluster) / len(cluster)
|
||||||
|
for _, box_idx, is_first in cluster:
|
||||||
|
mapping[(box_idx, is_first)] = avg_value
|
||||||
|
|
||||||
|
return mapping
|
||||||
|
|
||||||
|
x_mapping = cluster_and_normalize(x_coords, threshold)
|
||||||
|
y_mapping = cluster_and_normalize(y_coords, threshold)
|
||||||
|
|
||||||
|
# Create normalized cell boxes
|
||||||
|
normalized_boxes = []
|
||||||
|
for i, box in enumerate(cell_boxes):
|
||||||
|
x1_norm = x_mapping.get((i, True), box[0])
|
||||||
|
x2_norm = x_mapping.get((i, False), box[2])
|
||||||
|
y1_norm = y_mapping.get((i, True), box[1])
|
||||||
|
y2_norm = y_mapping.get((i, False), box[3])
|
||||||
|
normalized_boxes.append([x1_norm, y1_norm, x2_norm, y2_norm])
|
||||||
|
|
||||||
|
logger.debug(f"[TABLE] Normalized {len(cell_boxes)} cell boxes to grid")
|
||||||
|
return normalized_boxes
|
||||||
|
|
||||||
|
def _draw_table_with_cell_boxes(
|
||||||
|
self,
|
||||||
|
pdf_canvas: canvas.Canvas,
|
||||||
|
table_element: Dict,
|
||||||
|
page_height: float,
|
||||||
|
scale_w: float = 1.0,
|
||||||
|
scale_h: float = 1.0,
|
||||||
|
result_dir: Optional[Path] = None
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Draw table borders using cell_boxes for accurate positioning.
|
||||||
|
|
||||||
|
LAYERED RENDERING APPROACH:
|
||||||
|
- This method ONLY draws cell borders and embedded images
|
||||||
|
- Text is rendered separately using raw OCR positions (via GapFillingService)
|
||||||
|
- This decouples visual structure (borders) from content (text)
|
||||||
|
|
||||||
|
FALLBACK: If cell_boxes are incomplete, always draws the outer table
|
||||||
|
border using the table's bbox to ensure table boundaries are visible.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pdf_canvas: ReportLab canvas object
|
||||||
|
table_element: Table element dict with cell_boxes
|
||||||
|
page_height: Height of page in PDF coordinates
|
||||||
|
scale_w: Scale factor for X coordinates
|
||||||
|
scale_h: Scale factor for Y coordinates
|
||||||
|
result_dir: Directory containing result files (for embedded images)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
cell_boxes = table_element.get('cell_boxes', [])
|
||||||
|
|
||||||
|
# Always draw outer table border first (fallback for incomplete cell_boxes)
|
||||||
|
table_bbox = table_element.get('bbox', [])
|
||||||
|
if table_bbox and len(table_bbox) >= 4:
|
||||||
|
# Handle different bbox formats (list or dict)
|
||||||
|
if isinstance(table_bbox, dict):
|
||||||
|
tx1 = float(table_bbox.get('x0', 0))
|
||||||
|
ty1 = float(table_bbox.get('y0', 0))
|
||||||
|
tx2 = float(table_bbox.get('x1', 0))
|
||||||
|
ty2 = float(table_bbox.get('y1', 0))
|
||||||
|
else:
|
||||||
|
tx1, ty1, tx2, ty2 = table_bbox[:4]
|
||||||
|
|
||||||
|
# Apply scaling
|
||||||
|
tx1_scaled = tx1 * scale_w
|
||||||
|
ty1_scaled = ty1 * scale_h
|
||||||
|
tx2_scaled = tx2 * scale_w
|
||||||
|
ty2_scaled = ty2 * scale_h
|
||||||
|
|
||||||
|
table_width = tx2_scaled - tx1_scaled
|
||||||
|
table_height = ty2_scaled - ty1_scaled
|
||||||
|
|
||||||
|
# Transform Y coordinate (PDF uses bottom-left origin)
|
||||||
|
pdf_x = tx1_scaled
|
||||||
|
pdf_y = page_height - ty2_scaled # Bottom of table in PDF coords
|
||||||
|
|
||||||
|
# Draw outer table border (slightly thicker for visibility)
|
||||||
|
pdf_canvas.setStrokeColor(colors.black)
|
||||||
|
pdf_canvas.setLineWidth(1.0)
|
||||||
|
pdf_canvas.rect(pdf_x, pdf_y, table_width, table_height, stroke=1, fill=0)
|
||||||
|
logger.info(f"[TABLE] Drew outer table border at [{int(tx1)},{int(ty1)},{int(tx2)},{int(ty2)}]")
|
||||||
|
|
||||||
|
if not cell_boxes:
|
||||||
|
logger.warning("[TABLE] No cell_boxes available, only outer border drawn")
|
||||||
|
# Still draw embedded images even without cell borders
|
||||||
|
embedded_images = table_element.get('embedded_images', [])
|
||||||
|
if embedded_images and result_dir:
|
||||||
|
for emb_img in embedded_images:
|
||||||
|
self._draw_embedded_image(
|
||||||
|
pdf_canvas, emb_img, page_height, result_dir, scale_w, scale_h
|
||||||
|
)
|
||||||
|
return True # Outer border drawn successfully
|
||||||
|
|
||||||
|
# Normalize cell boxes to create aligned grid
|
||||||
|
cell_boxes = self._normalize_cell_boxes_to_grid(cell_boxes)
|
||||||
|
|
||||||
|
logger.info(f"[TABLE] Drawing {len(cell_boxes)} cell borders (layered mode, grid-aligned)")
|
||||||
|
|
||||||
|
# Draw each cell border
|
||||||
|
for box in cell_boxes:
|
||||||
|
x1, y1, x2, y2 = box[0], box[1], box[2], box[3]
|
||||||
|
|
||||||
|
# Apply scaling
|
||||||
|
x1_scaled = x1 * scale_w
|
||||||
|
y1_scaled = y1 * scale_h
|
||||||
|
x2_scaled = x2 * scale_w
|
||||||
|
y2_scaled = y2 * scale_h
|
||||||
|
|
||||||
|
cell_width = x2_scaled - x1_scaled
|
||||||
|
cell_height = y2_scaled - y1_scaled
|
||||||
|
|
||||||
|
# Transform Y coordinate (PDF uses bottom-left origin)
|
||||||
|
pdf_x = x1_scaled
|
||||||
|
pdf_y = page_height - y2_scaled # Bottom of cell in PDF coords
|
||||||
|
|
||||||
|
# Draw cell border only (no fill, no text)
|
||||||
|
pdf_canvas.setStrokeColor(colors.black)
|
||||||
|
pdf_canvas.setLineWidth(0.5)
|
||||||
|
pdf_canvas.rect(pdf_x, pdf_y, cell_width, cell_height, stroke=1, fill=0)
|
||||||
|
|
||||||
|
logger.info(f"[TABLE] Drew {len(cell_boxes)} cell borders")
|
||||||
|
|
||||||
|
# Draw embedded images
|
||||||
|
embedded_images = table_element.get('embedded_images', [])
|
||||||
|
if embedded_images and result_dir:
|
||||||
|
logger.info(f"[TABLE] Drawing {len(embedded_images)} embedded images")
|
||||||
|
for emb_img in embedded_images:
|
||||||
|
self._draw_embedded_image(
|
||||||
|
pdf_canvas, emb_img, page_height, result_dir, scale_w, scale_h
|
||||||
|
)
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"[TABLE] Failed to draw cell borders: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return False
|
||||||
|
|
||||||
def draw_image_region(
|
def draw_image_region(
|
||||||
self,
|
self,
|
||||||
pdf_canvas: canvas.Canvas,
|
pdf_canvas: canvas.Canvas,
|
||||||
@@ -2923,12 +3249,29 @@ class PDFGeneratorService:
|
|||||||
from reportlab.platypus import Table, TableStyle
|
from reportlab.platypus import Table, TableStyle
|
||||||
from reportlab.lib import colors
|
from reportlab.lib import colors
|
||||||
|
|
||||||
|
# Determine number of rows and columns for cell_boxes calculation
|
||||||
|
num_rows = len(rows)
|
||||||
|
max_cols = max(len(row['cells']) for row in rows) if rows else 0
|
||||||
|
|
||||||
# Use original column widths from extraction if available
|
# Use original column widths from extraction if available
|
||||||
# Otherwise let ReportLab auto-calculate
|
# Otherwise try to compute from cell_boxes (from PP-StructureV3)
|
||||||
col_widths = None
|
col_widths = None
|
||||||
if element.metadata and 'column_widths' in element.metadata:
|
if element.metadata and 'column_widths' in element.metadata:
|
||||||
col_widths = element.metadata['column_widths']
|
col_widths = element.metadata['column_widths']
|
||||||
logger.debug(f"Using extracted column widths: {col_widths}")
|
logger.debug(f"Using extracted column widths: {col_widths}")
|
||||||
|
elif element.metadata and 'cell_boxes' in element.metadata:
|
||||||
|
# Use cell_boxes from PP-StructureV3 for accurate column/row sizing
|
||||||
|
cell_boxes = element.metadata['cell_boxes']
|
||||||
|
cell_boxes_source = element.metadata.get('cell_boxes_source', 'unknown')
|
||||||
|
table_bbox_list = [bbox.x0, bbox.y0, bbox.x1, bbox.y1]
|
||||||
|
logger.info(f"[TABLE] Using {len(cell_boxes)} cell boxes from {cell_boxes_source}")
|
||||||
|
|
||||||
|
computed_col_widths, computed_row_heights = self._compute_table_grid_from_cell_boxes(
|
||||||
|
cell_boxes, table_bbox_list, num_rows, max_cols
|
||||||
|
)
|
||||||
|
if computed_col_widths:
|
||||||
|
col_widths = computed_col_widths
|
||||||
|
logger.info(f"[TABLE] Computed {len(col_widths)} column widths from cell_boxes")
|
||||||
|
|
||||||
# NOTE: Don't use rowHeights from extraction - it causes content overlap
|
# NOTE: Don't use rowHeights from extraction - it causes content overlap
|
||||||
# The extracted row heights are based on cell boundaries, not text content height.
|
# The extracted row heights are based on cell boundaries, not text content height.
|
||||||
|
|||||||
@@ -26,9 +26,11 @@ import paddle
|
|||||||
from paddleocr import PPStructureV3
|
from paddleocr import PPStructureV3
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import cv2
|
||||||
from app.models.unified_document import ElementType
|
from app.models.unified_document import ElementType
|
||||||
from app.core.config import settings
|
from app.core.config import settings
|
||||||
from app.services.memory_manager import prediction_context
|
from app.services.memory_manager import prediction_context
|
||||||
|
from app.services.cv_table_detector import CVTableDetector
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -62,6 +64,7 @@ class PPStructureEnhanced:
|
|||||||
'watermark': ElementType.WATERMARK,
|
'watermark': ElementType.WATERMARK,
|
||||||
'signature': ElementType.SIGNATURE,
|
'signature': ElementType.SIGNATURE,
|
||||||
'stamp': ElementType.STAMP,
|
'stamp': ElementType.STAMP,
|
||||||
|
'seal': ElementType.STAMP, # PP-StructureV3 may use 'seal' label
|
||||||
'logo': ElementType.LOGO,
|
'logo': ElementType.LOGO,
|
||||||
'barcode': ElementType.BARCODE,
|
'barcode': ElementType.BARCODE,
|
||||||
'qr-code': ElementType.QR_CODE,
|
'qr-code': ElementType.QR_CODE,
|
||||||
@@ -80,183 +83,15 @@ class PPStructureEnhanced:
|
|||||||
"""
|
"""
|
||||||
self.structure_engine = structure_engine
|
self.structure_engine = structure_engine
|
||||||
|
|
||||||
# Lazy-loaded SLANeXt models for cell boxes extraction
|
|
||||||
# These are loaded on-demand when enable_table_cell_boxes_extraction is True
|
|
||||||
self._slanet_wired_model = None
|
|
||||||
self._slanet_wireless_model = None
|
|
||||||
self._table_cls_model = None
|
|
||||||
|
|
||||||
def _get_slanet_model(self, is_wired: bool = True):
|
|
||||||
"""
|
|
||||||
Get or create SLANeXt model for cell boxes extraction (lazy loading).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
is_wired: True for wired (bordered) tables, False for wireless
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
SLANeXt model instance or None if loading fails
|
|
||||||
"""
|
|
||||||
if not settings.enable_table_cell_boxes_extraction:
|
|
||||||
return None
|
|
||||||
|
|
||||||
try:
|
|
||||||
from paddlex import create_model
|
|
||||||
|
|
||||||
if is_wired:
|
|
||||||
if self._slanet_wired_model is None:
|
|
||||||
model_name = settings.wired_table_model_name or "SLANeXt_wired"
|
|
||||||
logger.info(f"Loading SLANeXt wired model: {model_name}")
|
|
||||||
self._slanet_wired_model = create_model(model_name)
|
|
||||||
return self._slanet_wired_model
|
|
||||||
else:
|
|
||||||
if self._slanet_wireless_model is None:
|
|
||||||
model_name = settings.wireless_table_model_name or "SLANeXt_wireless"
|
|
||||||
logger.info(f"Loading SLANeXt wireless model: {model_name}")
|
|
||||||
self._slanet_wireless_model = create_model(model_name)
|
|
||||||
return self._slanet_wireless_model
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to load SLANeXt model: {e}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _get_table_classifier(self):
|
|
||||||
"""
|
|
||||||
Get or create table classification model (lazy loading).
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Table classifier model instance or None if loading fails
|
|
||||||
"""
|
|
||||||
if not settings.enable_table_cell_boxes_extraction:
|
|
||||||
return None
|
|
||||||
|
|
||||||
try:
|
|
||||||
from paddlex import create_model
|
|
||||||
|
|
||||||
if self._table_cls_model is None:
|
|
||||||
model_name = settings.table_classification_model_name or "PP-LCNet_x1_0_table_cls"
|
|
||||||
logger.info(f"Loading table classification model: {model_name}")
|
|
||||||
self._table_cls_model = create_model(model_name)
|
|
||||||
return self._table_cls_model
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to load table classifier: {e}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _extract_cell_boxes_with_slanet(
|
|
||||||
self,
|
|
||||||
table_image: np.ndarray,
|
|
||||||
table_bbox: List[float],
|
|
||||||
is_wired: Optional[bool] = None
|
|
||||||
) -> Optional[List[List[float]]]:
|
|
||||||
"""
|
|
||||||
Extract cell bounding boxes using direct SLANeXt model call.
|
|
||||||
|
|
||||||
This supplements PPStructureV3 which doesn't expose cell boxes in its output.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
table_image: Cropped table image as numpy array (BGR format)
|
|
||||||
table_bbox: Table bounding box in page coordinates [x1, y1, x2, y2]
|
|
||||||
is_wired: If None, auto-detect using classifier. True for bordered tables.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of cell bounding boxes in page coordinates [[x1,y1,x2,y2], ...],
|
|
||||||
or None if extraction fails
|
|
||||||
"""
|
|
||||||
if not settings.enable_table_cell_boxes_extraction:
|
|
||||||
return None
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Auto-detect table type if not specified
|
|
||||||
if is_wired is None:
|
|
||||||
classifier = self._get_table_classifier()
|
|
||||||
if classifier:
|
|
||||||
try:
|
|
||||||
cls_result = classifier.predict(table_image)
|
|
||||||
# PP-LCNet returns classification result
|
|
||||||
for res in cls_result:
|
|
||||||
label_names = res.get('label_names', [])
|
|
||||||
if label_names:
|
|
||||||
is_wired = 'wired' in str(label_names[0]).lower()
|
|
||||||
logger.debug(f"Table classified as: {'wired' if is_wired else 'wireless'}")
|
|
||||||
break
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Table classification failed, defaulting to wired: {e}")
|
|
||||||
is_wired = True
|
|
||||||
else:
|
|
||||||
is_wired = True # Default to wired if classifier unavailable
|
|
||||||
|
|
||||||
# Get appropriate SLANeXt model
|
|
||||||
model = self._get_slanet_model(is_wired=is_wired)
|
|
||||||
if model is None:
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Run SLANeXt prediction
|
|
||||||
results = model.predict(table_image)
|
|
||||||
|
|
||||||
# Extract cell boxes from result
|
|
||||||
cell_boxes = []
|
|
||||||
table_x, table_y = table_bbox[0], table_bbox[1]
|
|
||||||
|
|
||||||
for result in results:
|
|
||||||
# SLANeXt returns 'bbox' with 8-point polygon format
|
|
||||||
# [[x1,y1,x2,y2,x3,y3,x4,y4], ...]
|
|
||||||
boxes = result.get('bbox', [])
|
|
||||||
for box in boxes:
|
|
||||||
if isinstance(box, (list, tuple)):
|
|
||||||
if len(box) >= 8:
|
|
||||||
# 8-point polygon: convert to 4-point rectangle
|
|
||||||
xs = [box[i] for i in range(0, 8, 2)]
|
|
||||||
ys = [box[i] for i in range(1, 8, 2)]
|
|
||||||
x1, y1 = min(xs), min(ys)
|
|
||||||
x2, y2 = max(xs), max(ys)
|
|
||||||
elif len(box) >= 4:
|
|
||||||
# Already 4-point rectangle
|
|
||||||
x1, y1, x2, y2 = box[:4]
|
|
||||||
else:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Convert to absolute page coordinates
|
|
||||||
abs_box = [
|
|
||||||
float(x1 + table_x),
|
|
||||||
float(y1 + table_y),
|
|
||||||
float(x2 + table_x),
|
|
||||||
float(y2 + table_y)
|
|
||||||
]
|
|
||||||
cell_boxes.append(abs_box)
|
|
||||||
|
|
||||||
logger.info(f"SLANeXt extracted {len(cell_boxes)} cell boxes (is_wired={is_wired})")
|
|
||||||
return cell_boxes if cell_boxes else None
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Cell boxes extraction with SLANeXt failed: {e}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
def release_slanet_models(self):
|
|
||||||
"""Release SLANeXt models to free GPU memory."""
|
|
||||||
if self._slanet_wired_model is not None:
|
|
||||||
del self._slanet_wired_model
|
|
||||||
self._slanet_wired_model = None
|
|
||||||
logger.info("Released SLANeXt wired model")
|
|
||||||
|
|
||||||
if self._slanet_wireless_model is not None:
|
|
||||||
del self._slanet_wireless_model
|
|
||||||
self._slanet_wireless_model = None
|
|
||||||
logger.info("Released SLANeXt wireless model")
|
|
||||||
|
|
||||||
if self._table_cls_model is not None:
|
|
||||||
del self._table_cls_model
|
|
||||||
self._table_cls_model = None
|
|
||||||
logger.info("Released table classifier model")
|
|
||||||
|
|
||||||
gc.collect()
|
|
||||||
if TORCH_AVAILABLE:
|
|
||||||
torch.cuda.empty_cache()
|
|
||||||
|
|
||||||
def analyze_with_full_structure(
|
def analyze_with_full_structure(
|
||||||
self,
|
self,
|
||||||
image_path: Path,
|
image_path: Path,
|
||||||
output_dir: Optional[Path] = None,
|
output_dir: Optional[Path] = None,
|
||||||
current_page: int = 0,
|
current_page: int = 0,
|
||||||
preprocessed_image: Optional[Image.Image] = None,
|
preprocessed_image: Optional[Image.Image] = None,
|
||||||
scaling_info: Optional['ScalingInfo'] = None
|
scaling_info: Optional['ScalingInfo'] = None,
|
||||||
|
save_visualization: bool = False,
|
||||||
|
use_cv_table_detection: bool = False
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Analyze document with full PP-StructureV3 capabilities.
|
Analyze document with full PP-StructureV3 capabilities.
|
||||||
@@ -271,6 +106,10 @@ class PPStructureEnhanced:
|
|||||||
scaling_info: Optional ScalingInfo from preprocessing. If image was scaled
|
scaling_info: Optional ScalingInfo from preprocessing. If image was scaled
|
||||||
for layout detection, all bbox coordinates will be scaled back
|
for layout detection, all bbox coordinates will be scaled back
|
||||||
to original image coordinates for proper cropping.
|
to original image coordinates for proper cropping.
|
||||||
|
save_visualization: If True, save detection visualization images
|
||||||
|
(layout_det_res, layout_order_res, overall_ocr_res, etc.)
|
||||||
|
use_cv_table_detection: If True, use CV-based line detection for wired tables
|
||||||
|
instead of ML-based cell detection (RT-DETR-L)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dictionary with complete structure information including:
|
Dictionary with complete structure information including:
|
||||||
@@ -278,6 +117,7 @@ class PPStructureEnhanced:
|
|||||||
- reading_order: Reading order indices
|
- reading_order: Reading order indices
|
||||||
- images: Extracted images with metadata
|
- images: Extracted images with metadata
|
||||||
- tables: Extracted tables with structure
|
- tables: Extracted tables with structure
|
||||||
|
- visualization_dir: Path to visualization images (if save_visualization=True)
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
logger.info(f"Enhanced PP-StructureV3 analysis on {image_path.name}")
|
logger.info(f"Enhanced PP-StructureV3 analysis on {image_path.name}")
|
||||||
@@ -313,9 +153,21 @@ class PPStructureEnhanced:
|
|||||||
all_elements = []
|
all_elements = []
|
||||||
all_images = []
|
all_images = []
|
||||||
all_tables = []
|
all_tables = []
|
||||||
|
visualization_dir = None
|
||||||
|
|
||||||
# Process each page result
|
# Process each page result
|
||||||
for page_idx, page_result in enumerate(results):
|
for page_idx, page_result in enumerate(results):
|
||||||
|
# Save visualization images if requested
|
||||||
|
if save_visualization and output_dir and hasattr(page_result, 'save_to_img'):
|
||||||
|
try:
|
||||||
|
vis_dir = output_dir / 'visualization'
|
||||||
|
vis_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
page_result.save_to_img(str(vis_dir))
|
||||||
|
visualization_dir = vis_dir
|
||||||
|
logger.info(f"Saved visualization images to {vis_dir}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to save visualization images: {e}")
|
||||||
|
|
||||||
# Try to access parsing_res_list and table_res_list (the complete structure)
|
# Try to access parsing_res_list and table_res_list (the complete structure)
|
||||||
parsing_res_list = None
|
parsing_res_list = None
|
||||||
table_res_list = None
|
table_res_list = None
|
||||||
@@ -369,6 +221,7 @@ class PPStructureEnhanced:
|
|||||||
logger.info(f"Found parsing_res_list in to_dict['res'] with {len(parsing_res_list)} elements")
|
logger.info(f"Found parsing_res_list in to_dict['res'] with {len(parsing_res_list)} elements")
|
||||||
|
|
||||||
# Extract table_res_list which contains cell_box_list
|
# Extract table_res_list which contains cell_box_list
|
||||||
|
layout_det_res = None
|
||||||
if result_dict:
|
if result_dict:
|
||||||
if 'table_res_list' in result_dict:
|
if 'table_res_list' in result_dict:
|
||||||
table_res_list = result_dict['table_res_list']
|
table_res_list = result_dict['table_res_list']
|
||||||
@@ -377,20 +230,40 @@ class PPStructureEnhanced:
|
|||||||
if 'cell_box_list' in tbl:
|
if 'cell_box_list' in tbl:
|
||||||
logger.info(f" Table {i}: {len(tbl['cell_box_list'])} cell boxes")
|
logger.info(f" Table {i}: {len(tbl['cell_box_list'])} cell boxes")
|
||||||
|
|
||||||
|
# Extract layout_det_res for Image-in-Table processing
|
||||||
|
if 'layout_det_res' in result_dict:
|
||||||
|
layout_det_res = result_dict['layout_det_res']
|
||||||
|
logger.info(f"Found layout_det_res with {len(layout_det_res.get('boxes', []))} boxes")
|
||||||
|
|
||||||
# Process parsing_res_list if found
|
# Process parsing_res_list if found
|
||||||
if parsing_res_list:
|
if parsing_res_list:
|
||||||
elements = self._process_parsing_res_list(
|
elements = self._process_parsing_res_list(
|
||||||
parsing_res_list, current_page, output_dir, image_path, scaling_info,
|
parsing_res_list, current_page, output_dir, image_path, scaling_info,
|
||||||
table_res_list=table_res_list # Pass table_res_list for cell_box_list
|
table_res_list=table_res_list, # Pass table_res_list for cell_box_list
|
||||||
|
layout_det_res=layout_det_res, # Pass layout_det_res for Image-in-Table
|
||||||
|
use_cv_table_detection=use_cv_table_detection # Use CV for wired tables
|
||||||
)
|
)
|
||||||
all_elements.extend(elements)
|
all_elements.extend(elements)
|
||||||
|
|
||||||
# Extract tables and images from elements
|
# Extract tables and images from elements
|
||||||
|
table_bboxes = [] # Collect table bboxes for standalone image filtering
|
||||||
for elem in elements:
|
for elem in elements:
|
||||||
if elem['type'] == ElementType.TABLE:
|
if elem['type'] == ElementType.TABLE:
|
||||||
all_tables.append(elem)
|
all_tables.append(elem)
|
||||||
|
table_bboxes.append(elem.get('bbox', [0, 0, 0, 0]))
|
||||||
elif elem['type'] in [ElementType.IMAGE, ElementType.FIGURE]:
|
elif elem['type'] in [ElementType.IMAGE, ElementType.FIGURE]:
|
||||||
all_images.append(elem)
|
all_images.append(elem)
|
||||||
|
|
||||||
|
# Extract standalone images from layout_det_res (images NOT inside tables)
|
||||||
|
if layout_det_res and image_path and output_dir:
|
||||||
|
standalone_images = self._extract_standalone_images(
|
||||||
|
layout_det_res, table_bboxes, image_path, output_dir,
|
||||||
|
current_page, len(elements), scaling_info
|
||||||
|
)
|
||||||
|
if standalone_images:
|
||||||
|
all_elements.extend(standalone_images)
|
||||||
|
all_images.extend(standalone_images)
|
||||||
|
logger.info(f"Extracted {len(standalone_images)} standalone images from layout_det_res")
|
||||||
else:
|
else:
|
||||||
# Fallback to markdown if parsing_res_list not available
|
# Fallback to markdown if parsing_res_list not available
|
||||||
logger.warning("parsing_res_list not found, falling back to markdown")
|
logger.warning("parsing_res_list not found, falling back to markdown")
|
||||||
@@ -402,7 +275,7 @@ class PPStructureEnhanced:
|
|||||||
# Create reading order based on element positions
|
# Create reading order based on element positions
|
||||||
reading_order = self._determine_reading_order(all_elements)
|
reading_order = self._determine_reading_order(all_elements)
|
||||||
|
|
||||||
return {
|
result = {
|
||||||
'elements': all_elements,
|
'elements': all_elements,
|
||||||
'total_elements': len(all_elements),
|
'total_elements': len(all_elements),
|
||||||
'reading_order': reading_order,
|
'reading_order': reading_order,
|
||||||
@@ -412,6 +285,12 @@ class PPStructureEnhanced:
|
|||||||
'has_parsing_res_list': parsing_res_list is not None
|
'has_parsing_res_list': parsing_res_list is not None
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Add visualization directory if available
|
||||||
|
if visualization_dir:
|
||||||
|
result['visualization_dir'] = str(visualization_dir)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Enhanced PP-StructureV3 analysis error: {e}")
|
logger.error(f"Enhanced PP-StructureV3 analysis error: {e}")
|
||||||
import traceback
|
import traceback
|
||||||
@@ -446,7 +325,9 @@ class PPStructureEnhanced:
|
|||||||
output_dir: Optional[Path],
|
output_dir: Optional[Path],
|
||||||
source_image_path: Optional[Path] = None,
|
source_image_path: Optional[Path] = None,
|
||||||
scaling_info: Optional['ScalingInfo'] = None,
|
scaling_info: Optional['ScalingInfo'] = None,
|
||||||
table_res_list: Optional[List[Dict]] = None
|
table_res_list: Optional[List[Dict]] = None,
|
||||||
|
layout_det_res: Optional[Dict] = None,
|
||||||
|
use_cv_table_detection: bool = False
|
||||||
) -> List[Dict[str, Any]]:
|
) -> List[Dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
Process parsing_res_list to extract all elements.
|
Process parsing_res_list to extract all elements.
|
||||||
@@ -458,6 +339,8 @@ class PPStructureEnhanced:
|
|||||||
output_dir: Optional output directory
|
output_dir: Optional output directory
|
||||||
source_image_path: Path to source image for cropping image regions
|
source_image_path: Path to source image for cropping image regions
|
||||||
table_res_list: Optional list of table results containing cell_box_list
|
table_res_list: Optional list of table results containing cell_box_list
|
||||||
|
layout_det_res: Optional layout detection result for Image-in-Table processing
|
||||||
|
use_cv_table_detection: If True, use CV line detection for wired tables
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of processed elements with normalized structure
|
List of processed elements with normalized structure
|
||||||
@@ -628,53 +511,55 @@ class PPStructureEnhanced:
|
|||||||
logger.info(f"[TABLE] Processed {len(processed_cells)} cell boxes with table offset ({table_x}, {table_y})")
|
logger.info(f"[TABLE] Processed {len(processed_cells)} cell boxes with table offset ({table_x}, {table_y})")
|
||||||
cell_boxes_extracted = True
|
cell_boxes_extracted = True
|
||||||
|
|
||||||
# Supplement with direct SLANeXt call if PPStructureV3 didn't provide boxes
|
|
||||||
if not cell_boxes_extracted and source_image_path and bbox != [0, 0, 0, 0]:
|
|
||||||
logger.info(f"[TABLE] No boxes from PPStructureV3, attempting SLANeXt extraction...")
|
|
||||||
try:
|
|
||||||
# Load source image and crop table region
|
|
||||||
source_img = Image.open(source_image_path)
|
|
||||||
source_array = np.array(source_img)
|
|
||||||
|
|
||||||
# Crop table region (bbox is in original image coordinates)
|
|
||||||
x1, y1, x2, y2 = [int(round(c)) for c in bbox]
|
|
||||||
# Ensure coordinates are within image bounds
|
|
||||||
h, w = source_array.shape[:2]
|
|
||||||
x1, y1 = max(0, x1), max(0, y1)
|
|
||||||
x2, y2 = min(w, x2), min(h, y2)
|
|
||||||
|
|
||||||
if x2 > x1 and y2 > y1:
|
|
||||||
table_crop = source_array[y1:y2, x1:x2]
|
|
||||||
|
|
||||||
# Convert RGB to BGR for SLANeXt
|
|
||||||
if len(table_crop.shape) == 3 and table_crop.shape[2] == 3:
|
|
||||||
table_crop_bgr = table_crop[:, :, ::-1]
|
|
||||||
else:
|
|
||||||
table_crop_bgr = table_crop
|
|
||||||
|
|
||||||
# Extract cell boxes using SLANeXt
|
|
||||||
slanet_boxes = self._extract_cell_boxes_with_slanet(
|
|
||||||
table_crop_bgr,
|
|
||||||
bbox, # Pass original bbox for coordinate offset
|
|
||||||
is_wired=None # Auto-detect
|
|
||||||
)
|
|
||||||
|
|
||||||
if slanet_boxes:
|
|
||||||
element['cell_boxes'] = slanet_boxes
|
|
||||||
element['cell_boxes_source'] = 'slanet'
|
|
||||||
cell_boxes_extracted = True
|
|
||||||
logger.info(f"[TABLE] SLANeXt extracted {len(slanet_boxes)} cell boxes")
|
|
||||||
else:
|
|
||||||
logger.warning(f"[TABLE] Invalid crop region: ({x1},{y1})-({x2},{y2})")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"[TABLE] SLANeXt extraction failed: {e}")
|
|
||||||
|
|
||||||
if not cell_boxes_extracted:
|
if not cell_boxes_extracted:
|
||||||
logger.info(f"[TABLE] No cell boxes available. PPStructureV3 keys: {list(res_data.keys()) if res_data else 'empty'}")
|
logger.info(f"[TABLE] No cell boxes available. PPStructureV3 keys: {list(res_data.keys()) if res_data else 'empty'}")
|
||||||
|
|
||||||
# Special handling for images/figures
|
# 2.5 CV-based table line detection for wired tables
|
||||||
elif mapped_type in [ElementType.IMAGE, ElementType.FIGURE]:
|
if use_cv_table_detection and source_image_path and source_image_path.exists():
|
||||||
|
try:
|
||||||
|
# Load image for CV processing
|
||||||
|
cv_image = cv2.imread(str(source_image_path))
|
||||||
|
if cv_image is not None:
|
||||||
|
cv_detector = CVTableDetector()
|
||||||
|
ml_cell_boxes = element.get('cell_boxes', [])
|
||||||
|
|
||||||
|
# Detect cells using CV line detection
|
||||||
|
cv_cells = cv_detector.detect_and_merge_with_ml(
|
||||||
|
cv_image,
|
||||||
|
bbox, # Table bbox
|
||||||
|
ml_cell_boxes
|
||||||
|
)
|
||||||
|
|
||||||
|
if cv_cells:
|
||||||
|
# Apply scaling if needed
|
||||||
|
if scaling_info and scaling_info.was_scaled:
|
||||||
|
cv_cells = [
|
||||||
|
[
|
||||||
|
c[0] * scaling_info.scale_x,
|
||||||
|
c[1] * scaling_info.scale_y,
|
||||||
|
c[2] * scaling_info.scale_x,
|
||||||
|
c[3] * scaling_info.scale_y
|
||||||
|
]
|
||||||
|
for c in cv_cells
|
||||||
|
]
|
||||||
|
|
||||||
|
element['cell_boxes'] = cv_cells
|
||||||
|
element['cell_boxes_source'] = 'cv_line_detection'
|
||||||
|
logger.info(f"[TABLE] CV line detection found {len(cv_cells)} cells (ML had {len(ml_cell_boxes)})")
|
||||||
|
except Exception as cv_error:
|
||||||
|
logger.warning(f"[TABLE] CV line detection failed: {cv_error}")
|
||||||
|
|
||||||
|
# 3. Image-in-Table 處理:檢測並嵌入表格內的圖片
|
||||||
|
if layout_det_res and source_image_path and output_dir:
|
||||||
|
embedded_images = self._embed_images_in_table(
|
||||||
|
element, bbox, layout_det_res, source_image_path, output_dir
|
||||||
|
)
|
||||||
|
if embedded_images:
|
||||||
|
element['embedded_images'] = embedded_images
|
||||||
|
logger.info(f"[TABLE] Embedded {len(embedded_images)} images into table")
|
||||||
|
|
||||||
|
# Special handling for images/figures/stamps (visual elements that need cropping)
|
||||||
|
elif mapped_type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.STAMP, ElementType.LOGO]:
|
||||||
# Save image if path provided
|
# Save image if path provided
|
||||||
if 'img_path' in item and output_dir:
|
if 'img_path' in item and output_dir:
|
||||||
saved_path = self._save_image(item['img_path'], output_dir, element['element_id'])
|
saved_path = self._save_image(item['img_path'], output_dir, element['element_id'])
|
||||||
@@ -704,6 +589,209 @@ class PPStructureEnhanced:
|
|||||||
|
|
||||||
return elements
|
return elements
|
||||||
|
|
||||||
|
def _embed_images_in_table(
|
||||||
|
self,
|
||||||
|
table_element: Dict[str, Any],
|
||||||
|
table_bbox: List[float],
|
||||||
|
layout_det_res: Dict,
|
||||||
|
source_image_path: Path,
|
||||||
|
output_dir: Path
|
||||||
|
) -> List[Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Detect and embed images that are inside a table region.
|
||||||
|
|
||||||
|
This handles the case where layout detection finds an image inside a table,
|
||||||
|
similar to how pp_demo embeds images in table HTML.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
table_element: The table element being processed
|
||||||
|
table_bbox: Table bounding box [x1, y1, x2, y2]
|
||||||
|
layout_det_res: Layout detection result containing all detected boxes
|
||||||
|
source_image_path: Path to source image for cropping
|
||||||
|
output_dir: Output directory for saving cropped images
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of embedded image info dicts with 'bbox', 'saved_path', 'html_tag'
|
||||||
|
"""
|
||||||
|
embedded_images = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
boxes = layout_det_res.get('boxes', [])
|
||||||
|
table_x1, table_y1, table_x2, table_y2 = table_bbox
|
||||||
|
|
||||||
|
for box in boxes:
|
||||||
|
label = box.get('label', '').lower()
|
||||||
|
if label != 'image':
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Get image bbox
|
||||||
|
img_coord = box.get('coordinate', [])
|
||||||
|
if len(img_coord) < 4:
|
||||||
|
continue
|
||||||
|
|
||||||
|
img_x1, img_y1, img_x2, img_y2 = img_coord[:4]
|
||||||
|
|
||||||
|
# Check if image is inside table (with some tolerance)
|
||||||
|
tolerance = 5 # pixels
|
||||||
|
if (img_x1 >= table_x1 - tolerance and
|
||||||
|
img_y1 >= table_y1 - tolerance and
|
||||||
|
img_x2 <= table_x2 + tolerance and
|
||||||
|
img_y2 <= table_y2 + tolerance):
|
||||||
|
|
||||||
|
logger.info(f"[IMAGE-IN-TABLE] Found image at [{int(img_x1)},{int(img_y1)},{int(img_x2)},{int(img_y2)}] inside table")
|
||||||
|
|
||||||
|
# Crop and save the image
|
||||||
|
img_element_id = f"img_in_table_{int(img_x1)}_{int(img_y1)}_{int(img_x2)}_{int(img_y2)}"
|
||||||
|
cropped_path = self._crop_and_save_image(
|
||||||
|
source_image_path,
|
||||||
|
[img_x1, img_y1, img_x2, img_y2],
|
||||||
|
output_dir,
|
||||||
|
img_element_id
|
||||||
|
)
|
||||||
|
|
||||||
|
if cropped_path:
|
||||||
|
# Create relative path for HTML embedding
|
||||||
|
rel_path = f"imgs/{Path(cropped_path).name}"
|
||||||
|
|
||||||
|
# Create img tag similar to pp_demo
|
||||||
|
img_html = f'<div style="text-align: center;"><img src="{rel_path}" alt="Image" /></div>'
|
||||||
|
|
||||||
|
embedded_image = {
|
||||||
|
'bbox': [img_x1, img_y1, img_x2, img_y2],
|
||||||
|
'saved_path': str(cropped_path),
|
||||||
|
'relative_path': rel_path,
|
||||||
|
'html_tag': img_html,
|
||||||
|
'element_id': img_element_id
|
||||||
|
}
|
||||||
|
embedded_images.append(embedded_image)
|
||||||
|
|
||||||
|
# Try to insert image into HTML content
|
||||||
|
if 'html' in table_element and table_element['html']:
|
||||||
|
# Insert image reference at the end of HTML before </table>
|
||||||
|
original_html = table_element['html']
|
||||||
|
if '</tbody>' in original_html:
|
||||||
|
# Insert before </tbody> in a new row
|
||||||
|
new_html = original_html.replace(
|
||||||
|
'</tbody>',
|
||||||
|
f'<tr><td colspan="99" style="text-align:center;"><img src="{rel_path}" alt="Embedded Image" /></td></tr></tbody>'
|
||||||
|
)
|
||||||
|
table_element['html'] = new_html
|
||||||
|
logger.info(f"[IMAGE-IN-TABLE] Embedded image into table HTML")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"[IMAGE-IN-TABLE] Error processing images in table: {e}")
|
||||||
|
|
||||||
|
return embedded_images
|
||||||
|
|
||||||
|
def _extract_standalone_images(
|
||||||
|
self,
|
||||||
|
layout_det_res: Dict,
|
||||||
|
table_bboxes: List[List[float]],
|
||||||
|
source_image_path: Path,
|
||||||
|
output_dir: Path,
|
||||||
|
current_page: int,
|
||||||
|
start_index: int,
|
||||||
|
scaling_info: Optional['ScalingInfo'] = None
|
||||||
|
) -> List[Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Extract standalone images from layout_det_res that are NOT inside tables.
|
||||||
|
|
||||||
|
This handles images that PP-StructureV3 detects in layout_det_res but
|
||||||
|
doesn't include in parsing_res_list (non-table images).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
layout_det_res: Layout detection result containing all detected boxes
|
||||||
|
table_bboxes: List of table bounding boxes to exclude images inside tables
|
||||||
|
source_image_path: Path to source image for cropping
|
||||||
|
output_dir: Output directory for saving cropped images
|
||||||
|
current_page: Current page number
|
||||||
|
start_index: Starting index for element IDs
|
||||||
|
scaling_info: Optional scaling info for coordinate restoration
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of standalone image elements
|
||||||
|
"""
|
||||||
|
standalone_images = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
boxes = layout_det_res.get('boxes', [])
|
||||||
|
logger.info(f"[STANDALONE-IMAGE] Checking {len(boxes)} boxes for standalone images")
|
||||||
|
|
||||||
|
for box_idx, box in enumerate(boxes):
|
||||||
|
label = box.get('label', '').lower()
|
||||||
|
if label != 'image':
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Get image bbox
|
||||||
|
img_coord = box.get('coordinate', [])
|
||||||
|
if len(img_coord) < 4:
|
||||||
|
continue
|
||||||
|
|
||||||
|
img_x1, img_y1, img_x2, img_y2 = img_coord[:4]
|
||||||
|
|
||||||
|
# Check if image is inside any table (skip if so)
|
||||||
|
is_inside_table = False
|
||||||
|
for table_bbox in table_bboxes:
|
||||||
|
if len(table_bbox) < 4:
|
||||||
|
continue
|
||||||
|
tx1, ty1, tx2, ty2 = table_bbox[:4]
|
||||||
|
tolerance = 5 # pixels
|
||||||
|
if (img_x1 >= tx1 - tolerance and
|
||||||
|
img_y1 >= ty1 - tolerance and
|
||||||
|
img_x2 <= tx2 + tolerance and
|
||||||
|
img_y2 <= ty2 + tolerance):
|
||||||
|
is_inside_table = True
|
||||||
|
logger.debug(f"[STANDALONE-IMAGE] Image at [{int(img_x1)},{int(img_y1)}] is inside table, skipping")
|
||||||
|
break
|
||||||
|
|
||||||
|
if is_inside_table:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Scale bbox back to original coordinates if needed
|
||||||
|
if scaling_info and scaling_info.was_scaled:
|
||||||
|
scale_factor = scaling_info.scale_factor
|
||||||
|
img_x1 *= scale_factor
|
||||||
|
img_y1 *= scale_factor
|
||||||
|
img_x2 *= scale_factor
|
||||||
|
img_y2 *= scale_factor
|
||||||
|
logger.debug(f"[STANDALONE-IMAGE] Scaled bbox by {scale_factor:.3f}")
|
||||||
|
|
||||||
|
logger.info(f"[STANDALONE-IMAGE] Found standalone image at [{int(img_x1)},{int(img_y1)},{int(img_x2)},{int(img_y2)}]")
|
||||||
|
|
||||||
|
# Crop and save the image
|
||||||
|
element_idx = start_index + len(standalone_images)
|
||||||
|
img_element_id = f"standalone_img_{current_page}_{element_idx}"
|
||||||
|
cropped_path = self._crop_and_save_image(
|
||||||
|
source_image_path,
|
||||||
|
[img_x1, img_y1, img_x2, img_y2],
|
||||||
|
output_dir,
|
||||||
|
img_element_id
|
||||||
|
)
|
||||||
|
|
||||||
|
if cropped_path:
|
||||||
|
element = {
|
||||||
|
'element_id': img_element_id,
|
||||||
|
'type': ElementType.IMAGE,
|
||||||
|
'original_type': 'image',
|
||||||
|
'content': '',
|
||||||
|
'page': current_page,
|
||||||
|
'bbox': [img_x1, img_y1, img_x2, img_y2],
|
||||||
|
'index': element_idx,
|
||||||
|
'confidence': box.get('score', 1.0),
|
||||||
|
'saved_path': cropped_path,
|
||||||
|
'img_path': cropped_path,
|
||||||
|
'source': 'layout_det_res'
|
||||||
|
}
|
||||||
|
standalone_images.append(element)
|
||||||
|
logger.info(f"[STANDALONE-IMAGE] Extracted and saved: {cropped_path}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"[STANDALONE-IMAGE] Error extracting standalone images: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
|
return standalone_images
|
||||||
|
|
||||||
def _process_markdown_fallback(
|
def _process_markdown_fallback(
|
||||||
self,
|
self,
|
||||||
page_result: Any,
|
page_result: Any,
|
||||||
|
|||||||
135
backend/tests/test_layered_rendering.py
Normal file
135
backend/tests/test_layered_rendering.py
Normal file
@@ -0,0 +1,135 @@
|
|||||||
|
"""
|
||||||
|
Test script for layered rendering approach.
|
||||||
|
Tests that table borders are drawn from cell_boxes
|
||||||
|
while text is rendered at raw OCR positions.
|
||||||
|
"""
|
||||||
|
import sys
|
||||||
|
sys.path.insert(0, '/home/egg/project/Tool_OCR/backend')
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from app.services.pdf_generator_service import PDFGeneratorService
|
||||||
|
from app.services.gap_filling_service import GapFillingService
|
||||||
|
|
||||||
|
|
||||||
|
def test_layered_rendering():
|
||||||
|
"""Test the layered rendering approach."""
|
||||||
|
# Use existing test task
|
||||||
|
task_id = "84899366-f361-44f1-b989-5aba72419ca5"
|
||||||
|
result_dir = Path(f"/home/egg/project/Tool_OCR/backend/storage/results/{task_id}")
|
||||||
|
|
||||||
|
if not result_dir.exists():
|
||||||
|
print(f"[ERROR] Result directory not found: {result_dir}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Load scan_result.json
|
||||||
|
scan_result_path = result_dir / "scan_result.json"
|
||||||
|
raw_ocr_path = result_dir / f"{task_id}_scan_page_1_raw_ocr_regions.json"
|
||||||
|
|
||||||
|
if not scan_result_path.exists():
|
||||||
|
print(f"[ERROR] scan_result.json not found")
|
||||||
|
return False
|
||||||
|
|
||||||
|
print(f"[INFO] Loading scan_result.json from {scan_result_path}")
|
||||||
|
with open(scan_result_path, 'r', encoding='utf-8') as f:
|
||||||
|
scan_result = json.load(f)
|
||||||
|
|
||||||
|
# Parse as UnifiedDocument using PDFGeneratorService's method
|
||||||
|
# scan_result IS the unified document (not nested under 'unified_document')
|
||||||
|
pdf_service = PDFGeneratorService()
|
||||||
|
unified_doc = pdf_service._json_to_unified_document(scan_result, result_dir)
|
||||||
|
|
||||||
|
if not unified_doc:
|
||||||
|
print(f"[ERROR] Failed to parse UnifiedDocument")
|
||||||
|
return False
|
||||||
|
|
||||||
|
print(f"[INFO] UnifiedDocument: {unified_doc.page_count} pages")
|
||||||
|
|
||||||
|
# Count elements
|
||||||
|
table_count = 0
|
||||||
|
text_count = 0
|
||||||
|
for page in unified_doc.pages:
|
||||||
|
for elem in page.elements:
|
||||||
|
if elem.type.value == 'table':
|
||||||
|
table_count += 1
|
||||||
|
# Check if cell_boxes are present (in metadata, not content)
|
||||||
|
cell_boxes = elem.metadata.get('cell_boxes', []) if elem.metadata else []
|
||||||
|
embedded_images = elem.metadata.get('embedded_images', []) if elem.metadata else []
|
||||||
|
print(f"[INFO] Table {elem.element_id}: {len(cell_boxes)} cell_boxes, {len(embedded_images)} embedded_images")
|
||||||
|
elif elem.type.value in ['text', 'paragraph', 'title']:
|
||||||
|
text_count += 1
|
||||||
|
|
||||||
|
print(f"[INFO] Tables: {table_count}, Text elements: {text_count}")
|
||||||
|
|
||||||
|
# Load raw OCR regions if available
|
||||||
|
raw_ocr_regions = []
|
||||||
|
if raw_ocr_path.exists():
|
||||||
|
print(f"[INFO] Loading raw OCR regions from {raw_ocr_path}")
|
||||||
|
with open(raw_ocr_path, 'r', encoding='utf-8') as f:
|
||||||
|
raw_ocr_data = json.load(f)
|
||||||
|
# Could be a list or dict with 'text_regions' key
|
||||||
|
if isinstance(raw_ocr_data, list):
|
||||||
|
raw_ocr_regions = raw_ocr_data
|
||||||
|
else:
|
||||||
|
raw_ocr_regions = raw_ocr_data.get('text_regions', [])
|
||||||
|
print(f"[INFO] Raw OCR regions: {len(raw_ocr_regions)}")
|
||||||
|
|
||||||
|
# Apply gap filling for each page
|
||||||
|
print(f"[INFO] Applying GapFillingService...")
|
||||||
|
gap_service = GapFillingService()
|
||||||
|
gap_filled_doc = unified_doc # Start with original
|
||||||
|
|
||||||
|
for page in unified_doc.pages:
|
||||||
|
page_num = page.page_number
|
||||||
|
page_dims = page.dimensions
|
||||||
|
|
||||||
|
# Get elements for this page
|
||||||
|
pp_elements = page.elements
|
||||||
|
|
||||||
|
# Apply gap filling
|
||||||
|
filled_elements, stats = gap_service.fill_gaps(
|
||||||
|
raw_ocr_regions=raw_ocr_regions,
|
||||||
|
pp_structure_elements=pp_elements,
|
||||||
|
page_number=page_num,
|
||||||
|
pp_dimensions=page_dims
|
||||||
|
)
|
||||||
|
|
||||||
|
# Update the page's elements
|
||||||
|
page.elements = filled_elements
|
||||||
|
print(f"[INFO] Page {page_num}: Added {stats.get('gaps_filled', 0)} gap-filled regions")
|
||||||
|
|
||||||
|
# Count elements after gap filling
|
||||||
|
final_text_count = 0
|
||||||
|
for page in gap_filled_doc.pages:
|
||||||
|
for elem in page.elements:
|
||||||
|
if elem.type.value in ['text', 'paragraph', 'title']:
|
||||||
|
final_text_count += 1
|
||||||
|
|
||||||
|
print(f"[INFO] After gap filling: {final_text_count} text elements (was {text_count})")
|
||||||
|
|
||||||
|
# Generate PDF
|
||||||
|
print(f"[INFO] Generating PDF with layered rendering...")
|
||||||
|
output_pdf = result_dir / "test_layered_rendering.pdf"
|
||||||
|
|
||||||
|
try:
|
||||||
|
success = pdf_service.generate_from_unified_document(
|
||||||
|
unified_doc=gap_filled_doc,
|
||||||
|
output_path=output_pdf
|
||||||
|
)
|
||||||
|
if success:
|
||||||
|
print(f"[SUCCESS] PDF generated: {output_pdf}")
|
||||||
|
print(f"[INFO] PDF size: {output_pdf.stat().st_size} bytes")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print(f"[ERROR] PDF generation returned False")
|
||||||
|
return False
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[ERROR] PDF generation failed: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
success = test_layered_rendering()
|
||||||
|
sys.exit(0 if success else 1)
|
||||||
@@ -241,6 +241,25 @@ export default function PreprocessingSettings({
|
|||||||
)}
|
)}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
{/* Scan Artifact Removal Toggle */}
|
||||||
|
<div className="space-y-2">
|
||||||
|
<label className="flex items-center gap-2 cursor-pointer">
|
||||||
|
<input
|
||||||
|
type="checkbox"
|
||||||
|
checked={config.remove_scan_artifacts}
|
||||||
|
onChange={(e) => handleConfigChange('remove_scan_artifacts', e.target.checked)}
|
||||||
|
disabled={disabled}
|
||||||
|
className="w-4 h-4 rounded border-gray-300 text-blue-600 focus:ring-blue-500"
|
||||||
|
/>
|
||||||
|
<span className="text-sm text-gray-700">
|
||||||
|
{t('processing.preprocessing.removeScanArtifacts')}
|
||||||
|
</span>
|
||||||
|
</label>
|
||||||
|
<p className="text-xs text-gray-500 pl-6">
|
||||||
|
{t('processing.preprocessing.removeScanArtifactsDesc')}
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
{/* Binarize Toggle - Hidden by default, shown only in advanced mode */}
|
{/* Binarize Toggle - Hidden by default, shown only in advanced mode */}
|
||||||
<details className="pt-2">
|
<details className="pt-2">
|
||||||
<summary className="text-xs text-gray-500 cursor-pointer hover:text-gray-700">
|
<summary className="text-xs text-gray-500 cursor-pointer hover:text-gray-700">
|
||||||
|
|||||||
124
frontend/src/components/TableDetectionSelector.tsx
Normal file
124
frontend/src/components/TableDetectionSelector.tsx
Normal file
@@ -0,0 +1,124 @@
|
|||||||
|
import { cn } from '@/lib/utils'
|
||||||
|
import { Checkbox } from '@/components/ui/checkbox'
|
||||||
|
import { Table, Grid3X3, Rows3 } from 'lucide-react'
|
||||||
|
import { useTranslation } from 'react-i18next'
|
||||||
|
import type { TableDetectionConfig } from '@/types/apiV2'
|
||||||
|
|
||||||
|
interface TableDetectionSelectorProps {
|
||||||
|
value: TableDetectionConfig
|
||||||
|
onChange: (config: TableDetectionConfig) => void
|
||||||
|
disabled?: boolean
|
||||||
|
className?: string
|
||||||
|
}
|
||||||
|
|
||||||
|
interface DetectionOption {
|
||||||
|
key: keyof TableDetectionConfig
|
||||||
|
icon: React.ReactNode
|
||||||
|
labelKey: string
|
||||||
|
descKey: string
|
||||||
|
}
|
||||||
|
|
||||||
|
const DETECTION_OPTIONS: DetectionOption[] = [
|
||||||
|
{
|
||||||
|
key: 'enable_wired_table',
|
||||||
|
icon: <Grid3X3 className="w-5 h-5" />,
|
||||||
|
labelKey: 'processing.tableDetection.wired',
|
||||||
|
descKey: 'processing.tableDetection.wiredDesc',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
key: 'enable_wireless_table',
|
||||||
|
icon: <Rows3 className="w-5 h-5" />,
|
||||||
|
labelKey: 'processing.tableDetection.wireless',
|
||||||
|
descKey: 'processing.tableDetection.wirelessDesc',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
key: 'enable_region_detection',
|
||||||
|
icon: <Table className="w-5 h-5" />,
|
||||||
|
labelKey: 'processing.tableDetection.region',
|
||||||
|
descKey: 'processing.tableDetection.regionDesc',
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
export default function TableDetectionSelector({
|
||||||
|
value,
|
||||||
|
onChange,
|
||||||
|
disabled = false,
|
||||||
|
className,
|
||||||
|
}: TableDetectionSelectorProps) {
|
||||||
|
const { t } = useTranslation()
|
||||||
|
|
||||||
|
const handleOptionChange = (key: keyof TableDetectionConfig, checked: boolean) => {
|
||||||
|
onChange({
|
||||||
|
...value,
|
||||||
|
[key]: checked,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className={cn('border rounded-lg p-4 bg-white', className)}>
|
||||||
|
{/* Header */}
|
||||||
|
<div className="flex items-center gap-2 mb-4">
|
||||||
|
<Table className="w-5 h-5 text-gray-600" />
|
||||||
|
<h3 className="text-lg font-semibold text-gray-900">{t('processing.tableDetection.title')}</h3>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Detection Options */}
|
||||||
|
<div className="space-y-3">
|
||||||
|
{DETECTION_OPTIONS.map((option) => {
|
||||||
|
const isChecked = value[option.key]
|
||||||
|
|
||||||
|
return (
|
||||||
|
<label
|
||||||
|
key={option.key}
|
||||||
|
className={cn(
|
||||||
|
'flex items-start gap-4 p-4 rounded-lg border-2 transition-all cursor-pointer',
|
||||||
|
isChecked
|
||||||
|
? 'border-blue-500 bg-blue-50'
|
||||||
|
: 'border-gray-200 hover:border-gray-300 hover:bg-gray-50',
|
||||||
|
disabled && 'opacity-50 cursor-not-allowed'
|
||||||
|
)}
|
||||||
|
>
|
||||||
|
{/* Checkbox */}
|
||||||
|
<Checkbox
|
||||||
|
checked={isChecked}
|
||||||
|
onCheckedChange={(checked) => handleOptionChange(option.key, checked)}
|
||||||
|
disabled={disabled}
|
||||||
|
className="mt-0.5"
|
||||||
|
/>
|
||||||
|
|
||||||
|
{/* Icon */}
|
||||||
|
<div
|
||||||
|
className={cn(
|
||||||
|
'p-2 rounded-lg flex-shrink-0',
|
||||||
|
isChecked ? 'bg-blue-100 text-blue-600' : 'bg-gray-100 text-gray-500'
|
||||||
|
)}
|
||||||
|
>
|
||||||
|
{option.icon}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Content */}
|
||||||
|
<div className="flex-1 min-w-0">
|
||||||
|
<span
|
||||||
|
className={cn(
|
||||||
|
'font-medium',
|
||||||
|
isChecked ? 'text-blue-700' : 'text-gray-900'
|
||||||
|
)}
|
||||||
|
>
|
||||||
|
{t(option.labelKey)}
|
||||||
|
</span>
|
||||||
|
<p className="text-sm text-gray-500 mt-1">{t(option.descKey)}</p>
|
||||||
|
</div>
|
||||||
|
</label>
|
||||||
|
)
|
||||||
|
})}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Info Note */}
|
||||||
|
<div className="mt-4 p-3 bg-amber-50 border border-amber-200 rounded-md">
|
||||||
|
<p className="text-sm text-amber-800">
|
||||||
|
{t('processing.tableDetection.note')}
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
@@ -64,6 +64,16 @@
|
|||||||
"recommended": "推薦",
|
"recommended": "推薦",
|
||||||
"note": "版面模型會影響文件結構(表格、文字區塊、圖片)的偵測效果。請根據您的文件類型選擇適合的模型。"
|
"note": "版面模型會影響文件結構(表格、文字區塊、圖片)的偵測效果。請根據您的文件類型選擇適合的模型。"
|
||||||
},
|
},
|
||||||
|
"tableDetection": {
|
||||||
|
"title": "表格偵測模式",
|
||||||
|
"wired": "有框線表格",
|
||||||
|
"wiredDesc": "偵測有明顯格線邊框的表格,適用於正式表格文件",
|
||||||
|
"wireless": "無框線表格",
|
||||||
|
"wirelessDesc": "偵測無邊框的表格,透過對齊方式推斷表格結構",
|
||||||
|
"region": "區域偵測",
|
||||||
|
"regionDesc": "輔助偵測表格區域,改善複雜表格的儲存格識別",
|
||||||
|
"note": "可同時啟用多種偵測模式,系統會自動整合偵測結果。如果表格儲存格框線不正確,請嘗試調整偵測模式。"
|
||||||
|
},
|
||||||
"preprocessing": {
|
"preprocessing": {
|
||||||
"title": "影像前處理",
|
"title": "影像前處理",
|
||||||
"mode": {
|
"mode": {
|
||||||
@@ -92,6 +102,8 @@
|
|||||||
"strong": "強",
|
"strong": "強",
|
||||||
"maximum": "最強"
|
"maximum": "最強"
|
||||||
},
|
},
|
||||||
|
"removeScanArtifacts": "移除掃描瑕疵",
|
||||||
|
"removeScanArtifactsDesc": "移除掃描時光源產生的水平線痕,避免被誤判為表格框線",
|
||||||
"advanced": "進階選項",
|
"advanced": "進階選項",
|
||||||
"binarize": "二值化處理",
|
"binarize": "二值化處理",
|
||||||
"binarizeWarning": "不建議使用",
|
"binarizeWarning": "不建議使用",
|
||||||
|
|||||||
@@ -12,9 +12,10 @@ import { Play, CheckCircle, FileText, AlertCircle, Clock, Activity, Loader2, Inf
|
|||||||
import LayoutModelSelector from '@/components/LayoutModelSelector'
|
import LayoutModelSelector from '@/components/LayoutModelSelector'
|
||||||
import PreprocessingSettings from '@/components/PreprocessingSettings'
|
import PreprocessingSettings from '@/components/PreprocessingSettings'
|
||||||
import PreprocessingPreview from '@/components/PreprocessingPreview'
|
import PreprocessingPreview from '@/components/PreprocessingPreview'
|
||||||
|
import TableDetectionSelector from '@/components/TableDetectionSelector'
|
||||||
import TaskNotFound from '@/components/TaskNotFound'
|
import TaskNotFound from '@/components/TaskNotFound'
|
||||||
import { useTaskValidation } from '@/hooks/useTaskValidation'
|
import { useTaskValidation } from '@/hooks/useTaskValidation'
|
||||||
import type { LayoutModel, ProcessingOptions, PreprocessingMode, PreprocessingConfig, DocumentAnalysisResponse } from '@/types/apiV2'
|
import type { LayoutModel, ProcessingOptions, PreprocessingMode, PreprocessingConfig, TableDetectionConfig, DocumentAnalysisResponse } from '@/types/apiV2'
|
||||||
|
|
||||||
export default function ProcessingPage() {
|
export default function ProcessingPage() {
|
||||||
const { t } = useTranslation()
|
const { t } = useTranslation()
|
||||||
@@ -44,9 +45,17 @@ export default function ProcessingPage() {
|
|||||||
sharpen: true,
|
sharpen: true,
|
||||||
sharpen_strength: 1.0,
|
sharpen_strength: 1.0,
|
||||||
binarize: false,
|
binarize: false,
|
||||||
|
remove_scan_artifacts: true,
|
||||||
})
|
})
|
||||||
const [showPreview, setShowPreview] = useState(false)
|
const [showPreview, setShowPreview] = useState(false)
|
||||||
|
|
||||||
|
// Table detection state
|
||||||
|
const [tableDetectionConfig, setTableDetectionConfig] = useState<TableDetectionConfig>({
|
||||||
|
enable_wired_table: true,
|
||||||
|
enable_wireless_table: true,
|
||||||
|
enable_region_detection: true,
|
||||||
|
})
|
||||||
|
|
||||||
// Analyze document to determine if OCR is needed (only for pending tasks)
|
// Analyze document to determine if OCR is needed (only for pending tasks)
|
||||||
const { data: documentAnalysis, isLoading: isAnalyzing } = useQuery({
|
const { data: documentAnalysis, isLoading: isAnalyzing } = useQuery({
|
||||||
queryKey: ['documentAnalysis', taskId],
|
queryKey: ['documentAnalysis', taskId],
|
||||||
@@ -70,6 +79,7 @@ export default function ProcessingPage() {
|
|||||||
layout_model: layoutModel,
|
layout_model: layoutModel,
|
||||||
preprocessing_mode: preprocessingMode,
|
preprocessing_mode: preprocessingMode,
|
||||||
preprocessing_config: preprocessingMode === 'manual' ? preprocessingConfig : undefined,
|
preprocessing_config: preprocessingMode === 'manual' ? preprocessingConfig : undefined,
|
||||||
|
table_detection: tableDetectionConfig,
|
||||||
}
|
}
|
||||||
|
|
||||||
return apiClientV2.startTask(taskId!, options)
|
return apiClientV2.startTask(taskId!, options)
|
||||||
@@ -441,6 +451,13 @@ export default function ProcessingPage() {
|
|||||||
disabled={processOCRMutation.isPending}
|
disabled={processOCRMutation.isPending}
|
||||||
/>
|
/>
|
||||||
|
|
||||||
|
{/* Table Detection Settings */}
|
||||||
|
<TableDetectionSelector
|
||||||
|
value={tableDetectionConfig}
|
||||||
|
onChange={setTableDetectionConfig}
|
||||||
|
disabled={processOCRMutation.isPending}
|
||||||
|
/>
|
||||||
|
|
||||||
{/* Preprocessing Settings */}
|
{/* Preprocessing Settings */}
|
||||||
<PreprocessingSettings
|
<PreprocessingSettings
|
||||||
mode={preprocessingMode}
|
mode={preprocessingMode}
|
||||||
|
|||||||
@@ -108,6 +108,20 @@ export interface PreprocessingConfig {
|
|||||||
sharpen: boolean
|
sharpen: boolean
|
||||||
sharpen_strength: number // 0.5-2.0, default 1.0
|
sharpen_strength: number // 0.5-2.0, default 1.0
|
||||||
binarize: boolean
|
binarize: boolean
|
||||||
|
remove_scan_artifacts: boolean // Remove horizontal scan line artifacts
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Table detection configuration for PP-StructureV3.
|
||||||
|
* Controls which table detection modes to enable.
|
||||||
|
* - enable_wired_table: Tables with visible cell borders/grid lines
|
||||||
|
* - enable_wireless_table: Tables without visible borders
|
||||||
|
* - enable_region_detection: Detect table-like regions for better cell structure
|
||||||
|
*/
|
||||||
|
export interface TableDetectionConfig {
|
||||||
|
enable_wired_table: boolean
|
||||||
|
enable_wireless_table: boolean
|
||||||
|
enable_region_detection: boolean
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -147,6 +161,7 @@ export interface ProcessingOptions {
|
|||||||
layout_model?: LayoutModel // Layout detection model selection (OCR track only)
|
layout_model?: LayoutModel // Layout detection model selection (OCR track only)
|
||||||
preprocessing_mode?: PreprocessingMode // Preprocessing mode (OCR track only)
|
preprocessing_mode?: PreprocessingMode // Preprocessing mode (OCR track only)
|
||||||
preprocessing_config?: PreprocessingConfig // Manual preprocessing config
|
preprocessing_config?: PreprocessingConfig // Manual preprocessing config
|
||||||
|
table_detection?: TableDetectionConfig // Table detection options (OCR track only)
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface TaskCreate {
|
export interface TaskCreate {
|
||||||
|
|||||||
@@ -1,62 +1,88 @@
|
|||||||
# Tasks: Extract Table Cell Boxes
|
# Tasks: Extract Table Cell Boxes
|
||||||
|
|
||||||
## Phase 1: 基礎設施
|
## 重要發現 (2025-11-28)
|
||||||
|
|
||||||
### Task 1.1: 添加配置項
|
**PPStructureV3 (PaddleX 3.3.9) 確實提供 `table_res_list`!**
|
||||||
- [x] 在 `config.py` 添加 `enable_table_cell_boxes_extraction` 配置
|
|
||||||
- [x] 確認現有的表格模型配置可用
|
之前的實現假設需要額外調用 SLANeXt 模型,但經過深入測試發現:
|
||||||
|
- `result.json['res']['table_res_list']` 包含所有表格的 `cell_box_list`
|
||||||
|
- 不需要額外的模型調用
|
||||||
|
- 已移除多餘的 SLANeXt 代碼
|
||||||
|
|
||||||
|
## Phase 1: 基礎設施 (已完成)
|
||||||
|
|
||||||
|
### Task 1.1: 配置項
|
||||||
|
- [x] ~~添加 `enable_table_cell_boxes_extraction` 配置~~ (已移除,不再需要)
|
||||||
|
- [x] 確認 PPStructureV3 提供 `table_res_list`
|
||||||
|
|
||||||
### Task 1.2: 模型緩存機制
|
### Task 1.2: 模型緩存機制
|
||||||
- [x] 在 `PPStructureEnhanced` 中添加模型緩存屬性
|
- [x] ~~實現 SLANeXt 模型緩存~~ (已移除,不再需要)
|
||||||
- [x] 實現延遲載入邏輯
|
- [x] 直接使用 PPStructureV3 內建的 `table_res_list`
|
||||||
- [x] 添加模型釋放方法(可選)
|
|
||||||
|
|
||||||
## Phase 2: Cell Boxes 提取
|
## Phase 2: Cell Boxes 提取 (已完成)
|
||||||
|
|
||||||
### Task 2.1: 修改表格處理邏輯
|
### Task 2.1: 從 table_res_list 提取
|
||||||
- [x] 在 `_process_parsing_res_list` 中添加 cell boxes 提取
|
- [x] 從 `result.json['res']['table_res_list']` 獲取 `cell_box_list`
|
||||||
- [x] 實現圖片裁切邏輯
|
- [x] 通過 HTML 內容匹配表格
|
||||||
- [x] 調用 SLANeXt 模型獲取結果
|
- [x] 驗證座標格式 (已是絕對座標)
|
||||||
|
|
||||||
### Task 2.2: 座標轉換
|
### Task 2.2: Image-in-Table 處理
|
||||||
- [x] 實現相對座標到全域座標的轉換
|
- [x] 從 `layout_det_res` 獲取 image boxes
|
||||||
- [x] 處理 ScalingInfo 的座標縮放
|
- [x] 檢測表格內的圖片
|
||||||
- [x] 驗證座標轉換正確性
|
- [x] 裁切保存圖片
|
||||||
|
- [x] 嵌入到表格 HTML
|
||||||
|
|
||||||
### Task 2.3: 錯誤處理
|
## Phase 3: PDF 生成優化 (已完成)
|
||||||
- [x] 添加 try-catch 包裝
|
|
||||||
- [x] 實現失敗時的降級處理
|
|
||||||
- [x] 添加適當的日誌記錄
|
|
||||||
|
|
||||||
## Phase 3: PDF 生成優化
|
### Task 3.1: ~~利用 Cell Boxes 推斷網格~~ (已棄用)
|
||||||
|
- [x] ~~修改 `draw_table_region` 使用 cell_boxes~~
|
||||||
|
- [x] ~~根據實際 cell 位置計算行高列寬~~
|
||||||
|
- [x] 測試渲染效果 → **發現問題:HTML 結構與 cell_boxes 不匹配**
|
||||||
|
|
||||||
### Task 3.1: 利用 Cell Boxes 渲染表格
|
### Task 3.2: 方案 B - 分層渲染 (Layered Rendering) ✓ 已完成
|
||||||
- [x] 修改 `draw_table_region` 使用 cell_boxes
|
|
||||||
- [x] 根據實際 cell 位置計算行高列寬
|
|
||||||
- [ ] 測試渲染效果
|
|
||||||
|
|
||||||
### Task 3.2: 備選方案
|
**問題分析 (2025-11-30)**:
|
||||||
- [x] 當 cell_boxes 不可用時,使用現有邏輯
|
- HTML 表格結構與 cell_boxes 不匹配,無法正確推斷網格
|
||||||
|
- 嘗試在 cell 內繪製文字失敗(超出邊框、匹配錯誤)
|
||||||
|
|
||||||
|
**解決方案**:分層渲染 - 分離表格邊框與文字繪製
|
||||||
|
- Layer 1: 使用 cell_boxes 繪製表格邊框
|
||||||
|
- Layer 2: 使用 raw OCR positions 繪製文字(獨立於表格結構)
|
||||||
|
- Layer 3: 繪製 embedded_images
|
||||||
|
|
||||||
|
**實作步驟 (2025-11-30)**:
|
||||||
|
- [x] 修改 `GapFillingService._is_region_covered()` - 跳過 TABLE 元素覆蓋檢測
|
||||||
|
- [x] 簡化 `_draw_table_with_cell_boxes()` - 只繪製邊框 + 圖片
|
||||||
|
- [x] 修改 `regions_to_avoid` - 排除表格,讓文字穿透表格區域
|
||||||
|
- [x] 整合測試:test_layered_rendering.py
|
||||||
|
|
||||||
|
### Task 3.3: 備選方案
|
||||||
|
- [x] 當 cell_boxes 不可用時,使用 ReportLab Table
|
||||||
- [x] 確保向後兼容
|
- [x] 確保向後兼容
|
||||||
|
|
||||||
## Phase 4: 測試與驗證
|
## Phase 4: 測試與驗證 (已完成)
|
||||||
|
|
||||||
### Task 4.1: 單元測試
|
### Task 4.1: 單元測試
|
||||||
- [ ] 測試 cell boxes 提取功能
|
- [x] 測試 cell_box_list 提取 (29 cells 成功)
|
||||||
- [ ] 測試座標轉換
|
- [x] 測試 Image-in-Table 處理 (1 image embedded)
|
||||||
- [ ] 測試錯誤處理
|
- [x] 測試錯誤處理
|
||||||
|
|
||||||
### Task 4.2: 整合測試
|
### Task 4.2: 整合測試
|
||||||
- [ ] 使用實際 PDF 測試 OCR Track
|
- [x] 使用實際 PDF 測試 OCR Track (test_layered_rendering.py)
|
||||||
- [ ] 驗證 PDF 版面還原效果
|
- [x] 驗證 PDF 版面還原效果
|
||||||
- [ ] 性能測試
|
- [x] 分層渲染測試結果:
|
||||||
|
- 50 text elements (從 raw OCR 補充,原本只有 5 個)
|
||||||
|
- 31 cell_boxes (8 + 23)
|
||||||
|
- 1 embedded_image
|
||||||
|
- PDF 生成成功 (57,290 bytes)
|
||||||
|
|
||||||
## Phase 5: 清理
|
## Phase 5: 清理 (已完成)
|
||||||
|
|
||||||
### Task 5.1: 移除舊代碼
|
### Task 5.1: 移除舊代碼
|
||||||
- [ ] 評估並移除不再需要的 Paragraph 包裝代碼
|
- [x] 移除 SLANeXt 模型緩存代碼
|
||||||
- [ ] 清理調試日誌
|
- [x] 移除 `_get_slanet_model()`, `_get_table_classifier()`, `_extract_cell_boxes_with_slanet()`, `release_slanet_models()`
|
||||||
- [ ] 更新文檔
|
- [x] 移除 `enable_table_cell_boxes_extraction` 配置
|
||||||
|
- [x] 清理調試日誌
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@@ -66,32 +92,182 @@
|
|||||||
|
|
||||||
| 文件 | 修改內容 |
|
| 文件 | 修改內容 |
|
||||||
|------|---------|
|
|------|---------|
|
||||||
| `backend/app/core/config.py` | 添加配置項 |
|
| `backend/app/core/config.py` | 移除 `enable_table_cell_boxes_extraction` |
|
||||||
| `backend/app/services/pp_structure_enhanced.py` | 主要實現 |
|
| `backend/app/services/pp_structure_enhanced.py` | 使用 `table_res_list`, 添加 `_embed_images_in_table()` |
|
||||||
| `backend/app/services/pdf_generator_service.py` | 利用 cell_boxes |
|
| `backend/app/services/pdf_generator_service.py` | 分層渲染:只繪製邊框,排除表格區域的文字過濾 |
|
||||||
|
| `backend/app/services/gap_filling_service.py` | `_is_region_covered()` 跳過 TABLE 元素 |
|
||||||
|
| `backend/tests/test_layered_rendering.py` | 分層渲染整合測試 |
|
||||||
|
|
||||||
### 依賴
|
### PPStructureV3 數據結構
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from paddlex import create_model
|
result.json = {
|
||||||
|
'res': {
|
||||||
|
'parsing_res_list': [...], # 解析結果
|
||||||
|
'layout_det_res': {...}, # Layout 檢測結果
|
||||||
|
'table_res_list': [ # 表格識別結果
|
||||||
|
{
|
||||||
|
'cell_box_list': [[x1,y1,x2,y2], ...], # ← 關鍵!
|
||||||
|
'pred_html': '<html>...',
|
||||||
|
'table_ocr_pred': {...}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
'overall_ocr_res': {...}
|
||||||
|
}
|
||||||
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
### 測試數據
|
### 測試結果
|
||||||
|
|
||||||
- Task ID: `79a3d256-88f6-41d4-a7e9-3e358c85db40`
|
- Task ID: `442f9345-09ba-4a7d-949f-3bc88c2fa895`
|
||||||
- 表格 bbox: `[84, 269, 1174, 1508]`
|
- cell_boxes: 29 cells (source: table_res_list)
|
||||||
- 預期 cell 數量: 29 (SLANeXt_wired)
|
- embedded_images: 1 (img_in_table_935_838_1118_1031)
|
||||||
|
|
||||||
### 實現摘要
|
### 本地 vs 雲端差異
|
||||||
|
|
||||||
**已完成 (715805b):**
|
| 特性 | 本地 PaddleX 3.3.9 | 雲端 pp_demo |
|
||||||
1. `config.py`: 添加 `enable_table_cell_boxes_extraction` 配置項
|
|------|-------------------|--------------|
|
||||||
2. `pp_structure_enhanced.py`:
|
| `table_res_list` | ✓ 提供 | ✓ 提供 |
|
||||||
- 添加 `_slanet_wired_model`, `_slanet_wireless_model`, `_table_cls_model` 緩存屬性
|
| `cell_box_list` | ✓ 29 cells | ✓ 27+8 cells |
|
||||||
- 實現 `_get_slanet_model()` 和 `_get_table_classifier()` 延遲載入
|
| Layout 識別 | 1 個合併表格 | 2 個獨立表格 |
|
||||||
- 實現 `_extract_cell_boxes_with_slanet()` 從裁切圖片提取 cell boxes
|
| Image-in-Table | 需自行處理 | 自動嵌入 HTML |
|
||||||
- 實現 `release_slanet_models()` 釋放 GPU 記憶體
|
|
||||||
- 修改表格處理邏輯,當 PPStructureV3 沒有返回 boxes 時調用 SLANeXt
|
### 遺留問題
|
||||||
3. `pdf_generator_service.py`:
|
|
||||||
- 添加 `_compute_table_grid_from_cell_boxes()` 計算列寬和行高
|
1. **Layout 識別合併表格**:本地 Layout 模型把多個表格合併成一個大表格
|
||||||
- 修改 `draw_table_region()` 優先使用 cell_boxes 計算列寬
|
- 這導致 `table_res_list` 只有 1 個表格
|
||||||
|
- 雲端識別為 2 個獨立表格
|
||||||
|
- 可能需要調整 Layout 模型參數或後處理邏輯
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 分層渲染技術設計 (2025-11-30)
|
||||||
|
|
||||||
|
### 問題根因
|
||||||
|
|
||||||
|
ReportLab Table 需要規則矩形網格,但 PPStructureV3 的 cell_boxes 反映實際視覺位置,與 HTML 邏輯結構不匹配。嘗試在 cell 內繪製文字會導致:
|
||||||
|
- 文字超出邊框
|
||||||
|
- 匹配錯誤
|
||||||
|
- 部分文字遺失
|
||||||
|
|
||||||
|
### 解決方案:分層渲染
|
||||||
|
|
||||||
|
將表格渲染解耦為三個獨立層次:
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────────────────────────────┐
|
||||||
|
│ Layer 3: Embedded Images │
|
||||||
|
│ (從 metadata['embedded_images'] 獲取) │
|
||||||
|
├─────────────────────────────────────────┤
|
||||||
|
│ Layer 2: Text at Raw OCR Positions │
|
||||||
|
│ (從 GapFillingService 補充的原始 OCR) │
|
||||||
|
├─────────────────────────────────────────┤
|
||||||
|
│ Layer 1: Table Cell Borders │
|
||||||
|
│ (從 metadata['cell_boxes'] 繪製) │
|
||||||
|
└─────────────────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
### 實作細節
|
||||||
|
|
||||||
|
**1. GapFillingService 修改** (`_is_region_covered`):
|
||||||
|
```python
|
||||||
|
# 跳過 TABLE 元素覆蓋檢測,讓表格內文字通過
|
||||||
|
if skip_table_coverage and element.type == ElementType.TABLE:
|
||||||
|
continue
|
||||||
|
```
|
||||||
|
|
||||||
|
**2. PDF Generator 修改** (`regions_to_avoid`):
|
||||||
|
```python
|
||||||
|
# 排除表格,只避免與圖片重疊
|
||||||
|
regions_to_avoid = [img for img in images_metadata if img.get('type') != 'table']
|
||||||
|
```
|
||||||
|
|
||||||
|
**3. 簡化的 `_draw_table_with_cell_boxes`**:
|
||||||
|
```python
|
||||||
|
def _draw_table_with_cell_boxes(...):
|
||||||
|
"""只繪製邊框和圖片,不處理文字"""
|
||||||
|
# 1. 繪製每個 cell 的邊框
|
||||||
|
for box in cell_boxes:
|
||||||
|
pdf_canvas.rect(x, y, width, height, stroke=1, fill=0)
|
||||||
|
|
||||||
|
# 2. 繪製 embedded_images
|
||||||
|
for img in embedded_images:
|
||||||
|
self._draw_embedded_image(...)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 優勢
|
||||||
|
|
||||||
|
1. **解耦**:邊框渲染與文字渲染完全獨立
|
||||||
|
2. **精確**:文字位置直接使用 OCR 結果,不需推斷
|
||||||
|
3. **穩定**:不受 cell_boxes 與 HTML 不匹配影響
|
||||||
|
4. **相容**:visualization 中 overall_ocr_res.png 的效果可直接還原
|
||||||
|
|
||||||
|
### 測試結果
|
||||||
|
|
||||||
|
- Task ID: `84899366-f361-44f1-b989-5aba72419ca5`
|
||||||
|
- cell_boxes: 31 (8 + 23)
|
||||||
|
- 原始 text elements: 5
|
||||||
|
- 補充後 text elements: 50 (從 raw OCR 補充)
|
||||||
|
- PDF 大小: 57,290 bytes
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 混合渲染優化 (2025-11-30)
|
||||||
|
|
||||||
|
### 問題發現
|
||||||
|
|
||||||
|
分層渲染後仍有問題:
|
||||||
|
1. 表格歪斜:cell_boxes 有 2-11 像素的座標偏差
|
||||||
|
2. Title 等元素樣式未應用:OCR track 不套用樣式
|
||||||
|
|
||||||
|
### 解決方案:混合渲染 + 網格對齊
|
||||||
|
|
||||||
|
**1. Cell Boxes 網格對齊** (`_normalize_cell_boxes_to_grid`):
|
||||||
|
```python
|
||||||
|
def _normalize_cell_boxes_to_grid(self, cell_boxes, threshold=10.0):
|
||||||
|
"""
|
||||||
|
將相鄰座標聚合為統一值,消除 2-11 像素的偏差。
|
||||||
|
- 收集所有 X/Y 座標
|
||||||
|
- 聚類相近座標(threshold 內)
|
||||||
|
- 使用平均值作為對齊後的座標
|
||||||
|
"""
|
||||||
|
```
|
||||||
|
|
||||||
|
**2. 元素類型樣式** (OCR track):
|
||||||
|
```python
|
||||||
|
# 在 draw_text_region 中加入元素類型檢查
|
||||||
|
element_type = region.get('element_type', 'text')
|
||||||
|
|
||||||
|
if element_type == 'title':
|
||||||
|
font_size = min(font_size * 1.3, 36) # 30% 放大
|
||||||
|
elif element_type == 'header':
|
||||||
|
font_size = min(font_size * 1.15, 24) # 15% 放大
|
||||||
|
elif element_type == 'caption':
|
||||||
|
font_size = max(font_size * 0.9, 6) # 10% 縮小
|
||||||
|
```
|
||||||
|
|
||||||
|
**3. 元素類型傳遞**:
|
||||||
|
```python
|
||||||
|
# convert_unified_document_to_ocr_data 中加入
|
||||||
|
text_region = {
|
||||||
|
'text': text_content,
|
||||||
|
'bbox': bbox_polygon,
|
||||||
|
'element_type': element.type.value # 新增
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 改進後效果
|
||||||
|
|
||||||
|
| 項目 | 改進前 | 改進後 |
|
||||||
|
|------|--------|--------|
|
||||||
|
| 表格邊框 | 歪斜 (2-11px 偏差) | 網格對齊 |
|
||||||
|
| Title 樣式 | 無 (與普通文字相同) | 36pt 放大字體 |
|
||||||
|
| 混合渲染 | 只用 raw OCR | PP-Structure + raw OCR |
|
||||||
|
|
||||||
|
### 測試結果 (2025-11-30)
|
||||||
|
|
||||||
|
- Task ID: `3a3f350f-2d81-4af4-8a18-021ea09ac433`
|
||||||
|
- Table 1: 8 cell_boxes → 網格對齊
|
||||||
|
- Table 2: 23 cell_boxes → 網格對齊 + 1 embedded image
|
||||||
|
- Title: Applied title style: size=36.0
|
||||||
|
- PDF 大小: 104,082 bytes
|
||||||
|
|||||||
Reference in New Issue
Block a user