feat: add table detection options and scan artifact removal
- Add TableDetectionSelector component for wired/wireless/region detection - Add CV-based table line detector module (disabled due to poor performance) - Add scan artifact removal preprocessing step (removes faint horizontal lines) - Add PreprocessingConfig schema with remove_scan_artifacts option - Update frontend PreprocessingSettings with scan artifact toggle - Integrate table detection config into ProcessingPage - Archive extract-table-cell-boxes proposal 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -26,9 +26,11 @@ import paddle
|
||||
from paddleocr import PPStructureV3
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
import cv2
|
||||
from app.models.unified_document import ElementType
|
||||
from app.core.config import settings
|
||||
from app.services.memory_manager import prediction_context
|
||||
from app.services.cv_table_detector import CVTableDetector
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -62,6 +64,7 @@ class PPStructureEnhanced:
|
||||
'watermark': ElementType.WATERMARK,
|
||||
'signature': ElementType.SIGNATURE,
|
||||
'stamp': ElementType.STAMP,
|
||||
'seal': ElementType.STAMP, # PP-StructureV3 may use 'seal' label
|
||||
'logo': ElementType.LOGO,
|
||||
'barcode': ElementType.BARCODE,
|
||||
'qr-code': ElementType.QR_CODE,
|
||||
@@ -80,183 +83,15 @@ class PPStructureEnhanced:
|
||||
"""
|
||||
self.structure_engine = structure_engine
|
||||
|
||||
# Lazy-loaded SLANeXt models for cell boxes extraction
|
||||
# These are loaded on-demand when enable_table_cell_boxes_extraction is True
|
||||
self._slanet_wired_model = None
|
||||
self._slanet_wireless_model = None
|
||||
self._table_cls_model = None
|
||||
|
||||
def _get_slanet_model(self, is_wired: bool = True):
|
||||
"""
|
||||
Get or create SLANeXt model for cell boxes extraction (lazy loading).
|
||||
|
||||
Args:
|
||||
is_wired: True for wired (bordered) tables, False for wireless
|
||||
|
||||
Returns:
|
||||
SLANeXt model instance or None if loading fails
|
||||
"""
|
||||
if not settings.enable_table_cell_boxes_extraction:
|
||||
return None
|
||||
|
||||
try:
|
||||
from paddlex import create_model
|
||||
|
||||
if is_wired:
|
||||
if self._slanet_wired_model is None:
|
||||
model_name = settings.wired_table_model_name or "SLANeXt_wired"
|
||||
logger.info(f"Loading SLANeXt wired model: {model_name}")
|
||||
self._slanet_wired_model = create_model(model_name)
|
||||
return self._slanet_wired_model
|
||||
else:
|
||||
if self._slanet_wireless_model is None:
|
||||
model_name = settings.wireless_table_model_name or "SLANeXt_wireless"
|
||||
logger.info(f"Loading SLANeXt wireless model: {model_name}")
|
||||
self._slanet_wireless_model = create_model(model_name)
|
||||
return self._slanet_wireless_model
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load SLANeXt model: {e}")
|
||||
return None
|
||||
|
||||
def _get_table_classifier(self):
|
||||
"""
|
||||
Get or create table classification model (lazy loading).
|
||||
|
||||
Returns:
|
||||
Table classifier model instance or None if loading fails
|
||||
"""
|
||||
if not settings.enable_table_cell_boxes_extraction:
|
||||
return None
|
||||
|
||||
try:
|
||||
from paddlex import create_model
|
||||
|
||||
if self._table_cls_model is None:
|
||||
model_name = settings.table_classification_model_name or "PP-LCNet_x1_0_table_cls"
|
||||
logger.info(f"Loading table classification model: {model_name}")
|
||||
self._table_cls_model = create_model(model_name)
|
||||
return self._table_cls_model
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load table classifier: {e}")
|
||||
return None
|
||||
|
||||
def _extract_cell_boxes_with_slanet(
|
||||
self,
|
||||
table_image: np.ndarray,
|
||||
table_bbox: List[float],
|
||||
is_wired: Optional[bool] = None
|
||||
) -> Optional[List[List[float]]]:
|
||||
"""
|
||||
Extract cell bounding boxes using direct SLANeXt model call.
|
||||
|
||||
This supplements PPStructureV3 which doesn't expose cell boxes in its output.
|
||||
|
||||
Args:
|
||||
table_image: Cropped table image as numpy array (BGR format)
|
||||
table_bbox: Table bounding box in page coordinates [x1, y1, x2, y2]
|
||||
is_wired: If None, auto-detect using classifier. True for bordered tables.
|
||||
|
||||
Returns:
|
||||
List of cell bounding boxes in page coordinates [[x1,y1,x2,y2], ...],
|
||||
or None if extraction fails
|
||||
"""
|
||||
if not settings.enable_table_cell_boxes_extraction:
|
||||
return None
|
||||
|
||||
try:
|
||||
# Auto-detect table type if not specified
|
||||
if is_wired is None:
|
||||
classifier = self._get_table_classifier()
|
||||
if classifier:
|
||||
try:
|
||||
cls_result = classifier.predict(table_image)
|
||||
# PP-LCNet returns classification result
|
||||
for res in cls_result:
|
||||
label_names = res.get('label_names', [])
|
||||
if label_names:
|
||||
is_wired = 'wired' in str(label_names[0]).lower()
|
||||
logger.debug(f"Table classified as: {'wired' if is_wired else 'wireless'}")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.warning(f"Table classification failed, defaulting to wired: {e}")
|
||||
is_wired = True
|
||||
else:
|
||||
is_wired = True # Default to wired if classifier unavailable
|
||||
|
||||
# Get appropriate SLANeXt model
|
||||
model = self._get_slanet_model(is_wired=is_wired)
|
||||
if model is None:
|
||||
return None
|
||||
|
||||
# Run SLANeXt prediction
|
||||
results = model.predict(table_image)
|
||||
|
||||
# Extract cell boxes from result
|
||||
cell_boxes = []
|
||||
table_x, table_y = table_bbox[0], table_bbox[1]
|
||||
|
||||
for result in results:
|
||||
# SLANeXt returns 'bbox' with 8-point polygon format
|
||||
# [[x1,y1,x2,y2,x3,y3,x4,y4], ...]
|
||||
boxes = result.get('bbox', [])
|
||||
for box in boxes:
|
||||
if isinstance(box, (list, tuple)):
|
||||
if len(box) >= 8:
|
||||
# 8-point polygon: convert to 4-point rectangle
|
||||
xs = [box[i] for i in range(0, 8, 2)]
|
||||
ys = [box[i] for i in range(1, 8, 2)]
|
||||
x1, y1 = min(xs), min(ys)
|
||||
x2, y2 = max(xs), max(ys)
|
||||
elif len(box) >= 4:
|
||||
# Already 4-point rectangle
|
||||
x1, y1, x2, y2 = box[:4]
|
||||
else:
|
||||
continue
|
||||
|
||||
# Convert to absolute page coordinates
|
||||
abs_box = [
|
||||
float(x1 + table_x),
|
||||
float(y1 + table_y),
|
||||
float(x2 + table_x),
|
||||
float(y2 + table_y)
|
||||
]
|
||||
cell_boxes.append(abs_box)
|
||||
|
||||
logger.info(f"SLANeXt extracted {len(cell_boxes)} cell boxes (is_wired={is_wired})")
|
||||
return cell_boxes if cell_boxes else None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Cell boxes extraction with SLANeXt failed: {e}")
|
||||
return None
|
||||
|
||||
def release_slanet_models(self):
|
||||
"""Release SLANeXt models to free GPU memory."""
|
||||
if self._slanet_wired_model is not None:
|
||||
del self._slanet_wired_model
|
||||
self._slanet_wired_model = None
|
||||
logger.info("Released SLANeXt wired model")
|
||||
|
||||
if self._slanet_wireless_model is not None:
|
||||
del self._slanet_wireless_model
|
||||
self._slanet_wireless_model = None
|
||||
logger.info("Released SLANeXt wireless model")
|
||||
|
||||
if self._table_cls_model is not None:
|
||||
del self._table_cls_model
|
||||
self._table_cls_model = None
|
||||
logger.info("Released table classifier model")
|
||||
|
||||
gc.collect()
|
||||
if TORCH_AVAILABLE:
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def analyze_with_full_structure(
|
||||
self,
|
||||
image_path: Path,
|
||||
output_dir: Optional[Path] = None,
|
||||
current_page: int = 0,
|
||||
preprocessed_image: Optional[Image.Image] = None,
|
||||
scaling_info: Optional['ScalingInfo'] = None
|
||||
scaling_info: Optional['ScalingInfo'] = None,
|
||||
save_visualization: bool = False,
|
||||
use_cv_table_detection: bool = False
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze document with full PP-StructureV3 capabilities.
|
||||
@@ -271,6 +106,10 @@ class PPStructureEnhanced:
|
||||
scaling_info: Optional ScalingInfo from preprocessing. If image was scaled
|
||||
for layout detection, all bbox coordinates will be scaled back
|
||||
to original image coordinates for proper cropping.
|
||||
save_visualization: If True, save detection visualization images
|
||||
(layout_det_res, layout_order_res, overall_ocr_res, etc.)
|
||||
use_cv_table_detection: If True, use CV-based line detection for wired tables
|
||||
instead of ML-based cell detection (RT-DETR-L)
|
||||
|
||||
Returns:
|
||||
Dictionary with complete structure information including:
|
||||
@@ -278,6 +117,7 @@ class PPStructureEnhanced:
|
||||
- reading_order: Reading order indices
|
||||
- images: Extracted images with metadata
|
||||
- tables: Extracted tables with structure
|
||||
- visualization_dir: Path to visualization images (if save_visualization=True)
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Enhanced PP-StructureV3 analysis on {image_path.name}")
|
||||
@@ -313,9 +153,21 @@ class PPStructureEnhanced:
|
||||
all_elements = []
|
||||
all_images = []
|
||||
all_tables = []
|
||||
visualization_dir = None
|
||||
|
||||
# Process each page result
|
||||
for page_idx, page_result in enumerate(results):
|
||||
# Save visualization images if requested
|
||||
if save_visualization and output_dir and hasattr(page_result, 'save_to_img'):
|
||||
try:
|
||||
vis_dir = output_dir / 'visualization'
|
||||
vis_dir.mkdir(parents=True, exist_ok=True)
|
||||
page_result.save_to_img(str(vis_dir))
|
||||
visualization_dir = vis_dir
|
||||
logger.info(f"Saved visualization images to {vis_dir}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to save visualization images: {e}")
|
||||
|
||||
# Try to access parsing_res_list and table_res_list (the complete structure)
|
||||
parsing_res_list = None
|
||||
table_res_list = None
|
||||
@@ -369,6 +221,7 @@ class PPStructureEnhanced:
|
||||
logger.info(f"Found parsing_res_list in to_dict['res'] with {len(parsing_res_list)} elements")
|
||||
|
||||
# Extract table_res_list which contains cell_box_list
|
||||
layout_det_res = None
|
||||
if result_dict:
|
||||
if 'table_res_list' in result_dict:
|
||||
table_res_list = result_dict['table_res_list']
|
||||
@@ -377,20 +230,40 @@ class PPStructureEnhanced:
|
||||
if 'cell_box_list' in tbl:
|
||||
logger.info(f" Table {i}: {len(tbl['cell_box_list'])} cell boxes")
|
||||
|
||||
# Extract layout_det_res for Image-in-Table processing
|
||||
if 'layout_det_res' in result_dict:
|
||||
layout_det_res = result_dict['layout_det_res']
|
||||
logger.info(f"Found layout_det_res with {len(layout_det_res.get('boxes', []))} boxes")
|
||||
|
||||
# Process parsing_res_list if found
|
||||
if parsing_res_list:
|
||||
elements = self._process_parsing_res_list(
|
||||
parsing_res_list, current_page, output_dir, image_path, scaling_info,
|
||||
table_res_list=table_res_list # Pass table_res_list for cell_box_list
|
||||
table_res_list=table_res_list, # Pass table_res_list for cell_box_list
|
||||
layout_det_res=layout_det_res, # Pass layout_det_res for Image-in-Table
|
||||
use_cv_table_detection=use_cv_table_detection # Use CV for wired tables
|
||||
)
|
||||
all_elements.extend(elements)
|
||||
|
||||
# Extract tables and images from elements
|
||||
table_bboxes = [] # Collect table bboxes for standalone image filtering
|
||||
for elem in elements:
|
||||
if elem['type'] == ElementType.TABLE:
|
||||
all_tables.append(elem)
|
||||
table_bboxes.append(elem.get('bbox', [0, 0, 0, 0]))
|
||||
elif elem['type'] in [ElementType.IMAGE, ElementType.FIGURE]:
|
||||
all_images.append(elem)
|
||||
|
||||
# Extract standalone images from layout_det_res (images NOT inside tables)
|
||||
if layout_det_res and image_path and output_dir:
|
||||
standalone_images = self._extract_standalone_images(
|
||||
layout_det_res, table_bboxes, image_path, output_dir,
|
||||
current_page, len(elements), scaling_info
|
||||
)
|
||||
if standalone_images:
|
||||
all_elements.extend(standalone_images)
|
||||
all_images.extend(standalone_images)
|
||||
logger.info(f"Extracted {len(standalone_images)} standalone images from layout_det_res")
|
||||
else:
|
||||
# Fallback to markdown if parsing_res_list not available
|
||||
logger.warning("parsing_res_list not found, falling back to markdown")
|
||||
@@ -402,7 +275,7 @@ class PPStructureEnhanced:
|
||||
# Create reading order based on element positions
|
||||
reading_order = self._determine_reading_order(all_elements)
|
||||
|
||||
return {
|
||||
result = {
|
||||
'elements': all_elements,
|
||||
'total_elements': len(all_elements),
|
||||
'reading_order': reading_order,
|
||||
@@ -412,6 +285,12 @@ class PPStructureEnhanced:
|
||||
'has_parsing_res_list': parsing_res_list is not None
|
||||
}
|
||||
|
||||
# Add visualization directory if available
|
||||
if visualization_dir:
|
||||
result['visualization_dir'] = str(visualization_dir)
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Enhanced PP-StructureV3 analysis error: {e}")
|
||||
import traceback
|
||||
@@ -446,7 +325,9 @@ class PPStructureEnhanced:
|
||||
output_dir: Optional[Path],
|
||||
source_image_path: Optional[Path] = None,
|
||||
scaling_info: Optional['ScalingInfo'] = None,
|
||||
table_res_list: Optional[List[Dict]] = None
|
||||
table_res_list: Optional[List[Dict]] = None,
|
||||
layout_det_res: Optional[Dict] = None,
|
||||
use_cv_table_detection: bool = False
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Process parsing_res_list to extract all elements.
|
||||
@@ -458,6 +339,8 @@ class PPStructureEnhanced:
|
||||
output_dir: Optional output directory
|
||||
source_image_path: Path to source image for cropping image regions
|
||||
table_res_list: Optional list of table results containing cell_box_list
|
||||
layout_det_res: Optional layout detection result for Image-in-Table processing
|
||||
use_cv_table_detection: If True, use CV line detection for wired tables
|
||||
|
||||
Returns:
|
||||
List of processed elements with normalized structure
|
||||
@@ -628,53 +511,55 @@ class PPStructureEnhanced:
|
||||
logger.info(f"[TABLE] Processed {len(processed_cells)} cell boxes with table offset ({table_x}, {table_y})")
|
||||
cell_boxes_extracted = True
|
||||
|
||||
# Supplement with direct SLANeXt call if PPStructureV3 didn't provide boxes
|
||||
if not cell_boxes_extracted and source_image_path and bbox != [0, 0, 0, 0]:
|
||||
logger.info(f"[TABLE] No boxes from PPStructureV3, attempting SLANeXt extraction...")
|
||||
try:
|
||||
# Load source image and crop table region
|
||||
source_img = Image.open(source_image_path)
|
||||
source_array = np.array(source_img)
|
||||
|
||||
# Crop table region (bbox is in original image coordinates)
|
||||
x1, y1, x2, y2 = [int(round(c)) for c in bbox]
|
||||
# Ensure coordinates are within image bounds
|
||||
h, w = source_array.shape[:2]
|
||||
x1, y1 = max(0, x1), max(0, y1)
|
||||
x2, y2 = min(w, x2), min(h, y2)
|
||||
|
||||
if x2 > x1 and y2 > y1:
|
||||
table_crop = source_array[y1:y2, x1:x2]
|
||||
|
||||
# Convert RGB to BGR for SLANeXt
|
||||
if len(table_crop.shape) == 3 and table_crop.shape[2] == 3:
|
||||
table_crop_bgr = table_crop[:, :, ::-1]
|
||||
else:
|
||||
table_crop_bgr = table_crop
|
||||
|
||||
# Extract cell boxes using SLANeXt
|
||||
slanet_boxes = self._extract_cell_boxes_with_slanet(
|
||||
table_crop_bgr,
|
||||
bbox, # Pass original bbox for coordinate offset
|
||||
is_wired=None # Auto-detect
|
||||
)
|
||||
|
||||
if slanet_boxes:
|
||||
element['cell_boxes'] = slanet_boxes
|
||||
element['cell_boxes_source'] = 'slanet'
|
||||
cell_boxes_extracted = True
|
||||
logger.info(f"[TABLE] SLANeXt extracted {len(slanet_boxes)} cell boxes")
|
||||
else:
|
||||
logger.warning(f"[TABLE] Invalid crop region: ({x1},{y1})-({x2},{y2})")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[TABLE] SLANeXt extraction failed: {e}")
|
||||
|
||||
if not cell_boxes_extracted:
|
||||
logger.info(f"[TABLE] No cell boxes available. PPStructureV3 keys: {list(res_data.keys()) if res_data else 'empty'}")
|
||||
|
||||
# Special handling for images/figures
|
||||
elif mapped_type in [ElementType.IMAGE, ElementType.FIGURE]:
|
||||
# 2.5 CV-based table line detection for wired tables
|
||||
if use_cv_table_detection and source_image_path and source_image_path.exists():
|
||||
try:
|
||||
# Load image for CV processing
|
||||
cv_image = cv2.imread(str(source_image_path))
|
||||
if cv_image is not None:
|
||||
cv_detector = CVTableDetector()
|
||||
ml_cell_boxes = element.get('cell_boxes', [])
|
||||
|
||||
# Detect cells using CV line detection
|
||||
cv_cells = cv_detector.detect_and_merge_with_ml(
|
||||
cv_image,
|
||||
bbox, # Table bbox
|
||||
ml_cell_boxes
|
||||
)
|
||||
|
||||
if cv_cells:
|
||||
# Apply scaling if needed
|
||||
if scaling_info and scaling_info.was_scaled:
|
||||
cv_cells = [
|
||||
[
|
||||
c[0] * scaling_info.scale_x,
|
||||
c[1] * scaling_info.scale_y,
|
||||
c[2] * scaling_info.scale_x,
|
||||
c[3] * scaling_info.scale_y
|
||||
]
|
||||
for c in cv_cells
|
||||
]
|
||||
|
||||
element['cell_boxes'] = cv_cells
|
||||
element['cell_boxes_source'] = 'cv_line_detection'
|
||||
logger.info(f"[TABLE] CV line detection found {len(cv_cells)} cells (ML had {len(ml_cell_boxes)})")
|
||||
except Exception as cv_error:
|
||||
logger.warning(f"[TABLE] CV line detection failed: {cv_error}")
|
||||
|
||||
# 3. Image-in-Table 處理:檢測並嵌入表格內的圖片
|
||||
if layout_det_res and source_image_path and output_dir:
|
||||
embedded_images = self._embed_images_in_table(
|
||||
element, bbox, layout_det_res, source_image_path, output_dir
|
||||
)
|
||||
if embedded_images:
|
||||
element['embedded_images'] = embedded_images
|
||||
logger.info(f"[TABLE] Embedded {len(embedded_images)} images into table")
|
||||
|
||||
# Special handling for images/figures/stamps (visual elements that need cropping)
|
||||
elif mapped_type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.STAMP, ElementType.LOGO]:
|
||||
# Save image if path provided
|
||||
if 'img_path' in item and output_dir:
|
||||
saved_path = self._save_image(item['img_path'], output_dir, element['element_id'])
|
||||
@@ -704,6 +589,209 @@ class PPStructureEnhanced:
|
||||
|
||||
return elements
|
||||
|
||||
def _embed_images_in_table(
|
||||
self,
|
||||
table_element: Dict[str, Any],
|
||||
table_bbox: List[float],
|
||||
layout_det_res: Dict,
|
||||
source_image_path: Path,
|
||||
output_dir: Path
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Detect and embed images that are inside a table region.
|
||||
|
||||
This handles the case where layout detection finds an image inside a table,
|
||||
similar to how pp_demo embeds images in table HTML.
|
||||
|
||||
Args:
|
||||
table_element: The table element being processed
|
||||
table_bbox: Table bounding box [x1, y1, x2, y2]
|
||||
layout_det_res: Layout detection result containing all detected boxes
|
||||
source_image_path: Path to source image for cropping
|
||||
output_dir: Output directory for saving cropped images
|
||||
|
||||
Returns:
|
||||
List of embedded image info dicts with 'bbox', 'saved_path', 'html_tag'
|
||||
"""
|
||||
embedded_images = []
|
||||
|
||||
try:
|
||||
boxes = layout_det_res.get('boxes', [])
|
||||
table_x1, table_y1, table_x2, table_y2 = table_bbox
|
||||
|
||||
for box in boxes:
|
||||
label = box.get('label', '').lower()
|
||||
if label != 'image':
|
||||
continue
|
||||
|
||||
# Get image bbox
|
||||
img_coord = box.get('coordinate', [])
|
||||
if len(img_coord) < 4:
|
||||
continue
|
||||
|
||||
img_x1, img_y1, img_x2, img_y2 = img_coord[:4]
|
||||
|
||||
# Check if image is inside table (with some tolerance)
|
||||
tolerance = 5 # pixels
|
||||
if (img_x1 >= table_x1 - tolerance and
|
||||
img_y1 >= table_y1 - tolerance and
|
||||
img_x2 <= table_x2 + tolerance and
|
||||
img_y2 <= table_y2 + tolerance):
|
||||
|
||||
logger.info(f"[IMAGE-IN-TABLE] Found image at [{int(img_x1)},{int(img_y1)},{int(img_x2)},{int(img_y2)}] inside table")
|
||||
|
||||
# Crop and save the image
|
||||
img_element_id = f"img_in_table_{int(img_x1)}_{int(img_y1)}_{int(img_x2)}_{int(img_y2)}"
|
||||
cropped_path = self._crop_and_save_image(
|
||||
source_image_path,
|
||||
[img_x1, img_y1, img_x2, img_y2],
|
||||
output_dir,
|
||||
img_element_id
|
||||
)
|
||||
|
||||
if cropped_path:
|
||||
# Create relative path for HTML embedding
|
||||
rel_path = f"imgs/{Path(cropped_path).name}"
|
||||
|
||||
# Create img tag similar to pp_demo
|
||||
img_html = f'<div style="text-align: center;"><img src="{rel_path}" alt="Image" /></div>'
|
||||
|
||||
embedded_image = {
|
||||
'bbox': [img_x1, img_y1, img_x2, img_y2],
|
||||
'saved_path': str(cropped_path),
|
||||
'relative_path': rel_path,
|
||||
'html_tag': img_html,
|
||||
'element_id': img_element_id
|
||||
}
|
||||
embedded_images.append(embedded_image)
|
||||
|
||||
# Try to insert image into HTML content
|
||||
if 'html' in table_element and table_element['html']:
|
||||
# Insert image reference at the end of HTML before </table>
|
||||
original_html = table_element['html']
|
||||
if '</tbody>' in original_html:
|
||||
# Insert before </tbody> in a new row
|
||||
new_html = original_html.replace(
|
||||
'</tbody>',
|
||||
f'<tr><td colspan="99" style="text-align:center;"><img src="{rel_path}" alt="Embedded Image" /></td></tr></tbody>'
|
||||
)
|
||||
table_element['html'] = new_html
|
||||
logger.info(f"[IMAGE-IN-TABLE] Embedded image into table HTML")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[IMAGE-IN-TABLE] Error processing images in table: {e}")
|
||||
|
||||
return embedded_images
|
||||
|
||||
def _extract_standalone_images(
|
||||
self,
|
||||
layout_det_res: Dict,
|
||||
table_bboxes: List[List[float]],
|
||||
source_image_path: Path,
|
||||
output_dir: Path,
|
||||
current_page: int,
|
||||
start_index: int,
|
||||
scaling_info: Optional['ScalingInfo'] = None
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Extract standalone images from layout_det_res that are NOT inside tables.
|
||||
|
||||
This handles images that PP-StructureV3 detects in layout_det_res but
|
||||
doesn't include in parsing_res_list (non-table images).
|
||||
|
||||
Args:
|
||||
layout_det_res: Layout detection result containing all detected boxes
|
||||
table_bboxes: List of table bounding boxes to exclude images inside tables
|
||||
source_image_path: Path to source image for cropping
|
||||
output_dir: Output directory for saving cropped images
|
||||
current_page: Current page number
|
||||
start_index: Starting index for element IDs
|
||||
scaling_info: Optional scaling info for coordinate restoration
|
||||
|
||||
Returns:
|
||||
List of standalone image elements
|
||||
"""
|
||||
standalone_images = []
|
||||
|
||||
try:
|
||||
boxes = layout_det_res.get('boxes', [])
|
||||
logger.info(f"[STANDALONE-IMAGE] Checking {len(boxes)} boxes for standalone images")
|
||||
|
||||
for box_idx, box in enumerate(boxes):
|
||||
label = box.get('label', '').lower()
|
||||
if label != 'image':
|
||||
continue
|
||||
|
||||
# Get image bbox
|
||||
img_coord = box.get('coordinate', [])
|
||||
if len(img_coord) < 4:
|
||||
continue
|
||||
|
||||
img_x1, img_y1, img_x2, img_y2 = img_coord[:4]
|
||||
|
||||
# Check if image is inside any table (skip if so)
|
||||
is_inside_table = False
|
||||
for table_bbox in table_bboxes:
|
||||
if len(table_bbox) < 4:
|
||||
continue
|
||||
tx1, ty1, tx2, ty2 = table_bbox[:4]
|
||||
tolerance = 5 # pixels
|
||||
if (img_x1 >= tx1 - tolerance and
|
||||
img_y1 >= ty1 - tolerance and
|
||||
img_x2 <= tx2 + tolerance and
|
||||
img_y2 <= ty2 + tolerance):
|
||||
is_inside_table = True
|
||||
logger.debug(f"[STANDALONE-IMAGE] Image at [{int(img_x1)},{int(img_y1)}] is inside table, skipping")
|
||||
break
|
||||
|
||||
if is_inside_table:
|
||||
continue
|
||||
|
||||
# Scale bbox back to original coordinates if needed
|
||||
if scaling_info and scaling_info.was_scaled:
|
||||
scale_factor = scaling_info.scale_factor
|
||||
img_x1 *= scale_factor
|
||||
img_y1 *= scale_factor
|
||||
img_x2 *= scale_factor
|
||||
img_y2 *= scale_factor
|
||||
logger.debug(f"[STANDALONE-IMAGE] Scaled bbox by {scale_factor:.3f}")
|
||||
|
||||
logger.info(f"[STANDALONE-IMAGE] Found standalone image at [{int(img_x1)},{int(img_y1)},{int(img_x2)},{int(img_y2)}]")
|
||||
|
||||
# Crop and save the image
|
||||
element_idx = start_index + len(standalone_images)
|
||||
img_element_id = f"standalone_img_{current_page}_{element_idx}"
|
||||
cropped_path = self._crop_and_save_image(
|
||||
source_image_path,
|
||||
[img_x1, img_y1, img_x2, img_y2],
|
||||
output_dir,
|
||||
img_element_id
|
||||
)
|
||||
|
||||
if cropped_path:
|
||||
element = {
|
||||
'element_id': img_element_id,
|
||||
'type': ElementType.IMAGE,
|
||||
'original_type': 'image',
|
||||
'content': '',
|
||||
'page': current_page,
|
||||
'bbox': [img_x1, img_y1, img_x2, img_y2],
|
||||
'index': element_idx,
|
||||
'confidence': box.get('score', 1.0),
|
||||
'saved_path': cropped_path,
|
||||
'img_path': cropped_path,
|
||||
'source': 'layout_det_res'
|
||||
}
|
||||
standalone_images.append(element)
|
||||
logger.info(f"[STANDALONE-IMAGE] Extracted and saved: {cropped_path}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[STANDALONE-IMAGE] Error extracting standalone images: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
return standalone_images
|
||||
|
||||
def _process_markdown_fallback(
|
||||
self,
|
||||
page_result: Any,
|
||||
|
||||
Reference in New Issue
Block a user