feat: add table detection options and scan artifact removal

- Add TableDetectionSelector component for wired/wireless/region detection - Add CV-based table line detector module (disabled due to poor performance) - Add scan artifact removal preprocessing step (removes faint horizontal lines) - Add PreprocessingConfig schema with remove_scan_artifacts option - Update frontend PreprocessingSettings with scan artifact toggle - Integrate table detection config into ProcessingPage - Archive extract-table-cell-boxes proposal 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-30 13:21:50 +08:00
parent f5a2c8a750
commit 95ae1f1bdb
17 changed files with 1906 additions and 344 deletions
--- a/backend/app/services/pp_structure_enhanced.py
+++ b/backend/app/services/pp_structure_enhanced.py
@@ -26,9 +26,11 @@ import paddle
 from paddleocr import PPStructureV3
 from PIL import Image
 import numpy as np
+import cv2
 from app.models.unified_document import ElementType
 from app.core.config import settings
 from app.services.memory_manager import prediction_context
+from app.services.cv_table_detector import CVTableDetector

 logger = logging.getLogger(__name__)

@@ -62,6 +64,7 @@ class PPStructureEnhanced:
        'watermark': ElementType.WATERMARK,
        'signature': ElementType.SIGNATURE,
        'stamp': ElementType.STAMP,
+        'seal': ElementType.STAMP,  # PP-StructureV3 may use 'seal' label
        'logo': ElementType.LOGO,
        'barcode': ElementType.BARCODE,
        'qr-code': ElementType.QR_CODE,
@@ -80,183 +83,15 @@ class PPStructureEnhanced:
        """
        self.structure_engine = structure_engine

-        # Lazy-loaded SLANeXt models for cell boxes extraction
-        # These are loaded on-demand when enable_table_cell_boxes_extraction is True
-        self._slanet_wired_model = None
-        self._slanet_wireless_model = None
-        self._table_cls_model = None
-
-    def _get_slanet_model(self, is_wired: bool = True):
-        """
-        Get or create SLANeXt model for cell boxes extraction (lazy loading).
-
-        Args:
-            is_wired: True for wired (bordered) tables, False for wireless
-
-        Returns:
-            SLANeXt model instance or None if loading fails
-        """
-        if not settings.enable_table_cell_boxes_extraction:
-            return None
-
-        try:
-            from paddlex import create_model
-
-            if is_wired:
-                if self._slanet_wired_model is None:
-                    model_name = settings.wired_table_model_name or "SLANeXt_wired"
-                    logger.info(f"Loading SLANeXt wired model: {model_name}")
-                    self._slanet_wired_model = create_model(model_name)
-                return self._slanet_wired_model
-            else:
-                if self._slanet_wireless_model is None:
-                    model_name = settings.wireless_table_model_name or "SLANeXt_wireless"
-                    logger.info(f"Loading SLANeXt wireless model: {model_name}")
-                    self._slanet_wireless_model = create_model(model_name)
-                return self._slanet_wireless_model
-        except Exception as e:
-            logger.error(f"Failed to load SLANeXt model: {e}")
-            return None
-
-    def _get_table_classifier(self):
-        """
-        Get or create table classification model (lazy loading).
-
-        Returns:
-            Table classifier model instance or None if loading fails
-        """
-        if not settings.enable_table_cell_boxes_extraction:
-            return None
-
-        try:
-            from paddlex import create_model
-
-            if self._table_cls_model is None:
-                model_name = settings.table_classification_model_name or "PP-LCNet_x1_0_table_cls"
-                logger.info(f"Loading table classification model: {model_name}")
-                self._table_cls_model = create_model(model_name)
-            return self._table_cls_model
-        except Exception as e:
-            logger.error(f"Failed to load table classifier: {e}")
-            return None
-
-    def _extract_cell_boxes_with_slanet(
-        self,
-        table_image: np.ndarray,
-        table_bbox: List[float],
-        is_wired: Optional[bool] = None
-    ) -> Optional[List[List[float]]]:
-        """
-        Extract cell bounding boxes using direct SLANeXt model call.
-
-        This supplements PPStructureV3 which doesn't expose cell boxes in its output.
-
-        Args:
-            table_image: Cropped table image as numpy array (BGR format)
-            table_bbox: Table bounding box in page coordinates [x1, y1, x2, y2]
-            is_wired: If None, auto-detect using classifier. True for bordered tables.
-
-        Returns:
-            List of cell bounding boxes in page coordinates [[x1,y1,x2,y2], ...],
-            or None if extraction fails
-        """
-        if not settings.enable_table_cell_boxes_extraction:
-            return None
-
-        try:
-            # Auto-detect table type if not specified
-            if is_wired is None:
-                classifier = self._get_table_classifier()
-                if classifier:
-                    try:
-                        cls_result = classifier.predict(table_image)
-                        # PP-LCNet returns classification result
-                        for res in cls_result:
-                            label_names = res.get('label_names', [])
-                            if label_names:
-                                is_wired = 'wired' in str(label_names[0]).lower()
-                                logger.debug(f"Table classified as: {'wired' if is_wired else 'wireless'}")
-                                break
-                    except Exception as e:
-                        logger.warning(f"Table classification failed, defaulting to wired: {e}")
-                        is_wired = True
-                else:
-                    is_wired = True  # Default to wired if classifier unavailable
-
-            # Get appropriate SLANeXt model
-            model = self._get_slanet_model(is_wired=is_wired)
-            if model is None:
-                return None
-
-            # Run SLANeXt prediction
-            results = model.predict(table_image)
-
-            # Extract cell boxes from result
-            cell_boxes = []
-            table_x, table_y = table_bbox[0], table_bbox[1]
-
-            for result in results:
-                # SLANeXt returns 'bbox' with 8-point polygon format
-                # [[x1,y1,x2,y2,x3,y3,x4,y4], ...]
-                boxes = result.get('bbox', [])
-                for box in boxes:
-                    if isinstance(box, (list, tuple)):
-                        if len(box) >= 8:
-                            # 8-point polygon: convert to 4-point rectangle
-                            xs = [box[i] for i in range(0, 8, 2)]
-                            ys = [box[i] for i in range(1, 8, 2)]
-                            x1, y1 = min(xs), min(ys)
-                            x2, y2 = max(xs), max(ys)
-                        elif len(box) >= 4:
-                            # Already 4-point rectangle
-                            x1, y1, x2, y2 = box[:4]
-                        else:
-                            continue
-
-                        # Convert to absolute page coordinates
-                        abs_box = [
-                            float(x1 + table_x),
-                            float(y1 + table_y),
-                            float(x2 + table_x),
-                            float(y2 + table_y)
-                        ]
-                        cell_boxes.append(abs_box)
-
-            logger.info(f"SLANeXt extracted {len(cell_boxes)} cell boxes (is_wired={is_wired})")
-            return cell_boxes if cell_boxes else None
-
-        except Exception as e:
-            logger.error(f"Cell boxes extraction with SLANeXt failed: {e}")
-            return None
-
-    def release_slanet_models(self):
-        """Release SLANeXt models to free GPU memory."""
-        if self._slanet_wired_model is not None:
-            del self._slanet_wired_model
-            self._slanet_wired_model = None
-            logger.info("Released SLANeXt wired model")
-
-        if self._slanet_wireless_model is not None:
-            del self._slanet_wireless_model
-            self._slanet_wireless_model = None
-            logger.info("Released SLANeXt wireless model")
-
-        if self._table_cls_model is not None:
-            del self._table_cls_model
-            self._table_cls_model = None
-            logger.info("Released table classifier model")
-
-        gc.collect()
-        if TORCH_AVAILABLE:
-            torch.cuda.empty_cache()
-
    def analyze_with_full_structure(
        self,
        image_path: Path,
        output_dir: Optional[Path] = None,
        current_page: int = 0,
        preprocessed_image: Optional[Image.Image] = None,
-        scaling_info: Optional['ScalingInfo'] = None
+        scaling_info: Optional['ScalingInfo'] = None,
+        save_visualization: bool = False,
+        use_cv_table_detection: bool = False
    ) -> Dict[str, Any]:
        """
        Analyze document with full PP-StructureV3 capabilities.
@@ -271,6 +106,10 @@ class PPStructureEnhanced:
            scaling_info: Optional ScalingInfo from preprocessing. If image was scaled
                         for layout detection, all bbox coordinates will be scaled back
                         to original image coordinates for proper cropping.
+            save_visualization: If True, save detection visualization images
+                               (layout_det_res, layout_order_res, overall_ocr_res, etc.)
+            use_cv_table_detection: If True, use CV-based line detection for wired tables
+                                   instead of ML-based cell detection (RT-DETR-L)

        Returns:
            Dictionary with complete structure information including:
@@ -278,6 +117,7 @@ class PPStructureEnhanced:
            - reading_order: Reading order indices
            - images: Extracted images with metadata
            - tables: Extracted tables with structure
+            - visualization_dir: Path to visualization images (if save_visualization=True)
        """
        try:
            logger.info(f"Enhanced PP-StructureV3 analysis on {image_path.name}")
@@ -313,9 +153,21 @@ class PPStructureEnhanced:
            all_elements = []
            all_images = []
            all_tables = []
+            visualization_dir = None

            # Process each page result
            for page_idx, page_result in enumerate(results):
+                # Save visualization images if requested
+                if save_visualization and output_dir and hasattr(page_result, 'save_to_img'):
+                    try:
+                        vis_dir = output_dir / 'visualization'
+                        vis_dir.mkdir(parents=True, exist_ok=True)
+                        page_result.save_to_img(str(vis_dir))
+                        visualization_dir = vis_dir
+                        logger.info(f"Saved visualization images to {vis_dir}")
+                    except Exception as e:
+                        logger.warning(f"Failed to save visualization images: {e}")
+
                # Try to access parsing_res_list and table_res_list (the complete structure)
                parsing_res_list = None
                table_res_list = None
@@ -369,6 +221,7 @@ class PPStructureEnhanced:
                            logger.info(f"Found parsing_res_list in to_dict['res'] with {len(parsing_res_list)} elements")

                # Extract table_res_list which contains cell_box_list
+                layout_det_res = None
                if result_dict:
                    if 'table_res_list' in result_dict:
                        table_res_list = result_dict['table_res_list']
@@ -377,20 +230,40 @@ class PPStructureEnhanced:
                            if 'cell_box_list' in tbl:
                                logger.info(f"  Table {i}: {len(tbl['cell_box_list'])} cell boxes")

+                    # Extract layout_det_res for Image-in-Table processing
+                    if 'layout_det_res' in result_dict:
+                        layout_det_res = result_dict['layout_det_res']
+                        logger.info(f"Found layout_det_res with {len(layout_det_res.get('boxes', []))} boxes")
+
                # Process parsing_res_list if found
                if parsing_res_list:
                    elements = self._process_parsing_res_list(
                        parsing_res_list, current_page, output_dir, image_path, scaling_info,
-                        table_res_list=table_res_list  # Pass table_res_list for cell_box_list
+                        table_res_list=table_res_list,  # Pass table_res_list for cell_box_list
+                        layout_det_res=layout_det_res,  # Pass layout_det_res for Image-in-Table
+                        use_cv_table_detection=use_cv_table_detection  # Use CV for wired tables
                    )
                    all_elements.extend(elements)

                    # Extract tables and images from elements
+                    table_bboxes = []  # Collect table bboxes for standalone image filtering
                    for elem in elements:
                        if elem['type'] == ElementType.TABLE:
                            all_tables.append(elem)
+                            table_bboxes.append(elem.get('bbox', [0, 0, 0, 0]))
                        elif elem['type'] in [ElementType.IMAGE, ElementType.FIGURE]:
                            all_images.append(elem)
+
+                    # Extract standalone images from layout_det_res (images NOT inside tables)
+                    if layout_det_res and image_path and output_dir:
+                        standalone_images = self._extract_standalone_images(
+                            layout_det_res, table_bboxes, image_path, output_dir,
+                            current_page, len(elements), scaling_info
+                        )
+                        if standalone_images:
+                            all_elements.extend(standalone_images)
+                            all_images.extend(standalone_images)
+                            logger.info(f"Extracted {len(standalone_images)} standalone images from layout_det_res")
                else:
                    # Fallback to markdown if parsing_res_list not available
                    logger.warning("parsing_res_list not found, falling back to markdown")
@@ -402,7 +275,7 @@ class PPStructureEnhanced:
            # Create reading order based on element positions
            reading_order = self._determine_reading_order(all_elements)

-            return {
+            result = {
                'elements': all_elements,
                'total_elements': len(all_elements),
                'reading_order': reading_order,
@@ -412,6 +285,12 @@ class PPStructureEnhanced:
                'has_parsing_res_list': parsing_res_list is not None
            }

+            # Add visualization directory if available
+            if visualization_dir:
+                result['visualization_dir'] = str(visualization_dir)
+
+            return result
+
        except Exception as e:
            logger.error(f"Enhanced PP-StructureV3 analysis error: {e}")
            import traceback
@@ -446,7 +325,9 @@ class PPStructureEnhanced:
        output_dir: Optional[Path],
        source_image_path: Optional[Path] = None,
        scaling_info: Optional['ScalingInfo'] = None,
-        table_res_list: Optional[List[Dict]] = None
+        table_res_list: Optional[List[Dict]] = None,
+        layout_det_res: Optional[Dict] = None,
+        use_cv_table_detection: bool = False
    ) -> List[Dict[str, Any]]:
        """
        Process parsing_res_list to extract all elements.
@@ -458,6 +339,8 @@ class PPStructureEnhanced:
            output_dir: Optional output directory
            source_image_path: Path to source image for cropping image regions
            table_res_list: Optional list of table results containing cell_box_list
+            layout_det_res: Optional layout detection result for Image-in-Table processing
+            use_cv_table_detection: If True, use CV line detection for wired tables

        Returns:
            List of processed elements with normalized structure
@@ -628,53 +511,55 @@ class PPStructureEnhanced:
                    logger.info(f"[TABLE] Processed {len(processed_cells)} cell boxes with table offset ({table_x}, {table_y})")
                    cell_boxes_extracted = True

-                # Supplement with direct SLANeXt call if PPStructureV3 didn't provide boxes
-                if not cell_boxes_extracted and source_image_path and bbox != [0, 0, 0, 0]:
-                    logger.info(f"[TABLE] No boxes from PPStructureV3, attempting SLANeXt extraction...")
-                    try:
-                        # Load source image and crop table region
-                        source_img = Image.open(source_image_path)
-                        source_array = np.array(source_img)
-
-                        # Crop table region (bbox is in original image coordinates)
-                        x1, y1, x2, y2 = [int(round(c)) for c in bbox]
-                        # Ensure coordinates are within image bounds
-                        h, w = source_array.shape[:2]
-                        x1, y1 = max(0, x1), max(0, y1)
-                        x2, y2 = min(w, x2), min(h, y2)
-
-                        if x2 > x1 and y2 > y1:
-                            table_crop = source_array[y1:y2, x1:x2]
-
-                            # Convert RGB to BGR for SLANeXt
-                            if len(table_crop.shape) == 3 and table_crop.shape[2] == 3:
-                                table_crop_bgr = table_crop[:, :, ::-1]
-                            else:
-                                table_crop_bgr = table_crop
-
-                            # Extract cell boxes using SLANeXt
-                            slanet_boxes = self._extract_cell_boxes_with_slanet(
-                                table_crop_bgr,
-                                bbox,  # Pass original bbox for coordinate offset
-                                is_wired=None  # Auto-detect
-                            )
-
-                            if slanet_boxes:
-                                element['cell_boxes'] = slanet_boxes
-                                element['cell_boxes_source'] = 'slanet'
-                                cell_boxes_extracted = True
-                                logger.info(f"[TABLE] SLANeXt extracted {len(slanet_boxes)} cell boxes")
-                        else:
-                            logger.warning(f"[TABLE] Invalid crop region: ({x1},{y1})-({x2},{y2})")
-
-                    except Exception as e:
-                        logger.error(f"[TABLE] SLANeXt extraction failed: {e}")
-
                if not cell_boxes_extracted:
                    logger.info(f"[TABLE] No cell boxes available. PPStructureV3 keys: {list(res_data.keys()) if res_data else 'empty'}")

-            # Special handling for images/figures
-            elif mapped_type in [ElementType.IMAGE, ElementType.FIGURE]:
+                # 2.5 CV-based table line detection for wired tables
+                if use_cv_table_detection and source_image_path and source_image_path.exists():
+                    try:
+                        # Load image for CV processing
+                        cv_image = cv2.imread(str(source_image_path))
+                        if cv_image is not None:
+                            cv_detector = CVTableDetector()
+                            ml_cell_boxes = element.get('cell_boxes', [])
+
+                            # Detect cells using CV line detection
+                            cv_cells = cv_detector.detect_and_merge_with_ml(
+                                cv_image,
+                                bbox,  # Table bbox
+                                ml_cell_boxes
+                            )
+
+                            if cv_cells:
+                                # Apply scaling if needed
+                                if scaling_info and scaling_info.was_scaled:
+                                    cv_cells = [
+                                        [
+                                            c[0] * scaling_info.scale_x,
+                                            c[1] * scaling_info.scale_y,
+                                            c[2] * scaling_info.scale_x,
+                                            c[3] * scaling_info.scale_y
+                                        ]
+                                        for c in cv_cells
+                                    ]
+
+                                element['cell_boxes'] = cv_cells
+                                element['cell_boxes_source'] = 'cv_line_detection'
+                                logger.info(f"[TABLE] CV line detection found {len(cv_cells)} cells (ML had {len(ml_cell_boxes)})")
+                    except Exception as cv_error:
+                        logger.warning(f"[TABLE] CV line detection failed: {cv_error}")
+
+                # 3. Image-in-Table 處理：檢測並嵌入表格內的圖片
+                if layout_det_res and source_image_path and output_dir:
+                    embedded_images = self._embed_images_in_table(
+                        element, bbox, layout_det_res, source_image_path, output_dir
+                    )
+                    if embedded_images:
+                        element['embedded_images'] = embedded_images
+                        logger.info(f"[TABLE] Embedded {len(embedded_images)} images into table")
+
+            # Special handling for images/figures/stamps (visual elements that need cropping)
+            elif mapped_type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.STAMP, ElementType.LOGO]:
                # Save image if path provided
                if 'img_path' in item and output_dir:
                    saved_path = self._save_image(item['img_path'], output_dir, element['element_id'])
@@ -704,6 +589,209 @@ class PPStructureEnhanced:

        return elements

+    def _embed_images_in_table(
+        self,
+        table_element: Dict[str, Any],
+        table_bbox: List[float],
+        layout_det_res: Dict,
+        source_image_path: Path,
+        output_dir: Path
+    ) -> List[Dict[str, Any]]:
+        """
+        Detect and embed images that are inside a table region.
+
+        This handles the case where layout detection finds an image inside a table,
+        similar to how pp_demo embeds images in table HTML.
+
+        Args:
+            table_element: The table element being processed
+            table_bbox: Table bounding box [x1, y1, x2, y2]
+            layout_det_res: Layout detection result containing all detected boxes
+            source_image_path: Path to source image for cropping
+            output_dir: Output directory for saving cropped images
+
+        Returns:
+            List of embedded image info dicts with 'bbox', 'saved_path', 'html_tag'
+        """
+        embedded_images = []
+
+        try:
+            boxes = layout_det_res.get('boxes', [])
+            table_x1, table_y1, table_x2, table_y2 = table_bbox
+
+            for box in boxes:
+                label = box.get('label', '').lower()
+                if label != 'image':
+                    continue
+
+                # Get image bbox
+                img_coord = box.get('coordinate', [])
+                if len(img_coord) < 4:
+                    continue
+
+                img_x1, img_y1, img_x2, img_y2 = img_coord[:4]
+
+                # Check if image is inside table (with some tolerance)
+                tolerance = 5  # pixels
+                if (img_x1 >= table_x1 - tolerance and
+                    img_y1 >= table_y1 - tolerance and
+                    img_x2 <= table_x2 + tolerance and
+                    img_y2 <= table_y2 + tolerance):
+
+                    logger.info(f"[IMAGE-IN-TABLE] Found image at [{int(img_x1)},{int(img_y1)},{int(img_x2)},{int(img_y2)}] inside table")
+
+                    # Crop and save the image
+                    img_element_id = f"img_in_table_{int(img_x1)}_{int(img_y1)}_{int(img_x2)}_{int(img_y2)}"
+                    cropped_path = self._crop_and_save_image(
+                        source_image_path,
+                        [img_x1, img_y1, img_x2, img_y2],
+                        output_dir,
+                        img_element_id
+                    )
+
+                    if cropped_path:
+                        # Create relative path for HTML embedding
+                        rel_path = f"imgs/{Path(cropped_path).name}"
+
+                        # Create img tag similar to pp_demo
+                        img_html = f'<div style="text-align: center;"><img src="{rel_path}" alt="Image" /></div>'
+
+                        embedded_image = {
+                            'bbox': [img_x1, img_y1, img_x2, img_y2],
+                            'saved_path': str(cropped_path),
+                            'relative_path': rel_path,
+                            'html_tag': img_html,
+                            'element_id': img_element_id
+                        }
+                        embedded_images.append(embedded_image)
+
+                        # Try to insert image into HTML content
+                        if 'html' in table_element and table_element['html']:
+                            # Insert image reference at the end of HTML before </table>
+                            original_html = table_element['html']
+                            if '</tbody>' in original_html:
+                                # Insert before </tbody> in a new row
+                                new_html = original_html.replace(
+                                    '</tbody>',
+                                    f'<tr><td colspan="99" style="text-align:center;"><img src="{rel_path}" alt="Embedded Image" /></td></tr></tbody>'
+                                )
+                                table_element['html'] = new_html
+                                logger.info(f"[IMAGE-IN-TABLE] Embedded image into table HTML")
+
+        except Exception as e:
+            logger.error(f"[IMAGE-IN-TABLE] Error processing images in table: {e}")
+
+        return embedded_images
+
+    def _extract_standalone_images(
+        self,
+        layout_det_res: Dict,
+        table_bboxes: List[List[float]],
+        source_image_path: Path,
+        output_dir: Path,
+        current_page: int,
+        start_index: int,
+        scaling_info: Optional['ScalingInfo'] = None
+    ) -> List[Dict[str, Any]]:
+        """
+        Extract standalone images from layout_det_res that are NOT inside tables.
+
+        This handles images that PP-StructureV3 detects in layout_det_res but
+        doesn't include in parsing_res_list (non-table images).
+
+        Args:
+            layout_det_res: Layout detection result containing all detected boxes
+            table_bboxes: List of table bounding boxes to exclude images inside tables
+            source_image_path: Path to source image for cropping
+            output_dir: Output directory for saving cropped images
+            current_page: Current page number
+            start_index: Starting index for element IDs
+            scaling_info: Optional scaling info for coordinate restoration
+
+        Returns:
+            List of standalone image elements
+        """
+        standalone_images = []
+
+        try:
+            boxes = layout_det_res.get('boxes', [])
+            logger.info(f"[STANDALONE-IMAGE] Checking {len(boxes)} boxes for standalone images")
+
+            for box_idx, box in enumerate(boxes):
+                label = box.get('label', '').lower()
+                if label != 'image':
+                    continue
+
+                # Get image bbox
+                img_coord = box.get('coordinate', [])
+                if len(img_coord) < 4:
+                    continue
+
+                img_x1, img_y1, img_x2, img_y2 = img_coord[:4]
+
+                # Check if image is inside any table (skip if so)
+                is_inside_table = False
+                for table_bbox in table_bboxes:
+                    if len(table_bbox) < 4:
+                        continue
+                    tx1, ty1, tx2, ty2 = table_bbox[:4]
+                    tolerance = 5  # pixels
+                    if (img_x1 >= tx1 - tolerance and
+                        img_y1 >= ty1 - tolerance and
+                        img_x2 <= tx2 + tolerance and
+                        img_y2 <= ty2 + tolerance):
+                        is_inside_table = True
+                        logger.debug(f"[STANDALONE-IMAGE] Image at [{int(img_x1)},{int(img_y1)}] is inside table, skipping")
+                        break
+
+                if is_inside_table:
+                    continue
+
+                # Scale bbox back to original coordinates if needed
+                if scaling_info and scaling_info.was_scaled:
+                    scale_factor = scaling_info.scale_factor
+                    img_x1 *= scale_factor
+                    img_y1 *= scale_factor
+                    img_x2 *= scale_factor
+                    img_y2 *= scale_factor
+                    logger.debug(f"[STANDALONE-IMAGE] Scaled bbox by {scale_factor:.3f}")
+
+                logger.info(f"[STANDALONE-IMAGE] Found standalone image at [{int(img_x1)},{int(img_y1)},{int(img_x2)},{int(img_y2)}]")
+
+                # Crop and save the image
+                element_idx = start_index + len(standalone_images)
+                img_element_id = f"standalone_img_{current_page}_{element_idx}"
+                cropped_path = self._crop_and_save_image(
+                    source_image_path,
+                    [img_x1, img_y1, img_x2, img_y2],
+                    output_dir,
+                    img_element_id
+                )
+
+                if cropped_path:
+                    element = {
+                        'element_id': img_element_id,
+                        'type': ElementType.IMAGE,
+                        'original_type': 'image',
+                        'content': '',
+                        'page': current_page,
+                        'bbox': [img_x1, img_y1, img_x2, img_y2],
+                        'index': element_idx,
+                        'confidence': box.get('score', 1.0),
+                        'saved_path': cropped_path,
+                        'img_path': cropped_path,
+                        'source': 'layout_det_res'
+                    }
+                    standalone_images.append(element)
+                    logger.info(f"[STANDALONE-IMAGE] Extracted and saved: {cropped_path}")
+
+        except Exception as e:
+            logger.error(f"[STANDALONE-IMAGE] Error extracting standalone images: {e}")
+            import traceback
+            traceback.print_exc()
+
+        return standalone_images
+
    def _process_markdown_fallback(
        self,
        page_result: Any,