fix: prevent text/table/image overlap by filtering text in all regions

Critical Fix for Overlapping Content: After fixing scale factors, overlapping became visible because text was being drawn on top of tables AND images. Previous code only filtered text inside tables, not images. Problem: 1. Text regions overlapped with table regions → duplicated content 2. Text regions overlapped with image regions → text on top of images 3. Old filter only checked tables from images_metadata 4. Old filter used simple point-in-bbox, couldn't handle polygons Solution: 1. Add _get_bbox_coords() helper: - Handles both polygon [[x,y],...] and rect [x1,y1,x2,y2] formats - Returns normalized [x_min, y_min, x_max, y_max] 2. Add _is_bbox_inside() with tolerance: - Uses _get_bbox_coords() for both inner and outer bbox - Checks if inner bbox is completely inside outer bbox - Supports 5px tolerance for edge cases 3. Add _filter_text_in_regions() (replaces old logic): - Filters text regions against ANY list of regions to avoid - Works with tables, images, or any other region type - Logs how many regions were filtered 4. Update generate_layout_pdf(): - Collect both table_regions and image_regions - Combine into regions_to_avoid list - Use new filter function instead of old inline logic Changes: - backend/app/services/pdf_generator_service.py: - Add Union to imports - Add _get_bbox_coords() helper (polygon + rect support) - Add _is_bbox_inside() (tolerance-based containment check) - Add _filter_text_in_regions() (generic region filter) - Replace old table-only filter with new multi-region filter - Filter text against both tables AND images Expected Results: ✓ No text drawn inside table regions ✓ No text drawn inside image regions ✓ Tables rendered as proper ReportLab tables ✓ Images rendered as embedded images ✓ No duplicate or overlapping content Additional: - Cleaned all Python cache files (__pycache__, *.pyc) - Cleaned test output directories - Cleaned uploads and results directories 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-18 08:16:19 +08:00
parent e839d68160
commit 92e326b3a3
1 changed files with 70 additions and 33 deletions
--- a/backend/app/services/pdf_generator_service.py
+++ b/backend/app/services/pdf_generator_service.py
@@ -6,7 +6,7 @@ Generates PDF files that preserve the original document layout using OCR JSON da
 import json
 import logging
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple, Union
 from datetime import datetime

 from reportlab.lib.pagesizes import A4, letter
@@ -272,6 +272,68 @@ class PDFGeneratorService:

        return None

+    def _get_bbox_coords(self, bbox: Union[List[List[float]], List[float]]) -> Optional[Tuple[float, float, float, float]]:
+        """將任何 bbox 格式 (多邊形或 [x1,y1,x2,y2]) 轉換為 [x_min, y_min, x_max, y_max]"""
+        try:
+            if isinstance(bbox[0], (list, tuple)):
+                # 處理多邊形 [[x, y], ...]
+                x_coords = [p[0] for p in bbox if isinstance(p, (list, tuple)) and len(p) >= 2]
+                y_coords = [p[1] for p in bbox if isinstance(p, (list, tuple)) and len(p) >= 2]
+                if not x_coords or not y_coords:
+                    return None
+                return min(x_coords), min(y_coords), max(x_coords), max(y_coords)
+            elif isinstance(bbox[0], (int, float)) and len(bbox) == 4:
+                # 處理 [x1, y1, x2, y2]
+                return bbox[0], bbox[1], bbox[2], bbox[3]
+            else:
+                logger.warning(f"未知的 bbox 格式: {bbox}")
+                return None
+        except Exception as e:
+            logger.error(f"解析 bbox {bbox} 時出錯: {e}")
+            return None
+
+    def _is_bbox_inside(self, inner_bbox_data: Dict, outer_bbox_data: Dict, tolerance: float = 5.0) -> bool:
+        """
+        檢查 'inner_bbox' 是否在 'outer_bbox' 內部（帶有容錯）。
+        此版本可處理多邊形和矩形。
+        """
+        inner_coords = self._get_bbox_coords(inner_bbox_data.get('bbox'))
+        outer_coords = self._get_bbox_coords(outer_bbox_data.get('bbox'))
+
+        if not inner_coords or not outer_coords:
+            return False
+
+        inner_x1, inner_y1, inner_x2, inner_y2 = inner_coords
+        outer_x1, outer_y1, outer_x2, outer_y2 = outer_coords
+
+        # 檢查 inner 是否在 outer 內部 (加入 tolerance)
+        is_inside = (
+            (inner_x1 >= outer_x1 - tolerance) and
+            (inner_y1 >= outer_y1 - tolerance) and
+            (inner_x2 <= outer_x2 + tolerance) and
+            (inner_y2 <= outer_y2 + tolerance)
+        )
+        return is_inside
+
+    def _filter_text_in_regions(self, text_regions: List[Dict], regions_to_avoid: List[Dict]) -> List[Dict]:
+        """
+        過濾掉位於 'regions_to_avoid'（例如表格、圖片）內部的文字區域。
+        """
+        filtered_text = []
+        for text_region in text_regions:
+            is_inside_any_avoid_region = False
+            for avoid_region in regions_to_avoid:
+                if self._is_bbox_inside(text_region, avoid_region):
+                    is_inside_any_avoid_region = True
+                    logger.debug(f"過濾掉文字: {text_region.get('text', '')[:20]}...")
+                    break  # 找到一個包含它的區域就足夠了
+
+            if not is_inside_any_avoid_region:
+                filtered_text.append(text_region)
+
+        logger.info(f"原始文字區域: {len(text_regions)}, 過濾後: {len(filtered_text)}")
+        return filtered_text
+
    def draw_text_region(
        self,
        pdf_canvas: canvas.Canvas,
@@ -629,40 +691,15 @@ class PDFGeneratorService:
            # Create PDF canvas with target dimensions
            pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height))

-            # Extract table bboxes to exclude text in those regions
-            table_bboxes = []
-            for img_meta in images_metadata:
-                img_path = img_meta.get('image_path', '')
-                if 'table' in img_path.lower():
-                    bbox = img_meta.get('bbox', [])
-                    if bbox and len(bbox) >= 4:
-                        table_bboxes.append(bbox)
+            # *** 關鍵修復：收集所有需要避免的區域（表格 + 圖片）***
+            table_regions = ocr_data.get('tables', [])
+            image_regions = ocr_data.get('image_regions', [])

-            # Helper function to check if a point is inside a bbox
-            def point_in_bbox(x, y, bbox):
-                x1, y1 = bbox[0]
-                x2, y2 = bbox[2]
-                return min(x1, x2) <= x <= max(x1, x2) and min(y1, y2) <= y <= max(y1, y2)
+            # 建立一個包含「所有」要避免的區域的列表
+            regions_to_avoid = table_regions + image_regions

-            # Filter text regions to exclude those inside tables
-            filtered_text_regions = []
-            for region in text_regions:
-                bbox = region.get('bbox', [])
-                if not bbox or len(bbox) < 4:
-                    continue
-
-                # Check if text region center is inside any table bbox
-                center_x = (bbox[0][0] + bbox[2][0]) / 2
-                center_y = (bbox[0][1] + bbox[2][1]) / 2
-
-                is_in_table = any(point_in_bbox(center_x, center_y, table_bbox) for table_bbox in table_bboxes)
-
-                if not is_in_table:
-                    filtered_text_regions.append(region)
-                else:
-                    logger.debug(f"Excluded text '{region.get('text', '')[:20]}...' (inside table)")
-
-            logger.info(f"Filtered {len(text_regions) - len(filtered_text_regions)} text regions inside tables")
+            # 使用新的過濾函式過濾文字區域
+            filtered_text_regions = self._filter_text_in_regions(text_regions, regions_to_avoid)

            # Group regions by page
            pages_data = {}