fix: prevent text/table/image overlap by filtering text in all regions

Critical Fix for Overlapping Content: After fixing scale factors, overlapping became visible because text was being drawn on top of tables AND images. Previous code only filtered text inside tables, not images. Problem: 1. Text regions overlapped with table regions → duplicated content 2. Text regions overlapped with image regions → text on top of images 3. Old filter only checked tables from images_metadata 4. Old filter used simple point-in-bbox, couldn't handle polygons Solution: 1. Add _get_bbox_coords() helper: - Handles both polygon [[x,y],...] and rect [x1,y1,x2,y2] formats - Returns normalized [x_min, y_min, x_max, y_max] 2. Add _is_bbox_inside() with tolerance: - Uses _get_bbox_coords() for both inner and outer bbox - Checks if inner bbox is completely inside outer bbox - Supports 5px tolerance for edge cases 3. Add _filter_text_in_regions() (replaces old logic): - Filters text regions against ANY list of regions to avoid - Works with tables, images, or any other region type - Logs how many regions were filtered 4. Update generate_layout_pdf(): - Collect both table_regions and image_regions - Combine into regions_to_avoid list - Use new filter function instead of old inline logic Changes: - backend/app/services/pdf_generator_service.py: - Add Union to imports - Add _get_bbox_coords() helper (polygon + rect support) - Add _is_bbox_inside() (tolerance-based containment check) - Add _filter_text_in_regions() (generic region filter) - Replace old table-only filter with new multi-region filter - Filter text against both tables AND images Expected Results: ✓ No text drawn inside table regions ✓ No text drawn inside image regions ✓ Tables rendered as proper ReportLab tables ✓ Images rendered as embedded images ✓ No duplicate or overlapping content Additional: - Cleaned all Python cache files (__pycache__, *.pyc) - Cleaned test output directories - Cleaned uploads and results directories 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-18 08:16:19 +08:00
parent e839d68160
commit 92e326b3a3
1 changed files with 70 additions and 33 deletions
--- a/backend/app/services/pdf_generator_service.py
+++ b/backend/app/services/pdf_generator_service.py
@@ -6,7 +6,7 @@ Generates PDF files that preserve the original document layout using OCR JSON da
 import json
 import logging
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple, Union
 from datetime import datetime
 from reportlab.lib.pagesizes import A4, letter
@@ -272,6 +272,68 @@ class PDFGeneratorService:
        return None
    def _get_bbox_coords(self, bbox: Union[List[List[float]], List[float]]) -> Optional[Tuple[float, float, float, float]]:
        """將任何 bbox 格式 (多邊形或 [x1,y1,x2,y2]) 轉換為 [x_min, y_min, x_max, y_max]"""
        try:
            if isinstance(bbox[0], (list, tuple)):
                # 處理多邊形 [[x, y], ...]
                x_coords = [p[0] for p in bbox if isinstance(p, (list, tuple)) and len(p) >= 2]
                y_coords = [p[1] for p in bbox if isinstance(p, (list, tuple)) and len(p) >= 2]
                if not x_coords or not y_coords:
                    return None
                return min(x_coords), min(y_coords), max(x_coords), max(y_coords)
            elif isinstance(bbox[0], (int, float)) and len(bbox) == 4:
                # 處理 [x1, y1, x2, y2]
                return bbox[0], bbox[1], bbox[2], bbox[3]
            else:
                logger.warning(f"未知的 bbox 格式: {bbox}")
                return None
        except Exception as e:
            logger.error(f"解析 bbox {bbox} 時出錯: {e}")
            return None
    def _is_bbox_inside(self, inner_bbox_data: Dict, outer_bbox_data: Dict, tolerance: float = 5.0) -> bool:
        """
        檢查 'inner_bbox' 是否在 'outer_bbox' 內部（帶有容錯）。
        此版本可處理多邊形和矩形。
        """
        inner_coords = self._get_bbox_coords(inner_bbox_data.get('bbox'))
        outer_coords = self._get_bbox_coords(outer_bbox_data.get('bbox'))
        if not inner_coords or not outer_coords:
            return False
        inner_x1, inner_y1, inner_x2, inner_y2 = inner_coords
        outer_x1, outer_y1, outer_x2, outer_y2 = outer_coords
        # 檢查 inner 是否在 outer 內部 (加入 tolerance)
        is_inside = (
            (inner_x1 >= outer_x1 - tolerance) and
            (inner_y1 >= outer_y1 - tolerance) and
            (inner_x2 <= outer_x2 + tolerance) and
            (inner_y2 <= outer_y2 + tolerance)
        )
        return is_inside
    def _filter_text_in_regions(self, text_regions: List[Dict], regions_to_avoid: List[Dict]) -> List[Dict]:
        """
        過濾掉位於 'regions_to_avoid'（例如表格、圖片）內部的文字區域。
        """
        filtered_text = []
        for text_region in text_regions:
            is_inside_any_avoid_region = False
            for avoid_region in regions_to_avoid:
                if self._is_bbox_inside(text_region, avoid_region):
                    is_inside_any_avoid_region = True
                    logger.debug(f"過濾掉文字: {text_region.get('text', '')[:20]}...")
                    break  # 找到一個包含它的區域就足夠了
            if not is_inside_any_avoid_region:
                filtered_text.append(text_region)
        logger.info(f"原始文字區域: {len(text_regions)}, 過濾後: {len(filtered_text)}")
        return filtered_text
    def draw_text_region(
        self,
        pdf_canvas: canvas.Canvas,
@@ -629,40 +691,15 @@ class PDFGeneratorService:
            # Create PDF canvas with target dimensions
            pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height))
-            # Extract table bboxes to exclude text in those regions
+            # *** 關鍵修復：收集所有需要避免的區域（表格 + 圖片）***
-            table_bboxes = []
+            table_regions = ocr_data.get('tables', [])
-            for img_meta in images_metadata:
+            image_regions = ocr_data.get('image_regions', [])
                img_path = img_meta.get('image_path', '')
                if 'table' in img_path.lower():
                    bbox = img_meta.get('bbox', [])
                    if bbox and len(bbox) >= 4:
                        table_bboxes.append(bbox)
-            # Helper function to check if a point is inside a bbox
+            # 建立一個包含「所有」要避免的區域的列表
-            def point_in_bbox(x, y, bbox):
+            regions_to_avoid = table_regions + image_regions
                x1, y1 = bbox[0]
                x2, y2 = bbox[2]
                return min(x1, x2) <= x <= max(x1, x2) and min(y1, y2) <= y <= max(y1, y2)
-            # Filter text regions to exclude those inside tables
+            # 使用新的過濾函式過濾文字區域
-            filtered_text_regions = []
+            filtered_text_regions = self._filter_text_in_regions(text_regions, regions_to_avoid)
            for region in text_regions:
                bbox = region.get('bbox', [])
                if not bbox or len(bbox) < 4:
                    continue
                # Check if text region center is inside any table bbox
                center_x = (bbox[0][0] + bbox[2][0]) / 2
                center_y = (bbox[0][1] + bbox[2][1]) / 2
                is_in_table = any(point_in_bbox(center_x, center_y, table_bbox) for table_bbox in table_bboxes)
                if not is_in_table:
                    filtered_text_regions.append(region)
                else:
                    logger.debug(f"Excluded text '{region.get('text', '')[:20]}...' (inside table)")
            logger.info(f"Filtered {len(text_regions) - len(filtered_text_regions)} text regions inside tables")
            # Group regions by page
            pages_data = {}