fix: prevent text/table/image overlap by filtering text in all regions

Critical Fix for Overlapping Content:
After fixing scale factors, overlapping became visible because text was
being drawn on top of tables AND images. Previous code only filtered
text inside tables, not images.

Problem:
1. Text regions overlapped with table regions → duplicated content
2. Text regions overlapped with image regions → text on top of images
3. Old filter only checked tables from images_metadata
4. Old filter used simple point-in-bbox, couldn't handle polygons

Solution:
1. Add _get_bbox_coords() helper:
   - Handles both polygon [[x,y],...] and rect [x1,y1,x2,y2] formats
   - Returns normalized [x_min, y_min, x_max, y_max]

2. Add _is_bbox_inside() with tolerance:
   - Uses _get_bbox_coords() for both inner and outer bbox
   - Checks if inner bbox is completely inside outer bbox
   - Supports 5px tolerance for edge cases

3. Add _filter_text_in_regions() (replaces old logic):
   - Filters text regions against ANY list of regions to avoid
   - Works with tables, images, or any other region type
   - Logs how many regions were filtered

4. Update generate_layout_pdf():
   - Collect both table_regions and image_regions
   - Combine into regions_to_avoid list
   - Use new filter function instead of old inline logic

Changes:
- backend/app/services/pdf_generator_service.py:
  - Add Union to imports
  - Add _get_bbox_coords() helper (polygon + rect support)
  - Add _is_bbox_inside() (tolerance-based containment check)
  - Add _filter_text_in_regions() (generic region filter)
  - Replace old table-only filter with new multi-region filter
  - Filter text against both tables AND images

Expected Results:
✓ No text drawn inside table regions
✓ No text drawn inside image regions
✓ Tables rendered as proper ReportLab tables
✓ Images rendered as embedded images
✓ No duplicate or overlapping content

Additional:
- Cleaned all Python cache files (__pycache__, *.pyc)
- Cleaned test output directories
- Cleaned uploads and results directories

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-18 08:16:19 +08:00
parent e839d68160
commit 92e326b3a3

View File

@@ -6,7 +6,7 @@ Generates PDF files that preserve the original document layout using OCR JSON da
import json
import logging
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from typing import Dict, List, Optional, Tuple, Union
from datetime import datetime
from reportlab.lib.pagesizes import A4, letter
@@ -272,6 +272,68 @@ class PDFGeneratorService:
return None
def _get_bbox_coords(self, bbox: Union[List[List[float]], List[float]]) -> Optional[Tuple[float, float, float, float]]:
"""將任何 bbox 格式 (多邊形或 [x1,y1,x2,y2]) 轉換為 [x_min, y_min, x_max, y_max]"""
try:
if isinstance(bbox[0], (list, tuple)):
# 處理多邊形 [[x, y], ...]
x_coords = [p[0] for p in bbox if isinstance(p, (list, tuple)) and len(p) >= 2]
y_coords = [p[1] for p in bbox if isinstance(p, (list, tuple)) and len(p) >= 2]
if not x_coords or not y_coords:
return None
return min(x_coords), min(y_coords), max(x_coords), max(y_coords)
elif isinstance(bbox[0], (int, float)) and len(bbox) == 4:
# 處理 [x1, y1, x2, y2]
return bbox[0], bbox[1], bbox[2], bbox[3]
else:
logger.warning(f"未知的 bbox 格式: {bbox}")
return None
except Exception as e:
logger.error(f"解析 bbox {bbox} 時出錯: {e}")
return None
def _is_bbox_inside(self, inner_bbox_data: Dict, outer_bbox_data: Dict, tolerance: float = 5.0) -> bool:
"""
檢查 'inner_bbox' 是否在 'outer_bbox' 內部(帶有容錯)。
此版本可處理多邊形和矩形。
"""
inner_coords = self._get_bbox_coords(inner_bbox_data.get('bbox'))
outer_coords = self._get_bbox_coords(outer_bbox_data.get('bbox'))
if not inner_coords or not outer_coords:
return False
inner_x1, inner_y1, inner_x2, inner_y2 = inner_coords
outer_x1, outer_y1, outer_x2, outer_y2 = outer_coords
# 檢查 inner 是否在 outer 內部 (加入 tolerance)
is_inside = (
(inner_x1 >= outer_x1 - tolerance) and
(inner_y1 >= outer_y1 - tolerance) and
(inner_x2 <= outer_x2 + tolerance) and
(inner_y2 <= outer_y2 + tolerance)
)
return is_inside
def _filter_text_in_regions(self, text_regions: List[Dict], regions_to_avoid: List[Dict]) -> List[Dict]:
"""
過濾掉位於 'regions_to_avoid'(例如表格、圖片)內部的文字區域。
"""
filtered_text = []
for text_region in text_regions:
is_inside_any_avoid_region = False
for avoid_region in regions_to_avoid:
if self._is_bbox_inside(text_region, avoid_region):
is_inside_any_avoid_region = True
logger.debug(f"過濾掉文字: {text_region.get('text', '')[:20]}...")
break # 找到一個包含它的區域就足夠了
if not is_inside_any_avoid_region:
filtered_text.append(text_region)
logger.info(f"原始文字區域: {len(text_regions)}, 過濾後: {len(filtered_text)}")
return filtered_text
def draw_text_region(
self,
pdf_canvas: canvas.Canvas,
@@ -629,40 +691,15 @@ class PDFGeneratorService:
# Create PDF canvas with target dimensions
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height))
# Extract table bboxes to exclude text in those regions
table_bboxes = []
for img_meta in images_metadata:
img_path = img_meta.get('image_path', '')
if 'table' in img_path.lower():
bbox = img_meta.get('bbox', [])
if bbox and len(bbox) >= 4:
table_bboxes.append(bbox)
# *** 關鍵修復:收集所有需要避免的區域(表格 + 圖片)***
table_regions = ocr_data.get('tables', [])
image_regions = ocr_data.get('image_regions', [])
# Helper function to check if a point is inside a bbox
def point_in_bbox(x, y, bbox):
x1, y1 = bbox[0]
x2, y2 = bbox[2]
return min(x1, x2) <= x <= max(x1, x2) and min(y1, y2) <= y <= max(y1, y2)
# 建立一個包含「所有」要避免的區域的列表
regions_to_avoid = table_regions + image_regions
# Filter text regions to exclude those inside tables
filtered_text_regions = []
for region in text_regions:
bbox = region.get('bbox', [])
if not bbox or len(bbox) < 4:
continue
# Check if text region center is inside any table bbox
center_x = (bbox[0][0] + bbox[2][0]) / 2
center_y = (bbox[0][1] + bbox[2][1]) / 2
is_in_table = any(point_in_bbox(center_x, center_y, table_bbox) for table_bbox in table_bboxes)
if not is_in_table:
filtered_text_regions.append(region)
else:
logger.debug(f"Excluded text '{region.get('text', '')[:20]}...' (inside table)")
logger.info(f"Filtered {len(text_regions) - len(filtered_text_regions)} text regions inside tables")
# 使用新的過濾函式過濾文字區域
filtered_text_regions = self._filter_text_in_regions(text_regions, regions_to_avoid)
# Group regions by page
pages_data = {}