fix: prevent text/table/image overlap by filtering text in all regions
Critical Fix for Overlapping Content: After fixing scale factors, overlapping became visible because text was being drawn on top of tables AND images. Previous code only filtered text inside tables, not images. Problem: 1. Text regions overlapped with table regions → duplicated content 2. Text regions overlapped with image regions → text on top of images 3. Old filter only checked tables from images_metadata 4. Old filter used simple point-in-bbox, couldn't handle polygons Solution: 1. Add _get_bbox_coords() helper: - Handles both polygon [[x,y],...] and rect [x1,y1,x2,y2] formats - Returns normalized [x_min, y_min, x_max, y_max] 2. Add _is_bbox_inside() with tolerance: - Uses _get_bbox_coords() for both inner and outer bbox - Checks if inner bbox is completely inside outer bbox - Supports 5px tolerance for edge cases 3. Add _filter_text_in_regions() (replaces old logic): - Filters text regions against ANY list of regions to avoid - Works with tables, images, or any other region type - Logs how many regions were filtered 4. Update generate_layout_pdf(): - Collect both table_regions and image_regions - Combine into regions_to_avoid list - Use new filter function instead of old inline logic Changes: - backend/app/services/pdf_generator_service.py: - Add Union to imports - Add _get_bbox_coords() helper (polygon + rect support) - Add _is_bbox_inside() (tolerance-based containment check) - Add _filter_text_in_regions() (generic region filter) - Replace old table-only filter with new multi-region filter - Filter text against both tables AND images Expected Results: ✓ No text drawn inside table regions ✓ No text drawn inside image regions ✓ Tables rendered as proper ReportLab tables ✓ Images rendered as embedded images ✓ No duplicate or overlapping content Additional: - Cleaned all Python cache files (__pycache__, *.pyc) - Cleaned test output directories - Cleaned uploads and results directories 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -6,7 +6,7 @@ Generates PDF files that preserve the original document layout using OCR JSON da
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, List, Optional, Tuple
|
from typing import Dict, List, Optional, Tuple, Union
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
from reportlab.lib.pagesizes import A4, letter
|
from reportlab.lib.pagesizes import A4, letter
|
||||||
@@ -272,6 +272,68 @@ class PDFGeneratorService:
|
|||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def _get_bbox_coords(self, bbox: Union[List[List[float]], List[float]]) -> Optional[Tuple[float, float, float, float]]:
|
||||||
|
"""將任何 bbox 格式 (多邊形或 [x1,y1,x2,y2]) 轉換為 [x_min, y_min, x_max, y_max]"""
|
||||||
|
try:
|
||||||
|
if isinstance(bbox[0], (list, tuple)):
|
||||||
|
# 處理多邊形 [[x, y], ...]
|
||||||
|
x_coords = [p[0] for p in bbox if isinstance(p, (list, tuple)) and len(p) >= 2]
|
||||||
|
y_coords = [p[1] for p in bbox if isinstance(p, (list, tuple)) and len(p) >= 2]
|
||||||
|
if not x_coords or not y_coords:
|
||||||
|
return None
|
||||||
|
return min(x_coords), min(y_coords), max(x_coords), max(y_coords)
|
||||||
|
elif isinstance(bbox[0], (int, float)) and len(bbox) == 4:
|
||||||
|
# 處理 [x1, y1, x2, y2]
|
||||||
|
return bbox[0], bbox[1], bbox[2], bbox[3]
|
||||||
|
else:
|
||||||
|
logger.warning(f"未知的 bbox 格式: {bbox}")
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"解析 bbox {bbox} 時出錯: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _is_bbox_inside(self, inner_bbox_data: Dict, outer_bbox_data: Dict, tolerance: float = 5.0) -> bool:
|
||||||
|
"""
|
||||||
|
檢查 'inner_bbox' 是否在 'outer_bbox' 內部(帶有容錯)。
|
||||||
|
此版本可處理多邊形和矩形。
|
||||||
|
"""
|
||||||
|
inner_coords = self._get_bbox_coords(inner_bbox_data.get('bbox'))
|
||||||
|
outer_coords = self._get_bbox_coords(outer_bbox_data.get('bbox'))
|
||||||
|
|
||||||
|
if not inner_coords or not outer_coords:
|
||||||
|
return False
|
||||||
|
|
||||||
|
inner_x1, inner_y1, inner_x2, inner_y2 = inner_coords
|
||||||
|
outer_x1, outer_y1, outer_x2, outer_y2 = outer_coords
|
||||||
|
|
||||||
|
# 檢查 inner 是否在 outer 內部 (加入 tolerance)
|
||||||
|
is_inside = (
|
||||||
|
(inner_x1 >= outer_x1 - tolerance) and
|
||||||
|
(inner_y1 >= outer_y1 - tolerance) and
|
||||||
|
(inner_x2 <= outer_x2 + tolerance) and
|
||||||
|
(inner_y2 <= outer_y2 + tolerance)
|
||||||
|
)
|
||||||
|
return is_inside
|
||||||
|
|
||||||
|
def _filter_text_in_regions(self, text_regions: List[Dict], regions_to_avoid: List[Dict]) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
過濾掉位於 'regions_to_avoid'(例如表格、圖片)內部的文字區域。
|
||||||
|
"""
|
||||||
|
filtered_text = []
|
||||||
|
for text_region in text_regions:
|
||||||
|
is_inside_any_avoid_region = False
|
||||||
|
for avoid_region in regions_to_avoid:
|
||||||
|
if self._is_bbox_inside(text_region, avoid_region):
|
||||||
|
is_inside_any_avoid_region = True
|
||||||
|
logger.debug(f"過濾掉文字: {text_region.get('text', '')[:20]}...")
|
||||||
|
break # 找到一個包含它的區域就足夠了
|
||||||
|
|
||||||
|
if not is_inside_any_avoid_region:
|
||||||
|
filtered_text.append(text_region)
|
||||||
|
|
||||||
|
logger.info(f"原始文字區域: {len(text_regions)}, 過濾後: {len(filtered_text)}")
|
||||||
|
return filtered_text
|
||||||
|
|
||||||
def draw_text_region(
|
def draw_text_region(
|
||||||
self,
|
self,
|
||||||
pdf_canvas: canvas.Canvas,
|
pdf_canvas: canvas.Canvas,
|
||||||
@@ -629,40 +691,15 @@ class PDFGeneratorService:
|
|||||||
# Create PDF canvas with target dimensions
|
# Create PDF canvas with target dimensions
|
||||||
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height))
|
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height))
|
||||||
|
|
||||||
# Extract table bboxes to exclude text in those regions
|
# *** 關鍵修復:收集所有需要避免的區域(表格 + 圖片)***
|
||||||
table_bboxes = []
|
table_regions = ocr_data.get('tables', [])
|
||||||
for img_meta in images_metadata:
|
image_regions = ocr_data.get('image_regions', [])
|
||||||
img_path = img_meta.get('image_path', '')
|
|
||||||
if 'table' in img_path.lower():
|
|
||||||
bbox = img_meta.get('bbox', [])
|
|
||||||
if bbox and len(bbox) >= 4:
|
|
||||||
table_bboxes.append(bbox)
|
|
||||||
|
|
||||||
# Helper function to check if a point is inside a bbox
|
# 建立一個包含「所有」要避免的區域的列表
|
||||||
def point_in_bbox(x, y, bbox):
|
regions_to_avoid = table_regions + image_regions
|
||||||
x1, y1 = bbox[0]
|
|
||||||
x2, y2 = bbox[2]
|
|
||||||
return min(x1, x2) <= x <= max(x1, x2) and min(y1, y2) <= y <= max(y1, y2)
|
|
||||||
|
|
||||||
# Filter text regions to exclude those inside tables
|
# 使用新的過濾函式過濾文字區域
|
||||||
filtered_text_regions = []
|
filtered_text_regions = self._filter_text_in_regions(text_regions, regions_to_avoid)
|
||||||
for region in text_regions:
|
|
||||||
bbox = region.get('bbox', [])
|
|
||||||
if not bbox or len(bbox) < 4:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Check if text region center is inside any table bbox
|
|
||||||
center_x = (bbox[0][0] + bbox[2][0]) / 2
|
|
||||||
center_y = (bbox[0][1] + bbox[2][1]) / 2
|
|
||||||
|
|
||||||
is_in_table = any(point_in_bbox(center_x, center_y, table_bbox) for table_bbox in table_bboxes)
|
|
||||||
|
|
||||||
if not is_in_table:
|
|
||||||
filtered_text_regions.append(region)
|
|
||||||
else:
|
|
||||||
logger.debug(f"Excluded text '{region.get('text', '')[:20]}...' (inside table)")
|
|
||||||
|
|
||||||
logger.info(f"Filtered {len(text_regions) - len(filtered_text_regions)} text regions inside tables")
|
|
||||||
|
|
||||||
# Group regions by page
|
# Group regions by page
|
||||||
pages_data = {}
|
pages_data = {}
|
||||||
|
|||||||
Reference in New Issue
Block a user