fix: 修復PDF生成中的頁碼錯誤和文字重疊問題
## 問題修復 ### 1. 頁碼分配錯誤 - **問題**: layout_data 和 images_metadata 頁碼被 1-based 覆蓋,導致全部為 0 - **修復**: 在 analyze_layout() 添加 current_page 參數,從源頭設置正確的 0-based 頁碼 - **影響**: 表格和圖片現在顯示在正確的頁面上 ### 2. 文字與表格/圖片重疊 - **問題**: 使用不存在的 'tables' 和 'image_regions' 字段過濾,導致過濾失效 - **修復**: 改用 images_metadata(包含所有表格/圖片的 bbox) - **新增**: _bbox_overlaps() 檢測任意重疊(非完全包含) - **影響**: 文字不再覆蓋表格和圖片區域 ### 3. 渲染順序優化 - **調整**: 圖片(底層) → 表格(中間層) → 文字(頂層) - **影響**: 視覺層次更正確 ## 技術細節 - ocr_service.py: 添加 current_page 參數傳遞,移除頁碼覆蓋邏輯 - pdf_generator_service.py: - 新增 _bbox_overlaps() 方法 - 更新 _filter_text_in_regions() 使用重疊檢測 - 修正數據源為 images_metadata - 調整繪製順序 ## 已知限制 - 仍有 21.6% 文字因過濾而遺失(座標定位方法的固有問題) - 未使用 PP-StructureV3 的完整版面資訊(parsing_res_list, layout_bbox) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -315,23 +315,74 @@ class PDFGeneratorService:
|
||||
)
|
||||
return is_inside
|
||||
|
||||
def _filter_text_in_regions(self, text_regions: List[Dict], regions_to_avoid: List[Dict]) -> List[Dict]:
|
||||
def _bbox_overlaps(self, bbox1_data: Dict, bbox2_data: Dict, tolerance: float = 5.0) -> bool:
|
||||
"""
|
||||
過濾掉位於 'regions_to_avoid'(例如表格、圖片)內部的文字區域。
|
||||
檢查兩個 bbox 是否有重疊(帶有容錯)。
|
||||
如果有任何重疊,返回 True。
|
||||
|
||||
Args:
|
||||
bbox1_data: 第一個 bbox 數據
|
||||
bbox2_data: 第二個 bbox 數據
|
||||
tolerance: 容錯值(像素)
|
||||
|
||||
Returns:
|
||||
True 如果兩個 bbox 有重疊
|
||||
"""
|
||||
coords1 = self._get_bbox_coords(bbox1_data.get('bbox'))
|
||||
coords2 = self._get_bbox_coords(bbox2_data.get('bbox'))
|
||||
|
||||
if not coords1 or not coords2:
|
||||
return False
|
||||
|
||||
x1_min, y1_min, x1_max, y1_max = coords1
|
||||
x2_min, y2_min, x2_max, y2_max = coords2
|
||||
|
||||
# 擴展 bbox2(表格/圖片區域)的範圍
|
||||
x2_min -= tolerance
|
||||
y2_min -= tolerance
|
||||
x2_max += tolerance
|
||||
y2_max += tolerance
|
||||
|
||||
# 檢查是否有重疊:如果沒有重疊,則必定滿足以下條件之一
|
||||
no_overlap = (
|
||||
x1_max < x2_min or # bbox1 在 bbox2 左側
|
||||
x1_min > x2_max or # bbox1 在 bbox2 右側
|
||||
y1_max < y2_min or # bbox1 在 bbox2 上方
|
||||
y1_min > y2_max # bbox1 在 bbox2 下方
|
||||
)
|
||||
|
||||
return not no_overlap
|
||||
|
||||
def _filter_text_in_regions(self, text_regions: List[Dict], regions_to_avoid: List[Dict], tolerance: float = 10.0) -> List[Dict]:
|
||||
"""
|
||||
過濾掉與 'regions_to_avoid'(例如表格、圖片)重疊的文字區域。
|
||||
|
||||
Args:
|
||||
text_regions: 文字區域列表
|
||||
regions_to_avoid: 需要避免的區域列表(表格、圖片)
|
||||
tolerance: 容錯值(像素),增加到 10.0 以更好地處理邊界情況
|
||||
|
||||
Returns:
|
||||
過濾後的文字區域列表
|
||||
"""
|
||||
filtered_text = []
|
||||
for text_region in text_regions:
|
||||
is_inside_any_avoid_region = False
|
||||
for avoid_region in regions_to_avoid:
|
||||
if self._is_bbox_inside(text_region, avoid_region):
|
||||
is_inside_any_avoid_region = True
|
||||
logger.debug(f"過濾掉文字: {text_region.get('text', '')[:20]}...")
|
||||
break # 找到一個包含它的區域就足夠了
|
||||
filtered_count = 0
|
||||
|
||||
if not is_inside_any_avoid_region:
|
||||
for text_region in text_regions:
|
||||
should_filter = False
|
||||
|
||||
for avoid_region in regions_to_avoid:
|
||||
# 使用重疊檢測:只要有任何重疊就過濾掉
|
||||
if self._bbox_overlaps(text_region, avoid_region, tolerance=tolerance):
|
||||
should_filter = True
|
||||
filtered_count += 1
|
||||
logger.debug(f"過濾掉重疊文字: {text_region.get('text', '')[:20]}...")
|
||||
break # 找到一個重疊區域就足夠了
|
||||
|
||||
if not should_filter:
|
||||
filtered_text.append(text_region)
|
||||
|
||||
logger.info(f"原始文字區域: {len(text_regions)}, 過濾後: {len(filtered_text)}")
|
||||
logger.info(f"原始文字區域: {len(text_regions)}, 過濾後: {len(filtered_text)}, 移除: {filtered_count}")
|
||||
return filtered_text
|
||||
|
||||
def draw_text_region(
|
||||
@@ -718,11 +769,22 @@ class PDFGeneratorService:
|
||||
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height))
|
||||
|
||||
# *** 關鍵修復:收集所有需要避免的區域(表格 + 圖片)***
|
||||
table_regions = ocr_data.get('tables', [])
|
||||
image_regions = ocr_data.get('image_regions', [])
|
||||
# 注意:OCR JSON 中沒有 'tables' 和 'image_regions' 頂層欄位
|
||||
# 重要發現:
|
||||
# - layout_data.elements 中的表格元素沒有 bbox(都是空列表)
|
||||
# - images_metadata 包含所有表格和圖片,並且有正確的 bbox
|
||||
# - 因此,只需使用 images_metadata 來過濾文字即可
|
||||
|
||||
# 建立一個包含「所有」要避免的區域的列表
|
||||
regions_to_avoid = table_regions + image_regions
|
||||
# 使用 images_metadata 作為要避免的區域(包含表格圖片和其他圖片)
|
||||
regions_to_avoid = images_metadata
|
||||
|
||||
table_count = len([img for img in images_metadata if 'table' in img.get('image_path', '').lower()])
|
||||
other_count = len(images_metadata) - table_count
|
||||
|
||||
logger.info(f"使用 images_metadata 過濾文字區域:")
|
||||
logger.info(f" - 表格圖片: {table_count}")
|
||||
logger.info(f" - 其他圖片: {other_count}")
|
||||
logger.info(f" - 總計需要避免的區域: {len(regions_to_avoid)}")
|
||||
|
||||
# 使用新的過濾函式過濾文字區域
|
||||
filtered_text_regions = self._filter_text_in_regions(text_regions, regions_to_avoid)
|
||||
@@ -751,23 +813,16 @@ class PDFGeneratorService:
|
||||
if page_num > 1:
|
||||
pdf_canvas.showPage() # Start new page
|
||||
|
||||
# Draw text regions for this page (excluding table text)
|
||||
page_regions = pages_data.get(page_num, [])
|
||||
logger.info(f"第 {page_num} 頁: 繪製 {len(page_regions)} 個文字區域")
|
||||
for i, region in enumerate(page_regions, 1):
|
||||
logger.debug(f" 文字 {i}/{len(page_regions)}")
|
||||
self.draw_text_region(pdf_canvas, region, target_height, scale_w, scale_h)
|
||||
# Get filtered regions for this page
|
||||
page_text_regions = pages_data.get(page_num, [])
|
||||
page_table_regions = [t for t in table_elements if t.get('page') == page_num - 1]
|
||||
page_image_regions = [img for img in images_metadata if img.get('page') == page_num - 1 and 'table' not in img.get('image_path', '').lower()]
|
||||
|
||||
# Draw tables for this page
|
||||
page_tables = [t for t in table_elements if t.get('page') == page_num - 1]
|
||||
logger.info(f"第 {page_num} 頁: 繪製 {len(page_tables)} 個表格")
|
||||
for table_elem in page_tables:
|
||||
self.draw_table_region(pdf_canvas, table_elem, images_metadata, target_height, scale_w, scale_h)
|
||||
# 繪製順序:圖片(底層) → 表格(中間層) → 文字(最上層)
|
||||
|
||||
# Draw non-table images for this page (figure, chart, seal, etc.)
|
||||
page_images = [img for img in images_metadata if img.get('page') == page_num - 1 and 'table' not in img.get('image_path', '').lower()]
|
||||
logger.info(f"第 {page_num} 頁: 繪製 {len(page_images)} 個圖片")
|
||||
for img_meta in page_images:
|
||||
# 1. Draw images first (bottom layer)
|
||||
logger.info(f"第 {page_num} 頁: 繪製 {len(page_image_regions)} 個圖片")
|
||||
for img_meta in page_image_regions:
|
||||
self.draw_image_region(
|
||||
pdf_canvas,
|
||||
img_meta,
|
||||
@@ -777,6 +832,17 @@ class PDFGeneratorService:
|
||||
scale_h
|
||||
)
|
||||
|
||||
# 2. Draw tables (middle layer)
|
||||
logger.info(f"第 {page_num} 頁: 繪製 {len(page_table_regions)} 個表格")
|
||||
for table_elem in page_table_regions:
|
||||
self.draw_table_region(pdf_canvas, table_elem, images_metadata, target_height, scale_w, scale_h)
|
||||
|
||||
# 3. Draw text regions last (top layer) - excluding table text
|
||||
logger.info(f"第 {page_num} 頁: 繪製 {len(page_text_regions)} 個文字區域")
|
||||
for i, region in enumerate(page_text_regions, 1):
|
||||
logger.debug(f" 文字 {i}/{len(page_text_regions)}")
|
||||
self.draw_text_region(pdf_canvas, region, target_height, scale_w, scale_h)
|
||||
|
||||
logger.info(f"<<< 第 {page_num} 頁完成")
|
||||
|
||||
# Save PDF
|
||||
|
||||
Reference in New Issue
Block a user