fix: 修復PDF生成中的頁碼錯誤和文字重疊問題
## 問題修復 ### 1. 頁碼分配錯誤 - **問題**: layout_data 和 images_metadata 頁碼被 1-based 覆蓋,導致全部為 0 - **修復**: 在 analyze_layout() 添加 current_page 參數,從源頭設置正確的 0-based 頁碼 - **影響**: 表格和圖片現在顯示在正確的頁面上 ### 2. 文字與表格/圖片重疊 - **問題**: 使用不存在的 'tables' 和 'image_regions' 字段過濾,導致過濾失效 - **修復**: 改用 images_metadata(包含所有表格/圖片的 bbox) - **新增**: _bbox_overlaps() 檢測任意重疊(非完全包含) - **影響**: 文字不再覆蓋表格和圖片區域 ### 3. 渲染順序優化 - **調整**: 圖片(底層) → 表格(中間層) → 文字(頂層) - **影響**: 視覺層次更正確 ## 技術細節 - ocr_service.py: 添加 current_page 參數傳遞,移除頁碼覆蓋邏輯 - pdf_generator_service.py: - 新增 _bbox_overlaps() 方法 - 更新 _filter_text_in_regions() 使用重疊檢測 - 修正數據源為 images_metadata - 調整繪製順序 ## 已知限制 - 仍有 21.6% 文字因過濾而遺失(座標定位方法的固有問題) - 未使用 PP-StructureV3 的完整版面資訊(parsing_res_list, layout_bbox) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -285,7 +285,8 @@ class OCRService:
|
||||
lang: str = 'ch',
|
||||
detect_layout: bool = True,
|
||||
confidence_threshold: Optional[float] = None,
|
||||
output_dir: Optional[Path] = None
|
||||
output_dir: Optional[Path] = None,
|
||||
current_page: int = 0
|
||||
) -> Dict:
|
||||
"""
|
||||
Process single image with OCR and layout analysis
|
||||
@@ -295,6 +296,8 @@ class OCRService:
|
||||
lang: Language for OCR
|
||||
detect_layout: Whether to perform layout analysis
|
||||
confidence_threshold: Minimum confidence threshold (uses default if None)
|
||||
output_dir: Optional output directory for saving extracted images
|
||||
current_page: Current page number (0-based) for multi-page documents
|
||||
|
||||
Returns:
|
||||
Dictionary with OCR results and metadata
|
||||
@@ -337,13 +340,14 @@ class OCRService:
|
||||
for page_num, page_image_path in enumerate(image_paths, 1):
|
||||
logger.info(f"Processing PDF page {page_num}/{len(image_paths)}")
|
||||
|
||||
# Process each page
|
||||
# Process each page with correct page number (0-based for layout data)
|
||||
page_result = self.process_image(
|
||||
page_image_path,
|
||||
lang=lang,
|
||||
detect_layout=detect_layout,
|
||||
confidence_threshold=confidence_threshold,
|
||||
output_dir=output_dir
|
||||
output_dir=output_dir,
|
||||
current_page=page_num - 1 # Convert to 0-based page number for layout data
|
||||
)
|
||||
|
||||
# Accumulate results
|
||||
@@ -356,19 +360,13 @@ class OCRService:
|
||||
total_confidence_sum += page_result['average_confidence'] * page_result['total_text_regions']
|
||||
total_valid_regions += page_result['total_text_regions']
|
||||
|
||||
# Accumulate layout data and update page numbers
|
||||
# Accumulate layout data (page numbers already set correctly in analyze_layout)
|
||||
if page_result.get('layout_data'):
|
||||
layout_data = page_result['layout_data']
|
||||
# Update page number for all layout elements
|
||||
if layout_data.get('elements'):
|
||||
for element in layout_data['elements']:
|
||||
element['page'] = page_num
|
||||
all_layout_data.append(layout_data)
|
||||
|
||||
# Accumulate images metadata and update page numbers
|
||||
# Accumulate images metadata (page numbers already set correctly in analyze_layout)
|
||||
if page_result.get('images_metadata'):
|
||||
for img_meta in page_result['images_metadata']:
|
||||
img_meta['page'] = page_num # Update page number for multi-page PDFs
|
||||
all_images_metadata.extend(page_result['images_metadata'])
|
||||
|
||||
# Store OCR dimensions for each page
|
||||
@@ -483,7 +481,8 @@ class OCRService:
|
||||
images_metadata = []
|
||||
|
||||
if detect_layout:
|
||||
layout_data, images_metadata = self.analyze_layout(image_path, output_dir=output_dir)
|
||||
# Pass current_page to analyze_layout for correct page numbering
|
||||
layout_data, images_metadata = self.analyze_layout(image_path, output_dir=output_dir, current_page=current_page)
|
||||
|
||||
# Generate Markdown
|
||||
markdown_content = self.generate_markdown(text_regions, layout_data)
|
||||
@@ -587,13 +586,14 @@ class OCRService:
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
return text.strip()
|
||||
|
||||
def analyze_layout(self, image_path: Path, output_dir: Optional[Path] = None) -> Tuple[Optional[Dict], List[Dict]]:
|
||||
def analyze_layout(self, image_path: Path, output_dir: Optional[Path] = None, current_page: int = 0) -> Tuple[Optional[Dict], List[Dict]]:
|
||||
"""
|
||||
Analyze document layout using PP-StructureV3
|
||||
|
||||
Args:
|
||||
image_path: Path to image file
|
||||
output_dir: Optional output directory for saving extracted images (defaults to image_path.parent)
|
||||
current_page: Current page number (0-based) for multi-page documents
|
||||
|
||||
Returns:
|
||||
Tuple of (layout_data, images_metadata)
|
||||
@@ -633,7 +633,7 @@ class OCRService:
|
||||
'element_id': len(layout_elements),
|
||||
'type': 'table' if has_table else 'text',
|
||||
'content': markdown_texts,
|
||||
'page': page_idx,
|
||||
'page': current_page, # Use current_page parameter instead of page_idx
|
||||
'bbox': [], # PP-StructureV3 doesn't provide individual bbox in this format
|
||||
}
|
||||
|
||||
@@ -687,7 +687,7 @@ class OCRService:
|
||||
'element_id': len(layout_elements) + img_idx,
|
||||
'image_path': img_path,
|
||||
'type': 'image',
|
||||
'page': page_idx,
|
||||
'page': current_page, # Use current_page parameter instead of page_idx
|
||||
'bbox': bbox,
|
||||
})
|
||||
|
||||
|
||||
@@ -315,23 +315,74 @@ class PDFGeneratorService:
|
||||
)
|
||||
return is_inside
|
||||
|
||||
def _filter_text_in_regions(self, text_regions: List[Dict], regions_to_avoid: List[Dict]) -> List[Dict]:
|
||||
def _bbox_overlaps(self, bbox1_data: Dict, bbox2_data: Dict, tolerance: float = 5.0) -> bool:
|
||||
"""
|
||||
過濾掉位於 'regions_to_avoid'(例如表格、圖片)內部的文字區域。
|
||||
檢查兩個 bbox 是否有重疊(帶有容錯)。
|
||||
如果有任何重疊,返回 True。
|
||||
|
||||
Args:
|
||||
bbox1_data: 第一個 bbox 數據
|
||||
bbox2_data: 第二個 bbox 數據
|
||||
tolerance: 容錯值(像素)
|
||||
|
||||
Returns:
|
||||
True 如果兩個 bbox 有重疊
|
||||
"""
|
||||
coords1 = self._get_bbox_coords(bbox1_data.get('bbox'))
|
||||
coords2 = self._get_bbox_coords(bbox2_data.get('bbox'))
|
||||
|
||||
if not coords1 or not coords2:
|
||||
return False
|
||||
|
||||
x1_min, y1_min, x1_max, y1_max = coords1
|
||||
x2_min, y2_min, x2_max, y2_max = coords2
|
||||
|
||||
# 擴展 bbox2(表格/圖片區域)的範圍
|
||||
x2_min -= tolerance
|
||||
y2_min -= tolerance
|
||||
x2_max += tolerance
|
||||
y2_max += tolerance
|
||||
|
||||
# 檢查是否有重疊:如果沒有重疊,則必定滿足以下條件之一
|
||||
no_overlap = (
|
||||
x1_max < x2_min or # bbox1 在 bbox2 左側
|
||||
x1_min > x2_max or # bbox1 在 bbox2 右側
|
||||
y1_max < y2_min or # bbox1 在 bbox2 上方
|
||||
y1_min > y2_max # bbox1 在 bbox2 下方
|
||||
)
|
||||
|
||||
return not no_overlap
|
||||
|
||||
def _filter_text_in_regions(self, text_regions: List[Dict], regions_to_avoid: List[Dict], tolerance: float = 10.0) -> List[Dict]:
|
||||
"""
|
||||
過濾掉與 'regions_to_avoid'(例如表格、圖片)重疊的文字區域。
|
||||
|
||||
Args:
|
||||
text_regions: 文字區域列表
|
||||
regions_to_avoid: 需要避免的區域列表(表格、圖片)
|
||||
tolerance: 容錯值(像素),增加到 10.0 以更好地處理邊界情況
|
||||
|
||||
Returns:
|
||||
過濾後的文字區域列表
|
||||
"""
|
||||
filtered_text = []
|
||||
for text_region in text_regions:
|
||||
is_inside_any_avoid_region = False
|
||||
for avoid_region in regions_to_avoid:
|
||||
if self._is_bbox_inside(text_region, avoid_region):
|
||||
is_inside_any_avoid_region = True
|
||||
logger.debug(f"過濾掉文字: {text_region.get('text', '')[:20]}...")
|
||||
break # 找到一個包含它的區域就足夠了
|
||||
filtered_count = 0
|
||||
|
||||
if not is_inside_any_avoid_region:
|
||||
for text_region in text_regions:
|
||||
should_filter = False
|
||||
|
||||
for avoid_region in regions_to_avoid:
|
||||
# 使用重疊檢測:只要有任何重疊就過濾掉
|
||||
if self._bbox_overlaps(text_region, avoid_region, tolerance=tolerance):
|
||||
should_filter = True
|
||||
filtered_count += 1
|
||||
logger.debug(f"過濾掉重疊文字: {text_region.get('text', '')[:20]}...")
|
||||
break # 找到一個重疊區域就足夠了
|
||||
|
||||
if not should_filter:
|
||||
filtered_text.append(text_region)
|
||||
|
||||
logger.info(f"原始文字區域: {len(text_regions)}, 過濾後: {len(filtered_text)}")
|
||||
logger.info(f"原始文字區域: {len(text_regions)}, 過濾後: {len(filtered_text)}, 移除: {filtered_count}")
|
||||
return filtered_text
|
||||
|
||||
def draw_text_region(
|
||||
@@ -718,11 +769,22 @@ class PDFGeneratorService:
|
||||
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height))
|
||||
|
||||
# *** 關鍵修復:收集所有需要避免的區域(表格 + 圖片)***
|
||||
table_regions = ocr_data.get('tables', [])
|
||||
image_regions = ocr_data.get('image_regions', [])
|
||||
# 注意:OCR JSON 中沒有 'tables' 和 'image_regions' 頂層欄位
|
||||
# 重要發現:
|
||||
# - layout_data.elements 中的表格元素沒有 bbox(都是空列表)
|
||||
# - images_metadata 包含所有表格和圖片,並且有正確的 bbox
|
||||
# - 因此,只需使用 images_metadata 來過濾文字即可
|
||||
|
||||
# 建立一個包含「所有」要避免的區域的列表
|
||||
regions_to_avoid = table_regions + image_regions
|
||||
# 使用 images_metadata 作為要避免的區域(包含表格圖片和其他圖片)
|
||||
regions_to_avoid = images_metadata
|
||||
|
||||
table_count = len([img for img in images_metadata if 'table' in img.get('image_path', '').lower()])
|
||||
other_count = len(images_metadata) - table_count
|
||||
|
||||
logger.info(f"使用 images_metadata 過濾文字區域:")
|
||||
logger.info(f" - 表格圖片: {table_count}")
|
||||
logger.info(f" - 其他圖片: {other_count}")
|
||||
logger.info(f" - 總計需要避免的區域: {len(regions_to_avoid)}")
|
||||
|
||||
# 使用新的過濾函式過濾文字區域
|
||||
filtered_text_regions = self._filter_text_in_regions(text_regions, regions_to_avoid)
|
||||
@@ -751,23 +813,16 @@ class PDFGeneratorService:
|
||||
if page_num > 1:
|
||||
pdf_canvas.showPage() # Start new page
|
||||
|
||||
# Draw text regions for this page (excluding table text)
|
||||
page_regions = pages_data.get(page_num, [])
|
||||
logger.info(f"第 {page_num} 頁: 繪製 {len(page_regions)} 個文字區域")
|
||||
for i, region in enumerate(page_regions, 1):
|
||||
logger.debug(f" 文字 {i}/{len(page_regions)}")
|
||||
self.draw_text_region(pdf_canvas, region, target_height, scale_w, scale_h)
|
||||
# Get filtered regions for this page
|
||||
page_text_regions = pages_data.get(page_num, [])
|
||||
page_table_regions = [t for t in table_elements if t.get('page') == page_num - 1]
|
||||
page_image_regions = [img for img in images_metadata if img.get('page') == page_num - 1 and 'table' not in img.get('image_path', '').lower()]
|
||||
|
||||
# Draw tables for this page
|
||||
page_tables = [t for t in table_elements if t.get('page') == page_num - 1]
|
||||
logger.info(f"第 {page_num} 頁: 繪製 {len(page_tables)} 個表格")
|
||||
for table_elem in page_tables:
|
||||
self.draw_table_region(pdf_canvas, table_elem, images_metadata, target_height, scale_w, scale_h)
|
||||
# 繪製順序:圖片(底層) → 表格(中間層) → 文字(最上層)
|
||||
|
||||
# Draw non-table images for this page (figure, chart, seal, etc.)
|
||||
page_images = [img for img in images_metadata if img.get('page') == page_num - 1 and 'table' not in img.get('image_path', '').lower()]
|
||||
logger.info(f"第 {page_num} 頁: 繪製 {len(page_images)} 個圖片")
|
||||
for img_meta in page_images:
|
||||
# 1. Draw images first (bottom layer)
|
||||
logger.info(f"第 {page_num} 頁: 繪製 {len(page_image_regions)} 個圖片")
|
||||
for img_meta in page_image_regions:
|
||||
self.draw_image_region(
|
||||
pdf_canvas,
|
||||
img_meta,
|
||||
@@ -777,6 +832,17 @@ class PDFGeneratorService:
|
||||
scale_h
|
||||
)
|
||||
|
||||
# 2. Draw tables (middle layer)
|
||||
logger.info(f"第 {page_num} 頁: 繪製 {len(page_table_regions)} 個表格")
|
||||
for table_elem in page_table_regions:
|
||||
self.draw_table_region(pdf_canvas, table_elem, images_metadata, target_height, scale_w, scale_h)
|
||||
|
||||
# 3. Draw text regions last (top layer) - excluding table text
|
||||
logger.info(f"第 {page_num} 頁: 繪製 {len(page_text_regions)} 個文字區域")
|
||||
for i, region in enumerate(page_text_regions, 1):
|
||||
logger.debug(f" 文字 {i}/{len(page_text_regions)}")
|
||||
self.draw_text_region(pdf_canvas, region, target_height, scale_w, scale_h)
|
||||
|
||||
logger.info(f"<<< 第 {page_num} 頁完成")
|
||||
|
||||
# Save PDF
|
||||
|
||||
Reference in New Issue
Block a user