fix: 修復PDF生成中的頁碼錯誤和文字重疊問題
## 問題修復 ### 1. 頁碼分配錯誤 - **問題**: layout_data 和 images_metadata 頁碼被 1-based 覆蓋,導致全部為 0 - **修復**: 在 analyze_layout() 添加 current_page 參數,從源頭設置正確的 0-based 頁碼 - **影響**: 表格和圖片現在顯示在正確的頁面上 ### 2. 文字與表格/圖片重疊 - **問題**: 使用不存在的 'tables' 和 'image_regions' 字段過濾,導致過濾失效 - **修復**: 改用 images_metadata(包含所有表格/圖片的 bbox) - **新增**: _bbox_overlaps() 檢測任意重疊(非完全包含) - **影響**: 文字不再覆蓋表格和圖片區域 ### 3. 渲染順序優化 - **調整**: 圖片(底層) → 表格(中間層) → 文字(頂層) - **影響**: 視覺層次更正確 ## 技術細節 - ocr_service.py: 添加 current_page 參數傳遞,移除頁碼覆蓋邏輯 - pdf_generator_service.py: - 新增 _bbox_overlaps() 方法 - 更新 _filter_text_in_regions() 使用重疊檢測 - 修正數據源為 images_metadata - 調整繪製順序 ## 已知限制 - 仍有 21.6% 文字因過濾而遺失(座標定位方法的固有問題) - 未使用 PP-StructureV3 的完整版面資訊(parsing_res_list, layout_bbox) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -285,7 +285,8 @@ class OCRService:
|
||||
lang: str = 'ch',
|
||||
detect_layout: bool = True,
|
||||
confidence_threshold: Optional[float] = None,
|
||||
output_dir: Optional[Path] = None
|
||||
output_dir: Optional[Path] = None,
|
||||
current_page: int = 0
|
||||
) -> Dict:
|
||||
"""
|
||||
Process single image with OCR and layout analysis
|
||||
@@ -295,6 +296,8 @@ class OCRService:
|
||||
lang: Language for OCR
|
||||
detect_layout: Whether to perform layout analysis
|
||||
confidence_threshold: Minimum confidence threshold (uses default if None)
|
||||
output_dir: Optional output directory for saving extracted images
|
||||
current_page: Current page number (0-based) for multi-page documents
|
||||
|
||||
Returns:
|
||||
Dictionary with OCR results and metadata
|
||||
@@ -337,13 +340,14 @@ class OCRService:
|
||||
for page_num, page_image_path in enumerate(image_paths, 1):
|
||||
logger.info(f"Processing PDF page {page_num}/{len(image_paths)}")
|
||||
|
||||
# Process each page
|
||||
# Process each page with correct page number (0-based for layout data)
|
||||
page_result = self.process_image(
|
||||
page_image_path,
|
||||
lang=lang,
|
||||
detect_layout=detect_layout,
|
||||
confidence_threshold=confidence_threshold,
|
||||
output_dir=output_dir
|
||||
output_dir=output_dir,
|
||||
current_page=page_num - 1 # Convert to 0-based page number for layout data
|
||||
)
|
||||
|
||||
# Accumulate results
|
||||
@@ -356,19 +360,13 @@ class OCRService:
|
||||
total_confidence_sum += page_result['average_confidence'] * page_result['total_text_regions']
|
||||
total_valid_regions += page_result['total_text_regions']
|
||||
|
||||
# Accumulate layout data and update page numbers
|
||||
# Accumulate layout data (page numbers already set correctly in analyze_layout)
|
||||
if page_result.get('layout_data'):
|
||||
layout_data = page_result['layout_data']
|
||||
# Update page number for all layout elements
|
||||
if layout_data.get('elements'):
|
||||
for element in layout_data['elements']:
|
||||
element['page'] = page_num
|
||||
all_layout_data.append(layout_data)
|
||||
|
||||
# Accumulate images metadata and update page numbers
|
||||
# Accumulate images metadata (page numbers already set correctly in analyze_layout)
|
||||
if page_result.get('images_metadata'):
|
||||
for img_meta in page_result['images_metadata']:
|
||||
img_meta['page'] = page_num # Update page number for multi-page PDFs
|
||||
all_images_metadata.extend(page_result['images_metadata'])
|
||||
|
||||
# Store OCR dimensions for each page
|
||||
@@ -483,7 +481,8 @@ class OCRService:
|
||||
images_metadata = []
|
||||
|
||||
if detect_layout:
|
||||
layout_data, images_metadata = self.analyze_layout(image_path, output_dir=output_dir)
|
||||
# Pass current_page to analyze_layout for correct page numbering
|
||||
layout_data, images_metadata = self.analyze_layout(image_path, output_dir=output_dir, current_page=current_page)
|
||||
|
||||
# Generate Markdown
|
||||
markdown_content = self.generate_markdown(text_regions, layout_data)
|
||||
@@ -587,13 +586,14 @@ class OCRService:
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
return text.strip()
|
||||
|
||||
def analyze_layout(self, image_path: Path, output_dir: Optional[Path] = None) -> Tuple[Optional[Dict], List[Dict]]:
|
||||
def analyze_layout(self, image_path: Path, output_dir: Optional[Path] = None, current_page: int = 0) -> Tuple[Optional[Dict], List[Dict]]:
|
||||
"""
|
||||
Analyze document layout using PP-StructureV3
|
||||
|
||||
Args:
|
||||
image_path: Path to image file
|
||||
output_dir: Optional output directory for saving extracted images (defaults to image_path.parent)
|
||||
current_page: Current page number (0-based) for multi-page documents
|
||||
|
||||
Returns:
|
||||
Tuple of (layout_data, images_metadata)
|
||||
@@ -633,7 +633,7 @@ class OCRService:
|
||||
'element_id': len(layout_elements),
|
||||
'type': 'table' if has_table else 'text',
|
||||
'content': markdown_texts,
|
||||
'page': page_idx,
|
||||
'page': current_page, # Use current_page parameter instead of page_idx
|
||||
'bbox': [], # PP-StructureV3 doesn't provide individual bbox in this format
|
||||
}
|
||||
|
||||
@@ -687,7 +687,7 @@ class OCRService:
|
||||
'element_id': len(layout_elements) + img_idx,
|
||||
'image_path': img_path,
|
||||
'type': 'image',
|
||||
'page': page_idx,
|
||||
'page': current_page, # Use current_page parameter instead of page_idx
|
||||
'bbox': bbox,
|
||||
})
|
||||
|
||||
|
||||
Reference in New Issue
Block a user