## 問題修復 ### 1. 頁碼分配錯誤 - **問題**: layout_data 和 images_metadata 頁碼被 1-based 覆蓋,導致全部為 0 - **修復**: 在 analyze_layout() 添加 current_page 參數,從源頭設置正確的 0-based 頁碼 - **影響**: 表格和圖片現在顯示在正確的頁面上 ### 2. 文字與表格/圖片重疊 - **問題**: 使用不存在的 'tables' 和 'image_regions' 字段過濾,導致過濾失效 - **修復**: 改用 images_metadata(包含所有表格/圖片的 bbox) - **新增**: _bbox_overlaps() 檢測任意重疊(非完全包含) - **影響**: 文字不再覆蓋表格和圖片區域 ### 3. 渲染順序優化 - **調整**: 圖片(底層) → 表格(中間層) → 文字(頂層) - **影響**: 視覺層次更正確 ## 技術細節 - ocr_service.py: 添加 current_page 參數傳遞,移除頁碼覆蓋邏輯 - pdf_generator_service.py: - 新增 _bbox_overlaps() 方法 - 更新 _filter_text_in_regions() 使用重疊檢測 - 修正數據源為 images_metadata - 調整繪製順序 ## 已知限制 - 仍有 21.6% 文字因過濾而遺失(座標定位方法的固有問題) - 未使用 PP-StructureV3 的完整版面資訊(parsing_res_list, layout_bbox) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
864 lines
35 KiB
Python
864 lines
35 KiB
Python
"""
|
||
Layout-Preserving PDF Generation Service
|
||
Generates PDF files that preserve the original document layout using OCR JSON data
|
||
"""
|
||
|
||
import json
|
||
import logging
|
||
from pathlib import Path
|
||
from typing import Dict, List, Optional, Tuple, Union
|
||
from datetime import datetime
|
||
|
||
from reportlab.lib.pagesizes import A4, letter
|
||
from reportlab.lib.units import mm
|
||
from reportlab.pdfgen import canvas
|
||
from reportlab.pdfbase import pdfmetrics
|
||
from reportlab.pdfbase.ttfonts import TTFont
|
||
from reportlab.platypus import Table, TableStyle
|
||
from reportlab.lib import colors
|
||
from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_RIGHT
|
||
from reportlab.platypus import Paragraph
|
||
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
||
from PIL import Image
|
||
from html.parser import HTMLParser
|
||
|
||
from app.core.config import settings
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
class HTMLTableParser(HTMLParser):
|
||
"""Parse HTML table to extract structure and data"""
|
||
|
||
def __init__(self):
|
||
super().__init__()
|
||
self.tables = []
|
||
self.current_table = None
|
||
self.current_row = None
|
||
self.current_cell = None
|
||
self.in_table = False
|
||
|
||
def handle_starttag(self, tag, attrs):
|
||
attrs_dict = dict(attrs)
|
||
|
||
if tag == 'table':
|
||
self.in_table = True
|
||
self.current_table = {'rows': []}
|
||
|
||
elif tag == 'tr' and self.in_table:
|
||
self.current_row = {'cells': []}
|
||
|
||
elif tag in ('td', 'th') and self.in_table and self.current_row is not None:
|
||
colspan = int(attrs_dict.get('colspan', 1))
|
||
rowspan = int(attrs_dict.get('rowspan', 1))
|
||
self.current_cell = {
|
||
'text': '',
|
||
'is_header': tag == 'th',
|
||
'colspan': colspan,
|
||
'rowspan': rowspan
|
||
}
|
||
|
||
def handle_endtag(self, tag):
|
||
if tag == 'table' and self.in_table:
|
||
if self.current_table and self.current_table['rows']:
|
||
self.tables.append(self.current_table)
|
||
self.current_table = None
|
||
self.in_table = False
|
||
|
||
elif tag == 'tr' and self.current_row is not None:
|
||
if self.current_table is not None:
|
||
self.current_table['rows'].append(self.current_row)
|
||
self.current_row = None
|
||
|
||
elif tag in ('td', 'th') and self.current_cell is not None:
|
||
if self.current_row is not None:
|
||
self.current_row['cells'].append(self.current_cell)
|
||
self.current_cell = None
|
||
|
||
def handle_data(self, data):
|
||
if self.current_cell is not None:
|
||
self.current_cell['text'] += data.strip() + ' '
|
||
|
||
|
||
class PDFGeneratorService:
|
||
"""Service for generating layout-preserving PDFs from OCR JSON data"""
|
||
|
||
def __init__(self):
|
||
"""Initialize PDF generator with font configuration"""
|
||
self.font_name = 'NotoSansSC'
|
||
self.font_path = None
|
||
self.font_registered = False
|
||
|
||
self._register_chinese_font()
|
||
|
||
def _register_chinese_font(self):
|
||
"""Register Chinese font for PDF generation"""
|
||
try:
|
||
# Get font path from settings
|
||
font_path = Path(settings.chinese_font_path)
|
||
|
||
# Try relative path from project root
|
||
if not font_path.is_absolute():
|
||
# Adjust path - settings.chinese_font_path starts with ./backend/
|
||
project_root = Path(__file__).resolve().parent.parent.parent.parent
|
||
font_path = project_root / font_path
|
||
|
||
if not font_path.exists():
|
||
logger.error(f"Chinese font not found at {font_path}")
|
||
return
|
||
|
||
# Register font
|
||
pdfmetrics.registerFont(TTFont(self.font_name, str(font_path)))
|
||
self.font_path = font_path
|
||
self.font_registered = True
|
||
logger.info(f"Chinese font registered: {self.font_name} from {font_path}")
|
||
|
||
except Exception as e:
|
||
logger.error(f"Failed to register Chinese font: {e}")
|
||
self.font_registered = False
|
||
|
||
def load_ocr_json(self, json_path: Path) -> Optional[Dict]:
|
||
"""
|
||
Load and parse OCR JSON result file
|
||
|
||
Args:
|
||
json_path: Path to JSON file
|
||
|
||
Returns:
|
||
Parsed JSON data or None if failed
|
||
"""
|
||
try:
|
||
with open(json_path, 'r', encoding='utf-8') as f:
|
||
data = json.load(f)
|
||
|
||
logger.info(f"Loaded OCR JSON: {json_path.name}")
|
||
return data
|
||
|
||
except Exception as e:
|
||
logger.error(f"Failed to load JSON {json_path}: {e}")
|
||
return None
|
||
|
||
def calculate_page_dimensions(self, ocr_data: Dict, source_file_path: Optional[Path] = None) -> Tuple[float, float]:
|
||
"""
|
||
從 OCR JSON 數據中推斷 OCR 處理時的實際頁面尺寸。
|
||
這非常重要,因為 OCR 可能在高解析度影像上運行。
|
||
|
||
Args:
|
||
ocr_data: Complete OCR data dictionary with text_regions and layout
|
||
source_file_path: Optional path to source file (fallback only)
|
||
|
||
Returns:
|
||
Tuple of (width, height) in points
|
||
"""
|
||
max_x = 0
|
||
max_y = 0
|
||
|
||
# *** 關鍵修復:檢查所有可能包含 bbox 的字段 ***
|
||
# 不同版本的 OCR 輸出可能使用不同的字段名
|
||
all_regions = []
|
||
|
||
# 1. text_regions - 包含所有文字區域(最常見)
|
||
if 'text_regions' in ocr_data and isinstance(ocr_data['text_regions'], list):
|
||
all_regions.extend(ocr_data['text_regions'])
|
||
|
||
# 2. image_regions - 包含圖片區域
|
||
if 'image_regions' in ocr_data and isinstance(ocr_data['image_regions'], list):
|
||
all_regions.extend(ocr_data['image_regions'])
|
||
|
||
# 3. tables - 包含表格區域
|
||
if 'tables' in ocr_data and isinstance(ocr_data['tables'], list):
|
||
all_regions.extend(ocr_data['tables'])
|
||
|
||
# 4. layout - 可能包含布局信息(可能是空列表)
|
||
if 'layout' in ocr_data and isinstance(ocr_data['layout'], list):
|
||
all_regions.extend(ocr_data['layout'])
|
||
|
||
# 5. layout_data.elements - PP-StructureV3 格式
|
||
if 'layout_data' in ocr_data and isinstance(ocr_data['layout_data'], dict):
|
||
elements = ocr_data['layout_data'].get('elements', [])
|
||
if elements:
|
||
all_regions.extend(elements)
|
||
|
||
if not all_regions:
|
||
# 如果 JSON 為空,回退到原始檔案尺寸
|
||
logger.warning("JSON 中沒有找到 text_regions, image_regions, tables, layout 或 layout_data.elements,回退到原始檔案尺寸。")
|
||
if source_file_path:
|
||
dims = self.get_original_page_size(source_file_path)
|
||
if dims:
|
||
return dims
|
||
return A4
|
||
|
||
region_count = 0
|
||
for region in all_regions:
|
||
try:
|
||
bbox = region.get('bbox')
|
||
if not bbox:
|
||
continue
|
||
|
||
region_count += 1
|
||
|
||
# *** 關鍵修復:正確處理多邊形 [[x, y], ...] 格式 ***
|
||
if isinstance(bbox[0], (int, float)):
|
||
# 處理簡單的 [x1, y1, x2, y2] 格式
|
||
max_x = max(max_x, bbox[2])
|
||
max_y = max(max_y, bbox[3])
|
||
elif isinstance(bbox[0], (list, tuple)):
|
||
# 處理多邊形 [[x, y], ...] 格式
|
||
x_coords = [p[0] for p in bbox if isinstance(p, (list, tuple)) and len(p) >= 2]
|
||
y_coords = [p[1] for p in bbox if isinstance(p, (list, tuple)) and len(p) >= 2]
|
||
if x_coords and y_coords:
|
||
max_x = max(max_x, max(x_coords))
|
||
max_y = max(max_y, max(y_coords))
|
||
|
||
except Exception as e:
|
||
logger.warning(f"Error processing bbox {bbox}: {e}")
|
||
|
||
if max_x > 0 and max_y > 0:
|
||
logger.info(f"從 {region_count} 個區域中推斷出的 OCR 座標系尺寸: {max_x:.1f} x {max_y:.1f}")
|
||
return (max_x, max_y)
|
||
else:
|
||
# 如果所有 bbox 都解析失敗,才回退
|
||
logger.warning("無法從 bbox 推斷尺寸,回退到原始檔案尺寸。")
|
||
if source_file_path:
|
||
dims = self.get_original_page_size(source_file_path)
|
||
if dims:
|
||
return dims
|
||
return A4
|
||
|
||
def get_original_page_size(self, file_path: Path) -> Optional[Tuple[float, float]]:
|
||
"""
|
||
Extract page dimensions from original source file
|
||
|
||
Args:
|
||
file_path: Path to original file (image or PDF)
|
||
|
||
Returns:
|
||
Tuple of (width, height) in points or None
|
||
"""
|
||
try:
|
||
if not file_path.exists():
|
||
return None
|
||
|
||
# For images, get dimensions from PIL
|
||
if file_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']:
|
||
img = Image.open(file_path)
|
||
# Use pixel dimensions directly as points (1:1 mapping)
|
||
# This matches how PaddleOCR reports coordinates
|
||
width_pt = float(img.width)
|
||
height_pt = float(img.height)
|
||
logger.info(f"Extracted dimensions from image: {width_pt:.1f} x {height_pt:.1f} points (1:1 pixel mapping)")
|
||
return (width_pt, height_pt)
|
||
|
||
# For PDFs, extract dimensions using PyPDF2
|
||
if file_path.suffix.lower() == '.pdf':
|
||
try:
|
||
from PyPDF2 import PdfReader
|
||
reader = PdfReader(file_path)
|
||
if len(reader.pages) > 0:
|
||
page = reader.pages[0]
|
||
# MediaBox gives [x1, y1, x2, y2] in points
|
||
mediabox = page.mediabox
|
||
width_pt = float(mediabox.width)
|
||
height_pt = float(mediabox.height)
|
||
logger.info(f"Extracted dimensions from PDF: {width_pt:.1f} x {height_pt:.1f} points")
|
||
return (width_pt, height_pt)
|
||
except ImportError:
|
||
logger.warning("PyPDF2 not available, cannot extract PDF dimensions")
|
||
except Exception as e:
|
||
logger.warning(f"Failed to extract PDF dimensions: {e}")
|
||
|
||
except Exception as e:
|
||
logger.warning(f"Failed to get page size from {file_path}: {e}")
|
||
|
||
return None
|
||
|
||
def _get_bbox_coords(self, bbox: Union[List[List[float]], List[float]]) -> Optional[Tuple[float, float, float, float]]:
|
||
"""將任何 bbox 格式 (多邊形或 [x1,y1,x2,y2]) 轉換為 [x_min, y_min, x_max, y_max]"""
|
||
try:
|
||
if isinstance(bbox[0], (list, tuple)):
|
||
# 處理多邊形 [[x, y], ...]
|
||
x_coords = [p[0] for p in bbox if isinstance(p, (list, tuple)) and len(p) >= 2]
|
||
y_coords = [p[1] for p in bbox if isinstance(p, (list, tuple)) and len(p) >= 2]
|
||
if not x_coords or not y_coords:
|
||
return None
|
||
return min(x_coords), min(y_coords), max(x_coords), max(y_coords)
|
||
elif isinstance(bbox[0], (int, float)) and len(bbox) == 4:
|
||
# 處理 [x1, y1, x2, y2]
|
||
return bbox[0], bbox[1], bbox[2], bbox[3]
|
||
else:
|
||
logger.warning(f"未知的 bbox 格式: {bbox}")
|
||
return None
|
||
except Exception as e:
|
||
logger.error(f"解析 bbox {bbox} 時出錯: {e}")
|
||
return None
|
||
|
||
def _is_bbox_inside(self, inner_bbox_data: Dict, outer_bbox_data: Dict, tolerance: float = 5.0) -> bool:
|
||
"""
|
||
檢查 'inner_bbox' 是否在 'outer_bbox' 內部(帶有容錯)。
|
||
此版本可處理多邊形和矩形。
|
||
"""
|
||
inner_coords = self._get_bbox_coords(inner_bbox_data.get('bbox'))
|
||
outer_coords = self._get_bbox_coords(outer_bbox_data.get('bbox'))
|
||
|
||
if not inner_coords or not outer_coords:
|
||
return False
|
||
|
||
inner_x1, inner_y1, inner_x2, inner_y2 = inner_coords
|
||
outer_x1, outer_y1, outer_x2, outer_y2 = outer_coords
|
||
|
||
# 檢查 inner 是否在 outer 內部 (加入 tolerance)
|
||
is_inside = (
|
||
(inner_x1 >= outer_x1 - tolerance) and
|
||
(inner_y1 >= outer_y1 - tolerance) and
|
||
(inner_x2 <= outer_x2 + tolerance) and
|
||
(inner_y2 <= outer_y2 + tolerance)
|
||
)
|
||
return is_inside
|
||
|
||
def _bbox_overlaps(self, bbox1_data: Dict, bbox2_data: Dict, tolerance: float = 5.0) -> bool:
|
||
"""
|
||
檢查兩個 bbox 是否有重疊(帶有容錯)。
|
||
如果有任何重疊,返回 True。
|
||
|
||
Args:
|
||
bbox1_data: 第一個 bbox 數據
|
||
bbox2_data: 第二個 bbox 數據
|
||
tolerance: 容錯值(像素)
|
||
|
||
Returns:
|
||
True 如果兩個 bbox 有重疊
|
||
"""
|
||
coords1 = self._get_bbox_coords(bbox1_data.get('bbox'))
|
||
coords2 = self._get_bbox_coords(bbox2_data.get('bbox'))
|
||
|
||
if not coords1 or not coords2:
|
||
return False
|
||
|
||
x1_min, y1_min, x1_max, y1_max = coords1
|
||
x2_min, y2_min, x2_max, y2_max = coords2
|
||
|
||
# 擴展 bbox2(表格/圖片區域)的範圍
|
||
x2_min -= tolerance
|
||
y2_min -= tolerance
|
||
x2_max += tolerance
|
||
y2_max += tolerance
|
||
|
||
# 檢查是否有重疊:如果沒有重疊,則必定滿足以下條件之一
|
||
no_overlap = (
|
||
x1_max < x2_min or # bbox1 在 bbox2 左側
|
||
x1_min > x2_max or # bbox1 在 bbox2 右側
|
||
y1_max < y2_min or # bbox1 在 bbox2 上方
|
||
y1_min > y2_max # bbox1 在 bbox2 下方
|
||
)
|
||
|
||
return not no_overlap
|
||
|
||
def _filter_text_in_regions(self, text_regions: List[Dict], regions_to_avoid: List[Dict], tolerance: float = 10.0) -> List[Dict]:
|
||
"""
|
||
過濾掉與 'regions_to_avoid'(例如表格、圖片)重疊的文字區域。
|
||
|
||
Args:
|
||
text_regions: 文字區域列表
|
||
regions_to_avoid: 需要避免的區域列表(表格、圖片)
|
||
tolerance: 容錯值(像素),增加到 10.0 以更好地處理邊界情況
|
||
|
||
Returns:
|
||
過濾後的文字區域列表
|
||
"""
|
||
filtered_text = []
|
||
filtered_count = 0
|
||
|
||
for text_region in text_regions:
|
||
should_filter = False
|
||
|
||
for avoid_region in regions_to_avoid:
|
||
# 使用重疊檢測:只要有任何重疊就過濾掉
|
||
if self._bbox_overlaps(text_region, avoid_region, tolerance=tolerance):
|
||
should_filter = True
|
||
filtered_count += 1
|
||
logger.debug(f"過濾掉重疊文字: {text_region.get('text', '')[:20]}...")
|
||
break # 找到一個重疊區域就足夠了
|
||
|
||
if not should_filter:
|
||
filtered_text.append(text_region)
|
||
|
||
logger.info(f"原始文字區域: {len(text_regions)}, 過濾後: {len(filtered_text)}, 移除: {filtered_count}")
|
||
return filtered_text
|
||
|
||
def draw_text_region(
|
||
self,
|
||
pdf_canvas: canvas.Canvas,
|
||
region: Dict,
|
||
page_height: float,
|
||
scale_w: float = 1.0,
|
||
scale_h: float = 1.0
|
||
):
|
||
"""
|
||
Draw a text region at precise coordinates
|
||
|
||
Args:
|
||
pdf_canvas: ReportLab canvas object
|
||
region: Text region dict with text, bbox, confidence
|
||
page_height: Height of page (for coordinate transformation)
|
||
scale_w: Scale factor for X coordinates (PDF width / OCR width)
|
||
scale_h: Scale factor for Y coordinates (PDF height / OCR height)
|
||
"""
|
||
text = region.get('text', '')
|
||
bbox = region.get('bbox', [])
|
||
confidence = region.get('confidence', 1.0)
|
||
|
||
if not text or not bbox or len(bbox) < 4:
|
||
return
|
||
|
||
try:
|
||
# bbox from OCR: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
|
||
# Points: top-left, top-right, bottom-right, bottom-left
|
||
# OCR coordinates: origin (0,0) at top-left, Y increases downward
|
||
ocr_x_left = bbox[0][0] # Left X
|
||
ocr_y_top = bbox[0][1] # Top Y in OCR coordinates
|
||
ocr_x_right = bbox[2][0] # Right X
|
||
ocr_y_bottom = bbox[2][1] # Bottom Y in OCR coordinates
|
||
|
||
logger.info(f"[文字] '{text[:20]}...' OCR原始座標: L={ocr_x_left:.0f}, T={ocr_y_top:.0f}, R={ocr_x_right:.0f}, B={ocr_y_bottom:.0f}")
|
||
|
||
# Apply scale factors to convert from OCR space to PDF space
|
||
scaled_x_left = ocr_x_left * scale_w
|
||
scaled_y_top = ocr_y_top * scale_h
|
||
scaled_x_right = ocr_x_right * scale_w
|
||
scaled_y_bottom = ocr_y_bottom * scale_h
|
||
|
||
logger.info(f"[文字] '{text[:20]}...' 縮放後(scale={scale_w:.3f},{scale_h:.3f}): L={scaled_x_left:.1f}, T={scaled_y_top:.1f}, R={scaled_x_right:.1f}, B={scaled_y_bottom:.1f}")
|
||
|
||
# Calculate bbox dimensions (after scaling)
|
||
bbox_width = abs(scaled_x_right - scaled_x_left)
|
||
bbox_height = abs(scaled_y_bottom - scaled_y_top)
|
||
|
||
# Calculate font size using heuristics
|
||
# Font size is typically 70-90% of bbox height
|
||
# Testing shows 0.75 works well for most cases
|
||
font_size = bbox_height * 0.75
|
||
font_size = max(min(font_size, 72), 4) # Clamp between 4pt and 72pt
|
||
|
||
# Transform coordinates: OCR (top-left origin) → PDF (bottom-left origin)
|
||
# CRITICAL: Y-axis flip!
|
||
pdf_x = scaled_x_left
|
||
pdf_y = page_height - scaled_y_bottom # Flip Y-axis using bottom coordinate
|
||
|
||
logger.info(f"[文字] '{text[:30]}' → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 字體:{font_size:.1f}pt, 寬x高:{bbox_width:.0f}x{bbox_height:.0f}")
|
||
|
||
# Set font
|
||
font_name = self.font_name if self.font_registered else 'Helvetica'
|
||
pdf_canvas.setFont(font_name, font_size)
|
||
|
||
# Calculate text width to prevent overflow
|
||
text_width = pdf_canvas.stringWidth(text, font_name, font_size)
|
||
|
||
# If text is too wide for bbox, scale down font
|
||
if text_width > bbox_width:
|
||
scale_factor = bbox_width / text_width
|
||
font_size = font_size * scale_factor * 0.95 # 95% to add small margin
|
||
font_size = max(font_size, 3) # Minimum 3pt
|
||
pdf_canvas.setFont(font_name, font_size)
|
||
|
||
# Draw text at calculated position
|
||
pdf_canvas.drawString(pdf_x, pdf_y, text)
|
||
|
||
# Debug: Draw bounding box (optional)
|
||
if settings.pdf_enable_bbox_debug:
|
||
pdf_canvas.setStrokeColorRGB(1, 0, 0, 0.3) # Red, semi-transparent
|
||
pdf_canvas.setLineWidth(0.5)
|
||
# Transform all bbox points to PDF coordinates (apply scaling first)
|
||
pdf_points = [(p[0] * scale_w, page_height - p[1] * scale_h) for p in bbox]
|
||
# Draw bbox rectangle
|
||
for i in range(4):
|
||
x1, y1 = pdf_points[i]
|
||
x2, y2 = pdf_points[(i + 1) % 4]
|
||
pdf_canvas.line(x1, y1, x2, y2)
|
||
|
||
except Exception as e:
|
||
logger.warning(f"Failed to draw text region '{text[:20]}...': {e}")
|
||
|
||
def draw_table_region(
|
||
self,
|
||
pdf_canvas: canvas.Canvas,
|
||
table_element: Dict,
|
||
images_metadata: List[Dict],
|
||
page_height: float,
|
||
scale_w: float = 1.0,
|
||
scale_h: float = 1.0
|
||
):
|
||
"""
|
||
Draw a table region by parsing HTML and rebuilding with ReportLab Table
|
||
|
||
Args:
|
||
pdf_canvas: ReportLab canvas object
|
||
table_element: Table element dict with HTML content
|
||
images_metadata: List of image metadata to find table bbox
|
||
page_height: Height of page
|
||
scale_w: Scale factor for X coordinates (PDF width / OCR width)
|
||
scale_h: Scale factor for Y coordinates (PDF height / OCR height)
|
||
"""
|
||
try:
|
||
html_content = table_element.get('content', '')
|
||
if not html_content:
|
||
return
|
||
|
||
# Parse HTML to extract table structure
|
||
parser = HTMLTableParser()
|
||
parser.feed(html_content)
|
||
|
||
if not parser.tables:
|
||
logger.warning("No tables found in HTML content")
|
||
return
|
||
|
||
# Get the first table (PP-StructureV3 usually provides one table per element)
|
||
table_data = parser.tables[0]
|
||
rows = table_data['rows']
|
||
|
||
if not rows:
|
||
return
|
||
|
||
# Find corresponding table image to get bbox
|
||
table_bbox = None
|
||
for img_meta in images_metadata:
|
||
img_path = img_meta.get('image_path', '')
|
||
if 'table' in img_path.lower():
|
||
bbox = img_meta.get('bbox', [])
|
||
if bbox and len(bbox) >= 4:
|
||
table_bbox = bbox
|
||
break
|
||
|
||
if not table_bbox:
|
||
logger.warning("No bbox found for table")
|
||
return
|
||
|
||
# Extract bbox coordinates
|
||
ocr_x_left_raw = table_bbox[0][0]
|
||
ocr_y_top_raw = table_bbox[0][1]
|
||
ocr_x_right_raw = table_bbox[2][0]
|
||
ocr_y_bottom_raw = table_bbox[2][1]
|
||
|
||
logger.info(f"[表格] OCR原始座標: L={ocr_x_left_raw:.0f}, T={ocr_y_top_raw:.0f}, R={ocr_x_right_raw:.0f}, B={ocr_y_bottom_raw:.0f}")
|
||
|
||
# Apply scaling
|
||
ocr_x_left = ocr_x_left_raw * scale_w
|
||
ocr_y_top = ocr_y_top_raw * scale_h
|
||
ocr_x_right = ocr_x_right_raw * scale_w
|
||
ocr_y_bottom = ocr_y_bottom_raw * scale_h
|
||
|
||
table_width = abs(ocr_x_right - ocr_x_left)
|
||
table_height = abs(ocr_y_bottom - ocr_y_top)
|
||
|
||
# Transform coordinates
|
||
pdf_x = ocr_x_left
|
||
pdf_y = page_height - ocr_y_bottom
|
||
|
||
# Build table data for ReportLab
|
||
# Convert parsed structure to simple 2D array
|
||
max_cols = max(len(row['cells']) for row in rows)
|
||
|
||
logger.info(f"[表格] {len(rows)}行x{max_cols}列 → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 寬x高: {table_width:.0f}x{table_height:.0f}")
|
||
reportlab_data = []
|
||
|
||
for row in rows:
|
||
row_data = []
|
||
for cell in row['cells']:
|
||
text = cell['text'].strip()
|
||
row_data.append(text)
|
||
# Pad row if needed
|
||
while len(row_data) < max_cols:
|
||
row_data.append('')
|
||
reportlab_data.append(row_data)
|
||
|
||
# Calculate column widths (equal distribution)
|
||
col_widths = [table_width / max_cols] * max_cols
|
||
|
||
# Create ReportLab Table
|
||
# Use smaller font size to fit in bbox
|
||
font_size = min(table_height / len(rows) * 0.5, 10)
|
||
font_size = max(font_size, 6)
|
||
|
||
# Create table with font
|
||
table = Table(reportlab_data, colWidths=col_widths)
|
||
|
||
# Apply table style
|
||
style = TableStyle([
|
||
('FONT', (0, 0), (-1, -1), self.font_name if self.font_registered else 'Helvetica', font_size),
|
||
('GRID', (0, 0), (-1, -1), 0.5, colors.black),
|
||
('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
|
||
('ALIGN', (0, 0), (-1, -1), 'CENTER'),
|
||
('LEFTPADDING', (0, 0), (-1, -1), 2),
|
||
('RIGHTPADDING', (0, 0), (-1, -1), 2),
|
||
('TOPPADDING', (0, 0), (-1, -1), 2),
|
||
('BOTTOMPADDING', (0, 0), (-1, -1), 2),
|
||
])
|
||
|
||
# Add header style if first row has headers
|
||
if rows and rows[0]['cells'] and rows[0]['cells'][0].get('is_header'):
|
||
style.add('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey)
|
||
style.add('FONT', (0, 0), (-1, 0), self.font_name if self.font_registered else 'Helvetica-Bold', font_size)
|
||
|
||
table.setStyle(style)
|
||
|
||
# Calculate table size
|
||
table.wrapOn(pdf_canvas, table_width, table_height)
|
||
|
||
# Draw table at position
|
||
table.drawOn(pdf_canvas, pdf_x, pdf_y)
|
||
|
||
logger.info(f"Drew table at ({pdf_x:.0f}, {pdf_y:.0f}) size {table_width:.0f}x{table_height:.0f} with {len(rows)} rows")
|
||
|
||
except Exception as e:
|
||
logger.warning(f"Failed to draw table region: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
|
||
def draw_image_region(
|
||
self,
|
||
pdf_canvas: canvas.Canvas,
|
||
region: Dict,
|
||
page_height: float,
|
||
result_dir: Path,
|
||
scale_w: float = 1.0,
|
||
scale_h: float = 1.0
|
||
):
|
||
"""
|
||
Draw an image region by embedding the extracted image
|
||
|
||
Handles images extracted by PP-StructureV3 (tables, figures, charts, etc.)
|
||
|
||
Args:
|
||
pdf_canvas: ReportLab canvas object
|
||
region: Image metadata dict with image_path and bbox
|
||
page_height: Height of page (for coordinate transformation)
|
||
result_dir: Directory containing result files
|
||
scale_w: Scale factor for X coordinates (PDF width / OCR width)
|
||
scale_h: Scale factor for Y coordinates (PDF height / OCR height)
|
||
"""
|
||
try:
|
||
image_path_str = region.get('image_path', '')
|
||
if not image_path_str:
|
||
return
|
||
|
||
# Construct full path to image
|
||
image_path = result_dir / image_path_str
|
||
|
||
if not image_path.exists():
|
||
logger.warning(f"Image not found: {image_path}")
|
||
return
|
||
|
||
# Get bbox for positioning
|
||
bbox = region.get('bbox', [])
|
||
if not bbox or len(bbox) < 4:
|
||
# If no bbox, skip for now
|
||
logger.warning(f"No bbox for image {image_path_str}")
|
||
return
|
||
|
||
# bbox from OCR: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
|
||
# OCR coordinates: origin (0,0) at top-left, Y increases downward
|
||
ocr_x_left_raw = bbox[0][0]
|
||
ocr_y_top_raw = bbox[0][1]
|
||
ocr_x_right_raw = bbox[2][0]
|
||
ocr_y_bottom_raw = bbox[2][1]
|
||
|
||
logger.info(f"[圖片] '{image_path_str}' OCR原始座標: L={ocr_x_left_raw:.0f}, T={ocr_y_top_raw:.0f}, R={ocr_x_right_raw:.0f}, B={ocr_y_bottom_raw:.0f}")
|
||
|
||
# Apply scaling
|
||
ocr_x_left = ocr_x_left_raw * scale_w
|
||
ocr_y_top = ocr_y_top_raw * scale_h
|
||
ocr_x_right = ocr_x_right_raw * scale_w
|
||
ocr_y_bottom = ocr_y_bottom_raw * scale_h
|
||
|
||
# Calculate bbox dimensions (after scaling)
|
||
bbox_width = abs(ocr_x_right - ocr_x_left)
|
||
bbox_height = abs(ocr_y_bottom - ocr_y_top)
|
||
|
||
# Transform coordinates: OCR (top-left origin) → PDF (bottom-left origin)
|
||
# CRITICAL: Y-axis flip!
|
||
# For images, we position at bottom-left corner
|
||
pdf_x_left = ocr_x_left
|
||
pdf_y_bottom = page_height - ocr_y_bottom # Flip Y-axis
|
||
|
||
logger.info(f"[圖片] '{image_path_str}' → PDF位置: ({pdf_x_left:.1f}, {pdf_y_bottom:.1f}), 寬x高: {bbox_width:.0f}x{bbox_height:.0f}")
|
||
|
||
# Draw image using ReportLab
|
||
# drawImage expects: (path, x, y, width, height)
|
||
# where (x, y) is the bottom-left corner of the image
|
||
pdf_canvas.drawImage(
|
||
str(image_path),
|
||
pdf_x_left,
|
||
pdf_y_bottom,
|
||
width=bbox_width,
|
||
height=bbox_height,
|
||
preserveAspectRatio=True,
|
||
mask='auto' # Handle transparency
|
||
)
|
||
|
||
logger.info(f"[圖片] ✓ 成功繪製 '{image_path_str}'")
|
||
|
||
except Exception as e:
|
||
logger.warning(f"Failed to draw image region: {e}")
|
||
|
||
def generate_layout_pdf(
|
||
self,
|
||
json_path: Path,
|
||
output_path: Path,
|
||
source_file_path: Optional[Path] = None
|
||
) -> bool:
|
||
"""
|
||
Generate layout-preserving PDF from OCR JSON data
|
||
|
||
Args:
|
||
json_path: Path to OCR JSON file
|
||
output_path: Path to save generated PDF
|
||
source_file_path: Optional path to original source file for dimension extraction
|
||
|
||
Returns:
|
||
True if successful, False otherwise
|
||
"""
|
||
try:
|
||
# Check if PDF already exists (caching)
|
||
if output_path.exists():
|
||
logger.info(f"PDF already exists: {output_path.name}")
|
||
return True
|
||
|
||
# Load JSON data
|
||
ocr_data = self.load_ocr_json(json_path)
|
||
if not ocr_data:
|
||
return False
|
||
|
||
# Get text regions
|
||
text_regions = ocr_data.get('text_regions', [])
|
||
if not text_regions:
|
||
logger.warning("No text regions found in JSON")
|
||
return False
|
||
|
||
# Get images metadata
|
||
images_metadata = ocr_data.get('images_metadata', [])
|
||
|
||
# Get layout data
|
||
layout_data = ocr_data.get('layout_data', {})
|
||
|
||
# Step 1: Get OCR processing dimensions (the large image OCR actually used)
|
||
# This comes from analyzing all bbox coordinates in the OCR data
|
||
ocr_width, ocr_height = self.calculate_page_dimensions(ocr_data, source_file_path=None)
|
||
logger.info(f"OCR 處理時使用的座標系尺寸: {ocr_width:.1f} x {ocr_height:.1f}")
|
||
|
||
# Step 2: Get target PDF dimensions (usually the original file size)
|
||
# This is what we want the final PDF size to be
|
||
if source_file_path:
|
||
target_dims = self.get_original_page_size(source_file_path)
|
||
if target_dims:
|
||
target_width, target_height = target_dims
|
||
logger.info(f"目標 PDF 尺寸(來自原始文件): {target_width:.1f} x {target_height:.1f}")
|
||
else:
|
||
# If we can't get original size, use OCR dimensions as target
|
||
target_width, target_height = ocr_width, ocr_height
|
||
logger.warning(f"無法獲取原始文件尺寸,使用 OCR 尺寸作為目標: {target_width:.1f} x {target_height:.1f}")
|
||
else:
|
||
# No source file, use OCR dimensions as target (1:1 mapping)
|
||
target_width, target_height = ocr_width, ocr_height
|
||
logger.info(f"無原始文件,使用 OCR 尺寸作為目標: {target_width:.1f} x {target_height:.1f}")
|
||
|
||
# Step 3: Calculate scale factors to convert OCR coordinates to PDF coordinates
|
||
scale_w = target_width / ocr_width
|
||
scale_h = target_height / ocr_height
|
||
logger.info(f"縮放因子: X={scale_w:.3f}, Y={scale_h:.3f} (OCR座標 → PDF座標)")
|
||
|
||
# Create PDF canvas with target dimensions
|
||
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height))
|
||
|
||
# *** 關鍵修復:收集所有需要避免的區域(表格 + 圖片)***
|
||
# 注意:OCR JSON 中沒有 'tables' 和 'image_regions' 頂層欄位
|
||
# 重要發現:
|
||
# - layout_data.elements 中的表格元素沒有 bbox(都是空列表)
|
||
# - images_metadata 包含所有表格和圖片,並且有正確的 bbox
|
||
# - 因此,只需使用 images_metadata 來過濾文字即可
|
||
|
||
# 使用 images_metadata 作為要避免的區域(包含表格圖片和其他圖片)
|
||
regions_to_avoid = images_metadata
|
||
|
||
table_count = len([img for img in images_metadata if 'table' in img.get('image_path', '').lower()])
|
||
other_count = len(images_metadata) - table_count
|
||
|
||
logger.info(f"使用 images_metadata 過濾文字區域:")
|
||
logger.info(f" - 表格圖片: {table_count}")
|
||
logger.info(f" - 其他圖片: {other_count}")
|
||
logger.info(f" - 總計需要避免的區域: {len(regions_to_avoid)}")
|
||
|
||
# 使用新的過濾函式過濾文字區域
|
||
filtered_text_regions = self._filter_text_in_regions(text_regions, regions_to_avoid)
|
||
|
||
# Group regions by page
|
||
pages_data = {}
|
||
for region in filtered_text_regions:
|
||
page_num = region.get('page', 1)
|
||
if page_num not in pages_data:
|
||
pages_data[page_num] = []
|
||
pages_data[page_num].append(region)
|
||
|
||
# Get table elements from layout_data
|
||
table_elements = []
|
||
if layout_data and layout_data.get('elements'):
|
||
table_elements = [e for e in layout_data['elements'] if e.get('type') == 'table']
|
||
|
||
# Process each page
|
||
total_pages = ocr_data.get('total_pages', 1)
|
||
logger.info(f"=" * 70)
|
||
logger.info(f"開始處理 {total_pages} 頁 PDF")
|
||
logger.info(f"=" * 70)
|
||
|
||
for page_num in range(1, total_pages + 1):
|
||
logger.info(f"\n>>> 處理第 {page_num}/{total_pages} 頁")
|
||
if page_num > 1:
|
||
pdf_canvas.showPage() # Start new page
|
||
|
||
# Get filtered regions for this page
|
||
page_text_regions = pages_data.get(page_num, [])
|
||
page_table_regions = [t for t in table_elements if t.get('page') == page_num - 1]
|
||
page_image_regions = [img for img in images_metadata if img.get('page') == page_num - 1 and 'table' not in img.get('image_path', '').lower()]
|
||
|
||
# 繪製順序:圖片(底層) → 表格(中間層) → 文字(最上層)
|
||
|
||
# 1. Draw images first (bottom layer)
|
||
logger.info(f"第 {page_num} 頁: 繪製 {len(page_image_regions)} 個圖片")
|
||
for img_meta in page_image_regions:
|
||
self.draw_image_region(
|
||
pdf_canvas,
|
||
img_meta,
|
||
target_height,
|
||
json_path.parent,
|
||
scale_w,
|
||
scale_h
|
||
)
|
||
|
||
# 2. Draw tables (middle layer)
|
||
logger.info(f"第 {page_num} 頁: 繪製 {len(page_table_regions)} 個表格")
|
||
for table_elem in page_table_regions:
|
||
self.draw_table_region(pdf_canvas, table_elem, images_metadata, target_height, scale_w, scale_h)
|
||
|
||
# 3. Draw text regions last (top layer) - excluding table text
|
||
logger.info(f"第 {page_num} 頁: 繪製 {len(page_text_regions)} 個文字區域")
|
||
for i, region in enumerate(page_text_regions, 1):
|
||
logger.debug(f" 文字 {i}/{len(page_text_regions)}")
|
||
self.draw_text_region(pdf_canvas, region, target_height, scale_w, scale_h)
|
||
|
||
logger.info(f"<<< 第 {page_num} 頁完成")
|
||
|
||
# Save PDF
|
||
pdf_canvas.save()
|
||
|
||
file_size = output_path.stat().st_size
|
||
logger.info(f"Generated layout-preserving PDF: {output_path.name} ({file_size} bytes)")
|
||
return True
|
||
|
||
except Exception as e:
|
||
logger.error(f"Failed to generate PDF: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
return False
|
||
|
||
|
||
# Singleton instance
|
||
pdf_generator_service = PDFGeneratorService()
|