feat: update PDF generator to support UnifiedDocument directly

- Add generate_from_unified_document() method for direct UnifiedDocument processing
- Create convert_unified_document_to_ocr_data() for format conversion
- Extract _generate_pdf_from_data() as reusable core logic
- Support both OCR and DIRECT processing tracks in PDF generation
- Handle coordinate transformations (BoundingBox to polygon format)
- Update OCR service to use appropriate PDF generation method

Completes Section 4 (Unified Processing Pipeline) of dual-track proposal.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-19 08:48:25 +08:00
parent ab89a40e8d
commit ecdce961ca
3 changed files with 341 additions and 138 deletions

View File

@@ -1223,11 +1223,21 @@ class OCRService:
logger.info(f"Generating layout-preserving PDF: {pdf_filename}")
success = pdf_generator_service.generate_layout_pdf(
json_path=json_path,
output_path=pdf_path,
source_file_path=source_file_path
)
# Use appropriate method based on result type
if isinstance(result, UnifiedDocument):
# Use direct UnifiedDocument generation for better accuracy
success = pdf_generator_service.generate_from_unified_document(
unified_doc=result,
output_path=pdf_path,
source_file_path=source_file_path
)
else:
# Legacy path: use JSON file
success = pdf_generator_service.generate_layout_pdf(
json_path=json_path,
output_path=pdf_path,
source_file_path=source_file_path
)
if success:
logger.info(f"✓ PDF generated successfully: {pdf_path.name}")

View File

@@ -24,6 +24,17 @@ from html.parser import HTMLParser
from app.core.config import settings
# Import UnifiedDocument for dual-track support
try:
from app.models.unified_document import (
UnifiedDocument, DocumentElement, ElementType,
BoundingBox, TableData, ProcessingTrack
)
UNIFIED_DOCUMENT_AVAILABLE = True
except ImportError:
UNIFIED_DOCUMENT_AVAILABLE = False
UnifiedDocument = None
logger = logging.getLogger(__name__)
@@ -138,6 +149,310 @@ class PDFGeneratorService:
logger.error(f"Failed to load JSON {json_path}: {e}")
return None
def convert_unified_document_to_ocr_data(self, unified_doc: 'UnifiedDocument') -> Dict:
"""
Convert UnifiedDocument to OCR data format for PDF generation.
This method transforms the UnifiedDocument structure into the legacy
OCR data format that the PDF generator expects, supporting both
OCR and DIRECT processing tracks.
Args:
unified_doc: UnifiedDocument object from either processing track
Returns:
Dictionary in OCR data format with text_regions, images_metadata, layout_data
"""
text_regions = []
images_metadata = []
layout_elements = []
for page in unified_doc.pages:
page_num = page.page_number # 1-based
for element in page.elements:
# Convert BoundingBox to polygon format [[x,y], [x,y], [x,y], [x,y]]
bbox_polygon = [
[element.bbox.x0, element.bbox.y0], # top-left
[element.bbox.x1, element.bbox.y0], # top-right
[element.bbox.x1, element.bbox.y1], # bottom-right
[element.bbox.x0, element.bbox.y1], # bottom-left
]
# Handle text elements
if element.is_text or element.type in [
ElementType.TEXT, ElementType.TITLE, ElementType.HEADER,
ElementType.FOOTER, ElementType.PARAGRAPH, ElementType.CAPTION,
ElementType.LIST_ITEM, ElementType.FOOTNOTE, ElementType.REFERENCE
]:
text_content = element.get_text()
if text_content:
text_regions.append({
'text': text_content,
'bbox': bbox_polygon,
'confidence': element.confidence or 1.0,
'page': page_num
})
# Handle table elements
elif element.type == ElementType.TABLE:
# Convert TableData to HTML for layout_data
if isinstance(element.content, TableData):
html_content = element.content.to_html()
elif isinstance(element.content, dict):
html_content = element.content.get('html', str(element.content))
else:
html_content = str(element.content)
layout_elements.append({
'type': 'table',
'content': html_content,
'bbox': [element.bbox.x0, element.bbox.y0,
element.bbox.x1, element.bbox.y1],
'page': page_num - 1 # layout uses 0-based
})
# Also add to images_metadata for overlap filtering
# Tables are often rendered as images
table_id = element.element_id or f"table_{page_num}_{len(images_metadata)}"
images_metadata.append({
'image_path': f"table_{table_id}.png",
'bbox': bbox_polygon,
'page': page_num - 1, # 0-based for images_metadata
'type': 'table'
})
# Handle image/visual elements
elif element.is_visual or element.type in [
ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART,
ElementType.DIAGRAM, ElementType.LOGO
]:
# Get image path from content or metadata
if isinstance(element.content, dict):
image_path = element.content.get('path', '')
else:
image_path = element.metadata.get('path', f"image_{element.element_id}.png")
images_metadata.append({
'image_path': image_path,
'bbox': bbox_polygon,
'page': page_num - 1, # 0-based
'type': element.type.value
})
# Build OCR data structure
ocr_data = {
'text_regions': text_regions,
'images_metadata': images_metadata,
'layout_data': {
'elements': layout_elements,
'total_elements': len(layout_elements)
},
'total_pages': unified_doc.page_count,
'ocr_dimensions': {
'width': unified_doc.pages[0].dimensions.width if unified_doc.pages else 0,
'height': unified_doc.pages[0].dimensions.height if unified_doc.pages else 0
},
# Metadata for tracking
'_from_unified_document': True,
'_processing_track': unified_doc.metadata.processing_track.value
}
logger.info(f"Converted UnifiedDocument to OCR data: "
f"{len(text_regions)} text regions, "
f"{len(images_metadata)} images, "
f"{len(layout_elements)} layout elements, "
f"track={unified_doc.metadata.processing_track.value}")
return ocr_data
def generate_from_unified_document(
self,
unified_doc: 'UnifiedDocument',
output_path: Path,
source_file_path: Optional[Path] = None
) -> bool:
"""
Generate layout-preserving PDF directly from UnifiedDocument.
This method supports both OCR and DIRECT processing tracks,
preserving layout and coordinate information from either source.
Args:
unified_doc: UnifiedDocument object
output_path: Path to save generated PDF
source_file_path: Optional path to original source file
Returns:
True if successful, False otherwise
"""
if not UNIFIED_DOCUMENT_AVAILABLE:
logger.error("UnifiedDocument support not available")
return False
try:
# Convert UnifiedDocument to OCR data format
ocr_data = self.convert_unified_document_to_ocr_data(unified_doc)
# Use internal generation with pre-loaded data
return self._generate_pdf_from_data(
ocr_data=ocr_data,
output_path=output_path,
source_file_path=source_file_path
)
except Exception as e:
logger.error(f"Failed to generate PDF from UnifiedDocument: {e}")
import traceback
traceback.print_exc()
return False
def _generate_pdf_from_data(
self,
ocr_data: Dict,
output_path: Path,
source_file_path: Optional[Path] = None,
json_parent_dir: Optional[Path] = None
) -> bool:
"""
Internal method to generate PDF from OCR data dictionary.
This is the core generation logic extracted for reuse by both
JSON-based and UnifiedDocument-based generation paths.
Args:
ocr_data: OCR data dictionary
output_path: Path to save generated PDF
source_file_path: Optional path to original source file
json_parent_dir: Directory containing images (for JSON-based generation)
Returns:
True if successful, False otherwise
"""
try:
# Check if PDF already exists (caching)
if output_path.exists():
logger.info(f"PDF already exists: {output_path.name}")
return True
# Get text regions
text_regions = ocr_data.get('text_regions', [])
if not text_regions:
logger.warning("No text regions found in data")
# Don't fail - might have only tables/images
# Get images metadata
images_metadata = ocr_data.get('images_metadata', [])
# Get layout data
layout_data = ocr_data.get('layout_data', {})
# Step 1: Get OCR processing dimensions
ocr_width, ocr_height = self.calculate_page_dimensions(ocr_data, source_file_path=None)
logger.info(f"OCR 處理時使用的座標系尺寸: {ocr_width:.1f} x {ocr_height:.1f}")
# Step 2: Get target PDF dimensions
if source_file_path:
target_dims = self.get_original_page_size(source_file_path)
if target_dims:
target_width, target_height = target_dims
logger.info(f"目標 PDF 尺寸(來自原始文件): {target_width:.1f} x {target_height:.1f}")
else:
target_width, target_height = ocr_width, ocr_height
logger.warning(f"無法獲取原始文件尺寸,使用 OCR 尺寸作為目標")
else:
target_width, target_height = ocr_width, ocr_height
logger.info(f"無原始文件,使用 OCR 尺寸作為目標: {target_width:.1f} x {target_height:.1f}")
# Step 3: Calculate scale factors
scale_w = target_width / ocr_width if ocr_width > 0 else 1.0
scale_h = target_height / ocr_height if ocr_height > 0 else 1.0
logger.info(f"縮放因子: X={scale_w:.3f}, Y={scale_h:.3f}")
# Create PDF canvas
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height))
# Filter text regions to avoid overlap with tables/images
regions_to_avoid = images_metadata
table_count = len([img for img in images_metadata if 'table' in img.get('image_path', '').lower()])
logger.info(f"過濾文字區域: {len(regions_to_avoid)} 個區域需要避免")
filtered_text_regions = self._filter_text_in_regions(text_regions, regions_to_avoid)
# Group regions by page
pages_data = {}
for region in filtered_text_regions:
page_num = region.get('page', 1)
if page_num not in pages_data:
pages_data[page_num] = []
pages_data[page_num].append(region)
# Get table elements from layout_data
table_elements = []
if layout_data and layout_data.get('elements'):
table_elements = [e for e in layout_data['elements'] if e.get('type') == 'table']
# Process each page
total_pages = ocr_data.get('total_pages', 1)
logger.info(f"開始處理 {total_pages} 頁 PDF")
# Determine image directory
if json_parent_dir is None:
json_parent_dir = output_path.parent
for page_num in range(1, total_pages + 1):
logger.info(f">>> 處理第 {page_num}/{total_pages}")
if page_num > 1:
pdf_canvas.showPage()
# Get regions for this page
page_text_regions = pages_data.get(page_num, [])
page_table_regions = [t for t in table_elements if t.get('page') == page_num - 1]
page_image_regions = [
img for img in images_metadata
if img.get('page') == page_num - 1
and 'table' not in img.get('image_path', '').lower()
]
# Draw in layers: images → tables → text
# 1. Draw images (bottom layer)
for img_meta in page_image_regions:
self.draw_image_region(
pdf_canvas, img_meta, target_height,
json_parent_dir, scale_w, scale_h
)
# 2. Draw tables (middle layer)
for table_elem in page_table_regions:
self.draw_table_region(
pdf_canvas, table_elem, images_metadata,
target_height, scale_w, scale_h
)
# 3. Draw text (top layer)
for region in page_text_regions:
self.draw_text_region(
pdf_canvas, region, target_height,
scale_w, scale_h
)
logger.info(f"<<< 第 {page_num} 頁完成")
# Save PDF
pdf_canvas.save()
file_size = output_path.stat().st_size
logger.info(f"Generated PDF: {output_path.name} ({file_size} bytes)")
return True
except Exception as e:
logger.error(f"Failed to generate PDF: {e}")
import traceback
traceback.print_exc()
return False
def calculate_page_dimensions(self, ocr_data: Dict, source_file_path: Optional[Path] = None) -> Tuple[float, float]:
"""
從 OCR JSON 數據中推斷 OCR 處理時的實際頁面尺寸。
@@ -717,140 +1032,18 @@ class PDFGeneratorService:
True if successful, False otherwise
"""
try:
# Check if PDF already exists (caching)
if output_path.exists():
logger.info(f"PDF already exists: {output_path.name}")
return True
# Load JSON data
ocr_data = self.load_ocr_json(json_path)
if not ocr_data:
return False
# Get text regions
text_regions = ocr_data.get('text_regions', [])
if not text_regions:
logger.warning("No text regions found in JSON")
return False
# Get images metadata
images_metadata = ocr_data.get('images_metadata', [])
# Get layout data
layout_data = ocr_data.get('layout_data', {})
# Step 1: Get OCR processing dimensions (the large image OCR actually used)
# This comes from analyzing all bbox coordinates in the OCR data
ocr_width, ocr_height = self.calculate_page_dimensions(ocr_data, source_file_path=None)
logger.info(f"OCR 處理時使用的座標系尺寸: {ocr_width:.1f} x {ocr_height:.1f}")
# Step 2: Get target PDF dimensions (usually the original file size)
# This is what we want the final PDF size to be
if source_file_path:
target_dims = self.get_original_page_size(source_file_path)
if target_dims:
target_width, target_height = target_dims
logger.info(f"目標 PDF 尺寸(來自原始文件): {target_width:.1f} x {target_height:.1f}")
else:
# If we can't get original size, use OCR dimensions as target
target_width, target_height = ocr_width, ocr_height
logger.warning(f"無法獲取原始文件尺寸,使用 OCR 尺寸作為目標: {target_width:.1f} x {target_height:.1f}")
else:
# No source file, use OCR dimensions as target (1:1 mapping)
target_width, target_height = ocr_width, ocr_height
logger.info(f"無原始文件,使用 OCR 尺寸作為目標: {target_width:.1f} x {target_height:.1f}")
# Step 3: Calculate scale factors to convert OCR coordinates to PDF coordinates
scale_w = target_width / ocr_width
scale_h = target_height / ocr_height
logger.info(f"縮放因子: X={scale_w:.3f}, Y={scale_h:.3f} (OCR座標 → PDF座標)")
# Create PDF canvas with target dimensions
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height))
# *** 關鍵修復:收集所有需要避免的區域(表格 + 圖片)***
# 注意OCR JSON 中沒有 'tables' 和 'image_regions' 頂層欄位
# 重要發現:
# - layout_data.elements 中的表格元素沒有 bbox都是空列表
# - images_metadata 包含所有表格和圖片,並且有正確的 bbox
# - 因此,只需使用 images_metadata 來過濾文字即可
# 使用 images_metadata 作為要避免的區域(包含表格圖片和其他圖片)
regions_to_avoid = images_metadata
table_count = len([img for img in images_metadata if 'table' in img.get('image_path', '').lower()])
other_count = len(images_metadata) - table_count
logger.info(f"使用 images_metadata 過濾文字區域:")
logger.info(f" - 表格圖片: {table_count}")
logger.info(f" - 其他圖片: {other_count}")
logger.info(f" - 總計需要避免的區域: {len(regions_to_avoid)}")
# 使用新的過濾函式過濾文字區域
filtered_text_regions = self._filter_text_in_regions(text_regions, regions_to_avoid)
# Group regions by page
pages_data = {}
for region in filtered_text_regions:
page_num = region.get('page', 1)
if page_num not in pages_data:
pages_data[page_num] = []
pages_data[page_num].append(region)
# Get table elements from layout_data
table_elements = []
if layout_data and layout_data.get('elements'):
table_elements = [e for e in layout_data['elements'] if e.get('type') == 'table']
# Process each page
total_pages = ocr_data.get('total_pages', 1)
logger.info(f"=" * 70)
logger.info(f"開始處理 {total_pages} 頁 PDF")
logger.info(f"=" * 70)
for page_num in range(1, total_pages + 1):
logger.info(f"\n>>> 處理第 {page_num}/{total_pages}")
if page_num > 1:
pdf_canvas.showPage() # Start new page
# Get filtered regions for this page
page_text_regions = pages_data.get(page_num, [])
page_table_regions = [t for t in table_elements if t.get('page') == page_num - 1]
page_image_regions = [img for img in images_metadata if img.get('page') == page_num - 1 and 'table' not in img.get('image_path', '').lower()]
# 繪製順序:圖片(底層) → 表格(中間層) → 文字(最上層)
# 1. Draw images first (bottom layer)
logger.info(f"{page_num} 頁: 繪製 {len(page_image_regions)} 個圖片")
for img_meta in page_image_regions:
self.draw_image_region(
pdf_canvas,
img_meta,
target_height,
json_path.parent,
scale_w,
scale_h
)
# 2. Draw tables (middle layer)
logger.info(f"{page_num} 頁: 繪製 {len(page_table_regions)} 個表格")
for table_elem in page_table_regions:
self.draw_table_region(pdf_canvas, table_elem, images_metadata, target_height, scale_w, scale_h)
# 3. Draw text regions last (top layer) - excluding table text
logger.info(f"{page_num} 頁: 繪製 {len(page_text_regions)} 個文字區域")
for i, region in enumerate(page_text_regions, 1):
logger.debug(f" 文字 {i}/{len(page_text_regions)}")
self.draw_text_region(pdf_canvas, region, target_height, scale_w, scale_h)
logger.info(f"<<< 第 {page_num} 頁完成")
# Save PDF
pdf_canvas.save()
file_size = output_path.stat().st_size
logger.info(f"Generated layout-preserving PDF: {output_path.name} ({file_size} bytes)")
return True
# Use internal generation with pre-loaded data
return self._generate_pdf_from_data(
ocr_data=ocr_data,
output_path=output_path,
source_file_path=source_file_path,
json_parent_dir=json_path.parent
)
except Exception as e:
logger.error(f"Failed to generate PDF: {e}")

View File

@@ -63,10 +63,10 @@
- [x] 4.2.1 Define standardized JSON schema
- [x] 4.2.2 Include processing metadata
- [x] 4.2.3 Support both track outputs
- [ ] 4.3 Update PDF generator for UnifiedDocument
- [ ] 4.3.1 Adapt PDF generation to use UnifiedDocument
- [ ] 4.3.2 Preserve layout from both tracks
- [ ] 4.3.3 Handle coordinate transformations
- [x] 4.3 Update PDF generator for UnifiedDocument
- [x] 4.3.1 Adapt PDF generation to use UnifiedDocument
- [x] 4.3.2 Preserve layout from both tracks
- [x] 4.3.3 Handle coordinate transformations
## 5. Translation System Foundation
- [ ] 5.1 Create TranslationEngine interface