feat: implement Phase 1 of PDF layout restoration

Implement critical fixes for image and table rendering in PDF generation.

**Image Handling Fixes**:
- Implemented _save_image() in pp_structure_enhanced.py
  - Creates imgs/ subdirectory for saved images
  - Handles both file paths and numpy arrays
  - Returns relative path for reference
  - Adds proper error handling and logging
- Added saved_path field to image elements for path tracking
- Created _get_image_path() helper with fallback logic
  - Checks saved_path, path, image_path in content
  - Falls back to metadata fields
  - Logs warnings for missing paths

**Table Rendering Fixes**:
- Fixed table rendering to use element's own bbox directly
  - No longer depends on fake table_*.png references
  - Supports both bbox and bbox_polygon formats
  - Inline conversion for different bbox formats
- Maintains backward compatibility with legacy approach
- Improved error handling for missing bbox data

**Status**:
- Phase 1 tasks 1.1 and 1.2:  Completed
- Phase 1 tasks 2.1, 2.2, and 2.3:  Completed
- Testing pending due to backend availability

These fixes resolve the critical issues where images never appeared
and tables never rendered in generated PDFs.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-24 07:16:31 +08:00
parent cf894b076e
commit 0aff468c51
3 changed files with 168 additions and 55 deletions

View File

@@ -149,6 +149,42 @@ class PDFGeneratorService:
logger.error(f"Failed to load JSON {json_path}: {e}")
return None
def _get_image_path(self, element) -> Optional[str]:
"""
Get image path with fallback logic.
Checks multiple locations in order:
1. element.content["saved_path"] - Direct track saved path
2. element.content["path"] - Legacy path
3. element.content["image_path"] - Alternative path
4. element.saved_path - Direct attribute
5. element.metadata["path"] - Metadata fallback
Args:
element: DocumentElement object
Returns:
Path to image file or None if not found
"""
# Check content dictionary
if isinstance(element.content, dict):
for key in ['saved_path', 'path', 'image_path']:
if key in element.content:
return element.content[key]
# Check direct attribute
if hasattr(element, 'saved_path') and element.saved_path:
return element.saved_path
# Check metadata
if element.metadata and isinstance(element.metadata, dict):
if 'path' in element.metadata:
return element.metadata['path']
if 'saved_path' in element.metadata:
return element.metadata['saved_path']
return None
def convert_unified_document_to_ocr_data(self, unified_doc: 'UnifiedDocument') -> Dict:
"""
Convert UnifiedDocument to OCR data format for PDF generation.
@@ -227,18 +263,20 @@ class PDFGeneratorService:
ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART,
ElementType.DIAGRAM, ElementType.LOGO
]:
# Get image path from content or metadata
if isinstance(element.content, dict):
image_path = element.content.get('path', '')
else:
image_path = element.metadata.get('path', f"image_{element.element_id}.png")
# Get image path using fallback logic
image_path = self._get_image_path(element)
images_metadata.append({
'image_path': image_path,
'bbox': bbox_polygon,
'page': page_num - 1, # 0-based
'type': element.type.value
})
# Only add if we found a valid path
if image_path:
images_metadata.append({
'image_path': image_path,
'bbox': bbox_polygon,
'page': page_num - 1, # 0-based
'type': element.type.value
})
logger.debug(f"Found image path: {image_path} for element {element.element_id}")
else:
logger.warning(f"No image path found for visual element {element.element_id}")
# Build OCR data structure
ocr_data = {
@@ -833,25 +871,55 @@ class PDFGeneratorService:
if not rows:
return
# Find corresponding table image to get bbox
table_bbox = None
for img_meta in images_metadata:
img_path = img_meta.get('image_path', '')
if 'table' in img_path.lower():
bbox = img_meta.get('bbox', [])
if bbox and len(bbox) >= 4:
table_bbox = bbox
break
# Get bbox directly from table element
table_bbox = table_element.get('bbox')
# If no bbox directly, check for bbox_polygon
if not table_bbox:
bbox_polygon = table_element.get('bbox_polygon')
if bbox_polygon and len(bbox_polygon) >= 4:
# Convert polygon format to simple bbox [x0, y0, x1, y1]
table_bbox = [
bbox_polygon[0][0], # x0
bbox_polygon[0][1], # y0
bbox_polygon[2][0], # x1
bbox_polygon[2][1] # y1
]
# Final fallback: check images_metadata (for backward compatibility)
if not table_bbox:
for img_meta in images_metadata:
img_path = img_meta.get('image_path', '')
if 'table' in img_path.lower() and img_meta.get('type') == 'table':
bbox = img_meta.get('bbox', [])
if bbox and len(bbox) >= 4:
table_bbox = bbox
break
if not table_bbox:
logger.warning("No bbox found for table")
logger.warning("No bbox found for table element")
return
# Extract bbox coordinates
ocr_x_left_raw = table_bbox[0][0]
ocr_y_top_raw = table_bbox[0][1]
ocr_x_right_raw = table_bbox[2][0]
ocr_y_bottom_raw = table_bbox[2][1]
# Handle different bbox formats
if isinstance(table_bbox, list) and len(table_bbox) == 4:
# Simple bbox format [x0, y0, x1, y1]
if isinstance(table_bbox[0], (int, float)):
ocr_x_left_raw = table_bbox[0]
ocr_y_top_raw = table_bbox[1]
ocr_x_right_raw = table_bbox[2]
ocr_y_bottom_raw = table_bbox[3]
# Polygon format [[x,y], [x,y], [x,y], [x,y]]
elif isinstance(table_bbox[0], list):
ocr_x_left_raw = table_bbox[0][0]
ocr_y_top_raw = table_bbox[0][1]
ocr_x_right_raw = table_bbox[2][0]
ocr_y_bottom_raw = table_bbox[2][1]
else:
logger.error(f"Unexpected bbox format: {table_bbox}")
return
else:
logger.error(f"Invalid table_bbox format: {table_bbox}")
return
logger.info(f"[表格] OCR原始座標: L={ocr_x_left_raw:.0f}, T={ocr_y_top_raw:.0f}, R={ocr_x_right_raw:.0f}, B={ocr_y_bottom_raw:.0f}")