Added support for preserving and rendering inline style variations within text elements (e.g., bold/italic/color changes mid-line). Span Extraction (direct_extraction_engine.py): 1. Parse PyMuPDF span data with font, size, flags, color per span 2. Create DocumentElement children for each span with StyleInfo 3. Store spans in element.children for downstream rendering 4. Extract span-specific bbox from PyMuPDF (lines 434-453) Span Rendering (pdf_generator_service.py): 1. Implement _draw_text_with_spans() method (lines 1685-1734) - Iterate through span children - Apply per-span styling via _apply_text_style - Track X position and calculate widths - Return total rendered width 2. Integrate in _draw_text_element_direct() (lines 1822-1823, 1905-1914) - Check for element.children (has_spans flag) - Use span rendering for first line - Fall back to normal rendering for list items 3. Add span count to debug logging Features: - Inline font changes (Arial → Times → Courier) - Inline size changes (12pt → 14pt → 10pt) - Inline style changes (normal → bold → italic) - Inline color changes (black → red → blue) Limitations (future work): - Currently renders all spans on first line only - Multi-line span support requires line breaking logic - List items use single-style rendering (compatibility) Direct track only (OCR track has no span information). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2082 lines
82 KiB
Python
2082 lines
82 KiB
Python
"""
|
||
Layout-Preserving PDF Generation Service
|
||
Generates PDF files that preserve the original document layout using OCR JSON data
|
||
"""
|
||
|
||
import json
|
||
import logging
|
||
import re
|
||
from pathlib import Path
|
||
from typing import Dict, List, Optional, Tuple, Union
|
||
from datetime import datetime
|
||
|
||
from reportlab.lib.pagesizes import A4, letter
|
||
from reportlab.lib.units import mm
|
||
from reportlab.pdfgen import canvas
|
||
from reportlab.pdfbase import pdfmetrics
|
||
from reportlab.pdfbase.ttfonts import TTFont
|
||
from reportlab.platypus import Table, TableStyle
|
||
from reportlab.lib import colors
|
||
from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_RIGHT
|
||
from reportlab.platypus import Paragraph
|
||
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
||
from PIL import Image
|
||
from html.parser import HTMLParser
|
||
|
||
from app.core.config import settings
|
||
|
||
# Import UnifiedDocument for dual-track support
|
||
try:
|
||
from app.models.unified_document import (
|
||
UnifiedDocument, DocumentElement, ElementType,
|
||
BoundingBox, TableData, ProcessingTrack
|
||
)
|
||
UNIFIED_DOCUMENT_AVAILABLE = True
|
||
except ImportError:
|
||
UNIFIED_DOCUMENT_AVAILABLE = False
|
||
UnifiedDocument = None
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
class HTMLTableParser(HTMLParser):
|
||
"""Parse HTML table to extract structure and data"""
|
||
|
||
def __init__(self):
|
||
super().__init__()
|
||
self.tables = []
|
||
self.current_table = None
|
||
self.current_row = None
|
||
self.current_cell = None
|
||
self.in_table = False
|
||
|
||
def handle_starttag(self, tag, attrs):
|
||
attrs_dict = dict(attrs)
|
||
|
||
if tag == 'table':
|
||
self.in_table = True
|
||
self.current_table = {'rows': []}
|
||
|
||
elif tag == 'tr' and self.in_table:
|
||
self.current_row = {'cells': []}
|
||
|
||
elif tag in ('td', 'th') and self.in_table and self.current_row is not None:
|
||
colspan = int(attrs_dict.get('colspan', 1))
|
||
rowspan = int(attrs_dict.get('rowspan', 1))
|
||
self.current_cell = {
|
||
'text': '',
|
||
'is_header': tag == 'th',
|
||
'colspan': colspan,
|
||
'rowspan': rowspan
|
||
}
|
||
|
||
def handle_endtag(self, tag):
|
||
if tag == 'table' and self.in_table:
|
||
if self.current_table and self.current_table['rows']:
|
||
self.tables.append(self.current_table)
|
||
self.current_table = None
|
||
self.in_table = False
|
||
|
||
elif tag == 'tr' and self.current_row is not None:
|
||
if self.current_table is not None:
|
||
self.current_table['rows'].append(self.current_row)
|
||
self.current_row = None
|
||
|
||
elif tag in ('td', 'th') and self.current_cell is not None:
|
||
if self.current_row is not None:
|
||
self.current_row['cells'].append(self.current_cell)
|
||
self.current_cell = None
|
||
|
||
def handle_data(self, data):
|
||
if self.current_cell is not None:
|
||
self.current_cell['text'] += data.strip() + ' '
|
||
|
||
|
||
class PDFGeneratorService:
|
||
"""Service for generating layout-preserving PDFs from OCR JSON data"""
|
||
|
||
# Font mapping from common fonts to PDF standard fonts
|
||
FONT_MAPPING = {
|
||
'Arial': 'Helvetica',
|
||
'Arial Black': 'Helvetica-Bold',
|
||
'Times New Roman': 'Times-Roman',
|
||
'Times': 'Times-Roman',
|
||
'Courier New': 'Courier',
|
||
'Courier': 'Courier',
|
||
'Calibri': 'Helvetica',
|
||
'Cambria': 'Times-Roman',
|
||
'Georgia': 'Times-Roman',
|
||
'Verdana': 'Helvetica',
|
||
'Tahoma': 'Helvetica',
|
||
'Trebuchet MS': 'Helvetica',
|
||
'Comic Sans MS': 'Helvetica',
|
||
'Impact': 'Helvetica-Bold',
|
||
'Lucida Console': 'Courier',
|
||
'Palatino': 'Times-Roman',
|
||
'Garamond': 'Times-Roman',
|
||
'Bookman': 'Times-Roman',
|
||
'Century Gothic': 'Helvetica',
|
||
'Franklin Gothic': 'Helvetica',
|
||
}
|
||
|
||
# Style flags for text formatting
|
||
STYLE_FLAG_BOLD = 1
|
||
STYLE_FLAG_ITALIC = 2
|
||
STYLE_FLAG_UNDERLINE = 4
|
||
STYLE_FLAG_STRIKETHROUGH = 8
|
||
|
||
def __init__(self):
|
||
"""Initialize PDF generator with font configuration"""
|
||
self.font_name = 'NotoSansSC'
|
||
self.font_path = None
|
||
self.font_registered = False
|
||
self.current_processing_track = None # Track type for current document
|
||
|
||
self._register_chinese_font()
|
||
|
||
def _register_chinese_font(self):
|
||
"""Register Chinese font for PDF generation"""
|
||
try:
|
||
# Get font path from settings
|
||
font_path = Path(settings.chinese_font_path)
|
||
|
||
# Try relative path from project root
|
||
if not font_path.is_absolute():
|
||
# Adjust path - settings.chinese_font_path starts with ./backend/
|
||
project_root = Path(__file__).resolve().parent.parent.parent.parent
|
||
font_path = project_root / font_path
|
||
|
||
if not font_path.exists():
|
||
logger.error(f"Chinese font not found at {font_path}")
|
||
return
|
||
|
||
# Register font
|
||
pdfmetrics.registerFont(TTFont(self.font_name, str(font_path)))
|
||
self.font_path = font_path
|
||
self.font_registered = True
|
||
logger.info(f"Chinese font registered: {self.font_name} from {font_path}")
|
||
|
||
except Exception as e:
|
||
logger.error(f"Failed to register Chinese font: {e}")
|
||
self.font_registered = False
|
||
|
||
def _parse_color(self, color_value) -> Tuple[float, float, float]:
|
||
"""
|
||
Parse color value to RGB tuple.
|
||
|
||
Args:
|
||
color_value: Color as hex string (#RRGGBB), RGB tuple, or color name
|
||
|
||
Returns:
|
||
RGB tuple with values 0-1 for ReportLab
|
||
"""
|
||
if not color_value:
|
||
return (0, 0, 0) # Default to black
|
||
|
||
try:
|
||
# Handle hex color (#RRGGBB or #RGB)
|
||
if isinstance(color_value, str) and color_value.startswith('#'):
|
||
hex_color = color_value.lstrip('#')
|
||
|
||
# Expand short form (#RGB -> #RRGGBB)
|
||
if len(hex_color) == 3:
|
||
hex_color = ''.join([c*2 for c in hex_color])
|
||
|
||
if len(hex_color) == 6:
|
||
r = int(hex_color[0:2], 16) / 255.0
|
||
g = int(hex_color[2:4], 16) / 255.0
|
||
b = int(hex_color[4:6], 16) / 255.0
|
||
return (r, g, b)
|
||
|
||
# Handle RGB tuple or list
|
||
elif isinstance(color_value, (tuple, list)) and len(color_value) >= 3:
|
||
r, g, b = color_value[0:3]
|
||
# Normalize to 0-1 if values are 0-255
|
||
if any(v > 1 for v in [r, g, b]):
|
||
return (r/255.0, g/255.0, b/255.0)
|
||
return (r, g, b)
|
||
|
||
except (ValueError, TypeError) as e:
|
||
logger.warning(f"Failed to parse color {color_value}: {e}")
|
||
|
||
# Default to black
|
||
return (0, 0, 0)
|
||
|
||
def _map_font(self, font_name: Optional[str]) -> str:
|
||
"""
|
||
Map font name to PDF standard font.
|
||
|
||
Args:
|
||
font_name: Original font name
|
||
|
||
Returns:
|
||
PDF standard font name
|
||
"""
|
||
if not font_name:
|
||
return 'Helvetica'
|
||
|
||
# Direct lookup
|
||
if font_name in self.FONT_MAPPING:
|
||
return self.FONT_MAPPING[font_name]
|
||
|
||
# Case-insensitive lookup
|
||
font_lower = font_name.lower()
|
||
for orig_font, pdf_font in self.FONT_MAPPING.items():
|
||
if orig_font.lower() == font_lower:
|
||
return pdf_font
|
||
|
||
# Partial match for common patterns
|
||
if 'arial' in font_lower:
|
||
return 'Helvetica'
|
||
elif 'times' in font_lower:
|
||
return 'Times-Roman'
|
||
elif 'courier' in font_lower:
|
||
return 'Courier'
|
||
|
||
# Default fallback
|
||
logger.debug(f"Font '{font_name}' not found in mapping, using Helvetica")
|
||
return 'Helvetica'
|
||
|
||
def _apply_text_style(self, c: canvas.Canvas, style_info, default_size: float = 12):
|
||
"""
|
||
Apply text styling from StyleInfo to PDF canvas.
|
||
|
||
Args:
|
||
c: ReportLab canvas object
|
||
style_info: StyleInfo object or dict with font, size, color, flags
|
||
default_size: Default font size if not specified
|
||
"""
|
||
if not style_info:
|
||
# Apply default styling
|
||
c.setFont('Helvetica', default_size)
|
||
c.setFillColorRGB(0, 0, 0)
|
||
return
|
||
|
||
try:
|
||
# Extract style attributes
|
||
if hasattr(style_info, '__dict__'):
|
||
# StyleInfo object
|
||
font_family = getattr(style_info, 'font', None)
|
||
font_size = getattr(style_info, 'size', default_size)
|
||
color = getattr(style_info, 'color', None)
|
||
flags = getattr(style_info, 'flags', 0)
|
||
elif isinstance(style_info, dict):
|
||
# Dictionary
|
||
font_family = style_info.get('font')
|
||
font_size = style_info.get('size', default_size)
|
||
color = style_info.get('color')
|
||
flags = style_info.get('flags', 0)
|
||
else:
|
||
# Unknown format, use defaults
|
||
c.setFont('Helvetica', default_size)
|
||
c.setFillColorRGB(0, 0, 0)
|
||
return
|
||
|
||
# Map font name
|
||
base_font = self._map_font(font_family) if font_family else 'Helvetica'
|
||
|
||
# Apply bold/italic modifiers
|
||
if flags:
|
||
is_bold = bool(flags & self.STYLE_FLAG_BOLD)
|
||
is_italic = bool(flags & self.STYLE_FLAG_ITALIC)
|
||
|
||
if is_bold and is_italic:
|
||
# Try bold-italic variant
|
||
if 'Helvetica' in base_font:
|
||
base_font = 'Helvetica-BoldOblique'
|
||
elif 'Times' in base_font:
|
||
base_font = 'Times-BoldItalic'
|
||
elif 'Courier' in base_font:
|
||
base_font = 'Courier-BoldOblique'
|
||
elif is_bold:
|
||
# Try bold variant
|
||
if 'Helvetica' in base_font:
|
||
base_font = 'Helvetica-Bold'
|
||
elif 'Times' in base_font:
|
||
base_font = 'Times-Bold'
|
||
elif 'Courier' in base_font:
|
||
base_font = 'Courier-Bold'
|
||
elif is_italic:
|
||
# Try italic variant
|
||
if 'Helvetica' in base_font:
|
||
base_font = 'Helvetica-Oblique'
|
||
elif 'Times' in base_font:
|
||
base_font = 'Times-Italic'
|
||
elif 'Courier' in base_font:
|
||
base_font = 'Courier-Oblique'
|
||
|
||
# Apply font and size
|
||
actual_size = font_size if font_size and font_size > 0 else default_size
|
||
try:
|
||
c.setFont(base_font, actual_size)
|
||
except KeyError:
|
||
# Font not available, fallback
|
||
logger.warning(f"Font '{base_font}' not available, using Helvetica")
|
||
c.setFont('Helvetica', actual_size)
|
||
|
||
# Apply color
|
||
if color:
|
||
r, g, b = self._parse_color(color)
|
||
c.setFillColorRGB(r, g, b)
|
||
else:
|
||
c.setFillColorRGB(0, 0, 0) # Default black
|
||
|
||
except Exception as e:
|
||
logger.error(f"Failed to apply text style: {e}")
|
||
# Fallback to defaults
|
||
c.setFont('Helvetica', default_size)
|
||
c.setFillColorRGB(0, 0, 0)
|
||
|
||
def load_ocr_json(self, json_path: Path) -> Optional[Dict]:
|
||
"""
|
||
Load and parse OCR JSON result file
|
||
|
||
Args:
|
||
json_path: Path to JSON file
|
||
|
||
Returns:
|
||
Parsed JSON data or None if failed
|
||
"""
|
||
try:
|
||
with open(json_path, 'r', encoding='utf-8') as f:
|
||
data = json.load(f)
|
||
|
||
logger.info(f"Loaded OCR JSON: {json_path.name}")
|
||
return data
|
||
|
||
except Exception as e:
|
||
logger.error(f"Failed to load JSON {json_path}: {e}")
|
||
return None
|
||
|
||
def _get_image_path(self, element) -> Optional[str]:
|
||
"""
|
||
Get image path with fallback logic.
|
||
|
||
Checks multiple locations in order:
|
||
1. element.content["saved_path"] - Direct track saved path
|
||
2. element.content["path"] - Legacy path
|
||
3. element.content["image_path"] - Alternative path
|
||
4. element.saved_path - Direct attribute
|
||
5. element.metadata["path"] - Metadata fallback
|
||
|
||
Args:
|
||
element: DocumentElement object
|
||
|
||
Returns:
|
||
Path to image file or None if not found
|
||
"""
|
||
# Check content dictionary
|
||
if isinstance(element.content, dict):
|
||
for key in ['saved_path', 'path', 'image_path']:
|
||
if key in element.content:
|
||
return element.content[key]
|
||
|
||
# Check direct attribute
|
||
if hasattr(element, 'saved_path') and element.saved_path:
|
||
return element.saved_path
|
||
|
||
# Check metadata
|
||
if element.metadata and isinstance(element.metadata, dict):
|
||
if 'path' in element.metadata:
|
||
return element.metadata['path']
|
||
if 'saved_path' in element.metadata:
|
||
return element.metadata['saved_path']
|
||
|
||
return None
|
||
|
||
def convert_unified_document_to_ocr_data(self, unified_doc: 'UnifiedDocument') -> Dict:
|
||
"""
|
||
Convert UnifiedDocument to OCR data format for PDF generation.
|
||
|
||
This method transforms the UnifiedDocument structure into the legacy
|
||
OCR data format that the PDF generator expects, supporting both
|
||
OCR and DIRECT processing tracks.
|
||
|
||
Args:
|
||
unified_doc: UnifiedDocument object from either processing track
|
||
|
||
Returns:
|
||
Dictionary in OCR data format with text_regions, images_metadata, layout_data
|
||
"""
|
||
text_regions = []
|
||
images_metadata = []
|
||
layout_elements = []
|
||
|
||
for page in unified_doc.pages:
|
||
page_num = page.page_number # 1-based
|
||
|
||
for element in page.elements:
|
||
# Convert BoundingBox to polygon format [[x,y], [x,y], [x,y], [x,y]]
|
||
bbox_polygon = [
|
||
[element.bbox.x0, element.bbox.y0], # top-left
|
||
[element.bbox.x1, element.bbox.y0], # top-right
|
||
[element.bbox.x1, element.bbox.y1], # bottom-right
|
||
[element.bbox.x0, element.bbox.y1], # bottom-left
|
||
]
|
||
|
||
# Handle text elements
|
||
if element.is_text or element.type in [
|
||
ElementType.TEXT, ElementType.TITLE, ElementType.HEADER,
|
||
ElementType.FOOTER, ElementType.PARAGRAPH, ElementType.CAPTION,
|
||
ElementType.LIST_ITEM, ElementType.FOOTNOTE, ElementType.REFERENCE
|
||
]:
|
||
text_content = element.get_text()
|
||
if text_content:
|
||
text_region = {
|
||
'text': text_content,
|
||
'bbox': bbox_polygon,
|
||
'confidence': element.confidence or 1.0,
|
||
'page': page_num
|
||
}
|
||
|
||
# Include style information if available (for Direct track)
|
||
if hasattr(element, 'style') and element.style:
|
||
text_region['style'] = element.style
|
||
|
||
text_regions.append(text_region)
|
||
|
||
# Handle table elements
|
||
elif element.type == ElementType.TABLE:
|
||
# Convert TableData to HTML for layout_data
|
||
if isinstance(element.content, TableData):
|
||
html_content = element.content.to_html()
|
||
elif isinstance(element.content, dict):
|
||
html_content = element.content.get('html', str(element.content))
|
||
else:
|
||
html_content = str(element.content)
|
||
|
||
layout_elements.append({
|
||
'type': 'table',
|
||
'content': html_content,
|
||
'bbox': [element.bbox.x0, element.bbox.y0,
|
||
element.bbox.x1, element.bbox.y1],
|
||
'page': page_num - 1 # layout uses 0-based
|
||
})
|
||
|
||
# Add bbox to images_metadata for text overlap filtering
|
||
# (no actual image file, just bbox for filtering)
|
||
images_metadata.append({
|
||
'image_path': None, # No fake table image
|
||
'bbox': bbox_polygon,
|
||
'page': page_num - 1, # 0-based for images_metadata
|
||
'type': 'table',
|
||
'element_id': element.element_id
|
||
})
|
||
|
||
# Handle image/visual elements
|
||
elif element.is_visual or element.type in [
|
||
ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART,
|
||
ElementType.DIAGRAM, ElementType.LOGO
|
||
]:
|
||
# Get image path using fallback logic
|
||
image_path = self._get_image_path(element)
|
||
|
||
# Only add if we found a valid path
|
||
if image_path:
|
||
images_metadata.append({
|
||
'image_path': image_path,
|
||
'bbox': bbox_polygon,
|
||
'page': page_num - 1, # 0-based
|
||
'type': element.type.value
|
||
})
|
||
logger.debug(f"Found image path: {image_path} for element {element.element_id}")
|
||
else:
|
||
logger.warning(f"No image path found for visual element {element.element_id}")
|
||
|
||
# Build OCR data structure
|
||
ocr_data = {
|
||
'text_regions': text_regions,
|
||
'images_metadata': images_metadata,
|
||
'layout_data': {
|
||
'elements': layout_elements,
|
||
'total_elements': len(layout_elements)
|
||
},
|
||
'total_pages': unified_doc.page_count,
|
||
'ocr_dimensions': {
|
||
'width': unified_doc.pages[0].dimensions.width if unified_doc.pages else 0,
|
||
'height': unified_doc.pages[0].dimensions.height if unified_doc.pages else 0
|
||
},
|
||
# Metadata for tracking
|
||
'_from_unified_document': True,
|
||
'_processing_track': unified_doc.metadata.processing_track.value
|
||
}
|
||
|
||
logger.info(f"Converted UnifiedDocument to OCR data: "
|
||
f"{len(text_regions)} text regions, "
|
||
f"{len(images_metadata)} images, "
|
||
f"{len(layout_elements)} layout elements, "
|
||
f"track={unified_doc.metadata.processing_track.value}")
|
||
|
||
return ocr_data
|
||
|
||
def generate_from_unified_document(
|
||
self,
|
||
unified_doc: 'UnifiedDocument',
|
||
output_path: Path,
|
||
source_file_path: Optional[Path] = None
|
||
) -> bool:
|
||
"""
|
||
Generate layout-preserving PDF directly from UnifiedDocument.
|
||
|
||
This method supports both OCR and DIRECT processing tracks,
|
||
preserving layout and coordinate information from either source.
|
||
|
||
Args:
|
||
unified_doc: UnifiedDocument object
|
||
output_path: Path to save generated PDF
|
||
source_file_path: Optional path to original source file
|
||
|
||
Returns:
|
||
True if successful, False otherwise
|
||
"""
|
||
if not UNIFIED_DOCUMENT_AVAILABLE:
|
||
logger.error("UnifiedDocument support not available")
|
||
return False
|
||
|
||
try:
|
||
# Detect processing track for track-specific rendering
|
||
processing_track = None
|
||
if hasattr(unified_doc, 'metadata') and unified_doc.metadata:
|
||
if hasattr(unified_doc.metadata, 'processing_track'):
|
||
processing_track = unified_doc.metadata.processing_track
|
||
elif isinstance(unified_doc.metadata, dict):
|
||
processing_track = unified_doc.metadata.get('processing_track')
|
||
|
||
# Route to track-specific rendering method
|
||
is_direct_track = (processing_track == 'direct' or
|
||
processing_track == ProcessingTrack.DIRECT)
|
||
|
||
logger.info(f"Processing track: {processing_track}, using {'Direct' if is_direct_track else 'OCR'} track rendering")
|
||
|
||
if is_direct_track:
|
||
# Direct track: Rich formatting preservation
|
||
return self._generate_direct_track_pdf(
|
||
unified_doc=unified_doc,
|
||
output_path=output_path,
|
||
source_file_path=source_file_path
|
||
)
|
||
else:
|
||
# OCR track: Simplified rendering (backward compatible)
|
||
return self._generate_ocr_track_pdf(
|
||
unified_doc=unified_doc,
|
||
output_path=output_path,
|
||
source_file_path=source_file_path
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.error(f"Failed to generate PDF from UnifiedDocument: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
return False
|
||
|
||
def _generate_direct_track_pdf(
|
||
self,
|
||
unified_doc: 'UnifiedDocument',
|
||
output_path: Path,
|
||
source_file_path: Optional[Path] = None
|
||
) -> bool:
|
||
"""
|
||
Generate PDF with rich formatting preservation for Direct track.
|
||
|
||
This method processes UnifiedDocument directly without converting to
|
||
legacy OCR format, preserving StyleInfo and applying proper text
|
||
formatting including line breaks.
|
||
|
||
Args:
|
||
unified_doc: UnifiedDocument from Direct extraction
|
||
output_path: Path to save generated PDF
|
||
source_file_path: Optional path to original source file
|
||
|
||
Returns:
|
||
True if successful, False otherwise
|
||
"""
|
||
try:
|
||
logger.info("=== Direct Track PDF Generation ===")
|
||
logger.info(f"Total pages: {len(unified_doc.pages)}")
|
||
|
||
# Set current track for helper methods
|
||
self.current_processing_track = 'direct'
|
||
|
||
# Get page dimensions from first page
|
||
if not unified_doc.pages:
|
||
logger.error("No pages in document")
|
||
return False
|
||
|
||
first_page = unified_doc.pages[0]
|
||
page_width = first_page.width
|
||
page_height = first_page.height
|
||
|
||
logger.info(f"Page dimensions: {page_width} x {page_height}")
|
||
|
||
# Create PDF canvas with source dimensions
|
||
from reportlab.pdfgen import canvas
|
||
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(page_width, page_height))
|
||
|
||
# Process each page
|
||
for page_idx, page in enumerate(unified_doc.pages):
|
||
logger.info(f">>> Processing page {page_idx + 1}/{len(unified_doc.pages)}")
|
||
|
||
if page_idx > 0:
|
||
pdf_canvas.showPage()
|
||
|
||
# Separate elements by type
|
||
text_elements = []
|
||
table_elements = []
|
||
image_elements = []
|
||
list_elements = []
|
||
|
||
for element in page.elements:
|
||
if element.type == ElementType.TABLE:
|
||
table_elements.append(element)
|
||
elif element.is_visual or element.type in [
|
||
ElementType.IMAGE, ElementType.FIGURE,
|
||
ElementType.CHART, ElementType.DIAGRAM
|
||
]:
|
||
image_elements.append(element)
|
||
elif element.type == ElementType.LIST_ITEM:
|
||
list_elements.append(element)
|
||
elif self._is_list_item_fallback(element):
|
||
# Fallback detection: Check metadata and text patterns
|
||
list_elements.append(element)
|
||
# Mark as list item for downstream processing
|
||
element.type = ElementType.LIST_ITEM
|
||
elif element.is_text or element.type in [
|
||
ElementType.TEXT, ElementType.TITLE, ElementType.HEADER,
|
||
ElementType.FOOTER, ElementType.PARAGRAPH
|
||
]:
|
||
text_elements.append(element)
|
||
|
||
logger.info(f"Page {page_idx + 1}: {len(text_elements)} text, "
|
||
f"{len(table_elements)} tables, {len(image_elements)} images, "
|
||
f"{len(list_elements)} list items")
|
||
|
||
# Draw in layers: images → tables → lists → text
|
||
|
||
# 1. Draw images
|
||
for img_elem in image_elements:
|
||
self._draw_image_element_direct(pdf_canvas, img_elem, page_height, output_path.parent)
|
||
|
||
# 2. Draw tables
|
||
for table_elem in table_elements:
|
||
self._draw_table_element_direct(pdf_canvas, table_elem, page_height)
|
||
|
||
# 3. Draw lists with sequential numbering
|
||
self._draw_list_elements_direct(pdf_canvas, list_elements, page_height)
|
||
|
||
# 4. Draw text with line breaks and styling
|
||
for text_elem in text_elements:
|
||
self._draw_text_element_direct(pdf_canvas, text_elem, page_height)
|
||
|
||
# Save PDF
|
||
pdf_canvas.save()
|
||
logger.info(f"Direct track PDF saved to {output_path}")
|
||
|
||
# Reset track
|
||
self.current_processing_track = None
|
||
return True
|
||
|
||
except Exception as e:
|
||
logger.error(f"Failed to generate Direct track PDF: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
self.current_processing_track = None
|
||
return False
|
||
|
||
def _generate_ocr_track_pdf(
|
||
self,
|
||
unified_doc: 'UnifiedDocument',
|
||
output_path: Path,
|
||
source_file_path: Optional[Path] = None
|
||
) -> bool:
|
||
"""
|
||
Generate PDF with simplified rendering for OCR track.
|
||
|
||
This method uses the existing OCR data conversion and rendering
|
||
pipeline for backward compatibility.
|
||
|
||
Args:
|
||
unified_doc: UnifiedDocument from OCR processing
|
||
output_path: Path to save generated PDF
|
||
source_file_path: Optional path to original source file
|
||
|
||
Returns:
|
||
True if successful, False otherwise
|
||
"""
|
||
try:
|
||
logger.info("=== OCR Track PDF Generation ===")
|
||
|
||
# Set current track
|
||
self.current_processing_track = 'ocr'
|
||
|
||
# Convert UnifiedDocument to OCR data format (legacy)
|
||
ocr_data = self.convert_unified_document_to_ocr_data(unified_doc)
|
||
|
||
# Use existing generation pipeline
|
||
result = self._generate_pdf_from_data(
|
||
ocr_data=ocr_data,
|
||
output_path=output_path,
|
||
source_file_path=source_file_path
|
||
)
|
||
|
||
# Reset track
|
||
self.current_processing_track = None
|
||
return result
|
||
|
||
except Exception as e:
|
||
logger.error(f"Failed to generate OCR track PDF: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
self.current_processing_track = None
|
||
return False
|
||
|
||
def _generate_pdf_from_data(
|
||
self,
|
||
ocr_data: Dict,
|
||
output_path: Path,
|
||
source_file_path: Optional[Path] = None,
|
||
json_parent_dir: Optional[Path] = None
|
||
) -> bool:
|
||
"""
|
||
Internal method to generate PDF from OCR data dictionary.
|
||
|
||
This is the core generation logic extracted for reuse by both
|
||
JSON-based and UnifiedDocument-based generation paths.
|
||
|
||
Args:
|
||
ocr_data: OCR data dictionary
|
||
output_path: Path to save generated PDF
|
||
source_file_path: Optional path to original source file
|
||
json_parent_dir: Directory containing images (for JSON-based generation)
|
||
|
||
Returns:
|
||
True if successful, False otherwise
|
||
"""
|
||
try:
|
||
# Check if PDF already exists (caching)
|
||
if output_path.exists():
|
||
logger.info(f"PDF already exists: {output_path.name}")
|
||
return True
|
||
|
||
# Get text regions
|
||
text_regions = ocr_data.get('text_regions', [])
|
||
if not text_regions:
|
||
logger.warning("No text regions found in data")
|
||
# Don't fail - might have only tables/images
|
||
|
||
# Get images metadata
|
||
images_metadata = ocr_data.get('images_metadata', [])
|
||
|
||
# Get layout data
|
||
layout_data = ocr_data.get('layout_data', {})
|
||
|
||
# Step 1: Get OCR processing dimensions
|
||
ocr_width, ocr_height = self.calculate_page_dimensions(ocr_data, source_file_path=None)
|
||
logger.info(f"OCR 處理時使用的座標系尺寸: {ocr_width:.1f} x {ocr_height:.1f}")
|
||
|
||
# Step 2: Get target PDF dimensions
|
||
if source_file_path:
|
||
target_dims = self.get_original_page_size(source_file_path)
|
||
if target_dims:
|
||
target_width, target_height = target_dims
|
||
logger.info(f"目標 PDF 尺寸(來自原始文件): {target_width:.1f} x {target_height:.1f}")
|
||
else:
|
||
target_width, target_height = ocr_width, ocr_height
|
||
logger.warning(f"無法獲取原始文件尺寸,使用 OCR 尺寸作為目標")
|
||
else:
|
||
target_width, target_height = ocr_width, ocr_height
|
||
logger.info(f"無原始文件,使用 OCR 尺寸作為目標: {target_width:.1f} x {target_height:.1f}")
|
||
|
||
# Step 3: Calculate scale factors
|
||
scale_w = target_width / ocr_width if ocr_width > 0 else 1.0
|
||
scale_h = target_height / ocr_height if ocr_height > 0 else 1.0
|
||
logger.info(f"縮放因子: X={scale_w:.3f}, Y={scale_h:.3f}")
|
||
|
||
# Create PDF canvas
|
||
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height))
|
||
|
||
# Filter text regions to avoid overlap with tables/images
|
||
regions_to_avoid = images_metadata
|
||
table_count = len([img for img in images_metadata if img.get('type') == 'table'])
|
||
|
||
logger.info(f"過濾文字區域: {len(regions_to_avoid)} 個區域需要避免 (含 {table_count} 個表格)")
|
||
|
||
filtered_text_regions = self._filter_text_in_regions(text_regions, regions_to_avoid)
|
||
|
||
# Group regions by page
|
||
pages_data = {}
|
||
for region in filtered_text_regions:
|
||
page_num = region.get('page', 1)
|
||
if page_num not in pages_data:
|
||
pages_data[page_num] = []
|
||
pages_data[page_num].append(region)
|
||
|
||
# Get table elements from layout_data
|
||
table_elements = []
|
||
if layout_data and layout_data.get('elements'):
|
||
table_elements = [e for e in layout_data['elements'] if e.get('type') == 'table']
|
||
|
||
# Process each page
|
||
total_pages = ocr_data.get('total_pages', 1)
|
||
logger.info(f"開始處理 {total_pages} 頁 PDF")
|
||
|
||
# Determine image directory
|
||
if json_parent_dir is None:
|
||
json_parent_dir = output_path.parent
|
||
|
||
for page_num in range(1, total_pages + 1):
|
||
logger.info(f">>> 處理第 {page_num}/{total_pages} 頁")
|
||
if page_num > 1:
|
||
pdf_canvas.showPage()
|
||
|
||
# Get regions for this page
|
||
page_text_regions = pages_data.get(page_num, [])
|
||
page_table_regions = [t for t in table_elements if t.get('page') == page_num - 1]
|
||
page_image_regions = [
|
||
img for img in images_metadata
|
||
if img.get('page') == page_num - 1
|
||
and img.get('type') != 'table'
|
||
and img.get('image_path') is not None # Skip table placeholders
|
||
]
|
||
|
||
# Draw in layers: images → tables → text
|
||
|
||
# 1. Draw images (bottom layer)
|
||
for img_meta in page_image_regions:
|
||
self.draw_image_region(
|
||
pdf_canvas, img_meta, target_height,
|
||
json_parent_dir, scale_w, scale_h
|
||
)
|
||
|
||
# 2. Draw tables (middle layer)
|
||
for table_elem in page_table_regions:
|
||
self.draw_table_region(
|
||
pdf_canvas, table_elem, images_metadata,
|
||
target_height, scale_w, scale_h
|
||
)
|
||
|
||
# 3. Draw text (top layer)
|
||
for region in page_text_regions:
|
||
self.draw_text_region(
|
||
pdf_canvas, region, target_height,
|
||
scale_w, scale_h
|
||
)
|
||
|
||
logger.info(f"<<< 第 {page_num} 頁完成")
|
||
|
||
# Save PDF
|
||
pdf_canvas.save()
|
||
|
||
file_size = output_path.stat().st_size
|
||
logger.info(f"Generated PDF: {output_path.name} ({file_size} bytes)")
|
||
return True
|
||
|
||
except Exception as e:
|
||
logger.error(f"Failed to generate PDF: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
return False
|
||
|
||
def calculate_page_dimensions(self, ocr_data: Dict, source_file_path: Optional[Path] = None) -> Tuple[float, float]:
|
||
"""
|
||
從 OCR JSON 數據中推斷 OCR 處理時的實際頁面尺寸。
|
||
這非常重要,因為 OCR 可能在高解析度影像上運行。
|
||
|
||
Args:
|
||
ocr_data: Complete OCR data dictionary with text_regions and layout
|
||
source_file_path: Optional path to source file (fallback only)
|
||
|
||
Returns:
|
||
Tuple of (width, height) in points
|
||
"""
|
||
max_x = 0
|
||
max_y = 0
|
||
|
||
# *** 關鍵修復:檢查所有可能包含 bbox 的字段 ***
|
||
# 不同版本的 OCR 輸出可能使用不同的字段名
|
||
all_regions = []
|
||
|
||
# 1. text_regions - 包含所有文字區域(最常見)
|
||
if 'text_regions' in ocr_data and isinstance(ocr_data['text_regions'], list):
|
||
all_regions.extend(ocr_data['text_regions'])
|
||
|
||
# 2. image_regions - 包含圖片區域
|
||
if 'image_regions' in ocr_data and isinstance(ocr_data['image_regions'], list):
|
||
all_regions.extend(ocr_data['image_regions'])
|
||
|
||
# 3. tables - 包含表格區域
|
||
if 'tables' in ocr_data and isinstance(ocr_data['tables'], list):
|
||
all_regions.extend(ocr_data['tables'])
|
||
|
||
# 4. layout - 可能包含布局信息(可能是空列表)
|
||
if 'layout' in ocr_data and isinstance(ocr_data['layout'], list):
|
||
all_regions.extend(ocr_data['layout'])
|
||
|
||
# 5. layout_data.elements - PP-StructureV3 格式
|
||
if 'layout_data' in ocr_data and isinstance(ocr_data['layout_data'], dict):
|
||
elements = ocr_data['layout_data'].get('elements', [])
|
||
if elements:
|
||
all_regions.extend(elements)
|
||
|
||
if not all_regions:
|
||
# 如果 JSON 為空,回退到原始檔案尺寸
|
||
logger.warning("JSON 中沒有找到 text_regions, image_regions, tables, layout 或 layout_data.elements,回退到原始檔案尺寸。")
|
||
if source_file_path:
|
||
dims = self.get_original_page_size(source_file_path)
|
||
if dims:
|
||
return dims
|
||
return A4
|
||
|
||
region_count = 0
|
||
for region in all_regions:
|
||
try:
|
||
bbox = region.get('bbox')
|
||
if not bbox:
|
||
continue
|
||
|
||
region_count += 1
|
||
|
||
# *** 關鍵修復:正確處理多邊形 [[x, y], ...] 格式 ***
|
||
if isinstance(bbox[0], (int, float)):
|
||
# 處理簡單的 [x1, y1, x2, y2] 格式
|
||
max_x = max(max_x, bbox[2])
|
||
max_y = max(max_y, bbox[3])
|
||
elif isinstance(bbox[0], (list, tuple)):
|
||
# 處理多邊形 [[x, y], ...] 格式
|
||
x_coords = [p[0] for p in bbox if isinstance(p, (list, tuple)) and len(p) >= 2]
|
||
y_coords = [p[1] for p in bbox if isinstance(p, (list, tuple)) and len(p) >= 2]
|
||
if x_coords and y_coords:
|
||
max_x = max(max_x, max(x_coords))
|
||
max_y = max(max_y, max(y_coords))
|
||
|
||
except Exception as e:
|
||
logger.warning(f"Error processing bbox {bbox}: {e}")
|
||
|
||
if max_x > 0 and max_y > 0:
|
||
logger.info(f"從 {region_count} 個區域中推斷出的 OCR 座標系尺寸: {max_x:.1f} x {max_y:.1f}")
|
||
return (max_x, max_y)
|
||
else:
|
||
# 如果所有 bbox 都解析失敗,才回退
|
||
logger.warning("無法從 bbox 推斷尺寸,回退到原始檔案尺寸。")
|
||
if source_file_path:
|
||
dims = self.get_original_page_size(source_file_path)
|
||
if dims:
|
||
return dims
|
||
return A4
|
||
|
||
def get_original_page_size(self, file_path: Path) -> Optional[Tuple[float, float]]:
|
||
"""
|
||
Extract page dimensions from original source file
|
||
|
||
Args:
|
||
file_path: Path to original file (image or PDF)
|
||
|
||
Returns:
|
||
Tuple of (width, height) in points or None
|
||
"""
|
||
try:
|
||
if not file_path.exists():
|
||
return None
|
||
|
||
# For images, get dimensions from PIL
|
||
if file_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']:
|
||
img = Image.open(file_path)
|
||
# Use pixel dimensions directly as points (1:1 mapping)
|
||
# This matches how PaddleOCR reports coordinates
|
||
width_pt = float(img.width)
|
||
height_pt = float(img.height)
|
||
logger.info(f"Extracted dimensions from image: {width_pt:.1f} x {height_pt:.1f} points (1:1 pixel mapping)")
|
||
return (width_pt, height_pt)
|
||
|
||
# For PDFs, extract dimensions using PyPDF2
|
||
if file_path.suffix.lower() == '.pdf':
|
||
try:
|
||
from PyPDF2 import PdfReader
|
||
reader = PdfReader(file_path)
|
||
if len(reader.pages) > 0:
|
||
page = reader.pages[0]
|
||
# MediaBox gives [x1, y1, x2, y2] in points
|
||
mediabox = page.mediabox
|
||
width_pt = float(mediabox.width)
|
||
height_pt = float(mediabox.height)
|
||
logger.info(f"Extracted dimensions from PDF: {width_pt:.1f} x {height_pt:.1f} points")
|
||
return (width_pt, height_pt)
|
||
except ImportError:
|
||
logger.warning("PyPDF2 not available, cannot extract PDF dimensions")
|
||
except Exception as e:
|
||
logger.warning(f"Failed to extract PDF dimensions: {e}")
|
||
|
||
except Exception as e:
|
||
logger.warning(f"Failed to get page size from {file_path}: {e}")
|
||
|
||
return None
|
||
|
||
def _get_bbox_coords(self, bbox: Union[List[List[float]], List[float]]) -> Optional[Tuple[float, float, float, float]]:
|
||
"""將任何 bbox 格式 (多邊形或 [x1,y1,x2,y2]) 轉換為 [x_min, y_min, x_max, y_max]"""
|
||
try:
|
||
if isinstance(bbox[0], (list, tuple)):
|
||
# 處理多邊形 [[x, y], ...]
|
||
x_coords = [p[0] for p in bbox if isinstance(p, (list, tuple)) and len(p) >= 2]
|
||
y_coords = [p[1] for p in bbox if isinstance(p, (list, tuple)) and len(p) >= 2]
|
||
if not x_coords or not y_coords:
|
||
return None
|
||
return min(x_coords), min(y_coords), max(x_coords), max(y_coords)
|
||
elif isinstance(bbox[0], (int, float)) and len(bbox) == 4:
|
||
# 處理 [x1, y1, x2, y2]
|
||
return bbox[0], bbox[1], bbox[2], bbox[3]
|
||
else:
|
||
logger.warning(f"未知的 bbox 格式: {bbox}")
|
||
return None
|
||
except Exception as e:
|
||
logger.error(f"解析 bbox {bbox} 時出錯: {e}")
|
||
return None
|
||
|
||
def _is_bbox_inside(self, inner_bbox_data: Dict, outer_bbox_data: Dict, tolerance: float = 5.0) -> bool:
|
||
"""
|
||
檢查 'inner_bbox' 是否在 'outer_bbox' 內部(帶有容錯)。
|
||
此版本可處理多邊形和矩形。
|
||
"""
|
||
inner_coords = self._get_bbox_coords(inner_bbox_data.get('bbox'))
|
||
outer_coords = self._get_bbox_coords(outer_bbox_data.get('bbox'))
|
||
|
||
if not inner_coords or not outer_coords:
|
||
return False
|
||
|
||
inner_x1, inner_y1, inner_x2, inner_y2 = inner_coords
|
||
outer_x1, outer_y1, outer_x2, outer_y2 = outer_coords
|
||
|
||
# 檢查 inner 是否在 outer 內部 (加入 tolerance)
|
||
is_inside = (
|
||
(inner_x1 >= outer_x1 - tolerance) and
|
||
(inner_y1 >= outer_y1 - tolerance) and
|
||
(inner_x2 <= outer_x2 + tolerance) and
|
||
(inner_y2 <= outer_y2 + tolerance)
|
||
)
|
||
return is_inside
|
||
|
||
def _bbox_overlaps(self, bbox1_data: Dict, bbox2_data: Dict, tolerance: float = 5.0) -> bool:
|
||
"""
|
||
檢查兩個 bbox 是否有重疊(帶有容錯)。
|
||
如果有任何重疊,返回 True。
|
||
|
||
Args:
|
||
bbox1_data: 第一個 bbox 數據
|
||
bbox2_data: 第二個 bbox 數據
|
||
tolerance: 容錯值(像素)
|
||
|
||
Returns:
|
||
True 如果兩個 bbox 有重疊
|
||
"""
|
||
coords1 = self._get_bbox_coords(bbox1_data.get('bbox'))
|
||
coords2 = self._get_bbox_coords(bbox2_data.get('bbox'))
|
||
|
||
if not coords1 or not coords2:
|
||
return False
|
||
|
||
x1_min, y1_min, x1_max, y1_max = coords1
|
||
x2_min, y2_min, x2_max, y2_max = coords2
|
||
|
||
# 擴展 bbox2(表格/圖片區域)的範圍
|
||
x2_min -= tolerance
|
||
y2_min -= tolerance
|
||
x2_max += tolerance
|
||
y2_max += tolerance
|
||
|
||
# 檢查是否有重疊:如果沒有重疊,則必定滿足以下條件之一
|
||
no_overlap = (
|
||
x1_max < x2_min or # bbox1 在 bbox2 左側
|
||
x1_min > x2_max or # bbox1 在 bbox2 右側
|
||
y1_max < y2_min or # bbox1 在 bbox2 上方
|
||
y1_min > y2_max # bbox1 在 bbox2 下方
|
||
)
|
||
|
||
return not no_overlap
|
||
|
||
def _filter_text_in_regions(self, text_regions: List[Dict], regions_to_avoid: List[Dict], tolerance: float = 10.0) -> List[Dict]:
|
||
"""
|
||
過濾掉與 'regions_to_avoid'(例如表格、圖片)重疊的文字區域。
|
||
|
||
Args:
|
||
text_regions: 文字區域列表
|
||
regions_to_avoid: 需要避免的區域列表(表格、圖片)
|
||
tolerance: 容錯值(像素),增加到 10.0 以更好地處理邊界情況
|
||
|
||
Returns:
|
||
過濾後的文字區域列表
|
||
"""
|
||
filtered_text = []
|
||
filtered_count = 0
|
||
|
||
for text_region in text_regions:
|
||
should_filter = False
|
||
|
||
for avoid_region in regions_to_avoid:
|
||
# 使用重疊檢測:只要有任何重疊就過濾掉
|
||
if self._bbox_overlaps(text_region, avoid_region, tolerance=tolerance):
|
||
should_filter = True
|
||
filtered_count += 1
|
||
logger.debug(f"過濾掉重疊文字: {text_region.get('text', '')[:20]}...")
|
||
break # 找到一個重疊區域就足夠了
|
||
|
||
if not should_filter:
|
||
filtered_text.append(text_region)
|
||
|
||
logger.info(f"原始文字區域: {len(text_regions)}, 過濾後: {len(filtered_text)}, 移除: {filtered_count}")
|
||
return filtered_text
|
||
|
||
def draw_text_region(
|
||
self,
|
||
pdf_canvas: canvas.Canvas,
|
||
region: Dict,
|
||
page_height: float,
|
||
scale_w: float = 1.0,
|
||
scale_h: float = 1.0
|
||
):
|
||
"""
|
||
Draw a text region at precise coordinates
|
||
|
||
Args:
|
||
pdf_canvas: ReportLab canvas object
|
||
region: Text region dict with text, bbox, confidence
|
||
page_height: Height of page (for coordinate transformation)
|
||
scale_w: Scale factor for X coordinates (PDF width / OCR width)
|
||
scale_h: Scale factor for Y coordinates (PDF height / OCR height)
|
||
"""
|
||
text = region.get('text', '')
|
||
bbox = region.get('bbox', [])
|
||
confidence = region.get('confidence', 1.0)
|
||
|
||
if not text or not bbox or len(bbox) < 4:
|
||
return
|
||
|
||
try:
|
||
# bbox from OCR: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
|
||
# Points: top-left, top-right, bottom-right, bottom-left
|
||
# OCR coordinates: origin (0,0) at top-left, Y increases downward
|
||
ocr_x_left = bbox[0][0] # Left X
|
||
ocr_y_top = bbox[0][1] # Top Y in OCR coordinates
|
||
ocr_x_right = bbox[2][0] # Right X
|
||
ocr_y_bottom = bbox[2][1] # Bottom Y in OCR coordinates
|
||
|
||
logger.info(f"[文字] '{text[:20]}...' OCR原始座標: L={ocr_x_left:.0f}, T={ocr_y_top:.0f}, R={ocr_x_right:.0f}, B={ocr_y_bottom:.0f}")
|
||
|
||
# Apply scale factors to convert from OCR space to PDF space
|
||
scaled_x_left = ocr_x_left * scale_w
|
||
scaled_y_top = ocr_y_top * scale_h
|
||
scaled_x_right = ocr_x_right * scale_w
|
||
scaled_y_bottom = ocr_y_bottom * scale_h
|
||
|
||
logger.info(f"[文字] '{text[:20]}...' 縮放後(scale={scale_w:.3f},{scale_h:.3f}): L={scaled_x_left:.1f}, T={scaled_y_top:.1f}, R={scaled_x_right:.1f}, B={scaled_y_bottom:.1f}")
|
||
|
||
# Calculate bbox dimensions (after scaling)
|
||
bbox_width = abs(scaled_x_right - scaled_x_left)
|
||
bbox_height = abs(scaled_y_bottom - scaled_y_top)
|
||
|
||
# Calculate font size using heuristics
|
||
# Font size is typically 70-90% of bbox height
|
||
# Testing shows 0.75 works well for most cases
|
||
font_size = bbox_height * 0.75
|
||
font_size = max(min(font_size, 72), 4) # Clamp between 4pt and 72pt
|
||
|
||
# Transform coordinates: OCR (top-left origin) → PDF (bottom-left origin)
|
||
# CRITICAL: Y-axis flip!
|
||
pdf_x = scaled_x_left
|
||
pdf_y = page_height - scaled_y_bottom # Flip Y-axis using bottom coordinate
|
||
|
||
logger.info(f"[文字] '{text[:30]}' → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 字體:{font_size:.1f}pt, 寬x高:{bbox_width:.0f}x{bbox_height:.0f}")
|
||
|
||
# Set font with track-specific styling
|
||
# Note: OCR track has no StyleInfo (extracted from images), so no advanced formatting
|
||
style_info = region.get('style')
|
||
is_direct_track = (self.current_processing_track == 'direct' or
|
||
self.current_processing_track == ProcessingTrack.DIRECT)
|
||
|
||
if style_info and is_direct_track:
|
||
# Direct track: Apply rich styling from StyleInfo
|
||
self._apply_text_style(pdf_canvas, style_info, default_size=font_size)
|
||
# Get current font for width calculation
|
||
font_name = pdf_canvas._fontname
|
||
font_size = pdf_canvas._fontsize
|
||
logger.debug(f"Applied Direct track style: font={font_name}, size={font_size}")
|
||
else:
|
||
# OCR track or no style: Use simple font selection
|
||
font_name = self.font_name if self.font_registered else 'Helvetica'
|
||
pdf_canvas.setFont(font_name, font_size)
|
||
|
||
# Handle line breaks (split text by newlines)
|
||
# OCR track: simple left-aligned rendering
|
||
lines = text.split('\n')
|
||
line_height = font_size * 1.2 # 120% of font size for line spacing
|
||
|
||
# Draw each line (left-aligned for OCR track)
|
||
for i, line in enumerate(lines):
|
||
if not line.strip():
|
||
continue # Skip empty lines
|
||
|
||
line_y = pdf_y - (i * line_height)
|
||
|
||
# Calculate text width to prevent overflow
|
||
text_width = pdf_canvas.stringWidth(line, font_name, font_size)
|
||
|
||
# If text is too wide for bbox, scale down font for this line
|
||
current_font_size = font_size
|
||
if text_width > bbox_width:
|
||
scale_factor = bbox_width / text_width
|
||
current_font_size = font_size * scale_factor * 0.95 # 95% to add small margin
|
||
current_font_size = max(current_font_size, 3) # Minimum 3pt
|
||
pdf_canvas.setFont(font_name, current_font_size)
|
||
|
||
# Draw text at left-aligned position (OCR track uses simple left alignment)
|
||
pdf_canvas.drawString(pdf_x, line_y, line)
|
||
|
||
# Reset font size for next line
|
||
if text_width > bbox_width:
|
||
pdf_canvas.setFont(font_name, font_size)
|
||
|
||
# Debug: Draw bounding box (optional)
|
||
if settings.pdf_enable_bbox_debug:
|
||
pdf_canvas.setStrokeColorRGB(1, 0, 0, 0.3) # Red, semi-transparent
|
||
pdf_canvas.setLineWidth(0.5)
|
||
# Transform all bbox points to PDF coordinates (apply scaling first)
|
||
pdf_points = [(p[0] * scale_w, page_height - p[1] * scale_h) for p in bbox]
|
||
# Draw bbox rectangle
|
||
for i in range(4):
|
||
x1, y1 = pdf_points[i]
|
||
x2, y2 = pdf_points[(i + 1) % 4]
|
||
pdf_canvas.line(x1, y1, x2, y2)
|
||
|
||
except Exception as e:
|
||
logger.warning(f"Failed to draw text region '{text[:20]}...': {e}")
|
||
|
||
def draw_table_region(
|
||
self,
|
||
pdf_canvas: canvas.Canvas,
|
||
table_element: Dict,
|
||
images_metadata: List[Dict],
|
||
page_height: float,
|
||
scale_w: float = 1.0,
|
||
scale_h: float = 1.0
|
||
):
|
||
"""
|
||
Draw a table region by parsing HTML and rebuilding with ReportLab Table
|
||
|
||
Args:
|
||
pdf_canvas: ReportLab canvas object
|
||
table_element: Table element dict with HTML content
|
||
images_metadata: List of image metadata to find table bbox
|
||
page_height: Height of page
|
||
scale_w: Scale factor for X coordinates (PDF width / OCR width)
|
||
scale_h: Scale factor for Y coordinates (PDF height / OCR height)
|
||
"""
|
||
try:
|
||
html_content = table_element.get('content', '')
|
||
if not html_content:
|
||
return
|
||
|
||
# Parse HTML to extract table structure
|
||
parser = HTMLTableParser()
|
||
parser.feed(html_content)
|
||
|
||
if not parser.tables:
|
||
logger.warning("No tables found in HTML content")
|
||
return
|
||
|
||
# Get the first table (PP-StructureV3 usually provides one table per element)
|
||
table_data = parser.tables[0]
|
||
rows = table_data['rows']
|
||
|
||
if not rows:
|
||
return
|
||
|
||
# Get bbox directly from table element
|
||
table_bbox = table_element.get('bbox')
|
||
|
||
# If no bbox directly, check for bbox_polygon
|
||
if not table_bbox:
|
||
bbox_polygon = table_element.get('bbox_polygon')
|
||
if bbox_polygon and len(bbox_polygon) >= 4:
|
||
# Convert polygon format to simple bbox [x0, y0, x1, y1]
|
||
table_bbox = [
|
||
bbox_polygon[0][0], # x0
|
||
bbox_polygon[0][1], # y0
|
||
bbox_polygon[2][0], # x1
|
||
bbox_polygon[2][1] # y1
|
||
]
|
||
|
||
if not table_bbox:
|
||
logger.warning(f"No bbox found for table element")
|
||
return
|
||
|
||
# Handle different bbox formats
|
||
if isinstance(table_bbox, list) and len(table_bbox) == 4:
|
||
# Simple bbox format [x0, y0, x1, y1]
|
||
if isinstance(table_bbox[0], (int, float)):
|
||
ocr_x_left_raw = table_bbox[0]
|
||
ocr_y_top_raw = table_bbox[1]
|
||
ocr_x_right_raw = table_bbox[2]
|
||
ocr_y_bottom_raw = table_bbox[3]
|
||
# Polygon format [[x,y], [x,y], [x,y], [x,y]]
|
||
elif isinstance(table_bbox[0], list):
|
||
ocr_x_left_raw = table_bbox[0][0]
|
||
ocr_y_top_raw = table_bbox[0][1]
|
||
ocr_x_right_raw = table_bbox[2][0]
|
||
ocr_y_bottom_raw = table_bbox[2][1]
|
||
else:
|
||
logger.error(f"Unexpected bbox format: {table_bbox}")
|
||
return
|
||
else:
|
||
logger.error(f"Invalid table_bbox format: {table_bbox}")
|
||
return
|
||
|
||
logger.info(f"[表格] OCR原始座標: L={ocr_x_left_raw:.0f}, T={ocr_y_top_raw:.0f}, R={ocr_x_right_raw:.0f}, B={ocr_y_bottom_raw:.0f}")
|
||
|
||
# Apply scaling
|
||
ocr_x_left = ocr_x_left_raw * scale_w
|
||
ocr_y_top = ocr_y_top_raw * scale_h
|
||
ocr_x_right = ocr_x_right_raw * scale_w
|
||
ocr_y_bottom = ocr_y_bottom_raw * scale_h
|
||
|
||
table_width = abs(ocr_x_right - ocr_x_left)
|
||
table_height = abs(ocr_y_bottom - ocr_y_top)
|
||
|
||
# Transform coordinates
|
||
pdf_x = ocr_x_left
|
||
pdf_y = page_height - ocr_y_bottom
|
||
|
||
# Build table data for ReportLab
|
||
# Convert parsed structure to simple 2D array
|
||
max_cols = max(len(row['cells']) for row in rows)
|
||
|
||
logger.info(f"[表格] {len(rows)}行x{max_cols}列 → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 寬x高: {table_width:.0f}x{table_height:.0f}")
|
||
reportlab_data = []
|
||
|
||
for row in rows:
|
||
row_data = []
|
||
for cell in row['cells']:
|
||
text = cell['text'].strip()
|
||
row_data.append(text)
|
||
# Pad row if needed
|
||
while len(row_data) < max_cols:
|
||
row_data.append('')
|
||
reportlab_data.append(row_data)
|
||
|
||
# Calculate column widths (equal distribution)
|
||
col_widths = [table_width / max_cols] * max_cols
|
||
|
||
# Create ReportLab Table
|
||
# Use smaller font size to fit in bbox
|
||
font_size = min(table_height / len(rows) * 0.5, 10)
|
||
font_size = max(font_size, 6)
|
||
|
||
# Create table with font
|
||
table = Table(reportlab_data, colWidths=col_widths)
|
||
|
||
# Apply table style
|
||
style = TableStyle([
|
||
('FONT', (0, 0), (-1, -1), self.font_name if self.font_registered else 'Helvetica', font_size),
|
||
('GRID', (0, 0), (-1, -1), 0.5, colors.black),
|
||
('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
|
||
('ALIGN', (0, 0), (-1, -1), 'CENTER'),
|
||
('LEFTPADDING', (0, 0), (-1, -1), 2),
|
||
('RIGHTPADDING', (0, 0), (-1, -1), 2),
|
||
('TOPPADDING', (0, 0), (-1, -1), 2),
|
||
('BOTTOMPADDING', (0, 0), (-1, -1), 2),
|
||
])
|
||
|
||
# Add header style if first row has headers
|
||
if rows and rows[0]['cells'] and rows[0]['cells'][0].get('is_header'):
|
||
style.add('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey)
|
||
style.add('FONT', (0, 0), (-1, 0), self.font_name if self.font_registered else 'Helvetica-Bold', font_size)
|
||
|
||
table.setStyle(style)
|
||
|
||
# Calculate table size
|
||
table.wrapOn(pdf_canvas, table_width, table_height)
|
||
|
||
# Draw table at position
|
||
table.drawOn(pdf_canvas, pdf_x, pdf_y)
|
||
|
||
logger.info(f"Drew table at ({pdf_x:.0f}, {pdf_y:.0f}) size {table_width:.0f}x{table_height:.0f} with {len(rows)} rows")
|
||
|
||
except Exception as e:
|
||
logger.warning(f"Failed to draw table region: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
|
||
def draw_image_region(
|
||
self,
|
||
pdf_canvas: canvas.Canvas,
|
||
region: Dict,
|
||
page_height: float,
|
||
result_dir: Path,
|
||
scale_w: float = 1.0,
|
||
scale_h: float = 1.0
|
||
):
|
||
"""
|
||
Draw an image region by embedding the extracted image
|
||
|
||
Handles images extracted by PP-StructureV3 (tables, figures, charts, etc.)
|
||
|
||
Args:
|
||
pdf_canvas: ReportLab canvas object
|
||
region: Image metadata dict with image_path and bbox
|
||
page_height: Height of page (for coordinate transformation)
|
||
result_dir: Directory containing result files
|
||
scale_w: Scale factor for X coordinates (PDF width / OCR width)
|
||
scale_h: Scale factor for Y coordinates (PDF height / OCR height)
|
||
"""
|
||
try:
|
||
image_path_str = region.get('image_path', '')
|
||
if not image_path_str:
|
||
return
|
||
|
||
# Construct full path to image
|
||
image_path = result_dir / image_path_str
|
||
|
||
if not image_path.exists():
|
||
logger.warning(f"Image not found: {image_path}")
|
||
return
|
||
|
||
# Get bbox for positioning
|
||
bbox = region.get('bbox', [])
|
||
if not bbox or len(bbox) < 4:
|
||
# If no bbox, skip for now
|
||
logger.warning(f"No bbox for image {image_path_str}")
|
||
return
|
||
|
||
# bbox from OCR: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
|
||
# OCR coordinates: origin (0,0) at top-left, Y increases downward
|
||
ocr_x_left_raw = bbox[0][0]
|
||
ocr_y_top_raw = bbox[0][1]
|
||
ocr_x_right_raw = bbox[2][0]
|
||
ocr_y_bottom_raw = bbox[2][1]
|
||
|
||
logger.info(f"[圖片] '{image_path_str}' OCR原始座標: L={ocr_x_left_raw:.0f}, T={ocr_y_top_raw:.0f}, R={ocr_x_right_raw:.0f}, B={ocr_y_bottom_raw:.0f}")
|
||
|
||
# Apply scaling
|
||
ocr_x_left = ocr_x_left_raw * scale_w
|
||
ocr_y_top = ocr_y_top_raw * scale_h
|
||
ocr_x_right = ocr_x_right_raw * scale_w
|
||
ocr_y_bottom = ocr_y_bottom_raw * scale_h
|
||
|
||
# Calculate bbox dimensions (after scaling)
|
||
bbox_width = abs(ocr_x_right - ocr_x_left)
|
||
bbox_height = abs(ocr_y_bottom - ocr_y_top)
|
||
|
||
# Transform coordinates: OCR (top-left origin) → PDF (bottom-left origin)
|
||
# CRITICAL: Y-axis flip!
|
||
# For images, we position at bottom-left corner
|
||
pdf_x_left = ocr_x_left
|
||
pdf_y_bottom = page_height - ocr_y_bottom # Flip Y-axis
|
||
|
||
logger.info(f"[圖片] '{image_path_str}' → PDF位置: ({pdf_x_left:.1f}, {pdf_y_bottom:.1f}), 寬x高: {bbox_width:.0f}x{bbox_height:.0f}")
|
||
|
||
# Draw image using ReportLab
|
||
# drawImage expects: (path, x, y, width, height)
|
||
# where (x, y) is the bottom-left corner of the image
|
||
pdf_canvas.drawImage(
|
||
str(image_path),
|
||
pdf_x_left,
|
||
pdf_y_bottom,
|
||
width=bbox_width,
|
||
height=bbox_height,
|
||
preserveAspectRatio=True,
|
||
mask='auto' # Handle transparency
|
||
)
|
||
|
||
logger.info(f"[圖片] ✓ 成功繪製 '{image_path_str}'")
|
||
|
||
except Exception as e:
|
||
logger.warning(f"Failed to draw image region: {e}")
|
||
|
||
def generate_layout_pdf(
|
||
self,
|
||
json_path: Path,
|
||
output_path: Path,
|
||
source_file_path: Optional[Path] = None
|
||
) -> bool:
|
||
"""
|
||
Generate layout-preserving PDF from OCR JSON data
|
||
|
||
Args:
|
||
json_path: Path to OCR JSON file
|
||
output_path: Path to save generated PDF
|
||
source_file_path: Optional path to original source file for dimension extraction
|
||
|
||
Returns:
|
||
True if successful, False otherwise
|
||
"""
|
||
try:
|
||
# Load JSON data
|
||
ocr_data = self.load_ocr_json(json_path)
|
||
if not ocr_data:
|
||
return False
|
||
|
||
# Use internal generation with pre-loaded data
|
||
return self._generate_pdf_from_data(
|
||
ocr_data=ocr_data,
|
||
output_path=output_path,
|
||
source_file_path=source_file_path,
|
||
json_parent_dir=json_path.parent
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.error(f"Failed to generate PDF: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
return False
|
||
|
||
def _is_list_item_fallback(self, element: 'DocumentElement') -> bool:
|
||
"""
|
||
Fallback detection for list items not marked with ElementType.LIST_ITEM.
|
||
|
||
Checks metadata and text patterns to identify list items.
|
||
|
||
Args:
|
||
element: Document element to check
|
||
|
||
Returns:
|
||
True if element appears to be a list item
|
||
"""
|
||
# Skip if already categorized as table or image
|
||
if element.type in [ElementType.TABLE, ElementType.IMAGE, ElementType.FIGURE,
|
||
ElementType.CHART, ElementType.DIAGRAM]:
|
||
return False
|
||
|
||
# Check metadata for list-related fields
|
||
if element.metadata:
|
||
# Check for list_level metadata
|
||
if 'list_level' in element.metadata:
|
||
return True
|
||
# Check for parent_item (indicates list hierarchy)
|
||
if 'parent_item' in element.metadata:
|
||
return True
|
||
# Check for children (could be parent list item)
|
||
if 'children' in element.metadata and element.metadata['children']:
|
||
return True
|
||
|
||
# Check text content for list patterns
|
||
if element.is_text:
|
||
text = element.get_text().lstrip()
|
||
# Ordered list pattern: starts with number followed by . or )
|
||
if re.match(r'^\d+[\.\)]\s', text):
|
||
return True
|
||
# Unordered list pattern: starts with bullet character
|
||
if re.match(r'^[•·▪▫◦‣⁃\-\*]\s', text):
|
||
return True
|
||
|
||
return False
|
||
|
||
def _draw_list_elements_direct(
|
||
self,
|
||
pdf_canvas: canvas.Canvas,
|
||
list_elements: List['DocumentElement'],
|
||
page_height: float
|
||
):
|
||
"""
|
||
Draw list elements with proper sequential numbering and formatting.
|
||
|
||
This method processes all list items on a page, groups them into lists,
|
||
and assigns proper sequential numbering to ordered lists.
|
||
|
||
Args:
|
||
pdf_canvas: ReportLab canvas object
|
||
list_elements: List of LIST_ITEM elements
|
||
page_height: Page height for coordinate transformation
|
||
"""
|
||
if not list_elements:
|
||
return
|
||
|
||
# Sort list items by position (top to bottom, left to right)
|
||
sorted_items = sorted(list_elements, key=lambda e: (e.bbox.y0, e.bbox.x0))
|
||
|
||
# Group list items into lists based on proximity and level
|
||
list_groups = []
|
||
current_group = []
|
||
prev_y = None
|
||
prev_level = None
|
||
max_gap = 30 # Maximum vertical gap between items in same list (in points)
|
||
|
||
for item in sorted_items:
|
||
level = item.metadata.get('list_level', 0) if item.metadata else 0
|
||
y_pos = item.bbox.y0
|
||
|
||
# Check if this item belongs to current group
|
||
if current_group and prev_y is not None:
|
||
gap = abs(y_pos - prev_y)
|
||
# Start new group if gap is too large or level changed significantly
|
||
if gap > max_gap or (prev_level is not None and level != prev_level):
|
||
list_groups.append(current_group)
|
||
current_group = []
|
||
|
||
current_group.append(item)
|
||
prev_y = y_pos
|
||
prev_level = level
|
||
|
||
if current_group:
|
||
list_groups.append(current_group)
|
||
|
||
# Process each list group
|
||
for group in list_groups:
|
||
# Detect list type from first item
|
||
first_item = group[0]
|
||
text_content = first_item.get_text()
|
||
text_stripped = text_content.lstrip()
|
||
|
||
list_type = None
|
||
list_counter = 1
|
||
|
||
# Determine list type
|
||
if re.match(r'^\d+[\.\)]\s', text_stripped):
|
||
list_type = 'ordered'
|
||
# Extract starting number
|
||
match = re.match(r'^(\d+)[\.\)]\s', text_stripped)
|
||
if match:
|
||
list_counter = int(match.group(1))
|
||
elif re.match(r'^[•·▪▫◦‣⁃]\s', text_stripped):
|
||
list_type = 'unordered'
|
||
|
||
# Draw each item in the group with proper spacing
|
||
# Track cumulative Y offset to apply spacing_after between items
|
||
cumulative_y_offset = 0
|
||
|
||
for item_idx, item in enumerate(group):
|
||
# Prepare list marker based on type
|
||
if list_type == 'ordered':
|
||
list_marker = f"{list_counter}. "
|
||
list_counter += 1
|
||
elif list_type == 'unordered':
|
||
list_marker = "• "
|
||
else:
|
||
list_marker = "" # No marker if type unknown
|
||
|
||
# Store list marker in item metadata for _draw_text_element_direct
|
||
if not item.metadata:
|
||
item.metadata = {}
|
||
item.metadata['_list_marker'] = list_marker
|
||
item.metadata['_list_type'] = list_type
|
||
|
||
# Add default list item spacing if not specified
|
||
# This ensures consistent spacing between list items
|
||
desired_spacing_after = item.metadata.get('spacing_after', 0)
|
||
if desired_spacing_after == 0:
|
||
# Default list item spacing: 3 points between items (except last item)
|
||
if item_idx < len(group) - 1:
|
||
desired_spacing_after = 3.0
|
||
item.metadata['spacing_after'] = desired_spacing_after
|
||
|
||
# Draw the list item with cumulative Y offset
|
||
self._draw_text_element_direct(pdf_canvas, item, page_height, y_offset=cumulative_y_offset)
|
||
|
||
# Calculate spacing to add after this item
|
||
if item_idx < len(group) - 1 and desired_spacing_after > 0:
|
||
next_item = group[item_idx + 1]
|
||
|
||
# Calculate actual vertical gap between items (in document coordinates)
|
||
# Note: Y increases downward in document coordinates
|
||
actual_gap = next_item.bbox.y0 - item.bbox.y1
|
||
|
||
# If actual gap is less than desired spacing, add offset to push next item down
|
||
if actual_gap < desired_spacing_after:
|
||
additional_spacing = desired_spacing_after - actual_gap
|
||
cumulative_y_offset -= additional_spacing # Negative because PDF Y increases upward
|
||
logger.debug(f"Adding {additional_spacing:.1f}pt spacing after list item {item.element_id} "
|
||
f"(actual_gap={actual_gap:.1f}pt, desired={desired_spacing_after:.1f}pt)")
|
||
|
||
def _draw_text_with_spans(
|
||
self,
|
||
pdf_canvas: canvas.Canvas,
|
||
spans: List['DocumentElement'],
|
||
line_x: float,
|
||
line_y: float,
|
||
default_font_size: float
|
||
) -> float:
|
||
"""
|
||
Draw text with inline span styling (mixed styles within a line).
|
||
|
||
Args:
|
||
pdf_canvas: ReportLab canvas object
|
||
spans: List of span DocumentElements
|
||
line_x: Starting X position
|
||
line_y: Y position
|
||
default_font_size: Default font size if span has none
|
||
|
||
Returns:
|
||
Total width of drawn text
|
||
"""
|
||
x_pos = line_x
|
||
total_width = 0
|
||
|
||
for span in spans:
|
||
span_text = span.get_text()
|
||
if not span_text:
|
||
continue
|
||
|
||
# Apply span-specific styling
|
||
if span.style:
|
||
self._apply_text_style(pdf_canvas, span.style, default_size=default_font_size)
|
||
else:
|
||
# Fallback to default font
|
||
font_name = self.font_name if self.font_registered else 'Helvetica'
|
||
pdf_canvas.setFont(font_name, default_font_size)
|
||
|
||
# Get current font for width calculation
|
||
current_font = pdf_canvas._fontname
|
||
current_size = pdf_canvas._fontsize
|
||
|
||
# Draw this span
|
||
pdf_canvas.drawString(x_pos, line_y, span_text)
|
||
|
||
# Calculate width and advance position
|
||
span_width = pdf_canvas.stringWidth(span_text, current_font, current_size)
|
||
x_pos += span_width
|
||
total_width += span_width
|
||
|
||
return total_width
|
||
|
||
def _draw_text_element_direct(
|
||
self,
|
||
pdf_canvas: canvas.Canvas,
|
||
element: 'DocumentElement',
|
||
page_height: float,
|
||
y_offset: float = 0
|
||
):
|
||
"""
|
||
Draw text element with Direct track rich formatting.
|
||
|
||
Handles line breaks, alignment, indentation, and applies StyleInfo.
|
||
Supports span-level inline styling if element has children.
|
||
|
||
Args:
|
||
pdf_canvas: ReportLab canvas object
|
||
element: DocumentElement with text content
|
||
page_height: Page height for coordinate transformation
|
||
y_offset: Optional Y coordinate offset (for list spacing), in PDF coordinates
|
||
"""
|
||
try:
|
||
text_content = element.get_text()
|
||
if not text_content:
|
||
return
|
||
|
||
# Get bounding box
|
||
bbox = element.bbox
|
||
if not bbox:
|
||
logger.warning(f"No bbox for text element {element.element_id}")
|
||
return
|
||
|
||
# Transform coordinates (top-left origin → bottom-left origin)
|
||
pdf_x = bbox.x0
|
||
pdf_y = page_height - bbox.y1 + y_offset # Use bottom of bbox + apply offset
|
||
|
||
bbox_width = bbox.x1 - bbox.x0
|
||
bbox_height = bbox.y1 - bbox.y0
|
||
|
||
# Calculate font size from bbox height
|
||
font_size = bbox_height * 0.75
|
||
font_size = max(min(font_size, 72), 4) # Clamp 4-72pt
|
||
|
||
# Apply style if available
|
||
alignment = 'left' # Default alignment
|
||
if hasattr(element, 'style') and element.style:
|
||
self._apply_text_style(pdf_canvas, element.style, default_size=font_size)
|
||
# Get alignment from style
|
||
if hasattr(element.style, 'alignment') and element.style.alignment:
|
||
alignment = element.style.alignment
|
||
else:
|
||
# Use default font
|
||
font_name = self.font_name if self.font_registered else 'Helvetica'
|
||
pdf_canvas.setFont(font_name, font_size)
|
||
|
||
# Detect list items and extract list properties
|
||
is_list_item = (element.type == ElementType.LIST_ITEM)
|
||
list_level = element.metadata.get('list_level', 0) if element.metadata else 0
|
||
|
||
# Get pre-computed list marker from metadata (set by _draw_list_elements_direct)
|
||
list_marker = element.metadata.get('_list_marker', '') if element.metadata else ''
|
||
list_type = element.metadata.get('_list_type') if element.metadata else None
|
||
|
||
# If no pre-computed marker, remove original marker from text
|
||
if is_list_item and list_marker:
|
||
# Remove original marker from text content
|
||
text_stripped = text_content.lstrip()
|
||
# Remove ordered list marker
|
||
text_content = re.sub(r'^\d+[\.\)]\s', '', text_stripped)
|
||
# Remove unordered list marker
|
||
text_content = re.sub(r'^[•·▪▫◦‣⁃]\s', '', text_content)
|
||
|
||
# Get indentation from metadata (in points)
|
||
indent = element.metadata.get('indent', 0) if element.metadata else 0
|
||
first_line_indent = element.metadata.get('first_line_indent', indent) if element.metadata else indent
|
||
|
||
# Apply list indentation (20pt per level)
|
||
if is_list_item:
|
||
list_indent = list_level * 20 # 20pt per level
|
||
indent += list_indent
|
||
first_line_indent += list_indent
|
||
|
||
# Get paragraph spacing
|
||
# spacing_before: Applied by adjusting starting Y position (pdf_y)
|
||
# spacing_after: Applied via y_offset in _draw_list_elements_direct for list items
|
||
paragraph_spacing_before = element.metadata.get('spacing_before', 0) if element.metadata else 0
|
||
paragraph_spacing_after = element.metadata.get('spacing_after', 0) if element.metadata else 0
|
||
|
||
# Check if element has span children for inline styling
|
||
has_spans = element.children and len(element.children) > 0
|
||
|
||
# Handle line breaks
|
||
lines = text_content.split('\n')
|
||
line_height = font_size * 1.2 # 120% of font size
|
||
|
||
# Calculate list marker width for multi-line alignment
|
||
marker_width = 0
|
||
if is_list_item and list_marker:
|
||
# Use current font to calculate marker width
|
||
marker_width = pdf_canvas.stringWidth(list_marker, pdf_canvas._fontname, font_size)
|
||
|
||
# Apply paragraph spacing before (shift starting position up)
|
||
pdf_y += paragraph_spacing_before
|
||
|
||
# Draw each line with alignment
|
||
for i, line in enumerate(lines):
|
||
if not line.strip():
|
||
# Empty line: apply reduced spacing
|
||
continue
|
||
|
||
line_y = pdf_y - (i * line_height)
|
||
|
||
# Get current font info
|
||
font_name = pdf_canvas._fontname
|
||
current_font_size = pdf_canvas._fontsize
|
||
|
||
# Calculate line indentation
|
||
line_indent = first_line_indent if i == 0 else indent
|
||
|
||
# For list items: align subsequent lines with text after marker
|
||
if is_list_item and i > 0 and marker_width > 0:
|
||
line_indent += marker_width
|
||
|
||
# Prepend list marker to first line
|
||
rendered_line = line
|
||
if is_list_item and i == 0 and list_marker:
|
||
rendered_line = list_marker + line
|
||
|
||
# Calculate text width
|
||
text_width = pdf_canvas.stringWidth(rendered_line, font_name, current_font_size)
|
||
available_width = bbox_width - line_indent
|
||
|
||
# Scale font if needed
|
||
if text_width > available_width:
|
||
scale_factor = available_width / text_width
|
||
scaled_size = current_font_size * scale_factor * 0.95
|
||
scaled_size = max(scaled_size, 3)
|
||
pdf_canvas.setFont(font_name, scaled_size)
|
||
text_width = pdf_canvas.stringWidth(rendered_line, font_name, scaled_size)
|
||
current_font_size = scaled_size
|
||
|
||
# Calculate X position based on alignment
|
||
line_x = pdf_x + line_indent
|
||
|
||
if alignment == 'center':
|
||
line_x = pdf_x + (bbox_width - text_width) / 2
|
||
elif alignment == 'right':
|
||
line_x = pdf_x + bbox_width - text_width
|
||
elif alignment == 'justify' and i < len(lines) - 1:
|
||
# Justify: distribute extra space between words (except last line)
|
||
words = rendered_line.split()
|
||
if len(words) > 1:
|
||
total_word_width = sum(pdf_canvas.stringWidth(word, font_name, current_font_size) for word in words)
|
||
extra_space = available_width - total_word_width
|
||
word_spacing = extra_space / (len(words) - 1)
|
||
|
||
# Draw words with calculated spacing
|
||
x_pos = pdf_x + line_indent
|
||
for word in words:
|
||
pdf_canvas.drawString(x_pos, line_y, word)
|
||
word_width = pdf_canvas.stringWidth(word, font_name, current_font_size)
|
||
x_pos += word_width + word_spacing
|
||
|
||
# Reset font for next line and skip normal drawString
|
||
if text_width > available_width:
|
||
pdf_canvas.setFont(font_name, font_size)
|
||
continue
|
||
# else: left alignment uses line_x as-is
|
||
|
||
# Draw the line at calculated position
|
||
# Use span-level rendering if element has span children
|
||
if has_spans and not is_list_item:
|
||
# Render with inline span styling
|
||
# Note: Currently we render all spans on first line
|
||
# Multi-line span support would require more complex line breaking logic
|
||
if i == 0: # Only render spans on first line for now
|
||
total_width = self._draw_text_with_spans(
|
||
pdf_canvas, element.children, line_x, line_y, font_size
|
||
)
|
||
logger.debug(f"Drew {len(element.children)} spans, total width={total_width:.1f}pt")
|
||
# Skip rendering on subsequent lines (text already drawn via spans)
|
||
else:
|
||
# Normal single-style rendering
|
||
pdf_canvas.drawString(line_x, line_y, rendered_line)
|
||
|
||
# Reset font size for next line
|
||
if text_width > available_width:
|
||
pdf_canvas.setFont(font_name, font_size)
|
||
|
||
# Calculate actual text height used
|
||
actual_text_height = len(lines) * line_height
|
||
bbox_bottom_margin = bbox_height - actual_text_height - paragraph_spacing_before
|
||
|
||
# Note: For list items, spacing_after is applied via y_offset in _draw_list_elements_direct
|
||
# For other elements, spacing is inherent in element positioning (bbox-based layout)
|
||
list_info = f", list={list_type}, level={list_level}" if is_list_item else ""
|
||
y_offset_info = f", y_offset={y_offset:.1f}pt" if y_offset != 0 else ""
|
||
span_info = f", spans={len(element.children)}" if has_spans else ""
|
||
logger.debug(f"Drew text element: {text_content[:30]}... "
|
||
f"({len(lines)} lines, align={alignment}, indent={indent}{list_info}{y_offset_info}{span_info}, "
|
||
f"spacing_before={paragraph_spacing_before}, spacing_after={paragraph_spacing_after}, "
|
||
f"actual_height={actual_text_height:.1f}, bbox_bottom_margin={bbox_bottom_margin:.1f})")
|
||
|
||
except Exception as e:
|
||
logger.error(f"Failed to draw text element {element.element_id}: {e}")
|
||
|
||
def _draw_table_element_direct(
|
||
self,
|
||
pdf_canvas: canvas.Canvas,
|
||
element: 'DocumentElement',
|
||
page_height: float
|
||
):
|
||
"""
|
||
Draw table element with Direct track positioning.
|
||
|
||
Args:
|
||
pdf_canvas: ReportLab canvas object
|
||
element: DocumentElement with table content
|
||
page_height: Page height for coordinate transformation
|
||
"""
|
||
try:
|
||
# Get table HTML content
|
||
if isinstance(element.content, TableData):
|
||
html_content = element.content.to_html()
|
||
elif isinstance(element.content, dict):
|
||
html_content = element.content.get('html', str(element.content))
|
||
else:
|
||
html_content = str(element.content)
|
||
|
||
if not html_content:
|
||
logger.warning(f"No HTML content for table {element.element_id}")
|
||
return
|
||
|
||
# Parse HTML
|
||
parser = HTMLTableParser()
|
||
parser.feed(html_content)
|
||
|
||
if not parser.tables or not parser.tables[0]['rows']:
|
||
logger.warning(f"No table data parsed for {element.element_id}")
|
||
return
|
||
|
||
table_data = parser.tables[0]
|
||
rows = table_data['rows']
|
||
|
||
# Get bbox
|
||
bbox = element.bbox
|
||
if not bbox:
|
||
logger.warning(f"No bbox for table {element.element_id}")
|
||
return
|
||
|
||
# Transform coordinates
|
||
pdf_x = bbox.x0
|
||
pdf_y = page_height - bbox.y1 # Bottom of table
|
||
|
||
table_width = bbox.x1 - bbox.x0
|
||
table_height = bbox.y1 - bbox.y0
|
||
|
||
# Build table data for ReportLab
|
||
table_content = []
|
||
for row in rows:
|
||
row_data = [cell['text'].strip() for cell in row['cells']]
|
||
table_content.append(row_data)
|
||
|
||
# Create table
|
||
from reportlab.platypus import Table, TableStyle
|
||
from reportlab.lib import colors
|
||
|
||
t = Table(table_content, colWidths=[table_width / len(table_content[0])] * len(table_content[0]))
|
||
|
||
# Apply style
|
||
style = TableStyle([
|
||
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
|
||
('FONTSIZE', (0, 0), (-1, -1), 8),
|
||
('ALIGN', (0, 0), (-1, -1), 'LEFT'),
|
||
('VALIGN', (0, 0), (-1, -1), 'TOP'),
|
||
])
|
||
t.setStyle(style)
|
||
|
||
# Draw table
|
||
t.wrapOn(pdf_canvas, table_width, table_height)
|
||
t.drawOn(pdf_canvas, pdf_x, pdf_y)
|
||
|
||
logger.debug(f"Drew table element: {len(rows)} rows")
|
||
|
||
except Exception as e:
|
||
logger.error(f"Failed to draw table element {element.element_id}: {e}")
|
||
|
||
def _draw_image_element_direct(
|
||
self,
|
||
pdf_canvas: canvas.Canvas,
|
||
element: 'DocumentElement',
|
||
page_height: float,
|
||
result_dir: Path
|
||
):
|
||
"""
|
||
Draw image element with Direct track positioning.
|
||
|
||
Args:
|
||
pdf_canvas: ReportLab canvas object
|
||
element: DocumentElement with image content
|
||
page_height: Page height for coordinate transformation
|
||
result_dir: Directory containing image files
|
||
"""
|
||
try:
|
||
# Get image path
|
||
image_path_str = self._get_image_path(element)
|
||
if not image_path_str:
|
||
logger.warning(f"No image path for element {element.element_id}")
|
||
return
|
||
|
||
# Construct full path
|
||
image_path = result_dir / image_path_str
|
||
|
||
if not image_path.exists():
|
||
logger.warning(f"Image not found: {image_path}")
|
||
return
|
||
|
||
# Get bbox
|
||
bbox = element.bbox
|
||
if not bbox:
|
||
logger.warning(f"No bbox for image {element.element_id}")
|
||
return
|
||
|
||
# Transform coordinates
|
||
pdf_x = bbox.x0
|
||
pdf_y = page_height - bbox.y1 # Bottom of image
|
||
|
||
image_width = bbox.x1 - bbox.x0
|
||
image_height = bbox.y1 - bbox.y0
|
||
|
||
# Draw image
|
||
pdf_canvas.drawImage(
|
||
str(image_path),
|
||
pdf_x,
|
||
pdf_y,
|
||
width=image_width,
|
||
height=image_height,
|
||
preserveAspectRatio=True
|
||
)
|
||
|
||
logger.debug(f"Drew image: {image_path_str}")
|
||
|
||
except Exception as e:
|
||
logger.error(f"Failed to draw image element {element.element_id}: {e}")
|
||
|
||
|
||
# Singleton instance
|
||
pdf_generator_service = PDFGeneratorService()
|