feat: add multi-column layout support for PDF extraction and generation
- Enable PyMuPDF sort=True for correct reading order in multi-column PDFs - Add column detection utilities (_sort_elements_for_reading_order, _detect_columns) - Preserve extraction order in PDF generation instead of re-sorting by Y position - Fix StyleInfo field names (font_name, font_size, text_color instead of font, size, color) - Fix Page.dimensions access (was incorrectly accessing Page.width directly) - Implement row-by-row reading order (top-to-bottom, left-to-right within each row) This fixes the issue where multi-column PDFs (e.g., technical data sheets) had incorrect element ordering, with title appearing at position 12 instead of first. PyMuPDF's built-in sort=True parameter provides optimal reading order for most multi-column layouts without requiring custom column detection. Resolves: Multi-column layout reading order issue reported by user Affects: Direct track PDF extraction and generation (Task 8) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -189,8 +189,8 @@ class DirectExtractionEngine:
|
||||
dpi=72 # PDF standard DPI
|
||||
)
|
||||
|
||||
# Extract text blocks with formatting
|
||||
text_dict = page.get_text("dict")
|
||||
# Extract text blocks with formatting (sort=True for reading order)
|
||||
text_dict = page.get_text("dict", sort=True)
|
||||
for block_idx, block in enumerate(text_dict.get("blocks", [])):
|
||||
if block.get("type") == 0: # Text block
|
||||
element = self._process_text_block(
|
||||
@@ -254,6 +254,11 @@ class DirectExtractionEngine:
|
||||
if drawings:
|
||||
logger.debug(f"Page {page_num} contains {len(drawings)} vector drawing commands")
|
||||
|
||||
# PyMuPDF's sort=True already provides good reading order for multi-column layouts
|
||||
# (top-to-bottom, left-to-right within each row). We don't need to re-sort.
|
||||
# NOTE: If sort=True is not used in get_text(), uncomment the line below:
|
||||
# elements = self._sort_elements_for_reading_order(elements, dimensions)
|
||||
|
||||
# Post-process elements for header/footer detection and structure
|
||||
elements = self._detect_headers_footers(elements, dimensions)
|
||||
elements = self._build_section_hierarchy(elements)
|
||||
@@ -270,6 +275,113 @@ class DirectExtractionEngine:
|
||||
}
|
||||
)
|
||||
|
||||
def _sort_elements_for_reading_order(self, elements: List[DocumentElement], dimensions: Dimensions) -> List[DocumentElement]:
|
||||
"""
|
||||
Sort elements by reading order, handling multi-column layouts.
|
||||
|
||||
For multi-column layouts (e.g., two-column documents), this ensures
|
||||
elements are ordered correctly: top-to-bottom, then left-to-right
|
||||
within each row.
|
||||
|
||||
Args:
|
||||
elements: List of document elements
|
||||
dimensions: Page dimensions
|
||||
|
||||
Returns:
|
||||
Sorted list of elements in reading order
|
||||
"""
|
||||
if not elements:
|
||||
return elements
|
||||
|
||||
# Detect if page has multi-column layout
|
||||
text_elements = [e for e in elements if e.bbox and e.is_text]
|
||||
if len(text_elements) < 3:
|
||||
# Too few elements to determine layout, just sort by Y position
|
||||
return sorted(elements, key=lambda e: (e.bbox.y0 if e.bbox else 0, e.bbox.x0 if e.bbox else 0))
|
||||
|
||||
# Cluster x-positions to detect columns
|
||||
x_positions = [e.bbox.x0 for e in text_elements]
|
||||
columns = self._detect_columns(x_positions, dimensions.width)
|
||||
|
||||
if len(columns) <= 1:
|
||||
# Single column layout - simple top-to-bottom sort
|
||||
logger.debug(f"Detected single-column layout")
|
||||
return sorted(elements, key=lambda e: (e.bbox.y0 if e.bbox else 0, e.bbox.x0 if e.bbox else 0))
|
||||
|
||||
logger.debug(f"Detected {len(columns)}-column layout at x positions: {[f'{x:.1f}' for x in columns]}")
|
||||
|
||||
# Multi-column layout - use newspaper-style reading order
|
||||
# (complete left column, then right column, etc.)
|
||||
# This is more appropriate for technical documents and data sheets
|
||||
element_data = []
|
||||
for elem in elements:
|
||||
if not elem.bbox:
|
||||
element_data.append((elem, 0, 0))
|
||||
continue
|
||||
|
||||
# Find which column this element belongs to
|
||||
col_idx = 0
|
||||
min_dist = float('inf')
|
||||
for i, col_x in enumerate(columns):
|
||||
dist = abs(elem.bbox.x0 - col_x)
|
||||
if dist < min_dist:
|
||||
min_dist = dist
|
||||
col_idx = i
|
||||
|
||||
element_data.append((elem, col_idx, elem.bbox.y0))
|
||||
|
||||
# Sort by: column first, then Y position within column
|
||||
# This gives newspaper-style reading: complete column 1, then column 2, etc.
|
||||
element_data.sort(key=lambda x: (x[1], x[2]))
|
||||
|
||||
logger.debug(f"Using newspaper-style column reading order (column by column, top to bottom)")
|
||||
return [e[0] for e in element_data]
|
||||
|
||||
def _detect_columns(self, x_positions: List[float], page_width: float) -> List[float]:
|
||||
"""
|
||||
Detect column positions from x-coordinates of text elements.
|
||||
|
||||
Args:
|
||||
x_positions: List of x-coordinates (left edges of text)
|
||||
page_width: Page width in points
|
||||
|
||||
Returns:
|
||||
List of column x-positions (sorted left to right)
|
||||
"""
|
||||
if not x_positions:
|
||||
return []
|
||||
|
||||
# Cluster x-positions to find column starts
|
||||
# Use k-means-like approach: find groups of x-positions
|
||||
threshold = page_width * 0.15 # 15% of page width as clustering threshold
|
||||
|
||||
sorted_x = sorted(set(x_positions))
|
||||
if not sorted_x:
|
||||
return []
|
||||
|
||||
clusters = [[sorted_x[0]]]
|
||||
|
||||
for x in sorted_x[1:]:
|
||||
# Check if x belongs to current cluster
|
||||
cluster_center = sum(clusters[-1]) / len(clusters[-1])
|
||||
if abs(x - cluster_center) < threshold:
|
||||
clusters[-1].append(x)
|
||||
else:
|
||||
# Start new cluster
|
||||
clusters.append([x])
|
||||
|
||||
# Return average x position of each cluster (column start)
|
||||
column_positions = [sum(cluster) / len(cluster) for cluster in clusters]
|
||||
|
||||
# Filter out columns that are too close to each other
|
||||
min_column_width = page_width * 0.2 # Columns must be at least 20% of page width apart
|
||||
filtered_columns = [column_positions[0]]
|
||||
for col_x in column_positions[1:]:
|
||||
if col_x - filtered_columns[-1] >= min_column_width:
|
||||
filtered_columns.append(col_x)
|
||||
|
||||
return filtered_columns
|
||||
|
||||
def _detect_headers_footers(self, elements: List[DocumentElement], dimensions: Dimensions) -> List[DocumentElement]:
|
||||
"""Detect and mark header/footer elements based on page position"""
|
||||
page_height = dimensions.height
|
||||
|
||||
Reference in New Issue
Block a user