fix: complete layout analysis features for DirectExtractionEngine
Implements missing layout analysis capabilities: - Add footer detection based on page position (bottom 10%) - Build hierarchical section structure from font sizes - Create nested list structure from indentation levels All elements now have proper metadata for: - section_level, parent_section, child_sections (headers) - list_level, parent_item, children (list items) - is_page_header, is_page_footer flags Updates tasks.md to reflect accurate completion status. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -254,6 +254,11 @@ class DirectExtractionEngine:
|
||||
if drawings:
|
||||
logger.debug(f"Page {page_num} contains {len(drawings)} vector drawing commands")
|
||||
|
||||
# Post-process elements for header/footer detection and structure
|
||||
elements = self._detect_headers_footers(elements, dimensions)
|
||||
elements = self._build_section_hierarchy(elements)
|
||||
elements = self._build_nested_lists(elements)
|
||||
|
||||
return Page(
|
||||
page_number=page_num,
|
||||
elements=elements,
|
||||
@@ -265,6 +270,134 @@ class DirectExtractionEngine:
|
||||
}
|
||||
)
|
||||
|
||||
def _detect_headers_footers(self, elements: List[DocumentElement], dimensions: Dimensions) -> List[DocumentElement]:
|
||||
"""Detect and mark header/footer elements based on page position"""
|
||||
page_height = dimensions.height
|
||||
header_threshold = page_height * 0.1 # Top 10% of page
|
||||
footer_threshold = page_height * 0.9 # Bottom 10% of page
|
||||
|
||||
for elem in elements:
|
||||
# Skip non-text elements
|
||||
if not elem.is_text:
|
||||
continue
|
||||
|
||||
# Check if element is in header region
|
||||
if elem.bbox.y1 <= header_threshold:
|
||||
# Only mark as header if it's short text
|
||||
if isinstance(elem.content, str) and len(elem.content) < 200:
|
||||
elem.type = ElementType.HEADER
|
||||
elem.metadata['is_page_header'] = True
|
||||
|
||||
# Check if element is in footer region
|
||||
elif elem.bbox.y0 >= footer_threshold:
|
||||
# Short text in footer region
|
||||
if isinstance(elem.content, str) and len(elem.content) < 200:
|
||||
elem.type = ElementType.FOOTER
|
||||
elem.metadata['is_page_footer'] = True
|
||||
|
||||
return elements
|
||||
|
||||
def _build_section_hierarchy(self, elements: List[DocumentElement]) -> List[DocumentElement]:
|
||||
"""Build hierarchical section structure based on font sizes"""
|
||||
# Collect all headers with their font sizes
|
||||
headers = []
|
||||
for elem in elements:
|
||||
if elem.type in [ElementType.TITLE, ElementType.HEADER]:
|
||||
# Get average font size from style
|
||||
font_size = 12.0 # Default
|
||||
if elem.style and elem.style.font_size:
|
||||
font_size = elem.style.font_size
|
||||
headers.append((elem, font_size))
|
||||
|
||||
if not headers:
|
||||
return elements
|
||||
|
||||
# Sort headers by font size to determine hierarchy levels
|
||||
font_sizes = sorted(set(size for _, size in headers), reverse=True)
|
||||
size_to_level = {size: level for level, size in enumerate(font_sizes, 1)}
|
||||
|
||||
# Assign section levels to headers
|
||||
for elem, font_size in headers:
|
||||
level = size_to_level.get(font_size, 1)
|
||||
elem.metadata['section_level'] = level
|
||||
elem.metadata['font_size'] = font_size
|
||||
|
||||
# Build parent-child relationships between headers
|
||||
header_stack = [] # Stack of (element, level)
|
||||
for elem, font_size in headers:
|
||||
level = elem.metadata['section_level']
|
||||
|
||||
# Pop headers that are at same or lower level (larger font)
|
||||
while header_stack and header_stack[-1][1] >= level:
|
||||
header_stack.pop()
|
||||
|
||||
# Set parent header
|
||||
if header_stack:
|
||||
parent = header_stack[-1][0]
|
||||
elem.metadata['parent_section'] = parent.element_id
|
||||
if 'child_sections' not in parent.metadata:
|
||||
parent.metadata['child_sections'] = []
|
||||
parent.metadata['child_sections'].append(elem.element_id)
|
||||
|
||||
header_stack.append((elem, level))
|
||||
|
||||
# Link content to nearest preceding header at same or higher level
|
||||
current_header = None
|
||||
for elem in elements:
|
||||
if elem.type in [ElementType.TITLE, ElementType.HEADER]:
|
||||
current_header = elem
|
||||
elif current_header and elem.type not in [ElementType.HEADER, ElementType.FOOTER]:
|
||||
elem.metadata['section_id'] = current_header.element_id
|
||||
|
||||
return elements
|
||||
|
||||
def _build_nested_lists(self, elements: List[DocumentElement]) -> List[DocumentElement]:
|
||||
"""Build nested list structure from flat list items"""
|
||||
# Group list items
|
||||
list_items = [e for e in elements if e.type == ElementType.LIST_ITEM]
|
||||
if not list_items:
|
||||
return elements
|
||||
|
||||
# Sort by position (top to bottom)
|
||||
list_items.sort(key=lambda e: (e.bbox.y0, e.bbox.x0))
|
||||
|
||||
# Detect indentation levels based on x position
|
||||
x_positions = [item.bbox.x0 for item in list_items]
|
||||
if not x_positions:
|
||||
return elements
|
||||
|
||||
min_x = min(x_positions)
|
||||
indent_unit = 20 # Typical indent size in points
|
||||
|
||||
# Assign nesting levels
|
||||
for item in list_items:
|
||||
indent = item.bbox.x0 - min_x
|
||||
level = int(indent / indent_unit)
|
||||
item.metadata['list_level'] = level
|
||||
|
||||
# Build parent-child relationships
|
||||
item_stack = [] # Stack of (element, level)
|
||||
for item in list_items:
|
||||
level = item.metadata.get('list_level', 0)
|
||||
|
||||
# Pop items at same or deeper level
|
||||
while item_stack and item_stack[-1][1] >= level:
|
||||
item_stack.pop()
|
||||
|
||||
# Set parent
|
||||
if item_stack:
|
||||
parent = item_stack[-1][0]
|
||||
item.metadata['parent_item'] = parent.element_id
|
||||
if 'children' not in parent.metadata:
|
||||
parent.metadata['children'] = []
|
||||
parent.metadata['children'].append(item.element_id)
|
||||
# Also add to actual children list
|
||||
parent.children.append(item)
|
||||
|
||||
item_stack.append((item, level))
|
||||
|
||||
return elements
|
||||
|
||||
def _process_text_block(self, block: Dict, page_num: int, counter: int) -> Optional[DocumentElement]:
|
||||
"""Process a text block into a DocumentElement"""
|
||||
# Calculate block bounding box
|
||||
|
||||
Reference in New Issue
Block a user