From 5bcf3dfd42a4ee1743b29cef2db1770db9289a88 Mon Sep 17 00:00:00 2001 From: egg Date: Wed, 19 Nov 2025 08:15:11 +0800 Subject: [PATCH] fix: complete layout analysis features for DirectExtractionEngine MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements missing layout analysis capabilities: - Add footer detection based on page position (bottom 10%) - Build hierarchical section structure from font sizes - Create nested list structure from indentation levels All elements now have proper metadata for: - section_level, parent_section, child_sections (headers) - list_level, parent_item, children (list items) - is_page_header, is_page_footer flags Updates tasks.md to reflect accurate completion status. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../app/services/direct_extraction_engine.py | 133 ++++++++++++++++++ .../dual-track-document-processing/tasks.md | 1 + 2 files changed, 134 insertions(+) diff --git a/backend/app/services/direct_extraction_engine.py b/backend/app/services/direct_extraction_engine.py index 36eebbb..f2a2f01 100644 --- a/backend/app/services/direct_extraction_engine.py +++ b/backend/app/services/direct_extraction_engine.py @@ -254,6 +254,11 @@ class DirectExtractionEngine: if drawings: logger.debug(f"Page {page_num} contains {len(drawings)} vector drawing commands") + # Post-process elements for header/footer detection and structure + elements = self._detect_headers_footers(elements, dimensions) + elements = self._build_section_hierarchy(elements) + elements = self._build_nested_lists(elements) + return Page( page_number=page_num, elements=elements, @@ -265,6 +270,134 @@ class DirectExtractionEngine: } ) + def _detect_headers_footers(self, elements: List[DocumentElement], dimensions: Dimensions) -> List[DocumentElement]: + """Detect and mark header/footer elements based on page position""" + page_height = dimensions.height + header_threshold = page_height * 0.1 # Top 10% of page + footer_threshold = page_height * 0.9 # Bottom 10% of page + + for elem in elements: + # Skip non-text elements + if not elem.is_text: + continue + + # Check if element is in header region + if elem.bbox.y1 <= header_threshold: + # Only mark as header if it's short text + if isinstance(elem.content, str) and len(elem.content) < 200: + elem.type = ElementType.HEADER + elem.metadata['is_page_header'] = True + + # Check if element is in footer region + elif elem.bbox.y0 >= footer_threshold: + # Short text in footer region + if isinstance(elem.content, str) and len(elem.content) < 200: + elem.type = ElementType.FOOTER + elem.metadata['is_page_footer'] = True + + return elements + + def _build_section_hierarchy(self, elements: List[DocumentElement]) -> List[DocumentElement]: + """Build hierarchical section structure based on font sizes""" + # Collect all headers with their font sizes + headers = [] + for elem in elements: + if elem.type in [ElementType.TITLE, ElementType.HEADER]: + # Get average font size from style + font_size = 12.0 # Default + if elem.style and elem.style.font_size: + font_size = elem.style.font_size + headers.append((elem, font_size)) + + if not headers: + return elements + + # Sort headers by font size to determine hierarchy levels + font_sizes = sorted(set(size for _, size in headers), reverse=True) + size_to_level = {size: level for level, size in enumerate(font_sizes, 1)} + + # Assign section levels to headers + for elem, font_size in headers: + level = size_to_level.get(font_size, 1) + elem.metadata['section_level'] = level + elem.metadata['font_size'] = font_size + + # Build parent-child relationships between headers + header_stack = [] # Stack of (element, level) + for elem, font_size in headers: + level = elem.metadata['section_level'] + + # Pop headers that are at same or lower level (larger font) + while header_stack and header_stack[-1][1] >= level: + header_stack.pop() + + # Set parent header + if header_stack: + parent = header_stack[-1][0] + elem.metadata['parent_section'] = parent.element_id + if 'child_sections' not in parent.metadata: + parent.metadata['child_sections'] = [] + parent.metadata['child_sections'].append(elem.element_id) + + header_stack.append((elem, level)) + + # Link content to nearest preceding header at same or higher level + current_header = None + for elem in elements: + if elem.type in [ElementType.TITLE, ElementType.HEADER]: + current_header = elem + elif current_header and elem.type not in [ElementType.HEADER, ElementType.FOOTER]: + elem.metadata['section_id'] = current_header.element_id + + return elements + + def _build_nested_lists(self, elements: List[DocumentElement]) -> List[DocumentElement]: + """Build nested list structure from flat list items""" + # Group list items + list_items = [e for e in elements if e.type == ElementType.LIST_ITEM] + if not list_items: + return elements + + # Sort by position (top to bottom) + list_items.sort(key=lambda e: (e.bbox.y0, e.bbox.x0)) + + # Detect indentation levels based on x position + x_positions = [item.bbox.x0 for item in list_items] + if not x_positions: + return elements + + min_x = min(x_positions) + indent_unit = 20 # Typical indent size in points + + # Assign nesting levels + for item in list_items: + indent = item.bbox.x0 - min_x + level = int(indent / indent_unit) + item.metadata['list_level'] = level + + # Build parent-child relationships + item_stack = [] # Stack of (element, level) + for item in list_items: + level = item.metadata.get('list_level', 0) + + # Pop items at same or deeper level + while item_stack and item_stack[-1][1] >= level: + item_stack.pop() + + # Set parent + if item_stack: + parent = item_stack[-1][0] + item.metadata['parent_item'] = parent.element_id + if 'children' not in parent.metadata: + parent.metadata['children'] = [] + parent.metadata['children'].append(item.element_id) + # Also add to actual children list + parent.children.append(item) + + item_stack.append((item, level)) + + return elements + def _process_text_block(self, block: Dict, page_num: int, counter: int) -> Optional[DocumentElement]: """Process a text block into a DocumentElement""" # Calculate block bounding box diff --git a/openspec/changes/dual-track-document-processing/tasks.md b/openspec/changes/dual-track-document-processing/tasks.md index 38eac9b..85fbfa7 100644 --- a/openspec/changes/dual-track-document-processing/tasks.md +++ b/openspec/changes/dual-track-document-processing/tasks.md @@ -11,6 +11,7 @@ - [x] 1.2.2 Add DocumentElement model - [x] 1.2.3 Add DocumentMetadata model - [x] 1.2.4 Create converters for both OCR and direct extraction outputs + - Note: OCR converter complete; DirectExtractionEngine returns UnifiedDocument directly - [x] 1.3 Create DocumentTypeDetector service - [x] 1.3.1 Implement file type detection using python-magic - [x] 1.3.2 Add PDF editability checking logic