fix: complete layout analysis features for DirectExtractionEngine

Implements missing layout analysis capabilities: - Add footer detection based on page position (bottom 10%) - Build hierarchical section structure from font sizes - Create nested list structure from indentation levels All elements now have proper metadata for: - section_level, parent_section, child_sections (headers) - list_level, parent_item, children (list items) - is_page_header, is_page_footer flags Updates tasks.md to reflect accurate completion status. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-19 08:15:11 +08:00
parent a3a6fbe58b
commit 5bcf3dfd42
2 changed files with 134 additions and 0 deletions
--- a/backend/app/services/direct_extraction_engine.py
+++ b/backend/app/services/direct_extraction_engine.py
@@ -254,6 +254,11 @@ class DirectExtractionEngine:
        if drawings:
            logger.debug(f"Page {page_num} contains {len(drawings)} vector drawing commands")
        # Post-process elements for header/footer detection and structure
        elements = self._detect_headers_footers(elements, dimensions)
        elements = self._build_section_hierarchy(elements)
        elements = self._build_nested_lists(elements)
        return Page(
            page_number=page_num,
            elements=elements,
@@ -265,6 +270,134 @@ class DirectExtractionEngine:
            }
        )
    def _detect_headers_footers(self, elements: List[DocumentElement], dimensions: Dimensions) -> List[DocumentElement]:
        """Detect and mark header/footer elements based on page position"""
        page_height = dimensions.height
        header_threshold = page_height * 0.1  # Top 10% of page
        footer_threshold = page_height * 0.9  # Bottom 10% of page
        for elem in elements:
            # Skip non-text elements
            if not elem.is_text:
                continue
            # Check if element is in header region
            if elem.bbox.y1 <= header_threshold:
                # Only mark as header if it's short text
                if isinstance(elem.content, str) and len(elem.content) < 200:
                    elem.type = ElementType.HEADER
                    elem.metadata['is_page_header'] = True
            # Check if element is in footer region
            elif elem.bbox.y0 >= footer_threshold:
                # Short text in footer region
                if isinstance(elem.content, str) and len(elem.content) < 200:
                    elem.type = ElementType.FOOTER
                    elem.metadata['is_page_footer'] = True
        return elements
    def _build_section_hierarchy(self, elements: List[DocumentElement]) -> List[DocumentElement]:
        """Build hierarchical section structure based on font sizes"""
        # Collect all headers with their font sizes
        headers = []
        for elem in elements:
            if elem.type in [ElementType.TITLE, ElementType.HEADER]:
                # Get average font size from style
                font_size = 12.0  # Default
                if elem.style and elem.style.font_size:
                    font_size = elem.style.font_size
                headers.append((elem, font_size))
        if not headers:
            return elements
        # Sort headers by font size to determine hierarchy levels
        font_sizes = sorted(set(size for _, size in headers), reverse=True)
        size_to_level = {size: level for level, size in enumerate(font_sizes, 1)}
        # Assign section levels to headers
        for elem, font_size in headers:
            level = size_to_level.get(font_size, 1)
            elem.metadata['section_level'] = level
            elem.metadata['font_size'] = font_size
        # Build parent-child relationships between headers
        header_stack = []  # Stack of (element, level)
        for elem, font_size in headers:
            level = elem.metadata['section_level']
            # Pop headers that are at same or lower level (larger font)
            while header_stack and header_stack[-1][1] >= level:
                header_stack.pop()
            # Set parent header
            if header_stack:
                parent = header_stack[-1][0]
                elem.metadata['parent_section'] = parent.element_id
                if 'child_sections' not in parent.metadata:
                    parent.metadata['child_sections'] = []
                parent.metadata['child_sections'].append(elem.element_id)
            header_stack.append((elem, level))
        # Link content to nearest preceding header at same or higher level
        current_header = None
        for elem in elements:
            if elem.type in [ElementType.TITLE, ElementType.HEADER]:
                current_header = elem
            elif current_header and elem.type not in [ElementType.HEADER, ElementType.FOOTER]:
                elem.metadata['section_id'] = current_header.element_id
        return elements
    def _build_nested_lists(self, elements: List[DocumentElement]) -> List[DocumentElement]:
        """Build nested list structure from flat list items"""
        # Group list items
        list_items = [e for e in elements if e.type == ElementType.LIST_ITEM]
        if not list_items:
            return elements
        # Sort by position (top to bottom)
        list_items.sort(key=lambda e: (e.bbox.y0, e.bbox.x0))
        # Detect indentation levels based on x position
        x_positions = [item.bbox.x0 for item in list_items]
        if not x_positions:
            return elements
        min_x = min(x_positions)
        indent_unit = 20  # Typical indent size in points
        # Assign nesting levels
        for item in list_items:
            indent = item.bbox.x0 - min_x
            level = int(indent / indent_unit)
            item.metadata['list_level'] = level
        # Build parent-child relationships
        item_stack = []  # Stack of (element, level)
        for item in list_items:
            level = item.metadata.get('list_level', 0)
            # Pop items at same or deeper level
            while item_stack and item_stack[-1][1] >= level:
                item_stack.pop()
            # Set parent
            if item_stack:
                parent = item_stack[-1][0]
                item.metadata['parent_item'] = parent.element_id
                if 'children' not in parent.metadata:
                    parent.metadata['children'] = []
                parent.metadata['children'].append(item.element_id)
                # Also add to actual children list
                parent.children.append(item)
            item_stack.append((item, level))
        return elements
    def _process_text_block(self, block: Dict, page_num: int, counter: int) -> Optional[DocumentElement]:
        """Process a text block into a DocumentElement"""
        # Calculate block bounding box
--- a/openspec/changes/dual-track-document-processing/tasks.md
+++ b/openspec/changes/dual-track-document-processing/tasks.md
@@ -11,6 +11,7 @@
  - [x] 1.2.2 Add DocumentElement model
  - [x] 1.2.3 Add DocumentMetadata model
  - [x] 1.2.4 Create converters for both OCR and direct extraction outputs
    - Note: OCR converter complete; DirectExtractionEngine returns UnifiedDocument directly
 - [x] 1.3 Create DocumentTypeDetector service
  - [x] 1.3.1 Implement file type detection using python-magic
  - [x] 1.3.2 Add PDF editability checking logic