""" Direct Extraction Engine using PyMuPDF Handles direct text and structure extraction from editable PDFs without OCR. This provides much faster processing and perfect accuracy for documents with extractable text. """ import os import logging import fitz # PyMuPDF import uuid from pathlib import Path from typing import Dict, List, Optional, Tuple, Any, Union from datetime import datetime import re from ..models.unified_document import ( UnifiedDocument, DocumentElement, Page, DocumentMetadata, BoundingBox, StyleInfo, TableData, TableCell, Dimensions, ElementType, ProcessingTrack ) logger = logging.getLogger(__name__) class DirectExtractionEngine: """ Engine for direct text extraction from editable PDFs using PyMuPDF. This engine provides: - Fast text extraction with exact positioning - Font and style information preservation - Table structure detection - Image extraction with coordinates - Hyperlink and annotation extraction """ def __init__(self, enable_table_detection: bool = True, enable_image_extraction: bool = True, min_table_rows: int = 2, min_table_cols: int = 2): """ Initialize the extraction engine. Args: enable_table_detection: Whether to detect and extract tables enable_image_extraction: Whether to extract images min_table_rows: Minimum rows for table detection min_table_cols: Minimum columns for table detection """ self.enable_table_detection = enable_table_detection self.enable_image_extraction = enable_image_extraction self.min_table_rows = min_table_rows self.min_table_cols = min_table_cols def extract(self, file_path: Path, output_dir: Optional[Path] = None) -> UnifiedDocument: """ Extract content from PDF file to UnifiedDocument format. Args: file_path: Path to PDF file output_dir: Optional directory to save extracted images Returns: UnifiedDocument with extracted content """ start_time = datetime.now() document_id = str(uuid.uuid4()) try: doc = fitz.open(str(file_path)) # Extract document metadata metadata = self._extract_metadata(file_path, doc, start_time) # Extract pages pages = [] for page_num in range(len(doc)): logger.info(f"Extracting page {page_num + 1}/{len(doc)}") page = self._extract_page( doc[page_num], page_num + 1, document_id, output_dir ) pages.append(page) doc.close() # Calculate processing time processing_time = (datetime.now() - start_time).total_seconds() metadata.processing_time = processing_time logger.info(f"Direct extraction completed in {processing_time:.2f}s") return UnifiedDocument( document_id=document_id, metadata=metadata, pages=pages ) except Exception as e: logger.error(f"Error during direct extraction: {e}") # Return partial result with error information processing_time = (datetime.now() - start_time).total_seconds() if 'metadata' not in locals(): metadata = DocumentMetadata( filename=file_path.name, file_type="pdf", file_size=file_path.stat().st_size if file_path.exists() else 0, created_at=datetime.now(), processing_track=ProcessingTrack.DIRECT, processing_time=processing_time ) return UnifiedDocument( document_id=document_id, metadata=metadata, pages=pages if 'pages' in locals() else [], processing_errors=[{ "error": str(e), "type": type(e).__name__ }] ) def _extract_metadata(self, file_path: Path, doc: fitz.Document, start_time: datetime) -> DocumentMetadata: """Extract document metadata""" pdf_metadata = doc.metadata return DocumentMetadata( filename=file_path.name, file_type="pdf", file_size=file_path.stat().st_size, created_at=start_time, processing_track=ProcessingTrack.DIRECT, processing_time=0.0, # Will be updated later title=pdf_metadata.get("title"), author=pdf_metadata.get("author"), subject=pdf_metadata.get("subject"), keywords=pdf_metadata.get("keywords", "").split(",") if pdf_metadata.get("keywords") else None, producer=pdf_metadata.get("producer"), creator=pdf_metadata.get("creator"), creation_date=self._parse_pdf_date(pdf_metadata.get("creationDate")), modification_date=self._parse_pdf_date(pdf_metadata.get("modDate")) ) def _parse_pdf_date(self, date_str: str) -> Optional[datetime]: """Parse PDF date string to datetime""" if not date_str: return None try: # PDF date format: D:YYYYMMDDHHmmSSOHH'mm # Example: D:20240101120000+09'00 if date_str.startswith("D:"): date_str = date_str[2:] # Extract just the date/time part (first 14 characters) if len(date_str) >= 14: date_part = date_str[:14] return datetime.strptime(date_part, "%Y%m%d%H%M%S") except: pass return None def _extract_page(self, page: fitz.Page, page_num: int, document_id: str, output_dir: Optional[Path]) -> Page: """Extract content from a single page""" elements = [] element_counter = 0 # Get page dimensions rect = page.rect dimensions = Dimensions( width=rect.width, height=rect.height, dpi=72 # PDF standard DPI ) # Extract text blocks with formatting text_dict = page.get_text("dict") for block_idx, block in enumerate(text_dict.get("blocks", [])): if block.get("type") == 0: # Text block element = self._process_text_block( block, page_num, element_counter ) if element: elements.append(element) element_counter += 1 # Extract tables (if enabled) if self.enable_table_detection: try: # Try native table detection (PyMuPDF 1.23.0+) tables = page.find_tables() for table_idx, table in enumerate(tables): element = self._process_native_table( table, page_num, element_counter ) if element: elements.append(element) element_counter += 1 except AttributeError: # Fallback to positional table detection logger.debug("Native table detection not available, using positional detection") table_elements = self._detect_tables_by_position(page, page_num, element_counter) elements.extend(table_elements) element_counter += len(table_elements) # Extract images (if enabled) if self.enable_image_extraction: image_elements = self._extract_images( page, page_num, document_id, element_counter, output_dir ) elements.extend(image_elements) element_counter += len(image_elements) # Extract hyperlinks links = page.get_links() for link_idx, link in enumerate(links): # Create link annotation element if it has URI if link.get("uri"): from_rect = link.get("from") if from_rect: element = DocumentElement( element_id=f"link_{page_num}_{element_counter}", type=ElementType.REFERENCE, content={"uri": link["uri"], "type": "hyperlink"}, bbox=BoundingBox( x0=from_rect.x0, y0=from_rect.y0, x1=from_rect.x1, y1=from_rect.y1 ), metadata={"link_type": "external" if link["uri"].startswith("http") else "internal"} ) elements.append(element) element_counter += 1 # Extract vector graphics (as metadata) drawings = page.get_drawings() if drawings: logger.debug(f"Page {page_num} contains {len(drawings)} vector drawing commands") return Page( page_number=page_num, elements=elements, dimensions=dimensions, metadata={ "has_drawings": len(drawings) > 0, "drawing_count": len(drawings), "link_count": len(links) } ) def _process_text_block(self, block: Dict, page_num: int, counter: int) -> Optional[DocumentElement]: """Process a text block into a DocumentElement""" # Calculate block bounding box bbox_data = block.get("bbox", [0, 0, 0, 0]) bbox = BoundingBox( x0=bbox_data[0], y0=bbox_data[1], x1=bbox_data[2], y1=bbox_data[3] ) # Extract text content text_parts = [] styles = [] for line in block.get("lines", []): for span in line.get("spans", []): text = span.get("text", "") if text: text_parts.append(text) # Extract style information style = StyleInfo( font_name=span.get("font"), font_size=span.get("size"), font_weight="bold" if span.get("flags", 0) & 2**4 else "normal", font_style="italic" if span.get("flags", 0) & 2**1 else "normal", text_color=span.get("color") ) styles.append(style) if not text_parts: return None full_text = "".join(text_parts) # Determine element type based on content and style element_type = self._infer_element_type(full_text, styles) # Use the most common style for the block if styles: block_style = styles[0] # Could be improved with style merging else: block_style = None return DocumentElement( element_id=f"text_{page_num}_{counter}", type=element_type, content=full_text, bbox=bbox, style=block_style, confidence=1.0 # Direct extraction has perfect confidence ) def _infer_element_type(self, text: str, styles: List[StyleInfo]) -> ElementType: """Infer element type based on text content and styling""" text_lower = text.lower().strip() # Check for common patterns if len(text_lower) < 100 and styles: # Short text with large font might be title/header avg_size = sum(s.font_size or 12 for s in styles) / len(styles) if avg_size > 16: return ElementType.TITLE elif avg_size > 14: return ElementType.HEADER # Check for list patterns if re.match(r'^[\d•·▪▫◦‣⁃]\s', text_lower): return ElementType.LIST_ITEM # Check for page numbers if re.match(r'^page\s+\d+|^\d+\s*$|^-\s*\d+\s*-$', text_lower): return ElementType.PAGE_NUMBER # Check for footnote patterns if re.match(r'^[\[\d+\]]|^\d+\)', text_lower): return ElementType.FOOTNOTE # Default to paragraph for longer text, text for shorter return ElementType.PARAGRAPH if len(text) > 150 else ElementType.TEXT def _process_native_table(self, table, page_num: int, counter: int) -> Optional[DocumentElement]: """Process a natively detected table""" try: # Extract table data data = table.extract() if not data or len(data) < self.min_table_rows: return None # Get table bounding box bbox_data = table.bbox bbox = BoundingBox( x0=bbox_data[0], y0=bbox_data[1], x1=bbox_data[2], y1=bbox_data[3] ) # Create table cells cells = [] for row_idx, row in enumerate(data): for col_idx, cell_text in enumerate(row): if cell_text: cells.append(TableCell( row=row_idx, col=col_idx, content=str(cell_text) if cell_text else "" )) # Create table data table_data = TableData( rows=len(data), cols=max(len(row) for row in data) if data else 0, cells=cells, headers=data[0] if data else None # Assume first row is header ) return DocumentElement( element_id=f"table_{page_num}_{counter}", type=ElementType.TABLE, content=table_data, bbox=bbox, confidence=1.0 ) except Exception as e: logger.error(f"Error processing native table: {e}") return None def _detect_tables_by_position(self, page: fitz.Page, page_num: int, counter: int) -> List[DocumentElement]: """Detect tables by analyzing text positioning""" tables = [] # Get all words with positions words = page.get_text("words") # Returns (x0, y0, x1, y1, "word", block_no, line_no, word_no) if not words: return tables # Group words by approximate row (y-coordinate) rows = {} for word in words: y = round(word[1] / 5) * 5 # Round to nearest 5 points if y not in rows: rows[y] = [] rows[y].append({ 'x0': word[0], 'y0': word[1], 'x1': word[2], 'y1': word[3], 'text': word[4], 'block': word[5] if len(word) > 5 else 0 }) # Sort rows by y-coordinate sorted_rows = sorted(rows.items(), key=lambda x: x[0]) # Find potential tables (consecutive rows with multiple columns) current_table_rows = [] tables_found = [] for y, words_in_row in sorted_rows: words_in_row.sort(key=lambda w: w['x0']) if len(words_in_row) >= self.min_table_cols: # Check if this could be a table row x_positions = [w['x0'] for w in words_in_row] # Check for somewhat regular spacing if self._has_regular_spacing(x_positions): current_table_rows.append((y, words_in_row)) else: # End current table if exists if len(current_table_rows) >= self.min_table_rows: tables_found.append(current_table_rows) current_table_rows = [] else: # End current table if exists if len(current_table_rows) >= self.min_table_rows: tables_found.append(current_table_rows) current_table_rows = [] # Don't forget the last table if len(current_table_rows) >= self.min_table_rows: tables_found.append(current_table_rows) # Convert detected tables to DocumentElements for table_idx, table_rows in enumerate(tables_found): if not table_rows: continue # Calculate table bounding box all_words = [] for _, words in table_rows: all_words.extend(words) min_x = min(w['x0'] for w in all_words) min_y = min(w['y0'] for w in all_words) max_x = max(w['x1'] for w in all_words) max_y = max(w['y1'] for w in all_words) bbox = BoundingBox(x0=min_x, y0=min_y, x1=max_x, y1=max_y) # Create table cells cells = [] for row_idx, (y, words) in enumerate(table_rows): # Group words into columns columns = self._group_into_columns(words, table_rows) for col_idx, col_text in enumerate(columns): if col_text: cells.append(TableCell( row=row_idx, col=col_idx, content=col_text )) # Create table data table_data = TableData( rows=len(table_rows), cols=max(len(self._group_into_columns(words, table_rows)) for _, words in table_rows), cells=cells ) element = DocumentElement( element_id=f"table_{page_num}_{counter + table_idx}", type=ElementType.TABLE, content=table_data, bbox=bbox, confidence=0.8, # Lower confidence for positional detection metadata={"detection_method": "positional"} ) tables.append(element) return tables def _has_regular_spacing(self, x_positions: List[float], tolerance: float = 0.3) -> bool: """Check if x positions have somewhat regular spacing""" if len(x_positions) < 3: return False spacings = [x_positions[i+1] - x_positions[i] for i in range(len(x_positions)-1)] avg_spacing = sum(spacings) / len(spacings) # Check if spacings are within tolerance of average for spacing in spacings: if abs(spacing - avg_spacing) > avg_spacing * tolerance: return False return True def _group_into_columns(self, words: List[Dict], all_rows: List) -> List[str]: """Group words into columns based on x-position""" if not words: return [] # Find common column positions across all rows all_x_positions = [] for _, row_words in all_rows: all_x_positions.extend([w['x0'] for w in row_words]) # Cluster x-positions to find columns column_positions = self._cluster_positions(all_x_positions) # Assign words to columns columns = [""] * len(column_positions) for word in words: # Find closest column closest_col = 0 min_dist = float('inf') for col_idx, col_x in enumerate(column_positions): dist = abs(word['x0'] - col_x) if dist < min_dist: min_dist = dist closest_col = col_idx if columns[closest_col]: columns[closest_col] += " " + word['text'] else: columns[closest_col] = word['text'] return columns def _cluster_positions(self, positions: List[float], threshold: float = 20) -> List[float]: """Cluster positions to find common columns""" if not positions: return [] sorted_pos = sorted(positions) clusters = [[sorted_pos[0]]] for pos in sorted_pos[1:]: # Check if position belongs to current cluster if pos - clusters[-1][-1] < threshold: clusters[-1].append(pos) else: clusters.append([pos]) # Return average position of each cluster return [sum(cluster) / len(cluster) for cluster in clusters] def _extract_images(self, page: fitz.Page, page_num: int, document_id: str, counter: int, output_dir: Optional[Path]) -> List[DocumentElement]: """Extract images from page""" elements = [] image_list = page.get_images() for img_idx, img in enumerate(image_list): try: xref = img[0] # Get image position(s) img_rects = page.get_image_rects(xref) if not img_rects: continue rect = img_rects[0] # Use first occurrence bbox = BoundingBox( x0=rect.x0, y0=rect.y0, x1=rect.x1, y1=rect.y1 ) # Extract image data pix = fitz.Pixmap(page.parent, xref) image_data = { "width": pix.width, "height": pix.height, "colorspace": pix.colorspace.name if pix.colorspace else "unknown", "xref": xref } # Save image if output directory provided if output_dir: output_dir.mkdir(parents=True, exist_ok=True) image_filename = f"{document_id}_p{page_num}_img{img_idx}.png" image_path = output_dir / image_filename pix.save(str(image_path)) image_data["saved_path"] = str(image_path) logger.debug(f"Saved image to {image_path}") element = DocumentElement( element_id=f"image_{page_num}_{counter + img_idx}", type=ElementType.IMAGE, content=image_data, bbox=bbox, confidence=1.0, metadata={ "image_index": img_idx, "xref": xref } ) elements.append(element) pix = None # Free memory except Exception as e: logger.error(f"Error extracting image {img_idx}: {e}") return elements