feat: implement core dual-track processing infrastructure
Added foundation for dual-track document processing: 1. UnifiedDocument Model (backend/app/models/unified_document.py) - Common output format for both OCR and direct extraction - Comprehensive element types (23+ types from PP-StructureV3) - BoundingBox, StyleInfo, TableData structures - Backward compatibility with legacy format 2. DocumentTypeDetector Service (backend/app/services/document_type_detector.py) - Intelligent document type detection using python-magic - PDF editability analysis using PyMuPDF - Processing track recommendation with confidence scores - Support for PDF, images, Office docs, and text files 3. DirectExtractionEngine Service (backend/app/services/direct_extraction_engine.py) - Fast extraction from editable PDFs using PyMuPDF - Preserves fonts, colors, and exact positioning - Native and positional table detection - Image extraction with coordinates - Hyperlink and metadata extraction 4. Dependencies - Added PyMuPDF>=1.23.0 for PDF extraction - Added pdfplumber>=0.10.0 as fallback - Added python-magic-bin>=0.4.14 for file detection Next: Integrate with OCR service for complete dual-track processing 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
633
backend/app/services/direct_extraction_engine.py
Normal file
633
backend/app/services/direct_extraction_engine.py
Normal file
@@ -0,0 +1,633 @@
|
||||
"""
|
||||
Direct Extraction Engine using PyMuPDF
|
||||
|
||||
Handles direct text and structure extraction from editable PDFs without OCR.
|
||||
This provides much faster processing and perfect accuracy for documents with
|
||||
extractable text.
|
||||
"""
|
||||
|
||||
import os
|
||||
import logging
|
||||
import fitz # PyMuPDF
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple, Any, Union
|
||||
from datetime import datetime
|
||||
import re
|
||||
|
||||
from ..models.unified_document import (
|
||||
UnifiedDocument, DocumentElement, Page, DocumentMetadata,
|
||||
BoundingBox, StyleInfo, TableData, TableCell, Dimensions,
|
||||
ElementType, ProcessingTrack
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DirectExtractionEngine:
|
||||
"""
|
||||
Engine for direct text extraction from editable PDFs using PyMuPDF.
|
||||
|
||||
This engine provides:
|
||||
- Fast text extraction with exact positioning
|
||||
- Font and style information preservation
|
||||
- Table structure detection
|
||||
- Image extraction with coordinates
|
||||
- Hyperlink and annotation extraction
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
enable_table_detection: bool = True,
|
||||
enable_image_extraction: bool = True,
|
||||
min_table_rows: int = 2,
|
||||
min_table_cols: int = 2):
|
||||
"""
|
||||
Initialize the extraction engine.
|
||||
|
||||
Args:
|
||||
enable_table_detection: Whether to detect and extract tables
|
||||
enable_image_extraction: Whether to extract images
|
||||
min_table_rows: Minimum rows for table detection
|
||||
min_table_cols: Minimum columns for table detection
|
||||
"""
|
||||
self.enable_table_detection = enable_table_detection
|
||||
self.enable_image_extraction = enable_image_extraction
|
||||
self.min_table_rows = min_table_rows
|
||||
self.min_table_cols = min_table_cols
|
||||
|
||||
def extract(self,
|
||||
file_path: Path,
|
||||
output_dir: Optional[Path] = None) -> UnifiedDocument:
|
||||
"""
|
||||
Extract content from PDF file to UnifiedDocument format.
|
||||
|
||||
Args:
|
||||
file_path: Path to PDF file
|
||||
output_dir: Optional directory to save extracted images
|
||||
|
||||
Returns:
|
||||
UnifiedDocument with extracted content
|
||||
"""
|
||||
start_time = datetime.now()
|
||||
document_id = str(uuid.uuid4())
|
||||
|
||||
try:
|
||||
doc = fitz.open(str(file_path))
|
||||
|
||||
# Extract document metadata
|
||||
metadata = self._extract_metadata(file_path, doc, start_time)
|
||||
|
||||
# Extract pages
|
||||
pages = []
|
||||
for page_num in range(len(doc)):
|
||||
logger.info(f"Extracting page {page_num + 1}/{len(doc)}")
|
||||
page = self._extract_page(
|
||||
doc[page_num],
|
||||
page_num + 1,
|
||||
document_id,
|
||||
output_dir
|
||||
)
|
||||
pages.append(page)
|
||||
|
||||
doc.close()
|
||||
|
||||
# Calculate processing time
|
||||
processing_time = (datetime.now() - start_time).total_seconds()
|
||||
metadata.processing_time = processing_time
|
||||
|
||||
logger.info(f"Direct extraction completed in {processing_time:.2f}s")
|
||||
|
||||
return UnifiedDocument(
|
||||
document_id=document_id,
|
||||
metadata=metadata,
|
||||
pages=pages
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during direct extraction: {e}")
|
||||
# Return partial result with error information
|
||||
processing_time = (datetime.now() - start_time).total_seconds()
|
||||
|
||||
if 'metadata' not in locals():
|
||||
metadata = DocumentMetadata(
|
||||
filename=file_path.name,
|
||||
file_type="pdf",
|
||||
file_size=file_path.stat().st_size if file_path.exists() else 0,
|
||||
created_at=datetime.now(),
|
||||
processing_track=ProcessingTrack.DIRECT,
|
||||
processing_time=processing_time
|
||||
)
|
||||
|
||||
return UnifiedDocument(
|
||||
document_id=document_id,
|
||||
metadata=metadata,
|
||||
pages=pages if 'pages' in locals() else [],
|
||||
processing_errors=[{
|
||||
"error": str(e),
|
||||
"type": type(e).__name__
|
||||
}]
|
||||
)
|
||||
|
||||
def _extract_metadata(self,
|
||||
file_path: Path,
|
||||
doc: fitz.Document,
|
||||
start_time: datetime) -> DocumentMetadata:
|
||||
"""Extract document metadata"""
|
||||
pdf_metadata = doc.metadata
|
||||
|
||||
return DocumentMetadata(
|
||||
filename=file_path.name,
|
||||
file_type="pdf",
|
||||
file_size=file_path.stat().st_size,
|
||||
created_at=start_time,
|
||||
processing_track=ProcessingTrack.DIRECT,
|
||||
processing_time=0.0, # Will be updated later
|
||||
title=pdf_metadata.get("title"),
|
||||
author=pdf_metadata.get("author"),
|
||||
subject=pdf_metadata.get("subject"),
|
||||
keywords=pdf_metadata.get("keywords", "").split(",") if pdf_metadata.get("keywords") else None,
|
||||
producer=pdf_metadata.get("producer"),
|
||||
creator=pdf_metadata.get("creator"),
|
||||
creation_date=self._parse_pdf_date(pdf_metadata.get("creationDate")),
|
||||
modification_date=self._parse_pdf_date(pdf_metadata.get("modDate"))
|
||||
)
|
||||
|
||||
def _parse_pdf_date(self, date_str: str) -> Optional[datetime]:
|
||||
"""Parse PDF date string to datetime"""
|
||||
if not date_str:
|
||||
return None
|
||||
|
||||
try:
|
||||
# PDF date format: D:YYYYMMDDHHmmSSOHH'mm
|
||||
# Example: D:20240101120000+09'00
|
||||
if date_str.startswith("D:"):
|
||||
date_str = date_str[2:]
|
||||
|
||||
# Extract just the date/time part (first 14 characters)
|
||||
if len(date_str) >= 14:
|
||||
date_part = date_str[:14]
|
||||
return datetime.strptime(date_part, "%Y%m%d%H%M%S")
|
||||
except:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def _extract_page(self,
|
||||
page: fitz.Page,
|
||||
page_num: int,
|
||||
document_id: str,
|
||||
output_dir: Optional[Path]) -> Page:
|
||||
"""Extract content from a single page"""
|
||||
elements = []
|
||||
element_counter = 0
|
||||
|
||||
# Get page dimensions
|
||||
rect = page.rect
|
||||
dimensions = Dimensions(
|
||||
width=rect.width,
|
||||
height=rect.height,
|
||||
dpi=72 # PDF standard DPI
|
||||
)
|
||||
|
||||
# Extract text blocks with formatting
|
||||
text_dict = page.get_text("dict")
|
||||
for block_idx, block in enumerate(text_dict.get("blocks", [])):
|
||||
if block.get("type") == 0: # Text block
|
||||
element = self._process_text_block(
|
||||
block, page_num, element_counter
|
||||
)
|
||||
if element:
|
||||
elements.append(element)
|
||||
element_counter += 1
|
||||
|
||||
# Extract tables (if enabled)
|
||||
if self.enable_table_detection:
|
||||
try:
|
||||
# Try native table detection (PyMuPDF 1.23.0+)
|
||||
tables = page.find_tables()
|
||||
for table_idx, table in enumerate(tables):
|
||||
element = self._process_native_table(
|
||||
table, page_num, element_counter
|
||||
)
|
||||
if element:
|
||||
elements.append(element)
|
||||
element_counter += 1
|
||||
except AttributeError:
|
||||
# Fallback to positional table detection
|
||||
logger.debug("Native table detection not available, using positional detection")
|
||||
table_elements = self._detect_tables_by_position(page, page_num, element_counter)
|
||||
elements.extend(table_elements)
|
||||
element_counter += len(table_elements)
|
||||
|
||||
# Extract images (if enabled)
|
||||
if self.enable_image_extraction:
|
||||
image_elements = self._extract_images(
|
||||
page, page_num, document_id, element_counter, output_dir
|
||||
)
|
||||
elements.extend(image_elements)
|
||||
element_counter += len(image_elements)
|
||||
|
||||
# Extract hyperlinks
|
||||
links = page.get_links()
|
||||
for link_idx, link in enumerate(links):
|
||||
# Create link annotation element if it has URI
|
||||
if link.get("uri"):
|
||||
from_rect = link.get("from")
|
||||
if from_rect:
|
||||
element = DocumentElement(
|
||||
element_id=f"link_{page_num}_{element_counter}",
|
||||
type=ElementType.REFERENCE,
|
||||
content={"uri": link["uri"], "type": "hyperlink"},
|
||||
bbox=BoundingBox(
|
||||
x0=from_rect.x0,
|
||||
y0=from_rect.y0,
|
||||
x1=from_rect.x1,
|
||||
y1=from_rect.y1
|
||||
),
|
||||
metadata={"link_type": "external" if link["uri"].startswith("http") else "internal"}
|
||||
)
|
||||
elements.append(element)
|
||||
element_counter += 1
|
||||
|
||||
# Extract vector graphics (as metadata)
|
||||
drawings = page.get_drawings()
|
||||
if drawings:
|
||||
logger.debug(f"Page {page_num} contains {len(drawings)} vector drawing commands")
|
||||
|
||||
return Page(
|
||||
page_number=page_num,
|
||||
elements=elements,
|
||||
dimensions=dimensions,
|
||||
metadata={
|
||||
"has_drawings": len(drawings) > 0,
|
||||
"drawing_count": len(drawings),
|
||||
"link_count": len(links)
|
||||
}
|
||||
)
|
||||
|
||||
def _process_text_block(self, block: Dict, page_num: int, counter: int) -> Optional[DocumentElement]:
|
||||
"""Process a text block into a DocumentElement"""
|
||||
# Calculate block bounding box
|
||||
bbox_data = block.get("bbox", [0, 0, 0, 0])
|
||||
bbox = BoundingBox(
|
||||
x0=bbox_data[0],
|
||||
y0=bbox_data[1],
|
||||
x1=bbox_data[2],
|
||||
y1=bbox_data[3]
|
||||
)
|
||||
|
||||
# Extract text content
|
||||
text_parts = []
|
||||
styles = []
|
||||
|
||||
for line in block.get("lines", []):
|
||||
for span in line.get("spans", []):
|
||||
text = span.get("text", "")
|
||||
if text:
|
||||
text_parts.append(text)
|
||||
|
||||
# Extract style information
|
||||
style = StyleInfo(
|
||||
font_name=span.get("font"),
|
||||
font_size=span.get("size"),
|
||||
font_weight="bold" if span.get("flags", 0) & 2**4 else "normal",
|
||||
font_style="italic" if span.get("flags", 0) & 2**1 else "normal",
|
||||
text_color=span.get("color")
|
||||
)
|
||||
styles.append(style)
|
||||
|
||||
if not text_parts:
|
||||
return None
|
||||
|
||||
full_text = "".join(text_parts)
|
||||
|
||||
# Determine element type based on content and style
|
||||
element_type = self._infer_element_type(full_text, styles)
|
||||
|
||||
# Use the most common style for the block
|
||||
if styles:
|
||||
block_style = styles[0] # Could be improved with style merging
|
||||
else:
|
||||
block_style = None
|
||||
|
||||
return DocumentElement(
|
||||
element_id=f"text_{page_num}_{counter}",
|
||||
type=element_type,
|
||||
content=full_text,
|
||||
bbox=bbox,
|
||||
style=block_style,
|
||||
confidence=1.0 # Direct extraction has perfect confidence
|
||||
)
|
||||
|
||||
def _infer_element_type(self, text: str, styles: List[StyleInfo]) -> ElementType:
|
||||
"""Infer element type based on text content and styling"""
|
||||
text_lower = text.lower().strip()
|
||||
|
||||
# Check for common patterns
|
||||
if len(text_lower) < 100 and styles:
|
||||
# Short text with large font might be title/header
|
||||
avg_size = sum(s.font_size or 12 for s in styles) / len(styles)
|
||||
if avg_size > 16:
|
||||
return ElementType.TITLE
|
||||
elif avg_size > 14:
|
||||
return ElementType.HEADER
|
||||
|
||||
# Check for list patterns
|
||||
if re.match(r'^[\d•·▪▫◦‣⁃]\s', text_lower):
|
||||
return ElementType.LIST_ITEM
|
||||
|
||||
# Check for page numbers
|
||||
if re.match(r'^page\s+\d+|^\d+\s*$|^-\s*\d+\s*-$', text_lower):
|
||||
return ElementType.PAGE_NUMBER
|
||||
|
||||
# Check for footnote patterns
|
||||
if re.match(r'^[\[\d+\]]|^\d+\)', text_lower):
|
||||
return ElementType.FOOTNOTE
|
||||
|
||||
# Default to paragraph for longer text, text for shorter
|
||||
return ElementType.PARAGRAPH if len(text) > 150 else ElementType.TEXT
|
||||
|
||||
def _process_native_table(self, table, page_num: int, counter: int) -> Optional[DocumentElement]:
|
||||
"""Process a natively detected table"""
|
||||
try:
|
||||
# Extract table data
|
||||
data = table.extract()
|
||||
if not data or len(data) < self.min_table_rows:
|
||||
return None
|
||||
|
||||
# Get table bounding box
|
||||
bbox_data = table.bbox
|
||||
bbox = BoundingBox(
|
||||
x0=bbox_data[0],
|
||||
y0=bbox_data[1],
|
||||
x1=bbox_data[2],
|
||||
y1=bbox_data[3]
|
||||
)
|
||||
|
||||
# Create table cells
|
||||
cells = []
|
||||
for row_idx, row in enumerate(data):
|
||||
for col_idx, cell_text in enumerate(row):
|
||||
if cell_text:
|
||||
cells.append(TableCell(
|
||||
row=row_idx,
|
||||
col=col_idx,
|
||||
content=str(cell_text) if cell_text else ""
|
||||
))
|
||||
|
||||
# Create table data
|
||||
table_data = TableData(
|
||||
rows=len(data),
|
||||
cols=max(len(row) for row in data) if data else 0,
|
||||
cells=cells,
|
||||
headers=data[0] if data else None # Assume first row is header
|
||||
)
|
||||
|
||||
return DocumentElement(
|
||||
element_id=f"table_{page_num}_{counter}",
|
||||
type=ElementType.TABLE,
|
||||
content=table_data,
|
||||
bbox=bbox,
|
||||
confidence=1.0
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing native table: {e}")
|
||||
return None
|
||||
|
||||
def _detect_tables_by_position(self, page: fitz.Page, page_num: int, counter: int) -> List[DocumentElement]:
|
||||
"""Detect tables by analyzing text positioning"""
|
||||
tables = []
|
||||
|
||||
# Get all words with positions
|
||||
words = page.get_text("words") # Returns (x0, y0, x1, y1, "word", block_no, line_no, word_no)
|
||||
|
||||
if not words:
|
||||
return tables
|
||||
|
||||
# Group words by approximate row (y-coordinate)
|
||||
rows = {}
|
||||
for word in words:
|
||||
y = round(word[1] / 5) * 5 # Round to nearest 5 points
|
||||
if y not in rows:
|
||||
rows[y] = []
|
||||
rows[y].append({
|
||||
'x0': word[0],
|
||||
'y0': word[1],
|
||||
'x1': word[2],
|
||||
'y1': word[3],
|
||||
'text': word[4],
|
||||
'block': word[5] if len(word) > 5 else 0
|
||||
})
|
||||
|
||||
# Sort rows by y-coordinate
|
||||
sorted_rows = sorted(rows.items(), key=lambda x: x[0])
|
||||
|
||||
# Find potential tables (consecutive rows with multiple columns)
|
||||
current_table_rows = []
|
||||
tables_found = []
|
||||
|
||||
for y, words_in_row in sorted_rows:
|
||||
words_in_row.sort(key=lambda w: w['x0'])
|
||||
|
||||
if len(words_in_row) >= self.min_table_cols:
|
||||
# Check if this could be a table row
|
||||
x_positions = [w['x0'] for w in words_in_row]
|
||||
|
||||
# Check for somewhat regular spacing
|
||||
if self._has_regular_spacing(x_positions):
|
||||
current_table_rows.append((y, words_in_row))
|
||||
else:
|
||||
# End current table if exists
|
||||
if len(current_table_rows) >= self.min_table_rows:
|
||||
tables_found.append(current_table_rows)
|
||||
current_table_rows = []
|
||||
else:
|
||||
# End current table if exists
|
||||
if len(current_table_rows) >= self.min_table_rows:
|
||||
tables_found.append(current_table_rows)
|
||||
current_table_rows = []
|
||||
|
||||
# Don't forget the last table
|
||||
if len(current_table_rows) >= self.min_table_rows:
|
||||
tables_found.append(current_table_rows)
|
||||
|
||||
# Convert detected tables to DocumentElements
|
||||
for table_idx, table_rows in enumerate(tables_found):
|
||||
if not table_rows:
|
||||
continue
|
||||
|
||||
# Calculate table bounding box
|
||||
all_words = []
|
||||
for _, words in table_rows:
|
||||
all_words.extend(words)
|
||||
|
||||
min_x = min(w['x0'] for w in all_words)
|
||||
min_y = min(w['y0'] for w in all_words)
|
||||
max_x = max(w['x1'] for w in all_words)
|
||||
max_y = max(w['y1'] for w in all_words)
|
||||
|
||||
bbox = BoundingBox(x0=min_x, y0=min_y, x1=max_x, y1=max_y)
|
||||
|
||||
# Create table cells
|
||||
cells = []
|
||||
for row_idx, (y, words) in enumerate(table_rows):
|
||||
# Group words into columns
|
||||
columns = self._group_into_columns(words, table_rows)
|
||||
for col_idx, col_text in enumerate(columns):
|
||||
if col_text:
|
||||
cells.append(TableCell(
|
||||
row=row_idx,
|
||||
col=col_idx,
|
||||
content=col_text
|
||||
))
|
||||
|
||||
# Create table data
|
||||
table_data = TableData(
|
||||
rows=len(table_rows),
|
||||
cols=max(len(self._group_into_columns(words, table_rows))
|
||||
for _, words in table_rows),
|
||||
cells=cells
|
||||
)
|
||||
|
||||
element = DocumentElement(
|
||||
element_id=f"table_{page_num}_{counter + table_idx}",
|
||||
type=ElementType.TABLE,
|
||||
content=table_data,
|
||||
bbox=bbox,
|
||||
confidence=0.8, # Lower confidence for positional detection
|
||||
metadata={"detection_method": "positional"}
|
||||
)
|
||||
tables.append(element)
|
||||
|
||||
return tables
|
||||
|
||||
def _has_regular_spacing(self, x_positions: List[float], tolerance: float = 0.3) -> bool:
|
||||
"""Check if x positions have somewhat regular spacing"""
|
||||
if len(x_positions) < 3:
|
||||
return False
|
||||
|
||||
spacings = [x_positions[i+1] - x_positions[i] for i in range(len(x_positions)-1)]
|
||||
avg_spacing = sum(spacings) / len(spacings)
|
||||
|
||||
# Check if spacings are within tolerance of average
|
||||
for spacing in spacings:
|
||||
if abs(spacing - avg_spacing) > avg_spacing * tolerance:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _group_into_columns(self, words: List[Dict], all_rows: List) -> List[str]:
|
||||
"""Group words into columns based on x-position"""
|
||||
if not words:
|
||||
return []
|
||||
|
||||
# Find common column positions across all rows
|
||||
all_x_positions = []
|
||||
for _, row_words in all_rows:
|
||||
all_x_positions.extend([w['x0'] for w in row_words])
|
||||
|
||||
# Cluster x-positions to find columns
|
||||
column_positions = self._cluster_positions(all_x_positions)
|
||||
|
||||
# Assign words to columns
|
||||
columns = [""] * len(column_positions)
|
||||
for word in words:
|
||||
# Find closest column
|
||||
closest_col = 0
|
||||
min_dist = float('inf')
|
||||
for col_idx, col_x in enumerate(column_positions):
|
||||
dist = abs(word['x0'] - col_x)
|
||||
if dist < min_dist:
|
||||
min_dist = dist
|
||||
closest_col = col_idx
|
||||
|
||||
if columns[closest_col]:
|
||||
columns[closest_col] += " " + word['text']
|
||||
else:
|
||||
columns[closest_col] = word['text']
|
||||
|
||||
return columns
|
||||
|
||||
def _cluster_positions(self, positions: List[float], threshold: float = 20) -> List[float]:
|
||||
"""Cluster positions to find common columns"""
|
||||
if not positions:
|
||||
return []
|
||||
|
||||
sorted_pos = sorted(positions)
|
||||
clusters = [[sorted_pos[0]]]
|
||||
|
||||
for pos in sorted_pos[1:]:
|
||||
# Check if position belongs to current cluster
|
||||
if pos - clusters[-1][-1] < threshold:
|
||||
clusters[-1].append(pos)
|
||||
else:
|
||||
clusters.append([pos])
|
||||
|
||||
# Return average position of each cluster
|
||||
return [sum(cluster) / len(cluster) for cluster in clusters]
|
||||
|
||||
def _extract_images(self,
|
||||
page: fitz.Page,
|
||||
page_num: int,
|
||||
document_id: str,
|
||||
counter: int,
|
||||
output_dir: Optional[Path]) -> List[DocumentElement]:
|
||||
"""Extract images from page"""
|
||||
elements = []
|
||||
image_list = page.get_images()
|
||||
|
||||
for img_idx, img in enumerate(image_list):
|
||||
try:
|
||||
xref = img[0]
|
||||
|
||||
# Get image position(s)
|
||||
img_rects = page.get_image_rects(xref)
|
||||
if not img_rects:
|
||||
continue
|
||||
|
||||
rect = img_rects[0] # Use first occurrence
|
||||
bbox = BoundingBox(
|
||||
x0=rect.x0,
|
||||
y0=rect.y0,
|
||||
x1=rect.x1,
|
||||
y1=rect.y1
|
||||
)
|
||||
|
||||
# Extract image data
|
||||
pix = fitz.Pixmap(page.parent, xref)
|
||||
image_data = {
|
||||
"width": pix.width,
|
||||
"height": pix.height,
|
||||
"colorspace": pix.colorspace.name if pix.colorspace else "unknown",
|
||||
"xref": xref
|
||||
}
|
||||
|
||||
# Save image if output directory provided
|
||||
if output_dir:
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
image_filename = f"{document_id}_p{page_num}_img{img_idx}.png"
|
||||
image_path = output_dir / image_filename
|
||||
pix.save(str(image_path))
|
||||
image_data["saved_path"] = str(image_path)
|
||||
logger.debug(f"Saved image to {image_path}")
|
||||
|
||||
element = DocumentElement(
|
||||
element_id=f"image_{page_num}_{counter + img_idx}",
|
||||
type=ElementType.IMAGE,
|
||||
content=image_data,
|
||||
bbox=bbox,
|
||||
confidence=1.0,
|
||||
metadata={
|
||||
"image_index": img_idx,
|
||||
"xref": xref
|
||||
}
|
||||
)
|
||||
elements.append(element)
|
||||
|
||||
pix = None # Free memory
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error extracting image {img_idx}: {e}")
|
||||
|
||||
return elements
|
||||
397
backend/app/services/document_type_detector.py
Normal file
397
backend/app/services/document_type_detector.py
Normal file
@@ -0,0 +1,397 @@
|
||||
"""
|
||||
Document Type Detector Service
|
||||
|
||||
Intelligently determines the optimal processing track for documents based on
|
||||
file type, content analysis, and editability checks.
|
||||
"""
|
||||
|
||||
import os
|
||||
import logging
|
||||
import magic
|
||||
import fitz # PyMuPDF
|
||||
from pathlib import Path
|
||||
from typing import Dict, Optional, Tuple, List
|
||||
from enum import Enum
|
||||
import statistics
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DocumentType(str, Enum):
|
||||
"""Document type classification"""
|
||||
PDF_EDITABLE = "pdf_editable" # PDF with extractable text
|
||||
PDF_SCANNED = "pdf_scanned" # PDF with images/scanned content
|
||||
PDF_MIXED = "pdf_mixed" # PDF with both text and scanned pages
|
||||
IMAGE = "image" # Image files (PNG, JPG, etc.)
|
||||
OFFICE_WORD = "office_word" # Word documents
|
||||
OFFICE_EXCEL = "office_excel" # Excel spreadsheets
|
||||
OFFICE_POWERPOINT = "office_ppt" # PowerPoint presentations
|
||||
TEXT = "text" # Plain text files
|
||||
UNKNOWN = "unknown" # Unknown format
|
||||
|
||||
|
||||
class ProcessingTrackRecommendation:
|
||||
"""Processing track recommendation with confidence"""
|
||||
|
||||
def __init__(self,
|
||||
track: str,
|
||||
confidence: float,
|
||||
reason: str,
|
||||
document_type: DocumentType,
|
||||
metadata: Optional[Dict] = None):
|
||||
self.track = track # "ocr" or "direct"
|
||||
self.confidence = confidence # 0.0 to 1.0
|
||||
self.reason = reason
|
||||
self.document_type = document_type
|
||||
self.metadata = metadata or {}
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
return {
|
||||
"recommended_track": self.track,
|
||||
"confidence": self.confidence,
|
||||
"reason": self.reason,
|
||||
"document_type": self.document_type.value,
|
||||
"metadata": self.metadata
|
||||
}
|
||||
|
||||
|
||||
class DocumentTypeDetector:
|
||||
"""
|
||||
Service for detecting document types and recommending processing tracks.
|
||||
|
||||
This service analyzes documents to determine:
|
||||
1. The document type (PDF, image, Office, etc.)
|
||||
2. Whether the document contains extractable text
|
||||
3. The recommended processing track (OCR vs Direct)
|
||||
"""
|
||||
|
||||
# MIME type mappings
|
||||
IMAGE_MIMES = {
|
||||
'image/png', 'image/jpeg', 'image/jpg', 'image/gif',
|
||||
'image/bmp', 'image/tiff', 'image/webp'
|
||||
}
|
||||
|
||||
OFFICE_MIMES = {
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': DocumentType.OFFICE_WORD,
|
||||
'application/msword': DocumentType.OFFICE_WORD,
|
||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': DocumentType.OFFICE_EXCEL,
|
||||
'application/vnd.ms-excel': DocumentType.OFFICE_EXCEL,
|
||||
'application/vnd.openxmlformats-officedocument.presentationml.presentation': DocumentType.OFFICE_POWERPOINT,
|
||||
'application/vnd.ms-powerpoint': DocumentType.OFFICE_POWERPOINT,
|
||||
}
|
||||
|
||||
def __init__(self,
|
||||
min_text_length: int = 100,
|
||||
sample_pages: int = 3,
|
||||
text_coverage_threshold: float = 0.9):
|
||||
"""
|
||||
Initialize the detector.
|
||||
|
||||
Args:
|
||||
min_text_length: Minimum text length to consider a page as having extractable text
|
||||
sample_pages: Number of pages to sample for PDF analysis
|
||||
text_coverage_threshold: Percentage of pages with text to classify as editable
|
||||
"""
|
||||
self.min_text_length = min_text_length
|
||||
self.sample_pages = sample_pages
|
||||
self.text_coverage_threshold = text_coverage_threshold
|
||||
|
||||
def detect(self, file_path: Path) -> ProcessingTrackRecommendation:
|
||||
"""
|
||||
Detect document type and recommend processing track.
|
||||
|
||||
Args:
|
||||
file_path: Path to the document file
|
||||
|
||||
Returns:
|
||||
ProcessingTrackRecommendation with track selection and metadata
|
||||
"""
|
||||
if not file_path.exists():
|
||||
logger.error(f"File not found: {file_path}")
|
||||
return ProcessingTrackRecommendation(
|
||||
track="ocr",
|
||||
confidence=0.5,
|
||||
reason="File not found, defaulting to OCR",
|
||||
document_type=DocumentType.UNKNOWN
|
||||
)
|
||||
|
||||
try:
|
||||
# Detect MIME type
|
||||
mime_type = magic.from_file(str(file_path), mime=True)
|
||||
logger.info(f"Detected MIME type: {mime_type} for {file_path.name}")
|
||||
|
||||
# Route based on file type
|
||||
if mime_type == 'application/pdf':
|
||||
return self._analyze_pdf(file_path)
|
||||
elif mime_type in self.IMAGE_MIMES:
|
||||
return self._analyze_image(file_path, mime_type)
|
||||
elif mime_type in self.OFFICE_MIMES:
|
||||
return self._analyze_office(file_path, mime_type)
|
||||
elif mime_type.startswith('text/'):
|
||||
return self._analyze_text(file_path, mime_type)
|
||||
else:
|
||||
logger.warning(f"Unknown MIME type: {mime_type}")
|
||||
return ProcessingTrackRecommendation(
|
||||
track="ocr",
|
||||
confidence=0.5,
|
||||
reason=f"Unknown file type ({mime_type}), defaulting to OCR",
|
||||
document_type=DocumentType.UNKNOWN
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error detecting document type: {e}")
|
||||
return ProcessingTrackRecommendation(
|
||||
track="ocr",
|
||||
confidence=0.3,
|
||||
reason=f"Error during detection: {str(e)}",
|
||||
document_type=DocumentType.UNKNOWN
|
||||
)
|
||||
|
||||
def _analyze_pdf(self, file_path: Path) -> ProcessingTrackRecommendation:
|
||||
"""
|
||||
Analyze PDF to determine if it's editable or scanned.
|
||||
|
||||
Args:
|
||||
file_path: Path to PDF file
|
||||
|
||||
Returns:
|
||||
Processing track recommendation
|
||||
"""
|
||||
try:
|
||||
doc = fitz.open(str(file_path))
|
||||
total_pages = len(doc)
|
||||
|
||||
# Sample pages for analysis
|
||||
pages_to_check = min(self.sample_pages, total_pages)
|
||||
text_pages = []
|
||||
page_details = []
|
||||
|
||||
for page_num in range(pages_to_check):
|
||||
page = doc[page_num]
|
||||
|
||||
# Extract text
|
||||
text = page.get_text()
|
||||
text_length = len(text.strip())
|
||||
|
||||
# Check for images
|
||||
images = page.get_images()
|
||||
image_count = len(images)
|
||||
|
||||
# Calculate page area covered by images
|
||||
page_rect = page.rect
|
||||
page_area = page_rect.width * page_rect.height
|
||||
image_area = 0
|
||||
|
||||
for img in images:
|
||||
try:
|
||||
# Get image rectangles
|
||||
xref = img[0]
|
||||
img_rects = page.get_image_rects(xref)
|
||||
for rect in img_rects:
|
||||
image_area += rect.width * rect.height
|
||||
except:
|
||||
pass
|
||||
|
||||
image_coverage = image_area / page_area if page_area > 0 else 0
|
||||
|
||||
# Determine if page has meaningful text
|
||||
has_text = text_length >= self.min_text_length
|
||||
|
||||
text_pages.append(has_text)
|
||||
page_details.append({
|
||||
"page": page_num + 1,
|
||||
"text_length": text_length,
|
||||
"has_text": has_text,
|
||||
"image_count": image_count,
|
||||
"image_coverage": image_coverage
|
||||
})
|
||||
|
||||
logger.debug(f"Page {page_num + 1}: text_length={text_length}, "
|
||||
f"images={image_count}, image_coverage={image_coverage:.2%}")
|
||||
|
||||
doc.close()
|
||||
|
||||
# Calculate text coverage
|
||||
text_coverage = sum(text_pages) / len(text_pages) if text_pages else 0
|
||||
|
||||
# Determine document type and track
|
||||
metadata = {
|
||||
"total_pages": total_pages,
|
||||
"sampled_pages": pages_to_check,
|
||||
"text_coverage": text_coverage,
|
||||
"page_details": page_details
|
||||
}
|
||||
|
||||
if text_coverage >= self.text_coverage_threshold:
|
||||
# Mostly text-based PDF
|
||||
return ProcessingTrackRecommendation(
|
||||
track="direct",
|
||||
confidence=0.95,
|
||||
reason=f"PDF has extractable text on {text_coverage:.0%} of sampled pages",
|
||||
document_type=DocumentType.PDF_EDITABLE,
|
||||
metadata=metadata
|
||||
)
|
||||
elif text_coverage <= 0.1:
|
||||
# Mostly scanned/image PDF
|
||||
return ProcessingTrackRecommendation(
|
||||
track="ocr",
|
||||
confidence=0.95,
|
||||
reason=f"PDF appears to be scanned (only {text_coverage:.0%} pages have text)",
|
||||
document_type=DocumentType.PDF_SCANNED,
|
||||
metadata=metadata
|
||||
)
|
||||
else:
|
||||
# Mixed content
|
||||
# For mixed PDFs, we could implement page-level track selection in the future
|
||||
# For now, use OCR to ensure we don't miss scanned content
|
||||
return ProcessingTrackRecommendation(
|
||||
track="ocr",
|
||||
confidence=0.7,
|
||||
reason=f"PDF has mixed content ({text_coverage:.0%} text pages), using OCR for completeness",
|
||||
document_type=DocumentType.PDF_MIXED,
|
||||
metadata=metadata
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error analyzing PDF: {e}")
|
||||
return ProcessingTrackRecommendation(
|
||||
track="ocr",
|
||||
confidence=0.5,
|
||||
reason=f"Error analyzing PDF: {str(e)}",
|
||||
document_type=DocumentType.PDF_SCANNED
|
||||
)
|
||||
|
||||
def _analyze_image(self, file_path: Path, mime_type: str) -> ProcessingTrackRecommendation:
|
||||
"""
|
||||
Analyze image file.
|
||||
|
||||
Images always require OCR processing.
|
||||
"""
|
||||
file_size = file_path.stat().st_size
|
||||
metadata = {
|
||||
"mime_type": mime_type,
|
||||
"file_size": file_size,
|
||||
"file_extension": file_path.suffix
|
||||
}
|
||||
|
||||
return ProcessingTrackRecommendation(
|
||||
track="ocr",
|
||||
confidence=1.0,
|
||||
reason="Image files require OCR processing",
|
||||
document_type=DocumentType.IMAGE,
|
||||
metadata=metadata
|
||||
)
|
||||
|
||||
def _analyze_office(self, file_path: Path, mime_type: str) -> ProcessingTrackRecommendation:
|
||||
"""
|
||||
Analyze Office document.
|
||||
|
||||
Currently routes all Office documents to OCR track.
|
||||
Future enhancement: implement direct extraction for Office files.
|
||||
"""
|
||||
document_type = self.OFFICE_MIMES.get(mime_type, DocumentType.UNKNOWN)
|
||||
file_size = file_path.stat().st_size
|
||||
|
||||
metadata = {
|
||||
"mime_type": mime_type,
|
||||
"file_size": file_size,
|
||||
"file_extension": file_path.suffix
|
||||
}
|
||||
|
||||
# TODO: In future, we could implement direct extraction for Office files
|
||||
# using python-docx, openpyxl, python-pptx
|
||||
return ProcessingTrackRecommendation(
|
||||
track="ocr",
|
||||
confidence=0.9,
|
||||
reason="Office documents currently processed via OCR (direct extraction planned)",
|
||||
document_type=document_type,
|
||||
metadata=metadata
|
||||
)
|
||||
|
||||
def _analyze_text(self, file_path: Path, mime_type: str) -> ProcessingTrackRecommendation:
|
||||
"""
|
||||
Analyze text file.
|
||||
|
||||
Plain text files can be directly processed without OCR.
|
||||
"""
|
||||
file_size = file_path.stat().st_size
|
||||
metadata = {
|
||||
"mime_type": mime_type,
|
||||
"file_size": file_size,
|
||||
"file_extension": file_path.suffix
|
||||
}
|
||||
|
||||
return ProcessingTrackRecommendation(
|
||||
track="direct",
|
||||
confidence=1.0,
|
||||
reason="Plain text files can be directly processed",
|
||||
document_type=DocumentType.TEXT,
|
||||
metadata=metadata
|
||||
)
|
||||
|
||||
def analyze_batch(self, file_paths: List[Path]) -> Dict[str, ProcessingTrackRecommendation]:
|
||||
"""
|
||||
Analyze multiple files and return recommendations.
|
||||
|
||||
Args:
|
||||
file_paths: List of file paths to analyze
|
||||
|
||||
Returns:
|
||||
Dictionary mapping file paths to recommendations
|
||||
"""
|
||||
results = {}
|
||||
|
||||
for file_path in file_paths:
|
||||
try:
|
||||
recommendation = self.detect(file_path)
|
||||
results[str(file_path)] = recommendation
|
||||
logger.info(f"Analyzed {file_path.name}: {recommendation.track} "
|
||||
f"(confidence: {recommendation.confidence:.2f})")
|
||||
except Exception as e:
|
||||
logger.error(f"Error analyzing {file_path}: {e}")
|
||||
results[str(file_path)] = ProcessingTrackRecommendation(
|
||||
track="ocr",
|
||||
confidence=0.3,
|
||||
reason=f"Error during analysis: {str(e)}",
|
||||
document_type=DocumentType.UNKNOWN
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
def get_statistics(self, recommendations: Dict[str, ProcessingTrackRecommendation]) -> Dict:
|
||||
"""
|
||||
Calculate statistics from batch analysis results.
|
||||
|
||||
Args:
|
||||
recommendations: Dictionary of file recommendations
|
||||
|
||||
Returns:
|
||||
Statistics dictionary
|
||||
"""
|
||||
if not recommendations:
|
||||
return {"total": 0}
|
||||
|
||||
tracks = [r.track for r in recommendations.values()]
|
||||
confidences = [r.confidence for r in recommendations.values()]
|
||||
doc_types = [r.document_type.value for r in recommendations.values()]
|
||||
|
||||
stats = {
|
||||
"total": len(recommendations),
|
||||
"by_track": {
|
||||
"ocr": tracks.count("ocr"),
|
||||
"direct": tracks.count("direct")
|
||||
},
|
||||
"by_document_type": {},
|
||||
"confidence": {
|
||||
"mean": statistics.mean(confidences),
|
||||
"median": statistics.median(confidences),
|
||||
"min": min(confidences),
|
||||
"max": max(confidences)
|
||||
}
|
||||
}
|
||||
|
||||
# Count by document type
|
||||
for doc_type in set(doc_types):
|
||||
stats["by_document_type"][doc_type] = doc_types.count(doc_type)
|
||||
|
||||
return stats
|
||||
Reference in New Issue
Block a user