feat: implement core dual-track processing infrastructure

Added foundation for dual-track document processing:

1. UnifiedDocument Model (backend/app/models/unified_document.py)
   - Common output format for both OCR and direct extraction
   - Comprehensive element types (23+ types from PP-StructureV3)
   - BoundingBox, StyleInfo, TableData structures
   - Backward compatibility with legacy format

2. DocumentTypeDetector Service (backend/app/services/document_type_detector.py)
   - Intelligent document type detection using python-magic
   - PDF editability analysis using PyMuPDF
   - Processing track recommendation with confidence scores
   - Support for PDF, images, Office docs, and text files

3. DirectExtractionEngine Service (backend/app/services/direct_extraction_engine.py)
   - Fast extraction from editable PDFs using PyMuPDF
   - Preserves fonts, colors, and exact positioning
   - Native and positional table detection
   - Image extraction with coordinates
   - Hyperlink and metadata extraction

4. Dependencies
   - Added PyMuPDF>=1.23.0 for PDF extraction
   - Added pdfplumber>=0.10.0 as fallback
   - Added python-magic-bin>=0.4.14 for file detection

Next: Integrate with OCR service for complete dual-track processing

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-18 20:17:50 +08:00
parent cd3cbea49d
commit 2d50c128f7
4 changed files with 1729 additions and 0 deletions

View File

@@ -0,0 +1,633 @@
"""
Direct Extraction Engine using PyMuPDF
Handles direct text and structure extraction from editable PDFs without OCR.
This provides much faster processing and perfect accuracy for documents with
extractable text.
"""
import os
import logging
import fitz # PyMuPDF
import uuid
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Any, Union
from datetime import datetime
import re
from ..models.unified_document import (
UnifiedDocument, DocumentElement, Page, DocumentMetadata,
BoundingBox, StyleInfo, TableData, TableCell, Dimensions,
ElementType, ProcessingTrack
)
logger = logging.getLogger(__name__)
class DirectExtractionEngine:
"""
Engine for direct text extraction from editable PDFs using PyMuPDF.
This engine provides:
- Fast text extraction with exact positioning
- Font and style information preservation
- Table structure detection
- Image extraction with coordinates
- Hyperlink and annotation extraction
"""
def __init__(self,
enable_table_detection: bool = True,
enable_image_extraction: bool = True,
min_table_rows: int = 2,
min_table_cols: int = 2):
"""
Initialize the extraction engine.
Args:
enable_table_detection: Whether to detect and extract tables
enable_image_extraction: Whether to extract images
min_table_rows: Minimum rows for table detection
min_table_cols: Minimum columns for table detection
"""
self.enable_table_detection = enable_table_detection
self.enable_image_extraction = enable_image_extraction
self.min_table_rows = min_table_rows
self.min_table_cols = min_table_cols
def extract(self,
file_path: Path,
output_dir: Optional[Path] = None) -> UnifiedDocument:
"""
Extract content from PDF file to UnifiedDocument format.
Args:
file_path: Path to PDF file
output_dir: Optional directory to save extracted images
Returns:
UnifiedDocument with extracted content
"""
start_time = datetime.now()
document_id = str(uuid.uuid4())
try:
doc = fitz.open(str(file_path))
# Extract document metadata
metadata = self._extract_metadata(file_path, doc, start_time)
# Extract pages
pages = []
for page_num in range(len(doc)):
logger.info(f"Extracting page {page_num + 1}/{len(doc)}")
page = self._extract_page(
doc[page_num],
page_num + 1,
document_id,
output_dir
)
pages.append(page)
doc.close()
# Calculate processing time
processing_time = (datetime.now() - start_time).total_seconds()
metadata.processing_time = processing_time
logger.info(f"Direct extraction completed in {processing_time:.2f}s")
return UnifiedDocument(
document_id=document_id,
metadata=metadata,
pages=pages
)
except Exception as e:
logger.error(f"Error during direct extraction: {e}")
# Return partial result with error information
processing_time = (datetime.now() - start_time).total_seconds()
if 'metadata' not in locals():
metadata = DocumentMetadata(
filename=file_path.name,
file_type="pdf",
file_size=file_path.stat().st_size if file_path.exists() else 0,
created_at=datetime.now(),
processing_track=ProcessingTrack.DIRECT,
processing_time=processing_time
)
return UnifiedDocument(
document_id=document_id,
metadata=metadata,
pages=pages if 'pages' in locals() else [],
processing_errors=[{
"error": str(e),
"type": type(e).__name__
}]
)
def _extract_metadata(self,
file_path: Path,
doc: fitz.Document,
start_time: datetime) -> DocumentMetadata:
"""Extract document metadata"""
pdf_metadata = doc.metadata
return DocumentMetadata(
filename=file_path.name,
file_type="pdf",
file_size=file_path.stat().st_size,
created_at=start_time,
processing_track=ProcessingTrack.DIRECT,
processing_time=0.0, # Will be updated later
title=pdf_metadata.get("title"),
author=pdf_metadata.get("author"),
subject=pdf_metadata.get("subject"),
keywords=pdf_metadata.get("keywords", "").split(",") if pdf_metadata.get("keywords") else None,
producer=pdf_metadata.get("producer"),
creator=pdf_metadata.get("creator"),
creation_date=self._parse_pdf_date(pdf_metadata.get("creationDate")),
modification_date=self._parse_pdf_date(pdf_metadata.get("modDate"))
)
def _parse_pdf_date(self, date_str: str) -> Optional[datetime]:
"""Parse PDF date string to datetime"""
if not date_str:
return None
try:
# PDF date format: D:YYYYMMDDHHmmSSOHH'mm
# Example: D:20240101120000+09'00
if date_str.startswith("D:"):
date_str = date_str[2:]
# Extract just the date/time part (first 14 characters)
if len(date_str) >= 14:
date_part = date_str[:14]
return datetime.strptime(date_part, "%Y%m%d%H%M%S")
except:
pass
return None
def _extract_page(self,
page: fitz.Page,
page_num: int,
document_id: str,
output_dir: Optional[Path]) -> Page:
"""Extract content from a single page"""
elements = []
element_counter = 0
# Get page dimensions
rect = page.rect
dimensions = Dimensions(
width=rect.width,
height=rect.height,
dpi=72 # PDF standard DPI
)
# Extract text blocks with formatting
text_dict = page.get_text("dict")
for block_idx, block in enumerate(text_dict.get("blocks", [])):
if block.get("type") == 0: # Text block
element = self._process_text_block(
block, page_num, element_counter
)
if element:
elements.append(element)
element_counter += 1
# Extract tables (if enabled)
if self.enable_table_detection:
try:
# Try native table detection (PyMuPDF 1.23.0+)
tables = page.find_tables()
for table_idx, table in enumerate(tables):
element = self._process_native_table(
table, page_num, element_counter
)
if element:
elements.append(element)
element_counter += 1
except AttributeError:
# Fallback to positional table detection
logger.debug("Native table detection not available, using positional detection")
table_elements = self._detect_tables_by_position(page, page_num, element_counter)
elements.extend(table_elements)
element_counter += len(table_elements)
# Extract images (if enabled)
if self.enable_image_extraction:
image_elements = self._extract_images(
page, page_num, document_id, element_counter, output_dir
)
elements.extend(image_elements)
element_counter += len(image_elements)
# Extract hyperlinks
links = page.get_links()
for link_idx, link in enumerate(links):
# Create link annotation element if it has URI
if link.get("uri"):
from_rect = link.get("from")
if from_rect:
element = DocumentElement(
element_id=f"link_{page_num}_{element_counter}",
type=ElementType.REFERENCE,
content={"uri": link["uri"], "type": "hyperlink"},
bbox=BoundingBox(
x0=from_rect.x0,
y0=from_rect.y0,
x1=from_rect.x1,
y1=from_rect.y1
),
metadata={"link_type": "external" if link["uri"].startswith("http") else "internal"}
)
elements.append(element)
element_counter += 1
# Extract vector graphics (as metadata)
drawings = page.get_drawings()
if drawings:
logger.debug(f"Page {page_num} contains {len(drawings)} vector drawing commands")
return Page(
page_number=page_num,
elements=elements,
dimensions=dimensions,
metadata={
"has_drawings": len(drawings) > 0,
"drawing_count": len(drawings),
"link_count": len(links)
}
)
def _process_text_block(self, block: Dict, page_num: int, counter: int) -> Optional[DocumentElement]:
"""Process a text block into a DocumentElement"""
# Calculate block bounding box
bbox_data = block.get("bbox", [0, 0, 0, 0])
bbox = BoundingBox(
x0=bbox_data[0],
y0=bbox_data[1],
x1=bbox_data[2],
y1=bbox_data[3]
)
# Extract text content
text_parts = []
styles = []
for line in block.get("lines", []):
for span in line.get("spans", []):
text = span.get("text", "")
if text:
text_parts.append(text)
# Extract style information
style = StyleInfo(
font_name=span.get("font"),
font_size=span.get("size"),
font_weight="bold" if span.get("flags", 0) & 2**4 else "normal",
font_style="italic" if span.get("flags", 0) & 2**1 else "normal",
text_color=span.get("color")
)
styles.append(style)
if not text_parts:
return None
full_text = "".join(text_parts)
# Determine element type based on content and style
element_type = self._infer_element_type(full_text, styles)
# Use the most common style for the block
if styles:
block_style = styles[0] # Could be improved with style merging
else:
block_style = None
return DocumentElement(
element_id=f"text_{page_num}_{counter}",
type=element_type,
content=full_text,
bbox=bbox,
style=block_style,
confidence=1.0 # Direct extraction has perfect confidence
)
def _infer_element_type(self, text: str, styles: List[StyleInfo]) -> ElementType:
"""Infer element type based on text content and styling"""
text_lower = text.lower().strip()
# Check for common patterns
if len(text_lower) < 100 and styles:
# Short text with large font might be title/header
avg_size = sum(s.font_size or 12 for s in styles) / len(styles)
if avg_size > 16:
return ElementType.TITLE
elif avg_size > 14:
return ElementType.HEADER
# Check for list patterns
if re.match(r'^[\d•·▪▫◦‣]\s', text_lower):
return ElementType.LIST_ITEM
# Check for page numbers
if re.match(r'^page\s+\d+|^\d+\s*$|^-\s*\d+\s*-$', text_lower):
return ElementType.PAGE_NUMBER
# Check for footnote patterns
if re.match(r'^[\[\d+\]]|^\d+\)', text_lower):
return ElementType.FOOTNOTE
# Default to paragraph for longer text, text for shorter
return ElementType.PARAGRAPH if len(text) > 150 else ElementType.TEXT
def _process_native_table(self, table, page_num: int, counter: int) -> Optional[DocumentElement]:
"""Process a natively detected table"""
try:
# Extract table data
data = table.extract()
if not data or len(data) < self.min_table_rows:
return None
# Get table bounding box
bbox_data = table.bbox
bbox = BoundingBox(
x0=bbox_data[0],
y0=bbox_data[1],
x1=bbox_data[2],
y1=bbox_data[3]
)
# Create table cells
cells = []
for row_idx, row in enumerate(data):
for col_idx, cell_text in enumerate(row):
if cell_text:
cells.append(TableCell(
row=row_idx,
col=col_idx,
content=str(cell_text) if cell_text else ""
))
# Create table data
table_data = TableData(
rows=len(data),
cols=max(len(row) for row in data) if data else 0,
cells=cells,
headers=data[0] if data else None # Assume first row is header
)
return DocumentElement(
element_id=f"table_{page_num}_{counter}",
type=ElementType.TABLE,
content=table_data,
bbox=bbox,
confidence=1.0
)
except Exception as e:
logger.error(f"Error processing native table: {e}")
return None
def _detect_tables_by_position(self, page: fitz.Page, page_num: int, counter: int) -> List[DocumentElement]:
"""Detect tables by analyzing text positioning"""
tables = []
# Get all words with positions
words = page.get_text("words") # Returns (x0, y0, x1, y1, "word", block_no, line_no, word_no)
if not words:
return tables
# Group words by approximate row (y-coordinate)
rows = {}
for word in words:
y = round(word[1] / 5) * 5 # Round to nearest 5 points
if y not in rows:
rows[y] = []
rows[y].append({
'x0': word[0],
'y0': word[1],
'x1': word[2],
'y1': word[3],
'text': word[4],
'block': word[5] if len(word) > 5 else 0
})
# Sort rows by y-coordinate
sorted_rows = sorted(rows.items(), key=lambda x: x[0])
# Find potential tables (consecutive rows with multiple columns)
current_table_rows = []
tables_found = []
for y, words_in_row in sorted_rows:
words_in_row.sort(key=lambda w: w['x0'])
if len(words_in_row) >= self.min_table_cols:
# Check if this could be a table row
x_positions = [w['x0'] for w in words_in_row]
# Check for somewhat regular spacing
if self._has_regular_spacing(x_positions):
current_table_rows.append((y, words_in_row))
else:
# End current table if exists
if len(current_table_rows) >= self.min_table_rows:
tables_found.append(current_table_rows)
current_table_rows = []
else:
# End current table if exists
if len(current_table_rows) >= self.min_table_rows:
tables_found.append(current_table_rows)
current_table_rows = []
# Don't forget the last table
if len(current_table_rows) >= self.min_table_rows:
tables_found.append(current_table_rows)
# Convert detected tables to DocumentElements
for table_idx, table_rows in enumerate(tables_found):
if not table_rows:
continue
# Calculate table bounding box
all_words = []
for _, words in table_rows:
all_words.extend(words)
min_x = min(w['x0'] for w in all_words)
min_y = min(w['y0'] for w in all_words)
max_x = max(w['x1'] for w in all_words)
max_y = max(w['y1'] for w in all_words)
bbox = BoundingBox(x0=min_x, y0=min_y, x1=max_x, y1=max_y)
# Create table cells
cells = []
for row_idx, (y, words) in enumerate(table_rows):
# Group words into columns
columns = self._group_into_columns(words, table_rows)
for col_idx, col_text in enumerate(columns):
if col_text:
cells.append(TableCell(
row=row_idx,
col=col_idx,
content=col_text
))
# Create table data
table_data = TableData(
rows=len(table_rows),
cols=max(len(self._group_into_columns(words, table_rows))
for _, words in table_rows),
cells=cells
)
element = DocumentElement(
element_id=f"table_{page_num}_{counter + table_idx}",
type=ElementType.TABLE,
content=table_data,
bbox=bbox,
confidence=0.8, # Lower confidence for positional detection
metadata={"detection_method": "positional"}
)
tables.append(element)
return tables
def _has_regular_spacing(self, x_positions: List[float], tolerance: float = 0.3) -> bool:
"""Check if x positions have somewhat regular spacing"""
if len(x_positions) < 3:
return False
spacings = [x_positions[i+1] - x_positions[i] for i in range(len(x_positions)-1)]
avg_spacing = sum(spacings) / len(spacings)
# Check if spacings are within tolerance of average
for spacing in spacings:
if abs(spacing - avg_spacing) > avg_spacing * tolerance:
return False
return True
def _group_into_columns(self, words: List[Dict], all_rows: List) -> List[str]:
"""Group words into columns based on x-position"""
if not words:
return []
# Find common column positions across all rows
all_x_positions = []
for _, row_words in all_rows:
all_x_positions.extend([w['x0'] for w in row_words])
# Cluster x-positions to find columns
column_positions = self._cluster_positions(all_x_positions)
# Assign words to columns
columns = [""] * len(column_positions)
for word in words:
# Find closest column
closest_col = 0
min_dist = float('inf')
for col_idx, col_x in enumerate(column_positions):
dist = abs(word['x0'] - col_x)
if dist < min_dist:
min_dist = dist
closest_col = col_idx
if columns[closest_col]:
columns[closest_col] += " " + word['text']
else:
columns[closest_col] = word['text']
return columns
def _cluster_positions(self, positions: List[float], threshold: float = 20) -> List[float]:
"""Cluster positions to find common columns"""
if not positions:
return []
sorted_pos = sorted(positions)
clusters = [[sorted_pos[0]]]
for pos in sorted_pos[1:]:
# Check if position belongs to current cluster
if pos - clusters[-1][-1] < threshold:
clusters[-1].append(pos)
else:
clusters.append([pos])
# Return average position of each cluster
return [sum(cluster) / len(cluster) for cluster in clusters]
def _extract_images(self,
page: fitz.Page,
page_num: int,
document_id: str,
counter: int,
output_dir: Optional[Path]) -> List[DocumentElement]:
"""Extract images from page"""
elements = []
image_list = page.get_images()
for img_idx, img in enumerate(image_list):
try:
xref = img[0]
# Get image position(s)
img_rects = page.get_image_rects(xref)
if not img_rects:
continue
rect = img_rects[0] # Use first occurrence
bbox = BoundingBox(
x0=rect.x0,
y0=rect.y0,
x1=rect.x1,
y1=rect.y1
)
# Extract image data
pix = fitz.Pixmap(page.parent, xref)
image_data = {
"width": pix.width,
"height": pix.height,
"colorspace": pix.colorspace.name if pix.colorspace else "unknown",
"xref": xref
}
# Save image if output directory provided
if output_dir:
output_dir.mkdir(parents=True, exist_ok=True)
image_filename = f"{document_id}_p{page_num}_img{img_idx}.png"
image_path = output_dir / image_filename
pix.save(str(image_path))
image_data["saved_path"] = str(image_path)
logger.debug(f"Saved image to {image_path}")
element = DocumentElement(
element_id=f"image_{page_num}_{counter + img_idx}",
type=ElementType.IMAGE,
content=image_data,
bbox=bbox,
confidence=1.0,
metadata={
"image_index": img_idx,
"xref": xref
}
)
elements.append(element)
pix = None # Free memory
except Exception as e:
logger.error(f"Error extracting image {img_idx}: {e}")
return elements

View File

@@ -0,0 +1,397 @@
"""
Document Type Detector Service
Intelligently determines the optimal processing track for documents based on
file type, content analysis, and editability checks.
"""
import os
import logging
import magic
import fitz # PyMuPDF
from pathlib import Path
from typing import Dict, Optional, Tuple, List
from enum import Enum
import statistics
logger = logging.getLogger(__name__)
class DocumentType(str, Enum):
"""Document type classification"""
PDF_EDITABLE = "pdf_editable" # PDF with extractable text
PDF_SCANNED = "pdf_scanned" # PDF with images/scanned content
PDF_MIXED = "pdf_mixed" # PDF with both text and scanned pages
IMAGE = "image" # Image files (PNG, JPG, etc.)
OFFICE_WORD = "office_word" # Word documents
OFFICE_EXCEL = "office_excel" # Excel spreadsheets
OFFICE_POWERPOINT = "office_ppt" # PowerPoint presentations
TEXT = "text" # Plain text files
UNKNOWN = "unknown" # Unknown format
class ProcessingTrackRecommendation:
"""Processing track recommendation with confidence"""
def __init__(self,
track: str,
confidence: float,
reason: str,
document_type: DocumentType,
metadata: Optional[Dict] = None):
self.track = track # "ocr" or "direct"
self.confidence = confidence # 0.0 to 1.0
self.reason = reason
self.document_type = document_type
self.metadata = metadata or {}
def to_dict(self) -> Dict:
return {
"recommended_track": self.track,
"confidence": self.confidence,
"reason": self.reason,
"document_type": self.document_type.value,
"metadata": self.metadata
}
class DocumentTypeDetector:
"""
Service for detecting document types and recommending processing tracks.
This service analyzes documents to determine:
1. The document type (PDF, image, Office, etc.)
2. Whether the document contains extractable text
3. The recommended processing track (OCR vs Direct)
"""
# MIME type mappings
IMAGE_MIMES = {
'image/png', 'image/jpeg', 'image/jpg', 'image/gif',
'image/bmp', 'image/tiff', 'image/webp'
}
OFFICE_MIMES = {
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': DocumentType.OFFICE_WORD,
'application/msword': DocumentType.OFFICE_WORD,
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': DocumentType.OFFICE_EXCEL,
'application/vnd.ms-excel': DocumentType.OFFICE_EXCEL,
'application/vnd.openxmlformats-officedocument.presentationml.presentation': DocumentType.OFFICE_POWERPOINT,
'application/vnd.ms-powerpoint': DocumentType.OFFICE_POWERPOINT,
}
def __init__(self,
min_text_length: int = 100,
sample_pages: int = 3,
text_coverage_threshold: float = 0.9):
"""
Initialize the detector.
Args:
min_text_length: Minimum text length to consider a page as having extractable text
sample_pages: Number of pages to sample for PDF analysis
text_coverage_threshold: Percentage of pages with text to classify as editable
"""
self.min_text_length = min_text_length
self.sample_pages = sample_pages
self.text_coverage_threshold = text_coverage_threshold
def detect(self, file_path: Path) -> ProcessingTrackRecommendation:
"""
Detect document type and recommend processing track.
Args:
file_path: Path to the document file
Returns:
ProcessingTrackRecommendation with track selection and metadata
"""
if not file_path.exists():
logger.error(f"File not found: {file_path}")
return ProcessingTrackRecommendation(
track="ocr",
confidence=0.5,
reason="File not found, defaulting to OCR",
document_type=DocumentType.UNKNOWN
)
try:
# Detect MIME type
mime_type = magic.from_file(str(file_path), mime=True)
logger.info(f"Detected MIME type: {mime_type} for {file_path.name}")
# Route based on file type
if mime_type == 'application/pdf':
return self._analyze_pdf(file_path)
elif mime_type in self.IMAGE_MIMES:
return self._analyze_image(file_path, mime_type)
elif mime_type in self.OFFICE_MIMES:
return self._analyze_office(file_path, mime_type)
elif mime_type.startswith('text/'):
return self._analyze_text(file_path, mime_type)
else:
logger.warning(f"Unknown MIME type: {mime_type}")
return ProcessingTrackRecommendation(
track="ocr",
confidence=0.5,
reason=f"Unknown file type ({mime_type}), defaulting to OCR",
document_type=DocumentType.UNKNOWN
)
except Exception as e:
logger.error(f"Error detecting document type: {e}")
return ProcessingTrackRecommendation(
track="ocr",
confidence=0.3,
reason=f"Error during detection: {str(e)}",
document_type=DocumentType.UNKNOWN
)
def _analyze_pdf(self, file_path: Path) -> ProcessingTrackRecommendation:
"""
Analyze PDF to determine if it's editable or scanned.
Args:
file_path: Path to PDF file
Returns:
Processing track recommendation
"""
try:
doc = fitz.open(str(file_path))
total_pages = len(doc)
# Sample pages for analysis
pages_to_check = min(self.sample_pages, total_pages)
text_pages = []
page_details = []
for page_num in range(pages_to_check):
page = doc[page_num]
# Extract text
text = page.get_text()
text_length = len(text.strip())
# Check for images
images = page.get_images()
image_count = len(images)
# Calculate page area covered by images
page_rect = page.rect
page_area = page_rect.width * page_rect.height
image_area = 0
for img in images:
try:
# Get image rectangles
xref = img[0]
img_rects = page.get_image_rects(xref)
for rect in img_rects:
image_area += rect.width * rect.height
except:
pass
image_coverage = image_area / page_area if page_area > 0 else 0
# Determine if page has meaningful text
has_text = text_length >= self.min_text_length
text_pages.append(has_text)
page_details.append({
"page": page_num + 1,
"text_length": text_length,
"has_text": has_text,
"image_count": image_count,
"image_coverage": image_coverage
})
logger.debug(f"Page {page_num + 1}: text_length={text_length}, "
f"images={image_count}, image_coverage={image_coverage:.2%}")
doc.close()
# Calculate text coverage
text_coverage = sum(text_pages) / len(text_pages) if text_pages else 0
# Determine document type and track
metadata = {
"total_pages": total_pages,
"sampled_pages": pages_to_check,
"text_coverage": text_coverage,
"page_details": page_details
}
if text_coverage >= self.text_coverage_threshold:
# Mostly text-based PDF
return ProcessingTrackRecommendation(
track="direct",
confidence=0.95,
reason=f"PDF has extractable text on {text_coverage:.0%} of sampled pages",
document_type=DocumentType.PDF_EDITABLE,
metadata=metadata
)
elif text_coverage <= 0.1:
# Mostly scanned/image PDF
return ProcessingTrackRecommendation(
track="ocr",
confidence=0.95,
reason=f"PDF appears to be scanned (only {text_coverage:.0%} pages have text)",
document_type=DocumentType.PDF_SCANNED,
metadata=metadata
)
else:
# Mixed content
# For mixed PDFs, we could implement page-level track selection in the future
# For now, use OCR to ensure we don't miss scanned content
return ProcessingTrackRecommendation(
track="ocr",
confidence=0.7,
reason=f"PDF has mixed content ({text_coverage:.0%} text pages), using OCR for completeness",
document_type=DocumentType.PDF_MIXED,
metadata=metadata
)
except Exception as e:
logger.error(f"Error analyzing PDF: {e}")
return ProcessingTrackRecommendation(
track="ocr",
confidence=0.5,
reason=f"Error analyzing PDF: {str(e)}",
document_type=DocumentType.PDF_SCANNED
)
def _analyze_image(self, file_path: Path, mime_type: str) -> ProcessingTrackRecommendation:
"""
Analyze image file.
Images always require OCR processing.
"""
file_size = file_path.stat().st_size
metadata = {
"mime_type": mime_type,
"file_size": file_size,
"file_extension": file_path.suffix
}
return ProcessingTrackRecommendation(
track="ocr",
confidence=1.0,
reason="Image files require OCR processing",
document_type=DocumentType.IMAGE,
metadata=metadata
)
def _analyze_office(self, file_path: Path, mime_type: str) -> ProcessingTrackRecommendation:
"""
Analyze Office document.
Currently routes all Office documents to OCR track.
Future enhancement: implement direct extraction for Office files.
"""
document_type = self.OFFICE_MIMES.get(mime_type, DocumentType.UNKNOWN)
file_size = file_path.stat().st_size
metadata = {
"mime_type": mime_type,
"file_size": file_size,
"file_extension": file_path.suffix
}
# TODO: In future, we could implement direct extraction for Office files
# using python-docx, openpyxl, python-pptx
return ProcessingTrackRecommendation(
track="ocr",
confidence=0.9,
reason="Office documents currently processed via OCR (direct extraction planned)",
document_type=document_type,
metadata=metadata
)
def _analyze_text(self, file_path: Path, mime_type: str) -> ProcessingTrackRecommendation:
"""
Analyze text file.
Plain text files can be directly processed without OCR.
"""
file_size = file_path.stat().st_size
metadata = {
"mime_type": mime_type,
"file_size": file_size,
"file_extension": file_path.suffix
}
return ProcessingTrackRecommendation(
track="direct",
confidence=1.0,
reason="Plain text files can be directly processed",
document_type=DocumentType.TEXT,
metadata=metadata
)
def analyze_batch(self, file_paths: List[Path]) -> Dict[str, ProcessingTrackRecommendation]:
"""
Analyze multiple files and return recommendations.
Args:
file_paths: List of file paths to analyze
Returns:
Dictionary mapping file paths to recommendations
"""
results = {}
for file_path in file_paths:
try:
recommendation = self.detect(file_path)
results[str(file_path)] = recommendation
logger.info(f"Analyzed {file_path.name}: {recommendation.track} "
f"(confidence: {recommendation.confidence:.2f})")
except Exception as e:
logger.error(f"Error analyzing {file_path}: {e}")
results[str(file_path)] = ProcessingTrackRecommendation(
track="ocr",
confidence=0.3,
reason=f"Error during analysis: {str(e)}",
document_type=DocumentType.UNKNOWN
)
return results
def get_statistics(self, recommendations: Dict[str, ProcessingTrackRecommendation]) -> Dict:
"""
Calculate statistics from batch analysis results.
Args:
recommendations: Dictionary of file recommendations
Returns:
Statistics dictionary
"""
if not recommendations:
return {"total": 0}
tracks = [r.track for r in recommendations.values()]
confidences = [r.confidence for r in recommendations.values()]
doc_types = [r.document_type.value for r in recommendations.values()]
stats = {
"total": len(recommendations),
"by_track": {
"ocr": tracks.count("ocr"),
"direct": tracks.count("direct")
},
"by_document_type": {},
"confidence": {
"mean": statistics.mean(confidences),
"median": statistics.median(confidences),
"min": min(confidences),
"max": max(confidences)
}
}
# Count by document type
for doc_type in set(doc_types):
stats["by_document_type"][doc_type] = doc_types.count(doc_type)
return stats