459 lines
17 KiB
Markdown
459 lines
17 KiB
Markdown
# Design: PDF Preprocessing Pipeline
|
|
|
|
## Architecture Overview
|
|
|
|
```
|
|
┌─────────────────────────────────────────────────────────────────────────────┐
|
|
│ DIRECT Track PDF Processing Pipeline │
|
|
├─────────────────────────────────────────────────────────────────────────────┤
|
|
│ │
|
|
│ Input PDF │
|
|
│ │ │
|
|
│ ▼ │
|
|
│ ┌─────────────────────────────────────────────────────────────────────┐ │
|
|
│ │ Step 0: GS Distillation (Exception Handler) │ │
|
|
│ │ ─────────────────────────────────────────────────────────────────── │ │
|
|
│ │ Trigger: (cid:xxxx) garble detected OR mupdf structural errors │ │
|
|
│ │ Action: gs -sDEVICE=pdfwrite -dDetectDuplicateImages=true │ │
|
|
│ │ Status: DISABLED by default, auto-triggered on errors │ │
|
|
│ └─────────────────────────────────────────────────────────────────────┘ │
|
|
│ │ │
|
|
│ ▼ │
|
|
│ ┌─────────────────────────────────────────────────────────────────────┐ │
|
|
│ │ Step 1: Object-level Cleaning (P0 - Core) │ │
|
|
│ │ ─────────────────────────────────────────────────────────────────── │ │
|
|
│ │ 1.1 clean_contents(sanitize=True) - Fix malformed content stream │ │
|
|
│ │ 1.2 Remove hidden OCG layers │ │
|
|
│ │ 1.3 White-out detection & removal (IoU >= 80%) │ │
|
|
│ └─────────────────────────────────────────────────────────────────────┘ │
|
|
│ │ │
|
|
│ ▼ │
|
|
│ ┌─────────────────────────────────────────────────────────────────────┐ │
|
|
│ │ Step 2: Layout Analysis (P1 - Rule-based) │ │
|
|
│ │ ─────────────────────────────────────────────────────────────────── │ │
|
|
│ │ 2.1 get_text("blocks", sort=True) - Column-aware sorting │ │
|
|
│ │ 2.2 Classify elements (title/body/header/footer/page_number) │ │
|
|
│ │ 2.3 Filter unwanted elements (page numbers, decorations) │ │
|
|
│ └─────────────────────────────────────────────────────────────────────┘ │
|
|
│ │ │
|
|
│ ▼ │
|
|
│ ┌─────────────────────────────────────────────────────────────────────┐ │
|
|
│ │ Step 3: Text Extraction (Enhanced) │ │
|
|
│ │ ─────────────────────────────────────────────────────────────────── │ │
|
|
│ │ 3.1 Extract text with bbox coordinates preserved │ │
|
|
│ │ 3.2 Garble rate detection (cid:xxxx count / total chars) │ │
|
|
│ │ 3.3 Auto-fallback: garble_rate > 10% → trigger Paddle OCR │ │
|
|
│ └─────────────────────────────────────────────────────────────────────┘ │
|
|
│ │ │
|
|
│ ▼ │
|
|
│ UnifiedDocument (with bbox for debugging) │
|
|
│ │
|
|
└─────────────────────────────────────────────────────────────────────────────┘
|
|
```
|
|
|
|
---
|
|
|
|
## Step 0: GS Distillation (Exception Handler)
|
|
|
|
### Purpose
|
|
Repair structurally damaged PDFs that PyMuPDF cannot parse correctly.
|
|
|
|
### Trigger Conditions
|
|
```python
|
|
def should_trigger_gs_repair(page_text: str, mupdf_warnings: List[str]) -> bool:
|
|
# Condition 1: High garble rate (cid:xxxx patterns)
|
|
cid_pattern = r'\(cid:\d+\)'
|
|
cid_count = len(re.findall(cid_pattern, page_text))
|
|
total_chars = len(page_text)
|
|
garble_rate = cid_count / max(total_chars, 1)
|
|
|
|
if garble_rate > 0.1: # >10% garbled
|
|
return True
|
|
|
|
# Condition 2: Severe structural errors
|
|
severe_errors = ['error', 'invalid', 'corrupt', 'damaged']
|
|
for warning in mupdf_warnings:
|
|
if any(err in warning.lower() for err in severe_errors):
|
|
return True
|
|
|
|
return False
|
|
```
|
|
|
|
### GS Command
|
|
```bash
|
|
gs -dNOPAUSE -dBATCH -dSAFER \
|
|
-sDEVICE=pdfwrite \
|
|
-dPDFSETTINGS=/prepress \
|
|
-dDetectDuplicateImages=true \
|
|
-sOutputFile=repaired.pdf \
|
|
input.pdf
|
|
```
|
|
|
|
### Implementation Notes
|
|
- **Default**: DISABLED
|
|
- **Execution**: Only when triggered by error detection
|
|
- **Fallback**: If GS also fails, route to Paddle OCR track
|
|
|
|
---
|
|
|
|
## Step 1: Object-level Cleaning (P0)
|
|
|
|
### 1.1 Content Stream Sanitization
|
|
```python
|
|
def sanitize_page(page: fitz.Page) -> None:
|
|
"""Fix malformed PDF content stream."""
|
|
page.clean_contents(sanitize=True)
|
|
```
|
|
|
|
### 1.2 Hidden Layer (OCG) Removal
|
|
```python
|
|
def remove_hidden_layers(doc: fitz.Document) -> List[str]:
|
|
"""Remove content from hidden Optional Content Groups."""
|
|
removed_layers = []
|
|
|
|
ocgs = doc.get_ocgs() # Get all OCG definitions
|
|
for ocg_xref, ocg_info in ocgs.items():
|
|
# Check if layer is hidden by default
|
|
if ocg_info.get('on') == False:
|
|
removed_layers.append(ocg_info.get('name', f'OCG_{ocg_xref}'))
|
|
# Mark for removal during extraction
|
|
|
|
return removed_layers
|
|
```
|
|
|
|
### 1.3 White-out Detection (Core Algorithm)
|
|
```python
|
|
def detect_whiteout_covered_text(page: fitz.Page, iou_threshold: float = 0.8) -> List[dict]:
|
|
"""
|
|
Detect text covered by white rectangles ("white-out" / "correction tape" effect).
|
|
|
|
Returns list of text words that should be excluded from extraction.
|
|
"""
|
|
covered_words = []
|
|
|
|
# Get all white-filled rectangles
|
|
drawings = page.get_drawings()
|
|
white_rects = []
|
|
for d in drawings:
|
|
# Check for white fill (RGB all 1.0)
|
|
fill_color = d.get('fill')
|
|
if fill_color and fill_color == (1, 1, 1):
|
|
rect = d.get('rect')
|
|
if rect:
|
|
white_rects.append(fitz.Rect(rect))
|
|
|
|
if not white_rects:
|
|
return covered_words
|
|
|
|
# Get all text words with bounding boxes
|
|
words = page.get_text("words") # Returns list of (x0, y0, x1, y1, word, block_no, line_no, word_no)
|
|
|
|
for word_info in words:
|
|
word_rect = fitz.Rect(word_info[:4])
|
|
word_text = word_info[4]
|
|
|
|
for white_rect in white_rects:
|
|
# Calculate IoU (Intersection over Union)
|
|
intersection = word_rect & white_rect # Intersection
|
|
if intersection.is_empty:
|
|
continue
|
|
|
|
intersection_area = intersection.width * intersection.height
|
|
word_area = word_rect.width * word_rect.height
|
|
|
|
if word_area > 0:
|
|
coverage_ratio = intersection_area / word_area
|
|
if coverage_ratio >= iou_threshold:
|
|
covered_words.append({
|
|
'text': word_text,
|
|
'bbox': tuple(word_rect),
|
|
'coverage': coverage_ratio
|
|
})
|
|
break # Word is covered, no need to check other rects
|
|
|
|
return covered_words
|
|
```
|
|
|
|
---
|
|
|
|
## Step 2: Layout Analysis (P1)
|
|
|
|
### 2.1 Column-aware Text Extraction
|
|
```python
|
|
def extract_with_reading_order(page: fitz.Page) -> List[dict]:
|
|
"""
|
|
Extract text blocks with correct reading order.
|
|
PyMuPDF's sort=True handles two-column layouts automatically.
|
|
"""
|
|
# CRITICAL: sort=True enables column-aware sorting
|
|
blocks = page.get_text("dict", sort=True)['blocks']
|
|
return blocks
|
|
```
|
|
|
|
### 2.2 Element Classification
|
|
```python
|
|
def classify_element(block: dict, page_rect: fitz.Rect) -> str:
|
|
"""
|
|
Classify text block by position and font size.
|
|
|
|
Returns: 'title', 'body', 'header', 'footer', 'page_number'
|
|
"""
|
|
if 'lines' not in block:
|
|
return 'image'
|
|
|
|
bbox = fitz.Rect(block['bbox'])
|
|
page_height = page_rect.height
|
|
page_width = page_rect.width
|
|
|
|
# Relative position (0.0 = top, 1.0 = bottom)
|
|
y_rel = bbox.y0 / page_height
|
|
|
|
# Get average font size
|
|
font_sizes = []
|
|
for line in block.get('lines', []):
|
|
for span in line.get('spans', []):
|
|
font_sizes.append(span.get('size', 12))
|
|
avg_font_size = sum(font_sizes) / len(font_sizes) if font_sizes else 12
|
|
|
|
# Get text content for pattern matching
|
|
text = ''.join(
|
|
span.get('text', '')
|
|
for line in block.get('lines', [])
|
|
for span in line.get('spans', [])
|
|
).strip()
|
|
|
|
# Classification rules
|
|
|
|
# Header: top 5% of page
|
|
if y_rel < 0.05:
|
|
return 'header'
|
|
|
|
# Footer: bottom 5% of page
|
|
if y_rel > 0.95:
|
|
return 'footer'
|
|
|
|
# Page number: bottom 10% + numeric pattern
|
|
if y_rel > 0.90 and _is_page_number(text):
|
|
return 'page_number'
|
|
|
|
# Title: large font (>14pt) or centered
|
|
if avg_font_size > 14:
|
|
return 'title'
|
|
|
|
# Check if centered (for subtitles)
|
|
x_center = (bbox.x0 + bbox.x1) / 2
|
|
page_center = page_width / 2
|
|
if abs(x_center - page_center) < page_width * 0.1 and len(text) < 100:
|
|
if avg_font_size > 12:
|
|
return 'title'
|
|
|
|
return 'body'
|
|
|
|
|
|
def _is_page_number(text: str) -> bool:
|
|
"""Check if text is likely a page number."""
|
|
text = text.strip()
|
|
|
|
# Pure number
|
|
if text.isdigit():
|
|
return True
|
|
|
|
# Common patterns: "Page 1", "- 1 -", "1/10"
|
|
patterns = [
|
|
r'^page\s*\d+$',
|
|
r'^-?\s*\d+\s*-?$',
|
|
r'^\d+\s*/\s*\d+$',
|
|
r'^第\s*\d+\s*頁$',
|
|
r'^第\s*\d+\s*页$',
|
|
]
|
|
|
|
for pattern in patterns:
|
|
if re.match(pattern, text, re.IGNORECASE):
|
|
return True
|
|
|
|
return False
|
|
```
|
|
|
|
### 2.3 Element Filtering
|
|
```python
|
|
def filter_elements(blocks: List[dict], page_rect: fitz.Rect) -> List[dict]:
|
|
"""Filter out unwanted elements (page numbers, headers, footers)."""
|
|
filtered = []
|
|
|
|
for block in blocks:
|
|
element_type = classify_element(block, page_rect)
|
|
|
|
# Skip page numbers and optionally headers/footers
|
|
if element_type == 'page_number':
|
|
continue
|
|
|
|
# Keep with classification metadata
|
|
block['_element_type'] = element_type
|
|
filtered.append(block)
|
|
|
|
return filtered
|
|
```
|
|
|
|
---
|
|
|
|
## Step 3: Text Extraction (Enhanced)
|
|
|
|
### 3.1 Garble Detection
|
|
```python
|
|
def calculate_garble_rate(text: str) -> float:
|
|
"""
|
|
Calculate the rate of garbled characters (cid:xxxx patterns).
|
|
|
|
Returns: float between 0.0 and 1.0
|
|
"""
|
|
if not text:
|
|
return 0.0
|
|
|
|
# Count (cid:xxxx) patterns
|
|
cid_pattern = r'\(cid:\d+\)'
|
|
cid_matches = re.findall(cid_pattern, text)
|
|
cid_char_count = sum(len(m) for m in cid_matches)
|
|
|
|
# Count other garble indicators
|
|
# - Replacement character U+FFFD
|
|
# - Private Use Area characters
|
|
replacement_count = text.count('\ufffd')
|
|
pua_count = sum(1 for c in text if 0xE000 <= ord(c) <= 0xF8FF)
|
|
|
|
total_garble = cid_char_count + replacement_count + pua_count
|
|
total_chars = len(text)
|
|
|
|
return total_garble / total_chars if total_chars > 0 else 0.0
|
|
```
|
|
|
|
### 3.2 Auto-fallback to OCR
|
|
```python
|
|
def should_fallback_to_ocr(page_text: str, garble_threshold: float = 0.1) -> bool:
|
|
"""
|
|
Determine if page should be processed with OCR instead of direct extraction.
|
|
|
|
Args:
|
|
page_text: Extracted text from page
|
|
garble_threshold: Maximum acceptable garble rate (default 10%)
|
|
|
|
Returns:
|
|
True if OCR fallback is recommended
|
|
"""
|
|
garble_rate = calculate_garble_rate(page_text)
|
|
|
|
if garble_rate > garble_threshold:
|
|
logger.warning(
|
|
f"High garble rate detected: {garble_rate:.1%}. "
|
|
f"Recommending OCR fallback."
|
|
)
|
|
return True
|
|
|
|
return False
|
|
```
|
|
|
|
---
|
|
|
|
## Integration Point
|
|
|
|
### Modified DirectExtractionEngine._extract_page()
|
|
|
|
```python
|
|
def _extract_page(self, page: fitz.Page, page_num: int, ...) -> Page:
|
|
"""Extract content from a single page with preprocessing pipeline."""
|
|
|
|
# === Step 1: Object-level Cleaning ===
|
|
|
|
# 1.1 Sanitize content stream
|
|
page.clean_contents(sanitize=True)
|
|
|
|
# 1.2 Detect white-out covered text
|
|
covered_words = detect_whiteout_covered_text(page, iou_threshold=0.8)
|
|
covered_bboxes = [fitz.Rect(w['bbox']) for w in covered_words]
|
|
|
|
# === Step 2: Layout Analysis ===
|
|
|
|
# 2.1 Extract with column-aware sorting
|
|
blocks = page.get_text("dict", sort=True)['blocks']
|
|
|
|
# 2.2 & 2.3 Classify and filter
|
|
filtered_blocks = filter_elements(blocks, page.rect)
|
|
|
|
# === Step 3: Text Extraction ===
|
|
|
|
elements = []
|
|
full_text = ""
|
|
|
|
for block in filtered_blocks:
|
|
# Skip if block overlaps with covered areas
|
|
block_rect = fitz.Rect(block['bbox'])
|
|
if any(block_rect.intersects(cr) for cr in covered_bboxes):
|
|
continue
|
|
|
|
# Extract text with bbox preserved
|
|
element = self._block_to_element(block, page_num)
|
|
if element:
|
|
elements.append(element)
|
|
full_text += element.get_text() + " "
|
|
|
|
# 3.2 Check garble rate
|
|
if should_fallback_to_ocr(full_text):
|
|
# Mark page for OCR processing
|
|
page_metadata['needs_ocr'] = True
|
|
|
|
return Page(
|
|
page_number=page_num,
|
|
elements=elements,
|
|
metadata=page_metadata
|
|
)
|
|
```
|
|
|
|
---
|
|
|
|
## Configuration
|
|
|
|
```python
|
|
@dataclass
|
|
class PreprocessingConfig:
|
|
"""Configuration for PDF preprocessing pipeline."""
|
|
|
|
# Step 0: GS Distillation
|
|
gs_enabled: bool = False # Disabled by default
|
|
gs_garble_threshold: float = 0.1 # Trigger on >10% garble
|
|
gs_detect_duplicate_images: bool = True
|
|
|
|
# Step 1: Object Cleaning
|
|
sanitize_content: bool = True
|
|
remove_hidden_layers: bool = True
|
|
whiteout_detection: bool = True
|
|
whiteout_iou_threshold: float = 0.8
|
|
|
|
# Step 2: Layout Analysis
|
|
column_aware_sort: bool = True # Use sort=True
|
|
filter_page_numbers: bool = True
|
|
filter_headers: bool = False # Keep headers by default
|
|
filter_footers: bool = False # Keep footers by default
|
|
|
|
# Step 3: Text Extraction
|
|
preserve_bbox: bool = True # For debugging
|
|
garble_detection: bool = True
|
|
ocr_fallback_threshold: float = 0.1 # Fallback on >10% garble
|
|
```
|
|
|
|
---
|
|
|
|
## Testing Strategy
|
|
|
|
1. **Unit Tests**
|
|
- White-out detection with synthetic PDFs
|
|
- Garble rate calculation
|
|
- Element classification accuracy
|
|
|
|
2. **Integration Tests**
|
|
- Two-column document reading order
|
|
- Hidden layer removal
|
|
- GS fallback trigger conditions
|
|
|
|
3. **Regression Tests**
|
|
- Existing task outputs should not change for clean PDFs
|
|
- Performance benchmarks (should add <100ms per page)
|