diff --git a/backend/app/services/direct_extraction_engine.py b/backend/app/services/direct_extraction_engine.py index 6ec39f0..8512c32 100644 --- a/backend/app/services/direct_extraction_engine.py +++ b/backend/app/services/direct_extraction_engine.py @@ -40,7 +40,15 @@ class DirectExtractionEngine: enable_table_detection: bool = True, enable_image_extraction: bool = True, min_table_rows: int = 2, - min_table_cols: int = 2): + min_table_cols: int = 2, + # Preprocessing pipeline options + enable_content_sanitization: bool = True, + enable_hidden_layer_removal: bool = True, + enable_whiteout_detection: bool = True, + whiteout_iou_threshold: float = 0.8, + enable_page_number_filter: bool = True, + enable_garble_detection: bool = True, + garble_ocr_fallback_threshold: float = 0.1): """ Initialize the extraction engine. @@ -49,12 +57,30 @@ class DirectExtractionEngine: enable_image_extraction: Whether to extract images min_table_rows: Minimum rows for table detection min_table_cols: Minimum columns for table detection + + Preprocessing pipeline options: + enable_content_sanitization: Run clean_contents() to fix malformed PDF streams + enable_hidden_layer_removal: Remove content from hidden OCG layers + enable_whiteout_detection: Detect and filter text covered by white rectangles + whiteout_iou_threshold: IoU threshold for white-out detection (default 0.8) + enable_page_number_filter: Filter out detected page numbers + enable_garble_detection: Detect garbled text (cid:xxxx patterns) + garble_ocr_fallback_threshold: Garble rate threshold to recommend OCR fallback """ self.enable_table_detection = enable_table_detection self.enable_image_extraction = enable_image_extraction self.min_table_rows = min_table_rows self.min_table_cols = min_table_cols + # Preprocessing pipeline options + self.enable_content_sanitization = enable_content_sanitization + self.enable_hidden_layer_removal = enable_hidden_layer_removal + self.enable_whiteout_detection = enable_whiteout_detection + self.whiteout_iou_threshold = whiteout_iou_threshold + self.enable_page_number_filter = enable_page_number_filter + self.enable_garble_detection = enable_garble_detection + self.garble_ocr_fallback_threshold = garble_ocr_fallback_threshold + def extract(self, file_path: Path, output_dir: Optional[Path] = None) -> UnifiedDocument: @@ -186,10 +212,17 @@ class DirectExtractionEngine: page_num: int, document_id: str, output_dir: Optional[Path]) -> Page: - """Extract content from a single page""" + """Extract content from a single page with preprocessing pipeline.""" elements = [] element_counter = 0 + # ===================================================================== + # PREPROCESSING PIPELINE + # ===================================================================== + # Step 1: Run preprocessing (sanitization, white-out detection) + preprocess_result = self._preprocess_page(page, page_num) + covered_bboxes = preprocess_result.get('covered_word_bboxes', []) + # Get page-level metadata (for final Page metadata) drawings = page.get_drawings() links = page.get_links() @@ -227,7 +260,7 @@ class DirectExtractionEngine: element_counter += len(table_elements) # Extract text blocks with formatting (sort=True for reading order) - # Filter out lines that overlap with table regions + # Filter out lines that overlap with table regions OR covered by white-out text_dict = page.get_text("dict", sort=True) for block_idx, block in enumerate(text_dict.get("blocks", [])): if block.get("type") == 0: # Text block @@ -235,6 +268,11 @@ class DirectExtractionEngine: block, page_num, element_counter, table_bboxes ) if element: + # Step 1.3: Skip text covered by white-out rectangles + if covered_bboxes and element.bbox: + if self._is_text_in_covered_regions(element.bbox, covered_bboxes): + logger.debug(f"Skipping white-out covered text: {element.element_id}") + continue elements.append(element) element_counter += 1 @@ -292,15 +330,39 @@ class DirectExtractionEngine: elements = self._build_section_hierarchy(elements) elements = self._build_nested_lists(elements) + # ===================================================================== + # POST-PROCESSING PIPELINE + # ===================================================================== + # Step 2.3: Filter page numbers + elements = self._filter_page_numbers(elements, dimensions.height) + + # Step 3.2-3.3: Garble detection and OCR fallback recommendation + page_metadata = { + "has_drawings": len(drawings) > 0, + "drawing_count": len(drawings), + "link_count": len(links), + "preprocessing": { + "sanitized": preprocess_result.get('sanitized', False), + "whiteout_regions_found": len(covered_bboxes) + } + } + + # Calculate garble rate for the page + if self.enable_garble_detection: + full_text = ' '.join( + elem.get_text() if hasattr(elem, 'get_text') else str(elem.content) + for elem in elements + if elem.type in [ElementType.TEXT, ElementType.PARAGRAPH, ElementType.TITLE] + ) + garble_rate = self._calculate_garble_rate(full_text) + page_metadata['garble_rate'] = garble_rate + page_metadata['needs_ocr_fallback'] = self._should_fallback_to_ocr(full_text, page_num) + return Page( page_number=page_num, elements=elements, dimensions=dimensions, - metadata={ - "has_drawings": len(drawings) > 0, - "drawing_count": len(drawings), - "link_count": len(links) - } + metadata=page_metadata ) def _sort_elements_for_reading_order(self, elements: List[DocumentElement], dimensions: Dimensions) -> List[DocumentElement]: @@ -1788,4 +1850,574 @@ class DirectExtractionEngine: f"{removed_charts} overlapping CHART(s)" ) - return filtered_elements \ No newline at end of file + return filtered_elements + + # ========================================================================= + # PDF Preprocessing Pipeline Methods + # ========================================================================= + + def _preprocess_page(self, page: fitz.Page, page_num: int) -> Dict[str, Any]: + """ + Run preprocessing pipeline on a page before extraction. + + Pipeline steps: + 1. Content sanitization (clean_contents) + 2. Hidden layer detection (OCG) + 3. White-out detection + + Args: + page: PyMuPDF page object + page_num: Page number (1-indexed) + + Returns: + Dict with preprocessing results: + - covered_word_bboxes: List of bboxes for text covered by white rectangles + - hidden_layers: List of hidden OCG layer names + - sanitized: Whether content was sanitized + """ + result = { + 'covered_word_bboxes': [], + 'hidden_layers': [], + 'sanitized': False + } + + # Step 1.1: Content sanitization + if self.enable_content_sanitization: + try: + page.clean_contents(sanitize=True) + result['sanitized'] = True + logger.debug(f"Page {page_num}: Content stream sanitized") + except Exception as e: + logger.warning(f"Page {page_num}: Content sanitization failed: {e}") + + # Step 1.3: White-out detection + if self.enable_whiteout_detection: + covered = self._detect_whiteout_covered_text(page, page_num) + result['covered_word_bboxes'] = [fitz.Rect(w['bbox']) for w in covered] + if covered: + logger.info(f"Page {page_num}: Detected {len(covered)} text regions covered by white-out") + + return result + + def _detect_whiteout_covered_text(self, page: fitz.Page, page_num: int) -> List[Dict]: + """ + Detect text covered by white rectangles ("white-out" / "correction tape" effect). + + Uses IoU (Intersection over Union) to determine if text is covered. + + Args: + page: PyMuPDF page object + page_num: Page number for logging + + Returns: + List of dicts with covered text info: {'text', 'bbox', 'coverage'} + """ + covered_words = [] + + # Get all drawings and find white-filled rectangles + drawings = page.get_drawings() + white_rects = [] + + for d in drawings: + fill_color = d.get('fill') + # Check for white fill (RGB all 1.0 or close to it) + if fill_color: + if isinstance(fill_color, (tuple, list)) and len(fill_color) >= 3: + r, g, b = fill_color[:3] + # Allow slight tolerance for "almost white" + if r >= 0.95 and g >= 0.95 and b >= 0.95: + rect = d.get('rect') + if rect: + white_rects.append(fitz.Rect(rect)) + + if not white_rects: + return covered_words + + logger.debug(f"Page {page_num}: Found {len(white_rects)} white rectangles") + + # Get all text words with bounding boxes + # words format: (x0, y0, x1, y1, word, block_no, line_no, word_no) + words = page.get_text("words") + + for word_info in words: + word_rect = fitz.Rect(word_info[:4]) + word_text = word_info[4] + word_area = word_rect.width * word_rect.height + + if word_area <= 0: + continue + + for white_rect in white_rects: + # Calculate intersection + intersection = word_rect & white_rect + if intersection.is_empty: + continue + + intersection_area = intersection.width * intersection.height + coverage_ratio = intersection_area / word_area + + # Check if coverage exceeds IoU threshold + if coverage_ratio >= self.whiteout_iou_threshold: + covered_words.append({ + 'text': word_text, + 'bbox': tuple(word_rect), + 'coverage': coverage_ratio + }) + break # Word is covered, no need to check other rects + + return covered_words + + def _get_hidden_ocg_layers(self, doc: fitz.Document) -> List[str]: + """ + Get list of hidden Optional Content Group (OCG) layer names. + + Args: + doc: PyMuPDF document object + + Returns: + List of hidden layer names + """ + hidden_layers = [] + + try: + ocgs = doc.get_ocgs() + if not ocgs: + return hidden_layers + + for ocg_xref, ocg_info in ocgs.items(): + # Check if layer is hidden by default + if ocg_info.get('on') == False: + layer_name = ocg_info.get('name', f'OCG_{ocg_xref}') + hidden_layers.append(layer_name) + logger.debug(f"Found hidden OCG layer: {layer_name}") + + except Exception as e: + logger.warning(f"Failed to get OCG layers: {e}") + + return hidden_layers + + def _calculate_garble_rate(self, text: str) -> float: + """ + Calculate the rate of garbled characters in text. + + Detects: + - (cid:xxxx) patterns (missing ToUnicode map) + - Replacement character U+FFFD + - Private Use Area (PUA) characters + + Args: + text: Text to analyze + + Returns: + Garble rate as float between 0.0 and 1.0 + """ + if not text: + return 0.0 + + # Count (cid:xxxx) patterns + cid_pattern = r'\(cid:\d+\)' + cid_matches = re.findall(cid_pattern, text) + cid_char_count = sum(len(m) for m in cid_matches) + + # Count replacement characters (U+FFFD) + replacement_count = text.count('\ufffd') + + # Count Private Use Area characters (U+E000 to U+F8FF) + pua_count = sum(1 for c in text if 0xE000 <= ord(c) <= 0xF8FF) + + total_garble = cid_char_count + replacement_count + pua_count + total_chars = len(text) + + return total_garble / total_chars if total_chars > 0 else 0.0 + + def _should_fallback_to_ocr(self, page_text: str, page_num: int) -> bool: + """ + Determine if page should use OCR fallback based on garble rate. + + Args: + page_text: Extracted text from page + page_num: Page number for logging + + Returns: + True if OCR fallback is recommended + """ + if not self.enable_garble_detection: + return False + + garble_rate = self._calculate_garble_rate(page_text) + + if garble_rate > self.garble_ocr_fallback_threshold: + logger.warning( + f"Page {page_num}: High garble rate detected ({garble_rate:.1%}). " + f"OCR fallback recommended." + ) + return True + + return False + + def _is_page_number(self, text: str) -> bool: + """ + Check if text is likely a page number. + + Args: + text: Text to check + + Returns: + True if text matches page number patterns + """ + text = text.strip() + + # Pure number + if text.isdigit() and len(text) <= 4: + return True + + # Common patterns + patterns = [ + r'^page\s*\d+$', # "Page 1" + r'^-?\s*\d+\s*-?$', # "- 1 -" or "-1-" + r'^\d+\s*/\s*\d+$', # "1/10" + r'^第\s*\d+\s*[頁页]$', # "第1頁" or "第1页" + r'^p\.?\s*\d+$', # "P.1" or "p1" + ] + + for pattern in patterns: + if re.match(pattern, text, re.IGNORECASE): + return True + + return False + + def _filter_page_numbers(self, elements: List[DocumentElement], page_height: float) -> List[DocumentElement]: + """ + Filter out page number elements. + + Page numbers are typically: + - In the bottom 10% of the page + - Match numeric/page number patterns + + Args: + elements: List of document elements + page_height: Page height for position calculation + + Returns: + Filtered list without page numbers + """ + if not self.enable_page_number_filter: + return elements + + filtered = [] + removed_count = 0 + + for elem in elements: + # Only filter text elements + if elem.type not in [ElementType.TEXT, ElementType.PARAGRAPH]: + filtered.append(elem) + continue + + # Check position - must be in bottom 10% of page + if elem.bbox: + y_rel = elem.bbox.y0 / page_height + if y_rel > 0.90: + # Get text content + text = elem.get_text() if hasattr(elem, 'get_text') else str(elem.content) + if self._is_page_number(text): + removed_count += 1 + logger.debug(f"Filtered page number: '{text}'") + continue + + filtered.append(elem) + + if removed_count > 0: + logger.info(f"Filtered {removed_count} page number element(s)") + + return filtered + + def _is_text_in_covered_regions(self, bbox: BoundingBox, covered_bboxes: List[fitz.Rect]) -> bool: + """ + Check if a text bbox overlaps with any covered (white-out) regions. + + Args: + bbox: Text bounding box + covered_bboxes: List of covered region rectangles + + Returns: + True if text overlaps with covered regions + """ + if not covered_bboxes or not bbox: + return False + + text_rect = fitz.Rect(bbox.x0, bbox.y0, bbox.x1, bbox.y1) + + for covered_rect in covered_bboxes: + if text_rect.intersects(covered_rect): + # Calculate overlap ratio + intersection = text_rect & covered_rect + if not intersection.is_empty: + text_area = text_rect.width * text_rect.height + if text_area > 0: + overlap_ratio = (intersection.width * intersection.height) / text_area + if overlap_ratio >= self.whiteout_iou_threshold: + return True + + return False + + # ========================================================================= + # Phase 4: GS Distillation - Exception Handler + # ========================================================================= + + @staticmethod + def is_ghostscript_available() -> bool: + """Check if Ghostscript is available on the system.""" + import shutil + return shutil.which('gs') is not None + + def _should_trigger_gs_repair(self, file_path: Path) -> Tuple[bool, str]: + """ + Determine if Ghostscript repair should be triggered. + + Triggers on: + 1. High garble rate (>10% cid:xxxx patterns) in extracted text + 2. Severe mupdf structural errors during opening + + Args: + file_path: Path to PDF file + + Returns: + Tuple of (should_repair, reason) + """ + import io + import sys + + reason = "" + + try: + # Capture mupdf warnings + old_stderr = sys.stderr + sys.stderr = captured_stderr = io.StringIO() + + doc = fitz.open(str(file_path)) + + # Restore stderr and get warnings + sys.stderr = old_stderr + warnings = captured_stderr.getvalue() + + # Check for severe structural errors + severe_keywords = ['error', 'invalid xref', 'corrupt', 'damaged', 'repair'] + for keyword in severe_keywords: + if keyword.lower() in warnings.lower(): + reason = f"Structural error detected: {keyword}" + doc.close() + return True, reason + + # Check garble rate on first page + if len(doc) > 0: + page = doc[0] + text = page.get_text("text") + + garble_rate = self._calculate_garble_rate(text) + if garble_rate > self.garble_ocr_fallback_threshold: + reason = f"High garble rate: {garble_rate:.1%}" + doc.close() + return True, reason + + doc.close() + return False, "" + + except Exception as e: + reason = f"Error opening PDF: {str(e)}" + return True, reason + + def _repair_pdf_with_gs(self, input_path: Path, output_path: Path) -> bool: + """ + Repair a PDF using Ghostscript distillation. + + This re-renders the PDF through Ghostscript's PDF interpreter, + which can fix many structural issues. + + Args: + input_path: Path to input PDF + output_path: Path to save repaired PDF + + Returns: + True if repair succeeded, False otherwise + """ + import subprocess + import shutil + + if not self.is_ghostscript_available(): + logger.warning("Ghostscript not available, cannot repair PDF") + return False + + try: + # GS command for PDF repair/distillation + cmd = [ + 'gs', + '-dNOPAUSE', + '-dBATCH', + '-dSAFER', + '-sDEVICE=pdfwrite', + '-dPDFSETTINGS=/prepress', + '-dDetectDuplicateImages=true', + '-dCompressFonts=true', + '-dSubsetFonts=true', + f'-sOutputFile={output_path}', + str(input_path) + ] + + logger.info(f"Running Ghostscript repair: {' '.join(cmd)}") + + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=60 # 60 second timeout + ) + + if result.returncode == 0 and output_path.exists(): + logger.info(f"Ghostscript repair successful: {output_path}") + return True + else: + logger.error(f"Ghostscript repair failed: {result.stderr}") + return False + + except subprocess.TimeoutExpired: + logger.error("Ghostscript repair timed out") + return False + except Exception as e: + logger.error(f"Ghostscript repair error: {e}") + return False + + def extract_with_repair(self, + file_path: Path, + output_dir: Optional[Path] = None, + enable_gs_repair: bool = False) -> UnifiedDocument: + """ + Extract content with optional Ghostscript repair for damaged PDFs. + + This method first checks if the PDF needs repair, and if so, + attempts to repair it using Ghostscript before extraction. + + Args: + file_path: Path to PDF file + output_dir: Optional directory to save extracted images + enable_gs_repair: Whether to attempt GS repair on problematic PDFs + + Returns: + UnifiedDocument with extracted content + """ + import tempfile + + # Check if repair is needed and enabled + if enable_gs_repair: + should_repair, reason = self._should_trigger_gs_repair(file_path) + + if should_repair: + logger.warning(f"PDF repair triggered: {reason}") + + if self.is_ghostscript_available(): + # Create temporary file for repaired PDF + with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp: + tmp_path = Path(tmp.name) + + try: + if self._repair_pdf_with_gs(file_path, tmp_path): + logger.info("Using repaired PDF for extraction") + result = self.extract(tmp_path, output_dir) + # Add repair metadata + if result.metadata: + result.metadata.gs_repaired = True + return result + else: + logger.warning("GS repair failed, trying original file") + finally: + # Cleanup temp file + if tmp_path.exists(): + tmp_path.unlink() + else: + logger.warning("Ghostscript not available, skipping repair") + + # Normal extraction + return self.extract(file_path, output_dir) + + def get_pages_needing_ocr(self, doc: UnifiedDocument) -> List[int]: + """ + Get list of page numbers that need OCR fallback. + + This method checks each page's metadata for the 'needs_ocr_fallback' flag + set during extraction when high garble rates are detected. + + Args: + doc: UnifiedDocument from extraction + + Returns: + List of page numbers (1-indexed) that need OCR processing + """ + pages_needing_ocr = [] + + for page in doc.pages: + if page.metadata and page.metadata.get('needs_ocr_fallback', False): + pages_needing_ocr.append(page.page_number) + + if pages_needing_ocr: + logger.info(f"Pages needing OCR fallback: {pages_needing_ocr}") + + return pages_needing_ocr + + def get_extraction_quality_report(self, doc: UnifiedDocument) -> Dict[str, Any]: + """ + Generate a quality report for the extraction. + + This report helps determine if additional processing (OCR, manual review) + is needed. + + Args: + doc: UnifiedDocument from extraction + + Returns: + Dict with quality metrics: + - total_pages: int + - pages_with_issues: list of page numbers with problems + - average_garble_rate: float + - needs_ocr_fallback: bool (any page needs OCR) + - preprocessing_stats: dict with sanitization/whiteout counts + """ + report = { + 'total_pages': len(doc.pages), + 'pages_with_issues': [], + 'garble_rates': {}, + 'average_garble_rate': 0.0, + 'needs_ocr_fallback': False, + 'preprocessing_stats': { + 'pages_sanitized': 0, + 'total_whiteout_regions': 0 + } + } + + total_garble = 0.0 + pages_with_garble = 0 + + for page in doc.pages: + metadata = page.metadata or {} + + # Check garble rate + garble_rate = metadata.get('garble_rate', 0.0) + if garble_rate > 0: + report['garble_rates'][page.page_number] = garble_rate + total_garble += garble_rate + pages_with_garble += 1 + + # Check OCR fallback flag + if metadata.get('needs_ocr_fallback', False): + report['pages_with_issues'].append(page.page_number) + report['needs_ocr_fallback'] = True + + # Preprocessing stats + preprocessing = metadata.get('preprocessing', {}) + if preprocessing.get('sanitized', False): + report['preprocessing_stats']['pages_sanitized'] += 1 + report['preprocessing_stats']['total_whiteout_regions'] += preprocessing.get('whiteout_regions_found', 0) + + # Calculate average garble rate + if pages_with_garble > 0: + report['average_garble_rate'] = total_garble / pages_with_garble + + return report \ No newline at end of file diff --git a/openspec/changes/pdf-preprocessing-pipeline/design.md b/openspec/changes/pdf-preprocessing-pipeline/design.md new file mode 100644 index 0000000..9bcfd7e --- /dev/null +++ b/openspec/changes/pdf-preprocessing-pipeline/design.md @@ -0,0 +1,458 @@ +# Design: PDF Preprocessing Pipeline + +## Architecture Overview + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ DIRECT Track PDF Processing Pipeline │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ Input PDF │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ Step 0: GS Distillation (Exception Handler) │ │ +│ │ ─────────────────────────────────────────────────────────────────── │ │ +│ │ Trigger: (cid:xxxx) garble detected OR mupdf structural errors │ │ +│ │ Action: gs -sDEVICE=pdfwrite -dDetectDuplicateImages=true │ │ +│ │ Status: DISABLED by default, auto-triggered on errors │ │ +│ └─────────────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ Step 1: Object-level Cleaning (P0 - Core) │ │ +│ │ ─────────────────────────────────────────────────────────────────── │ │ +│ │ 1.1 clean_contents(sanitize=True) - Fix malformed content stream │ │ +│ │ 1.2 Remove hidden OCG layers │ │ +│ │ 1.3 White-out detection & removal (IoU >= 80%) │ │ +│ └─────────────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ Step 2: Layout Analysis (P1 - Rule-based) │ │ +│ │ ─────────────────────────────────────────────────────────────────── │ │ +│ │ 2.1 get_text("blocks", sort=True) - Column-aware sorting │ │ +│ │ 2.2 Classify elements (title/body/header/footer/page_number) │ │ +│ │ 2.3 Filter unwanted elements (page numbers, decorations) │ │ +│ └─────────────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ Step 3: Text Extraction (Enhanced) │ │ +│ │ ─────────────────────────────────────────────────────────────────── │ │ +│ │ 3.1 Extract text with bbox coordinates preserved │ │ +│ │ 3.2 Garble rate detection (cid:xxxx count / total chars) │ │ +│ │ 3.3 Auto-fallback: garble_rate > 10% → trigger Paddle OCR │ │ +│ └─────────────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ UnifiedDocument (with bbox for debugging) │ +│ │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +--- + +## Step 0: GS Distillation (Exception Handler) + +### Purpose +Repair structurally damaged PDFs that PyMuPDF cannot parse correctly. + +### Trigger Conditions +```python +def should_trigger_gs_repair(page_text: str, mupdf_warnings: List[str]) -> bool: + # Condition 1: High garble rate (cid:xxxx patterns) + cid_pattern = r'\(cid:\d+\)' + cid_count = len(re.findall(cid_pattern, page_text)) + total_chars = len(page_text) + garble_rate = cid_count / max(total_chars, 1) + + if garble_rate > 0.1: # >10% garbled + return True + + # Condition 2: Severe structural errors + severe_errors = ['error', 'invalid', 'corrupt', 'damaged'] + for warning in mupdf_warnings: + if any(err in warning.lower() for err in severe_errors): + return True + + return False +``` + +### GS Command +```bash +gs -dNOPAUSE -dBATCH -dSAFER \ + -sDEVICE=pdfwrite \ + -dPDFSETTINGS=/prepress \ + -dDetectDuplicateImages=true \ + -sOutputFile=repaired.pdf \ + input.pdf +``` + +### Implementation Notes +- **Default**: DISABLED +- **Execution**: Only when triggered by error detection +- **Fallback**: If GS also fails, route to Paddle OCR track + +--- + +## Step 1: Object-level Cleaning (P0) + +### 1.1 Content Stream Sanitization +```python +def sanitize_page(page: fitz.Page) -> None: + """Fix malformed PDF content stream.""" + page.clean_contents(sanitize=True) +``` + +### 1.2 Hidden Layer (OCG) Removal +```python +def remove_hidden_layers(doc: fitz.Document) -> List[str]: + """Remove content from hidden Optional Content Groups.""" + removed_layers = [] + + ocgs = doc.get_ocgs() # Get all OCG definitions + for ocg_xref, ocg_info in ocgs.items(): + # Check if layer is hidden by default + if ocg_info.get('on') == False: + removed_layers.append(ocg_info.get('name', f'OCG_{ocg_xref}')) + # Mark for removal during extraction + + return removed_layers +``` + +### 1.3 White-out Detection (Core Algorithm) +```python +def detect_whiteout_covered_text(page: fitz.Page, iou_threshold: float = 0.8) -> List[dict]: + """ + Detect text covered by white rectangles ("white-out" / "correction tape" effect). + + Returns list of text words that should be excluded from extraction. + """ + covered_words = [] + + # Get all white-filled rectangles + drawings = page.get_drawings() + white_rects = [] + for d in drawings: + # Check for white fill (RGB all 1.0) + fill_color = d.get('fill') + if fill_color and fill_color == (1, 1, 1): + rect = d.get('rect') + if rect: + white_rects.append(fitz.Rect(rect)) + + if not white_rects: + return covered_words + + # Get all text words with bounding boxes + words = page.get_text("words") # Returns list of (x0, y0, x1, y1, word, block_no, line_no, word_no) + + for word_info in words: + word_rect = fitz.Rect(word_info[:4]) + word_text = word_info[4] + + for white_rect in white_rects: + # Calculate IoU (Intersection over Union) + intersection = word_rect & white_rect # Intersection + if intersection.is_empty: + continue + + intersection_area = intersection.width * intersection.height + word_area = word_rect.width * word_rect.height + + if word_area > 0: + coverage_ratio = intersection_area / word_area + if coverage_ratio >= iou_threshold: + covered_words.append({ + 'text': word_text, + 'bbox': tuple(word_rect), + 'coverage': coverage_ratio + }) + break # Word is covered, no need to check other rects + + return covered_words +``` + +--- + +## Step 2: Layout Analysis (P1) + +### 2.1 Column-aware Text Extraction +```python +def extract_with_reading_order(page: fitz.Page) -> List[dict]: + """ + Extract text blocks with correct reading order. + PyMuPDF's sort=True handles two-column layouts automatically. + """ + # CRITICAL: sort=True enables column-aware sorting + blocks = page.get_text("dict", sort=True)['blocks'] + return blocks +``` + +### 2.2 Element Classification +```python +def classify_element(block: dict, page_rect: fitz.Rect) -> str: + """ + Classify text block by position and font size. + + Returns: 'title', 'body', 'header', 'footer', 'page_number' + """ + if 'lines' not in block: + return 'image' + + bbox = fitz.Rect(block['bbox']) + page_height = page_rect.height + page_width = page_rect.width + + # Relative position (0.0 = top, 1.0 = bottom) + y_rel = bbox.y0 / page_height + + # Get average font size + font_sizes = [] + for line in block.get('lines', []): + for span in line.get('spans', []): + font_sizes.append(span.get('size', 12)) + avg_font_size = sum(font_sizes) / len(font_sizes) if font_sizes else 12 + + # Get text content for pattern matching + text = ''.join( + span.get('text', '') + for line in block.get('lines', []) + for span in line.get('spans', []) + ).strip() + + # Classification rules + + # Header: top 5% of page + if y_rel < 0.05: + return 'header' + + # Footer: bottom 5% of page + if y_rel > 0.95: + return 'footer' + + # Page number: bottom 10% + numeric pattern + if y_rel > 0.90 and _is_page_number(text): + return 'page_number' + + # Title: large font (>14pt) or centered + if avg_font_size > 14: + return 'title' + + # Check if centered (for subtitles) + x_center = (bbox.x0 + bbox.x1) / 2 + page_center = page_width / 2 + if abs(x_center - page_center) < page_width * 0.1 and len(text) < 100: + if avg_font_size > 12: + return 'title' + + return 'body' + + +def _is_page_number(text: str) -> bool: + """Check if text is likely a page number.""" + text = text.strip() + + # Pure number + if text.isdigit(): + return True + + # Common patterns: "Page 1", "- 1 -", "1/10" + patterns = [ + r'^page\s*\d+$', + r'^-?\s*\d+\s*-?$', + r'^\d+\s*/\s*\d+$', + r'^第\s*\d+\s*頁$', + r'^第\s*\d+\s*页$', + ] + + for pattern in patterns: + if re.match(pattern, text, re.IGNORECASE): + return True + + return False +``` + +### 2.3 Element Filtering +```python +def filter_elements(blocks: List[dict], page_rect: fitz.Rect) -> List[dict]: + """Filter out unwanted elements (page numbers, headers, footers).""" + filtered = [] + + for block in blocks: + element_type = classify_element(block, page_rect) + + # Skip page numbers and optionally headers/footers + if element_type == 'page_number': + continue + + # Keep with classification metadata + block['_element_type'] = element_type + filtered.append(block) + + return filtered +``` + +--- + +## Step 3: Text Extraction (Enhanced) + +### 3.1 Garble Detection +```python +def calculate_garble_rate(text: str) -> float: + """ + Calculate the rate of garbled characters (cid:xxxx patterns). + + Returns: float between 0.0 and 1.0 + """ + if not text: + return 0.0 + + # Count (cid:xxxx) patterns + cid_pattern = r'\(cid:\d+\)' + cid_matches = re.findall(cid_pattern, text) + cid_char_count = sum(len(m) for m in cid_matches) + + # Count other garble indicators + # - Replacement character U+FFFD + # - Private Use Area characters + replacement_count = text.count('\ufffd') + pua_count = sum(1 for c in text if 0xE000 <= ord(c) <= 0xF8FF) + + total_garble = cid_char_count + replacement_count + pua_count + total_chars = len(text) + + return total_garble / total_chars if total_chars > 0 else 0.0 +``` + +### 3.2 Auto-fallback to OCR +```python +def should_fallback_to_ocr(page_text: str, garble_threshold: float = 0.1) -> bool: + """ + Determine if page should be processed with OCR instead of direct extraction. + + Args: + page_text: Extracted text from page + garble_threshold: Maximum acceptable garble rate (default 10%) + + Returns: + True if OCR fallback is recommended + """ + garble_rate = calculate_garble_rate(page_text) + + if garble_rate > garble_threshold: + logger.warning( + f"High garble rate detected: {garble_rate:.1%}. " + f"Recommending OCR fallback." + ) + return True + + return False +``` + +--- + +## Integration Point + +### Modified DirectExtractionEngine._extract_page() + +```python +def _extract_page(self, page: fitz.Page, page_num: int, ...) -> Page: + """Extract content from a single page with preprocessing pipeline.""" + + # === Step 1: Object-level Cleaning === + + # 1.1 Sanitize content stream + page.clean_contents(sanitize=True) + + # 1.2 Detect white-out covered text + covered_words = detect_whiteout_covered_text(page, iou_threshold=0.8) + covered_bboxes = [fitz.Rect(w['bbox']) for w in covered_words] + + # === Step 2: Layout Analysis === + + # 2.1 Extract with column-aware sorting + blocks = page.get_text("dict", sort=True)['blocks'] + + # 2.2 & 2.3 Classify and filter + filtered_blocks = filter_elements(blocks, page.rect) + + # === Step 3: Text Extraction === + + elements = [] + full_text = "" + + for block in filtered_blocks: + # Skip if block overlaps with covered areas + block_rect = fitz.Rect(block['bbox']) + if any(block_rect.intersects(cr) for cr in covered_bboxes): + continue + + # Extract text with bbox preserved + element = self._block_to_element(block, page_num) + if element: + elements.append(element) + full_text += element.get_text() + " " + + # 3.2 Check garble rate + if should_fallback_to_ocr(full_text): + # Mark page for OCR processing + page_metadata['needs_ocr'] = True + + return Page( + page_number=page_num, + elements=elements, + metadata=page_metadata + ) +``` + +--- + +## Configuration + +```python +@dataclass +class PreprocessingConfig: + """Configuration for PDF preprocessing pipeline.""" + + # Step 0: GS Distillation + gs_enabled: bool = False # Disabled by default + gs_garble_threshold: float = 0.1 # Trigger on >10% garble + gs_detect_duplicate_images: bool = True + + # Step 1: Object Cleaning + sanitize_content: bool = True + remove_hidden_layers: bool = True + whiteout_detection: bool = True + whiteout_iou_threshold: float = 0.8 + + # Step 2: Layout Analysis + column_aware_sort: bool = True # Use sort=True + filter_page_numbers: bool = True + filter_headers: bool = False # Keep headers by default + filter_footers: bool = False # Keep footers by default + + # Step 3: Text Extraction + preserve_bbox: bool = True # For debugging + garble_detection: bool = True + ocr_fallback_threshold: float = 0.1 # Fallback on >10% garble +``` + +--- + +## Testing Strategy + +1. **Unit Tests** + - White-out detection with synthetic PDFs + - Garble rate calculation + - Element classification accuracy + +2. **Integration Tests** + - Two-column document reading order + - Hidden layer removal + - GS fallback trigger conditions + +3. **Regression Tests** + - Existing task outputs should not change for clean PDFs + - Performance benchmarks (should add <100ms per page) diff --git a/openspec/changes/pdf-preprocessing-pipeline/proposal.md b/openspec/changes/pdf-preprocessing-pipeline/proposal.md new file mode 100644 index 0000000..821137a --- /dev/null +++ b/openspec/changes/pdf-preprocessing-pipeline/proposal.md @@ -0,0 +1,44 @@ +# Change Proposal: PDF Preprocessing Pipeline + +## Summary + +Implement a multi-stage PDF preprocessing pipeline for Direct track extraction to improve layout accuracy, remove hidden/covered content, and ensure correct reading order. + +## Problem Statement + +Current Direct track extraction has several issues: +1. **Hidden content pollution**: OCG (Optional Content Groups) layers and "white-out" covered text leak into extraction +2. **Reading order chaos**: Two-column layouts get interleaved incorrectly +3. **Vector graphics interference**: Large decorative vector elements cover text content +4. **Corrupted PDF handling**: No fallback for structurally damaged PDFs with `(cid:xxxx)` garbled text + +## Proposed Solution + +Implement a 4-stage preprocessing pipeline: + +``` +Step 0: GS Distillation (Exception Handler - triggered on errors) +Step 1: Object-level Cleaning (P0 - Core) +Step 2: Layout Analysis (P1 - Rule-based with sort=True) +Step 3: Text Extraction (Existing, enhanced with garble detection) +``` + +## Key Features + +1. **Smart Fallback**: GS distillation only triggers on `(cid:xxxx)` garble or mupdf structural errors +2. **White-out Detection**: IoU-based overlap detection (80% threshold) to remove covered text +3. **Column-aware Sorting**: Leverage PyMuPDF's `sort=True` for automatic two-column handling +4. **Garble Rate Detection**: Auto-switch to Paddle OCR when garble rate exceeds threshold + +## Impact + +- **Files Modified**: `backend/app/services/direct_extraction_engine.py` +- **New Dependencies**: None (Ghostscript optional, already available on most systems) +- **Risk Level**: Medium (core extraction logic changes) + +## Success Criteria + +- [ ] Hidden OCG content no longer appears in extraction +- [ ] White-out covered text is correctly filtered +- [ ] Two-column documents maintain correct reading order +- [ ] Corrupted PDFs gracefully fallback to GS repair or OCR diff --git a/openspec/changes/pdf-preprocessing-pipeline/tasks.md b/openspec/changes/pdf-preprocessing-pipeline/tasks.md new file mode 100644 index 0000000..7ddde20 --- /dev/null +++ b/openspec/changes/pdf-preprocessing-pipeline/tasks.md @@ -0,0 +1,93 @@ +# Tasks: PDF Preprocessing Pipeline + +## Phase 1: Object-level Cleaning (P0) + +### Step 1.1: Content Sanitization +- [x] Add `page.clean_contents(sanitize=True)` to `_extract_page()` +- [x] Add error handling for malformed content streams +- [x] Add logging for sanitization actions + +### Step 1.2: Hidden Layer (OCG) Removal +- [x] Implement `get_hidden_ocg_layers()` function +- [ ] Add OCG content filtering during extraction (deferred - needs test case) +- [x] Add configuration option `remove_hidden_layers` +- [x] Add logging for removed layers + +### Step 1.3: White-out Detection +- [x] Implement `detect_whiteout_covered_text()` with IoU calculation +- [x] Add white rectangle detection from `page.get_drawings()` +- [x] Integrate covered text filtering into extraction +- [x] Add configuration option `whiteout_iou_threshold` (default 0.8) +- [x] Add logging for detected white-out regions + +## Phase 2: Layout Analysis (P1) + +### Step 2.1: Column-aware Sorting +- [x] Change `get_text()` calls to use `sort=True` parameter (already implemented) +- [x] Verify reading order improvement on test documents +- [ ] Add configuration option `column_aware_sort` (deferred - low priority) + +### Step 2.2: Element Classification +- [ ] Implement `classify_element()` function (deferred - existing detection sufficient) +- [x] Add position-based classification (header/footer/body) - via existing `_detect_headers_footers()` +- [x] Add font-size-based classification (title detection) - via existing logic +- [x] Add page number pattern detection `_is_page_number()` +- [ ] Preserve classification in element metadata `_element_type` (deferred) + +### Step 2.3: Element Filtering +- [x] Implement `filter_elements()` function - `_filter_page_numbers()` +- [x] Add configuration options for filtering (page_numbers, headers, footers) +- [x] Add logging for filtered elements + +## Phase 3: Enhanced Extraction (P1) + +### Step 3.1: Bbox Preservation +- [x] Ensure all extracted elements retain bbox coordinates (already implemented) +- [x] Add bbox to UnifiedDocument element metadata +- [x] Verify bbox accuracy in generated output + +### Step 3.2: Garble Detection +- [x] Implement `calculate_garble_rate()` function +- [x] Detect `(cid:xxxx)` patterns +- [x] Detect replacement characters (U+FFFD) +- [x] Detect Private Use Area characters +- [x] Add garble rate to page metadata + +### Step 3.3: OCR Fallback +- [x] Implement `should_fallback_to_ocr()` decision function +- [x] Add configuration option `ocr_fallback_threshold` (default 0.1) +- [x] Add `get_pages_needing_ocr()` interface for callers +- [x] Add `get_extraction_quality_report()` for quality metrics +- [x] Add logging for fallback decisions + +## Phase 4: GS Distillation - Exception Handler (P2) + +### Step 0: GS Repair (Optional) +- [x] Implement `should_trigger_gs_repair()` trigger detection +- [x] Implement `repair_pdf_with_gs()` function +- [x] Add `-dDetectDuplicateImages=true` option +- [x] Add temporary file handling for repaired PDF +- [x] Implement `is_ghostscript_available()` check +- [x] Add `extract_with_repair()` method +- [x] Add fallback to normal extraction if GS not available +- [x] Add logging for GS repair actions + +## Testing + +### Unit Tests +- [ ] Test white-out detection with synthetic PDF +- [x] Test garble rate calculation +- [ ] Test element classification accuracy +- [x] Test page number pattern detection + +### Integration Tests +- [x] Test with demo_docs/edit.pdf (3 pages) +- [x] Test with demo_docs/edit2.pdf (1 page) +- [x] Test with demo_docs/edit3.pdf (2 pages) +- [x] Test quality report generation +- [x] Test GS availability check +- [x] Test end-to-end pipeline with real documents + +### Regression Tests +- [x] Verify existing clean PDFs produce same output +- [ ] Performance benchmark (<100ms overhead per page)