""" PP-StructureV3 Debug Service Provides debugging tools for visualizing and saving PP-StructureV3 results: - Save raw results as JSON for inspection - Generate visualization images showing detected bboxes - Compare raw OCR regions with PP-StructureV3 elements """ import json import logging from pathlib import Path from typing import Dict, List, Any, Optional, Tuple from datetime import datetime from PIL import Image, ImageDraw, ImageFont logger = logging.getLogger(__name__) # Color palette for different element types (RGB) ELEMENT_COLORS: Dict[str, Tuple[int, int, int]] = { 'text': (0, 128, 0), # Green 'title': (0, 0, 255), # Blue 'table': (255, 0, 0), # Red 'figure': (255, 165, 0), # Orange 'image': (255, 165, 0), # Orange 'header': (128, 0, 128), # Purple 'footer': (128, 0, 128), # Purple 'equation': (0, 255, 255), # Cyan 'chart': (255, 192, 203), # Pink 'list': (139, 69, 19), # Brown 'reference': (128, 128, 128), # Gray 'default': (255, 0, 255), # Magenta for unknown types } # Color for raw OCR regions RAW_OCR_COLOR = (255, 215, 0) # Gold class PPStructureDebug: """Debug service for PP-StructureV3 analysis results.""" def __init__(self, output_dir: Path): """ Initialize debug service. Args: output_dir: Directory to save debug outputs """ self.output_dir = Path(output_dir) self.output_dir.mkdir(parents=True, exist_ok=True) def save_raw_results( self, pp_structure_results: Dict[str, Any], raw_ocr_regions: List[Dict[str, Any]], filename_prefix: str = "debug" ) -> Dict[str, Path]: """ Save raw PP-StructureV3 results and OCR regions as JSON files. Args: pp_structure_results: Raw PP-StructureV3 analysis results raw_ocr_regions: Raw OCR text regions filename_prefix: Prefix for output files Returns: Dictionary with paths to saved files """ saved_files = {} # Save PP-StructureV3 results pp_json_path = self.output_dir / f"{filename_prefix}_pp_structure_raw.json" try: # Convert any non-serializable types serializable_results = self._make_serializable(pp_structure_results) with open(pp_json_path, 'w', encoding='utf-8') as f: json.dump(serializable_results, f, ensure_ascii=False, indent=2) saved_files['pp_structure'] = pp_json_path logger.info(f"Saved PP-StructureV3 raw results to {pp_json_path}") except Exception as e: logger.error(f"Failed to save PP-StructureV3 results: {e}") # Save raw OCR regions ocr_json_path = self.output_dir / f"{filename_prefix}_raw_ocr_regions.json" try: serializable_ocr = self._make_serializable(raw_ocr_regions) with open(ocr_json_path, 'w', encoding='utf-8') as f: json.dump(serializable_ocr, f, ensure_ascii=False, indent=2) saved_files['raw_ocr'] = ocr_json_path logger.info(f"Saved raw OCR regions to {ocr_json_path}") except Exception as e: logger.error(f"Failed to save raw OCR regions: {e}") # Save summary comparison summary_path = self.output_dir / f"{filename_prefix}_debug_summary.json" try: summary = self._generate_summary(pp_structure_results, raw_ocr_regions) with open(summary_path, 'w', encoding='utf-8') as f: json.dump(summary, f, ensure_ascii=False, indent=2) saved_files['summary'] = summary_path logger.info(f"Saved debug summary to {summary_path}") except Exception as e: logger.error(f"Failed to save debug summary: {e}") return saved_files def generate_visualization( self, image_path: Path, pp_structure_elements: List[Dict[str, Any]], raw_ocr_regions: Optional[List[Dict[str, Any]]] = None, filename_prefix: str = "debug", show_labels: bool = True, show_raw_ocr: bool = True ) -> Optional[Path]: """ Generate visualization image showing detected elements. Args: image_path: Path to original image pp_structure_elements: PP-StructureV3 detected elements raw_ocr_regions: Optional raw OCR regions to overlay filename_prefix: Prefix for output file show_labels: Whether to show element type labels show_raw_ocr: Whether to show raw OCR regions Returns: Path to generated visualization image """ try: # Load original image img = Image.open(image_path) if img.mode != 'RGB': img = img.convert('RGB') # Create copy for drawing viz_img = img.copy() draw = ImageDraw.Draw(viz_img) # Try to load a font, fall back to default try: font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 14) small_font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 10) except (IOError, OSError): try: font = ImageFont.truetype("/home/egg/project/Tool_OCR/backend/fonts/NotoSansSC-Regular.ttf", 14) small_font = ImageFont.truetype("/home/egg/project/Tool_OCR/backend/fonts/NotoSansSC-Regular.ttf", 10) except (IOError, OSError): font = ImageFont.load_default() small_font = font # Draw raw OCR regions first (so PP-Structure boxes are on top) if show_raw_ocr and raw_ocr_regions: for idx, region in enumerate(raw_ocr_regions): bbox = self._normalize_bbox(region.get('bbox', [])) if bbox: # Draw with dashed style simulation (draw thin lines) x0, y0, x1, y1 = bbox draw.rectangle([x0, y0, x1, y1], outline=RAW_OCR_COLOR, width=1) # Add small label if show_labels: confidence = region.get('confidence', 0) label = f"OCR:{confidence:.2f}" draw.text((x0, y0 - 12), label, fill=RAW_OCR_COLOR, font=small_font) # Draw PP-StructureV3 elements for idx, elem in enumerate(pp_structure_elements): elem_type = elem.get('type', 'default') if hasattr(elem_type, 'value'): elem_type = elem_type.value elem_type = str(elem_type).lower() color = ELEMENT_COLORS.get(elem_type, ELEMENT_COLORS['default']) bbox = self._normalize_bbox(elem.get('bbox', [])) if bbox: x0, y0, x1, y1 = bbox # Draw thicker rectangle for PP-Structure elements draw.rectangle([x0, y0, x1, y1], outline=color, width=3) # Add label if show_labels: label = f"{idx}:{elem_type}" # Draw label background text_bbox = draw.textbbox((x0, y0 - 18), label, font=font) draw.rectangle(text_bbox, fill=(255, 255, 255, 200)) draw.text((x0, y0 - 18), label, fill=color, font=font) # Add legend self._draw_legend(draw, img.width, font) # Add image info info_text = f"PP-Structure: {len(pp_structure_elements)} elements" if raw_ocr_regions: info_text += f" | Raw OCR: {len(raw_ocr_regions)} regions" info_text += f" | Size: {img.width}x{img.height}" draw.text((10, img.height - 25), info_text, fill=(0, 0, 0), font=font) # Save visualization viz_path = self.output_dir / f"{filename_prefix}_pp_structure_viz.png" viz_img.save(viz_path, 'PNG') logger.info(f"Saved visualization to {viz_path}") return viz_path except Exception as e: logger.error(f"Failed to generate visualization: {e}") import traceback traceback.print_exc() return None def _draw_legend(self, draw: ImageDraw, img_width: int, font: ImageFont): """Draw a legend showing element type colors.""" legend_x = img_width - 150 legend_y = 10 # Draw legend background draw.rectangle( [legend_x - 5, legend_y - 5, img_width - 5, legend_y + len(ELEMENT_COLORS) * 18 + 25], fill=(255, 255, 255, 230), outline=(0, 0, 0) ) draw.text((legend_x, legend_y), "Legend:", fill=(0, 0, 0), font=font) legend_y += 20 for elem_type, color in ELEMENT_COLORS.items(): if elem_type == 'default': continue draw.rectangle([legend_x, legend_y + 2, legend_x + 12, legend_y + 14], fill=color) draw.text((legend_x + 18, legend_y), elem_type, fill=(0, 0, 0), font=font) legend_y += 18 # Add raw OCR legend entry draw.rectangle([legend_x, legend_y + 2, legend_x + 12, legend_y + 14], fill=RAW_OCR_COLOR) draw.text((legend_x + 18, legend_y), "raw_ocr", fill=(0, 0, 0), font=font) def _normalize_bbox(self, bbox: Any) -> Optional[Tuple[float, float, float, float]]: """Normalize bbox to (x0, y0, x1, y1) format.""" if not bbox: return None try: # Handle nested list format [[x1,y1], [x2,y2], [x3,y3], [x4,y4]] if isinstance(bbox, (list, tuple)) and len(bbox) >= 1: if isinstance(bbox[0], (list, tuple)): xs = [pt[0] for pt in bbox if len(pt) >= 2] ys = [pt[1] for pt in bbox if len(pt) >= 2] if xs and ys: return (min(xs), min(ys), max(xs), max(ys)) # Handle flat list [x0, y0, x1, y1] if isinstance(bbox, (list, tuple)) and len(bbox) == 4: return (float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3])) # Handle flat polygon [x1, y1, x2, y2, ...] if isinstance(bbox, (list, tuple)) and len(bbox) >= 8: xs = [bbox[i] for i in range(0, len(bbox), 2)] ys = [bbox[i] for i in range(1, len(bbox), 2)] return (min(xs), min(ys), max(xs), max(ys)) # Handle dict format if isinstance(bbox, dict): return ( float(bbox.get('x0', bbox.get('x_min', 0))), float(bbox.get('y0', bbox.get('y_min', 0))), float(bbox.get('x1', bbox.get('x_max', 0))), float(bbox.get('y1', bbox.get('y_max', 0))) ) except (TypeError, ValueError, IndexError) as e: logger.warning(f"Failed to normalize bbox {bbox}: {e}") return None def _generate_summary( self, pp_structure_results: Dict[str, Any], raw_ocr_regions: List[Dict[str, Any]] ) -> Dict[str, Any]: """Generate summary comparing PP-Structure and raw OCR.""" pp_elements = pp_structure_results.get('elements', []) # Count element types type_counts = {} for elem in pp_elements: elem_type = elem.get('type', 'unknown') if hasattr(elem_type, 'value'): elem_type = elem_type.value type_counts[str(elem_type)] = type_counts.get(str(elem_type), 0) + 1 # Calculate bounding box coverage pp_bbox_area = 0 ocr_bbox_area = 0 for elem in pp_elements: bbox = self._normalize_bbox(elem.get('bbox')) if bbox: pp_bbox_area += (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) for region in raw_ocr_regions: bbox = self._normalize_bbox(region.get('bbox')) if bbox: ocr_bbox_area += (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) return { 'timestamp': datetime.now().isoformat(), 'pp_structure': { 'total_elements': len(pp_elements), 'element_types': type_counts, 'total_bbox_area': pp_bbox_area, 'has_parsing_res_list': pp_structure_results.get('has_parsing_res_list', False) }, 'raw_ocr': { 'total_regions': len(raw_ocr_regions), 'total_bbox_area': ocr_bbox_area, 'avg_confidence': sum(r.get('confidence', 0) for r in raw_ocr_regions) / len(raw_ocr_regions) if raw_ocr_regions else 0 }, 'comparison': { 'element_count_ratio': len(pp_elements) / len(raw_ocr_regions) if raw_ocr_regions else 0, 'area_ratio': pp_bbox_area / ocr_bbox_area if ocr_bbox_area > 0 else 0, 'potential_gap': len(raw_ocr_regions) - len(pp_elements) if raw_ocr_regions else 0 } } def _make_serializable(self, obj: Any) -> Any: """Convert object to JSON-serializable format.""" if obj is None: return None if isinstance(obj, (str, int, float, bool)): return obj if isinstance(obj, (list, tuple)): return [self._make_serializable(item) for item in obj] if isinstance(obj, dict): return {str(k): self._make_serializable(v) for k, v in obj.items()} if hasattr(obj, 'value'): return obj.value if hasattr(obj, '__dict__'): return self._make_serializable(obj.__dict__) if hasattr(obj, 'tolist'): # numpy array return obj.tolist() return str(obj)