Files
OCR/backend/app/services/pp_structure_debug.py
egg 86a6633000 feat: consolidate env config and add deployment files
- Add debug_font_path, demo_docs_dir, e2e_api_base_url to config.py
- Fix hardcoded paths in pp_structure_debug.py, create_demo_images.py
- Fix hardcoded paths in test files
- Update .env.example with new configuration options
- Update .gitignore to exclude AI development files (.claude/, openspec/, AGENTS.md, CLAUDE.md)
- Add production startup script (start-prod.sh)
- Add README.md with project documentation
- Add 1panel Docker deployment files (docker-compose.yml, Dockerfiles, nginx.conf)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-14 15:02:16 +08:00

359 lines
14 KiB
Python

"""
PP-StructureV3 Debug Service
Provides debugging tools for visualizing and saving PP-StructureV3 results:
- Save raw results as JSON for inspection
- Generate visualization images showing detected bboxes
- Compare raw OCR regions with PP-StructureV3 elements
"""
import json
import logging
from pathlib import Path
from typing import Dict, List, Any, Optional, Tuple
from datetime import datetime
from PIL import Image, ImageDraw, ImageFont
from app.utils.bbox_utils import normalize_bbox
from app.core.config import BACKEND_ROOT, settings
logger = logging.getLogger(__name__)
# Color palette for different element types (RGB)
ELEMENT_COLORS: Dict[str, Tuple[int, int, int]] = {
'text': (0, 128, 0), # Green
'title': (0, 0, 255), # Blue
'table': (255, 0, 0), # Red
'figure': (255, 165, 0), # Orange
'image': (255, 165, 0), # Orange
'header': (128, 0, 128), # Purple
'footer': (128, 0, 128), # Purple
'equation': (0, 255, 255), # Cyan
'chart': (255, 192, 203), # Pink
'list': (139, 69, 19), # Brown
'reference': (128, 128, 128), # Gray
'default': (255, 0, 255), # Magenta for unknown types
}
# Color for raw OCR regions
RAW_OCR_COLOR = (255, 215, 0) # Gold
class PPStructureDebug:
"""Debug service for PP-StructureV3 analysis results."""
def __init__(self, output_dir: Path):
"""
Initialize debug service.
Args:
output_dir: Directory to save debug outputs
"""
self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True)
def save_raw_results(
self,
pp_structure_results: Dict[str, Any],
raw_ocr_regions: List[Dict[str, Any]],
filename_prefix: str = "debug"
) -> Dict[str, Path]:
"""
Save raw PP-StructureV3 results and OCR regions as JSON files.
Args:
pp_structure_results: Raw PP-StructureV3 analysis results
raw_ocr_regions: Raw OCR text regions
filename_prefix: Prefix for output files
Returns:
Dictionary with paths to saved files
"""
saved_files = {}
# Save PP-StructureV3 results
pp_json_path = self.output_dir / f"{filename_prefix}_pp_structure_raw.json"
try:
# Convert any non-serializable types
serializable_results = self._make_serializable(pp_structure_results)
with open(pp_json_path, 'w', encoding='utf-8') as f:
json.dump(serializable_results, f, ensure_ascii=False, indent=2)
saved_files['pp_structure'] = pp_json_path
logger.info(f"Saved PP-StructureV3 raw results to {pp_json_path}")
except Exception as e:
logger.error(f"Failed to save PP-StructureV3 results: {e}")
# Save raw OCR regions
ocr_json_path = self.output_dir / f"{filename_prefix}_raw_ocr_regions.json"
try:
serializable_ocr = self._make_serializable(raw_ocr_regions)
with open(ocr_json_path, 'w', encoding='utf-8') as f:
json.dump(serializable_ocr, f, ensure_ascii=False, indent=2)
saved_files['raw_ocr'] = ocr_json_path
logger.info(f"Saved raw OCR regions to {ocr_json_path}")
except Exception as e:
logger.error(f"Failed to save raw OCR regions: {e}")
# Save summary comparison
summary_path = self.output_dir / f"{filename_prefix}_debug_summary.json"
try:
summary = self._generate_summary(pp_structure_results, raw_ocr_regions)
with open(summary_path, 'w', encoding='utf-8') as f:
json.dump(summary, f, ensure_ascii=False, indent=2)
saved_files['summary'] = summary_path
logger.info(f"Saved debug summary to {summary_path}")
except Exception as e:
logger.error(f"Failed to save debug summary: {e}")
return saved_files
def save_debug_results(
self,
pp_structure_results: Dict[str, Any],
raw_ocr_regions: List[Dict[str, Any]],
filename_prefix: str = "debug"
) -> Dict[str, Path]:
"""
Save debug-only files (PP-Structure raw results and summary).
Does NOT save raw_ocr_regions.json (that's handled separately).
Args:
pp_structure_results: Raw PP-StructureV3 analysis results
raw_ocr_regions: Raw OCR text regions (for summary generation only)
filename_prefix: Prefix for output files
Returns:
Dictionary with paths to saved files
"""
saved_files = {}
# Save PP-StructureV3 results
pp_json_path = self.output_dir / f"{filename_prefix}_pp_structure_raw.json"
try:
serializable_results = self._make_serializable(pp_structure_results)
with open(pp_json_path, 'w', encoding='utf-8') as f:
json.dump(serializable_results, f, ensure_ascii=False, indent=2)
saved_files['pp_structure'] = pp_json_path
logger.info(f"Saved PP-StructureV3 raw results to {pp_json_path}")
except Exception as e:
logger.error(f"Failed to save PP-StructureV3 results: {e}")
# Save summary comparison
summary_path = self.output_dir / f"{filename_prefix}_debug_summary.json"
try:
summary = self._generate_summary(pp_structure_results, raw_ocr_regions)
with open(summary_path, 'w', encoding='utf-8') as f:
json.dump(summary, f, ensure_ascii=False, indent=2)
saved_files['summary'] = summary_path
logger.info(f"Saved debug summary to {summary_path}")
except Exception as e:
logger.error(f"Failed to save debug summary: {e}")
return saved_files
def generate_visualization(
self,
image_path: Path,
pp_structure_elements: List[Dict[str, Any]],
raw_ocr_regions: Optional[List[Dict[str, Any]]] = None,
filename_prefix: str = "debug",
show_labels: bool = True,
show_raw_ocr: bool = True
) -> Optional[Path]:
"""
Generate visualization image showing detected elements.
Args:
image_path: Path to original image
pp_structure_elements: PP-StructureV3 detected elements
raw_ocr_regions: Optional raw OCR regions to overlay
filename_prefix: Prefix for output file
show_labels: Whether to show element type labels
show_raw_ocr: Whether to show raw OCR regions
Returns:
Path to generated visualization image
"""
try:
# Load original image
img = Image.open(image_path)
if img.mode != 'RGB':
img = img.convert('RGB')
# Create copy for drawing
viz_img = img.copy()
draw = ImageDraw.Draw(viz_img)
# Try to load a font, fall back to default
try:
font = ImageFont.truetype(settings.debug_font_path, 14)
small_font = ImageFont.truetype(settings.debug_font_path, 10)
except (IOError, OSError):
try:
noto_font = BACKEND_ROOT / "fonts" / "NotoSansSC-Regular.ttf"
font = ImageFont.truetype(str(noto_font), 14)
small_font = ImageFont.truetype(str(noto_font), 10)
except (IOError, OSError):
font = ImageFont.load_default()
small_font = font
# Draw raw OCR regions first (so PP-Structure boxes are on top)
if show_raw_ocr and raw_ocr_regions:
for idx, region in enumerate(raw_ocr_regions):
bbox = self._normalize_bbox(region.get('bbox', []))
if bbox:
# Draw with dashed style simulation (draw thin lines)
x0, y0, x1, y1 = bbox
draw.rectangle([x0, y0, x1, y1], outline=RAW_OCR_COLOR, width=1)
# Add small label
if show_labels:
confidence = region.get('confidence', 0)
label = f"OCR:{confidence:.2f}"
draw.text((x0, y0 - 12), label, fill=RAW_OCR_COLOR, font=small_font)
# Draw PP-StructureV3 elements
for idx, elem in enumerate(pp_structure_elements):
elem_type = elem.get('type', 'default')
if hasattr(elem_type, 'value'):
elem_type = elem_type.value
elem_type = str(elem_type).lower()
color = ELEMENT_COLORS.get(elem_type, ELEMENT_COLORS['default'])
bbox = self._normalize_bbox(elem.get('bbox', []))
if bbox:
x0, y0, x1, y1 = bbox
# Draw thicker rectangle for PP-Structure elements
draw.rectangle([x0, y0, x1, y1], outline=color, width=3)
# Add label
if show_labels:
label = f"{idx}:{elem_type}"
# Draw label background
text_bbox = draw.textbbox((x0, y0 - 18), label, font=font)
draw.rectangle(text_bbox, fill=(255, 255, 255, 200))
draw.text((x0, y0 - 18), label, fill=color, font=font)
# Add legend
self._draw_legend(draw, img.width, font)
# Add image info
info_text = f"PP-Structure: {len(pp_structure_elements)} elements"
if raw_ocr_regions:
info_text += f" | Raw OCR: {len(raw_ocr_regions)} regions"
info_text += f" | Size: {img.width}x{img.height}"
draw.text((10, img.height - 25), info_text, fill=(0, 0, 0), font=font)
# Save visualization
viz_path = self.output_dir / f"{filename_prefix}_pp_structure_viz.png"
viz_img.save(viz_path, 'PNG')
logger.info(f"Saved visualization to {viz_path}")
return viz_path
except Exception as e:
logger.error(f"Failed to generate visualization: {e}")
import traceback
traceback.print_exc()
return None
def _draw_legend(self, draw: ImageDraw, img_width: int, font: ImageFont):
"""Draw a legend showing element type colors."""
legend_x = img_width - 150
legend_y = 10
# Draw legend background
draw.rectangle(
[legend_x - 5, legend_y - 5, img_width - 5, legend_y + len(ELEMENT_COLORS) * 18 + 25],
fill=(255, 255, 255, 230),
outline=(0, 0, 0)
)
draw.text((legend_x, legend_y), "Legend:", fill=(0, 0, 0), font=font)
legend_y += 20
for elem_type, color in ELEMENT_COLORS.items():
if elem_type == 'default':
continue
draw.rectangle([legend_x, legend_y + 2, legend_x + 12, legend_y + 14], fill=color)
draw.text((legend_x + 18, legend_y), elem_type, fill=(0, 0, 0), font=font)
legend_y += 18
# Add raw OCR legend entry
draw.rectangle([legend_x, legend_y + 2, legend_x + 12, legend_y + 14], fill=RAW_OCR_COLOR)
draw.text((legend_x + 18, legend_y), "raw_ocr", fill=(0, 0, 0), font=font)
def _normalize_bbox(self, bbox: Any) -> Optional[Tuple[float, float, float, float]]:
"""Normalize bbox to (x0, y0, x1, y1) format. Uses shared bbox utility."""
return normalize_bbox(bbox)
def _generate_summary(
self,
pp_structure_results: Dict[str, Any],
raw_ocr_regions: List[Dict[str, Any]]
) -> Dict[str, Any]:
"""Generate summary comparing PP-Structure and raw OCR."""
pp_elements = pp_structure_results.get('elements', [])
# Count element types
type_counts = {}
for elem in pp_elements:
elem_type = elem.get('type', 'unknown')
if hasattr(elem_type, 'value'):
elem_type = elem_type.value
type_counts[str(elem_type)] = type_counts.get(str(elem_type), 0) + 1
# Calculate bounding box coverage
pp_bbox_area = 0
ocr_bbox_area = 0
for elem in pp_elements:
bbox = self._normalize_bbox(elem.get('bbox'))
if bbox:
pp_bbox_area += (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
for region in raw_ocr_regions:
bbox = self._normalize_bbox(region.get('bbox'))
if bbox:
ocr_bbox_area += (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
return {
'timestamp': datetime.now().isoformat(),
'pp_structure': {
'total_elements': len(pp_elements),
'element_types': type_counts,
'total_bbox_area': pp_bbox_area,
'has_parsing_res_list': pp_structure_results.get('has_parsing_res_list', False)
},
'raw_ocr': {
'total_regions': len(raw_ocr_regions),
'total_bbox_area': ocr_bbox_area,
'avg_confidence': sum(r.get('confidence', 0) for r in raw_ocr_regions) / len(raw_ocr_regions) if raw_ocr_regions else 0
},
'comparison': {
'element_count_ratio': len(pp_elements) / len(raw_ocr_regions) if raw_ocr_regions else 0,
'area_ratio': pp_bbox_area / ocr_bbox_area if ocr_bbox_area > 0 else 0,
'potential_gap': len(raw_ocr_regions) - len(pp_elements) if raw_ocr_regions else 0
}
}
def _make_serializable(self, obj: Any) -> Any:
"""Convert object to JSON-serializable format."""
if obj is None:
return None
if isinstance(obj, (str, int, float, bool)):
return obj
if isinstance(obj, (list, tuple)):
return [self._make_serializable(item) for item in obj]
if isinstance(obj, dict):
return {str(k): self._make_serializable(v) for k, v in obj.items()}
if hasattr(obj, 'value'):
return obj.value
if hasattr(obj, '__dict__'):
return self._make_serializable(obj.__dict__)
if hasattr(obj, 'tolist'): # numpy array
return obj.tolist()
return str(obj)