Files
OCR/backend/app/services/pp_structure_debug.py
egg cfe65158a3 feat: enable document orientation detection for scanned PDFs
- Enable PP-StructureV3's use_doc_orientation_classify feature
- Detect rotation angle from doc_preprocessor_res.angle
- Swap page dimensions (width <-> height) for 90°/270° rotations
- Output PDF now correctly displays landscape-scanned content

Also includes:
- Archive completed openspec proposals
- Add simplify-frontend-ocr-config proposal (pending)
- Code cleanup and frontend simplification

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-11 17:13:46 +08:00

313 lines
12 KiB
Python

"""
PP-StructureV3 Debug Service
Provides debugging tools for visualizing and saving PP-StructureV3 results:
- Save raw results as JSON for inspection
- Generate visualization images showing detected bboxes
- Compare raw OCR regions with PP-StructureV3 elements
"""
import json
import logging
from pathlib import Path
from typing import Dict, List, Any, Optional, Tuple
from datetime import datetime
from PIL import Image, ImageDraw, ImageFont
from app.utils.bbox_utils import normalize_bbox
logger = logging.getLogger(__name__)
# Color palette for different element types (RGB)
ELEMENT_COLORS: Dict[str, Tuple[int, int, int]] = {
'text': (0, 128, 0), # Green
'title': (0, 0, 255), # Blue
'table': (255, 0, 0), # Red
'figure': (255, 165, 0), # Orange
'image': (255, 165, 0), # Orange
'header': (128, 0, 128), # Purple
'footer': (128, 0, 128), # Purple
'equation': (0, 255, 255), # Cyan
'chart': (255, 192, 203), # Pink
'list': (139, 69, 19), # Brown
'reference': (128, 128, 128), # Gray
'default': (255, 0, 255), # Magenta for unknown types
}
# Color for raw OCR regions
RAW_OCR_COLOR = (255, 215, 0) # Gold
class PPStructureDebug:
"""Debug service for PP-StructureV3 analysis results."""
def __init__(self, output_dir: Path):
"""
Initialize debug service.
Args:
output_dir: Directory to save debug outputs
"""
self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True)
def save_raw_results(
self,
pp_structure_results: Dict[str, Any],
raw_ocr_regions: List[Dict[str, Any]],
filename_prefix: str = "debug"
) -> Dict[str, Path]:
"""
Save raw PP-StructureV3 results and OCR regions as JSON files.
Args:
pp_structure_results: Raw PP-StructureV3 analysis results
raw_ocr_regions: Raw OCR text regions
filename_prefix: Prefix for output files
Returns:
Dictionary with paths to saved files
"""
saved_files = {}
# Save PP-StructureV3 results
pp_json_path = self.output_dir / f"{filename_prefix}_pp_structure_raw.json"
try:
# Convert any non-serializable types
serializable_results = self._make_serializable(pp_structure_results)
with open(pp_json_path, 'w', encoding='utf-8') as f:
json.dump(serializable_results, f, ensure_ascii=False, indent=2)
saved_files['pp_structure'] = pp_json_path
logger.info(f"Saved PP-StructureV3 raw results to {pp_json_path}")
except Exception as e:
logger.error(f"Failed to save PP-StructureV3 results: {e}")
# Save raw OCR regions
ocr_json_path = self.output_dir / f"{filename_prefix}_raw_ocr_regions.json"
try:
serializable_ocr = self._make_serializable(raw_ocr_regions)
with open(ocr_json_path, 'w', encoding='utf-8') as f:
json.dump(serializable_ocr, f, ensure_ascii=False, indent=2)
saved_files['raw_ocr'] = ocr_json_path
logger.info(f"Saved raw OCR regions to {ocr_json_path}")
except Exception as e:
logger.error(f"Failed to save raw OCR regions: {e}")
# Save summary comparison
summary_path = self.output_dir / f"{filename_prefix}_debug_summary.json"
try:
summary = self._generate_summary(pp_structure_results, raw_ocr_regions)
with open(summary_path, 'w', encoding='utf-8') as f:
json.dump(summary, f, ensure_ascii=False, indent=2)
saved_files['summary'] = summary_path
logger.info(f"Saved debug summary to {summary_path}")
except Exception as e:
logger.error(f"Failed to save debug summary: {e}")
return saved_files
def generate_visualization(
self,
image_path: Path,
pp_structure_elements: List[Dict[str, Any]],
raw_ocr_regions: Optional[List[Dict[str, Any]]] = None,
filename_prefix: str = "debug",
show_labels: bool = True,
show_raw_ocr: bool = True
) -> Optional[Path]:
"""
Generate visualization image showing detected elements.
Args:
image_path: Path to original image
pp_structure_elements: PP-StructureV3 detected elements
raw_ocr_regions: Optional raw OCR regions to overlay
filename_prefix: Prefix for output file
show_labels: Whether to show element type labels
show_raw_ocr: Whether to show raw OCR regions
Returns:
Path to generated visualization image
"""
try:
# Load original image
img = Image.open(image_path)
if img.mode != 'RGB':
img = img.convert('RGB')
# Create copy for drawing
viz_img = img.copy()
draw = ImageDraw.Draw(viz_img)
# Try to load a font, fall back to default
try:
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 14)
small_font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 10)
except (IOError, OSError):
try:
font = ImageFont.truetype("/home/egg/project/Tool_OCR/backend/fonts/NotoSansSC-Regular.ttf", 14)
small_font = ImageFont.truetype("/home/egg/project/Tool_OCR/backend/fonts/NotoSansSC-Regular.ttf", 10)
except (IOError, OSError):
font = ImageFont.load_default()
small_font = font
# Draw raw OCR regions first (so PP-Structure boxes are on top)
if show_raw_ocr and raw_ocr_regions:
for idx, region in enumerate(raw_ocr_regions):
bbox = self._normalize_bbox(region.get('bbox', []))
if bbox:
# Draw with dashed style simulation (draw thin lines)
x0, y0, x1, y1 = bbox
draw.rectangle([x0, y0, x1, y1], outline=RAW_OCR_COLOR, width=1)
# Add small label
if show_labels:
confidence = region.get('confidence', 0)
label = f"OCR:{confidence:.2f}"
draw.text((x0, y0 - 12), label, fill=RAW_OCR_COLOR, font=small_font)
# Draw PP-StructureV3 elements
for idx, elem in enumerate(pp_structure_elements):
elem_type = elem.get('type', 'default')
if hasattr(elem_type, 'value'):
elem_type = elem_type.value
elem_type = str(elem_type).lower()
color = ELEMENT_COLORS.get(elem_type, ELEMENT_COLORS['default'])
bbox = self._normalize_bbox(elem.get('bbox', []))
if bbox:
x0, y0, x1, y1 = bbox
# Draw thicker rectangle for PP-Structure elements
draw.rectangle([x0, y0, x1, y1], outline=color, width=3)
# Add label
if show_labels:
label = f"{idx}:{elem_type}"
# Draw label background
text_bbox = draw.textbbox((x0, y0 - 18), label, font=font)
draw.rectangle(text_bbox, fill=(255, 255, 255, 200))
draw.text((x0, y0 - 18), label, fill=color, font=font)
# Add legend
self._draw_legend(draw, img.width, font)
# Add image info
info_text = f"PP-Structure: {len(pp_structure_elements)} elements"
if raw_ocr_regions:
info_text += f" | Raw OCR: {len(raw_ocr_regions)} regions"
info_text += f" | Size: {img.width}x{img.height}"
draw.text((10, img.height - 25), info_text, fill=(0, 0, 0), font=font)
# Save visualization
viz_path = self.output_dir / f"{filename_prefix}_pp_structure_viz.png"
viz_img.save(viz_path, 'PNG')
logger.info(f"Saved visualization to {viz_path}")
return viz_path
except Exception as e:
logger.error(f"Failed to generate visualization: {e}")
import traceback
traceback.print_exc()
return None
def _draw_legend(self, draw: ImageDraw, img_width: int, font: ImageFont):
"""Draw a legend showing element type colors."""
legend_x = img_width - 150
legend_y = 10
# Draw legend background
draw.rectangle(
[legend_x - 5, legend_y - 5, img_width - 5, legend_y + len(ELEMENT_COLORS) * 18 + 25],
fill=(255, 255, 255, 230),
outline=(0, 0, 0)
)
draw.text((legend_x, legend_y), "Legend:", fill=(0, 0, 0), font=font)
legend_y += 20
for elem_type, color in ELEMENT_COLORS.items():
if elem_type == 'default':
continue
draw.rectangle([legend_x, legend_y + 2, legend_x + 12, legend_y + 14], fill=color)
draw.text((legend_x + 18, legend_y), elem_type, fill=(0, 0, 0), font=font)
legend_y += 18
# Add raw OCR legend entry
draw.rectangle([legend_x, legend_y + 2, legend_x + 12, legend_y + 14], fill=RAW_OCR_COLOR)
draw.text((legend_x + 18, legend_y), "raw_ocr", fill=(0, 0, 0), font=font)
def _normalize_bbox(self, bbox: Any) -> Optional[Tuple[float, float, float, float]]:
"""Normalize bbox to (x0, y0, x1, y1) format. Uses shared bbox utility."""
return normalize_bbox(bbox)
def _generate_summary(
self,
pp_structure_results: Dict[str, Any],
raw_ocr_regions: List[Dict[str, Any]]
) -> Dict[str, Any]:
"""Generate summary comparing PP-Structure and raw OCR."""
pp_elements = pp_structure_results.get('elements', [])
# Count element types
type_counts = {}
for elem in pp_elements:
elem_type = elem.get('type', 'unknown')
if hasattr(elem_type, 'value'):
elem_type = elem_type.value
type_counts[str(elem_type)] = type_counts.get(str(elem_type), 0) + 1
# Calculate bounding box coverage
pp_bbox_area = 0
ocr_bbox_area = 0
for elem in pp_elements:
bbox = self._normalize_bbox(elem.get('bbox'))
if bbox:
pp_bbox_area += (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
for region in raw_ocr_regions:
bbox = self._normalize_bbox(region.get('bbox'))
if bbox:
ocr_bbox_area += (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
return {
'timestamp': datetime.now().isoformat(),
'pp_structure': {
'total_elements': len(pp_elements),
'element_types': type_counts,
'total_bbox_area': pp_bbox_area,
'has_parsing_res_list': pp_structure_results.get('has_parsing_res_list', False)
},
'raw_ocr': {
'total_regions': len(raw_ocr_regions),
'total_bbox_area': ocr_bbox_area,
'avg_confidence': sum(r.get('confidence', 0) for r in raw_ocr_regions) / len(raw_ocr_regions) if raw_ocr_regions else 0
},
'comparison': {
'element_count_ratio': len(pp_elements) / len(raw_ocr_regions) if raw_ocr_regions else 0,
'area_ratio': pp_bbox_area / ocr_bbox_area if ocr_bbox_area > 0 else 0,
'potential_gap': len(raw_ocr_regions) - len(pp_elements) if raw_ocr_regions else 0
}
}
def _make_serializable(self, obj: Any) -> Any:
"""Convert object to JSON-serializable format."""
if obj is None:
return None
if isinstance(obj, (str, int, float, bool)):
return obj
if isinstance(obj, (list, tuple)):
return [self._make_serializable(item) for item in obj]
if isinstance(obj, dict):
return {str(k): self._make_serializable(v) for k, v in obj.items()}
if hasattr(obj, 'value'):
return obj.value
if hasattr(obj, '__dict__'):
return self._make_serializable(obj.__dict__)
if hasattr(obj, 'tolist'): # numpy array
return obj.tolist()
return str(obj)