- Enable PP-StructureV3's use_doc_orientation_classify feature - Detect rotation angle from doc_preprocessor_res.angle - Swap page dimensions (width <-> height) for 90°/270° rotations - Output PDF now correctly displays landscape-scanned content Also includes: - Archive completed openspec proposals - Add simplify-frontend-ocr-config proposal (pending) - Code cleanup and frontend simplification 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
313 lines
12 KiB
Python
313 lines
12 KiB
Python
"""
|
|
PP-StructureV3 Debug Service
|
|
|
|
Provides debugging tools for visualizing and saving PP-StructureV3 results:
|
|
- Save raw results as JSON for inspection
|
|
- Generate visualization images showing detected bboxes
|
|
- Compare raw OCR regions with PP-StructureV3 elements
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import Dict, List, Any, Optional, Tuple
|
|
from datetime import datetime
|
|
|
|
from PIL import Image, ImageDraw, ImageFont
|
|
|
|
from app.utils.bbox_utils import normalize_bbox
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Color palette for different element types (RGB)
|
|
ELEMENT_COLORS: Dict[str, Tuple[int, int, int]] = {
|
|
'text': (0, 128, 0), # Green
|
|
'title': (0, 0, 255), # Blue
|
|
'table': (255, 0, 0), # Red
|
|
'figure': (255, 165, 0), # Orange
|
|
'image': (255, 165, 0), # Orange
|
|
'header': (128, 0, 128), # Purple
|
|
'footer': (128, 0, 128), # Purple
|
|
'equation': (0, 255, 255), # Cyan
|
|
'chart': (255, 192, 203), # Pink
|
|
'list': (139, 69, 19), # Brown
|
|
'reference': (128, 128, 128), # Gray
|
|
'default': (255, 0, 255), # Magenta for unknown types
|
|
}
|
|
|
|
# Color for raw OCR regions
|
|
RAW_OCR_COLOR = (255, 215, 0) # Gold
|
|
|
|
|
|
class PPStructureDebug:
|
|
"""Debug service for PP-StructureV3 analysis results."""
|
|
|
|
def __init__(self, output_dir: Path):
|
|
"""
|
|
Initialize debug service.
|
|
|
|
Args:
|
|
output_dir: Directory to save debug outputs
|
|
"""
|
|
self.output_dir = Path(output_dir)
|
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
def save_raw_results(
|
|
self,
|
|
pp_structure_results: Dict[str, Any],
|
|
raw_ocr_regions: List[Dict[str, Any]],
|
|
filename_prefix: str = "debug"
|
|
) -> Dict[str, Path]:
|
|
"""
|
|
Save raw PP-StructureV3 results and OCR regions as JSON files.
|
|
|
|
Args:
|
|
pp_structure_results: Raw PP-StructureV3 analysis results
|
|
raw_ocr_regions: Raw OCR text regions
|
|
filename_prefix: Prefix for output files
|
|
|
|
Returns:
|
|
Dictionary with paths to saved files
|
|
"""
|
|
saved_files = {}
|
|
|
|
# Save PP-StructureV3 results
|
|
pp_json_path = self.output_dir / f"{filename_prefix}_pp_structure_raw.json"
|
|
try:
|
|
# Convert any non-serializable types
|
|
serializable_results = self._make_serializable(pp_structure_results)
|
|
with open(pp_json_path, 'w', encoding='utf-8') as f:
|
|
json.dump(serializable_results, f, ensure_ascii=False, indent=2)
|
|
saved_files['pp_structure'] = pp_json_path
|
|
logger.info(f"Saved PP-StructureV3 raw results to {pp_json_path}")
|
|
except Exception as e:
|
|
logger.error(f"Failed to save PP-StructureV3 results: {e}")
|
|
|
|
# Save raw OCR regions
|
|
ocr_json_path = self.output_dir / f"{filename_prefix}_raw_ocr_regions.json"
|
|
try:
|
|
serializable_ocr = self._make_serializable(raw_ocr_regions)
|
|
with open(ocr_json_path, 'w', encoding='utf-8') as f:
|
|
json.dump(serializable_ocr, f, ensure_ascii=False, indent=2)
|
|
saved_files['raw_ocr'] = ocr_json_path
|
|
logger.info(f"Saved raw OCR regions to {ocr_json_path}")
|
|
except Exception as e:
|
|
logger.error(f"Failed to save raw OCR regions: {e}")
|
|
|
|
# Save summary comparison
|
|
summary_path = self.output_dir / f"{filename_prefix}_debug_summary.json"
|
|
try:
|
|
summary = self._generate_summary(pp_structure_results, raw_ocr_regions)
|
|
with open(summary_path, 'w', encoding='utf-8') as f:
|
|
json.dump(summary, f, ensure_ascii=False, indent=2)
|
|
saved_files['summary'] = summary_path
|
|
logger.info(f"Saved debug summary to {summary_path}")
|
|
except Exception as e:
|
|
logger.error(f"Failed to save debug summary: {e}")
|
|
|
|
return saved_files
|
|
|
|
def generate_visualization(
|
|
self,
|
|
image_path: Path,
|
|
pp_structure_elements: List[Dict[str, Any]],
|
|
raw_ocr_regions: Optional[List[Dict[str, Any]]] = None,
|
|
filename_prefix: str = "debug",
|
|
show_labels: bool = True,
|
|
show_raw_ocr: bool = True
|
|
) -> Optional[Path]:
|
|
"""
|
|
Generate visualization image showing detected elements.
|
|
|
|
Args:
|
|
image_path: Path to original image
|
|
pp_structure_elements: PP-StructureV3 detected elements
|
|
raw_ocr_regions: Optional raw OCR regions to overlay
|
|
filename_prefix: Prefix for output file
|
|
show_labels: Whether to show element type labels
|
|
show_raw_ocr: Whether to show raw OCR regions
|
|
|
|
Returns:
|
|
Path to generated visualization image
|
|
"""
|
|
try:
|
|
# Load original image
|
|
img = Image.open(image_path)
|
|
if img.mode != 'RGB':
|
|
img = img.convert('RGB')
|
|
|
|
# Create copy for drawing
|
|
viz_img = img.copy()
|
|
draw = ImageDraw.Draw(viz_img)
|
|
|
|
# Try to load a font, fall back to default
|
|
try:
|
|
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 14)
|
|
small_font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 10)
|
|
except (IOError, OSError):
|
|
try:
|
|
font = ImageFont.truetype("/home/egg/project/Tool_OCR/backend/fonts/NotoSansSC-Regular.ttf", 14)
|
|
small_font = ImageFont.truetype("/home/egg/project/Tool_OCR/backend/fonts/NotoSansSC-Regular.ttf", 10)
|
|
except (IOError, OSError):
|
|
font = ImageFont.load_default()
|
|
small_font = font
|
|
|
|
# Draw raw OCR regions first (so PP-Structure boxes are on top)
|
|
if show_raw_ocr and raw_ocr_regions:
|
|
for idx, region in enumerate(raw_ocr_regions):
|
|
bbox = self._normalize_bbox(region.get('bbox', []))
|
|
if bbox:
|
|
# Draw with dashed style simulation (draw thin lines)
|
|
x0, y0, x1, y1 = bbox
|
|
draw.rectangle([x0, y0, x1, y1], outline=RAW_OCR_COLOR, width=1)
|
|
|
|
# Add small label
|
|
if show_labels:
|
|
confidence = region.get('confidence', 0)
|
|
label = f"OCR:{confidence:.2f}"
|
|
draw.text((x0, y0 - 12), label, fill=RAW_OCR_COLOR, font=small_font)
|
|
|
|
# Draw PP-StructureV3 elements
|
|
for idx, elem in enumerate(pp_structure_elements):
|
|
elem_type = elem.get('type', 'default')
|
|
if hasattr(elem_type, 'value'):
|
|
elem_type = elem_type.value
|
|
elem_type = str(elem_type).lower()
|
|
|
|
color = ELEMENT_COLORS.get(elem_type, ELEMENT_COLORS['default'])
|
|
bbox = self._normalize_bbox(elem.get('bbox', []))
|
|
|
|
if bbox:
|
|
x0, y0, x1, y1 = bbox
|
|
# Draw thicker rectangle for PP-Structure elements
|
|
draw.rectangle([x0, y0, x1, y1], outline=color, width=3)
|
|
|
|
# Add label
|
|
if show_labels:
|
|
label = f"{idx}:{elem_type}"
|
|
# Draw label background
|
|
text_bbox = draw.textbbox((x0, y0 - 18), label, font=font)
|
|
draw.rectangle(text_bbox, fill=(255, 255, 255, 200))
|
|
draw.text((x0, y0 - 18), label, fill=color, font=font)
|
|
|
|
# Add legend
|
|
self._draw_legend(draw, img.width, font)
|
|
|
|
# Add image info
|
|
info_text = f"PP-Structure: {len(pp_structure_elements)} elements"
|
|
if raw_ocr_regions:
|
|
info_text += f" | Raw OCR: {len(raw_ocr_regions)} regions"
|
|
info_text += f" | Size: {img.width}x{img.height}"
|
|
draw.text((10, img.height - 25), info_text, fill=(0, 0, 0), font=font)
|
|
|
|
# Save visualization
|
|
viz_path = self.output_dir / f"{filename_prefix}_pp_structure_viz.png"
|
|
viz_img.save(viz_path, 'PNG')
|
|
logger.info(f"Saved visualization to {viz_path}")
|
|
|
|
return viz_path
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to generate visualization: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return None
|
|
|
|
def _draw_legend(self, draw: ImageDraw, img_width: int, font: ImageFont):
|
|
"""Draw a legend showing element type colors."""
|
|
legend_x = img_width - 150
|
|
legend_y = 10
|
|
|
|
# Draw legend background
|
|
draw.rectangle(
|
|
[legend_x - 5, legend_y - 5, img_width - 5, legend_y + len(ELEMENT_COLORS) * 18 + 25],
|
|
fill=(255, 255, 255, 230),
|
|
outline=(0, 0, 0)
|
|
)
|
|
|
|
draw.text((legend_x, legend_y), "Legend:", fill=(0, 0, 0), font=font)
|
|
legend_y += 20
|
|
|
|
for elem_type, color in ELEMENT_COLORS.items():
|
|
if elem_type == 'default':
|
|
continue
|
|
draw.rectangle([legend_x, legend_y + 2, legend_x + 12, legend_y + 14], fill=color)
|
|
draw.text((legend_x + 18, legend_y), elem_type, fill=(0, 0, 0), font=font)
|
|
legend_y += 18
|
|
|
|
# Add raw OCR legend entry
|
|
draw.rectangle([legend_x, legend_y + 2, legend_x + 12, legend_y + 14], fill=RAW_OCR_COLOR)
|
|
draw.text((legend_x + 18, legend_y), "raw_ocr", fill=(0, 0, 0), font=font)
|
|
|
|
def _normalize_bbox(self, bbox: Any) -> Optional[Tuple[float, float, float, float]]:
|
|
"""Normalize bbox to (x0, y0, x1, y1) format. Uses shared bbox utility."""
|
|
return normalize_bbox(bbox)
|
|
|
|
def _generate_summary(
|
|
self,
|
|
pp_structure_results: Dict[str, Any],
|
|
raw_ocr_regions: List[Dict[str, Any]]
|
|
) -> Dict[str, Any]:
|
|
"""Generate summary comparing PP-Structure and raw OCR."""
|
|
pp_elements = pp_structure_results.get('elements', [])
|
|
|
|
# Count element types
|
|
type_counts = {}
|
|
for elem in pp_elements:
|
|
elem_type = elem.get('type', 'unknown')
|
|
if hasattr(elem_type, 'value'):
|
|
elem_type = elem_type.value
|
|
type_counts[str(elem_type)] = type_counts.get(str(elem_type), 0) + 1
|
|
|
|
# Calculate bounding box coverage
|
|
pp_bbox_area = 0
|
|
ocr_bbox_area = 0
|
|
|
|
for elem in pp_elements:
|
|
bbox = self._normalize_bbox(elem.get('bbox'))
|
|
if bbox:
|
|
pp_bbox_area += (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
|
|
|
|
for region in raw_ocr_regions:
|
|
bbox = self._normalize_bbox(region.get('bbox'))
|
|
if bbox:
|
|
ocr_bbox_area += (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
|
|
|
|
return {
|
|
'timestamp': datetime.now().isoformat(),
|
|
'pp_structure': {
|
|
'total_elements': len(pp_elements),
|
|
'element_types': type_counts,
|
|
'total_bbox_area': pp_bbox_area,
|
|
'has_parsing_res_list': pp_structure_results.get('has_parsing_res_list', False)
|
|
},
|
|
'raw_ocr': {
|
|
'total_regions': len(raw_ocr_regions),
|
|
'total_bbox_area': ocr_bbox_area,
|
|
'avg_confidence': sum(r.get('confidence', 0) for r in raw_ocr_regions) / len(raw_ocr_regions) if raw_ocr_regions else 0
|
|
},
|
|
'comparison': {
|
|
'element_count_ratio': len(pp_elements) / len(raw_ocr_regions) if raw_ocr_regions else 0,
|
|
'area_ratio': pp_bbox_area / ocr_bbox_area if ocr_bbox_area > 0 else 0,
|
|
'potential_gap': len(raw_ocr_regions) - len(pp_elements) if raw_ocr_regions else 0
|
|
}
|
|
}
|
|
|
|
def _make_serializable(self, obj: Any) -> Any:
|
|
"""Convert object to JSON-serializable format."""
|
|
if obj is None:
|
|
return None
|
|
if isinstance(obj, (str, int, float, bool)):
|
|
return obj
|
|
if isinstance(obj, (list, tuple)):
|
|
return [self._make_serializable(item) for item in obj]
|
|
if isinstance(obj, dict):
|
|
return {str(k): self._make_serializable(v) for k, v in obj.items()}
|
|
if hasattr(obj, 'value'):
|
|
return obj.value
|
|
if hasattr(obj, '__dict__'):
|
|
return self._make_serializable(obj.__dict__)
|
|
if hasattr(obj, 'tolist'): # numpy array
|
|
return obj.tolist()
|
|
return str(obj)
|