Files
OCR/backend/app/services/text_region_renderer.py
egg cfe65158a3 feat: enable document orientation detection for scanned PDFs
- Enable PP-StructureV3's use_doc_orientation_classify feature
- Detect rotation angle from doc_preprocessor_res.angle
- Swap page dimensions (width <-> height) for 90°/270° rotations
- Output PDF now correctly displays landscape-scanned content

Also includes:
- Archive completed openspec proposals
- Add simplify-frontend-ocr-config proposal (pending)
- Code cleanup and frontend simplification

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-11 17:13:46 +08:00

671 lines
23 KiB
Python

"""
Simple Text Region Renderer
Renders raw OCR text regions directly to PDF at their detected positions,
with rotation correction based on bbox quadrilateral geometry.
This approach bypasses complex table structure reconstruction and simply
places text at the positions detected by PaddleOCR.
"""
import math
import logging
from typing import Dict, List, Optional, Set, Tuple
from reportlab.pdfgen import canvas
from reportlab.lib.colors import black
from app.utils.bbox_utils import normalize_bbox
logger = logging.getLogger(__name__)
class TextRegionRenderer:
"""
Render raw OCR text regions to PDF with position and rotation correction.
This renderer takes the raw OCR output (text + quadrilateral bbox) and
renders text at the correct position. Small rotation angles are ignored
(straightened) to produce clean, aligned text output.
"""
# Minimum font size to prevent illegible text
MIN_FONT_SIZE = 6.0
# Maximum font size to prevent oversized text
MAX_FONT_SIZE = 72.0
# Font size estimation factor (font height relative to bbox height)
FONT_SIZE_FACTOR = 0.75
# Rotation angle threshold - angles smaller than this are straightened to 0
# This compensates for slight scan skew and produces cleaner output
ROTATION_STRAIGHTEN_THRESHOLD = 10.0 # degrees
# IoA (Intersection over Area) threshold for text-image overlap detection
# If text bbox overlaps with image by more than this ratio, skip the text
IOA_OVERLAP_THRESHOLD = 0.3 # 30% overlap
def __init__(
self,
font_name: str = 'NotoSansSC',
debug: bool = False,
straighten_threshold: float = None,
ioa_threshold: float = None
):
"""
Initialize the text region renderer.
Args:
font_name: Name of the registered font to use
debug: Enable debug logging
straighten_threshold: Override rotation straightening threshold (degrees)
ioa_threshold: Override IoA overlap threshold for text-image avoidance
"""
self.font_name = font_name
self.debug = debug
self.straighten_threshold = straighten_threshold or self.ROTATION_STRAIGHTEN_THRESHOLD
self.ioa_threshold = ioa_threshold or self.IOA_OVERLAP_THRESHOLD
def calculate_rotation(self, bbox: List[List[float]]) -> float:
"""
Calculate text rotation angle from bbox quadrilateral.
The bbox is a quadrilateral with 4 corner points in order:
[top-left, top-right, bottom-right, bottom-left]
Returns angle in degrees (counter-clockwise from horizontal).
Positive angle means text is tilted upward to the right.
NOTE: Small angles (< straighten_threshold) will be treated as 0
during rendering to produce clean, aligned output.
Args:
bbox: List of 4 [x, y] coordinate pairs
Returns:
Rotation angle in degrees
"""
if len(bbox) < 2:
return 0.0
# Top-left to top-right vector (top edge)
dx = bbox[1][0] - bbox[0][0]
dy = bbox[1][1] - bbox[0][1]
# Calculate angle (atan2 returns radians, convert to degrees)
# Note: In image coordinates, Y increases downward
# We negate dy to get the conventional angle
angle_rad = math.atan2(-dy, dx)
angle_deg = math.degrees(angle_rad)
if self.debug:
logger.debug(f"Rotation calculation: dx={dx:.1f}, dy={dy:.1f}, angle={angle_deg:.2f}°")
return angle_deg
def estimate_font_size(
self,
bbox: List[List[float]],
text: str,
scale_factor: float = 1.0
) -> float:
"""
Estimate appropriate font size from bbox dimensions.
Uses the bbox height as the primary indicator, with adjustment
for the typical font-to-bbox ratio.
Args:
bbox: List of 4 [x, y] coordinate pairs
text: The text content (for width-based adjustments)
scale_factor: Coordinate scaling factor
Returns:
Estimated font size in points
"""
if len(bbox) < 4:
return 12.0 # Default font size
# Calculate bbox height (average of left and right edges)
left_height = math.dist(bbox[0], bbox[3])
right_height = math.dist(bbox[1], bbox[2])
avg_height = (left_height + right_height) / 2
# Apply scale factor and font size ratio
font_size = avg_height * scale_factor * self.FONT_SIZE_FACTOR
# Clamp to reasonable range
font_size = max(self.MIN_FONT_SIZE, min(self.MAX_FONT_SIZE, font_size))
if self.debug:
logger.debug(f"Font size estimation: bbox_h={avg_height:.1f}, "
f"scale={scale_factor:.3f}, font={font_size:.1f}pt")
return font_size
def get_bbox_center(self, bbox: List[List[float]]) -> Tuple[float, float]:
"""
Calculate the center point of a bbox quadrilateral.
Args:
bbox: List of 4 [x, y] coordinate pairs
Returns:
Tuple of (center_x, center_y)
"""
if len(bbox) < 4:
return (0.0, 0.0)
center_x = sum(p[0] for p in bbox) / 4
center_y = sum(p[1] for p in bbox) / 4
return (center_x, center_y)
def get_bbox_as_rect(self, bbox: List[List[float]]) -> Tuple[float, float, float, float]:
"""
Convert quadrilateral bbox to axis-aligned rectangle (x0, y0, x1, y1).
Uses shared bbox utility.
Args:
bbox: List of 4 [x, y] coordinate pairs
Returns:
Tuple of (x0, y0, x1, y1) - min/max coordinates
"""
result = normalize_bbox(bbox)
return result if result else (0.0, 0.0, 0.0, 0.0)
def get_bbox_left_baseline(
self,
bbox: List[List[float]]
) -> Tuple[float, float]:
"""
Get the left baseline point for text rendering.
For left-aligned text, we use the bottom-left corner as the
baseline starting point (text baseline is at the bottom).
Args:
bbox: List of 4 [x, y] coordinate pairs
Returns:
Tuple of (x, y) for the left baseline point
"""
if len(bbox) < 4:
return (0.0, 0.0)
# Use bottom-left corner for baseline
# bbox[3] is bottom-left in the standard ordering
x = bbox[3][0]
y = bbox[3][1]
return (x, y)
def calculate_ioa(
self,
text_rect: Tuple[float, float, float, float],
image_rect: Tuple[float, float, float, float]
) -> float:
"""
Calculate Intersection over Area (IoA) of text bbox with image bbox.
IoA = intersection_area / text_area
This measures how much of the text region overlaps with the image.
Args:
text_rect: Text bbox as (x0, y0, x1, y1)
image_rect: Image bbox as (x0, y0, x1, y1)
Returns:
IoA ratio (0.0 to 1.0)
"""
tx0, ty0, tx1, ty1 = text_rect
ix0, iy0, ix1, iy1 = image_rect
# Calculate text area
text_area = (tx1 - tx0) * (ty1 - ty0)
if text_area <= 0:
return 0.0
# Calculate intersection
inter_x0 = max(tx0, ix0)
inter_y0 = max(ty0, iy0)
inter_x1 = min(tx1, ix1)
inter_y1 = min(ty1, iy1)
if inter_x0 >= inter_x1 or inter_y0 >= inter_y1:
return 0.0 # No intersection
inter_area = (inter_x1 - inter_x0) * (inter_y1 - inter_y0)
return inter_area / text_area
def is_overlapping_exclusion_zones(
self,
bbox: List[List[float]],
exclusion_zones: List[Tuple[float, float, float, float]]
) -> bool:
"""
Check if text bbox overlaps significantly with any exclusion zone.
Args:
bbox: Text bbox as quadrilateral
exclusion_zones: List of (x0, y0, x1, y1) rectangles to avoid
Returns:
True if text should be skipped due to overlap
"""
if not exclusion_zones:
return False
text_rect = self.get_bbox_as_rect(bbox)
for zone in exclusion_zones:
ioa = self.calculate_ioa(text_rect, zone)
if ioa >= self.ioa_threshold:
if self.debug:
logger.debug(f"Text overlaps exclusion zone: IoA={ioa:.2f} >= {self.ioa_threshold}")
return True
return False
def is_inside_zone(
self,
bbox: List[List[float]],
zone: Tuple[float, float, float, float],
threshold: float = 0.5
) -> bool:
"""
Check if text bbox is inside a zone (for collecting chart texts).
Args:
bbox: Text bbox as quadrilateral
zone: Zone as (x0, y0, x1, y1) rectangle
threshold: Minimum IoA to consider "inside"
Returns:
True if text is inside the zone
"""
text_rect = self.get_bbox_as_rect(bbox)
ioa = self.calculate_ioa(text_rect, zone)
return ioa >= threshold
def is_axis_label(
self,
bbox: List[List[float]],
zone: Tuple[float, float, float, float],
margin: float = 50.0
) -> bool:
"""
Check if text bbox is an axis label for a chart/image zone.
Axis labels are typically:
- Vertical text to the LEFT of the chart (Y-axis label)
- Horizontal text BELOW the chart (X-axis label)
Args:
bbox: Text bbox as quadrilateral
zone: Chart/image zone as (x0, y0, x1, y1) rectangle
margin: Maximum distance from zone edge to be considered axis label
Returns:
True if text appears to be an axis label for this zone
"""
if len(bbox) < 4:
return False
text_rect = self.get_bbox_as_rect(bbox)
tx0, ty0, tx1, ty1 = text_rect
zx0, zy0, zx1, zy1 = zone
# Calculate text dimensions
text_width = tx1 - tx0
text_height = ty1 - ty0
# Check for Y-axis label: vertical text to the LEFT of zone
# - Text is to the left of zone (tx1 <= zx0 + small overlap)
# - Text's Y range overlaps with zone's Y range
# - Text is taller than wide (aspect ratio > 2) OR very narrow
is_left_of_zone = tx1 <= zx0 + margin and tx1 >= zx0 - margin
y_overlaps = not (ty1 < zy0 or ty0 > zy1)
is_vertical_text = text_height > text_width * 2
if is_left_of_zone and y_overlaps and is_vertical_text:
if self.debug:
logger.debug(f"Detected Y-axis label: text is left of zone, vertical")
return True
# Check for X-axis label: horizontal text BELOW the zone
# - Text is below zone (ty0 >= zy1 - small overlap)
# - Text's X range overlaps with zone's X range
# - Text is wider than tall (normal horizontal text)
is_below_zone = ty0 >= zy1 - margin and ty0 <= zy1 + margin
x_overlaps = not (tx1 < zx0 or tx0 > zx1)
is_horizontal_text = text_width > text_height
if is_below_zone and x_overlaps and is_horizontal_text:
if self.debug:
logger.debug(f"Detected X-axis label: text is below zone, horizontal")
return True
return False
def is_near_zone(
self,
bbox: List[List[float]],
zone: Tuple[float, float, float, float],
margin: float = 100.0
) -> bool:
"""
Check if text bbox is near (within margin) of a zone.
Args:
bbox: Text bbox as quadrilateral
zone: Zone as (x0, y0, x1, y1) rectangle
margin: Maximum distance from zone to be considered "near"
Returns:
True if text is near the zone
"""
if len(bbox) < 4:
return False
text_rect = self.get_bbox_as_rect(bbox)
tx0, ty0, tx1, ty1 = text_rect
zx0, zy0, zx1, zy1 = zone
# Expand zone by margin
expanded_zone = (zx0 - margin, zy0 - margin, zx1 + margin, zy1 + margin)
# Check if text overlaps with expanded zone
ex0, ey0, ex1, ey1 = expanded_zone
return not (tx1 < ex0 or tx0 > ex1 or ty1 < ey0 or ty0 > ey1)
def collect_zone_texts(
self,
regions: List[Dict],
zones: List[Tuple[float, float, float, float]],
threshold: float = 0.5,
include_axis_labels: bool = True
) -> Set[str]:
"""
Collect text content from regions inside zones or identified as axis labels.
This set is used during rendering for position-aware deduplication:
- Text that matches this set AND is near a zone will be skipped
- Text that matches but is far from zones will still be rendered
Args:
regions: List of raw OCR region dicts
zones: List of (x0, y0, x1, y1) rectangles (e.g., chart bboxes)
threshold: Minimum IoA to consider text as "inside" zone
include_axis_labels: Also collect axis labels adjacent to zones
Returns:
Set of text strings found inside zones or as axis labels
"""
zone_texts = set()
for region in regions:
text = region.get('text', '').strip()
bbox = region.get('bbox', [])
if not text or len(bbox) < 4:
continue
for zone in zones:
# Check if inside zone
if self.is_inside_zone(bbox, zone, threshold):
zone_texts.add(text)
if self.debug:
logger.debug(f"Collected zone text (inside): '{text}'")
break
# Check if it's an axis label
if include_axis_labels and self.is_axis_label(bbox, zone):
zone_texts.add(text)
if self.debug:
logger.debug(f"Collected zone text (axis label): '{text}'")
break
return zone_texts
def render_text_region(
self,
pdf_canvas: canvas.Canvas,
region: Dict,
page_height: float,
scale_x: float = 1.0,
scale_y: float = 1.0,
exclusion_zones: List[Tuple[float, float, float, float]] = None,
zone_texts: Set[str] = None
) -> Tuple[bool, str]:
"""
Render a single OCR text region to the PDF canvas.
Handles coordinate transformation from image coordinates (origin top-left)
to PDF coordinates (origin bottom-left).
Small rotation angles are straightened to produce clean output.
Text overlapping with exclusion zones (images) is skipped.
Deduplication logic (position-aware):
- If text matches zone_texts AND is NEAR the zone (or is axis label),
skip it to avoid duplicate chart labels
- Text far from zones is rendered even if it matches zone content
Args:
pdf_canvas: ReportLab canvas to draw on
region: Raw OCR region dict with 'text' and 'bbox'
page_height: Height of the PDF page (for Y-flip)
scale_x: X coordinate scaling factor
scale_y: Y coordinate scaling factor
exclusion_zones: List of (x0, y0, x1, y1) rectangles to avoid
zone_texts: Set of zone-internal texts (dedupe only if near zone)
Returns:
Tuple of (success: bool, skip_reason: str)
- success=True, skip_reason='' if rendered successfully
- success=False, skip_reason='overlap'/'dedupe'/'error'/'' if skipped
"""
text = region.get('text', '').strip()
bbox = region.get('bbox', [])
if not text or len(bbox) < 4:
return (False, '')
# Check if text overlaps with exclusion zones (images/charts)
if exclusion_zones and self.is_overlapping_exclusion_zones(bbox, exclusion_zones):
if self.debug:
logger.debug(f"Skipping text '{text[:20]}...' due to exclusion zone overlap")
return (False, 'overlap')
# Check if text should be deduplicated based on position
# Only skip if text matches zone content AND is near a zone (or is axis label)
if zone_texts and text in zone_texts and exclusion_zones:
for zone in exclusion_zones:
# Check if it's an axis label for this zone
if self.is_axis_label(bbox, zone):
if self.debug:
logger.debug(f"Skipping text '{text[:20]}...' - axis label for zone")
return (False, 'dedupe')
# Check if it's near this zone (for zone-internal text deduplication)
if self.is_near_zone(bbox, zone, margin=100.0):
if self.debug:
logger.debug(f"Skipping text '{text[:20]}...' - matches zone text and is near zone")
return (False, 'dedupe')
try:
# Calculate text properties
rotation = self.calculate_rotation(bbox)
font_size = self.estimate_font_size(bbox, text, scale_y)
# Straighten small rotations for cleaner output
# Only apply rotation for significant angles (e.g., 90° rotated text)
if abs(rotation) < self.straighten_threshold:
rotation = 0.0
# Get left baseline point in image coordinates
img_x, img_y = self.get_bbox_left_baseline(bbox)
# Apply scaling
scaled_x = img_x * scale_x
scaled_y = img_y * scale_y
# Convert to PDF coordinates (flip Y axis)
pdf_x = scaled_x
pdf_y = page_height - scaled_y
# Save canvas state
pdf_canvas.saveState()
# Try to set font with fallback
try:
pdf_canvas.setFont(self.font_name, font_size)
except KeyError:
# Font not registered, try fallback fonts
fallback_fonts = ['Helvetica', 'Times-Roman', 'Courier']
font_set = False
for fallback in fallback_fonts:
try:
pdf_canvas.setFont(fallback, font_size)
font_set = True
if self.debug:
logger.debug(f"Using fallback font: {fallback}")
break
except KeyError:
continue
if not font_set:
logger.warning(f"No available font found, skipping region")
pdf_canvas.restoreState()
return (False, 'error')
pdf_canvas.setFillColor(black)
# Apply rotation if needed (only for significant angles like 90°)
if abs(rotation) > 0.5:
pdf_canvas.translate(pdf_x, pdf_y)
pdf_canvas.rotate(rotation)
pdf_canvas.drawString(0, 0, text)
else:
pdf_canvas.drawString(pdf_x, pdf_y, text)
# Restore canvas state
pdf_canvas.restoreState()
if self.debug:
logger.debug(f"Rendered text '{text[:20]}...' at ({pdf_x:.1f}, {pdf_y:.1f}), "
f"rot={rotation:.1f}°, size={font_size:.1f}pt")
return (True, '')
except Exception as e:
logger.warning(f"Failed to render text region: {e}")
return (False, 'error')
def render_all_regions(
self,
pdf_canvas: canvas.Canvas,
regions: List[Dict],
page_height: float,
scale_x: float = 1.0,
scale_y: float = 1.0,
page_filter: Optional[int] = None,
exclusion_zones: List[Tuple[float, float, float, float]] = None,
zone_texts: Set[str] = None
) -> int:
"""
Render all OCR text regions to the PDF canvas.
Args:
pdf_canvas: ReportLab canvas to draw on
regions: List of raw OCR region dicts
page_height: Height of the PDF page
scale_x: X coordinate scaling factor
scale_y: Y coordinate scaling factor
page_filter: If set, only render regions for this page index
exclusion_zones: List of (x0, y0, x1, y1) rectangles to avoid
zone_texts: Set of zone-internal texts (for position-aware deduplication)
Returns:
Number of regions successfully rendered
"""
rendered_count = 0
skipped_overlap = 0
skipped_dedupe = 0
for region in regions:
# Filter by page if specified
if page_filter is not None:
region_page = region.get('page', 0)
if region_page != page_filter:
continue
success, skip_reason = self.render_text_region(
pdf_canvas, region, page_height, scale_x, scale_y,
exclusion_zones, zone_texts
)
if success:
rendered_count += 1
elif skip_reason == 'overlap':
skipped_overlap += 1
elif skip_reason == 'dedupe':
skipped_dedupe += 1
# Log results with skip counts
total_processed = rendered_count + skipped_overlap + skipped_dedupe
skip_parts = []
if skipped_overlap > 0:
skip_parts.append(f"{skipped_overlap} overlap")
if skipped_dedupe > 0:
skip_parts.append(f"{skipped_dedupe} dedupe")
if skip_parts:
logger.info(f"Rendered {rendered_count}/{total_processed} text regions "
f"(skipped: {', '.join(skip_parts)})")
else:
logger.info(f"Rendered {rendered_count}/{len(regions)} text regions")
return rendered_count
def load_raw_ocr_regions(result_dir: str, task_id: str, page_num: int) -> List[Dict]:
"""
Load raw OCR regions from the result directory.
Args:
result_dir: Path to the result directory
task_id: Task ID
page_num: Page number (1-indexed)
Returns:
List of raw OCR region dictionaries
"""
from pathlib import Path
import json
result_path = Path(result_dir)
# Use glob pattern to find raw OCR regions file
# Filename format: {task_id}_{original_filename}_page_{page_num}_raw_ocr_regions.json
# The original_filename varies based on uploaded file (e.g., scan, document, etc.)
glob_pattern = f"{task_id}_*_page_{page_num}_raw_ocr_regions.json"
matching_files = list(result_path.glob(glob_pattern))
if matching_files:
# Use the first matching file (there should only be one per page)
file_path = matching_files[0]
try:
with open(file_path, 'r', encoding='utf-8') as f:
regions = json.load(f)
logger.info(f"Loaded {len(regions)} raw OCR regions from {file_path.name}")
return regions
except Exception as e:
logger.error(f"Failed to load raw OCR regions from {file_path}: {e}")
return []
logger.warning(f"Raw OCR regions file not found for task {task_id} page {page_num}. "
f"Glob pattern: {glob_pattern}")
return []