""" Simple Text Region Renderer Renders raw OCR text regions directly to PDF at their detected positions, with rotation correction based on bbox quadrilateral geometry. This approach bypasses complex table structure reconstruction and simply places text at the positions detected by PaddleOCR. """ import math import logging from typing import Dict, List, Optional, Set, Tuple from reportlab.pdfgen import canvas from reportlab.lib.colors import black from app.utils.bbox_utils import normalize_bbox logger = logging.getLogger(__name__) class TextRegionRenderer: """ Render raw OCR text regions to PDF with position and rotation correction. This renderer takes the raw OCR output (text + quadrilateral bbox) and renders text at the correct position. Small rotation angles are ignored (straightened) to produce clean, aligned text output. """ # Minimum font size to prevent illegible text MIN_FONT_SIZE = 6.0 # Maximum font size to prevent oversized text MAX_FONT_SIZE = 72.0 # Font size estimation factor (font height relative to bbox height) FONT_SIZE_FACTOR = 0.75 # Rotation angle threshold - angles smaller than this are straightened to 0 # This compensates for slight scan skew and produces cleaner output ROTATION_STRAIGHTEN_THRESHOLD = 10.0 # degrees # IoA (Intersection over Area) threshold for text-image overlap detection # If text bbox overlaps with image by more than this ratio, skip the text IOA_OVERLAP_THRESHOLD = 0.3 # 30% overlap def __init__( self, font_name: str = 'NotoSansSC', debug: bool = False, straighten_threshold: float = None, ioa_threshold: float = None ): """ Initialize the text region renderer. Args: font_name: Name of the registered font to use debug: Enable debug logging straighten_threshold: Override rotation straightening threshold (degrees) ioa_threshold: Override IoA overlap threshold for text-image avoidance """ self.font_name = font_name self.debug = debug self.straighten_threshold = straighten_threshold or self.ROTATION_STRAIGHTEN_THRESHOLD self.ioa_threshold = ioa_threshold or self.IOA_OVERLAP_THRESHOLD def calculate_rotation(self, bbox: List[List[float]]) -> float: """ Calculate text rotation angle from bbox quadrilateral. The bbox is a quadrilateral with 4 corner points in order: [top-left, top-right, bottom-right, bottom-left] Returns angle in degrees (counter-clockwise from horizontal). Positive angle means text is tilted upward to the right. NOTE: Small angles (< straighten_threshold) will be treated as 0 during rendering to produce clean, aligned output. Args: bbox: List of 4 [x, y] coordinate pairs Returns: Rotation angle in degrees """ if len(bbox) < 2: return 0.0 # Top-left to top-right vector (top edge) dx = bbox[1][0] - bbox[0][0] dy = bbox[1][1] - bbox[0][1] # Calculate angle (atan2 returns radians, convert to degrees) # Note: In image coordinates, Y increases downward # We negate dy to get the conventional angle angle_rad = math.atan2(-dy, dx) angle_deg = math.degrees(angle_rad) if self.debug: logger.debug(f"Rotation calculation: dx={dx:.1f}, dy={dy:.1f}, angle={angle_deg:.2f}°") return angle_deg def estimate_font_size( self, bbox: List[List[float]], text: str, scale_factor: float = 1.0 ) -> float: """ Estimate appropriate font size from bbox dimensions. Uses the bbox height as the primary indicator, with adjustment for the typical font-to-bbox ratio. Args: bbox: List of 4 [x, y] coordinate pairs text: The text content (for width-based adjustments) scale_factor: Coordinate scaling factor Returns: Estimated font size in points """ if len(bbox) < 4: return 12.0 # Default font size # Calculate bbox height (average of left and right edges) left_height = math.dist(bbox[0], bbox[3]) right_height = math.dist(bbox[1], bbox[2]) avg_height = (left_height + right_height) / 2 # Apply scale factor and font size ratio font_size = avg_height * scale_factor * self.FONT_SIZE_FACTOR # Clamp to reasonable range font_size = max(self.MIN_FONT_SIZE, min(self.MAX_FONT_SIZE, font_size)) if self.debug: logger.debug(f"Font size estimation: bbox_h={avg_height:.1f}, " f"scale={scale_factor:.3f}, font={font_size:.1f}pt") return font_size def get_bbox_center(self, bbox: List[List[float]]) -> Tuple[float, float]: """ Calculate the center point of a bbox quadrilateral. Args: bbox: List of 4 [x, y] coordinate pairs Returns: Tuple of (center_x, center_y) """ if len(bbox) < 4: return (0.0, 0.0) center_x = sum(p[0] for p in bbox) / 4 center_y = sum(p[1] for p in bbox) / 4 return (center_x, center_y) def get_bbox_as_rect(self, bbox: List[List[float]]) -> Tuple[float, float, float, float]: """ Convert quadrilateral bbox to axis-aligned rectangle (x0, y0, x1, y1). Uses shared bbox utility. Args: bbox: List of 4 [x, y] coordinate pairs Returns: Tuple of (x0, y0, x1, y1) - min/max coordinates """ result = normalize_bbox(bbox) return result if result else (0.0, 0.0, 0.0, 0.0) def get_bbox_left_baseline( self, bbox: List[List[float]] ) -> Tuple[float, float]: """ Get the left baseline point for text rendering. For left-aligned text, we use the bottom-left corner as the baseline starting point (text baseline is at the bottom). Args: bbox: List of 4 [x, y] coordinate pairs Returns: Tuple of (x, y) for the left baseline point """ if len(bbox) < 4: return (0.0, 0.0) # Use bottom-left corner for baseline # bbox[3] is bottom-left in the standard ordering x = bbox[3][0] y = bbox[3][1] return (x, y) def calculate_ioa( self, text_rect: Tuple[float, float, float, float], image_rect: Tuple[float, float, float, float] ) -> float: """ Calculate Intersection over Area (IoA) of text bbox with image bbox. IoA = intersection_area / text_area This measures how much of the text region overlaps with the image. Args: text_rect: Text bbox as (x0, y0, x1, y1) image_rect: Image bbox as (x0, y0, x1, y1) Returns: IoA ratio (0.0 to 1.0) """ tx0, ty0, tx1, ty1 = text_rect ix0, iy0, ix1, iy1 = image_rect # Calculate text area text_area = (tx1 - tx0) * (ty1 - ty0) if text_area <= 0: return 0.0 # Calculate intersection inter_x0 = max(tx0, ix0) inter_y0 = max(ty0, iy0) inter_x1 = min(tx1, ix1) inter_y1 = min(ty1, iy1) if inter_x0 >= inter_x1 or inter_y0 >= inter_y1: return 0.0 # No intersection inter_area = (inter_x1 - inter_x0) * (inter_y1 - inter_y0) return inter_area / text_area def is_overlapping_exclusion_zones( self, bbox: List[List[float]], exclusion_zones: List[Tuple[float, float, float, float]] ) -> bool: """ Check if text bbox overlaps significantly with any exclusion zone. Args: bbox: Text bbox as quadrilateral exclusion_zones: List of (x0, y0, x1, y1) rectangles to avoid Returns: True if text should be skipped due to overlap """ if not exclusion_zones: return False text_rect = self.get_bbox_as_rect(bbox) for zone in exclusion_zones: ioa = self.calculate_ioa(text_rect, zone) if ioa >= self.ioa_threshold: if self.debug: logger.debug(f"Text overlaps exclusion zone: IoA={ioa:.2f} >= {self.ioa_threshold}") return True return False def is_inside_zone( self, bbox: List[List[float]], zone: Tuple[float, float, float, float], threshold: float = 0.5 ) -> bool: """ Check if text bbox is inside a zone (for collecting chart texts). Args: bbox: Text bbox as quadrilateral zone: Zone as (x0, y0, x1, y1) rectangle threshold: Minimum IoA to consider "inside" Returns: True if text is inside the zone """ text_rect = self.get_bbox_as_rect(bbox) ioa = self.calculate_ioa(text_rect, zone) return ioa >= threshold def is_axis_label( self, bbox: List[List[float]], zone: Tuple[float, float, float, float], margin: float = 50.0 ) -> bool: """ Check if text bbox is an axis label for a chart/image zone. Axis labels are typically: - Vertical text to the LEFT of the chart (Y-axis label) - Horizontal text BELOW the chart (X-axis label) Args: bbox: Text bbox as quadrilateral zone: Chart/image zone as (x0, y0, x1, y1) rectangle margin: Maximum distance from zone edge to be considered axis label Returns: True if text appears to be an axis label for this zone """ if len(bbox) < 4: return False text_rect = self.get_bbox_as_rect(bbox) tx0, ty0, tx1, ty1 = text_rect zx0, zy0, zx1, zy1 = zone # Calculate text dimensions text_width = tx1 - tx0 text_height = ty1 - ty0 # Check for Y-axis label: vertical text to the LEFT of zone # - Text is to the left of zone (tx1 <= zx0 + small overlap) # - Text's Y range overlaps with zone's Y range # - Text is taller than wide (aspect ratio > 2) OR very narrow is_left_of_zone = tx1 <= zx0 + margin and tx1 >= zx0 - margin y_overlaps = not (ty1 < zy0 or ty0 > zy1) is_vertical_text = text_height > text_width * 2 if is_left_of_zone and y_overlaps and is_vertical_text: if self.debug: logger.debug(f"Detected Y-axis label: text is left of zone, vertical") return True # Check for X-axis label: horizontal text BELOW the zone # - Text is below zone (ty0 >= zy1 - small overlap) # - Text's X range overlaps with zone's X range # - Text is wider than tall (normal horizontal text) is_below_zone = ty0 >= zy1 - margin and ty0 <= zy1 + margin x_overlaps = not (tx1 < zx0 or tx0 > zx1) is_horizontal_text = text_width > text_height if is_below_zone and x_overlaps and is_horizontal_text: if self.debug: logger.debug(f"Detected X-axis label: text is below zone, horizontal") return True return False def is_near_zone( self, bbox: List[List[float]], zone: Tuple[float, float, float, float], margin: float = 100.0 ) -> bool: """ Check if text bbox is near (within margin) of a zone. Args: bbox: Text bbox as quadrilateral zone: Zone as (x0, y0, x1, y1) rectangle margin: Maximum distance from zone to be considered "near" Returns: True if text is near the zone """ if len(bbox) < 4: return False text_rect = self.get_bbox_as_rect(bbox) tx0, ty0, tx1, ty1 = text_rect zx0, zy0, zx1, zy1 = zone # Expand zone by margin expanded_zone = (zx0 - margin, zy0 - margin, zx1 + margin, zy1 + margin) # Check if text overlaps with expanded zone ex0, ey0, ex1, ey1 = expanded_zone return not (tx1 < ex0 or tx0 > ex1 or ty1 < ey0 or ty0 > ey1) def collect_zone_texts( self, regions: List[Dict], zones: List[Tuple[float, float, float, float]], threshold: float = 0.5, include_axis_labels: bool = True ) -> Set[str]: """ Collect text content from regions inside zones or identified as axis labels. This set is used during rendering for position-aware deduplication: - Text that matches this set AND is near a zone will be skipped - Text that matches but is far from zones will still be rendered Args: regions: List of raw OCR region dicts zones: List of (x0, y0, x1, y1) rectangles (e.g., chart bboxes) threshold: Minimum IoA to consider text as "inside" zone include_axis_labels: Also collect axis labels adjacent to zones Returns: Set of text strings found inside zones or as axis labels """ zone_texts = set() for region in regions: text = region.get('text', '').strip() bbox = region.get('bbox', []) if not text or len(bbox) < 4: continue for zone in zones: # Check if inside zone if self.is_inside_zone(bbox, zone, threshold): zone_texts.add(text) if self.debug: logger.debug(f"Collected zone text (inside): '{text}'") break # Check if it's an axis label if include_axis_labels and self.is_axis_label(bbox, zone): zone_texts.add(text) if self.debug: logger.debug(f"Collected zone text (axis label): '{text}'") break return zone_texts def render_text_region( self, pdf_canvas: canvas.Canvas, region: Dict, page_height: float, scale_x: float = 1.0, scale_y: float = 1.0, exclusion_zones: List[Tuple[float, float, float, float]] = None, zone_texts: Set[str] = None ) -> Tuple[bool, str]: """ Render a single OCR text region to the PDF canvas. Handles coordinate transformation from image coordinates (origin top-left) to PDF coordinates (origin bottom-left). Small rotation angles are straightened to produce clean output. Text overlapping with exclusion zones (images) is skipped. Deduplication logic (position-aware): - If text matches zone_texts AND is NEAR the zone (or is axis label), skip it to avoid duplicate chart labels - Text far from zones is rendered even if it matches zone content Args: pdf_canvas: ReportLab canvas to draw on region: Raw OCR region dict with 'text' and 'bbox' page_height: Height of the PDF page (for Y-flip) scale_x: X coordinate scaling factor scale_y: Y coordinate scaling factor exclusion_zones: List of (x0, y0, x1, y1) rectangles to avoid zone_texts: Set of zone-internal texts (dedupe only if near zone) Returns: Tuple of (success: bool, skip_reason: str) - success=True, skip_reason='' if rendered successfully - success=False, skip_reason='overlap'/'dedupe'/'error'/'' if skipped """ text = region.get('text', '').strip() bbox = region.get('bbox', []) if not text or len(bbox) < 4: return (False, '') # Check if text overlaps with exclusion zones (images/charts) if exclusion_zones and self.is_overlapping_exclusion_zones(bbox, exclusion_zones): if self.debug: logger.debug(f"Skipping text '{text[:20]}...' due to exclusion zone overlap") return (False, 'overlap') # Check if text should be deduplicated based on position # Only skip if text matches zone content AND is near a zone (or is axis label) if zone_texts and text in zone_texts and exclusion_zones: for zone in exclusion_zones: # Check if it's an axis label for this zone if self.is_axis_label(bbox, zone): if self.debug: logger.debug(f"Skipping text '{text[:20]}...' - axis label for zone") return (False, 'dedupe') # Check if it's near this zone (for zone-internal text deduplication) if self.is_near_zone(bbox, zone, margin=100.0): if self.debug: logger.debug(f"Skipping text '{text[:20]}...' - matches zone text and is near zone") return (False, 'dedupe') try: # Calculate text properties rotation = self.calculate_rotation(bbox) font_size = self.estimate_font_size(bbox, text, scale_y) # Straighten small rotations for cleaner output # Only apply rotation for significant angles (e.g., 90° rotated text) if abs(rotation) < self.straighten_threshold: rotation = 0.0 # Get left baseline point in image coordinates img_x, img_y = self.get_bbox_left_baseline(bbox) # Apply scaling scaled_x = img_x * scale_x scaled_y = img_y * scale_y # Convert to PDF coordinates (flip Y axis) pdf_x = scaled_x pdf_y = page_height - scaled_y # Save canvas state pdf_canvas.saveState() # Try to set font with fallback try: pdf_canvas.setFont(self.font_name, font_size) except KeyError: # Font not registered, try fallback fonts fallback_fonts = ['Helvetica', 'Times-Roman', 'Courier'] font_set = False for fallback in fallback_fonts: try: pdf_canvas.setFont(fallback, font_size) font_set = True if self.debug: logger.debug(f"Using fallback font: {fallback}") break except KeyError: continue if not font_set: logger.warning(f"No available font found, skipping region") pdf_canvas.restoreState() return (False, 'error') pdf_canvas.setFillColor(black) # Apply rotation if needed (only for significant angles like 90°) if abs(rotation) > 0.5: pdf_canvas.translate(pdf_x, pdf_y) pdf_canvas.rotate(rotation) pdf_canvas.drawString(0, 0, text) else: pdf_canvas.drawString(pdf_x, pdf_y, text) # Restore canvas state pdf_canvas.restoreState() if self.debug: logger.debug(f"Rendered text '{text[:20]}...' at ({pdf_x:.1f}, {pdf_y:.1f}), " f"rot={rotation:.1f}°, size={font_size:.1f}pt") return (True, '') except Exception as e: logger.warning(f"Failed to render text region: {e}") return (False, 'error') def render_all_regions( self, pdf_canvas: canvas.Canvas, regions: List[Dict], page_height: float, scale_x: float = 1.0, scale_y: float = 1.0, page_filter: Optional[int] = None, exclusion_zones: List[Tuple[float, float, float, float]] = None, zone_texts: Set[str] = None ) -> int: """ Render all OCR text regions to the PDF canvas. Args: pdf_canvas: ReportLab canvas to draw on regions: List of raw OCR region dicts page_height: Height of the PDF page scale_x: X coordinate scaling factor scale_y: Y coordinate scaling factor page_filter: If set, only render regions for this page index exclusion_zones: List of (x0, y0, x1, y1) rectangles to avoid zone_texts: Set of zone-internal texts (for position-aware deduplication) Returns: Number of regions successfully rendered """ rendered_count = 0 skipped_overlap = 0 skipped_dedupe = 0 for region in regions: # Filter by page if specified if page_filter is not None: region_page = region.get('page', 0) if region_page != page_filter: continue success, skip_reason = self.render_text_region( pdf_canvas, region, page_height, scale_x, scale_y, exclusion_zones, zone_texts ) if success: rendered_count += 1 elif skip_reason == 'overlap': skipped_overlap += 1 elif skip_reason == 'dedupe': skipped_dedupe += 1 # Log results with skip counts total_processed = rendered_count + skipped_overlap + skipped_dedupe skip_parts = [] if skipped_overlap > 0: skip_parts.append(f"{skipped_overlap} overlap") if skipped_dedupe > 0: skip_parts.append(f"{skipped_dedupe} dedupe") if skip_parts: logger.info(f"Rendered {rendered_count}/{total_processed} text regions " f"(skipped: {', '.join(skip_parts)})") else: logger.info(f"Rendered {rendered_count}/{len(regions)} text regions") return rendered_count def load_raw_ocr_regions(result_dir: str, task_id: str, page_num: int) -> List[Dict]: """ Load raw OCR regions from the result directory. Args: result_dir: Path to the result directory task_id: Task ID page_num: Page number (1-indexed) Returns: List of raw OCR region dictionaries """ from pathlib import Path import json result_path = Path(result_dir) # Use glob pattern to find raw OCR regions file # Filename format: {task_id}_{original_filename}_page_{page_num}_raw_ocr_regions.json # The original_filename varies based on uploaded file (e.g., scan, document, etc.) glob_pattern = f"{task_id}_*_page_{page_num}_raw_ocr_regions.json" matching_files = list(result_path.glob(glob_pattern)) if matching_files: # Use the first matching file (there should only be one per page) file_path = matching_files[0] try: with open(file_path, 'r', encoding='utf-8') as f: regions = json.load(f) logger.info(f"Loaded {len(regions)} raw OCR regions from {file_path.name}") return regions except Exception as e: logger.error(f"Failed to load raw OCR regions from {file_path}: {e}") return [] logger.warning(f"Raw OCR regions file not found for task {task_id} page {page_num}. " f"Glob pattern: {glob_pattern}") return []