OCR/backend/app/services/text_region_renderer.py

"""
Simple Text Region Renderer

Renders raw OCR text regions directly to PDF at their detected positions,
with rotation correction based on bbox quadrilateral geometry.

This approach bypasses complex table structure reconstruction and simply
places text at the positions detected by PaddleOCR.
"""

import math
import logging
from typing import Dict, List, Optional, Set, Tuple

from reportlab.pdfgen import canvas
from reportlab.lib.colors import black

from app.utils.bbox_utils import normalize_bbox

logger = logging.getLogger(__name__)


class TextRegionRenderer:
    """
    Render raw OCR text regions to PDF with position and rotation correction.

    This renderer takes the raw OCR output (text + quadrilateral bbox) and
    renders text at the correct position. Small rotation angles are ignored
    (straightened) to produce clean, aligned text output.
    """

    # Minimum font size to prevent illegible text
    MIN_FONT_SIZE = 6.0

    # Maximum font size to prevent oversized text
    MAX_FONT_SIZE = 72.0

    # Font size estimation factor (font height relative to bbox height)
    FONT_SIZE_FACTOR = 0.75

    # Rotation angle threshold - angles smaller than this are straightened to 0
    # This compensates for slight scan skew and produces cleaner output
    ROTATION_STRAIGHTEN_THRESHOLD = 10.0  # degrees

    # IoA (Intersection over Area) threshold for text-image overlap detection
    # If text bbox overlaps with image by more than this ratio, skip the text
    IOA_OVERLAP_THRESHOLD = 0.3  # 30% overlap

    def __init__(
        self,
        font_name: str = 'NotoSansSC',
        debug: bool = False,
        straighten_threshold: float = None,
        ioa_threshold: float = None
    ):
        """
        Initialize the text region renderer.

        Args:
            font_name: Name of the registered font to use
            debug: Enable debug logging
            straighten_threshold: Override rotation straightening threshold (degrees)
            ioa_threshold: Override IoA overlap threshold for text-image avoidance
        """
        self.font_name = font_name
        self.debug = debug
        self.straighten_threshold = straighten_threshold or self.ROTATION_STRAIGHTEN_THRESHOLD
        self.ioa_threshold = ioa_threshold or self.IOA_OVERLAP_THRESHOLD

    def calculate_rotation(self, bbox: List[List[float]]) -> float:
        """
        Calculate text rotation angle from bbox quadrilateral.

        The bbox is a quadrilateral with 4 corner points in order:
        [top-left, top-right, bottom-right, bottom-left]

        Returns angle in degrees (counter-clockwise from horizontal).
        Positive angle means text is tilted upward to the right.

        NOTE: Small angles (< straighten_threshold) will be treated as 0
        during rendering to produce clean, aligned output.

        Args:
            bbox: List of 4 [x, y] coordinate pairs

        Returns:
            Rotation angle in degrees
        """
        if len(bbox) < 2:
            return 0.0

        # Top-left to top-right vector (top edge)
        dx = bbox[1][0] - bbox[0][0]
        dy = bbox[1][1] - bbox[0][1]

        # Calculate angle (atan2 returns radians, convert to degrees)
        # Note: In image coordinates, Y increases downward
        # We negate dy to get the conventional angle
        angle_rad = math.atan2(-dy, dx)
        angle_deg = math.degrees(angle_rad)

        if self.debug:
            logger.debug(f"Rotation calculation: dx={dx:.1f}, dy={dy:.1f}, angle={angle_deg:.2f}°")

        return angle_deg

    def estimate_font_size(
        self,
        bbox: List[List[float]],
        text: str,
        scale_factor: float = 1.0
    ) -> float:
        """
        Estimate appropriate font size from bbox dimensions.

        Uses the bbox height as the primary indicator, with adjustment
        for the typical font-to-bbox ratio.

        Args:
            bbox: List of 4 [x, y] coordinate pairs
            text: The text content (for width-based adjustments)
            scale_factor: Coordinate scaling factor

        Returns:
            Estimated font size in points
        """
        if len(bbox) < 4:
            return 12.0  # Default font size

        # Calculate bbox height (average of left and right edges)
        left_height = math.dist(bbox[0], bbox[3])
        right_height = math.dist(bbox[1], bbox[2])
        avg_height = (left_height + right_height) / 2

        # Apply scale factor and font size ratio
        font_size = avg_height * scale_factor * self.FONT_SIZE_FACTOR

        # Clamp to reasonable range
        font_size = max(self.MIN_FONT_SIZE, min(self.MAX_FONT_SIZE, font_size))

        if self.debug:
            logger.debug(f"Font size estimation: bbox_h={avg_height:.1f}, "
                        f"scale={scale_factor:.3f}, font={font_size:.1f}pt")

        return font_size

    def get_bbox_center(self, bbox: List[List[float]]) -> Tuple[float, float]:
        """
        Calculate the center point of a bbox quadrilateral.

        Args:
            bbox: List of 4 [x, y] coordinate pairs

        Returns:
            Tuple of (center_x, center_y)
        """
        if len(bbox) < 4:
            return (0.0, 0.0)

        center_x = sum(p[0] for p in bbox) / 4
        center_y = sum(p[1] for p in bbox) / 4
        return (center_x, center_y)

    def get_bbox_as_rect(self, bbox: List[List[float]]) -> Tuple[float, float, float, float]:
        """
        Convert quadrilateral bbox to axis-aligned rectangle (x0, y0, x1, y1).
        Uses shared bbox utility.

        Args:
            bbox: List of 4 [x, y] coordinate pairs

        Returns:
            Tuple of (x0, y0, x1, y1) - min/max coordinates
        """
        result = normalize_bbox(bbox)
        return result if result else (0.0, 0.0, 0.0, 0.0)

    def get_bbox_left_baseline(
        self,
        bbox: List[List[float]]
    ) -> Tuple[float, float]:
        """
        Get the left baseline point for text rendering.

        For left-aligned text, we use the bottom-left corner as the
        baseline starting point (text baseline is at the bottom).

        Args:
            bbox: List of 4 [x, y] coordinate pairs

        Returns:
            Tuple of (x, y) for the left baseline point
        """
        if len(bbox) < 4:
            return (0.0, 0.0)

        # Use bottom-left corner for baseline
        # bbox[3] is bottom-left in the standard ordering
        x = bbox[3][0]
        y = bbox[3][1]

        return (x, y)

    def calculate_ioa(
        self,
        text_rect: Tuple[float, float, float, float],
        image_rect: Tuple[float, float, float, float]
    ) -> float:
        """
        Calculate Intersection over Area (IoA) of text bbox with image bbox.

        IoA = intersection_area / text_area

        This measures how much of the text region overlaps with the image.

        Args:
            text_rect: Text bbox as (x0, y0, x1, y1)
            image_rect: Image bbox as (x0, y0, x1, y1)

        Returns:
            IoA ratio (0.0 to 1.0)
        """
        tx0, ty0, tx1, ty1 = text_rect
        ix0, iy0, ix1, iy1 = image_rect

        # Calculate text area
        text_area = (tx1 - tx0) * (ty1 - ty0)
        if text_area <= 0:
            return 0.0

        # Calculate intersection
        inter_x0 = max(tx0, ix0)
        inter_y0 = max(ty0, iy0)
        inter_x1 = min(tx1, ix1)
        inter_y1 = min(ty1, iy1)

        if inter_x0 >= inter_x1 or inter_y0 >= inter_y1:
            return 0.0  # No intersection

        inter_area = (inter_x1 - inter_x0) * (inter_y1 - inter_y0)
        return inter_area / text_area

    def is_overlapping_exclusion_zones(
        self,
        bbox: List[List[float]],
        exclusion_zones: List[Tuple[float, float, float, float]]
    ) -> bool:
        """
        Check if text bbox overlaps significantly with any exclusion zone.

        Args:
            bbox: Text bbox as quadrilateral
            exclusion_zones: List of (x0, y0, x1, y1) rectangles to avoid

        Returns:
            True if text should be skipped due to overlap
        """
        if not exclusion_zones:
            return False

        text_rect = self.get_bbox_as_rect(bbox)

        for zone in exclusion_zones:
            ioa = self.calculate_ioa(text_rect, zone)
            if ioa >= self.ioa_threshold:
                if self.debug:
                    logger.debug(f"Text overlaps exclusion zone: IoA={ioa:.2f} >= {self.ioa_threshold}")
                return True

        return False

    def is_inside_zone(
        self,
        bbox: List[List[float]],
        zone: Tuple[float, float, float, float],
        threshold: float = 0.5
    ) -> bool:
        """
        Check if text bbox is inside a zone (for collecting chart texts).

        Args:
            bbox: Text bbox as quadrilateral
            zone: Zone as (x0, y0, x1, y1) rectangle
            threshold: Minimum IoA to consider "inside"

        Returns:
            True if text is inside the zone
        """
        text_rect = self.get_bbox_as_rect(bbox)
        ioa = self.calculate_ioa(text_rect, zone)
        return ioa >= threshold

    def is_axis_label(
        self,
        bbox: List[List[float]],
        zone: Tuple[float, float, float, float],
        margin: float = 50.0
    ) -> bool:
        """
        Check if text bbox is an axis label for a chart/image zone.

        Axis labels are typically:
        - Vertical text to the LEFT of the chart (Y-axis label)
        - Horizontal text BELOW the chart (X-axis label)

        Args:
            bbox: Text bbox as quadrilateral
            zone: Chart/image zone as (x0, y0, x1, y1) rectangle
            margin: Maximum distance from zone edge to be considered axis label

        Returns:
            True if text appears to be an axis label for this zone
        """
        if len(bbox) < 4:
            return False

        text_rect = self.get_bbox_as_rect(bbox)
        tx0, ty0, tx1, ty1 = text_rect
        zx0, zy0, zx1, zy1 = zone

        # Calculate text dimensions
        text_width = tx1 - tx0
        text_height = ty1 - ty0

        # Check for Y-axis label: vertical text to the LEFT of zone
        # - Text is to the left of zone (tx1 <= zx0 + small overlap)
        # - Text's Y range overlaps with zone's Y range
        # - Text is taller than wide (aspect ratio > 2) OR very narrow
        is_left_of_zone = tx1 <= zx0 + margin and tx1 >= zx0 - margin
        y_overlaps = not (ty1 < zy0 or ty0 > zy1)
        is_vertical_text = text_height > text_width * 2

        if is_left_of_zone and y_overlaps and is_vertical_text:
            if self.debug:
                logger.debug(f"Detected Y-axis label: text is left of zone, vertical")
            return True

        # Check for X-axis label: horizontal text BELOW the zone
        # - Text is below zone (ty0 >= zy1 - small overlap)
        # - Text's X range overlaps with zone's X range
        # - Text is wider than tall (normal horizontal text)
        is_below_zone = ty0 >= zy1 - margin and ty0 <= zy1 + margin
        x_overlaps = not (tx1 < zx0 or tx0 > zx1)
        is_horizontal_text = text_width > text_height

        if is_below_zone and x_overlaps and is_horizontal_text:
            if self.debug:
                logger.debug(f"Detected X-axis label: text is below zone, horizontal")
            return True

        return False

    def is_near_zone(
        self,
        bbox: List[List[float]],
        zone: Tuple[float, float, float, float],
        margin: float = 100.0
    ) -> bool:
        """
        Check if text bbox is near (within margin) of a zone.

        Args:
            bbox: Text bbox as quadrilateral
            zone: Zone as (x0, y0, x1, y1) rectangle
            margin: Maximum distance from zone to be considered "near"

        Returns:
            True if text is near the zone
        """
        if len(bbox) < 4:
            return False

        text_rect = self.get_bbox_as_rect(bbox)
        tx0, ty0, tx1, ty1 = text_rect
        zx0, zy0, zx1, zy1 = zone

        # Expand zone by margin
        expanded_zone = (zx0 - margin, zy0 - margin, zx1 + margin, zy1 + margin)

        # Check if text overlaps with expanded zone
        ex0, ey0, ex1, ey1 = expanded_zone
        return not (tx1 < ex0 or tx0 > ex1 or ty1 < ey0 or ty0 > ey1)

    def collect_zone_texts(
        self,
        regions: List[Dict],
        zones: List[Tuple[float, float, float, float]],
        threshold: float = 0.5,
        include_axis_labels: bool = True
    ) -> Set[str]:
        """
        Collect text content from regions inside zones or identified as axis labels.

        This set is used during rendering for position-aware deduplication:
        - Text that matches this set AND is near a zone will be skipped
        - Text that matches but is far from zones will still be rendered

        Args:
            regions: List of raw OCR region dicts
            zones: List of (x0, y0, x1, y1) rectangles (e.g., chart bboxes)
            threshold: Minimum IoA to consider text as "inside" zone
            include_axis_labels: Also collect axis labels adjacent to zones

        Returns:
            Set of text strings found inside zones or as axis labels
        """
        zone_texts = set()

        for region in regions:
            text = region.get('text', '').strip()
            bbox = region.get('bbox', [])

            if not text or len(bbox) < 4:
                continue

            for zone in zones:
                # Check if inside zone
                if self.is_inside_zone(bbox, zone, threshold):
                    zone_texts.add(text)
                    if self.debug:
                        logger.debug(f"Collected zone text (inside): '{text}'")
                    break

                # Check if it's an axis label
                if include_axis_labels and self.is_axis_label(bbox, zone):
                    zone_texts.add(text)
                    if self.debug:
                        logger.debug(f"Collected zone text (axis label): '{text}'")
                    break

        return zone_texts

    def render_text_region(
        self,
        pdf_canvas: canvas.Canvas,
        region: Dict,
        page_height: float,
        scale_x: float = 1.0,
        scale_y: float = 1.0,
        exclusion_zones: List[Tuple[float, float, float, float]] = None,
        zone_texts: Set[str] = None
    ) -> Tuple[bool, str]:
        """
        Render a single OCR text region to the PDF canvas.

        Handles coordinate transformation from image coordinates (origin top-left)
        to PDF coordinates (origin bottom-left).

        Small rotation angles are straightened to produce clean output.
        Text overlapping with exclusion zones (images) is skipped.

        Deduplication logic (position-aware):
        - If text matches zone_texts AND is NEAR the zone (or is axis label),
          skip it to avoid duplicate chart labels
        - Text far from zones is rendered even if it matches zone content

        Args:
            pdf_canvas: ReportLab canvas to draw on
            region: Raw OCR region dict with 'text' and 'bbox'
            page_height: Height of the PDF page (for Y-flip)
            scale_x: X coordinate scaling factor
            scale_y: Y coordinate scaling factor
            exclusion_zones: List of (x0, y0, x1, y1) rectangles to avoid
            zone_texts: Set of zone-internal texts (dedupe only if near zone)

        Returns:
            Tuple of (success: bool, skip_reason: str)
            - success=True, skip_reason='' if rendered successfully
            - success=False, skip_reason='overlap'/'dedupe'/'error'/'' if skipped
        """
        text = region.get('text', '').strip()
        bbox = region.get('bbox', [])

        if not text or len(bbox) < 4:
            return (False, '')

        # Check if text overlaps with exclusion zones (images/charts)
        if exclusion_zones and self.is_overlapping_exclusion_zones(bbox, exclusion_zones):
            if self.debug:
                logger.debug(f"Skipping text '{text[:20]}...' due to exclusion zone overlap")
            return (False, 'overlap')

        # Check if text should be deduplicated based on position
        # Only skip if text matches zone content AND is near a zone (or is axis label)
        if zone_texts and text in zone_texts and exclusion_zones:
            for zone in exclusion_zones:
                # Check if it's an axis label for this zone
                if self.is_axis_label(bbox, zone):
                    if self.debug:
                        logger.debug(f"Skipping text '{text[:20]}...' - axis label for zone")
                    return (False, 'dedupe')
                # Check if it's near this zone (for zone-internal text deduplication)
                if self.is_near_zone(bbox, zone, margin=100.0):
                    if self.debug:
                        logger.debug(f"Skipping text '{text[:20]}...' - matches zone text and is near zone")
                    return (False, 'dedupe')

        try:
            # Calculate text properties
            rotation = self.calculate_rotation(bbox)
            font_size = self.estimate_font_size(bbox, text, scale_y)

            # Straighten small rotations for cleaner output
            # Only apply rotation for significant angles (e.g., 90° rotated text)
            if abs(rotation) < self.straighten_threshold:
                rotation = 0.0

            # Get left baseline point in image coordinates
            img_x, img_y = self.get_bbox_left_baseline(bbox)

            # Apply scaling
            scaled_x = img_x * scale_x
            scaled_y = img_y * scale_y

            # Convert to PDF coordinates (flip Y axis)
            pdf_x = scaled_x
            pdf_y = page_height - scaled_y

            # Save canvas state
            pdf_canvas.saveState()

            # Try to set font with fallback
            try:
                pdf_canvas.setFont(self.font_name, font_size)
            except KeyError:
                # Font not registered, try fallback fonts
                fallback_fonts = ['Helvetica', 'Times-Roman', 'Courier']
                font_set = False
                for fallback in fallback_fonts:
                    try:
                        pdf_canvas.setFont(fallback, font_size)
                        font_set = True
                        if self.debug:
                            logger.debug(f"Using fallback font: {fallback}")
                        break
                    except KeyError:
                        continue
                if not font_set:
                    logger.warning(f"No available font found, skipping region")
                    pdf_canvas.restoreState()
                    return (False, 'error')

            pdf_canvas.setFillColor(black)

            # Apply rotation if needed (only for significant angles like 90°)
            if abs(rotation) > 0.5:
                pdf_canvas.translate(pdf_x, pdf_y)
                pdf_canvas.rotate(rotation)
                pdf_canvas.drawString(0, 0, text)
            else:
                pdf_canvas.drawString(pdf_x, pdf_y, text)

            # Restore canvas state
            pdf_canvas.restoreState()

            if self.debug:
                logger.debug(f"Rendered text '{text[:20]}...' at ({pdf_x:.1f}, {pdf_y:.1f}), "
                            f"rot={rotation:.1f}°, size={font_size:.1f}pt")

            return (True, '')

        except Exception as e:
            logger.warning(f"Failed to render text region: {e}")
            return (False, 'error')

    def render_all_regions(
        self,
        pdf_canvas: canvas.Canvas,
        regions: List[Dict],
        page_height: float,
        scale_x: float = 1.0,
        scale_y: float = 1.0,
        page_filter: Optional[int] = None,
        exclusion_zones: List[Tuple[float, float, float, float]] = None,
        zone_texts: Set[str] = None
    ) -> int:
        """
        Render all OCR text regions to the PDF canvas.

        Args:
            pdf_canvas: ReportLab canvas to draw on
            regions: List of raw OCR region dicts
            page_height: Height of the PDF page
            scale_x: X coordinate scaling factor
            scale_y: Y coordinate scaling factor
            page_filter: If set, only render regions for this page index
            exclusion_zones: List of (x0, y0, x1, y1) rectangles to avoid
            zone_texts: Set of zone-internal texts (for position-aware deduplication)

        Returns:
            Number of regions successfully rendered
        """
        rendered_count = 0
        skipped_overlap = 0
        skipped_dedupe = 0

        for region in regions:
            # Filter by page if specified
            if page_filter is not None:
                region_page = region.get('page', 0)
                if region_page != page_filter:
                    continue

            success, skip_reason = self.render_text_region(
                pdf_canvas, region, page_height, scale_x, scale_y,
                exclusion_zones, zone_texts
            )

            if success:
                rendered_count += 1
            elif skip_reason == 'overlap':
                skipped_overlap += 1
            elif skip_reason == 'dedupe':
                skipped_dedupe += 1

        # Log results with skip counts
        total_processed = rendered_count + skipped_overlap + skipped_dedupe
        skip_parts = []
        if skipped_overlap > 0:
            skip_parts.append(f"{skipped_overlap} overlap")
        if skipped_dedupe > 0:
            skip_parts.append(f"{skipped_dedupe} dedupe")

        if skip_parts:
            logger.info(f"Rendered {rendered_count}/{total_processed} text regions "
                       f"(skipped: {', '.join(skip_parts)})")
        else:
            logger.info(f"Rendered {rendered_count}/{len(regions)} text regions")

        return rendered_count


def load_raw_ocr_regions(result_dir: str, task_id: str, page_num: int) -> List[Dict]:
    """
    Load raw OCR regions from the result directory.

    Args:
        result_dir: Path to the result directory
        task_id: Task ID
        page_num: Page number (1-indexed)

    Returns:
        List of raw OCR region dictionaries
    """
    from pathlib import Path
    import json

    result_path = Path(result_dir)

    # Use glob pattern to find raw OCR regions file
    # Filename format: {task_id}_{original_filename}_page_{page_num}_raw_ocr_regions.json
    # The original_filename varies based on uploaded file (e.g., scan, document, etc.)
    glob_pattern = f"{task_id}_*_page_{page_num}_raw_ocr_regions.json"
    matching_files = list(result_path.glob(glob_pattern))

    if matching_files:
        # Use the first matching file (there should only be one per page)
        file_path = matching_files[0]
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                regions = json.load(f)
                logger.info(f"Loaded {len(regions)} raw OCR regions from {file_path.name}")
                return regions
        except Exception as e:
            logger.error(f"Failed to load raw OCR regions from {file_path}: {e}")
            return []

    logger.warning(f"Raw OCR regions file not found for task {task_id} page {page_num}. "
                   f"Glob pattern: {glob_pattern}")
    return []