OCR/backend/app/services/pdf_generator_service.py

"""
Layout-Preserving PDF Generation Service
Generates PDF files that preserve the original document layout using OCR JSON data
"""

import json
import logging
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from datetime import datetime

from reportlab.lib.pagesizes import A4, letter
from reportlab.lib.units import mm
from reportlab.pdfgen import canvas
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.platypus import Table, TableStyle
from reportlab.lib import colors
from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_RIGHT
from reportlab.platypus import Paragraph
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from PIL import Image
from html.parser import HTMLParser

from app.core.config import settings

logger = logging.getLogger(__name__)


class HTMLTableParser(HTMLParser):
    """Parse HTML table to extract structure and data"""

    def __init__(self):
        super().__init__()
        self.tables = []
        self.current_table = None
        self.current_row = None
        self.current_cell = None
        self.in_table = False

    def handle_starttag(self, tag, attrs):
        attrs_dict = dict(attrs)

        if tag == 'table':
            self.in_table = True
            self.current_table = {'rows': []}

        elif tag == 'tr' and self.in_table:
            self.current_row = {'cells': []}

        elif tag in ('td', 'th') and self.in_table and self.current_row is not None:
            colspan = int(attrs_dict.get('colspan', 1))
            rowspan = int(attrs_dict.get('rowspan', 1))
            self.current_cell = {
                'text': '',
                'is_header': tag == 'th',
                'colspan': colspan,
                'rowspan': rowspan
            }

    def handle_endtag(self, tag):
        if tag == 'table' and self.in_table:
            if self.current_table and self.current_table['rows']:
                self.tables.append(self.current_table)
            self.current_table = None
            self.in_table = False

        elif tag == 'tr' and self.current_row is not None:
            if self.current_table is not None:
                self.current_table['rows'].append(self.current_row)
            self.current_row = None

        elif tag in ('td', 'th') and self.current_cell is not None:
            if self.current_row is not None:
                self.current_row['cells'].append(self.current_cell)
            self.current_cell = None

    def handle_data(self, data):
        if self.current_cell is not None:
            self.current_cell['text'] += data.strip() + ' '


class PDFGeneratorService:
    """Service for generating layout-preserving PDFs from OCR JSON data"""

    def __init__(self):
        """Initialize PDF generator with font configuration"""
        self.font_name = 'NotoSansSC'
        self.font_path = None
        self.font_registered = False

        self._register_chinese_font()

    def _register_chinese_font(self):
        """Register Chinese font for PDF generation"""
        try:
            # Get font path from settings
            font_path = Path(settings.chinese_font_path)

            # Try relative path from project root
            if not font_path.is_absolute():
                # Adjust path - settings.chinese_font_path starts with ./backend/
                project_root = Path(__file__).resolve().parent.parent.parent.parent
                font_path = project_root / font_path

            if not font_path.exists():
                logger.error(f"Chinese font not found at {font_path}")
                return

            # Register font
            pdfmetrics.registerFont(TTFont(self.font_name, str(font_path)))
            self.font_path = font_path
            self.font_registered = True
            logger.info(f"Chinese font registered: {self.font_name} from {font_path}")

        except Exception as e:
            logger.error(f"Failed to register Chinese font: {e}")
            self.font_registered = False

    def load_ocr_json(self, json_path: Path) -> Optional[Dict]:
        """
        Load and parse OCR JSON result file

        Args:
            json_path: Path to JSON file

        Returns:
            Parsed JSON data or None if failed
        """
        try:
            with open(json_path, 'r', encoding='utf-8') as f:
                data = json.load(f)

            logger.info(f"Loaded OCR JSON: {json_path.name}")
            return data

        except Exception as e:
            logger.error(f"Failed to load JSON {json_path}: {e}")
            return None

    def calculate_page_dimensions(self, text_regions: List[Dict], source_file_path: Optional[Path] = None) -> Tuple[float, float]:
        """
        Calculate page dimensions from source file or text region bounding boxes

        Args:
            text_regions: List of text regions with bbox coordinates
            source_file_path: Optional path to source file for accurate dimensions

        Returns:
            Tuple of (width, height) in points
        """
        # First try to get dimensions from source file
        if source_file_path:
            dims = self.get_original_page_size(source_file_path)
            if dims:
                return dims

        if not text_regions:
            return A4  # Default to A4 size

        max_x = 0
        max_y = 0

        for region in text_regions:
            bbox = region.get('bbox', [])
            if not bbox or len(bbox) < 4:
                continue

            # bbox format: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
            for point in bbox:
                if isinstance(point, (list, tuple)) and len(point) >= 2:
                    x, y = point[0], point[1]
                    max_x = max(max_x, x)
                    max_y = max(max_y, y)

        # OCR coordinates are in pixels, use them directly as points (1:1 mapping)
        # Do NOT add padding - this causes layout issues
        width = max_x if max_x > 0 else A4[0]
        height = max_y if max_y > 0 else A4[1]

        logger.info(f"Calculated page dimensions from OCR: {width:.1f} x {height:.1f} points")
        return (width, height)

    def get_original_page_size(self, file_path: Path) -> Optional[Tuple[float, float]]:
        """
        Extract page dimensions from original source file

        Args:
            file_path: Path to original file (image or PDF)

        Returns:
            Tuple of (width, height) in points or None
        """
        try:
            if not file_path.exists():
                return None

            # For images, get dimensions from PIL
            if file_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']:
                img = Image.open(file_path)
                # Use pixel dimensions directly as points (1:1 mapping)
                # This matches how PaddleOCR reports coordinates
                width_pt = float(img.width)
                height_pt = float(img.height)
                logger.info(f"Extracted dimensions from image: {width_pt:.1f} x {height_pt:.1f} points (1:1 pixel mapping)")
                return (width_pt, height_pt)

            # For PDFs, would need PyPDF2 or similar
            # For now, return None to use calculated dimensions

        except Exception as e:
            logger.warning(f"Failed to get page size from {file_path}: {e}")

        return None

    def draw_text_region(
        self,
        pdf_canvas: canvas.Canvas,
        region: Dict,
        page_height: float,
        scale_w: float = 1.0,
        scale_h: float = 1.0
    ):
        """
        Draw a text region at precise coordinates

        Args:
            pdf_canvas: ReportLab canvas object
            region: Text region dict with text, bbox, confidence
            page_height: Height of page (for coordinate transformation)
            scale_w: Scale factor for X coordinates (PDF width / OCR width)
            scale_h: Scale factor for Y coordinates (PDF height / OCR height)
        """
        text = region.get('text', '')
        bbox = region.get('bbox', [])
        confidence = region.get('confidence', 1.0)

        if not text or not bbox or len(bbox) < 4:
            return

        try:
            # bbox from OCR: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
            # Points: top-left, top-right, bottom-right, bottom-left
            # OCR coordinates: origin (0,0) at top-left, Y increases downward
            ocr_x_left = bbox[0][0]    # Left X
            ocr_y_top = bbox[0][1]     # Top Y in OCR coordinates
            ocr_x_right = bbox[2][0]   # Right X
            ocr_y_bottom = bbox[2][1]  # Bottom Y in OCR coordinates

            # Apply scale factors to convert from OCR space to PDF space
            ocr_x_left = ocr_x_left * scale_w
            ocr_y_top = ocr_y_top * scale_h
            ocr_x_right = ocr_x_right * scale_w
            ocr_y_bottom = ocr_y_bottom * scale_h

            # Calculate bbox dimensions (after scaling)
            bbox_width = abs(ocr_x_right - ocr_x_left)
            bbox_height = abs(ocr_y_bottom - ocr_y_top)

            # Calculate font size using heuristics
            # Font size is typically 70-90% of bbox height
            # Testing shows 0.75 works well for most cases
            font_size = bbox_height * 0.75
            font_size = max(min(font_size, 72), 4)  # Clamp between 4pt and 72pt

            # Transform coordinates: OCR (top-left origin) → PDF (bottom-left origin)
            # CRITICAL: Y-axis flip!
            pdf_x = ocr_x_left
            pdf_y = page_height - ocr_y_bottom  # Flip Y-axis using bottom coordinate

            # Set font
            font_name = self.font_name if self.font_registered else 'Helvetica'
            pdf_canvas.setFont(font_name, font_size)

            # Calculate text width to prevent overflow
            text_width = pdf_canvas.stringWidth(text, font_name, font_size)

            # If text is too wide for bbox, scale down font
            if text_width > bbox_width:
                scale_factor = bbox_width / text_width
                font_size = font_size * scale_factor * 0.95  # 95% to add small margin
                font_size = max(font_size, 3)  # Minimum 3pt
                pdf_canvas.setFont(font_name, font_size)

            # Draw text at calculated position
            pdf_canvas.drawString(pdf_x, pdf_y, text)

            # Debug: Draw bounding box (optional)
            if settings.pdf_enable_bbox_debug:
                pdf_canvas.setStrokeColorRGB(1, 0, 0, 0.3)  # Red, semi-transparent
                pdf_canvas.setLineWidth(0.5)
                # Transform all bbox points to PDF coordinates (apply scaling first)
                pdf_points = [(p[0] * scale_w, page_height - p[1] * scale_h) for p in bbox]
                # Draw bbox rectangle
                for i in range(4):
                    x1, y1 = pdf_points[i]
                    x2, y2 = pdf_points[(i + 1) % 4]
                    pdf_canvas.line(x1, y1, x2, y2)

        except Exception as e:
            logger.warning(f"Failed to draw text region '{text[:20]}...': {e}")

    def draw_table_region(
        self,
        pdf_canvas: canvas.Canvas,
        table_element: Dict,
        images_metadata: List[Dict],
        page_height: float,
        scale_w: float = 1.0,
        scale_h: float = 1.0
    ):
        """
        Draw a table region by parsing HTML and rebuilding with ReportLab Table

        Args:
            pdf_canvas: ReportLab canvas object
            table_element: Table element dict with HTML content
            images_metadata: List of image metadata to find table bbox
            page_height: Height of page
            scale_w: Scale factor for X coordinates (PDF width / OCR width)
            scale_h: Scale factor for Y coordinates (PDF height / OCR height)
        """
        try:
            html_content = table_element.get('content', '')
            if not html_content:
                return

            # Parse HTML to extract table structure
            parser = HTMLTableParser()
            parser.feed(html_content)

            if not parser.tables:
                logger.warning("No tables found in HTML content")
                return

            # Get the first table (PP-StructureV3 usually provides one table per element)
            table_data = parser.tables[0]
            rows = table_data['rows']

            if not rows:
                return

            # Find corresponding table image to get bbox
            table_bbox = None
            for img_meta in images_metadata:
                img_path = img_meta.get('image_path', '')
                if 'table' in img_path.lower():
                    bbox = img_meta.get('bbox', [])
                    if bbox and len(bbox) >= 4:
                        table_bbox = bbox
                        break

            if not table_bbox:
                logger.warning("No bbox found for table")
                return

            # Extract bbox coordinates and apply scaling
            ocr_x_left = table_bbox[0][0] * scale_w
            ocr_y_top = table_bbox[0][1] * scale_h
            ocr_x_right = table_bbox[2][0] * scale_w
            ocr_y_bottom = table_bbox[2][1] * scale_h

            table_width = abs(ocr_x_right - ocr_x_left)
            table_height = abs(ocr_y_bottom - ocr_y_top)

            # Transform coordinates
            pdf_x = ocr_x_left
            pdf_y = page_height - ocr_y_bottom

            # Build table data for ReportLab
            # Convert parsed structure to simple 2D array
            max_cols = max(len(row['cells']) for row in rows)
            reportlab_data = []

            for row in rows:
                row_data = []
                for cell in row['cells']:
                    text = cell['text'].strip()
                    row_data.append(text)
                # Pad row if needed
                while len(row_data) < max_cols:
                    row_data.append('')
                reportlab_data.append(row_data)

            # Calculate column widths (equal distribution)
            col_widths = [table_width / max_cols] * max_cols

            # Create ReportLab Table
            # Use smaller font size to fit in bbox
            font_size = min(table_height / len(rows) * 0.5, 10)
            font_size = max(font_size, 6)

            # Create table with font
            table = Table(reportlab_data, colWidths=col_widths)

            # Apply table style
            style = TableStyle([
                ('FONT', (0, 0), (-1, -1), self.font_name if self.font_registered else 'Helvetica', font_size),
                ('GRID', (0, 0), (-1, -1), 0.5, colors.black),
                ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
                ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
                ('LEFTPADDING', (0, 0), (-1, -1), 2),
                ('RIGHTPADDING', (0, 0), (-1, -1), 2),
                ('TOPPADDING', (0, 0), (-1, -1), 2),
                ('BOTTOMPADDING', (0, 0), (-1, -1), 2),
            ])

            # Add header style if first row has headers
            if rows and rows[0]['cells'] and rows[0]['cells'][0].get('is_header'):
                style.add('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey)
                style.add('FONT', (0, 0), (-1, 0), self.font_name if self.font_registered else 'Helvetica-Bold', font_size)

            table.setStyle(style)

            # Calculate table size
            table.wrapOn(pdf_canvas, table_width, table_height)

            # Draw table at position
            table.drawOn(pdf_canvas, pdf_x, pdf_y)

            logger.info(f"Drew table at ({pdf_x:.0f}, {pdf_y:.0f}) size {table_width:.0f}x{table_height:.0f} with {len(rows)} rows")

        except Exception as e:
            logger.warning(f"Failed to draw table region: {e}")
            import traceback
            traceback.print_exc()

    def draw_image_region(
        self,
        pdf_canvas: canvas.Canvas,
        region: Dict,
        page_height: float,
        result_dir: Path,
        scale_w: float = 1.0,
        scale_h: float = 1.0
    ):
        """
        Draw an image region by embedding the extracted image

        Handles images extracted by PP-StructureV3 (tables, figures, charts, etc.)

        Args:
            pdf_canvas: ReportLab canvas object
            region: Image metadata dict with image_path and bbox
            page_height: Height of page (for coordinate transformation)
            result_dir: Directory containing result files
            scale_w: Scale factor for X coordinates (PDF width / OCR width)
            scale_h: Scale factor for Y coordinates (PDF height / OCR height)
        """
        try:
            image_path_str = region.get('image_path', '')
            if not image_path_str:
                return

            # Construct full path to image
            image_path = result_dir / image_path_str

            if not image_path.exists():
                logger.warning(f"Image not found: {image_path}")
                return

            # Get bbox for positioning
            bbox = region.get('bbox', [])
            if not bbox or len(bbox) < 4:
                # If no bbox, skip for now
                logger.warning(f"No bbox for image {image_path_str}")
                return

            # bbox from OCR: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
            # OCR coordinates: origin (0,0) at top-left, Y increases downward
            ocr_x_left = bbox[0][0] * scale_w
            ocr_y_top = bbox[0][1] * scale_h
            ocr_x_right = bbox[2][0] * scale_w
            ocr_y_bottom = bbox[2][1] * scale_h

            # Calculate bbox dimensions (after scaling)
            bbox_width = abs(ocr_x_right - ocr_x_left)
            bbox_height = abs(ocr_y_bottom - ocr_y_top)

            # Transform coordinates: OCR (top-left origin) → PDF (bottom-left origin)
            # CRITICAL: Y-axis flip!
            # For images, we position at bottom-left corner
            pdf_x_left = ocr_x_left
            pdf_y_bottom = page_height - ocr_y_bottom  # Flip Y-axis

            # Draw image using ReportLab
            # drawImage expects: (path, x, y, width, height)
            # where (x, y) is the bottom-left corner of the image
            pdf_canvas.drawImage(
                str(image_path),
                pdf_x_left,
                pdf_y_bottom,
                width=bbox_width,
                height=bbox_height,
                preserveAspectRatio=True,
                mask='auto'  # Handle transparency
            )

            logger.info(f"Drew image: {image_path_str} at ({pdf_x_left:.0f}, {pdf_y_bottom:.0f}) size {bbox_width:.0f}x{bbox_height:.0f}")

        except Exception as e:
            logger.warning(f"Failed to draw image region: {e}")

    def generate_layout_pdf(
        self,
        json_path: Path,
        output_path: Path,
        source_file_path: Optional[Path] = None
    ) -> bool:
        """
        Generate layout-preserving PDF from OCR JSON data

        Args:
            json_path: Path to OCR JSON file
            output_path: Path to save generated PDF
            source_file_path: Optional path to original source file for dimension extraction

        Returns:
            True if successful, False otherwise
        """
        try:
            # Check if PDF already exists (caching)
            if output_path.exists():
                logger.info(f"PDF already exists: {output_path.name}")
                return True

            # Load JSON data
            ocr_data = self.load_ocr_json(json_path)
            if not ocr_data:
                return False

            # Get text regions
            text_regions = ocr_data.get('text_regions', [])
            if not text_regions:
                logger.warning("No text regions found in JSON")
                return False

            # Get images metadata
            images_metadata = ocr_data.get('images_metadata', [])

            # Get layout data
            layout_data = ocr_data.get('layout_data', {})

            # Get OCR dimensions (the dimensions of images as processed by OCR)
            ocr_dimensions = ocr_data.get('ocr_dimensions')

            # Determine page dimensions
            page_size = self.calculate_page_dimensions(text_regions, source_file_path)

            page_width, page_height = page_size

            # Calculate scale factors if OCR dimensions are available
            # Default to 1.0 if no OCR dimensions (backward compatibility)
            scale_w = 1.0
            scale_h = 1.0

            if ocr_dimensions:
                # For single image
                if isinstance(ocr_dimensions, dict):
                    ocr_width = ocr_dimensions.get('width', page_width)
                    ocr_height = ocr_dimensions.get('height', page_height)
                    scale_w = page_width / ocr_width
                    scale_h = page_height / ocr_height
                    logger.info(f"Scale factors - X: {scale_w:.3f}, Y: {scale_h:.3f} (OCR: {ocr_width}x{ocr_height}, PDF: {page_width}x{page_height})")
                # For multi-page PDF - we'll handle per-page scaling below
                elif isinstance(ocr_dimensions, list) and ocr_dimensions:
                    # Use first page dimensions as default
                    ocr_width = ocr_dimensions[0].get('width', page_width)
                    ocr_height = ocr_dimensions[0].get('height', page_height)
                    scale_w = page_width / ocr_width
                    scale_h = page_height / ocr_height
                    logger.info(f"Default scale factors - X: {scale_w:.3f}, Y: {scale_h:.3f}")

            # Create PDF canvas
            pdf_canvas = canvas.Canvas(str(output_path), pagesize=(page_width, page_height))

            # Extract table bboxes to exclude text in those regions
            table_bboxes = []
            for img_meta in images_metadata:
                img_path = img_meta.get('image_path', '')
                if 'table' in img_path.lower():
                    bbox = img_meta.get('bbox', [])
                    if bbox and len(bbox) >= 4:
                        table_bboxes.append(bbox)

            # Helper function to check if a point is inside a bbox
            def point_in_bbox(x, y, bbox):
                x1, y1 = bbox[0]
                x2, y2 = bbox[2]
                return min(x1, x2) <= x <= max(x1, x2) and min(y1, y2) <= y <= max(y1, y2)

            # Filter text regions to exclude those inside tables
            filtered_text_regions = []
            for region in text_regions:
                bbox = region.get('bbox', [])
                if not bbox or len(bbox) < 4:
                    continue

                # Check if text region center is inside any table bbox
                center_x = (bbox[0][0] + bbox[2][0]) / 2
                center_y = (bbox[0][1] + bbox[2][1]) / 2

                is_in_table = any(point_in_bbox(center_x, center_y, table_bbox) for table_bbox in table_bboxes)

                if not is_in_table:
                    filtered_text_regions.append(region)
                else:
                    logger.debug(f"Excluded text '{region.get('text', '')[:20]}...' (inside table)")

            logger.info(f"Filtered {len(text_regions) - len(filtered_text_regions)} text regions inside tables")

            # Group regions by page
            pages_data = {}
            for region in filtered_text_regions:
                page_num = region.get('page', 1)
                if page_num not in pages_data:
                    pages_data[page_num] = []
                pages_data[page_num].append(region)

            # Get table elements from layout_data
            table_elements = []
            if layout_data and layout_data.get('elements'):
                table_elements = [e for e in layout_data['elements'] if e.get('type') == 'table']

            # Process each page
            total_pages = ocr_data.get('total_pages', 1)
            for page_num in range(1, total_pages + 1):
                if page_num > 1:
                    pdf_canvas.showPage()  # Start new page

                # Get scale factors for this page (for multi-page PDFs)
                page_scale_w = scale_w
                page_scale_h = scale_h
                if isinstance(ocr_dimensions, list) and ocr_dimensions:
                    # Find dimensions for this specific page
                    for dim_info in ocr_dimensions:
                        if dim_info.get('page') == page_num:
                            ocr_width = dim_info.get('width', page_width)
                            ocr_height = dim_info.get('height', page_height)
                            page_scale_w = page_width / ocr_width
                            page_scale_h = page_height / ocr_height
                            logger.info(f"Page {page_num} scale factors - X: {page_scale_w:.3f}, Y: {page_scale_h:.3f}")
                            break

                # Draw text regions for this page (excluding table text)
                page_regions = pages_data.get(page_num, [])
                for region in page_regions:
                    self.draw_text_region(pdf_canvas, region, page_height, page_scale_w, page_scale_h)

                # Draw tables for this page
                for table_elem in table_elements:
                    if table_elem.get('page', 0) == page_num - 1:  # page is 0-indexed
                        self.draw_table_region(pdf_canvas, table_elem, images_metadata, page_height, page_scale_w, page_scale_h)

                # Draw non-table images for this page (figure, chart, seal, etc.)
                for img_meta in images_metadata:
                    if img_meta.get('page') == page_num - 1:  # page is 0-indexed
                        img_path = img_meta.get('image_path', '')
                        # Skip table images (they're now rendered as tables)
                        if 'table' not in img_path.lower():
                            self.draw_image_region(
                                pdf_canvas,
                                img_meta,
                                page_height,
                                json_path.parent,
                                page_scale_w,
                                page_scale_h
                            )

            # Save PDF
            pdf_canvas.save()

            file_size = output_path.stat().st_size
            logger.info(f"Generated layout-preserving PDF: {output_path.name} ({file_size} bytes)")
            return True

        except Exception as e:
            logger.error(f"Failed to generate PDF: {e}")
            import traceback
            traceback.print_exc()
            return False


# Singleton instance
pdf_generator_service = PDFGeneratorService()