feat: unify Direct Track PDF rendering and simplify export options

Backend changes: - Apply background image + invisible text layer to all Direct Track PDFs - Add CHART to regions_to_avoid for text extraction - Improve visual fidelity for native PDFs and Office documents Frontend changes: - Remove JSON, UnifiedDocument, Markdown download buttons - Simplify to 2-column layout with only Layout PDF and Reflow PDF - Remove translation JSON download and Layout PDF option - Keep only Reflow PDF for translated document downloads - Clean up unused imports (FileJson, Database, FileOutput) Archives two OpenSpec proposals: - unify-direct-track-pdf-rendering - simplify-frontend-export-options 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-12 07:50:43 +08:00
parent 53bfa88773
commit 24253ac15e
15 changed files with 891 additions and 195 deletions
--- a/backend/app/services/pdf_generator_service.py
+++ b/backend/app/services/pdf_generator_service.py
@@ -709,7 +709,8 @@ class PDFGeneratorService:
        self,
        unified_doc: 'UnifiedDocument',
        output_path: Path,
-        source_file_path: Optional[Path] = None
+        source_file_path: Optional[Path] = None,
+        result_dir: Optional[Path] = None
    ) -> bool:
        """
        Generate layout-preserving PDF directly from UnifiedDocument.
@@ -721,6 +722,7 @@ class PDFGeneratorService:
            unified_doc: UnifiedDocument object
            output_path: Path to save generated PDF
            source_file_path: Optional path to original source file
+            result_dir: Optional path to result directory (for finding converted PDFs)

        Returns:
            True if successful, False otherwise
@@ -751,7 +753,8 @@ class PDFGeneratorService:
                return self._generate_direct_track_pdf(
                    unified_doc=unified_doc,
                    output_path=output_path,
-                    source_file_path=source_file_path
+                    source_file_path=source_file_path,
+                    result_dir=result_dir
                )
            else:
                # OCR track: Simplified rendering (backward compatible)
@@ -823,7 +826,8 @@ class PDFGeneratorService:
        self,
        unified_doc: 'UnifiedDocument',
        output_path: Path,
-        source_file_path: Optional[Path] = None
+        source_file_path: Optional[Path] = None,
+        result_dir: Optional[Path] = None
    ) -> bool:
        """
        Generate PDF with rich formatting preservation for Direct track.
@@ -836,6 +840,7 @@ class PDFGeneratorService:
            unified_doc: UnifiedDocument from Direct extraction
            output_path: Path to save generated PDF
            source_file_path: Optional path to original source file
+            result_dir: Optional path to result directory (for finding converted PDFs)

        Returns:
            True if successful, False otherwise
@@ -865,6 +870,55 @@ class PDFGeneratorService:
            from reportlab.pdfgen import canvas
            pdf_canvas = canvas.Canvas(str(output_path), pagesize=(page_width, page_height))

+            # For ALL Direct Track documents, render source page as background image
+            # This preserves visual fidelity (vector graphics, charts, complex layouts)
+            # and overlays invisible text layer for searchability/translation
+            use_background_rendering = (
+                self.current_processing_track == ProcessingTrack.DIRECT or
+                self.current_processing_track == ProcessingTrack.HYBRID
+            )
+            source_pdf = None
+
+            if use_background_rendering:
+                # Find the source PDF for background rendering
+                # For Office documents: source_file_path points to .pptx/.docx, need converted PDF
+                # For native PDFs: source_file_path should be the PDF itself
+                actual_source_pdf = None
+
+                # Use provided result_dir, or fall back to output_path.parent
+                search_dir = result_dir if result_dir else output_path.parent
+                if search_dir.exists():
+                    # Look for PDF files that match the pattern: {task_id}_{name}.pdf
+                    pdf_files = list(search_dir.glob('*.pdf'))
+                    # Filter out layout/output PDFs
+                    source_pdfs = [
+                        f for f in pdf_files
+                        if not f.name.endswith('_layout.pdf')
+                        and not f.name.endswith('_reflow.pdf')
+                        and f.name != output_path.name
+                    ]
+                    if source_pdfs:
+                        actual_source_pdf = source_pdfs[0]
+                        logger.debug(f"Found converted PDF in result dir: {actual_source_pdf.name}")
+
+                # Fallback: use source_file_path if it's a PDF
+                if not actual_source_pdf and source_file_path and source_file_path.exists():
+                    if source_file_path.suffix.lower() == '.pdf':
+                        actual_source_pdf = source_file_path
+
+                if actual_source_pdf and actual_source_pdf.exists():
+                    try:
+                        import fitz
+                        source_pdf = fitz.open(str(actual_source_pdf))
+                        logger.info(f"Direct Track: will render source pages as background from: {actual_source_pdf.name}")
+                    except Exception as e:
+                        logger.warning(f"Failed to open source PDF for background rendering: {e}")
+                        use_background_rendering = False
+                        source_pdf = None
+                else:
+                    logger.warning(f"Direct Track: no source PDF found in {search_dir}, skipping background rendering")
+                    use_background_rendering = False
+
            # Process each page
            for page_idx, page in enumerate(unified_doc.pages):
                logger.info(f">>> Processing page {page_idx + 1}/{len(unified_doc.pages)}")
@@ -880,6 +934,42 @@ class PDFGeneratorService:
                # Set page size for current page
                pdf_canvas.setPageSize((current_page_width, current_page_height))

+                # For Direct Track: render source page as background image
+                # This preserves all visual content (vector graphics, shapes, charts)
+                rendered_background = False
+                if use_background_rendering and source_pdf and page_idx < len(source_pdf):
+                    try:
+                        source_page = source_pdf[page_idx]
+                        # Render at 2x resolution for quality
+                        mat = fitz.Matrix(2.0, 2.0)
+                        pix = source_page.get_pixmap(matrix=mat, alpha=False)
+
+                        # Save to temporary file
+                        import tempfile
+                        with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
+                            pix.save(tmp.name)
+                            temp_bg_path = tmp.name
+
+                        # Draw background image (full page)
+                        from reportlab.lib.utils import ImageReader
+                        bg_img = ImageReader(temp_bg_path)
+                        pdf_canvas.drawImage(
+                            bg_img,
+                            0, 0,
+                            width=current_page_width,
+                            height=current_page_height,
+                            preserveAspectRatio=False
+                        )
+                        rendered_background = True
+                        logger.info(f"  Rendered source page {page_idx + 1} as background image")
+
+                        # Clean up temp file
+                        import os
+                        os.unlink(temp_bg_path)
+                    except Exception as e:
+                        logger.warning(f"Failed to render background for page {page_idx + 1}: {e}")
+                        rendered_background = False
+
                # Separate elements by type
                text_elements = []
                table_elements = []
@@ -918,22 +1008,28 @@ class PDFGeneratorService:
                                    continue

                        image_elements.append(element)
-                        # Only add real images to exclusion regions, NOT charts/diagrams
-                        # Charts often have large bounding boxes that include text labels
-                        # which should be rendered as selectable text on top
-                        if element.type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.LOGO, ElementType.STAMP]:
-                            # Check if this is Direct track (text from PDF text layer, not OCR)
-                            is_direct = (self.current_processing_track == ProcessingTrack.DIRECT or
-                                        self.current_processing_track == ProcessingTrack.HYBRID)

-                            if is_direct:
-                                # Direct track: text is from PDF text layer, not OCR'd from images
-                                # Don't exclude any images - text should be rendered on top
-                                # This is critical for Office documents with background images
+                        # Check if this is Direct track (text from PDF text layer, not OCR)
+                        is_direct = (self.current_processing_track == ProcessingTrack.DIRECT or
+                                    self.current_processing_track == ProcessingTrack.HYBRID)
+
+                        # For Direct Track with background rendering:
+                        # - CHART regions should be excluded from text layer (chart text already in background)
+                        # - Other images don't need exclusion (text rendered as invisible overlay)
+                        if is_direct:
+                            if element.type == ElementType.CHART:
+                                # Add chart to exclusion regions - chart-internal text should NOT be
+                                # in the invisible text layer (already visible in background image)
+                                regions_to_avoid.append(element)
+                                logger.debug(f"Direct track: excluding CHART {element.element_id} - text inside chart not needed")
+                            else:
+                                # Other image types: don't exclude, text will be invisible overlay
                                logger.debug(f"Direct track: not excluding {element.element_id} from text regions")
-                                continue
+                            continue

-                            # OCR track: Skip full-page background images from exclusion regions
+                        # OCR track: Handle image exclusion for text rendered on images
+                        if element.type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.LOGO, ElementType.STAMP]:
+                            # Skip full-page background images from exclusion regions
                            # Smaller images that might contain OCR'd text should still be excluded
                            if element.bbox:
                                elem_area = (element.bbox.x1 - element.bbox.x0) * (element.bbox.y1 - element.bbox.y0)
@@ -965,23 +1061,20 @@ class PDFGeneratorService:
                           f"{len(table_elements)} tables, {len(image_elements)} images, "
                           f"{len(list_elements)} list items")

-                # Use original element order from extraction engine
-                # The extraction engine has already sorted elements by reading order,
-                # handling multi-column layouts correctly (top-to-bottom, left-to-right)
-                all_elements = []
+                # FIX: Render in proper z-order for Office/PPT documents
+                # Images (backgrounds) must be rendered FIRST, then tables, then text on top
+                # This ensures white text on dark backgrounds is visible

-                # Preserve original order by iterating through page.elements
-                for elem in page.elements:
-                    if elem in image_elements:
-                        all_elements.append(('image', elem))
-                    elif elem in table_elements:
-                        all_elements.append(('table', elem))
-                    elif elem in list_elements:
-                        all_elements.append(('list', elem))
-                    elif elem in text_elements:
-                        all_elements.append(('text', elem))
+                # Sort images by area (largest first = background images)
+                def get_element_area(elem):
+                    if elem.bbox:
+                        return (elem.bbox.x1 - elem.bbox.x0) * (elem.bbox.y1 - elem.bbox.y0)
+                    return 0

-                logger.debug(f"Drawing {len(all_elements)} elements in extraction order (preserves multi-column reading order)")
+                sorted_images = sorted(image_elements, key=get_element_area, reverse=True)
+
+                logger.debug(f"Rendering order: {len(sorted_images)} images (largest first), "
+                           f"{len(table_elements)} tables, {len(text_elements)+len(list_elements)} text elements")
                logger.debug(f"Exclusion regions: {len(regions_to_avoid)} (tables/images/charts)")

                # Debug: Log exclusion region types
@@ -992,29 +1085,61 @@ class PDFGeneratorService:
                if region_types:
                    logger.debug(f"  Exclusion region breakdown: {region_types}")

-                # Draw elements in document order
-                for elem_type, elem in all_elements:
-                    if elem_type == 'image':
+                # Step 1: Draw images (backgrounds)
+                # Skip if we already rendered the source page as background (Office documents)
+                if rendered_background:
+                    logger.debug(f"  Skipping {len(sorted_images)} individual images - background already rendered")
+                else:
+                    # Larger images (backgrounds) are drawn first, smaller images on top
+                    for elem in sorted_images:
                        self._draw_image_element_direct(pdf_canvas, elem, current_page_height, output_path.parent)
-                    elif elem_type == 'table':
+
+                # For Office documents with full-page background rendering:
+                # - Skip tables (already visible in background image)
+                # - Draw text as INVISIBLE layer (for searchability/translation, but no visual overlap)
+                if rendered_background:
+                    logger.debug(f"  Skipping {len(table_elements)} tables - already in background")
+                    logger.debug(f"  Drawing {len(text_elements)+len(list_elements)} text elements as invisible layer")
+
+                    # Set text rendering mode to invisible (mode 3)
+                    # This makes text selectable/searchable but not visible
+                    pdf_canvas._code.append('3 Tr')  # Text render mode: invisible
+
+                    for elem in page.elements:
+                        if elem in list_elements or elem in text_elements:
+                            self._draw_text_element_direct(pdf_canvas, elem, current_page_height)
+
+                    # Reset text rendering mode to normal
+                    pdf_canvas._code.append('0 Tr')  # Text render mode: fill
+                else:
+                    # Step 2: Draw tables
+                    for elem in table_elements:
                        self._draw_table_element_direct(pdf_canvas, elem, current_page_height)
-                    elif elem_type == 'list':
-                        # FIX: Check if list item overlaps with table/image
-                        if not self._is_element_inside_regions(elem.bbox, regions_to_avoid):
-                            self._draw_text_element_direct(pdf_canvas, elem, current_page_height)
-                        else:
-                            logger.debug(f"Skipping list element {elem.element_id} inside table/image region")
-                    elif elem_type == 'text':
-                        # FIX: Check if text overlaps with table/image before drawing
-                        if not self._is_element_inside_regions(elem.bbox, regions_to_avoid):
-                            self._draw_text_element_direct(pdf_canvas, elem, current_page_height)
-                        else:
-                            logger.debug(f"Skipping text element {elem.element_id} inside table/image region")
+
+                    # Step 3: Draw text and list elements (on top of images/tables)
+                    # Use original document order for reading flow
+                    for elem in page.elements:
+                        if elem in list_elements:
+                            # Check if list item overlaps with table/image
+                            if not self._is_element_inside_regions(elem.bbox, regions_to_avoid):
+                                self._draw_text_element_direct(pdf_canvas, elem, current_page_height)
+                            else:
+                                logger.debug(f"Skipping list element {elem.element_id} inside table/image region")
+                        elif elem in text_elements:
+                            # Check if text overlaps with table/image before drawing
+                            if not self._is_element_inside_regions(elem.bbox, regions_to_avoid):
+                                self._draw_text_element_direct(pdf_canvas, elem, current_page_height)
+                            else:
+                                logger.debug(f"Skipping text element {elem.element_id} inside table/image region")

            # Save PDF
            pdf_canvas.save()
            logger.info(f"Direct track PDF saved to {output_path}")

+            # Close source PDF if opened
+            if source_pdf:
+                source_pdf.close()
+
            # Reset track
            self.current_processing_track = None
            return True
@@ -1023,6 +1148,12 @@ class PDFGeneratorService:
            logger.error(f"Failed to generate Direct track PDF: {e}")
            import traceback
            traceback.print_exc()
+            # Clean up source PDF on error
+            if source_pdf:
+                try:
+                    source_pdf.close()
+                except:
+                    pass
            self.current_processing_track = None
            return False

@@ -3249,7 +3380,8 @@ class PDFGeneratorService:
                    return self.generate_from_unified_document(
                        unified_doc=unified_doc,
                        output_path=output_path,
-                        source_file_path=source_file_path
+                        source_file_path=source_file_path,
+                        result_dir=json_path.parent  # Pass result dir for finding converted PDFs
                    )
                else:
                    logger.error("Failed to convert JSON to UnifiedDocument")
@@ -3309,6 +3441,7 @@ class PDFGeneratorService:
                keywords=metadata_dict.get('keywords'),
                producer=metadata_dict.get('producer'),
                creator=metadata_dict.get('creator'),
+                original_filename=metadata_dict.get('original_filename'),  # For Office document detection
                creation_date=datetime.fromisoformat(metadata_dict['creation_date'].replace('Z', '+00:00')) if metadata_dict.get('creation_date') else None,
                modification_date=datetime.fromisoformat(metadata_dict['modification_date'].replace('Z', '+00:00')) if metadata_dict.get('modification_date') else None,
            )