feat: implement layout-preserving PDF generation with table reconstruction

Major Features: - Add PDF generation service with Chinese font support - Parse HTML tables from PP-StructureV3 and rebuild with ReportLab - Extract table text for translation purposes - Auto-filter text regions inside tables to avoid overlaps Backend Changes: 1. pdf_generator_service.py (NEW) - HTMLTableParser: Parse HTML tables to extract structure - PDFGeneratorService: Generate layout-preserving PDFs - Coordinate transformation: OCR (top-left) → PDF (bottom-left) - Font size heuristics: 75% of bbox height with width checking - Table reconstruction: Parse HTML → ReportLab Table - Image embedding: Extract bbox from filenames 2. ocr_service.py - Add _extract_table_text() for translation support - Add output_dir parameter to save images to result directory - Extract bbox from image filenames (img_in_table_box_x1_y1_x2_y2.jpg) 3. tasks.py - Update process_task_ocr to use save_results() with PDF generation - Fix download_pdf endpoint to use database-stored PDF paths - Support on-demand PDF generation from JSON 4. config.py - Add chinese_font_path configuration - Add pdf_enable_bbox_debug flag Frontend Changes: 1. PDFViewer.tsx (NEW) - React PDF viewer with zoom and pagination - Memoized file config to prevent unnecessary reloads 2. TaskDetailPage.tsx & ResultsPage.tsx - Integrate PDF preview and download 3. main.tsx - Configure PDF.js worker via CDN 4. vite.config.ts - Add host: '0.0.0.0' for network access - Use VITE_API_URL environment variable for backend proxy Dependencies: - reportlab: PDF generation library - Noto Sans SC font: Chinese character support 🤖 Generated with Claude Code https://claude.com/claude-code Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-17 20:21:56 +08:00
parent 012da1abc4
commit fa1abcd8e6
16 changed files with 1427 additions and 57 deletions
--- a/backend/app/routers/tasks.py
+++ b/backend/app/routers/tasks.py
@@ -66,34 +66,33 @@ def process_task_ocr(task_id: str, task_db_id: int, file_path: str, filename: st
        # Initialize OCR service
        ocr_service = OCRService()

+        # Create result directory before OCR processing (needed for saving extracted images)
+        result_dir = Path(settings.result_dir) / task_id
+        result_dir.mkdir(parents=True, exist_ok=True)
+
        # Process the file with OCR
        ocr_result = ocr_service.process_image(
            image_path=Path(file_path),
            lang='ch',
-            detect_layout=True
+            detect_layout=True,
+            output_dir=result_dir
        )

        # Calculate processing time
        processing_time_ms = int((datetime.now() - start_time).total_seconds() * 1000)

-        # Create result directory
-        result_dir = Path(settings.result_dir) / task_id
-        result_dir.mkdir(parents=True, exist_ok=True)
-
-        # Save JSON result
-        json_path = result_dir / f"{Path(filename).stem}_result.json"
-        with open(json_path, 'w', encoding='utf-8') as f:
-            json.dump(ocr_result, f, ensure_ascii=False, indent=2)
-
-        # Save Markdown result
-        markdown_path = result_dir / f"{Path(filename).stem}_result.md"
-        markdown_content = ocr_result.get('markdown_content', '')
-        with open(markdown_path, 'w', encoding='utf-8') as f:
-            f.write(markdown_content)
+        # Save results using OCR service (includes JSON, Markdown, and PDF generation)
+        json_path, markdown_path, pdf_path = ocr_service.save_results(
+            result=ocr_result,
+            output_dir=result_dir,
+            file_id=Path(filename).stem,
+            source_file_path=Path(file_path)
+        )

        # Update task with results (direct database update)
-        task.result_json_path = str(json_path)
-        task.result_markdown_path = str(markdown_path)
+        task.result_json_path = str(json_path) if json_path else None
+        task.result_markdown_path = str(markdown_path) if markdown_path else None
+        task.result_pdf_path = str(pdf_path) if pdf_path else None
        task.processing_time_ms = processing_time_ms
        task.status = TaskStatus.COMPLETED
        task.completed_at = datetime.utcnow()
@@ -468,10 +467,16 @@ async def download_pdf(
    current_user: User = Depends(get_current_user)
 ):
    """
-    Download task result as searchable PDF file
+    Download task result as layout-preserving PDF file

    - **task_id**: Task UUID
+
+    Returns a PDF that preserves the original document layout using OCR results.
+    The PDF is generated from OCR JSON data and cached for subsequent requests.
    """
+    from pathlib import Path
+    from app.services.pdf_generator_service import pdf_generator_service
+
    # Get task
    task = task_service.get_task_by_id(
        db=db,
@@ -485,12 +490,69 @@ async def download_pdf(
            detail="Task not found"
        )

+    # Check if task is completed
+    if task.status.value != "completed":
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail="Task is not completed yet. Please wait for OCR processing to finish."
+        )
+
+    # Check if PDF path is stored in database
+    if task.result_pdf_path and Path(task.result_pdf_path).exists():
+        pdf_path = Path(task.result_pdf_path)
+        logger.info(f"Using pre-generated PDF from database: {pdf_path.name}")
+    else:
+        # Fallback: Try to generate PDF on-demand
+        result_dir = Path(settings.result_dir) / task_id
+
+        # Use stored JSON path or construct it
+        if task.result_json_path and Path(task.result_json_path).exists():
+            json_path = Path(task.result_json_path)
+        else:
+            # Try to find JSON file in result directory
+            json_files = list(result_dir.glob("*_result.json"))
+            if not json_files:
+                raise HTTPException(
+                    status_code=status.HTTP_404_NOT_FOUND,
+                    detail="OCR result JSON not found"
+                )
+            json_path = json_files[0]
+
+        # Construct PDF path based on JSON filename
+        pdf_filename = json_path.stem.replace("_result", "_layout") + ".pdf"
+        pdf_path = result_dir / pdf_filename
+
+        # Generate PDF if it doesn't exist
+        if not pdf_path.exists():
+            logger.info(f"Generating layout-preserving PDF for task {task_id}")
+
+            # Get source file path if available
+            source_file = None
+            task_file = db.query(TaskFile).filter(TaskFile.task_id == task.id).first()
+            if task_file and task_file.stored_path and Path(task_file.stored_path).exists():
+                source_file = Path(task_file.stored_path)
+
+            # Generate PDF
+            success = pdf_generator_service.generate_layout_pdf(
+                json_path=json_path,
+                output_path=pdf_path,
+                source_file_path=source_file
+            )
+
+            if not success:
+                raise HTTPException(
+                    status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                    detail="Failed to generate PDF. Please check server logs."
+                )
+
+            logger.info(f"PDF generated successfully: {pdf_path.name}")
+
    # Validate file access
    is_valid, error_msg = file_access_service.validate_file_access(
        db=db,
        user_id=current_user.id,
        task_id=task_id,
-        file_path=task.result_pdf_path
+        file_path=str(pdf_path)
    )

    if not is_valid:
@@ -502,7 +564,7 @@ async def download_pdf(
    # Return file
    filename = f"{task.filename or task_id}_result.pdf"
    return FileResponse(
-        path=task.result_pdf_path,
+        path=str(pdf_path),
        filename=filename,
        media_type="application/pdf"
    )