feat: implement layout-preserving PDF generation with table reconstruction
Major Features: - Add PDF generation service with Chinese font support - Parse HTML tables from PP-StructureV3 and rebuild with ReportLab - Extract table text for translation purposes - Auto-filter text regions inside tables to avoid overlaps Backend Changes: 1. pdf_generator_service.py (NEW) - HTMLTableParser: Parse HTML tables to extract structure - PDFGeneratorService: Generate layout-preserving PDFs - Coordinate transformation: OCR (top-left) → PDF (bottom-left) - Font size heuristics: 75% of bbox height with width checking - Table reconstruction: Parse HTML → ReportLab Table - Image embedding: Extract bbox from filenames 2. ocr_service.py - Add _extract_table_text() for translation support - Add output_dir parameter to save images to result directory - Extract bbox from image filenames (img_in_table_box_x1_y1_x2_y2.jpg) 3. tasks.py - Update process_task_ocr to use save_results() with PDF generation - Fix download_pdf endpoint to use database-stored PDF paths - Support on-demand PDF generation from JSON 4. config.py - Add chinese_font_path configuration - Add pdf_enable_bbox_debug flag Frontend Changes: 1. PDFViewer.tsx (NEW) - React PDF viewer with zoom and pagination - Memoized file config to prevent unnecessary reloads 2. TaskDetailPage.tsx & ResultsPage.tsx - Integrate PDF preview and download 3. main.tsx - Configure PDF.js worker via CDN 4. vite.config.ts - Add host: '0.0.0.0' for network access - Use VITE_API_URL environment variable for backend proxy Dependencies: - reportlab: PDF generation library - Noto Sans SC font: Chinese character support 🤖 Generated with Claude Code https://claude.com/claude-code Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -66,34 +66,33 @@ def process_task_ocr(task_id: str, task_db_id: int, file_path: str, filename: st
|
||||
# Initialize OCR service
|
||||
ocr_service = OCRService()
|
||||
|
||||
# Create result directory before OCR processing (needed for saving extracted images)
|
||||
result_dir = Path(settings.result_dir) / task_id
|
||||
result_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Process the file with OCR
|
||||
ocr_result = ocr_service.process_image(
|
||||
image_path=Path(file_path),
|
||||
lang='ch',
|
||||
detect_layout=True
|
||||
detect_layout=True,
|
||||
output_dir=result_dir
|
||||
)
|
||||
|
||||
# Calculate processing time
|
||||
processing_time_ms = int((datetime.now() - start_time).total_seconds() * 1000)
|
||||
|
||||
# Create result directory
|
||||
result_dir = Path(settings.result_dir) / task_id
|
||||
result_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Save JSON result
|
||||
json_path = result_dir / f"{Path(filename).stem}_result.json"
|
||||
with open(json_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(ocr_result, f, ensure_ascii=False, indent=2)
|
||||
|
||||
# Save Markdown result
|
||||
markdown_path = result_dir / f"{Path(filename).stem}_result.md"
|
||||
markdown_content = ocr_result.get('markdown_content', '')
|
||||
with open(markdown_path, 'w', encoding='utf-8') as f:
|
||||
f.write(markdown_content)
|
||||
# Save results using OCR service (includes JSON, Markdown, and PDF generation)
|
||||
json_path, markdown_path, pdf_path = ocr_service.save_results(
|
||||
result=ocr_result,
|
||||
output_dir=result_dir,
|
||||
file_id=Path(filename).stem,
|
||||
source_file_path=Path(file_path)
|
||||
)
|
||||
|
||||
# Update task with results (direct database update)
|
||||
task.result_json_path = str(json_path)
|
||||
task.result_markdown_path = str(markdown_path)
|
||||
task.result_json_path = str(json_path) if json_path else None
|
||||
task.result_markdown_path = str(markdown_path) if markdown_path else None
|
||||
task.result_pdf_path = str(pdf_path) if pdf_path else None
|
||||
task.processing_time_ms = processing_time_ms
|
||||
task.status = TaskStatus.COMPLETED
|
||||
task.completed_at = datetime.utcnow()
|
||||
@@ -468,10 +467,16 @@ async def download_pdf(
|
||||
current_user: User = Depends(get_current_user)
|
||||
):
|
||||
"""
|
||||
Download task result as searchable PDF file
|
||||
Download task result as layout-preserving PDF file
|
||||
|
||||
- **task_id**: Task UUID
|
||||
|
||||
Returns a PDF that preserves the original document layout using OCR results.
|
||||
The PDF is generated from OCR JSON data and cached for subsequent requests.
|
||||
"""
|
||||
from pathlib import Path
|
||||
from app.services.pdf_generator_service import pdf_generator_service
|
||||
|
||||
# Get task
|
||||
task = task_service.get_task_by_id(
|
||||
db=db,
|
||||
@@ -485,12 +490,69 @@ async def download_pdf(
|
||||
detail="Task not found"
|
||||
)
|
||||
|
||||
# Check if task is completed
|
||||
if task.status.value != "completed":
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail="Task is not completed yet. Please wait for OCR processing to finish."
|
||||
)
|
||||
|
||||
# Check if PDF path is stored in database
|
||||
if task.result_pdf_path and Path(task.result_pdf_path).exists():
|
||||
pdf_path = Path(task.result_pdf_path)
|
||||
logger.info(f"Using pre-generated PDF from database: {pdf_path.name}")
|
||||
else:
|
||||
# Fallback: Try to generate PDF on-demand
|
||||
result_dir = Path(settings.result_dir) / task_id
|
||||
|
||||
# Use stored JSON path or construct it
|
||||
if task.result_json_path and Path(task.result_json_path).exists():
|
||||
json_path = Path(task.result_json_path)
|
||||
else:
|
||||
# Try to find JSON file in result directory
|
||||
json_files = list(result_dir.glob("*_result.json"))
|
||||
if not json_files:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail="OCR result JSON not found"
|
||||
)
|
||||
json_path = json_files[0]
|
||||
|
||||
# Construct PDF path based on JSON filename
|
||||
pdf_filename = json_path.stem.replace("_result", "_layout") + ".pdf"
|
||||
pdf_path = result_dir / pdf_filename
|
||||
|
||||
# Generate PDF if it doesn't exist
|
||||
if not pdf_path.exists():
|
||||
logger.info(f"Generating layout-preserving PDF for task {task_id}")
|
||||
|
||||
# Get source file path if available
|
||||
source_file = None
|
||||
task_file = db.query(TaskFile).filter(TaskFile.task_id == task.id).first()
|
||||
if task_file and task_file.stored_path and Path(task_file.stored_path).exists():
|
||||
source_file = Path(task_file.stored_path)
|
||||
|
||||
# Generate PDF
|
||||
success = pdf_generator_service.generate_layout_pdf(
|
||||
json_path=json_path,
|
||||
output_path=pdf_path,
|
||||
source_file_path=source_file
|
||||
)
|
||||
|
||||
if not success:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail="Failed to generate PDF. Please check server logs."
|
||||
)
|
||||
|
||||
logger.info(f"PDF generated successfully: {pdf_path.name}")
|
||||
|
||||
# Validate file access
|
||||
is_valid, error_msg = file_access_service.validate_file_access(
|
||||
db=db,
|
||||
user_id=current_user.id,
|
||||
task_id=task_id,
|
||||
file_path=task.result_pdf_path
|
||||
file_path=str(pdf_path)
|
||||
)
|
||||
|
||||
if not is_valid:
|
||||
@@ -502,7 +564,7 @@ async def download_pdf(
|
||||
# Return file
|
||||
filename = f"{task.filename or task_id}_result.pdf"
|
||||
return FileResponse(
|
||||
path=task.result_pdf_path,
|
||||
path=str(pdf_path),
|
||||
filename=filename,
|
||||
media_type="application/pdf"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user