feat: implement layout-preserving PDF generation with table reconstruction

Major Features:
- Add PDF generation service with Chinese font support
- Parse HTML tables from PP-StructureV3 and rebuild with ReportLab
- Extract table text for translation purposes
- Auto-filter text regions inside tables to avoid overlaps

Backend Changes:
1. pdf_generator_service.py (NEW)
   - HTMLTableParser: Parse HTML tables to extract structure
   - PDFGeneratorService: Generate layout-preserving PDFs
   - Coordinate transformation: OCR (top-left) → PDF (bottom-left)
   - Font size heuristics: 75% of bbox height with width checking
   - Table reconstruction: Parse HTML → ReportLab Table
   - Image embedding: Extract bbox from filenames

2. ocr_service.py
   - Add _extract_table_text() for translation support
   - Add output_dir parameter to save images to result directory
   - Extract bbox from image filenames (img_in_table_box_x1_y1_x2_y2.jpg)

3. tasks.py
   - Update process_task_ocr to use save_results() with PDF generation
   - Fix download_pdf endpoint to use database-stored PDF paths
   - Support on-demand PDF generation from JSON

4. config.py
   - Add chinese_font_path configuration
   - Add pdf_enable_bbox_debug flag

Frontend Changes:
1. PDFViewer.tsx (NEW)
   - React PDF viewer with zoom and pagination
   - Memoized file config to prevent unnecessary reloads

2. TaskDetailPage.tsx & ResultsPage.tsx
   - Integrate PDF preview and download

3. main.tsx
   - Configure PDF.js worker via CDN

4. vite.config.ts
   - Add host: '0.0.0.0' for network access
   - Use VITE_API_URL environment variable for backend proxy

Dependencies:
- reportlab: PDF generation library
- Noto Sans SC font: Chinese character support

🤖 Generated with Claude Code
https://claude.com/claude-code

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-17 20:21:56 +08:00
parent 012da1abc4
commit fa1abcd8e6
16 changed files with 1427 additions and 57 deletions

View File

@@ -66,34 +66,33 @@ def process_task_ocr(task_id: str, task_db_id: int, file_path: str, filename: st
# Initialize OCR service
ocr_service = OCRService()
# Create result directory before OCR processing (needed for saving extracted images)
result_dir = Path(settings.result_dir) / task_id
result_dir.mkdir(parents=True, exist_ok=True)
# Process the file with OCR
ocr_result = ocr_service.process_image(
image_path=Path(file_path),
lang='ch',
detect_layout=True
detect_layout=True,
output_dir=result_dir
)
# Calculate processing time
processing_time_ms = int((datetime.now() - start_time).total_seconds() * 1000)
# Create result directory
result_dir = Path(settings.result_dir) / task_id
result_dir.mkdir(parents=True, exist_ok=True)
# Save JSON result
json_path = result_dir / f"{Path(filename).stem}_result.json"
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(ocr_result, f, ensure_ascii=False, indent=2)
# Save Markdown result
markdown_path = result_dir / f"{Path(filename).stem}_result.md"
markdown_content = ocr_result.get('markdown_content', '')
with open(markdown_path, 'w', encoding='utf-8') as f:
f.write(markdown_content)
# Save results using OCR service (includes JSON, Markdown, and PDF generation)
json_path, markdown_path, pdf_path = ocr_service.save_results(
result=ocr_result,
output_dir=result_dir,
file_id=Path(filename).stem,
source_file_path=Path(file_path)
)
# Update task with results (direct database update)
task.result_json_path = str(json_path)
task.result_markdown_path = str(markdown_path)
task.result_json_path = str(json_path) if json_path else None
task.result_markdown_path = str(markdown_path) if markdown_path else None
task.result_pdf_path = str(pdf_path) if pdf_path else None
task.processing_time_ms = processing_time_ms
task.status = TaskStatus.COMPLETED
task.completed_at = datetime.utcnow()
@@ -468,10 +467,16 @@ async def download_pdf(
current_user: User = Depends(get_current_user)
):
"""
Download task result as searchable PDF file
Download task result as layout-preserving PDF file
- **task_id**: Task UUID
Returns a PDF that preserves the original document layout using OCR results.
The PDF is generated from OCR JSON data and cached for subsequent requests.
"""
from pathlib import Path
from app.services.pdf_generator_service import pdf_generator_service
# Get task
task = task_service.get_task_by_id(
db=db,
@@ -485,12 +490,69 @@ async def download_pdf(
detail="Task not found"
)
# Check if task is completed
if task.status.value != "completed":
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Task is not completed yet. Please wait for OCR processing to finish."
)
# Check if PDF path is stored in database
if task.result_pdf_path and Path(task.result_pdf_path).exists():
pdf_path = Path(task.result_pdf_path)
logger.info(f"Using pre-generated PDF from database: {pdf_path.name}")
else:
# Fallback: Try to generate PDF on-demand
result_dir = Path(settings.result_dir) / task_id
# Use stored JSON path or construct it
if task.result_json_path and Path(task.result_json_path).exists():
json_path = Path(task.result_json_path)
else:
# Try to find JSON file in result directory
json_files = list(result_dir.glob("*_result.json"))
if not json_files:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="OCR result JSON not found"
)
json_path = json_files[0]
# Construct PDF path based on JSON filename
pdf_filename = json_path.stem.replace("_result", "_layout") + ".pdf"
pdf_path = result_dir / pdf_filename
# Generate PDF if it doesn't exist
if not pdf_path.exists():
logger.info(f"Generating layout-preserving PDF for task {task_id}")
# Get source file path if available
source_file = None
task_file = db.query(TaskFile).filter(TaskFile.task_id == task.id).first()
if task_file and task_file.stored_path and Path(task_file.stored_path).exists():
source_file = Path(task_file.stored_path)
# Generate PDF
success = pdf_generator_service.generate_layout_pdf(
json_path=json_path,
output_path=pdf_path,
source_file_path=source_file
)
if not success:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="Failed to generate PDF. Please check server logs."
)
logger.info(f"PDF generated successfully: {pdf_path.name}")
# Validate file access
is_valid, error_msg = file_access_service.validate_file_access(
db=db,
user_id=current_user.id,
task_id=task_id,
file_path=task.result_pdf_path
file_path=str(pdf_path)
)
if not is_valid:
@@ -502,7 +564,7 @@ async def download_pdf(
# Return file
filename = f"{task.filename or task_id}_result.pdf"
return FileResponse(
path=task.result_pdf_path,
path=str(pdf_path),
filename=filename,
media_type="application/pdf"
)