feat: implement layout-preserving PDF generation with table reconstruction
Major Features: - Add PDF generation service with Chinese font support - Parse HTML tables from PP-StructureV3 and rebuild with ReportLab - Extract table text for translation purposes - Auto-filter text regions inside tables to avoid overlaps Backend Changes: 1. pdf_generator_service.py (NEW) - HTMLTableParser: Parse HTML tables to extract structure - PDFGeneratorService: Generate layout-preserving PDFs - Coordinate transformation: OCR (top-left) → PDF (bottom-left) - Font size heuristics: 75% of bbox height with width checking - Table reconstruction: Parse HTML → ReportLab Table - Image embedding: Extract bbox from filenames 2. ocr_service.py - Add _extract_table_text() for translation support - Add output_dir parameter to save images to result directory - Extract bbox from image filenames (img_in_table_box_x1_y1_x2_y2.jpg) 3. tasks.py - Update process_task_ocr to use save_results() with PDF generation - Fix download_pdf endpoint to use database-stored PDF paths - Support on-demand PDF generation from JSON 4. config.py - Add chinese_font_path configuration - Add pdf_enable_bbox_debug flag Frontend Changes: 1. PDFViewer.tsx (NEW) - React PDF viewer with zoom and pagination - Memoized file config to prevent unnecessary reloads 2. TaskDetailPage.tsx & ResultsPage.tsx - Integrate PDF preview and download 3. main.tsx - Configure PDF.js worker via CDN 4. vite.config.ts - Add host: '0.0.0.0' for network access - Use VITE_API_URL environment variable for backend proxy Dependencies: - reportlab: PDF generation library - Noto Sans SC font: Chinese character support 🤖 Generated with Claude Code https://claude.com/claude-code Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -96,6 +96,11 @@ class Settings(BaseSettings):
|
||||
pdf_margin_left: int = Field(default=20)
|
||||
pdf_margin_right: int = Field(default=20)
|
||||
|
||||
# ===== Layout-Preserving PDF Configuration =====
|
||||
chinese_font_path: str = Field(default="./backend/fonts/NotoSansSC-Regular.ttf")
|
||||
pdf_font_size_base: int = Field(default=12)
|
||||
pdf_enable_bbox_debug: bool = Field(default=False) # Draw bounding boxes for debugging
|
||||
|
||||
# ===== Translation Configuration (Reserved) =====
|
||||
enable_translation: bool = Field(default=False)
|
||||
translation_engine: str = Field(default="offline")
|
||||
|
||||
@@ -66,34 +66,33 @@ def process_task_ocr(task_id: str, task_db_id: int, file_path: str, filename: st
|
||||
# Initialize OCR service
|
||||
ocr_service = OCRService()
|
||||
|
||||
# Create result directory before OCR processing (needed for saving extracted images)
|
||||
result_dir = Path(settings.result_dir) / task_id
|
||||
result_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Process the file with OCR
|
||||
ocr_result = ocr_service.process_image(
|
||||
image_path=Path(file_path),
|
||||
lang='ch',
|
||||
detect_layout=True
|
||||
detect_layout=True,
|
||||
output_dir=result_dir
|
||||
)
|
||||
|
||||
# Calculate processing time
|
||||
processing_time_ms = int((datetime.now() - start_time).total_seconds() * 1000)
|
||||
|
||||
# Create result directory
|
||||
result_dir = Path(settings.result_dir) / task_id
|
||||
result_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Save JSON result
|
||||
json_path = result_dir / f"{Path(filename).stem}_result.json"
|
||||
with open(json_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(ocr_result, f, ensure_ascii=False, indent=2)
|
||||
|
||||
# Save Markdown result
|
||||
markdown_path = result_dir / f"{Path(filename).stem}_result.md"
|
||||
markdown_content = ocr_result.get('markdown_content', '')
|
||||
with open(markdown_path, 'w', encoding='utf-8') as f:
|
||||
f.write(markdown_content)
|
||||
# Save results using OCR service (includes JSON, Markdown, and PDF generation)
|
||||
json_path, markdown_path, pdf_path = ocr_service.save_results(
|
||||
result=ocr_result,
|
||||
output_dir=result_dir,
|
||||
file_id=Path(filename).stem,
|
||||
source_file_path=Path(file_path)
|
||||
)
|
||||
|
||||
# Update task with results (direct database update)
|
||||
task.result_json_path = str(json_path)
|
||||
task.result_markdown_path = str(markdown_path)
|
||||
task.result_json_path = str(json_path) if json_path else None
|
||||
task.result_markdown_path = str(markdown_path) if markdown_path else None
|
||||
task.result_pdf_path = str(pdf_path) if pdf_path else None
|
||||
task.processing_time_ms = processing_time_ms
|
||||
task.status = TaskStatus.COMPLETED
|
||||
task.completed_at = datetime.utcnow()
|
||||
@@ -468,10 +467,16 @@ async def download_pdf(
|
||||
current_user: User = Depends(get_current_user)
|
||||
):
|
||||
"""
|
||||
Download task result as searchable PDF file
|
||||
Download task result as layout-preserving PDF file
|
||||
|
||||
- **task_id**: Task UUID
|
||||
|
||||
Returns a PDF that preserves the original document layout using OCR results.
|
||||
The PDF is generated from OCR JSON data and cached for subsequent requests.
|
||||
"""
|
||||
from pathlib import Path
|
||||
from app.services.pdf_generator_service import pdf_generator_service
|
||||
|
||||
# Get task
|
||||
task = task_service.get_task_by_id(
|
||||
db=db,
|
||||
@@ -485,12 +490,69 @@ async def download_pdf(
|
||||
detail="Task not found"
|
||||
)
|
||||
|
||||
# Check if task is completed
|
||||
if task.status.value != "completed":
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail="Task is not completed yet. Please wait for OCR processing to finish."
|
||||
)
|
||||
|
||||
# Check if PDF path is stored in database
|
||||
if task.result_pdf_path and Path(task.result_pdf_path).exists():
|
||||
pdf_path = Path(task.result_pdf_path)
|
||||
logger.info(f"Using pre-generated PDF from database: {pdf_path.name}")
|
||||
else:
|
||||
# Fallback: Try to generate PDF on-demand
|
||||
result_dir = Path(settings.result_dir) / task_id
|
||||
|
||||
# Use stored JSON path or construct it
|
||||
if task.result_json_path and Path(task.result_json_path).exists():
|
||||
json_path = Path(task.result_json_path)
|
||||
else:
|
||||
# Try to find JSON file in result directory
|
||||
json_files = list(result_dir.glob("*_result.json"))
|
||||
if not json_files:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail="OCR result JSON not found"
|
||||
)
|
||||
json_path = json_files[0]
|
||||
|
||||
# Construct PDF path based on JSON filename
|
||||
pdf_filename = json_path.stem.replace("_result", "_layout") + ".pdf"
|
||||
pdf_path = result_dir / pdf_filename
|
||||
|
||||
# Generate PDF if it doesn't exist
|
||||
if not pdf_path.exists():
|
||||
logger.info(f"Generating layout-preserving PDF for task {task_id}")
|
||||
|
||||
# Get source file path if available
|
||||
source_file = None
|
||||
task_file = db.query(TaskFile).filter(TaskFile.task_id == task.id).first()
|
||||
if task_file and task_file.stored_path and Path(task_file.stored_path).exists():
|
||||
source_file = Path(task_file.stored_path)
|
||||
|
||||
# Generate PDF
|
||||
success = pdf_generator_service.generate_layout_pdf(
|
||||
json_path=json_path,
|
||||
output_path=pdf_path,
|
||||
source_file_path=source_file
|
||||
)
|
||||
|
||||
if not success:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail="Failed to generate PDF. Please check server logs."
|
||||
)
|
||||
|
||||
logger.info(f"PDF generated successfully: {pdf_path.name}")
|
||||
|
||||
# Validate file access
|
||||
is_valid, error_msg = file_access_service.validate_file_access(
|
||||
db=db,
|
||||
user_id=current_user.id,
|
||||
task_id=task_id,
|
||||
file_path=task.result_pdf_path
|
||||
file_path=str(pdf_path)
|
||||
)
|
||||
|
||||
if not is_valid:
|
||||
@@ -502,7 +564,7 @@ async def download_pdf(
|
||||
# Return file
|
||||
filename = f"{task.filename or task_id}_result.pdf"
|
||||
return FileResponse(
|
||||
path=task.result_pdf_path,
|
||||
path=str(pdf_path),
|
||||
filename=filename,
|
||||
media_type="application/pdf"
|
||||
)
|
||||
|
||||
@@ -284,7 +284,8 @@ class OCRService:
|
||||
image_path: Path,
|
||||
lang: str = 'ch',
|
||||
detect_layout: bool = True,
|
||||
confidence_threshold: Optional[float] = None
|
||||
confidence_threshold: Optional[float] = None,
|
||||
output_dir: Optional[Path] = None
|
||||
) -> Dict:
|
||||
"""
|
||||
Process single image with OCR and layout analysis
|
||||
@@ -340,7 +341,8 @@ class OCRService:
|
||||
page_image_path,
|
||||
lang=lang,
|
||||
detect_layout=detect_layout,
|
||||
confidence_threshold=confidence_threshold
|
||||
confidence_threshold=confidence_threshold,
|
||||
output_dir=output_dir
|
||||
)
|
||||
|
||||
# Accumulate results
|
||||
@@ -458,7 +460,7 @@ class OCRService:
|
||||
images_metadata = []
|
||||
|
||||
if detect_layout:
|
||||
layout_data, images_metadata = self.analyze_layout(image_path)
|
||||
layout_data, images_metadata = self.analyze_layout(image_path, output_dir=output_dir)
|
||||
|
||||
# Generate Markdown
|
||||
markdown_content = self.generate_markdown(text_regions, layout_data)
|
||||
@@ -500,12 +502,71 @@ class OCRService:
|
||||
'processing_time': (datetime.now() - start_time).total_seconds(),
|
||||
}
|
||||
|
||||
def analyze_layout(self, image_path: Path) -> Tuple[Optional[Dict], List[Dict]]:
|
||||
def _extract_table_text(self, html_content: str) -> str:
|
||||
"""
|
||||
Extract text from HTML table content for translation purposes
|
||||
|
||||
Args:
|
||||
html_content: HTML content containing table
|
||||
|
||||
Returns:
|
||||
Extracted text from table cells
|
||||
"""
|
||||
try:
|
||||
from html.parser import HTMLParser
|
||||
|
||||
class TableTextExtractor(HTMLParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.text_parts = []
|
||||
self.in_table = False
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag == 'table':
|
||||
self.in_table = True
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag == 'table':
|
||||
self.in_table = False
|
||||
elif tag in ('td', 'th') and self.in_table:
|
||||
self.text_parts.append(' | ') # Cell separator
|
||||
elif tag == 'tr' and self.in_table:
|
||||
self.text_parts.append('\n') # Row separator
|
||||
|
||||
def handle_data(self, data):
|
||||
if self.in_table:
|
||||
stripped = data.strip()
|
||||
if stripped:
|
||||
self.text_parts.append(stripped)
|
||||
|
||||
parser = TableTextExtractor()
|
||||
parser.feed(html_content)
|
||||
|
||||
# Clean up the extracted text
|
||||
extracted = ''.join(parser.text_parts)
|
||||
# Remove multiple separators
|
||||
import re
|
||||
extracted = re.sub(r'\s*\|\s*\|+\s*', ' | ', extracted)
|
||||
extracted = re.sub(r'\n+', '\n', extracted)
|
||||
extracted = extracted.strip()
|
||||
|
||||
return extracted
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to extract table text: {e}")
|
||||
# Fallback: just remove HTML tags
|
||||
import re
|
||||
text = re.sub(r'<[^>]+>', ' ', html_content)
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
return text.strip()
|
||||
|
||||
def analyze_layout(self, image_path: Path, output_dir: Optional[Path] = None) -> Tuple[Optional[Dict], List[Dict]]:
|
||||
"""
|
||||
Analyze document layout using PP-StructureV3
|
||||
|
||||
Args:
|
||||
image_path: Path to image file
|
||||
output_dir: Optional output directory for saving extracted images (defaults to image_path.parent)
|
||||
|
||||
Returns:
|
||||
Tuple of (layout_data, images_metadata)
|
||||
@@ -548,16 +609,59 @@ class OCRService:
|
||||
'page': page_idx,
|
||||
'bbox': [], # PP-StructureV3 doesn't provide individual bbox in this format
|
||||
}
|
||||
|
||||
# Extract text from table for translation purposes
|
||||
if has_table:
|
||||
table_text = self._extract_table_text(markdown_texts)
|
||||
element['extracted_text'] = table_text
|
||||
logger.info(f"Extracted {len(table_text)} characters from table")
|
||||
|
||||
layout_elements.append(element)
|
||||
|
||||
# Add image metadata
|
||||
# Add image metadata and SAVE images to disk
|
||||
for img_idx, (img_path, img_obj) in enumerate(markdown_images.items()):
|
||||
# Save image to disk
|
||||
try:
|
||||
# Determine base directory for saving images
|
||||
base_dir = output_dir if output_dir else image_path.parent
|
||||
|
||||
# Create full path for image file
|
||||
full_img_path = base_dir / img_path
|
||||
|
||||
# Create imgs/ subdirectory if it doesn't exist
|
||||
full_img_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Save image object to disk
|
||||
if hasattr(img_obj, 'save'):
|
||||
# img_obj is PIL Image
|
||||
img_obj.save(str(full_img_path))
|
||||
logger.info(f"Saved extracted image to {full_img_path}")
|
||||
else:
|
||||
logger.warning(f"Image object for {img_path} does not have save() method, skipping")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to save image {img_path}: {str(e)}")
|
||||
# Continue processing even if image save fails
|
||||
|
||||
# Extract bbox from filename (format: img_in_table_box_x1_y1_x2_y2.jpg)
|
||||
bbox = []
|
||||
try:
|
||||
import re
|
||||
match = re.search(r'box_(\d+)_(\d+)_(\d+)_(\d+)', img_path)
|
||||
if match:
|
||||
x1, y1, x2, y2 = map(int, match.groups())
|
||||
# Convert to 4-point bbox format: [[x1,y1], [x2,y1], [x2,y2], [x1,y2]]
|
||||
bbox = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
|
||||
logger.info(f"Extracted bbox from filename: {bbox}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to extract bbox from {img_path}: {e}")
|
||||
|
||||
images_metadata.append({
|
||||
'element_id': len(layout_elements) + img_idx,
|
||||
'image_path': img_path,
|
||||
'type': 'image',
|
||||
'page': page_idx,
|
||||
'bbox': [],
|
||||
'bbox': bbox,
|
||||
})
|
||||
|
||||
if layout_elements:
|
||||
@@ -638,18 +742,20 @@ class OCRService:
|
||||
self,
|
||||
result: Dict,
|
||||
output_dir: Path,
|
||||
file_id: str
|
||||
) -> Tuple[Optional[Path], Optional[Path]]:
|
||||
file_id: str,
|
||||
source_file_path: Optional[Path] = None
|
||||
) -> Tuple[Optional[Path], Optional[Path], Optional[Path]]:
|
||||
"""
|
||||
Save OCR results to JSON and Markdown files
|
||||
Save OCR results to JSON, Markdown, and layout-preserving PDF files
|
||||
|
||||
Args:
|
||||
result: OCR result dictionary
|
||||
output_dir: Output directory
|
||||
file_id: Unique file identifier
|
||||
source_file_path: Optional path to original source file for PDF generation
|
||||
|
||||
Returns:
|
||||
Tuple of (json_path, markdown_path)
|
||||
Tuple of (json_path, markdown_path, pdf_path)
|
||||
"""
|
||||
try:
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
@@ -666,8 +772,37 @@ class OCRService:
|
||||
f.write(markdown_content)
|
||||
|
||||
logger.info(f"Results saved: {json_path.name}, {markdown_path.name}")
|
||||
return json_path, markdown_path
|
||||
|
||||
# Generate layout-preserving PDF
|
||||
pdf_path = None
|
||||
try:
|
||||
from app.services.pdf_generator_service import pdf_generator_service
|
||||
|
||||
pdf_filename = f"{file_id}_layout.pdf"
|
||||
pdf_path = output_dir / pdf_filename
|
||||
|
||||
logger.info(f"Generating layout-preserving PDF: {pdf_filename}")
|
||||
|
||||
success = pdf_generator_service.generate_layout_pdf(
|
||||
json_path=json_path,
|
||||
output_path=pdf_path,
|
||||
source_file_path=source_file_path
|
||||
)
|
||||
|
||||
if success:
|
||||
logger.info(f"✓ PDF generated successfully: {pdf_path.name}")
|
||||
else:
|
||||
logger.warning(f"✗ PDF generation failed for {file_id}")
|
||||
pdf_path = None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating PDF for {file_id}: {str(e)}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
pdf_path = None
|
||||
|
||||
return json_path, markdown_path, pdf_path
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error saving results: {str(e)}")
|
||||
return None, None
|
||||
return None, None, None
|
||||
|
||||
626
backend/app/services/pdf_generator_service.py
Normal file
626
backend/app/services/pdf_generator_service.py
Normal file
@@ -0,0 +1,626 @@
|
||||
"""
|
||||
Layout-Preserving PDF Generation Service
|
||||
Generates PDF files that preserve the original document layout using OCR JSON data
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from datetime import datetime
|
||||
|
||||
from reportlab.lib.pagesizes import A4, letter
|
||||
from reportlab.lib.units import mm
|
||||
from reportlab.pdfgen import canvas
|
||||
from reportlab.pdfbase import pdfmetrics
|
||||
from reportlab.pdfbase.ttfonts import TTFont
|
||||
from reportlab.platypus import Table, TableStyle
|
||||
from reportlab.lib import colors
|
||||
from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_RIGHT
|
||||
from reportlab.platypus import Paragraph
|
||||
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
||||
from PIL import Image
|
||||
from html.parser import HTMLParser
|
||||
|
||||
from app.core.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HTMLTableParser(HTMLParser):
|
||||
"""Parse HTML table to extract structure and data"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.tables = []
|
||||
self.current_table = None
|
||||
self.current_row = None
|
||||
self.current_cell = None
|
||||
self.in_table = False
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
attrs_dict = dict(attrs)
|
||||
|
||||
if tag == 'table':
|
||||
self.in_table = True
|
||||
self.current_table = {'rows': []}
|
||||
|
||||
elif tag == 'tr' and self.in_table:
|
||||
self.current_row = {'cells': []}
|
||||
|
||||
elif tag in ('td', 'th') and self.in_table and self.current_row is not None:
|
||||
colspan = int(attrs_dict.get('colspan', 1))
|
||||
rowspan = int(attrs_dict.get('rowspan', 1))
|
||||
self.current_cell = {
|
||||
'text': '',
|
||||
'is_header': tag == 'th',
|
||||
'colspan': colspan,
|
||||
'rowspan': rowspan
|
||||
}
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag == 'table' and self.in_table:
|
||||
if self.current_table and self.current_table['rows']:
|
||||
self.tables.append(self.current_table)
|
||||
self.current_table = None
|
||||
self.in_table = False
|
||||
|
||||
elif tag == 'tr' and self.current_row is not None:
|
||||
if self.current_table is not None:
|
||||
self.current_table['rows'].append(self.current_row)
|
||||
self.current_row = None
|
||||
|
||||
elif tag in ('td', 'th') and self.current_cell is not None:
|
||||
if self.current_row is not None:
|
||||
self.current_row['cells'].append(self.current_cell)
|
||||
self.current_cell = None
|
||||
|
||||
def handle_data(self, data):
|
||||
if self.current_cell is not None:
|
||||
self.current_cell['text'] += data.strip() + ' '
|
||||
|
||||
|
||||
class PDFGeneratorService:
|
||||
"""Service for generating layout-preserving PDFs from OCR JSON data"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize PDF generator with font configuration"""
|
||||
self.font_name = 'NotoSansSC'
|
||||
self.font_path = None
|
||||
self.font_registered = False
|
||||
|
||||
self._register_chinese_font()
|
||||
|
||||
def _register_chinese_font(self):
|
||||
"""Register Chinese font for PDF generation"""
|
||||
try:
|
||||
# Get font path from settings
|
||||
font_path = Path(settings.chinese_font_path)
|
||||
|
||||
# Try relative path from project root
|
||||
if not font_path.is_absolute():
|
||||
# Adjust path - settings.chinese_font_path starts with ./backend/
|
||||
project_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||
font_path = project_root / font_path
|
||||
|
||||
if not font_path.exists():
|
||||
logger.error(f"Chinese font not found at {font_path}")
|
||||
return
|
||||
|
||||
# Register font
|
||||
pdfmetrics.registerFont(TTFont(self.font_name, str(font_path)))
|
||||
self.font_path = font_path
|
||||
self.font_registered = True
|
||||
logger.info(f"Chinese font registered: {self.font_name} from {font_path}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to register Chinese font: {e}")
|
||||
self.font_registered = False
|
||||
|
||||
def load_ocr_json(self, json_path: Path) -> Optional[Dict]:
|
||||
"""
|
||||
Load and parse OCR JSON result file
|
||||
|
||||
Args:
|
||||
json_path: Path to JSON file
|
||||
|
||||
Returns:
|
||||
Parsed JSON data or None if failed
|
||||
"""
|
||||
try:
|
||||
with open(json_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
logger.info(f"Loaded OCR JSON: {json_path.name}")
|
||||
return data
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load JSON {json_path}: {e}")
|
||||
return None
|
||||
|
||||
def calculate_page_dimensions(self, text_regions: List[Dict], source_file_path: Optional[Path] = None) -> Tuple[float, float]:
|
||||
"""
|
||||
Calculate page dimensions from source file or text region bounding boxes
|
||||
|
||||
Args:
|
||||
text_regions: List of text regions with bbox coordinates
|
||||
source_file_path: Optional path to source file for accurate dimensions
|
||||
|
||||
Returns:
|
||||
Tuple of (width, height) in points
|
||||
"""
|
||||
# First try to get dimensions from source file
|
||||
if source_file_path:
|
||||
dims = self.get_original_page_size(source_file_path)
|
||||
if dims:
|
||||
return dims
|
||||
|
||||
if not text_regions:
|
||||
return A4 # Default to A4 size
|
||||
|
||||
max_x = 0
|
||||
max_y = 0
|
||||
|
||||
for region in text_regions:
|
||||
bbox = region.get('bbox', [])
|
||||
if not bbox or len(bbox) < 4:
|
||||
continue
|
||||
|
||||
# bbox format: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
|
||||
for point in bbox:
|
||||
if isinstance(point, (list, tuple)) and len(point) >= 2:
|
||||
x, y = point[0], point[1]
|
||||
max_x = max(max_x, x)
|
||||
max_y = max(max_y, y)
|
||||
|
||||
# OCR coordinates are in pixels, use them directly as points (1:1 mapping)
|
||||
# Do NOT add padding - this causes layout issues
|
||||
width = max_x if max_x > 0 else A4[0]
|
||||
height = max_y if max_y > 0 else A4[1]
|
||||
|
||||
logger.info(f"Calculated page dimensions from OCR: {width:.1f} x {height:.1f} points")
|
||||
return (width, height)
|
||||
|
||||
def get_original_page_size(self, file_path: Path) -> Optional[Tuple[float, float]]:
|
||||
"""
|
||||
Extract page dimensions from original source file
|
||||
|
||||
Args:
|
||||
file_path: Path to original file (image or PDF)
|
||||
|
||||
Returns:
|
||||
Tuple of (width, height) in points or None
|
||||
"""
|
||||
try:
|
||||
if not file_path.exists():
|
||||
return None
|
||||
|
||||
# For images, get dimensions from PIL
|
||||
if file_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']:
|
||||
img = Image.open(file_path)
|
||||
# Use pixel dimensions directly as points (1:1 mapping)
|
||||
# This matches how PaddleOCR reports coordinates
|
||||
width_pt = float(img.width)
|
||||
height_pt = float(img.height)
|
||||
logger.info(f"Extracted dimensions from image: {width_pt:.1f} x {height_pt:.1f} points (1:1 pixel mapping)")
|
||||
return (width_pt, height_pt)
|
||||
|
||||
# For PDFs, would need PyPDF2 or similar
|
||||
# For now, return None to use calculated dimensions
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to get page size from {file_path}: {e}")
|
||||
|
||||
return None
|
||||
|
||||
def draw_text_region(
|
||||
self,
|
||||
pdf_canvas: canvas.Canvas,
|
||||
region: Dict,
|
||||
page_height: float
|
||||
):
|
||||
"""
|
||||
Draw a text region at precise coordinates
|
||||
|
||||
Args:
|
||||
pdf_canvas: ReportLab canvas object
|
||||
region: Text region dict with text, bbox, confidence
|
||||
page_height: Height of page (for coordinate transformation)
|
||||
"""
|
||||
text = region.get('text', '')
|
||||
bbox = region.get('bbox', [])
|
||||
confidence = region.get('confidence', 1.0)
|
||||
|
||||
if not text or not bbox or len(bbox) < 4:
|
||||
return
|
||||
|
||||
try:
|
||||
# bbox from OCR: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
|
||||
# Points: top-left, top-right, bottom-right, bottom-left
|
||||
# OCR coordinates: origin (0,0) at top-left, Y increases downward
|
||||
ocr_x_left = bbox[0][0] # Left X
|
||||
ocr_y_top = bbox[0][1] # Top Y in OCR coordinates
|
||||
ocr_x_right = bbox[2][0] # Right X
|
||||
ocr_y_bottom = bbox[2][1] # Bottom Y in OCR coordinates
|
||||
|
||||
# Calculate bbox dimensions
|
||||
bbox_width = abs(ocr_x_right - ocr_x_left)
|
||||
bbox_height = abs(ocr_y_bottom - ocr_y_top)
|
||||
|
||||
# Calculate font size using heuristics
|
||||
# Font size is typically 70-90% of bbox height
|
||||
# Testing shows 0.75 works well for most cases
|
||||
font_size = bbox_height * 0.75
|
||||
font_size = max(min(font_size, 72), 4) # Clamp between 4pt and 72pt
|
||||
|
||||
# Transform coordinates: OCR (top-left origin) → PDF (bottom-left origin)
|
||||
# CRITICAL: Y-axis flip!
|
||||
pdf_x = ocr_x_left
|
||||
pdf_y = page_height - ocr_y_bottom # Flip Y-axis using bottom coordinate
|
||||
|
||||
# Set font
|
||||
font_name = self.font_name if self.font_registered else 'Helvetica'
|
||||
pdf_canvas.setFont(font_name, font_size)
|
||||
|
||||
# Calculate text width to prevent overflow
|
||||
text_width = pdf_canvas.stringWidth(text, font_name, font_size)
|
||||
|
||||
# If text is too wide for bbox, scale down font
|
||||
if text_width > bbox_width:
|
||||
scale_factor = bbox_width / text_width
|
||||
font_size = font_size * scale_factor * 0.95 # 95% to add small margin
|
||||
font_size = max(font_size, 3) # Minimum 3pt
|
||||
pdf_canvas.setFont(font_name, font_size)
|
||||
|
||||
# Draw text at calculated position
|
||||
pdf_canvas.drawString(pdf_x, pdf_y, text)
|
||||
|
||||
# Debug: Draw bounding box (optional)
|
||||
if settings.pdf_enable_bbox_debug:
|
||||
pdf_canvas.setStrokeColorRGB(1, 0, 0, 0.3) # Red, semi-transparent
|
||||
pdf_canvas.setLineWidth(0.5)
|
||||
# Transform all bbox points to PDF coordinates
|
||||
pdf_points = [(p[0], page_height - p[1]) for p in bbox]
|
||||
# Draw bbox rectangle
|
||||
for i in range(4):
|
||||
x1, y1 = pdf_points[i]
|
||||
x2, y2 = pdf_points[(i + 1) % 4]
|
||||
pdf_canvas.line(x1, y1, x2, y2)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to draw text region '{text[:20]}...': {e}")
|
||||
|
||||
def draw_table_region(
|
||||
self,
|
||||
pdf_canvas: canvas.Canvas,
|
||||
table_element: Dict,
|
||||
images_metadata: List[Dict],
|
||||
page_height: float
|
||||
):
|
||||
"""
|
||||
Draw a table region by parsing HTML and rebuilding with ReportLab Table
|
||||
|
||||
Args:
|
||||
pdf_canvas: ReportLab canvas object
|
||||
table_element: Table element dict with HTML content
|
||||
images_metadata: List of image metadata to find table bbox
|
||||
page_height: Height of page
|
||||
"""
|
||||
try:
|
||||
html_content = table_element.get('content', '')
|
||||
if not html_content:
|
||||
return
|
||||
|
||||
# Parse HTML to extract table structure
|
||||
parser = HTMLTableParser()
|
||||
parser.feed(html_content)
|
||||
|
||||
if not parser.tables:
|
||||
logger.warning("No tables found in HTML content")
|
||||
return
|
||||
|
||||
# Get the first table (PP-StructureV3 usually provides one table per element)
|
||||
table_data = parser.tables[0]
|
||||
rows = table_data['rows']
|
||||
|
||||
if not rows:
|
||||
return
|
||||
|
||||
# Find corresponding table image to get bbox
|
||||
table_bbox = None
|
||||
for img_meta in images_metadata:
|
||||
img_path = img_meta.get('image_path', '')
|
||||
if 'table' in img_path.lower():
|
||||
bbox = img_meta.get('bbox', [])
|
||||
if bbox and len(bbox) >= 4:
|
||||
table_bbox = bbox
|
||||
break
|
||||
|
||||
if not table_bbox:
|
||||
logger.warning("No bbox found for table")
|
||||
return
|
||||
|
||||
# Extract bbox coordinates
|
||||
ocr_x_left = table_bbox[0][0]
|
||||
ocr_y_top = table_bbox[0][1]
|
||||
ocr_x_right = table_bbox[2][0]
|
||||
ocr_y_bottom = table_bbox[2][1]
|
||||
|
||||
table_width = abs(ocr_x_right - ocr_x_left)
|
||||
table_height = abs(ocr_y_bottom - ocr_y_top)
|
||||
|
||||
# Transform coordinates
|
||||
pdf_x = ocr_x_left
|
||||
pdf_y = page_height - ocr_y_bottom
|
||||
|
||||
# Build table data for ReportLab
|
||||
# Convert parsed structure to simple 2D array
|
||||
max_cols = max(len(row['cells']) for row in rows)
|
||||
reportlab_data = []
|
||||
|
||||
for row in rows:
|
||||
row_data = []
|
||||
for cell in row['cells']:
|
||||
text = cell['text'].strip()
|
||||
row_data.append(text)
|
||||
# Pad row if needed
|
||||
while len(row_data) < max_cols:
|
||||
row_data.append('')
|
||||
reportlab_data.append(row_data)
|
||||
|
||||
# Calculate column widths (equal distribution)
|
||||
col_widths = [table_width / max_cols] * max_cols
|
||||
|
||||
# Create ReportLab Table
|
||||
# Use smaller font size to fit in bbox
|
||||
font_size = min(table_height / len(rows) * 0.5, 10)
|
||||
font_size = max(font_size, 6)
|
||||
|
||||
# Create table with font
|
||||
table = Table(reportlab_data, colWidths=col_widths)
|
||||
|
||||
# Apply table style
|
||||
style = TableStyle([
|
||||
('FONT', (0, 0), (-1, -1), self.font_name if self.font_registered else 'Helvetica', font_size),
|
||||
('GRID', (0, 0), (-1, -1), 0.5, colors.black),
|
||||
('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
|
||||
('ALIGN', (0, 0), (-1, -1), 'CENTER'),
|
||||
('LEFTPADDING', (0, 0), (-1, -1), 2),
|
||||
('RIGHTPADDING', (0, 0), (-1, -1), 2),
|
||||
('TOPPADDING', (0, 0), (-1, -1), 2),
|
||||
('BOTTOMPADDING', (0, 0), (-1, -1), 2),
|
||||
])
|
||||
|
||||
# Add header style if first row has headers
|
||||
if rows and rows[0]['cells'] and rows[0]['cells'][0].get('is_header'):
|
||||
style.add('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey)
|
||||
style.add('FONT', (0, 0), (-1, 0), self.font_name if self.font_registered else 'Helvetica-Bold', font_size)
|
||||
|
||||
table.setStyle(style)
|
||||
|
||||
# Calculate table size
|
||||
table.wrapOn(pdf_canvas, table_width, table_height)
|
||||
|
||||
# Draw table at position
|
||||
table.drawOn(pdf_canvas, pdf_x, pdf_y)
|
||||
|
||||
logger.info(f"Drew table at ({pdf_x:.0f}, {pdf_y:.0f}) size {table_width:.0f}x{table_height:.0f} with {len(rows)} rows")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to draw table region: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
def draw_image_region(
|
||||
self,
|
||||
pdf_canvas: canvas.Canvas,
|
||||
region: Dict,
|
||||
page_height: float,
|
||||
result_dir: Path
|
||||
):
|
||||
"""
|
||||
Draw an image region by embedding the extracted image
|
||||
|
||||
Handles images extracted by PP-StructureV3 (tables, figures, charts, etc.)
|
||||
|
||||
Args:
|
||||
pdf_canvas: ReportLab canvas object
|
||||
region: Image metadata dict with image_path and bbox
|
||||
page_height: Height of page (for coordinate transformation)
|
||||
result_dir: Directory containing result files
|
||||
"""
|
||||
try:
|
||||
image_path_str = region.get('image_path', '')
|
||||
if not image_path_str:
|
||||
return
|
||||
|
||||
# Construct full path to image
|
||||
image_path = result_dir / image_path_str
|
||||
|
||||
if not image_path.exists():
|
||||
logger.warning(f"Image not found: {image_path}")
|
||||
return
|
||||
|
||||
# Get bbox for positioning
|
||||
bbox = region.get('bbox', [])
|
||||
if not bbox or len(bbox) < 4:
|
||||
# If no bbox, skip for now
|
||||
logger.warning(f"No bbox for image {image_path_str}")
|
||||
return
|
||||
|
||||
# bbox from OCR: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
|
||||
# OCR coordinates: origin (0,0) at top-left, Y increases downward
|
||||
ocr_x_left = bbox[0][0]
|
||||
ocr_y_top = bbox[0][1]
|
||||
ocr_x_right = bbox[2][0]
|
||||
ocr_y_bottom = bbox[2][1]
|
||||
|
||||
# Calculate bbox dimensions
|
||||
bbox_width = abs(ocr_x_right - ocr_x_left)
|
||||
bbox_height = abs(ocr_y_bottom - ocr_y_top)
|
||||
|
||||
# Transform coordinates: OCR (top-left origin) → PDF (bottom-left origin)
|
||||
# CRITICAL: Y-axis flip!
|
||||
# For images, we position at bottom-left corner
|
||||
pdf_x_left = ocr_x_left
|
||||
pdf_y_bottom = page_height - ocr_y_bottom # Flip Y-axis
|
||||
|
||||
# Draw image using ReportLab
|
||||
# drawImage expects: (path, x, y, width, height)
|
||||
# where (x, y) is the bottom-left corner of the image
|
||||
pdf_canvas.drawImage(
|
||||
str(image_path),
|
||||
pdf_x_left,
|
||||
pdf_y_bottom,
|
||||
width=bbox_width,
|
||||
height=bbox_height,
|
||||
preserveAspectRatio=True,
|
||||
mask='auto' # Handle transparency
|
||||
)
|
||||
|
||||
logger.info(f"Drew image: {image_path_str} at ({pdf_x_left:.0f}, {pdf_y_bottom:.0f}) size {bbox_width:.0f}x{bbox_height:.0f}")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to draw image region: {e}")
|
||||
|
||||
def generate_layout_pdf(
|
||||
self,
|
||||
json_path: Path,
|
||||
output_path: Path,
|
||||
source_file_path: Optional[Path] = None
|
||||
) -> bool:
|
||||
"""
|
||||
Generate layout-preserving PDF from OCR JSON data
|
||||
|
||||
Args:
|
||||
json_path: Path to OCR JSON file
|
||||
output_path: Path to save generated PDF
|
||||
source_file_path: Optional path to original source file for dimension extraction
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise
|
||||
"""
|
||||
try:
|
||||
# Check if PDF already exists (caching)
|
||||
if output_path.exists():
|
||||
logger.info(f"PDF already exists: {output_path.name}")
|
||||
return True
|
||||
|
||||
# Load JSON data
|
||||
ocr_data = self.load_ocr_json(json_path)
|
||||
if not ocr_data:
|
||||
return False
|
||||
|
||||
# Get text regions
|
||||
text_regions = ocr_data.get('text_regions', [])
|
||||
if not text_regions:
|
||||
logger.warning("No text regions found in JSON")
|
||||
return False
|
||||
|
||||
# Get images metadata
|
||||
images_metadata = ocr_data.get('images_metadata', [])
|
||||
|
||||
# Get layout data
|
||||
layout_data = ocr_data.get('layout_data', {})
|
||||
|
||||
# Determine page dimensions
|
||||
page_size = self.calculate_page_dimensions(text_regions, source_file_path)
|
||||
|
||||
page_width, page_height = page_size
|
||||
|
||||
# Create PDF canvas
|
||||
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(page_width, page_height))
|
||||
|
||||
# Extract table bboxes to exclude text in those regions
|
||||
table_bboxes = []
|
||||
for img_meta in images_metadata:
|
||||
img_path = img_meta.get('image_path', '')
|
||||
if 'table' in img_path.lower():
|
||||
bbox = img_meta.get('bbox', [])
|
||||
if bbox and len(bbox) >= 4:
|
||||
table_bboxes.append(bbox)
|
||||
|
||||
# Helper function to check if a point is inside a bbox
|
||||
def point_in_bbox(x, y, bbox):
|
||||
x1, y1 = bbox[0]
|
||||
x2, y2 = bbox[2]
|
||||
return min(x1, x2) <= x <= max(x1, x2) and min(y1, y2) <= y <= max(y1, y2)
|
||||
|
||||
# Filter text regions to exclude those inside tables
|
||||
filtered_text_regions = []
|
||||
for region in text_regions:
|
||||
bbox = region.get('bbox', [])
|
||||
if not bbox or len(bbox) < 4:
|
||||
continue
|
||||
|
||||
# Check if text region center is inside any table bbox
|
||||
center_x = (bbox[0][0] + bbox[2][0]) / 2
|
||||
center_y = (bbox[0][1] + bbox[2][1]) / 2
|
||||
|
||||
is_in_table = any(point_in_bbox(center_x, center_y, table_bbox) for table_bbox in table_bboxes)
|
||||
|
||||
if not is_in_table:
|
||||
filtered_text_regions.append(region)
|
||||
else:
|
||||
logger.debug(f"Excluded text '{region.get('text', '')[:20]}...' (inside table)")
|
||||
|
||||
logger.info(f"Filtered {len(text_regions) - len(filtered_text_regions)} text regions inside tables")
|
||||
|
||||
# Group regions by page
|
||||
pages_data = {}
|
||||
for region in filtered_text_regions:
|
||||
page_num = region.get('page', 1)
|
||||
if page_num not in pages_data:
|
||||
pages_data[page_num] = []
|
||||
pages_data[page_num].append(region)
|
||||
|
||||
# Get table elements from layout_data
|
||||
table_elements = []
|
||||
if layout_data and layout_data.get('elements'):
|
||||
table_elements = [e for e in layout_data['elements'] if e.get('type') == 'table']
|
||||
|
||||
# Process each page
|
||||
total_pages = ocr_data.get('total_pages', 1)
|
||||
for page_num in range(1, total_pages + 1):
|
||||
if page_num > 1:
|
||||
pdf_canvas.showPage() # Start new page
|
||||
|
||||
# Draw text regions for this page (excluding table text)
|
||||
page_regions = pages_data.get(page_num, [])
|
||||
for region in page_regions:
|
||||
self.draw_text_region(pdf_canvas, region, page_height)
|
||||
|
||||
# Draw tables for this page
|
||||
for table_elem in table_elements:
|
||||
if table_elem.get('page', 0) == page_num - 1: # page is 0-indexed
|
||||
self.draw_table_region(pdf_canvas, table_elem, images_metadata, page_height)
|
||||
|
||||
# Draw non-table images for this page (figure, chart, seal, etc.)
|
||||
for img_meta in images_metadata:
|
||||
if img_meta.get('page') == page_num - 1: # page is 0-indexed
|
||||
img_path = img_meta.get('image_path', '')
|
||||
# Skip table images (they're now rendered as tables)
|
||||
if 'table' not in img_path.lower():
|
||||
self.draw_image_region(
|
||||
pdf_canvas,
|
||||
img_meta,
|
||||
page_height,
|
||||
json_path.parent
|
||||
)
|
||||
|
||||
# Save PDF
|
||||
pdf_canvas.save()
|
||||
|
||||
file_size = output_path.stat().st_size
|
||||
logger.info(f"Generated layout-preserving PDF: {output_path.name} ({file_size} bytes)")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to generate PDF: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
|
||||
# Singleton instance
|
||||
pdf_generator_service = PDFGeneratorService()
|
||||
31
backend/download_fonts.sh
Executable file
31
backend/download_fonts.sh
Executable file
@@ -0,0 +1,31 @@
|
||||
#!/bin/bash
|
||||
# Download Noto Sans SC TrueType font for layout-preserving PDF generation
|
||||
|
||||
set -e
|
||||
|
||||
FONT_DIR="backend/fonts"
|
||||
FONT_URL="https://github.com/notofonts/noto-cjk/raw/main/Sans/Variable/TTF/Subset/NotoSansSC-VF.ttf"
|
||||
FONT_FILE="NotoSansSC-Regular.ttf"
|
||||
|
||||
echo "🔤 Downloading Chinese font for PDF generation..."
|
||||
|
||||
# Create font directory
|
||||
mkdir -p "$FONT_DIR"
|
||||
|
||||
# Download font if not exists
|
||||
if [ -f "$FONT_DIR/$FONT_FILE" ]; then
|
||||
echo "✓ Font already exists: $FONT_DIR/$FONT_FILE"
|
||||
else
|
||||
echo "Downloading from GitHub..."
|
||||
wget "$FONT_URL" -O "$FONT_DIR/$FONT_FILE"
|
||||
|
||||
if [ -f "$FONT_DIR/$FONT_FILE" ]; then
|
||||
SIZE=$(du -h "$FONT_DIR/$FONT_FILE" | cut -f1)
|
||||
echo "✓ Font downloaded successfully: $SIZE"
|
||||
else
|
||||
echo "✗ Font download failed"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "✅ Font setup complete!"
|
||||
BIN
backend/fonts/NotoSansSC-Regular.ttf
Normal file
BIN
backend/fonts/NotoSansSC-Regular.ttf
Normal file
Binary file not shown.
62
backend/test_chinese_font.py
Normal file
62
backend/test_chinese_font.py
Normal file
@@ -0,0 +1,62 @@
|
||||
"""
|
||||
Test script to verify ReportLab and Chinese font rendering
|
||||
"""
|
||||
from reportlab.pdfgen import canvas
|
||||
from reportlab.pdfbase import pdfmetrics
|
||||
from reportlab.pdfbase.ttfonts import TTFont
|
||||
from pathlib import Path
|
||||
import sys
|
||||
|
||||
def test_chinese_rendering():
|
||||
"""Test if Chinese characters can be rendered in PDF"""
|
||||
|
||||
# Font path
|
||||
font_path = "/home/egg/project/Tool_OCR/backend/fonts/NotoSansSC-Regular.ttf"
|
||||
|
||||
# Check if font file exists
|
||||
if not Path(font_path).exists():
|
||||
print(f"❌ Font file not found: {font_path}")
|
||||
return False
|
||||
|
||||
print(f"✓ Font file found: {font_path}")
|
||||
|
||||
try:
|
||||
# Register Chinese font
|
||||
pdfmetrics.registerFont(TTFont('NotoSansSC', font_path))
|
||||
print("✓ Font registered successfully")
|
||||
|
||||
# Create test PDF
|
||||
test_pdf = "/tmp/test_chinese.pdf"
|
||||
c = canvas.Canvas(test_pdf)
|
||||
|
||||
# Set Chinese font
|
||||
c.setFont('NotoSansSC', 14)
|
||||
|
||||
# Draw test text
|
||||
c.drawString(100, 750, "測試中文字符渲染 - Test Chinese Character Rendering")
|
||||
c.drawString(100, 730, "HTD-S1 技術數據表")
|
||||
c.drawString(100, 710, "這是一個 PDF 生成測試")
|
||||
|
||||
c.save()
|
||||
print(f"✓ Test PDF created: {test_pdf}")
|
||||
|
||||
# Check file size
|
||||
file_size = Path(test_pdf).stat().st_size
|
||||
print(f"✓ PDF file size: {file_size} bytes")
|
||||
|
||||
if file_size > 0:
|
||||
print("\n✅ Chinese font rendering test PASSED")
|
||||
return True
|
||||
else:
|
||||
print("\n❌ PDF file is empty")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error during testing: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
if __name__ == "__main__":
|
||||
success = test_chinese_rendering()
|
||||
sys.exit(0 if success else 1)
|
||||
Reference in New Issue
Block a user