feat: implement layout-preserving PDF generation with table reconstruction
Major Features: - Add PDF generation service with Chinese font support - Parse HTML tables from PP-StructureV3 and rebuild with ReportLab - Extract table text for translation purposes - Auto-filter text regions inside tables to avoid overlaps Backend Changes: 1. pdf_generator_service.py (NEW) - HTMLTableParser: Parse HTML tables to extract structure - PDFGeneratorService: Generate layout-preserving PDFs - Coordinate transformation: OCR (top-left) → PDF (bottom-left) - Font size heuristics: 75% of bbox height with width checking - Table reconstruction: Parse HTML → ReportLab Table - Image embedding: Extract bbox from filenames 2. ocr_service.py - Add _extract_table_text() for translation support - Add output_dir parameter to save images to result directory - Extract bbox from image filenames (img_in_table_box_x1_y1_x2_y2.jpg) 3. tasks.py - Update process_task_ocr to use save_results() with PDF generation - Fix download_pdf endpoint to use database-stored PDF paths - Support on-demand PDF generation from JSON 4. config.py - Add chinese_font_path configuration - Add pdf_enable_bbox_debug flag Frontend Changes: 1. PDFViewer.tsx (NEW) - React PDF viewer with zoom and pagination - Memoized file config to prevent unnecessary reloads 2. TaskDetailPage.tsx & ResultsPage.tsx - Integrate PDF preview and download 3. main.tsx - Configure PDF.js worker via CDN 4. vite.config.ts - Add host: '0.0.0.0' for network access - Use VITE_API_URL environment variable for backend proxy Dependencies: - reportlab: PDF generation library - Noto Sans SC font: Chinese character support 🤖 Generated with Claude Code https://claude.com/claude-code Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -284,7 +284,8 @@ class OCRService:
|
||||
image_path: Path,
|
||||
lang: str = 'ch',
|
||||
detect_layout: bool = True,
|
||||
confidence_threshold: Optional[float] = None
|
||||
confidence_threshold: Optional[float] = None,
|
||||
output_dir: Optional[Path] = None
|
||||
) -> Dict:
|
||||
"""
|
||||
Process single image with OCR and layout analysis
|
||||
@@ -340,7 +341,8 @@ class OCRService:
|
||||
page_image_path,
|
||||
lang=lang,
|
||||
detect_layout=detect_layout,
|
||||
confidence_threshold=confidence_threshold
|
||||
confidence_threshold=confidence_threshold,
|
||||
output_dir=output_dir
|
||||
)
|
||||
|
||||
# Accumulate results
|
||||
@@ -458,7 +460,7 @@ class OCRService:
|
||||
images_metadata = []
|
||||
|
||||
if detect_layout:
|
||||
layout_data, images_metadata = self.analyze_layout(image_path)
|
||||
layout_data, images_metadata = self.analyze_layout(image_path, output_dir=output_dir)
|
||||
|
||||
# Generate Markdown
|
||||
markdown_content = self.generate_markdown(text_regions, layout_data)
|
||||
@@ -500,12 +502,71 @@ class OCRService:
|
||||
'processing_time': (datetime.now() - start_time).total_seconds(),
|
||||
}
|
||||
|
||||
def analyze_layout(self, image_path: Path) -> Tuple[Optional[Dict], List[Dict]]:
|
||||
def _extract_table_text(self, html_content: str) -> str:
|
||||
"""
|
||||
Extract text from HTML table content for translation purposes
|
||||
|
||||
Args:
|
||||
html_content: HTML content containing table
|
||||
|
||||
Returns:
|
||||
Extracted text from table cells
|
||||
"""
|
||||
try:
|
||||
from html.parser import HTMLParser
|
||||
|
||||
class TableTextExtractor(HTMLParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.text_parts = []
|
||||
self.in_table = False
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag == 'table':
|
||||
self.in_table = True
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag == 'table':
|
||||
self.in_table = False
|
||||
elif tag in ('td', 'th') and self.in_table:
|
||||
self.text_parts.append(' | ') # Cell separator
|
||||
elif tag == 'tr' and self.in_table:
|
||||
self.text_parts.append('\n') # Row separator
|
||||
|
||||
def handle_data(self, data):
|
||||
if self.in_table:
|
||||
stripped = data.strip()
|
||||
if stripped:
|
||||
self.text_parts.append(stripped)
|
||||
|
||||
parser = TableTextExtractor()
|
||||
parser.feed(html_content)
|
||||
|
||||
# Clean up the extracted text
|
||||
extracted = ''.join(parser.text_parts)
|
||||
# Remove multiple separators
|
||||
import re
|
||||
extracted = re.sub(r'\s*\|\s*\|+\s*', ' | ', extracted)
|
||||
extracted = re.sub(r'\n+', '\n', extracted)
|
||||
extracted = extracted.strip()
|
||||
|
||||
return extracted
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to extract table text: {e}")
|
||||
# Fallback: just remove HTML tags
|
||||
import re
|
||||
text = re.sub(r'<[^>]+>', ' ', html_content)
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
return text.strip()
|
||||
|
||||
def analyze_layout(self, image_path: Path, output_dir: Optional[Path] = None) -> Tuple[Optional[Dict], List[Dict]]:
|
||||
"""
|
||||
Analyze document layout using PP-StructureV3
|
||||
|
||||
Args:
|
||||
image_path: Path to image file
|
||||
output_dir: Optional output directory for saving extracted images (defaults to image_path.parent)
|
||||
|
||||
Returns:
|
||||
Tuple of (layout_data, images_metadata)
|
||||
@@ -548,16 +609,59 @@ class OCRService:
|
||||
'page': page_idx,
|
||||
'bbox': [], # PP-StructureV3 doesn't provide individual bbox in this format
|
||||
}
|
||||
|
||||
# Extract text from table for translation purposes
|
||||
if has_table:
|
||||
table_text = self._extract_table_text(markdown_texts)
|
||||
element['extracted_text'] = table_text
|
||||
logger.info(f"Extracted {len(table_text)} characters from table")
|
||||
|
||||
layout_elements.append(element)
|
||||
|
||||
# Add image metadata
|
||||
# Add image metadata and SAVE images to disk
|
||||
for img_idx, (img_path, img_obj) in enumerate(markdown_images.items()):
|
||||
# Save image to disk
|
||||
try:
|
||||
# Determine base directory for saving images
|
||||
base_dir = output_dir if output_dir else image_path.parent
|
||||
|
||||
# Create full path for image file
|
||||
full_img_path = base_dir / img_path
|
||||
|
||||
# Create imgs/ subdirectory if it doesn't exist
|
||||
full_img_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Save image object to disk
|
||||
if hasattr(img_obj, 'save'):
|
||||
# img_obj is PIL Image
|
||||
img_obj.save(str(full_img_path))
|
||||
logger.info(f"Saved extracted image to {full_img_path}")
|
||||
else:
|
||||
logger.warning(f"Image object for {img_path} does not have save() method, skipping")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to save image {img_path}: {str(e)}")
|
||||
# Continue processing even if image save fails
|
||||
|
||||
# Extract bbox from filename (format: img_in_table_box_x1_y1_x2_y2.jpg)
|
||||
bbox = []
|
||||
try:
|
||||
import re
|
||||
match = re.search(r'box_(\d+)_(\d+)_(\d+)_(\d+)', img_path)
|
||||
if match:
|
||||
x1, y1, x2, y2 = map(int, match.groups())
|
||||
# Convert to 4-point bbox format: [[x1,y1], [x2,y1], [x2,y2], [x1,y2]]
|
||||
bbox = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
|
||||
logger.info(f"Extracted bbox from filename: {bbox}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to extract bbox from {img_path}: {e}")
|
||||
|
||||
images_metadata.append({
|
||||
'element_id': len(layout_elements) + img_idx,
|
||||
'image_path': img_path,
|
||||
'type': 'image',
|
||||
'page': page_idx,
|
||||
'bbox': [],
|
||||
'bbox': bbox,
|
||||
})
|
||||
|
||||
if layout_elements:
|
||||
@@ -638,18 +742,20 @@ class OCRService:
|
||||
self,
|
||||
result: Dict,
|
||||
output_dir: Path,
|
||||
file_id: str
|
||||
) -> Tuple[Optional[Path], Optional[Path]]:
|
||||
file_id: str,
|
||||
source_file_path: Optional[Path] = None
|
||||
) -> Tuple[Optional[Path], Optional[Path], Optional[Path]]:
|
||||
"""
|
||||
Save OCR results to JSON and Markdown files
|
||||
Save OCR results to JSON, Markdown, and layout-preserving PDF files
|
||||
|
||||
Args:
|
||||
result: OCR result dictionary
|
||||
output_dir: Output directory
|
||||
file_id: Unique file identifier
|
||||
source_file_path: Optional path to original source file for PDF generation
|
||||
|
||||
Returns:
|
||||
Tuple of (json_path, markdown_path)
|
||||
Tuple of (json_path, markdown_path, pdf_path)
|
||||
"""
|
||||
try:
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
@@ -666,8 +772,37 @@ class OCRService:
|
||||
f.write(markdown_content)
|
||||
|
||||
logger.info(f"Results saved: {json_path.name}, {markdown_path.name}")
|
||||
return json_path, markdown_path
|
||||
|
||||
# Generate layout-preserving PDF
|
||||
pdf_path = None
|
||||
try:
|
||||
from app.services.pdf_generator_service import pdf_generator_service
|
||||
|
||||
pdf_filename = f"{file_id}_layout.pdf"
|
||||
pdf_path = output_dir / pdf_filename
|
||||
|
||||
logger.info(f"Generating layout-preserving PDF: {pdf_filename}")
|
||||
|
||||
success = pdf_generator_service.generate_layout_pdf(
|
||||
json_path=json_path,
|
||||
output_path=pdf_path,
|
||||
source_file_path=source_file_path
|
||||
)
|
||||
|
||||
if success:
|
||||
logger.info(f"✓ PDF generated successfully: {pdf_path.name}")
|
||||
else:
|
||||
logger.warning(f"✗ PDF generation failed for {file_id}")
|
||||
pdf_path = None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating PDF for {file_id}: {str(e)}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
pdf_path = None
|
||||
|
||||
return json_path, markdown_path, pdf_path
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error saving results: {str(e)}")
|
||||
return None, None
|
||||
return None, None, None
|
||||
|
||||
Reference in New Issue
Block a user