Files
OCR/backend/app/services/ocr_service.py
egg 80c091b89a fix: add PaddlePaddle 2.x/3.x API compatibility layer
PaddlePaddle 3.0.0b2 has "Illegal instruction" error on current CPU.
Downgrade to stable 2.6.2 which works but uses different API.

Changes:
- Auto-detect PaddlePaddle version at runtime
- Use 'device' parameter for 3.x (device="gpu:0" or "cpu")
- Use 'use_gpu' + 'gpu_mem' parameters for 2.x
- Apply to both get_ocr_engine() and get_structure_engine()
- Log PaddlePaddle version in initialization messages

Current setup:
- paddlepaddle-gpu==2.6.2 (stable, CUDA compiled)
- paddleocr==3.3.1
- paddlex==3.3.9

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-14 10:56:29 +08:00

725 lines
30 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Tool_OCR - Core OCR Service
PaddleOCR-VL integration for text and structure extraction
"""
import json
import logging
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from datetime import datetime
import uuid
from paddleocr import PaddleOCR, PPStructureV3
from PIL import Image
from pdf2image import convert_from_path
import paddle
from app.core.config import settings
from app.services.office_converter import OfficeConverter, OfficeConverterError
logger = logging.getLogger(__name__)
class OCRService:
"""
Core OCR service using PaddleOCR-VL
Handles text recognition and document structure analysis
"""
def __init__(self):
"""Initialize PaddleOCR and PPStructure engines with GPU detection"""
self.ocr_languages = settings.ocr_languages_list
self.confidence_threshold = settings.ocr_confidence_threshold
# Initialize PaddleOCR engine (will be lazy-loaded per language)
self.ocr_engines = {}
# Initialize PP-Structure for layout analysis
self.structure_engine = None
# Initialize Office document converter
self.office_converter = OfficeConverter()
# GPU Detection and Configuration
self.gpu_available = False
self.use_gpu = False
self.gpu_info = {}
self._detect_and_configure_gpu()
logger.info("OCR Service initialized")
def _detect_and_configure_gpu(self):
"""Detect GPU availability and configure usage"""
try:
# Check if forced CPU mode
if settings.force_cpu_mode:
logger.info("GPU mode forced to CPU by configuration")
self.use_gpu = False
self.gpu_info = {
'available': False,
'reason': 'CPU mode forced by configuration',
}
return
# Check if PaddlePaddle is compiled with CUDA
if paddle.is_compiled_with_cuda():
# Check if GPU devices are available
gpu_count = paddle.device.cuda.device_count()
if gpu_count > 0:
self.gpu_available = True
self.use_gpu = True
# Get GPU device information
device_id = settings.gpu_device_id if settings.gpu_device_id < gpu_count else 0
gpu_props = paddle.device.cuda.get_device_properties(device_id)
self.gpu_info = {
'available': True,
'device_count': gpu_count,
'device_id': device_id,
'device_name': gpu_props.name,
'total_memory': gpu_props.total_memory,
'compute_capability': f"{gpu_props.major}.{gpu_props.minor}",
}
# Set GPU memory fraction
try:
paddle.device.set_device(f'gpu:{device_id}')
logger.info(f"GPU {device_id} selected: {gpu_props.name}")
logger.info(f"GPU memory: {gpu_props.total_memory / (1024**3):.2f} GB")
logger.info(f"Compute capability: {gpu_props.major}.{gpu_props.minor}")
logger.info(f"GPU memory fraction set to: {settings.gpu_memory_fraction}")
except Exception as e:
logger.warning(f"Failed to configure GPU device: {e}")
self.use_gpu = False
self.gpu_info['available'] = False
self.gpu_info['reason'] = f'GPU configuration failed: {str(e)}'
else:
logger.warning("CUDA is available but no GPU devices found")
self.gpu_info = {
'available': False,
'reason': 'CUDA compiled but no GPU devices detected',
}
else:
logger.info("PaddlePaddle not compiled with CUDA support")
self.gpu_info = {
'available': False,
'reason': 'PaddlePaddle not compiled with CUDA',
}
except Exception as e:
logger.error(f"GPU detection failed: {e}")
self.use_gpu = False
self.gpu_info = {
'available': False,
'reason': f'GPU detection error: {str(e)}',
}
# Log final GPU status
if self.use_gpu:
logger.info(f"✓ GPU acceleration ENABLED - Using {self.gpu_info.get('device_name', 'Unknown GPU')}")
else:
reason = self.gpu_info.get('reason', 'Unknown')
logger.info(f" GPU acceleration DISABLED - {reason} - Using CPU mode")
def get_gpu_status(self) -> Dict:
"""
Get current GPU status and information
Returns:
Dictionary with GPU status information
"""
status = {
'gpu_enabled': self.use_gpu,
'gpu_available': self.gpu_available,
**self.gpu_info,
}
# Add current GPU memory usage if GPU is being used
if self.use_gpu and self.gpu_available:
try:
device_id = self.gpu_info.get('device_id', 0)
# Get memory info (returns allocated, total in bytes)
memory_allocated = paddle.device.cuda.memory_allocated(device_id)
memory_reserved = paddle.device.cuda.memory_reserved(device_id)
total_memory = self.gpu_info.get('total_memory', 0)
status['memory_allocated_mb'] = memory_allocated / (1024**2)
status['memory_reserved_mb'] = memory_reserved / (1024**2)
status['memory_total_mb'] = total_memory / (1024**2)
status['memory_utilization'] = (memory_allocated / total_memory * 100) if total_memory > 0 else 0
except Exception as e:
logger.warning(f"Failed to get GPU memory info: {e}")
return status
def get_ocr_engine(self, lang: str = 'ch') -> PaddleOCR:
"""
Get or create OCR engine for specified language with GPU support
Args:
lang: Language code (ch, en, japan, korean, etc.)
Returns:
PaddleOCR engine instance
"""
if lang not in self.ocr_engines:
logger.info(f"Initializing PaddleOCR engine for language: {lang} (GPU: {self.use_gpu})")
try:
# Check PaddlePaddle version to use correct API
paddle_version = paddle.__version__
is_paddle_3x = paddle_version.startswith('3.')
if is_paddle_3x:
# PaddlePaddle 3.x uses 'device' parameter
device = f"gpu:{settings.gpu_device_id}" if self.use_gpu else "cpu"
self.ocr_engines[lang] = PaddleOCR(
use_angle_cls=True,
lang=lang,
device=device,
)
else:
# PaddlePaddle 2.x uses 'use_gpu' and 'gpu_mem' parameters
self.ocr_engines[lang] = PaddleOCR(
use_angle_cls=True,
lang=lang,
use_gpu=self.use_gpu,
gpu_mem=int(settings.gpu_memory_fraction * 1000) if self.use_gpu else 500,
)
logger.info(f"PaddleOCR engine ready for {lang} (PaddlePaddle {paddle_version}, {'GPU' if self.use_gpu else 'CPU'} mode)")
except Exception as e:
# If GPU initialization fails, fall back to CPU
if self.use_gpu:
logger.warning(f"GPU initialization failed, falling back to CPU: {e}")
self.use_gpu = False
paddle_version = paddle.__version__
is_paddle_3x = paddle_version.startswith('3.')
if is_paddle_3x:
self.ocr_engines[lang] = PaddleOCR(
use_angle_cls=True,
lang=lang,
device="cpu",
)
else:
self.ocr_engines[lang] = PaddleOCR(
use_angle_cls=True,
lang=lang,
use_gpu=False,
)
logger.info(f"PaddleOCR engine ready for {lang} (CPU mode - fallback)")
else:
raise
return self.ocr_engines[lang]
def get_structure_engine(self) -> PPStructureV3:
"""
Get or create PP-Structure engine for layout analysis with GPU support
Returns:
PPStructure engine instance
"""
if self.structure_engine is None:
logger.info(f"Initializing PP-StructureV3 engine (GPU: {self.use_gpu})")
try:
# Check PaddlePaddle version to use correct API
paddle_version = paddle.__version__
is_paddle_3x = paddle_version.startswith('3.')
if is_paddle_3x:
# PaddlePaddle 3.x uses 'device' parameter
device = f"gpu:{settings.gpu_device_id}" if self.use_gpu else "cpu"
self.structure_engine = PPStructureV3(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False,
use_table_recognition=True,
use_formula_recognition=True,
layout_threshold=0.5,
device=device,
)
else:
# PaddlePaddle 2.x uses 'use_gpu' and 'gpu_mem' parameters
self.structure_engine = PPStructureV3(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False,
use_table_recognition=True,
use_formula_recognition=True,
layout_threshold=0.5,
use_gpu=self.use_gpu,
gpu_mem=int(settings.gpu_memory_fraction * 1000) if self.use_gpu else 500,
)
logger.info(f"PP-StructureV3 engine ready (PaddlePaddle {paddle_version}, {'GPU' if self.use_gpu else 'CPU'} mode)")
except Exception as e:
# If GPU initialization fails, fall back to CPU
if self.use_gpu:
logger.warning(f"GPU initialization failed for PP-Structure, falling back to CPU: {e}")
paddle_version = paddle.__version__
is_paddle_3x = paddle_version.startswith('3.')
if is_paddle_3x:
self.structure_engine = PPStructureV3(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False,
use_table_recognition=True,
use_formula_recognition=True,
layout_threshold=0.5,
device="cpu",
)
else:
self.structure_engine = PPStructureV3(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False,
use_table_recognition=True,
use_formula_recognition=True,
layout_threshold=0.5,
use_gpu=False,
)
logger.info("PP-StructureV3 engine ready (CPU mode - fallback)")
else:
raise
return self.structure_engine
def convert_pdf_to_images(self, pdf_path: Path, output_dir: Path) -> List[Path]:
"""
Convert PDF to images (one per page)
Args:
pdf_path: Path to PDF file
output_dir: Directory to save converted images
Returns:
List of paths to converted images
"""
try:
output_dir.mkdir(parents=True, exist_ok=True)
logger.info(f"Converting PDF {pdf_path.name} to images")
# Convert PDF to images (300 DPI for good quality)
images = convert_from_path(
str(pdf_path),
dpi=300,
fmt='png'
)
image_paths = []
for i, image in enumerate(images):
# Save each page as PNG
image_path = output_dir / f"{pdf_path.stem}_page_{i+1}.png"
image.save(str(image_path), 'PNG')
image_paths.append(image_path)
logger.info(f"Saved page {i+1} to {image_path.name}")
logger.info(f"Converted {len(image_paths)} pages from PDF")
return image_paths
except Exception as e:
logger.error(f"PDF conversion error: {str(e)}")
raise
def process_image(
self,
image_path: Path,
lang: str = 'ch',
detect_layout: bool = True,
confidence_threshold: Optional[float] = None
) -> Dict:
"""
Process single image with OCR and layout analysis
Args:
image_path: Path to image file
lang: Language for OCR
detect_layout: Whether to perform layout analysis
confidence_threshold: Minimum confidence threshold (uses default if None)
Returns:
Dictionary with OCR results and metadata
"""
start_time = datetime.now()
threshold = confidence_threshold if confidence_threshold is not None else self.confidence_threshold
try:
# Check if file is Office document
if self.office_converter.is_office_document(image_path):
logger.info(f"Detected Office document: {image_path.name}, converting to PDF")
try:
# Convert Office document to PDF
pdf_path = self.office_converter.convert_to_pdf(image_path)
logger.info(f"Office document converted to PDF: {pdf_path.name}")
# Process the PDF (will be handled by PDF processing logic below)
image_path = pdf_path
except OfficeConverterError as e:
logger.error(f"Office conversion failed: {str(e)}")
raise
# Check if file is PDF
is_pdf = image_path.suffix.lower() == '.pdf'
if is_pdf:
# Convert PDF to images
logger.info(f"Detected PDF file: {image_path.name}, converting to images")
pdf_images_dir = image_path.parent / f"{image_path.stem}_pages"
image_paths = self.convert_pdf_to_images(image_path, pdf_images_dir)
# Process all pages
all_text_regions = []
total_confidence_sum = 0.0
total_valid_regions = 0
all_layout_data = []
all_images_metadata = []
for page_num, page_image_path in enumerate(image_paths, 1):
logger.info(f"Processing PDF page {page_num}/{len(image_paths)}")
# Process each page
page_result = self.process_image(
page_image_path,
lang=lang,
detect_layout=detect_layout,
confidence_threshold=confidence_threshold
)
# Accumulate results
if page_result['status'] == 'success':
# Add page number to each text region
for region in page_result['text_regions']:
region['page'] = page_num
all_text_regions.append(region)
total_confidence_sum += page_result['average_confidence'] * page_result['total_text_regions']
total_valid_regions += page_result['total_text_regions']
# Accumulate layout data
if page_result.get('layout_data'):
all_layout_data.append(page_result['layout_data'])
# Accumulate images metadata
if page_result.get('images_metadata'):
all_images_metadata.extend(page_result['images_metadata'])
# Calculate overall average confidence
avg_confidence = total_confidence_sum / total_valid_regions if total_valid_regions > 0 else 0.0
# Combine layout data from all pages
combined_layout = None
if all_layout_data:
combined_elements = []
for layout in all_layout_data:
if layout.get('elements'):
combined_elements.extend(layout['elements'])
if combined_elements:
combined_layout = {
'elements': combined_elements,
'total_elements': len(combined_elements),
'reading_order': list(range(len(combined_elements))),
}
# Generate combined markdown
markdown_content = self.generate_markdown(all_text_regions, combined_layout)
# Calculate processing time
processing_time = (datetime.now() - start_time).total_seconds()
logger.info(
f"PDF processing completed: {image_path.name} - "
f"{len(image_paths)} pages, "
f"{len(all_text_regions)} regions, "
f"{avg_confidence:.2f} avg confidence, "
f"{processing_time:.2f}s"
)
return {
'status': 'success',
'file_name': image_path.name,
'language': lang,
'text_regions': all_text_regions,
'total_text_regions': len(all_text_regions),
'average_confidence': avg_confidence,
'layout_data': combined_layout,
'images_metadata': all_images_metadata,
'markdown_content': markdown_content,
'processing_time': processing_time,
'timestamp': datetime.utcnow().isoformat(),
'total_pages': len(image_paths),
}
# Get OCR engine (for non-PDF images)
ocr_engine = self.get_ocr_engine(lang)
# Perform OCR
logger.info(f"Processing image: {image_path.name}")
# Note: In PaddleOCR 3.x, use_angle_cls is set during initialization, not in ocr() call
ocr_results = ocr_engine.ocr(str(image_path))
# Parse OCR results (PaddleOCR 3.x format)
text_regions = []
total_confidence = 0.0
valid_regions = 0
if ocr_results and isinstance(ocr_results, (list, tuple)) and len(ocr_results) > 0:
# PaddleOCR 3.x returns a list of dictionaries (one per page)
for page_result in ocr_results:
if isinstance(page_result, dict):
# New format: {'rec_texts': [...], 'rec_scores': [...], 'rec_polys': [...]}
texts = page_result.get('rec_texts', [])
scores = page_result.get('rec_scores', [])
polys = page_result.get('rec_polys', [])
# Process each recognized text
for idx, text in enumerate(texts):
# Get corresponding score and bbox
confidence = scores[idx] if idx < len(scores) else 1.0
bbox = polys[idx] if idx < len(polys) else []
# Convert numpy array bbox to list for JSON serialization
if hasattr(bbox, 'tolist'):
bbox = bbox.tolist()
# Filter by confidence threshold
if confidence >= threshold:
text_regions.append({
'text': text,
'bbox': bbox,
'confidence': float(confidence),
})
total_confidence += confidence
valid_regions += 1
avg_confidence = total_confidence / valid_regions if valid_regions > 0 else 0.0
logger.info(f"Parsed {len(text_regions)} text regions with avg confidence {avg_confidence:.3f}")
# Layout analysis (if requested)
layout_data = None
images_metadata = []
if detect_layout:
layout_data, images_metadata = self.analyze_layout(image_path)
# Generate Markdown
markdown_content = self.generate_markdown(text_regions, layout_data)
# Calculate processing time
processing_time = (datetime.now() - start_time).total_seconds()
result = {
'status': 'success',
'file_name': image_path.name,
'language': lang,
'text_regions': text_regions,
'total_text_regions': len(text_regions),
'average_confidence': avg_confidence,
'layout_data': layout_data,
'images_metadata': images_metadata,
'markdown_content': markdown_content,
'processing_time': processing_time,
'timestamp': datetime.utcnow().isoformat(),
}
logger.info(
f"OCR completed: {image_path.name} - "
f"{len(text_regions)} regions, "
f"{avg_confidence:.2f} avg confidence, "
f"{processing_time:.2f}s"
)
return result
except Exception as e:
import traceback
error_trace = traceback.format_exc()
logger.error(f"OCR processing error for {image_path.name}: {str(e)}\n{error_trace}")
return {
'status': 'error',
'file_name': image_path.name,
'error_message': str(e),
'processing_time': (datetime.now() - start_time).total_seconds(),
}
def analyze_layout(self, image_path: Path) -> Tuple[Optional[Dict], List[Dict]]:
"""
Analyze document layout using PP-StructureV3
Args:
image_path: Path to image file
Returns:
Tuple of (layout_data, images_metadata)
"""
try:
structure_engine = self.get_structure_engine()
# Perform structure analysis using predict() method (PaddleOCR 3.x API)
logger.info(f"Running layout analysis on {image_path.name}")
results = structure_engine.predict(str(image_path))
layout_elements = []
images_metadata = []
# Process each page result (for images, usually just one page)
for page_idx, page_result in enumerate(results):
# Get markdown dictionary from result object
if hasattr(page_result, 'markdown'):
markdown_dict = page_result.markdown
logger.info(f"Page {page_idx} markdown keys: {markdown_dict.keys() if isinstance(markdown_dict, dict) else type(markdown_dict)}")
# Extract layout information from markdown structure
if isinstance(markdown_dict, dict):
# Get markdown texts (HTML format with tables and structure)
markdown_texts = markdown_dict.get('markdown_texts', '')
markdown_images = markdown_dict.get('markdown_images', {})
# Create a layout element for the structured content
if markdown_texts:
# Parse HTML content to identify tables and text
import re
# Check if content contains tables
has_table = '<table' in markdown_texts.lower()
element = {
'element_id': len(layout_elements),
'type': 'table' if has_table else 'text',
'content': markdown_texts,
'page': page_idx,
'bbox': [], # PP-StructureV3 doesn't provide individual bbox in this format
}
layout_elements.append(element)
# Add image metadata
for img_idx, (img_path, img_obj) in enumerate(markdown_images.items()):
images_metadata.append({
'element_id': len(layout_elements) + img_idx,
'image_path': img_path,
'type': 'image',
'page': page_idx,
'bbox': [],
})
if layout_elements:
layout_data = {
'elements': layout_elements,
'total_elements': len(layout_elements),
'reading_order': list(range(len(layout_elements))),
}
logger.info(f"Detected {len(layout_elements)} layout elements")
return layout_data, images_metadata
else:
logger.warning("No layout elements detected")
return None, []
except Exception as e:
import traceback
error_trace = traceback.format_exc()
logger.error(f"Layout analysis error: {str(e)}\n{error_trace}")
return None, []
def generate_markdown(
self,
text_regions: List[Dict],
layout_data: Optional[Dict] = None
) -> str:
"""
Generate Markdown from OCR results
Args:
text_regions: List of text regions with bbox and text
layout_data: Optional layout structure information
Returns:
Markdown formatted string
"""
markdown_lines = []
if layout_data and layout_data.get('elements'):
# Generate structured Markdown based on layout
for element in layout_data['elements']:
element_type = element.get('type', 'text')
content = element.get('content', '')
if element_type == 'title':
markdown_lines.append(f"# {content}\n")
elif element_type == 'table':
# Table in HTML format
markdown_lines.append(content)
markdown_lines.append("")
elif element_type == 'figure':
element_id = element.get('element_id')
markdown_lines.append(f"![Figure {element_id}](./images/img_{element_id}.jpg)\n")
else:
markdown_lines.append(f"{content}\n")
else:
# Simple Markdown from text regions only
# Sort by vertical position (top to bottom)
def get_y_coord(region):
"""Safely extract Y coordinate from bbox"""
bbox = region.get('bbox', [])
if isinstance(bbox, (list, tuple)) and len(bbox) > 0:
if isinstance(bbox[0], (list, tuple)) and len(bbox[0]) > 1:
return bbox[0][1] # [[x1,y1], [x2,y2], ...] format
elif len(bbox) > 1:
return bbox[1] # [x1, y1, x2, y2, ...] format
return 0 # Default to 0 if can't extract
sorted_regions = sorted(text_regions, key=get_y_coord)
for region in sorted_regions:
text = region['text']
markdown_lines.append(text)
return "\n".join(markdown_lines)
def save_results(
self,
result: Dict,
output_dir: Path,
file_id: str
) -> Tuple[Optional[Path], Optional[Path]]:
"""
Save OCR results to JSON and Markdown files
Args:
result: OCR result dictionary
output_dir: Output directory
file_id: Unique file identifier
Returns:
Tuple of (json_path, markdown_path)
"""
try:
output_dir.mkdir(parents=True, exist_ok=True)
# Save JSON
json_path = output_dir / f"{file_id}_result.json"
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=2)
# Save Markdown
markdown_path = output_dir / f"{file_id}_output.md"
markdown_content = result.get('markdown_content', '')
with open(markdown_path, 'w', encoding='utf-8') as f:
f.write(markdown_content)
logger.info(f"Results saved: {json_path.name}, {markdown_path.name}")
return json_path, markdown_path
except Exception as e:
logger.error(f"Error saving results: {str(e)}")
return None, None