diff --git a/backend/app/services/ocr_service.py b/backend/app/services/ocr_service.py index 0e24d7f..9b550f8 100644 --- a/backend/app/services/ocr_service.py +++ b/backend/app/services/ocr_service.py @@ -1,12 +1,12 @@ """ -Tool_OCR - Core OCR Service -PaddleOCR-VL integration for text and structure extraction +Tool_OCR - Core OCR Service with Dual-track Processing +Supports both PaddleOCR (for scanned documents) and direct extraction (for editable PDFs) """ import json import logging from pathlib import Path -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union from datetime import datetime import uuid @@ -18,6 +18,20 @@ import paddle from app.core.config import settings from app.services.office_converter import OfficeConverter, OfficeConverterError +# Import dual-track components +try: + from app.services.document_type_detector import DocumentTypeDetector, ProcessingTrackRecommendation + from app.services.direct_extraction_engine import DirectExtractionEngine + from app.models.unified_document import ( + UnifiedDocument, UnifiedDocumentConverter, DocumentMetadata, + ProcessingTrack, ElementType, DocumentElement, Page, Dimensions, + BoundingBox + ) + DUAL_TRACK_AVAILABLE = True +except ImportError as e: + logger.warning(f"Dual-track components not available: {e}") + DUAL_TRACK_AVAILABLE = False + logger = logging.getLogger(__name__) @@ -28,7 +42,7 @@ class OCRService: """ def __init__(self): - """Initialize PaddleOCR and PPStructure engines with GPU detection""" + """Initialize PaddleOCR and PPStructure engines with GPU detection and dual-track support""" self.ocr_languages = settings.ocr_languages_list self.confidence_threshold = settings.ocr_confidence_threshold @@ -41,6 +55,25 @@ class OCRService: # Initialize Office document converter self.office_converter = OfficeConverter() + # Initialize dual-track components if available + if DUAL_TRACK_AVAILABLE: + self.document_detector = DocumentTypeDetector( + min_text_length=100, + sample_pages=3, + text_coverage_threshold=0.9 + ) + self.direct_extraction_engine = DirectExtractionEngine( + enable_table_detection=True, + enable_image_extraction=True + ) + self.dual_track_enabled = True + logger.info("Dual-track processing enabled") + else: + self.document_detector = None + self.direct_extraction_engine = None + self.dual_track_enabled = False + logger.info("Dual-track processing not available, using OCR-only mode") + # GPU Detection and Configuration self.gpu_available = False self.use_gpu = False @@ -765,9 +798,301 @@ class OCRService: return "\n".join(markdown_lines) + def process_with_dual_track( + self, + file_path: Path, + lang: str = 'ch', + detect_layout: bool = True, + confidence_threshold: Optional[float] = None, + output_dir: Optional[Path] = None, + force_track: Optional[str] = None + ) -> Union[UnifiedDocument, Dict]: + """ + Process document using dual-track approach. + + Args: + file_path: Path to document file + lang: Language for OCR (if needed) + detect_layout: Whether to perform layout analysis + confidence_threshold: Minimum confidence threshold + output_dir: Optional output directory for extracted images + force_track: Force specific track ("ocr" or "direct"), None for auto-detection + + Returns: + UnifiedDocument if dual-track is enabled, Dict otherwise + """ + if not self.dual_track_enabled: + # Fallback to traditional OCR processing + return self.process_file_traditional( + file_path, lang, detect_layout, confidence_threshold, output_dir + ) + + start_time = datetime.now() + document_id = str(uuid.uuid4()) + + try: + # Detect document type and recommend processing track + if force_track: + logger.info(f"Forced to use {force_track} track") + recommendation = ProcessingTrackRecommendation( + track=force_track, + confidence=1.0, + reason=f"Forced by user", + document_type=None + ) + else: + recommendation = self.document_detector.detect(file_path) + logger.info(f"Recommended track: {recommendation.track} (confidence: {recommendation.confidence:.2f})") + logger.info(f"Reason: {recommendation.reason}") + + # Route to appropriate processing track + if recommendation.track == "direct": + # Use direct extraction for editable PDFs + logger.info("Using DIRECT extraction track (PyMuPDF)") + unified_doc = self.direct_extraction_engine.extract(file_path, output_dir) + unified_doc.document_id = document_id + else: + # Use OCR for scanned documents, images, etc. + logger.info("Using OCR track (PaddleOCR)") + ocr_result = self.process_file_traditional( + file_path, lang, detect_layout, confidence_threshold, output_dir + ) + + # Convert OCR result to UnifiedDocument + metadata = DocumentMetadata( + filename=file_path.name, + file_type=file_path.suffix, + file_size=file_path.stat().st_size, + created_at=start_time, + processing_track=ProcessingTrack.OCR, + processing_time=(datetime.now() - start_time).total_seconds(), + language=lang + ) + + unified_doc = UnifiedDocumentConverter.from_ocr_result( + ocr_result, document_id, metadata + ) + + # Update processing track metadata + unified_doc.metadata.processing_track = ( + ProcessingTrack.DIRECT if recommendation.track == "direct" + else ProcessingTrack.OCR + ) + + # Calculate total processing time + processing_time = (datetime.now() - start_time).total_seconds() + unified_doc.metadata.processing_time = processing_time + + logger.info(f"Document processing completed in {processing_time:.2f}s using {recommendation.track} track") + + return unified_doc + + except Exception as e: + logger.error(f"Error in dual-track processing: {e}") + # Fallback to traditional OCR + return self.process_file_traditional( + file_path, lang, detect_layout, confidence_threshold, output_dir + ) + + def process_file_traditional( + self, + file_path: Path, + lang: str = 'ch', + detect_layout: bool = True, + confidence_threshold: Optional[float] = None, + output_dir: Optional[Path] = None + ) -> Dict: + """ + Traditional OCR processing (legacy method). + + Args: + file_path: Path to file + lang: Language for OCR + detect_layout: Whether to perform layout analysis + confidence_threshold: Minimum confidence threshold + output_dir: Optional output directory + + Returns: + Dictionary with OCR results in legacy format + """ + # Check if it's a PDF that needs conversion + if file_path.suffix.lower() == '.pdf': + image_paths = self.convert_pdf_to_images(file_path, output_dir or file_path.parent) + + # Process multiple pages + all_results = [] + for i, image_path in enumerate(image_paths): + result = self.process_image( + image_path, lang, detect_layout, confidence_threshold, output_dir, i + ) + all_results.append(result) + + # Combine results + combined_result = self._combine_results(all_results) + combined_result['filename'] = file_path.name + return combined_result + + else: + # Single image or other file + return self.process_image( + file_path, lang, detect_layout, confidence_threshold, output_dir, 0 + ) + + def _combine_results(self, results: List[Dict]) -> Dict: + """Combine multiple OCR results into one""" + if not results: + return {'status': 'error', 'error': 'No results to combine'} + + combined = { + 'status': 'success', + 'text_regions': [], + 'total_text_regions': 0, + 'average_confidence': 0.0, + 'processing_time': 0.0, + 'pages': [], + 'layout_data': {'elements': []}, + 'images_metadata': [] + } + + total_confidence = 0.0 + total_regions = 0 + + for page_num, result in enumerate(results): + if result['status'] == 'success': + # Add page number to text regions + for region in result.get('text_regions', []): + region['page'] = page_num + 1 + combined['text_regions'].append(region) + + # Accumulate statistics + total_regions += result.get('total_text_regions', 0) + total_confidence += result.get('average_confidence', 0) * result.get('total_text_regions', 0) + combined['processing_time'] += result.get('processing_time', 0) + + # Collect layout data + if result.get('layout_data'): + for elem in result['layout_data'].get('elements', []): + elem['page'] = page_num + combined['layout_data']['elements'].append(elem) + + # Collect images metadata + for img in result.get('images_metadata', []): + img['page'] = page_num + combined['images_metadata'].append(img) + + # Store page data + combined['pages'].append(result) + + combined['total_text_regions'] = total_regions + combined['average_confidence'] = total_confidence / total_regions if total_regions > 0 else 0.0 + combined['language'] = results[0].get('language', 'ch') if results else 'ch' + combined['gpu_used'] = results[0].get('gpu_used', False) if results else False + + # Generate markdown + combined['markdown_content'] = self.generate_markdown( + combined['text_regions'], combined['layout_data'] + ) + + return combined + + def process( + self, + file_path: Path, + lang: str = 'ch', + detect_layout: bool = True, + confidence_threshold: Optional[float] = None, + output_dir: Optional[Path] = None, + use_dual_track: bool = True, + force_track: Optional[str] = None + ) -> Union[UnifiedDocument, Dict]: + """ + Main processing method with dual-track support. + + Args: + file_path: Path to document file + lang: Language for OCR + detect_layout: Whether to perform layout analysis + confidence_threshold: Minimum confidence threshold + output_dir: Optional output directory + use_dual_track: Whether to use dual-track processing (default True) + force_track: Force specific track ("ocr" or "direct") + + Returns: + UnifiedDocument if dual-track is enabled and use_dual_track=True, + Dict with legacy format otherwise + """ + if use_dual_track and self.dual_track_enabled: + # Use dual-track processing + return self.process_with_dual_track( + file_path, lang, detect_layout, confidence_threshold, output_dir, force_track + ) + else: + # Use traditional OCR processing + return self.process_file_traditional( + file_path, lang, detect_layout, confidence_threshold, output_dir + ) + + def process_legacy( + self, + file_path: Path, + lang: str = 'ch', + detect_layout: bool = True, + confidence_threshold: Optional[float] = None, + output_dir: Optional[Path] = None + ) -> Dict: + """ + Legacy processing method that always returns Dict format. + Kept for backward compatibility. + + Args: + file_path: Path to document file + lang: Language for OCR + detect_layout: Whether to perform layout analysis + confidence_threshold: Minimum confidence threshold + output_dir: Optional output directory + + Returns: + Dictionary with OCR results in legacy format + """ + if self.dual_track_enabled: + # Use dual-track but convert to legacy format + result = self.process_with_dual_track( + file_path, lang, detect_layout, confidence_threshold, output_dir + ) + + # Convert UnifiedDocument to legacy format if needed + if isinstance(result, UnifiedDocument): + return result.to_legacy_format() + else: + return result + else: + # Use traditional processing + return self.process_file_traditional( + file_path, lang, detect_layout, confidence_threshold, output_dir + ) + + def get_track_recommendation(self, file_path: Path) -> Optional[ProcessingTrackRecommendation]: + """ + Get processing track recommendation for a file. + + Args: + file_path: Path to document file + + Returns: + ProcessingTrackRecommendation if dual-track is enabled, None otherwise + """ + if not self.dual_track_enabled: + return None + + try: + return self.document_detector.detect(file_path) + except Exception as e: + logger.error(f"Error getting track recommendation: {e}") + return None + def save_results( self, - result: Dict, + result: Union[UnifiedDocument, Dict], output_dir: Path, file_id: str, source_file_path: Optional[Path] = None @@ -776,7 +1101,7 @@ class OCRService: Save OCR results to JSON, Markdown, and layout-preserving PDF files Args: - result: OCR result dictionary + result: OCR result (UnifiedDocument or dictionary) output_dir: Output directory file_id: Unique file identifier source_file_path: Optional path to original source file for PDF generation @@ -787,14 +1112,24 @@ class OCRService: try: output_dir.mkdir(parents=True, exist_ok=True) - # Save JSON + # Convert UnifiedDocument to dict if needed + if isinstance(result, UnifiedDocument): + result_dict = result.to_dict() + legacy_result = result.to_legacy_format() + markdown_content = result.extract_all_text() + else: + result_dict = result + legacy_result = result + markdown_content = result.get('markdown_content', '') + + # Save JSON (use dict format for compatibility) json_path = output_dir / f"{file_id}_result.json" with open(json_path, 'w', encoding='utf-8') as f: - json.dump(result, f, ensure_ascii=False, indent=2) + json.dump(result_dict if isinstance(result, UnifiedDocument) else result, + f, ensure_ascii=False, indent=2) # Save Markdown markdown_path = output_dir / f"{file_id}_output.md" - markdown_content = result.get('markdown_content', '') with open(markdown_path, 'w', encoding='utf-8') as f: f.write(markdown_content) diff --git a/backend/app/services/ocr_service_original.py b/backend/app/services/ocr_service_original.py new file mode 100644 index 0000000..0e24d7f --- /dev/null +++ b/backend/app/services/ocr_service_original.py @@ -0,0 +1,835 @@ +""" +Tool_OCR - Core OCR Service +PaddleOCR-VL integration for text and structure extraction +""" + +import json +import logging +from pathlib import Path +from typing import Dict, List, Optional, Tuple +from datetime import datetime +import uuid + +from paddleocr import PaddleOCR, PPStructureV3 +from PIL import Image +from pdf2image import convert_from_path +import paddle + +from app.core.config import settings +from app.services.office_converter import OfficeConverter, OfficeConverterError + +logger = logging.getLogger(__name__) + + +class OCRService: + """ + Core OCR service using PaddleOCR-VL + Handles text recognition and document structure analysis + """ + + def __init__(self): + """Initialize PaddleOCR and PPStructure engines with GPU detection""" + self.ocr_languages = settings.ocr_languages_list + self.confidence_threshold = settings.ocr_confidence_threshold + + # Initialize PaddleOCR engine (will be lazy-loaded per language) + self.ocr_engines = {} + + # Initialize PP-Structure for layout analysis + self.structure_engine = None + + # Initialize Office document converter + self.office_converter = OfficeConverter() + + # GPU Detection and Configuration + self.gpu_available = False + self.use_gpu = False + self.gpu_info = {} + + self._detect_and_configure_gpu() + + logger.info("OCR Service initialized") + + def _detect_and_configure_gpu(self): + """Detect GPU availability and configure usage""" + try: + # Check if forced CPU mode + if settings.force_cpu_mode: + logger.info("GPU mode forced to CPU by configuration") + self.use_gpu = False + self.gpu_info = { + 'available': False, + 'reason': 'CPU mode forced by configuration', + } + return + + # Check if PaddlePaddle is compiled with CUDA + if paddle.is_compiled_with_cuda(): + # Check if GPU devices are available + gpu_count = paddle.device.cuda.device_count() + + if gpu_count > 0: + self.gpu_available = True + self.use_gpu = True + + # Get GPU device information + device_id = settings.gpu_device_id if settings.gpu_device_id < gpu_count else 0 + gpu_props = paddle.device.cuda.get_device_properties(device_id) + + self.gpu_info = { + 'available': True, + 'device_count': gpu_count, + 'device_id': device_id, + 'device_name': gpu_props.name, + 'total_memory': gpu_props.total_memory, + 'compute_capability': f"{gpu_props.major}.{gpu_props.minor}", + } + + # Set GPU memory fraction + try: + paddle.device.set_device(f'gpu:{device_id}') + logger.info(f"GPU {device_id} selected: {gpu_props.name}") + logger.info(f"GPU memory: {gpu_props.total_memory / (1024**3):.2f} GB") + logger.info(f"Compute capability: {gpu_props.major}.{gpu_props.minor}") + logger.info(f"GPU memory fraction set to: {settings.gpu_memory_fraction}") + except Exception as e: + logger.warning(f"Failed to configure GPU device: {e}") + self.use_gpu = False + self.gpu_info['available'] = False + self.gpu_info['reason'] = f'GPU configuration failed: {str(e)}' + else: + logger.warning("CUDA is available but no GPU devices found") + self.gpu_info = { + 'available': False, + 'reason': 'CUDA compiled but no GPU devices detected', + } + else: + logger.info("PaddlePaddle not compiled with CUDA support") + self.gpu_info = { + 'available': False, + 'reason': 'PaddlePaddle not compiled with CUDA', + } + + except Exception as e: + logger.error(f"GPU detection failed: {e}") + self.use_gpu = False + self.gpu_info = { + 'available': False, + 'reason': f'GPU detection error: {str(e)}', + } + + # Log final GPU status + if self.use_gpu: + logger.info(f"✓ GPU acceleration ENABLED - Using {self.gpu_info.get('device_name', 'Unknown GPU')}") + else: + reason = self.gpu_info.get('reason', 'Unknown') + logger.info(f"ℹ GPU acceleration DISABLED - {reason} - Using CPU mode") + + def get_gpu_status(self) -> Dict: + """ + Get current GPU status and information + + Returns: + Dictionary with GPU status information + """ + status = { + 'gpu_enabled': self.use_gpu, + 'gpu_available': self.gpu_available, + **self.gpu_info, + } + + # Add current GPU memory usage if GPU is being used + if self.use_gpu and self.gpu_available: + try: + device_id = self.gpu_info.get('device_id', 0) + # Get memory info (returns allocated, total in bytes) + memory_allocated = paddle.device.cuda.memory_allocated(device_id) + memory_reserved = paddle.device.cuda.memory_reserved(device_id) + total_memory = self.gpu_info.get('total_memory', 0) + + status['memory_allocated_mb'] = memory_allocated / (1024**2) + status['memory_reserved_mb'] = memory_reserved / (1024**2) + status['memory_total_mb'] = total_memory / (1024**2) + status['memory_utilization'] = (memory_allocated / total_memory * 100) if total_memory > 0 else 0 + except Exception as e: + logger.warning(f"Failed to get GPU memory info: {e}") + + return status + + def get_ocr_engine(self, lang: str = 'ch') -> PaddleOCR: + """ + Get or create OCR engine for specified language with GPU support + + Args: + lang: Language code (ch, en, japan, korean, etc.) + + Returns: + PaddleOCR engine instance + """ + if lang not in self.ocr_engines: + logger.info(f"Initializing PaddleOCR engine for language: {lang} (GPU: {self.use_gpu})") + + try: + # PaddleOCR 3.x: Device is set globally via paddle.set_device() + # No need to pass device/use_gpu/gpu_mem parameters + self.ocr_engines[lang] = PaddleOCR( + lang=lang, + use_textline_orientation=True, # Replaces deprecated use_angle_cls + ) + logger.info(f"PaddleOCR engine ready for {lang} (PaddlePaddle {paddle.__version__}, {'GPU' if self.use_gpu else 'CPU'} mode)") + + except Exception as e: + # If GPU initialization fails, fall back to CPU + if self.use_gpu: + logger.warning(f"GPU initialization failed, falling back to CPU: {e}") + self.use_gpu = False + # Switch to CPU device globally + paddle.set_device('cpu') + self.ocr_engines[lang] = PaddleOCR( + lang=lang, + use_textline_orientation=True, + ) + logger.info(f"PaddleOCR engine ready for {lang} (CPU mode - fallback)") + else: + raise + + return self.ocr_engines[lang] + + def get_structure_engine(self) -> PPStructureV3: + """ + Get or create PP-Structure engine for layout analysis with GPU support + + Returns: + PPStructure engine instance + """ + if self.structure_engine is None: + logger.info(f"Initializing PP-StructureV3 engine (GPU: {self.use_gpu})") + + try: + # PaddleOCR 3.x: Device is set globally via paddle.set_device() + # No need to pass device/use_gpu/gpu_mem parameters + self.structure_engine = PPStructureV3( + use_doc_orientation_classify=False, + use_doc_unwarping=False, + use_textline_orientation=False, + use_table_recognition=True, + use_formula_recognition=True, + use_chart_recognition=True, # Enable chart recognition (requires PaddlePaddle >= 3.2.0 for fused_rms_norm_ext) + layout_threshold=0.5, + ) + logger.info(f"PP-StructureV3 engine ready (PaddlePaddle {paddle.__version__}, {'GPU' if self.use_gpu else 'CPU'} mode)") + + except Exception as e: + # If GPU initialization fails, fall back to CPU + if self.use_gpu: + logger.warning(f"GPU initialization failed for PP-Structure, falling back to CPU: {e}") + self.use_gpu = False + # Switch to CPU device globally + paddle.set_device('cpu') + self.structure_engine = PPStructureV3( + use_doc_orientation_classify=False, + use_doc_unwarping=False, + use_textline_orientation=False, + use_table_recognition=True, + use_formula_recognition=True, + use_chart_recognition=True, # Enable chart recognition (CPU fallback mode) + layout_threshold=0.5, + ) + logger.info("PP-StructureV3 engine ready (CPU mode - fallback)") + else: + raise + + return self.structure_engine + + def convert_pdf_to_images(self, pdf_path: Path, output_dir: Path) -> List[Path]: + """ + Convert PDF to images (one per page) + + Args: + pdf_path: Path to PDF file + output_dir: Directory to save converted images + + Returns: + List of paths to converted images + """ + try: + output_dir.mkdir(parents=True, exist_ok=True) + + logger.info(f"Converting PDF {pdf_path.name} to images") + + # Convert PDF to images (300 DPI for good quality) + images = convert_from_path( + str(pdf_path), + dpi=300, + fmt='png' + ) + + image_paths = [] + for i, image in enumerate(images): + # Save each page as PNG + image_path = output_dir / f"{pdf_path.stem}_page_{i+1}.png" + image.save(str(image_path), 'PNG') + image_paths.append(image_path) + logger.info(f"Saved page {i+1} to {image_path.name}") + + logger.info(f"Converted {len(image_paths)} pages from PDF") + return image_paths + + except Exception as e: + logger.error(f"PDF conversion error: {str(e)}") + raise + + def process_image( + self, + image_path: Path, + lang: str = 'ch', + detect_layout: bool = True, + confidence_threshold: Optional[float] = None, + output_dir: Optional[Path] = None, + current_page: int = 0 + ) -> Dict: + """ + Process single image with OCR and layout analysis + + Args: + image_path: Path to image file + lang: Language for OCR + detect_layout: Whether to perform layout analysis + confidence_threshold: Minimum confidence threshold (uses default if None) + output_dir: Optional output directory for saving extracted images + current_page: Current page number (0-based) for multi-page documents + + Returns: + Dictionary with OCR results and metadata + """ + start_time = datetime.now() + threshold = confidence_threshold if confidence_threshold is not None else self.confidence_threshold + + try: + # Check if file is Office document + if self.office_converter.is_office_document(image_path): + logger.info(f"Detected Office document: {image_path.name}, converting to PDF") + try: + # Convert Office document to PDF + pdf_path = self.office_converter.convert_to_pdf(image_path) + logger.info(f"Office document converted to PDF: {pdf_path.name}") + + # Process the PDF (will be handled by PDF processing logic below) + image_path = pdf_path + except OfficeConverterError as e: + logger.error(f"Office conversion failed: {str(e)}") + raise + + # Check if file is PDF + is_pdf = image_path.suffix.lower() == '.pdf' + + if is_pdf: + # Convert PDF to images + logger.info(f"Detected PDF file: {image_path.name}, converting to images") + pdf_images_dir = image_path.parent / f"{image_path.stem}_pages" + image_paths = self.convert_pdf_to_images(image_path, pdf_images_dir) + + # Process all pages + all_text_regions = [] + total_confidence_sum = 0.0 + total_valid_regions = 0 + all_layout_data = [] + all_images_metadata = [] + all_ocr_dimensions = [] + + for page_num, page_image_path in enumerate(image_paths, 1): + logger.info(f"Processing PDF page {page_num}/{len(image_paths)}") + + # Process each page with correct page number (0-based for layout data) + page_result = self.process_image( + page_image_path, + lang=lang, + detect_layout=detect_layout, + confidence_threshold=confidence_threshold, + output_dir=output_dir, + current_page=page_num - 1 # Convert to 0-based page number for layout data + ) + + # Accumulate results + if page_result['status'] == 'success': + # Add page number to each text region + for region in page_result['text_regions']: + region['page'] = page_num + all_text_regions.append(region) + + total_confidence_sum += page_result['average_confidence'] * page_result['total_text_regions'] + total_valid_regions += page_result['total_text_regions'] + + # Accumulate layout data (page numbers already set correctly in analyze_layout) + if page_result.get('layout_data'): + layout_data = page_result['layout_data'] + all_layout_data.append(layout_data) + + # Accumulate images metadata (page numbers already set correctly in analyze_layout) + if page_result.get('images_metadata'): + all_images_metadata.extend(page_result['images_metadata']) + + # Store OCR dimensions for each page + if page_result.get('ocr_dimensions'): + all_ocr_dimensions.append({ + 'page': page_num, + 'width': page_result['ocr_dimensions']['width'], + 'height': page_result['ocr_dimensions']['height'] + }) + + # Calculate overall average confidence + avg_confidence = total_confidence_sum / total_valid_regions if total_valid_regions > 0 else 0.0 + + # Combine layout data from all pages + combined_layout = None + if all_layout_data: + combined_elements = [] + for layout in all_layout_data: + if layout.get('elements'): + combined_elements.extend(layout['elements']) + if combined_elements: + combined_layout = { + 'elements': combined_elements, + 'total_elements': len(combined_elements), + 'reading_order': list(range(len(combined_elements))), + } + + # Generate combined markdown + markdown_content = self.generate_markdown(all_text_regions, combined_layout) + + # Calculate processing time + processing_time = (datetime.now() - start_time).total_seconds() + + logger.info( + f"PDF processing completed: {image_path.name} - " + f"{len(image_paths)} pages, " + f"{len(all_text_regions)} regions, " + f"{avg_confidence:.2f} avg confidence, " + f"{processing_time:.2f}s" + ) + + return { + 'status': 'success', + 'file_name': image_path.name, + 'language': lang, + 'text_regions': all_text_regions, + 'total_text_regions': len(all_text_regions), + 'average_confidence': avg_confidence, + 'layout_data': combined_layout, + 'images_metadata': all_images_metadata, + 'markdown_content': markdown_content, + 'processing_time': processing_time, + 'timestamp': datetime.utcnow().isoformat(), + 'total_pages': len(image_paths), + 'ocr_dimensions': all_ocr_dimensions if all_ocr_dimensions else None, + } + + # Get OCR engine (for non-PDF images) + ocr_engine = self.get_ocr_engine(lang) + + # Get the actual image dimensions that OCR will use + from PIL import Image + with Image.open(image_path) as img: + ocr_width, ocr_height = img.size + logger.info(f"OCR processing image dimensions: {ocr_width}x{ocr_height}") + + # Perform OCR + logger.info(f"Processing image: {image_path.name}") + # Note: In PaddleOCR 3.x, use_angle_cls is set during initialization, not in ocr() call + ocr_results = ocr_engine.ocr(str(image_path)) + + # Parse OCR results (PaddleOCR 3.x format) + text_regions = [] + total_confidence = 0.0 + valid_regions = 0 + + if ocr_results and isinstance(ocr_results, (list, tuple)) and len(ocr_results) > 0: + # PaddleOCR 3.x returns a list of dictionaries (one per page) + for page_result in ocr_results: + if isinstance(page_result, dict): + # New format: {'rec_texts': [...], 'rec_scores': [...], 'rec_polys': [...]} + texts = page_result.get('rec_texts', []) + scores = page_result.get('rec_scores', []) + polys = page_result.get('rec_polys', []) + + # Process each recognized text + for idx, text in enumerate(texts): + # Get corresponding score and bbox + confidence = scores[idx] if idx < len(scores) else 1.0 + bbox = polys[idx] if idx < len(polys) else [] + + # Convert numpy array bbox to list for JSON serialization + if hasattr(bbox, 'tolist'): + bbox = bbox.tolist() + + # Filter by confidence threshold + if confidence >= threshold: + text_regions.append({ + 'text': text, + 'bbox': bbox, + 'confidence': float(confidence), + }) + total_confidence += confidence + valid_regions += 1 + + avg_confidence = total_confidence / valid_regions if valid_regions > 0 else 0.0 + + logger.info(f"Parsed {len(text_regions)} text regions with avg confidence {avg_confidence:.3f}") + + # Layout analysis (if requested) + layout_data = None + images_metadata = [] + + if detect_layout: + # Pass current_page to analyze_layout for correct page numbering + layout_data, images_metadata = self.analyze_layout(image_path, output_dir=output_dir, current_page=current_page) + + # Generate Markdown + markdown_content = self.generate_markdown(text_regions, layout_data) + + # Calculate processing time + processing_time = (datetime.now() - start_time).total_seconds() + + result = { + 'status': 'success', + 'file_name': image_path.name, + 'language': lang, + 'text_regions': text_regions, + 'total_text_regions': len(text_regions), + 'average_confidence': avg_confidence, + 'layout_data': layout_data, + 'images_metadata': images_metadata, + 'markdown_content': markdown_content, + 'processing_time': processing_time, + 'timestamp': datetime.utcnow().isoformat(), + 'ocr_dimensions': { + 'width': ocr_width, + 'height': ocr_height + } + } + + logger.info( + f"OCR completed: {image_path.name} - " + f"{len(text_regions)} regions, " + f"{avg_confidence:.2f} avg confidence, " + f"{processing_time:.2f}s" + ) + + return result + + except Exception as e: + import traceback + error_trace = traceback.format_exc() + logger.error(f"OCR processing error for {image_path.name}: {str(e)}\n{error_trace}") + return { + 'status': 'error', + 'file_name': image_path.name, + 'error_message': str(e), + 'processing_time': (datetime.now() - start_time).total_seconds(), + } + + def _extract_table_text(self, html_content: str) -> str: + """ + Extract text from HTML table content for translation purposes + + Args: + html_content: HTML content containing table + + Returns: + Extracted text from table cells + """ + try: + from html.parser import HTMLParser + + class TableTextExtractor(HTMLParser): + def __init__(self): + super().__init__() + self.text_parts = [] + self.in_table = False + + def handle_starttag(self, tag, attrs): + if tag == 'table': + self.in_table = True + + def handle_endtag(self, tag): + if tag == 'table': + self.in_table = False + elif tag in ('td', 'th') and self.in_table: + self.text_parts.append(' | ') # Cell separator + elif tag == 'tr' and self.in_table: + self.text_parts.append('\n') # Row separator + + def handle_data(self, data): + if self.in_table: + stripped = data.strip() + if stripped: + self.text_parts.append(stripped) + + parser = TableTextExtractor() + parser.feed(html_content) + + # Clean up the extracted text + extracted = ''.join(parser.text_parts) + # Remove multiple separators + import re + extracted = re.sub(r'\s*\|\s*\|+\s*', ' | ', extracted) + extracted = re.sub(r'\n+', '\n', extracted) + extracted = extracted.strip() + + return extracted + + except Exception as e: + logger.warning(f"Failed to extract table text: {e}") + # Fallback: just remove HTML tags + import re + text = re.sub(r'<[^>]+>', ' ', html_content) + text = re.sub(r'\s+', ' ', text) + return text.strip() + + def analyze_layout(self, image_path: Path, output_dir: Optional[Path] = None, current_page: int = 0) -> Tuple[Optional[Dict], List[Dict]]: + """ + Analyze document layout using PP-StructureV3 + + Args: + image_path: Path to image file + output_dir: Optional output directory for saving extracted images (defaults to image_path.parent) + current_page: Current page number (0-based) for multi-page documents + + Returns: + Tuple of (layout_data, images_metadata) + """ + try: + structure_engine = self.get_structure_engine() + + # Perform structure analysis using predict() method (PaddleOCR 3.x API) + logger.info(f"Running layout analysis on {image_path.name}") + results = structure_engine.predict(str(image_path)) + + layout_elements = [] + images_metadata = [] + + # Process each page result (for images, usually just one page) + for page_idx, page_result in enumerate(results): + # Get markdown dictionary from result object + if hasattr(page_result, 'markdown'): + markdown_dict = page_result.markdown + logger.info(f"Page {page_idx} markdown keys: {markdown_dict.keys() if isinstance(markdown_dict, dict) else type(markdown_dict)}") + + # Extract layout information from markdown structure + if isinstance(markdown_dict, dict): + # Get markdown texts (HTML format with tables and structure) + markdown_texts = markdown_dict.get('markdown_texts', '') + markdown_images = markdown_dict.get('markdown_images', {}) + + # Create a layout element for the structured content + if markdown_texts: + # Parse HTML content to identify tables and text + import re + + # Check if content contains tables + has_table = ' str: + """ + Generate Markdown from OCR results + + Args: + text_regions: List of text regions with bbox and text + layout_data: Optional layout structure information + + Returns: + Markdown formatted string + """ + markdown_lines = [] + + if layout_data and layout_data.get('elements'): + # Generate structured Markdown based on layout + for element in layout_data['elements']: + element_type = element.get('type', 'text') + content = element.get('content', '') + + if element_type == 'title': + markdown_lines.append(f"# {content}\n") + elif element_type == 'table': + # Table in HTML format + markdown_lines.append(content) + markdown_lines.append("") + elif element_type == 'figure': + element_id = element.get('element_id') + markdown_lines.append(f"![Figure {element_id}](./images/img_{element_id}.jpg)\n") + else: + markdown_lines.append(f"{content}\n") + + else: + # Simple Markdown from text regions only + # Sort by vertical position (top to bottom) + def get_y_coord(region): + """Safely extract Y coordinate from bbox""" + bbox = region.get('bbox', []) + if isinstance(bbox, (list, tuple)) and len(bbox) > 0: + if isinstance(bbox[0], (list, tuple)) and len(bbox[0]) > 1: + return bbox[0][1] # [[x1,y1], [x2,y2], ...] format + elif len(bbox) > 1: + return bbox[1] # [x1, y1, x2, y2, ...] format + return 0 # Default to 0 if can't extract + + sorted_regions = sorted(text_regions, key=get_y_coord) + + for region in sorted_regions: + text = region['text'] + markdown_lines.append(text) + + return "\n".join(markdown_lines) + + def save_results( + self, + result: Dict, + output_dir: Path, + file_id: str, + source_file_path: Optional[Path] = None + ) -> Tuple[Optional[Path], Optional[Path], Optional[Path]]: + """ + Save OCR results to JSON, Markdown, and layout-preserving PDF files + + Args: + result: OCR result dictionary + output_dir: Output directory + file_id: Unique file identifier + source_file_path: Optional path to original source file for PDF generation + + Returns: + Tuple of (json_path, markdown_path, pdf_path) + """ + try: + output_dir.mkdir(parents=True, exist_ok=True) + + # Save JSON + json_path = output_dir / f"{file_id}_result.json" + with open(json_path, 'w', encoding='utf-8') as f: + json.dump(result, f, ensure_ascii=False, indent=2) + + # Save Markdown + markdown_path = output_dir / f"{file_id}_output.md" + markdown_content = result.get('markdown_content', '') + with open(markdown_path, 'w', encoding='utf-8') as f: + f.write(markdown_content) + + logger.info(f"Results saved: {json_path.name}, {markdown_path.name}") + + # Generate layout-preserving PDF + pdf_path = None + try: + from app.services.pdf_generator_service import pdf_generator_service + + pdf_filename = f"{file_id}_layout.pdf" + pdf_path = output_dir / pdf_filename + + logger.info(f"Generating layout-preserving PDF: {pdf_filename}") + + success = pdf_generator_service.generate_layout_pdf( + json_path=json_path, + output_path=pdf_path, + source_file_path=source_file_path + ) + + if success: + logger.info(f"✓ PDF generated successfully: {pdf_path.name}") + else: + logger.warning(f"✗ PDF generation failed for {file_id}") + pdf_path = None + + except Exception as e: + logger.error(f"Error generating PDF for {file_id}: {str(e)}") + import traceback + traceback.print_exc() + pdf_path = None + + return json_path, markdown_path, pdf_path + + except Exception as e: + logger.error(f"Error saving results: {str(e)}") + return None, None, None