refactor: remove unused code and migrate legacy API
Backend cleanup: - Remove ocr_service_original.py (legacy OCR service, replaced by ocr_service.py) - Remove preprocessor.py (unused, functionality absorbed by layout_preprocessing_service.py) - Remove pdf_font_manager.py (unused, never referenced by any service) Frontend cleanup: - Remove MarkdownPreview.tsx (unused component) - Remove ResultsTable.tsx (unused, replaced by TaskHistoryPage) - Remove services/api.ts (legacy API client, migrated to apiV2) - Remove types/api.ts (legacy types, migrated to apiV2.ts) API migration: - Add export rules CRUD methods to apiClientV2 - Update SettingsPage.tsx to use apiClientV2 - Update Layout.tsx to use only apiClientV2 for logout This reduces ~1,500 lines of redundant code and unifies the API client. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,835 +0,0 @@
|
|||||||
"""
|
|
||||||
Tool_OCR - Core OCR Service
|
|
||||||
PaddleOCR-VL integration for text and structure extraction
|
|
||||||
"""
|
|
||||||
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Dict, List, Optional, Tuple
|
|
||||||
from datetime import datetime
|
|
||||||
import uuid
|
|
||||||
|
|
||||||
from paddleocr import PaddleOCR, PPStructureV3
|
|
||||||
from PIL import Image
|
|
||||||
from pdf2image import convert_from_path
|
|
||||||
import paddle
|
|
||||||
|
|
||||||
from app.core.config import settings
|
|
||||||
from app.services.office_converter import OfficeConverter, OfficeConverterError
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class OCRService:
|
|
||||||
"""
|
|
||||||
Core OCR service using PaddleOCR-VL
|
|
||||||
Handles text recognition and document structure analysis
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
"""Initialize PaddleOCR and PPStructure engines with GPU detection"""
|
|
||||||
self.ocr_languages = settings.ocr_languages_list
|
|
||||||
self.confidence_threshold = settings.ocr_confidence_threshold
|
|
||||||
|
|
||||||
# Initialize PaddleOCR engine (will be lazy-loaded per language)
|
|
||||||
self.ocr_engines = {}
|
|
||||||
|
|
||||||
# Initialize PP-Structure for layout analysis
|
|
||||||
self.structure_engine = None
|
|
||||||
|
|
||||||
# Initialize Office document converter
|
|
||||||
self.office_converter = OfficeConverter()
|
|
||||||
|
|
||||||
# GPU Detection and Configuration
|
|
||||||
self.gpu_available = False
|
|
||||||
self.use_gpu = False
|
|
||||||
self.gpu_info = {}
|
|
||||||
|
|
||||||
self._detect_and_configure_gpu()
|
|
||||||
|
|
||||||
logger.info("OCR Service initialized")
|
|
||||||
|
|
||||||
def _detect_and_configure_gpu(self):
|
|
||||||
"""Detect GPU availability and configure usage"""
|
|
||||||
try:
|
|
||||||
# Check if forced CPU mode
|
|
||||||
if settings.force_cpu_mode:
|
|
||||||
logger.info("GPU mode forced to CPU by configuration")
|
|
||||||
self.use_gpu = False
|
|
||||||
self.gpu_info = {
|
|
||||||
'available': False,
|
|
||||||
'reason': 'CPU mode forced by configuration',
|
|
||||||
}
|
|
||||||
return
|
|
||||||
|
|
||||||
# Check if PaddlePaddle is compiled with CUDA
|
|
||||||
if paddle.is_compiled_with_cuda():
|
|
||||||
# Check if GPU devices are available
|
|
||||||
gpu_count = paddle.device.cuda.device_count()
|
|
||||||
|
|
||||||
if gpu_count > 0:
|
|
||||||
self.gpu_available = True
|
|
||||||
self.use_gpu = True
|
|
||||||
|
|
||||||
# Get GPU device information
|
|
||||||
device_id = settings.gpu_device_id if settings.gpu_device_id < gpu_count else 0
|
|
||||||
gpu_props = paddle.device.cuda.get_device_properties(device_id)
|
|
||||||
|
|
||||||
self.gpu_info = {
|
|
||||||
'available': True,
|
|
||||||
'device_count': gpu_count,
|
|
||||||
'device_id': device_id,
|
|
||||||
'device_name': gpu_props.name,
|
|
||||||
'total_memory': gpu_props.total_memory,
|
|
||||||
'compute_capability': f"{gpu_props.major}.{gpu_props.minor}",
|
|
||||||
}
|
|
||||||
|
|
||||||
# Set GPU memory fraction
|
|
||||||
try:
|
|
||||||
paddle.device.set_device(f'gpu:{device_id}')
|
|
||||||
logger.info(f"GPU {device_id} selected: {gpu_props.name}")
|
|
||||||
logger.info(f"GPU memory: {gpu_props.total_memory / (1024**3):.2f} GB")
|
|
||||||
logger.info(f"Compute capability: {gpu_props.major}.{gpu_props.minor}")
|
|
||||||
logger.info(f"GPU memory fraction set to: {settings.gpu_memory_fraction}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Failed to configure GPU device: {e}")
|
|
||||||
self.use_gpu = False
|
|
||||||
self.gpu_info['available'] = False
|
|
||||||
self.gpu_info['reason'] = f'GPU configuration failed: {str(e)}'
|
|
||||||
else:
|
|
||||||
logger.warning("CUDA is available but no GPU devices found")
|
|
||||||
self.gpu_info = {
|
|
||||||
'available': False,
|
|
||||||
'reason': 'CUDA compiled but no GPU devices detected',
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
logger.info("PaddlePaddle not compiled with CUDA support")
|
|
||||||
self.gpu_info = {
|
|
||||||
'available': False,
|
|
||||||
'reason': 'PaddlePaddle not compiled with CUDA',
|
|
||||||
}
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"GPU detection failed: {e}")
|
|
||||||
self.use_gpu = False
|
|
||||||
self.gpu_info = {
|
|
||||||
'available': False,
|
|
||||||
'reason': f'GPU detection error: {str(e)}',
|
|
||||||
}
|
|
||||||
|
|
||||||
# Log final GPU status
|
|
||||||
if self.use_gpu:
|
|
||||||
logger.info(f"✓ GPU acceleration ENABLED - Using {self.gpu_info.get('device_name', 'Unknown GPU')}")
|
|
||||||
else:
|
|
||||||
reason = self.gpu_info.get('reason', 'Unknown')
|
|
||||||
logger.info(f"ℹ GPU acceleration DISABLED - {reason} - Using CPU mode")
|
|
||||||
|
|
||||||
def get_gpu_status(self) -> Dict:
|
|
||||||
"""
|
|
||||||
Get current GPU status and information
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dictionary with GPU status information
|
|
||||||
"""
|
|
||||||
status = {
|
|
||||||
'gpu_enabled': self.use_gpu,
|
|
||||||
'gpu_available': self.gpu_available,
|
|
||||||
**self.gpu_info,
|
|
||||||
}
|
|
||||||
|
|
||||||
# Add current GPU memory usage if GPU is being used
|
|
||||||
if self.use_gpu and self.gpu_available:
|
|
||||||
try:
|
|
||||||
device_id = self.gpu_info.get('device_id', 0)
|
|
||||||
# Get memory info (returns allocated, total in bytes)
|
|
||||||
memory_allocated = paddle.device.cuda.memory_allocated(device_id)
|
|
||||||
memory_reserved = paddle.device.cuda.memory_reserved(device_id)
|
|
||||||
total_memory = self.gpu_info.get('total_memory', 0)
|
|
||||||
|
|
||||||
status['memory_allocated_mb'] = memory_allocated / (1024**2)
|
|
||||||
status['memory_reserved_mb'] = memory_reserved / (1024**2)
|
|
||||||
status['memory_total_mb'] = total_memory / (1024**2)
|
|
||||||
status['memory_utilization'] = (memory_allocated / total_memory * 100) if total_memory > 0 else 0
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Failed to get GPU memory info: {e}")
|
|
||||||
|
|
||||||
return status
|
|
||||||
|
|
||||||
def get_ocr_engine(self, lang: str = 'ch') -> PaddleOCR:
|
|
||||||
"""
|
|
||||||
Get or create OCR engine for specified language with GPU support
|
|
||||||
|
|
||||||
Args:
|
|
||||||
lang: Language code (ch, en, japan, korean, etc.)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
PaddleOCR engine instance
|
|
||||||
"""
|
|
||||||
if lang not in self.ocr_engines:
|
|
||||||
logger.info(f"Initializing PaddleOCR engine for language: {lang} (GPU: {self.use_gpu})")
|
|
||||||
|
|
||||||
try:
|
|
||||||
# PaddleOCR 3.x: Device is set globally via paddle.set_device()
|
|
||||||
# No need to pass device/use_gpu/gpu_mem parameters
|
|
||||||
self.ocr_engines[lang] = PaddleOCR(
|
|
||||||
lang=lang,
|
|
||||||
use_textline_orientation=True, # Replaces deprecated use_angle_cls
|
|
||||||
)
|
|
||||||
logger.info(f"PaddleOCR engine ready for {lang} (PaddlePaddle {paddle.__version__}, {'GPU' if self.use_gpu else 'CPU'} mode)")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
# If GPU initialization fails, fall back to CPU
|
|
||||||
if self.use_gpu:
|
|
||||||
logger.warning(f"GPU initialization failed, falling back to CPU: {e}")
|
|
||||||
self.use_gpu = False
|
|
||||||
# Switch to CPU device globally
|
|
||||||
paddle.set_device('cpu')
|
|
||||||
self.ocr_engines[lang] = PaddleOCR(
|
|
||||||
lang=lang,
|
|
||||||
use_textline_orientation=True,
|
|
||||||
)
|
|
||||||
logger.info(f"PaddleOCR engine ready for {lang} (CPU mode - fallback)")
|
|
||||||
else:
|
|
||||||
raise
|
|
||||||
|
|
||||||
return self.ocr_engines[lang]
|
|
||||||
|
|
||||||
def get_structure_engine(self) -> PPStructureV3:
|
|
||||||
"""
|
|
||||||
Get or create PP-Structure engine for layout analysis with GPU support
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
PPStructure engine instance
|
|
||||||
"""
|
|
||||||
if self.structure_engine is None:
|
|
||||||
logger.info(f"Initializing PP-StructureV3 engine (GPU: {self.use_gpu})")
|
|
||||||
|
|
||||||
try:
|
|
||||||
# PaddleOCR 3.x: Device is set globally via paddle.set_device()
|
|
||||||
# No need to pass device/use_gpu/gpu_mem parameters
|
|
||||||
self.structure_engine = PPStructureV3(
|
|
||||||
use_doc_orientation_classify=False,
|
|
||||||
use_doc_unwarping=False,
|
|
||||||
use_textline_orientation=False,
|
|
||||||
use_table_recognition=True,
|
|
||||||
use_formula_recognition=True,
|
|
||||||
use_chart_recognition=True, # Enable chart recognition (requires PaddlePaddle >= 3.2.0 for fused_rms_norm_ext)
|
|
||||||
layout_threshold=0.5,
|
|
||||||
)
|
|
||||||
logger.info(f"PP-StructureV3 engine ready (PaddlePaddle {paddle.__version__}, {'GPU' if self.use_gpu else 'CPU'} mode)")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
# If GPU initialization fails, fall back to CPU
|
|
||||||
if self.use_gpu:
|
|
||||||
logger.warning(f"GPU initialization failed for PP-Structure, falling back to CPU: {e}")
|
|
||||||
self.use_gpu = False
|
|
||||||
# Switch to CPU device globally
|
|
||||||
paddle.set_device('cpu')
|
|
||||||
self.structure_engine = PPStructureV3(
|
|
||||||
use_doc_orientation_classify=False,
|
|
||||||
use_doc_unwarping=False,
|
|
||||||
use_textline_orientation=False,
|
|
||||||
use_table_recognition=True,
|
|
||||||
use_formula_recognition=True,
|
|
||||||
use_chart_recognition=True, # Enable chart recognition (CPU fallback mode)
|
|
||||||
layout_threshold=0.5,
|
|
||||||
)
|
|
||||||
logger.info("PP-StructureV3 engine ready (CPU mode - fallback)")
|
|
||||||
else:
|
|
||||||
raise
|
|
||||||
|
|
||||||
return self.structure_engine
|
|
||||||
|
|
||||||
def convert_pdf_to_images(self, pdf_path: Path, output_dir: Path) -> List[Path]:
|
|
||||||
"""
|
|
||||||
Convert PDF to images (one per page)
|
|
||||||
|
|
||||||
Args:
|
|
||||||
pdf_path: Path to PDF file
|
|
||||||
output_dir: Directory to save converted images
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of paths to converted images
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
logger.info(f"Converting PDF {pdf_path.name} to images")
|
|
||||||
|
|
||||||
# Convert PDF to images (300 DPI for good quality)
|
|
||||||
images = convert_from_path(
|
|
||||||
str(pdf_path),
|
|
||||||
dpi=300,
|
|
||||||
fmt='png'
|
|
||||||
)
|
|
||||||
|
|
||||||
image_paths = []
|
|
||||||
for i, image in enumerate(images):
|
|
||||||
# Save each page as PNG
|
|
||||||
image_path = output_dir / f"{pdf_path.stem}_page_{i+1}.png"
|
|
||||||
image.save(str(image_path), 'PNG')
|
|
||||||
image_paths.append(image_path)
|
|
||||||
logger.info(f"Saved page {i+1} to {image_path.name}")
|
|
||||||
|
|
||||||
logger.info(f"Converted {len(image_paths)} pages from PDF")
|
|
||||||
return image_paths
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"PDF conversion error: {str(e)}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def process_image(
|
|
||||||
self,
|
|
||||||
image_path: Path,
|
|
||||||
lang: str = 'ch',
|
|
||||||
detect_layout: bool = True,
|
|
||||||
confidence_threshold: Optional[float] = None,
|
|
||||||
output_dir: Optional[Path] = None,
|
|
||||||
current_page: int = 0
|
|
||||||
) -> Dict:
|
|
||||||
"""
|
|
||||||
Process single image with OCR and layout analysis
|
|
||||||
|
|
||||||
Args:
|
|
||||||
image_path: Path to image file
|
|
||||||
lang: Language for OCR
|
|
||||||
detect_layout: Whether to perform layout analysis
|
|
||||||
confidence_threshold: Minimum confidence threshold (uses default if None)
|
|
||||||
output_dir: Optional output directory for saving extracted images
|
|
||||||
current_page: Current page number (0-based) for multi-page documents
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dictionary with OCR results and metadata
|
|
||||||
"""
|
|
||||||
start_time = datetime.now()
|
|
||||||
threshold = confidence_threshold if confidence_threshold is not None else self.confidence_threshold
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Check if file is Office document
|
|
||||||
if self.office_converter.is_office_document(image_path):
|
|
||||||
logger.info(f"Detected Office document: {image_path.name}, converting to PDF")
|
|
||||||
try:
|
|
||||||
# Convert Office document to PDF
|
|
||||||
pdf_path = self.office_converter.convert_to_pdf(image_path)
|
|
||||||
logger.info(f"Office document converted to PDF: {pdf_path.name}")
|
|
||||||
|
|
||||||
# Process the PDF (will be handled by PDF processing logic below)
|
|
||||||
image_path = pdf_path
|
|
||||||
except OfficeConverterError as e:
|
|
||||||
logger.error(f"Office conversion failed: {str(e)}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
# Check if file is PDF
|
|
||||||
is_pdf = image_path.suffix.lower() == '.pdf'
|
|
||||||
|
|
||||||
if is_pdf:
|
|
||||||
# Convert PDF to images
|
|
||||||
logger.info(f"Detected PDF file: {image_path.name}, converting to images")
|
|
||||||
pdf_images_dir = image_path.parent / f"{image_path.stem}_pages"
|
|
||||||
image_paths = self.convert_pdf_to_images(image_path, pdf_images_dir)
|
|
||||||
|
|
||||||
# Process all pages
|
|
||||||
all_text_regions = []
|
|
||||||
total_confidence_sum = 0.0
|
|
||||||
total_valid_regions = 0
|
|
||||||
all_layout_data = []
|
|
||||||
all_images_metadata = []
|
|
||||||
all_ocr_dimensions = []
|
|
||||||
|
|
||||||
for page_num, page_image_path in enumerate(image_paths, 1):
|
|
||||||
logger.info(f"Processing PDF page {page_num}/{len(image_paths)}")
|
|
||||||
|
|
||||||
# Process each page with correct page number (0-based for layout data)
|
|
||||||
page_result = self.process_image(
|
|
||||||
page_image_path,
|
|
||||||
lang=lang,
|
|
||||||
detect_layout=detect_layout,
|
|
||||||
confidence_threshold=confidence_threshold,
|
|
||||||
output_dir=output_dir,
|
|
||||||
current_page=page_num - 1 # Convert to 0-based page number for layout data
|
|
||||||
)
|
|
||||||
|
|
||||||
# Accumulate results
|
|
||||||
if page_result['status'] == 'success':
|
|
||||||
# Add page number to each text region
|
|
||||||
for region in page_result['text_regions']:
|
|
||||||
region['page'] = page_num
|
|
||||||
all_text_regions.append(region)
|
|
||||||
|
|
||||||
total_confidence_sum += page_result['average_confidence'] * page_result['total_text_regions']
|
|
||||||
total_valid_regions += page_result['total_text_regions']
|
|
||||||
|
|
||||||
# Accumulate layout data (page numbers already set correctly in analyze_layout)
|
|
||||||
if page_result.get('layout_data'):
|
|
||||||
layout_data = page_result['layout_data']
|
|
||||||
all_layout_data.append(layout_data)
|
|
||||||
|
|
||||||
# Accumulate images metadata (page numbers already set correctly in analyze_layout)
|
|
||||||
if page_result.get('images_metadata'):
|
|
||||||
all_images_metadata.extend(page_result['images_metadata'])
|
|
||||||
|
|
||||||
# Store OCR dimensions for each page
|
|
||||||
if page_result.get('ocr_dimensions'):
|
|
||||||
all_ocr_dimensions.append({
|
|
||||||
'page': page_num,
|
|
||||||
'width': page_result['ocr_dimensions']['width'],
|
|
||||||
'height': page_result['ocr_dimensions']['height']
|
|
||||||
})
|
|
||||||
|
|
||||||
# Calculate overall average confidence
|
|
||||||
avg_confidence = total_confidence_sum / total_valid_regions if total_valid_regions > 0 else 0.0
|
|
||||||
|
|
||||||
# Combine layout data from all pages
|
|
||||||
combined_layout = None
|
|
||||||
if all_layout_data:
|
|
||||||
combined_elements = []
|
|
||||||
for layout in all_layout_data:
|
|
||||||
if layout.get('elements'):
|
|
||||||
combined_elements.extend(layout['elements'])
|
|
||||||
if combined_elements:
|
|
||||||
combined_layout = {
|
|
||||||
'elements': combined_elements,
|
|
||||||
'total_elements': len(combined_elements),
|
|
||||||
'reading_order': list(range(len(combined_elements))),
|
|
||||||
}
|
|
||||||
|
|
||||||
# Generate combined markdown
|
|
||||||
markdown_content = self.generate_markdown(all_text_regions, combined_layout)
|
|
||||||
|
|
||||||
# Calculate processing time
|
|
||||||
processing_time = (datetime.now() - start_time).total_seconds()
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
f"PDF processing completed: {image_path.name} - "
|
|
||||||
f"{len(image_paths)} pages, "
|
|
||||||
f"{len(all_text_regions)} regions, "
|
|
||||||
f"{avg_confidence:.2f} avg confidence, "
|
|
||||||
f"{processing_time:.2f}s"
|
|
||||||
)
|
|
||||||
|
|
||||||
return {
|
|
||||||
'status': 'success',
|
|
||||||
'file_name': image_path.name,
|
|
||||||
'language': lang,
|
|
||||||
'text_regions': all_text_regions,
|
|
||||||
'total_text_regions': len(all_text_regions),
|
|
||||||
'average_confidence': avg_confidence,
|
|
||||||
'layout_data': combined_layout,
|
|
||||||
'images_metadata': all_images_metadata,
|
|
||||||
'markdown_content': markdown_content,
|
|
||||||
'processing_time': processing_time,
|
|
||||||
'timestamp': datetime.utcnow().isoformat(),
|
|
||||||
'total_pages': len(image_paths),
|
|
||||||
'ocr_dimensions': all_ocr_dimensions if all_ocr_dimensions else None,
|
|
||||||
}
|
|
||||||
|
|
||||||
# Get OCR engine (for non-PDF images)
|
|
||||||
ocr_engine = self.get_ocr_engine(lang)
|
|
||||||
|
|
||||||
# Get the actual image dimensions that OCR will use
|
|
||||||
from PIL import Image
|
|
||||||
with Image.open(image_path) as img:
|
|
||||||
ocr_width, ocr_height = img.size
|
|
||||||
logger.info(f"OCR processing image dimensions: {ocr_width}x{ocr_height}")
|
|
||||||
|
|
||||||
# Perform OCR
|
|
||||||
logger.info(f"Processing image: {image_path.name}")
|
|
||||||
# Note: In PaddleOCR 3.x, use_angle_cls is set during initialization, not in ocr() call
|
|
||||||
ocr_results = ocr_engine.ocr(str(image_path))
|
|
||||||
|
|
||||||
# Parse OCR results (PaddleOCR 3.x format)
|
|
||||||
text_regions = []
|
|
||||||
total_confidence = 0.0
|
|
||||||
valid_regions = 0
|
|
||||||
|
|
||||||
if ocr_results and isinstance(ocr_results, (list, tuple)) and len(ocr_results) > 0:
|
|
||||||
# PaddleOCR 3.x returns a list of dictionaries (one per page)
|
|
||||||
for page_result in ocr_results:
|
|
||||||
if isinstance(page_result, dict):
|
|
||||||
# New format: {'rec_texts': [...], 'rec_scores': [...], 'rec_polys': [...]}
|
|
||||||
texts = page_result.get('rec_texts', [])
|
|
||||||
scores = page_result.get('rec_scores', [])
|
|
||||||
polys = page_result.get('rec_polys', [])
|
|
||||||
|
|
||||||
# Process each recognized text
|
|
||||||
for idx, text in enumerate(texts):
|
|
||||||
# Get corresponding score and bbox
|
|
||||||
confidence = scores[idx] if idx < len(scores) else 1.0
|
|
||||||
bbox = polys[idx] if idx < len(polys) else []
|
|
||||||
|
|
||||||
# Convert numpy array bbox to list for JSON serialization
|
|
||||||
if hasattr(bbox, 'tolist'):
|
|
||||||
bbox = bbox.tolist()
|
|
||||||
|
|
||||||
# Filter by confidence threshold
|
|
||||||
if confidence >= threshold:
|
|
||||||
text_regions.append({
|
|
||||||
'text': text,
|
|
||||||
'bbox': bbox,
|
|
||||||
'confidence': float(confidence),
|
|
||||||
})
|
|
||||||
total_confidence += confidence
|
|
||||||
valid_regions += 1
|
|
||||||
|
|
||||||
avg_confidence = total_confidence / valid_regions if valid_regions > 0 else 0.0
|
|
||||||
|
|
||||||
logger.info(f"Parsed {len(text_regions)} text regions with avg confidence {avg_confidence:.3f}")
|
|
||||||
|
|
||||||
# Layout analysis (if requested)
|
|
||||||
layout_data = None
|
|
||||||
images_metadata = []
|
|
||||||
|
|
||||||
if detect_layout:
|
|
||||||
# Pass current_page to analyze_layout for correct page numbering
|
|
||||||
layout_data, images_metadata = self.analyze_layout(image_path, output_dir=output_dir, current_page=current_page)
|
|
||||||
|
|
||||||
# Generate Markdown
|
|
||||||
markdown_content = self.generate_markdown(text_regions, layout_data)
|
|
||||||
|
|
||||||
# Calculate processing time
|
|
||||||
processing_time = (datetime.now() - start_time).total_seconds()
|
|
||||||
|
|
||||||
result = {
|
|
||||||
'status': 'success',
|
|
||||||
'file_name': image_path.name,
|
|
||||||
'language': lang,
|
|
||||||
'text_regions': text_regions,
|
|
||||||
'total_text_regions': len(text_regions),
|
|
||||||
'average_confidence': avg_confidence,
|
|
||||||
'layout_data': layout_data,
|
|
||||||
'images_metadata': images_metadata,
|
|
||||||
'markdown_content': markdown_content,
|
|
||||||
'processing_time': processing_time,
|
|
||||||
'timestamp': datetime.utcnow().isoformat(),
|
|
||||||
'ocr_dimensions': {
|
|
||||||
'width': ocr_width,
|
|
||||||
'height': ocr_height
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
f"OCR completed: {image_path.name} - "
|
|
||||||
f"{len(text_regions)} regions, "
|
|
||||||
f"{avg_confidence:.2f} avg confidence, "
|
|
||||||
f"{processing_time:.2f}s"
|
|
||||||
)
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
import traceback
|
|
||||||
error_trace = traceback.format_exc()
|
|
||||||
logger.error(f"OCR processing error for {image_path.name}: {str(e)}\n{error_trace}")
|
|
||||||
return {
|
|
||||||
'status': 'error',
|
|
||||||
'file_name': image_path.name,
|
|
||||||
'error_message': str(e),
|
|
||||||
'processing_time': (datetime.now() - start_time).total_seconds(),
|
|
||||||
}
|
|
||||||
|
|
||||||
def _extract_table_text(self, html_content: str) -> str:
|
|
||||||
"""
|
|
||||||
Extract text from HTML table content for translation purposes
|
|
||||||
|
|
||||||
Args:
|
|
||||||
html_content: HTML content containing table
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Extracted text from table cells
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
from html.parser import HTMLParser
|
|
||||||
|
|
||||||
class TableTextExtractor(HTMLParser):
|
|
||||||
def __init__(self):
|
|
||||||
super().__init__()
|
|
||||||
self.text_parts = []
|
|
||||||
self.in_table = False
|
|
||||||
|
|
||||||
def handle_starttag(self, tag, attrs):
|
|
||||||
if tag == 'table':
|
|
||||||
self.in_table = True
|
|
||||||
|
|
||||||
def handle_endtag(self, tag):
|
|
||||||
if tag == 'table':
|
|
||||||
self.in_table = False
|
|
||||||
elif tag in ('td', 'th') and self.in_table:
|
|
||||||
self.text_parts.append(' | ') # Cell separator
|
|
||||||
elif tag == 'tr' and self.in_table:
|
|
||||||
self.text_parts.append('\n') # Row separator
|
|
||||||
|
|
||||||
def handle_data(self, data):
|
|
||||||
if self.in_table:
|
|
||||||
stripped = data.strip()
|
|
||||||
if stripped:
|
|
||||||
self.text_parts.append(stripped)
|
|
||||||
|
|
||||||
parser = TableTextExtractor()
|
|
||||||
parser.feed(html_content)
|
|
||||||
|
|
||||||
# Clean up the extracted text
|
|
||||||
extracted = ''.join(parser.text_parts)
|
|
||||||
# Remove multiple separators
|
|
||||||
import re
|
|
||||||
extracted = re.sub(r'\s*\|\s*\|+\s*', ' | ', extracted)
|
|
||||||
extracted = re.sub(r'\n+', '\n', extracted)
|
|
||||||
extracted = extracted.strip()
|
|
||||||
|
|
||||||
return extracted
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Failed to extract table text: {e}")
|
|
||||||
# Fallback: just remove HTML tags
|
|
||||||
import re
|
|
||||||
text = re.sub(r'<[^>]+>', ' ', html_content)
|
|
||||||
text = re.sub(r'\s+', ' ', text)
|
|
||||||
return text.strip()
|
|
||||||
|
|
||||||
def analyze_layout(self, image_path: Path, output_dir: Optional[Path] = None, current_page: int = 0) -> Tuple[Optional[Dict], List[Dict]]:
|
|
||||||
"""
|
|
||||||
Analyze document layout using PP-StructureV3
|
|
||||||
|
|
||||||
Args:
|
|
||||||
image_path: Path to image file
|
|
||||||
output_dir: Optional output directory for saving extracted images (defaults to image_path.parent)
|
|
||||||
current_page: Current page number (0-based) for multi-page documents
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Tuple of (layout_data, images_metadata)
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
structure_engine = self.get_structure_engine()
|
|
||||||
|
|
||||||
# Perform structure analysis using predict() method (PaddleOCR 3.x API)
|
|
||||||
logger.info(f"Running layout analysis on {image_path.name}")
|
|
||||||
results = structure_engine.predict(str(image_path))
|
|
||||||
|
|
||||||
layout_elements = []
|
|
||||||
images_metadata = []
|
|
||||||
|
|
||||||
# Process each page result (for images, usually just one page)
|
|
||||||
for page_idx, page_result in enumerate(results):
|
|
||||||
# Get markdown dictionary from result object
|
|
||||||
if hasattr(page_result, 'markdown'):
|
|
||||||
markdown_dict = page_result.markdown
|
|
||||||
logger.info(f"Page {page_idx} markdown keys: {markdown_dict.keys() if isinstance(markdown_dict, dict) else type(markdown_dict)}")
|
|
||||||
|
|
||||||
# Extract layout information from markdown structure
|
|
||||||
if isinstance(markdown_dict, dict):
|
|
||||||
# Get markdown texts (HTML format with tables and structure)
|
|
||||||
markdown_texts = markdown_dict.get('markdown_texts', '')
|
|
||||||
markdown_images = markdown_dict.get('markdown_images', {})
|
|
||||||
|
|
||||||
# Create a layout element for the structured content
|
|
||||||
if markdown_texts:
|
|
||||||
# Parse HTML content to identify tables and text
|
|
||||||
import re
|
|
||||||
|
|
||||||
# Check if content contains tables
|
|
||||||
has_table = '<table' in markdown_texts.lower()
|
|
||||||
|
|
||||||
element = {
|
|
||||||
'element_id': len(layout_elements),
|
|
||||||
'type': 'table' if has_table else 'text',
|
|
||||||
'content': markdown_texts,
|
|
||||||
'page': current_page, # Use current_page parameter instead of page_idx
|
|
||||||
'bbox': [], # PP-StructureV3 doesn't provide individual bbox in this format
|
|
||||||
}
|
|
||||||
|
|
||||||
# Extract text from table for translation purposes
|
|
||||||
if has_table:
|
|
||||||
table_text = self._extract_table_text(markdown_texts)
|
|
||||||
element['extracted_text'] = table_text
|
|
||||||
logger.info(f"Extracted {len(table_text)} characters from table")
|
|
||||||
|
|
||||||
layout_elements.append(element)
|
|
||||||
|
|
||||||
# Add image metadata and SAVE images to disk
|
|
||||||
for img_idx, (img_path, img_obj) in enumerate(markdown_images.items()):
|
|
||||||
# Save image to disk
|
|
||||||
try:
|
|
||||||
# Determine base directory for saving images
|
|
||||||
base_dir = output_dir if output_dir else image_path.parent
|
|
||||||
|
|
||||||
# Create full path for image file
|
|
||||||
full_img_path = base_dir / img_path
|
|
||||||
|
|
||||||
# Create imgs/ subdirectory if it doesn't exist
|
|
||||||
full_img_path.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
# Save image object to disk
|
|
||||||
if hasattr(img_obj, 'save'):
|
|
||||||
# img_obj is PIL Image
|
|
||||||
img_obj.save(str(full_img_path))
|
|
||||||
logger.info(f"Saved extracted image to {full_img_path}")
|
|
||||||
else:
|
|
||||||
logger.warning(f"Image object for {img_path} does not have save() method, skipping")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Failed to save image {img_path}: {str(e)}")
|
|
||||||
# Continue processing even if image save fails
|
|
||||||
|
|
||||||
# Extract bbox from filename (format: img_in_table_box_x1_y1_x2_y2.jpg)
|
|
||||||
bbox = []
|
|
||||||
try:
|
|
||||||
import re
|
|
||||||
match = re.search(r'box_(\d+)_(\d+)_(\d+)_(\d+)', img_path)
|
|
||||||
if match:
|
|
||||||
x1, y1, x2, y2 = map(int, match.groups())
|
|
||||||
# Convert to 4-point bbox format: [[x1,y1], [x2,y1], [x2,y2], [x1,y2]]
|
|
||||||
bbox = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
|
|
||||||
logger.info(f"Extracted bbox from filename: {bbox}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Failed to extract bbox from {img_path}: {e}")
|
|
||||||
|
|
||||||
images_metadata.append({
|
|
||||||
'element_id': len(layout_elements) + img_idx,
|
|
||||||
'image_path': img_path,
|
|
||||||
'type': 'image',
|
|
||||||
'page': current_page, # Use current_page parameter instead of page_idx
|
|
||||||
'bbox': bbox,
|
|
||||||
})
|
|
||||||
|
|
||||||
if layout_elements:
|
|
||||||
layout_data = {
|
|
||||||
'elements': layout_elements,
|
|
||||||
'total_elements': len(layout_elements),
|
|
||||||
'reading_order': list(range(len(layout_elements))),
|
|
||||||
}
|
|
||||||
logger.info(f"Detected {len(layout_elements)} layout elements")
|
|
||||||
return layout_data, images_metadata
|
|
||||||
else:
|
|
||||||
logger.warning("No layout elements detected")
|
|
||||||
return None, []
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
import traceback
|
|
||||||
error_trace = traceback.format_exc()
|
|
||||||
logger.error(f"Layout analysis error: {str(e)}\n{error_trace}")
|
|
||||||
return None, []
|
|
||||||
|
|
||||||
def generate_markdown(
|
|
||||||
self,
|
|
||||||
text_regions: List[Dict],
|
|
||||||
layout_data: Optional[Dict] = None
|
|
||||||
) -> str:
|
|
||||||
"""
|
|
||||||
Generate Markdown from OCR results
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text_regions: List of text regions with bbox and text
|
|
||||||
layout_data: Optional layout structure information
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Markdown formatted string
|
|
||||||
"""
|
|
||||||
markdown_lines = []
|
|
||||||
|
|
||||||
if layout_data and layout_data.get('elements'):
|
|
||||||
# Generate structured Markdown based on layout
|
|
||||||
for element in layout_data['elements']:
|
|
||||||
element_type = element.get('type', 'text')
|
|
||||||
content = element.get('content', '')
|
|
||||||
|
|
||||||
if element_type == 'title':
|
|
||||||
markdown_lines.append(f"# {content}\n")
|
|
||||||
elif element_type == 'table':
|
|
||||||
# Table in HTML format
|
|
||||||
markdown_lines.append(content)
|
|
||||||
markdown_lines.append("")
|
|
||||||
elif element_type == 'figure':
|
|
||||||
element_id = element.get('element_id')
|
|
||||||
markdown_lines.append(f"\n")
|
|
||||||
else:
|
|
||||||
markdown_lines.append(f"{content}\n")
|
|
||||||
|
|
||||||
else:
|
|
||||||
# Simple Markdown from text regions only
|
|
||||||
# Sort by vertical position (top to bottom)
|
|
||||||
def get_y_coord(region):
|
|
||||||
"""Safely extract Y coordinate from bbox"""
|
|
||||||
bbox = region.get('bbox', [])
|
|
||||||
if isinstance(bbox, (list, tuple)) and len(bbox) > 0:
|
|
||||||
if isinstance(bbox[0], (list, tuple)) and len(bbox[0]) > 1:
|
|
||||||
return bbox[0][1] # [[x1,y1], [x2,y2], ...] format
|
|
||||||
elif len(bbox) > 1:
|
|
||||||
return bbox[1] # [x1, y1, x2, y2, ...] format
|
|
||||||
return 0 # Default to 0 if can't extract
|
|
||||||
|
|
||||||
sorted_regions = sorted(text_regions, key=get_y_coord)
|
|
||||||
|
|
||||||
for region in sorted_regions:
|
|
||||||
text = region['text']
|
|
||||||
markdown_lines.append(text)
|
|
||||||
|
|
||||||
return "\n".join(markdown_lines)
|
|
||||||
|
|
||||||
def save_results(
|
|
||||||
self,
|
|
||||||
result: Dict,
|
|
||||||
output_dir: Path,
|
|
||||||
file_id: str,
|
|
||||||
source_file_path: Optional[Path] = None
|
|
||||||
) -> Tuple[Optional[Path], Optional[Path], Optional[Path]]:
|
|
||||||
"""
|
|
||||||
Save OCR results to JSON, Markdown, and layout-preserving PDF files
|
|
||||||
|
|
||||||
Args:
|
|
||||||
result: OCR result dictionary
|
|
||||||
output_dir: Output directory
|
|
||||||
file_id: Unique file identifier
|
|
||||||
source_file_path: Optional path to original source file for PDF generation
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Tuple of (json_path, markdown_path, pdf_path)
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
# Save JSON
|
|
||||||
json_path = output_dir / f"{file_id}_result.json"
|
|
||||||
with open(json_path, 'w', encoding='utf-8') as f:
|
|
||||||
json.dump(result, f, ensure_ascii=False, indent=2)
|
|
||||||
|
|
||||||
# Save Markdown
|
|
||||||
markdown_path = output_dir / f"{file_id}_output.md"
|
|
||||||
markdown_content = result.get('markdown_content', '')
|
|
||||||
with open(markdown_path, 'w', encoding='utf-8') as f:
|
|
||||||
f.write(markdown_content)
|
|
||||||
|
|
||||||
logger.info(f"Results saved: {json_path.name}, {markdown_path.name}")
|
|
||||||
|
|
||||||
# Generate layout-preserving PDF
|
|
||||||
pdf_path = None
|
|
||||||
try:
|
|
||||||
from app.services.pdf_generator_service import pdf_generator_service
|
|
||||||
|
|
||||||
pdf_filename = f"{file_id}_layout.pdf"
|
|
||||||
pdf_path = output_dir / pdf_filename
|
|
||||||
|
|
||||||
logger.info(f"Generating layout-preserving PDF: {pdf_filename}")
|
|
||||||
|
|
||||||
success = pdf_generator_service.generate_layout_pdf(
|
|
||||||
json_path=json_path,
|
|
||||||
output_path=pdf_path,
|
|
||||||
source_file_path=source_file_path
|
|
||||||
)
|
|
||||||
|
|
||||||
if success:
|
|
||||||
logger.info(f"✓ PDF generated successfully: {pdf_path.name}")
|
|
||||||
else:
|
|
||||||
logger.warning(f"✗ PDF generation failed for {file_id}")
|
|
||||||
pdf_path = None
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error generating PDF for {file_id}: {str(e)}")
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
pdf_path = None
|
|
||||||
|
|
||||||
return json_path, markdown_path, pdf_path
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error saving results: {str(e)}")
|
|
||||||
return None, None, None
|
|
||||||
@@ -1,312 +0,0 @@
|
|||||||
"""
|
|
||||||
PDF Font Manager - Handles font loading, registration, and fallback.
|
|
||||||
|
|
||||||
This module provides unified font management for PDF generation,
|
|
||||||
including CJK font support and font fallback mechanisms.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import logging
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Dict, List, Optional, Tuple
|
|
||||||
|
|
||||||
from reportlab.pdfbase import pdfmetrics
|
|
||||||
from reportlab.pdfbase.ttfonts import TTFont
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
|
||||||
# Configuration
|
|
||||||
# ============================================================================
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class FontConfig:
|
|
||||||
"""Configuration for font management."""
|
|
||||||
# Primary fonts
|
|
||||||
chinese_font_name: str = "NotoSansSC"
|
|
||||||
chinese_font_path: Optional[Path] = None
|
|
||||||
|
|
||||||
# Fallback fonts (built-in)
|
|
||||||
fallback_font_name: str = "Helvetica"
|
|
||||||
fallback_cjk_font_name: str = "HeiseiMin-W3" # Built-in ReportLab CJK
|
|
||||||
|
|
||||||
# Font sizes
|
|
||||||
default_font_size: int = 10
|
|
||||||
min_font_size: int = 6
|
|
||||||
max_font_size: int = 14
|
|
||||||
|
|
||||||
# Font registration options
|
|
||||||
auto_register: bool = True
|
|
||||||
enable_cjk_fallback: bool = True
|
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
|
||||||
# Font Manager
|
|
||||||
# ============================================================================
|
|
||||||
|
|
||||||
class FontManager:
|
|
||||||
"""
|
|
||||||
Manages font registration and selection for PDF generation.
|
|
||||||
|
|
||||||
Features:
|
|
||||||
- Lazy font registration
|
|
||||||
- CJK (Chinese/Japanese/Korean) font support
|
|
||||||
- Automatic fallback to built-in fonts
|
|
||||||
- Font caching to avoid duplicate registration
|
|
||||||
"""
|
|
||||||
|
|
||||||
_instance = None
|
|
||||||
_registered_fonts: Dict[str, Path] = {}
|
|
||||||
|
|
||||||
def __new__(cls, *args, **kwargs):
|
|
||||||
"""Singleton pattern to avoid duplicate font registration."""
|
|
||||||
if cls._instance is None:
|
|
||||||
cls._instance = super().__new__(cls)
|
|
||||||
cls._instance._initialized = False
|
|
||||||
return cls._instance
|
|
||||||
|
|
||||||
def __init__(self, config: Optional[FontConfig] = None):
|
|
||||||
"""
|
|
||||||
Initialize FontManager.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
config: FontConfig instance (uses defaults if None)
|
|
||||||
"""
|
|
||||||
if self._initialized:
|
|
||||||
return
|
|
||||||
|
|
||||||
self.config = config or FontConfig()
|
|
||||||
self._primary_font_registered = False
|
|
||||||
self._cjk_fallback_available = False
|
|
||||||
|
|
||||||
# Auto-register fonts if enabled
|
|
||||||
if self.config.auto_register:
|
|
||||||
self._register_fonts()
|
|
||||||
|
|
||||||
self._initialized = True
|
|
||||||
|
|
||||||
@property
|
|
||||||
def primary_font_name(self) -> str:
|
|
||||||
"""Get the primary font name to use."""
|
|
||||||
if self._primary_font_registered:
|
|
||||||
return self.config.chinese_font_name
|
|
||||||
return self.config.fallback_font_name
|
|
||||||
|
|
||||||
@property
|
|
||||||
def is_cjk_enabled(self) -> bool:
|
|
||||||
"""Check if CJK fonts are available."""
|
|
||||||
return self._primary_font_registered or self._cjk_fallback_available
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def reset(cls):
|
|
||||||
"""Reset singleton instance (for testing)."""
|
|
||||||
cls._instance = None
|
|
||||||
cls._registered_fonts = {}
|
|
||||||
|
|
||||||
def get_font_for_text(self, text: str) -> str:
|
|
||||||
"""
|
|
||||||
Get appropriate font name for given text.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text: Text to render
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Font name suitable for the text content
|
|
||||||
"""
|
|
||||||
if self._contains_cjk(text):
|
|
||||||
if self._primary_font_registered:
|
|
||||||
return self.config.chinese_font_name
|
|
||||||
elif self._cjk_fallback_available:
|
|
||||||
return self.config.fallback_cjk_font_name
|
|
||||||
return self.primary_font_name
|
|
||||||
|
|
||||||
def get_font_size(
|
|
||||||
self,
|
|
||||||
text: str,
|
|
||||||
available_width: float,
|
|
||||||
available_height: float,
|
|
||||||
pdf_canvas=None
|
|
||||||
) -> int:
|
|
||||||
"""
|
|
||||||
Calculate optimal font size for text to fit within bounds.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text: Text to render
|
|
||||||
available_width: Maximum width available
|
|
||||||
available_height: Maximum height available
|
|
||||||
pdf_canvas: Optional canvas for precise measurement
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Font size that fits within bounds
|
|
||||||
"""
|
|
||||||
font_name = self.get_font_for_text(text)
|
|
||||||
|
|
||||||
for size in range(self.config.max_font_size, self.config.min_font_size - 1, -1):
|
|
||||||
if pdf_canvas:
|
|
||||||
# Precise measurement with canvas
|
|
||||||
text_width = pdf_canvas.stringWidth(text, font_name, size)
|
|
||||||
else:
|
|
||||||
# Approximate measurement
|
|
||||||
text_width = len(text) * size * 0.6 # Rough estimate
|
|
||||||
|
|
||||||
text_height = size * 1.2 # Line height
|
|
||||||
|
|
||||||
if text_width <= available_width and text_height <= available_height:
|
|
||||||
return size
|
|
||||||
|
|
||||||
return self.config.min_font_size
|
|
||||||
|
|
||||||
def register_font(
|
|
||||||
self,
|
|
||||||
font_name: str,
|
|
||||||
font_path: Path,
|
|
||||||
force: bool = False
|
|
||||||
) -> bool:
|
|
||||||
"""
|
|
||||||
Register a custom font.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
font_name: Name to register font under
|
|
||||||
font_path: Path to TTF font file
|
|
||||||
force: Force re-registration if already registered
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
True if registration successful
|
|
||||||
"""
|
|
||||||
if font_name in self._registered_fonts and not force:
|
|
||||||
logger.debug(f"Font {font_name} already registered")
|
|
||||||
return True
|
|
||||||
|
|
||||||
try:
|
|
||||||
if not font_path.exists():
|
|
||||||
logger.error(f"Font file not found: {font_path}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
pdfmetrics.registerFont(TTFont(font_name, str(font_path)))
|
|
||||||
self._registered_fonts[font_name] = font_path
|
|
||||||
logger.info(f"Font registered: {font_name} from {font_path}")
|
|
||||||
return True
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to register font {font_name}: {e}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
def get_registered_fonts(self) -> List[str]:
|
|
||||||
"""Get list of registered custom font names."""
|
|
||||||
return list(self._registered_fonts.keys())
|
|
||||||
|
|
||||||
# =========================================================================
|
|
||||||
# Private Methods
|
|
||||||
# =========================================================================
|
|
||||||
|
|
||||||
def _register_fonts(self):
|
|
||||||
"""Register configured fonts."""
|
|
||||||
# Register primary Chinese font
|
|
||||||
if self.config.chinese_font_path:
|
|
||||||
self._register_chinese_font()
|
|
||||||
|
|
||||||
# Setup CJK fallback
|
|
||||||
if self.config.enable_cjk_fallback:
|
|
||||||
self._setup_cjk_fallback()
|
|
||||||
|
|
||||||
def _register_chinese_font(self):
|
|
||||||
"""Register the primary Chinese font."""
|
|
||||||
font_path = self.config.chinese_font_path
|
|
||||||
|
|
||||||
if font_path is None:
|
|
||||||
# Try to load from settings
|
|
||||||
try:
|
|
||||||
from app.core.config import settings
|
|
||||||
font_path = Path(settings.chinese_font_path)
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug(f"Could not load font path from settings: {e}")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Resolve relative path
|
|
||||||
if not font_path.is_absolute():
|
|
||||||
# Try project root
|
|
||||||
project_root = Path(__file__).resolve().parent.parent.parent.parent
|
|
||||||
font_path = project_root / font_path
|
|
||||||
|
|
||||||
if not font_path.exists():
|
|
||||||
logger.warning(f"Chinese font not found at {font_path}")
|
|
||||||
return
|
|
||||||
|
|
||||||
try:
|
|
||||||
pdfmetrics.registerFont(TTFont(self.config.chinese_font_name, str(font_path)))
|
|
||||||
self._registered_fonts[self.config.chinese_font_name] = font_path
|
|
||||||
self._primary_font_registered = True
|
|
||||||
logger.info(f"Chinese font registered: {self.config.chinese_font_name}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to register Chinese font: {e}")
|
|
||||||
|
|
||||||
def _setup_cjk_fallback(self):
|
|
||||||
"""Setup CJK fallback using built-in fonts."""
|
|
||||||
try:
|
|
||||||
# ReportLab includes CID fonts for CJK
|
|
||||||
from reportlab.pdfbase.cidfonts import UnicodeCIDFont
|
|
||||||
|
|
||||||
# Register CJK fonts if not already registered
|
|
||||||
try:
|
|
||||||
pdfmetrics.registerFont(UnicodeCIDFont('HeiseiMin-W3'))
|
|
||||||
self._cjk_fallback_available = True
|
|
||||||
logger.debug("CJK fallback font available: HeiseiMin-W3")
|
|
||||||
except Exception:
|
|
||||||
pass # Font may already be registered
|
|
||||||
|
|
||||||
except ImportError:
|
|
||||||
logger.debug("CID fonts not available for CJK fallback")
|
|
||||||
|
|
||||||
def _contains_cjk(self, text: str) -> bool:
|
|
||||||
"""
|
|
||||||
Check if text contains CJK characters.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text: Text to check
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
True if text contains Chinese, Japanese, or Korean characters
|
|
||||||
"""
|
|
||||||
if not text:
|
|
||||||
return False
|
|
||||||
|
|
||||||
for char in text:
|
|
||||||
code = ord(char)
|
|
||||||
# CJK Unified Ideographs and related ranges
|
|
||||||
if any([
|
|
||||||
0x4E00 <= code <= 0x9FFF, # CJK Unified Ideographs
|
|
||||||
0x3400 <= code <= 0x4DBF, # CJK Extension A
|
|
||||||
0x20000 <= code <= 0x2A6DF, # CJK Extension B
|
|
||||||
0x3000 <= code <= 0x303F, # CJK Punctuation
|
|
||||||
0x3040 <= code <= 0x309F, # Hiragana
|
|
||||||
0x30A0 <= code <= 0x30FF, # Katakana
|
|
||||||
0xAC00 <= code <= 0xD7AF, # Korean Hangul
|
|
||||||
]):
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
|
||||||
# Convenience Functions
|
|
||||||
# ============================================================================
|
|
||||||
|
|
||||||
_default_manager: Optional[FontManager] = None
|
|
||||||
|
|
||||||
|
|
||||||
def get_font_manager() -> FontManager:
|
|
||||||
"""Get the default FontManager instance."""
|
|
||||||
global _default_manager
|
|
||||||
if _default_manager is None:
|
|
||||||
_default_manager = FontManager()
|
|
||||||
return _default_manager
|
|
||||||
|
|
||||||
|
|
||||||
def register_font(font_name: str, font_path: Path) -> bool:
|
|
||||||
"""Register a font using the default manager."""
|
|
||||||
return get_font_manager().register_font(font_name, font_path)
|
|
||||||
|
|
||||||
|
|
||||||
def get_font_for_text(text: str) -> str:
|
|
||||||
"""Get appropriate font for text using the default manager."""
|
|
||||||
return get_font_manager().get_font_for_text(text)
|
|
||||||
@@ -1,230 +0,0 @@
|
|||||||
"""
|
|
||||||
Tool_OCR - Document Preprocessor Service
|
|
||||||
Handles file validation, format detection, and preprocessing
|
|
||||||
"""
|
|
||||||
|
|
||||||
import magic
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Tuple, Optional
|
|
||||||
import logging
|
|
||||||
from PIL import Image
|
|
||||||
import cv2
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from app.core.config import settings
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class DocumentPreprocessor:
|
|
||||||
"""
|
|
||||||
Document preprocessing service for format standardization
|
|
||||||
Validates and prepares documents for OCR processing
|
|
||||||
"""
|
|
||||||
|
|
||||||
SUPPORTED_IMAGE_FORMATS = ['png', 'jpg', 'jpeg', 'bmp', 'tiff', 'tif']
|
|
||||||
SUPPORTED_PDF_FORMAT = ['pdf']
|
|
||||||
ALL_SUPPORTED_FORMATS = SUPPORTED_IMAGE_FORMATS + SUPPORTED_PDF_FORMAT
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.allowed_extensions = settings.allowed_extensions_list
|
|
||||||
self.max_file_size = settings.max_upload_size
|
|
||||||
logger.info(f"DocumentPreprocessor initialized with allowed_extensions: {self.allowed_extensions}")
|
|
||||||
|
|
||||||
def validate_file(self, file_path: Path) -> Tuple[bool, Optional[str], Optional[str]]:
|
|
||||||
"""
|
|
||||||
Validate file format, size, and integrity
|
|
||||||
|
|
||||||
Args:
|
|
||||||
file_path: Path to the file to validate
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Tuple of (is_valid, file_format, error_message)
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
# Check file exists
|
|
||||||
if not file_path.exists():
|
|
||||||
return False, None, f"File not found: {file_path}"
|
|
||||||
|
|
||||||
# Check file size
|
|
||||||
file_size = file_path.stat().st_size
|
|
||||||
if file_size > self.max_file_size:
|
|
||||||
max_mb = self.max_file_size / (1024 * 1024)
|
|
||||||
actual_mb = file_size / (1024 * 1024)
|
|
||||||
return False, None, f"File too large: {actual_mb:.2f}MB (max {max_mb:.2f}MB)"
|
|
||||||
|
|
||||||
# Detect file format using magic numbers
|
|
||||||
mime = magic.Magic(mime=True)
|
|
||||||
mime_type = mime.from_file(str(file_path))
|
|
||||||
|
|
||||||
# Map MIME type to format
|
|
||||||
file_format = self._mime_to_format(mime_type)
|
|
||||||
if not file_format:
|
|
||||||
return False, None, f"Unsupported file type: {mime_type}"
|
|
||||||
|
|
||||||
# Check if format is in allowed extensions
|
|
||||||
if file_format not in self.allowed_extensions:
|
|
||||||
return False, None, f"File format '{file_format}' not allowed"
|
|
||||||
|
|
||||||
# Validate file integrity
|
|
||||||
is_valid, error = self._validate_integrity(file_path, file_format)
|
|
||||||
if not is_valid:
|
|
||||||
return False, file_format, f"File corrupted: {error}"
|
|
||||||
|
|
||||||
logger.info(f"File validated successfully: {file_path.name} ({file_format})")
|
|
||||||
return True, file_format, None
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"File validation error: {str(e)}")
|
|
||||||
return False, None, f"Validation error: {str(e)}"
|
|
||||||
|
|
||||||
def _mime_to_format(self, mime_type: str) -> Optional[str]:
|
|
||||||
"""Convert MIME type to file format"""
|
|
||||||
mime_map = {
|
|
||||||
'image/png': 'png',
|
|
||||||
'image/jpeg': 'jpg',
|
|
||||||
'image/jpg': 'jpg',
|
|
||||||
'image/bmp': 'bmp',
|
|
||||||
'image/tiff': 'tiff',
|
|
||||||
'image/x-tiff': 'tiff',
|
|
||||||
'application/pdf': 'pdf',
|
|
||||||
'application/msword': 'doc',
|
|
||||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
|
|
||||||
'application/vnd.ms-powerpoint': 'ppt',
|
|
||||||
'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
|
|
||||||
}
|
|
||||||
return mime_map.get(mime_type)
|
|
||||||
|
|
||||||
def _validate_integrity(self, file_path: Path, file_format: str) -> Tuple[bool, Optional[str]]:
|
|
||||||
"""
|
|
||||||
Validate file integrity by attempting to open it
|
|
||||||
|
|
||||||
Args:
|
|
||||||
file_path: Path to file
|
|
||||||
file_format: Detected file format
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Tuple of (is_valid, error_message)
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
if file_format in self.SUPPORTED_IMAGE_FORMATS:
|
|
||||||
# Try to open image
|
|
||||||
with Image.open(file_path) as img:
|
|
||||||
img.verify() # Verify image integrity
|
|
||||||
# Reopen for actual check (verify() closes the file)
|
|
||||||
with Image.open(file_path) as img:
|
|
||||||
_ = img.size # Force load to detect corruption
|
|
||||||
return True, None
|
|
||||||
|
|
||||||
elif file_format == 'pdf':
|
|
||||||
# Basic PDF validation - check file starts with PDF signature
|
|
||||||
with open(file_path, 'rb') as f:
|
|
||||||
header = f.read(5)
|
|
||||||
if header != b'%PDF-':
|
|
||||||
return False, "Invalid PDF header"
|
|
||||||
return True, None
|
|
||||||
|
|
||||||
elif file_format in ['doc', 'docx', 'ppt', 'pptx']:
|
|
||||||
# Office documents - basic validation (check file size and can be opened)
|
|
||||||
# Modern Office formats (docx, pptx) are ZIP-based
|
|
||||||
if file_format in ['docx', 'pptx']:
|
|
||||||
import zipfile
|
|
||||||
try:
|
|
||||||
with zipfile.ZipFile(file_path, 'r') as zf:
|
|
||||||
# Check if it has the required Office structure
|
|
||||||
if file_format == 'docx' and 'word/document.xml' not in zf.namelist():
|
|
||||||
return False, "Invalid DOCX structure"
|
|
||||||
elif file_format == 'pptx' and 'ppt/presentation.xml' not in zf.namelist():
|
|
||||||
return False, "Invalid PPTX structure"
|
|
||||||
except zipfile.BadZipFile:
|
|
||||||
return False, "Invalid Office file (corrupt ZIP)"
|
|
||||||
# Old formats (doc, ppt) - just check file exists and has content
|
|
||||||
return True, None
|
|
||||||
|
|
||||||
else:
|
|
||||||
return False, f"Unknown format: {file_format}"
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
return False, str(e)
|
|
||||||
|
|
||||||
def preprocess_image(
|
|
||||||
self,
|
|
||||||
image_path: Path,
|
|
||||||
enhance: bool = True,
|
|
||||||
output_path: Optional[Path] = None
|
|
||||||
) -> Tuple[bool, Optional[Path], Optional[str]]:
|
|
||||||
"""
|
|
||||||
Preprocess image to improve OCR accuracy
|
|
||||||
|
|
||||||
Args:
|
|
||||||
image_path: Path to input image
|
|
||||||
enhance: Whether to apply enhancement
|
|
||||||
output_path: Optional output path (defaults to temp directory)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Tuple of (success, processed_image_path, error_message)
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
# Read image
|
|
||||||
img = cv2.imread(str(image_path))
|
|
||||||
if img is None:
|
|
||||||
return False, None, "Failed to read image"
|
|
||||||
|
|
||||||
if not enhance:
|
|
||||||
# No preprocessing, return original
|
|
||||||
return True, image_path, None
|
|
||||||
|
|
||||||
# Convert to grayscale
|
|
||||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
|
||||||
|
|
||||||
# Apply adaptive thresholding to handle varying lighting
|
|
||||||
processed = cv2.adaptiveThreshold(
|
|
||||||
gray,
|
|
||||||
255,
|
|
||||||
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
|
||||||
cv2.THRESH_BINARY,
|
|
||||||
11,
|
|
||||||
2
|
|
||||||
)
|
|
||||||
|
|
||||||
# Denoise
|
|
||||||
processed = cv2.fastNlMeansDenoising(processed, None, 10, 7, 21)
|
|
||||||
|
|
||||||
# Determine output path
|
|
||||||
if output_path is None:
|
|
||||||
output_path = Path(settings.processed_dir) / f"processed_{image_path.name}"
|
|
||||||
|
|
||||||
# Save processed image
|
|
||||||
cv2.imwrite(str(output_path), processed)
|
|
||||||
|
|
||||||
logger.info(f"Image preprocessed: {image_path.name} -> {output_path.name}")
|
|
||||||
return True, output_path, None
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Image preprocessing error: {str(e)}")
|
|
||||||
return False, None, f"Preprocessing error: {str(e)}"
|
|
||||||
|
|
||||||
def get_file_info(self, file_path: Path) -> dict:
|
|
||||||
"""
|
|
||||||
Get comprehensive file information
|
|
||||||
|
|
||||||
Args:
|
|
||||||
file_path: Path to file
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dictionary with file information
|
|
||||||
"""
|
|
||||||
stat = file_path.stat()
|
|
||||||
mime = magic.Magic(mime=True)
|
|
||||||
mime_type = mime.from_file(str(file_path))
|
|
||||||
|
|
||||||
return {
|
|
||||||
'name': file_path.name,
|
|
||||||
'path': str(file_path),
|
|
||||||
'size': stat.st_size,
|
|
||||||
'size_mb': stat.st_size / (1024 * 1024),
|
|
||||||
'mime_type': mime_type,
|
|
||||||
'format': self._mime_to_format(mime_type),
|
|
||||||
'created_at': stat.st_ctime,
|
|
||||||
'modified_at': stat.st_mtime,
|
|
||||||
}
|
|
||||||
@@ -1,7 +1,6 @@
|
|||||||
import { Outlet, NavLink, useNavigate } from 'react-router-dom'
|
import { Outlet, NavLink, useNavigate } from 'react-router-dom'
|
||||||
import { useTranslation } from 'react-i18next'
|
import { useTranslation } from 'react-i18next'
|
||||||
import { useAuthStore } from '@/store/authStore'
|
import { useAuthStore } from '@/store/authStore'
|
||||||
import { apiClient } from '@/services/api'
|
|
||||||
import { apiClientV2 } from '@/services/apiV2'
|
import { apiClientV2 } from '@/services/apiV2'
|
||||||
import {
|
import {
|
||||||
Upload,
|
Upload,
|
||||||
@@ -29,12 +28,7 @@ export default function Layout() {
|
|||||||
|
|
||||||
const handleLogout = async () => {
|
const handleLogout = async () => {
|
||||||
try {
|
try {
|
||||||
// Use V2 API if authenticated with V2
|
await apiClientV2.logout()
|
||||||
if (apiClientV2.isAuthenticated()) {
|
|
||||||
await apiClientV2.logout()
|
|
||||||
} else {
|
|
||||||
apiClient.logout()
|
|
||||||
}
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('Logout error:', error)
|
console.error('Logout error:', error)
|
||||||
} finally {
|
} finally {
|
||||||
|
|||||||
@@ -1,26 +0,0 @@
|
|||||||
import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card'
|
|
||||||
|
|
||||||
interface MarkdownPreviewProps {
|
|
||||||
title?: string
|
|
||||||
content: string
|
|
||||||
className?: string
|
|
||||||
}
|
|
||||||
|
|
||||||
export default function MarkdownPreview({ title, content, className }: MarkdownPreviewProps) {
|
|
||||||
return (
|
|
||||||
<Card className={className}>
|
|
||||||
{title && (
|
|
||||||
<CardHeader>
|
|
||||||
<CardTitle>{title}</CardTitle>
|
|
||||||
</CardHeader>
|
|
||||||
)}
|
|
||||||
<CardContent>
|
|
||||||
<div className="prose prose-sm max-w-none dark:prose-invert">
|
|
||||||
<pre className="whitespace-pre-wrap break-words bg-muted p-4 rounded-md overflow-auto max-h-[600px]">
|
|
||||||
{content}
|
|
||||||
</pre>
|
|
||||||
</div>
|
|
||||||
</CardContent>
|
|
||||||
</Card>
|
|
||||||
)
|
|
||||||
}
|
|
||||||
@@ -1,90 +0,0 @@
|
|||||||
import { useTranslation } from 'react-i18next'
|
|
||||||
import { Table, TableBody, TableCell, TableHead, TableHeader, TableRow } from '@/components/ui/table'
|
|
||||||
import { Badge } from '@/components/ui/badge'
|
|
||||||
import { Button } from '@/components/ui/button'
|
|
||||||
import type { FileResult } from '@/types/apiV2'
|
|
||||||
|
|
||||||
interface ResultsTableProps {
|
|
||||||
files: FileResult[]
|
|
||||||
onViewResult?: (fileId: number) => void
|
|
||||||
onDownloadPDF?: (fileId: number) => void
|
|
||||||
}
|
|
||||||
|
|
||||||
export default function ResultsTable({ files, onViewResult, onDownloadPDF }: ResultsTableProps) {
|
|
||||||
const { t } = useTranslation()
|
|
||||||
|
|
||||||
const getStatusBadge = (status: FileResult['status']) => {
|
|
||||||
switch (status) {
|
|
||||||
case 'completed':
|
|
||||||
return <Badge variant="success">{t('processing.completed')}</Badge>
|
|
||||||
case 'processing':
|
|
||||||
return <Badge variant="default">{t('processing.processing')}</Badge>
|
|
||||||
case 'failed':
|
|
||||||
return <Badge variant="destructive">{t('processing.failed')}</Badge>
|
|
||||||
default:
|
|
||||||
return <Badge variant="secondary">{t('processing.pending')}</Badge>
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const formatTime = (seconds?: number) => {
|
|
||||||
if (!seconds) return 'N/A'
|
|
||||||
return `${seconds.toFixed(2)}s`
|
|
||||||
}
|
|
||||||
|
|
||||||
return (
|
|
||||||
<div className="rounded-md border">
|
|
||||||
<Table>
|
|
||||||
<TableHeader>
|
|
||||||
<TableRow>
|
|
||||||
<TableHead>{t('results.filename')}</TableHead>
|
|
||||||
<TableHead>{t('results.status')}</TableHead>
|
|
||||||
<TableHead>{t('results.processingTime')}</TableHead>
|
|
||||||
<TableHead className="text-right">{t('results.actions')}</TableHead>
|
|
||||||
</TableRow>
|
|
||||||
</TableHeader>
|
|
||||||
<TableBody>
|
|
||||||
{files.length === 0 ? (
|
|
||||||
<TableRow>
|
|
||||||
<TableCell colSpan={4} className="text-center text-muted-foreground">
|
|
||||||
{t('results.noResults')}
|
|
||||||
</TableCell>
|
|
||||||
</TableRow>
|
|
||||||
) : (
|
|
||||||
files.map((file) => (
|
|
||||||
<TableRow key={file.id}>
|
|
||||||
<TableCell className="font-medium">{file.filename}</TableCell>
|
|
||||||
<TableCell>{getStatusBadge(file.status)}</TableCell>
|
|
||||||
<TableCell>{formatTime(file.processing_time)}</TableCell>
|
|
||||||
<TableCell className="text-right">
|
|
||||||
<div className="flex justify-end gap-2">
|
|
||||||
{file.status === 'completed' && (
|
|
||||||
<>
|
|
||||||
<Button
|
|
||||||
variant="outline"
|
|
||||||
size="sm"
|
|
||||||
onClick={() => onViewResult?.(file.id)}
|
|
||||||
>
|
|
||||||
{t('results.viewMarkdown')}
|
|
||||||
</Button>
|
|
||||||
<Button
|
|
||||||
variant="outline"
|
|
||||||
size="sm"
|
|
||||||
onClick={() => onDownloadPDF?.(file.id)}
|
|
||||||
>
|
|
||||||
{t('results.downloadPDF')}
|
|
||||||
</Button>
|
|
||||||
</>
|
|
||||||
)}
|
|
||||||
{file.status === 'failed' && file.error && (
|
|
||||||
<span className="text-sm text-destructive">{file.error}</span>
|
|
||||||
)}
|
|
||||||
</div>
|
|
||||||
</TableCell>
|
|
||||||
</TableRow>
|
|
||||||
))
|
|
||||||
)}
|
|
||||||
</TableBody>
|
|
||||||
</Table>
|
|
||||||
</div>
|
|
||||||
)
|
|
||||||
}
|
|
||||||
@@ -4,7 +4,7 @@ import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query'
|
|||||||
import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card'
|
import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card'
|
||||||
import { Button } from '@/components/ui/button'
|
import { Button } from '@/components/ui/button'
|
||||||
import { useToast } from '@/components/ui/toast'
|
import { useToast } from '@/components/ui/toast'
|
||||||
import { apiClient } from '@/services/api'
|
import { apiClientV2 } from '@/services/apiV2'
|
||||||
import type { ExportRule } from '@/types/apiV2'
|
import type { ExportRule } from '@/types/apiV2'
|
||||||
|
|
||||||
export default function SettingsPage() {
|
export default function SettingsPage() {
|
||||||
@@ -25,12 +25,12 @@ export default function SettingsPage() {
|
|||||||
// Fetch export rules
|
// Fetch export rules
|
||||||
const { data: exportRules, isLoading } = useQuery({
|
const { data: exportRules, isLoading } = useQuery({
|
||||||
queryKey: ['exportRules'],
|
queryKey: ['exportRules'],
|
||||||
queryFn: () => apiClient.getExportRules(),
|
queryFn: () => apiClientV2.getExportRules(),
|
||||||
})
|
})
|
||||||
|
|
||||||
// Create rule mutation
|
// Create rule mutation
|
||||||
const createRuleMutation = useMutation({
|
const createRuleMutation = useMutation({
|
||||||
mutationFn: (rule: any) => apiClient.createExportRule(rule),
|
mutationFn: (rule: any) => apiClientV2.createExportRule(rule),
|
||||||
onSuccess: () => {
|
onSuccess: () => {
|
||||||
queryClient.invalidateQueries({ queryKey: ['exportRules'] })
|
queryClient.invalidateQueries({ queryKey: ['exportRules'] })
|
||||||
setIsCreating(false)
|
setIsCreating(false)
|
||||||
@@ -53,7 +53,7 @@ export default function SettingsPage() {
|
|||||||
// Update rule mutation
|
// Update rule mutation
|
||||||
const updateRuleMutation = useMutation({
|
const updateRuleMutation = useMutation({
|
||||||
mutationFn: ({ ruleId, rule }: { ruleId: number; rule: any }) =>
|
mutationFn: ({ ruleId, rule }: { ruleId: number; rule: any }) =>
|
||||||
apiClient.updateExportRule(ruleId, rule),
|
apiClientV2.updateExportRule(ruleId, rule),
|
||||||
onSuccess: () => {
|
onSuccess: () => {
|
||||||
queryClient.invalidateQueries({ queryKey: ['exportRules'] })
|
queryClient.invalidateQueries({ queryKey: ['exportRules'] })
|
||||||
setEditingRule(null)
|
setEditingRule(null)
|
||||||
@@ -75,7 +75,7 @@ export default function SettingsPage() {
|
|||||||
|
|
||||||
// Delete rule mutation
|
// Delete rule mutation
|
||||||
const deleteRuleMutation = useMutation({
|
const deleteRuleMutation = useMutation({
|
||||||
mutationFn: (ruleId: number) => apiClient.deleteExportRule(ruleId),
|
mutationFn: (ruleId: number) => apiClientV2.deleteExportRule(ruleId),
|
||||||
onSuccess: () => {
|
onSuccess: () => {
|
||||||
queryClient.invalidateQueries({ queryKey: ['exportRules'] })
|
queryClient.invalidateQueries({ queryKey: ['exportRules'] })
|
||||||
toast({
|
toast({
|
||||||
|
|||||||
@@ -1,271 +0,0 @@
|
|||||||
import axios, { AxiosError } from 'axios'
|
|
||||||
import type { AxiosInstance } from 'axios'
|
|
||||||
import type {
|
|
||||||
LoginRequest,
|
|
||||||
LoginResponse,
|
|
||||||
UploadResponse,
|
|
||||||
ProcessRequest,
|
|
||||||
ProcessResponse,
|
|
||||||
BatchStatus,
|
|
||||||
OCRResult,
|
|
||||||
ExportRequest,
|
|
||||||
ExportRule,
|
|
||||||
CSSTemplate,
|
|
||||||
TranslateRequest,
|
|
||||||
TranslateResponse,
|
|
||||||
ApiError,
|
|
||||||
} from '@/types/api'
|
|
||||||
|
|
||||||
/**
|
|
||||||
* API Client Configuration
|
|
||||||
* - In Docker: VITE_API_BASE_URL is empty string, use relative path
|
|
||||||
* - In development: Use VITE_API_BASE_URL from .env or default to localhost:8000
|
|
||||||
*/
|
|
||||||
const envApiBaseUrl = import.meta.env.VITE_API_BASE_URL
|
|
||||||
const API_BASE_URL = envApiBaseUrl !== undefined ? envApiBaseUrl : 'http://localhost:8000'
|
|
||||||
const API_VERSION = 'v2'
|
|
||||||
|
|
||||||
class ApiClient {
|
|
||||||
private client: AxiosInstance
|
|
||||||
private token: string | null = null
|
|
||||||
|
|
||||||
constructor() {
|
|
||||||
this.client = axios.create({
|
|
||||||
baseURL: `${API_BASE_URL}/api/${API_VERSION}`,
|
|
||||||
timeout: 30000,
|
|
||||||
headers: {
|
|
||||||
'Content-Type': 'application/json',
|
|
||||||
},
|
|
||||||
})
|
|
||||||
|
|
||||||
// Request interceptor to add auth token
|
|
||||||
this.client.interceptors.request.use(
|
|
||||||
(config) => {
|
|
||||||
if (this.token) {
|
|
||||||
config.headers.Authorization = `Bearer ${this.token}`
|
|
||||||
}
|
|
||||||
return config
|
|
||||||
},
|
|
||||||
(error) => Promise.reject(error)
|
|
||||||
)
|
|
||||||
|
|
||||||
// Response interceptor for error handling
|
|
||||||
this.client.interceptors.response.use(
|
|
||||||
(response) => response,
|
|
||||||
(error: AxiosError<ApiError>) => {
|
|
||||||
if (error.response?.status === 401) {
|
|
||||||
// Token expired or invalid
|
|
||||||
this.clearToken()
|
|
||||||
window.location.href = '/login'
|
|
||||||
}
|
|
||||||
return Promise.reject(error)
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
// Load token from localStorage
|
|
||||||
this.loadToken()
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Set authentication token
|
|
||||||
*/
|
|
||||||
setToken(token: string) {
|
|
||||||
this.token = token
|
|
||||||
localStorage.setItem('auth_token', token)
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Clear authentication token
|
|
||||||
*/
|
|
||||||
clearToken() {
|
|
||||||
this.token = null
|
|
||||||
localStorage.removeItem('auth_token')
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Load token from localStorage
|
|
||||||
*/
|
|
||||||
private loadToken() {
|
|
||||||
const token = localStorage.getItem('auth_token')
|
|
||||||
if (token) {
|
|
||||||
this.token = token
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Check if user is authenticated
|
|
||||||
*/
|
|
||||||
isAuthenticated(): boolean {
|
|
||||||
return this.token !== null
|
|
||||||
}
|
|
||||||
|
|
||||||
// ==================== Authentication ====================
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Login
|
|
||||||
*/
|
|
||||||
async login(data: LoginRequest): Promise<LoginResponse> {
|
|
||||||
const response = await this.client.post<LoginResponse>('/auth/login', {
|
|
||||||
username: data.username,
|
|
||||||
password: data.password,
|
|
||||||
})
|
|
||||||
|
|
||||||
this.setToken(response.data.access_token)
|
|
||||||
return response.data
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Logout
|
|
||||||
*/
|
|
||||||
logout() {
|
|
||||||
this.clearToken()
|
|
||||||
}
|
|
||||||
|
|
||||||
// ==================== File Upload ====================
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Upload files
|
|
||||||
*/
|
|
||||||
async uploadFiles(files: File[]): Promise<UploadResponse> {
|
|
||||||
const formData = new FormData()
|
|
||||||
files.forEach((file) => {
|
|
||||||
formData.append('files', file)
|
|
||||||
})
|
|
||||||
|
|
||||||
const response = await this.client.post<UploadResponse>('/upload', formData, {
|
|
||||||
headers: {
|
|
||||||
'Content-Type': 'multipart/form-data',
|
|
||||||
},
|
|
||||||
})
|
|
||||||
|
|
||||||
return response.data
|
|
||||||
}
|
|
||||||
|
|
||||||
// ==================== OCR Processing ====================
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Process OCR
|
|
||||||
*/
|
|
||||||
async processOCR(data: ProcessRequest): Promise<ProcessResponse> {
|
|
||||||
const response = await this.client.post<ProcessResponse>('/ocr/process', data)
|
|
||||||
return response.data
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get OCR result by file ID
|
|
||||||
* Note: Backend uses file-level tracking, not task-level
|
|
||||||
*/
|
|
||||||
async getOCRResult(fileId: number): Promise<OCRResult> {
|
|
||||||
const response = await this.client.get<OCRResult>(`/ocr/result/${fileId}`)
|
|
||||||
return response.data
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get batch status
|
|
||||||
*/
|
|
||||||
async getBatchStatus(batchId: number): Promise<BatchStatus> {
|
|
||||||
const response = await this.client.get<BatchStatus>(`/batch/${batchId}/status`)
|
|
||||||
return response.data
|
|
||||||
}
|
|
||||||
|
|
||||||
// ==================== Export ====================
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Export results
|
|
||||||
*/
|
|
||||||
async exportResults(data: ExportRequest): Promise<Blob> {
|
|
||||||
const response = await this.client.post('/export', data, {
|
|
||||||
responseType: 'blob',
|
|
||||||
})
|
|
||||||
return response.data
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Generate and download PDF
|
|
||||||
*/
|
|
||||||
async exportPDF(fileId: number, cssTemplate?: string): Promise<Blob> {
|
|
||||||
const params = cssTemplate ? { css_template: cssTemplate } : {}
|
|
||||||
const response = await this.client.get(`/export/pdf/${fileId}`, {
|
|
||||||
params,
|
|
||||||
responseType: 'blob',
|
|
||||||
})
|
|
||||||
return response.data
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get export rules
|
|
||||||
*/
|
|
||||||
async getExportRules(): Promise<ExportRule[]> {
|
|
||||||
const response = await this.client.get<ExportRule[]>('/export/rules')
|
|
||||||
return response.data
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Create export rule
|
|
||||||
*/
|
|
||||||
async createExportRule(rule: Omit<ExportRule, 'id' | 'created_at'>): Promise<ExportRule> {
|
|
||||||
const response = await this.client.post<ExportRule>('/export/rules', rule)
|
|
||||||
return response.data
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Update export rule
|
|
||||||
*/
|
|
||||||
async updateExportRule(ruleId: number, rule: Partial<ExportRule>): Promise<ExportRule> {
|
|
||||||
const response = await this.client.put<ExportRule>(`/export/rules/${ruleId}`, rule)
|
|
||||||
return response.data
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Delete export rule
|
|
||||||
*/
|
|
||||||
async deleteExportRule(ruleId: number): Promise<void> {
|
|
||||||
await this.client.delete(`/export/rules/${ruleId}`)
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get CSS templates
|
|
||||||
*/
|
|
||||||
async getCSSTemplates(): Promise<CSSTemplate[]> {
|
|
||||||
const response = await this.client.get<CSSTemplate[]>('/export/css-templates')
|
|
||||||
return response.data
|
|
||||||
}
|
|
||||||
|
|
||||||
// ==================== Translation (FUTURE FEATURE - STUB) ====================
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Translate document (STUB - Not yet implemented)
|
|
||||||
* This is a placeholder for future translation functionality
|
|
||||||
* @throws Will throw error with status 501 (Not Implemented)
|
|
||||||
*/
|
|
||||||
async translateDocument(data: TranslateRequest): Promise<TranslateResponse> {
|
|
||||||
// This endpoint is expected to return 501 Not Implemented until Phase 5
|
|
||||||
const response = await this.client.post<TranslateResponse>('/translate/document', data)
|
|
||||||
return response.data
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get translation configs (NOT IMPLEMENTED)
|
|
||||||
* This endpoint does not exist on backend - configs will be part of Phase 5
|
|
||||||
* @deprecated Backend endpoint does not exist - will return 404
|
|
||||||
*/
|
|
||||||
// async getTranslationConfigs(): Promise<TranslationConfig[]> {
|
|
||||||
// const response = await this.client.get<TranslationConfig[]>('/translate/configs')
|
|
||||||
// return response.data
|
|
||||||
// }
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Create translation config (NOT IMPLEMENTED)
|
|
||||||
* This endpoint does not exist on backend - configs will be part of Phase 5
|
|
||||||
* @deprecated Backend endpoint does not exist - will return 404
|
|
||||||
*/
|
|
||||||
// async createTranslationConfig(
|
|
||||||
// config: Omit<TranslationConfig, 'id' | 'created_at'>
|
|
||||||
// ): Promise<TranslationConfig> {
|
|
||||||
// const response = await this.client.post<TranslationConfig>('/translate/configs', config)
|
|
||||||
// return response.data
|
|
||||||
// }
|
|
||||||
}
|
|
||||||
|
|
||||||
// Export singleton instance
|
|
||||||
export const apiClient = new ApiClient()
|
|
||||||
@@ -38,6 +38,7 @@ import type {
|
|||||||
TranslationStatusResponse,
|
TranslationStatusResponse,
|
||||||
TranslationListResponse,
|
TranslationListResponse,
|
||||||
TranslationResult,
|
TranslationResult,
|
||||||
|
ExportRule,
|
||||||
} from '@/types/apiV2'
|
} from '@/types/apiV2'
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -713,6 +714,39 @@ class ApiClientV2 {
|
|||||||
link.click()
|
link.click()
|
||||||
window.URL.revokeObjectURL(link.href)
|
window.URL.revokeObjectURL(link.href)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ==================== Export Rules APIs ====================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get export rules
|
||||||
|
*/
|
||||||
|
async getExportRules(): Promise<ExportRule[]> {
|
||||||
|
const response = await this.client.get<ExportRule[]>('/export/rules')
|
||||||
|
return response.data
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create export rule
|
||||||
|
*/
|
||||||
|
async createExportRule(rule: Omit<ExportRule, 'id' | 'created_at'>): Promise<ExportRule> {
|
||||||
|
const response = await this.client.post<ExportRule>('/export/rules', rule)
|
||||||
|
return response.data
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Update export rule
|
||||||
|
*/
|
||||||
|
async updateExportRule(ruleId: number, rule: Partial<ExportRule>): Promise<ExportRule> {
|
||||||
|
const response = await this.client.put<ExportRule>(`/export/rules/${ruleId}`, rule)
|
||||||
|
return response.data
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Delete export rule
|
||||||
|
*/
|
||||||
|
async deleteExportRule(ruleId: number): Promise<void> {
|
||||||
|
await this.client.delete(`/export/rules/${ruleId}`)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Export singleton instance
|
// Export singleton instance
|
||||||
|
|||||||
@@ -1,182 +0,0 @@
|
|||||||
/**
|
|
||||||
* API Type Definitions
|
|
||||||
* Based on backend OpenAPI specification
|
|
||||||
*/
|
|
||||||
|
|
||||||
// Authentication
|
|
||||||
export interface LoginRequest {
|
|
||||||
username: string
|
|
||||||
password: string
|
|
||||||
}
|
|
||||||
|
|
||||||
export interface LoginResponse {
|
|
||||||
access_token: string
|
|
||||||
token_type: string
|
|
||||||
expires_in: number // Token expiration time in seconds
|
|
||||||
}
|
|
||||||
|
|
||||||
export interface User {
|
|
||||||
id: number
|
|
||||||
username: string
|
|
||||||
email?: string
|
|
||||||
displayName?: string | null
|
|
||||||
}
|
|
||||||
|
|
||||||
// File Upload (V2 API)
|
|
||||||
export interface UploadResponse {
|
|
||||||
task_id: string
|
|
||||||
filename: string
|
|
||||||
file_size: number
|
|
||||||
file_type: string
|
|
||||||
status: 'pending' | 'processing' | 'completed' | 'failed'
|
|
||||||
}
|
|
||||||
|
|
||||||
export interface FileInfo {
|
|
||||||
id: number
|
|
||||||
filename: string
|
|
||||||
file_size: number
|
|
||||||
file_format: string // Changed from 'format' to match backend
|
|
||||||
status: 'pending' | 'processing' | 'completed' | 'failed'
|
|
||||||
}
|
|
||||||
|
|
||||||
// OCR Processing
|
|
||||||
export interface ProcessRequest {
|
|
||||||
batch_id: number
|
|
||||||
lang?: string
|
|
||||||
detect_layout?: boolean // Changed from confidence_threshold to match backend
|
|
||||||
}
|
|
||||||
|
|
||||||
export interface ProcessResponse {
|
|
||||||
message: string // Added to match backend
|
|
||||||
batch_id: number
|
|
||||||
total_files: number // Added to match backend
|
|
||||||
status: string
|
|
||||||
// Removed task_id - backend uses batch-level tracking instead
|
|
||||||
}
|
|
||||||
|
|
||||||
export interface TaskStatus {
|
|
||||||
task_id: string
|
|
||||||
status: 'pending' | 'processing' | 'completed' | 'failed'
|
|
||||||
progress_percentage: number
|
|
||||||
current_file?: string
|
|
||||||
files_processed: number
|
|
||||||
total_files: number
|
|
||||||
error?: string
|
|
||||||
}
|
|
||||||
|
|
||||||
export interface BatchStatus {
|
|
||||||
batch: {
|
|
||||||
id: number
|
|
||||||
status: 'pending' | 'processing' | 'completed' | 'failed'
|
|
||||||
progress_percentage: number
|
|
||||||
created_at: string
|
|
||||||
completed_at?: string
|
|
||||||
}
|
|
||||||
files: FileResult[]
|
|
||||||
}
|
|
||||||
|
|
||||||
export interface FileResult {
|
|
||||||
id: number
|
|
||||||
filename: string
|
|
||||||
status: 'pending' | 'processing' | 'completed' | 'failed'
|
|
||||||
processing_time?: number
|
|
||||||
error?: string
|
|
||||||
}
|
|
||||||
|
|
||||||
// OCR Results
|
|
||||||
export interface OCRResult {
|
|
||||||
file_id: number
|
|
||||||
filename: string
|
|
||||||
status: string
|
|
||||||
markdown_content: string
|
|
||||||
json_data: OCRJsonData
|
|
||||||
confidence: number
|
|
||||||
processing_time: number
|
|
||||||
}
|
|
||||||
|
|
||||||
export interface OCRJsonData {
|
|
||||||
total_text_regions: number
|
|
||||||
average_confidence: number
|
|
||||||
text_blocks: TextBlock[]
|
|
||||||
layout_info?: LayoutInfo
|
|
||||||
}
|
|
||||||
|
|
||||||
export interface TextBlock {
|
|
||||||
text: string
|
|
||||||
confidence: number
|
|
||||||
bbox: [number, number, number, number]
|
|
||||||
position: number
|
|
||||||
}
|
|
||||||
|
|
||||||
export interface LayoutInfo {
|
|
||||||
tables_detected: number
|
|
||||||
images_detected: number
|
|
||||||
structure: string
|
|
||||||
}
|
|
||||||
|
|
||||||
// Export
|
|
||||||
export interface ExportRequest {
|
|
||||||
batch_id: number
|
|
||||||
format: 'txt' | 'json' | 'excel' | 'markdown' | 'pdf'
|
|
||||||
rule_id?: number
|
|
||||||
options?: ExportOptions
|
|
||||||
}
|
|
||||||
|
|
||||||
export interface ExportOptions {
|
|
||||||
confidence_threshold?: number
|
|
||||||
include_metadata?: boolean
|
|
||||||
filename_pattern?: string
|
|
||||||
css_template?: string
|
|
||||||
}
|
|
||||||
|
|
||||||
export interface ExportRule {
|
|
||||||
id: number
|
|
||||||
rule_name: string
|
|
||||||
config_json: Record<string, any>
|
|
||||||
css_template?: string
|
|
||||||
created_at: string
|
|
||||||
}
|
|
||||||
|
|
||||||
export interface CSSTemplate {
|
|
||||||
name: string
|
|
||||||
description: string
|
|
||||||
// filename is not returned by backend - use name as identifier
|
|
||||||
}
|
|
||||||
|
|
||||||
// Translation (FUTURE FEATURE)
|
|
||||||
export interface TranslateRequest {
|
|
||||||
file_id: number
|
|
||||||
source_lang: string
|
|
||||||
target_lang: string
|
|
||||||
engine_type?: 'argos' | 'ernie' | 'google'
|
|
||||||
}
|
|
||||||
|
|
||||||
export interface TranslateResponse {
|
|
||||||
task_id: string
|
|
||||||
file_id: number
|
|
||||||
status: 'pending' | 'processing' | 'completed' | 'failed'
|
|
||||||
translated_content?: string
|
|
||||||
}
|
|
||||||
|
|
||||||
export interface TranslationConfig {
|
|
||||||
id: number
|
|
||||||
source_lang: string
|
|
||||||
target_lang: string
|
|
||||||
engine_type: 'argos' | 'ernie' | 'google'
|
|
||||||
engine_config: Record<string, any>
|
|
||||||
created_at: string
|
|
||||||
}
|
|
||||||
|
|
||||||
// API Response
|
|
||||||
export interface ApiResponse<T = any> {
|
|
||||||
success: boolean
|
|
||||||
data?: T
|
|
||||||
error?: string
|
|
||||||
message?: string
|
|
||||||
}
|
|
||||||
|
|
||||||
// Error Response
|
|
||||||
export interface ApiError {
|
|
||||||
detail: string
|
|
||||||
status_code: number
|
|
||||||
}
|
|
||||||
Reference in New Issue
Block a user