refactor: remove unused code and migrate legacy API
Backend cleanup: - Remove ocr_service_original.py (legacy OCR service, replaced by ocr_service.py) - Remove preprocessor.py (unused, functionality absorbed by layout_preprocessing_service.py) - Remove pdf_font_manager.py (unused, never referenced by any service) Frontend cleanup: - Remove MarkdownPreview.tsx (unused component) - Remove ResultsTable.tsx (unused, replaced by TaskHistoryPage) - Remove services/api.ts (legacy API client, migrated to apiV2) - Remove types/api.ts (legacy types, migrated to apiV2.ts) API migration: - Add export rules CRUD methods to apiClientV2 - Update SettingsPage.tsx to use apiClientV2 - Update Layout.tsx to use only apiClientV2 for logout This reduces ~1,500 lines of redundant code and unifies the API client. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,835 +0,0 @@
|
||||
"""
|
||||
Tool_OCR - Core OCR Service
|
||||
PaddleOCR-VL integration for text and structure extraction
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from datetime import datetime
|
||||
import uuid
|
||||
|
||||
from paddleocr import PaddleOCR, PPStructureV3
|
||||
from PIL import Image
|
||||
from pdf2image import convert_from_path
|
||||
import paddle
|
||||
|
||||
from app.core.config import settings
|
||||
from app.services.office_converter import OfficeConverter, OfficeConverterError
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class OCRService:
|
||||
"""
|
||||
Core OCR service using PaddleOCR-VL
|
||||
Handles text recognition and document structure analysis
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize PaddleOCR and PPStructure engines with GPU detection"""
|
||||
self.ocr_languages = settings.ocr_languages_list
|
||||
self.confidence_threshold = settings.ocr_confidence_threshold
|
||||
|
||||
# Initialize PaddleOCR engine (will be lazy-loaded per language)
|
||||
self.ocr_engines = {}
|
||||
|
||||
# Initialize PP-Structure for layout analysis
|
||||
self.structure_engine = None
|
||||
|
||||
# Initialize Office document converter
|
||||
self.office_converter = OfficeConverter()
|
||||
|
||||
# GPU Detection and Configuration
|
||||
self.gpu_available = False
|
||||
self.use_gpu = False
|
||||
self.gpu_info = {}
|
||||
|
||||
self._detect_and_configure_gpu()
|
||||
|
||||
logger.info("OCR Service initialized")
|
||||
|
||||
def _detect_and_configure_gpu(self):
|
||||
"""Detect GPU availability and configure usage"""
|
||||
try:
|
||||
# Check if forced CPU mode
|
||||
if settings.force_cpu_mode:
|
||||
logger.info("GPU mode forced to CPU by configuration")
|
||||
self.use_gpu = False
|
||||
self.gpu_info = {
|
||||
'available': False,
|
||||
'reason': 'CPU mode forced by configuration',
|
||||
}
|
||||
return
|
||||
|
||||
# Check if PaddlePaddle is compiled with CUDA
|
||||
if paddle.is_compiled_with_cuda():
|
||||
# Check if GPU devices are available
|
||||
gpu_count = paddle.device.cuda.device_count()
|
||||
|
||||
if gpu_count > 0:
|
||||
self.gpu_available = True
|
||||
self.use_gpu = True
|
||||
|
||||
# Get GPU device information
|
||||
device_id = settings.gpu_device_id if settings.gpu_device_id < gpu_count else 0
|
||||
gpu_props = paddle.device.cuda.get_device_properties(device_id)
|
||||
|
||||
self.gpu_info = {
|
||||
'available': True,
|
||||
'device_count': gpu_count,
|
||||
'device_id': device_id,
|
||||
'device_name': gpu_props.name,
|
||||
'total_memory': gpu_props.total_memory,
|
||||
'compute_capability': f"{gpu_props.major}.{gpu_props.minor}",
|
||||
}
|
||||
|
||||
# Set GPU memory fraction
|
||||
try:
|
||||
paddle.device.set_device(f'gpu:{device_id}')
|
||||
logger.info(f"GPU {device_id} selected: {gpu_props.name}")
|
||||
logger.info(f"GPU memory: {gpu_props.total_memory / (1024**3):.2f} GB")
|
||||
logger.info(f"Compute capability: {gpu_props.major}.{gpu_props.minor}")
|
||||
logger.info(f"GPU memory fraction set to: {settings.gpu_memory_fraction}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to configure GPU device: {e}")
|
||||
self.use_gpu = False
|
||||
self.gpu_info['available'] = False
|
||||
self.gpu_info['reason'] = f'GPU configuration failed: {str(e)}'
|
||||
else:
|
||||
logger.warning("CUDA is available but no GPU devices found")
|
||||
self.gpu_info = {
|
||||
'available': False,
|
||||
'reason': 'CUDA compiled but no GPU devices detected',
|
||||
}
|
||||
else:
|
||||
logger.info("PaddlePaddle not compiled with CUDA support")
|
||||
self.gpu_info = {
|
||||
'available': False,
|
||||
'reason': 'PaddlePaddle not compiled with CUDA',
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"GPU detection failed: {e}")
|
||||
self.use_gpu = False
|
||||
self.gpu_info = {
|
||||
'available': False,
|
||||
'reason': f'GPU detection error: {str(e)}',
|
||||
}
|
||||
|
||||
# Log final GPU status
|
||||
if self.use_gpu:
|
||||
logger.info(f"✓ GPU acceleration ENABLED - Using {self.gpu_info.get('device_name', 'Unknown GPU')}")
|
||||
else:
|
||||
reason = self.gpu_info.get('reason', 'Unknown')
|
||||
logger.info(f"ℹ GPU acceleration DISABLED - {reason} - Using CPU mode")
|
||||
|
||||
def get_gpu_status(self) -> Dict:
|
||||
"""
|
||||
Get current GPU status and information
|
||||
|
||||
Returns:
|
||||
Dictionary with GPU status information
|
||||
"""
|
||||
status = {
|
||||
'gpu_enabled': self.use_gpu,
|
||||
'gpu_available': self.gpu_available,
|
||||
**self.gpu_info,
|
||||
}
|
||||
|
||||
# Add current GPU memory usage if GPU is being used
|
||||
if self.use_gpu and self.gpu_available:
|
||||
try:
|
||||
device_id = self.gpu_info.get('device_id', 0)
|
||||
# Get memory info (returns allocated, total in bytes)
|
||||
memory_allocated = paddle.device.cuda.memory_allocated(device_id)
|
||||
memory_reserved = paddle.device.cuda.memory_reserved(device_id)
|
||||
total_memory = self.gpu_info.get('total_memory', 0)
|
||||
|
||||
status['memory_allocated_mb'] = memory_allocated / (1024**2)
|
||||
status['memory_reserved_mb'] = memory_reserved / (1024**2)
|
||||
status['memory_total_mb'] = total_memory / (1024**2)
|
||||
status['memory_utilization'] = (memory_allocated / total_memory * 100) if total_memory > 0 else 0
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to get GPU memory info: {e}")
|
||||
|
||||
return status
|
||||
|
||||
def get_ocr_engine(self, lang: str = 'ch') -> PaddleOCR:
|
||||
"""
|
||||
Get or create OCR engine for specified language with GPU support
|
||||
|
||||
Args:
|
||||
lang: Language code (ch, en, japan, korean, etc.)
|
||||
|
||||
Returns:
|
||||
PaddleOCR engine instance
|
||||
"""
|
||||
if lang not in self.ocr_engines:
|
||||
logger.info(f"Initializing PaddleOCR engine for language: {lang} (GPU: {self.use_gpu})")
|
||||
|
||||
try:
|
||||
# PaddleOCR 3.x: Device is set globally via paddle.set_device()
|
||||
# No need to pass device/use_gpu/gpu_mem parameters
|
||||
self.ocr_engines[lang] = PaddleOCR(
|
||||
lang=lang,
|
||||
use_textline_orientation=True, # Replaces deprecated use_angle_cls
|
||||
)
|
||||
logger.info(f"PaddleOCR engine ready for {lang} (PaddlePaddle {paddle.__version__}, {'GPU' if self.use_gpu else 'CPU'} mode)")
|
||||
|
||||
except Exception as e:
|
||||
# If GPU initialization fails, fall back to CPU
|
||||
if self.use_gpu:
|
||||
logger.warning(f"GPU initialization failed, falling back to CPU: {e}")
|
||||
self.use_gpu = False
|
||||
# Switch to CPU device globally
|
||||
paddle.set_device('cpu')
|
||||
self.ocr_engines[lang] = PaddleOCR(
|
||||
lang=lang,
|
||||
use_textline_orientation=True,
|
||||
)
|
||||
logger.info(f"PaddleOCR engine ready for {lang} (CPU mode - fallback)")
|
||||
else:
|
||||
raise
|
||||
|
||||
return self.ocr_engines[lang]
|
||||
|
||||
def get_structure_engine(self) -> PPStructureV3:
|
||||
"""
|
||||
Get or create PP-Structure engine for layout analysis with GPU support
|
||||
|
||||
Returns:
|
||||
PPStructure engine instance
|
||||
"""
|
||||
if self.structure_engine is None:
|
||||
logger.info(f"Initializing PP-StructureV3 engine (GPU: {self.use_gpu})")
|
||||
|
||||
try:
|
||||
# PaddleOCR 3.x: Device is set globally via paddle.set_device()
|
||||
# No need to pass device/use_gpu/gpu_mem parameters
|
||||
self.structure_engine = PPStructureV3(
|
||||
use_doc_orientation_classify=False,
|
||||
use_doc_unwarping=False,
|
||||
use_textline_orientation=False,
|
||||
use_table_recognition=True,
|
||||
use_formula_recognition=True,
|
||||
use_chart_recognition=True, # Enable chart recognition (requires PaddlePaddle >= 3.2.0 for fused_rms_norm_ext)
|
||||
layout_threshold=0.5,
|
||||
)
|
||||
logger.info(f"PP-StructureV3 engine ready (PaddlePaddle {paddle.__version__}, {'GPU' if self.use_gpu else 'CPU'} mode)")
|
||||
|
||||
except Exception as e:
|
||||
# If GPU initialization fails, fall back to CPU
|
||||
if self.use_gpu:
|
||||
logger.warning(f"GPU initialization failed for PP-Structure, falling back to CPU: {e}")
|
||||
self.use_gpu = False
|
||||
# Switch to CPU device globally
|
||||
paddle.set_device('cpu')
|
||||
self.structure_engine = PPStructureV3(
|
||||
use_doc_orientation_classify=False,
|
||||
use_doc_unwarping=False,
|
||||
use_textline_orientation=False,
|
||||
use_table_recognition=True,
|
||||
use_formula_recognition=True,
|
||||
use_chart_recognition=True, # Enable chart recognition (CPU fallback mode)
|
||||
layout_threshold=0.5,
|
||||
)
|
||||
logger.info("PP-StructureV3 engine ready (CPU mode - fallback)")
|
||||
else:
|
||||
raise
|
||||
|
||||
return self.structure_engine
|
||||
|
||||
def convert_pdf_to_images(self, pdf_path: Path, output_dir: Path) -> List[Path]:
|
||||
"""
|
||||
Convert PDF to images (one per page)
|
||||
|
||||
Args:
|
||||
pdf_path: Path to PDF file
|
||||
output_dir: Directory to save converted images
|
||||
|
||||
Returns:
|
||||
List of paths to converted images
|
||||
"""
|
||||
try:
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
logger.info(f"Converting PDF {pdf_path.name} to images")
|
||||
|
||||
# Convert PDF to images (300 DPI for good quality)
|
||||
images = convert_from_path(
|
||||
str(pdf_path),
|
||||
dpi=300,
|
||||
fmt='png'
|
||||
)
|
||||
|
||||
image_paths = []
|
||||
for i, image in enumerate(images):
|
||||
# Save each page as PNG
|
||||
image_path = output_dir / f"{pdf_path.stem}_page_{i+1}.png"
|
||||
image.save(str(image_path), 'PNG')
|
||||
image_paths.append(image_path)
|
||||
logger.info(f"Saved page {i+1} to {image_path.name}")
|
||||
|
||||
logger.info(f"Converted {len(image_paths)} pages from PDF")
|
||||
return image_paths
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"PDF conversion error: {str(e)}")
|
||||
raise
|
||||
|
||||
def process_image(
|
||||
self,
|
||||
image_path: Path,
|
||||
lang: str = 'ch',
|
||||
detect_layout: bool = True,
|
||||
confidence_threshold: Optional[float] = None,
|
||||
output_dir: Optional[Path] = None,
|
||||
current_page: int = 0
|
||||
) -> Dict:
|
||||
"""
|
||||
Process single image with OCR and layout analysis
|
||||
|
||||
Args:
|
||||
image_path: Path to image file
|
||||
lang: Language for OCR
|
||||
detect_layout: Whether to perform layout analysis
|
||||
confidence_threshold: Minimum confidence threshold (uses default if None)
|
||||
output_dir: Optional output directory for saving extracted images
|
||||
current_page: Current page number (0-based) for multi-page documents
|
||||
|
||||
Returns:
|
||||
Dictionary with OCR results and metadata
|
||||
"""
|
||||
start_time = datetime.now()
|
||||
threshold = confidence_threshold if confidence_threshold is not None else self.confidence_threshold
|
||||
|
||||
try:
|
||||
# Check if file is Office document
|
||||
if self.office_converter.is_office_document(image_path):
|
||||
logger.info(f"Detected Office document: {image_path.name}, converting to PDF")
|
||||
try:
|
||||
# Convert Office document to PDF
|
||||
pdf_path = self.office_converter.convert_to_pdf(image_path)
|
||||
logger.info(f"Office document converted to PDF: {pdf_path.name}")
|
||||
|
||||
# Process the PDF (will be handled by PDF processing logic below)
|
||||
image_path = pdf_path
|
||||
except OfficeConverterError as e:
|
||||
logger.error(f"Office conversion failed: {str(e)}")
|
||||
raise
|
||||
|
||||
# Check if file is PDF
|
||||
is_pdf = image_path.suffix.lower() == '.pdf'
|
||||
|
||||
if is_pdf:
|
||||
# Convert PDF to images
|
||||
logger.info(f"Detected PDF file: {image_path.name}, converting to images")
|
||||
pdf_images_dir = image_path.parent / f"{image_path.stem}_pages"
|
||||
image_paths = self.convert_pdf_to_images(image_path, pdf_images_dir)
|
||||
|
||||
# Process all pages
|
||||
all_text_regions = []
|
||||
total_confidence_sum = 0.0
|
||||
total_valid_regions = 0
|
||||
all_layout_data = []
|
||||
all_images_metadata = []
|
||||
all_ocr_dimensions = []
|
||||
|
||||
for page_num, page_image_path in enumerate(image_paths, 1):
|
||||
logger.info(f"Processing PDF page {page_num}/{len(image_paths)}")
|
||||
|
||||
# Process each page with correct page number (0-based for layout data)
|
||||
page_result = self.process_image(
|
||||
page_image_path,
|
||||
lang=lang,
|
||||
detect_layout=detect_layout,
|
||||
confidence_threshold=confidence_threshold,
|
||||
output_dir=output_dir,
|
||||
current_page=page_num - 1 # Convert to 0-based page number for layout data
|
||||
)
|
||||
|
||||
# Accumulate results
|
||||
if page_result['status'] == 'success':
|
||||
# Add page number to each text region
|
||||
for region in page_result['text_regions']:
|
||||
region['page'] = page_num
|
||||
all_text_regions.append(region)
|
||||
|
||||
total_confidence_sum += page_result['average_confidence'] * page_result['total_text_regions']
|
||||
total_valid_regions += page_result['total_text_regions']
|
||||
|
||||
# Accumulate layout data (page numbers already set correctly in analyze_layout)
|
||||
if page_result.get('layout_data'):
|
||||
layout_data = page_result['layout_data']
|
||||
all_layout_data.append(layout_data)
|
||||
|
||||
# Accumulate images metadata (page numbers already set correctly in analyze_layout)
|
||||
if page_result.get('images_metadata'):
|
||||
all_images_metadata.extend(page_result['images_metadata'])
|
||||
|
||||
# Store OCR dimensions for each page
|
||||
if page_result.get('ocr_dimensions'):
|
||||
all_ocr_dimensions.append({
|
||||
'page': page_num,
|
||||
'width': page_result['ocr_dimensions']['width'],
|
||||
'height': page_result['ocr_dimensions']['height']
|
||||
})
|
||||
|
||||
# Calculate overall average confidence
|
||||
avg_confidence = total_confidence_sum / total_valid_regions if total_valid_regions > 0 else 0.0
|
||||
|
||||
# Combine layout data from all pages
|
||||
combined_layout = None
|
||||
if all_layout_data:
|
||||
combined_elements = []
|
||||
for layout in all_layout_data:
|
||||
if layout.get('elements'):
|
||||
combined_elements.extend(layout['elements'])
|
||||
if combined_elements:
|
||||
combined_layout = {
|
||||
'elements': combined_elements,
|
||||
'total_elements': len(combined_elements),
|
||||
'reading_order': list(range(len(combined_elements))),
|
||||
}
|
||||
|
||||
# Generate combined markdown
|
||||
markdown_content = self.generate_markdown(all_text_regions, combined_layout)
|
||||
|
||||
# Calculate processing time
|
||||
processing_time = (datetime.now() - start_time).total_seconds()
|
||||
|
||||
logger.info(
|
||||
f"PDF processing completed: {image_path.name} - "
|
||||
f"{len(image_paths)} pages, "
|
||||
f"{len(all_text_regions)} regions, "
|
||||
f"{avg_confidence:.2f} avg confidence, "
|
||||
f"{processing_time:.2f}s"
|
||||
)
|
||||
|
||||
return {
|
||||
'status': 'success',
|
||||
'file_name': image_path.name,
|
||||
'language': lang,
|
||||
'text_regions': all_text_regions,
|
||||
'total_text_regions': len(all_text_regions),
|
||||
'average_confidence': avg_confidence,
|
||||
'layout_data': combined_layout,
|
||||
'images_metadata': all_images_metadata,
|
||||
'markdown_content': markdown_content,
|
||||
'processing_time': processing_time,
|
||||
'timestamp': datetime.utcnow().isoformat(),
|
||||
'total_pages': len(image_paths),
|
||||
'ocr_dimensions': all_ocr_dimensions if all_ocr_dimensions else None,
|
||||
}
|
||||
|
||||
# Get OCR engine (for non-PDF images)
|
||||
ocr_engine = self.get_ocr_engine(lang)
|
||||
|
||||
# Get the actual image dimensions that OCR will use
|
||||
from PIL import Image
|
||||
with Image.open(image_path) as img:
|
||||
ocr_width, ocr_height = img.size
|
||||
logger.info(f"OCR processing image dimensions: {ocr_width}x{ocr_height}")
|
||||
|
||||
# Perform OCR
|
||||
logger.info(f"Processing image: {image_path.name}")
|
||||
# Note: In PaddleOCR 3.x, use_angle_cls is set during initialization, not in ocr() call
|
||||
ocr_results = ocr_engine.ocr(str(image_path))
|
||||
|
||||
# Parse OCR results (PaddleOCR 3.x format)
|
||||
text_regions = []
|
||||
total_confidence = 0.0
|
||||
valid_regions = 0
|
||||
|
||||
if ocr_results and isinstance(ocr_results, (list, tuple)) and len(ocr_results) > 0:
|
||||
# PaddleOCR 3.x returns a list of dictionaries (one per page)
|
||||
for page_result in ocr_results:
|
||||
if isinstance(page_result, dict):
|
||||
# New format: {'rec_texts': [...], 'rec_scores': [...], 'rec_polys': [...]}
|
||||
texts = page_result.get('rec_texts', [])
|
||||
scores = page_result.get('rec_scores', [])
|
||||
polys = page_result.get('rec_polys', [])
|
||||
|
||||
# Process each recognized text
|
||||
for idx, text in enumerate(texts):
|
||||
# Get corresponding score and bbox
|
||||
confidence = scores[idx] if idx < len(scores) else 1.0
|
||||
bbox = polys[idx] if idx < len(polys) else []
|
||||
|
||||
# Convert numpy array bbox to list for JSON serialization
|
||||
if hasattr(bbox, 'tolist'):
|
||||
bbox = bbox.tolist()
|
||||
|
||||
# Filter by confidence threshold
|
||||
if confidence >= threshold:
|
||||
text_regions.append({
|
||||
'text': text,
|
||||
'bbox': bbox,
|
||||
'confidence': float(confidence),
|
||||
})
|
||||
total_confidence += confidence
|
||||
valid_regions += 1
|
||||
|
||||
avg_confidence = total_confidence / valid_regions if valid_regions > 0 else 0.0
|
||||
|
||||
logger.info(f"Parsed {len(text_regions)} text regions with avg confidence {avg_confidence:.3f}")
|
||||
|
||||
# Layout analysis (if requested)
|
||||
layout_data = None
|
||||
images_metadata = []
|
||||
|
||||
if detect_layout:
|
||||
# Pass current_page to analyze_layout for correct page numbering
|
||||
layout_data, images_metadata = self.analyze_layout(image_path, output_dir=output_dir, current_page=current_page)
|
||||
|
||||
# Generate Markdown
|
||||
markdown_content = self.generate_markdown(text_regions, layout_data)
|
||||
|
||||
# Calculate processing time
|
||||
processing_time = (datetime.now() - start_time).total_seconds()
|
||||
|
||||
result = {
|
||||
'status': 'success',
|
||||
'file_name': image_path.name,
|
||||
'language': lang,
|
||||
'text_regions': text_regions,
|
||||
'total_text_regions': len(text_regions),
|
||||
'average_confidence': avg_confidence,
|
||||
'layout_data': layout_data,
|
||||
'images_metadata': images_metadata,
|
||||
'markdown_content': markdown_content,
|
||||
'processing_time': processing_time,
|
||||
'timestamp': datetime.utcnow().isoformat(),
|
||||
'ocr_dimensions': {
|
||||
'width': ocr_width,
|
||||
'height': ocr_height
|
||||
}
|
||||
}
|
||||
|
||||
logger.info(
|
||||
f"OCR completed: {image_path.name} - "
|
||||
f"{len(text_regions)} regions, "
|
||||
f"{avg_confidence:.2f} avg confidence, "
|
||||
f"{processing_time:.2f}s"
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
import traceback
|
||||
error_trace = traceback.format_exc()
|
||||
logger.error(f"OCR processing error for {image_path.name}: {str(e)}\n{error_trace}")
|
||||
return {
|
||||
'status': 'error',
|
||||
'file_name': image_path.name,
|
||||
'error_message': str(e),
|
||||
'processing_time': (datetime.now() - start_time).total_seconds(),
|
||||
}
|
||||
|
||||
def _extract_table_text(self, html_content: str) -> str:
|
||||
"""
|
||||
Extract text from HTML table content for translation purposes
|
||||
|
||||
Args:
|
||||
html_content: HTML content containing table
|
||||
|
||||
Returns:
|
||||
Extracted text from table cells
|
||||
"""
|
||||
try:
|
||||
from html.parser import HTMLParser
|
||||
|
||||
class TableTextExtractor(HTMLParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.text_parts = []
|
||||
self.in_table = False
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag == 'table':
|
||||
self.in_table = True
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag == 'table':
|
||||
self.in_table = False
|
||||
elif tag in ('td', 'th') and self.in_table:
|
||||
self.text_parts.append(' | ') # Cell separator
|
||||
elif tag == 'tr' and self.in_table:
|
||||
self.text_parts.append('\n') # Row separator
|
||||
|
||||
def handle_data(self, data):
|
||||
if self.in_table:
|
||||
stripped = data.strip()
|
||||
if stripped:
|
||||
self.text_parts.append(stripped)
|
||||
|
||||
parser = TableTextExtractor()
|
||||
parser.feed(html_content)
|
||||
|
||||
# Clean up the extracted text
|
||||
extracted = ''.join(parser.text_parts)
|
||||
# Remove multiple separators
|
||||
import re
|
||||
extracted = re.sub(r'\s*\|\s*\|+\s*', ' | ', extracted)
|
||||
extracted = re.sub(r'\n+', '\n', extracted)
|
||||
extracted = extracted.strip()
|
||||
|
||||
return extracted
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to extract table text: {e}")
|
||||
# Fallback: just remove HTML tags
|
||||
import re
|
||||
text = re.sub(r'<[^>]+>', ' ', html_content)
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
return text.strip()
|
||||
|
||||
def analyze_layout(self, image_path: Path, output_dir: Optional[Path] = None, current_page: int = 0) -> Tuple[Optional[Dict], List[Dict]]:
|
||||
"""
|
||||
Analyze document layout using PP-StructureV3
|
||||
|
||||
Args:
|
||||
image_path: Path to image file
|
||||
output_dir: Optional output directory for saving extracted images (defaults to image_path.parent)
|
||||
current_page: Current page number (0-based) for multi-page documents
|
||||
|
||||
Returns:
|
||||
Tuple of (layout_data, images_metadata)
|
||||
"""
|
||||
try:
|
||||
structure_engine = self.get_structure_engine()
|
||||
|
||||
# Perform structure analysis using predict() method (PaddleOCR 3.x API)
|
||||
logger.info(f"Running layout analysis on {image_path.name}")
|
||||
results = structure_engine.predict(str(image_path))
|
||||
|
||||
layout_elements = []
|
||||
images_metadata = []
|
||||
|
||||
# Process each page result (for images, usually just one page)
|
||||
for page_idx, page_result in enumerate(results):
|
||||
# Get markdown dictionary from result object
|
||||
if hasattr(page_result, 'markdown'):
|
||||
markdown_dict = page_result.markdown
|
||||
logger.info(f"Page {page_idx} markdown keys: {markdown_dict.keys() if isinstance(markdown_dict, dict) else type(markdown_dict)}")
|
||||
|
||||
# Extract layout information from markdown structure
|
||||
if isinstance(markdown_dict, dict):
|
||||
# Get markdown texts (HTML format with tables and structure)
|
||||
markdown_texts = markdown_dict.get('markdown_texts', '')
|
||||
markdown_images = markdown_dict.get('markdown_images', {})
|
||||
|
||||
# Create a layout element for the structured content
|
||||
if markdown_texts:
|
||||
# Parse HTML content to identify tables and text
|
||||
import re
|
||||
|
||||
# Check if content contains tables
|
||||
has_table = '<table' in markdown_texts.lower()
|
||||
|
||||
element = {
|
||||
'element_id': len(layout_elements),
|
||||
'type': 'table' if has_table else 'text',
|
||||
'content': markdown_texts,
|
||||
'page': current_page, # Use current_page parameter instead of page_idx
|
||||
'bbox': [], # PP-StructureV3 doesn't provide individual bbox in this format
|
||||
}
|
||||
|
||||
# Extract text from table for translation purposes
|
||||
if has_table:
|
||||
table_text = self._extract_table_text(markdown_texts)
|
||||
element['extracted_text'] = table_text
|
||||
logger.info(f"Extracted {len(table_text)} characters from table")
|
||||
|
||||
layout_elements.append(element)
|
||||
|
||||
# Add image metadata and SAVE images to disk
|
||||
for img_idx, (img_path, img_obj) in enumerate(markdown_images.items()):
|
||||
# Save image to disk
|
||||
try:
|
||||
# Determine base directory for saving images
|
||||
base_dir = output_dir if output_dir else image_path.parent
|
||||
|
||||
# Create full path for image file
|
||||
full_img_path = base_dir / img_path
|
||||
|
||||
# Create imgs/ subdirectory if it doesn't exist
|
||||
full_img_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Save image object to disk
|
||||
if hasattr(img_obj, 'save'):
|
||||
# img_obj is PIL Image
|
||||
img_obj.save(str(full_img_path))
|
||||
logger.info(f"Saved extracted image to {full_img_path}")
|
||||
else:
|
||||
logger.warning(f"Image object for {img_path} does not have save() method, skipping")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to save image {img_path}: {str(e)}")
|
||||
# Continue processing even if image save fails
|
||||
|
||||
# Extract bbox from filename (format: img_in_table_box_x1_y1_x2_y2.jpg)
|
||||
bbox = []
|
||||
try:
|
||||
import re
|
||||
match = re.search(r'box_(\d+)_(\d+)_(\d+)_(\d+)', img_path)
|
||||
if match:
|
||||
x1, y1, x2, y2 = map(int, match.groups())
|
||||
# Convert to 4-point bbox format: [[x1,y1], [x2,y1], [x2,y2], [x1,y2]]
|
||||
bbox = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
|
||||
logger.info(f"Extracted bbox from filename: {bbox}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to extract bbox from {img_path}: {e}")
|
||||
|
||||
images_metadata.append({
|
||||
'element_id': len(layout_elements) + img_idx,
|
||||
'image_path': img_path,
|
||||
'type': 'image',
|
||||
'page': current_page, # Use current_page parameter instead of page_idx
|
||||
'bbox': bbox,
|
||||
})
|
||||
|
||||
if layout_elements:
|
||||
layout_data = {
|
||||
'elements': layout_elements,
|
||||
'total_elements': len(layout_elements),
|
||||
'reading_order': list(range(len(layout_elements))),
|
||||
}
|
||||
logger.info(f"Detected {len(layout_elements)} layout elements")
|
||||
return layout_data, images_metadata
|
||||
else:
|
||||
logger.warning("No layout elements detected")
|
||||
return None, []
|
||||
|
||||
except Exception as e:
|
||||
import traceback
|
||||
error_trace = traceback.format_exc()
|
||||
logger.error(f"Layout analysis error: {str(e)}\n{error_trace}")
|
||||
return None, []
|
||||
|
||||
def generate_markdown(
|
||||
self,
|
||||
text_regions: List[Dict],
|
||||
layout_data: Optional[Dict] = None
|
||||
) -> str:
|
||||
"""
|
||||
Generate Markdown from OCR results
|
||||
|
||||
Args:
|
||||
text_regions: List of text regions with bbox and text
|
||||
layout_data: Optional layout structure information
|
||||
|
||||
Returns:
|
||||
Markdown formatted string
|
||||
"""
|
||||
markdown_lines = []
|
||||
|
||||
if layout_data and layout_data.get('elements'):
|
||||
# Generate structured Markdown based on layout
|
||||
for element in layout_data['elements']:
|
||||
element_type = element.get('type', 'text')
|
||||
content = element.get('content', '')
|
||||
|
||||
if element_type == 'title':
|
||||
markdown_lines.append(f"# {content}\n")
|
||||
elif element_type == 'table':
|
||||
# Table in HTML format
|
||||
markdown_lines.append(content)
|
||||
markdown_lines.append("")
|
||||
elif element_type == 'figure':
|
||||
element_id = element.get('element_id')
|
||||
markdown_lines.append(f"\n")
|
||||
else:
|
||||
markdown_lines.append(f"{content}\n")
|
||||
|
||||
else:
|
||||
# Simple Markdown from text regions only
|
||||
# Sort by vertical position (top to bottom)
|
||||
def get_y_coord(region):
|
||||
"""Safely extract Y coordinate from bbox"""
|
||||
bbox = region.get('bbox', [])
|
||||
if isinstance(bbox, (list, tuple)) and len(bbox) > 0:
|
||||
if isinstance(bbox[0], (list, tuple)) and len(bbox[0]) > 1:
|
||||
return bbox[0][1] # [[x1,y1], [x2,y2], ...] format
|
||||
elif len(bbox) > 1:
|
||||
return bbox[1] # [x1, y1, x2, y2, ...] format
|
||||
return 0 # Default to 0 if can't extract
|
||||
|
||||
sorted_regions = sorted(text_regions, key=get_y_coord)
|
||||
|
||||
for region in sorted_regions:
|
||||
text = region['text']
|
||||
markdown_lines.append(text)
|
||||
|
||||
return "\n".join(markdown_lines)
|
||||
|
||||
def save_results(
|
||||
self,
|
||||
result: Dict,
|
||||
output_dir: Path,
|
||||
file_id: str,
|
||||
source_file_path: Optional[Path] = None
|
||||
) -> Tuple[Optional[Path], Optional[Path], Optional[Path]]:
|
||||
"""
|
||||
Save OCR results to JSON, Markdown, and layout-preserving PDF files
|
||||
|
||||
Args:
|
||||
result: OCR result dictionary
|
||||
output_dir: Output directory
|
||||
file_id: Unique file identifier
|
||||
source_file_path: Optional path to original source file for PDF generation
|
||||
|
||||
Returns:
|
||||
Tuple of (json_path, markdown_path, pdf_path)
|
||||
"""
|
||||
try:
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Save JSON
|
||||
json_path = output_dir / f"{file_id}_result.json"
|
||||
with open(json_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(result, f, ensure_ascii=False, indent=2)
|
||||
|
||||
# Save Markdown
|
||||
markdown_path = output_dir / f"{file_id}_output.md"
|
||||
markdown_content = result.get('markdown_content', '')
|
||||
with open(markdown_path, 'w', encoding='utf-8') as f:
|
||||
f.write(markdown_content)
|
||||
|
||||
logger.info(f"Results saved: {json_path.name}, {markdown_path.name}")
|
||||
|
||||
# Generate layout-preserving PDF
|
||||
pdf_path = None
|
||||
try:
|
||||
from app.services.pdf_generator_service import pdf_generator_service
|
||||
|
||||
pdf_filename = f"{file_id}_layout.pdf"
|
||||
pdf_path = output_dir / pdf_filename
|
||||
|
||||
logger.info(f"Generating layout-preserving PDF: {pdf_filename}")
|
||||
|
||||
success = pdf_generator_service.generate_layout_pdf(
|
||||
json_path=json_path,
|
||||
output_path=pdf_path,
|
||||
source_file_path=source_file_path
|
||||
)
|
||||
|
||||
if success:
|
||||
logger.info(f"✓ PDF generated successfully: {pdf_path.name}")
|
||||
else:
|
||||
logger.warning(f"✗ PDF generation failed for {file_id}")
|
||||
pdf_path = None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating PDF for {file_id}: {str(e)}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
pdf_path = None
|
||||
|
||||
return json_path, markdown_path, pdf_path
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error saving results: {str(e)}")
|
||||
return None, None, None
|
||||
@@ -1,312 +0,0 @@
|
||||
"""
|
||||
PDF Font Manager - Handles font loading, registration, and fallback.
|
||||
|
||||
This module provides unified font management for PDF generation,
|
||||
including CJK font support and font fallback mechanisms.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
from reportlab.pdfbase import pdfmetrics
|
||||
from reportlab.pdfbase.ttfonts import TTFont
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Configuration
|
||||
# ============================================================================
|
||||
|
||||
@dataclass
|
||||
class FontConfig:
|
||||
"""Configuration for font management."""
|
||||
# Primary fonts
|
||||
chinese_font_name: str = "NotoSansSC"
|
||||
chinese_font_path: Optional[Path] = None
|
||||
|
||||
# Fallback fonts (built-in)
|
||||
fallback_font_name: str = "Helvetica"
|
||||
fallback_cjk_font_name: str = "HeiseiMin-W3" # Built-in ReportLab CJK
|
||||
|
||||
# Font sizes
|
||||
default_font_size: int = 10
|
||||
min_font_size: int = 6
|
||||
max_font_size: int = 14
|
||||
|
||||
# Font registration options
|
||||
auto_register: bool = True
|
||||
enable_cjk_fallback: bool = True
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Font Manager
|
||||
# ============================================================================
|
||||
|
||||
class FontManager:
|
||||
"""
|
||||
Manages font registration and selection for PDF generation.
|
||||
|
||||
Features:
|
||||
- Lazy font registration
|
||||
- CJK (Chinese/Japanese/Korean) font support
|
||||
- Automatic fallback to built-in fonts
|
||||
- Font caching to avoid duplicate registration
|
||||
"""
|
||||
|
||||
_instance = None
|
||||
_registered_fonts: Dict[str, Path] = {}
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
"""Singleton pattern to avoid duplicate font registration."""
|
||||
if cls._instance is None:
|
||||
cls._instance = super().__new__(cls)
|
||||
cls._instance._initialized = False
|
||||
return cls._instance
|
||||
|
||||
def __init__(self, config: Optional[FontConfig] = None):
|
||||
"""
|
||||
Initialize FontManager.
|
||||
|
||||
Args:
|
||||
config: FontConfig instance (uses defaults if None)
|
||||
"""
|
||||
if self._initialized:
|
||||
return
|
||||
|
||||
self.config = config or FontConfig()
|
||||
self._primary_font_registered = False
|
||||
self._cjk_fallback_available = False
|
||||
|
||||
# Auto-register fonts if enabled
|
||||
if self.config.auto_register:
|
||||
self._register_fonts()
|
||||
|
||||
self._initialized = True
|
||||
|
||||
@property
|
||||
def primary_font_name(self) -> str:
|
||||
"""Get the primary font name to use."""
|
||||
if self._primary_font_registered:
|
||||
return self.config.chinese_font_name
|
||||
return self.config.fallback_font_name
|
||||
|
||||
@property
|
||||
def is_cjk_enabled(self) -> bool:
|
||||
"""Check if CJK fonts are available."""
|
||||
return self._primary_font_registered or self._cjk_fallback_available
|
||||
|
||||
@classmethod
|
||||
def reset(cls):
|
||||
"""Reset singleton instance (for testing)."""
|
||||
cls._instance = None
|
||||
cls._registered_fonts = {}
|
||||
|
||||
def get_font_for_text(self, text: str) -> str:
|
||||
"""
|
||||
Get appropriate font name for given text.
|
||||
|
||||
Args:
|
||||
text: Text to render
|
||||
|
||||
Returns:
|
||||
Font name suitable for the text content
|
||||
"""
|
||||
if self._contains_cjk(text):
|
||||
if self._primary_font_registered:
|
||||
return self.config.chinese_font_name
|
||||
elif self._cjk_fallback_available:
|
||||
return self.config.fallback_cjk_font_name
|
||||
return self.primary_font_name
|
||||
|
||||
def get_font_size(
|
||||
self,
|
||||
text: str,
|
||||
available_width: float,
|
||||
available_height: float,
|
||||
pdf_canvas=None
|
||||
) -> int:
|
||||
"""
|
||||
Calculate optimal font size for text to fit within bounds.
|
||||
|
||||
Args:
|
||||
text: Text to render
|
||||
available_width: Maximum width available
|
||||
available_height: Maximum height available
|
||||
pdf_canvas: Optional canvas for precise measurement
|
||||
|
||||
Returns:
|
||||
Font size that fits within bounds
|
||||
"""
|
||||
font_name = self.get_font_for_text(text)
|
||||
|
||||
for size in range(self.config.max_font_size, self.config.min_font_size - 1, -1):
|
||||
if pdf_canvas:
|
||||
# Precise measurement with canvas
|
||||
text_width = pdf_canvas.stringWidth(text, font_name, size)
|
||||
else:
|
||||
# Approximate measurement
|
||||
text_width = len(text) * size * 0.6 # Rough estimate
|
||||
|
||||
text_height = size * 1.2 # Line height
|
||||
|
||||
if text_width <= available_width and text_height <= available_height:
|
||||
return size
|
||||
|
||||
return self.config.min_font_size
|
||||
|
||||
def register_font(
|
||||
self,
|
||||
font_name: str,
|
||||
font_path: Path,
|
||||
force: bool = False
|
||||
) -> bool:
|
||||
"""
|
||||
Register a custom font.
|
||||
|
||||
Args:
|
||||
font_name: Name to register font under
|
||||
font_path: Path to TTF font file
|
||||
force: Force re-registration if already registered
|
||||
|
||||
Returns:
|
||||
True if registration successful
|
||||
"""
|
||||
if font_name in self._registered_fonts and not force:
|
||||
logger.debug(f"Font {font_name} already registered")
|
||||
return True
|
||||
|
||||
try:
|
||||
if not font_path.exists():
|
||||
logger.error(f"Font file not found: {font_path}")
|
||||
return False
|
||||
|
||||
pdfmetrics.registerFont(TTFont(font_name, str(font_path)))
|
||||
self._registered_fonts[font_name] = font_path
|
||||
logger.info(f"Font registered: {font_name} from {font_path}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to register font {font_name}: {e}")
|
||||
return False
|
||||
|
||||
def get_registered_fonts(self) -> List[str]:
|
||||
"""Get list of registered custom font names."""
|
||||
return list(self._registered_fonts.keys())
|
||||
|
||||
# =========================================================================
|
||||
# Private Methods
|
||||
# =========================================================================
|
||||
|
||||
def _register_fonts(self):
|
||||
"""Register configured fonts."""
|
||||
# Register primary Chinese font
|
||||
if self.config.chinese_font_path:
|
||||
self._register_chinese_font()
|
||||
|
||||
# Setup CJK fallback
|
||||
if self.config.enable_cjk_fallback:
|
||||
self._setup_cjk_fallback()
|
||||
|
||||
def _register_chinese_font(self):
|
||||
"""Register the primary Chinese font."""
|
||||
font_path = self.config.chinese_font_path
|
||||
|
||||
if font_path is None:
|
||||
# Try to load from settings
|
||||
try:
|
||||
from app.core.config import settings
|
||||
font_path = Path(settings.chinese_font_path)
|
||||
except Exception as e:
|
||||
logger.debug(f"Could not load font path from settings: {e}")
|
||||
return
|
||||
|
||||
# Resolve relative path
|
||||
if not font_path.is_absolute():
|
||||
# Try project root
|
||||
project_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||
font_path = project_root / font_path
|
||||
|
||||
if not font_path.exists():
|
||||
logger.warning(f"Chinese font not found at {font_path}")
|
||||
return
|
||||
|
||||
try:
|
||||
pdfmetrics.registerFont(TTFont(self.config.chinese_font_name, str(font_path)))
|
||||
self._registered_fonts[self.config.chinese_font_name] = font_path
|
||||
self._primary_font_registered = True
|
||||
logger.info(f"Chinese font registered: {self.config.chinese_font_name}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to register Chinese font: {e}")
|
||||
|
||||
def _setup_cjk_fallback(self):
|
||||
"""Setup CJK fallback using built-in fonts."""
|
||||
try:
|
||||
# ReportLab includes CID fonts for CJK
|
||||
from reportlab.pdfbase.cidfonts import UnicodeCIDFont
|
||||
|
||||
# Register CJK fonts if not already registered
|
||||
try:
|
||||
pdfmetrics.registerFont(UnicodeCIDFont('HeiseiMin-W3'))
|
||||
self._cjk_fallback_available = True
|
||||
logger.debug("CJK fallback font available: HeiseiMin-W3")
|
||||
except Exception:
|
||||
pass # Font may already be registered
|
||||
|
||||
except ImportError:
|
||||
logger.debug("CID fonts not available for CJK fallback")
|
||||
|
||||
def _contains_cjk(self, text: str) -> bool:
|
||||
"""
|
||||
Check if text contains CJK characters.
|
||||
|
||||
Args:
|
||||
text: Text to check
|
||||
|
||||
Returns:
|
||||
True if text contains Chinese, Japanese, or Korean characters
|
||||
"""
|
||||
if not text:
|
||||
return False
|
||||
|
||||
for char in text:
|
||||
code = ord(char)
|
||||
# CJK Unified Ideographs and related ranges
|
||||
if any([
|
||||
0x4E00 <= code <= 0x9FFF, # CJK Unified Ideographs
|
||||
0x3400 <= code <= 0x4DBF, # CJK Extension A
|
||||
0x20000 <= code <= 0x2A6DF, # CJK Extension B
|
||||
0x3000 <= code <= 0x303F, # CJK Punctuation
|
||||
0x3040 <= code <= 0x309F, # Hiragana
|
||||
0x30A0 <= code <= 0x30FF, # Katakana
|
||||
0xAC00 <= code <= 0xD7AF, # Korean Hangul
|
||||
]):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Convenience Functions
|
||||
# ============================================================================
|
||||
|
||||
_default_manager: Optional[FontManager] = None
|
||||
|
||||
|
||||
def get_font_manager() -> FontManager:
|
||||
"""Get the default FontManager instance."""
|
||||
global _default_manager
|
||||
if _default_manager is None:
|
||||
_default_manager = FontManager()
|
||||
return _default_manager
|
||||
|
||||
|
||||
def register_font(font_name: str, font_path: Path) -> bool:
|
||||
"""Register a font using the default manager."""
|
||||
return get_font_manager().register_font(font_name, font_path)
|
||||
|
||||
|
||||
def get_font_for_text(text: str) -> str:
|
||||
"""Get appropriate font for text using the default manager."""
|
||||
return get_font_manager().get_font_for_text(text)
|
||||
@@ -1,230 +0,0 @@
|
||||
"""
|
||||
Tool_OCR - Document Preprocessor Service
|
||||
Handles file validation, format detection, and preprocessing
|
||||
"""
|
||||
|
||||
import magic
|
||||
from pathlib import Path
|
||||
from typing import Tuple, Optional
|
||||
import logging
|
||||
from PIL import Image
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
from app.core.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DocumentPreprocessor:
|
||||
"""
|
||||
Document preprocessing service for format standardization
|
||||
Validates and prepares documents for OCR processing
|
||||
"""
|
||||
|
||||
SUPPORTED_IMAGE_FORMATS = ['png', 'jpg', 'jpeg', 'bmp', 'tiff', 'tif']
|
||||
SUPPORTED_PDF_FORMAT = ['pdf']
|
||||
ALL_SUPPORTED_FORMATS = SUPPORTED_IMAGE_FORMATS + SUPPORTED_PDF_FORMAT
|
||||
|
||||
def __init__(self):
|
||||
self.allowed_extensions = settings.allowed_extensions_list
|
||||
self.max_file_size = settings.max_upload_size
|
||||
logger.info(f"DocumentPreprocessor initialized with allowed_extensions: {self.allowed_extensions}")
|
||||
|
||||
def validate_file(self, file_path: Path) -> Tuple[bool, Optional[str], Optional[str]]:
|
||||
"""
|
||||
Validate file format, size, and integrity
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to validate
|
||||
|
||||
Returns:
|
||||
Tuple of (is_valid, file_format, error_message)
|
||||
"""
|
||||
try:
|
||||
# Check file exists
|
||||
if not file_path.exists():
|
||||
return False, None, f"File not found: {file_path}"
|
||||
|
||||
# Check file size
|
||||
file_size = file_path.stat().st_size
|
||||
if file_size > self.max_file_size:
|
||||
max_mb = self.max_file_size / (1024 * 1024)
|
||||
actual_mb = file_size / (1024 * 1024)
|
||||
return False, None, f"File too large: {actual_mb:.2f}MB (max {max_mb:.2f}MB)"
|
||||
|
||||
# Detect file format using magic numbers
|
||||
mime = magic.Magic(mime=True)
|
||||
mime_type = mime.from_file(str(file_path))
|
||||
|
||||
# Map MIME type to format
|
||||
file_format = self._mime_to_format(mime_type)
|
||||
if not file_format:
|
||||
return False, None, f"Unsupported file type: {mime_type}"
|
||||
|
||||
# Check if format is in allowed extensions
|
||||
if file_format not in self.allowed_extensions:
|
||||
return False, None, f"File format '{file_format}' not allowed"
|
||||
|
||||
# Validate file integrity
|
||||
is_valid, error = self._validate_integrity(file_path, file_format)
|
||||
if not is_valid:
|
||||
return False, file_format, f"File corrupted: {error}"
|
||||
|
||||
logger.info(f"File validated successfully: {file_path.name} ({file_format})")
|
||||
return True, file_format, None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"File validation error: {str(e)}")
|
||||
return False, None, f"Validation error: {str(e)}"
|
||||
|
||||
def _mime_to_format(self, mime_type: str) -> Optional[str]:
|
||||
"""Convert MIME type to file format"""
|
||||
mime_map = {
|
||||
'image/png': 'png',
|
||||
'image/jpeg': 'jpg',
|
||||
'image/jpg': 'jpg',
|
||||
'image/bmp': 'bmp',
|
||||
'image/tiff': 'tiff',
|
||||
'image/x-tiff': 'tiff',
|
||||
'application/pdf': 'pdf',
|
||||
'application/msword': 'doc',
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
|
||||
'application/vnd.ms-powerpoint': 'ppt',
|
||||
'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
|
||||
}
|
||||
return mime_map.get(mime_type)
|
||||
|
||||
def _validate_integrity(self, file_path: Path, file_format: str) -> Tuple[bool, Optional[str]]:
|
||||
"""
|
||||
Validate file integrity by attempting to open it
|
||||
|
||||
Args:
|
||||
file_path: Path to file
|
||||
file_format: Detected file format
|
||||
|
||||
Returns:
|
||||
Tuple of (is_valid, error_message)
|
||||
"""
|
||||
try:
|
||||
if file_format in self.SUPPORTED_IMAGE_FORMATS:
|
||||
# Try to open image
|
||||
with Image.open(file_path) as img:
|
||||
img.verify() # Verify image integrity
|
||||
# Reopen for actual check (verify() closes the file)
|
||||
with Image.open(file_path) as img:
|
||||
_ = img.size # Force load to detect corruption
|
||||
return True, None
|
||||
|
||||
elif file_format == 'pdf':
|
||||
# Basic PDF validation - check file starts with PDF signature
|
||||
with open(file_path, 'rb') as f:
|
||||
header = f.read(5)
|
||||
if header != b'%PDF-':
|
||||
return False, "Invalid PDF header"
|
||||
return True, None
|
||||
|
||||
elif file_format in ['doc', 'docx', 'ppt', 'pptx']:
|
||||
# Office documents - basic validation (check file size and can be opened)
|
||||
# Modern Office formats (docx, pptx) are ZIP-based
|
||||
if file_format in ['docx', 'pptx']:
|
||||
import zipfile
|
||||
try:
|
||||
with zipfile.ZipFile(file_path, 'r') as zf:
|
||||
# Check if it has the required Office structure
|
||||
if file_format == 'docx' and 'word/document.xml' not in zf.namelist():
|
||||
return False, "Invalid DOCX structure"
|
||||
elif file_format == 'pptx' and 'ppt/presentation.xml' not in zf.namelist():
|
||||
return False, "Invalid PPTX structure"
|
||||
except zipfile.BadZipFile:
|
||||
return False, "Invalid Office file (corrupt ZIP)"
|
||||
# Old formats (doc, ppt) - just check file exists and has content
|
||||
return True, None
|
||||
|
||||
else:
|
||||
return False, f"Unknown format: {file_format}"
|
||||
|
||||
except Exception as e:
|
||||
return False, str(e)
|
||||
|
||||
def preprocess_image(
|
||||
self,
|
||||
image_path: Path,
|
||||
enhance: bool = True,
|
||||
output_path: Optional[Path] = None
|
||||
) -> Tuple[bool, Optional[Path], Optional[str]]:
|
||||
"""
|
||||
Preprocess image to improve OCR accuracy
|
||||
|
||||
Args:
|
||||
image_path: Path to input image
|
||||
enhance: Whether to apply enhancement
|
||||
output_path: Optional output path (defaults to temp directory)
|
||||
|
||||
Returns:
|
||||
Tuple of (success, processed_image_path, error_message)
|
||||
"""
|
||||
try:
|
||||
# Read image
|
||||
img = cv2.imread(str(image_path))
|
||||
if img is None:
|
||||
return False, None, "Failed to read image"
|
||||
|
||||
if not enhance:
|
||||
# No preprocessing, return original
|
||||
return True, image_path, None
|
||||
|
||||
# Convert to grayscale
|
||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
# Apply adaptive thresholding to handle varying lighting
|
||||
processed = cv2.adaptiveThreshold(
|
||||
gray,
|
||||
255,
|
||||
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
||||
cv2.THRESH_BINARY,
|
||||
11,
|
||||
2
|
||||
)
|
||||
|
||||
# Denoise
|
||||
processed = cv2.fastNlMeansDenoising(processed, None, 10, 7, 21)
|
||||
|
||||
# Determine output path
|
||||
if output_path is None:
|
||||
output_path = Path(settings.processed_dir) / f"processed_{image_path.name}"
|
||||
|
||||
# Save processed image
|
||||
cv2.imwrite(str(output_path), processed)
|
||||
|
||||
logger.info(f"Image preprocessed: {image_path.name} -> {output_path.name}")
|
||||
return True, output_path, None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Image preprocessing error: {str(e)}")
|
||||
return False, None, f"Preprocessing error: {str(e)}"
|
||||
|
||||
def get_file_info(self, file_path: Path) -> dict:
|
||||
"""
|
||||
Get comprehensive file information
|
||||
|
||||
Args:
|
||||
file_path: Path to file
|
||||
|
||||
Returns:
|
||||
Dictionary with file information
|
||||
"""
|
||||
stat = file_path.stat()
|
||||
mime = magic.Magic(mime=True)
|
||||
mime_type = mime.from_file(str(file_path))
|
||||
|
||||
return {
|
||||
'name': file_path.name,
|
||||
'path': str(file_path),
|
||||
'size': stat.st_size,
|
||||
'size_mb': stat.st_size / (1024 * 1024),
|
||||
'mime_type': mime_type,
|
||||
'format': self._mime_to_format(mime_type),
|
||||
'created_at': stat.st_ctime,
|
||||
'modified_at': stat.st_mtime,
|
||||
}
|
||||
@@ -1,7 +1,6 @@
|
||||
import { Outlet, NavLink, useNavigate } from 'react-router-dom'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import { useAuthStore } from '@/store/authStore'
|
||||
import { apiClient } from '@/services/api'
|
||||
import { apiClientV2 } from '@/services/apiV2'
|
||||
import {
|
||||
Upload,
|
||||
@@ -29,12 +28,7 @@ export default function Layout() {
|
||||
|
||||
const handleLogout = async () => {
|
||||
try {
|
||||
// Use V2 API if authenticated with V2
|
||||
if (apiClientV2.isAuthenticated()) {
|
||||
await apiClientV2.logout()
|
||||
} else {
|
||||
apiClient.logout()
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Logout error:', error)
|
||||
} finally {
|
||||
|
||||
@@ -1,26 +0,0 @@
|
||||
import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card'
|
||||
|
||||
interface MarkdownPreviewProps {
|
||||
title?: string
|
||||
content: string
|
||||
className?: string
|
||||
}
|
||||
|
||||
export default function MarkdownPreview({ title, content, className }: MarkdownPreviewProps) {
|
||||
return (
|
||||
<Card className={className}>
|
||||
{title && (
|
||||
<CardHeader>
|
||||
<CardTitle>{title}</CardTitle>
|
||||
</CardHeader>
|
||||
)}
|
||||
<CardContent>
|
||||
<div className="prose prose-sm max-w-none dark:prose-invert">
|
||||
<pre className="whitespace-pre-wrap break-words bg-muted p-4 rounded-md overflow-auto max-h-[600px]">
|
||||
{content}
|
||||
</pre>
|
||||
</div>
|
||||
</CardContent>
|
||||
</Card>
|
||||
)
|
||||
}
|
||||
@@ -1,90 +0,0 @@
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import { Table, TableBody, TableCell, TableHead, TableHeader, TableRow } from '@/components/ui/table'
|
||||
import { Badge } from '@/components/ui/badge'
|
||||
import { Button } from '@/components/ui/button'
|
||||
import type { FileResult } from '@/types/apiV2'
|
||||
|
||||
interface ResultsTableProps {
|
||||
files: FileResult[]
|
||||
onViewResult?: (fileId: number) => void
|
||||
onDownloadPDF?: (fileId: number) => void
|
||||
}
|
||||
|
||||
export default function ResultsTable({ files, onViewResult, onDownloadPDF }: ResultsTableProps) {
|
||||
const { t } = useTranslation()
|
||||
|
||||
const getStatusBadge = (status: FileResult['status']) => {
|
||||
switch (status) {
|
||||
case 'completed':
|
||||
return <Badge variant="success">{t('processing.completed')}</Badge>
|
||||
case 'processing':
|
||||
return <Badge variant="default">{t('processing.processing')}</Badge>
|
||||
case 'failed':
|
||||
return <Badge variant="destructive">{t('processing.failed')}</Badge>
|
||||
default:
|
||||
return <Badge variant="secondary">{t('processing.pending')}</Badge>
|
||||
}
|
||||
}
|
||||
|
||||
const formatTime = (seconds?: number) => {
|
||||
if (!seconds) return 'N/A'
|
||||
return `${seconds.toFixed(2)}s`
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="rounded-md border">
|
||||
<Table>
|
||||
<TableHeader>
|
||||
<TableRow>
|
||||
<TableHead>{t('results.filename')}</TableHead>
|
||||
<TableHead>{t('results.status')}</TableHead>
|
||||
<TableHead>{t('results.processingTime')}</TableHead>
|
||||
<TableHead className="text-right">{t('results.actions')}</TableHead>
|
||||
</TableRow>
|
||||
</TableHeader>
|
||||
<TableBody>
|
||||
{files.length === 0 ? (
|
||||
<TableRow>
|
||||
<TableCell colSpan={4} className="text-center text-muted-foreground">
|
||||
{t('results.noResults')}
|
||||
</TableCell>
|
||||
</TableRow>
|
||||
) : (
|
||||
files.map((file) => (
|
||||
<TableRow key={file.id}>
|
||||
<TableCell className="font-medium">{file.filename}</TableCell>
|
||||
<TableCell>{getStatusBadge(file.status)}</TableCell>
|
||||
<TableCell>{formatTime(file.processing_time)}</TableCell>
|
||||
<TableCell className="text-right">
|
||||
<div className="flex justify-end gap-2">
|
||||
{file.status === 'completed' && (
|
||||
<>
|
||||
<Button
|
||||
variant="outline"
|
||||
size="sm"
|
||||
onClick={() => onViewResult?.(file.id)}
|
||||
>
|
||||
{t('results.viewMarkdown')}
|
||||
</Button>
|
||||
<Button
|
||||
variant="outline"
|
||||
size="sm"
|
||||
onClick={() => onDownloadPDF?.(file.id)}
|
||||
>
|
||||
{t('results.downloadPDF')}
|
||||
</Button>
|
||||
</>
|
||||
)}
|
||||
{file.status === 'failed' && file.error && (
|
||||
<span className="text-sm text-destructive">{file.error}</span>
|
||||
)}
|
||||
</div>
|
||||
</TableCell>
|
||||
</TableRow>
|
||||
))
|
||||
)}
|
||||
</TableBody>
|
||||
</Table>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
@@ -4,7 +4,7 @@ import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query'
|
||||
import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card'
|
||||
import { Button } from '@/components/ui/button'
|
||||
import { useToast } from '@/components/ui/toast'
|
||||
import { apiClient } from '@/services/api'
|
||||
import { apiClientV2 } from '@/services/apiV2'
|
||||
import type { ExportRule } from '@/types/apiV2'
|
||||
|
||||
export default function SettingsPage() {
|
||||
@@ -25,12 +25,12 @@ export default function SettingsPage() {
|
||||
// Fetch export rules
|
||||
const { data: exportRules, isLoading } = useQuery({
|
||||
queryKey: ['exportRules'],
|
||||
queryFn: () => apiClient.getExportRules(),
|
||||
queryFn: () => apiClientV2.getExportRules(),
|
||||
})
|
||||
|
||||
// Create rule mutation
|
||||
const createRuleMutation = useMutation({
|
||||
mutationFn: (rule: any) => apiClient.createExportRule(rule),
|
||||
mutationFn: (rule: any) => apiClientV2.createExportRule(rule),
|
||||
onSuccess: () => {
|
||||
queryClient.invalidateQueries({ queryKey: ['exportRules'] })
|
||||
setIsCreating(false)
|
||||
@@ -53,7 +53,7 @@ export default function SettingsPage() {
|
||||
// Update rule mutation
|
||||
const updateRuleMutation = useMutation({
|
||||
mutationFn: ({ ruleId, rule }: { ruleId: number; rule: any }) =>
|
||||
apiClient.updateExportRule(ruleId, rule),
|
||||
apiClientV2.updateExportRule(ruleId, rule),
|
||||
onSuccess: () => {
|
||||
queryClient.invalidateQueries({ queryKey: ['exportRules'] })
|
||||
setEditingRule(null)
|
||||
@@ -75,7 +75,7 @@ export default function SettingsPage() {
|
||||
|
||||
// Delete rule mutation
|
||||
const deleteRuleMutation = useMutation({
|
||||
mutationFn: (ruleId: number) => apiClient.deleteExportRule(ruleId),
|
||||
mutationFn: (ruleId: number) => apiClientV2.deleteExportRule(ruleId),
|
||||
onSuccess: () => {
|
||||
queryClient.invalidateQueries({ queryKey: ['exportRules'] })
|
||||
toast({
|
||||
|
||||
@@ -1,271 +0,0 @@
|
||||
import axios, { AxiosError } from 'axios'
|
||||
import type { AxiosInstance } from 'axios'
|
||||
import type {
|
||||
LoginRequest,
|
||||
LoginResponse,
|
||||
UploadResponse,
|
||||
ProcessRequest,
|
||||
ProcessResponse,
|
||||
BatchStatus,
|
||||
OCRResult,
|
||||
ExportRequest,
|
||||
ExportRule,
|
||||
CSSTemplate,
|
||||
TranslateRequest,
|
||||
TranslateResponse,
|
||||
ApiError,
|
||||
} from '@/types/api'
|
||||
|
||||
/**
|
||||
* API Client Configuration
|
||||
* - In Docker: VITE_API_BASE_URL is empty string, use relative path
|
||||
* - In development: Use VITE_API_BASE_URL from .env or default to localhost:8000
|
||||
*/
|
||||
const envApiBaseUrl = import.meta.env.VITE_API_BASE_URL
|
||||
const API_BASE_URL = envApiBaseUrl !== undefined ? envApiBaseUrl : 'http://localhost:8000'
|
||||
const API_VERSION = 'v2'
|
||||
|
||||
class ApiClient {
|
||||
private client: AxiosInstance
|
||||
private token: string | null = null
|
||||
|
||||
constructor() {
|
||||
this.client = axios.create({
|
||||
baseURL: `${API_BASE_URL}/api/${API_VERSION}`,
|
||||
timeout: 30000,
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
})
|
||||
|
||||
// Request interceptor to add auth token
|
||||
this.client.interceptors.request.use(
|
||||
(config) => {
|
||||
if (this.token) {
|
||||
config.headers.Authorization = `Bearer ${this.token}`
|
||||
}
|
||||
return config
|
||||
},
|
||||
(error) => Promise.reject(error)
|
||||
)
|
||||
|
||||
// Response interceptor for error handling
|
||||
this.client.interceptors.response.use(
|
||||
(response) => response,
|
||||
(error: AxiosError<ApiError>) => {
|
||||
if (error.response?.status === 401) {
|
||||
// Token expired or invalid
|
||||
this.clearToken()
|
||||
window.location.href = '/login'
|
||||
}
|
||||
return Promise.reject(error)
|
||||
}
|
||||
)
|
||||
|
||||
// Load token from localStorage
|
||||
this.loadToken()
|
||||
}
|
||||
|
||||
/**
|
||||
* Set authentication token
|
||||
*/
|
||||
setToken(token: string) {
|
||||
this.token = token
|
||||
localStorage.setItem('auth_token', token)
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear authentication token
|
||||
*/
|
||||
clearToken() {
|
||||
this.token = null
|
||||
localStorage.removeItem('auth_token')
|
||||
}
|
||||
|
||||
/**
|
||||
* Load token from localStorage
|
||||
*/
|
||||
private loadToken() {
|
||||
const token = localStorage.getItem('auth_token')
|
||||
if (token) {
|
||||
this.token = token
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if user is authenticated
|
||||
*/
|
||||
isAuthenticated(): boolean {
|
||||
return this.token !== null
|
||||
}
|
||||
|
||||
// ==================== Authentication ====================
|
||||
|
||||
/**
|
||||
* Login
|
||||
*/
|
||||
async login(data: LoginRequest): Promise<LoginResponse> {
|
||||
const response = await this.client.post<LoginResponse>('/auth/login', {
|
||||
username: data.username,
|
||||
password: data.password,
|
||||
})
|
||||
|
||||
this.setToken(response.data.access_token)
|
||||
return response.data
|
||||
}
|
||||
|
||||
/**
|
||||
* Logout
|
||||
*/
|
||||
logout() {
|
||||
this.clearToken()
|
||||
}
|
||||
|
||||
// ==================== File Upload ====================
|
||||
|
||||
/**
|
||||
* Upload files
|
||||
*/
|
||||
async uploadFiles(files: File[]): Promise<UploadResponse> {
|
||||
const formData = new FormData()
|
||||
files.forEach((file) => {
|
||||
formData.append('files', file)
|
||||
})
|
||||
|
||||
const response = await this.client.post<UploadResponse>('/upload', formData, {
|
||||
headers: {
|
||||
'Content-Type': 'multipart/form-data',
|
||||
},
|
||||
})
|
||||
|
||||
return response.data
|
||||
}
|
||||
|
||||
// ==================== OCR Processing ====================
|
||||
|
||||
/**
|
||||
* Process OCR
|
||||
*/
|
||||
async processOCR(data: ProcessRequest): Promise<ProcessResponse> {
|
||||
const response = await this.client.post<ProcessResponse>('/ocr/process', data)
|
||||
return response.data
|
||||
}
|
||||
|
||||
/**
|
||||
* Get OCR result by file ID
|
||||
* Note: Backend uses file-level tracking, not task-level
|
||||
*/
|
||||
async getOCRResult(fileId: number): Promise<OCRResult> {
|
||||
const response = await this.client.get<OCRResult>(`/ocr/result/${fileId}`)
|
||||
return response.data
|
||||
}
|
||||
|
||||
/**
|
||||
* Get batch status
|
||||
*/
|
||||
async getBatchStatus(batchId: number): Promise<BatchStatus> {
|
||||
const response = await this.client.get<BatchStatus>(`/batch/${batchId}/status`)
|
||||
return response.data
|
||||
}
|
||||
|
||||
// ==================== Export ====================
|
||||
|
||||
/**
|
||||
* Export results
|
||||
*/
|
||||
async exportResults(data: ExportRequest): Promise<Blob> {
|
||||
const response = await this.client.post('/export', data, {
|
||||
responseType: 'blob',
|
||||
})
|
||||
return response.data
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate and download PDF
|
||||
*/
|
||||
async exportPDF(fileId: number, cssTemplate?: string): Promise<Blob> {
|
||||
const params = cssTemplate ? { css_template: cssTemplate } : {}
|
||||
const response = await this.client.get(`/export/pdf/${fileId}`, {
|
||||
params,
|
||||
responseType: 'blob',
|
||||
})
|
||||
return response.data
|
||||
}
|
||||
|
||||
/**
|
||||
* Get export rules
|
||||
*/
|
||||
async getExportRules(): Promise<ExportRule[]> {
|
||||
const response = await this.client.get<ExportRule[]>('/export/rules')
|
||||
return response.data
|
||||
}
|
||||
|
||||
/**
|
||||
* Create export rule
|
||||
*/
|
||||
async createExportRule(rule: Omit<ExportRule, 'id' | 'created_at'>): Promise<ExportRule> {
|
||||
const response = await this.client.post<ExportRule>('/export/rules', rule)
|
||||
return response.data
|
||||
}
|
||||
|
||||
/**
|
||||
* Update export rule
|
||||
*/
|
||||
async updateExportRule(ruleId: number, rule: Partial<ExportRule>): Promise<ExportRule> {
|
||||
const response = await this.client.put<ExportRule>(`/export/rules/${ruleId}`, rule)
|
||||
return response.data
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete export rule
|
||||
*/
|
||||
async deleteExportRule(ruleId: number): Promise<void> {
|
||||
await this.client.delete(`/export/rules/${ruleId}`)
|
||||
}
|
||||
|
||||
/**
|
||||
* Get CSS templates
|
||||
*/
|
||||
async getCSSTemplates(): Promise<CSSTemplate[]> {
|
||||
const response = await this.client.get<CSSTemplate[]>('/export/css-templates')
|
||||
return response.data
|
||||
}
|
||||
|
||||
// ==================== Translation (FUTURE FEATURE - STUB) ====================
|
||||
|
||||
/**
|
||||
* Translate document (STUB - Not yet implemented)
|
||||
* This is a placeholder for future translation functionality
|
||||
* @throws Will throw error with status 501 (Not Implemented)
|
||||
*/
|
||||
async translateDocument(data: TranslateRequest): Promise<TranslateResponse> {
|
||||
// This endpoint is expected to return 501 Not Implemented until Phase 5
|
||||
const response = await this.client.post<TranslateResponse>('/translate/document', data)
|
||||
return response.data
|
||||
}
|
||||
|
||||
/**
|
||||
* Get translation configs (NOT IMPLEMENTED)
|
||||
* This endpoint does not exist on backend - configs will be part of Phase 5
|
||||
* @deprecated Backend endpoint does not exist - will return 404
|
||||
*/
|
||||
// async getTranslationConfigs(): Promise<TranslationConfig[]> {
|
||||
// const response = await this.client.get<TranslationConfig[]>('/translate/configs')
|
||||
// return response.data
|
||||
// }
|
||||
|
||||
/**
|
||||
* Create translation config (NOT IMPLEMENTED)
|
||||
* This endpoint does not exist on backend - configs will be part of Phase 5
|
||||
* @deprecated Backend endpoint does not exist - will return 404
|
||||
*/
|
||||
// async createTranslationConfig(
|
||||
// config: Omit<TranslationConfig, 'id' | 'created_at'>
|
||||
// ): Promise<TranslationConfig> {
|
||||
// const response = await this.client.post<TranslationConfig>('/translate/configs', config)
|
||||
// return response.data
|
||||
// }
|
||||
}
|
||||
|
||||
// Export singleton instance
|
||||
export const apiClient = new ApiClient()
|
||||
@@ -38,6 +38,7 @@ import type {
|
||||
TranslationStatusResponse,
|
||||
TranslationListResponse,
|
||||
TranslationResult,
|
||||
ExportRule,
|
||||
} from '@/types/apiV2'
|
||||
|
||||
/**
|
||||
@@ -713,6 +714,39 @@ class ApiClientV2 {
|
||||
link.click()
|
||||
window.URL.revokeObjectURL(link.href)
|
||||
}
|
||||
|
||||
// ==================== Export Rules APIs ====================
|
||||
|
||||
/**
|
||||
* Get export rules
|
||||
*/
|
||||
async getExportRules(): Promise<ExportRule[]> {
|
||||
const response = await this.client.get<ExportRule[]>('/export/rules')
|
||||
return response.data
|
||||
}
|
||||
|
||||
/**
|
||||
* Create export rule
|
||||
*/
|
||||
async createExportRule(rule: Omit<ExportRule, 'id' | 'created_at'>): Promise<ExportRule> {
|
||||
const response = await this.client.post<ExportRule>('/export/rules', rule)
|
||||
return response.data
|
||||
}
|
||||
|
||||
/**
|
||||
* Update export rule
|
||||
*/
|
||||
async updateExportRule(ruleId: number, rule: Partial<ExportRule>): Promise<ExportRule> {
|
||||
const response = await this.client.put<ExportRule>(`/export/rules/${ruleId}`, rule)
|
||||
return response.data
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete export rule
|
||||
*/
|
||||
async deleteExportRule(ruleId: number): Promise<void> {
|
||||
await this.client.delete(`/export/rules/${ruleId}`)
|
||||
}
|
||||
}
|
||||
|
||||
// Export singleton instance
|
||||
|
||||
@@ -1,182 +0,0 @@
|
||||
/**
|
||||
* API Type Definitions
|
||||
* Based on backend OpenAPI specification
|
||||
*/
|
||||
|
||||
// Authentication
|
||||
export interface LoginRequest {
|
||||
username: string
|
||||
password: string
|
||||
}
|
||||
|
||||
export interface LoginResponse {
|
||||
access_token: string
|
||||
token_type: string
|
||||
expires_in: number // Token expiration time in seconds
|
||||
}
|
||||
|
||||
export interface User {
|
||||
id: number
|
||||
username: string
|
||||
email?: string
|
||||
displayName?: string | null
|
||||
}
|
||||
|
||||
// File Upload (V2 API)
|
||||
export interface UploadResponse {
|
||||
task_id: string
|
||||
filename: string
|
||||
file_size: number
|
||||
file_type: string
|
||||
status: 'pending' | 'processing' | 'completed' | 'failed'
|
||||
}
|
||||
|
||||
export interface FileInfo {
|
||||
id: number
|
||||
filename: string
|
||||
file_size: number
|
||||
file_format: string // Changed from 'format' to match backend
|
||||
status: 'pending' | 'processing' | 'completed' | 'failed'
|
||||
}
|
||||
|
||||
// OCR Processing
|
||||
export interface ProcessRequest {
|
||||
batch_id: number
|
||||
lang?: string
|
||||
detect_layout?: boolean // Changed from confidence_threshold to match backend
|
||||
}
|
||||
|
||||
export interface ProcessResponse {
|
||||
message: string // Added to match backend
|
||||
batch_id: number
|
||||
total_files: number // Added to match backend
|
||||
status: string
|
||||
// Removed task_id - backend uses batch-level tracking instead
|
||||
}
|
||||
|
||||
export interface TaskStatus {
|
||||
task_id: string
|
||||
status: 'pending' | 'processing' | 'completed' | 'failed'
|
||||
progress_percentage: number
|
||||
current_file?: string
|
||||
files_processed: number
|
||||
total_files: number
|
||||
error?: string
|
||||
}
|
||||
|
||||
export interface BatchStatus {
|
||||
batch: {
|
||||
id: number
|
||||
status: 'pending' | 'processing' | 'completed' | 'failed'
|
||||
progress_percentage: number
|
||||
created_at: string
|
||||
completed_at?: string
|
||||
}
|
||||
files: FileResult[]
|
||||
}
|
||||
|
||||
export interface FileResult {
|
||||
id: number
|
||||
filename: string
|
||||
status: 'pending' | 'processing' | 'completed' | 'failed'
|
||||
processing_time?: number
|
||||
error?: string
|
||||
}
|
||||
|
||||
// OCR Results
|
||||
export interface OCRResult {
|
||||
file_id: number
|
||||
filename: string
|
||||
status: string
|
||||
markdown_content: string
|
||||
json_data: OCRJsonData
|
||||
confidence: number
|
||||
processing_time: number
|
||||
}
|
||||
|
||||
export interface OCRJsonData {
|
||||
total_text_regions: number
|
||||
average_confidence: number
|
||||
text_blocks: TextBlock[]
|
||||
layout_info?: LayoutInfo
|
||||
}
|
||||
|
||||
export interface TextBlock {
|
||||
text: string
|
||||
confidence: number
|
||||
bbox: [number, number, number, number]
|
||||
position: number
|
||||
}
|
||||
|
||||
export interface LayoutInfo {
|
||||
tables_detected: number
|
||||
images_detected: number
|
||||
structure: string
|
||||
}
|
||||
|
||||
// Export
|
||||
export interface ExportRequest {
|
||||
batch_id: number
|
||||
format: 'txt' | 'json' | 'excel' | 'markdown' | 'pdf'
|
||||
rule_id?: number
|
||||
options?: ExportOptions
|
||||
}
|
||||
|
||||
export interface ExportOptions {
|
||||
confidence_threshold?: number
|
||||
include_metadata?: boolean
|
||||
filename_pattern?: string
|
||||
css_template?: string
|
||||
}
|
||||
|
||||
export interface ExportRule {
|
||||
id: number
|
||||
rule_name: string
|
||||
config_json: Record<string, any>
|
||||
css_template?: string
|
||||
created_at: string
|
||||
}
|
||||
|
||||
export interface CSSTemplate {
|
||||
name: string
|
||||
description: string
|
||||
// filename is not returned by backend - use name as identifier
|
||||
}
|
||||
|
||||
// Translation (FUTURE FEATURE)
|
||||
export interface TranslateRequest {
|
||||
file_id: number
|
||||
source_lang: string
|
||||
target_lang: string
|
||||
engine_type?: 'argos' | 'ernie' | 'google'
|
||||
}
|
||||
|
||||
export interface TranslateResponse {
|
||||
task_id: string
|
||||
file_id: number
|
||||
status: 'pending' | 'processing' | 'completed' | 'failed'
|
||||
translated_content?: string
|
||||
}
|
||||
|
||||
export interface TranslationConfig {
|
||||
id: number
|
||||
source_lang: string
|
||||
target_lang: string
|
||||
engine_type: 'argos' | 'ernie' | 'google'
|
||||
engine_config: Record<string, any>
|
||||
created_at: string
|
||||
}
|
||||
|
||||
// API Response
|
||||
export interface ApiResponse<T = any> {
|
||||
success: boolean
|
||||
data?: T
|
||||
error?: string
|
||||
message?: string
|
||||
}
|
||||
|
||||
// Error Response
|
||||
export interface ApiError {
|
||||
detail: string
|
||||
status_code: number
|
||||
}
|
||||
Reference in New Issue
Block a user