改用API驗證

This commit is contained in:
beabigegg
2025-10-02 17:13:24 +08:00
parent 0a89c19fc9
commit adecdf0cce
48 changed files with 6136 additions and 1239 deletions

View File

@@ -23,29 +23,51 @@ class DifyClient:
"""Dify API 客戶端"""
def __init__(self):
self.base_url = current_app.config.get('DIFY_API_BASE_URL', '')
self.api_key = current_app.config.get('DIFY_API_KEY', '')
# 翻译API配置
self.translation_base_url = current_app.config.get('DIFY_TRANSLATION_BASE_URL', '')
self.translation_api_key = current_app.config.get('DIFY_TRANSLATION_API_KEY', '')
# OCR API配置
self.ocr_base_url = current_app.config.get('DIFY_OCR_BASE_URL', '')
self.ocr_api_key = current_app.config.get('DIFY_OCR_API_KEY', '')
self.timeout = (10, 60) # (連接超時, 讀取超時)
self.max_retries = 3
self.retry_delay = 1.6 # 指數退避基數
if not self.base_url or not self.api_key:
logger.warning("Dify API configuration is incomplete")
if not self.translation_base_url or not self.translation_api_key:
logger.warning("Dify Translation API configuration is incomplete")
if not self.ocr_base_url or not self.ocr_api_key:
logger.warning("Dify OCR API configuration is incomplete")
def _make_request(self, method: str, endpoint: str, data: Dict[str, Any] = None,
user_id: int = None, job_id: int = None) -> Dict[str, Any]:
def _make_request(self, method: str, endpoint: str, data: Dict[str, Any] = None,
user_id: int = None, job_id: int = None, files_data: Dict = None,
api_type: str = 'translation') -> Dict[str, Any]:
"""發送 HTTP 請求到 Dify API"""
if not self.base_url or not self.api_key:
raise APIError("Dify API 未配置完整")
url = f"{self.base_url.rstrip('/')}/{endpoint.lstrip('/')}"
# 根据API类型选择配置
if api_type == 'ocr':
base_url = self.ocr_base_url
api_key = self.ocr_api_key
if not base_url or not api_key:
raise APIError("Dify OCR API 未配置完整")
else: # translation
base_url = self.translation_base_url
api_key = self.translation_api_key
if not base_url or not api_key:
raise APIError("Dify Translation API 未配置完整")
url = f"{base_url.rstrip('/')}/{endpoint.lstrip('/')}"
headers = {
'Authorization': f'Bearer {self.api_key}',
'Content-Type': 'application/json',
'Authorization': f'Bearer {api_key}',
'User-Agent': 'PANJIT-Document-Translator/1.0'
}
# 只有在非文件上传时才设置JSON Content-Type
if not files_data:
headers['Content-Type'] = 'application/json'
# 重試邏輯
last_exception = None
@@ -53,11 +75,15 @@ class DifyClient:
for attempt in range(self.max_retries):
try:
logger.debug(f"Making Dify API request: {method} {url} (attempt {attempt + 1})")
# logger.debug(f"Making Dify API request: {method} {url} (attempt {attempt + 1})")
if method.upper() == 'GET':
response = requests.get(url, headers=headers, timeout=self.timeout, params=data)
elif files_data:
# 文件上传请求使用multipart/form-data
response = requests.post(url, headers=headers, timeout=self.timeout, files=files_data, data=data)
else:
# 普通JSON请求
response = requests.post(url, headers=headers, timeout=self.timeout, json=data)
# 計算響應時間
@@ -80,7 +106,7 @@ class DifyClient:
success=True
)
logger.debug(f"Dify API request successful: {response_time_ms}ms")
# logger.debug(f"Dify API request successful: {response_time_ms}ms")
return result
except requests.exceptions.RequestException as e:
@@ -107,7 +133,7 @@ class DifyClient:
# 指數退避
delay = self.retry_delay ** attempt
logger.debug(f"Retrying in {delay} seconds...")
# logger.debug(f"Retrying in {delay} seconds...")
time.sleep(delay)
# 所有重試都失敗了
@@ -137,7 +163,7 @@ class DifyClient:
logger.warning(f"Failed to record API usage: {str(e)}")
def translate_text(self, text: str, source_language: str, target_language: str,
user_id: int = None, job_id: int = None) -> Dict[str, Any]:
user_id: int = None, job_id: int = None, conversation_id: str = None) -> Dict[str, Any]:
"""翻譯文字"""
if not text.strip():
@@ -181,7 +207,15 @@ Rules:
'user': f"user_{user_id}" if user_id else "doc-translator-user",
'query': query
}
# 如果有 conversation_id加入請求中以維持對話連續性
if conversation_id:
request_data['conversation_id'] = conversation_id
logger.info(f"[TRANSLATION] Sending translation request...")
logger.info(f"[TRANSLATION] Request data: {request_data}")
logger.info(f"[TRANSLATION] Text length: {len(text)} characters")
try:
response = self._make_request(
method='POST',
@@ -203,6 +237,7 @@ Rules:
'source_text': text,
'source_language': source_language,
'target_language': target_language,
'conversation_id': response.get('conversation_id'),
'metadata': response.get('metadata', {})
}
@@ -271,18 +306,165 @@ Rules:
with open(config_file, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if line.startswith('base_url:'):
if line.startswith('#') or not line:
continue # 跳过注释和空行
# 翻译API配置兼容旧格式
if line.startswith('base_url:') or line.startswith('translation_base_url:'):
base_url = line.split(':', 1)[1].strip()
current_app.config['DIFY_TRANSLATION_BASE_URL'] = base_url
# 兼容旧配置
current_app.config['DIFY_API_BASE_URL'] = base_url
elif line.startswith('api:'):
elif line.startswith('api:') or line.startswith('translation_api:'):
api_key = line.split(':', 1)[1].strip()
current_app.config['DIFY_TRANSLATION_API_KEY'] = api_key
# 兼容旧配置
current_app.config['DIFY_API_KEY'] = api_key
# OCR API配置
elif line.startswith('ocr_base_url:'):
ocr_base_url = line.split(':', 1)[1].strip()
current_app.config['DIFY_OCR_BASE_URL'] = ocr_base_url
elif line.startswith('ocr_api:'):
ocr_api_key = line.split(':', 1)[1].strip()
current_app.config['DIFY_OCR_API_KEY'] = ocr_api_key
logger.info("Dify API config loaded from file")
except Exception as e:
logger.error(f"Failed to load Dify config from file: {str(e)}")
def upload_file(self, image_data: bytes, filename: str, user_id: int = None) -> str:
"""上传图片文件到Dify OCR API并返回file_id"""
if not image_data:
raise APIError("图片数据不能为空")
logger.info(f"[OCR-UPLOAD] Starting file upload to Dify OCR API")
logger.info(f"[OCR-UPLOAD] File: {filename}, Size: {len(image_data)} bytes, User: {user_id}")
# 构建文件上传数据
files_data = {
'file': (filename, image_data, 'image/png') # 假设为PNG格式
}
form_data = {
'user': f"user_{user_id}" if user_id else "doc-translator-user"
}
# logger.debug(f"[OCR-UPLOAD] Upload form_data: {form_data}")
# logger.debug(f"[OCR-UPLOAD] Using OCR API: {self.ocr_base_url}")
try:
response = self._make_request(
method='POST',
endpoint='/files/upload',
data=form_data,
files_data=files_data,
user_id=user_id,
api_type='ocr' # 使用OCR API
)
logger.info(f"[OCR-UPLOAD] Raw Dify upload response: {response}")
file_id = response.get('id')
if not file_id:
logger.error(f"[OCR-UPLOAD] No file ID in response: {response}")
raise APIError("Dify 文件上传失败未返回文件ID")
logger.info(f"[OCR-UPLOAD] ✓ File uploaded successfully: {file_id}")
# logger.debug(f"[OCR-UPLOAD] File details: name={response.get('name')}, size={response.get('size')}, type={response.get('mime_type')}")
return file_id
except APIError:
raise
except Exception as e:
error_msg = f"文件上传到Dify失败: {str(e)}"
logger.error(f"[OCR-UPLOAD] ✗ Upload failed: {error_msg}")
raise APIError(error_msg)
def ocr_image_with_dify(self, image_data: bytes, filename: str = "image.png",
user_id: int = None, job_id: int = None) -> str:
"""使用Dify进行图像OCR识别"""
logger.info(f"[OCR-RECOGNITION] Starting OCR process for {filename}")
logger.info(f"[OCR-RECOGNITION] Image size: {len(image_data)} bytes, User: {user_id}, Job: {job_id}")
try:
# 1. 先上传文件获取file_id
logger.info(f"[OCR-RECOGNITION] Step 1: Uploading image to Dify...")
file_id = self.upload_file(image_data, filename, user_id)
logger.info(f"[OCR-RECOGNITION] Step 1 ✓ File uploaded with ID: {file_id}")
# 2. 构建OCR请求
# 系统提示词已在Dify Chat Flow中配置这里只需要发送简单的用户query
query = "將圖片中的文字完整的提取出來"
logger.info(f"[OCR-RECOGNITION] Step 2: Preparing OCR request...")
# logger.debug(f"[OCR-RECOGNITION] Query: {query}")
# 3. 构建Chat Flow请求根据最新Dify运行记录图片应该放在files数组中
request_data = {
'inputs': {},
'response_mode': 'blocking',
'user': f"user_{user_id}" if user_id else "doc-translator-user",
'query': query,
'files': [
{
'type': 'image',
'transfer_method': 'local_file',
'upload_file_id': file_id
}
]
}
logger.info(f"[OCR-RECOGNITION] Step 3: Sending OCR request to Dify...")
logger.info(f"[OCR-RECOGNITION] Request data: {request_data}")
logger.info(f"[OCR-RECOGNITION] Using OCR API: {self.ocr_base_url}")
response = self._make_request(
method='POST',
endpoint='/chat-messages',
data=request_data,
user_id=user_id,
job_id=job_id,
api_type='ocr' # 使用OCR API
)
logger.info(f"[OCR-RECOGNITION] Step 3 ✓ Received response from Dify")
logger.info(f"[OCR-RECOGNITION] Raw Dify OCR response: {response}")
# 从响应中提取OCR结果
answer = response.get('answer', '')
metadata = response.get('metadata', {})
conversation_id = response.get('conversation_id', '')
logger.info(f"[OCR-RECOGNITION] Response details:")
logger.info(f"[OCR-RECOGNITION] - Answer length: {len(answer) if answer else 0} characters")
logger.info(f"[OCR-RECOGNITION] - Conversation ID: {conversation_id}")
logger.info(f"[OCR-RECOGNITION] - Metadata: {metadata}")
if not isinstance(answer, str) or not answer.strip():
logger.error(f"[OCR-RECOGNITION] ✗ Empty or invalid answer from Dify")
logger.error(f"[OCR-RECOGNITION] Answer type: {type(answer)}, Content: '{answer}'")
raise APIError("Dify OCR 返回空的识别结果")
# 记录OCR识别的前100个字符用于调试
preview = answer[:100] + "..." if len(answer) > 100 else answer
logger.info(f"[OCR-RECOGNITION] ✓ OCR completed successfully")
logger.info(f"[OCR-RECOGNITION] Extracted {len(answer)} characters")
# logger.debug(f"[OCR-RECOGNITION] Text preview: {preview}")
return answer.strip()
except APIError:
raise
except Exception as e:
error_msg = f"Dify OCR识别失败: {str(e)}"
logger.error(f"[OCR-RECOGNITION] ✗ OCR process failed: {error_msg}")
logger.error(f"[OCR-RECOGNITION] Exception details: {type(e).__name__}: {str(e)}")
raise APIError(error_msg)
def init_dify_config(app):
"""初始化 Dify 配置"""
@@ -291,12 +473,22 @@ def init_dify_config(app):
DifyClient.load_config_from_file()
# 檢查配置完整性
base_url = app.config.get('DIFY_API_BASE_URL')
api_key = app.config.get('DIFY_API_KEY')
if base_url and api_key:
logger.info("Dify API configuration loaded successfully")
translation_base_url = app.config.get('DIFY_TRANSLATION_BASE_URL')
translation_api_key = app.config.get('DIFY_TRANSLATION_API_KEY')
ocr_base_url = app.config.get('DIFY_OCR_BASE_URL')
ocr_api_key = app.config.get('DIFY_OCR_API_KEY')
logger.info("Dify API Configuration Status:")
if translation_base_url and translation_api_key:
logger.info("✓ Translation API configured successfully")
else:
logger.warning("Dify API configuration is incomplete")
logger.warning(f"Base URL: {'' if base_url else ''}")
logger.warning(f"API Key: {'' if api_key else ''}")
logger.warning("✗ Translation API configuration is incomplete")
logger.warning(f" - Translation Base URL: {'' if translation_base_url else ''}")
logger.warning(f" - Translation API Key: {'' if translation_api_key else ''}")
if ocr_base_url and ocr_api_key:
logger.info("✓ OCR API configured successfully")
else:
logger.warning("✗ OCR API configuration is incomplete (扫描PDF功能将不可用)")
logger.warning(f" - OCR Base URL: {'' if ocr_base_url else ''}")
logger.warning(f" - OCR API Key: {'' if ocr_api_key else ''}")

View File

@@ -0,0 +1,700 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
增强的PDF解析器 - 支持扫描PDF的OCR处理
Author: PANJIT IT Team
Created: 2024-09-23
Modified: 2024-09-23
"""
import io
from pathlib import Path
from typing import List, Optional
from PyPDF2 import PdfReader
from app.utils.logger import get_logger
from app.utils.exceptions import FileProcessingError
from app.services.dify_client import DifyClient
from app.services.ocr_cache import OCRCache
from app.utils.image_preprocessor import ImagePreprocessor
logger = get_logger(__name__)
# 检查PyMuPDF依赖
try:
import fitz # PyMuPDF
_HAS_PYMUPDF = True
except ImportError:
_HAS_PYMUPDF = False
logger.warning("PyMuPDF not available. Scanned PDF processing will be disabled.")
class EnhancedPdfParser:
"""支持扫描PDF的增强解析器"""
def __init__(self, file_path: str):
self.file_path = Path(file_path)
self.dify_client = DifyClient()
self.ocr_cache = OCRCache()
self.image_preprocessor = ImagePreprocessor(use_opencv=True)
if not self.file_path.exists():
raise FileProcessingError(f"PDF文件不存在: {file_path}")
def is_scanned_pdf(self) -> bool:
"""检测PDF是否为扫描件"""
try:
reader = PdfReader(str(self.file_path))
text_content = ""
# 检查前3页的文字内容
pages_to_check = min(3, len(reader.pages))
for i in range(pages_to_check):
page_text = reader.pages[i].extract_text()
text_content += page_text
# 如果文字内容很少,很可能是扫描件
text_length = len(text_content.strip())
logger.info(f"PDF text extraction found {text_length} characters in first {pages_to_check} pages")
# 阈值少于100个字符认为是扫描件
is_scanned = text_length < 100
if is_scanned:
logger.info("PDF detected as scanned document, will use OCR processing")
else:
logger.info("PDF detected as text-based document, will use direct text extraction")
return is_scanned
except Exception as e:
logger.warning(f"Failed to analyze PDF type: {e}, treating as scanned document")
return True # 默认当作扫描件处理
def extract_text_segments(self, user_id: int = None, job_id: int = None) -> List[str]:
"""智能提取PDF文字片段"""
try:
# 首先尝试直接文字提取
if not self.is_scanned_pdf():
return self._extract_from_text_pdf()
# 扫描PDF则转换为图片后使用Dify OCR
if not _HAS_PYMUPDF:
raise FileProcessingError("处理扫描PDF需要PyMuPDF库请安装: pip install PyMuPDF")
return self._extract_from_scanned_pdf(user_id, job_id)
except Exception as e:
logger.error(f"PDF文字提取失败: {str(e)}")
raise FileProcessingError(f"PDF文件解析失败: {str(e)}")
def _extract_from_text_pdf(self) -> List[str]:
"""从文字型PDF提取文字片段"""
try:
reader = PdfReader(str(self.file_path))
text_segments = []
for page_num, page in enumerate(reader.pages, 1):
page_text = page.extract_text()
if page_text.strip():
# 简单的句子分割
sentences = self._split_text_into_sentences(page_text)
# 过滤掉太短的片段
valid_sentences = [s for s in sentences if len(s.strip()) > 10]
text_segments.extend(valid_sentences)
logger.debug(f"Page {page_num}: extracted {len(valid_sentences)} sentences")
logger.info(f"Text PDF extraction completed: {len(text_segments)} segments")
# 合併短段落以減少不必要的翻譯調用
merged_segments = self._merge_short_segments(text_segments)
return merged_segments
except Exception as e:
logger.error(f"Text PDF extraction failed: {str(e)}")
raise FileProcessingError(f"文字PDF提取失败: {str(e)}")
def _extract_from_scanned_pdf(self, user_id: int = None, job_id: int = None) -> List[str]:
"""从扫描PDF提取文字片段使用Dify OCR"""
try:
doc = fitz.open(str(self.file_path))
text_segments = []
total_pages = doc.page_count
logger.info(f"Processing scanned PDF with {total_pages} pages using Dify OCR")
for page_num in range(total_pages):
try:
logger.info(f"[PDF-OCR] Processing page {page_num + 1}/{total_pages}")
page = doc[page_num]
# 转换页面为高分辨率图片
# 使用2倍缩放提高OCR准确度
zoom = 2.0
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat, alpha=False)
# 转换为PNG字节数据
# 轉換為 PNG 並進行圖像預處理以提升 OCR 準確度
img_data_raw = pix.tobytes("png")
img_data = self.image_preprocessor.preprocess_smart(img_data_raw)
logger.debug(f"[PDF-OCR] Page {page_num + 1}: Image preprocessed ({len(img_data_raw)} -> {len(img_data)} bytes)")
filename = f"page_{page_num + 1}.png"
logger.info(f"[PDF-OCR] Page {page_num + 1}: Converted to image ({len(img_data)} bytes)")
logger.debug(f"[PDF-OCR] Page {page_num + 1}: Image zoom={zoom}, format=PNG")
# 检查OCR快取
cache_key_info = f"{self.file_path.name}_page_{page_num + 1}_zoom_{zoom}"
cached_text = self.ocr_cache.get_cached_text(
file_data=img_data,
filename=filename,
additional_info=cache_key_info
)
if cached_text:
logger.info(f"[PDF-OCR] Page {page_num + 1}: ✓ 使用快取的OCR結果 (節省AI流量)")
ocr_text = cached_text
else:
# 使用Dify OCR识别文字
logger.info(f"[PDF-OCR] Page {page_num + 1}: Starting OCR recognition...")
ocr_text = self.dify_client.ocr_image_with_dify(
image_data=img_data,
filename=filename,
user_id=user_id,
job_id=job_id
)
# 保存OCR结果到快取
if ocr_text.strip():
self.ocr_cache.save_cached_text(
file_data=img_data,
extracted_text=ocr_text,
filename=filename,
additional_info=cache_key_info,
metadata={
'source_file': str(self.file_path),
'page_number': page_num + 1,
'total_pages': total_pages,
'zoom_level': zoom,
'image_size_bytes': len(img_data),
'user_id': user_id,
'job_id': job_id
}
)
logger.info(f"[PDF-OCR] Page {page_num + 1}: ✓ OCR結果已保存到快取")
logger.info(f"[PDF-OCR] Page {page_num + 1}: OCR completed")
logger.debug(f"[PDF-OCR] Page {page_num + 1}: Raw OCR result length: {len(ocr_text)}")
if ocr_text.strip():
# 分割OCR结果为句子
logger.debug(f"[PDF-OCR] Page {page_num + 1}: Splitting OCR text into sentences...")
sentences = self._split_ocr_text(ocr_text)
# 过滤有效句子
valid_sentences = [s for s in sentences if len(s.strip()) > 5]
text_segments.extend(valid_sentences)
logger.info(f"[PDF-OCR] Page {page_num + 1}: ✓ Extracted {len(valid_sentences)} valid sentences")
logger.debug(f"[PDF-OCR] Page {page_num + 1}: Total sentences before filter: {len(sentences)}")
# 记录前50个字符用于调试
if valid_sentences:
preview = valid_sentences[0][:50] + "..." if len(valid_sentences[0]) > 50 else valid_sentences[0]
logger.debug(f"[PDF-OCR] Page {page_num + 1}: First sentence preview: {preview}")
else:
logger.warning(f"[PDF-OCR] Page {page_num + 1}: ⚠ OCR returned empty result")
except Exception as e:
logger.error(f"[PDF-OCR] Page {page_num + 1}: ✗ Processing failed: {str(e)}")
logger.error(f"[PDF-OCR] Page {page_num + 1}: Exception type: {type(e).__name__}")
# 继续处理下一页,不中断整个流程
continue
doc.close()
logger.info(f"[PDF-OCR] OCR processing completed for all {total_pages} pages")
logger.info(f"[PDF-OCR] Total text segments extracted: {len(text_segments)}")
if not text_segments:
logger.error(f"[PDF-OCR] ✗ No text content extracted from any page")
raise FileProcessingError("OCR处理完成但未提取到任何文字内容")
logger.info(f"[PDF-OCR] ✓ Scanned PDF processing completed successfully")
logger.info(f"[PDF-OCR] Final result: {len(text_segments)} text segments extracted")
# 合併短段落以減少不必要的翻譯調用
merged_segments = self._merge_short_segments(text_segments)
logger.info(f"[PDF-OCR] After merging: {len(merged_segments)} segments ready for translation")
return merged_segments
except Exception as e:
logger.error(f"Scanned PDF processing failed: {str(e)}")
raise FileProcessingError(f"扫描PDF处理失败: {str(e)}")
def _split_text_into_sentences(self, text: str) -> List[str]:
"""将文字分割成句子"""
if not text.strip():
return []
# 简单的分句逻辑
sentences = []
separators = ['. ', '', '', '', '!', '?', '\n\n']
current_sentences = [text]
for sep in separators:
new_sentences = []
for sentence in current_sentences:
parts = sentence.split(sep)
if len(parts) > 1:
# 保留分隔符
for i, part in enumerate(parts[:-1]):
if part.strip():
new_sentences.append(part.strip() + sep.rstrip())
# 最后一部分
if parts[-1].strip():
new_sentences.append(parts[-1].strip())
else:
new_sentences.append(sentence)
current_sentences = new_sentences
# 过滤掉太短的句子
valid_sentences = [s for s in current_sentences if len(s.strip()) > 3]
return valid_sentences
def _split_ocr_text(self, ocr_text: str) -> List[str]:
"""分割OCR识别的文字"""
if not ocr_text.strip():
return []
# OCR结果可能包含表格或特殊格式需要特殊处理
lines = ocr_text.split('\n')
sentences = []
current_paragraph = []
for line in lines:
line = line.strip()
if not line:
# 空行表示段落结束
if current_paragraph:
paragraph_text = ' '.join(current_paragraph)
if len(paragraph_text) > 10:
sentences.append(paragraph_text)
current_paragraph = []
continue
# 检查是否是表格行(包含|或多个制表符)
if '|' in line or '\t' in line:
# 表格行单独处理
if current_paragraph:
paragraph_text = ' '.join(current_paragraph)
if len(paragraph_text) > 10:
sentences.append(paragraph_text)
current_paragraph = []
if len(line) > 10:
sentences.append(line)
else:
# 普通文字行
current_paragraph.append(line)
# 处理最后的段落
if current_paragraph:
paragraph_text = ' '.join(current_paragraph)
if len(paragraph_text) > 10:
sentences.append(paragraph_text)
return sentences
def generate_translated_document(self, translations: dict, target_language: str,
output_dir: Path) -> str:
"""生成翻译的Word文档保持与DOCX相同的格式"""
try:
from app.utils.helpers import generate_filename
translated_texts = translations.get(target_language, [])
# 生成Word文档而非文字文件
output_filename = f"{self.file_path.stem}_{target_language}_translated.docx"
output_path = output_dir / output_filename
# 创建Word文档
from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
doc = Document()
# 添加标题页
title = doc.add_heading(f"PDF翻译结果 - {target_language}", 0)
title.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
# 添加文档信息
info_para = doc.add_paragraph()
info_para.add_run("原始文件: ").bold = True
info_para.add_run(self.file_path.name)
info_para.add_run("\n处理方式: ").bold = True
info_para.add_run("OCR识别" if self.is_scanned_pdf() else "直接文字提取")
info_para.add_run(f"\n翻译语言: ").bold = True
info_para.add_run(target_language)
info_para.add_run(f"\n总段落数: ").bold = True
info_para.add_run(str(len(translated_texts)))
doc.add_paragraph() # 空行
# 添加翻译内容
for i, text in enumerate(translated_texts, 1):
content_type = self._detect_content_type(text)
if content_type == 'table':
# 尝试创建实际的表格
self._add_table_content(doc, text, i)
elif content_type == 'heading':
# 添加标题
self._add_heading_content(doc, text, i)
elif content_type == 'list':
# 添加列表
self._add_list_content(doc, text, i)
else:
# 普通段落
self._add_paragraph_content(doc, text, i)
# 保存Word文档
doc.save(output_path)
logger.info(f"Generated translated PDF Word document: {output_path}")
return str(output_path)
except Exception as e:
logger.error(f"Failed to generate translated Word document: {str(e)}")
raise FileProcessingError(f"生成翻译Word文档失败: {str(e)}")
def generate_combined_translated_document(self, all_translations: dict, target_languages: list,
output_dir: Path) -> str:
"""生成包含所有翻譯語言的組合Word文檔譯文1/譯文2格式"""
try:
from app.utils.helpers import generate_filename
# 生成組合文檔檔名
languages_suffix = '_'.join(target_languages)
output_filename = f"{self.file_path.stem}_{languages_suffix}_combined.docx"
output_path = output_dir / output_filename
# 创建Word文档
from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
doc = Document()
# 添加标题页
title = doc.add_heading(f"PDF翻译結果 - 多語言組合文檔", 0)
title.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
# 添加文档信息
info_para = doc.add_paragraph()
info_para.add_run("原始文件: ").bold = True
info_para.add_run(self.file_path.name)
info_para.add_run("\n处理方式: ").bold = True
info_para.add_run("OCR识别" if self.is_scanned_pdf() else "直接文字提取")
info_para.add_run(f"\n翻译语言: ").bold = True
info_para.add_run(' / '.join(target_languages))
# 获取第一个語言的翻譯作為基準長度
first_language = target_languages[0]
segment_count = len(all_translations.get(first_language, []))
info_para.add_run(f"\n总段落数: ").bold = True
info_para.add_run(str(segment_count))
doc.add_paragraph() # 空行
# 添加翻译内容 - 譯文1/譯文2格式
for i in range(segment_count):
content_para = doc.add_paragraph()
# 添加段落编号
num_run = content_para.add_run(f"{i+1:03d}. ")
num_run.bold = True
num_run.font.size = Pt(12)
# 为每种语言添加翻譯
for j, target_language in enumerate(target_languages):
if i < len(all_translations.get(target_language, [])):
translation_text = all_translations[target_language][i]
# 添加語言標識
if j > 0:
content_para.add_run("\n\n") # 翻譯之間的間距
lang_run = content_para.add_run(f"[{target_language}] ")
lang_run.bold = True
lang_run.font.size = Pt(11)
# 添加翻譯内容
trans_run = content_para.add_run(translation_text)
trans_run.font.size = Pt(11)
# 段落間距
content_para.paragraph_format.space_after = Pt(12)
# 保存Word文档
doc.save(output_path)
logger.info(f"Generated combined translated PDF Word document: {output_path}")
return str(output_path)
except Exception as e:
logger.error(f"Failed to generate combined translated Word document: {str(e)}")
raise FileProcessingError(f"生成組合翻译Word文档失败: {str(e)}")
def _is_table_component(self, segment: str) -> bool:
"""檢查段落是否為表格組件(表格邊界、分隔線等)"""
segment = segment.strip()
# Markdown表格分隔線如 |---|---|---| 或 |===|===|===|
if '|' in segment and ('-' in segment or '=' in segment):
# 移除 | 和 - = 後,如果剩餘內容很少,則判斷為表格分隔線
clean_segment = segment.replace('|', '').replace('-', '').replace('=', '').replace(' ', '').replace(':', '')
if len(clean_segment) <= 2: # 允許少量其他字符
return True
# 純分隔線
if segment.replace('=', '').replace('-', '').replace(' ', '') == '':
return True
return False
def _is_table_row(self, segment: str) -> bool:
"""檢查段落是否為表格行(包含實際數據的表格行)"""
segment = segment.strip()
# Markdown表格行至少包含兩個 | 符號,且有實際內容
if segment.count('|') >= 2:
# 移除首尾的 | 並分割為單元格
cells = segment.strip('|').split('|')
# 檢查是否有實際的文字內容(不只是分隔符號)
has_content = any(
cell.strip() and
not cell.replace('-', '').replace('=', '').replace(' ', '').replace(':', '') == ''
for cell in cells
)
if has_content:
return True
return False
def _merge_table_segments(self, segments: List[str], start_idx: int) -> tuple[str, int]:
"""
合併表格相關的段落
Returns:
(merged_table_content, next_index)
"""
table_parts = []
current_idx = start_idx
# 收集連續的表格相關段落
while current_idx < len(segments):
segment = segments[current_idx].strip()
if self._is_table_component(segment) or self._is_table_row(segment):
table_parts.append(segment)
current_idx += 1
else:
break
# 將表格部分合併為一個段落
merged_table = '\n'.join(table_parts)
return merged_table, current_idx
def _merge_short_segments(self, text_segments: List[str], min_length: int = 10) -> List[str]:
"""
合併短段落以減少不必要的翻譯調用,特別處理表格結構
Args:
text_segments: 原始文字段落列表
min_length: 最小段落長度閾值,短於此長度的段落將被合併
Returns:
合併後的段落列表
"""
if not text_segments:
return text_segments
merged_segments = []
current_merge = ""
i = 0
while i < len(text_segments):
segment = text_segments[i].strip()
if not segment: # 跳過空段落
i += 1
continue
# 檢查是否為表格組件
if self._is_table_component(segment) or self._is_table_row(segment):
# 先處理之前積累的短段落
if current_merge:
merged_segments.append(current_merge.strip())
logger.debug(f"Merged short segments before table: '{current_merge[:50]}...'")
current_merge = ""
# 合併表格相關段落
table_content, next_i = self._merge_table_segments(text_segments, i)
merged_segments.append(table_content)
logger.debug(f"Merged table content: {next_i - i} segments -> 1 table block")
i = next_i
continue
# 檢查是否為短段落
if len(segment) < min_length:
# 檢查是否為純標點符號或數字(排除表格符號)
if segment.replace('*', '').replace('-', '').replace('_', '').replace('#', '').strip() == '':
logger.debug(f"Skipping pure symbol segment: '{segment}'")
i += 1
continue
# 短段落需要合併
if current_merge:
current_merge += " " + segment
else:
current_merge = segment
logger.debug(f"Adding short segment to merge: '{segment}' (length: {len(segment)})")
else:
# 長段落,先處理之前積累的短段落
if current_merge:
merged_segments.append(current_merge.strip())
logger.debug(f"Merged short segments: '{current_merge[:50]}...' (total length: {len(current_merge)})")
current_merge = ""
# 添加當前長段落
merged_segments.append(segment)
logger.debug(f"Added long segment: '{segment[:50]}...' (length: {len(segment)})")
i += 1
# 處理最後剩餘的短段落
if current_merge:
merged_segments.append(current_merge.strip())
logger.debug(f"Final merged short segments: '{current_merge[:50]}...' (total length: {len(current_merge)})")
logger.info(f"Segment merging: {len(text_segments)} -> {len(merged_segments)} segments")
return merged_segments
def _detect_content_type(self, text: str) -> str:
"""检测内容类型"""
text_lower = text.lower().strip()
# 检测表格(包含多个|或制表符)
if ('|' in text and text.count('|') >= 2) or '\t' in text:
return 'table'
# 检测标题
if (text_lower.startswith(('', '', 'chapter', 'section', '#')) or
any(keyword in text_lower for keyword in ['', '', '']) and len(text) < 100):
return 'heading'
# 检测列表
if (text_lower.startswith(('', '-', '*', '1.', '2.', '3.', '4.', '5.')) or
any(text_lower.startswith(f"{i}.") for i in range(1, 20))):
return 'list'
return 'paragraph'
def _add_table_content(self, doc, text: str, index: int):
"""添加表格内容"""
from docx.shared import Pt
# 添加表格标题
title_para = doc.add_paragraph()
title_run = title_para.add_run(f"表格 {index}: ")
title_run.bold = True
title_run.font.size = Pt(12)
# 解析表格
if '|' in text:
# Markdown风格表格
lines = [line.strip() for line in text.split('\n') if line.strip()]
rows = []
for line in lines:
if line.startswith('|') and line.endswith('|'):
cells = [cell.strip() for cell in line.split('|')[1:-1]]
if cells: # 过滤掉分隔行(如|---|---|
if not all(cell.replace('-', '').replace(' ', '') == '' for cell in cells):
rows.append(cells)
if rows:
# 创建表格
table = doc.add_table(rows=len(rows), cols=len(rows[0]))
table.style = 'Table Grid'
for i, row_data in enumerate(rows):
for j, cell_data in enumerate(row_data):
if j < len(table.rows[i].cells):
cell = table.rows[i].cells[j]
cell.text = cell_data
# 设置字体
for paragraph in cell.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(10)
else:
# 制表符分隔的表格
para = doc.add_paragraph()
content_run = para.add_run(text)
content_run.font.name = 'Courier New'
content_run.font.size = Pt(10)
def _add_heading_content(self, doc, text: str, index: int):
"""添加标题内容"""
from docx.shared import Pt
# 移除段落编号,直接作为标题
clean_text = text.strip()
if len(clean_text) < 100:
heading = doc.add_heading(clean_text, level=2)
else:
# 长文本作为普通段落但使用标题样式
para = doc.add_paragraph()
run = para.add_run(clean_text)
run.bold = True
run.font.size = Pt(14)
def _add_list_content(self, doc, text: str, index: int):
"""添加列表内容"""
from docx.shared import Pt
# 检查是否已经有编号
if any(text.strip().startswith(f"{i}.") for i in range(1, 20)):
# 已编号列表
para = doc.add_paragraph(text.strip(), style='List Number')
else:
# 项目符号列表
para = doc.add_paragraph(text.strip(), style='List Bullet')
# 设置字体大小
for run in para.runs:
run.font.size = Pt(11)
def _add_paragraph_content(self, doc, text: str, index: int):
"""添加普通段落内容"""
from docx.shared import Pt
para = doc.add_paragraph()
# 添加段落编号(可选)
num_run = para.add_run(f"{index:03d}. ")
num_run.bold = True
num_run.font.size = Pt(12)
# 添加内容
content_run = para.add_run(text)
content_run.font.size = Pt(11)
# 设置段落间距
para.paragraph_format.space_after = Pt(6)

View File

@@ -56,41 +56,45 @@ class NotificationService:
return None
def _send_email(self, to_email: str, subject: str, html_content: str, text_content: str = None) -> bool:
"""發送郵件的基礎方法"""
try:
if not self.smtp_server or not self.sender_email:
logger.error("SMTP configuration incomplete")
return False
# 建立郵件
msg = MIMEMultipart('alternative')
msg['From'] = f"{self.app_name} <{self.sender_email}>"
msg['To'] = to_email
msg['Subject'] = subject
# 添加文本內容
if text_content:
text_part = MIMEText(text_content, 'plain', 'utf-8')
msg.attach(text_part)
# 添加 HTML 內容
html_part = MIMEText(html_content, 'html', 'utf-8')
msg.attach(html_part)
# 發送郵件
server = self._create_smtp_connection()
if not server:
return False
server.send_message(msg)
server.quit()
logger.info(f"Email sent successfully to {to_email}")
return True
except Exception as e:
logger.error(f"Failed to send email to {to_email}: {str(e)}")
return False
"""發送郵件的基礎方法 - 已停用 (資安限制,無法連接內網)"""
logger.info(f"SMTP service disabled - Email notification skipped for {to_email}: {subject}")
return True # 回傳 True 避免影響其他流程
# 以下 SMTP 功能已註解,因應資安限制無法連接內網
# try:
# if not self.smtp_server or not self.sender_email:
# logger.error("SMTP configuration incomplete")
# return False
#
# # 建立郵件
# msg = MIMEMultipart('alternative')
# msg['From'] = f"{self.app_name} <{self.sender_email}>"
# msg['To'] = to_email
# msg['Subject'] = subject
#
# # 添加文本內容
# if text_content:
# text_part = MIMEText(text_content, 'plain', 'utf-8')
# msg.attach(text_part)
#
# # 添加 HTML 內容
# html_part = MIMEText(html_content, 'html', 'utf-8')
# msg.attach(html_part)
#
# # 發送郵件
# server = self._create_smtp_connection()
# if not server:
# return False
#
# server.send_message(msg)
# server.quit()
#
# logger.info(f"Email sent successfully to {to_email}")
# return True
#
# except Exception as e:
# logger.error(f"Failed to send email to {to_email}: {str(e)}")
# return False
def send_job_completion_notification(self, job: TranslationJob) -> bool:
"""發送任務完成通知"""

282
app/services/ocr_cache.py Normal file
View File

@@ -0,0 +1,282 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
OCR 快取管理模組
Author: PANJIT IT Team
Created: 2024-01-28
Modified: 2024-01-28
"""
import hashlib
import json
import sqlite3
from datetime import datetime, timedelta
from pathlib import Path
from typing import Optional, Dict, Any
import logging
logger = logging.getLogger(__name__)
class OCRCache:
"""OCR 結果快取管理器"""
def __init__(self, cache_db_path: str = "ocr_cache.db", cache_expire_days: int = 30):
"""
初始化 OCR 快取管理器
Args:
cache_db_path: 快取資料庫路徑
cache_expire_days: 快取過期天數
"""
self.cache_db_path = Path(cache_db_path)
self.cache_expire_days = cache_expire_days
self.init_database()
def init_database(self):
"""初始化快取資料庫"""
try:
with sqlite3.connect(self.cache_db_path) as conn:
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS ocr_cache (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file_hash TEXT UNIQUE NOT NULL,
filename TEXT,
file_size INTEGER,
extracted_text TEXT NOT NULL,
extraction_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
access_count INTEGER DEFAULT 1,
last_access_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
metadata TEXT
)
''')
# 創建索引以提高查詢效能
cursor.execute('''
CREATE INDEX IF NOT EXISTS idx_file_hash
ON ocr_cache(file_hash)
''')
cursor.execute('''
CREATE INDEX IF NOT EXISTS idx_extraction_time
ON ocr_cache(extraction_time)
''')
conn.commit()
logger.info("OCR 快取資料庫初始化完成")
except Exception as e:
logger.error(f"初始化 OCR 快取資料庫失敗: {e}")
raise
def _calculate_file_hash(self, file_data: bytes, additional_info: str = "") -> str:
"""
計算檔案內容的 SHA256 雜湊值
Args:
file_data: 檔案二進位資料
additional_info: 額外資訊(如頁數、處理參數等)
Returns:
檔案的 SHA256 雜湊值
"""
hash_input = file_data + additional_info.encode('utf-8')
return hashlib.sha256(hash_input).hexdigest()
def get_cached_text(self, file_data: bytes, filename: str = "",
additional_info: str = "") -> Optional[str]:
"""
獲取快取的 OCR 文字
Args:
file_data: 檔案二進位資料
filename: 檔案名稱
additional_info: 額外資訊
Returns:
快取的文字內容,如果不存在則返回 None
"""
try:
file_hash = self._calculate_file_hash(file_data, additional_info)
with sqlite3.connect(self.cache_db_path) as conn:
cursor = conn.cursor()
# 查詢快取
cursor.execute('''
SELECT extracted_text, access_count
FROM ocr_cache
WHERE file_hash = ? AND
extraction_time > datetime('now', '-{} days')
'''.format(self.cache_expire_days), (file_hash,))
result = cursor.fetchone()
if result:
extracted_text, access_count = result
# 更新訪問計數和時間
cursor.execute('''
UPDATE ocr_cache
SET access_count = ?, last_access_time = CURRENT_TIMESTAMP
WHERE file_hash = ?
''', (access_count + 1, file_hash))
conn.commit()
logger.info(f"[OCR-CACHE] 快取命中: {filename} (訪問次數: {access_count + 1})")
return extracted_text
logger.debug(f"[OCR-CACHE] 快取未命中: {filename}")
return None
except Exception as e:
logger.error(f"獲取 OCR 快取失敗: {e}")
return None
def save_cached_text(self, file_data: bytes, extracted_text: str,
filename: str = "", additional_info: str = "",
metadata: Dict[str, Any] = None) -> bool:
"""
儲存 OCR 文字到快取
Args:
file_data: 檔案二進位資料
extracted_text: 提取的文字
filename: 檔案名稱
additional_info: 額外資訊
metadata: 中繼資料
Returns:
是否儲存成功
"""
try:
file_hash = self._calculate_file_hash(file_data, additional_info)
file_size = len(file_data)
metadata_json = json.dumps(metadata or {}, ensure_ascii=False)
with sqlite3.connect(self.cache_db_path) as conn:
cursor = conn.cursor()
# 使用 INSERT OR REPLACE 來處理重複的雜湊值
cursor.execute('''
INSERT OR REPLACE INTO ocr_cache
(file_hash, filename, file_size, extracted_text, metadata)
VALUES (?, ?, ?, ?, ?)
''', (file_hash, filename, file_size, extracted_text, metadata_json))
conn.commit()
logger.info(f"[OCR-CACHE] 儲存快取成功: {filename} ({len(extracted_text)} 字元)")
return True
except Exception as e:
logger.error(f"儲存 OCR 快取失敗: {e}")
return False
def get_cache_stats(self) -> Dict[str, Any]:
"""
獲取快取統計資訊
Returns:
快取統計資料
"""
try:
with sqlite3.connect(self.cache_db_path) as conn:
cursor = conn.cursor()
# 總記錄數
cursor.execute('SELECT COUNT(*) FROM ocr_cache')
total_records = cursor.fetchone()[0]
# 總訪問次數
cursor.execute('SELECT SUM(access_count) FROM ocr_cache')
total_accesses = cursor.fetchone()[0] or 0
# 快取大小
cursor.execute('SELECT SUM(LENGTH(extracted_text)) FROM ocr_cache')
cache_size_chars = cursor.fetchone()[0] or 0
# 最近 7 天的記錄數
cursor.execute('''
SELECT COUNT(*) FROM ocr_cache
WHERE extraction_time > datetime('now', '-7 days')
''')
recent_records = cursor.fetchone()[0]
# 最常訪問的記錄
cursor.execute('''
SELECT filename, access_count, last_access_time
FROM ocr_cache
ORDER BY access_count DESC
LIMIT 5
''')
top_accessed = cursor.fetchall()
return {
'total_records': total_records,
'total_accesses': total_accesses,
'cache_size_chars': cache_size_chars,
'cache_size_mb': cache_size_chars / (1024 * 1024),
'recent_records_7days': recent_records,
'top_accessed_files': [
{
'filename': row[0],
'access_count': row[1],
'last_access': row[2]
}
for row in top_accessed
],
'cache_hit_potential': f"{(total_accesses - total_records) / max(total_accesses, 1) * 100:.1f}%"
}
except Exception as e:
logger.error(f"獲取快取統計失敗: {e}")
return {}
def clean_expired_cache(self) -> int:
"""
清理過期的快取記錄
Returns:
清理的記錄數量
"""
try:
with sqlite3.connect(self.cache_db_path) as conn:
cursor = conn.cursor()
# 刪除過期記錄
cursor.execute('''
DELETE FROM ocr_cache
WHERE extraction_time < datetime('now', '-{} days')
'''.format(self.cache_expire_days))
deleted_count = cursor.rowcount
conn.commit()
logger.info(f"[OCR-CACHE] 清理過期快取: {deleted_count} 筆記錄")
return deleted_count
except Exception as e:
logger.error(f"清理過期快取失敗: {e}")
return 0
def clear_all_cache(self) -> bool:
"""
清空所有快取
Returns:
是否成功
"""
try:
with sqlite3.connect(self.cache_db_path) as conn:
cursor = conn.cursor()
cursor.execute('DELETE FROM ocr_cache')
conn.commit()
logger.info("[OCR-CACHE] 已清空所有快取")
return True
except Exception as e:
logger.error(f"清空快取失敗: {e}")
return False

File diff suppressed because it is too large Load Diff