改用API驗證
This commit is contained in:
282
app/services/ocr_cache.py
Normal file
282
app/services/ocr_cache.py
Normal file
@@ -0,0 +1,282 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
OCR 快取管理模組
|
||||
|
||||
Author: PANJIT IT Team
|
||||
Created: 2024-01-28
|
||||
Modified: 2024-01-28
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import sqlite3
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, Any
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class OCRCache:
|
||||
"""OCR 結果快取管理器"""
|
||||
|
||||
def __init__(self, cache_db_path: str = "ocr_cache.db", cache_expire_days: int = 30):
|
||||
"""
|
||||
初始化 OCR 快取管理器
|
||||
|
||||
Args:
|
||||
cache_db_path: 快取資料庫路徑
|
||||
cache_expire_days: 快取過期天數
|
||||
"""
|
||||
self.cache_db_path = Path(cache_db_path)
|
||||
self.cache_expire_days = cache_expire_days
|
||||
self.init_database()
|
||||
|
||||
def init_database(self):
|
||||
"""初始化快取資料庫"""
|
||||
try:
|
||||
with sqlite3.connect(self.cache_db_path) as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
CREATE TABLE IF NOT EXISTS ocr_cache (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
file_hash TEXT UNIQUE NOT NULL,
|
||||
filename TEXT,
|
||||
file_size INTEGER,
|
||||
extracted_text TEXT NOT NULL,
|
||||
extraction_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
access_count INTEGER DEFAULT 1,
|
||||
last_access_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
metadata TEXT
|
||||
)
|
||||
''')
|
||||
|
||||
# 創建索引以提高查詢效能
|
||||
cursor.execute('''
|
||||
CREATE INDEX IF NOT EXISTS idx_file_hash
|
||||
ON ocr_cache(file_hash)
|
||||
''')
|
||||
cursor.execute('''
|
||||
CREATE INDEX IF NOT EXISTS idx_extraction_time
|
||||
ON ocr_cache(extraction_time)
|
||||
''')
|
||||
|
||||
conn.commit()
|
||||
logger.info("OCR 快取資料庫初始化完成")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"初始化 OCR 快取資料庫失敗: {e}")
|
||||
raise
|
||||
|
||||
def _calculate_file_hash(self, file_data: bytes, additional_info: str = "") -> str:
|
||||
"""
|
||||
計算檔案內容的 SHA256 雜湊值
|
||||
|
||||
Args:
|
||||
file_data: 檔案二進位資料
|
||||
additional_info: 額外資訊(如頁數、處理參數等)
|
||||
|
||||
Returns:
|
||||
檔案的 SHA256 雜湊值
|
||||
"""
|
||||
hash_input = file_data + additional_info.encode('utf-8')
|
||||
return hashlib.sha256(hash_input).hexdigest()
|
||||
|
||||
def get_cached_text(self, file_data: bytes, filename: str = "",
|
||||
additional_info: str = "") -> Optional[str]:
|
||||
"""
|
||||
獲取快取的 OCR 文字
|
||||
|
||||
Args:
|
||||
file_data: 檔案二進位資料
|
||||
filename: 檔案名稱
|
||||
additional_info: 額外資訊
|
||||
|
||||
Returns:
|
||||
快取的文字內容,如果不存在則返回 None
|
||||
"""
|
||||
try:
|
||||
file_hash = self._calculate_file_hash(file_data, additional_info)
|
||||
|
||||
with sqlite3.connect(self.cache_db_path) as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 查詢快取
|
||||
cursor.execute('''
|
||||
SELECT extracted_text, access_count
|
||||
FROM ocr_cache
|
||||
WHERE file_hash = ? AND
|
||||
extraction_time > datetime('now', '-{} days')
|
||||
'''.format(self.cache_expire_days), (file_hash,))
|
||||
|
||||
result = cursor.fetchone()
|
||||
|
||||
if result:
|
||||
extracted_text, access_count = result
|
||||
|
||||
# 更新訪問計數和時間
|
||||
cursor.execute('''
|
||||
UPDATE ocr_cache
|
||||
SET access_count = ?, last_access_time = CURRENT_TIMESTAMP
|
||||
WHERE file_hash = ?
|
||||
''', (access_count + 1, file_hash))
|
||||
|
||||
conn.commit()
|
||||
|
||||
logger.info(f"[OCR-CACHE] 快取命中: {filename} (訪問次數: {access_count + 1})")
|
||||
return extracted_text
|
||||
|
||||
logger.debug(f"[OCR-CACHE] 快取未命中: {filename}")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"獲取 OCR 快取失敗: {e}")
|
||||
return None
|
||||
|
||||
def save_cached_text(self, file_data: bytes, extracted_text: str,
|
||||
filename: str = "", additional_info: str = "",
|
||||
metadata: Dict[str, Any] = None) -> bool:
|
||||
"""
|
||||
儲存 OCR 文字到快取
|
||||
|
||||
Args:
|
||||
file_data: 檔案二進位資料
|
||||
extracted_text: 提取的文字
|
||||
filename: 檔案名稱
|
||||
additional_info: 額外資訊
|
||||
metadata: 中繼資料
|
||||
|
||||
Returns:
|
||||
是否儲存成功
|
||||
"""
|
||||
try:
|
||||
file_hash = self._calculate_file_hash(file_data, additional_info)
|
||||
file_size = len(file_data)
|
||||
metadata_json = json.dumps(metadata or {}, ensure_ascii=False)
|
||||
|
||||
with sqlite3.connect(self.cache_db_path) as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 使用 INSERT OR REPLACE 來處理重複的雜湊值
|
||||
cursor.execute('''
|
||||
INSERT OR REPLACE INTO ocr_cache
|
||||
(file_hash, filename, file_size, extracted_text, metadata)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
''', (file_hash, filename, file_size, extracted_text, metadata_json))
|
||||
|
||||
conn.commit()
|
||||
|
||||
logger.info(f"[OCR-CACHE] 儲存快取成功: {filename} ({len(extracted_text)} 字元)")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"儲存 OCR 快取失敗: {e}")
|
||||
return False
|
||||
|
||||
def get_cache_stats(self) -> Dict[str, Any]:
|
||||
"""
|
||||
獲取快取統計資訊
|
||||
|
||||
Returns:
|
||||
快取統計資料
|
||||
"""
|
||||
try:
|
||||
with sqlite3.connect(self.cache_db_path) as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 總記錄數
|
||||
cursor.execute('SELECT COUNT(*) FROM ocr_cache')
|
||||
total_records = cursor.fetchone()[0]
|
||||
|
||||
# 總訪問次數
|
||||
cursor.execute('SELECT SUM(access_count) FROM ocr_cache')
|
||||
total_accesses = cursor.fetchone()[0] or 0
|
||||
|
||||
# 快取大小
|
||||
cursor.execute('SELECT SUM(LENGTH(extracted_text)) FROM ocr_cache')
|
||||
cache_size_chars = cursor.fetchone()[0] or 0
|
||||
|
||||
# 最近 7 天的記錄數
|
||||
cursor.execute('''
|
||||
SELECT COUNT(*) FROM ocr_cache
|
||||
WHERE extraction_time > datetime('now', '-7 days')
|
||||
''')
|
||||
recent_records = cursor.fetchone()[0]
|
||||
|
||||
# 最常訪問的記錄
|
||||
cursor.execute('''
|
||||
SELECT filename, access_count, last_access_time
|
||||
FROM ocr_cache
|
||||
ORDER BY access_count DESC
|
||||
LIMIT 5
|
||||
''')
|
||||
top_accessed = cursor.fetchall()
|
||||
|
||||
return {
|
||||
'total_records': total_records,
|
||||
'total_accesses': total_accesses,
|
||||
'cache_size_chars': cache_size_chars,
|
||||
'cache_size_mb': cache_size_chars / (1024 * 1024),
|
||||
'recent_records_7days': recent_records,
|
||||
'top_accessed_files': [
|
||||
{
|
||||
'filename': row[0],
|
||||
'access_count': row[1],
|
||||
'last_access': row[2]
|
||||
}
|
||||
for row in top_accessed
|
||||
],
|
||||
'cache_hit_potential': f"{(total_accesses - total_records) / max(total_accesses, 1) * 100:.1f}%"
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"獲取快取統計失敗: {e}")
|
||||
return {}
|
||||
|
||||
def clean_expired_cache(self) -> int:
|
||||
"""
|
||||
清理過期的快取記錄
|
||||
|
||||
Returns:
|
||||
清理的記錄數量
|
||||
"""
|
||||
try:
|
||||
with sqlite3.connect(self.cache_db_path) as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 刪除過期記錄
|
||||
cursor.execute('''
|
||||
DELETE FROM ocr_cache
|
||||
WHERE extraction_time < datetime('now', '-{} days')
|
||||
'''.format(self.cache_expire_days))
|
||||
|
||||
deleted_count = cursor.rowcount
|
||||
conn.commit()
|
||||
|
||||
logger.info(f"[OCR-CACHE] 清理過期快取: {deleted_count} 筆記錄")
|
||||
return deleted_count
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"清理過期快取失敗: {e}")
|
||||
return 0
|
||||
|
||||
def clear_all_cache(self) -> bool:
|
||||
"""
|
||||
清空所有快取
|
||||
|
||||
Returns:
|
||||
是否成功
|
||||
"""
|
||||
try:
|
||||
with sqlite3.connect(self.cache_db_path) as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('DELETE FROM ocr_cache')
|
||||
conn.commit()
|
||||
|
||||
logger.info("[OCR-CACHE] 已清空所有快取")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"清空快取失敗: {e}")
|
||||
return False
|
Reference in New Issue
Block a user