282 lines
9.4 KiB
Python
282 lines
9.4 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
OCR 快取管理模組
|
|
|
|
Author: PANJIT IT Team
|
|
Created: 2024-01-28
|
|
Modified: 2024-01-28
|
|
"""
|
|
|
|
import hashlib
|
|
import json
|
|
import sqlite3
|
|
from datetime import datetime, timedelta
|
|
from pathlib import Path
|
|
from typing import Optional, Dict, Any
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class OCRCache:
|
|
"""OCR 結果快取管理器"""
|
|
|
|
def __init__(self, cache_db_path: str = "ocr_cache.db", cache_expire_days: int = 30):
|
|
"""
|
|
初始化 OCR 快取管理器
|
|
|
|
Args:
|
|
cache_db_path: 快取資料庫路徑
|
|
cache_expire_days: 快取過期天數
|
|
"""
|
|
self.cache_db_path = Path(cache_db_path)
|
|
self.cache_expire_days = cache_expire_days
|
|
self.init_database()
|
|
|
|
def init_database(self):
|
|
"""初始化快取資料庫"""
|
|
try:
|
|
with sqlite3.connect(self.cache_db_path) as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute('''
|
|
CREATE TABLE IF NOT EXISTS ocr_cache (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
file_hash TEXT UNIQUE NOT NULL,
|
|
filename TEXT,
|
|
file_size INTEGER,
|
|
extracted_text TEXT NOT NULL,
|
|
extraction_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
access_count INTEGER DEFAULT 1,
|
|
last_access_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
metadata TEXT
|
|
)
|
|
''')
|
|
|
|
# 創建索引以提高查詢效能
|
|
cursor.execute('''
|
|
CREATE INDEX IF NOT EXISTS idx_file_hash
|
|
ON ocr_cache(file_hash)
|
|
''')
|
|
cursor.execute('''
|
|
CREATE INDEX IF NOT EXISTS idx_extraction_time
|
|
ON ocr_cache(extraction_time)
|
|
''')
|
|
|
|
conn.commit()
|
|
logger.info("OCR 快取資料庫初始化完成")
|
|
|
|
except Exception as e:
|
|
logger.error(f"初始化 OCR 快取資料庫失敗: {e}")
|
|
raise
|
|
|
|
def _calculate_file_hash(self, file_data: bytes, additional_info: str = "") -> str:
|
|
"""
|
|
計算檔案內容的 SHA256 雜湊值
|
|
|
|
Args:
|
|
file_data: 檔案二進位資料
|
|
additional_info: 額外資訊(如頁數、處理參數等)
|
|
|
|
Returns:
|
|
檔案的 SHA256 雜湊值
|
|
"""
|
|
hash_input = file_data + additional_info.encode('utf-8')
|
|
return hashlib.sha256(hash_input).hexdigest()
|
|
|
|
def get_cached_text(self, file_data: bytes, filename: str = "",
|
|
additional_info: str = "") -> Optional[str]:
|
|
"""
|
|
獲取快取的 OCR 文字
|
|
|
|
Args:
|
|
file_data: 檔案二進位資料
|
|
filename: 檔案名稱
|
|
additional_info: 額外資訊
|
|
|
|
Returns:
|
|
快取的文字內容,如果不存在則返回 None
|
|
"""
|
|
try:
|
|
file_hash = self._calculate_file_hash(file_data, additional_info)
|
|
|
|
with sqlite3.connect(self.cache_db_path) as conn:
|
|
cursor = conn.cursor()
|
|
|
|
# 查詢快取
|
|
cursor.execute('''
|
|
SELECT extracted_text, access_count
|
|
FROM ocr_cache
|
|
WHERE file_hash = ? AND
|
|
extraction_time > datetime('now', '-{} days')
|
|
'''.format(self.cache_expire_days), (file_hash,))
|
|
|
|
result = cursor.fetchone()
|
|
|
|
if result:
|
|
extracted_text, access_count = result
|
|
|
|
# 更新訪問計數和時間
|
|
cursor.execute('''
|
|
UPDATE ocr_cache
|
|
SET access_count = ?, last_access_time = CURRENT_TIMESTAMP
|
|
WHERE file_hash = ?
|
|
''', (access_count + 1, file_hash))
|
|
|
|
conn.commit()
|
|
|
|
logger.info(f"[OCR-CACHE] 快取命中: {filename} (訪問次數: {access_count + 1})")
|
|
return extracted_text
|
|
|
|
logger.debug(f"[OCR-CACHE] 快取未命中: {filename}")
|
|
return None
|
|
|
|
except Exception as e:
|
|
logger.error(f"獲取 OCR 快取失敗: {e}")
|
|
return None
|
|
|
|
def save_cached_text(self, file_data: bytes, extracted_text: str,
|
|
filename: str = "", additional_info: str = "",
|
|
metadata: Dict[str, Any] = None) -> bool:
|
|
"""
|
|
儲存 OCR 文字到快取
|
|
|
|
Args:
|
|
file_data: 檔案二進位資料
|
|
extracted_text: 提取的文字
|
|
filename: 檔案名稱
|
|
additional_info: 額外資訊
|
|
metadata: 中繼資料
|
|
|
|
Returns:
|
|
是否儲存成功
|
|
"""
|
|
try:
|
|
file_hash = self._calculate_file_hash(file_data, additional_info)
|
|
file_size = len(file_data)
|
|
metadata_json = json.dumps(metadata or {}, ensure_ascii=False)
|
|
|
|
with sqlite3.connect(self.cache_db_path) as conn:
|
|
cursor = conn.cursor()
|
|
|
|
# 使用 INSERT OR REPLACE 來處理重複的雜湊值
|
|
cursor.execute('''
|
|
INSERT OR REPLACE INTO ocr_cache
|
|
(file_hash, filename, file_size, extracted_text, metadata)
|
|
VALUES (?, ?, ?, ?, ?)
|
|
''', (file_hash, filename, file_size, extracted_text, metadata_json))
|
|
|
|
conn.commit()
|
|
|
|
logger.info(f"[OCR-CACHE] 儲存快取成功: {filename} ({len(extracted_text)} 字元)")
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"儲存 OCR 快取失敗: {e}")
|
|
return False
|
|
|
|
def get_cache_stats(self) -> Dict[str, Any]:
|
|
"""
|
|
獲取快取統計資訊
|
|
|
|
Returns:
|
|
快取統計資料
|
|
"""
|
|
try:
|
|
with sqlite3.connect(self.cache_db_path) as conn:
|
|
cursor = conn.cursor()
|
|
|
|
# 總記錄數
|
|
cursor.execute('SELECT COUNT(*) FROM ocr_cache')
|
|
total_records = cursor.fetchone()[0]
|
|
|
|
# 總訪問次數
|
|
cursor.execute('SELECT SUM(access_count) FROM ocr_cache')
|
|
total_accesses = cursor.fetchone()[0] or 0
|
|
|
|
# 快取大小
|
|
cursor.execute('SELECT SUM(LENGTH(extracted_text)) FROM ocr_cache')
|
|
cache_size_chars = cursor.fetchone()[0] or 0
|
|
|
|
# 最近 7 天的記錄數
|
|
cursor.execute('''
|
|
SELECT COUNT(*) FROM ocr_cache
|
|
WHERE extraction_time > datetime('now', '-7 days')
|
|
''')
|
|
recent_records = cursor.fetchone()[0]
|
|
|
|
# 最常訪問的記錄
|
|
cursor.execute('''
|
|
SELECT filename, access_count, last_access_time
|
|
FROM ocr_cache
|
|
ORDER BY access_count DESC
|
|
LIMIT 5
|
|
''')
|
|
top_accessed = cursor.fetchall()
|
|
|
|
return {
|
|
'total_records': total_records,
|
|
'total_accesses': total_accesses,
|
|
'cache_size_chars': cache_size_chars,
|
|
'cache_size_mb': cache_size_chars / (1024 * 1024),
|
|
'recent_records_7days': recent_records,
|
|
'top_accessed_files': [
|
|
{
|
|
'filename': row[0],
|
|
'access_count': row[1],
|
|
'last_access': row[2]
|
|
}
|
|
for row in top_accessed
|
|
],
|
|
'cache_hit_potential': f"{(total_accesses - total_records) / max(total_accesses, 1) * 100:.1f}%"
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"獲取快取統計失敗: {e}")
|
|
return {}
|
|
|
|
def clean_expired_cache(self) -> int:
|
|
"""
|
|
清理過期的快取記錄
|
|
|
|
Returns:
|
|
清理的記錄數量
|
|
"""
|
|
try:
|
|
with sqlite3.connect(self.cache_db_path) as conn:
|
|
cursor = conn.cursor()
|
|
|
|
# 刪除過期記錄
|
|
cursor.execute('''
|
|
DELETE FROM ocr_cache
|
|
WHERE extraction_time < datetime('now', '-{} days')
|
|
'''.format(self.cache_expire_days))
|
|
|
|
deleted_count = cursor.rowcount
|
|
conn.commit()
|
|
|
|
logger.info(f"[OCR-CACHE] 清理過期快取: {deleted_count} 筆記錄")
|
|
return deleted_count
|
|
|
|
except Exception as e:
|
|
logger.error(f"清理過期快取失敗: {e}")
|
|
return 0
|
|
|
|
def clear_all_cache(self) -> bool:
|
|
"""
|
|
清空所有快取
|
|
|
|
Returns:
|
|
是否成功
|
|
"""
|
|
try:
|
|
with sqlite3.connect(self.cache_db_path) as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute('DELETE FROM ocr_cache')
|
|
conn.commit()
|
|
|
|
logger.info("[OCR-CACHE] 已清空所有快取")
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"清空快取失敗: {e}")
|
|
return False |