#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ OCR 快取管理模組 Author: PANJIT IT Team Created: 2024-01-28 Modified: 2024-01-28 """ import hashlib import json import sqlite3 from datetime import datetime, timedelta from pathlib import Path from typing import Optional, Dict, Any import logging logger = logging.getLogger(__name__) class OCRCache: """OCR 結果快取管理器""" def __init__(self, cache_db_path: str = "ocr_cache.db", cache_expire_days: int = 30): """ 初始化 OCR 快取管理器 Args: cache_db_path: 快取資料庫路徑 cache_expire_days: 快取過期天數 """ self.cache_db_path = Path(cache_db_path) self.cache_expire_days = cache_expire_days self.init_database() def init_database(self): """初始化快取資料庫""" try: with sqlite3.connect(self.cache_db_path) as conn: cursor = conn.cursor() cursor.execute(''' CREATE TABLE IF NOT EXISTS ocr_cache ( id INTEGER PRIMARY KEY AUTOINCREMENT, file_hash TEXT UNIQUE NOT NULL, filename TEXT, file_size INTEGER, extracted_text TEXT NOT NULL, extraction_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, access_count INTEGER DEFAULT 1, last_access_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, metadata TEXT ) ''') # 創建索引以提高查詢效能 cursor.execute(''' CREATE INDEX IF NOT EXISTS idx_file_hash ON ocr_cache(file_hash) ''') cursor.execute(''' CREATE INDEX IF NOT EXISTS idx_extraction_time ON ocr_cache(extraction_time) ''') conn.commit() logger.info("OCR 快取資料庫初始化完成") except Exception as e: logger.error(f"初始化 OCR 快取資料庫失敗: {e}") raise def _calculate_file_hash(self, file_data: bytes, additional_info: str = "") -> str: """ 計算檔案內容的 SHA256 雜湊值 Args: file_data: 檔案二進位資料 additional_info: 額外資訊(如頁數、處理參數等) Returns: 檔案的 SHA256 雜湊值 """ hash_input = file_data + additional_info.encode('utf-8') return hashlib.sha256(hash_input).hexdigest() def get_cached_text(self, file_data: bytes, filename: str = "", additional_info: str = "") -> Optional[str]: """ 獲取快取的 OCR 文字 Args: file_data: 檔案二進位資料 filename: 檔案名稱 additional_info: 額外資訊 Returns: 快取的文字內容,如果不存在則返回 None """ try: file_hash = self._calculate_file_hash(file_data, additional_info) with sqlite3.connect(self.cache_db_path) as conn: cursor = conn.cursor() # 查詢快取 cursor.execute(''' SELECT extracted_text, access_count FROM ocr_cache WHERE file_hash = ? AND extraction_time > datetime('now', '-{} days') '''.format(self.cache_expire_days), (file_hash,)) result = cursor.fetchone() if result: extracted_text, access_count = result # 更新訪問計數和時間 cursor.execute(''' UPDATE ocr_cache SET access_count = ?, last_access_time = CURRENT_TIMESTAMP WHERE file_hash = ? ''', (access_count + 1, file_hash)) conn.commit() logger.info(f"[OCR-CACHE] 快取命中: {filename} (訪問次數: {access_count + 1})") return extracted_text logger.debug(f"[OCR-CACHE] 快取未命中: {filename}") return None except Exception as e: logger.error(f"獲取 OCR 快取失敗: {e}") return None def save_cached_text(self, file_data: bytes, extracted_text: str, filename: str = "", additional_info: str = "", metadata: Dict[str, Any] = None) -> bool: """ 儲存 OCR 文字到快取 Args: file_data: 檔案二進位資料 extracted_text: 提取的文字 filename: 檔案名稱 additional_info: 額外資訊 metadata: 中繼資料 Returns: 是否儲存成功 """ try: file_hash = self._calculate_file_hash(file_data, additional_info) file_size = len(file_data) metadata_json = json.dumps(metadata or {}, ensure_ascii=False) with sqlite3.connect(self.cache_db_path) as conn: cursor = conn.cursor() # 使用 INSERT OR REPLACE 來處理重複的雜湊值 cursor.execute(''' INSERT OR REPLACE INTO ocr_cache (file_hash, filename, file_size, extracted_text, metadata) VALUES (?, ?, ?, ?, ?) ''', (file_hash, filename, file_size, extracted_text, metadata_json)) conn.commit() logger.info(f"[OCR-CACHE] 儲存快取成功: {filename} ({len(extracted_text)} 字元)") return True except Exception as e: logger.error(f"儲存 OCR 快取失敗: {e}") return False def get_cache_stats(self) -> Dict[str, Any]: """ 獲取快取統計資訊 Returns: 快取統計資料 """ try: with sqlite3.connect(self.cache_db_path) as conn: cursor = conn.cursor() # 總記錄數 cursor.execute('SELECT COUNT(*) FROM ocr_cache') total_records = cursor.fetchone()[0] # 總訪問次數 cursor.execute('SELECT SUM(access_count) FROM ocr_cache') total_accesses = cursor.fetchone()[0] or 0 # 快取大小 cursor.execute('SELECT SUM(LENGTH(extracted_text)) FROM ocr_cache') cache_size_chars = cursor.fetchone()[0] or 0 # 最近 7 天的記錄數 cursor.execute(''' SELECT COUNT(*) FROM ocr_cache WHERE extraction_time > datetime('now', '-7 days') ''') recent_records = cursor.fetchone()[0] # 最常訪問的記錄 cursor.execute(''' SELECT filename, access_count, last_access_time FROM ocr_cache ORDER BY access_count DESC LIMIT 5 ''') top_accessed = cursor.fetchall() return { 'total_records': total_records, 'total_accesses': total_accesses, 'cache_size_chars': cache_size_chars, 'cache_size_mb': cache_size_chars / (1024 * 1024), 'recent_records_7days': recent_records, 'top_accessed_files': [ { 'filename': row[0], 'access_count': row[1], 'last_access': row[2] } for row in top_accessed ], 'cache_hit_potential': f"{(total_accesses - total_records) / max(total_accesses, 1) * 100:.1f}%" } except Exception as e: logger.error(f"獲取快取統計失敗: {e}") return {} def clean_expired_cache(self) -> int: """ 清理過期的快取記錄 Returns: 清理的記錄數量 """ try: with sqlite3.connect(self.cache_db_path) as conn: cursor = conn.cursor() # 刪除過期記錄 cursor.execute(''' DELETE FROM ocr_cache WHERE extraction_time < datetime('now', '-{} days') '''.format(self.cache_expire_days)) deleted_count = cursor.rowcount conn.commit() logger.info(f"[OCR-CACHE] 清理過期快取: {deleted_count} 筆記錄") return deleted_count except Exception as e: logger.error(f"清理過期快取失敗: {e}") return 0 def clear_all_cache(self) -> bool: """ 清空所有快取 Returns: 是否成功 """ try: with sqlite3.connect(self.cache_db_path) as conn: cursor = conn.cursor() cursor.execute('DELETE FROM ocr_cache') conn.commit() logger.info("[OCR-CACHE] 已清空所有快取") return True except Exception as e: logger.error(f"清空快取失敗: {e}") return False