改用API驗證

This commit is contained in:
beabigegg
2025-10-02 17:13:24 +08:00
parent 0a89c19fc9
commit adecdf0cce
48 changed files with 6136 additions and 1239 deletions

282
app/services/ocr_cache.py Normal file
View File

@@ -0,0 +1,282 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
OCR 快取管理模組
Author: PANJIT IT Team
Created: 2024-01-28
Modified: 2024-01-28
"""
import hashlib
import json
import sqlite3
from datetime import datetime, timedelta
from pathlib import Path
from typing import Optional, Dict, Any
import logging
logger = logging.getLogger(__name__)
class OCRCache:
"""OCR 結果快取管理器"""
def __init__(self, cache_db_path: str = "ocr_cache.db", cache_expire_days: int = 30):
"""
初始化 OCR 快取管理器
Args:
cache_db_path: 快取資料庫路徑
cache_expire_days: 快取過期天數
"""
self.cache_db_path = Path(cache_db_path)
self.cache_expire_days = cache_expire_days
self.init_database()
def init_database(self):
"""初始化快取資料庫"""
try:
with sqlite3.connect(self.cache_db_path) as conn:
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS ocr_cache (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file_hash TEXT UNIQUE NOT NULL,
filename TEXT,
file_size INTEGER,
extracted_text TEXT NOT NULL,
extraction_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
access_count INTEGER DEFAULT 1,
last_access_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
metadata TEXT
)
''')
# 創建索引以提高查詢效能
cursor.execute('''
CREATE INDEX IF NOT EXISTS idx_file_hash
ON ocr_cache(file_hash)
''')
cursor.execute('''
CREATE INDEX IF NOT EXISTS idx_extraction_time
ON ocr_cache(extraction_time)
''')
conn.commit()
logger.info("OCR 快取資料庫初始化完成")
except Exception as e:
logger.error(f"初始化 OCR 快取資料庫失敗: {e}")
raise
def _calculate_file_hash(self, file_data: bytes, additional_info: str = "") -> str:
"""
計算檔案內容的 SHA256 雜湊值
Args:
file_data: 檔案二進位資料
additional_info: 額外資訊(如頁數、處理參數等)
Returns:
檔案的 SHA256 雜湊值
"""
hash_input = file_data + additional_info.encode('utf-8')
return hashlib.sha256(hash_input).hexdigest()
def get_cached_text(self, file_data: bytes, filename: str = "",
additional_info: str = "") -> Optional[str]:
"""
獲取快取的 OCR 文字
Args:
file_data: 檔案二進位資料
filename: 檔案名稱
additional_info: 額外資訊
Returns:
快取的文字內容,如果不存在則返回 None
"""
try:
file_hash = self._calculate_file_hash(file_data, additional_info)
with sqlite3.connect(self.cache_db_path) as conn:
cursor = conn.cursor()
# 查詢快取
cursor.execute('''
SELECT extracted_text, access_count
FROM ocr_cache
WHERE file_hash = ? AND
extraction_time > datetime('now', '-{} days')
'''.format(self.cache_expire_days), (file_hash,))
result = cursor.fetchone()
if result:
extracted_text, access_count = result
# 更新訪問計數和時間
cursor.execute('''
UPDATE ocr_cache
SET access_count = ?, last_access_time = CURRENT_TIMESTAMP
WHERE file_hash = ?
''', (access_count + 1, file_hash))
conn.commit()
logger.info(f"[OCR-CACHE] 快取命中: {filename} (訪問次數: {access_count + 1})")
return extracted_text
logger.debug(f"[OCR-CACHE] 快取未命中: {filename}")
return None
except Exception as e:
logger.error(f"獲取 OCR 快取失敗: {e}")
return None
def save_cached_text(self, file_data: bytes, extracted_text: str,
filename: str = "", additional_info: str = "",
metadata: Dict[str, Any] = None) -> bool:
"""
儲存 OCR 文字到快取
Args:
file_data: 檔案二進位資料
extracted_text: 提取的文字
filename: 檔案名稱
additional_info: 額外資訊
metadata: 中繼資料
Returns:
是否儲存成功
"""
try:
file_hash = self._calculate_file_hash(file_data, additional_info)
file_size = len(file_data)
metadata_json = json.dumps(metadata or {}, ensure_ascii=False)
with sqlite3.connect(self.cache_db_path) as conn:
cursor = conn.cursor()
# 使用 INSERT OR REPLACE 來處理重複的雜湊值
cursor.execute('''
INSERT OR REPLACE INTO ocr_cache
(file_hash, filename, file_size, extracted_text, metadata)
VALUES (?, ?, ?, ?, ?)
''', (file_hash, filename, file_size, extracted_text, metadata_json))
conn.commit()
logger.info(f"[OCR-CACHE] 儲存快取成功: {filename} ({len(extracted_text)} 字元)")
return True
except Exception as e:
logger.error(f"儲存 OCR 快取失敗: {e}")
return False
def get_cache_stats(self) -> Dict[str, Any]:
"""
獲取快取統計資訊
Returns:
快取統計資料
"""
try:
with sqlite3.connect(self.cache_db_path) as conn:
cursor = conn.cursor()
# 總記錄數
cursor.execute('SELECT COUNT(*) FROM ocr_cache')
total_records = cursor.fetchone()[0]
# 總訪問次數
cursor.execute('SELECT SUM(access_count) FROM ocr_cache')
total_accesses = cursor.fetchone()[0] or 0
# 快取大小
cursor.execute('SELECT SUM(LENGTH(extracted_text)) FROM ocr_cache')
cache_size_chars = cursor.fetchone()[0] or 0
# 最近 7 天的記錄數
cursor.execute('''
SELECT COUNT(*) FROM ocr_cache
WHERE extraction_time > datetime('now', '-7 days')
''')
recent_records = cursor.fetchone()[0]
# 最常訪問的記錄
cursor.execute('''
SELECT filename, access_count, last_access_time
FROM ocr_cache
ORDER BY access_count DESC
LIMIT 5
''')
top_accessed = cursor.fetchall()
return {
'total_records': total_records,
'total_accesses': total_accesses,
'cache_size_chars': cache_size_chars,
'cache_size_mb': cache_size_chars / (1024 * 1024),
'recent_records_7days': recent_records,
'top_accessed_files': [
{
'filename': row[0],
'access_count': row[1],
'last_access': row[2]
}
for row in top_accessed
],
'cache_hit_potential': f"{(total_accesses - total_records) / max(total_accesses, 1) * 100:.1f}%"
}
except Exception as e:
logger.error(f"獲取快取統計失敗: {e}")
return {}
def clean_expired_cache(self) -> int:
"""
清理過期的快取記錄
Returns:
清理的記錄數量
"""
try:
with sqlite3.connect(self.cache_db_path) as conn:
cursor = conn.cursor()
# 刪除過期記錄
cursor.execute('''
DELETE FROM ocr_cache
WHERE extraction_time < datetime('now', '-{} days')
'''.format(self.cache_expire_days))
deleted_count = cursor.rowcount
conn.commit()
logger.info(f"[OCR-CACHE] 清理過期快取: {deleted_count} 筆記錄")
return deleted_count
except Exception as e:
logger.error(f"清理過期快取失敗: {e}")
return 0
def clear_all_cache(self) -> bool:
"""
清空所有快取
Returns:
是否成功
"""
try:
with sqlite3.connect(self.cache_db_path) as conn:
cursor = conn.cursor()
cursor.execute('DELETE FROM ocr_cache')
conn.commit()
logger.info("[OCR-CACHE] 已清空所有快取")
return True
except Exception as e:
logger.error(f"清空快取失敗: {e}")
return False