316 lines
12 KiB
Python
316 lines
12 KiB
Python
import re
|
||
from typing import List, Tuple, Optional
|
||
from rapidfuzz import fuzz, process
|
||
from sqlalchemy.orm import Session
|
||
from app.config import MATCH_THRESHOLD_AUTO, MATCH_THRESHOLD_REVIEW
|
||
from app.models.dit import DitRecord
|
||
from app.models.sample import SampleRecord
|
||
from app.models.order import OrderRecord
|
||
from app.models.match import MatchResult, MatchStatus, TargetType, ReviewLog
|
||
import pandas as pd
|
||
from datetime import timedelta
|
||
|
||
# 公司後綴清單(用於正規化)
|
||
COMPANY_SUFFIXES = [
|
||
'股份有限公司', '有限公司', '公司',
|
||
'株式会社', '株式會社',
|
||
'Co., Ltd.', 'Co.,Ltd.', 'Co. Ltd.', 'Co.Ltd.', 'Co., Ltd', 'Co.,Ltd',
|
||
'Corporation', 'Corp.', 'Corp',
|
||
'Inc.', 'Inc',
|
||
'Limited', 'Ltd.', 'Ltd', 'L.T.D.',
|
||
'LLC', 'L.L.C.',
|
||
]
|
||
|
||
def sanitize_pn(pn: str) -> str:
|
||
"""去除非字母數字字元並轉大寫 (允許 - 與 _)"""
|
||
if not pn:
|
||
return ""
|
||
# 保留 - 和 _,移除其他特殊符號
|
||
return re.sub(r'[^a-zA-Z0-9\-_]', '', str(pn)).upper()
|
||
|
||
def normalize_pn_for_matching(pn: str) -> str:
|
||
"""比對專用的正規化 (移除所有符號,只留英數)"""
|
||
if not pn:
|
||
return ""
|
||
return re.sub(r'[^a-zA-Z0-9]', '', str(pn)).upper()
|
||
|
||
def normalize_customer_name(name: str) -> str:
|
||
"""正規化客戶名稱 (轉大寫)"""
|
||
if not name:
|
||
return ""
|
||
|
||
# 轉換為大寫
|
||
normalized = name.strip()
|
||
|
||
# Pre-clean: Remove common punctuation/separators to make suffix matching easier
|
||
# But be careful not to merge words incorrectly.
|
||
|
||
# 移除公司後綴 - iterate multiple times or use regex for robust matching
|
||
# Sort suffixes by length descending to match longest first
|
||
sorted_suffixes = sorted(COMPANY_SUFFIXES, key=len, reverse=True)
|
||
|
||
for suffix in sorted_suffixes:
|
||
# Use word boundary or simple end of string check
|
||
# Escape suffix for regex
|
||
pattern = re.compile(re.escape(suffix) + r'$', re.IGNORECASE)
|
||
normalized = pattern.sub('', normalized).strip()
|
||
|
||
# Also try matching with preceding comma/space
|
||
pattern_strict = re.compile(r'[,.\s]+' + re.escape(suffix) + r'$', re.IGNORECASE)
|
||
normalized = pattern_strict.sub('', normalized).strip()
|
||
|
||
# 移除括號及其內容
|
||
normalized = re.sub(r'\([^)]*\)', '', normalized)
|
||
normalized = re.sub(r'([^)]*)', '', normalized)
|
||
|
||
# 全形轉半形
|
||
normalized = normalized.replace(' ', ' ')
|
||
|
||
# 移除特殊結尾字符 that might remain (like "Co.,") if suffix list didn't catch it
|
||
# Remove trailing "Co." or "Co.,"
|
||
normalized = re.sub(r'[,.\s]+Co[.,]*$', '', normalized, flags=re.IGNORECASE)
|
||
|
||
# 移除多餘空白
|
||
normalized = re.sub(r'\s+', ' ', normalized).strip()
|
||
|
||
# Remove all punctuation for final key? No, fuzzy match might rely on it.
|
||
# But for "Key" based matching in Lab, we want strict alphabetic?
|
||
# No, keep it similar to before but cleaner.
|
||
|
||
# Final aggressive strip of trailing punctuation
|
||
normalized = normalized.strip("., ")
|
||
|
||
return normalized.upper()
|
||
|
||
def calculate_similarity(name1: str, name2: str) -> Tuple[float, str]:
|
||
"""計算兩個名稱的相似度"""
|
||
# 正規化
|
||
norm1 = normalize_customer_name(name1)
|
||
norm2 = normalize_customer_name(name2)
|
||
|
||
if not norm1 or not norm2:
|
||
return 0.0, "Empty name"
|
||
|
||
# 完全匹配
|
||
if norm1 == norm2:
|
||
return 100.0, "Exact Match"
|
||
|
||
# 使用多種比對方法
|
||
ratio = fuzz.ratio(norm1, norm2)
|
||
partial_ratio = fuzz.partial_ratio(norm1, norm2)
|
||
token_sort_ratio = fuzz.token_sort_ratio(norm1, norm2)
|
||
token_set_ratio = fuzz.token_set_ratio(norm1, norm2)
|
||
|
||
# 取最高分
|
||
best_score = max(ratio, partial_ratio, token_sort_ratio, token_set_ratio)
|
||
|
||
# 決定原因
|
||
if ratio == best_score:
|
||
reason = "Character Similarity"
|
||
elif partial_ratio == best_score:
|
||
reason = "Partial Match"
|
||
elif token_sort_ratio == best_score:
|
||
reason = "Token Order Match"
|
||
else:
|
||
reason = "Token Set Match"
|
||
|
||
# 檢查是否為後綴差異
|
||
if best_score >= 80:
|
||
for suffix in COMPANY_SUFFIXES[:3]: # 只檢查常見後綴
|
||
if (suffix in name1 and suffix not in name2) or \
|
||
(suffix not in name1 and suffix in name2):
|
||
reason = "Corporate Suffix Mismatch"
|
||
break
|
||
|
||
return best_score, reason
|
||
|
||
class FuzzyMatcher:
|
||
def __init__(self, db: Session):
|
||
self.db = db
|
||
|
||
def run_matching(self) -> dict:
|
||
"""執行瀑布式模糊比對 (Waterfall Matching)"""
|
||
|
||
# 1. 取得所有 DIT 記錄
|
||
dit_records = self.db.query(DitRecord).all()
|
||
|
||
# 2. 取得所有樣品和訂單記錄並按 PN (比對專用正規化) 分組
|
||
sample_records = self.db.query(SampleRecord).all()
|
||
order_records = self.db.query(OrderRecord).all()
|
||
|
||
samples_by_pn = {}
|
||
samples_by_oppy = {}
|
||
for s in sample_records:
|
||
if s.pn:
|
||
norm_pn = normalize_pn_for_matching(s.pn)
|
||
if norm_pn not in samples_by_pn:
|
||
samples_by_pn[norm_pn] = []
|
||
samples_by_pn[norm_pn].append(s)
|
||
if s.oppy_no:
|
||
if s.oppy_no not in samples_by_oppy:
|
||
samples_by_oppy[s.oppy_no] = []
|
||
samples_by_oppy[s.oppy_no].append(s)
|
||
|
||
orders_by_pn = {}
|
||
for o in order_records:
|
||
if o.pn:
|
||
norm_pn = normalize_pn_for_matching(o.pn)
|
||
if norm_pn not in orders_by_pn:
|
||
orders_by_pn[norm_pn] = []
|
||
orders_by_pn[norm_pn].append(o)
|
||
|
||
# 3. 清除舊的比對結果
|
||
self.db.query(ReviewLog).delete()
|
||
self.db.query(MatchResult).delete()
|
||
|
||
match_count = 0
|
||
auto_matched = 0
|
||
pending_review = 0
|
||
|
||
for dit in dit_records:
|
||
dit_date = pd.to_datetime(dit.date, errors='coerce')
|
||
|
||
# 取得 DIT PN 的比對用正規化版本
|
||
dit_norm_pn = normalize_pn_for_matching(dit.pn)
|
||
|
||
# --- 比對樣品 (DIT -> Sample) ---
|
||
# 收集所有可能的樣品 (Priority 1: Oppy ID, Priority 2/3: PN)
|
||
potential_samples = []
|
||
if dit.op_id:
|
||
potential_samples.extend(samples_by_oppy.get(dit.op_id, []))
|
||
if dit_norm_pn:
|
||
potential_samples.extend(samples_by_pn.get(dit_norm_pn, []))
|
||
|
||
# 去重
|
||
seen_sample_ids = set()
|
||
unique_potential_samples = []
|
||
for s in potential_samples:
|
||
if s.id not in seen_sample_ids:
|
||
seen_sample_ids.add(s.id)
|
||
unique_potential_samples.append(s)
|
||
|
||
for sample in unique_potential_samples:
|
||
sample_date = pd.to_datetime(sample.date, errors='coerce')
|
||
|
||
# 時間窗檢查: Sample Date 必須在 DIT Date 的 前 30 天 至 今日 之間
|
||
if pd.notna(dit_date) and pd.notna(sample_date):
|
||
if sample_date < (dit_date - timedelta(days=30)):
|
||
continue
|
||
|
||
match_priority = 0
|
||
match_source = ""
|
||
score = 0.0
|
||
reason = ""
|
||
|
||
# Priority 1: 案號精準比對 (Golden Key)
|
||
if dit.op_id and sample.oppy_no and dit.op_id == sample.oppy_no:
|
||
match_priority = 1
|
||
match_source = f"Matched via Opportunity ID: {dit.op_id}"
|
||
score = 100.0
|
||
reason = "Golden Key Match"
|
||
|
||
# Priority 2 & 3 則限制在相同 PN (Ignored symbols)
|
||
elif dit_norm_pn == normalize_pn_for_matching(sample.pn):
|
||
# Priority 2: 客戶代碼比對 (Silver Key)
|
||
if dit.erp_account and sample.cust_id and dit.erp_account == sample.cust_id:
|
||
match_priority = 2
|
||
match_source = f"Matched via ERP Account: {dit.erp_account}"
|
||
score = 99.0
|
||
reason = "Silver Key Match"
|
||
|
||
# Priority 3: 名稱模糊比對 (Fallback)
|
||
else:
|
||
score, reason = calculate_similarity(dit.customer, sample.customer)
|
||
if score >= MATCH_THRESHOLD_REVIEW:
|
||
match_priority = 3
|
||
match_source = f"Matched via Name Similarity ({reason})"
|
||
|
||
if match_priority > 0:
|
||
status = MatchStatus.auto_matched if score >= MATCH_THRESHOLD_AUTO else MatchStatus.pending
|
||
match = MatchResult(
|
||
dit_id=dit.id,
|
||
target_type=TargetType.SAMPLE,
|
||
target_id=sample.id,
|
||
score=score,
|
||
match_priority=match_priority,
|
||
match_source=match_source,
|
||
reason=reason,
|
||
status=status
|
||
)
|
||
self.db.add(match)
|
||
match_count += 1
|
||
if status == MatchStatus.auto_matched:
|
||
auto_matched += 1
|
||
else:
|
||
pending_review += 1
|
||
|
||
# --- 比對訂單 (DIT -> Order) ---
|
||
# 訂單比對通常基於 PN
|
||
if dit_norm_pn:
|
||
for order in orders_by_pn.get(dit_norm_pn, []):
|
||
match_priority = 0
|
||
match_source = ""
|
||
score = 0.0
|
||
reason = ""
|
||
|
||
# Priority 2: 客戶代碼比對 (Silver Key)
|
||
if dit.erp_account and order.cust_id and dit.erp_account == order.cust_id:
|
||
match_priority = 2
|
||
match_source = f"Matched via ERP Account: {dit.erp_account}"
|
||
score = 99.0
|
||
reason = "Silver Key Match"
|
||
|
||
# Priority 3: 名稱模糊比對 (Fallback)
|
||
else:
|
||
score, reason = calculate_similarity(dit.customer, order.customer)
|
||
if score >= MATCH_THRESHOLD_REVIEW:
|
||
match_priority = 3
|
||
match_source = f"Matched via Name Similarity ({reason})"
|
||
|
||
if match_priority > 0:
|
||
status = MatchStatus.auto_matched if score >= MATCH_THRESHOLD_AUTO else MatchStatus.pending
|
||
match = MatchResult(
|
||
dit_id=dit.id,
|
||
target_type=TargetType.ORDER,
|
||
target_id=order.id,
|
||
score=score,
|
||
match_priority=match_priority,
|
||
match_source=match_source,
|
||
reason=reason,
|
||
status=status
|
||
)
|
||
self.db.add(match)
|
||
match_count += 1
|
||
if status == MatchStatus.auto_matched:
|
||
auto_matched += 1
|
||
else:
|
||
pending_review += 1
|
||
|
||
self.db.commit()
|
||
|
||
return {
|
||
'match_count': match_count,
|
||
'auto_matched': auto_matched,
|
||
'pending_review': pending_review
|
||
}
|
||
|
||
def get_pending_reviews(self) -> List[MatchResult]:
|
||
"""取得待審核的比對結果"""
|
||
return self.db.query(MatchResult).filter(
|
||
MatchResult.status == MatchStatus.pending
|
||
).all()
|
||
|
||
def review_match(self, match_id: int, action: str) -> Optional[MatchResult]:
|
||
"""審核比對結果"""
|
||
match = self.db.query(MatchResult).filter(MatchResult.id == match_id).first()
|
||
if not match:
|
||
return None
|
||
|
||
if action == 'accept':
|
||
match.status = MatchStatus.accepted
|
||
elif action == 'reject':
|
||
match.status = MatchStatus.rejected
|
||
|
||
self.db.commit()
|
||
return match
|