import re from typing import List, Tuple, Optional from rapidfuzz import fuzz, process from sqlalchemy.orm import Session from app.config import MATCH_THRESHOLD_AUTO, MATCH_THRESHOLD_REVIEW from app.models.dit import DitRecord from app.models.sample import SampleRecord from app.models.order import OrderRecord from app.models.match import MatchResult, MatchStatus, TargetType, ReviewLog import pandas as pd from datetime import timedelta # 公司後綴清單(用於正規化) COMPANY_SUFFIXES = [ '股份有限公司', '有限公司', '公司', '株式会社', '株式會社', 'Co., Ltd.', 'Co.,Ltd.', 'Co. Ltd.', 'Co.Ltd.', 'Co., Ltd', 'Co.,Ltd', 'Corporation', 'Corp.', 'Corp', 'Inc.', 'Inc', 'Limited', 'Ltd.', 'Ltd', 'L.T.D.', 'LLC', 'L.L.C.', ] def sanitize_pn(pn: str) -> str: """去除非字母數字字元並轉大寫 (允許 - 與 _)""" if not pn: return "" # 保留 - 和 _,移除其他特殊符號 return re.sub(r'[^a-zA-Z0-9\-_]', '', str(pn)).upper() def normalize_pn_for_matching(pn: str) -> str: """比對專用的正規化 (移除所有符號,只留英數)""" if not pn: return "" return re.sub(r'[^a-zA-Z0-9]', '', str(pn)).upper() def normalize_customer_name(name: str) -> str: """正規化客戶名稱 (轉大寫)""" if not name: return "" # 轉換為大寫 normalized = name.strip() # Pre-clean: Remove common punctuation/separators to make suffix matching easier # But be careful not to merge words incorrectly. # 移除公司後綴 - iterate multiple times or use regex for robust matching # Sort suffixes by length descending to match longest first sorted_suffixes = sorted(COMPANY_SUFFIXES, key=len, reverse=True) for suffix in sorted_suffixes: # Use word boundary or simple end of string check # Escape suffix for regex pattern = re.compile(re.escape(suffix) + r'$', re.IGNORECASE) normalized = pattern.sub('', normalized).strip() # Also try matching with preceding comma/space pattern_strict = re.compile(r'[,.\s]+' + re.escape(suffix) + r'$', re.IGNORECASE) normalized = pattern_strict.sub('', normalized).strip() # 移除括號及其內容 normalized = re.sub(r'\([^)]*\)', '', normalized) normalized = re.sub(r'([^)]*)', '', normalized) # 全形轉半形 normalized = normalized.replace(' ', ' ') # 移除特殊結尾字符 that might remain (like "Co.,") if suffix list didn't catch it # Remove trailing "Co." or "Co.," normalized = re.sub(r'[,.\s]+Co[.,]*$', '', normalized, flags=re.IGNORECASE) # 移除多餘空白 normalized = re.sub(r'\s+', ' ', normalized).strip() # Remove all punctuation for final key? No, fuzzy match might rely on it. # But for "Key" based matching in Lab, we want strict alphabetic? # No, keep it similar to before but cleaner. # Final aggressive strip of trailing punctuation normalized = normalized.strip("., ") return normalized.upper() def calculate_similarity(name1: str, name2: str) -> Tuple[float, str]: """計算兩個名稱的相似度""" # 正規化 norm1 = normalize_customer_name(name1) norm2 = normalize_customer_name(name2) if not norm1 or not norm2: return 0.0, "Empty name" # 完全匹配 if norm1 == norm2: return 100.0, "Exact Match" # 使用多種比對方法 ratio = fuzz.ratio(norm1, norm2) partial_ratio = fuzz.partial_ratio(norm1, norm2) token_sort_ratio = fuzz.token_sort_ratio(norm1, norm2) token_set_ratio = fuzz.token_set_ratio(norm1, norm2) # 取最高分 best_score = max(ratio, partial_ratio, token_sort_ratio, token_set_ratio) # 決定原因 if ratio == best_score: reason = "Character Similarity" elif partial_ratio == best_score: reason = "Partial Match" elif token_sort_ratio == best_score: reason = "Token Order Match" else: reason = "Token Set Match" # 檢查是否為後綴差異 if best_score >= 80: for suffix in COMPANY_SUFFIXES[:3]: # 只檢查常見後綴 if (suffix in name1 and suffix not in name2) or \ (suffix not in name1 and suffix in name2): reason = "Corporate Suffix Mismatch" break return best_score, reason class FuzzyMatcher: def __init__(self, db: Session): self.db = db def run_matching(self) -> dict: """執行瀑布式模糊比對 (Waterfall Matching)""" # 1. 取得所有 DIT 記錄 dit_records = self.db.query(DitRecord).all() # 2. 取得所有樣品和訂單記錄並按 PN (比對專用正規化) 分組 sample_records = self.db.query(SampleRecord).all() order_records = self.db.query(OrderRecord).all() samples_by_pn = {} samples_by_oppy = {} for s in sample_records: if s.pn: norm_pn = normalize_pn_for_matching(s.pn) if norm_pn not in samples_by_pn: samples_by_pn[norm_pn] = [] samples_by_pn[norm_pn].append(s) if s.oppy_no: if s.oppy_no not in samples_by_oppy: samples_by_oppy[s.oppy_no] = [] samples_by_oppy[s.oppy_no].append(s) orders_by_pn = {} for o in order_records: if o.pn: norm_pn = normalize_pn_for_matching(o.pn) if norm_pn not in orders_by_pn: orders_by_pn[norm_pn] = [] orders_by_pn[norm_pn].append(o) # 3. 清除舊的比對結果 self.db.query(ReviewLog).delete() self.db.query(MatchResult).delete() match_count = 0 auto_matched = 0 pending_review = 0 for dit in dit_records: dit_date = pd.to_datetime(dit.date, errors='coerce') # 取得 DIT PN 的比對用正規化版本 dit_norm_pn = normalize_pn_for_matching(dit.pn) # --- 比對樣品 (DIT -> Sample) --- # 收集所有可能的樣品 (Priority 1: Oppy ID, Priority 2/3: PN) potential_samples = [] if dit.op_id: potential_samples.extend(samples_by_oppy.get(dit.op_id, [])) if dit_norm_pn: potential_samples.extend(samples_by_pn.get(dit_norm_pn, [])) # 去重 seen_sample_ids = set() unique_potential_samples = [] for s in potential_samples: if s.id not in seen_sample_ids: seen_sample_ids.add(s.id) unique_potential_samples.append(s) for sample in unique_potential_samples: sample_date = pd.to_datetime(sample.date, errors='coerce') # 時間窗檢查: Sample Date 必須在 DIT Date 的 前 30 天 至 今日 之間 if pd.notna(dit_date) and pd.notna(sample_date): if sample_date < (dit_date - timedelta(days=30)): continue match_priority = 0 match_source = "" score = 0.0 reason = "" # Priority 1: 案號精準比對 (Golden Key) if dit.op_id and sample.oppy_no and dit.op_id == sample.oppy_no: match_priority = 1 match_source = f"Matched via Opportunity ID: {dit.op_id}" score = 100.0 reason = "Golden Key Match" # Priority 2 & 3 則限制在相同 PN (Ignored symbols) elif dit_norm_pn == normalize_pn_for_matching(sample.pn): # Priority 2: 客戶代碼比對 (Silver Key) if dit.erp_account and sample.cust_id and dit.erp_account == sample.cust_id: match_priority = 2 match_source = f"Matched via ERP Account: {dit.erp_account}" score = 99.0 reason = "Silver Key Match" # Priority 3: 名稱模糊比對 (Fallback) else: score, reason = calculate_similarity(dit.customer, sample.customer) if score >= MATCH_THRESHOLD_REVIEW: match_priority = 3 match_source = f"Matched via Name Similarity ({reason})" if match_priority > 0: status = MatchStatus.auto_matched if score >= MATCH_THRESHOLD_AUTO else MatchStatus.pending match = MatchResult( dit_id=dit.id, target_type=TargetType.SAMPLE, target_id=sample.id, score=score, match_priority=match_priority, match_source=match_source, reason=reason, status=status ) self.db.add(match) match_count += 1 if status == MatchStatus.auto_matched: auto_matched += 1 else: pending_review += 1 # --- 比對訂單 (DIT -> Order) --- # 訂單比對通常基於 PN if dit_norm_pn: for order in orders_by_pn.get(dit_norm_pn, []): match_priority = 0 match_source = "" score = 0.0 reason = "" # Priority 2: 客戶代碼比對 (Silver Key) if dit.erp_account and order.cust_id and dit.erp_account == order.cust_id: match_priority = 2 match_source = f"Matched via ERP Account: {dit.erp_account}" score = 99.0 reason = "Silver Key Match" # Priority 3: 名稱模糊比對 (Fallback) else: score, reason = calculate_similarity(dit.customer, order.customer) if score >= MATCH_THRESHOLD_REVIEW: match_priority = 3 match_source = f"Matched via Name Similarity ({reason})" if match_priority > 0: status = MatchStatus.auto_matched if score >= MATCH_THRESHOLD_AUTO else MatchStatus.pending match = MatchResult( dit_id=dit.id, target_type=TargetType.ORDER, target_id=order.id, score=score, match_priority=match_priority, match_source=match_source, reason=reason, status=status ) self.db.add(match) match_count += 1 if status == MatchStatus.auto_matched: auto_matched += 1 else: pending_review += 1 self.db.commit() return { 'match_count': match_count, 'auto_matched': auto_matched, 'pending_review': pending_review } def get_pending_reviews(self) -> List[MatchResult]: """取得待審核的比對結果""" return self.db.query(MatchResult).filter( MatchResult.status == MatchStatus.pending ).all() def review_match(self, match_id: int, action: str) -> Optional[MatchResult]: """審核比對結果""" match = self.db.query(MatchResult).filter(MatchResult.id == match_id).first() if not match: return None if action == 'accept': match.status = MatchStatus.accepted elif action == 'reject': match.status = MatchStatus.rejected self.db.commit() return match