Files
SalesPipeline/backend/app/services/fuzzy_matcher.py
2026-01-16 18:16:33 +08:00

316 lines
12 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
from typing import List, Tuple, Optional
from rapidfuzz import fuzz, process
from sqlalchemy.orm import Session
from app.config import MATCH_THRESHOLD_AUTO, MATCH_THRESHOLD_REVIEW
from app.models.dit import DitRecord
from app.models.sample import SampleRecord
from app.models.order import OrderRecord
from app.models.match import MatchResult, MatchStatus, TargetType, ReviewLog
import pandas as pd
from datetime import timedelta
# 公司後綴清單(用於正規化)
COMPANY_SUFFIXES = [
'股份有限公司', '有限公司', '公司',
'株式会社', '株式會社',
'Co., Ltd.', 'Co.,Ltd.', 'Co. Ltd.', 'Co.Ltd.', 'Co., Ltd', 'Co.,Ltd',
'Corporation', 'Corp.', 'Corp',
'Inc.', 'Inc',
'Limited', 'Ltd.', 'Ltd', 'L.T.D.',
'LLC', 'L.L.C.',
]
def sanitize_pn(pn: str) -> str:
"""去除非字母數字字元並轉大寫 (允許 - 與 _)"""
if not pn:
return ""
# 保留 - 和 _移除其他特殊符號
return re.sub(r'[^a-zA-Z0-9\-_]', '', str(pn)).upper()
def normalize_pn_for_matching(pn: str) -> str:
"""比對專用的正規化 (移除所有符號,只留英數)"""
if not pn:
return ""
return re.sub(r'[^a-zA-Z0-9]', '', str(pn)).upper()
def normalize_customer_name(name: str) -> str:
"""正規化客戶名稱 (轉大寫)"""
if not name:
return ""
# 轉換為大寫
normalized = name.strip()
# Pre-clean: Remove common punctuation/separators to make suffix matching easier
# But be careful not to merge words incorrectly.
# 移除公司後綴 - iterate multiple times or use regex for robust matching
# Sort suffixes by length descending to match longest first
sorted_suffixes = sorted(COMPANY_SUFFIXES, key=len, reverse=True)
for suffix in sorted_suffixes:
# Use word boundary or simple end of string check
# Escape suffix for regex
pattern = re.compile(re.escape(suffix) + r'$', re.IGNORECASE)
normalized = pattern.sub('', normalized).strip()
# Also try matching with preceding comma/space
pattern_strict = re.compile(r'[,.\s]+' + re.escape(suffix) + r'$', re.IGNORECASE)
normalized = pattern_strict.sub('', normalized).strip()
# 移除括號及其內容
normalized = re.sub(r'\([^)]*\)', '', normalized)
normalized = re.sub(r'[^]*', '', normalized)
# 全形轉半形
normalized = normalized.replace(' ', ' ')
# 移除特殊結尾字符 that might remain (like "Co.,") if suffix list didn't catch it
# Remove trailing "Co." or "Co.,"
normalized = re.sub(r'[,.\s]+Co[.,]*$', '', normalized, flags=re.IGNORECASE)
# 移除多餘空白
normalized = re.sub(r'\s+', ' ', normalized).strip()
# Remove all punctuation for final key? No, fuzzy match might rely on it.
# But for "Key" based matching in Lab, we want strict alphabetic?
# No, keep it similar to before but cleaner.
# Final aggressive strip of trailing punctuation
normalized = normalized.strip("., ")
return normalized.upper()
def calculate_similarity(name1: str, name2: str) -> Tuple[float, str]:
"""計算兩個名稱的相似度"""
# 正規化
norm1 = normalize_customer_name(name1)
norm2 = normalize_customer_name(name2)
if not norm1 or not norm2:
return 0.0, "Empty name"
# 完全匹配
if norm1 == norm2:
return 100.0, "Exact Match"
# 使用多種比對方法
ratio = fuzz.ratio(norm1, norm2)
partial_ratio = fuzz.partial_ratio(norm1, norm2)
token_sort_ratio = fuzz.token_sort_ratio(norm1, norm2)
token_set_ratio = fuzz.token_set_ratio(norm1, norm2)
# 取最高分
best_score = max(ratio, partial_ratio, token_sort_ratio, token_set_ratio)
# 決定原因
if ratio == best_score:
reason = "Character Similarity"
elif partial_ratio == best_score:
reason = "Partial Match"
elif token_sort_ratio == best_score:
reason = "Token Order Match"
else:
reason = "Token Set Match"
# 檢查是否為後綴差異
if best_score >= 80:
for suffix in COMPANY_SUFFIXES[:3]: # 只檢查常見後綴
if (suffix in name1 and suffix not in name2) or \
(suffix not in name1 and suffix in name2):
reason = "Corporate Suffix Mismatch"
break
return best_score, reason
class FuzzyMatcher:
def __init__(self, db: Session):
self.db = db
def run_matching(self) -> dict:
"""執行瀑布式模糊比對 (Waterfall Matching)"""
# 1. 取得所有 DIT 記錄
dit_records = self.db.query(DitRecord).all()
# 2. 取得所有樣品和訂單記錄並按 PN (比對專用正規化) 分組
sample_records = self.db.query(SampleRecord).all()
order_records = self.db.query(OrderRecord).all()
samples_by_pn = {}
samples_by_oppy = {}
for s in sample_records:
if s.pn:
norm_pn = normalize_pn_for_matching(s.pn)
if norm_pn not in samples_by_pn:
samples_by_pn[norm_pn] = []
samples_by_pn[norm_pn].append(s)
if s.oppy_no:
if s.oppy_no not in samples_by_oppy:
samples_by_oppy[s.oppy_no] = []
samples_by_oppy[s.oppy_no].append(s)
orders_by_pn = {}
for o in order_records:
if o.pn:
norm_pn = normalize_pn_for_matching(o.pn)
if norm_pn not in orders_by_pn:
orders_by_pn[norm_pn] = []
orders_by_pn[norm_pn].append(o)
# 3. 清除舊的比對結果
self.db.query(ReviewLog).delete()
self.db.query(MatchResult).delete()
match_count = 0
auto_matched = 0
pending_review = 0
for dit in dit_records:
dit_date = pd.to_datetime(dit.date, errors='coerce')
# 取得 DIT PN 的比對用正規化版本
dit_norm_pn = normalize_pn_for_matching(dit.pn)
# --- 比對樣品 (DIT -> Sample) ---
# 收集所有可能的樣品 (Priority 1: Oppy ID, Priority 2/3: PN)
potential_samples = []
if dit.op_id:
potential_samples.extend(samples_by_oppy.get(dit.op_id, []))
if dit_norm_pn:
potential_samples.extend(samples_by_pn.get(dit_norm_pn, []))
# 去重
seen_sample_ids = set()
unique_potential_samples = []
for s in potential_samples:
if s.id not in seen_sample_ids:
seen_sample_ids.add(s.id)
unique_potential_samples.append(s)
for sample in unique_potential_samples:
sample_date = pd.to_datetime(sample.date, errors='coerce')
# 時間窗檢查: Sample Date 必須在 DIT Date 的 前 30 天 至 今日 之間
if pd.notna(dit_date) and pd.notna(sample_date):
if sample_date < (dit_date - timedelta(days=30)):
continue
match_priority = 0
match_source = ""
score = 0.0
reason = ""
# Priority 1: 案號精準比對 (Golden Key)
if dit.op_id and sample.oppy_no and dit.op_id == sample.oppy_no:
match_priority = 1
match_source = f"Matched via Opportunity ID: {dit.op_id}"
score = 100.0
reason = "Golden Key Match"
# Priority 2 & 3 則限制在相同 PN (Ignored symbols)
elif dit_norm_pn == normalize_pn_for_matching(sample.pn):
# Priority 2: 客戶代碼比對 (Silver Key)
if dit.erp_account and sample.cust_id and dit.erp_account == sample.cust_id:
match_priority = 2
match_source = f"Matched via ERP Account: {dit.erp_account}"
score = 99.0
reason = "Silver Key Match"
# Priority 3: 名稱模糊比對 (Fallback)
else:
score, reason = calculate_similarity(dit.customer, sample.customer)
if score >= MATCH_THRESHOLD_REVIEW:
match_priority = 3
match_source = f"Matched via Name Similarity ({reason})"
if match_priority > 0:
status = MatchStatus.auto_matched if score >= MATCH_THRESHOLD_AUTO else MatchStatus.pending
match = MatchResult(
dit_id=dit.id,
target_type=TargetType.SAMPLE,
target_id=sample.id,
score=score,
match_priority=match_priority,
match_source=match_source,
reason=reason,
status=status
)
self.db.add(match)
match_count += 1
if status == MatchStatus.auto_matched:
auto_matched += 1
else:
pending_review += 1
# --- 比對訂單 (DIT -> Order) ---
# 訂單比對通常基於 PN
if dit_norm_pn:
for order in orders_by_pn.get(dit_norm_pn, []):
match_priority = 0
match_source = ""
score = 0.0
reason = ""
# Priority 2: 客戶代碼比對 (Silver Key)
if dit.erp_account and order.cust_id and dit.erp_account == order.cust_id:
match_priority = 2
match_source = f"Matched via ERP Account: {dit.erp_account}"
score = 99.0
reason = "Silver Key Match"
# Priority 3: 名稱模糊比對 (Fallback)
else:
score, reason = calculate_similarity(dit.customer, order.customer)
if score >= MATCH_THRESHOLD_REVIEW:
match_priority = 3
match_source = f"Matched via Name Similarity ({reason})"
if match_priority > 0:
status = MatchStatus.auto_matched if score >= MATCH_THRESHOLD_AUTO else MatchStatus.pending
match = MatchResult(
dit_id=dit.id,
target_type=TargetType.ORDER,
target_id=order.id,
score=score,
match_priority=match_priority,
match_source=match_source,
reason=reason,
status=status
)
self.db.add(match)
match_count += 1
if status == MatchStatus.auto_matched:
auto_matched += 1
else:
pending_review += 1
self.db.commit()
return {
'match_count': match_count,
'auto_matched': auto_matched,
'pending_review': pending_review
}
def get_pending_reviews(self) -> List[MatchResult]:
"""取得待審核的比對結果"""
return self.db.query(MatchResult).filter(
MatchResult.status == MatchStatus.pending
).all()
def review_match(self, match_id: int, action: str) -> Optional[MatchResult]:
"""審核比對結果"""
match = self.db.query(MatchResult).filter(MatchResult.id == match_id).first()
if not match:
return None
if action == 'accept':
match.status = MatchStatus.accepted
elif action == 'reject':
match.status = MatchStatus.rejected
self.db.commit()
return match