SalesPipeline/backend/app/services/fuzzy_matcher.py

import re
from typing import List, Tuple, Optional
from rapidfuzz import fuzz, process
from sqlalchemy.orm import Session
from app.config import MATCH_THRESHOLD_AUTO, MATCH_THRESHOLD_REVIEW
from app.models.dit import DitRecord
from app.models.sample import SampleRecord
from app.models.order import OrderRecord
from app.models.match import MatchResult, MatchStatus, TargetType, ReviewLog
import pandas as pd
from datetime import timedelta

# 公司後綴清單（用於正規化）
COMPANY_SUFFIXES = [
    '股份有限公司', '有限公司', '公司',
    '株式会社', '株式會社',
    'Co., Ltd.', 'Co.,Ltd.', 'Co. Ltd.', 'Co.Ltd.', 'Co., Ltd', 'Co.,Ltd',
    'Corporation', 'Corp.', 'Corp',
    'Inc.', 'Inc',
    'Limited', 'Ltd.', 'Ltd', 'L.T.D.',
    'LLC', 'L.L.C.',
]

def sanitize_pn(pn: str) -> str:
    """去除非字母數字字元並轉大寫 (允許 - 與 _)"""
    if not pn:
        return ""
    # 保留 - 和 _，移除其他特殊符號
    return re.sub(r'[^a-zA-Z0-9\-_]', '', str(pn)).upper()

def normalize_pn_for_matching(pn: str) -> str:
    """比對專用的正規化 (移除所有符號，只留英數)"""
    if not pn:
        return ""
    return re.sub(r'[^a-zA-Z0-9]', '', str(pn)).upper()

def normalize_customer_name(name: str) -> str:
    """正規化客戶名稱 (轉大寫)"""
    if not name:
        return ""

    # 轉換為大寫
    normalized = name.strip()

    # Pre-clean: Remove common punctuation/separators to make suffix matching easier
    # But be careful not to merge words incorrectly.

    # 移除公司後綴 - iterate multiple times or use regex for robust matching
    # Sort suffixes by length descending to match longest first
    sorted_suffixes = sorted(COMPANY_SUFFIXES, key=len, reverse=True)

    for suffix in sorted_suffixes:
        # Use word boundary or simple end of string check
        # Escape suffix for regex
        pattern = re.compile(re.escape(suffix) + r'$', re.IGNORECASE)
        normalized = pattern.sub('', normalized).strip()

        # Also try matching with preceding comma/space
        pattern_strict = re.compile(r'[,.\s]+' + re.escape(suffix) + r'$', re.IGNORECASE)
        normalized = pattern_strict.sub('', normalized).strip()

    # 移除括號及其內容
    normalized = re.sub(r'\([^)]*\)', '', normalized)
    normalized = re.sub(r'（[^）]*）', '', normalized)

    # 全形轉半形
    normalized = normalized.replace('　', ' ')

    # 移除特殊結尾字符 that might remain (like "Co.,") if suffix list didn't catch it
    # Remove trailing "Co." or "Co.,"
    normalized = re.sub(r'[,.\s]+Co[.,]*$', '', normalized, flags=re.IGNORECASE)

    # 移除多餘空白
    normalized = re.sub(r'\s+', ' ', normalized).strip()

    # Remove all punctuation for final key? No, fuzzy match might rely on it.
    # But for "Key" based matching in Lab, we want strict alphabetic?
    # No, keep it similar to before but cleaner.

    # Final aggressive strip of trailing punctuation
    normalized = normalized.strip("., ")

    return normalized.upper()

def calculate_similarity(name1: str, name2: str) -> Tuple[float, str]:
    """計算兩個名稱的相似度"""
    # 正規化
    norm1 = normalize_customer_name(name1)
    norm2 = normalize_customer_name(name2)

    if not norm1 or not norm2:
        return 0.0, "Empty name"

    # 完全匹配
    if norm1 == norm2:
        return 100.0, "Exact Match"

    # 使用多種比對方法
    ratio = fuzz.ratio(norm1, norm2)
    partial_ratio = fuzz.partial_ratio(norm1, norm2)
    token_sort_ratio = fuzz.token_sort_ratio(norm1, norm2)
    token_set_ratio = fuzz.token_set_ratio(norm1, norm2)

    # 取最高分
    best_score = max(ratio, partial_ratio, token_sort_ratio, token_set_ratio)

    # 決定原因
    if ratio == best_score:
        reason = "Character Similarity"
    elif partial_ratio == best_score:
        reason = "Partial Match"
    elif token_sort_ratio == best_score:
        reason = "Token Order Match"
    else:
        reason = "Token Set Match"

    # 檢查是否為後綴差異
    if best_score >= 80:
        for suffix in COMPANY_SUFFIXES[:3]:  # 只檢查常見後綴
            if (suffix in name1 and suffix not in name2) or \
               (suffix not in name1 and suffix in name2):
                reason = "Corporate Suffix Mismatch"
                break

    return best_score, reason

class FuzzyMatcher:
    def __init__(self, db: Session):
        self.db = db

    def run_matching(self) -> dict:
        """執行瀑布式模糊比對 (Waterfall Matching)"""

        # 1. 取得所有 DIT 記錄
        dit_records = self.db.query(DitRecord).all()

        # 2. 取得所有樣品和訂單記錄並按 PN (比對專用正規化) 分組
        sample_records = self.db.query(SampleRecord).all()
        order_records = self.db.query(OrderRecord).all()

        samples_by_pn = {}
        samples_by_oppy = {}
        for s in sample_records:
            if s.pn:
                norm_pn = normalize_pn_for_matching(s.pn)
                if norm_pn not in samples_by_pn:
                    samples_by_pn[norm_pn] = []
                samples_by_pn[norm_pn].append(s)
            if s.oppy_no:
                if s.oppy_no not in samples_by_oppy:
                    samples_by_oppy[s.oppy_no] = []
                samples_by_oppy[s.oppy_no].append(s)

        orders_by_pn = {}
        for o in order_records:
            if o.pn:
                norm_pn = normalize_pn_for_matching(o.pn)
                if norm_pn not in orders_by_pn:
                    orders_by_pn[norm_pn] = []
                orders_by_pn[norm_pn].append(o)

        # 3. 清除舊的比對結果
        self.db.query(ReviewLog).delete()
        self.db.query(MatchResult).delete()

        match_count = 0
        auto_matched = 0
        pending_review = 0

        for dit in dit_records:
            dit_date = pd.to_datetime(dit.date, errors='coerce')

            # 取得 DIT PN 的比對用正規化版本
            dit_norm_pn = normalize_pn_for_matching(dit.pn)

            # --- 比對樣品 (DIT -> Sample) ---
            # 收集所有可能的樣品 (Priority 1: Oppy ID, Priority 2/3: PN)
            potential_samples = []
            if dit.op_id:
                potential_samples.extend(samples_by_oppy.get(dit.op_id, []))
            if dit_norm_pn:
                potential_samples.extend(samples_by_pn.get(dit_norm_pn, []))

            # 去重
            seen_sample_ids = set()
            unique_potential_samples = []
            for s in potential_samples:
                if s.id not in seen_sample_ids:
                    seen_sample_ids.add(s.id)
                    unique_potential_samples.append(s)

            for sample in unique_potential_samples:
                sample_date = pd.to_datetime(sample.date, errors='coerce')

                # 時間窗檢查: Sample Date 必須在 DIT Date 的 前 30 天 至 今日 之間
                if pd.notna(dit_date) and pd.notna(sample_date):
                    if sample_date < (dit_date - timedelta(days=30)):
                        continue

                match_priority = 0
                match_source = ""
                score = 0.0
                reason = ""

                # Priority 1: 案號精準比對 (Golden Key)
                if dit.op_id and sample.oppy_no and dit.op_id == sample.oppy_no:
                    match_priority = 1
                    match_source = f"Matched via Opportunity ID: {dit.op_id}"
                    score = 100.0
                    reason = "Golden Key Match"

                # Priority 2 & 3 則限制在相同 PN (Ignored symbols)
                elif dit_norm_pn == normalize_pn_for_matching(sample.pn):
                    # Priority 2: 客戶代碼比對 (Silver Key)
                    if dit.erp_account and sample.cust_id and dit.erp_account == sample.cust_id:
                        match_priority = 2
                        match_source = f"Matched via ERP Account: {dit.erp_account}"
                        score = 99.0
                        reason = "Silver Key Match"

                    # Priority 3: 名稱模糊比對 (Fallback)
                    else:
                        score, reason = calculate_similarity(dit.customer, sample.customer)
                        if score >= MATCH_THRESHOLD_REVIEW:
                            match_priority = 3
                            match_source = f"Matched via Name Similarity ({reason})"

                if match_priority > 0:
                    status = MatchStatus.auto_matched if score >= MATCH_THRESHOLD_AUTO else MatchStatus.pending
                    match = MatchResult(
                        dit_id=dit.id,
                        target_type=TargetType.SAMPLE,
                        target_id=sample.id,
                        score=score,
                        match_priority=match_priority,
                        match_source=match_source,
                        reason=reason,
                        status=status
                    )
                    self.db.add(match)
                    match_count += 1
                    if status == MatchStatus.auto_matched:
                        auto_matched += 1
                    else:
                        pending_review += 1

            # --- 比對訂單 (DIT -> Order) ---
            # 訂單比對通常基於 PN
            if dit_norm_pn:
                for order in orders_by_pn.get(dit_norm_pn, []):
                    match_priority = 0
                    match_source = ""
                    score = 0.0
                    reason = ""

                    # Priority 2: 客戶代碼比對 (Silver Key)
                    if dit.erp_account and order.cust_id and dit.erp_account == order.cust_id:
                        match_priority = 2
                        match_source = f"Matched via ERP Account: {dit.erp_account}"
                        score = 99.0
                        reason = "Silver Key Match"

                    # Priority 3: 名稱模糊比對 (Fallback)
                    else:
                        score, reason = calculate_similarity(dit.customer, order.customer)
                        if score >= MATCH_THRESHOLD_REVIEW:
                            match_priority = 3
                            match_source = f"Matched via Name Similarity ({reason})"

                    if match_priority > 0:
                        status = MatchStatus.auto_matched if score >= MATCH_THRESHOLD_AUTO else MatchStatus.pending
                        match = MatchResult(
                            dit_id=dit.id,
                            target_type=TargetType.ORDER,
                            target_id=order.id,
                            score=score,
                            match_priority=match_priority,
                            match_source=match_source,
                            reason=reason,
                            status=status
                        )
                        self.db.add(match)
                        match_count += 1
                        if status == MatchStatus.auto_matched:
                            auto_matched += 1
                        else:
                            pending_review += 1

        self.db.commit()

        return {
            'match_count': match_count,
            'auto_matched': auto_matched,
            'pending_review': pending_review
        }

    def get_pending_reviews(self) -> List[MatchResult]:
        """取得待審核的比對結果"""
        return self.db.query(MatchResult).filter(
            MatchResult.status == MatchStatus.pending
        ).all()

    def review_match(self, match_id: int, action: str) -> Optional[MatchResult]:
        """審核比對結果"""
        match = self.db.query(MatchResult).filter(MatchResult.id == match_id).first()
        if not match:
            return None

        if action == 'accept':
            match.status = MatchStatus.accepted
        elif action == 'reject':
            match.status = MatchStatus.rejected

        self.db.commit()
        return match