first commit
This commit is contained in:
246
backend/app/routers/etl.py
Normal file
246
backend/app/routers/etl.py
Normal file
@@ -0,0 +1,246 @@
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
import pandas as pd
|
||||
from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, Form
|
||||
from sqlalchemy.orm import Session
|
||||
from pydantic import BaseModel
|
||||
from app.models import get_db
|
||||
from app.models.dit import DitRecord
|
||||
from app.models.sample import SampleRecord
|
||||
from app.models.order import OrderRecord
|
||||
from app.models.match import MatchResult, TargetType, ReviewLog
|
||||
from app.config import UPLOAD_DIR
|
||||
from app.services.excel_parser import excel_parser
|
||||
from app.services.fuzzy_matcher import normalize_customer_name, sanitize_pn
|
||||
|
||||
router = APIRouter(prefix="/etl", tags=["ETL"])
|
||||
|
||||
class ParsedFileResponse(BaseModel):
|
||||
file_id: str
|
||||
file_type: str
|
||||
filename: str
|
||||
header_row: int
|
||||
row_count: int
|
||||
preview: List[dict]
|
||||
|
||||
class ImportRequest(BaseModel):
|
||||
file_id: str
|
||||
|
||||
class ImportResponse(BaseModel):
|
||||
imported_count: int
|
||||
|
||||
@router.post("/upload", response_model=ParsedFileResponse)
|
||||
async def upload_file(
|
||||
file: UploadFile = File(...),
|
||||
file_type: str = Form(...),
|
||||
db: Session = Depends(get_db)
|
||||
):
|
||||
"""上傳並解析 Excel 檔案"""
|
||||
if file_type not in ['dit', 'sample', 'order']:
|
||||
raise HTTPException(status_code=400, detail="Invalid file type")
|
||||
|
||||
# 儲存檔案
|
||||
file_path = UPLOAD_DIR / file.filename
|
||||
with open(file_path, "wb") as buffer:
|
||||
shutil.copyfileobj(file.file, buffer)
|
||||
|
||||
try:
|
||||
# 解析檔案
|
||||
file_id, file_info = excel_parser.parse_file(file_path, file_type)
|
||||
|
||||
return ParsedFileResponse(
|
||||
file_id=file_id,
|
||||
file_type=file_info['file_type'],
|
||||
filename=file_info['filename'],
|
||||
header_row=file_info['header_row'],
|
||||
row_count=file_info['row_count'],
|
||||
preview=file_info['preview']
|
||||
)
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=400, detail=f"Failed to parse file: {str(e)}")
|
||||
|
||||
@router.get("/preview/{file_id}", response_model=ParsedFileResponse)
|
||||
def get_preview(file_id: str):
|
||||
"""取得檔案預覽"""
|
||||
file_info = excel_parser.get_file_info(file_id)
|
||||
if not file_info:
|
||||
raise HTTPException(status_code=404, detail="File not found")
|
||||
|
||||
return ParsedFileResponse(
|
||||
file_id=file_info['file_id'],
|
||||
file_type=file_info['file_type'],
|
||||
filename=file_info['filename'],
|
||||
header_row=file_info['header_row'],
|
||||
row_count=file_info['row_count'],
|
||||
preview=file_info['preview']
|
||||
)
|
||||
|
||||
def clean_value(val, default=''):
|
||||
"""清理欄位值,處理 nan 和空值"""
|
||||
if val is None or (isinstance(val, float) and pd.isna(val)):
|
||||
return default
|
||||
str_val = str(val).strip()
|
||||
if str_val.lower() in ('nan', 'none', 'null', ''):
|
||||
return default
|
||||
return str_val
|
||||
|
||||
|
||||
@router.post("/import", response_model=ImportResponse)
|
||||
def import_data(request: ImportRequest, db: Session = Depends(get_db)):
|
||||
"""匯入資料到資料庫"""
|
||||
import traceback
|
||||
|
||||
try:
|
||||
file_info = excel_parser.get_file_info(request.file_id)
|
||||
if not file_info:
|
||||
print(f"[ETL Import] Error: File not found for file_id={request.file_id}")
|
||||
raise HTTPException(status_code=404, detail="File not found")
|
||||
|
||||
df = excel_parser.get_parsed_data(request.file_id)
|
||||
if df is None:
|
||||
print(f"[ETL Import] Error: Parsed data not found for file_id={request.file_id}")
|
||||
raise HTTPException(status_code=404, detail="Parsed data not found")
|
||||
|
||||
print(f"[ETL Import] Starting import: file_type={file_info['file_type']}, rows={len(df)}")
|
||||
|
||||
file_type = file_info['file_type']
|
||||
imported_count = 0
|
||||
seen_ids = set() # 追蹤已處理的 ID,避免檔案內重複
|
||||
|
||||
# 清除該類型的舊資料,避免重複鍵衝突
|
||||
try:
|
||||
if file_type == 'dit':
|
||||
print("[ETL Import] Clearing old DIT records and dependent matches/logs...")
|
||||
# 先清除與 DIT 相關的審核日誌與比對結果
|
||||
db.query(ReviewLog).delete()
|
||||
db.query(MatchResult).delete()
|
||||
db.query(DitRecord).delete()
|
||||
elif file_type == 'sample':
|
||||
print("[ETL Import] Clearing old Sample records and dependent matches/logs...")
|
||||
# 先清除與 Sample 相關的比對結果 (及其日誌)
|
||||
# 這裡比較複雜,因為 ReviewLog 是透過 MatchResult 關聯的
|
||||
# 但既然我們是清空整個類別,直接清空所有 ReviewLog 和對應的 MatchResult 是最安全的
|
||||
db.query(ReviewLog).delete()
|
||||
db.query(MatchResult).filter(MatchResult.target_type == TargetType.SAMPLE).delete()
|
||||
db.query(SampleRecord).delete()
|
||||
elif file_type == 'order':
|
||||
print("[ETL Import] Clearing old Order records and dependent matches/logs...")
|
||||
db.query(ReviewLog).delete()
|
||||
db.query(MatchResult).filter(MatchResult.target_type == TargetType.ORDER).delete()
|
||||
db.query(OrderRecord).delete()
|
||||
db.flush() # 使用 flush 而非 commit,保持在同一個事務中
|
||||
print("[ETL Import] Old data cleared successfully.")
|
||||
except Exception as e:
|
||||
db.rollback()
|
||||
print(f"[ETL Import] Error clearing old data: {traceback.format_exc()}")
|
||||
raise HTTPException(status_code=500, detail=f"Failed to clear old data: {str(e)}")
|
||||
|
||||
for idx, row in df.iterrows():
|
||||
try:
|
||||
if file_type == 'dit':
|
||||
op_id = clean_value(row.get('op_id'), '')
|
||||
erp_account = clean_value(row.get('erp_account'), '')
|
||||
customer = clean_value(row.get('customer'))
|
||||
pn = clean_value(row.get('pn'))
|
||||
# 跳過無效資料列或重複的 op_id + pn 組合
|
||||
unique_key = f"{op_id}|{pn}"
|
||||
if not op_id or unique_key in seen_ids:
|
||||
continue
|
||||
seen_ids.add(unique_key)
|
||||
record = DitRecord(
|
||||
op_id=op_id,
|
||||
erp_account=erp_account,
|
||||
customer=customer,
|
||||
customer_normalized=normalize_customer_name(customer),
|
||||
pn=sanitize_pn(pn),
|
||||
eau=int(row.get('eau', 0)) if row.get('eau') and not pd.isna(row.get('eau')) else 0,
|
||||
stage=clean_value(row.get('stage')),
|
||||
date=clean_value(row.get('date'))
|
||||
)
|
||||
elif file_type == 'sample':
|
||||
sample_id = clean_value(row.get('sample_id'), f'S{idx}')
|
||||
oppy_no = clean_value(row.get('oppy_no'), '')
|
||||
cust_id = clean_value(row.get('cust_id'), '')
|
||||
customer = clean_value(row.get('customer'))
|
||||
pn = clean_value(row.get('pn'))
|
||||
# 跳過重複的 sample_id
|
||||
if sample_id in seen_ids:
|
||||
continue
|
||||
seen_ids.add(sample_id)
|
||||
record = SampleRecord(
|
||||
sample_id=sample_id,
|
||||
order_no=clean_value(row.get('order_no')),
|
||||
oppy_no=oppy_no,
|
||||
cust_id=cust_id,
|
||||
customer=customer,
|
||||
customer_normalized=normalize_customer_name(customer),
|
||||
pn=sanitize_pn(pn),
|
||||
qty=int(row.get('qty', 0)) if row.get('qty') and not pd.isna(row.get('qty')) else 0,
|
||||
date=clean_value(row.get('date'))
|
||||
)
|
||||
elif file_type == 'order':
|
||||
order_id = clean_value(row.get('order_id'), f'O{idx}')
|
||||
cust_id = clean_value(row.get('cust_id'), '')
|
||||
customer = clean_value(row.get('customer'))
|
||||
pn = clean_value(row.get('pn'))
|
||||
# 跳過重複的 order_id
|
||||
if order_id in seen_ids:
|
||||
continue
|
||||
seen_ids.add(order_id)
|
||||
record = OrderRecord(
|
||||
order_id=order_id,
|
||||
order_no=clean_value(row.get('order_no')),
|
||||
cust_id=cust_id,
|
||||
customer=customer,
|
||||
customer_normalized=normalize_customer_name(customer),
|
||||
pn=sanitize_pn(pn),
|
||||
qty=int(row.get('qty', 0)) if row.get('qty') and not pd.isna(row.get('qty')) else 0,
|
||||
status=clean_value(row.get('status'), 'Backlog'),
|
||||
amount=float(row.get('amount', 0)) if row.get('amount') and not pd.isna(row.get('amount')) else 0
|
||||
)
|
||||
else:
|
||||
continue
|
||||
|
||||
db.add(record)
|
||||
imported_count += 1
|
||||
if imported_count % 500 == 0:
|
||||
print(f"[ETL Import] Processed {imported_count} rows...")
|
||||
except Exception as e:
|
||||
print(f"[ETL Import] Error importing row {idx}: {e}")
|
||||
continue
|
||||
|
||||
try:
|
||||
print(f"[ETL Import] Committing {imported_count} records...")
|
||||
db.commit()
|
||||
print(f"[ETL Import] Import successful: {imported_count} records.")
|
||||
except Exception as e:
|
||||
db.rollback()
|
||||
print(f"[ETL Import] Commit Error: {traceback.format_exc()}")
|
||||
raise HTTPException(status_code=500, detail=f"Failed to commit data: {str(e)}")
|
||||
|
||||
return ImportResponse(imported_count=imported_count)
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
print(f"[ETL Import] Unhandled Exception: {traceback.format_exc()}")
|
||||
raise HTTPException(status_code=500, detail=f"Internal Server Error: {str(e)}")
|
||||
|
||||
@router.get("/data/{data_type}")
|
||||
def get_data(data_type: str, db: Session = Depends(get_db)):
|
||||
"""取得已匯入的資料"""
|
||||
if data_type == 'dit':
|
||||
records = db.query(DitRecord).all()
|
||||
elif data_type == 'sample':
|
||||
records = db.query(SampleRecord).all()
|
||||
elif data_type == 'order':
|
||||
records = db.query(OrderRecord).all()
|
||||
else:
|
||||
raise HTTPException(status_code=400, detail="Invalid data type")
|
||||
|
||||
return [
|
||||
{
|
||||
**{c.name: getattr(record, c.name) for c in record.__table__.columns}
|
||||
}
|
||||
for record in records
|
||||
]
|
||||
Reference in New Issue
Block a user