import pandas as pd import re import unicodedata from pyvi import ViTokenizer import pandas as pd import numpy as np import re import unicodedata from pyvi import ViTokenizer from transformers import AutoTokenizer, AutoModel import torch from sklearn.metrics import classification_report device = "cuda" if torch.cuda.is_available() else "cpu" # Load HF model lần đầu, dùng chung hf_tokenizer = AutoTokenizer.from_pretrained("dangvantuan/vietnamese-embedding") hf_model = AutoModel.from_pretrained("dangvantuan/vietnamese-embedding").to(device) def embed_text(text, tokenizer=hf_tokenizer, model=hf_model, device=device): """ Embed 1 câu text thành vector bằng model dangvantuan/vietnamese-embedding. Dùng mean pooling với attention_mask. """ encoded = tokenizer( text, padding=True, truncation=True, max_length=128, return_tensors="pt" ).to(device) with torch.no_grad(): output = model(**encoded) token_embeddings = output.last_hidden_state # (1, L, H) attention_mask = encoded["attention_mask"] # (1, L) mask = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() sum_embeds = (token_embeddings * mask).sum(dim=1) # (1, H) lengths = mask.sum(dim=1) # (1,1) mean_pooled = sum_embeds / lengths return mean_pooled.cpu() # (1, H) def cosine_sim(a, b): """ Tính cosine similarity: - a: tensor shape (1, H) - b: tensor shape (N, H) Trả về: (1, N) """ a = a / a.norm(dim=-1, keepdim=True) b = b / b.norm(dim=-1, keepdim=True) return torch.mm(a, b.t()) # ============================================================ # 2. RULE VỀ LOCATION VÀ THÔNG SỐ # ============================================================ def location_similarity(q_row, cand_row): """ Độ giống Location dựa trên location_clean (Jaccard tokens). """ q_tokens = set(str(q_row["location_clean"]).split()) c_tokens = set(str(cand_row["location_clean"]).split()) if not q_tokens or not c_tokens: return 0.0 inter = len(q_tokens & c_tokens) union = len(q_tokens | c_tokens) return inter / union def numeric_closeness(q_row, cand_row, alpha_out=0.7, alpha_in=0.7): """ Độ gần nhau về: - luuluong - diện tích ngoài / trong (log1p + exp(-alpha * dist)) Trả về giá trị khoảng (0..~1.5), càng lớn càng gần. """ # closeness theo luuluong if "luuluong" in q_row and "luuluong" in cand_row: if q_row["luuluong"] == cand_row["luuluong"]: c_luu = 1.0 elif abs(q_row["luuluong"] - cand_row["luuluong"]) == 1: c_luu = 0.6 else: c_luu = 0.3 else: c_luu = 0.5 # closeness theo diện tích, xử lý thiếu an toàn def safe_val(row, col): return float(row[col]) if col in row and not pd.isna(row[col]) else 0.0 q_out = safe_val(q_row, "Dientichngoai") q_in = safe_val(q_row, "Dientichtrong") c_out = safe_val(cand_row, "Dientichngoai") c_in = safe_val(cand_row, "Dientichtrong") d_out = abs(np.log1p(q_out) - np.log1p(c_out)) d_in = abs(np.log1p(q_in) - np.log1p(c_in)) c_out = np.exp(-alpha_out * d_out) c_in = np.exp(-alpha_in * d_in) return 0.5 * c_luu + 0.25 * c_out + 0.25 * c_in # ============================================================ # 3. TRAIN EMBEDDINGS TỪ train_df # ============================================================ def build_train_embeddings(train_df): """ Nhận train_df có cột 'job_segmented'. Trả về tensor embeddings shape (N_train, H). """ train_texts = train_df["job_segmented"].tolist() embs = [] for txt in train_texts: vec = embed_text(txt) embs.append(vec.squeeze(0)) return torch.stack(embs) # ============================================================ # 4. RULE DỰ ĐOÁN CHO 1 ROW # ============================================================ def predict_label_for_row(q_row, train_df, train_embeddings, tokenizer=hf_tokenizer, model=hf_model, top_k=10, w_numeric=0.4, w_loc=0.6): """ Bước 1: dùng embedding(job_segmented) để chọn top_k ứng viên gần nghĩa nhất. Bước 2: trong top_k đó, KHÔNG dùng score text nữa, chỉ dùng: - loc_sim : similarity theo location_clean [0,1] (quan trọng nhất) - num_c : numeric_closeness (luuluong + diện tích) (~0..1.5) final_score = w_loc * loc_sim + w_numeric * num_c """ # 1) embed query q_vec = embed_text(q_row["job_segmented"], tokenizer, model) # (1,H) # 2) cosine similarity với toàn bộ train → chỉ để CHỌN ỨNG VIÊN sims = cosine_sim(q_vec, train_embeddings)[0] # (N,) # 3) lấy top-k job gần nhất theo embedding top_k = min(top_k, len(train_df)) top_scores, top_idx = torch.topk(sims, k=top_k) label_scores = {} for _, idx in zip(top_scores, top_idx): cand_row = train_df.iloc[int(idx)] loc_sim = location_similarity(q_row, cand_row) num_c = numeric_closeness(q_row, cand_row) final_score = w_loc * loc_sim + w_numeric * num_c lbl = str(cand_row["label"]) label_scores[lbl] = label_scores.get(lbl, 0.0) + final_score # fallback nếu không có ứng viên if not label_scores: majority_label = str(train_df["label"].value_counts().idxmax()) return majority_label, {} best_label = max(label_scores, key=label_scores.get) return best_label, label_scores # ============================================================ # 5. PREDICT CHO CẢ 1 DATAFRAME (DÙNG CHO TEST/HOLDOUT) # ============================================================ def predict_on_df(df_in: pd.DataFrame, train_df: pd.DataFrame, train_embeddings, name: str, top_k=5, w_numeric=0.4, w_loc=0.6): df = df_in.copy() preds = [] scores = [] for _, row in df.iterrows(): pred, sc = predict_label_for_row( row, train_df=train_df, train_embeddings=train_embeddings, tokenizer=hf_tokenizer, model=hf_model, top_k=top_k, w_numeric=w_numeric, w_loc=w_loc ) preds.append(pred) scores.append(sc) df["pred_label"] = preds df["score_details"] = scores print(f"\n========== KẾT QUẢ TRÊN {name} ==========") print(classification_report( df["label"].astype(str), df["pred_label"].astype(str), digits=3 )) return df def normalize_text_keep_words(s: str) -> str: s = str(s) s = unicodedata.normalize('NFC', s).lower() s = re.sub(r"[^0-9a-zà-ỹ\s]", " ", s) s = re.sub(r"\s+", " ", s).strip() return s def segment_and_remove_stopwords(text): if not isinstance(text, str): return "" segmented = ViTokenizer.tokenize(text) tokens = segmented.split() filtered = [tok for tok in tokens if tok not in vietnamese_stopwords] return " ".join(filtered) def preprocess_pipeline(df_raw: pd.DataFrame) -> pd.DataFrame: """ - Không ghép Location + Job - Clean text - Word segment Job + remove stopword """ df = df_raw.dropna(subset=["label"]).copy() df["label"] = df["label"].astype(str).str.strip() df = df[df["label"].isin(["1","2","3","4"])].copy() df["Location"] = df["Location"].fillna("Unknown").astype(str) df["Job"] = df["Job"].astype(str) # Clean text df["location_clean"] = df["Location"].apply(normalize_text_keep_words) df["job_clean"] = df["Job"].apply(normalize_text_keep_words) # Word segment + remove stopword df["job_segmented"] = df["job_clean"].apply(segment_and_remove_stopwords) # Giữ lại các cột numeric nếu có keep_cols = [ "Location","Job","label", "location_clean","job_clean","job_segmented" ] for c in ["Name","luuluong","Dientichngoai","Dientichtrong"]: if c in df.columns: keep_cols.append(c) return df[keep_cols].reset_index(drop=True) # ==== 1. Load stopword ==== def predict_single_sample(job_segmented: str, location_clean: str, name: str, luuluong: int, dientichngoai: float, dientichtrong: float, train_df: pd.DataFrame, train_embeddings, top_k=5, w_numeric=0.4, w_loc=0.6): """ Demo dự đoán cho 1 điểm dữ liệu mới. - job_segmented: chuỗi đã wordsegment + bỏ stopword (giống train_df['job_segmented']) - location_clean: chuỗi location đã normalize (giống train_df['location_clean']) - name, luuluong, dientichngoai, dientichtrong: thông số tòa/job """ # Tạo 1 Series giả giống 1 dòng trong df_out q_dict = { "job_segmented": job_segmented, "location_clean": location_clean, "Name": name, "luuluong": luuluong, "Dientichngoai": dientichngoai, "Dientichtrong": dientichtrong, } q_row = pd.Series(q_dict) pred_label, label_scores = predict_label_for_row( q_row, train_df=train_df, train_embeddings=train_embeddings, tokenizer=hf_tokenizer, model=hf_model, top_k=top_k, w_numeric=w_numeric, w_loc=w_loc ) return pred_label, label_scores def load_stopwords(path): with open(path, "r", encoding="utf-8") as f: sw = [line.strip() for line in f if line.strip()] return set(sw) stopwords_path = "vietnamese-stopwords-dash.txt" # đổi tên nếu khác vietnamese_stopwords = load_stopwords(stopwords_path) def chia_train_test(df_out: pd.DataFrame): df_out = df_out[df_out["label"].astype(str).isin(["1", "2", "3"])].reset_index(drop=True) test_buildings = ["CMC"] # tập test chính test_df = df_out[df_out["Name"].isin(test_buildings)].reset_index(drop=True) train_df = df_out[ ~df_out["Name"].isin(test_buildings) ].reset_index(drop=True) return train_df, test_df if __name__ == "__main__": df_raw = pd.read_excel("hoanmy_detect_task.xlsx") df = preprocess_pipeline(df_raw) train_df, test_df = chia_train_test(df) train_embeddings = build_train_embeddings(train_df) # 4) Demo: dự đoán cho 1 dòng trong test_df (ví dụ dòng đầu tiên) example_row = test_df.iloc[0] pred, scores = predict_label_for_row( example_row, train_df=train_df, train_embeddings=train_embeddings, top_k=5, w_numeric=0.4, w_loc=0.6 ) print("\n=== DEMO PREDICT 1 DÒNG ===") print("Job :", example_row["Job"]) print("Segm job :", example_row["job_segmented"]) print("Location :", example_row["Location"]) print("True label:", example_row["label"]) print("Pred label:", pred) print("Label scores:", scores) # 5) Hoặc demo với 1 input tự tạo # job_seg_demo = "quét_dọn hành_lang lau_chùi vệ_sinh" # loc_clean_demo = "hành lang tầng 3" # pred2, scores2 = predict_single_sample( # job_segmented=job_seg_demo, # location_clean=loc_clean_demo, # name="CMC", # luuluong=2, # dientichngoai=0, # dientichtrong=1500, # train_df=train_df, # train_embeddings=train_embeddings, # top_k=5, # w_numeric=0.4, # w_loc=0.6 # ) # print("\n=== DEMO PREDICT 1 INPUT TỰ TẠO ===") # print("Pred label:", pred2) # print("Label scores:", scores2)