first

2025-11-16 15:09:40 +07:00 · 2025-11-16 15:09:40 +07:00 · 6f8847e36d
commit 6f8847e36d
40 changed files with 132469 additions and 0 deletions
--- a/cd.txt
+++ b/cd.txt
--- a/2025_01.01.2025-31.12.2025.doc
+++ b/2025_01.01.2025-31.12.2025.doc
--- a/Al/BIDV/bản
+++ b/Al/BIDV/bản
--- a/Al/CMC/Báo
+++ b/Al/CMC/Báo
--- a/Al/CMC/Hợp
+++ b/Al/CMC/Hợp
--- a/point/2024
+++ b/point/2024
--- a/(Final).doc
+++ b/(Final).doc
--- a/(Final).doc
+++ b/(Final).doc
--- a/Al/HCO(SAS)/BÁO
+++ b/Al/HCO(SAS)/BÁO
--- a/Al/HCO(SAS)/HĐ
+++ b/Al/HCO(SAS)/HĐ
--- a/Al/HH4/2019
+++ b/Al/HH4/2019
--- a/Al/HH4/Hợp
+++ b/Al/HH4/Hợp
--- a/Al/HH4/~$19
+++ b/Al/HH4/~$19
--- a/Kong.docx
+++ b/Kong.docx
--- a/HONGKONG.docx
+++ b/HONGKONG.docx
--- a/Al/Keangnam/HĐ
+++ b/Al/Keangnam/HĐ
--- a/25.5.2016.docx
+++ b/25.5.2016.docx
--- a/Al/Sunred/Báo
+++ b/Al/Sunred/Báo
--- a/Al/Sunred/HD
+++ b/Al/Sunred/HD
--- a/Võ/2025-HD
+++ b/Võ/2025-HD
--- a/Võ/20250517-HD
+++ b/Võ/20250517-HD
--- a/vấn.docx
+++ b/vấn.docx
--- a/vấn.docx
+++ b/vấn.docx
--- a/Al/VIGALCERA/BÁO
+++ b/Al/VIGALCERA/BÁO
--- a/Al/VIGALCERA/HĐ
+++ b/Al/VIGALCERA/HĐ
--- a/demo.py
+++ b/demo.py
@ -0,0 +1,375 @@
+import pandas as pd
+import re
+import unicodedata
+from pyvi import ViTokenizer
+import pandas as pd
+import numpy as np
+import re
+import unicodedata
+from pyvi import ViTokenizer
+from transformers import AutoTokenizer, AutoModel
+import torch
+from sklearn.metrics import classification_report
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+# Load HF model lần đầu, dùng chung
+hf_tokenizer = AutoTokenizer.from_pretrained("dangvantuan/vietnamese-embedding")
+hf_model = AutoModel.from_pretrained("dangvantuan/vietnamese-embedding").to(device)
+
+
+
+def embed_text(text, tokenizer=hf_tokenizer, model=hf_model, device=device):
+    """
+    Embed 1 câu text thành vector bằng model dangvantuan/vietnamese-embedding.
+    Dùng mean pooling với attention_mask.
+    """
+    encoded = tokenizer(
+        text,
+        padding=True,
+        truncation=True,
+        max_length=128,
+        return_tensors="pt"
+    ).to(device)
+
+    with torch.no_grad():
+        output = model(**encoded)
+
+    token_embeddings = output.last_hidden_state   # (1, L, H)
+    attention_mask   = encoded["attention_mask"]  # (1, L)
+
+    mask = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    sum_embeds = (token_embeddings * mask).sum(dim=1)  # (1, H)
+    lengths   = mask.sum(dim=1)                       # (1,1)
+    mean_pooled = sum_embeds / lengths
+
+    return mean_pooled.cpu()  # (1, H)
+
+
+def cosine_sim(a, b):
+    """
+    Tính cosine similarity:
+    - a: tensor shape (1, H)
+    - b: tensor shape (N, H)
+    Trả về: (1, N)
+    """
+    a = a / a.norm(dim=-1, keepdim=True)
+    b = b / b.norm(dim=-1, keepdim=True)
+    return torch.mm(a, b.t())
+
+
+# ============================================================
+# 2. RULE VỀ LOCATION VÀ THÔNG SỐ
+# ============================================================
+
+def location_similarity(q_row, cand_row):
+    """
+    Độ giống Location dựa trên location_clean (Jaccard tokens).
+    """
+    q_tokens = set(str(q_row["location_clean"]).split())
+    c_tokens = set(str(cand_row["location_clean"]).split())
+    if not q_tokens or not c_tokens:
+        return 0.0
+    inter = len(q_tokens & c_tokens)
+    union = len(q_tokens | c_tokens)
+    return inter / union
+
+
+def numeric_closeness(q_row, cand_row, alpha_out=0.7, alpha_in=0.7):
+    """
+    Độ gần nhau về:
+      - luuluong
+      - diện tích ngoài / trong (log1p + exp(-alpha * dist))
+    Trả về giá trị khoảng (0..~1.5), càng lớn càng gần.
+    """
+
+    # closeness theo luuluong
+    if "luuluong" in q_row and "luuluong" in cand_row:
+        if q_row["luuluong"] == cand_row["luuluong"]:
+            c_luu = 1.0
+        elif abs(q_row["luuluong"] - cand_row["luuluong"]) == 1:
+            c_luu = 0.6
+        else:
+            c_luu = 0.3
+    else:
+        c_luu = 0.5
+
+    # closeness theo diện tích, xử lý thiếu an toàn
+    def safe_val(row, col):
+        return float(row[col]) if col in row and not pd.isna(row[col]) else 0.0
+
+    q_out = safe_val(q_row, "Dientichngoai")
+    q_in  = safe_val(q_row, "Dientichtrong")
+    c_out = safe_val(cand_row, "Dientichngoai")
+    c_in  = safe_val(cand_row, "Dientichtrong")
+
+    d_out = abs(np.log1p(q_out) - np.log1p(c_out))
+    d_in  = abs(np.log1p(q_in)  - np.log1p(c_in))
+
+    c_out = np.exp(-alpha_out * d_out)
+    c_in  = np.exp(-alpha_in  * d_in)
+
+    return 0.5 * c_luu + 0.25 * c_out + 0.25 * c_in
+
+
+# ============================================================
+# 3. TRAIN EMBEDDINGS TỪ train_df
+# ============================================================
+
+def build_train_embeddings(train_df):
+    """
+    Nhận train_df có cột 'job_segmented'.
+    Trả về tensor embeddings shape (N_train, H).
+    """
+    train_texts = train_df["job_segmented"].tolist()
+    embs = []
+    for txt in train_texts:
+        vec = embed_text(txt)
+        embs.append(vec.squeeze(0))
+    return torch.stack(embs)
+
+
+# ============================================================
+# 4. RULE DỰ ĐOÁN CHO 1 ROW
+# ============================================================
+
+def predict_label_for_row(q_row,
+                          train_df,
+                          train_embeddings,
+                          tokenizer=hf_tokenizer,
+                          model=hf_model,
+                          top_k=10,
+                          w_numeric=0.4,
+                          w_loc=0.6):
+    """
+    Bước 1: dùng embedding(job_segmented) để chọn top_k ứng viên gần nghĩa nhất.
+    Bước 2: trong top_k đó, KHÔNG dùng score text nữa, chỉ dùng:
+        - loc_sim   : similarity theo location_clean  [0,1]  (quan trọng nhất)
+        - num_c     : numeric_closeness (luuluong + diện tích)  (~0..1.5)
+    final_score = w_loc * loc_sim + w_numeric * num_c
+    """
+    # 1) embed query
+    q_vec = embed_text(q_row["job_segmented"], tokenizer, model)  # (1,H)
+
+    # 2) cosine similarity với toàn bộ train → chỉ để CHỌN ỨNG VIÊN
+    sims = cosine_sim(q_vec, train_embeddings)[0]  # (N,)
+
+    # 3) lấy top-k job gần nhất theo embedding
+    top_k = min(top_k, len(train_df))
+    top_scores, top_idx = torch.topk(sims, k=top_k)
+
+    label_scores = {}
+
+    for _, idx in zip(top_scores, top_idx):
+        cand_row = train_df.iloc[int(idx)]
+
+        loc_sim = location_similarity(q_row, cand_row)
+        num_c   = numeric_closeness(q_row, cand_row)
+
+        final_score = w_loc * loc_sim + w_numeric * num_c
+
+        lbl = str(cand_row["label"])
+        label_scores[lbl] = label_scores.get(lbl, 0.0) + final_score
+
+    # fallback nếu không có ứng viên
+    if not label_scores:
+        majority_label = str(train_df["label"].value_counts().idxmax())
+        return majority_label, {}
+
+    best_label = max(label_scores, key=label_scores.get)
+    return best_label, label_scores
+
+
+# ============================================================
+# 5. PREDICT CHO CẢ 1 DATAFRAME (DÙNG CHO TEST/HOLDOUT)
+# ============================================================
+
+def predict_on_df(df_in: pd.DataFrame,
+                  train_df: pd.DataFrame,
+                  train_embeddings,
+                  name: str,
+                  top_k=5,
+                  w_numeric=0.4,
+                  w_loc=0.6):
+    df = df_in.copy()
+    preds = []
+    scores = []
+
+    for _, row in df.iterrows():
+        pred, sc = predict_label_for_row(
+            row,
+            train_df=train_df,
+            train_embeddings=train_embeddings,
+            tokenizer=hf_tokenizer,
+            model=hf_model,
+            top_k=top_k,
+            w_numeric=w_numeric,
+            w_loc=w_loc
+        )
+        preds.append(pred)
+        scores.append(sc)
+
+    df["pred_label"]    = preds
+    df["score_details"] = scores
+
+    print(f"\n========== KẾT QUẢ TRÊN {name} ==========")
+    print(classification_report(
+        df["label"].astype(str),
+        df["pred_label"].astype(str),
+        digits=3
+    ))
+
+    return df
+
+def normalize_text_keep_words(s: str) -> str:
+    s = str(s)
+    s = unicodedata.normalize('NFC', s).lower()
+    s = re.sub(r"[^0-9a-zà-ỹ\s]", " ", s)
+    s = re.sub(r"\s+", " ", s).strip()
+    return s
+
+def segment_and_remove_stopwords(text):
+    if not isinstance(text, str):
+        return ""
+    segmented = ViTokenizer.tokenize(text)
+    tokens = segmented.split()
+    filtered = [tok for tok in tokens if tok not in vietnamese_stopwords]
+    return " ".join(filtered)
+
+def preprocess_pipeline(df_raw: pd.DataFrame) -> pd.DataFrame:
+    """
+    - Không ghép Location + Job
+    - Clean text
+    - Word segment Job + remove stopword
+    """
+    df = df_raw.dropna(subset=["label"]).copy()
+    df["label"] = df["label"].astype(str).str.strip()
+    df = df[df["label"].isin(["1","2","3","4"])].copy()
+
+    df["Location"] = df["Location"].fillna("Unknown").astype(str)
+    df["Job"]      = df["Job"].astype(str)
+
+    # Clean text
+    df["location_clean"] = df["Location"].apply(normalize_text_keep_words)
+    df["job_clean"]      = df["Job"].apply(normalize_text_keep_words)
+
+    # Word segment + remove stopword
+    df["job_segmented"] = df["job_clean"].apply(segment_and_remove_stopwords)
+
+    # Giữ lại các cột numeric nếu có
+    keep_cols = [
+        "Location","Job","label",
+        "location_clean","job_clean","job_segmented"
+    ]
+    for c in ["Name","luuluong","Dientichngoai","Dientichtrong"]:
+        if c in df.columns:
+            keep_cols.append(c)
+
+    return df[keep_cols].reset_index(drop=True)
+# ==== 1. Load stopword ====
+def predict_single_sample(job_segmented: str,
+                          location_clean: str,
+                          name: str,
+                          luuluong: int,
+                          dientichngoai: float,
+                          dientichtrong: float,
+                          train_df: pd.DataFrame,
+                          train_embeddings,
+                          top_k=5,
+                          w_numeric=0.4,
+                          w_loc=0.6):
+    """
+    Demo dự đoán cho 1 điểm dữ liệu mới.
+    - job_segmented: chuỗi đã wordsegment + bỏ stopword (giống train_df['job_segmented'])
+    - location_clean: chuỗi location đã normalize (giống train_df['location_clean'])
+    - name, luuluong, dientichngoai, dientichtrong: thông số tòa/job
+    """
+
+    # Tạo 1 Series giả giống 1 dòng trong df_out
+    q_dict = {
+        "job_segmented": job_segmented,
+        "location_clean": location_clean,
+        "Name": name,
+        "luuluong": luuluong,
+        "Dientichngoai": dientichngoai,
+        "Dientichtrong": dientichtrong,
+    }
+    q_row = pd.Series(q_dict)
+
+    pred_label, label_scores = predict_label_for_row(
+        q_row,
+        train_df=train_df,
+        train_embeddings=train_embeddings,
+        tokenizer=hf_tokenizer,
+        model=hf_model,
+        top_k=top_k,
+        w_numeric=w_numeric,
+        w_loc=w_loc
+    )
+
+    return pred_label, label_scores
+
+
+def load_stopwords(path):
+    with open(path, "r", encoding="utf-8") as f:
+        sw = [line.strip() for line in f if line.strip()]
+    return set(sw)
+
+stopwords_path = "vietnamese-stopwords-dash.txt"   # đổi tên nếu khác
+vietnamese_stopwords = load_stopwords(stopwords_path)
+
+def chia_train_test(df_out: pd.DataFrame):
+    df_out = df_out[df_out["label"].astype(str).isin(["1", "2", "3"])].reset_index(drop=True)
+
+    test_buildings    = ["CMC"]   # tập test chính
+    test_df    = df_out[df_out["Name"].isin(test_buildings)].reset_index(drop=True)
+    train_df   = df_out[
+        ~df_out["Name"].isin(test_buildings)
+    ].reset_index(drop=True)
+
+    return train_df, test_df
+
+if __name__ == "__main__":
+    df_raw = pd.read_excel("hoanmy_detect_task.xlsx")
+    df = preprocess_pipeline(df_raw)
+
+    train_df, test_df = chia_train_test(df)
+    train_embeddings = build_train_embeddings(train_df)
+        # 4) Demo: dự đoán cho 1 dòng trong test_df (ví dụ dòng đầu tiên)
+    example_row = test_df.iloc[0]
+    pred, scores = predict_label_for_row(
+        example_row,
+        train_df=train_df,
+        train_embeddings=train_embeddings,
+        top_k=5,
+        w_numeric=0.4,
+        w_loc=0.6
+    )
+
+    print("\n=== DEMO PREDICT 1 DÒNG ===")
+    print("Job       :", example_row["Job"])
+    print("Segm job  :", example_row["job_segmented"])
+    print("Location  :", example_row["Location"])
+    print("True label:", example_row["label"])
+    print("Pred label:", pred)
+    print("Label scores:", scores)
+
+    # 5) Hoặc demo với 1 input tự tạo
+    # job_seg_demo = "quét_dọn hành_lang lau_chùi vệ_sinh"
+    # loc_clean_demo = "hành lang tầng 3"
+    # pred2, scores2 = predict_single_sample(
+    #     job_segmented=job_seg_demo,
+    #     location_clean=loc_clean_demo,
+    #     name="CMC",
+    #     luuluong=2,
+    #     dientichngoai=0,
+    #     dientichtrong=1500,
+    #     train_df=train_df,
+    #     train_embeddings=train_embeddings,
+    #     top_k=5,
+    #     w_numeric=0.4,
+    #     w_loc=0.6
+    # )
+    # print("\n=== DEMO PREDICT 1 INPUT TỰ TẠO ===")
+    # print("Pred label:", pred2)
+    # print("Label scores:", scores2)
+
--- a/felix.ipynb
+++ b/felix.ipynb
--- a/felix2.ipynb
+++ b/felix2.ipynb
--- a/felix3.ipynb
+++ b/felix3.ipynb
--- a/hoanmy_detect_task.xlsx
+++ b/hoanmy_detect_task.xlsx
--- a/phobert_cls_best/added_tokens.json
+++ b/phobert_cls_best/added_tokens.json
@ -0,0 +1,3 @@
+{
+  "<mask>": 64000
+}
--- a/phobert_cls_best/bpe.codes
+++ b/phobert_cls_best/bpe.codes
--- a/phobert_cls_best/config.json
+++ b/phobert_cls_best/config.json
@ -0,0 +1,39 @@
+{
+  "architectures": [
+    "RobertaForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "dtype": "float32",
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_2": 2
+  },
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 258,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "tokenizer_class": "PhobertTokenizer",
+  "transformers_version": "4.56.1",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 64001
+}
--- a/phobert_cls_best/model.safetensors
+++ b/phobert_cls_best/model.safetensors
--- a/phobert_cls_best/special_tokens_map.json
+++ b/phobert_cls_best/special_tokens_map.json
@ -0,0 +1,9 @@
+{
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": "<mask>",
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "unk_token": "<unk>"
+}
--- a/phobert_cls_best/tokenizer_config.json
+++ b/phobert_cls_best/tokenizer_config.json
@ -0,0 +1,55 @@
+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "64000": {
+      "content": "<mask>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "mask_token": "<mask>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "PhobertTokenizer",
+  "unk_token": "<unk>"
+}
--- a/phobert_cls_best/vocab.txt
+++ b/phobert_cls_best/vocab.txt
--- a/table_doc/BIDV.docx
+++ b/table_doc/BIDV.docx
--- a/train.ipynb
+++ b/train.ipynb
@ -0,0 +1,882 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "7eaf4b18",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "187c8d47",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_excel(\"hoanmy_detect_task.xlsx\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "eaddf252",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Location</th>\n",
+       "      <th>Job</th>\n",
+       "      <th>label</th>\n",
+       "      <th>Name</th>\n",
+       "      <th>luuluong</th>\n",
+       "      <th>Dientichngoai</th>\n",
+       "      <th>Dientichtrong</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Khu vực Ngoại cảnh</td>\n",
+       "      <td>Quét lá rụng, thu gom rác lối đi lại, lối xe c...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>BIDV</td>\n",
+       "      <td>3</td>\n",
+       "      <td>1144</td>\n",
+       "      <td>11200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Khu vực Ngoại cảnh</td>\n",
+       "      <td>Nhặt rác bồn hoa cây cảnh, làm sạch gạch ốp xu...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>BIDV</td>\n",
+       "      <td>3</td>\n",
+       "      <td>1144</td>\n",
+       "      <td>11200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Khu vực Ngoại cảnh</td>\n",
+       "      <td>Vệ sinh gạt tàn, thùng rác</td>\n",
+       "      <td>1</td>\n",
+       "      <td>BIDV</td>\n",
+       "      <td>3</td>\n",
+       "      <td>1144</td>\n",
+       "      <td>11200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Khu vực Ngoại cảnh</td>\n",
+       "      <td>Lau các biển quảng cáo, biển chỉ dẫn (dưới 4m)...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>BIDV</td>\n",
+       "      <td>3</td>\n",
+       "      <td>1144</td>\n",
+       "      <td>11200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Khu vực Ngoại cảnh</td>\n",
+       "      <td>Lau tường đá và kính bên ngoài tòa nhà (dưới 4m)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>BIDV</td>\n",
+       "      <td>3</td>\n",
+       "      <td>1144</td>\n",
+       "      <td>11200</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             Location                                                Job  \\\n",
+       "0  Khu vực Ngoại cảnh  Quét lá rụng, thu gom rác lối đi lại, lối xe c...   \n",
+       "1  Khu vực Ngoại cảnh  Nhặt rác bồn hoa cây cảnh, làm sạch gạch ốp xu...   \n",
+       "2  Khu vực Ngoại cảnh                         Vệ sinh gạt tàn, thùng rác   \n",
+       "3  Khu vực Ngoại cảnh  Lau các biển quảng cáo, biển chỉ dẫn (dưới 4m)...   \n",
+       "4  Khu vực Ngoại cảnh   Lau tường đá và kính bên ngoài tòa nhà (dưới 4m)   \n",
+       "\n",
+       "  label  Name  luuluong  Dientichngoai  Dientichtrong  \n",
+       "0     1  BIDV         3           1144          11200  \n",
+       "1     1  BIDV         3           1144          11200  \n",
+       "2     1  BIDV         3           1144          11200  \n",
+       "3     1  BIDV         3           1144          11200  \n",
+       "4     2  BIDV         3           1144          11200  "
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "6fd4be0a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from pyvi import ViTokenizer\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "878456df",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ==== 1. Load stopword ====\n",
+    "\n",
+    "def load_stopwords(path):\n",
+    "    with open(path, \"r\", encoding=\"utf-8\") as f:\n",
+    "        sw = [line.strip() for line in f if line.strip()]\n",
+    "    return set(sw)\n",
+    "\n",
+    "stopwords_path = \"vietnamese-stopwords-dash.txt\"   # đổi tên nếu khác\n",
+    "vietnamese_stopwords = load_stopwords(stopwords_path)\n",
+    "\n",
+    "\n",
+    "# ==== 2. Hàm tokenize + bỏ stopword ====\n",
+    "\n",
+    "def segment_and_remove_stopwords(text):\n",
+    "    if not isinstance(text, str):\n",
+    "        return \"\"\n",
+    "    segmented = ViTokenizer.tokenize(text)\n",
+    "    tokens = segmented.split()\n",
+    "    filtered = [tok for tok in tokens if tok not in vietnamese_stopwords]\n",
+    "    return \" \".join(filtered)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "bf15213a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import re\n",
+    "import unicodedata\n",
+    "from pyvi import ViTokenizer\n",
+    "\n",
+    "def normalize_text_keep_words(s: str) -> str:\n",
+    "    s = str(s)\n",
+    "    s = unicodedata.normalize('NFC', s).lower()\n",
+    "    s = re.sub(r\"[^0-9a-zà-ỹ\\s]\", \" \", s)\n",
+    "    s = re.sub(r\"\\s+\", \" \", s).strip()\n",
+    "    return s\n",
+    "\n",
+    "\n",
+    "def preprocess_pipeline(df_raw: pd.DataFrame) -> pd.DataFrame:\n",
+    "    \"\"\"\n",
+    "    - Không ghép Location + Job\n",
+    "    - Clean text\n",
+    "    - Word segment Job + remove stopword\n",
+    "    \"\"\"\n",
+    "    df = df_raw.dropna(subset=[\"label\"]).copy()\n",
+    "    df[\"label\"] = df[\"label\"].astype(str).str.strip()\n",
+    "    df = df[df[\"label\"].isin([\"1\",\"2\",\"3\",\"4\"])].copy()\n",
+    "\n",
+    "    df[\"Location\"] = df[\"Location\"].fillna(\"Unknown\").astype(str)\n",
+    "    df[\"Job\"]      = df[\"Job\"].astype(str)\n",
+    "\n",
+    "    # Clean text\n",
+    "    df[\"location_clean\"] = df[\"Location\"].apply(normalize_text_keep_words)\n",
+    "    df[\"job_clean\"]      = df[\"Job\"].apply(normalize_text_keep_words)\n",
+    "\n",
+    "    # Word segment + remove stopword\n",
+    "    df[\"job_segmented\"] = df[\"job_clean\"].apply(segment_and_remove_stopwords)\n",
+    "\n",
+    "    # Giữ lại các cột numeric nếu có\n",
+    "    keep_cols = [\n",
+    "        \"Location\",\"Job\",\"label\",\n",
+    "        \"location_clean\",\"job_clean\",\"job_segmented\"\n",
+    "    ]\n",
+    "    for c in [\"Name\",\"luuluong\",\"Dientichngoai\",\"Dientichtrong\"]:\n",
+    "        if c in df.columns:\n",
+    "            keep_cols.append(c)\n",
+    "\n",
+    "    return df[keep_cols].reset_index(drop=True)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "90599a6c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Location</th>\n",
+       "      <th>Job</th>\n",
+       "      <th>label</th>\n",
+       "      <th>location_clean</th>\n",
+       "      <th>job_clean</th>\n",
+       "      <th>job_segmented</th>\n",
+       "      <th>Name</th>\n",
+       "      <th>luuluong</th>\n",
+       "      <th>Dientichngoai</th>\n",
+       "      <th>Dientichtrong</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Khu vực Ngoại cảnh</td>\n",
+       "      <td>Quét lá rụng, thu gom rác lối đi lại, lối xe c...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>khu vực ngoại cảnh</td>\n",
+       "      <td>quét lá rụng thu gom rác lối đi lại lối xe chạ...</td>\n",
+       "      <td>quét lá rụng thu_gom rác lối đi_lại lối xe chạ...</td>\n",
+       "      <td>BIDV</td>\n",
+       "      <td>3</td>\n",
+       "      <td>1144</td>\n",
+       "      <td>11200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Khu vực Ngoại cảnh</td>\n",
+       "      <td>Nhặt rác bồn hoa cây cảnh, làm sạch gạch ốp xu...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>khu vực ngoại cảnh</td>\n",
+       "      <td>nhặt rác bồn hoa cây cảnh làm sạch gạch ốp xun...</td>\n",
+       "      <td>nhặt rác bồn hoa cây_cảnh sạch gạch ốp xung_qu...</td>\n",
+       "      <td>BIDV</td>\n",
+       "      <td>3</td>\n",
+       "      <td>1144</td>\n",
+       "      <td>11200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Khu vực Ngoại cảnh</td>\n",
+       "      <td>Vệ sinh gạt tàn, thùng rác</td>\n",
+       "      <td>1</td>\n",
+       "      <td>khu vực ngoại cảnh</td>\n",
+       "      <td>vệ sinh gạt tàn thùng rác</td>\n",
+       "      <td>vệ_sinh gạt_tàn thùng rác</td>\n",
+       "      <td>BIDV</td>\n",
+       "      <td>3</td>\n",
+       "      <td>1144</td>\n",
+       "      <td>11200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Khu vực Ngoại cảnh</td>\n",
+       "      <td>Lau các biển quảng cáo, biển chỉ dẫn (dưới 4m)...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>khu vực ngoại cảnh</td>\n",
+       "      <td>lau các biển quảng cáo biển chỉ dẫn dưới 4m ch...</td>\n",
+       "      <td>lau biển quảng_cáo biển chỉ_dẫn 4m chân cột điện</td>\n",
+       "      <td>BIDV</td>\n",
+       "      <td>3</td>\n",
+       "      <td>1144</td>\n",
+       "      <td>11200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Khu vực Ngoại cảnh</td>\n",
+       "      <td>Lau tường đá và kính bên ngoài tòa nhà (dưới 4m)</td>\n",
+       "      <td>2</td>\n",
+       "      <td>khu vực ngoại cảnh</td>\n",
+       "      <td>lau tường đá và kính bên ngoài tòa nhà dưới 4m</td>\n",
+       "      <td>lau tường đá kính tòa 4m</td>\n",
+       "      <td>BIDV</td>\n",
+       "      <td>3</td>\n",
+       "      <td>1144</td>\n",
+       "      <td>11200</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             Location                                                Job  \\\n",
+       "0  Khu vực Ngoại cảnh  Quét lá rụng, thu gom rác lối đi lại, lối xe c...   \n",
+       "1  Khu vực Ngoại cảnh  Nhặt rác bồn hoa cây cảnh, làm sạch gạch ốp xu...   \n",
+       "2  Khu vực Ngoại cảnh                         Vệ sinh gạt tàn, thùng rác   \n",
+       "3  Khu vực Ngoại cảnh  Lau các biển quảng cáo, biển chỉ dẫn (dưới 4m)...   \n",
+       "4  Khu vực Ngoại cảnh   Lau tường đá và kính bên ngoài tòa nhà (dưới 4m)   \n",
+       "\n",
+       "  label      location_clean  \\\n",
+       "0     1  khu vực ngoại cảnh   \n",
+       "1     1  khu vực ngoại cảnh   \n",
+       "2     1  khu vực ngoại cảnh   \n",
+       "3     1  khu vực ngoại cảnh   \n",
+       "4     2  khu vực ngoại cảnh   \n",
+       "\n",
+       "                                           job_clean  \\\n",
+       "0  quét lá rụng thu gom rác lối đi lại lối xe chạ...   \n",
+       "1  nhặt rác bồn hoa cây cảnh làm sạch gạch ốp xun...   \n",
+       "2                          vệ sinh gạt tàn thùng rác   \n",
+       "3  lau các biển quảng cáo biển chỉ dẫn dưới 4m ch...   \n",
+       "4     lau tường đá và kính bên ngoài tòa nhà dưới 4m   \n",
+       "\n",
+       "                                       job_segmented  Name  luuluong  \\\n",
+       "0  quét lá rụng thu_gom rác lối đi_lại lối xe chạ...  BIDV         3   \n",
+       "1  nhặt rác bồn hoa cây_cảnh sạch gạch ốp xung_qu...  BIDV         3   \n",
+       "2                          vệ_sinh gạt_tàn thùng rác  BIDV         3   \n",
+       "3   lau biển quảng_cáo biển chỉ_dẫn 4m chân cột điện  BIDV         3   \n",
+       "4                           lau tường đá kính tòa 4m  BIDV         3   \n",
+       "\n",
+       "   Dientichngoai  Dientichtrong  \n",
+       "0           1144          11200  \n",
+       "1           1144          11200  \n",
+       "2           1144          11200  \n",
+       "3           1144          11200  \n",
+       "4           1144          11200  "
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_out = preprocess_pipeline(df)\n",
+    "df_out.head()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "1c472539",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train: (514, 10)\n",
+      "Test : (172, 10)\n",
+      "Holdout: (83, 10)\n"
+     ]
+    }
+   ],
+   "source": [
+    "df_out = df_out[df_out[\"label\"].astype(str).isin([\"1\", \"2\", \"3\"])].reset_index(drop=True)\n",
+    "\n",
+    "\n",
+    "# ============================================================\n",
+    "# 3. CHIA 3 TẬP: TRAIN / TEST / HOLDOUT\n",
+    "# ============================================================\n",
+    "\n",
+    "test_buildings    = [\"Keangnam\", \"CMC\"]   # tập test chính\n",
+    "holdout_buildings = [\"VIGALCERA\"]             # tòa riêng để holdout – sửa theo dữ liệu thực tế\n",
+    "\n",
+    "test_df    = df_out[df_out[\"Name\"].isin(test_buildings)].reset_index(drop=True)\n",
+    "holdout_df = df_out[df_out[\"Name\"].isin(holdout_buildings)].reset_index(drop=True)\n",
+    "train_df   = df_out[\n",
+    "    ~df_out[\"Name\"].isin(test_buildings + holdout_buildings)\n",
+    "].reset_index(drop=True)\n",
+    "\n",
+    "print(\"Train:\", train_df.shape)\n",
+    "print(\"Test :\", test_df.shape)\n",
+    "print(\"Holdout:\", holdout_df.shape)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "bff38170",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_out = df_out[df_out[\"label\"].astype(str).isin([\"1\", \"2\", \"3\"])].reset_index(drop=True)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "8fc94180",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import re\n",
+    "import unicodedata\n",
+    "from pyvi import ViTokenizer\n",
+    "from transformers import AutoTokenizer, AutoModel\n",
+    "import torch\n",
+    "from sklearn.metrics import classification_report\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "c41abe00",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "hf_tokenizer = AutoTokenizer.from_pretrained(\"dangvantuan/vietnamese-embedding\")\n",
+    "hf_model = AutoModel.from_pretrained(\"dangvantuan/vietnamese-embedding\").to(device)\n",
+    "\n",
+    "def embed_text(text, tokenizer=hf_tokenizer, model=hf_model, device=device):\n",
+    "    encoded = tokenizer(\n",
+    "        text,\n",
+    "        padding=True,\n",
+    "        truncation=True,\n",
+    "        max_length=128,\n",
+    "        return_tensors=\"pt\"\n",
+    "    ).to(device)\n",
+    "\n",
+    "    with torch.no_grad():\n",
+    "        output = model(**encoded)\n",
+    "\n",
+    "    token_embeddings = output.last_hidden_state   # (1, L, H)\n",
+    "    attention_mask   = encoded[\"attention_mask\"]  # (1, L)\n",
+    "\n",
+    "    mask = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()\n",
+    "    sum_embeds = (token_embeddings * mask).sum(dim=1)  # (1, H)\n",
+    "    lengths   = mask.sum(dim=1)                       # (1,1)\n",
+    "    mean_pooled = sum_embeds / lengths\n",
+    "\n",
+    "    return mean_pooled.cpu()  # (1, H)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "04ad0db7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def cosine_sim(a, b):\n",
+    "    \"\"\"\n",
+    "    a: (1, H)\n",
+    "    b: (N, H)\n",
+    "    \"\"\"\n",
+    "    a = a / a.norm(dim=-1, keepdim=True)\n",
+    "    b = b / b.norm(dim=-1, keepdim=True)\n",
+    "    return torch.mm(a, b.t())  # (1, N)\n",
+    "\n",
+    "\n",
+    "# Embed toàn bộ train job_segmented\n",
+    "train_texts = train_df[\"job_segmented\"].tolist()\n",
+    "\n",
+    "train_embeddings = []\n",
+    "for txt in train_texts:\n",
+    "    vec = embed_text(txt)\n",
+    "    train_embeddings.append(vec.squeeze(0))\n",
+    "\n",
+    "train_embeddings = torch.stack(train_embeddings)   # (N_train, H)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "2f1a3319",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 5. HÀM LOCATION SIMILARITY + NUMERIC CLOSENESS\n",
+    "# ============================================================\n",
+    "\n",
+    "def location_similarity(q_row, cand_row):\n",
+    "    q_tokens = set(str(q_row[\"location_clean\"]).split())\n",
+    "    c_tokens = set(str(cand_row[\"location_clean\"]).split())\n",
+    "    if not q_tokens or not c_tokens:\n",
+    "        return 0.0\n",
+    "    inter = len(q_tokens & c_tokens)\n",
+    "    union = len(q_tokens | c_tokens)\n",
+    "    return inter / union\n",
+    "\n",
+    "\n",
+    "def numeric_closeness(q_row, cand_row, alpha_out=0.7, alpha_in=0.7):\n",
+    "    # closeness theo luuluong\n",
+    "    if \"luuluong\" in q_row and \"luuluong\" in cand_row:\n",
+    "        if q_row[\"luuluong\"] == cand_row[\"luuluong\"]:\n",
+    "            c_luu = 1.0\n",
+    "        elif abs(q_row[\"luuluong\"] - cand_row[\"luuluong\"]) == 1:\n",
+    "            c_luu = 0.6\n",
+    "        else:\n",
+    "            c_luu = 0.3\n",
+    "    else:\n",
+    "        c_luu = 0.5\n",
+    "\n",
+    "    # closeness theo diện tích\n",
+    "    def safe_val(row, col):\n",
+    "        return float(row[col]) if col in row and not pd.isna(row[col]) else 0.0\n",
+    "\n",
+    "    q_out = safe_val(q_row, \"Dientichngoai\")\n",
+    "    q_in  = safe_val(q_row, \"Dientichtrong\")\n",
+    "    c_out = safe_val(cand_row, \"Dientichngoai\")\n",
+    "    c_in  = safe_val(cand_row, \"Dientichtrong\")\n",
+    "\n",
+    "    d_out = abs(np.log1p(q_out) - np.log1p(c_out))\n",
+    "    d_in  = abs(np.log1p(q_in)  - np.log1p(c_in))\n",
+    "\n",
+    "    c_out = np.exp(-alpha_out * d_out)\n",
+    "    c_in  = np.exp(-alpha_in  * d_in)\n",
+    "\n",
+    "    return 0.5 * c_luu + 0.25 * c_out + 0.25 * c_in\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "9931151e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def predict_label_for_row(q_row,\n",
+    "                          train_df=train_df,\n",
+    "                          train_embeddings=train_embeddings,\n",
+    "                          tokenizer=hf_tokenizer,\n",
+    "                          model=hf_model,\n",
+    "                          top_k=10,\n",
+    "                          w_numeric=0.4,\n",
+    "                          w_loc=0.6):\n",
+    "    \"\"\"\n",
+    "    Bước 1: dùng embedding(job_segmented) để chọn top_k ứng viên gần nghĩa nhất.\n",
+    "    Bước 2: trong top_k đó, KHÔNG dùng score text nữa, chỉ dùng:\n",
+    "        - loc_sim   : similarity theo location_clean  [0,1]  (quan trọng nhất)\n",
+    "        - num_c     : numeric_closeness (luuluong + diện tích)  (~0..1.5)\n",
+    "    final_score = w_loc * loc_sim + w_numeric * num_c\n",
+    "    \"\"\"\n",
+    "    # 1) embed query\n",
+    "    q_vec = embed_text(q_row[\"job_segmented\"], tokenizer, model)  # (1,H)\n",
+    "\n",
+    "    # 2) cosine similarity với toàn bộ train → chỉ để CHỌN ỨNG VIÊN\n",
+    "    sims = cosine_sim(q_vec, train_embeddings)[0]  # (N,)\n",
+    "\n",
+    "    # 3) lấy top-k job gần nhất theo embedding\n",
+    "    top_k = min(top_k, len(train_df))\n",
+    "    top_scores, top_idx = torch.topk(sims, k=top_k)\n",
+    "\n",
+    "    label_scores = {}\n",
+    "\n",
+    "    for score, idx in zip(top_scores, top_idx):\n",
+    "        cand_row = train_df.iloc[int(idx)]\n",
+    "\n",
+    "        loc_sim = location_similarity(q_row, cand_row)\n",
+    "        num_c   = numeric_closeness(q_row, cand_row)\n",
+    "\n",
+    "        final_score = w_loc * loc_sim + w_numeric * num_c\n",
+    "\n",
+    "        lbl = str(cand_row[\"label\"])\n",
+    "        label_scores[lbl] = label_scores.get(lbl, 0.0) + final_score\n",
+    "\n",
+    "    if not label_scores:\n",
+    "        majority_label = str(train_df[\"label\"].value_counts().idxmax())\n",
+    "        return majority_label, {}\n",
+    "\n",
+    "    best_label = max(label_scores, key=label_scores.get)\n",
+    "    return best_label, label_scores\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "16b51693",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def predict_on_df(df_in: pd.DataFrame,\n",
+    "                  name: str,\n",
+    "                  top_k=5,\n",
+    "                  w_numeric=0.4,\n",
+    "                  w_loc=0.6):\n",
+    "    df = df_in.copy()\n",
+    "    preds = []\n",
+    "    scores = []\n",
+    "\n",
+    "    for _, row in df.iterrows():\n",
+    "        pred, sc = predict_label_for_row(\n",
+    "            row,\n",
+    "            train_df=train_df,\n",
+    "            train_embeddings=train_embeddings,\n",
+    "            tokenizer=hf_tokenizer,\n",
+    "            model=hf_model,\n",
+    "            top_k=top_k,\n",
+    "            w_numeric=w_numeric,\n",
+    "            w_loc=w_loc\n",
+    "        )\n",
+    "        preds.append(pred)\n",
+    "        scores.append(sc)\n",
+    "\n",
+    "    df[\"pred_label\"]    = preds\n",
+    "    df[\"score_details\"] = scores\n",
+    "\n",
+    "    print(f\"\\n========== KẾT QUẢ TRÊN {name} ==========\")\n",
+    "    print(classification_report(\n",
+    "        df[\"label\"].astype(str),\n",
+    "        df[\"pred_label\"].astype(str),\n",
+    "        digits=3\n",
+    "    ))\n",
+    "\n",
+    "    return df\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "dcc54593",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "========== KẾT QUẢ TRÊN TEST (Keangnam + CMC) ==========\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           1      0.788     0.848     0.817       105\n",
+      "           2      0.545     0.511     0.527        47\n",
+      "           3      0.667     0.500     0.571        20\n",
+      "\n",
+      "    accuracy                          0.715       172\n",
+      "   macro avg      0.667     0.619     0.638       172\n",
+      "weighted avg      0.707     0.715     0.709       172\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "test_df_pred    = predict_on_df(test_df, \"TEST (Keangnam + CMC)\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "6f0cc888",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "========== KẾT QUẢ TRÊN HOLDOUT ==========\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           1      0.684     0.907     0.780        43\n",
+      "           2      0.429     0.261     0.324        23\n",
+      "           3      0.667     0.471     0.552        17\n",
+      "\n",
+      "    accuracy                          0.639        83\n",
+      "   macro avg      0.593     0.546     0.552        83\n",
+      "weighted avg      0.610     0.639     0.607        83\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "holdout_df_pred = predict_on_df(holdout_df, \"HOLDOUT\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "5f804672",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_top_neighbors_for_row(q_row,\n",
+    "                              train_df,\n",
+    "                              train_embeddings,\n",
+    "                              tokenizer,\n",
+    "                              model,\n",
+    "                              top_k=5,\n",
+    "                              w_numeric=0.4,\n",
+    "                              w_loc=0.6):\n",
+    "    \"\"\"\n",
+    "    Bước 1: dùng embedding(job_segmented) để lấy top_k ứng viên gần nghĩa nhất.\n",
+    "    Bước 2: với mỗi ứng viên, tính:\n",
+    "        - cos_sim       : similarity embedding (chỉ để tham khảo)\n",
+    "        - loc_sim       : similarity theo location_clean\n",
+    "        - num_closeness : theo luuluong + diện tích\n",
+    "        - final_score   : w_loc * loc_sim + w_numeric * num_closeness\n",
+    "    Trả về DataFrame các neighbor, sort theo final_score giảm dần.\n",
+    "    \"\"\"\n",
+    "    # 1) embed query\n",
+    "    q_vec = embed_text(q_row[\"job_segmented\"], tokenizer, model)  # (1, H)\n",
+    "\n",
+    "    # 2) cosine similarity với toàn bộ train\n",
+    "    sims = cosine_sim(q_vec, train_embeddings)[0]  # (N,)\n",
+    "\n",
+    "    # 3) lấy top_k index theo sims\n",
+    "    top_k = min(top_k, len(train_df))\n",
+    "    top_scores, top_idx = torch.topk(sims, k=top_k)\n",
+    "\n",
+    "    rows = []\n",
+    "    for score, idx in zip(top_scores, top_idx):\n",
+    "        score_val = float(score.item())\n",
+    "        cand = train_df.iloc[int(idx)].copy()\n",
+    "\n",
+    "        loc_sim = location_similarity(q_row, cand)\n",
+    "        num_c   = numeric_closeness(q_row, cand)\n",
+    "\n",
+    "        final_score = w_loc * loc_sim + w_numeric * num_c\n",
+    "\n",
+    "        cand[\"cos_sim\"]       = score_val\n",
+    "        cand[\"loc_sim\"]       = loc_sim\n",
+    "        cand[\"num_closeness\"] = num_c\n",
+    "        cand[\"final_score\"]   = final_score\n",
+    "        rows.append(cand)\n",
+    "\n",
+    "    neighbors_df = pd.DataFrame(rows).sort_values(\"final_score\", ascending=False)\n",
+    "    return neighbors_df\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "f224a244",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Số mẫu sai trên HOLDOUT: 30\n",
+      "\n",
+      "====================================================================================================\n",
+      "❌ CASE SAI #3\n",
+      "  True label    : 3\n",
+      "  Pred label    : 1\n",
+      "  Tòa (Name)    : VIGALCERA\n",
+      "  Location      : Khu vực ngoại cảnh\n",
+      "  Lưu lượng     : 3\n",
+      "  DT ngoài      : 6000\n",
+      "  DT trong      : 4520\n",
+      "  Job raw       : - Làm vệ sinh khu vực đài phun nước\n",
+      "  job_segmented : vệ_sinh khu_vực đài phun\n",
+      "  location_clean: khu vực ngoại cảnh\n",
+      "\n",
+      "  → Top 10 hàng xóm trong train (sorted theo final_score):\n",
+      "label         Name                           Location  luuluong  Dientichngoai  Dientichtrong                                                                                                                                Job                                                                         job_segmented  cos_sim  loc_sim  num_closeness  final_score\n",
+      "    1         BIDV                 Khu vực Ngoại cảnh         3           1144          11200                                                                                                          Vệ sinh họng rác (nếu có)                                                                      vệ_sinh họng rác 0.694930 1.000000       0.710878     0.884351\n",
+      "    1         BIDV                Khu vực Nhà vệ sinh         3           1144          11200                  Làm sạch và khử mùi khu vệ sinh và các thiết bị bên trong nhà vệ sinh bao gồm: Bồn rửa, bệ xí, các vòi, van nước.               sạch khử mùi khu vệ_sinh thiết_bị vệ_sinh bao_gồm bồn rửa bệ_xí vòi van 0.690451 0.285714       0.710878     0.455780\n",
+      "    1 CenterPoint       KHU VỰC NHÀ VỆ SINH CÔNG CỘNG         3            400           5379   Làm sạch và khử mùi khu vệ sinh và các thiết bị bên trong nhà vệ sinh bao gồm: Bồn rửa, bệ xí, các vòi, van nước, hộp đựng giấy… sạch khử mùi khu vệ_sinh thiết_bị vệ_sinh bao_gồm bồn rửa bệ_xí vòi van hộp đựng giấy 0.671480 0.222222       0.758955     0.436915\n",
+      "    1    Hong Kong KHU VỰC NHÀ VỆ SINH (Ban quản lý )         2           9950          25630 · Làm sạch và khử mùi khu vệ sinh và các thiết bị bên trong nhà vệ sinh bao gồm: Bồn rửa, bệ xí, các vòi, van nước, hộp đựng giấy… sạch khử mùi khu vệ_sinh thiết_bị vệ_sinh bao_gồm bồn rửa bệ_xí vòi van hộp đựng giấy 0.671480 0.200000       0.549676     0.339870\n",
+      "    1          HCO                        Nhà vệ sinh         3            800          13000                                                                                       Lau thùng đựng rác, thiết bị thoát nước thải                                           lau thùng đựng rác thiết_bị thoát nước_thải 0.706144 0.000000       0.680405     0.272162\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Lọc các case dự đoán sai trên holdout\n",
+    "mis_holdout = holdout_df_pred[\n",
+    "    holdout_df_pred[\"label\"].astype(str) != holdout_df_pred[\"pred_label\"].astype(str)\n",
+    "].copy()\n",
+    "\n",
+    "print(\"Số mẫu sai trên HOLDOUT:\", len(mis_holdout))\n",
+    "\n",
+    "max_cases = 5   # in tối đa 5 case cho đỡ dài, bạn có thể tăng số này\n",
+    "\n",
+    "for i, (_, row) in enumerate(mis_holdout.iterrows(), start=1):\n",
+    "    if i > max_cases:\n",
+    "        break\n",
+    "    if i!=3:\n",
+    "        continue\n",
+    "    print(\"\\n\" + \"=\"*100)\n",
+    "    print(f\"❌ CASE SAI #{i}\")\n",
+    "    print(f\"  True label    : {row['label']}\")\n",
+    "    print(f\"  Pred label    : {row['pred_label']}\")\n",
+    "    print(f\"  Tòa (Name)    : {row.get('Name', 'N/A')}\")\n",
+    "    print(f\"  Location      : {row['Location']}\")\n",
+    "    print(f\"  Lưu lượng     : {row.get('luuluong', 'N/A')}\")\n",
+    "    print(f\"  DT ngoài      : {row.get('Dientichngoai', 'N/A')}\")\n",
+    "    print(f\"  DT trong      : {row.get('Dientichtrong', 'N/A')}\")\n",
+    "    print(\"  Job raw       :\", row[\"Job\"])\n",
+    "    print(\"  job_segmented :\", row[\"job_segmented\"])\n",
+    "    print(\"  location_clean:\", row[\"location_clean\"])\n",
+    "\n",
+    "    # Lấy top neighbors cho case này\n",
+    "    neighbors = get_top_neighbors_for_row(\n",
+    "        row,\n",
+    "        train_df=train_df,\n",
+    "        train_embeddings=train_embeddings,\n",
+    "        tokenizer=hf_tokenizer,\n",
+    "        model=hf_model,\n",
+    "        top_k=5,       # số ứng viên lấy theo embedding\n",
+    "        w_numeric=0.4,\n",
+    "        w_loc=0.6\n",
+    "    )\n",
+    "\n",
+    "    print(\"\\n  → Top 10 hàng xóm trong train (sorted theo final_score):\")\n",
+    "    cols_show = [\n",
+    "        \"label\", \"Name\", \"Location\",\n",
+    "        \"luuluong\", \"Dientichngoai\", \"Dientichtrong\",\n",
+    "        \"Job\", \"job_segmented\",\n",
+    "        \"cos_sim\", \"loc_sim\", \"num_closeness\", \"final_score\"\n",
+    "    ]\n",
+    "    # Chỉ in cột nào thực sự tồn tại (phòng trường hợp thiếu)\n",
+    "    cols_show = [c for c in cols_show if c in neighbors.columns]\n",
+    "\n",
+    "    print(neighbors[cols_show].head(10).to_string(index=False))\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "tainl",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/vietnamese-stopwords-dash.txt
+++ b/vietnamese-stopwords-dash.txt