first

2025-11-16 15:09:40 +07:00 · 2025-11-16 15:09:40 +07:00 · 6f8847e36d
commit 6f8847e36d
40 changed files with 132469 additions and 0 deletions
--- a/cd.txt
+++ b/cd.txt
--- a/2025_01.01.2025-31.12.2025.doc
+++ b/2025_01.01.2025-31.12.2025.doc
--- a/Al/BIDV/bản
+++ b/Al/BIDV/bản
--- a/Al/CMC/Báo
+++ b/Al/CMC/Báo
--- a/Al/CMC/Hợp
+++ b/Al/CMC/Hợp
--- a/point/2024
+++ b/point/2024
--- a/(Final).doc
+++ b/(Final).doc
--- a/(Final).doc
+++ b/(Final).doc
--- a/Al/HCO(SAS)/BÁO
+++ b/Al/HCO(SAS)/BÁO
--- a/Al/HCO(SAS)/HĐ
+++ b/Al/HCO(SAS)/HĐ
--- a/Al/HH4/2019
+++ b/Al/HH4/2019
--- a/Al/HH4/Hợp
+++ b/Al/HH4/Hợp
--- a/Al/HH4/~$19
+++ b/Al/HH4/~$19
--- a/Kong.docx
+++ b/Kong.docx
--- a/HONGKONG.docx
+++ b/HONGKONG.docx
--- a/Al/Keangnam/HĐ
+++ b/Al/Keangnam/HĐ
--- a/25.5.2016.docx
+++ b/25.5.2016.docx
--- a/Al/Sunred/Báo
+++ b/Al/Sunred/Báo
--- a/Al/Sunred/HD
+++ b/Al/Sunred/HD
--- a/Võ/2025-HD
+++ b/Võ/2025-HD
--- a/Võ/20250517-HD
+++ b/Võ/20250517-HD
--- a/vấn.docx
+++ b/vấn.docx
--- a/vấn.docx
+++ b/vấn.docx
--- a/Al/VIGALCERA/BÁO
+++ b/Al/VIGALCERA/BÁO
--- a/Al/VIGALCERA/HĐ
+++ b/Al/VIGALCERA/HĐ
--- a/demo.py
+++ b/demo.py
@ -0,0 +1,375 @@
 import pandas as pd
 import re
 import unicodedata
 from pyvi import ViTokenizer
 import pandas as pd
 import numpy as np
 import re
 import unicodedata
 from pyvi import ViTokenizer
 from transformers import AutoTokenizer, AutoModel
 import torch
 from sklearn.metrics import classification_report
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Load HF model lần đầu, dùng chung
 hf_tokenizer = AutoTokenizer.from_pretrained("dangvantuan/vietnamese-embedding")
 hf_model = AutoModel.from_pretrained("dangvantuan/vietnamese-embedding").to(device)
 def embed_text(text, tokenizer=hf_tokenizer, model=hf_model, device=device):
    """
    Embed 1 câu text thành vector bằng model dangvantuan/vietnamese-embedding.
    Dùng mean pooling với attention_mask.
    """
    encoded = tokenizer(
        text,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    ).to(device)
    with torch.no_grad():
        output = model(**encoded)
    token_embeddings = output.last_hidden_state   # (1, L, H)
    attention_mask   = encoded["attention_mask"]  # (1, L)
    mask = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeds = (token_embeddings * mask).sum(dim=1)  # (1, H)
    lengths   = mask.sum(dim=1)                       # (1,1)
    mean_pooled = sum_embeds / lengths
    return mean_pooled.cpu()  # (1, H)
 def cosine_sim(a, b):
    """
    Tính cosine similarity:
    - a: tensor shape (1, H)
    - b: tensor shape (N, H)
    Trả về: (1, N)
    """
    a = a / a.norm(dim=-1, keepdim=True)
    b = b / b.norm(dim=-1, keepdim=True)
    return torch.mm(a, b.t())
 # ============================================================
 # 2. RULE VỀ LOCATION VÀ THÔNG SỐ
 # ============================================================
 def location_similarity(q_row, cand_row):
    """
    Độ giống Location dựa trên location_clean (Jaccard tokens).
    """
    q_tokens = set(str(q_row["location_clean"]).split())
    c_tokens = set(str(cand_row["location_clean"]).split())
    if not q_tokens or not c_tokens:
        return 0.0
    inter = len(q_tokens & c_tokens)
    union = len(q_tokens | c_tokens)
    return inter / union
 def numeric_closeness(q_row, cand_row, alpha_out=0.7, alpha_in=0.7):
    """
    Độ gần nhau về:
      - luuluong
      - diện tích ngoài / trong (log1p + exp(-alpha * dist))
    Trả về giá trị khoảng (0..~1.5), càng lớn càng gần.
    """
    # closeness theo luuluong
    if "luuluong" in q_row and "luuluong" in cand_row:
        if q_row["luuluong"] == cand_row["luuluong"]:
            c_luu = 1.0
        elif abs(q_row["luuluong"] - cand_row["luuluong"]) == 1:
            c_luu = 0.6
        else:
            c_luu = 0.3
    else:
        c_luu = 0.5
    # closeness theo diện tích, xử lý thiếu an toàn
    def safe_val(row, col):
        return float(row[col]) if col in row and not pd.isna(row[col]) else 0.0
    q_out = safe_val(q_row, "Dientichngoai")
    q_in  = safe_val(q_row, "Dientichtrong")
    c_out = safe_val(cand_row, "Dientichngoai")
    c_in  = safe_val(cand_row, "Dientichtrong")
    d_out = abs(np.log1p(q_out) - np.log1p(c_out))
    d_in  = abs(np.log1p(q_in)  - np.log1p(c_in))
    c_out = np.exp(-alpha_out * d_out)
    c_in  = np.exp(-alpha_in  * d_in)
    return 0.5 * c_luu + 0.25 * c_out + 0.25 * c_in
 # ============================================================
 # 3. TRAIN EMBEDDINGS TỪ train_df
 # ============================================================
 def build_train_embeddings(train_df):
    """
    Nhận train_df có cột 'job_segmented'.
    Trả về tensor embeddings shape (N_train, H).
    """
    train_texts = train_df["job_segmented"].tolist()
    embs = []
    for txt in train_texts:
        vec = embed_text(txt)
        embs.append(vec.squeeze(0))
    return torch.stack(embs)
 # ============================================================
 # 4. RULE DỰ ĐOÁN CHO 1 ROW
 # ============================================================
 def predict_label_for_row(q_row,
                          train_df,
                          train_embeddings,
                          tokenizer=hf_tokenizer,
                          model=hf_model,
                          top_k=10,
                          w_numeric=0.4,
                          w_loc=0.6):
    """
    Bước 1: dùng embedding(job_segmented) để chọn top_k ứng viên gần nghĩa nhất.
    Bước 2: trong top_k đó, KHÔNG dùng score text nữa, chỉ dùng:
        - loc_sim   : similarity theo location_clean  [0,1]  (quan trọng nhất)
        - num_c     : numeric_closeness (luuluong + diện tích)  (~0..1.5)
    final_score = w_loc * loc_sim + w_numeric * num_c
    """
    # 1) embed query
    q_vec = embed_text(q_row["job_segmented"], tokenizer, model)  # (1,H)
    # 2) cosine similarity với toàn bộ train → chỉ để CHỌN ỨNG VIÊN
    sims = cosine_sim(q_vec, train_embeddings)[0]  # (N,)
    # 3) lấy top-k job gần nhất theo embedding
    top_k = min(top_k, len(train_df))
    top_scores, top_idx = torch.topk(sims, k=top_k)
    label_scores = {}
    for _, idx in zip(top_scores, top_idx):
        cand_row = train_df.iloc[int(idx)]
        loc_sim = location_similarity(q_row, cand_row)
        num_c   = numeric_closeness(q_row, cand_row)
        final_score = w_loc * loc_sim + w_numeric * num_c
        lbl = str(cand_row["label"])
        label_scores[lbl] = label_scores.get(lbl, 0.0) + final_score
    # fallback nếu không có ứng viên
    if not label_scores:
        majority_label = str(train_df["label"].value_counts().idxmax())
        return majority_label, {}
    best_label = max(label_scores, key=label_scores.get)
    return best_label, label_scores
 # ============================================================
 # 5. PREDICT CHO CẢ 1 DATAFRAME (DÙNG CHO TEST/HOLDOUT)
 # ============================================================
 def predict_on_df(df_in: pd.DataFrame,
                  train_df: pd.DataFrame,
                  train_embeddings,
                  name: str,
                  top_k=5,
                  w_numeric=0.4,
                  w_loc=0.6):
    df = df_in.copy()
    preds = []
    scores = []
    for _, row in df.iterrows():
        pred, sc = predict_label_for_row(
            row,
            train_df=train_df,
            train_embeddings=train_embeddings,
            tokenizer=hf_tokenizer,
            model=hf_model,
            top_k=top_k,
            w_numeric=w_numeric,
            w_loc=w_loc
        )
        preds.append(pred)
        scores.append(sc)
    df["pred_label"]    = preds
    df["score_details"] = scores
    print(f"\n========== KẾT QUẢ TRÊN {name} ==========")
    print(classification_report(
        df["label"].astype(str),
        df["pred_label"].astype(str),
        digits=3
    ))
    return df
 def normalize_text_keep_words(s: str) -> str:
    s = str(s)
    s = unicodedata.normalize('NFC', s).lower()
    s = re.sub(r"[^0-9a-zà-ỹ\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s
 def segment_and_remove_stopwords(text):
    if not isinstance(text, str):
        return ""
    segmented = ViTokenizer.tokenize(text)
    tokens = segmented.split()
    filtered = [tok for tok in tokens if tok not in vietnamese_stopwords]
    return " ".join(filtered)
 def preprocess_pipeline(df_raw: pd.DataFrame) -> pd.DataFrame:
    """
    - Không ghép Location + Job
    - Clean text
    - Word segment Job + remove stopword
    """
    df = df_raw.dropna(subset=["label"]).copy()
    df["label"] = df["label"].astype(str).str.strip()
    df = df[df["label"].isin(["1","2","3","4"])].copy()
    df["Location"] = df["Location"].fillna("Unknown").astype(str)
    df["Job"]      = df["Job"].astype(str)
    # Clean text
    df["location_clean"] = df["Location"].apply(normalize_text_keep_words)
    df["job_clean"]      = df["Job"].apply(normalize_text_keep_words)
    # Word segment + remove stopword
    df["job_segmented"] = df["job_clean"].apply(segment_and_remove_stopwords)
    # Giữ lại các cột numeric nếu có
    keep_cols = [
        "Location","Job","label",
        "location_clean","job_clean","job_segmented"
    ]
    for c in ["Name","luuluong","Dientichngoai","Dientichtrong"]:
        if c in df.columns:
            keep_cols.append(c)
    return df[keep_cols].reset_index(drop=True)
 # ==== 1. Load stopword ====
 def predict_single_sample(job_segmented: str,
                          location_clean: str,
                          name: str,
                          luuluong: int,
                          dientichngoai: float,
                          dientichtrong: float,
                          train_df: pd.DataFrame,
                          train_embeddings,
                          top_k=5,
                          w_numeric=0.4,
                          w_loc=0.6):
    """
    Demo dự đoán cho 1 điểm dữ liệu mới.
    - job_segmented: chuỗi đã wordsegment + bỏ stopword (giống train_df['job_segmented'])
    - location_clean: chuỗi location đã normalize (giống train_df['location_clean'])
    - name, luuluong, dientichngoai, dientichtrong: thông số tòa/job
    """
    # Tạo 1 Series giả giống 1 dòng trong df_out
    q_dict = {
        "job_segmented": job_segmented,
        "location_clean": location_clean,
        "Name": name,
        "luuluong": luuluong,
        "Dientichngoai": dientichngoai,
        "Dientichtrong": dientichtrong,
    }
    q_row = pd.Series(q_dict)
    pred_label, label_scores = predict_label_for_row(
        q_row,
        train_df=train_df,
        train_embeddings=train_embeddings,
        tokenizer=hf_tokenizer,
        model=hf_model,
        top_k=top_k,
        w_numeric=w_numeric,
        w_loc=w_loc
    )
    return pred_label, label_scores
 def load_stopwords(path):
    with open(path, "r", encoding="utf-8") as f:
        sw = [line.strip() for line in f if line.strip()]
    return set(sw)
 stopwords_path = "vietnamese-stopwords-dash.txt"   # đổi tên nếu khác
 vietnamese_stopwords = load_stopwords(stopwords_path)
 def chia_train_test(df_out: pd.DataFrame):
    df_out = df_out[df_out["label"].astype(str).isin(["1", "2", "3"])].reset_index(drop=True)
    test_buildings    = ["CMC"]   # tập test chính
    test_df    = df_out[df_out["Name"].isin(test_buildings)].reset_index(drop=True)
    train_df   = df_out[
        ~df_out["Name"].isin(test_buildings)
    ].reset_index(drop=True)
    return train_df, test_df
 if __name__ == "__main__":
    df_raw = pd.read_excel("hoanmy_detect_task.xlsx")
    df = preprocess_pipeline(df_raw)
    train_df, test_df = chia_train_test(df)
    train_embeddings = build_train_embeddings(train_df)
        # 4) Demo: dự đoán cho 1 dòng trong test_df (ví dụ dòng đầu tiên)
    example_row = test_df.iloc[0]
    pred, scores = predict_label_for_row(
        example_row,
        train_df=train_df,
        train_embeddings=train_embeddings,
        top_k=5,
        w_numeric=0.4,
        w_loc=0.6
    )
    print("\n=== DEMO PREDICT 1 DÒNG ===")
    print("Job       :", example_row["Job"])
    print("Segm job  :", example_row["job_segmented"])
    print("Location  :", example_row["Location"])
    print("True label:", example_row["label"])
    print("Pred label:", pred)
    print("Label scores:", scores)
    # 5) Hoặc demo với 1 input tự tạo
    # job_seg_demo = "quét_dọn hành_lang lau_chùi vệ_sinh"
    # loc_clean_demo = "hành lang tầng 3"
    # pred2, scores2 = predict_single_sample(
    #     job_segmented=job_seg_demo,
    #     location_clean=loc_clean_demo,
    #     name="CMC",
    #     luuluong=2,
    #     dientichngoai=0,
    #     dientichtrong=1500,
    #     train_df=train_df,
    #     train_embeddings=train_embeddings,
    #     top_k=5,
    #     w_numeric=0.4,
    #     w_loc=0.6
    # )
    # print("\n=== DEMO PREDICT 1 INPUT TỰ TẠO ===")
    # print("Pred label:", pred2)
    # print("Label scores:", scores2)
--- a/felix.ipynb
+++ b/felix.ipynb
--- a/felix2.ipynb
+++ b/felix2.ipynb
--- a/felix3.ipynb
+++ b/felix3.ipynb
--- a/hoanmy_detect_task.xlsx
+++ b/hoanmy_detect_task.xlsx
--- a/phobert_cls_best/added_tokens.json
+++ b/phobert_cls_best/added_tokens.json
@ -0,0 +1,3 @@
 {
  "<mask>": 64000
 }
--- a/phobert_cls_best/bpe.codes
+++ b/phobert_cls_best/bpe.codes
--- a/phobert_cls_best/config.json
+++ b/phobert_cls_best/config.json
@ -0,0 +1,39 @@
 {
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "dtype": "float32",
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 258,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "tokenizer_class": "PhobertTokenizer",
  "transformers_version": "4.56.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 64001
 }
--- a/phobert_cls_best/model.safetensors
+++ b/phobert_cls_best/model.safetensors
--- a/phobert_cls_best/special_tokens_map.json
+++ b/phobert_cls_best/special_tokens_map.json
@ -0,0 +1,9 @@
 {
  "bos_token": "<s>",
  "cls_token": "<s>",
  "eos_token": "</s>",
  "mask_token": "<mask>",
  "pad_token": "<pad>",
  "sep_token": "</s>",
  "unk_token": "<unk>"
 }
--- a/phobert_cls_best/tokenizer_config.json
+++ b/phobert_cls_best/tokenizer_config.json
@ -0,0 +1,55 @@
 {
  "added_tokens_decoder": {
    "0": {
      "content": "<s>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "1": {
      "content": "<pad>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "2": {
      "content": "</s>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "3": {
      "content": "<unk>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "64000": {
      "content": "<mask>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    }
  },
  "bos_token": "<s>",
  "clean_up_tokenization_spaces": false,
  "cls_token": "<s>",
  "eos_token": "</s>",
  "extra_special_tokens": {},
  "mask_token": "<mask>",
  "model_max_length": 1000000000000000019884624838656,
  "pad_token": "<pad>",
  "sep_token": "</s>",
  "tokenizer_class": "PhobertTokenizer",
  "unk_token": "<unk>"
 }
--- a/phobert_cls_best/vocab.txt
+++ b/phobert_cls_best/vocab.txt
--- a/table_doc/BIDV.docx
+++ b/table_doc/BIDV.docx
--- a/train.ipynb
+++ b/train.ipynb
@ -0,0 +1,882 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "7eaf4b18",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "187c8d47",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_excel(\"hoanmy_detect_task.xlsx\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "eaddf252",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Location</th>\n",
       "      <th>Job</th>\n",
       "      <th>label</th>\n",
       "      <th>Name</th>\n",
       "      <th>luuluong</th>\n",
       "      <th>Dientichngoai</th>\n",
       "      <th>Dientichtrong</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Khu vực Ngoại cảnh</td>\n",
       "      <td>Quét lá rụng, thu gom rác lối đi lại, lối xe c...</td>\n",
       "      <td>1</td>\n",
       "      <td>BIDV</td>\n",
       "      <td>3</td>\n",
       "      <td>1144</td>\n",
       "      <td>11200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Khu vực Ngoại cảnh</td>\n",
       "      <td>Nhặt rác bồn hoa cây cảnh, làm sạch gạch ốp xu...</td>\n",
       "      <td>1</td>\n",
       "      <td>BIDV</td>\n",
       "      <td>3</td>\n",
       "      <td>1144</td>\n",
       "      <td>11200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Khu vực Ngoại cảnh</td>\n",
       "      <td>Vệ sinh gạt tàn, thùng rác</td>\n",
       "      <td>1</td>\n",
       "      <td>BIDV</td>\n",
       "      <td>3</td>\n",
       "      <td>1144</td>\n",
       "      <td>11200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Khu vực Ngoại cảnh</td>\n",
       "      <td>Lau các biển quảng cáo, biển chỉ dẫn (dưới 4m)...</td>\n",
       "      <td>1</td>\n",
       "      <td>BIDV</td>\n",
       "      <td>3</td>\n",
       "      <td>1144</td>\n",
       "      <td>11200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Khu vực Ngoại cảnh</td>\n",
       "      <td>Lau tường đá và kính bên ngoài tòa nhà (dưới 4m)</td>\n",
       "      <td>2</td>\n",
       "      <td>BIDV</td>\n",
       "      <td>3</td>\n",
       "      <td>1144</td>\n",
       "      <td>11200</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             Location                                                Job  \\\n",
       "0  Khu vực Ngoại cảnh  Quét lá rụng, thu gom rác lối đi lại, lối xe c...   \n",
       "1  Khu vực Ngoại cảnh  Nhặt rác bồn hoa cây cảnh, làm sạch gạch ốp xu...   \n",
       "2  Khu vực Ngoại cảnh                         Vệ sinh gạt tàn, thùng rác   \n",
       "3  Khu vực Ngoại cảnh  Lau các biển quảng cáo, biển chỉ dẫn (dưới 4m)...   \n",
       "4  Khu vực Ngoại cảnh   Lau tường đá và kính bên ngoài tòa nhà (dưới 4m)   \n",
       "\n",
       "  label  Name  luuluong  Dientichngoai  Dientichtrong  \n",
       "0     1  BIDV         3           1144          11200  \n",
       "1     1  BIDV         3           1144          11200  \n",
       "2     1  BIDV         3           1144          11200  \n",
       "3     1  BIDV         3           1144          11200  \n",
       "4     2  BIDV         3           1144          11200  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "6fd4be0a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from pyvi import ViTokenizer\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "878456df",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ==== 1. Load stopword ====\n",
    "\n",
    "def load_stopwords(path):\n",
    "    with open(path, \"r\", encoding=\"utf-8\") as f:\n",
    "        sw = [line.strip() for line in f if line.strip()]\n",
    "    return set(sw)\n",
    "\n",
    "stopwords_path = \"vietnamese-stopwords-dash.txt\"   # đổi tên nếu khác\n",
    "vietnamese_stopwords = load_stopwords(stopwords_path)\n",
    "\n",
    "\n",
    "# ==== 2. Hàm tokenize + bỏ stopword ====\n",
    "\n",
    "def segment_and_remove_stopwords(text):\n",
    "    if not isinstance(text, str):\n",
    "        return \"\"\n",
    "    segmented = ViTokenizer.tokenize(text)\n",
    "    tokens = segmented.split()\n",
    "    filtered = [tok for tok in tokens if tok not in vietnamese_stopwords]\n",
    "    return \" \".join(filtered)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "bf15213a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import re\n",
    "import unicodedata\n",
    "from pyvi import ViTokenizer\n",
    "\n",
    "def normalize_text_keep_words(s: str) -> str:\n",
    "    s = str(s)\n",
    "    s = unicodedata.normalize('NFC', s).lower()\n",
    "    s = re.sub(r\"[^0-9a-zà-ỹ\\s]\", \" \", s)\n",
    "    s = re.sub(r\"\\s+\", \" \", s).strip()\n",
    "    return s\n",
    "\n",
    "\n",
    "def preprocess_pipeline(df_raw: pd.DataFrame) -> pd.DataFrame:\n",
    "    \"\"\"\n",
    "    - Không ghép Location + Job\n",
    "    - Clean text\n",
    "    - Word segment Job + remove stopword\n",
    "    \"\"\"\n",
    "    df = df_raw.dropna(subset=[\"label\"]).copy()\n",
    "    df[\"label\"] = df[\"label\"].astype(str).str.strip()\n",
    "    df = df[df[\"label\"].isin([\"1\",\"2\",\"3\",\"4\"])].copy()\n",
    "\n",
    "    df[\"Location\"] = df[\"Location\"].fillna(\"Unknown\").astype(str)\n",
    "    df[\"Job\"]      = df[\"Job\"].astype(str)\n",
    "\n",
    "    # Clean text\n",
    "    df[\"location_clean\"] = df[\"Location\"].apply(normalize_text_keep_words)\n",
    "    df[\"job_clean\"]      = df[\"Job\"].apply(normalize_text_keep_words)\n",
    "\n",
    "    # Word segment + remove stopword\n",
    "    df[\"job_segmented\"] = df[\"job_clean\"].apply(segment_and_remove_stopwords)\n",
    "\n",
    "    # Giữ lại các cột numeric nếu có\n",
    "    keep_cols = [\n",
    "        \"Location\",\"Job\",\"label\",\n",
    "        \"location_clean\",\"job_clean\",\"job_segmented\"\n",
    "    ]\n",
    "    for c in [\"Name\",\"luuluong\",\"Dientichngoai\",\"Dientichtrong\"]:\n",
    "        if c in df.columns:\n",
    "            keep_cols.append(c)\n",
    "\n",
    "    return df[keep_cols].reset_index(drop=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "90599a6c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Location</th>\n",
       "      <th>Job</th>\n",
       "      <th>label</th>\n",
       "      <th>location_clean</th>\n",
       "      <th>job_clean</th>\n",
       "      <th>job_segmented</th>\n",
       "      <th>Name</th>\n",
       "      <th>luuluong</th>\n",
       "      <th>Dientichngoai</th>\n",
       "      <th>Dientichtrong</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Khu vực Ngoại cảnh</td>\n",
       "      <td>Quét lá rụng, thu gom rác lối đi lại, lối xe c...</td>\n",
       "      <td>1</td>\n",
       "      <td>khu vực ngoại cảnh</td>\n",
       "      <td>quét lá rụng thu gom rác lối đi lại lối xe chạ...</td>\n",
       "      <td>quét lá rụng thu_gom rác lối đi_lại lối xe chạ...</td>\n",
       "      <td>BIDV</td>\n",
       "      <td>3</td>\n",
       "      <td>1144</td>\n",
       "      <td>11200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Khu vực Ngoại cảnh</td>\n",
       "      <td>Nhặt rác bồn hoa cây cảnh, làm sạch gạch ốp xu...</td>\n",
       "      <td>1</td>\n",
       "      <td>khu vực ngoại cảnh</td>\n",
       "      <td>nhặt rác bồn hoa cây cảnh làm sạch gạch ốp xun...</td>\n",
       "      <td>nhặt rác bồn hoa cây_cảnh sạch gạch ốp xung_qu...</td>\n",
       "      <td>BIDV</td>\n",
       "      <td>3</td>\n",
       "      <td>1144</td>\n",
       "      <td>11200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Khu vực Ngoại cảnh</td>\n",
       "      <td>Vệ sinh gạt tàn, thùng rác</td>\n",
       "      <td>1</td>\n",
       "      <td>khu vực ngoại cảnh</td>\n",
       "      <td>vệ sinh gạt tàn thùng rác</td>\n",
       "      <td>vệ_sinh gạt_tàn thùng rác</td>\n",
       "      <td>BIDV</td>\n",
       "      <td>3</td>\n",
       "      <td>1144</td>\n",
       "      <td>11200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Khu vực Ngoại cảnh</td>\n",
       "      <td>Lau các biển quảng cáo, biển chỉ dẫn (dưới 4m)...</td>\n",
       "      <td>1</td>\n",
       "      <td>khu vực ngoại cảnh</td>\n",
       "      <td>lau các biển quảng cáo biển chỉ dẫn dưới 4m ch...</td>\n",
       "      <td>lau biển quảng_cáo biển chỉ_dẫn 4m chân cột điện</td>\n",
       "      <td>BIDV</td>\n",
       "      <td>3</td>\n",
       "      <td>1144</td>\n",
       "      <td>11200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Khu vực Ngoại cảnh</td>\n",
       "      <td>Lau tường đá và kính bên ngoài tòa nhà (dưới 4m)</td>\n",
       "      <td>2</td>\n",
       "      <td>khu vực ngoại cảnh</td>\n",
       "      <td>lau tường đá và kính bên ngoài tòa nhà dưới 4m</td>\n",
       "      <td>lau tường đá kính tòa 4m</td>\n",
       "      <td>BIDV</td>\n",
       "      <td>3</td>\n",
       "      <td>1144</td>\n",
       "      <td>11200</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             Location                                                Job  \\\n",
       "0  Khu vực Ngoại cảnh  Quét lá rụng, thu gom rác lối đi lại, lối xe c...   \n",
       "1  Khu vực Ngoại cảnh  Nhặt rác bồn hoa cây cảnh, làm sạch gạch ốp xu...   \n",
       "2  Khu vực Ngoại cảnh                         Vệ sinh gạt tàn, thùng rác   \n",
       "3  Khu vực Ngoại cảnh  Lau các biển quảng cáo, biển chỉ dẫn (dưới 4m)...   \n",
       "4  Khu vực Ngoại cảnh   Lau tường đá và kính bên ngoài tòa nhà (dưới 4m)   \n",
       "\n",
       "  label      location_clean  \\\n",
       "0     1  khu vực ngoại cảnh   \n",
       "1     1  khu vực ngoại cảnh   \n",
       "2     1  khu vực ngoại cảnh   \n",
       "3     1  khu vực ngoại cảnh   \n",
       "4     2  khu vực ngoại cảnh   \n",
       "\n",
       "                                           job_clean  \\\n",
       "0  quét lá rụng thu gom rác lối đi lại lối xe chạ...   \n",
       "1  nhặt rác bồn hoa cây cảnh làm sạch gạch ốp xun...   \n",
       "2                          vệ sinh gạt tàn thùng rác   \n",
       "3  lau các biển quảng cáo biển chỉ dẫn dưới 4m ch...   \n",
       "4     lau tường đá và kính bên ngoài tòa nhà dưới 4m   \n",
       "\n",
       "                                       job_segmented  Name  luuluong  \\\n",
       "0  quét lá rụng thu_gom rác lối đi_lại lối xe chạ...  BIDV         3   \n",
       "1  nhặt rác bồn hoa cây_cảnh sạch gạch ốp xung_qu...  BIDV         3   \n",
       "2                          vệ_sinh gạt_tàn thùng rác  BIDV         3   \n",
       "3   lau biển quảng_cáo biển chỉ_dẫn 4m chân cột điện  BIDV         3   \n",
       "4                           lau tường đá kính tòa 4m  BIDV         3   \n",
       "\n",
       "   Dientichngoai  Dientichtrong  \n",
       "0           1144          11200  \n",
       "1           1144          11200  \n",
       "2           1144          11200  \n",
       "3           1144          11200  \n",
       "4           1144          11200  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_out = preprocess_pipeline(df)\n",
    "df_out.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "1c472539",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train: (514, 10)\n",
      "Test : (172, 10)\n",
      "Holdout: (83, 10)\n"
     ]
    }
   ],
   "source": [
    "df_out = df_out[df_out[\"label\"].astype(str).isin([\"1\", \"2\", \"3\"])].reset_index(drop=True)\n",
    "\n",
    "\n",
    "# ============================================================\n",
    "# 3. CHIA 3 TẬP: TRAIN / TEST / HOLDOUT\n",
    "# ============================================================\n",
    "\n",
    "test_buildings    = [\"Keangnam\", \"CMC\"]   # tập test chính\n",
    "holdout_buildings = [\"VIGALCERA\"]             # tòa riêng để holdout – sửa theo dữ liệu thực tế\n",
    "\n",
    "test_df    = df_out[df_out[\"Name\"].isin(test_buildings)].reset_index(drop=True)\n",
    "holdout_df = df_out[df_out[\"Name\"].isin(holdout_buildings)].reset_index(drop=True)\n",
    "train_df   = df_out[\n",
    "    ~df_out[\"Name\"].isin(test_buildings + holdout_buildings)\n",
    "].reset_index(drop=True)\n",
    "\n",
    "print(\"Train:\", train_df.shape)\n",
    "print(\"Test :\", test_df.shape)\n",
    "print(\"Holdout:\", holdout_df.shape)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "bff38170",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_out = df_out[df_out[\"label\"].astype(str).isin([\"1\", \"2\", \"3\"])].reset_index(drop=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "8fc94180",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import re\n",
    "import unicodedata\n",
    "from pyvi import ViTokenizer\n",
    "from transformers import AutoTokenizer, AutoModel\n",
    "import torch\n",
    "from sklearn.metrics import classification_report\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "c41abe00",
   "metadata": {},
   "outputs": [],
   "source": [
    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
    "hf_tokenizer = AutoTokenizer.from_pretrained(\"dangvantuan/vietnamese-embedding\")\n",
    "hf_model = AutoModel.from_pretrained(\"dangvantuan/vietnamese-embedding\").to(device)\n",
    "\n",
    "def embed_text(text, tokenizer=hf_tokenizer, model=hf_model, device=device):\n",
    "    encoded = tokenizer(\n",
    "        text,\n",
    "        padding=True,\n",
    "        truncation=True,\n",
    "        max_length=128,\n",
    "        return_tensors=\"pt\"\n",
    "    ).to(device)\n",
    "\n",
    "    with torch.no_grad():\n",
    "        output = model(**encoded)\n",
    "\n",
    "    token_embeddings = output.last_hidden_state   # (1, L, H)\n",
    "    attention_mask   = encoded[\"attention_mask\"]  # (1, L)\n",
    "\n",
    "    mask = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()\n",
    "    sum_embeds = (token_embeddings * mask).sum(dim=1)  # (1, H)\n",
    "    lengths   = mask.sum(dim=1)                       # (1,1)\n",
    "    mean_pooled = sum_embeds / lengths\n",
    "\n",
    "    return mean_pooled.cpu()  # (1, H)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "04ad0db7",
   "metadata": {},
   "outputs": [],
   "source": [
    "def cosine_sim(a, b):\n",
    "    \"\"\"\n",
    "    a: (1, H)\n",
    "    b: (N, H)\n",
    "    \"\"\"\n",
    "    a = a / a.norm(dim=-1, keepdim=True)\n",
    "    b = b / b.norm(dim=-1, keepdim=True)\n",
    "    return torch.mm(a, b.t())  # (1, N)\n",
    "\n",
    "\n",
    "# Embed toàn bộ train job_segmented\n",
    "train_texts = train_df[\"job_segmented\"].tolist()\n",
    "\n",
    "train_embeddings = []\n",
    "for txt in train_texts:\n",
    "    vec = embed_text(txt)\n",
    "    train_embeddings.append(vec.squeeze(0))\n",
    "\n",
    "train_embeddings = torch.stack(train_embeddings)   # (N_train, H)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "2f1a3319",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 5. HÀM LOCATION SIMILARITY + NUMERIC CLOSENESS\n",
    "# ============================================================\n",
    "\n",
    "def location_similarity(q_row, cand_row):\n",
    "    q_tokens = set(str(q_row[\"location_clean\"]).split())\n",
    "    c_tokens = set(str(cand_row[\"location_clean\"]).split())\n",
    "    if not q_tokens or not c_tokens:\n",
    "        return 0.0\n",
    "    inter = len(q_tokens & c_tokens)\n",
    "    union = len(q_tokens | c_tokens)\n",
    "    return inter / union\n",
    "\n",
    "\n",
    "def numeric_closeness(q_row, cand_row, alpha_out=0.7, alpha_in=0.7):\n",
    "    # closeness theo luuluong\n",
    "    if \"luuluong\" in q_row and \"luuluong\" in cand_row:\n",
    "        if q_row[\"luuluong\"] == cand_row[\"luuluong\"]:\n",
    "            c_luu = 1.0\n",
    "        elif abs(q_row[\"luuluong\"] - cand_row[\"luuluong\"]) == 1:\n",
    "            c_luu = 0.6\n",
    "        else:\n",
    "            c_luu = 0.3\n",
    "    else:\n",
    "        c_luu = 0.5\n",
    "\n",
    "    # closeness theo diện tích\n",
    "    def safe_val(row, col):\n",
    "        return float(row[col]) if col in row and not pd.isna(row[col]) else 0.0\n",
    "\n",
    "    q_out = safe_val(q_row, \"Dientichngoai\")\n",
    "    q_in  = safe_val(q_row, \"Dientichtrong\")\n",
    "    c_out = safe_val(cand_row, \"Dientichngoai\")\n",
    "    c_in  = safe_val(cand_row, \"Dientichtrong\")\n",
    "\n",
    "    d_out = abs(np.log1p(q_out) - np.log1p(c_out))\n",
    "    d_in  = abs(np.log1p(q_in)  - np.log1p(c_in))\n",
    "\n",
    "    c_out = np.exp(-alpha_out * d_out)\n",
    "    c_in  = np.exp(-alpha_in  * d_in)\n",
    "\n",
    "    return 0.5 * c_luu + 0.25 * c_out + 0.25 * c_in\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "9931151e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def predict_label_for_row(q_row,\n",
    "                          train_df=train_df,\n",
    "                          train_embeddings=train_embeddings,\n",
    "                          tokenizer=hf_tokenizer,\n",
    "                          model=hf_model,\n",
    "                          top_k=10,\n",
    "                          w_numeric=0.4,\n",
    "                          w_loc=0.6):\n",
    "    \"\"\"\n",
    "    Bước 1: dùng embedding(job_segmented) để chọn top_k ứng viên gần nghĩa nhất.\n",
    "    Bước 2: trong top_k đó, KHÔNG dùng score text nữa, chỉ dùng:\n",
    "        - loc_sim   : similarity theo location_clean  [0,1]  (quan trọng nhất)\n",
    "        - num_c     : numeric_closeness (luuluong + diện tích)  (~0..1.5)\n",
    "    final_score = w_loc * loc_sim + w_numeric * num_c\n",
    "    \"\"\"\n",
    "    # 1) embed query\n",
    "    q_vec = embed_text(q_row[\"job_segmented\"], tokenizer, model)  # (1,H)\n",
    "\n",
    "    # 2) cosine similarity với toàn bộ train → chỉ để CHỌN ỨNG VIÊN\n",
    "    sims = cosine_sim(q_vec, train_embeddings)[0]  # (N,)\n",
    "\n",
    "    # 3) lấy top-k job gần nhất theo embedding\n",
    "    top_k = min(top_k, len(train_df))\n",
    "    top_scores, top_idx = torch.topk(sims, k=top_k)\n",
    "\n",
    "    label_scores = {}\n",
    "\n",
    "    for score, idx in zip(top_scores, top_idx):\n",
    "        cand_row = train_df.iloc[int(idx)]\n",
    "\n",
    "        loc_sim = location_similarity(q_row, cand_row)\n",
    "        num_c   = numeric_closeness(q_row, cand_row)\n",
    "\n",
    "        final_score = w_loc * loc_sim + w_numeric * num_c\n",
    "\n",
    "        lbl = str(cand_row[\"label\"])\n",
    "        label_scores[lbl] = label_scores.get(lbl, 0.0) + final_score\n",
    "\n",
    "    if not label_scores:\n",
    "        majority_label = str(train_df[\"label\"].value_counts().idxmax())\n",
    "        return majority_label, {}\n",
    "\n",
    "    best_label = max(label_scores, key=label_scores.get)\n",
    "    return best_label, label_scores\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "16b51693",
   "metadata": {},
   "outputs": [],
   "source": [
    "def predict_on_df(df_in: pd.DataFrame,\n",
    "                  name: str,\n",
    "                  top_k=5,\n",
    "                  w_numeric=0.4,\n",
    "                  w_loc=0.6):\n",
    "    df = df_in.copy()\n",
    "    preds = []\n",
    "    scores = []\n",
    "\n",
    "    for _, row in df.iterrows():\n",
    "        pred, sc = predict_label_for_row(\n",
    "            row,\n",
    "            train_df=train_df,\n",
    "            train_embeddings=train_embeddings,\n",
    "            tokenizer=hf_tokenizer,\n",
    "            model=hf_model,\n",
    "            top_k=top_k,\n",
    "            w_numeric=w_numeric,\n",
    "            w_loc=w_loc\n",
    "        )\n",
    "        preds.append(pred)\n",
    "        scores.append(sc)\n",
    "\n",
    "    df[\"pred_label\"]    = preds\n",
    "    df[\"score_details\"] = scores\n",
    "\n",
    "    print(f\"\\n========== KẾT QUẢ TRÊN {name} ==========\")\n",
    "    print(classification_report(\n",
    "        df[\"label\"].astype(str),\n",
    "        df[\"pred_label\"].astype(str),\n",
    "        digits=3\n",
    "    ))\n",
    "\n",
    "    return df\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "dcc54593",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "========== KẾT QUẢ TRÊN TEST (Keangnam + CMC) ==========\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           1      0.788     0.848     0.817       105\n",
      "           2      0.545     0.511     0.527        47\n",
      "           3      0.667     0.500     0.571        20\n",
      "\n",
      "    accuracy                          0.715       172\n",
      "   macro avg      0.667     0.619     0.638       172\n",
      "weighted avg      0.707     0.715     0.709       172\n",
      "\n"
     ]
    }
   ],
   "source": [
    "test_df_pred    = predict_on_df(test_df, \"TEST (Keangnam + CMC)\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "6f0cc888",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "========== KẾT QUẢ TRÊN HOLDOUT ==========\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           1      0.684     0.907     0.780        43\n",
      "           2      0.429     0.261     0.324        23\n",
      "           3      0.667     0.471     0.552        17\n",
      "\n",
      "    accuracy                          0.639        83\n",
      "   macro avg      0.593     0.546     0.552        83\n",
      "weighted avg      0.610     0.639     0.607        83\n",
      "\n"
     ]
    }
   ],
   "source": [
    "holdout_df_pred = predict_on_df(holdout_df, \"HOLDOUT\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "5f804672",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_top_neighbors_for_row(q_row,\n",
    "                              train_df,\n",
    "                              train_embeddings,\n",
    "                              tokenizer,\n",
    "                              model,\n",
    "                              top_k=5,\n",
    "                              w_numeric=0.4,\n",
    "                              w_loc=0.6):\n",
    "    \"\"\"\n",
    "    Bước 1: dùng embedding(job_segmented) để lấy top_k ứng viên gần nghĩa nhất.\n",
    "    Bước 2: với mỗi ứng viên, tính:\n",
    "        - cos_sim       : similarity embedding (chỉ để tham khảo)\n",
    "        - loc_sim       : similarity theo location_clean\n",
    "        - num_closeness : theo luuluong + diện tích\n",
    "        - final_score   : w_loc * loc_sim + w_numeric * num_closeness\n",
    "    Trả về DataFrame các neighbor, sort theo final_score giảm dần.\n",
    "    \"\"\"\n",
    "    # 1) embed query\n",
    "    q_vec = embed_text(q_row[\"job_segmented\"], tokenizer, model)  # (1, H)\n",
    "\n",
    "    # 2) cosine similarity với toàn bộ train\n",
    "    sims = cosine_sim(q_vec, train_embeddings)[0]  # (N,)\n",
    "\n",
    "    # 3) lấy top_k index theo sims\n",
    "    top_k = min(top_k, len(train_df))\n",
    "    top_scores, top_idx = torch.topk(sims, k=top_k)\n",
    "\n",
    "    rows = []\n",
    "    for score, idx in zip(top_scores, top_idx):\n",
    "        score_val = float(score.item())\n",
    "        cand = train_df.iloc[int(idx)].copy()\n",
    "\n",
    "        loc_sim = location_similarity(q_row, cand)\n",
    "        num_c   = numeric_closeness(q_row, cand)\n",
    "\n",
    "        final_score = w_loc * loc_sim + w_numeric * num_c\n",
    "\n",
    "        cand[\"cos_sim\"]       = score_val\n",
    "        cand[\"loc_sim\"]       = loc_sim\n",
    "        cand[\"num_closeness\"] = num_c\n",
    "        cand[\"final_score\"]   = final_score\n",
    "        rows.append(cand)\n",
    "\n",
    "    neighbors_df = pd.DataFrame(rows).sort_values(\"final_score\", ascending=False)\n",
    "    return neighbors_df\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "f224a244",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Số mẫu sai trên HOLDOUT: 30\n",
      "\n",
      "====================================================================================================\n",
      "❌ CASE SAI #3\n",
      "  True label    : 3\n",
      "  Pred label    : 1\n",
      "  Tòa (Name)    : VIGALCERA\n",
      "  Location      : Khu vực ngoại cảnh\n",
      "  Lưu lượng     : 3\n",
      "  DT ngoài      : 6000\n",
      "  DT trong      : 4520\n",
      "  Job raw       : - Làm vệ sinh khu vực đài phun nước\n",
      "  job_segmented : vệ_sinh khu_vực đài phun\n",
      "  location_clean: khu vực ngoại cảnh\n",
      "\n",
      "  → Top 10 hàng xóm trong train (sorted theo final_score):\n",
      "label         Name                           Location  luuluong  Dientichngoai  Dientichtrong                                                                                                                                Job                                                                         job_segmented  cos_sim  loc_sim  num_closeness  final_score\n",
      "    1         BIDV                 Khu vực Ngoại cảnh         3           1144          11200                                                                                                          Vệ sinh họng rác (nếu có)                                                                      vệ_sinh họng rác 0.694930 1.000000       0.710878     0.884351\n",
      "    1         BIDV                Khu vực Nhà vệ sinh         3           1144          11200                  Làm sạch và khử mùi khu vệ sinh và các thiết bị bên trong nhà vệ sinh bao gồm: Bồn rửa, bệ xí, các vòi, van nước.               sạch khử mùi khu vệ_sinh thiết_bị vệ_sinh bao_gồm bồn rửa bệ_xí vòi van 0.690451 0.285714       0.710878     0.455780\n",
      "    1 CenterPoint       KHU VỰC NHÀ VỆ SINH CÔNG CỘNG         3            400           5379   Làm sạch và khử mùi khu vệ sinh và các thiết bị bên trong nhà vệ sinh bao gồm: Bồn rửa, bệ xí, các vòi, van nước, hộp đựng giấy… sạch khử mùi khu vệ_sinh thiết_bị vệ_sinh bao_gồm bồn rửa bệ_xí vòi van hộp đựng giấy 0.671480 0.222222       0.758955     0.436915\n",
      "    1    Hong Kong KHU VỰC NHÀ VỆ SINH (Ban quản lý )         2           9950          25630 · Làm sạch và khử mùi khu vệ sinh và các thiết bị bên trong nhà vệ sinh bao gồm: Bồn rửa, bệ xí, các vòi, van nước, hộp đựng giấy… sạch khử mùi khu vệ_sinh thiết_bị vệ_sinh bao_gồm bồn rửa bệ_xí vòi van hộp đựng giấy 0.671480 0.200000       0.549676     0.339870\n",
      "    1          HCO                        Nhà vệ sinh         3            800          13000                                                                                       Lau thùng đựng rác, thiết bị thoát nước thải                                           lau thùng đựng rác thiết_bị thoát nước_thải 0.706144 0.000000       0.680405     0.272162\n"
     ]
    }
   ],
   "source": [
    "# Lọc các case dự đoán sai trên holdout\n",
    "mis_holdout = holdout_df_pred[\n",
    "    holdout_df_pred[\"label\"].astype(str) != holdout_df_pred[\"pred_label\"].astype(str)\n",
    "].copy()\n",
    "\n",
    "print(\"Số mẫu sai trên HOLDOUT:\", len(mis_holdout))\n",
    "\n",
    "max_cases = 5   # in tối đa 5 case cho đỡ dài, bạn có thể tăng số này\n",
    "\n",
    "for i, (_, row) in enumerate(mis_holdout.iterrows(), start=1):\n",
    "    if i > max_cases:\n",
    "        break\n",
    "    if i!=3:\n",
    "        continue\n",
    "    print(\"\\n\" + \"=\"*100)\n",
    "    print(f\"❌ CASE SAI #{i}\")\n",
    "    print(f\"  True label    : {row['label']}\")\n",
    "    print(f\"  Pred label    : {row['pred_label']}\")\n",
    "    print(f\"  Tòa (Name)    : {row.get('Name', 'N/A')}\")\n",
    "    print(f\"  Location      : {row['Location']}\")\n",
    "    print(f\"  Lưu lượng     : {row.get('luuluong', 'N/A')}\")\n",
    "    print(f\"  DT ngoài      : {row.get('Dientichngoai', 'N/A')}\")\n",
    "    print(f\"  DT trong      : {row.get('Dientichtrong', 'N/A')}\")\n",
    "    print(\"  Job raw       :\", row[\"Job\"])\n",
    "    print(\"  job_segmented :\", row[\"job_segmented\"])\n",
    "    print(\"  location_clean:\", row[\"location_clean\"])\n",
    "\n",
    "    # Lấy top neighbors cho case này\n",
    "    neighbors = get_top_neighbors_for_row(\n",
    "        row,\n",
    "        train_df=train_df,\n",
    "        train_embeddings=train_embeddings,\n",
    "        tokenizer=hf_tokenizer,\n",
    "        model=hf_model,\n",
    "        top_k=5,       # số ứng viên lấy theo embedding\n",
    "        w_numeric=0.4,\n",
    "        w_loc=0.6\n",
    "    )\n",
    "\n",
    "    print(\"\\n  → Top 10 hàng xóm trong train (sorted theo final_score):\")\n",
    "    cols_show = [\n",
    "        \"label\", \"Name\", \"Location\",\n",
    "        \"luuluong\", \"Dientichngoai\", \"Dientichtrong\",\n",
    "        \"Job\", \"job_segmented\",\n",
    "        \"cos_sim\", \"loc_sim\", \"num_closeness\", \"final_score\"\n",
    "    ]\n",
    "    # Chỉ in cột nào thực sự tồn tại (phòng trường hợp thiếu)\n",
    "    cols_show = [c for c in cols_show if c in neighbors.columns]\n",
    "\n",
    "    print(neighbors[cols_show].head(10).to_string(index=False))\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "tainl",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/vietnamese-stopwords-dash.txt
+++ b/vietnamese-stopwords-dash.txt