HoanMy_DetectTansuat/felix3.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "17db49aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "fc17f632",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_excel(\"hoanmy_detect_task.xlsx\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "6ee83cd9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Location</th>\n",
       "      <th>Job</th>\n",
       "      <th>label</th>\n",
       "      <th>Name</th>\n",
       "      <th>luuluong</th>\n",
       "      <th>Dientichngoai</th>\n",
       "      <th>Dientichtrong</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Khu vực Ngoại cảnh</td>\n",
       "      <td>Quét lá rụng, thu gom rác lối đi lại, lối xe c...</td>\n",
       "      <td>1</td>\n",
       "      <td>BIDV</td>\n",
       "      <td>3</td>\n",
       "      <td>1144</td>\n",
       "      <td>11200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Khu vực Ngoại cảnh</td>\n",
       "      <td>Nhặt rác bồn hoa cây cảnh, làm sạch gạch ốp xu...</td>\n",
       "      <td>1</td>\n",
       "      <td>BIDV</td>\n",
       "      <td>3</td>\n",
       "      <td>1144</td>\n",
       "      <td>11200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Khu vực Ngoại cảnh</td>\n",
       "      <td>Vệ sinh gạt tàn, thùng rác</td>\n",
       "      <td>1</td>\n",
       "      <td>BIDV</td>\n",
       "      <td>3</td>\n",
       "      <td>1144</td>\n",
       "      <td>11200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Khu vực Ngoại cảnh</td>\n",
       "      <td>Lau các biển quảng cáo, biển chỉ dẫn (dưới 4m)...</td>\n",
       "      <td>1</td>\n",
       "      <td>BIDV</td>\n",
       "      <td>3</td>\n",
       "      <td>1144</td>\n",
       "      <td>11200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Khu vực Ngoại cảnh</td>\n",
       "      <td>Lau tường đá và kính bên ngoài tòa nhà (dưới 4m)</td>\n",
       "      <td>2</td>\n",
       "      <td>BIDV</td>\n",
       "      <td>3</td>\n",
       "      <td>1144</td>\n",
       "      <td>11200</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             Location                                                Job  \\\n",
       "0  Khu vực Ngoại cảnh  Quét lá rụng, thu gom rác lối đi lại, lối xe c...   \n",
       "1  Khu vực Ngoại cảnh  Nhặt rác bồn hoa cây cảnh, làm sạch gạch ốp xu...   \n",
       "2  Khu vực Ngoại cảnh                         Vệ sinh gạt tàn, thùng rác   \n",
       "3  Khu vực Ngoại cảnh  Lau các biển quảng cáo, biển chỉ dẫn (dưới 4m)...   \n",
       "4  Khu vực Ngoại cảnh   Lau tường đá và kính bên ngoài tòa nhà (dưới 4m)   \n",
       "\n",
       "  label  Name  luuluong  Dientichngoai  Dientichtrong  \n",
       "0     1  BIDV         3           1144          11200  \n",
       "1     1  BIDV         3           1144          11200  \n",
       "2     1  BIDV         3           1144          11200  \n",
       "3     1  BIDV         3           1144          11200  \n",
       "4     2  BIDV         3           1144          11200  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "c4616ce2",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from pyvi import ViTokenizer\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "a479ccda",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import re\n",
    "import unicodedata\n",
    "\n",
    "def normalize_text_keep_words(s: str) -> str:\n",
    "    \"\"\"\n",
    "    Chuẩn hoá Unicode (NFC), lowercase, loại ký tự không phải chữ/số/khoảng trắng.\n",
    "    Giữ toàn bộ từ vựng (CHƯA lọc stopword).\n",
    "    \"\"\"\n",
    "    s = str(s)\n",
    "    s = unicodedata.normalize('NFC', s).lower()\n",
    "    s = re.sub(r\"[^0-9a-zà-ỹ\\s]\", \" \", s)\n",
    "    s = re.sub(r\"\\s+\", \" \", s).strip()\n",
    "    return s\n",
    "\n",
    "def preprocess_step1_no_stopwords(df_raw: pd.DataFrame) -> pd.DataFrame:\n",
    "    \"\"\"\n",
    "    df_raw: DataFrame gốc có cột 'Location', 'Job', 'label'\n",
    "    Trả về df_out với các cột:\n",
    "      - Location, Job, label (đã lọc 1-4)\n",
    "      - location_clean, job_clean\n",
    "      - combined_text = f\"{location_clean} {job_clean}\"\n",
    "    \"\"\"\n",
    "    # 1) Bỏ nhãn rỗng và chỉ giữ {1,2,3,4}\n",
    "    df = df_raw.dropna(subset=[\"label\"]).copy()\n",
    "    df[\"label\"] = df[\"label\"].astype(str).str.strip()\n",
    "    df = df[df[\"label\"].isin([\"1\",\"2\",\"3\",\"4\"])].copy()\n",
    "\n",
    "    # 2) Bổ sung thiếu Location, chuẩn kiểu\n",
    "    df[\"Location\"] = df[\"Location\"].fillna(\"Unknown\").astype(str)\n",
    "    df[\"Job\"] = df[\"Job\"].astype(str)\n",
    "\n",
    "    # 3) Làm sạch (CHƯA lọc stopword)\n",
    "    df[\"location_clean\"] = df[\"Location\"].apply(normalize_text_keep_words)\n",
    "    df[\"job_clean\"]      = df[\"Job\"].apply(normalize_text_keep_words)\n",
    "\n",
    "    # 4) Ghép Location + Job (location trước, job sau)\n",
    "    df[\"combined_text\"] = (df[\"location_clean\"] + \" \" + df[\"job_clean\"]).str.strip()\n",
    "\n",
    "    # 5) Trả về gọn gàng\n",
    "    return df\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "1f2e1ad3",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_out = preprocess_step1_no_stopwords(df)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "b742ec0b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>combined_text</th>\n",
       "      <th>tokenized_text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>khu vực ngoại cảnh quét lá rụng thu gom rác lố...</td>\n",
       "      <td>khu_vực ngoại_cảnh quét lá rụng thu_gom rác lố...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>khu vực ngoại cảnh nhặt rác bồn hoa cây cảnh l...</td>\n",
       "      <td>khu_vực ngoại_cảnh nhặt rác bồn hoa cây_cảnh s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>khu vực ngoại cảnh vệ sinh gạt tàn thùng rác</td>\n",
       "      <td>khu_vực ngoại_cảnh_vệ_sinh gạt_tàn thùng rác</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>khu vực ngoại cảnh lau các biển quảng cáo biển...</td>\n",
       "      <td>khu_vực ngoại_cảnh lau biển quảng_cáo biển chỉ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>khu vực ngoại cảnh lau tường đá và kính bên ng...</td>\n",
       "      <td>khu_vực ngoại_cảnh lau tường đá kính tòa 4m</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>khu vực ngoại cảnh vệ sinh họng rác nếu có</td>\n",
       "      <td>khu_vực ngoại_cảnh_vệ_sinh họng rác</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>khu vực ngoại cảnh phun rửa sân bằng máy phun ...</td>\n",
       "      <td>khu_vực ngoại_cảnh phun rửa sân máy phun áp_lực</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>khu vực ngoại cảnh khu vực tập kết rác thải củ...</td>\n",
       "      <td>khu_vực ngoại_cảnh khu_vực tập_kết rác_thải tòa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>khu vực ngoại cảnh làm sạch các chòi bảo vệ và...</td>\n",
       "      <td>khu_vực ngoại_cảnh sạch chòi bảo_vệ thùng rác ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>khu vực ngoại cảnh lau kính mặt dưới mái kính ...</td>\n",
       "      <td>khu_vực ngoại_cảnh lau kính mặt mái kính sảnh ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       combined_text  \\\n",
       "0  khu vực ngoại cảnh quét lá rụng thu gom rác lố...   \n",
       "1  khu vực ngoại cảnh nhặt rác bồn hoa cây cảnh l...   \n",
       "2       khu vực ngoại cảnh vệ sinh gạt tàn thùng rác   \n",
       "3  khu vực ngoại cảnh lau các biển quảng cáo biển...   \n",
       "4  khu vực ngoại cảnh lau tường đá và kính bên ng...   \n",
       "5         khu vực ngoại cảnh vệ sinh họng rác nếu có   \n",
       "6  khu vực ngoại cảnh phun rửa sân bằng máy phun ...   \n",
       "7  khu vực ngoại cảnh khu vực tập kết rác thải củ...   \n",
       "8  khu vực ngoại cảnh làm sạch các chòi bảo vệ và...   \n",
       "9  khu vực ngoại cảnh lau kính mặt dưới mái kính ...   \n",
       "\n",
       "                                      tokenized_text  \n",
       "0  khu_vực ngoại_cảnh quét lá rụng thu_gom rác lố...  \n",
       "1  khu_vực ngoại_cảnh nhặt rác bồn hoa cây_cảnh s...  \n",
       "2       khu_vực ngoại_cảnh_vệ_sinh gạt_tàn thùng rác  \n",
       "3  khu_vực ngoại_cảnh lau biển quảng_cáo biển chỉ...  \n",
       "4        khu_vực ngoại_cảnh lau tường đá kính tòa 4m  \n",
       "5                khu_vực ngoại_cảnh_vệ_sinh họng rác  \n",
       "6    khu_vực ngoại_cảnh phun rửa sân máy phun áp_lực  \n",
       "7    khu_vực ngoại_cảnh khu_vực tập_kết rác_thải tòa  \n",
       "8  khu_vực ngoại_cảnh sạch chòi bảo_vệ thùng rác ...  \n",
       "9  khu_vực ngoại_cảnh lau kính mặt mái kính sảnh ...  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from pyvi import ViTokenizer\n",
    "\n",
    "# === 1. Đọc stopword từ file txt ===\n",
    "def load_stopwords(path):\n",
    "    \"\"\"\n",
    "    Đọc file stopword (.txt), mỗi dòng là 1 từ/cụm từ, loại bỏ dòng trống.\n",
    "    \"\"\"\n",
    "    with open(path, \"r\", encoding=\"utf-8\") as f:\n",
    "        stopwords = [line.strip() for line in f if line.strip()]\n",
    "    return set(stopwords)\n",
    "\n",
    "stopwords_path = \"vietnamese-stopwords-dash.txt\"  # đường dẫn file stopword của bạn\n",
    "vietnamese_stopwords = load_stopwords(stopwords_path)\n",
    "\n",
    "# === 2. Hàm tách từ và loại stopword ===\n",
    "def segment_and_remove_stopwords(text):\n",
    "    \"\"\"\n",
    "    Tách từ bằng PyVi -> loại stopword theo danh sách.\n",
    "    \"\"\"\n",
    "    if not isinstance(text, str):\n",
    "        return \"\"\n",
    "    segmented = ViTokenizer.tokenize(text)\n",
    "    tokens = segmented.split()\n",
    "    filtered = [tok for tok in tokens if tok not in vietnamese_stopwords]\n",
    "    return \" \".join(filtered)\n",
    "\n",
    "# === 3. Áp dụng cho dữ liệu đã tiền xử lý (df_out từ bước 1) ===\n",
    "# df_out = preprocess_step1_no_stopwords(df_raw)  # từ bước 1\n",
    "df_out[\"tokenized_text\"] = df_out[\"combined_text\"].apply(segment_and_remove_stopwords)\n",
    "\n",
    "# === 4. Xem thử vài dòng kết quả ===\n",
    "df_out[[\"combined_text\", \"tokenized_text\"]].head(10)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "1e823e7e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def remove_token_khu_vuc(text):\n",
    "    if not isinstance(text, str):\n",
    "        return \"\"\n",
    "    tokens = text.split()\n",
    "    tokens = [t for t in tokens if t != \"khu_vực\"]\n",
    "    return \" \".join(tokens)\n",
    "\n",
    "df_out[\"tokenized_text\"] = df_out[\"tokenized_text\"].apply(remove_token_khu_vuc)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "id": "d77bcefe",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Location', 'Job', 'label', 'Name', 'luuluong', 'Dientichngoai',\n",
       "       'Dientichtrong', 'location_clean', 'job_clean', 'combined_text',\n",
       "       'tokenized_text'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_out.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "c11582b0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Số dòng sau khi giữ label 1-3: (769, 11)\n",
      "label\n",
      "1    440\n",
      "2    223\n",
      "3    106\n",
      "Name: count, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "# Giữ lại chỉ label 1, 2, 3\n",
    "df_out = df_out[df_out[\"label\"].astype(str).isin([\"1\", \"2\", \"3\"])].reset_index(drop=True)\n",
    "\n",
    "print(\"Số dòng sau khi giữ label 1-3:\", df_out.shape)\n",
    "print(df_out[\"label\"].value_counts())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "id": "a5dd7cf9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train: (536, 11)\n",
      "Test : (172, 11)\n",
      "Hold-out: (61, 11)\n",
      "\n",
      "Số lượng theo từng tập:\n",
      "Train buildings: ['BIDV' 'CenterPoint ' 'HCO' 'Hong Kong' 'Sunred' '138A Giang Vo'\n",
      " 'VIGALCERA']\n",
      "Test  buildings: ['CMC' 'Keangnam']\n",
      "Hold-out building: ['HH4']\n"
     ]
    }
   ],
   "source": [
    "# ==== BƯỚC 1: chỉ giữ label 1–3 (nếu cần) ====\n",
    "df_out = df_out[df_out[\"label\"].astype(str).isin([\"1\", \"2\", \"3\"])].reset_index(drop=True)\n",
    "\n",
    "# ==== BƯỚC 2: định nghĩa 2 tập test + 1 tập hold-out ====\n",
    "test_buildings = [\"Keangnam\", \"CMC\"]          # tập test chính\n",
    "holdout_building = [\"HH4\"]                  # ví dụ: tập kiểm thử ngoài (bạn đổi tên tòa tùy ý)\n",
    "\n",
    "# ==== BƯỚC 3: Tách dữ liệu ====\n",
    "\n",
    "# Tập test theo 2 tòa chính\n",
    "test_df = df_out[df_out[\"Name\"].isin(test_buildings)].reset_index(drop=True)\n",
    "\n",
    "# Tập hold-out theo tòa riêng\n",
    "holdout_df = df_out[df_out[\"Name\"].isin(holdout_building)].reset_index(drop=True)\n",
    "\n",
    "# Tập train: tất cả còn lại\n",
    "train_df = df_out[\n",
    "    ~df_out[\"Name\"].isin(test_buildings + holdout_building)\n",
    "].reset_index(drop=True)\n",
    "\n",
    "print(\"Train:\", train_df.shape)\n",
    "print(\"Test :\", test_df.shape)\n",
    "print(\"Hold-out:\", holdout_df.shape)\n",
    "\n",
    "print(\"\\nSố lượng theo từng tập:\")\n",
    "print(\"Train buildings:\", train_df[\"Name\"].unique())\n",
    "print(\"Test  buildings:\", test_df[\"Name\"].unique())\n",
    "print(\"Hold-out building:\", holdout_df[\"Name\"].unique())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "id": "749be883",
   "metadata": {},
   "outputs": [],
   "source": [
    "def location_similarity(q_row, cand_row):\n",
    "    \"\"\"\n",
    "    Độ tương đồng Location trong [0,1] – càng cao càng giống.\n",
    "    Dùng Jaccard trên token của location_clean.\n",
    "    \"\"\"\n",
    "    q_tokens = set(str(q_row[\"location_clean\"]).split())\n",
    "    c_tokens = set(str(cand_row[\"location_clean\"]).split())\n",
    "\n",
    "    if not q_tokens or not c_tokens:\n",
    "        return 0.0\n",
    "\n",
    "    inter = len(q_tokens & c_tokens)\n",
    "    union = len(q_tokens | c_tokens)\n",
    "    return inter / union\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "id": "760cbf42",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer, AutoModel\n",
    "import torch\n",
    "\n",
    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"dangvantuan/vietnamese-embedding\")\n",
    "model = AutoModel.from_pretrained(\"dangvantuan/vietnamese-embedding\").to(device)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "id": "81ab2321",
   "metadata": {},
   "outputs": [],
   "source": [
    "def embed_text(text, tokenizer, model, device=device):\n",
    "    # Tokenize\n",
    "    encoded = tokenizer(\n",
    "        text,\n",
    "        padding=True,\n",
    "        truncation=True,\n",
    "        max_length=128,\n",
    "        return_tensors=\"pt\"\n",
    "    ).to(device)\n",
    "\n",
    "    with torch.no_grad():\n",
    "        output = model(**encoded)\n",
    "\n",
    "    # output.last_hidden_state: [batch, seq_len, hidden]\n",
    "    token_embeddings = output.last_hidden_state          # (1, L, H)\n",
    "    attention_mask = encoded[\"attention_mask\"]           # (1, L)\n",
    "\n",
    "    # Mean pooling\n",
    "    mask = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()\n",
    "    sum_embeds = (token_embeddings * mask).sum(dim=1)\n",
    "    lengths = mask.sum(dim=1)\n",
    "    mean_pooled = sum_embeds / lengths\n",
    "\n",
    "    return mean_pooled.cpu()    # return vector (1, hidden)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "id": "afee70b4",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_texts = train_df[\"tokenized_text\"].tolist()\n",
    "\n",
    "train_embeddings = []\n",
    "for txt in train_texts:\n",
    "    vec = embed_text(txt, tokenizer, model)\n",
    "    train_embeddings.append(vec.squeeze(0))\n",
    "\n",
    "train_embeddings = torch.stack(train_embeddings)   # shape: (N, hidden)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "id": "b484bf16",
   "metadata": {},
   "outputs": [],
   "source": [
    "def cosine_sim(a, b):\n",
    "    a = a / a.norm(dim=-1, keepdim=True)\n",
    "    b = b / b.norm(dim=-1, keepdim=True)\n",
    "    return torch.mm(a, b.t())   # (1,H) x (H,N) = (1,N)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "id": "0e04d882",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "def numeric_closeness(q_row, cand_row, alpha_out=0.7, alpha_in=0.7):\n",
    "\n",
    "    # closeness lưu lượng\n",
    "    if q_row[\"luuluong\"] == cand_row[\"luuluong\"]:\n",
    "        c_luu = 1.0\n",
    "    elif abs(q_row[\"luuluong\"] - cand_row[\"luuluong\"]) == 1:\n",
    "        c_luu = 0.6\n",
    "    else:\n",
    "        c_luu = 0.3\n",
    "\n",
    "    # closeness diện tích\n",
    "    d_out = abs(np.log1p(q_row[\"Dientichngoai\"]) - np.log1p(cand_row[\"Dientichngoai\"]))\n",
    "    d_in  = abs(np.log1p(q_row[\"Dientichtrong\"])  - np.log1p(cand_row[\"Dientichtrong\"]))\n",
    "\n",
    "    c_out = np.exp(-alpha_out * d_out)\n",
    "    c_in  = np.exp(-alpha_in  * d_in)\n",
    "\n",
    "    return 0.5 * c_luu + 0.25 * c_out + 0.25 * c_in\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "id": "9b5be7ca",
   "metadata": {},
   "outputs": [],
   "source": [
    "def predict_label_for_row(q_row, train_df, train_embeddings,\n",
    "                          tokenizer, model,\n",
    "                          top_k=10, w_numeric=0.4, w_loc=0.6):\n",
    "    \"\"\"\n",
    "    Bước 1: dùng embedding để chọn top_k ứng viên gần nghĩa nhất (theo tokenized_text).\n",
    "    Bước 2: trong top_k đó, KHÔNG dùng điểm text nữa, chỉ dùng:\n",
    "        - loc_sim   : similarity theo location_clean  [0,1]  (quan trọng nhất)\n",
    "        - num_c     : numeric_closeness (luuluong + diện tích)  (~0..1.5)\n",
    "    final_score = w_loc * loc_sim + w_numeric * num_c\n",
    "    \"\"\"\n",
    "\n",
    "    # 1) embed query\n",
    "    q_vec = embed_text(q_row[\"tokenized_text\"], tokenizer, model)  # (1, H)\n",
    "\n",
    "    # 2) cosine similarity với toàn bộ train → chỉ để CHỌN ỨNG VIÊN\n",
    "    sims = cosine_sim(q_vec, train_embeddings)[0]  # (N,)\n",
    "\n",
    "    # 3) lấy top-k job gần nhất theo embedding\n",
    "    top_k = min(top_k, len(train_df))\n",
    "    top_scores, top_idx = torch.topk(sims, k=top_k)\n",
    "\n",
    "    label_scores = {}\n",
    "\n",
    "    for score, idx in zip(top_scores, top_idx):\n",
    "        cand_row = train_df.iloc[int(idx)]\n",
    "\n",
    "        # location similarity (quan trọng nhất)\n",
    "        loc_sim = location_similarity(q_row, cand_row)\n",
    "\n",
    "        # numeric closeness (luuluong + diện tích)\n",
    "        num_c = numeric_closeness(q_row, cand_row)\n",
    "\n",
    "        # điểm cuối cho neighbor này\n",
    "        final_score = w_loc * loc_sim + w_numeric * num_c\n",
    "\n",
    "        lbl = str(cand_row[\"label\"])\n",
    "        label_scores[lbl] = label_scores.get(lbl, 0.0) + final_score\n",
    "\n",
    "    # fallback nếu không có ứng viên\n",
    "    if not label_scores:\n",
    "        majority_label = str(train_df[\"label\"].value_counts().idxmax())\n",
    "        return majority_label, {}\n",
    "\n",
    "    best_label = max(label_scores, key=label_scores.get)\n",
    "    return best_label, label_scores\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "id": "8872e2bd",
   "metadata": {},
   "outputs": [],
   "source": [
    "preds = []\n",
    "scores_debug = []\n",
    "\n",
    "for _, row in test_df.iterrows():\n",
    "    pred, sc = predict_label_for_row(\n",
    "        row,\n",
    "        train_df,\n",
    "        train_embeddings,\n",
    "        tokenizer,\n",
    "        model,\n",
    "        top_k=5,\n",
    "        w_numeric=0.3,\n",
    "        w_loc=0.7\n",
    "    )\n",
    "    preds.append(pred)\n",
    "    scores_debug.append(sc)\n",
    "\n",
    "test_df[\"pred_label_hf\"] = preds\n",
    "test_df[\"score_details\"] = scores_debug\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "id": "cbe8bcd2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           1      0.804     0.857     0.829       105\n",
      "           2      0.600     0.447     0.512        47\n",
      "           3      0.400     0.500     0.444        20\n",
      "\n",
      "    accuracy                          0.703       172\n",
      "   macro avg      0.601     0.601     0.595       172\n",
      "weighted avg      0.701     0.703     0.698       172\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import classification_report\n",
    "\n",
    "print(classification_report(\n",
    "    test_df[\"label\"].astype(str),\n",
    "    test_df[\"pred_label_hf\"].astype(str),\n",
    "    digits=3\n",
    "))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "id": "90b212ad",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Số mẫu dự đoán sai: 53\n",
      "label pred_label_hf Name                                                                    Location  luuluong  Dientichngoai  Dientichtrong                                                      Job                                                                  tokenized_text\n",
      "    3             1  CMC                                       Khu vực ngoại cảnh, vỉa hè xung quanh         1            900           5235 Lau sạch các biển báo, biển tên công ty (độ cao dưới 4m)               ngoại_cảnh vỉa_hè xung_quanh lau sạch biển_báo biển công_ty độ 4m\n",
      "    3             1  CMC                                       Khu vực ngoại cảnh, vỉa hè xung quanh         1            900           5235                               Vệ sinh lối ra vào, vỉa hè                                 ngoại_cảnh vỉa_hè xung_quanh vệ_sinh lối vỉa_hè\n",
      "    3             2  CMC                                       Khu vực ngoại cảnh, vỉa hè xung quanh         1            900           5235                                  Vệ sinh rãnh thoát nước                                 ngoại_cảnh vỉa_hè xung_quanh vệ_sinh rãnh thoát\n",
      "    2             1  CMC                                         Khu vực sảnh chính tầng 1 và tầng G         1            900           5235                      Lau cửa ra vào, vách kính (dưới 4m)                                         sảnh tầng 1 tầng g lau cửa vách kính 4m\n",
      "    2             3  CMC                                         Khu vực sảnh chính tầng 1 và tầng G         1            900           5235                                           Quét mạng nhện                                               sảnh tầng 1 tầng g quét mạng_nhện\n",
      "    1             2  CMC Khu vực hành lang công cộng sảnh chờ thang máy (từ tầng hầm B2 đến tầng 18)         1            900           5235                     Tẩy điểm các vết bẩn tường (dưới 4m) hành_lang công_cộng sảnh chờ thang_máy tầng hầm b2 tầng 18 tẩy vết bẩn tường 4m\n",
      "    3             1  CMC Khu vực hành lang công cộng sảnh chờ thang máy (từ tầng hầm B2 đến tầng 18)         1            900           5235                                           Quét mạng nhện       hành_lang công_cộng sảnh chờ thang_máy tầng hầm b2 tầng 18 quét mạng_nhện\n",
      "    2             1  CMC                                                       Thang máy (liên tầng)         1            900           5235               Lau bụi trần, đèn trần bên trong thang máy                             thang_máy liên_tầng lau bụi_trần đèn trần thang_máy\n",
      "    1             2  CMC                                Cầu thang bộ và thang thoát hiểm (liên tầng)         1            900           5235                                        Lau bậc cầu thang                          cầu_thang thang thoát hiểm_liên tầng lau bậc cầu_thang\n",
      "    1             2  CMC                                Cầu thang bộ và thang thoát hiểm (liên tầng)         1            900           5235                             Lau mặt bậc thang thoát hiểm               cầu_thang thang thoát hiểm_liên tầng lau mặt bậc thang thoát hiểm\n"
     ]
    }
   ],
   "source": [
    "# Lọc các trường hợp dự đoán sai\n",
    "mis_df = test_df[\n",
    "    test_df[\"label\"].astype(str) != test_df[\"pred_label_hf\"].astype(str)\n",
    "].copy()\n",
    "\n",
    "print(\"Số mẫu dự đoán sai:\", len(mis_df))\n",
    "\n",
    "# Xem 10 dòng đầu cho gọn\n",
    "cols_show = [\n",
    "    \"label\", \"pred_label_hf\",\n",
    "    \"Name\", \"Location\",\n",
    "    \"luuluong\", \"Dientichngoai\", \"Dientichtrong\",\n",
    "    \"Job\", \"tokenized_text\"\n",
    "]\n",
    "\n",
    "print(mis_df[cols_show].head(10).to_string(index=False))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "id": "6edef39a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "================================================================================\n",
      "❌ CASE #1\n",
      "  True label    : 3\n",
      "  Pred label    : 1\n",
      "  Tòa           : CMC\n",
      "  Location      : Khu vực ngoại cảnh, vỉa hè xung quanh\n",
      "  Lưu lượng     : 1\n",
      "  DT ngoài      : 900\n",
      "  DT trong      : 5235\n",
      "  Job raw       : Lau sạch các biển báo, biển tên công ty (độ cao dưới 4m)\n",
      "  tokenized_text: ngoại_cảnh vỉa_hè xung_quanh lau sạch biển_báo biển công_ty độ 4m\n",
      "\n",
      "================================================================================\n",
      "❌ CASE #2\n",
      "  True label    : 3\n",
      "  Pred label    : 1\n",
      "  Tòa           : CMC\n",
      "  Location      : Khu vực ngoại cảnh, vỉa hè xung quanh\n",
      "  Lưu lượng     : 1\n",
      "  DT ngoài      : 900\n",
      "  DT trong      : 5235\n",
      "  Job raw       : Vệ sinh lối ra vào, vỉa hè\n",
      "  tokenized_text: ngoại_cảnh vỉa_hè xung_quanh vệ_sinh lối vỉa_hè\n",
      "\n",
      "================================================================================\n",
      "❌ CASE #3\n",
      "  True label    : 3\n",
      "  Pred label    : 2\n",
      "  Tòa           : CMC\n",
      "  Location      : Khu vực ngoại cảnh, vỉa hè xung quanh\n",
      "  Lưu lượng     : 1\n",
      "  DT ngoài      : 900\n",
      "  DT trong      : 5235\n",
      "  Job raw       : Vệ sinh rãnh thoát nước\n",
      "  tokenized_text: ngoại_cảnh vỉa_hè xung_quanh vệ_sinh rãnh thoát\n",
      "\n",
      "================================================================================\n",
      "❌ CASE #4\n",
      "  True label    : 2\n",
      "  Pred label    : 1\n",
      "  Tòa           : CMC\n",
      "  Location      : Khu vực sảnh chính tầng 1 và tầng G\n",
      "  Lưu lượng     : 1\n",
      "  DT ngoài      : 900\n",
      "  DT trong      : 5235\n",
      "  Job raw       : Lau cửa ra vào, vách kính (dưới 4m)\n",
      "  tokenized_text: sảnh tầng 1 tầng g lau cửa vách kính 4m\n",
      "\n",
      "================================================================================\n",
      "❌ CASE #5\n",
      "  True label    : 2\n",
      "  Pred label    : 3\n",
      "  Tòa           : CMC\n",
      "  Location      : Khu vực sảnh chính tầng 1 và tầng G\n",
      "  Lưu lượng     : 1\n",
      "  DT ngoài      : 900\n",
      "  DT trong      : 5235\n",
      "  Job raw       : Quét mạng nhện\n",
      "  tokenized_text: sảnh tầng 1 tầng g quét mạng_nhện\n",
      "\n",
      "================================================================================\n",
      "❌ CASE #6\n",
      "  True label    : 1\n",
      "  Pred label    : 2\n",
      "  Tòa           : CMC\n",
      "  Location      : Khu vực hành lang công cộng sảnh chờ thang máy (từ tầng hầm B2 đến tầng 18)\n",
      "  Lưu lượng     : 1\n",
      "  DT ngoài      : 900\n",
      "  DT trong      : 5235\n",
      "  Job raw       : Tẩy điểm các vết bẩn tường (dưới 4m)\n",
      "  tokenized_text: hành_lang công_cộng sảnh chờ thang_máy tầng hầm b2 tầng 18 tẩy vết bẩn tường 4m\n",
      "\n",
      "================================================================================\n",
      "❌ CASE #7\n",
      "  True label    : 3\n",
      "  Pred label    : 1\n",
      "  Tòa           : CMC\n",
      "  Location      : Khu vực hành lang công cộng sảnh chờ thang máy (từ tầng hầm B2 đến tầng 18)\n",
      "  Lưu lượng     : 1\n",
      "  DT ngoài      : 900\n",
      "  DT trong      : 5235\n",
      "  Job raw       : Quét mạng nhện\n",
      "  tokenized_text: hành_lang công_cộng sảnh chờ thang_máy tầng hầm b2 tầng 18 quét mạng_nhện\n",
      "\n",
      "================================================================================\n",
      "❌ CASE #8\n",
      "  True label    : 2\n",
      "  Pred label    : 1\n",
      "  Tòa           : CMC\n",
      "  Location      :  Thang máy (liên tầng)\n",
      "  Lưu lượng     : 1\n",
      "  DT ngoài      : 900\n",
      "  DT trong      : 5235\n",
      "  Job raw       : Lau bụi trần, đèn trần bên trong thang máy\n",
      "  tokenized_text: thang_máy liên_tầng lau bụi_trần đèn trần thang_máy\n",
      "\n",
      "================================================================================\n",
      "❌ CASE #9\n",
      "  True label    : 1\n",
      "  Pred label    : 2\n",
      "  Tòa           : CMC\n",
      "  Location      : Cầu thang bộ và thang thoát hiểm (liên tầng)\n",
      "  Lưu lượng     : 1\n",
      "  DT ngoài      : 900\n",
      "  DT trong      : 5235\n",
      "  Job raw       : Lau bậc cầu thang\n",
      "  tokenized_text: cầu_thang thang thoát hiểm_liên tầng lau bậc cầu_thang\n",
      "\n",
      "================================================================================\n",
      "❌ CASE #10\n",
      "  True label    : 1\n",
      "  Pred label    : 2\n",
      "  Tòa           : CMC\n",
      "  Location      : Cầu thang bộ và thang thoát hiểm (liên tầng)\n",
      "  Lưu lượng     : 1\n",
      "  DT ngoài      : 900\n",
      "  DT trong      : 5235\n",
      "  Job raw       : Lau mặt bậc thang thoát hiểm\n",
      "  tokenized_text: cầu_thang thang thoát hiểm_liên tầng lau mặt bậc thang thoát hiểm\n"
     ]
    }
   ],
   "source": [
    "max_cases = 10  # đổi số nếu muốn xem nhiều hơn\n",
    "\n",
    "for i, (_, row) in enumerate(mis_df.iterrows(), start=1):\n",
    "    if i > max_cases:\n",
    "        break\n",
    "\n",
    "    print(\"\\n\" + \"=\"*80)\n",
    "    print(f\"❌ CASE #{i}\")\n",
    "    print(f\"  True label    : {row['label']}\")\n",
    "    print(f\"  Pred label    : {row['pred_label_hf']}\")\n",
    "    print(f\"  Tòa           : {row['Name']}\")\n",
    "    print(f\"  Location      : {row['Location']}\")\n",
    "    print(f\"  Lưu lượng     : {row['luuluong']}\")\n",
    "    print(f\"  DT ngoài      : {row['Dientichngoai']}\")\n",
    "    print(f\"  DT trong      : {row['Dientichtrong']}\")\n",
    "    print(\"  Job raw       :\", row['Job'])\n",
    "    print(\"  tokenized_text:\", row['tokenized_text'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "5935f240",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Số mẫu dự đoán sai: 54\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import torch\n",
    "\n",
    "mis_df = test_df[\n",
    "    test_df[\"label\"].astype(str) != test_df[\"pred_label_hf\"].astype(str)\n",
    "].copy()\n",
    "\n",
    "print(\"Số mẫu dự đoán sai:\", len(mis_df))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "id": "2aaae928",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_top_neighbors(q_row,\n",
    "                      train_df,\n",
    "                      train_embeddings,\n",
    "                      tokenizer,\n",
    "                      model,\n",
    "                      top_k=10,\n",
    "                      w_numeric=0.3,\n",
    "                      w_loc=0.6):\n",
    "    q_vec = embed_text(q_row[\"tokenized_text\"], tokenizer, model)\n",
    "    sims = cosine_sim(q_vec, train_embeddings)[0]\n",
    "\n",
    "    top_k = min(top_k, len(train_df))\n",
    "    top_scores, top_idx = torch.topk(sims, k=top_k)\n",
    "\n",
    "    w_text = 1.0 - w_numeric - w_loc\n",
    "\n",
    "    rows = []\n",
    "    for score, idx in zip(top_scores, top_idx):\n",
    "        score = float(score.item())\n",
    "        cand = train_df.iloc[int(idx)].copy()\n",
    "\n",
    "        num_c   = numeric_closeness(q_row, cand)\n",
    "        loc_sim = location_similarity(q_row, cand)\n",
    "\n",
    "        final_score = (\n",
    "            w_loc    * loc_sim +\n",
    "            w_numeric * num_c +\n",
    "            w_text   * score\n",
    "        )\n",
    "\n",
    "        cand[\"cos_sim\"]       = score\n",
    "        cand[\"loc_sim\"]       = loc_sim\n",
    "        cand[\"num_closeness\"] = num_c\n",
    "        cand[\"final_score\"]   = final_score\n",
    "        rows.append(cand)\n",
    "\n",
    "    neighbors_df = pd.DataFrame(rows).sort_values(\"final_score\", ascending=False)\n",
    "    return neighbors_df\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "id": "c9dca605",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "====================================================================================================\n",
      "❌ CASE SAI #50\n",
      "  True label      : 3\n",
      "  Pred label      : 2\n",
      "  Tòa (Name)      : Keangnam\n",
      "  Location        : KHU VỰC THANG BỘ (02 thang/ tháp)\n",
      "  Lưu lượng       : 2\n",
      "  DT ngoài (m2)   : 46056\n",
      "  DT trong (m2)   : 609673\n",
      "  Job raw         : · Lau quạt thông gió\n",
      "  tokenized_text  : thang 02 thang tháp lau quạt thông_gió\n",
      "\n",
      "  → Top 10 job tương đồng trong train:\n",
      "label         Name                           Location  luuluong  Dientichngoai  Dientichtrong                                                         Job                                         tokenized_text  cos_sim  num_closeness  final_score\n",
      "    2    Hong Kong        KHU VỰC THANG BỘ (4 thang )         2           9950          25630                                          · Lau lỗ thông gió                         thang 4 thang lau lỗ thông_gió 0.729290       0.612730     0.437822\n",
      "    1         BIDV     Khu vực thang bộ (02 thang bộ)         3           1144          11200            Làm sạch mặt ngoài cửa thông gió, biển hiệu tầng   thang 02 thang sạch mặt cửa thông gió biển_hiệu_tầng 0.819478       0.334063     0.396500\n",
      "    2          HH4   CẦU THANG BỘ VÀ THANG THOÁT HIỂM         2          43000           3000                                           Lau kính lấy sáng                    cầu_thang thang thoát hiểm_lau kính 0.614302       0.744329     0.356928\n",
      "    2          HH4   CẦU THANG BỘ VÀ THANG THOÁT HIỂM         2          43000           3000                                   Làm sạch thang thoát hiểm       cầu_thang thang thoát hiểm sạch thang thoát hiểm 0.576035       0.744329     0.334694\n",
      "    1         BIDV     Khu vực thang bộ (02 thang bộ)         3           1144          11200                                       Làm sạch bậc thang bộ                          thang 02 thang sạch bậc thang 0.673408       0.334063     0.325825\n",
      "    3         BIDV     Khu vực thang bộ (02 thang bộ)         3           1144          11200 Làm sạch các vách kính thang bộ (bên trong và cao dưới 4m).                 thang 02 thang sạch vách kính thang 4m 0.655837       0.334063     0.317323\n",
      "    2         BIDV     Khu vực thang bộ (02 thang bộ)         3           1144          11200          Làm sạch bên ngoài các hộp kỹ thuật, bình cứu hỏa.          thang 02 thang sạch hộp kỹ_thuật bình cứu_hỏa 0.630313       0.334063     0.304973\n",
      "    1         BIDV     Khu vực thang bộ (02 thang bộ)         3           1144          11200                                  Làm sạch tay vịn cầu thang                  thang 02 thang sạch tay_vịn cầu_thang 0.588724       0.334063     0.284851\n",
      "    2 CenterPoint  KHU VỰC THANG THOÁT HIỂM+ THANG BỘ         3            400           5379                                            Lau lỗ thông gió                thang thoát hiểm_thang lau lỗ thông_gió 0.708677       0.318151     0.264128\n",
      "    3         BIDV                 Khu vực Ngoại cảnh         3           1144          11200          Lau kính mặt dưới mái kính sảnh (02 mái kính sảnh) ngoại_cảnh lau kính mặt mái kính sảnh 02 mái kính sảnh 0.590167       0.334063     0.182270\n"
     ]
    }
   ],
   "source": [
    "max_cases = 54  # đổi nếu muốn xem nhiều hơn\n",
    "\n",
    "for i, (_, row) in enumerate(mis_df.iterrows(), start=1):\n",
    "    if i > max_cases:\n",
    "        break\n",
    "    if i!=50:\n",
    "       continue     \n",
    "    print(\"\\n\" + \"=\"*100)\n",
    "    print(f\"❌ CASE SAI #{i}\")\n",
    "    print(f\"  True label      : {row['label']}\")\n",
    "    print(f\"  Pred label      : {row['pred_label_hf']}\")\n",
    "    print(f\"  Tòa (Name)      : {row['Name']}\")\n",
    "    print(f\"  Location        : {row['Location']}\")\n",
    "    print(f\"  Lưu lượng       : {row['luuluong']}\")\n",
    "    print(f\"  DT ngoài (m2)   : {row['Dientichngoai']}\")\n",
    "    print(f\"  DT trong (m2)   : {row['Dientichtrong']}\")\n",
    "    print(\"  Job raw         :\", row[\"Job\"])\n",
    "    print(\"  tokenized_text  :\", row[\"tokenized_text\"])\n",
    "\n",
    "    # Lấy neighbors\n",
    "    neighbors = get_top_neighbors(\n",
    "        row,\n",
    "        train_df,\n",
    "        train_embeddings,\n",
    "        tokenizer,\n",
    "        model,\n",
    "        top_k=10,\n",
    "        w_numeric=0.7   # giữ giống lúc bạn train để dễ so\n",
    "    )\n",
    "\n",
    "    print(\"\\n  → Top 10 job tương đồng trong train:\")\n",
    "    cols_show = [\n",
    "        \"label\", \"Name\", \"Location\",\n",
    "        \"luuluong\", \"Dientichngoai\", \"Dientichtrong\",\n",
    "        \"Job\", \"tokenized_text\",\n",
    "        \"cos_sim\", \"num_closeness\", \"final_score\"\n",
    "    ]\n",
    "    print(neighbors[cols_show].to_string(index=False))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9adbaebb",
   "metadata": {},
   "source": [
    "DỰ ĐOÁN TẬP HOLD OUT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2b525a79",
   "metadata": {},
   "outputs": [],
   "source": [
    "holdout_preds = []\n",
    "holdout_scores_debug = []\n",
    "\n",
    "for _, row in holdout_df.iterrows():\n",
    "    pred, sc = predict_label_for_row(\n",
    "        row,\n",
    "        train_df,\n",
    "        train_embeddings,\n",
    "        tokenizer,\n",
    "        model,\n",
    "        top_k=10,          # số ứng viên lấy theo ST\n",
    "        w_numeric=0.3,     # bạn có thể tune\n",
    "        w_loc=0.7          # location quan trọng nhất\n",
    "    )\n",
    "    holdout_preds.append(pred)\n",
    "    holdout_scores_debug.append(sc)\n",
    "\n",
    "holdout_df[\"pred_label_hf\"] = holdout_preds\n",
    "holdout_df[\"score_details\"] = holdout_scores_debug\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e0bcce32",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ddeaa905",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "86ec7ba5",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d2d3c66a",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "tainl",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}