HoanMy_DetectTansuat/train.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "7eaf4b18",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "187c8d47",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_excel(\"hoanmy_detect_task.xlsx\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "eaddf252",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Location</th>\n",
       "      <th>Job</th>\n",
       "      <th>label</th>\n",
       "      <th>Name</th>\n",
       "      <th>luuluong</th>\n",
       "      <th>Dientichngoai</th>\n",
       "      <th>Dientichtrong</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Khu vực Ngoại cảnh</td>\n",
       "      <td>Quét lá rụng, thu gom rác lối đi lại, lối xe c...</td>\n",
       "      <td>1</td>\n",
       "      <td>BIDV</td>\n",
       "      <td>3</td>\n",
       "      <td>1144</td>\n",
       "      <td>11200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Khu vực Ngoại cảnh</td>\n",
       "      <td>Nhặt rác bồn hoa cây cảnh, làm sạch gạch ốp xu...</td>\n",
       "      <td>1</td>\n",
       "      <td>BIDV</td>\n",
       "      <td>3</td>\n",
       "      <td>1144</td>\n",
       "      <td>11200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Khu vực Ngoại cảnh</td>\n",
       "      <td>Vệ sinh gạt tàn, thùng rác</td>\n",
       "      <td>1</td>\n",
       "      <td>BIDV</td>\n",
       "      <td>3</td>\n",
       "      <td>1144</td>\n",
       "      <td>11200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Khu vực Ngoại cảnh</td>\n",
       "      <td>Lau các biển quảng cáo, biển chỉ dẫn (dưới 4m)...</td>\n",
       "      <td>1</td>\n",
       "      <td>BIDV</td>\n",
       "      <td>3</td>\n",
       "      <td>1144</td>\n",
       "      <td>11200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Khu vực Ngoại cảnh</td>\n",
       "      <td>Lau tường đá và kính bên ngoài tòa nhà (dưới 4m)</td>\n",
       "      <td>2</td>\n",
       "      <td>BIDV</td>\n",
       "      <td>3</td>\n",
       "      <td>1144</td>\n",
       "      <td>11200</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             Location                                                Job  \\\n",
       "0  Khu vực Ngoại cảnh  Quét lá rụng, thu gom rác lối đi lại, lối xe c...   \n",
       "1  Khu vực Ngoại cảnh  Nhặt rác bồn hoa cây cảnh, làm sạch gạch ốp xu...   \n",
       "2  Khu vực Ngoại cảnh                         Vệ sinh gạt tàn, thùng rác   \n",
       "3  Khu vực Ngoại cảnh  Lau các biển quảng cáo, biển chỉ dẫn (dưới 4m)...   \n",
       "4  Khu vực Ngoại cảnh   Lau tường đá và kính bên ngoài tòa nhà (dưới 4m)   \n",
       "\n",
       "  label  Name  luuluong  Dientichngoai  Dientichtrong  \n",
       "0     1  BIDV         3           1144          11200  \n",
       "1     1  BIDV         3           1144          11200  \n",
       "2     1  BIDV         3           1144          11200  \n",
       "3     1  BIDV         3           1144          11200  \n",
       "4     2  BIDV         3           1144          11200  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "6fd4be0a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from pyvi import ViTokenizer\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "878456df",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ==== 1. Load stopword ====\n",
    "\n",
    "def load_stopwords(path):\n",
    "    with open(path, \"r\", encoding=\"utf-8\") as f:\n",
    "        sw = [line.strip() for line in f if line.strip()]\n",
    "    return set(sw)\n",
    "\n",
    "stopwords_path = \"vietnamese-stopwords-dash.txt\"   # đổi tên nếu khác\n",
    "vietnamese_stopwords = load_stopwords(stopwords_path)\n",
    "\n",
    "\n",
    "# ==== 2. Hàm tokenize + bỏ stopword ====\n",
    "\n",
    "def segment_and_remove_stopwords(text):\n",
    "    if not isinstance(text, str):\n",
    "        return \"\"\n",
    "    segmented = ViTokenizer.tokenize(text)\n",
    "    tokens = segmented.split()\n",
    "    filtered = [tok for tok in tokens if tok not in vietnamese_stopwords]\n",
    "    return \" \".join(filtered)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "bf15213a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import re\n",
    "import unicodedata\n",
    "from pyvi import ViTokenizer\n",
    "\n",
    "def normalize_text_keep_words(s: str) -> str:\n",
    "    s = str(s)\n",
    "    s = unicodedata.normalize('NFC', s).lower()\n",
    "    s = re.sub(r\"[^0-9a-zà-ỹ\\s]\", \" \", s)\n",
    "    s = re.sub(r\"\\s+\", \" \", s).strip()\n",
    "    return s\n",
    "\n",
    "\n",
    "def preprocess_pipeline(df_raw: pd.DataFrame) -> pd.DataFrame:\n",
    "    \"\"\"\n",
    "    - Không ghép Location + Job\n",
    "    - Clean text\n",
    "    - Word segment Job + remove stopword\n",
    "    \"\"\"\n",
    "    df = df_raw.dropna(subset=[\"label\"]).copy()\n",
    "    df[\"label\"] = df[\"label\"].astype(str).str.strip()\n",
    "    df = df[df[\"label\"].isin([\"1\",\"2\",\"3\",\"4\"])].copy()\n",
    "\n",
    "    df[\"Location\"] = df[\"Location\"].fillna(\"Unknown\").astype(str)\n",
    "    df[\"Job\"]      = df[\"Job\"].astype(str)\n",
    "\n",
    "    # Clean text\n",
    "    df[\"location_clean\"] = df[\"Location\"].apply(normalize_text_keep_words)\n",
    "    df[\"job_clean\"]      = df[\"Job\"].apply(normalize_text_keep_words)\n",
    "\n",
    "    # Word segment + remove stopword\n",
    "    df[\"job_segmented\"] = df[\"job_clean\"].apply(segment_and_remove_stopwords)\n",
    "\n",
    "    # Giữ lại các cột numeric nếu có\n",
    "    keep_cols = [\n",
    "        \"Location\",\"Job\",\"label\",\n",
    "        \"location_clean\",\"job_clean\",\"job_segmented\"\n",
    "    ]\n",
    "    for c in [\"Name\",\"luuluong\",\"Dientichngoai\",\"Dientichtrong\"]:\n",
    "        if c in df.columns:\n",
    "            keep_cols.append(c)\n",
    "\n",
    "    return df[keep_cols].reset_index(drop=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "90599a6c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Location</th>\n",
       "      <th>Job</th>\n",
       "      <th>label</th>\n",
       "      <th>location_clean</th>\n",
       "      <th>job_clean</th>\n",
       "      <th>job_segmented</th>\n",
       "      <th>Name</th>\n",
       "      <th>luuluong</th>\n",
       "      <th>Dientichngoai</th>\n",
       "      <th>Dientichtrong</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Khu vực Ngoại cảnh</td>\n",
       "      <td>Quét lá rụng, thu gom rác lối đi lại, lối xe c...</td>\n",
       "      <td>1</td>\n",
       "      <td>khu vực ngoại cảnh</td>\n",
       "      <td>quét lá rụng thu gom rác lối đi lại lối xe chạ...</td>\n",
       "      <td>quét lá rụng thu_gom rác lối đi_lại lối xe chạ...</td>\n",
       "      <td>BIDV</td>\n",
       "      <td>3</td>\n",
       "      <td>1144</td>\n",
       "      <td>11200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Khu vực Ngoại cảnh</td>\n",
       "      <td>Nhặt rác bồn hoa cây cảnh, làm sạch gạch ốp xu...</td>\n",
       "      <td>1</td>\n",
       "      <td>khu vực ngoại cảnh</td>\n",
       "      <td>nhặt rác bồn hoa cây cảnh làm sạch gạch ốp xun...</td>\n",
       "      <td>nhặt rác bồn hoa cây_cảnh sạch gạch ốp xung_qu...</td>\n",
       "      <td>BIDV</td>\n",
       "      <td>3</td>\n",
       "      <td>1144</td>\n",
       "      <td>11200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Khu vực Ngoại cảnh</td>\n",
       "      <td>Vệ sinh gạt tàn, thùng rác</td>\n",
       "      <td>1</td>\n",
       "      <td>khu vực ngoại cảnh</td>\n",
       "      <td>vệ sinh gạt tàn thùng rác</td>\n",
       "      <td>vệ_sinh gạt_tàn thùng rác</td>\n",
       "      <td>BIDV</td>\n",
       "      <td>3</td>\n",
       "      <td>1144</td>\n",
       "      <td>11200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Khu vực Ngoại cảnh</td>\n",
       "      <td>Lau các biển quảng cáo, biển chỉ dẫn (dưới 4m)...</td>\n",
       "      <td>1</td>\n",
       "      <td>khu vực ngoại cảnh</td>\n",
       "      <td>lau các biển quảng cáo biển chỉ dẫn dưới 4m ch...</td>\n",
       "      <td>lau biển quảng_cáo biển chỉ_dẫn 4m chân cột điện</td>\n",
       "      <td>BIDV</td>\n",
       "      <td>3</td>\n",
       "      <td>1144</td>\n",
       "      <td>11200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Khu vực Ngoại cảnh</td>\n",
       "      <td>Lau tường đá và kính bên ngoài tòa nhà (dưới 4m)</td>\n",
       "      <td>2</td>\n",
       "      <td>khu vực ngoại cảnh</td>\n",
       "      <td>lau tường đá và kính bên ngoài tòa nhà dưới 4m</td>\n",
       "      <td>lau tường đá kính tòa 4m</td>\n",
       "      <td>BIDV</td>\n",
       "      <td>3</td>\n",
       "      <td>1144</td>\n",
       "      <td>11200</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             Location                                                Job  \\\n",
       "0  Khu vực Ngoại cảnh  Quét lá rụng, thu gom rác lối đi lại, lối xe c...   \n",
       "1  Khu vực Ngoại cảnh  Nhặt rác bồn hoa cây cảnh, làm sạch gạch ốp xu...   \n",
       "2  Khu vực Ngoại cảnh                         Vệ sinh gạt tàn, thùng rác   \n",
       "3  Khu vực Ngoại cảnh  Lau các biển quảng cáo, biển chỉ dẫn (dưới 4m)...   \n",
       "4  Khu vực Ngoại cảnh   Lau tường đá và kính bên ngoài tòa nhà (dưới 4m)   \n",
       "\n",
       "  label      location_clean  \\\n",
       "0     1  khu vực ngoại cảnh   \n",
       "1     1  khu vực ngoại cảnh   \n",
       "2     1  khu vực ngoại cảnh   \n",
       "3     1  khu vực ngoại cảnh   \n",
       "4     2  khu vực ngoại cảnh   \n",
       "\n",
       "                                           job_clean  \\\n",
       "0  quét lá rụng thu gom rác lối đi lại lối xe chạ...   \n",
       "1  nhặt rác bồn hoa cây cảnh làm sạch gạch ốp xun...   \n",
       "2                          vệ sinh gạt tàn thùng rác   \n",
       "3  lau các biển quảng cáo biển chỉ dẫn dưới 4m ch...   \n",
       "4     lau tường đá và kính bên ngoài tòa nhà dưới 4m   \n",
       "\n",
       "                                       job_segmented  Name  luuluong  \\\n",
       "0  quét lá rụng thu_gom rác lối đi_lại lối xe chạ...  BIDV         3   \n",
       "1  nhặt rác bồn hoa cây_cảnh sạch gạch ốp xung_qu...  BIDV         3   \n",
       "2                          vệ_sinh gạt_tàn thùng rác  BIDV         3   \n",
       "3   lau biển quảng_cáo biển chỉ_dẫn 4m chân cột điện  BIDV         3   \n",
       "4                           lau tường đá kính tòa 4m  BIDV         3   \n",
       "\n",
       "   Dientichngoai  Dientichtrong  \n",
       "0           1144          11200  \n",
       "1           1144          11200  \n",
       "2           1144          11200  \n",
       "3           1144          11200  \n",
       "4           1144          11200  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_out = preprocess_pipeline(df)\n",
    "df_out.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "1c472539",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train: (514, 10)\n",
      "Test : (172, 10)\n",
      "Holdout: (83, 10)\n"
     ]
    }
   ],
   "source": [
    "df_out = df_out[df_out[\"label\"].astype(str).isin([\"1\", \"2\", \"3\"])].reset_index(drop=True)\n",
    "\n",
    "\n",
    "# ============================================================\n",
    "# 3. CHIA 3 TẬP: TRAIN / TEST / HOLDOUT\n",
    "# ============================================================\n",
    "\n",
    "test_buildings    = [\"Keangnam\", \"CMC\"]   # tập test chính\n",
    "holdout_buildings = [\"VIGALCERA\"]             # tòa riêng để holdout – sửa theo dữ liệu thực tế\n",
    "\n",
    "test_df    = df_out[df_out[\"Name\"].isin(test_buildings)].reset_index(drop=True)\n",
    "holdout_df = df_out[df_out[\"Name\"].isin(holdout_buildings)].reset_index(drop=True)\n",
    "train_df   = df_out[\n",
    "    ~df_out[\"Name\"].isin(test_buildings + holdout_buildings)\n",
    "].reset_index(drop=True)\n",
    "\n",
    "print(\"Train:\", train_df.shape)\n",
    "print(\"Test :\", test_df.shape)\n",
    "print(\"Holdout:\", holdout_df.shape)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "bff38170",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_out = df_out[df_out[\"label\"].astype(str).isin([\"1\", \"2\", \"3\"])].reset_index(drop=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "8fc94180",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import re\n",
    "import unicodedata\n",
    "from pyvi import ViTokenizer\n",
    "from transformers import AutoTokenizer, AutoModel\n",
    "import torch\n",
    "from sklearn.metrics import classification_report\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "c41abe00",
   "metadata": {},
   "outputs": [],
   "source": [
    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
    "hf_tokenizer = AutoTokenizer.from_pretrained(\"dangvantuan/vietnamese-embedding\")\n",
    "hf_model = AutoModel.from_pretrained(\"dangvantuan/vietnamese-embedding\").to(device)\n",
    "\n",
    "def embed_text(text, tokenizer=hf_tokenizer, model=hf_model, device=device):\n",
    "    encoded = tokenizer(\n",
    "        text,\n",
    "        padding=True,\n",
    "        truncation=True,\n",
    "        max_length=128,\n",
    "        return_tensors=\"pt\"\n",
    "    ).to(device)\n",
    "\n",
    "    with torch.no_grad():\n",
    "        output = model(**encoded)\n",
    "\n",
    "    token_embeddings = output.last_hidden_state   # (1, L, H)\n",
    "    attention_mask   = encoded[\"attention_mask\"]  # (1, L)\n",
    "\n",
    "    mask = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()\n",
    "    sum_embeds = (token_embeddings * mask).sum(dim=1)  # (1, H)\n",
    "    lengths   = mask.sum(dim=1)                       # (1,1)\n",
    "    mean_pooled = sum_embeds / lengths\n",
    "\n",
    "    return mean_pooled.cpu()  # (1, H)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "04ad0db7",
   "metadata": {},
   "outputs": [],
   "source": [
    "def cosine_sim(a, b):\n",
    "    \"\"\"\n",
    "    a: (1, H)\n",
    "    b: (N, H)\n",
    "    \"\"\"\n",
    "    a = a / a.norm(dim=-1, keepdim=True)\n",
    "    b = b / b.norm(dim=-1, keepdim=True)\n",
    "    return torch.mm(a, b.t())  # (1, N)\n",
    "\n",
    "\n",
    "# Embed toàn bộ train job_segmented\n",
    "train_texts = train_df[\"job_segmented\"].tolist()\n",
    "\n",
    "train_embeddings = []\n",
    "for txt in train_texts:\n",
    "    vec = embed_text(txt)\n",
    "    train_embeddings.append(vec.squeeze(0))\n",
    "\n",
    "train_embeddings = torch.stack(train_embeddings)   # (N_train, H)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "2f1a3319",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 5. HÀM LOCATION SIMILARITY + NUMERIC CLOSENESS\n",
    "# ============================================================\n",
    "\n",
    "def location_similarity(q_row, cand_row):\n",
    "    q_tokens = set(str(q_row[\"location_clean\"]).split())\n",
    "    c_tokens = set(str(cand_row[\"location_clean\"]).split())\n",
    "    if not q_tokens or not c_tokens:\n",
    "        return 0.0\n",
    "    inter = len(q_tokens & c_tokens)\n",
    "    union = len(q_tokens | c_tokens)\n",
    "    return inter / union\n",
    "\n",
    "\n",
    "def numeric_closeness(q_row, cand_row, alpha_out=0.7, alpha_in=0.7):\n",
    "    # closeness theo luuluong\n",
    "    if \"luuluong\" in q_row and \"luuluong\" in cand_row:\n",
    "        if q_row[\"luuluong\"] == cand_row[\"luuluong\"]:\n",
    "            c_luu = 1.0\n",
    "        elif abs(q_row[\"luuluong\"] - cand_row[\"luuluong\"]) == 1:\n",
    "            c_luu = 0.6\n",
    "        else:\n",
    "            c_luu = 0.3\n",
    "    else:\n",
    "        c_luu = 0.5\n",
    "\n",
    "    # closeness theo diện tích\n",
    "    def safe_val(row, col):\n",
    "        return float(row[col]) if col in row and not pd.isna(row[col]) else 0.0\n",
    "\n",
    "    q_out = safe_val(q_row, \"Dientichngoai\")\n",
    "    q_in  = safe_val(q_row, \"Dientichtrong\")\n",
    "    c_out = safe_val(cand_row, \"Dientichngoai\")\n",
    "    c_in  = safe_val(cand_row, \"Dientichtrong\")\n",
    "\n",
    "    d_out = abs(np.log1p(q_out) - np.log1p(c_out))\n",
    "    d_in  = abs(np.log1p(q_in)  - np.log1p(c_in))\n",
    "\n",
    "    c_out = np.exp(-alpha_out * d_out)\n",
    "    c_in  = np.exp(-alpha_in  * d_in)\n",
    "\n",
    "    return 0.5 * c_luu + 0.25 * c_out + 0.25 * c_in\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "9931151e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def predict_label_for_row(q_row,\n",
    "                          train_df=train_df,\n",
    "                          train_embeddings=train_embeddings,\n",
    "                          tokenizer=hf_tokenizer,\n",
    "                          model=hf_model,\n",
    "                          top_k=10,\n",
    "                          w_numeric=0.4,\n",
    "                          w_loc=0.6):\n",
    "    \"\"\"\n",
    "    Bước 1: dùng embedding(job_segmented) để chọn top_k ứng viên gần nghĩa nhất.\n",
    "    Bước 2: trong top_k đó, KHÔNG dùng score text nữa, chỉ dùng:\n",
    "        - loc_sim   : similarity theo location_clean  [0,1]  (quan trọng nhất)\n",
    "        - num_c     : numeric_closeness (luuluong + diện tích)  (~0..1.5)\n",
    "    final_score = w_loc * loc_sim + w_numeric * num_c\n",
    "    \"\"\"\n",
    "    # 1) embed query\n",
    "    q_vec = embed_text(q_row[\"job_segmented\"], tokenizer, model)  # (1,H)\n",
    "\n",
    "    # 2) cosine similarity với toàn bộ train → chỉ để CHỌN ỨNG VIÊN\n",
    "    sims = cosine_sim(q_vec, train_embeddings)[0]  # (N,)\n",
    "\n",
    "    # 3) lấy top-k job gần nhất theo embedding\n",
    "    top_k = min(top_k, len(train_df))\n",
    "    top_scores, top_idx = torch.topk(sims, k=top_k)\n",
    "\n",
    "    label_scores = {}\n",
    "\n",
    "    for score, idx in zip(top_scores, top_idx):\n",
    "        cand_row = train_df.iloc[int(idx)]\n",
    "\n",
    "        loc_sim = location_similarity(q_row, cand_row)\n",
    "        num_c   = numeric_closeness(q_row, cand_row)\n",
    "\n",
    "        final_score = w_loc * loc_sim + w_numeric * num_c\n",
    "\n",
    "        lbl = str(cand_row[\"label\"])\n",
    "        label_scores[lbl] = label_scores.get(lbl, 0.0) + final_score\n",
    "\n",
    "    if not label_scores:\n",
    "        majority_label = str(train_df[\"label\"].value_counts().idxmax())\n",
    "        return majority_label, {}\n",
    "\n",
    "    best_label = max(label_scores, key=label_scores.get)\n",
    "    return best_label, label_scores\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "16b51693",
   "metadata": {},
   "outputs": [],
   "source": [
    "def predict_on_df(df_in: pd.DataFrame,\n",
    "                  name: str,\n",
    "                  top_k=5,\n",
    "                  w_numeric=0.4,\n",
    "                  w_loc=0.6):\n",
    "    df = df_in.copy()\n",
    "    preds = []\n",
    "    scores = []\n",
    "\n",
    "    for _, row in df.iterrows():\n",
    "        pred, sc = predict_label_for_row(\n",
    "            row,\n",
    "            train_df=train_df,\n",
    "            train_embeddings=train_embeddings,\n",
    "            tokenizer=hf_tokenizer,\n",
    "            model=hf_model,\n",
    "            top_k=top_k,\n",
    "            w_numeric=w_numeric,\n",
    "            w_loc=w_loc\n",
    "        )\n",
    "        preds.append(pred)\n",
    "        scores.append(sc)\n",
    "\n",
    "    df[\"pred_label\"]    = preds\n",
    "    df[\"score_details\"] = scores\n",
    "\n",
    "    print(f\"\\n========== KẾT QUẢ TRÊN {name} ==========\")\n",
    "    print(classification_report(\n",
    "        df[\"label\"].astype(str),\n",
    "        df[\"pred_label\"].astype(str),\n",
    "        digits=3\n",
    "    ))\n",
    "\n",
    "    return df\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "dcc54593",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "========== KẾT QUẢ TRÊN TEST (Keangnam + CMC) ==========\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           1      0.788     0.848     0.817       105\n",
      "           2      0.545     0.511     0.527        47\n",
      "           3      0.667     0.500     0.571        20\n",
      "\n",
      "    accuracy                          0.715       172\n",
      "   macro avg      0.667     0.619     0.638       172\n",
      "weighted avg      0.707     0.715     0.709       172\n",
      "\n"
     ]
    }
   ],
   "source": [
    "test_df_pred    = predict_on_df(test_df, \"TEST (Keangnam + CMC)\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "6f0cc888",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "========== KẾT QUẢ TRÊN HOLDOUT ==========\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           1      0.684     0.907     0.780        43\n",
      "           2      0.429     0.261     0.324        23\n",
      "           3      0.667     0.471     0.552        17\n",
      "\n",
      "    accuracy                          0.639        83\n",
      "   macro avg      0.593     0.546     0.552        83\n",
      "weighted avg      0.610     0.639     0.607        83\n",
      "\n"
     ]
    }
   ],
   "source": [
    "holdout_df_pred = predict_on_df(holdout_df, \"HOLDOUT\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "5f804672",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_top_neighbors_for_row(q_row,\n",
    "                              train_df,\n",
    "                              train_embeddings,\n",
    "                              tokenizer,\n",
    "                              model,\n",
    "                              top_k=5,\n",
    "                              w_numeric=0.4,\n",
    "                              w_loc=0.6):\n",
    "    \"\"\"\n",
    "    Bước 1: dùng embedding(job_segmented) để lấy top_k ứng viên gần nghĩa nhất.\n",
    "    Bước 2: với mỗi ứng viên, tính:\n",
    "        - cos_sim       : similarity embedding (chỉ để tham khảo)\n",
    "        - loc_sim       : similarity theo location_clean\n",
    "        - num_closeness : theo luuluong + diện tích\n",
    "        - final_score   : w_loc * loc_sim + w_numeric * num_closeness\n",
    "    Trả về DataFrame các neighbor, sort theo final_score giảm dần.\n",
    "    \"\"\"\n",
    "    # 1) embed query\n",
    "    q_vec = embed_text(q_row[\"job_segmented\"], tokenizer, model)  # (1, H)\n",
    "\n",
    "    # 2) cosine similarity với toàn bộ train\n",
    "    sims = cosine_sim(q_vec, train_embeddings)[0]  # (N,)\n",
    "\n",
    "    # 3) lấy top_k index theo sims\n",
    "    top_k = min(top_k, len(train_df))\n",
    "    top_scores, top_idx = torch.topk(sims, k=top_k)\n",
    "\n",
    "    rows = []\n",
    "    for score, idx in zip(top_scores, top_idx):\n",
    "        score_val = float(score.item())\n",
    "        cand = train_df.iloc[int(idx)].copy()\n",
    "\n",
    "        loc_sim = location_similarity(q_row, cand)\n",
    "        num_c   = numeric_closeness(q_row, cand)\n",
    "\n",
    "        final_score = w_loc * loc_sim + w_numeric * num_c\n",
    "\n",
    "        cand[\"cos_sim\"]       = score_val\n",
    "        cand[\"loc_sim\"]       = loc_sim\n",
    "        cand[\"num_closeness\"] = num_c\n",
    "        cand[\"final_score\"]   = final_score\n",
    "        rows.append(cand)\n",
    "\n",
    "    neighbors_df = pd.DataFrame(rows).sort_values(\"final_score\", ascending=False)\n",
    "    return neighbors_df\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "f224a244",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Số mẫu sai trên HOLDOUT: 30\n",
      "\n",
      "====================================================================================================\n",
      "❌ CASE SAI #3\n",
      "  True label    : 3\n",
      "  Pred label    : 1\n",
      "  Tòa (Name)    : VIGALCERA\n",
      "  Location      : Khu vực ngoại cảnh\n",
      "  Lưu lượng     : 3\n",
      "  DT ngoài      : 6000\n",
      "  DT trong      : 4520\n",
      "  Job raw       : - Làm vệ sinh khu vực đài phun nước\n",
      "  job_segmented : vệ_sinh khu_vực đài phun\n",
      "  location_clean: khu vực ngoại cảnh\n",
      "\n",
      "  → Top 10 hàng xóm trong train (sorted theo final_score):\n",
      "label         Name                           Location  luuluong  Dientichngoai  Dientichtrong                                                                                                                                Job                                                                         job_segmented  cos_sim  loc_sim  num_closeness  final_score\n",
      "    1         BIDV                 Khu vực Ngoại cảnh         3           1144          11200                                                                                                          Vệ sinh họng rác (nếu có)                                                                      vệ_sinh họng rác 0.694930 1.000000       0.710878     0.884351\n",
      "    1         BIDV                Khu vực Nhà vệ sinh         3           1144          11200                  Làm sạch và khử mùi khu vệ sinh và các thiết bị bên trong nhà vệ sinh bao gồm: Bồn rửa, bệ xí, các vòi, van nước.               sạch khử mùi khu vệ_sinh thiết_bị vệ_sinh bao_gồm bồn rửa bệ_xí vòi van 0.690451 0.285714       0.710878     0.455780\n",
      "    1 CenterPoint       KHU VỰC NHÀ VỆ SINH CÔNG CỘNG         3            400           5379   Làm sạch và khử mùi khu vệ sinh và các thiết bị bên trong nhà vệ sinh bao gồm: Bồn rửa, bệ xí, các vòi, van nước, hộp đựng giấy… sạch khử mùi khu vệ_sinh thiết_bị vệ_sinh bao_gồm bồn rửa bệ_xí vòi van hộp đựng giấy 0.671480 0.222222       0.758955     0.436915\n",
      "    1    Hong Kong KHU VỰC NHÀ VỆ SINH (Ban quản lý )         2           9950          25630 · Làm sạch và khử mùi khu vệ sinh và các thiết bị bên trong nhà vệ sinh bao gồm: Bồn rửa, bệ xí, các vòi, van nước, hộp đựng giấy… sạch khử mùi khu vệ_sinh thiết_bị vệ_sinh bao_gồm bồn rửa bệ_xí vòi van hộp đựng giấy 0.671480 0.200000       0.549676     0.339870\n",
      "    1          HCO                        Nhà vệ sinh         3            800          13000                                                                                       Lau thùng đựng rác, thiết bị thoát nước thải                                           lau thùng đựng rác thiết_bị thoát nước_thải 0.706144 0.000000       0.680405     0.272162\n"
     ]
    }
   ],
   "source": [
    "# Lọc các case dự đoán sai trên holdout\n",
    "mis_holdout = holdout_df_pred[\n",
    "    holdout_df_pred[\"label\"].astype(str) != holdout_df_pred[\"pred_label\"].astype(str)\n",
    "].copy()\n",
    "\n",
    "print(\"Số mẫu sai trên HOLDOUT:\", len(mis_holdout))\n",
    "\n",
    "max_cases = 5   # in tối đa 5 case cho đỡ dài, bạn có thể tăng số này\n",
    "\n",
    "for i, (_, row) in enumerate(mis_holdout.iterrows(), start=1):\n",
    "    if i > max_cases:\n",
    "        break\n",
    "    if i!=3:\n",
    "        continue\n",
    "    print(\"\\n\" + \"=\"*100)\n",
    "    print(f\"❌ CASE SAI #{i}\")\n",
    "    print(f\"  True label    : {row['label']}\")\n",
    "    print(f\"  Pred label    : {row['pred_label']}\")\n",
    "    print(f\"  Tòa (Name)    : {row.get('Name', 'N/A')}\")\n",
    "    print(f\"  Location      : {row['Location']}\")\n",
    "    print(f\"  Lưu lượng     : {row.get('luuluong', 'N/A')}\")\n",
    "    print(f\"  DT ngoài      : {row.get('Dientichngoai', 'N/A')}\")\n",
    "    print(f\"  DT trong      : {row.get('Dientichtrong', 'N/A')}\")\n",
    "    print(\"  Job raw       :\", row[\"Job\"])\n",
    "    print(\"  job_segmented :\", row[\"job_segmented\"])\n",
    "    print(\"  location_clean:\", row[\"location_clean\"])\n",
    "\n",
    "    # Lấy top neighbors cho case này\n",
    "    neighbors = get_top_neighbors_for_row(\n",
    "        row,\n",
    "        train_df=train_df,\n",
    "        train_embeddings=train_embeddings,\n",
    "        tokenizer=hf_tokenizer,\n",
    "        model=hf_model,\n",
    "        top_k=5,       # số ứng viên lấy theo embedding\n",
    "        w_numeric=0.4,\n",
    "        w_loc=0.6\n",
    "    )\n",
    "\n",
    "    print(\"\\n  → Top 10 hàng xóm trong train (sorted theo final_score):\")\n",
    "    cols_show = [\n",
    "        \"label\", \"Name\", \"Location\",\n",
    "        \"luuluong\", \"Dientichngoai\", \"Dientichtrong\",\n",
    "        \"Job\", \"job_segmented\",\n",
    "        \"cos_sim\", \"loc_sim\", \"num_closeness\", \"final_score\"\n",
    "    ]\n",
    "    # Chỉ in cột nào thực sự tồn tại (phòng trường hợp thiếu)\n",
    "    cols_show = [c for c in cols_show if c in neighbors.columns]\n",
    "\n",
    "    print(neighbors[cols_show].head(10).to_string(index=False))\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "tainl",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}