predict_caLamviec_nhansu/final.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "76aa1b75",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ Loaded: final_2.xlsx | sheet: final\n",
      "Shape (raw): (401, 42)\n",
      "\n",
      "=== TARGET SUMMARY (so_luong) ===\n",
      "count    401.000000\n",
      "mean       4.660848\n",
      "std        6.799242\n",
      "min        0.000000\n",
      "25%        1.000000\n",
      "50%        2.000000\n",
      "75%        5.000000\n",
      "max       64.000000\n",
      "Name: so_luong, dtype: float64\n",
      "Missing target: 0\n",
      "Negative target: 0\n",
      "Zero target: 3\n",
      "\n",
      "Duplicate full rows: 7\n",
      "Shape (dedup): (394, 42)\n",
      "\n",
      "Columns: 42\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ma_dia_diem</th>\n",
       "      <th>all_task_normal</th>\n",
       "      <th>all_task_dinhky</th>\n",
       "      <th>loai_ca</th>\n",
       "      <th>bat_dau</th>\n",
       "      <th>ket_thuc</th>\n",
       "      <th>tong_gio_lam</th>\n",
       "      <th>so_ca_cua_toa</th>\n",
       "      <th>so_luong</th>\n",
       "      <th>num_tasks</th>\n",
       "      <th>...</th>\n",
       "      <th>dien_tich_tham</th>\n",
       "      <th>doc_ham</th>\n",
       "      <th>vien_phan_quang</th>\n",
       "      <th>op_tuong</th>\n",
       "      <th>op_chan_tuong</th>\n",
       "      <th>ranh_thoat_nuoc</th>\n",
       "      <th>dien_tich_kinh</th>\n",
       "      <th>num_medical_tasks_total</th>\n",
       "      <th>num_indoor_room_tasks</th>\n",
       "      <th>is_tasks_text_missing</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>115-2</td>\n",
       "      <td>Làm sạch toàn bộ phòng giao dịch tầng 1 (kể cả...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Part time</td>\n",
       "      <td>06:30:00</td>\n",
       "      <td>10:30:00</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>20.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>101-1</td>\n",
       "      <td>Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,...</td>\n",
       "      <td>Lau bảng biển, bình cứu hỏa , cây nước hành la...</td>\n",
       "      <td>Hành chính</td>\n",
       "      <td>06:30:00</td>\n",
       "      <td>16:00:00</td>\n",
       "      <td>7.5</td>\n",
       "      <td>6</td>\n",
       "      <td>24</td>\n",
       "      <td>441</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>70</td>\n",
       "      <td>0</td>\n",
       "      <td>9176.0</td>\n",
       "      <td>89.0</td>\n",
       "      <td>25</td>\n",
       "      <td>894.0</td>\n",
       "      <td>112</td>\n",
       "      <td>39</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>101-1</td>\n",
       "      <td>Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,...</td>\n",
       "      <td>Lau bảng biển, bình cứu hỏa , cây nước hành la...</td>\n",
       "      <td>Ca sáng</td>\n",
       "      <td>06:00:00</td>\n",
       "      <td>14:00:00</td>\n",
       "      <td>8.0</td>\n",
       "      <td>6</td>\n",
       "      <td>3</td>\n",
       "      <td>441</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>70</td>\n",
       "      <td>0</td>\n",
       "      <td>9176.0</td>\n",
       "      <td>89.0</td>\n",
       "      <td>25</td>\n",
       "      <td>894.0</td>\n",
       "      <td>112</td>\n",
       "      <td>39</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3 rows × 42 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "  ma_dia_diem                                    all_task_normal  \\\n",
       "0       115-2  Làm sạch toàn bộ phòng giao dịch tầng 1 (kể cả...   \n",
       "1       101-1  Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,...   \n",
       "2       101-1  Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,...   \n",
       "\n",
       "                                     all_task_dinhky     loai_ca   bat_dau  \\\n",
       "0                                                NaN   Part time  06:30:00   \n",
       "1  Lau bảng biển, bình cứu hỏa , cây nước hành la...  Hành chính  06:30:00   \n",
       "2  Lau bảng biển, bình cứu hỏa , cây nước hành la...     Ca sáng  06:00:00   \n",
       "\n",
       "   ket_thuc  tong_gio_lam  so_ca_cua_toa  so_luong  num_tasks  ...  \\\n",
       "0  10:30:00           4.0              1         1          7  ...   \n",
       "1  16:00:00           7.5              6        24        441  ...   \n",
       "2  14:00:00           8.0              6         3        441  ...   \n",
       "\n",
       "   dien_tich_tham  doc_ham  vien_phan_quang  op_tuong  op_chan_tuong  \\\n",
       "0             0.0        0                0       0.0            0.0   \n",
       "1             0.0       70                0    9176.0           89.0   \n",
       "2             0.0       70                0    9176.0           89.0   \n",
       "\n",
       "   ranh_thoat_nuoc  dien_tich_kinh  num_medical_tasks_total  \\\n",
       "0                0            20.0                        0   \n",
       "1               25           894.0                      112   \n",
       "2               25           894.0                      112   \n",
       "\n",
       "   num_indoor_room_tasks  is_tasks_text_missing  \n",
       "0                      1                      0  \n",
       "1                     39                      0  \n",
       "2                     39                      0  \n",
       "\n",
       "[3 rows x 42 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "DATA_PATH = \"final_2.xlsx\"\n",
    "SHEET = \"final\"\n",
    "TARGET = \"so_luong\"\n",
    "\n",
    "# 1) Load\n",
    "df = pd.read_excel(DATA_PATH, sheet_name=SHEET)\n",
    "print(\"✅ Loaded:\", DATA_PATH, \"| sheet:\", SHEET)\n",
    "print(\"Shape (raw):\", df.shape)\n",
    "\n",
    "# 2) Target sanity\n",
    "assert TARGET in df.columns, f\"❌ Missing target column: {TARGET}\"\n",
    "df[TARGET] = pd.to_numeric(df[TARGET], errors=\"coerce\")\n",
    "\n",
    "print(\"\\n=== TARGET SUMMARY (so_luong) ===\")\n",
    "print(df[TARGET].describe())\n",
    "print(\"Missing target:\", df[TARGET].isna().sum())\n",
    "print(\"Negative target:\", (df[TARGET] < 0).sum())\n",
    "print(\"Zero target:\", (df[TARGET] == 0).sum())\n",
    "\n",
    "# 3) Deduplicate full rows\n",
    "dup = df.duplicated().sum()\n",
    "print(\"\\nDuplicate full rows:\", dup)\n",
    "if dup > 0:\n",
    "    df = df.drop_duplicates().reset_index(drop=True)\n",
    "print(\"Shape (dedup):\", df.shape)\n",
    "\n",
    "# 4) Quick preview\n",
    "print(\"\\nColumns:\", len(df.columns))\n",
    "display(df.head(3))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "421b7556",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['ma_dia_diem', 'all_task_normal', 'all_task_dinhky', 'loai_ca',\n",
       "       'bat_dau', 'ket_thuc', 'tong_gio_lam', 'so_ca_cua_toa', 'so_luong',\n",
       "       'num_tasks', 'num_cleaning_tasks', 'num_trash_collection_tasks',\n",
       "       'num_monitoring_tasks', 'num_deep_cleaning_tasks', 'num_support_tasks',\n",
       "       'num_other_tasks', 'num_wc_tasks', 'num_hallway_tasks',\n",
       "       'num_lobby_tasks', 'num_outdoor_tasks', 'num_elevator_tasks',\n",
       "       'cleaning_ratio', 'trash_collection_ratio', 'monitoring_ratio',\n",
       "       'area_diversity', 'so_tang', 'so_cua_thang_may', 'dien_tich_ngoai_canh',\n",
       "       'dien_tich_sanh', 'dien_tich_hanh_lang', 'dien_tich_wc',\n",
       "       'dien_tich_phong', 'dien_tich_tham', 'doc_ham', 'vien_phan_quang',\n",
       "       'op_tuong', 'op_chan_tuong', 'ranh_thoat_nuoc', 'dien_tich_kinh',\n",
       "       'num_medical_tasks_total', 'num_indoor_room_tasks',\n",
       "       'is_tasks_text_missing'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "daf5a333",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ X shape: (394, 48) | y shape: (394,) | #buildings: 192\n",
      "Columns sample: ['tong_gio_lam', 'so_ca_cua_toa', 'num_tasks', 'num_cleaning_tasks', 'num_trash_collection_tasks', 'num_monitoring_tasks', 'num_deep_cleaning_tasks', 'num_support_tasks', 'num_other_tasks', 'num_wc_tasks', 'num_hallway_tasks', 'num_lobby_tasks', 'num_outdoor_tasks', 'num_elevator_tasks', 'cleaning_ratio']\n",
      "Any NaN left?: False\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>tong_gio_lam</th>\n",
       "      <th>so_ca_cua_toa</th>\n",
       "      <th>num_tasks</th>\n",
       "      <th>num_cleaning_tasks</th>\n",
       "      <th>num_trash_collection_tasks</th>\n",
       "      <th>num_monitoring_tasks</th>\n",
       "      <th>num_deep_cleaning_tasks</th>\n",
       "      <th>num_support_tasks</th>\n",
       "      <th>num_other_tasks</th>\n",
       "      <th>num_wc_tasks</th>\n",
       "      <th>...</th>\n",
       "      <th>is_night_shift</th>\n",
       "      <th>is_morning_shift</th>\n",
       "      <th>is_afternoon_shift</th>\n",
       "      <th>is_evening_shift</th>\n",
       "      <th>loai_ca_Ca chiều</th>\n",
       "      <th>loai_ca_Ca gãy</th>\n",
       "      <th>loai_ca_Ca sáng</th>\n",
       "      <th>loai_ca_Ca đêm</th>\n",
       "      <th>loai_ca_Hành chính</th>\n",
       "      <th>loai_ca_Part time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>4.0</td>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>7.5</td>\n",
       "      <td>6</td>\n",
       "      <td>441</td>\n",
       "      <td>258</td>\n",
       "      <td>145</td>\n",
       "      <td>134</td>\n",
       "      <td>75</td>\n",
       "      <td>57</td>\n",
       "      <td>45</td>\n",
       "      <td>89</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8.0</td>\n",
       "      <td>6</td>\n",
       "      <td>441</td>\n",
       "      <td>258</td>\n",
       "      <td>145</td>\n",
       "      <td>134</td>\n",
       "      <td>75</td>\n",
       "      <td>57</td>\n",
       "      <td>45</td>\n",
       "      <td>89</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>8.0</td>\n",
       "      <td>6</td>\n",
       "      <td>441</td>\n",
       "      <td>258</td>\n",
       "      <td>145</td>\n",
       "      <td>134</td>\n",
       "      <td>75</td>\n",
       "      <td>57</td>\n",
       "      <td>45</td>\n",
       "      <td>89</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>8.0</td>\n",
       "      <td>6</td>\n",
       "      <td>441</td>\n",
       "      <td>258</td>\n",
       "      <td>145</td>\n",
       "      <td>134</td>\n",
       "      <td>75</td>\n",
       "      <td>57</td>\n",
       "      <td>45</td>\n",
       "      <td>89</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>7.5</td>\n",
       "      <td>6</td>\n",
       "      <td>441</td>\n",
       "      <td>258</td>\n",
       "      <td>145</td>\n",
       "      <td>134</td>\n",
       "      <td>75</td>\n",
       "      <td>57</td>\n",
       "      <td>45</td>\n",
       "      <td>89</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>9.5</td>\n",
       "      <td>6</td>\n",
       "      <td>441</td>\n",
       "      <td>258</td>\n",
       "      <td>145</td>\n",
       "      <td>134</td>\n",
       "      <td>75</td>\n",
       "      <td>57</td>\n",
       "      <td>45</td>\n",
       "      <td>89</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>9.5</td>\n",
       "      <td>3</td>\n",
       "      <td>135</td>\n",
       "      <td>81</td>\n",
       "      <td>35</td>\n",
       "      <td>38</td>\n",
       "      <td>10</td>\n",
       "      <td>20</td>\n",
       "      <td>21</td>\n",
       "      <td>25</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>7.5</td>\n",
       "      <td>3</td>\n",
       "      <td>135</td>\n",
       "      <td>81</td>\n",
       "      <td>35</td>\n",
       "      <td>38</td>\n",
       "      <td>10</td>\n",
       "      <td>20</td>\n",
       "      <td>21</td>\n",
       "      <td>25</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>7.5</td>\n",
       "      <td>3</td>\n",
       "      <td>135</td>\n",
       "      <td>81</td>\n",
       "      <td>35</td>\n",
       "      <td>38</td>\n",
       "      <td>10</td>\n",
       "      <td>20</td>\n",
       "      <td>21</td>\n",
       "      <td>25</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10 rows × 48 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   tong_gio_lam  so_ca_cua_toa  num_tasks  num_cleaning_tasks  \\\n",
       "0           4.0              1          7                   7   \n",
       "1           7.5              6        441                 258   \n",
       "2           8.0              6        441                 258   \n",
       "3           8.0              6        441                 258   \n",
       "4           8.0              6        441                 258   \n",
       "5           7.5              6        441                 258   \n",
       "6           9.5              6        441                 258   \n",
       "7           9.5              3        135                  81   \n",
       "8           7.5              3        135                  81   \n",
       "9           7.5              3        135                  81   \n",
       "\n",
       "   num_trash_collection_tasks  num_monitoring_tasks  num_deep_cleaning_tasks  \\\n",
       "0                           1                     2                        1   \n",
       "1                         145                   134                       75   \n",
       "2                         145                   134                       75   \n",
       "3                         145                   134                       75   \n",
       "4                         145                   134                       75   \n",
       "5                         145                   134                       75   \n",
       "6                         145                   134                       75   \n",
       "7                          35                    38                       10   \n",
       "8                          35                    38                       10   \n",
       "9                          35                    38                       10   \n",
       "\n",
       "   num_support_tasks  num_other_tasks  num_wc_tasks  ...  is_night_shift  \\\n",
       "0                  0                0             4  ...               0   \n",
       "1                 57               45            89  ...               0   \n",
       "2                 57               45            89  ...               0   \n",
       "3                 57               45            89  ...               0   \n",
       "4                 57               45            89  ...               1   \n",
       "5                 57               45            89  ...               0   \n",
       "6                 57               45            89  ...               0   \n",
       "7                 20               21            25  ...               0   \n",
       "8                 20               21            25  ...               0   \n",
       "9                 20               21            25  ...               1   \n",
       "\n",
       "   is_morning_shift  is_afternoon_shift  is_evening_shift  loai_ca_Ca chiều  \\\n",
       "0                 1                   0                 0             False   \n",
       "1                 1                   0                 0             False   \n",
       "2                 1                   0                 0             False   \n",
       "3                 0                   1                 0              True   \n",
       "4                 0                   0                 0             False   \n",
       "5                 0                   1                 0             False   \n",
       "6                 1                   0                 0             False   \n",
       "7                 1                   0                 0             False   \n",
       "8                 1                   0                 0             False   \n",
       "9                 0                   0                 0             False   \n",
       "\n",
       "   loai_ca_Ca gãy  loai_ca_Ca sáng  loai_ca_Ca đêm  loai_ca_Hành chính  \\\n",
       "0           False            False           False               False   \n",
       "1           False            False           False                True   \n",
       "2           False             True           False               False   \n",
       "3           False            False           False               False   \n",
       "4           False            False            True               False   \n",
       "5            True            False           False               False   \n",
       "6           False            False           False                True   \n",
       "7           False            False           False                True   \n",
       "8            True            False           False               False   \n",
       "9           False            False            True               False   \n",
       "\n",
       "   loai_ca_Part time  \n",
       "0               True  \n",
       "1              False  \n",
       "2              False  \n",
       "3              False  \n",
       "4              False  \n",
       "5              False  \n",
       "6              False  \n",
       "7              False  \n",
       "8              False  \n",
       "9              False  \n",
       "\n",
       "[10 rows x 48 columns]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "TARGET = \"so_luong\"\n",
    "GROUP_COL = \"ma_dia_diem\"\n",
    "\n",
    "# chỉ bỏ 4 cột bạn yêu cầu\n",
    "DROP_COLS = [\"ma_dia_diem\", \"all_task_normal\", \"all_task_dinhky\", \"is_tasks_text_missing\"]\n",
    "\n",
    "# ---------- helpers ----------\n",
    "def parse_hour(t):\n",
    "    \"\"\"Convert 'HH:MM:SS' (or datetime-like) -> float hour in [0,24).\"\"\"\n",
    "    if pd.isna(t):\n",
    "        return np.nan\n",
    "    # pandas Timestamp / datetime\n",
    "    if hasattr(t, \"hour\"):\n",
    "        return float(t.hour) + float(getattr(t, \"minute\", 0))/60.0\n",
    "    s = str(t).strip()\n",
    "    # handle '06:30:00'\n",
    "    if \":\" in s:\n",
    "        parts = s.split(\":\")\n",
    "        try:\n",
    "            hh = int(float(parts[0]))\n",
    "            mm = int(float(parts[1])) if len(parts) > 1 else 0\n",
    "            return hh + mm/60.0\n",
    "        except:\n",
    "            return np.nan\n",
    "    # handle '6.5' etc\n",
    "    try:\n",
    "        return float(s)\n",
    "    except:\n",
    "        return np.nan\n",
    "\n",
    "# ---------- 1) y + groups ----------\n",
    "y = df[TARGET].astype(float).copy()\n",
    "groups = df[GROUP_COL].astype(str).copy()   # để split theo tòa ở cell sau\n",
    "\n",
    "# ---------- 2) time features (bat_dau/ket_thuc) ----------\n",
    "hour_start = df[\"bat_dau\"].apply(parse_hour)\n",
    "hour_end   = df[\"ket_thuc\"].apply(parse_hour)\n",
    "\n",
    "# cross-day (vd 22 -> 6)\n",
    "is_cross_day = ((hour_end < hour_start) & hour_start.notna() & hour_end.notna()).astype(int)\n",
    "\n",
    "# shift length in hours, safe modulo 24\n",
    "shift_length = ((hour_end - hour_start) % 24).fillna(0)\n",
    "\n",
    "# flags\n",
    "is_night_shift = ((hour_start >= 22) | (hour_start < 6)).fillna(False).astype(int)\n",
    "is_morning_shift = ((hour_start >= 6) & (hour_start < 12)).fillna(False).astype(int)\n",
    "is_afternoon_shift = ((hour_start >= 12) & (hour_start < 18)).fillna(False).astype(int)\n",
    "is_evening_shift = ((hour_start >= 18) & (hour_start < 22)).fillna(False).astype(int)\n",
    "\n",
    "# ---------- 3) Build X from df: drop forbidden + drop raw time cols + add time features ----------\n",
    "X = df.drop(columns=[c for c in DROP_COLS if c in df.columns] + [TARGET], errors=\"ignore\").copy()\n",
    "\n",
    "# Drop raw time columns (không one-hot time)\n",
    "for c in [\"bat_dau\", \"ket_thuc\"]:\n",
    "    if c in X.columns:\n",
    "        X = X.drop(columns=[c])\n",
    "\n",
    "# Add engineered time features\n",
    "X[\"hour_start\"] = hour_start.fillna(0)\n",
    "X[\"hour_end\"] = hour_end.fillna(0)\n",
    "X[\"shift_length\"] = shift_length\n",
    "X[\"is_cross_day\"] = is_cross_day\n",
    "X[\"is_night_shift\"] = is_night_shift\n",
    "X[\"is_morning_shift\"] = is_morning_shift\n",
    "X[\"is_afternoon_shift\"] = is_afternoon_shift\n",
    "X[\"is_evening_shift\"] = is_evening_shift\n",
    "\n",
    "# ---------- 4) Fill NA numeric + one-hot ONLY loai_ca ----------\n",
    "# numeric fill\n",
    "num_cols = X.select_dtypes(include=[np.number]).columns\n",
    "X[num_cols] = X[num_cols].fillna(0)\n",
    "\n",
    "# One-hot loai_ca (nếu có)\n",
    "if \"loai_ca\" in X.columns:\n",
    "    X[\"loai_ca\"] = X[\"loai_ca\"].fillna(\"UNKNOWN\").astype(str)\n",
    "    X = pd.get_dummies(X, columns=[\"loai_ca\"], drop_first=True)\n",
    "\n",
    "print(\"✅ X shape:\", X.shape, \"| y shape:\", y.shape, \"| #buildings:\", groups.nunique())\n",
    "print(\"Columns sample:\", list(X.columns[:15]))\n",
    "print(\"Any NaN left?:\", X.isna().any().any())\n",
    "\n",
    "X.head(10)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "0ad8de9d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ Final X shape: (394, 45) | y shape: (394,) | #buildings: 192\n",
      "Any NaN left in X?: False\n",
      "Time one-hot columns (should be 0): 0\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>tong_gio_lam</th>\n",
       "      <th>so_ca_cua_toa</th>\n",
       "      <th>num_tasks</th>\n",
       "      <th>num_cleaning_tasks</th>\n",
       "      <th>num_trash_collection_tasks</th>\n",
       "      <th>num_monitoring_tasks</th>\n",
       "      <th>num_deep_cleaning_tasks</th>\n",
       "      <th>num_support_tasks</th>\n",
       "      <th>num_other_tasks</th>\n",
       "      <th>num_wc_tasks</th>\n",
       "      <th>...</th>\n",
       "      <th>hour_end</th>\n",
       "      <th>shift_length</th>\n",
       "      <th>is_cross_day</th>\n",
       "      <th>is_night_shift</th>\n",
       "      <th>loai_ca_Ca chiều</th>\n",
       "      <th>loai_ca_Ca gãy</th>\n",
       "      <th>loai_ca_Ca sáng</th>\n",
       "      <th>loai_ca_Ca đêm</th>\n",
       "      <th>loai_ca_Hành chính</th>\n",
       "      <th>loai_ca_Part time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>4.0</td>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>...</td>\n",
       "      <td>10.5</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>7.5</td>\n",
       "      <td>6</td>\n",
       "      <td>441</td>\n",
       "      <td>258</td>\n",
       "      <td>145</td>\n",
       "      <td>134</td>\n",
       "      <td>75</td>\n",
       "      <td>57</td>\n",
       "      <td>45</td>\n",
       "      <td>89</td>\n",
       "      <td>...</td>\n",
       "      <td>16.0</td>\n",
       "      <td>9.5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8.0</td>\n",
       "      <td>6</td>\n",
       "      <td>441</td>\n",
       "      <td>258</td>\n",
       "      <td>145</td>\n",
       "      <td>134</td>\n",
       "      <td>75</td>\n",
       "      <td>57</td>\n",
       "      <td>45</td>\n",
       "      <td>89</td>\n",
       "      <td>...</td>\n",
       "      <td>14.0</td>\n",
       "      <td>8.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3 rows × 45 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   tong_gio_lam  so_ca_cua_toa  num_tasks  num_cleaning_tasks  \\\n",
       "0           4.0              1          7                   7   \n",
       "1           7.5              6        441                 258   \n",
       "2           8.0              6        441                 258   \n",
       "\n",
       "   num_trash_collection_tasks  num_monitoring_tasks  num_deep_cleaning_tasks  \\\n",
       "0                           1                     2                        1   \n",
       "1                         145                   134                       75   \n",
       "2                         145                   134                       75   \n",
       "\n",
       "   num_support_tasks  num_other_tasks  num_wc_tasks  ...  hour_end  \\\n",
       "0                  0                0             4  ...      10.5   \n",
       "1                 57               45            89  ...      16.0   \n",
       "2                 57               45            89  ...      14.0   \n",
       "\n",
       "   shift_length  is_cross_day  is_night_shift  loai_ca_Ca chiều  \\\n",
       "0           4.0             0               0             False   \n",
       "1           9.5             0               0             False   \n",
       "2           8.0             0               0             False   \n",
       "\n",
       "   loai_ca_Ca gãy  loai_ca_Ca sáng  loai_ca_Ca đêm  loai_ca_Hành chính  \\\n",
       "0           False            False           False               False   \n",
       "1           False            False           False                True   \n",
       "2           False             True           False               False   \n",
       "\n",
       "   loai_ca_Part time  \n",
       "0               True  \n",
       "1              False  \n",
       "2              False  \n",
       "\n",
       "[3 rows x 45 columns]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "TARGET = \"so_luong\"\n",
    "GROUP_COL = \"ma_dia_diem\"\n",
    "DROP_COLS = [\"ma_dia_diem\", \"all_task_normal\", \"all_task_dinhky\", \"is_tasks_text_missing\"]\n",
    "\n",
    "def parse_hour(t):\n",
    "    \"\"\"Convert 'HH:MM:SS' or datetime-like -> float hour.\"\"\"\n",
    "    if pd.isna(t):\n",
    "        return np.nan\n",
    "    if hasattr(t, \"hour\"):\n",
    "        return float(t.hour) + float(getattr(t, \"minute\", 0))/60.0\n",
    "    s = str(t).strip()\n",
    "    if \":\" in s:\n",
    "        parts = s.split(\":\")\n",
    "        try:\n",
    "            hh = int(float(parts[0]))\n",
    "            mm = int(float(parts[1])) if len(parts) > 1 else 0\n",
    "            return hh + mm/60.0\n",
    "        except:\n",
    "            return np.nan\n",
    "    try:\n",
    "        return float(s)\n",
    "    except:\n",
    "        return np.nan\n",
    "\n",
    "# 1) y + groups (groups dùng ở cell 3 để split theo tòa)\n",
    "y = df[TARGET].astype(float).copy()\n",
    "groups = df[GROUP_COL].astype(str).copy()\n",
    "\n",
    "# 2) Time features từ bat_dau/ket_thuc (KHÔNG one-hot)\n",
    "hour_start = df[\"bat_dau\"].apply(parse_hour)\n",
    "hour_end   = df[\"ket_thuc\"].apply(parse_hour)\n",
    "\n",
    "is_cross_day = ((hour_end < hour_start) & hour_start.notna() & hour_end.notna()).astype(int)\n",
    "shift_length = ((hour_end - hour_start) % 24).fillna(0)\n",
    "is_night_shift = ((hour_start >= 22) | (hour_start < 6)).fillna(False).astype(int)\n",
    "\n",
    "# 3) Build X: drop 4 cols + drop target + drop raw time cols\n",
    "X = df.drop(columns=[c for c in DROP_COLS if c in df.columns] + [TARGET], errors=\"ignore\").copy()\n",
    "X = X.drop(columns=[c for c in [\"bat_dau\", \"ket_thuc\"] if c in X.columns], errors=\"ignore\")\n",
    "\n",
    "# Add engineered time cols\n",
    "X[\"hour_start\"] = hour_start.fillna(0)\n",
    "X[\"hour_end\"] = hour_end.fillna(0)\n",
    "X[\"shift_length\"] = shift_length\n",
    "X[\"is_cross_day\"] = is_cross_day\n",
    "X[\"is_night_shift\"] = is_night_shift\n",
    "\n",
    "# 4) Fill NA numeric\n",
    "num_cols = X.select_dtypes(include=[np.number]).columns\n",
    "X[num_cols] = X[num_cols].fillna(0)\n",
    "\n",
    "# 5) One-hot ONLY loai_ca (nếu có)\n",
    "if \"loai_ca\" in X.columns:\n",
    "    X[\"loai_ca\"] = X[\"loai_ca\"].fillna(\"UNKNOWN\").astype(str)\n",
    "    X = pd.get_dummies(X, columns=[\"loai_ca\"], drop_first=True)\n",
    "\n",
    "print(\"✅ Final X shape:\", X.shape, \"| y shape:\", y.shape, \"| #buildings:\", groups.nunique())\n",
    "print(\"Any NaN left in X?:\", X.isna().any().any())\n",
    "\n",
    "# sanity: confirm no ket_thuc_* or bat_dau_* one-hot columns\n",
    "bad_cols = [c for c in X.columns if str(c).startswith(\"bat_dau_\") or str(c).startswith(\"ket_thuc_\")]\n",
    "print(\"Time one-hot columns (should be 0):\", len(bad_cols))\n",
    "\n",
    "X.head(3)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "2df3b609",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Shapes:\n",
      "Train: (283, 45) (283,) | buildings: 134\n",
      "Val:   (58, 45) (58,) | buildings: 29\n",
      "Test:  (53, 45) (53,) | buildings: 29\n",
      "\n",
      "Leakage check (should all be 0):\n",
      "Train ∩ Val : 0\n",
      "Train ∩ Test: 0\n",
      "Val   ∩ Test: 0\n"
     ]
    }
   ],
   "source": [
    "from sklearn.model_selection import GroupShuffleSplit\n",
    "\n",
    "# ----- 1) Split: (train+val) vs test = 85% / 15% -----\n",
    "gss1 = GroupShuffleSplit(n_splits=1, test_size=0.15, random_state=42)\n",
    "trainval_idx, test_idx = next(gss1.split(X, y, groups=groups))\n",
    "\n",
    "X_trainval, X_test = X.iloc[trainval_idx].reset_index(drop=True), X.iloc[test_idx].reset_index(drop=True)\n",
    "y_trainval, y_test = y.iloc[trainval_idx].reset_index(drop=True), y.iloc[test_idx].reset_index(drop=True)\n",
    "groups_trainval = groups.iloc[trainval_idx].reset_index(drop=True)\n",
    "groups_test = groups.iloc[test_idx].reset_index(drop=True)\n",
    "\n",
    "# ----- 2) Split: train vs val inside trainval = 82.35% / 17.65%  -> overall 70% / 15% -----\n",
    "gss2 = GroupShuffleSplit(n_splits=1, test_size=0.1765, random_state=42)\n",
    "train_idx, val_idx = next(gss2.split(X_trainval, y_trainval, groups=groups_trainval))\n",
    "\n",
    "X_train, X_val = X_trainval.iloc[train_idx].reset_index(drop=True), X_trainval.iloc[val_idx].reset_index(drop=True)\n",
    "y_train, y_val = y_trainval.iloc[train_idx].reset_index(drop=True), y_trainval.iloc[val_idx].reset_index(drop=True)\n",
    "groups_train = groups_trainval.iloc[train_idx].reset_index(drop=True)\n",
    "groups_val = groups_trainval.iloc[val_idx].reset_index(drop=True)\n",
    "\n",
    "# ----- 3) Report -----\n",
    "print(\"Shapes:\")\n",
    "print(\"Train:\", X_train.shape, y_train.shape, \"| buildings:\", groups_train.nunique())\n",
    "print(\"Val:  \", X_val.shape,   y_val.shape,   \"| buildings:\", groups_val.nunique())\n",
    "print(\"Test: \", X_test.shape,  y_test.shape,  \"| buildings:\", groups_test.nunique())\n",
    "\n",
    "# Leakage check: ensure no building appears in multiple splits\n",
    "train_b = set(groups_train.unique())\n",
    "val_b   = set(groups_val.unique())\n",
    "test_b  = set(groups_test.unique())\n",
    "print(\"\\nLeakage check (should all be 0):\")\n",
    "print(\"Train ∩ Val :\", len(train_b & val_b))\n",
    "print(\"Train ∩ Test:\", len(train_b & test_b))\n",
    "print(\"Val   ∩ Test:\", len(val_b & test_b))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "8cc64019",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Model: Ridge_log\n",
      "Train | MAE=2.995 | RMSE=5.484 | R2=0.484\n",
      "Val   | MAE=1.398 | RMSE=2.015 | R2=0.037\n",
      "Test  | MAE=2.744 | RMSE=4.416 | R2=0.155\n",
      "\n",
      "Model: GBR_log\n",
      "Train | MAE=1.201 | RMSE=2.466 | R2=0.896\n",
      "Val   | MAE=1.213 | RMSE=1.832 | R2=0.203\n",
      "Test  | MAE=2.979 | RMSE=4.810 | R2=-0.002\n",
      "\n",
      "✅ Saved: test_predictions_gbr.xlsx\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>y_true</th>\n",
       "      <th>y_pred</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>29.0</td>\n",
       "      <td>17.014321</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4.0</td>\n",
       "      <td>5.206821</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4.0</td>\n",
       "      <td>2.930329</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3.0</td>\n",
       "      <td>1.853829</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2.0</td>\n",
       "      <td>1.561030</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>1.0</td>\n",
       "      <td>0.867534</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>3.0</td>\n",
       "      <td>1.601529</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>3.0</td>\n",
       "      <td>2.436945</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>13.0</td>\n",
       "      <td>3.495240</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>2.0</td>\n",
       "      <td>1.596085</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   y_true     y_pred\n",
       "0    29.0  17.014321\n",
       "1     4.0   5.206821\n",
       "2     4.0   2.930329\n",
       "3     3.0   1.853829\n",
       "4     2.0   1.561030\n",
       "5     1.0   0.867534\n",
       "6     3.0   1.601529\n",
       "7     3.0   2.436945\n",
       "8    13.0   3.495240\n",
       "9     2.0   1.596085"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
    "from sklearn.linear_model import Ridge\n",
    "from sklearn.ensemble import GradientBoostingRegressor\n",
    "\n",
    "def rmse(y, yhat):\n",
    "    return float(np.sqrt(mean_squared_error(y, yhat)))\n",
    "\n",
    "def eval_model(name, model, X_tr, y_tr, X_va, y_va, X_te, y_te, log_target=False):\n",
    "    # fit\n",
    "    model.fit(X_tr, np.log1p(y_tr) if log_target else y_tr)\n",
    "\n",
    "    # predict helper\n",
    "    def pred(m, X):\n",
    "        p = m.predict(X)\n",
    "        return np.expm1(p) if log_target else p\n",
    "\n",
    "    yhat_tr = pred(model, X_tr)\n",
    "    yhat_va = pred(model, X_va)\n",
    "    yhat_te = pred(model, X_te)\n",
    "\n",
    "    def metrics(y, yhat):\n",
    "        return {\n",
    "            \"MAE\": float(mean_absolute_error(y, yhat)),\n",
    "            \"RMSE\": rmse(y, yhat),\n",
    "            \"R2\": float(r2_score(y, yhat)),\n",
    "        }\n",
    "\n",
    "    res = {\n",
    "        \"model\": name,\n",
    "        \"Train\": metrics(y_tr, yhat_tr),\n",
    "        \"Val\":   metrics(y_va, yhat_va),\n",
    "        \"Test\":  metrics(y_te, yhat_te),\n",
    "    }\n",
    "    return res, yhat_te\n",
    "\n",
    "\n",
    "results = []\n",
    "\n",
    "# 1) Ridge (log target)\n",
    "ridge = Ridge(alpha=1.0, random_state=42)\n",
    "res_ridge, ridge_test_pred = eval_model(\n",
    "    \"Ridge_log\", ridge,\n",
    "    X_train, y_train,\n",
    "    X_val, y_val,\n",
    "    X_test, y_test,\n",
    "    log_target=True\n",
    ")\n",
    "results.append(res_ridge)\n",
    "\n",
    "# 2) Gradient Boosting (log target)\n",
    "gbr = GradientBoostingRegressor(\n",
    "    n_estimators=300,\n",
    "    learning_rate=0.05,\n",
    "    max_depth=3,\n",
    "    random_state=42\n",
    ")\n",
    "res_gbr, gbr_test_pred = eval_model(\n",
    "    \"GBR_log\", gbr,\n",
    "    X_train, y_train,\n",
    "    X_val, y_val,\n",
    "    X_test, y_test,\n",
    "    log_target=True\n",
    ")\n",
    "results.append(res_gbr)\n",
    "\n",
    "# Print results\n",
    "for r in results:\n",
    "    print(\"\\nModel:\", r[\"model\"])\n",
    "    for split in [\"Train\", \"Val\", \"Test\"]:\n",
    "        m = r[split]\n",
    "        print(f\"{split:5s} | MAE={m['MAE']:.3f} | RMSE={m['RMSE']:.3f} | R2={m['R2']:.3f}\")\n",
    "\n",
    "# Save TEST predictions (GBR)\n",
    "out = pd.DataFrame({\n",
    "    \"y_true\": y_test.values,\n",
    "    \"y_pred\": np.maximum(0, gbr_test_pred)  # clamp negative\n",
    "})\n",
    "out.to_excel(\"test_predictions_gbr.xlsx\", index=False)\n",
    "print(\"\\n✅ Saved: test_predictions_gbr.xlsx\")\n",
    "out.head(10)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "e238b641",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=== ML only (GBR_reg, log target) ===\n",
      "Train | MAE=2.145 | RMSE=4.539 | R2=0.647\n",
      "Val   | MAE=1.074 | RMSE=1.722 | R2=0.296\n",
      "Test  | MAE=2.669 | RMSE=4.565 | R2=0.097\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.ensemble import GradientBoostingRegressor\n",
    "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
    "\n",
    "def rmse(y, yhat):\n",
    "    return float(np.sqrt(mean_squared_error(y, yhat)))\n",
    "\n",
    "def report(name, y_true, y_pred):\n",
    "    print(f\"{name:5s} | MAE={mean_absolute_error(y_true, y_pred):.3f} \"\n",
    "          f\"| RMSE={rmse(y_true, y_pred):.3f} | R2={r2_score(y_true, y_pred):.3f}\")\n",
    "\n",
    "# GBR \"hiền\" hơn để giảm overfit\n",
    "gbr_reg = GradientBoostingRegressor(\n",
    "    n_estimators=600,\n",
    "    learning_rate=0.03,\n",
    "    max_depth=2,\n",
    "    min_samples_leaf=10,\n",
    "    min_samples_split=20,\n",
    "    random_state=42\n",
    ")\n",
    "\n",
    "# Train với log1p(target)\n",
    "gbr_reg.fit(X_train, np.log1p(y_train))\n",
    "\n",
    "def predict_original_scale(model, X):\n",
    "    return np.maximum(0, np.expm1(model.predict(X)))\n",
    "\n",
    "pred_train = predict_original_scale(gbr_reg, X_train)\n",
    "pred_val   = predict_original_scale(gbr_reg, X_val)\n",
    "pred_test  = predict_original_scale(gbr_reg, X_test)\n",
    "\n",
    "print(\"=== ML only (GBR_reg, log target) ===\")\n",
    "report(\"Train\", y_train, pred_train)\n",
    "report(\"Val\",   y_val,   pred_val)\n",
    "report(\"Test\",  y_test,  pred_test)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "635bf672",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best K on VAL: {'K': 150, 'mae_val': 1.0616528195190562}\n",
      "\n",
      "=== After Rule: y_pred_final = max(ML_pred, ceil(num_tasks/K)) ===\n",
      "Rule used: min_staff = ceil(num_tasks / 150)\n",
      "Train | MAE=2.259 | RMSE=4.601 | R2=0.637\n",
      "Val   | MAE=1.062 | RMSE=1.721 | R2=0.297\n",
      "Test  | MAE=2.602 | RMSE=4.527 | R2=0.112\n",
      "\n",
      "TEST big cases (y_true >= 10): 6\n",
      "ML    | MAE=8.512 | RMSE=9.050 | R2=-0.959\n",
      "Rule  | MAE=8.320 | RMSE=8.909 | R2=-0.899\n",
      "\n",
      "✅ Saved: test_predictions_ml_plus_rule.xlsx\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>y_true</th>\n",
       "      <th>y_pred_ml</th>\n",
       "      <th>y_pred_final</th>\n",
       "      <th>num_tasks</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>29.0</td>\n",
       "      <td>15.005548</td>\n",
       "      <td>15.005548</td>\n",
       "      <td>593</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4.0</td>\n",
       "      <td>4.369563</td>\n",
       "      <td>4.369563</td>\n",
       "      <td>593</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4.0</td>\n",
       "      <td>2.902404</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>593</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3.0</td>\n",
       "      <td>2.591762</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>593</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2.0</td>\n",
       "      <td>2.089570</td>\n",
       "      <td>2.089570</td>\n",
       "      <td>33</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>1.0</td>\n",
       "      <td>0.946909</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>33</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>3.0</td>\n",
       "      <td>1.167053</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>182</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>3.0</td>\n",
       "      <td>2.154357</td>\n",
       "      <td>2.154357</td>\n",
       "      <td>182</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>13.0</td>\n",
       "      <td>3.034284</td>\n",
       "      <td>3.034284</td>\n",
       "      <td>182</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>2.0</td>\n",
       "      <td>1.674387</td>\n",
       "      <td>1.674387</td>\n",
       "      <td>124</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   y_true  y_pred_ml  y_pred_final  num_tasks\n",
       "0    29.0  15.005548     15.005548        593\n",
       "1     4.0   4.369563      4.369563        593\n",
       "2     4.0   2.902404      4.000000        593\n",
       "3     3.0   2.591762      4.000000        593\n",
       "4     2.0   2.089570      2.089570         33\n",
       "5     1.0   0.946909      1.000000         33\n",
       "6     3.0   1.167053      2.000000        182\n",
       "7     3.0   2.154357      2.154357        182\n",
       "8    13.0   3.034284      3.034284        182\n",
       "9     2.0   1.674387      1.674387        124"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
    "\n",
    "def rmse(y, yhat):\n",
    "    return float(np.sqrt(mean_squared_error(y, yhat)))\n",
    "\n",
    "def report(name, y_true, y_pred):\n",
    "    print(f\"{name:5s} | MAE={mean_absolute_error(y_true, y_pred):.3f} \"\n",
    "          f\"| RMSE={rmse(y_true, y_pred):.3f} | R2={r2_score(y_true, y_pred):.3f}\")\n",
    "\n",
    "# num_tasks phải tồn tại trong X_train/X_val/X_test\n",
    "assert \"num_tasks\" in X_train.columns, \"❌ Missing num_tasks in X_train\"\n",
    "\n",
    "nt_train = X_train[\"num_tasks\"].values\n",
    "nt_val   = X_val[\"num_tasks\"].values\n",
    "nt_test  = X_test[\"num_tasks\"].values\n",
    "\n",
    "# ---- tune K on VAL (KHÔNG đụng test khi tune) ----\n",
    "Ks = [30, 40, 50, 60, 70, 80, 100, 120, 150]\n",
    "best = None\n",
    "\n",
    "for K in Ks:\n",
    "    min_val = np.ceil(nt_val / K)\n",
    "    pred_val_rule = np.maximum(pred_val, min_val)\n",
    "    mae = mean_absolute_error(y_val, pred_val_rule)\n",
    "    if (best is None) or (mae < best[\"mae_val\"]):\n",
    "        best = {\"K\": K, \"mae_val\": mae}\n",
    "\n",
    "print(\"Best K on VAL:\", best)\n",
    "\n",
    "K_best = best[\"K\"]\n",
    "\n",
    "def apply_rule(pred, num_tasks, K):\n",
    "    min_staff = np.ceil(num_tasks / K)\n",
    "    return np.maximum(pred, min_staff)\n",
    "\n",
    "pred_train_rule = apply_rule(pred_train, nt_train, K_best)\n",
    "pred_val_rule   = apply_rule(pred_val,   nt_val,   K_best)\n",
    "pred_test_rule  = apply_rule(pred_test,  nt_test,  K_best)\n",
    "\n",
    "print(\"\\n=== After Rule: y_pred_final = max(ML_pred, ceil(num_tasks/K)) ===\")\n",
    "print(\"Rule used: min_staff = ceil(num_tasks / %d)\" % K_best)\n",
    "\n",
    "report(\"Train\", y_train, pred_train_rule)\n",
    "report(\"Val\",   y_val,   pred_val_rule)\n",
    "report(\"Test\",  y_test,  pred_test_rule)\n",
    "\n",
    "# ---- big cases analysis on TEST ----\n",
    "mask_big = (y_test.values >= 10)\n",
    "print(\"\\nTEST big cases (y_true >= 10):\", int(mask_big.sum()))\n",
    "if mask_big.sum() > 0:\n",
    "    report(\"ML\",   y_test.values[mask_big], pred_test[mask_big])\n",
    "    report(\"Rule\", y_test.values[mask_big], pred_test_rule[mask_big])\n",
    "\n",
    "# save test predictions (after rule)\n",
    "out_rule = pd.DataFrame({\n",
    "    \"y_true\": y_test.values,\n",
    "    \"y_pred_ml\": pred_test,\n",
    "    \"y_pred_final\": pred_test_rule,\n",
    "    \"num_tasks\": nt_test\n",
    "})\n",
    "out_rule.to_excel(\"test_predictions_ml_plus_rule.xlsx\", index=False)\n",
    "print(\"\\n✅ Saved: test_predictions_ml_plus_rule.xlsx\")\n",
    "out_rule.head(10)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "70493591",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best (a,b) on VAL: {'a': 0, 'b': 0.1, 'mae_val': 1.0737151241156944}\n",
      "\n",
      "=== After Smooth Rule: y_pred_final = max(ML_pred, ceil(a + b*sqrt(num_tasks))) ===\n",
      "Rule used: ceil(0 + 0.1*sqrt(num_tasks))\n",
      "Train | MAE=2.185 | RMSE=4.551 | R2=0.645\n",
      "Val   | MAE=1.074 | RMSE=1.701 | R2=0.314\n",
      "Test  | MAE=2.599 | RMSE=4.556 | R2=0.101\n",
      "\n",
      "TEST big cases (y_true >= 10): 6\n",
      "ML    | MAE=8.512 | RMSE=9.050 | R2=-0.959\n",
      "Smooth | MAE=8.512 | RMSE=9.050 | R2=-0.959\n",
      "\n",
      "✅ Saved: test_predictions_ml_plus_smooth_rule.xlsx\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>y_true</th>\n",
       "      <th>y_pred_ml</th>\n",
       "      <th>y_pred_final</th>\n",
       "      <th>num_tasks</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>29.0</td>\n",
       "      <td>15.005548</td>\n",
       "      <td>15.005548</td>\n",
       "      <td>593</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4.0</td>\n",
       "      <td>4.369563</td>\n",
       "      <td>4.369563</td>\n",
       "      <td>593</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4.0</td>\n",
       "      <td>2.902404</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>593</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3.0</td>\n",
       "      <td>2.591762</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>593</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2.0</td>\n",
       "      <td>2.089570</td>\n",
       "      <td>2.089570</td>\n",
       "      <td>33</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>1.0</td>\n",
       "      <td>0.946909</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>33</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>3.0</td>\n",
       "      <td>1.167053</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>182</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>3.0</td>\n",
       "      <td>2.154357</td>\n",
       "      <td>2.154357</td>\n",
       "      <td>182</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>13.0</td>\n",
       "      <td>3.034284</td>\n",
       "      <td>3.034284</td>\n",
       "      <td>182</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>2.0</td>\n",
       "      <td>1.674387</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>124</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   y_true  y_pred_ml  y_pred_final  num_tasks\n",
       "0    29.0  15.005548     15.005548        593\n",
       "1     4.0   4.369563      4.369563        593\n",
       "2     4.0   2.902404      3.000000        593\n",
       "3     3.0   2.591762      3.000000        593\n",
       "4     2.0   2.089570      2.089570         33\n",
       "5     1.0   0.946909      1.000000         33\n",
       "6     3.0   1.167053      2.000000        182\n",
       "7     3.0   2.154357      2.154357        182\n",
       "8    13.0   3.034284      3.034284        182\n",
       "9     2.0   1.674387      2.000000        124"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
    "\n",
    "def rmse(y, yhat):\n",
    "    return float(np.sqrt(mean_squared_error(y, yhat)))\n",
    "\n",
    "def report(name, y_true, y_pred):\n",
    "    print(f\"{name:5s} | MAE={mean_absolute_error(y_true, y_pred):.3f} \"\n",
    "          f\"| RMSE={rmse(y_true, y_pred):.3f} | R2={r2_score(y_true, y_pred):.3f}\")\n",
    "\n",
    "nt_train = X_train[\"num_tasks\"].values\n",
    "nt_val   = X_val[\"num_tasks\"].values\n",
    "nt_test  = X_test[\"num_tasks\"].values\n",
    "\n",
    "# search grid (nhỏ thôi để bạn dễ đọc)\n",
    "a_list = [0, 1, 2]\n",
    "b_list = [0.10, 0.15, 0.20, 0.25, 0.30]\n",
    "\n",
    "best = None\n",
    "for a in a_list:\n",
    "    for b in b_list:\n",
    "        min_val = np.ceil(a + b * np.sqrt(nt_val))\n",
    "        pred_val_rule = np.maximum(pred_val, min_val)\n",
    "        mae = mean_absolute_error(y_val, pred_val_rule)\n",
    "        if (best is None) or (mae < best[\"mae_val\"]):\n",
    "            best = {\"a\": a, \"b\": b, \"mae_val\": mae}\n",
    "\n",
    "print(\"Best (a,b) on VAL:\", best)\n",
    "\n",
    "a_best, b_best = best[\"a\"], best[\"b\"]\n",
    "\n",
    "def apply_smooth_rule(pred, num_tasks, a, b):\n",
    "    min_staff = np.ceil(a + b * np.sqrt(num_tasks))\n",
    "    return np.maximum(pred, min_staff)\n",
    "\n",
    "pred_train_rule2 = apply_smooth_rule(pred_train, nt_train, a_best, b_best)\n",
    "pred_val_rule2   = apply_smooth_rule(pred_val,   nt_val,   a_best, b_best)\n",
    "pred_test_rule2  = apply_smooth_rule(pred_test,  nt_test,  a_best, b_best)\n",
    "\n",
    "print(\"\\n=== After Smooth Rule: y_pred_final = max(ML_pred, ceil(a + b*sqrt(num_tasks))) ===\")\n",
    "print(f\"Rule used: ceil({a_best} + {b_best}*sqrt(num_tasks))\")\n",
    "\n",
    "report(\"Train\", y_train, pred_train_rule2)\n",
    "report(\"Val\",   y_val,   pred_val_rule2)\n",
    "report(\"Test\",  y_test,  pred_test_rule2)\n",
    "\n",
    "mask_big = (y_test.values >= 10)\n",
    "print(\"\\nTEST big cases (y_true >= 10):\", int(mask_big.sum()))\n",
    "if mask_big.sum() > 0:\n",
    "    report(\"ML\",    y_test.values[mask_big], pred_test[mask_big])\n",
    "    report(\"Smooth\",y_test.values[mask_big], pred_test_rule2[mask_big])\n",
    "\n",
    "# save\n",
    "out_rule2 = pd.DataFrame({\n",
    "    \"y_true\": y_test.values,\n",
    "    \"y_pred_ml\": pred_test,\n",
    "    \"y_pred_final\": pred_test_rule2,\n",
    "    \"num_tasks\": nt_test\n",
    "})\n",
    "out_rule2.to_excel(\"test_predictions_ml_plus_smooth_rule.xlsx\", index=False)\n",
    "print(\"\\n✅ Saved: test_predictions_ml_plus_smooth_rule.xlsx\")\n",
    "out_rule2.head(10)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e71605c4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Big-rate (positive class=1):\n",
      "Train: 0.14840989399293286 | count: 42 / 283\n",
      "Val:   0.017241379310344827 | count: 1 / 58\n",
      "Test:  0.11320754716981132 | count: 6 / 53\n",
      "\n",
      "================================================================================\n",
      "MODEL: LogReg_balanced\n",
      "\n",
      "[VAL] classification_report:\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0      0.981     0.930     0.955        57\n",
      "           1      0.000     0.000     0.000         1\n",
      "\n",
      "    accuracy                          0.914        58\n",
      "   macro avg      0.491     0.465     0.477        58\n",
      "weighted avg      0.965     0.914     0.938        58\n",
      "\n",
      "VAL confusion_matrix:\n",
      " [[53  4]\n",
      " [ 1  0]]\n",
      "\n",
      "[TEST] classification_report:\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0      0.881     0.787     0.831        47\n",
      "           1      0.091     0.167     0.118         6\n",
      "\n",
      "    accuracy                          0.717        53\n",
      "   macro avg      0.486     0.477     0.475        53\n",
      "weighted avg      0.792     0.717     0.751        53\n",
      "\n",
      "TEST confusion_matrix:\n",
      " [[37 10]\n",
      " [ 5  1]]\n",
      "\n",
      "================================================================================\n",
      "MODEL: GBC_depth2\n",
      "\n",
      "[VAL] classification_report:\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0      0.983     1.000     0.991        57\n",
      "           1      0.000     0.000     0.000         1\n",
      "\n",
      "    accuracy                          0.983        58\n",
      "   macro avg      0.491     0.500     0.496        58\n",
      "weighted avg      0.966     0.983     0.974        58\n",
      "\n",
      "VAL confusion_matrix:\n",
      " [[57  0]\n",
      " [ 1  0]]\n",
      "\n",
      "[TEST] classification_report:\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0      0.907     0.830     0.867        47\n",
      "           1      0.200     0.333     0.250         6\n",
      "\n",
      "    accuracy                          0.774        53\n",
      "   macro avg      0.553     0.582     0.558        53\n",
      "weighted avg      0.827     0.774     0.797        53\n",
      "\n",
      "TEST confusion_matrix:\n",
      " [[39  8]\n",
      " [ 4  2]]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
      "c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
      "c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
     ]
    }
   ],
   "source": [
    "# Liệt kê các biến model đang tồn tại\n",
    "[name for name in globals().keys() if \"gbr\" in name.lower()]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "ce971deb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['gbr', 'res_gbr', 'gbr_test_pred', 'gbr_reg']"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Liệt kê các biến model đang tồn tại\n",
    "[name for name in globals().keys() if \"gbr\" in name.lower()]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "9ad22a15",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "=== TEST RESULTS ===\n",
      "ML only              | MAE=2.669 | RMSE=4.565 | R2=0.097\n",
      "ML + Business Rules  | MAE=2.663 | RMSE=4.510 | R2=0.119\n",
      "✅ Saved: test_predictions_ml_plus_business_rules.xlsx\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>y_true</th>\n",
       "      <th>y_pred_ml</th>\n",
       "      <th>min_staff_rule</th>\n",
       "      <th>y_pred_final</th>\n",
       "      <th>num_tasks</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>29.0</td>\n",
       "      <td>15.006</td>\n",
       "      <td>4.0</td>\n",
       "      <td>15.006</td>\n",
       "      <td>593</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4.0</td>\n",
       "      <td>4.370</td>\n",
       "      <td>4.0</td>\n",
       "      <td>4.370</td>\n",
       "      <td>593</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4.0</td>\n",
       "      <td>2.902</td>\n",
       "      <td>4.0</td>\n",
       "      <td>4.000</td>\n",
       "      <td>593</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3.0</td>\n",
       "      <td>2.592</td>\n",
       "      <td>6.0</td>\n",
       "      <td>6.000</td>\n",
       "      <td>593</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2.0</td>\n",
       "      <td>2.090</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2.090</td>\n",
       "      <td>33</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>1.0</td>\n",
       "      <td>0.947</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.000</td>\n",
       "      <td>33</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>3.0</td>\n",
       "      <td>1.167</td>\n",
       "      <td>3.0</td>\n",
       "      <td>3.000</td>\n",
       "      <td>182</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>3.0</td>\n",
       "      <td>2.154</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.154</td>\n",
       "      <td>182</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>13.0</td>\n",
       "      <td>3.034</td>\n",
       "      <td>2.0</td>\n",
       "      <td>3.034</td>\n",
       "      <td>182</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>2.0</td>\n",
       "      <td>1.674</td>\n",
       "      <td>3.0</td>\n",
       "      <td>3.000</td>\n",
       "      <td>124</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   y_true  y_pred_ml  min_staff_rule  y_pred_final  num_tasks\n",
       "0    29.0     15.006             4.0        15.006        593\n",
       "1     4.0      4.370             4.0         4.370        593\n",
       "2     4.0      2.902             4.0         4.000        593\n",
       "3     3.0      2.592             6.0         6.000        593\n",
       "4     2.0      2.090             1.0         2.090         33\n",
       "5     1.0      0.947             1.0         1.000         33\n",
       "6     3.0      1.167             3.0         3.000        182\n",
       "7     3.0      2.154             2.0         2.154        182\n",
       "8    13.0      3.034             2.0         3.034        182\n",
       "9     2.0      1.674             3.0         3.000        124"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
    "\n",
    "def eval_reg(y_true, y_pred, name):\n",
    "    rmse = float(np.sqrt(mean_squared_error(y_true, y_pred)))\n",
    "    print(f\"{name:20s} | MAE={mean_absolute_error(y_true, y_pred):.3f} | RMSE={rmse:.3f} | R2={r2_score(y_true, y_pred):.3f}\")\n",
    "\n",
    "# -----------------------\n",
    "# 1) ML prediction (log target -> original)\n",
    "# -----------------------\n",
    "y_pred_ml = np.maximum(0, np.expm1(gbr_reg.predict(X_test)))\n",
    "\n",
    "# -----------------------\n",
    "# 2) Business rules: minimum staffing\n",
    "# (các cột có thể có/không có nên check)\n",
    "# -----------------------\n",
    "min_staff = np.ceil(X_test[\"num_tasks\"] / 150)\n",
    "\n",
    "if \"num_wc_tasks\" in X_test.columns:\n",
    "    min_staff = np.maximum(min_staff, np.ceil(X_test[\"num_wc_tasks\"] / 40))\n",
    "\n",
    "if \"num_outdoor_tasks\" in X_test.columns:\n",
    "    min_staff = np.maximum(min_staff, np.ceil(X_test[\"num_outdoor_tasks\"] / 60))\n",
    "\n",
    "# Nếu bạn có cột shift (đêm / qua ngày) thì cộng thêm\n",
    "for col in [\"is_night_shift\", \"is_cross_day\"]:\n",
    "    if col in X_test.columns:\n",
    "        min_staff = min_staff + X_test[col].astype(int)\n",
    "\n",
    "# -----------------------\n",
    "# 3) Final prediction\n",
    "# -----------------------\n",
    "y_pred_final = np.maximum(y_pred_ml, min_staff)\n",
    "\n",
    "# -----------------------\n",
    "# 4) Evaluate\n",
    "# -----------------------\n",
    "print(\"\\n=== TEST RESULTS ===\")\n",
    "eval_reg(y_test, y_pred_ml,   \"ML only\")\n",
    "eval_reg(y_test, y_pred_final,\"ML + Business Rules\")\n",
    "\n",
    "# -----------------------\n",
    "# 5) Save file\n",
    "# -----------------------\n",
    "out = pd.DataFrame({\n",
    "    \"y_true\": y_test.values,\n",
    "    \"y_pred_ml\": np.round(y_pred_ml, 3),\n",
    "    \"min_staff_rule\": min_staff.astype(float).values,\n",
    "    \"y_pred_final\": np.round(y_pred_final, 3),\n",
    "    \"num_tasks\": X_test[\"num_tasks\"].values\n",
    "})\n",
    "\n",
    "out.to_excel(\"test_predictions_ml_plus_business_rules.xlsx\", index=False)\n",
    "print(\"✅ Saved: test_predictions_ml_plus_business_rules.xlsx\")\n",
    "\n",
    "out.head(10)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "79387bd4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ Best rule on VAL: {'k_tasks': 150, 'k_wc': 30, 'k_out': 40, 'mae_val': 1.0984917361586377}\n",
      "\n",
      "=== TEST EVAL ===\n",
      "ML only                   | MAE=2.669 | RMSE=4.565 | R2=0.097\n",
      "ML + tuned business rules | MAE=2.600 | RMSE=4.440 | R2=0.146\n",
      "✅ Saved: test_predictions_ml_plus_tuned_rules.xlsx\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>y_true</th>\n",
       "      <th>y_pred_ml</th>\n",
       "      <th>min_staff_rule</th>\n",
       "      <th>y_pred_final</th>\n",
       "      <th>num_tasks</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>29.0</td>\n",
       "      <td>15.006</td>\n",
       "      <td>4.0</td>\n",
       "      <td>15.006</td>\n",
       "      <td>593</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4.0</td>\n",
       "      <td>4.370</td>\n",
       "      <td>4.0</td>\n",
       "      <td>4.370</td>\n",
       "      <td>593</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4.0</td>\n",
       "      <td>2.902</td>\n",
       "      <td>4.0</td>\n",
       "      <td>4.000</td>\n",
       "      <td>593</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3.0</td>\n",
       "      <td>2.592</td>\n",
       "      <td>6.0</td>\n",
       "      <td>6.000</td>\n",
       "      <td>593</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2.0</td>\n",
       "      <td>2.090</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2.090</td>\n",
       "      <td>33</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>1.0</td>\n",
       "      <td>0.947</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.000</td>\n",
       "      <td>33</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>3.0</td>\n",
       "      <td>1.167</td>\n",
       "      <td>3.0</td>\n",
       "      <td>3.000</td>\n",
       "      <td>182</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>3.0</td>\n",
       "      <td>2.154</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.154</td>\n",
       "      <td>182</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>13.0</td>\n",
       "      <td>3.034</td>\n",
       "      <td>2.0</td>\n",
       "      <td>3.034</td>\n",
       "      <td>182</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>2.0</td>\n",
       "      <td>1.674</td>\n",
       "      <td>3.0</td>\n",
       "      <td>3.000</td>\n",
       "      <td>124</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   y_true  y_pred_ml  min_staff_rule  y_pred_final  num_tasks\n",
       "0    29.0     15.006             4.0        15.006        593\n",
       "1     4.0      4.370             4.0         4.370        593\n",
       "2     4.0      2.902             4.0         4.000        593\n",
       "3     3.0      2.592             6.0         6.000        593\n",
       "4     2.0      2.090             1.0         2.090         33\n",
       "5     1.0      0.947             1.0         1.000         33\n",
       "6     3.0      1.167             3.0         3.000        182\n",
       "7     3.0      2.154             2.0         2.154        182\n",
       "8    13.0      3.034             2.0         3.034        182\n",
       "9     2.0      1.674             3.0         3.000        124"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
    "\n",
    "def rmse(y, yhat):\n",
    "    return float(np.sqrt(mean_squared_error(y, yhat)))\n",
    "\n",
    "def eval_reg(y_true, y_pred, name):\n",
    "    print(f\"{name:25s} | MAE={mean_absolute_error(y_true, y_pred):.3f} | RMSE={rmse(y_true, y_pred):.3f} | R2={r2_score(y_true, y_pred):.3f}\")\n",
    "\n",
    "# ML preds (đã train gbr_reg)\n",
    "pred_train = np.maximum(0, np.expm1(gbr_reg.predict(X_train)))\n",
    "pred_val   = np.maximum(0, np.expm1(gbr_reg.predict(X_val)))\n",
    "pred_test  = np.maximum(0, np.expm1(gbr_reg.predict(X_test)))\n",
    "\n",
    "def compute_min_staff(X, k_tasks, k_wc=None, k_out=None):\n",
    "    ms = np.ceil(X[\"num_tasks\"] / k_tasks)\n",
    "\n",
    "    if (k_wc is not None) and (\"num_wc_tasks\" in X.columns):\n",
    "        ms = np.maximum(ms, np.ceil(X[\"num_wc_tasks\"] / k_wc))\n",
    "\n",
    "    if (k_out is not None) and (\"num_outdoor_tasks\" in X.columns):\n",
    "        ms = np.maximum(ms, np.ceil(X[\"num_outdoor_tasks\"] / k_out))\n",
    "\n",
    "    for col in [\"is_night_shift\", \"is_cross_day\"]:\n",
    "        if col in X.columns:\n",
    "            ms = ms + X[col].astype(int)\n",
    "    return ms\n",
    "\n",
    "# Grid search trên VAL\n",
    "k_tasks_list = [100, 120, 150, 180, 200]\n",
    "k_wc_list    = [25, 30, 40, 50, None]      # None = bỏ rule wc\n",
    "k_out_list   = [40, 60, 80, None]          # None = bỏ rule outdoor\n",
    "\n",
    "best = None\n",
    "\n",
    "for kt in k_tasks_list:\n",
    "    for kw in k_wc_list:\n",
    "        for ko in k_out_list:\n",
    "            ms_val = compute_min_staff(X_val, kt, kw, ko)\n",
    "            pred_val_final = np.maximum(pred_val, ms_val)\n",
    "            mae_val = mean_absolute_error(y_val, pred_val_final)\n",
    "\n",
    "            if (best is None) or (mae_val < best[\"mae_val\"]):\n",
    "                best = {\"k_tasks\": kt, \"k_wc\": kw, \"k_out\": ko, \"mae_val\": mae_val}\n",
    "\n",
    "print(\"✅ Best rule on VAL:\", best)\n",
    "\n",
    "# Apply best rule to TEST\n",
    "ms_test = compute_min_staff(X_test, best[\"k_tasks\"], best[\"k_wc\"], best[\"k_out\"])\n",
    "pred_test_final = np.maximum(pred_test, ms_test)\n",
    "\n",
    "print(\"\\n=== TEST EVAL ===\")\n",
    "eval_reg(y_test, pred_test, \"ML only\")\n",
    "eval_reg(y_test, pred_test_final, \"ML + tuned business rules\")\n",
    "\n",
    "# Save predictions\n",
    "out = pd.DataFrame({\n",
    "    \"y_true\": y_test.values,\n",
    "    \"y_pred_ml\": np.round(pred_test, 3),\n",
    "    \"min_staff_rule\": ms_test.astype(float).values,\n",
    "    \"y_pred_final\": np.round(pred_test_final, 3),\n",
    "    \"num_tasks\": X_test[\"num_tasks\"].values\n",
    "})\n",
    "out.to_excel(\"test_predictions_ml_plus_tuned_rules.xlsx\", index=False)\n",
    "print(\"✅ Saved: test_predictions_ml_plus_tuned_rules.xlsx\")\n",
    "out.head(10)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "6e9841d0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Class distribution:\n",
      "Train:\n",
      "so_luong\n",
      "0    207\n",
      "1     39\n",
      "2     37\n",
      "Name: count, dtype: int64\n",
      "Val:\n",
      "so_luong\n",
      "0    53\n",
      "1     4\n",
      "2     1\n",
      "Name: count, dtype: int64\n",
      "Test:\n",
      "so_luong\n",
      "0    43\n",
      "1     5\n",
      "2     5\n",
      "Name: count, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "# ===== CELL 9: Create staff size class =====\n",
    "\n",
    "def staff_class(y):\n",
    "    if y <= 5:\n",
    "        return 0\n",
    "    elif y <= 10:\n",
    "        return 1\n",
    "    else:\n",
    "        return 2\n",
    "\n",
    "y_train_cls = y_train.apply(staff_class)\n",
    "y_val_cls   = y_val.apply(staff_class)\n",
    "y_test_cls  = y_test.apply(staff_class)\n",
    "\n",
    "print(\"Class distribution:\")\n",
    "for name, y in [(\"Train\", y_train_cls), (\"Val\", y_val_cls), (\"Test\", y_test_cls)]:\n",
    "    print(f\"{name}:\")\n",
    "    print(y.value_counts().sort_index())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "daf4acc7",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.\n",
      "  warnings.warn(\n",
      "c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:465: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
      "STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.\n",
      "\n",
      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
      "Please also refer to the documentation for alternative solver options:\n",
      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
      "  n_iter_i = _check_optimize_result(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "=== VAL ===\n",
      "\n",
      "================================================================================\n",
      "MODEL: LR_balanced (VAL)\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0      0.956     0.811     0.878        53\n",
      "           1      0.111     0.250     0.154         4\n",
      "           2      0.000     0.000     0.000         1\n",
      "\n",
      "    accuracy                          0.759        58\n",
      "   macro avg      0.356     0.354     0.344        58\n",
      "weighted avg      0.881     0.759     0.813        58\n",
      "\n",
      "Confusion matrix:\n",
      " [[43  7  3]\n",
      " [ 2  1  1]\n",
      " [ 0  1  0]]\n",
      "\n",
      "================================================================================\n",
      "MODEL: HGB (VAL)\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0      0.946     1.000     0.972        53\n",
      "           1      0.500     0.250     0.333         4\n",
      "           2      0.000     0.000     0.000         1\n",
      "\n",
      "    accuracy                          0.931        58\n",
      "   macro avg      0.482     0.417     0.435        58\n",
      "weighted avg      0.899     0.931     0.912        58\n",
      "\n",
      "Confusion matrix:\n",
      " [[53  0  0]\n",
      " [ 3  1  0]\n",
      " [ 0  1  0]]\n",
      "\n",
      "=== TEST ===\n",
      "\n",
      "================================================================================\n",
      "MODEL: LR_balanced (TEST)\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0      0.914     0.744     0.821        43\n",
      "           1      0.231     0.600     0.333         5\n",
      "           2      0.000     0.000     0.000         5\n",
      "\n",
      "    accuracy                          0.660        53\n",
      "   macro avg      0.382     0.448     0.385        53\n",
      "weighted avg      0.764     0.660     0.697        53\n",
      "\n",
      "Confusion matrix:\n",
      " [[32  7  4]\n",
      " [ 1  3  1]\n",
      " [ 2  3  0]]\n",
      "\n",
      "================================================================================\n",
      "MODEL: HGB (TEST)\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0      0.925     0.860     0.892        43\n",
      "           1      0.200     0.200     0.200         5\n",
      "           2      0.125     0.200     0.154         5\n",
      "\n",
      "    accuracy                          0.736        53\n",
      "   macro avg      0.417     0.420     0.415        53\n",
      "weighted avg      0.781     0.736     0.757        53\n",
      "\n",
      "Confusion matrix:\n",
      " [[37  2  4]\n",
      " [ 1  1  3]\n",
      " [ 2  2  1]]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
      "c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
      "c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.ensemble import HistGradientBoostingClassifier\n",
    "from sklearn.metrics import classification_report, confusion_matrix\n",
    "\n",
    "# ---- 1) Models ----\n",
    "clf_lr = LogisticRegression(\n",
    "    max_iter=5000,\n",
    "    class_weight=\"balanced\",\n",
    "    solver=\"lbfgs\",\n",
    "    multi_class=\"auto\",\n",
    "    random_state=42\n",
    ")\n",
    "\n",
    "clf_hgb = HistGradientBoostingClassifier(\n",
    "    max_depth=3,\n",
    "    learning_rate=0.05,\n",
    "    max_iter=500,\n",
    "    random_state=42\n",
    ")\n",
    "\n",
    "# ---- 2) Fit ----\n",
    "clf_lr.fit(X_train, y_train_cls)\n",
    "clf_hgb.fit(X_train, y_train_cls)\n",
    "\n",
    "# ---- 3) Predict (proba) ----\n",
    "proba_val_lr  = clf_lr.predict_proba(X_val)\n",
    "proba_test_lr = clf_lr.predict_proba(X_test)\n",
    "\n",
    "proba_val_hgb  = clf_hgb.predict_proba(X_val)\n",
    "proba_test_hgb = clf_hgb.predict_proba(X_test)\n",
    "\n",
    "pred_val_lr  = np.argmax(proba_val_lr, axis=1)\n",
    "pred_test_lr = np.argmax(proba_test_lr, axis=1)\n",
    "\n",
    "pred_val_hgb  = np.argmax(proba_val_hgb, axis=1)\n",
    "pred_test_hgb = np.argmax(proba_test_hgb, axis=1)\n",
    "\n",
    "def show_clf(name, y_true, y_pred):\n",
    "    print(\"\\n\" + \"=\"*80)\n",
    "    print(\"MODEL:\", name)\n",
    "    print(classification_report(y_true, y_pred, digits=3))\n",
    "    print(\"Confusion matrix:\\n\", confusion_matrix(y_true, y_pred))\n",
    "\n",
    "print(\"\\n=== VAL ===\")\n",
    "show_clf(\"LR_balanced (VAL)\",  y_val_cls, pred_val_lr)\n",
    "show_clf(\"HGB (VAL)\",          y_val_cls, pred_val_hgb)\n",
    "\n",
    "print(\"\\n=== TEST ===\")\n",
    "show_clf(\"LR_balanced (TEST)\", y_test_cls, pred_test_lr)\n",
    "show_clf(\"HGB (TEST)\",         y_test_cls, pred_test_hgb)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "38c8d00e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ Best (min_hc, min_2424) on VAL: {'min_hc': 0, 'min_2424': 0, 'mae_val': 1.0984917361586377}\n",
      "\n",
      "=== TEST EVAL ===\n",
      "ML only (GBR log)            | MAE=2.669 | RMSE=4.565 | R2=0.097\n",
      "ML + tuned base rules        | MAE=2.600 | RMSE=4.440 | R2=0.146\n",
      "ML + base + loai_ca rules    | MAE=2.600 | RMSE=4.440 | R2=0.146\n",
      "✅ Saved: test_predictions_ml_plus_rules_plus_ca.xlsx\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>y_true</th>\n",
       "      <th>y_pred_ml</th>\n",
       "      <th>min_staff_base</th>\n",
       "      <th>min_staff_ca</th>\n",
       "      <th>min_staff_final</th>\n",
       "      <th>y_pred_final</th>\n",
       "      <th>num_tasks</th>\n",
       "      <th>loai_ca</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>29.0</td>\n",
       "      <td>15.006</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>15.006</td>\n",
       "      <td>593</td>\n",
       "      <td>Hành chính</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4.0</td>\n",
       "      <td>4.370</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>4.370</td>\n",
       "      <td>593</td>\n",
       "      <td>Ca sáng</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4.0</td>\n",
       "      <td>2.902</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>4.000</td>\n",
       "      <td>593</td>\n",
       "      <td>Ca chiều</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3.0</td>\n",
       "      <td>2.592</td>\n",
       "      <td>6.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>6.000</td>\n",
       "      <td>593</td>\n",
       "      <td>Ca đêm</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2.0</td>\n",
       "      <td>2.090</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2.090</td>\n",
       "      <td>33</td>\n",
       "      <td>Hành chính</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>1.0</td>\n",
       "      <td>0.947</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.000</td>\n",
       "      <td>33</td>\n",
       "      <td>Part time</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>3.0</td>\n",
       "      <td>1.167</td>\n",
       "      <td>3.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>3.000</td>\n",
       "      <td>182</td>\n",
       "      <td>Ca sáng</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>3.0</td>\n",
       "      <td>2.154</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.154</td>\n",
       "      <td>182</td>\n",
       "      <td>Ca chiều</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>13.0</td>\n",
       "      <td>3.034</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>3.034</td>\n",
       "      <td>182</td>\n",
       "      <td>Hành chính</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>2.0</td>\n",
       "      <td>1.674</td>\n",
       "      <td>3.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>3.000</td>\n",
       "      <td>124</td>\n",
       "      <td>Ca sáng</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   y_true  y_pred_ml  min_staff_base  min_staff_ca  min_staff_final  \\\n",
       "0    29.0     15.006             4.0           0.0              4.0   \n",
       "1     4.0      4.370             4.0           0.0              4.0   \n",
       "2     4.0      2.902             4.0           0.0              4.0   \n",
       "3     3.0      2.592             6.0           0.0              6.0   \n",
       "4     2.0      2.090             1.0           0.0              1.0   \n",
       "5     1.0      0.947             1.0           0.0              1.0   \n",
       "6     3.0      1.167             3.0           0.0              3.0   \n",
       "7     3.0      2.154             2.0           0.0              2.0   \n",
       "8    13.0      3.034             2.0           0.0              2.0   \n",
       "9     2.0      1.674             3.0           0.0              3.0   \n",
       "\n",
       "   y_pred_final  num_tasks     loai_ca  \n",
       "0        15.006        593  Hành chính  \n",
       "1         4.370        593     Ca sáng  \n",
       "2         4.000        593    Ca chiều  \n",
       "3         6.000        593      Ca đêm  \n",
       "4         2.090         33  Hành chính  \n",
       "5         1.000         33   Part time  \n",
       "6         3.000        182     Ca sáng  \n",
       "7         2.154        182    Ca chiều  \n",
       "8         3.034        182  Hành chính  \n",
       "9         3.000        124     Ca sáng  "
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
    "\n",
    "def rmse(y, yhat):\n",
    "    return float(np.sqrt(mean_squared_error(y, yhat)))\n",
    "\n",
    "def eval_reg(y_true, y_pred, name):\n",
    "    print(f\"{name:28s} | MAE={mean_absolute_error(y_true, y_pred):.3f} | RMSE={rmse(y_true, y_pred):.3f} | R2={r2_score(y_true, y_pred):.3f}\")\n",
    "\n",
    "# ========= 1) ML predictions (GBR log-target) =========\n",
    "pred_train = np.maximum(0, np.expm1(gbr_reg.predict(X_train)))\n",
    "pred_val   = np.maximum(0, np.expm1(gbr_reg.predict(X_val)))\n",
    "pred_test  = np.maximum(0, np.expm1(gbr_reg.predict(X_test)))\n",
    "\n",
    "# ========= 2) Base min-staff rule (tuned earlier) =========\n",
    "# dùng lại best bạn đã tìm: k_tasks=150, k_wc=30, k_out=40\n",
    "def compute_min_staff_base(X, k_tasks=150, k_wc=30, k_out=40):\n",
    "    ms = np.ceil(X[\"num_tasks\"] / k_tasks)\n",
    "\n",
    "    if \"num_wc_tasks\" in X.columns:\n",
    "        ms = np.maximum(ms, np.ceil(X[\"num_wc_tasks\"] / k_wc))\n",
    "\n",
    "    if \"num_outdoor_tasks\" in X.columns:\n",
    "        ms = np.maximum(ms, np.ceil(X[\"num_outdoor_tasks\"] / k_out))\n",
    "\n",
    "    # bonus theo ca đêm / cross-day nếu có\n",
    "    for col in [\"is_night_shift\", \"is_cross_day\"]:\n",
    "        if col in X.columns:\n",
    "            ms = ms + X[col].astype(int)\n",
    "\n",
    "    return ms\n",
    "\n",
    "# ========= 3) Tune MIN staff theo loai_ca trên VAL =========\n",
    "# Ta tune 2 tham số: min_HC (Hành chính), min_2424 (24/24)\n",
    "# Nếu loai_ca đang one-hot, ta sẽ suy ra label từ cột gốc nếu có\n",
    "def get_loai_ca_series(X):\n",
    "    # ưu tiên nếu còn cột gốc 'loai_ca' (string)\n",
    "    if \"loai_ca\" in X.columns and X[\"loai_ca\"].dtype == \"object\":\n",
    "        return X[\"loai_ca\"].astype(str)\n",
    "    # nếu đã one-hot: tìm các cột bắt đầu bằng \"loai_ca_\"\n",
    "    onehot_cols = [c for c in X.columns if c.startswith(\"loai_ca_\")]\n",
    "    if onehot_cols:\n",
    "        # lấy tên category có value True/1\n",
    "        def decode_row(row):\n",
    "            for c in onehot_cols:\n",
    "                if row[c] == 1 or row[c] is True:\n",
    "                    return c.replace(\"loai_ca_\", \"\")\n",
    "            return \"UNKNOWN\"\n",
    "        return X[onehot_cols].apply(decode_row, axis=1)\n",
    "    return pd.Series([\"UNKNOWN\"] * len(X), index=X.index)\n",
    "\n",
    "loai_ca_val = get_loai_ca_series(X_val)\n",
    "loai_ca_test = get_loai_ca_series(X_test)\n",
    "loai_ca_train = get_loai_ca_series(X_train)\n",
    "\n",
    "def apply_ca_rule(loai_ca_series, min_hc, min_2424):\n",
    "    # map tên ca -> min staff\n",
    "    # bạn có thể thêm biến thể viết khác nếu dữ liệu có\n",
    "    lc = loai_ca_series.str.lower()\n",
    "    min_by_ca = np.zeros(len(lc), dtype=float)\n",
    "\n",
    "    # Hành chính\n",
    "    mask_hc = lc.str.contains(\"hành chính\") | lc.str.contains(\"hanh chinh\")\n",
    "    min_by_ca[mask_hc.values] = min_hc\n",
    "\n",
    "    # 24/24 hoặc 24-24\n",
    "    mask_2424 = lc.str.contains(\"24/24\") | lc.str.contains(\"24-24\") | lc.str.contains(\"24 24\")\n",
    "    min_by_ca[mask_2424.values] = np.maximum(min_by_ca[mask_2424.values], min_2424)\n",
    "\n",
    "    return min_by_ca\n",
    "\n",
    "best = None\n",
    "min_hc_list = [0, 4, 6, 8, 10]\n",
    "min_2424_list = [0, 6, 8, 10, 12]\n",
    "\n",
    "base_val = compute_min_staff_base(X_val, 150, 30, 40)\n",
    "\n",
    "for mhc in min_hc_list:\n",
    "    for m24 in min_2424_list:\n",
    "        min_ca = apply_ca_rule(loai_ca_val, mhc, m24)\n",
    "        min_staff = np.maximum(base_val, min_ca)\n",
    "        pred_final = np.maximum(pred_val, min_staff)\n",
    "\n",
    "        mae_val = mean_absolute_error(y_val, pred_final)\n",
    "        if (best is None) or (mae_val < best[\"mae_val\"]):\n",
    "            best = {\"min_hc\": mhc, \"min_2424\": m24, \"mae_val\": mae_val}\n",
    "\n",
    "print(\"✅ Best (min_hc, min_2424) on VAL:\", best)\n",
    "\n",
    "# ========= 4) Apply best rule to TEST =========\n",
    "base_test = compute_min_staff_base(X_test, 150, 30, 40)\n",
    "min_ca_test = apply_ca_rule(loai_ca_test, best[\"min_hc\"], best[\"min_2424\"])\n",
    "min_staff_test = np.maximum(base_test, min_ca_test)\n",
    "\n",
    "pred_test_final = np.maximum(pred_test, min_staff_test)\n",
    "\n",
    "print(\"\\n=== TEST EVAL ===\")\n",
    "eval_reg(y_test, pred_test, \"ML only (GBR log)\")\n",
    "eval_reg(y_test, np.maximum(pred_test, base_test), \"ML + tuned base rules\")\n",
    "eval_reg(y_test, pred_test_final, \"ML + base + loai_ca rules\")\n",
    "\n",
    "# ========= 5) Save excel =========\n",
    "out = pd.DataFrame({\n",
    "    \"y_true\": y_test.values,\n",
    "    \"y_pred_ml\": np.round(pred_test, 3),\n",
    "    \"min_staff_base\": base_test.astype(float),\n",
    "    \"min_staff_ca\": min_ca_test.astype(float),\n",
    "    \"min_staff_final\": min_staff_test.astype(float),\n",
    "    \"y_pred_final\": np.round(pred_test_final, 3),\n",
    "    \"num_tasks\": X_test[\"num_tasks\"].values if \"num_tasks\" in X_test.columns else np.nan,\n",
    "    \"loai_ca\": loai_ca_test.values\n",
    "})\n",
    "out.to_excel(\"test_predictions_ml_plus_rules_plus_ca.xlsx\", index=False)\n",
    "print(\"✅ Saved: test_predictions_ml_plus_rules_plus_ca.xlsx\")\n",
    "\n",
    "out.head(10)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "6dc15922",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded: final_2.xlsx | sheet: final\n",
      "Shape (raw): (401, 42)\n",
      "Shape (dedup): (394, 42)\n",
      "\n",
      "=== Target summary (so_luong) ===\n",
      "count    394.000000\n",
      "mean       4.710660\n",
      "std        6.848602\n",
      "min        0.000000\n",
      "25%        1.000000\n",
      "50%        2.000000\n",
      "75%        5.000000\n",
      "max       64.000000\n",
      "Name: so_luong, dtype: float64\n",
      "\n",
      "=== staff_band distribution ===\n",
      "staff_band\n",
      "0    216\n",
      "1     87\n",
      "2     48\n",
      "3     27\n",
      "4     16\n",
      "Name: count, dtype: int64\n",
      "\n",
      "Sample rows:\n",
      "  ma_dia_diem     loai_ca  tong_gio_lam  num_tasks  so_luong  staff_band\n",
      "0       115-2   Part time           4.0          7         1           0\n",
      "1       101-1  Hành chính           7.5        441        24           4\n",
      "2       101-1     Ca sáng           8.0        441         3           1\n",
      "3       101-1    Ca chiều           8.0        441         5           1\n",
      "4       101-1      Ca đêm           8.0        441         1           0\n",
      "5       101-1      Ca gãy           7.5        441         1           0\n",
      "6       101-1  Hành chính           9.5        441        22           4\n",
      "7       101-2  Hành chính           9.5        135         8           2\n",
      "8       101-2      Ca gãy           7.5        135         1           0\n",
      "9       101-2      Ca đêm           7.5        135         1           0\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "DATA_PATH = \"final_2.xlsx\"\n",
    "SHEET_NAME = \"final\"\n",
    "\n",
    "df = pd.read_excel(DATA_PATH, sheet_name=SHEET_NAME)\n",
    "print(\"Loaded:\", DATA_PATH, \"| sheet:\", SHEET_NAME)\n",
    "print(\"Shape (raw):\", df.shape)\n",
    "\n",
    "# drop duplicates\n",
    "df = df.drop_duplicates()\n",
    "print(\"Shape (dedup):\", df.shape)\n",
    "\n",
    "# target\n",
    "assert \"so_luong\" in df.columns, \"Missing target so_luong\"\n",
    "\n",
    "# ---- Define ordinal bins (bậc nhân sự) ----\n",
    "# 0: 0-2  (rất nhỏ)\n",
    "# 1: 3-5  (nhỏ)\n",
    "# 2: 6-10 (trung bình)\n",
    "# 3: 11-20 (lớn)\n",
    "# 4: >20 (rất lớn)\n",
    "bins = [-0.1, 2, 5, 10, 20, 10**9]\n",
    "labels = [0, 1, 2, 3, 4]\n",
    "\n",
    "df[\"staff_band\"] = pd.cut(df[\"so_luong\"], bins=bins, labels=labels).astype(int)\n",
    "\n",
    "print(\"\\n=== Target summary (so_luong) ===\")\n",
    "print(df[\"so_luong\"].describe())\n",
    "\n",
    "print(\"\\n=== staff_band distribution ===\")\n",
    "print(df[\"staff_band\"].value_counts().sort_index())\n",
    "\n",
    "print(\"\\nSample rows:\")\n",
    "print(df[[\"ma_dia_diem\",\"loai_ca\",\"tong_gio_lam\",\"num_tasks\",\"so_luong\",\"staff_band\"]].head(10))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "666be810",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=== SPLIT SUMMARY (by ma_dia_diem) ===\n",
      "Buildings: 192\n",
      "Train buildings: 138 | rows: 282\n",
      "Val   buildings: 29 | rows: 56\n",
      "Test  buildings: 25 | rows: 56\n",
      "\n",
      "Leakage check (should be 0):\n",
      "Train ∩ Val : 0\n",
      "Train ∩ Test: 0\n",
      "Val   ∩ Test: 0\n",
      "\n",
      "=== staff_band distribution ===\n",
      "\n",
      "Train:\n",
      "staff_band\n",
      "0    155\n",
      "1     55\n",
      "2     38\n",
      "3     22\n",
      "4     12\n",
      "Name: count, dtype: int64\n",
      "\n",
      "Val:\n",
      "staff_band\n",
      "0    28\n",
      "1    20\n",
      "2     5\n",
      "3     3\n",
      "Name: count, dtype: int64\n",
      "\n",
      "Test:\n",
      "staff_band\n",
      "0    33\n",
      "1    12\n",
      "2     5\n",
      "3     2\n",
      "4     4\n",
      "Name: count, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "SEED = 42\n",
    "TRAIN_RATIO = 0.72\n",
    "VAL_RATIO   = 0.15\n",
    "TEST_RATIO  = 0.13\n",
    "\n",
    "assert abs(TRAIN_RATIO + VAL_RATIO + TEST_RATIO - 1.0) < 1e-9\n",
    "\n",
    "# 1) lấy danh sách tòa nhà (group)\n",
    "buildings = df[\"ma_dia_diem\"].astype(str).unique()\n",
    "rng = np.random.RandomState(SEED)\n",
    "rng.shuffle(buildings)\n",
    "\n",
    "n = len(buildings)\n",
    "n_train = int(round(n * TRAIN_RATIO))\n",
    "n_val   = int(round(n * VAL_RATIO))\n",
    "# phần còn lại là test\n",
    "train_b = set(buildings[:n_train])\n",
    "val_b   = set(buildings[n_train:n_train+n_val])\n",
    "test_b  = set(buildings[n_train+n_val:])\n",
    "\n",
    "# 2) tạo mask theo group\n",
    "train_mask = df[\"ma_dia_diem\"].astype(str).isin(train_b)\n",
    "val_mask   = df[\"ma_dia_diem\"].astype(str).isin(val_b)\n",
    "test_mask  = df[\"ma_dia_diem\"].astype(str).isin(test_b)\n",
    "\n",
    "df_train = df[train_mask].copy()\n",
    "df_val   = df[val_mask].copy()\n",
    "df_test  = df[test_mask].copy()\n",
    "\n",
    "print(\"=== SPLIT SUMMARY (by ma_dia_diem) ===\")\n",
    "print(\"Buildings:\", n)\n",
    "print(\"Train buildings:\", len(train_b), \"| rows:\", df_train.shape[0])\n",
    "print(\"Val   buildings:\", len(val_b),   \"| rows:\", df_val.shape[0])\n",
    "print(\"Test  buildings:\", len(test_b),  \"| rows:\", df_test.shape[0])\n",
    "\n",
    "# 3) leakage check (phải = 0)\n",
    "train_set = set(df_train[\"ma_dia_diem\"].astype(str).unique())\n",
    "val_set   = set(df_val[\"ma_dia_diem\"].astype(str).unique())\n",
    "test_set  = set(df_test[\"ma_dia_diem\"].astype(str).unique())\n",
    "\n",
    "print(\"\\nLeakage check (should be 0):\")\n",
    "print(\"Train ∩ Val :\", len(train_set & val_set))\n",
    "print(\"Train ∩ Test:\", len(train_set & test_set))\n",
    "print(\"Val   ∩ Test:\", len(val_set & test_set))\n",
    "\n",
    "# 4) distribution staff_band theo từng tập\n",
    "print(\"\\n=== staff_band distribution ===\")\n",
    "for name, d in [(\"Train\", df_train), (\"Val\", df_val), (\"Test\", df_test)]:\n",
    "    vc = d[\"staff_band\"].value_counts().sort_index()\n",
    "    print(f\"\\n{name}:\")\n",
    "    print(vc)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "d898cfe6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Categorical cols: ['loai_ca', 'bat_dau', 'ket_thuc']\n",
      "\n",
      "Shapes:\n",
      "Train: (282, 115) (282,)\n",
      "Val  : (56, 115) (56,)\n",
      "Test : (56, 115) (56,)\n",
      "\n",
      "Sample feature columns (first 25):\n",
      "['tong_gio_lam', 'so_ca_cua_toa', 'num_tasks', 'num_cleaning_tasks', 'num_trash_collection_tasks', 'num_monitoring_tasks', 'num_deep_cleaning_tasks', 'num_support_tasks', 'num_other_tasks', 'num_wc_tasks', 'num_hallway_tasks', 'num_lobby_tasks', 'num_outdoor_tasks', 'num_elevator_tasks', 'cleaning_ratio', 'trash_collection_ratio', 'monitoring_ratio', 'area_diversity', 'so_tang', 'so_cua_thang_may', 'dien_tich_ngoai_canh', 'dien_tich_sanh', 'dien_tich_hanh_lang', 'dien_tich_wc', 'dien_tich_phong']\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "# ========= 1) Helper: parse time to hour float =========\n",
    "def time_to_hour(x):\n",
    "    if pd.isna(x):\n",
    "        return np.nan\n",
    "    # nếu là kiểu datetime/time của pandas\n",
    "    if hasattr(x, \"hour\"):\n",
    "        return float(x.hour) + float(getattr(x, \"minute\", 0))/60.0\n",
    "    s = str(x).strip()\n",
    "    # format \"HH:MM:SS\" hoặc \"HH:MM\"\n",
    "    if \":\" in s:\n",
    "        parts = s.split(\":\")\n",
    "        try:\n",
    "            h = float(parts[0])\n",
    "            m = float(parts[1]) if len(parts) > 1 else 0.0\n",
    "            return h + m/60.0\n",
    "        except:\n",
    "            return np.nan\n",
    "    # fallback\n",
    "    try:\n",
    "        return float(s)\n",
    "    except:\n",
    "        return np.nan\n",
    "\n",
    "def add_time_features(d):\n",
    "    d = d.copy()\n",
    "    d[\"hour_start\"] = d[\"bat_dau\"].apply(time_to_hour)\n",
    "    d[\"hour_end\"]   = d[\"ket_thuc\"].apply(time_to_hour)\n",
    "\n",
    "    # shift length (handle cross-day)\n",
    "    # nếu end < start -> qua ngày: +24\n",
    "    end_adj = d[\"hour_end\"].copy()\n",
    "    mask_cross = (d[\"hour_end\"].notna()) & (d[\"hour_start\"].notna()) & (d[\"hour_end\"] < d[\"hour_start\"])\n",
    "    end_adj[mask_cross] = end_adj[mask_cross] + 24.0\n",
    "\n",
    "    d[\"shift_length\"] = (end_adj - d[\"hour_start\"]).clip(lower=0)\n",
    "    d[\"is_cross_day\"] = mask_cross.astype(int)\n",
    "\n",
    "    # buckets theo giờ bắt đầu\n",
    "    hs = d[\"hour_start\"].fillna(-1)\n",
    "    d[\"is_morning_shift\"]   = ((hs >= 6)  & (hs < 12)).astype(int)\n",
    "    d[\"is_afternoon_shift\"] = ((hs >= 12) & (hs < 18)).astype(int)\n",
    "    d[\"is_evening_shift\"]   = ((hs >= 18) & (hs < 24)).astype(int)\n",
    "    d[\"is_night_shift\"]     = ((hs >= 0)  & (hs < 6)).astype(int)\n",
    "\n",
    "    return d\n",
    "\n",
    "# ========= 2) Apply time features =========\n",
    "df_train_fe = add_time_features(df_train)\n",
    "df_val_fe   = add_time_features(df_val)\n",
    "df_test_fe  = add_time_features(df_test)\n",
    "\n",
    "# ========= 3) Drop columns (the ones you requested) + leakage columns =========\n",
    "DROP_COLS = [\"ma_dia_diem\", \"all_task_normal\", \"all_task_dinhky\", \"is_tasks_text_missing\"]\n",
    "LEAK_COLS = [\"so_luong\"]   # label thật -> tuyệt đối không dùng feature\n",
    "\n",
    "# giữ lại staff_band làm y\n",
    "y_train = df_train_fe[\"staff_band\"].astype(int)\n",
    "y_val   = df_val_fe[\"staff_band\"].astype(int)\n",
    "y_test  = df_test_fe[\"staff_band\"].astype(int)\n",
    "\n",
    "X_train = df_train_fe.drop(columns=[c for c in (DROP_COLS + LEAK_COLS + [\"staff_band\"]) if c in df_train_fe.columns])\n",
    "X_val   = df_val_fe.drop(columns=[c for c in (DROP_COLS + LEAK_COLS + [\"staff_band\"]) if c in df_val_fe.columns])\n",
    "X_test  = df_test_fe.drop(columns=[c for c in (DROP_COLS + LEAK_COLS + [\"staff_band\"]) if c in df_test_fe.columns])\n",
    "\n",
    "# ========= 4) One-hot only categorical columns =========\n",
    "cat_cols = [c for c in X_train.columns if X_train[c].dtype == \"object\"]\n",
    "print(\"Categorical cols:\", cat_cols)\n",
    "\n",
    "X_train = pd.get_dummies(X_train, columns=cat_cols, dummy_na=True)\n",
    "X_val   = pd.get_dummies(X_val,   columns=cat_cols, dummy_na=True)\n",
    "X_test  = pd.get_dummies(X_test,  columns=cat_cols, dummy_na=True)\n",
    "\n",
    "# align columns across splits\n",
    "X_train, X_val = X_train.align(X_val, join=\"left\", axis=1, fill_value=0)\n",
    "X_train, X_test = X_train.align(X_test, join=\"left\", axis=1, fill_value=0)\n",
    "\n",
    "# fill NaN numeric\n",
    "X_train = X_train.fillna(0)\n",
    "X_val   = X_val.fillna(0)\n",
    "X_test  = X_test.fillna(0)\n",
    "\n",
    "print(\"\\nShapes:\")\n",
    "print(\"Train:\", X_train.shape, y_train.shape)\n",
    "print(\"Val  :\", X_val.shape, y_val.shape)\n",
    "print(\"Test :\", X_test.shape, y_test.shape)\n",
    "\n",
    "print(\"\\nSample feature columns (first 25):\")\n",
    "print(list(X_train.columns[:25]))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "253b34f1",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:465: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
      "STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.\n",
      "\n",
      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
      "Please also refer to the documentation for alternative solver options:\n",
      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
      "  n_iter_i = _check_optimize_result(\n",
      "c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
      "c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
      "c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "======================================================================\n",
      "MODEL: LR_balanced\n",
      "Train | Acc=0.624 | MacroF1=0.552\n",
      "Val   | Acc=0.429 | MacroF1=0.277\n",
      "Test  | Acc=0.518 | MacroF1=0.402\n",
      "\n",
      "[VAL] Confusion matrix:\n",
      "[[18  5  0  5]\n",
      " [ 6  4  0 10]\n",
      " [ 2  2  0  1]\n",
      " [ 0  1  0  2]]\n",
      "\n",
      "[VAL] Classification report:\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0      0.692     0.643     0.667        28\n",
      "           1      0.333     0.200     0.250        20\n",
      "           2      0.000     0.000     0.000         5\n",
      "           3      0.111     0.667     0.190         3\n",
      "\n",
      "    accuracy                          0.429        56\n",
      "   macro avg      0.284     0.377     0.277        56\n",
      "weighted avg      0.471     0.429     0.433        56\n",
      "\n",
      "\n",
      "[TEST] Confusion matrix:\n",
      "[[20  8  1  2  2]\n",
      " [ 3  4  2  1  2]\n",
      " [ 0  1  1  3  0]\n",
      " [ 0  0  0  2  0]\n",
      " [ 0  1  0  1  2]]\n",
      "\n",
      "[TEST] Classification report:\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0      0.870     0.606     0.714        33\n",
      "           1      0.286     0.333     0.308        12\n",
      "           2      0.250     0.200     0.222         5\n",
      "           3      0.222     1.000     0.364         2\n",
      "           4      0.333     0.500     0.400         4\n",
      "\n",
      "    accuracy                          0.518        56\n",
      "   macro avg      0.392     0.528     0.402        56\n",
      "weighted avg      0.628     0.518     0.548        56\n",
      "\n",
      "\n",
      "======================================================================\n",
      "MODEL: RF_balanced\n",
      "Train | Acc=0.894 | MacroF1=0.875\n",
      "Val   | Acc=0.607 | MacroF1=0.424\n",
      "Test  | Acc=0.625 | MacroF1=0.453\n",
      "\n",
      "[VAL] Confusion matrix:\n",
      "[[19  8  1  0]\n",
      " [ 6 13  1  0]\n",
      " [ 0  2  2  1]\n",
      " [ 0  2  1  0]]\n",
      "\n",
      "[VAL] Classification report:\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0      0.760     0.679     0.717        28\n",
      "           1      0.520     0.650     0.578        20\n",
      "           2      0.400     0.400     0.400         5\n",
      "           3      0.000     0.000     0.000         3\n",
      "\n",
      "    accuracy                          0.607        56\n",
      "   macro avg      0.420     0.432     0.424        56\n",
      "weighted avg      0.601     0.607     0.601        56\n",
      "\n",
      "\n",
      "[TEST] Confusion matrix:\n",
      "[[23  2  8  0  0]\n",
      " [ 3  7  2  0  0]\n",
      " [ 0  0  4  1  0]\n",
      " [ 0  1  0  1  0]\n",
      " [ 0  0  4  0  0]]\n",
      "\n",
      "[TEST] Classification report:\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0      0.885     0.697     0.780        33\n",
      "           1      0.700     0.583     0.636        12\n",
      "           2      0.222     0.800     0.348         5\n",
      "           3      0.500     0.500     0.500         2\n",
      "           4      0.000     0.000     0.000         4\n",
      "\n",
      "    accuracy                          0.625        56\n",
      "   macro avg      0.461     0.516     0.453        56\n",
      "weighted avg      0.709     0.625     0.645        56\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
      "c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
      "c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "======================================================================\n",
      "MODEL: HGB\n",
      "Train | Acc=0.989 | MacroF1=0.989\n",
      "Val   | Acc=0.589 | MacroF1=0.405\n",
      "Test  | Acc=0.518 | MacroF1=0.326\n",
      "\n",
      "[VAL] Confusion matrix:\n",
      "[[22  5  1  0]\n",
      " [ 8  9  2  1]\n",
      " [ 1  1  2  1]\n",
      " [ 1  2  0  0]]\n",
      "\n",
      "[VAL] Classification report:\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0      0.688     0.786     0.733        28\n",
      "           1      0.529     0.450     0.486        20\n",
      "           2      0.400     0.400     0.400         5\n",
      "           3      0.000     0.000     0.000         3\n",
      "\n",
      "    accuracy                          0.589        56\n",
      "   macro avg      0.404     0.409     0.405        56\n",
      "weighted avg      0.569     0.589     0.576        56\n",
      "\n",
      "\n",
      "[TEST] Confusion matrix:\n",
      "[[22  6  4  1  0]\n",
      " [ 4  5  3  0  0]\n",
      " [ 1  2  1  1  0]\n",
      " [ 0  1  1  0  0]\n",
      " [ 0  1  2  0  1]]\n",
      "\n",
      "[TEST] Classification report:\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0      0.815     0.667     0.733        33\n",
      "           1      0.333     0.417     0.370        12\n",
      "           2      0.091     0.200     0.125         5\n",
      "           3      0.000     0.000     0.000         2\n",
      "           4      1.000     0.250     0.400         4\n",
      "\n",
      "    accuracy                          0.518        56\n",
      "   macro avg      0.448     0.307     0.326        56\n",
      "weighted avg      0.631     0.518     0.551        56\n",
      "\n",
      "\n",
      "======================================================================\n",
      "SUMMARY (sorted by Val MacroF1):\n",
      "RF_balanced  | Val MacroF1=0.424 | Test MacroF1=0.453\n",
      "HGB          | Val MacroF1=0.405 | Test MacroF1=0.326\n",
      "LR_balanced  | Val MacroF1=0.277 | Test MacroF1=0.402\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier\n",
    "\n",
    "def eval_cls(name, model, Xtr, ytr, Xva, yva, Xte, yte):\n",
    "    model.fit(Xtr, ytr)\n",
    "\n",
    "    def _metrics(split_name, X, y):\n",
    "        pred = model.predict(X)\n",
    "        acc = accuracy_score(y, pred)\n",
    "        f1m = f1_score(y, pred, average=\"macro\")\n",
    "        return pred, acc, f1m\n",
    "\n",
    "    pred_tr, acc_tr, f1_tr = _metrics(\"Train\", Xtr, ytr)\n",
    "    pred_va, acc_va, f1_va = _metrics(\"Val\",   Xva, yva)\n",
    "    pred_te, acc_te, f1_te = _metrics(\"Test\",  Xte, yte)\n",
    "\n",
    "    print(\"\\n\" + \"=\"*70)\n",
    "    print(f\"MODEL: {name}\")\n",
    "    print(f\"Train | Acc={acc_tr:.3f} | MacroF1={f1_tr:.3f}\")\n",
    "    print(f\"Val   | Acc={acc_va:.3f} | MacroF1={f1_va:.3f}\")\n",
    "    print(f\"Test  | Acc={acc_te:.3f} | MacroF1={f1_te:.3f}\")\n",
    "\n",
    "    print(\"\\n[VAL] Confusion matrix:\")\n",
    "    print(confusion_matrix(yva, pred_va))\n",
    "    print(\"\\n[VAL] Classification report:\")\n",
    "    print(classification_report(yva, pred_va, digits=3))\n",
    "\n",
    "    print(\"\\n[TEST] Confusion matrix:\")\n",
    "    print(confusion_matrix(yte, pred_te))\n",
    "    print(\"\\n[TEST] Classification report:\")\n",
    "    print(classification_report(yte, pred_te, digits=3))\n",
    "\n",
    "    return {\n",
    "        \"name\": name,\n",
    "        \"model\": model,\n",
    "        \"val_macro_f1\": f1_va,\n",
    "        \"test_macro_f1\": f1_te\n",
    "    }\n",
    "\n",
    "# 1) Logistic Regression (balanced) - baseline mạnh cho tabular\n",
    "lr = LogisticRegression(\n",
    "    max_iter=5000,\n",
    "    class_weight=\"balanced\",\n",
    "    n_jobs=None\n",
    ")\n",
    "\n",
    "# 2) RandomForest (balanced_subsample)\n",
    "rf = RandomForestClassifier(\n",
    "    n_estimators=600,\n",
    "    max_depth=None,\n",
    "    min_samples_leaf=2,\n",
    "    random_state=42,\n",
    "    class_weight=\"balanced_subsample\",\n",
    "    n_jobs=-1\n",
    ")\n",
    "\n",
    "# 3) HistGradientBoosting (mạnh cho tabular, chạy nhanh)\n",
    "hgb = HistGradientBoostingClassifier(\n",
    "    learning_rate=0.06,\n",
    "    max_depth=6,\n",
    "    max_iter=600,\n",
    "    random_state=42\n",
    ")\n",
    "\n",
    "results = []\n",
    "results.append(eval_cls(\"LR_balanced\", lr, X_train, y_train, X_val, y_val, X_test, y_test))\n",
    "results.append(eval_cls(\"RF_balanced\", rf, X_train, y_train, X_val, y_val, X_test, y_test))\n",
    "results.append(eval_cls(\"HGB\", hgb, X_train, y_train, X_val, y_val, X_test, y_test))\n",
    "\n",
    "# Summary\n",
    "print(\"\\n\" + \"=\"*70)\n",
    "print(\"SUMMARY (sorted by Val MacroF1):\")\n",
    "for r in sorted(results, key=lambda x: x[\"val_macro_f1\"], reverse=True):\n",
    "    print(f\"{r['name']:12s} | Val MacroF1={r['val_macro_f1']:.3f} | Test MacroF1={r['test_macro_f1']:.3f}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "e1851e78",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded: final_2.xlsx | sheet: final\n",
      "Shape (raw): (401, 42)\n",
      "Shape (after dedup): (394, 42)\n",
      "\n",
      "=== TARGET SUMMARY (so_luong) ===\n",
      "count    394.000000\n",
      "mean       4.710660\n",
      "std        6.848602\n",
      "min        0.000000\n",
      "25%        1.000000\n",
      "50%        2.000000\n",
      "75%        5.000000\n",
      "max       64.000000\n",
      "Name: so_luong, dtype: float64\n",
      "Missing target: 0\n",
      "Negative target: 0\n",
      "Zero target: 3\n",
      "\n",
      "Sample rows:\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ma_dia_diem</th>\n",
       "      <th>all_task_normal</th>\n",
       "      <th>all_task_dinhky</th>\n",
       "      <th>loai_ca</th>\n",
       "      <th>bat_dau</th>\n",
       "      <th>ket_thuc</th>\n",
       "      <th>tong_gio_lam</th>\n",
       "      <th>so_ca_cua_toa</th>\n",
       "      <th>so_luong</th>\n",
       "      <th>num_tasks</th>\n",
       "      <th>...</th>\n",
       "      <th>dien_tich_tham</th>\n",
       "      <th>doc_ham</th>\n",
       "      <th>vien_phan_quang</th>\n",
       "      <th>op_tuong</th>\n",
       "      <th>op_chan_tuong</th>\n",
       "      <th>ranh_thoat_nuoc</th>\n",
       "      <th>dien_tich_kinh</th>\n",
       "      <th>num_medical_tasks_total</th>\n",
       "      <th>num_indoor_room_tasks</th>\n",
       "      <th>is_tasks_text_missing</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>115-2</td>\n",
       "      <td>Làm sạch toàn bộ phòng giao dịch tầng 1 (kể cả...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Part time</td>\n",
       "      <td>06:30:00</td>\n",
       "      <td>10:30:00</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>20.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>101-1</td>\n",
       "      <td>Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,...</td>\n",
       "      <td>Lau bảng biển, bình cứu hỏa , cây nước hành la...</td>\n",
       "      <td>Hành chính</td>\n",
       "      <td>06:30:00</td>\n",
       "      <td>16:00:00</td>\n",
       "      <td>7.5</td>\n",
       "      <td>6</td>\n",
       "      <td>24</td>\n",
       "      <td>441</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>70</td>\n",
       "      <td>0</td>\n",
       "      <td>9176.0</td>\n",
       "      <td>89.0</td>\n",
       "      <td>25</td>\n",
       "      <td>894.0</td>\n",
       "      <td>112</td>\n",
       "      <td>39</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>101-1</td>\n",
       "      <td>Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,...</td>\n",
       "      <td>Lau bảng biển, bình cứu hỏa , cây nước hành la...</td>\n",
       "      <td>Ca sáng</td>\n",
       "      <td>06:00:00</td>\n",
       "      <td>14:00:00</td>\n",
       "      <td>8.0</td>\n",
       "      <td>6</td>\n",
       "      <td>3</td>\n",
       "      <td>441</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>70</td>\n",
       "      <td>0</td>\n",
       "      <td>9176.0</td>\n",
       "      <td>89.0</td>\n",
       "      <td>25</td>\n",
       "      <td>894.0</td>\n",
       "      <td>112</td>\n",
       "      <td>39</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3 rows × 42 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "  ma_dia_diem                                    all_task_normal  \\\n",
       "0       115-2  Làm sạch toàn bộ phòng giao dịch tầng 1 (kể cả...   \n",
       "1       101-1  Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,...   \n",
       "2       101-1  Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,...   \n",
       "\n",
       "                                     all_task_dinhky     loai_ca   bat_dau  \\\n",
       "0                                                NaN   Part time  06:30:00   \n",
       "1  Lau bảng biển, bình cứu hỏa , cây nước hành la...  Hành chính  06:30:00   \n",
       "2  Lau bảng biển, bình cứu hỏa , cây nước hành la...     Ca sáng  06:00:00   \n",
       "\n",
       "   ket_thuc  tong_gio_lam  so_ca_cua_toa  so_luong  num_tasks  ...  \\\n",
       "0  10:30:00           4.0              1         1          7  ...   \n",
       "1  16:00:00           7.5              6        24        441  ...   \n",
       "2  14:00:00           8.0              6         3        441  ...   \n",
       "\n",
       "   dien_tich_tham  doc_ham  vien_phan_quang  op_tuong  op_chan_tuong  \\\n",
       "0             0.0        0                0       0.0            0.0   \n",
       "1             0.0       70                0    9176.0           89.0   \n",
       "2             0.0       70                0    9176.0           89.0   \n",
       "\n",
       "   ranh_thoat_nuoc  dien_tich_kinh  num_medical_tasks_total  \\\n",
       "0                0            20.0                        0   \n",
       "1               25           894.0                      112   \n",
       "2               25           894.0                      112   \n",
       "\n",
       "   num_indoor_room_tasks  is_tasks_text_missing  \n",
       "0                      1                      0  \n",
       "1                     39                      0  \n",
       "2                     39                      0  \n",
       "\n",
       "[3 rows x 42 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# CELL 1 — LOAD DATA & BASIC CLEAN\n",
    "\n",
    "import pandas as pd\n",
    "\n",
    "DATA_PATH = \"final_2.xlsx\"\n",
    "SHEET_NAME = \"final\"\n",
    "\n",
    "# 1. Load\n",
    "df = pd.read_excel(DATA_PATH, sheet_name=SHEET_NAME)\n",
    "print(f\"Loaded: {DATA_PATH} | sheet: {SHEET_NAME}\")\n",
    "print(\"Shape (raw):\", df.shape)\n",
    "\n",
    "# 2. Drop duplicate full rows\n",
    "df = df.drop_duplicates().reset_index(drop=True)\n",
    "print(\"Shape (after dedup):\", df.shape)\n",
    "\n",
    "# 3. Check target\n",
    "assert \"so_luong\" in df.columns, \"❌ Missing target so_luong\"\n",
    "\n",
    "print(\"\\n=== TARGET SUMMARY (so_luong) ===\")\n",
    "print(df[\"so_luong\"].describe())\n",
    "print(\"Missing target:\", df[\"so_luong\"].isna().sum())\n",
    "print(\"Negative target:\", (df[\"so_luong\"] < 0).sum())\n",
    "print(\"Zero target:\", (df[\"so_luong\"] == 0).sum())\n",
    "\n",
    "# 4. Peek data\n",
    "print(\"\\nSample rows:\")\n",
    "display(df.head(3))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8c9268c5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# CELL 2 — FEATURE SELECTION (STRICT)\n",
    "\n",
    "# 1. Xem toàn bộ cột\n",
    "print(\"All columns:\")\n",
    "for i, c in enumerate(df.columns):\n",
    "    print(f\"{i:2d}: {c}\")\n",
    "\n",
    "# 2. Xác định cột cần loại bỏ (THEO THỎA THUẬN)\n",
    "DROP_COLS = [\n",
    "    df.columns[0],   # ma_dia_diem\n",
    "    df.columns[1],   # all_task_normal\n",
    "    df.columns[2],   # all_task_dinhky\n",
    "    df.columns[-1],  # is_tasks_text_missing\n",
    "]\n",
    "\n",
    "print(\"\\nDropped columns:\")\n",
    "for c in DROP_COLS:\n",
    "    print(\" -\", c)\n",
    "\n",
    "# 3. Tạo X, y\n",
    "X = df.drop(columns=DROP_COLS + [\"so_luong\"])\n",
    "y = df[\"so_luong\"].astype(float)\n",
    "\n",
    "print(\"\\nShapes:\")\n",
    "print(\"X:\", X.shape)\n",
    "print(\"y:\", y.shape)\n",
    "\n",
    "# 4. Kiểm tra kiểu dữ liệu\n",
    "print(\"\\nFeature dtypes:\")\n",
    "display(X.dtypes.value_counts())\n",
    "\n",
    "# 5. Kiểm tra missing\n",
    "print(\"\\nMissing values in X:\")\n",
    "display(X.isna().sum().sort_values(ascending=False).head(10))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b975f6cf",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1a595fe8",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b2fb9c84",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}