predict_caLamviec_nhansu/final.ipynb

3713 lines
136 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 12,
"id": "76aa1b75",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"✅ Loaded: final_2.xlsx | sheet: final\n",
"Shape (raw): (401, 42)\n",
"\n",
"=== TARGET SUMMARY (so_luong) ===\n",
"count 401.000000\n",
"mean 4.660848\n",
"std 6.799242\n",
"min 0.000000\n",
"25% 1.000000\n",
"50% 2.000000\n",
"75% 5.000000\n",
"max 64.000000\n",
"Name: so_luong, dtype: float64\n",
"Missing target: 0\n",
"Negative target: 0\n",
"Zero target: 3\n",
"\n",
"Duplicate full rows: 7\n",
"Shape (dedup): (394, 42)\n",
"\n",
"Columns: 42\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ma_dia_diem</th>\n",
" <th>all_task_normal</th>\n",
" <th>all_task_dinhky</th>\n",
" <th>loai_ca</th>\n",
" <th>bat_dau</th>\n",
" <th>ket_thuc</th>\n",
" <th>tong_gio_lam</th>\n",
" <th>so_ca_cua_toa</th>\n",
" <th>so_luong</th>\n",
" <th>num_tasks</th>\n",
" <th>...</th>\n",
" <th>dien_tich_tham</th>\n",
" <th>doc_ham</th>\n",
" <th>vien_phan_quang</th>\n",
" <th>op_tuong</th>\n",
" <th>op_chan_tuong</th>\n",
" <th>ranh_thoat_nuoc</th>\n",
" <th>dien_tich_kinh</th>\n",
" <th>num_medical_tasks_total</th>\n",
" <th>num_indoor_room_tasks</th>\n",
" <th>is_tasks_text_missing</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>115-2</td>\n",
" <td>Làm sạch toàn bộ phòng giao dịch tầng 1 (kể cả...</td>\n",
" <td>NaN</td>\n",
" <td>Part time</td>\n",
" <td>06:30:00</td>\n",
" <td>10:30:00</td>\n",
" <td>4.0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>7</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>20.0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>101-1</td>\n",
" <td>Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,...</td>\n",
" <td>Lau bảng biển, bình cứu hỏa , cây nước hành la...</td>\n",
" <td>Hành chính</td>\n",
" <td>06:30:00</td>\n",
" <td>16:00:00</td>\n",
" <td>7.5</td>\n",
" <td>6</td>\n",
" <td>24</td>\n",
" <td>441</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>70</td>\n",
" <td>0</td>\n",
" <td>9176.0</td>\n",
" <td>89.0</td>\n",
" <td>25</td>\n",
" <td>894.0</td>\n",
" <td>112</td>\n",
" <td>39</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>101-1</td>\n",
" <td>Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,...</td>\n",
" <td>Lau bảng biển, bình cứu hỏa , cây nước hành la...</td>\n",
" <td>Ca sáng</td>\n",
" <td>06:00:00</td>\n",
" <td>14:00:00</td>\n",
" <td>8.0</td>\n",
" <td>6</td>\n",
" <td>3</td>\n",
" <td>441</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>70</td>\n",
" <td>0</td>\n",
" <td>9176.0</td>\n",
" <td>89.0</td>\n",
" <td>25</td>\n",
" <td>894.0</td>\n",
" <td>112</td>\n",
" <td>39</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3 rows × 42 columns</p>\n",
"</div>"
],
"text/plain": [
" ma_dia_diem all_task_normal \\\n",
"0 115-2 Làm sạch toàn bộ phòng giao dịch tầng 1 (kể cả... \n",
"1 101-1 Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,... \n",
"2 101-1 Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,... \n",
"\n",
" all_task_dinhky loai_ca bat_dau \\\n",
"0 NaN Part time 06:30:00 \n",
"1 Lau bảng biển, bình cứu hỏa , cây nước hành la... Hành chính 06:30:00 \n",
"2 Lau bảng biển, bình cứu hỏa , cây nước hành la... Ca sáng 06:00:00 \n",
"\n",
" ket_thuc tong_gio_lam so_ca_cua_toa so_luong num_tasks ... \\\n",
"0 10:30:00 4.0 1 1 7 ... \n",
"1 16:00:00 7.5 6 24 441 ... \n",
"2 14:00:00 8.0 6 3 441 ... \n",
"\n",
" dien_tich_tham doc_ham vien_phan_quang op_tuong op_chan_tuong \\\n",
"0 0.0 0 0 0.0 0.0 \n",
"1 0.0 70 0 9176.0 89.0 \n",
"2 0.0 70 0 9176.0 89.0 \n",
"\n",
" ranh_thoat_nuoc dien_tich_kinh num_medical_tasks_total \\\n",
"0 0 20.0 0 \n",
"1 25 894.0 112 \n",
"2 25 894.0 112 \n",
"\n",
" num_indoor_room_tasks is_tasks_text_missing \n",
"0 1 0 \n",
"1 39 0 \n",
"2 39 0 \n",
"\n",
"[3 rows x 42 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"DATA_PATH = \"final_2.xlsx\"\n",
"SHEET = \"final\"\n",
"TARGET = \"so_luong\"\n",
"\n",
"# 1) Load\n",
"df = pd.read_excel(DATA_PATH, sheet_name=SHEET)\n",
"print(\"✅ Loaded:\", DATA_PATH, \"| sheet:\", SHEET)\n",
"print(\"Shape (raw):\", df.shape)\n",
"\n",
"# 2) Target sanity\n",
"assert TARGET in df.columns, f\"❌ Missing target column: {TARGET}\"\n",
"df[TARGET] = pd.to_numeric(df[TARGET], errors=\"coerce\")\n",
"\n",
"print(\"\\n=== TARGET SUMMARY (so_luong) ===\")\n",
"print(df[TARGET].describe())\n",
"print(\"Missing target:\", df[TARGET].isna().sum())\n",
"print(\"Negative target:\", (df[TARGET] < 0).sum())\n",
"print(\"Zero target:\", (df[TARGET] == 0).sum())\n",
"\n",
"# 3) Deduplicate full rows\n",
"dup = df.duplicated().sum()\n",
"print(\"\\nDuplicate full rows:\", dup)\n",
"if dup > 0:\n",
" df = df.drop_duplicates().reset_index(drop=True)\n",
"print(\"Shape (dedup):\", df.shape)\n",
"\n",
"# 4) Quick preview\n",
"print(\"\\nColumns:\", len(df.columns))\n",
"display(df.head(3))\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "421b7556",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['ma_dia_diem', 'all_task_normal', 'all_task_dinhky', 'loai_ca',\n",
" 'bat_dau', 'ket_thuc', 'tong_gio_lam', 'so_ca_cua_toa', 'so_luong',\n",
" 'num_tasks', 'num_cleaning_tasks', 'num_trash_collection_tasks',\n",
" 'num_monitoring_tasks', 'num_deep_cleaning_tasks', 'num_support_tasks',\n",
" 'num_other_tasks', 'num_wc_tasks', 'num_hallway_tasks',\n",
" 'num_lobby_tasks', 'num_outdoor_tasks', 'num_elevator_tasks',\n",
" 'cleaning_ratio', 'trash_collection_ratio', 'monitoring_ratio',\n",
" 'area_diversity', 'so_tang', 'so_cua_thang_may', 'dien_tich_ngoai_canh',\n",
" 'dien_tich_sanh', 'dien_tich_hanh_lang', 'dien_tich_wc',\n",
" 'dien_tich_phong', 'dien_tich_tham', 'doc_ham', 'vien_phan_quang',\n",
" 'op_tuong', 'op_chan_tuong', 'ranh_thoat_nuoc', 'dien_tich_kinh',\n",
" 'num_medical_tasks_total', 'num_indoor_room_tasks',\n",
" 'is_tasks_text_missing'],\n",
" dtype='object')"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "daf5a333",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"✅ X shape: (394, 48) | y shape: (394,) | #buildings: 192\n",
"Columns sample: ['tong_gio_lam', 'so_ca_cua_toa', 'num_tasks', 'num_cleaning_tasks', 'num_trash_collection_tasks', 'num_monitoring_tasks', 'num_deep_cleaning_tasks', 'num_support_tasks', 'num_other_tasks', 'num_wc_tasks', 'num_hallway_tasks', 'num_lobby_tasks', 'num_outdoor_tasks', 'num_elevator_tasks', 'cleaning_ratio']\n",
"Any NaN left?: False\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>tong_gio_lam</th>\n",
" <th>so_ca_cua_toa</th>\n",
" <th>num_tasks</th>\n",
" <th>num_cleaning_tasks</th>\n",
" <th>num_trash_collection_tasks</th>\n",
" <th>num_monitoring_tasks</th>\n",
" <th>num_deep_cleaning_tasks</th>\n",
" <th>num_support_tasks</th>\n",
" <th>num_other_tasks</th>\n",
" <th>num_wc_tasks</th>\n",
" <th>...</th>\n",
" <th>is_night_shift</th>\n",
" <th>is_morning_shift</th>\n",
" <th>is_afternoon_shift</th>\n",
" <th>is_evening_shift</th>\n",
" <th>loai_ca_Ca chiều</th>\n",
" <th>loai_ca_Ca gãy</th>\n",
" <th>loai_ca_Ca sáng</th>\n",
" <th>loai_ca_Ca đêm</th>\n",
" <th>loai_ca_Hành chính</th>\n",
" <th>loai_ca_Part time</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4.0</td>\n",
" <td>1</td>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>7.5</td>\n",
" <td>6</td>\n",
" <td>441</td>\n",
" <td>258</td>\n",
" <td>145</td>\n",
" <td>134</td>\n",
" <td>75</td>\n",
" <td>57</td>\n",
" <td>45</td>\n",
" <td>89</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>8.0</td>\n",
" <td>6</td>\n",
" <td>441</td>\n",
" <td>258</td>\n",
" <td>145</td>\n",
" <td>134</td>\n",
" <td>75</td>\n",
" <td>57</td>\n",
" <td>45</td>\n",
" <td>89</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>8.0</td>\n",
" <td>6</td>\n",
" <td>441</td>\n",
" <td>258</td>\n",
" <td>145</td>\n",
" <td>134</td>\n",
" <td>75</td>\n",
" <td>57</td>\n",
" <td>45</td>\n",
" <td>89</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>8.0</td>\n",
" <td>6</td>\n",
" <td>441</td>\n",
" <td>258</td>\n",
" <td>145</td>\n",
" <td>134</td>\n",
" <td>75</td>\n",
" <td>57</td>\n",
" <td>45</td>\n",
" <td>89</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>7.5</td>\n",
" <td>6</td>\n",
" <td>441</td>\n",
" <td>258</td>\n",
" <td>145</td>\n",
" <td>134</td>\n",
" <td>75</td>\n",
" <td>57</td>\n",
" <td>45</td>\n",
" <td>89</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>9.5</td>\n",
" <td>6</td>\n",
" <td>441</td>\n",
" <td>258</td>\n",
" <td>145</td>\n",
" <td>134</td>\n",
" <td>75</td>\n",
" <td>57</td>\n",
" <td>45</td>\n",
" <td>89</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>9.5</td>\n",
" <td>3</td>\n",
" <td>135</td>\n",
" <td>81</td>\n",
" <td>35</td>\n",
" <td>38</td>\n",
" <td>10</td>\n",
" <td>20</td>\n",
" <td>21</td>\n",
" <td>25</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>7.5</td>\n",
" <td>3</td>\n",
" <td>135</td>\n",
" <td>81</td>\n",
" <td>35</td>\n",
" <td>38</td>\n",
" <td>10</td>\n",
" <td>20</td>\n",
" <td>21</td>\n",
" <td>25</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>7.5</td>\n",
" <td>3</td>\n",
" <td>135</td>\n",
" <td>81</td>\n",
" <td>35</td>\n",
" <td>38</td>\n",
" <td>10</td>\n",
" <td>20</td>\n",
" <td>21</td>\n",
" <td>25</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10 rows × 48 columns</p>\n",
"</div>"
],
"text/plain": [
" tong_gio_lam so_ca_cua_toa num_tasks num_cleaning_tasks \\\n",
"0 4.0 1 7 7 \n",
"1 7.5 6 441 258 \n",
"2 8.0 6 441 258 \n",
"3 8.0 6 441 258 \n",
"4 8.0 6 441 258 \n",
"5 7.5 6 441 258 \n",
"6 9.5 6 441 258 \n",
"7 9.5 3 135 81 \n",
"8 7.5 3 135 81 \n",
"9 7.5 3 135 81 \n",
"\n",
" num_trash_collection_tasks num_monitoring_tasks num_deep_cleaning_tasks \\\n",
"0 1 2 1 \n",
"1 145 134 75 \n",
"2 145 134 75 \n",
"3 145 134 75 \n",
"4 145 134 75 \n",
"5 145 134 75 \n",
"6 145 134 75 \n",
"7 35 38 10 \n",
"8 35 38 10 \n",
"9 35 38 10 \n",
"\n",
" num_support_tasks num_other_tasks num_wc_tasks ... is_night_shift \\\n",
"0 0 0 4 ... 0 \n",
"1 57 45 89 ... 0 \n",
"2 57 45 89 ... 0 \n",
"3 57 45 89 ... 0 \n",
"4 57 45 89 ... 1 \n",
"5 57 45 89 ... 0 \n",
"6 57 45 89 ... 0 \n",
"7 20 21 25 ... 0 \n",
"8 20 21 25 ... 0 \n",
"9 20 21 25 ... 1 \n",
"\n",
" is_morning_shift is_afternoon_shift is_evening_shift loai_ca_Ca chiều \\\n",
"0 1 0 0 False \n",
"1 1 0 0 False \n",
"2 1 0 0 False \n",
"3 0 1 0 True \n",
"4 0 0 0 False \n",
"5 0 1 0 False \n",
"6 1 0 0 False \n",
"7 1 0 0 False \n",
"8 1 0 0 False \n",
"9 0 0 0 False \n",
"\n",
" loai_ca_Ca gãy loai_ca_Ca sáng loai_ca_Ca đêm loai_ca_Hành chính \\\n",
"0 False False False False \n",
"1 False False False True \n",
"2 False True False False \n",
"3 False False False False \n",
"4 False False True False \n",
"5 True False False False \n",
"6 False False False True \n",
"7 False False False True \n",
"8 True False False False \n",
"9 False False True False \n",
"\n",
" loai_ca_Part time \n",
"0 True \n",
"1 False \n",
"2 False \n",
"3 False \n",
"4 False \n",
"5 False \n",
"6 False \n",
"7 False \n",
"8 False \n",
"9 False \n",
"\n",
"[10 rows x 48 columns]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"TARGET = \"so_luong\"\n",
"GROUP_COL = \"ma_dia_diem\"\n",
"\n",
"# chỉ bỏ 4 cột bạn yêu cầu\n",
"DROP_COLS = [\"ma_dia_diem\", \"all_task_normal\", \"all_task_dinhky\", \"is_tasks_text_missing\"]\n",
"\n",
"# ---------- helpers ----------\n",
"def parse_hour(t):\n",
" \"\"\"Convert 'HH:MM:SS' (or datetime-like) -> float hour in [0,24).\"\"\"\n",
" if pd.isna(t):\n",
" return np.nan\n",
" # pandas Timestamp / datetime\n",
" if hasattr(t, \"hour\"):\n",
" return float(t.hour) + float(getattr(t, \"minute\", 0))/60.0\n",
" s = str(t).strip()\n",
" # handle '06:30:00'\n",
" if \":\" in s:\n",
" parts = s.split(\":\")\n",
" try:\n",
" hh = int(float(parts[0]))\n",
" mm = int(float(parts[1])) if len(parts) > 1 else 0\n",
" return hh + mm/60.0\n",
" except:\n",
" return np.nan\n",
" # handle '6.5' etc\n",
" try:\n",
" return float(s)\n",
" except:\n",
" return np.nan\n",
"\n",
"# ---------- 1) y + groups ----------\n",
"y = df[TARGET].astype(float).copy()\n",
"groups = df[GROUP_COL].astype(str).copy() # để split theo tòa ở cell sau\n",
"\n",
"# ---------- 2) time features (bat_dau/ket_thuc) ----------\n",
"hour_start = df[\"bat_dau\"].apply(parse_hour)\n",
"hour_end = df[\"ket_thuc\"].apply(parse_hour)\n",
"\n",
"# cross-day (vd 22 -> 6)\n",
"is_cross_day = ((hour_end < hour_start) & hour_start.notna() & hour_end.notna()).astype(int)\n",
"\n",
"# shift length in hours, safe modulo 24\n",
"shift_length = ((hour_end - hour_start) % 24).fillna(0)\n",
"\n",
"# flags\n",
"is_night_shift = ((hour_start >= 22) | (hour_start < 6)).fillna(False).astype(int)\n",
"is_morning_shift = ((hour_start >= 6) & (hour_start < 12)).fillna(False).astype(int)\n",
"is_afternoon_shift = ((hour_start >= 12) & (hour_start < 18)).fillna(False).astype(int)\n",
"is_evening_shift = ((hour_start >= 18) & (hour_start < 22)).fillna(False).astype(int)\n",
"\n",
"# ---------- 3) Build X from df: drop forbidden + drop raw time cols + add time features ----------\n",
"X = df.drop(columns=[c for c in DROP_COLS if c in df.columns] + [TARGET], errors=\"ignore\").copy()\n",
"\n",
"# Drop raw time columns (không one-hot time)\n",
"for c in [\"bat_dau\", \"ket_thuc\"]:\n",
" if c in X.columns:\n",
" X = X.drop(columns=[c])\n",
"\n",
"# Add engineered time features\n",
"X[\"hour_start\"] = hour_start.fillna(0)\n",
"X[\"hour_end\"] = hour_end.fillna(0)\n",
"X[\"shift_length\"] = shift_length\n",
"X[\"is_cross_day\"] = is_cross_day\n",
"X[\"is_night_shift\"] = is_night_shift\n",
"X[\"is_morning_shift\"] = is_morning_shift\n",
"X[\"is_afternoon_shift\"] = is_afternoon_shift\n",
"X[\"is_evening_shift\"] = is_evening_shift\n",
"\n",
"# ---------- 4) Fill NA numeric + one-hot ONLY loai_ca ----------\n",
"# numeric fill\n",
"num_cols = X.select_dtypes(include=[np.number]).columns\n",
"X[num_cols] = X[num_cols].fillna(0)\n",
"\n",
"# One-hot loai_ca (nếu có)\n",
"if \"loai_ca\" in X.columns:\n",
" X[\"loai_ca\"] = X[\"loai_ca\"].fillna(\"UNKNOWN\").astype(str)\n",
" X = pd.get_dummies(X, columns=[\"loai_ca\"], drop_first=True)\n",
"\n",
"print(\"✅ X shape:\", X.shape, \"| y shape:\", y.shape, \"| #buildings:\", groups.nunique())\n",
"print(\"Columns sample:\", list(X.columns[:15]))\n",
"print(\"Any NaN left?:\", X.isna().any().any())\n",
"\n",
"X.head(10)\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "0ad8de9d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"✅ Final X shape: (394, 45) | y shape: (394,) | #buildings: 192\n",
"Any NaN left in X?: False\n",
"Time one-hot columns (should be 0): 0\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>tong_gio_lam</th>\n",
" <th>so_ca_cua_toa</th>\n",
" <th>num_tasks</th>\n",
" <th>num_cleaning_tasks</th>\n",
" <th>num_trash_collection_tasks</th>\n",
" <th>num_monitoring_tasks</th>\n",
" <th>num_deep_cleaning_tasks</th>\n",
" <th>num_support_tasks</th>\n",
" <th>num_other_tasks</th>\n",
" <th>num_wc_tasks</th>\n",
" <th>...</th>\n",
" <th>hour_end</th>\n",
" <th>shift_length</th>\n",
" <th>is_cross_day</th>\n",
" <th>is_night_shift</th>\n",
" <th>loai_ca_Ca chiều</th>\n",
" <th>loai_ca_Ca gãy</th>\n",
" <th>loai_ca_Ca sáng</th>\n",
" <th>loai_ca_Ca đêm</th>\n",
" <th>loai_ca_Hành chính</th>\n",
" <th>loai_ca_Part time</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4.0</td>\n",
" <td>1</td>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>...</td>\n",
" <td>10.5</td>\n",
" <td>4.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>7.5</td>\n",
" <td>6</td>\n",
" <td>441</td>\n",
" <td>258</td>\n",
" <td>145</td>\n",
" <td>134</td>\n",
" <td>75</td>\n",
" <td>57</td>\n",
" <td>45</td>\n",
" <td>89</td>\n",
" <td>...</td>\n",
" <td>16.0</td>\n",
" <td>9.5</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>8.0</td>\n",
" <td>6</td>\n",
" <td>441</td>\n",
" <td>258</td>\n",
" <td>145</td>\n",
" <td>134</td>\n",
" <td>75</td>\n",
" <td>57</td>\n",
" <td>45</td>\n",
" <td>89</td>\n",
" <td>...</td>\n",
" <td>14.0</td>\n",
" <td>8.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3 rows × 45 columns</p>\n",
"</div>"
],
"text/plain": [
" tong_gio_lam so_ca_cua_toa num_tasks num_cleaning_tasks \\\n",
"0 4.0 1 7 7 \n",
"1 7.5 6 441 258 \n",
"2 8.0 6 441 258 \n",
"\n",
" num_trash_collection_tasks num_monitoring_tasks num_deep_cleaning_tasks \\\n",
"0 1 2 1 \n",
"1 145 134 75 \n",
"2 145 134 75 \n",
"\n",
" num_support_tasks num_other_tasks num_wc_tasks ... hour_end \\\n",
"0 0 0 4 ... 10.5 \n",
"1 57 45 89 ... 16.0 \n",
"2 57 45 89 ... 14.0 \n",
"\n",
" shift_length is_cross_day is_night_shift loai_ca_Ca chiều \\\n",
"0 4.0 0 0 False \n",
"1 9.5 0 0 False \n",
"2 8.0 0 0 False \n",
"\n",
" loai_ca_Ca gãy loai_ca_Ca sáng loai_ca_Ca đêm loai_ca_Hành chính \\\n",
"0 False False False False \n",
"1 False False False True \n",
"2 False True False False \n",
"\n",
" loai_ca_Part time \n",
"0 True \n",
"1 False \n",
"2 False \n",
"\n",
"[3 rows x 45 columns]"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"TARGET = \"so_luong\"\n",
"GROUP_COL = \"ma_dia_diem\"\n",
"DROP_COLS = [\"ma_dia_diem\", \"all_task_normal\", \"all_task_dinhky\", \"is_tasks_text_missing\"]\n",
"\n",
"def parse_hour(t):\n",
" \"\"\"Convert 'HH:MM:SS' or datetime-like -> float hour.\"\"\"\n",
" if pd.isna(t):\n",
" return np.nan\n",
" if hasattr(t, \"hour\"):\n",
" return float(t.hour) + float(getattr(t, \"minute\", 0))/60.0\n",
" s = str(t).strip()\n",
" if \":\" in s:\n",
" parts = s.split(\":\")\n",
" try:\n",
" hh = int(float(parts[0]))\n",
" mm = int(float(parts[1])) if len(parts) > 1 else 0\n",
" return hh + mm/60.0\n",
" except:\n",
" return np.nan\n",
" try:\n",
" return float(s)\n",
" except:\n",
" return np.nan\n",
"\n",
"# 1) y + groups (groups dùng ở cell 3 để split theo tòa)\n",
"y = df[TARGET].astype(float).copy()\n",
"groups = df[GROUP_COL].astype(str).copy()\n",
"\n",
"# 2) Time features từ bat_dau/ket_thuc (KHÔNG one-hot)\n",
"hour_start = df[\"bat_dau\"].apply(parse_hour)\n",
"hour_end = df[\"ket_thuc\"].apply(parse_hour)\n",
"\n",
"is_cross_day = ((hour_end < hour_start) & hour_start.notna() & hour_end.notna()).astype(int)\n",
"shift_length = ((hour_end - hour_start) % 24).fillna(0)\n",
"is_night_shift = ((hour_start >= 22) | (hour_start < 6)).fillna(False).astype(int)\n",
"\n",
"# 3) Build X: drop 4 cols + drop target + drop raw time cols\n",
"X = df.drop(columns=[c for c in DROP_COLS if c in df.columns] + [TARGET], errors=\"ignore\").copy()\n",
"X = X.drop(columns=[c for c in [\"bat_dau\", \"ket_thuc\"] if c in X.columns], errors=\"ignore\")\n",
"\n",
"# Add engineered time cols\n",
"X[\"hour_start\"] = hour_start.fillna(0)\n",
"X[\"hour_end\"] = hour_end.fillna(0)\n",
"X[\"shift_length\"] = shift_length\n",
"X[\"is_cross_day\"] = is_cross_day\n",
"X[\"is_night_shift\"] = is_night_shift\n",
"\n",
"# 4) Fill NA numeric\n",
"num_cols = X.select_dtypes(include=[np.number]).columns\n",
"X[num_cols] = X[num_cols].fillna(0)\n",
"\n",
"# 5) One-hot ONLY loai_ca (nếu có)\n",
"if \"loai_ca\" in X.columns:\n",
" X[\"loai_ca\"] = X[\"loai_ca\"].fillna(\"UNKNOWN\").astype(str)\n",
" X = pd.get_dummies(X, columns=[\"loai_ca\"], drop_first=True)\n",
"\n",
"print(\"✅ Final X shape:\", X.shape, \"| y shape:\", y.shape, \"| #buildings:\", groups.nunique())\n",
"print(\"Any NaN left in X?:\", X.isna().any().any())\n",
"\n",
"# sanity: confirm no ket_thuc_* or bat_dau_* one-hot columns\n",
"bad_cols = [c for c in X.columns if str(c).startswith(\"bat_dau_\") or str(c).startswith(\"ket_thuc_\")]\n",
"print(\"Time one-hot columns (should be 0):\", len(bad_cols))\n",
"\n",
"X.head(3)\n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "2df3b609",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Shapes:\n",
"Train: (283, 45) (283,) | buildings: 134\n",
"Val: (58, 45) (58,) | buildings: 29\n",
"Test: (53, 45) (53,) | buildings: 29\n",
"\n",
"Leakage check (should all be 0):\n",
"Train ∩ Val : 0\n",
"Train ∩ Test: 0\n",
"Val ∩ Test: 0\n"
]
}
],
"source": [
"from sklearn.model_selection import GroupShuffleSplit\n",
"\n",
"# ----- 1) Split: (train+val) vs test = 85% / 15% -----\n",
"gss1 = GroupShuffleSplit(n_splits=1, test_size=0.15, random_state=42)\n",
"trainval_idx, test_idx = next(gss1.split(X, y, groups=groups))\n",
"\n",
"X_trainval, X_test = X.iloc[trainval_idx].reset_index(drop=True), X.iloc[test_idx].reset_index(drop=True)\n",
"y_trainval, y_test = y.iloc[trainval_idx].reset_index(drop=True), y.iloc[test_idx].reset_index(drop=True)\n",
"groups_trainval = groups.iloc[trainval_idx].reset_index(drop=True)\n",
"groups_test = groups.iloc[test_idx].reset_index(drop=True)\n",
"\n",
"# ----- 2) Split: train vs val inside trainval = 82.35% / 17.65% -> overall 70% / 15% -----\n",
"gss2 = GroupShuffleSplit(n_splits=1, test_size=0.1765, random_state=42)\n",
"train_idx, val_idx = next(gss2.split(X_trainval, y_trainval, groups=groups_trainval))\n",
"\n",
"X_train, X_val = X_trainval.iloc[train_idx].reset_index(drop=True), X_trainval.iloc[val_idx].reset_index(drop=True)\n",
"y_train, y_val = y_trainval.iloc[train_idx].reset_index(drop=True), y_trainval.iloc[val_idx].reset_index(drop=True)\n",
"groups_train = groups_trainval.iloc[train_idx].reset_index(drop=True)\n",
"groups_val = groups_trainval.iloc[val_idx].reset_index(drop=True)\n",
"\n",
"# ----- 3) Report -----\n",
"print(\"Shapes:\")\n",
"print(\"Train:\", X_train.shape, y_train.shape, \"| buildings:\", groups_train.nunique())\n",
"print(\"Val: \", X_val.shape, y_val.shape, \"| buildings:\", groups_val.nunique())\n",
"print(\"Test: \", X_test.shape, y_test.shape, \"| buildings:\", groups_test.nunique())\n",
"\n",
"# Leakage check: ensure no building appears in multiple splits\n",
"train_b = set(groups_train.unique())\n",
"val_b = set(groups_val.unique())\n",
"test_b = set(groups_test.unique())\n",
"print(\"\\nLeakage check (should all be 0):\")\n",
"print(\"Train ∩ Val :\", len(train_b & val_b))\n",
"print(\"Train ∩ Test:\", len(train_b & test_b))\n",
"print(\"Val ∩ Test:\", len(val_b & test_b))\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "8cc64019",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Model: Ridge_log\n",
"Train | MAE=2.995 | RMSE=5.484 | R2=0.484\n",
"Val | MAE=1.398 | RMSE=2.015 | R2=0.037\n",
"Test | MAE=2.744 | RMSE=4.416 | R2=0.155\n",
"\n",
"Model: GBR_log\n",
"Train | MAE=1.201 | RMSE=2.466 | R2=0.896\n",
"Val | MAE=1.213 | RMSE=1.832 | R2=0.203\n",
"Test | MAE=2.979 | RMSE=4.810 | R2=-0.002\n",
"\n",
"✅ Saved: test_predictions_gbr.xlsx\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>y_true</th>\n",
" <th>y_pred</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>29.0</td>\n",
" <td>17.014321</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4.0</td>\n",
" <td>5.206821</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>4.0</td>\n",
" <td>2.930329</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3.0</td>\n",
" <td>1.853829</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2.0</td>\n",
" <td>1.561030</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1.0</td>\n",
" <td>0.867534</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>3.0</td>\n",
" <td>1.601529</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>3.0</td>\n",
" <td>2.436945</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>13.0</td>\n",
" <td>3.495240</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>2.0</td>\n",
" <td>1.596085</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" y_true y_pred\n",
"0 29.0 17.014321\n",
"1 4.0 5.206821\n",
"2 4.0 2.930329\n",
"3 3.0 1.853829\n",
"4 2.0 1.561030\n",
"5 1.0 0.867534\n",
"6 3.0 1.601529\n",
"7 3.0 2.436945\n",
"8 13.0 3.495240\n",
"9 2.0 1.596085"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
"from sklearn.linear_model import Ridge\n",
"from sklearn.ensemble import GradientBoostingRegressor\n",
"\n",
"def rmse(y, yhat):\n",
" return float(np.sqrt(mean_squared_error(y, yhat)))\n",
"\n",
"def eval_model(name, model, X_tr, y_tr, X_va, y_va, X_te, y_te, log_target=False):\n",
" # fit\n",
" model.fit(X_tr, np.log1p(y_tr) if log_target else y_tr)\n",
"\n",
" # predict helper\n",
" def pred(m, X):\n",
" p = m.predict(X)\n",
" return np.expm1(p) if log_target else p\n",
"\n",
" yhat_tr = pred(model, X_tr)\n",
" yhat_va = pred(model, X_va)\n",
" yhat_te = pred(model, X_te)\n",
"\n",
" def metrics(y, yhat):\n",
" return {\n",
" \"MAE\": float(mean_absolute_error(y, yhat)),\n",
" \"RMSE\": rmse(y, yhat),\n",
" \"R2\": float(r2_score(y, yhat)),\n",
" }\n",
"\n",
" res = {\n",
" \"model\": name,\n",
" \"Train\": metrics(y_tr, yhat_tr),\n",
" \"Val\": metrics(y_va, yhat_va),\n",
" \"Test\": metrics(y_te, yhat_te),\n",
" }\n",
" return res, yhat_te\n",
"\n",
"\n",
"results = []\n",
"\n",
"# 1) Ridge (log target)\n",
"ridge = Ridge(alpha=1.0, random_state=42)\n",
"res_ridge, ridge_test_pred = eval_model(\n",
" \"Ridge_log\", ridge,\n",
" X_train, y_train,\n",
" X_val, y_val,\n",
" X_test, y_test,\n",
" log_target=True\n",
")\n",
"results.append(res_ridge)\n",
"\n",
"# 2) Gradient Boosting (log target)\n",
"gbr = GradientBoostingRegressor(\n",
" n_estimators=300,\n",
" learning_rate=0.05,\n",
" max_depth=3,\n",
" random_state=42\n",
")\n",
"res_gbr, gbr_test_pred = eval_model(\n",
" \"GBR_log\", gbr,\n",
" X_train, y_train,\n",
" X_val, y_val,\n",
" X_test, y_test,\n",
" log_target=True\n",
")\n",
"results.append(res_gbr)\n",
"\n",
"# Print results\n",
"for r in results:\n",
" print(\"\\nModel:\", r[\"model\"])\n",
" for split in [\"Train\", \"Val\", \"Test\"]:\n",
" m = r[split]\n",
" print(f\"{split:5s} | MAE={m['MAE']:.3f} | RMSE={m['RMSE']:.3f} | R2={m['R2']:.3f}\")\n",
"\n",
"# Save TEST predictions (GBR)\n",
"out = pd.DataFrame({\n",
" \"y_true\": y_test.values,\n",
" \"y_pred\": np.maximum(0, gbr_test_pred) # clamp negative\n",
"})\n",
"out.to_excel(\"test_predictions_gbr.xlsx\", index=False)\n",
"print(\"\\n✅ Saved: test_predictions_gbr.xlsx\")\n",
"out.head(10)\n"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "e238b641",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"=== ML only (GBR_reg, log target) ===\n",
"Train | MAE=2.145 | RMSE=4.539 | R2=0.647\n",
"Val | MAE=1.074 | RMSE=1.722 | R2=0.296\n",
"Test | MAE=2.669 | RMSE=4.565 | R2=0.097\n"
]
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.ensemble import GradientBoostingRegressor\n",
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
"\n",
"def rmse(y, yhat):\n",
" return float(np.sqrt(mean_squared_error(y, yhat)))\n",
"\n",
"def report(name, y_true, y_pred):\n",
" print(f\"{name:5s} | MAE={mean_absolute_error(y_true, y_pred):.3f} \"\n",
" f\"| RMSE={rmse(y_true, y_pred):.3f} | R2={r2_score(y_true, y_pred):.3f}\")\n",
"\n",
"# GBR \"hiền\" hơn để giảm overfit\n",
"gbr_reg = GradientBoostingRegressor(\n",
" n_estimators=600,\n",
" learning_rate=0.03,\n",
" max_depth=2,\n",
" min_samples_leaf=10,\n",
" min_samples_split=20,\n",
" random_state=42\n",
")\n",
"\n",
"# Train với log1p(target)\n",
"gbr_reg.fit(X_train, np.log1p(y_train))\n",
"\n",
"def predict_original_scale(model, X):\n",
" return np.maximum(0, np.expm1(model.predict(X)))\n",
"\n",
"pred_train = predict_original_scale(gbr_reg, X_train)\n",
"pred_val = predict_original_scale(gbr_reg, X_val)\n",
"pred_test = predict_original_scale(gbr_reg, X_test)\n",
"\n",
"print(\"=== ML only (GBR_reg, log target) ===\")\n",
"report(\"Train\", y_train, pred_train)\n",
"report(\"Val\", y_val, pred_val)\n",
"report(\"Test\", y_test, pred_test)\n"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "635bf672",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best K on VAL: {'K': 150, 'mae_val': 1.0616528195190562}\n",
"\n",
"=== After Rule: y_pred_final = max(ML_pred, ceil(num_tasks/K)) ===\n",
"Rule used: min_staff = ceil(num_tasks / 150)\n",
"Train | MAE=2.259 | RMSE=4.601 | R2=0.637\n",
"Val | MAE=1.062 | RMSE=1.721 | R2=0.297\n",
"Test | MAE=2.602 | RMSE=4.527 | R2=0.112\n",
"\n",
"TEST big cases (y_true >= 10): 6\n",
"ML | MAE=8.512 | RMSE=9.050 | R2=-0.959\n",
"Rule | MAE=8.320 | RMSE=8.909 | R2=-0.899\n",
"\n",
"✅ Saved: test_predictions_ml_plus_rule.xlsx\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>y_true</th>\n",
" <th>y_pred_ml</th>\n",
" <th>y_pred_final</th>\n",
" <th>num_tasks</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>29.0</td>\n",
" <td>15.005548</td>\n",
" <td>15.005548</td>\n",
" <td>593</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4.0</td>\n",
" <td>4.369563</td>\n",
" <td>4.369563</td>\n",
" <td>593</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>4.0</td>\n",
" <td>2.902404</td>\n",
" <td>4.000000</td>\n",
" <td>593</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3.0</td>\n",
" <td>2.591762</td>\n",
" <td>4.000000</td>\n",
" <td>593</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2.0</td>\n",
" <td>2.089570</td>\n",
" <td>2.089570</td>\n",
" <td>33</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1.0</td>\n",
" <td>0.946909</td>\n",
" <td>1.000000</td>\n",
" <td>33</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>3.0</td>\n",
" <td>1.167053</td>\n",
" <td>2.000000</td>\n",
" <td>182</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>3.0</td>\n",
" <td>2.154357</td>\n",
" <td>2.154357</td>\n",
" <td>182</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>13.0</td>\n",
" <td>3.034284</td>\n",
" <td>3.034284</td>\n",
" <td>182</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>2.0</td>\n",
" <td>1.674387</td>\n",
" <td>1.674387</td>\n",
" <td>124</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" y_true y_pred_ml y_pred_final num_tasks\n",
"0 29.0 15.005548 15.005548 593\n",
"1 4.0 4.369563 4.369563 593\n",
"2 4.0 2.902404 4.000000 593\n",
"3 3.0 2.591762 4.000000 593\n",
"4 2.0 2.089570 2.089570 33\n",
"5 1.0 0.946909 1.000000 33\n",
"6 3.0 1.167053 2.000000 182\n",
"7 3.0 2.154357 2.154357 182\n",
"8 13.0 3.034284 3.034284 182\n",
"9 2.0 1.674387 1.674387 124"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
"\n",
"def rmse(y, yhat):\n",
" return float(np.sqrt(mean_squared_error(y, yhat)))\n",
"\n",
"def report(name, y_true, y_pred):\n",
" print(f\"{name:5s} | MAE={mean_absolute_error(y_true, y_pred):.3f} \"\n",
" f\"| RMSE={rmse(y_true, y_pred):.3f} | R2={r2_score(y_true, y_pred):.3f}\")\n",
"\n",
"# num_tasks phải tồn tại trong X_train/X_val/X_test\n",
"assert \"num_tasks\" in X_train.columns, \"❌ Missing num_tasks in X_train\"\n",
"\n",
"nt_train = X_train[\"num_tasks\"].values\n",
"nt_val = X_val[\"num_tasks\"].values\n",
"nt_test = X_test[\"num_tasks\"].values\n",
"\n",
"# ---- tune K on VAL (KHÔNG đụng test khi tune) ----\n",
"Ks = [30, 40, 50, 60, 70, 80, 100, 120, 150]\n",
"best = None\n",
"\n",
"for K in Ks:\n",
" min_val = np.ceil(nt_val / K)\n",
" pred_val_rule = np.maximum(pred_val, min_val)\n",
" mae = mean_absolute_error(y_val, pred_val_rule)\n",
" if (best is None) or (mae < best[\"mae_val\"]):\n",
" best = {\"K\": K, \"mae_val\": mae}\n",
"\n",
"print(\"Best K on VAL:\", best)\n",
"\n",
"K_best = best[\"K\"]\n",
"\n",
"def apply_rule(pred, num_tasks, K):\n",
" min_staff = np.ceil(num_tasks / K)\n",
" return np.maximum(pred, min_staff)\n",
"\n",
"pred_train_rule = apply_rule(pred_train, nt_train, K_best)\n",
"pred_val_rule = apply_rule(pred_val, nt_val, K_best)\n",
"pred_test_rule = apply_rule(pred_test, nt_test, K_best)\n",
"\n",
"print(\"\\n=== After Rule: y_pred_final = max(ML_pred, ceil(num_tasks/K)) ===\")\n",
"print(\"Rule used: min_staff = ceil(num_tasks / %d)\" % K_best)\n",
"\n",
"report(\"Train\", y_train, pred_train_rule)\n",
"report(\"Val\", y_val, pred_val_rule)\n",
"report(\"Test\", y_test, pred_test_rule)\n",
"\n",
"# ---- big cases analysis on TEST ----\n",
"mask_big = (y_test.values >= 10)\n",
"print(\"\\nTEST big cases (y_true >= 10):\", int(mask_big.sum()))\n",
"if mask_big.sum() > 0:\n",
" report(\"ML\", y_test.values[mask_big], pred_test[mask_big])\n",
" report(\"Rule\", y_test.values[mask_big], pred_test_rule[mask_big])\n",
"\n",
"# save test predictions (after rule)\n",
"out_rule = pd.DataFrame({\n",
" \"y_true\": y_test.values,\n",
" \"y_pred_ml\": pred_test,\n",
" \"y_pred_final\": pred_test_rule,\n",
" \"num_tasks\": nt_test\n",
"})\n",
"out_rule.to_excel(\"test_predictions_ml_plus_rule.xlsx\", index=False)\n",
"print(\"\\n✅ Saved: test_predictions_ml_plus_rule.xlsx\")\n",
"out_rule.head(10)\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "70493591",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best (a,b) on VAL: {'a': 0, 'b': 0.1, 'mae_val': 1.0737151241156944}\n",
"\n",
"=== After Smooth Rule: y_pred_final = max(ML_pred, ceil(a + b*sqrt(num_tasks))) ===\n",
"Rule used: ceil(0 + 0.1*sqrt(num_tasks))\n",
"Train | MAE=2.185 | RMSE=4.551 | R2=0.645\n",
"Val | MAE=1.074 | RMSE=1.701 | R2=0.314\n",
"Test | MAE=2.599 | RMSE=4.556 | R2=0.101\n",
"\n",
"TEST big cases (y_true >= 10): 6\n",
"ML | MAE=8.512 | RMSE=9.050 | R2=-0.959\n",
"Smooth | MAE=8.512 | RMSE=9.050 | R2=-0.959\n",
"\n",
"✅ Saved: test_predictions_ml_plus_smooth_rule.xlsx\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>y_true</th>\n",
" <th>y_pred_ml</th>\n",
" <th>y_pred_final</th>\n",
" <th>num_tasks</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>29.0</td>\n",
" <td>15.005548</td>\n",
" <td>15.005548</td>\n",
" <td>593</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4.0</td>\n",
" <td>4.369563</td>\n",
" <td>4.369563</td>\n",
" <td>593</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>4.0</td>\n",
" <td>2.902404</td>\n",
" <td>3.000000</td>\n",
" <td>593</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3.0</td>\n",
" <td>2.591762</td>\n",
" <td>3.000000</td>\n",
" <td>593</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2.0</td>\n",
" <td>2.089570</td>\n",
" <td>2.089570</td>\n",
" <td>33</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1.0</td>\n",
" <td>0.946909</td>\n",
" <td>1.000000</td>\n",
" <td>33</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>3.0</td>\n",
" <td>1.167053</td>\n",
" <td>2.000000</td>\n",
" <td>182</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>3.0</td>\n",
" <td>2.154357</td>\n",
" <td>2.154357</td>\n",
" <td>182</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>13.0</td>\n",
" <td>3.034284</td>\n",
" <td>3.034284</td>\n",
" <td>182</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>2.0</td>\n",
" <td>1.674387</td>\n",
" <td>2.000000</td>\n",
" <td>124</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" y_true y_pred_ml y_pred_final num_tasks\n",
"0 29.0 15.005548 15.005548 593\n",
"1 4.0 4.369563 4.369563 593\n",
"2 4.0 2.902404 3.000000 593\n",
"3 3.0 2.591762 3.000000 593\n",
"4 2.0 2.089570 2.089570 33\n",
"5 1.0 0.946909 1.000000 33\n",
"6 3.0 1.167053 2.000000 182\n",
"7 3.0 2.154357 2.154357 182\n",
"8 13.0 3.034284 3.034284 182\n",
"9 2.0 1.674387 2.000000 124"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
"\n",
"def rmse(y, yhat):\n",
" return float(np.sqrt(mean_squared_error(y, yhat)))\n",
"\n",
"def report(name, y_true, y_pred):\n",
" print(f\"{name:5s} | MAE={mean_absolute_error(y_true, y_pred):.3f} \"\n",
" f\"| RMSE={rmse(y_true, y_pred):.3f} | R2={r2_score(y_true, y_pred):.3f}\")\n",
"\n",
"nt_train = X_train[\"num_tasks\"].values\n",
"nt_val = X_val[\"num_tasks\"].values\n",
"nt_test = X_test[\"num_tasks\"].values\n",
"\n",
"# search grid (nhỏ thôi để bạn dễ đọc)\n",
"a_list = [0, 1, 2]\n",
"b_list = [0.10, 0.15, 0.20, 0.25, 0.30]\n",
"\n",
"best = None\n",
"for a in a_list:\n",
" for b in b_list:\n",
" min_val = np.ceil(a + b * np.sqrt(nt_val))\n",
" pred_val_rule = np.maximum(pred_val, min_val)\n",
" mae = mean_absolute_error(y_val, pred_val_rule)\n",
" if (best is None) or (mae < best[\"mae_val\"]):\n",
" best = {\"a\": a, \"b\": b, \"mae_val\": mae}\n",
"\n",
"print(\"Best (a,b) on VAL:\", best)\n",
"\n",
"a_best, b_best = best[\"a\"], best[\"b\"]\n",
"\n",
"def apply_smooth_rule(pred, num_tasks, a, b):\n",
" min_staff = np.ceil(a + b * np.sqrt(num_tasks))\n",
" return np.maximum(pred, min_staff)\n",
"\n",
"pred_train_rule2 = apply_smooth_rule(pred_train, nt_train, a_best, b_best)\n",
"pred_val_rule2 = apply_smooth_rule(pred_val, nt_val, a_best, b_best)\n",
"pred_test_rule2 = apply_smooth_rule(pred_test, nt_test, a_best, b_best)\n",
"\n",
"print(\"\\n=== After Smooth Rule: y_pred_final = max(ML_pred, ceil(a + b*sqrt(num_tasks))) ===\")\n",
"print(f\"Rule used: ceil({a_best} + {b_best}*sqrt(num_tasks))\")\n",
"\n",
"report(\"Train\", y_train, pred_train_rule2)\n",
"report(\"Val\", y_val, pred_val_rule2)\n",
"report(\"Test\", y_test, pred_test_rule2)\n",
"\n",
"mask_big = (y_test.values >= 10)\n",
"print(\"\\nTEST big cases (y_true >= 10):\", int(mask_big.sum()))\n",
"if mask_big.sum() > 0:\n",
" report(\"ML\", y_test.values[mask_big], pred_test[mask_big])\n",
" report(\"Smooth\",y_test.values[mask_big], pred_test_rule2[mask_big])\n",
"\n",
"# save\n",
"out_rule2 = pd.DataFrame({\n",
" \"y_true\": y_test.values,\n",
" \"y_pred_ml\": pred_test,\n",
" \"y_pred_final\": pred_test_rule2,\n",
" \"num_tasks\": nt_test\n",
"})\n",
"out_rule2.to_excel(\"test_predictions_ml_plus_smooth_rule.xlsx\", index=False)\n",
"print(\"\\n✅ Saved: test_predictions_ml_plus_smooth_rule.xlsx\")\n",
"out_rule2.head(10)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e71605c4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Big-rate (positive class=1):\n",
"Train: 0.14840989399293286 | count: 42 / 283\n",
"Val: 0.017241379310344827 | count: 1 / 58\n",
"Test: 0.11320754716981132 | count: 6 / 53\n",
"\n",
"================================================================================\n",
"MODEL: LogReg_balanced\n",
"\n",
"[VAL] classification_report:\n",
" precision recall f1-score support\n",
"\n",
" 0 0.981 0.930 0.955 57\n",
" 1 0.000 0.000 0.000 1\n",
"\n",
" accuracy 0.914 58\n",
" macro avg 0.491 0.465 0.477 58\n",
"weighted avg 0.965 0.914 0.938 58\n",
"\n",
"VAL confusion_matrix:\n",
" [[53 4]\n",
" [ 1 0]]\n",
"\n",
"[TEST] classification_report:\n",
" precision recall f1-score support\n",
"\n",
" 0 0.881 0.787 0.831 47\n",
" 1 0.091 0.167 0.118 6\n",
"\n",
" accuracy 0.717 53\n",
" macro avg 0.486 0.477 0.475 53\n",
"weighted avg 0.792 0.717 0.751 53\n",
"\n",
"TEST confusion_matrix:\n",
" [[37 10]\n",
" [ 5 1]]\n",
"\n",
"================================================================================\n",
"MODEL: GBC_depth2\n",
"\n",
"[VAL] classification_report:\n",
" precision recall f1-score support\n",
"\n",
" 0 0.983 1.000 0.991 57\n",
" 1 0.000 0.000 0.000 1\n",
"\n",
" accuracy 0.983 58\n",
" macro avg 0.491 0.500 0.496 58\n",
"weighted avg 0.966 0.983 0.974 58\n",
"\n",
"VAL confusion_matrix:\n",
" [[57 0]\n",
" [ 1 0]]\n",
"\n",
"[TEST] classification_report:\n",
" precision recall f1-score support\n",
"\n",
" 0 0.907 0.830 0.867 47\n",
" 1 0.200 0.333 0.250 6\n",
"\n",
" accuracy 0.774 53\n",
" macro avg 0.553 0.582 0.558 53\n",
"weighted avg 0.827 0.774 0.797 53\n",
"\n",
"TEST confusion_matrix:\n",
" [[39 8]\n",
" [ 4 2]]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
"c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
"c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
}
],
"source": [
"# Liệt kê các biến model đang tồn tại\n",
"[name for name in globals().keys() if \"gbr\" in name.lower()]\n"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "ce971deb",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['gbr', 'res_gbr', 'gbr_test_pred', 'gbr_reg']"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Liệt kê các biến model đang tồn tại\n",
"[name for name in globals().keys() if \"gbr\" in name.lower()]\n"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "9ad22a15",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== TEST RESULTS ===\n",
"ML only | MAE=2.669 | RMSE=4.565 | R2=0.097\n",
"ML + Business Rules | MAE=2.663 | RMSE=4.510 | R2=0.119\n",
"✅ Saved: test_predictions_ml_plus_business_rules.xlsx\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>y_true</th>\n",
" <th>y_pred_ml</th>\n",
" <th>min_staff_rule</th>\n",
" <th>y_pred_final</th>\n",
" <th>num_tasks</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>29.0</td>\n",
" <td>15.006</td>\n",
" <td>4.0</td>\n",
" <td>15.006</td>\n",
" <td>593</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4.0</td>\n",
" <td>4.370</td>\n",
" <td>4.0</td>\n",
" <td>4.370</td>\n",
" <td>593</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>4.0</td>\n",
" <td>2.902</td>\n",
" <td>4.0</td>\n",
" <td>4.000</td>\n",
" <td>593</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3.0</td>\n",
" <td>2.592</td>\n",
" <td>6.0</td>\n",
" <td>6.000</td>\n",
" <td>593</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2.0</td>\n",
" <td>2.090</td>\n",
" <td>1.0</td>\n",
" <td>2.090</td>\n",
" <td>33</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1.0</td>\n",
" <td>0.947</td>\n",
" <td>1.0</td>\n",
" <td>1.000</td>\n",
" <td>33</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>3.0</td>\n",
" <td>1.167</td>\n",
" <td>3.0</td>\n",
" <td>3.000</td>\n",
" <td>182</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>3.0</td>\n",
" <td>2.154</td>\n",
" <td>2.0</td>\n",
" <td>2.154</td>\n",
" <td>182</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>13.0</td>\n",
" <td>3.034</td>\n",
" <td>2.0</td>\n",
" <td>3.034</td>\n",
" <td>182</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>2.0</td>\n",
" <td>1.674</td>\n",
" <td>3.0</td>\n",
" <td>3.000</td>\n",
" <td>124</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" y_true y_pred_ml min_staff_rule y_pred_final num_tasks\n",
"0 29.0 15.006 4.0 15.006 593\n",
"1 4.0 4.370 4.0 4.370 593\n",
"2 4.0 2.902 4.0 4.000 593\n",
"3 3.0 2.592 6.0 6.000 593\n",
"4 2.0 2.090 1.0 2.090 33\n",
"5 1.0 0.947 1.0 1.000 33\n",
"6 3.0 1.167 3.0 3.000 182\n",
"7 3.0 2.154 2.0 2.154 182\n",
"8 13.0 3.034 2.0 3.034 182\n",
"9 2.0 1.674 3.0 3.000 124"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
"\n",
"def eval_reg(y_true, y_pred, name):\n",
" rmse = float(np.sqrt(mean_squared_error(y_true, y_pred)))\n",
" print(f\"{name:20s} | MAE={mean_absolute_error(y_true, y_pred):.3f} | RMSE={rmse:.3f} | R2={r2_score(y_true, y_pred):.3f}\")\n",
"\n",
"# -----------------------\n",
"# 1) ML prediction (log target -> original)\n",
"# -----------------------\n",
"y_pred_ml = np.maximum(0, np.expm1(gbr_reg.predict(X_test)))\n",
"\n",
"# -----------------------\n",
"# 2) Business rules: minimum staffing\n",
"# (các cột có thể có/không có nên check)\n",
"# -----------------------\n",
"min_staff = np.ceil(X_test[\"num_tasks\"] / 150)\n",
"\n",
"if \"num_wc_tasks\" in X_test.columns:\n",
" min_staff = np.maximum(min_staff, np.ceil(X_test[\"num_wc_tasks\"] / 40))\n",
"\n",
"if \"num_outdoor_tasks\" in X_test.columns:\n",
" min_staff = np.maximum(min_staff, np.ceil(X_test[\"num_outdoor_tasks\"] / 60))\n",
"\n",
"# Nếu bạn có cột shift (đêm / qua ngày) thì cộng thêm\n",
"for col in [\"is_night_shift\", \"is_cross_day\"]:\n",
" if col in X_test.columns:\n",
" min_staff = min_staff + X_test[col].astype(int)\n",
"\n",
"# -----------------------\n",
"# 3) Final prediction\n",
"# -----------------------\n",
"y_pred_final = np.maximum(y_pred_ml, min_staff)\n",
"\n",
"# -----------------------\n",
"# 4) Evaluate\n",
"# -----------------------\n",
"print(\"\\n=== TEST RESULTS ===\")\n",
"eval_reg(y_test, y_pred_ml, \"ML only\")\n",
"eval_reg(y_test, y_pred_final,\"ML + Business Rules\")\n",
"\n",
"# -----------------------\n",
"# 5) Save file\n",
"# -----------------------\n",
"out = pd.DataFrame({\n",
" \"y_true\": y_test.values,\n",
" \"y_pred_ml\": np.round(y_pred_ml, 3),\n",
" \"min_staff_rule\": min_staff.astype(float).values,\n",
" \"y_pred_final\": np.round(y_pred_final, 3),\n",
" \"num_tasks\": X_test[\"num_tasks\"].values\n",
"})\n",
"\n",
"out.to_excel(\"test_predictions_ml_plus_business_rules.xlsx\", index=False)\n",
"print(\"✅ Saved: test_predictions_ml_plus_business_rules.xlsx\")\n",
"\n",
"out.head(10)\n"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "79387bd4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"✅ Best rule on VAL: {'k_tasks': 150, 'k_wc': 30, 'k_out': 40, 'mae_val': 1.0984917361586377}\n",
"\n",
"=== TEST EVAL ===\n",
"ML only | MAE=2.669 | RMSE=4.565 | R2=0.097\n",
"ML + tuned business rules | MAE=2.600 | RMSE=4.440 | R2=0.146\n",
"✅ Saved: test_predictions_ml_plus_tuned_rules.xlsx\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>y_true</th>\n",
" <th>y_pred_ml</th>\n",
" <th>min_staff_rule</th>\n",
" <th>y_pred_final</th>\n",
" <th>num_tasks</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>29.0</td>\n",
" <td>15.006</td>\n",
" <td>4.0</td>\n",
" <td>15.006</td>\n",
" <td>593</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4.0</td>\n",
" <td>4.370</td>\n",
" <td>4.0</td>\n",
" <td>4.370</td>\n",
" <td>593</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>4.0</td>\n",
" <td>2.902</td>\n",
" <td>4.0</td>\n",
" <td>4.000</td>\n",
" <td>593</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3.0</td>\n",
" <td>2.592</td>\n",
" <td>6.0</td>\n",
" <td>6.000</td>\n",
" <td>593</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2.0</td>\n",
" <td>2.090</td>\n",
" <td>1.0</td>\n",
" <td>2.090</td>\n",
" <td>33</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1.0</td>\n",
" <td>0.947</td>\n",
" <td>1.0</td>\n",
" <td>1.000</td>\n",
" <td>33</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>3.0</td>\n",
" <td>1.167</td>\n",
" <td>3.0</td>\n",
" <td>3.000</td>\n",
" <td>182</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>3.0</td>\n",
" <td>2.154</td>\n",
" <td>2.0</td>\n",
" <td>2.154</td>\n",
" <td>182</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>13.0</td>\n",
" <td>3.034</td>\n",
" <td>2.0</td>\n",
" <td>3.034</td>\n",
" <td>182</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>2.0</td>\n",
" <td>1.674</td>\n",
" <td>3.0</td>\n",
" <td>3.000</td>\n",
" <td>124</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" y_true y_pred_ml min_staff_rule y_pred_final num_tasks\n",
"0 29.0 15.006 4.0 15.006 593\n",
"1 4.0 4.370 4.0 4.370 593\n",
"2 4.0 2.902 4.0 4.000 593\n",
"3 3.0 2.592 6.0 6.000 593\n",
"4 2.0 2.090 1.0 2.090 33\n",
"5 1.0 0.947 1.0 1.000 33\n",
"6 3.0 1.167 3.0 3.000 182\n",
"7 3.0 2.154 2.0 2.154 182\n",
"8 13.0 3.034 2.0 3.034 182\n",
"9 2.0 1.674 3.0 3.000 124"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
"\n",
"def rmse(y, yhat):\n",
" return float(np.sqrt(mean_squared_error(y, yhat)))\n",
"\n",
"def eval_reg(y_true, y_pred, name):\n",
" print(f\"{name:25s} | MAE={mean_absolute_error(y_true, y_pred):.3f} | RMSE={rmse(y_true, y_pred):.3f} | R2={r2_score(y_true, y_pred):.3f}\")\n",
"\n",
"# ML preds (đã train gbr_reg)\n",
"pred_train = np.maximum(0, np.expm1(gbr_reg.predict(X_train)))\n",
"pred_val = np.maximum(0, np.expm1(gbr_reg.predict(X_val)))\n",
"pred_test = np.maximum(0, np.expm1(gbr_reg.predict(X_test)))\n",
"\n",
"def compute_min_staff(X, k_tasks, k_wc=None, k_out=None):\n",
" ms = np.ceil(X[\"num_tasks\"] / k_tasks)\n",
"\n",
" if (k_wc is not None) and (\"num_wc_tasks\" in X.columns):\n",
" ms = np.maximum(ms, np.ceil(X[\"num_wc_tasks\"] / k_wc))\n",
"\n",
" if (k_out is not None) and (\"num_outdoor_tasks\" in X.columns):\n",
" ms = np.maximum(ms, np.ceil(X[\"num_outdoor_tasks\"] / k_out))\n",
"\n",
" for col in [\"is_night_shift\", \"is_cross_day\"]:\n",
" if col in X.columns:\n",
" ms = ms + X[col].astype(int)\n",
" return ms\n",
"\n",
"# Grid search trên VAL\n",
"k_tasks_list = [100, 120, 150, 180, 200]\n",
"k_wc_list = [25, 30, 40, 50, None] # None = bỏ rule wc\n",
"k_out_list = [40, 60, 80, None] # None = bỏ rule outdoor\n",
"\n",
"best = None\n",
"\n",
"for kt in k_tasks_list:\n",
" for kw in k_wc_list:\n",
" for ko in k_out_list:\n",
" ms_val = compute_min_staff(X_val, kt, kw, ko)\n",
" pred_val_final = np.maximum(pred_val, ms_val)\n",
" mae_val = mean_absolute_error(y_val, pred_val_final)\n",
"\n",
" if (best is None) or (mae_val < best[\"mae_val\"]):\n",
" best = {\"k_tasks\": kt, \"k_wc\": kw, \"k_out\": ko, \"mae_val\": mae_val}\n",
"\n",
"print(\"✅ Best rule on VAL:\", best)\n",
"\n",
"# Apply best rule to TEST\n",
"ms_test = compute_min_staff(X_test, best[\"k_tasks\"], best[\"k_wc\"], best[\"k_out\"])\n",
"pred_test_final = np.maximum(pred_test, ms_test)\n",
"\n",
"print(\"\\n=== TEST EVAL ===\")\n",
"eval_reg(y_test, pred_test, \"ML only\")\n",
"eval_reg(y_test, pred_test_final, \"ML + tuned business rules\")\n",
"\n",
"# Save predictions\n",
"out = pd.DataFrame({\n",
" \"y_true\": y_test.values,\n",
" \"y_pred_ml\": np.round(pred_test, 3),\n",
" \"min_staff_rule\": ms_test.astype(float).values,\n",
" \"y_pred_final\": np.round(pred_test_final, 3),\n",
" \"num_tasks\": X_test[\"num_tasks\"].values\n",
"})\n",
"out.to_excel(\"test_predictions_ml_plus_tuned_rules.xlsx\", index=False)\n",
"print(\"✅ Saved: test_predictions_ml_plus_tuned_rules.xlsx\")\n",
"out.head(10)\n"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "6e9841d0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Class distribution:\n",
"Train:\n",
"so_luong\n",
"0 207\n",
"1 39\n",
"2 37\n",
"Name: count, dtype: int64\n",
"Val:\n",
"so_luong\n",
"0 53\n",
"1 4\n",
"2 1\n",
"Name: count, dtype: int64\n",
"Test:\n",
"so_luong\n",
"0 43\n",
"1 5\n",
"2 5\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"# ===== CELL 9: Create staff size class =====\n",
"\n",
"def staff_class(y):\n",
" if y <= 5:\n",
" return 0\n",
" elif y <= 10:\n",
" return 1\n",
" else:\n",
" return 2\n",
"\n",
"y_train_cls = y_train.apply(staff_class)\n",
"y_val_cls = y_val.apply(staff_class)\n",
"y_test_cls = y_test.apply(staff_class)\n",
"\n",
"print(\"Class distribution:\")\n",
"for name, y in [(\"Train\", y_train_cls), (\"Val\", y_val_cls), (\"Test\", y_test_cls)]:\n",
" print(f\"{name}:\")\n",
" print(y.value_counts().sort_index())\n"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "daf4acc7",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.\n",
" warnings.warn(\n",
"c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:465: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" n_iter_i = _check_optimize_result(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== VAL ===\n",
"\n",
"================================================================================\n",
"MODEL: LR_balanced (VAL)\n",
" precision recall f1-score support\n",
"\n",
" 0 0.956 0.811 0.878 53\n",
" 1 0.111 0.250 0.154 4\n",
" 2 0.000 0.000 0.000 1\n",
"\n",
" accuracy 0.759 58\n",
" macro avg 0.356 0.354 0.344 58\n",
"weighted avg 0.881 0.759 0.813 58\n",
"\n",
"Confusion matrix:\n",
" [[43 7 3]\n",
" [ 2 1 1]\n",
" [ 0 1 0]]\n",
"\n",
"================================================================================\n",
"MODEL: HGB (VAL)\n",
" precision recall f1-score support\n",
"\n",
" 0 0.946 1.000 0.972 53\n",
" 1 0.500 0.250 0.333 4\n",
" 2 0.000 0.000 0.000 1\n",
"\n",
" accuracy 0.931 58\n",
" macro avg 0.482 0.417 0.435 58\n",
"weighted avg 0.899 0.931 0.912 58\n",
"\n",
"Confusion matrix:\n",
" [[53 0 0]\n",
" [ 3 1 0]\n",
" [ 0 1 0]]\n",
"\n",
"=== TEST ===\n",
"\n",
"================================================================================\n",
"MODEL: LR_balanced (TEST)\n",
" precision recall f1-score support\n",
"\n",
" 0 0.914 0.744 0.821 43\n",
" 1 0.231 0.600 0.333 5\n",
" 2 0.000 0.000 0.000 5\n",
"\n",
" accuracy 0.660 53\n",
" macro avg 0.382 0.448 0.385 53\n",
"weighted avg 0.764 0.660 0.697 53\n",
"\n",
"Confusion matrix:\n",
" [[32 7 4]\n",
" [ 1 3 1]\n",
" [ 2 3 0]]\n",
"\n",
"================================================================================\n",
"MODEL: HGB (TEST)\n",
" precision recall f1-score support\n",
"\n",
" 0 0.925 0.860 0.892 43\n",
" 1 0.200 0.200 0.200 5\n",
" 2 0.125 0.200 0.154 5\n",
"\n",
" accuracy 0.736 53\n",
" macro avg 0.417 0.420 0.415 53\n",
"weighted avg 0.781 0.736 0.757 53\n",
"\n",
"Confusion matrix:\n",
" [[37 2 4]\n",
" [ 1 1 3]\n",
" [ 2 2 1]]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
"c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
"c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
}
],
"source": [
"import numpy as np\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.ensemble import HistGradientBoostingClassifier\n",
"from sklearn.metrics import classification_report, confusion_matrix\n",
"\n",
"# ---- 1) Models ----\n",
"clf_lr = LogisticRegression(\n",
" max_iter=5000,\n",
" class_weight=\"balanced\",\n",
" solver=\"lbfgs\",\n",
" multi_class=\"auto\",\n",
" random_state=42\n",
")\n",
"\n",
"clf_hgb = HistGradientBoostingClassifier(\n",
" max_depth=3,\n",
" learning_rate=0.05,\n",
" max_iter=500,\n",
" random_state=42\n",
")\n",
"\n",
"# ---- 2) Fit ----\n",
"clf_lr.fit(X_train, y_train_cls)\n",
"clf_hgb.fit(X_train, y_train_cls)\n",
"\n",
"# ---- 3) Predict (proba) ----\n",
"proba_val_lr = clf_lr.predict_proba(X_val)\n",
"proba_test_lr = clf_lr.predict_proba(X_test)\n",
"\n",
"proba_val_hgb = clf_hgb.predict_proba(X_val)\n",
"proba_test_hgb = clf_hgb.predict_proba(X_test)\n",
"\n",
"pred_val_lr = np.argmax(proba_val_lr, axis=1)\n",
"pred_test_lr = np.argmax(proba_test_lr, axis=1)\n",
"\n",
"pred_val_hgb = np.argmax(proba_val_hgb, axis=1)\n",
"pred_test_hgb = np.argmax(proba_test_hgb, axis=1)\n",
"\n",
"def show_clf(name, y_true, y_pred):\n",
" print(\"\\n\" + \"=\"*80)\n",
" print(\"MODEL:\", name)\n",
" print(classification_report(y_true, y_pred, digits=3))\n",
" print(\"Confusion matrix:\\n\", confusion_matrix(y_true, y_pred))\n",
"\n",
"print(\"\\n=== VAL ===\")\n",
"show_clf(\"LR_balanced (VAL)\", y_val_cls, pred_val_lr)\n",
"show_clf(\"HGB (VAL)\", y_val_cls, pred_val_hgb)\n",
"\n",
"print(\"\\n=== TEST ===\")\n",
"show_clf(\"LR_balanced (TEST)\", y_test_cls, pred_test_lr)\n",
"show_clf(\"HGB (TEST)\", y_test_cls, pred_test_hgb)\n"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "38c8d00e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"✅ Best (min_hc, min_2424) on VAL: {'min_hc': 0, 'min_2424': 0, 'mae_val': 1.0984917361586377}\n",
"\n",
"=== TEST EVAL ===\n",
"ML only (GBR log) | MAE=2.669 | RMSE=4.565 | R2=0.097\n",
"ML + tuned base rules | MAE=2.600 | RMSE=4.440 | R2=0.146\n",
"ML + base + loai_ca rules | MAE=2.600 | RMSE=4.440 | R2=0.146\n",
"✅ Saved: test_predictions_ml_plus_rules_plus_ca.xlsx\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>y_true</th>\n",
" <th>y_pred_ml</th>\n",
" <th>min_staff_base</th>\n",
" <th>min_staff_ca</th>\n",
" <th>min_staff_final</th>\n",
" <th>y_pred_final</th>\n",
" <th>num_tasks</th>\n",
" <th>loai_ca</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>29.0</td>\n",
" <td>15.006</td>\n",
" <td>4.0</td>\n",
" <td>0.0</td>\n",
" <td>4.0</td>\n",
" <td>15.006</td>\n",
" <td>593</td>\n",
" <td>Hành chính</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4.0</td>\n",
" <td>4.370</td>\n",
" <td>4.0</td>\n",
" <td>0.0</td>\n",
" <td>4.0</td>\n",
" <td>4.370</td>\n",
" <td>593</td>\n",
" <td>Ca sáng</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>4.0</td>\n",
" <td>2.902</td>\n",
" <td>4.0</td>\n",
" <td>0.0</td>\n",
" <td>4.0</td>\n",
" <td>4.000</td>\n",
" <td>593</td>\n",
" <td>Ca chiều</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3.0</td>\n",
" <td>2.592</td>\n",
" <td>6.0</td>\n",
" <td>0.0</td>\n",
" <td>6.0</td>\n",
" <td>6.000</td>\n",
" <td>593</td>\n",
" <td>Ca đêm</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2.0</td>\n",
" <td>2.090</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>2.090</td>\n",
" <td>33</td>\n",
" <td>Hành chính</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1.0</td>\n",
" <td>0.947</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.000</td>\n",
" <td>33</td>\n",
" <td>Part time</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>3.0</td>\n",
" <td>1.167</td>\n",
" <td>3.0</td>\n",
" <td>0.0</td>\n",
" <td>3.0</td>\n",
" <td>3.000</td>\n",
" <td>182</td>\n",
" <td>Ca sáng</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>3.0</td>\n",
" <td>2.154</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>2.154</td>\n",
" <td>182</td>\n",
" <td>Ca chiều</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>13.0</td>\n",
" <td>3.034</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>3.034</td>\n",
" <td>182</td>\n",
" <td>Hành chính</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>2.0</td>\n",
" <td>1.674</td>\n",
" <td>3.0</td>\n",
" <td>0.0</td>\n",
" <td>3.0</td>\n",
" <td>3.000</td>\n",
" <td>124</td>\n",
" <td>Ca sáng</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" y_true y_pred_ml min_staff_base min_staff_ca min_staff_final \\\n",
"0 29.0 15.006 4.0 0.0 4.0 \n",
"1 4.0 4.370 4.0 0.0 4.0 \n",
"2 4.0 2.902 4.0 0.0 4.0 \n",
"3 3.0 2.592 6.0 0.0 6.0 \n",
"4 2.0 2.090 1.0 0.0 1.0 \n",
"5 1.0 0.947 1.0 0.0 1.0 \n",
"6 3.0 1.167 3.0 0.0 3.0 \n",
"7 3.0 2.154 2.0 0.0 2.0 \n",
"8 13.0 3.034 2.0 0.0 2.0 \n",
"9 2.0 1.674 3.0 0.0 3.0 \n",
"\n",
" y_pred_final num_tasks loai_ca \n",
"0 15.006 593 Hành chính \n",
"1 4.370 593 Ca sáng \n",
"2 4.000 593 Ca chiều \n",
"3 6.000 593 Ca đêm \n",
"4 2.090 33 Hành chính \n",
"5 1.000 33 Part time \n",
"6 3.000 182 Ca sáng \n",
"7 2.154 182 Ca chiều \n",
"8 3.034 182 Hành chính \n",
"9 3.000 124 Ca sáng "
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
"\n",
"def rmse(y, yhat):\n",
" return float(np.sqrt(mean_squared_error(y, yhat)))\n",
"\n",
"def eval_reg(y_true, y_pred, name):\n",
" print(f\"{name:28s} | MAE={mean_absolute_error(y_true, y_pred):.3f} | RMSE={rmse(y_true, y_pred):.3f} | R2={r2_score(y_true, y_pred):.3f}\")\n",
"\n",
"# ========= 1) ML predictions (GBR log-target) =========\n",
"pred_train = np.maximum(0, np.expm1(gbr_reg.predict(X_train)))\n",
"pred_val = np.maximum(0, np.expm1(gbr_reg.predict(X_val)))\n",
"pred_test = np.maximum(0, np.expm1(gbr_reg.predict(X_test)))\n",
"\n",
"# ========= 2) Base min-staff rule (tuned earlier) =========\n",
"# dùng lại best bạn đã tìm: k_tasks=150, k_wc=30, k_out=40\n",
"def compute_min_staff_base(X, k_tasks=150, k_wc=30, k_out=40):\n",
" ms = np.ceil(X[\"num_tasks\"] / k_tasks)\n",
"\n",
" if \"num_wc_tasks\" in X.columns:\n",
" ms = np.maximum(ms, np.ceil(X[\"num_wc_tasks\"] / k_wc))\n",
"\n",
" if \"num_outdoor_tasks\" in X.columns:\n",
" ms = np.maximum(ms, np.ceil(X[\"num_outdoor_tasks\"] / k_out))\n",
"\n",
" # bonus theo ca đêm / cross-day nếu có\n",
" for col in [\"is_night_shift\", \"is_cross_day\"]:\n",
" if col in X.columns:\n",
" ms = ms + X[col].astype(int)\n",
"\n",
" return ms\n",
"\n",
"# ========= 3) Tune MIN staff theo loai_ca trên VAL =========\n",
"# Ta tune 2 tham số: min_HC (Hành chính), min_2424 (24/24)\n",
"# Nếu loai_ca đang one-hot, ta sẽ suy ra label từ cột gốc nếu có\n",
"def get_loai_ca_series(X):\n",
" # ưu tiên nếu còn cột gốc 'loai_ca' (string)\n",
" if \"loai_ca\" in X.columns and X[\"loai_ca\"].dtype == \"object\":\n",
" return X[\"loai_ca\"].astype(str)\n",
" # nếu đã one-hot: tìm các cột bắt đầu bằng \"loai_ca_\"\n",
" onehot_cols = [c for c in X.columns if c.startswith(\"loai_ca_\")]\n",
" if onehot_cols:\n",
" # lấy tên category có value True/1\n",
" def decode_row(row):\n",
" for c in onehot_cols:\n",
" if row[c] == 1 or row[c] is True:\n",
" return c.replace(\"loai_ca_\", \"\")\n",
" return \"UNKNOWN\"\n",
" return X[onehot_cols].apply(decode_row, axis=1)\n",
" return pd.Series([\"UNKNOWN\"] * len(X), index=X.index)\n",
"\n",
"loai_ca_val = get_loai_ca_series(X_val)\n",
"loai_ca_test = get_loai_ca_series(X_test)\n",
"loai_ca_train = get_loai_ca_series(X_train)\n",
"\n",
"def apply_ca_rule(loai_ca_series, min_hc, min_2424):\n",
" # map tên ca -> min staff\n",
" # bạn có thể thêm biến thể viết khác nếu dữ liệu có\n",
" lc = loai_ca_series.str.lower()\n",
" min_by_ca = np.zeros(len(lc), dtype=float)\n",
"\n",
" # Hành chính\n",
" mask_hc = lc.str.contains(\"hành chính\") | lc.str.contains(\"hanh chinh\")\n",
" min_by_ca[mask_hc.values] = min_hc\n",
"\n",
" # 24/24 hoặc 24-24\n",
" mask_2424 = lc.str.contains(\"24/24\") | lc.str.contains(\"24-24\") | lc.str.contains(\"24 24\")\n",
" min_by_ca[mask_2424.values] = np.maximum(min_by_ca[mask_2424.values], min_2424)\n",
"\n",
" return min_by_ca\n",
"\n",
"best = None\n",
"min_hc_list = [0, 4, 6, 8, 10]\n",
"min_2424_list = [0, 6, 8, 10, 12]\n",
"\n",
"base_val = compute_min_staff_base(X_val, 150, 30, 40)\n",
"\n",
"for mhc in min_hc_list:\n",
" for m24 in min_2424_list:\n",
" min_ca = apply_ca_rule(loai_ca_val, mhc, m24)\n",
" min_staff = np.maximum(base_val, min_ca)\n",
" pred_final = np.maximum(pred_val, min_staff)\n",
"\n",
" mae_val = mean_absolute_error(y_val, pred_final)\n",
" if (best is None) or (mae_val < best[\"mae_val\"]):\n",
" best = {\"min_hc\": mhc, \"min_2424\": m24, \"mae_val\": mae_val}\n",
"\n",
"print(\"✅ Best (min_hc, min_2424) on VAL:\", best)\n",
"\n",
"# ========= 4) Apply best rule to TEST =========\n",
"base_test = compute_min_staff_base(X_test, 150, 30, 40)\n",
"min_ca_test = apply_ca_rule(loai_ca_test, best[\"min_hc\"], best[\"min_2424\"])\n",
"min_staff_test = np.maximum(base_test, min_ca_test)\n",
"\n",
"pred_test_final = np.maximum(pred_test, min_staff_test)\n",
"\n",
"print(\"\\n=== TEST EVAL ===\")\n",
"eval_reg(y_test, pred_test, \"ML only (GBR log)\")\n",
"eval_reg(y_test, np.maximum(pred_test, base_test), \"ML + tuned base rules\")\n",
"eval_reg(y_test, pred_test_final, \"ML + base + loai_ca rules\")\n",
"\n",
"# ========= 5) Save excel =========\n",
"out = pd.DataFrame({\n",
" \"y_true\": y_test.values,\n",
" \"y_pred_ml\": np.round(pred_test, 3),\n",
" \"min_staff_base\": base_test.astype(float),\n",
" \"min_staff_ca\": min_ca_test.astype(float),\n",
" \"min_staff_final\": min_staff_test.astype(float),\n",
" \"y_pred_final\": np.round(pred_test_final, 3),\n",
" \"num_tasks\": X_test[\"num_tasks\"].values if \"num_tasks\" in X_test.columns else np.nan,\n",
" \"loai_ca\": loai_ca_test.values\n",
"})\n",
"out.to_excel(\"test_predictions_ml_plus_rules_plus_ca.xlsx\", index=False)\n",
"print(\"✅ Saved: test_predictions_ml_plus_rules_plus_ca.xlsx\")\n",
"\n",
"out.head(10)\n"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "6dc15922",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loaded: final_2.xlsx | sheet: final\n",
"Shape (raw): (401, 42)\n",
"Shape (dedup): (394, 42)\n",
"\n",
"=== Target summary (so_luong) ===\n",
"count 394.000000\n",
"mean 4.710660\n",
"std 6.848602\n",
"min 0.000000\n",
"25% 1.000000\n",
"50% 2.000000\n",
"75% 5.000000\n",
"max 64.000000\n",
"Name: so_luong, dtype: float64\n",
"\n",
"=== staff_band distribution ===\n",
"staff_band\n",
"0 216\n",
"1 87\n",
"2 48\n",
"3 27\n",
"4 16\n",
"Name: count, dtype: int64\n",
"\n",
"Sample rows:\n",
" ma_dia_diem loai_ca tong_gio_lam num_tasks so_luong staff_band\n",
"0 115-2 Part time 4.0 7 1 0\n",
"1 101-1 Hành chính 7.5 441 24 4\n",
"2 101-1 Ca sáng 8.0 441 3 1\n",
"3 101-1 Ca chiều 8.0 441 5 1\n",
"4 101-1 Ca đêm 8.0 441 1 0\n",
"5 101-1 Ca gãy 7.5 441 1 0\n",
"6 101-1 Hành chính 9.5 441 22 4\n",
"7 101-2 Hành chính 9.5 135 8 2\n",
"8 101-2 Ca gãy 7.5 135 1 0\n",
"9 101-2 Ca đêm 7.5 135 1 0\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"DATA_PATH = \"final_2.xlsx\"\n",
"SHEET_NAME = \"final\"\n",
"\n",
"df = pd.read_excel(DATA_PATH, sheet_name=SHEET_NAME)\n",
"print(\"Loaded:\", DATA_PATH, \"| sheet:\", SHEET_NAME)\n",
"print(\"Shape (raw):\", df.shape)\n",
"\n",
"# drop duplicates\n",
"df = df.drop_duplicates()\n",
"print(\"Shape (dedup):\", df.shape)\n",
"\n",
"# target\n",
"assert \"so_luong\" in df.columns, \"Missing target so_luong\"\n",
"\n",
"# ---- Define ordinal bins (bậc nhân sự) ----\n",
"# 0: 0-2 (rất nhỏ)\n",
"# 1: 3-5 (nhỏ)\n",
"# 2: 6-10 (trung bình)\n",
"# 3: 11-20 (lớn)\n",
"# 4: >20 (rất lớn)\n",
"bins = [-0.1, 2, 5, 10, 20, 10**9]\n",
"labels = [0, 1, 2, 3, 4]\n",
"\n",
"df[\"staff_band\"] = pd.cut(df[\"so_luong\"], bins=bins, labels=labels).astype(int)\n",
"\n",
"print(\"\\n=== Target summary (so_luong) ===\")\n",
"print(df[\"so_luong\"].describe())\n",
"\n",
"print(\"\\n=== staff_band distribution ===\")\n",
"print(df[\"staff_band\"].value_counts().sort_index())\n",
"\n",
"print(\"\\nSample rows:\")\n",
"print(df[[\"ma_dia_diem\",\"loai_ca\",\"tong_gio_lam\",\"num_tasks\",\"so_luong\",\"staff_band\"]].head(10))\n"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "666be810",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"=== SPLIT SUMMARY (by ma_dia_diem) ===\n",
"Buildings: 192\n",
"Train buildings: 138 | rows: 282\n",
"Val buildings: 29 | rows: 56\n",
"Test buildings: 25 | rows: 56\n",
"\n",
"Leakage check (should be 0):\n",
"Train ∩ Val : 0\n",
"Train ∩ Test: 0\n",
"Val ∩ Test: 0\n",
"\n",
"=== staff_band distribution ===\n",
"\n",
"Train:\n",
"staff_band\n",
"0 155\n",
"1 55\n",
"2 38\n",
"3 22\n",
"4 12\n",
"Name: count, dtype: int64\n",
"\n",
"Val:\n",
"staff_band\n",
"0 28\n",
"1 20\n",
"2 5\n",
"3 3\n",
"Name: count, dtype: int64\n",
"\n",
"Test:\n",
"staff_band\n",
"0 33\n",
"1 12\n",
"2 5\n",
"3 2\n",
"4 4\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"import numpy as np\n",
"\n",
"SEED = 42\n",
"TRAIN_RATIO = 0.72\n",
"VAL_RATIO = 0.15\n",
"TEST_RATIO = 0.13\n",
"\n",
"assert abs(TRAIN_RATIO + VAL_RATIO + TEST_RATIO - 1.0) < 1e-9\n",
"\n",
"# 1) lấy danh sách tòa nhà (group)\n",
"buildings = df[\"ma_dia_diem\"].astype(str).unique()\n",
"rng = np.random.RandomState(SEED)\n",
"rng.shuffle(buildings)\n",
"\n",
"n = len(buildings)\n",
"n_train = int(round(n * TRAIN_RATIO))\n",
"n_val = int(round(n * VAL_RATIO))\n",
"# phần còn lại là test\n",
"train_b = set(buildings[:n_train])\n",
"val_b = set(buildings[n_train:n_train+n_val])\n",
"test_b = set(buildings[n_train+n_val:])\n",
"\n",
"# 2) tạo mask theo group\n",
"train_mask = df[\"ma_dia_diem\"].astype(str).isin(train_b)\n",
"val_mask = df[\"ma_dia_diem\"].astype(str).isin(val_b)\n",
"test_mask = df[\"ma_dia_diem\"].astype(str).isin(test_b)\n",
"\n",
"df_train = df[train_mask].copy()\n",
"df_val = df[val_mask].copy()\n",
"df_test = df[test_mask].copy()\n",
"\n",
"print(\"=== SPLIT SUMMARY (by ma_dia_diem) ===\")\n",
"print(\"Buildings:\", n)\n",
"print(\"Train buildings:\", len(train_b), \"| rows:\", df_train.shape[0])\n",
"print(\"Val buildings:\", len(val_b), \"| rows:\", df_val.shape[0])\n",
"print(\"Test buildings:\", len(test_b), \"| rows:\", df_test.shape[0])\n",
"\n",
"# 3) leakage check (phải = 0)\n",
"train_set = set(df_train[\"ma_dia_diem\"].astype(str).unique())\n",
"val_set = set(df_val[\"ma_dia_diem\"].astype(str).unique())\n",
"test_set = set(df_test[\"ma_dia_diem\"].astype(str).unique())\n",
"\n",
"print(\"\\nLeakage check (should be 0):\")\n",
"print(\"Train ∩ Val :\", len(train_set & val_set))\n",
"print(\"Train ∩ Test:\", len(train_set & test_set))\n",
"print(\"Val ∩ Test:\", len(val_set & test_set))\n",
"\n",
"# 4) distribution staff_band theo từng tập\n",
"print(\"\\n=== staff_band distribution ===\")\n",
"for name, d in [(\"Train\", df_train), (\"Val\", df_val), (\"Test\", df_test)]:\n",
" vc = d[\"staff_band\"].value_counts().sort_index()\n",
" print(f\"\\n{name}:\")\n",
" print(vc)\n"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "d898cfe6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Categorical cols: ['loai_ca', 'bat_dau', 'ket_thuc']\n",
"\n",
"Shapes:\n",
"Train: (282, 115) (282,)\n",
"Val : (56, 115) (56,)\n",
"Test : (56, 115) (56,)\n",
"\n",
"Sample feature columns (first 25):\n",
"['tong_gio_lam', 'so_ca_cua_toa', 'num_tasks', 'num_cleaning_tasks', 'num_trash_collection_tasks', 'num_monitoring_tasks', 'num_deep_cleaning_tasks', 'num_support_tasks', 'num_other_tasks', 'num_wc_tasks', 'num_hallway_tasks', 'num_lobby_tasks', 'num_outdoor_tasks', 'num_elevator_tasks', 'cleaning_ratio', 'trash_collection_ratio', 'monitoring_ratio', 'area_diversity', 'so_tang', 'so_cua_thang_may', 'dien_tich_ngoai_canh', 'dien_tich_sanh', 'dien_tich_hanh_lang', 'dien_tich_wc', 'dien_tich_phong']\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"# ========= 1) Helper: parse time to hour float =========\n",
"def time_to_hour(x):\n",
" if pd.isna(x):\n",
" return np.nan\n",
" # nếu là kiểu datetime/time của pandas\n",
" if hasattr(x, \"hour\"):\n",
" return float(x.hour) + float(getattr(x, \"minute\", 0))/60.0\n",
" s = str(x).strip()\n",
" # format \"HH:MM:SS\" hoặc \"HH:MM\"\n",
" if \":\" in s:\n",
" parts = s.split(\":\")\n",
" try:\n",
" h = float(parts[0])\n",
" m = float(parts[1]) if len(parts) > 1 else 0.0\n",
" return h + m/60.0\n",
" except:\n",
" return np.nan\n",
" # fallback\n",
" try:\n",
" return float(s)\n",
" except:\n",
" return np.nan\n",
"\n",
"def add_time_features(d):\n",
" d = d.copy()\n",
" d[\"hour_start\"] = d[\"bat_dau\"].apply(time_to_hour)\n",
" d[\"hour_end\"] = d[\"ket_thuc\"].apply(time_to_hour)\n",
"\n",
" # shift length (handle cross-day)\n",
" # nếu end < start -> qua ngày: +24\n",
" end_adj = d[\"hour_end\"].copy()\n",
" mask_cross = (d[\"hour_end\"].notna()) & (d[\"hour_start\"].notna()) & (d[\"hour_end\"] < d[\"hour_start\"])\n",
" end_adj[mask_cross] = end_adj[mask_cross] + 24.0\n",
"\n",
" d[\"shift_length\"] = (end_adj - d[\"hour_start\"]).clip(lower=0)\n",
" d[\"is_cross_day\"] = mask_cross.astype(int)\n",
"\n",
" # buckets theo giờ bắt đầu\n",
" hs = d[\"hour_start\"].fillna(-1)\n",
" d[\"is_morning_shift\"] = ((hs >= 6) & (hs < 12)).astype(int)\n",
" d[\"is_afternoon_shift\"] = ((hs >= 12) & (hs < 18)).astype(int)\n",
" d[\"is_evening_shift\"] = ((hs >= 18) & (hs < 24)).astype(int)\n",
" d[\"is_night_shift\"] = ((hs >= 0) & (hs < 6)).astype(int)\n",
"\n",
" return d\n",
"\n",
"# ========= 2) Apply time features =========\n",
"df_train_fe = add_time_features(df_train)\n",
"df_val_fe = add_time_features(df_val)\n",
"df_test_fe = add_time_features(df_test)\n",
"\n",
"# ========= 3) Drop columns (the ones you requested) + leakage columns =========\n",
"DROP_COLS = [\"ma_dia_diem\", \"all_task_normal\", \"all_task_dinhky\", \"is_tasks_text_missing\"]\n",
"LEAK_COLS = [\"so_luong\"] # label thật -> tuyệt đối không dùng feature\n",
"\n",
"# giữ lại staff_band làm y\n",
"y_train = df_train_fe[\"staff_band\"].astype(int)\n",
"y_val = df_val_fe[\"staff_band\"].astype(int)\n",
"y_test = df_test_fe[\"staff_band\"].astype(int)\n",
"\n",
"X_train = df_train_fe.drop(columns=[c for c in (DROP_COLS + LEAK_COLS + [\"staff_band\"]) if c in df_train_fe.columns])\n",
"X_val = df_val_fe.drop(columns=[c for c in (DROP_COLS + LEAK_COLS + [\"staff_band\"]) if c in df_val_fe.columns])\n",
"X_test = df_test_fe.drop(columns=[c for c in (DROP_COLS + LEAK_COLS + [\"staff_band\"]) if c in df_test_fe.columns])\n",
"\n",
"# ========= 4) One-hot only categorical columns =========\n",
"cat_cols = [c for c in X_train.columns if X_train[c].dtype == \"object\"]\n",
"print(\"Categorical cols:\", cat_cols)\n",
"\n",
"X_train = pd.get_dummies(X_train, columns=cat_cols, dummy_na=True)\n",
"X_val = pd.get_dummies(X_val, columns=cat_cols, dummy_na=True)\n",
"X_test = pd.get_dummies(X_test, columns=cat_cols, dummy_na=True)\n",
"\n",
"# align columns across splits\n",
"X_train, X_val = X_train.align(X_val, join=\"left\", axis=1, fill_value=0)\n",
"X_train, X_test = X_train.align(X_test, join=\"left\", axis=1, fill_value=0)\n",
"\n",
"# fill NaN numeric\n",
"X_train = X_train.fillna(0)\n",
"X_val = X_val.fillna(0)\n",
"X_test = X_test.fillna(0)\n",
"\n",
"print(\"\\nShapes:\")\n",
"print(\"Train:\", X_train.shape, y_train.shape)\n",
"print(\"Val :\", X_val.shape, y_val.shape)\n",
"print(\"Test :\", X_test.shape, y_test.shape)\n",
"\n",
"print(\"\\nSample feature columns (first 25):\")\n",
"print(list(X_train.columns[:25]))\n"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "253b34f1",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:465: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" n_iter_i = _check_optimize_result(\n",
"c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
"c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
"c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"======================================================================\n",
"MODEL: LR_balanced\n",
"Train | Acc=0.624 | MacroF1=0.552\n",
"Val | Acc=0.429 | MacroF1=0.277\n",
"Test | Acc=0.518 | MacroF1=0.402\n",
"\n",
"[VAL] Confusion matrix:\n",
"[[18 5 0 5]\n",
" [ 6 4 0 10]\n",
" [ 2 2 0 1]\n",
" [ 0 1 0 2]]\n",
"\n",
"[VAL] Classification report:\n",
" precision recall f1-score support\n",
"\n",
" 0 0.692 0.643 0.667 28\n",
" 1 0.333 0.200 0.250 20\n",
" 2 0.000 0.000 0.000 5\n",
" 3 0.111 0.667 0.190 3\n",
"\n",
" accuracy 0.429 56\n",
" macro avg 0.284 0.377 0.277 56\n",
"weighted avg 0.471 0.429 0.433 56\n",
"\n",
"\n",
"[TEST] Confusion matrix:\n",
"[[20 8 1 2 2]\n",
" [ 3 4 2 1 2]\n",
" [ 0 1 1 3 0]\n",
" [ 0 0 0 2 0]\n",
" [ 0 1 0 1 2]]\n",
"\n",
"[TEST] Classification report:\n",
" precision recall f1-score support\n",
"\n",
" 0 0.870 0.606 0.714 33\n",
" 1 0.286 0.333 0.308 12\n",
" 2 0.250 0.200 0.222 5\n",
" 3 0.222 1.000 0.364 2\n",
" 4 0.333 0.500 0.400 4\n",
"\n",
" accuracy 0.518 56\n",
" macro avg 0.392 0.528 0.402 56\n",
"weighted avg 0.628 0.518 0.548 56\n",
"\n",
"\n",
"======================================================================\n",
"MODEL: RF_balanced\n",
"Train | Acc=0.894 | MacroF1=0.875\n",
"Val | Acc=0.607 | MacroF1=0.424\n",
"Test | Acc=0.625 | MacroF1=0.453\n",
"\n",
"[VAL] Confusion matrix:\n",
"[[19 8 1 0]\n",
" [ 6 13 1 0]\n",
" [ 0 2 2 1]\n",
" [ 0 2 1 0]]\n",
"\n",
"[VAL] Classification report:\n",
" precision recall f1-score support\n",
"\n",
" 0 0.760 0.679 0.717 28\n",
" 1 0.520 0.650 0.578 20\n",
" 2 0.400 0.400 0.400 5\n",
" 3 0.000 0.000 0.000 3\n",
"\n",
" accuracy 0.607 56\n",
" macro avg 0.420 0.432 0.424 56\n",
"weighted avg 0.601 0.607 0.601 56\n",
"\n",
"\n",
"[TEST] Confusion matrix:\n",
"[[23 2 8 0 0]\n",
" [ 3 7 2 0 0]\n",
" [ 0 0 4 1 0]\n",
" [ 0 1 0 1 0]\n",
" [ 0 0 4 0 0]]\n",
"\n",
"[TEST] Classification report:\n",
" precision recall f1-score support\n",
"\n",
" 0 0.885 0.697 0.780 33\n",
" 1 0.700 0.583 0.636 12\n",
" 2 0.222 0.800 0.348 5\n",
" 3 0.500 0.500 0.500 2\n",
" 4 0.000 0.000 0.000 4\n",
"\n",
" accuracy 0.625 56\n",
" macro avg 0.461 0.516 0.453 56\n",
"weighted avg 0.709 0.625 0.645 56\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
"c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
"c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"======================================================================\n",
"MODEL: HGB\n",
"Train | Acc=0.989 | MacroF1=0.989\n",
"Val | Acc=0.589 | MacroF1=0.405\n",
"Test | Acc=0.518 | MacroF1=0.326\n",
"\n",
"[VAL] Confusion matrix:\n",
"[[22 5 1 0]\n",
" [ 8 9 2 1]\n",
" [ 1 1 2 1]\n",
" [ 1 2 0 0]]\n",
"\n",
"[VAL] Classification report:\n",
" precision recall f1-score support\n",
"\n",
" 0 0.688 0.786 0.733 28\n",
" 1 0.529 0.450 0.486 20\n",
" 2 0.400 0.400 0.400 5\n",
" 3 0.000 0.000 0.000 3\n",
"\n",
" accuracy 0.589 56\n",
" macro avg 0.404 0.409 0.405 56\n",
"weighted avg 0.569 0.589 0.576 56\n",
"\n",
"\n",
"[TEST] Confusion matrix:\n",
"[[22 6 4 1 0]\n",
" [ 4 5 3 0 0]\n",
" [ 1 2 1 1 0]\n",
" [ 0 1 1 0 0]\n",
" [ 0 1 2 0 1]]\n",
"\n",
"[TEST] Classification report:\n",
" precision recall f1-score support\n",
"\n",
" 0 0.815 0.667 0.733 33\n",
" 1 0.333 0.417 0.370 12\n",
" 2 0.091 0.200 0.125 5\n",
" 3 0.000 0.000 0.000 2\n",
" 4 1.000 0.250 0.400 4\n",
"\n",
" accuracy 0.518 56\n",
" macro avg 0.448 0.307 0.326 56\n",
"weighted avg 0.631 0.518 0.551 56\n",
"\n",
"\n",
"======================================================================\n",
"SUMMARY (sorted by Val MacroF1):\n",
"RF_balanced | Val MacroF1=0.424 | Test MacroF1=0.453\n",
"HGB | Val MacroF1=0.405 | Test MacroF1=0.326\n",
"LR_balanced | Val MacroF1=0.277 | Test MacroF1=0.402\n"
]
}
],
"source": [
"import numpy as np\n",
"from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier\n",
"\n",
"def eval_cls(name, model, Xtr, ytr, Xva, yva, Xte, yte):\n",
" model.fit(Xtr, ytr)\n",
"\n",
" def _metrics(split_name, X, y):\n",
" pred = model.predict(X)\n",
" acc = accuracy_score(y, pred)\n",
" f1m = f1_score(y, pred, average=\"macro\")\n",
" return pred, acc, f1m\n",
"\n",
" pred_tr, acc_tr, f1_tr = _metrics(\"Train\", Xtr, ytr)\n",
" pred_va, acc_va, f1_va = _metrics(\"Val\", Xva, yva)\n",
" pred_te, acc_te, f1_te = _metrics(\"Test\", Xte, yte)\n",
"\n",
" print(\"\\n\" + \"=\"*70)\n",
" print(f\"MODEL: {name}\")\n",
" print(f\"Train | Acc={acc_tr:.3f} | MacroF1={f1_tr:.3f}\")\n",
" print(f\"Val | Acc={acc_va:.3f} | MacroF1={f1_va:.3f}\")\n",
" print(f\"Test | Acc={acc_te:.3f} | MacroF1={f1_te:.3f}\")\n",
"\n",
" print(\"\\n[VAL] Confusion matrix:\")\n",
" print(confusion_matrix(yva, pred_va))\n",
" print(\"\\n[VAL] Classification report:\")\n",
" print(classification_report(yva, pred_va, digits=3))\n",
"\n",
" print(\"\\n[TEST] Confusion matrix:\")\n",
" print(confusion_matrix(yte, pred_te))\n",
" print(\"\\n[TEST] Classification report:\")\n",
" print(classification_report(yte, pred_te, digits=3))\n",
"\n",
" return {\n",
" \"name\": name,\n",
" \"model\": model,\n",
" \"val_macro_f1\": f1_va,\n",
" \"test_macro_f1\": f1_te\n",
" }\n",
"\n",
"# 1) Logistic Regression (balanced) - baseline mạnh cho tabular\n",
"lr = LogisticRegression(\n",
" max_iter=5000,\n",
" class_weight=\"balanced\",\n",
" n_jobs=None\n",
")\n",
"\n",
"# 2) RandomForest (balanced_subsample)\n",
"rf = RandomForestClassifier(\n",
" n_estimators=600,\n",
" max_depth=None,\n",
" min_samples_leaf=2,\n",
" random_state=42,\n",
" class_weight=\"balanced_subsample\",\n",
" n_jobs=-1\n",
")\n",
"\n",
"# 3) HistGradientBoosting (mạnh cho tabular, chạy nhanh)\n",
"hgb = HistGradientBoostingClassifier(\n",
" learning_rate=0.06,\n",
" max_depth=6,\n",
" max_iter=600,\n",
" random_state=42\n",
")\n",
"\n",
"results = []\n",
"results.append(eval_cls(\"LR_balanced\", lr, X_train, y_train, X_val, y_val, X_test, y_test))\n",
"results.append(eval_cls(\"RF_balanced\", rf, X_train, y_train, X_val, y_val, X_test, y_test))\n",
"results.append(eval_cls(\"HGB\", hgb, X_train, y_train, X_val, y_val, X_test, y_test))\n",
"\n",
"# Summary\n",
"print(\"\\n\" + \"=\"*70)\n",
"print(\"SUMMARY (sorted by Val MacroF1):\")\n",
"for r in sorted(results, key=lambda x: x[\"val_macro_f1\"], reverse=True):\n",
" print(f\"{r['name']:12s} | Val MacroF1={r['val_macro_f1']:.3f} | Test MacroF1={r['test_macro_f1']:.3f}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "e1851e78",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loaded: final_2.xlsx | sheet: final\n",
"Shape (raw): (401, 42)\n",
"Shape (after dedup): (394, 42)\n",
"\n",
"=== TARGET SUMMARY (so_luong) ===\n",
"count 394.000000\n",
"mean 4.710660\n",
"std 6.848602\n",
"min 0.000000\n",
"25% 1.000000\n",
"50% 2.000000\n",
"75% 5.000000\n",
"max 64.000000\n",
"Name: so_luong, dtype: float64\n",
"Missing target: 0\n",
"Negative target: 0\n",
"Zero target: 3\n",
"\n",
"Sample rows:\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ma_dia_diem</th>\n",
" <th>all_task_normal</th>\n",
" <th>all_task_dinhky</th>\n",
" <th>loai_ca</th>\n",
" <th>bat_dau</th>\n",
" <th>ket_thuc</th>\n",
" <th>tong_gio_lam</th>\n",
" <th>so_ca_cua_toa</th>\n",
" <th>so_luong</th>\n",
" <th>num_tasks</th>\n",
" <th>...</th>\n",
" <th>dien_tich_tham</th>\n",
" <th>doc_ham</th>\n",
" <th>vien_phan_quang</th>\n",
" <th>op_tuong</th>\n",
" <th>op_chan_tuong</th>\n",
" <th>ranh_thoat_nuoc</th>\n",
" <th>dien_tich_kinh</th>\n",
" <th>num_medical_tasks_total</th>\n",
" <th>num_indoor_room_tasks</th>\n",
" <th>is_tasks_text_missing</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>115-2</td>\n",
" <td>Làm sạch toàn bộ phòng giao dịch tầng 1 (kể cả...</td>\n",
" <td>NaN</td>\n",
" <td>Part time</td>\n",
" <td>06:30:00</td>\n",
" <td>10:30:00</td>\n",
" <td>4.0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>7</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>20.0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>101-1</td>\n",
" <td>Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,...</td>\n",
" <td>Lau bảng biển, bình cứu hỏa , cây nước hành la...</td>\n",
" <td>Hành chính</td>\n",
" <td>06:30:00</td>\n",
" <td>16:00:00</td>\n",
" <td>7.5</td>\n",
" <td>6</td>\n",
" <td>24</td>\n",
" <td>441</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>70</td>\n",
" <td>0</td>\n",
" <td>9176.0</td>\n",
" <td>89.0</td>\n",
" <td>25</td>\n",
" <td>894.0</td>\n",
" <td>112</td>\n",
" <td>39</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>101-1</td>\n",
" <td>Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,...</td>\n",
" <td>Lau bảng biển, bình cứu hỏa , cây nước hành la...</td>\n",
" <td>Ca sáng</td>\n",
" <td>06:00:00</td>\n",
" <td>14:00:00</td>\n",
" <td>8.0</td>\n",
" <td>6</td>\n",
" <td>3</td>\n",
" <td>441</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>70</td>\n",
" <td>0</td>\n",
" <td>9176.0</td>\n",
" <td>89.0</td>\n",
" <td>25</td>\n",
" <td>894.0</td>\n",
" <td>112</td>\n",
" <td>39</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3 rows × 42 columns</p>\n",
"</div>"
],
"text/plain": [
" ma_dia_diem all_task_normal \\\n",
"0 115-2 Làm sạch toàn bộ phòng giao dịch tầng 1 (kể cả... \n",
"1 101-1 Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,... \n",
"2 101-1 Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,... \n",
"\n",
" all_task_dinhky loai_ca bat_dau \\\n",
"0 NaN Part time 06:30:00 \n",
"1 Lau bảng biển, bình cứu hỏa , cây nước hành la... Hành chính 06:30:00 \n",
"2 Lau bảng biển, bình cứu hỏa , cây nước hành la... Ca sáng 06:00:00 \n",
"\n",
" ket_thuc tong_gio_lam so_ca_cua_toa so_luong num_tasks ... \\\n",
"0 10:30:00 4.0 1 1 7 ... \n",
"1 16:00:00 7.5 6 24 441 ... \n",
"2 14:00:00 8.0 6 3 441 ... \n",
"\n",
" dien_tich_tham doc_ham vien_phan_quang op_tuong op_chan_tuong \\\n",
"0 0.0 0 0 0.0 0.0 \n",
"1 0.0 70 0 9176.0 89.0 \n",
"2 0.0 70 0 9176.0 89.0 \n",
"\n",
" ranh_thoat_nuoc dien_tich_kinh num_medical_tasks_total \\\n",
"0 0 20.0 0 \n",
"1 25 894.0 112 \n",
"2 25 894.0 112 \n",
"\n",
" num_indoor_room_tasks is_tasks_text_missing \n",
"0 1 0 \n",
"1 39 0 \n",
"2 39 0 \n",
"\n",
"[3 rows x 42 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# CELL 1 — LOAD DATA & BASIC CLEAN\n",
"\n",
"import pandas as pd\n",
"\n",
"DATA_PATH = \"final_2.xlsx\"\n",
"SHEET_NAME = \"final\"\n",
"\n",
"# 1. Load\n",
"df = pd.read_excel(DATA_PATH, sheet_name=SHEET_NAME)\n",
"print(f\"Loaded: {DATA_PATH} | sheet: {SHEET_NAME}\")\n",
"print(\"Shape (raw):\", df.shape)\n",
"\n",
"# 2. Drop duplicate full rows\n",
"df = df.drop_duplicates().reset_index(drop=True)\n",
"print(\"Shape (after dedup):\", df.shape)\n",
"\n",
"# 3. Check target\n",
"assert \"so_luong\" in df.columns, \"❌ Missing target so_luong\"\n",
"\n",
"print(\"\\n=== TARGET SUMMARY (so_luong) ===\")\n",
"print(df[\"so_luong\"].describe())\n",
"print(\"Missing target:\", df[\"so_luong\"].isna().sum())\n",
"print(\"Negative target:\", (df[\"so_luong\"] < 0).sum())\n",
"print(\"Zero target:\", (df[\"so_luong\"] == 0).sum())\n",
"\n",
"# 4. Peek data\n",
"print(\"\\nSample rows:\")\n",
"display(df.head(3))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8c9268c5",
"metadata": {},
"outputs": [],
"source": [
"# CELL 2 — FEATURE SELECTION (STRICT)\n",
"\n",
"# 1. Xem toàn bộ cột\n",
"print(\"All columns:\")\n",
"for i, c in enumerate(df.columns):\n",
" print(f\"{i:2d}: {c}\")\n",
"\n",
"# 2. Xác định cột cần loại bỏ (THEO THỎA THUẬN)\n",
"DROP_COLS = [\n",
" df.columns[0], # ma_dia_diem\n",
" df.columns[1], # all_task_normal\n",
" df.columns[2], # all_task_dinhky\n",
" df.columns[-1], # is_tasks_text_missing\n",
"]\n",
"\n",
"print(\"\\nDropped columns:\")\n",
"for c in DROP_COLS:\n",
" print(\" -\", c)\n",
"\n",
"# 3. Tạo X, y\n",
"X = df.drop(columns=DROP_COLS + [\"so_luong\"])\n",
"y = df[\"so_luong\"].astype(float)\n",
"\n",
"print(\"\\nShapes:\")\n",
"print(\"X:\", X.shape)\n",
"print(\"y:\", y.shape)\n",
"\n",
"# 4. Kiểm tra kiểu dữ liệu\n",
"print(\"\\nFeature dtypes:\")\n",
"display(X.dtypes.value_counts())\n",
"\n",
"# 5. Kiểm tra missing\n",
"print(\"\\nMissing values in X:\")\n",
"display(X.isna().sum().sort_values(ascending=False).head(10))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b975f6cf",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "1a595fe8",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "b2fb9c84",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}