3713 lines
136 KiB
Plaintext
3713 lines
136 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"id": "76aa1b75",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"✅ Loaded: final_2.xlsx | sheet: final\n",
|
||
"Shape (raw): (401, 42)\n",
|
||
"\n",
|
||
"=== TARGET SUMMARY (so_luong) ===\n",
|
||
"count 401.000000\n",
|
||
"mean 4.660848\n",
|
||
"std 6.799242\n",
|
||
"min 0.000000\n",
|
||
"25% 1.000000\n",
|
||
"50% 2.000000\n",
|
||
"75% 5.000000\n",
|
||
"max 64.000000\n",
|
||
"Name: so_luong, dtype: float64\n",
|
||
"Missing target: 0\n",
|
||
"Negative target: 0\n",
|
||
"Zero target: 3\n",
|
||
"\n",
|
||
"Duplicate full rows: 7\n",
|
||
"Shape (dedup): (394, 42)\n",
|
||
"\n",
|
||
"Columns: 42\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>ma_dia_diem</th>\n",
|
||
" <th>all_task_normal</th>\n",
|
||
" <th>all_task_dinhky</th>\n",
|
||
" <th>loai_ca</th>\n",
|
||
" <th>bat_dau</th>\n",
|
||
" <th>ket_thuc</th>\n",
|
||
" <th>tong_gio_lam</th>\n",
|
||
" <th>so_ca_cua_toa</th>\n",
|
||
" <th>so_luong</th>\n",
|
||
" <th>num_tasks</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>dien_tich_tham</th>\n",
|
||
" <th>doc_ham</th>\n",
|
||
" <th>vien_phan_quang</th>\n",
|
||
" <th>op_tuong</th>\n",
|
||
" <th>op_chan_tuong</th>\n",
|
||
" <th>ranh_thoat_nuoc</th>\n",
|
||
" <th>dien_tich_kinh</th>\n",
|
||
" <th>num_medical_tasks_total</th>\n",
|
||
" <th>num_indoor_room_tasks</th>\n",
|
||
" <th>is_tasks_text_missing</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>115-2</td>\n",
|
||
" <td>Làm sạch toàn bộ phòng giao dịch tầng 1 (kể cả...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>Part time</td>\n",
|
||
" <td>06:30:00</td>\n",
|
||
" <td>10:30:00</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>20.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>101-1</td>\n",
|
||
" <td>Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,...</td>\n",
|
||
" <td>Lau bảng biển, bình cứu hỏa , cây nước hành la...</td>\n",
|
||
" <td>Hành chính</td>\n",
|
||
" <td>06:30:00</td>\n",
|
||
" <td>16:00:00</td>\n",
|
||
" <td>7.5</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>24</td>\n",
|
||
" <td>441</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>70</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>9176.0</td>\n",
|
||
" <td>89.0</td>\n",
|
||
" <td>25</td>\n",
|
||
" <td>894.0</td>\n",
|
||
" <td>112</td>\n",
|
||
" <td>39</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>101-1</td>\n",
|
||
" <td>Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,...</td>\n",
|
||
" <td>Lau bảng biển, bình cứu hỏa , cây nước hành la...</td>\n",
|
||
" <td>Ca sáng</td>\n",
|
||
" <td>06:00:00</td>\n",
|
||
" <td>14:00:00</td>\n",
|
||
" <td>8.0</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>441</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>70</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>9176.0</td>\n",
|
||
" <td>89.0</td>\n",
|
||
" <td>25</td>\n",
|
||
" <td>894.0</td>\n",
|
||
" <td>112</td>\n",
|
||
" <td>39</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>3 rows × 42 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" ma_dia_diem all_task_normal \\\n",
|
||
"0 115-2 Làm sạch toàn bộ phòng giao dịch tầng 1 (kể cả... \n",
|
||
"1 101-1 Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,... \n",
|
||
"2 101-1 Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,... \n",
|
||
"\n",
|
||
" all_task_dinhky loai_ca bat_dau \\\n",
|
||
"0 NaN Part time 06:30:00 \n",
|
||
"1 Lau bảng biển, bình cứu hỏa , cây nước hành la... Hành chính 06:30:00 \n",
|
||
"2 Lau bảng biển, bình cứu hỏa , cây nước hành la... Ca sáng 06:00:00 \n",
|
||
"\n",
|
||
" ket_thuc tong_gio_lam so_ca_cua_toa so_luong num_tasks ... \\\n",
|
||
"0 10:30:00 4.0 1 1 7 ... \n",
|
||
"1 16:00:00 7.5 6 24 441 ... \n",
|
||
"2 14:00:00 8.0 6 3 441 ... \n",
|
||
"\n",
|
||
" dien_tich_tham doc_ham vien_phan_quang op_tuong op_chan_tuong \\\n",
|
||
"0 0.0 0 0 0.0 0.0 \n",
|
||
"1 0.0 70 0 9176.0 89.0 \n",
|
||
"2 0.0 70 0 9176.0 89.0 \n",
|
||
"\n",
|
||
" ranh_thoat_nuoc dien_tich_kinh num_medical_tasks_total \\\n",
|
||
"0 0 20.0 0 \n",
|
||
"1 25 894.0 112 \n",
|
||
"2 25 894.0 112 \n",
|
||
"\n",
|
||
" num_indoor_room_tasks is_tasks_text_missing \n",
|
||
"0 1 0 \n",
|
||
"1 39 0 \n",
|
||
"2 39 0 \n",
|
||
"\n",
|
||
"[3 rows x 42 columns]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import numpy as np\n",
|
||
"\n",
|
||
"DATA_PATH = \"final_2.xlsx\"\n",
|
||
"SHEET = \"final\"\n",
|
||
"TARGET = \"so_luong\"\n",
|
||
"\n",
|
||
"# 1) Load\n",
|
||
"df = pd.read_excel(DATA_PATH, sheet_name=SHEET)\n",
|
||
"print(\"✅ Loaded:\", DATA_PATH, \"| sheet:\", SHEET)\n",
|
||
"print(\"Shape (raw):\", df.shape)\n",
|
||
"\n",
|
||
"# 2) Target sanity\n",
|
||
"assert TARGET in df.columns, f\"❌ Missing target column: {TARGET}\"\n",
|
||
"df[TARGET] = pd.to_numeric(df[TARGET], errors=\"coerce\")\n",
|
||
"\n",
|
||
"print(\"\\n=== TARGET SUMMARY (so_luong) ===\")\n",
|
||
"print(df[TARGET].describe())\n",
|
||
"print(\"Missing target:\", df[TARGET].isna().sum())\n",
|
||
"print(\"Negative target:\", (df[TARGET] < 0).sum())\n",
|
||
"print(\"Zero target:\", (df[TARGET] == 0).sum())\n",
|
||
"\n",
|
||
"# 3) Deduplicate full rows\n",
|
||
"dup = df.duplicated().sum()\n",
|
||
"print(\"\\nDuplicate full rows:\", dup)\n",
|
||
"if dup > 0:\n",
|
||
" df = df.drop_duplicates().reset_index(drop=True)\n",
|
||
"print(\"Shape (dedup):\", df.shape)\n",
|
||
"\n",
|
||
"# 4) Quick preview\n",
|
||
"print(\"\\nColumns:\", len(df.columns))\n",
|
||
"display(df.head(3))\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"id": "421b7556",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Index(['ma_dia_diem', 'all_task_normal', 'all_task_dinhky', 'loai_ca',\n",
|
||
" 'bat_dau', 'ket_thuc', 'tong_gio_lam', 'so_ca_cua_toa', 'so_luong',\n",
|
||
" 'num_tasks', 'num_cleaning_tasks', 'num_trash_collection_tasks',\n",
|
||
" 'num_monitoring_tasks', 'num_deep_cleaning_tasks', 'num_support_tasks',\n",
|
||
" 'num_other_tasks', 'num_wc_tasks', 'num_hallway_tasks',\n",
|
||
" 'num_lobby_tasks', 'num_outdoor_tasks', 'num_elevator_tasks',\n",
|
||
" 'cleaning_ratio', 'trash_collection_ratio', 'monitoring_ratio',\n",
|
||
" 'area_diversity', 'so_tang', 'so_cua_thang_may', 'dien_tich_ngoai_canh',\n",
|
||
" 'dien_tich_sanh', 'dien_tich_hanh_lang', 'dien_tich_wc',\n",
|
||
" 'dien_tich_phong', 'dien_tich_tham', 'doc_ham', 'vien_phan_quang',\n",
|
||
" 'op_tuong', 'op_chan_tuong', 'ranh_thoat_nuoc', 'dien_tich_kinh',\n",
|
||
" 'num_medical_tasks_total', 'num_indoor_room_tasks',\n",
|
||
" 'is_tasks_text_missing'],\n",
|
||
" dtype='object')"
|
||
]
|
||
},
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df.columns"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"id": "daf5a333",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"✅ X shape: (394, 48) | y shape: (394,) | #buildings: 192\n",
|
||
"Columns sample: ['tong_gio_lam', 'so_ca_cua_toa', 'num_tasks', 'num_cleaning_tasks', 'num_trash_collection_tasks', 'num_monitoring_tasks', 'num_deep_cleaning_tasks', 'num_support_tasks', 'num_other_tasks', 'num_wc_tasks', 'num_hallway_tasks', 'num_lobby_tasks', 'num_outdoor_tasks', 'num_elevator_tasks', 'cleaning_ratio']\n",
|
||
"Any NaN left?: False\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>tong_gio_lam</th>\n",
|
||
" <th>so_ca_cua_toa</th>\n",
|
||
" <th>num_tasks</th>\n",
|
||
" <th>num_cleaning_tasks</th>\n",
|
||
" <th>num_trash_collection_tasks</th>\n",
|
||
" <th>num_monitoring_tasks</th>\n",
|
||
" <th>num_deep_cleaning_tasks</th>\n",
|
||
" <th>num_support_tasks</th>\n",
|
||
" <th>num_other_tasks</th>\n",
|
||
" <th>num_wc_tasks</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>is_night_shift</th>\n",
|
||
" <th>is_morning_shift</th>\n",
|
||
" <th>is_afternoon_shift</th>\n",
|
||
" <th>is_evening_shift</th>\n",
|
||
" <th>loai_ca_Ca chiều</th>\n",
|
||
" <th>loai_ca_Ca gãy</th>\n",
|
||
" <th>loai_ca_Ca sáng</th>\n",
|
||
" <th>loai_ca_Ca đêm</th>\n",
|
||
" <th>loai_ca_Hành chính</th>\n",
|
||
" <th>loai_ca_Part time</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>7.5</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>441</td>\n",
|
||
" <td>258</td>\n",
|
||
" <td>145</td>\n",
|
||
" <td>134</td>\n",
|
||
" <td>75</td>\n",
|
||
" <td>57</td>\n",
|
||
" <td>45</td>\n",
|
||
" <td>89</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>8.0</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>441</td>\n",
|
||
" <td>258</td>\n",
|
||
" <td>145</td>\n",
|
||
" <td>134</td>\n",
|
||
" <td>75</td>\n",
|
||
" <td>57</td>\n",
|
||
" <td>45</td>\n",
|
||
" <td>89</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>8.0</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>441</td>\n",
|
||
" <td>258</td>\n",
|
||
" <td>145</td>\n",
|
||
" <td>134</td>\n",
|
||
" <td>75</td>\n",
|
||
" <td>57</td>\n",
|
||
" <td>45</td>\n",
|
||
" <td>89</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>8.0</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>441</td>\n",
|
||
" <td>258</td>\n",
|
||
" <td>145</td>\n",
|
||
" <td>134</td>\n",
|
||
" <td>75</td>\n",
|
||
" <td>57</td>\n",
|
||
" <td>45</td>\n",
|
||
" <td>89</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>7.5</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>441</td>\n",
|
||
" <td>258</td>\n",
|
||
" <td>145</td>\n",
|
||
" <td>134</td>\n",
|
||
" <td>75</td>\n",
|
||
" <td>57</td>\n",
|
||
" <td>45</td>\n",
|
||
" <td>89</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>9.5</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>441</td>\n",
|
||
" <td>258</td>\n",
|
||
" <td>145</td>\n",
|
||
" <td>134</td>\n",
|
||
" <td>75</td>\n",
|
||
" <td>57</td>\n",
|
||
" <td>45</td>\n",
|
||
" <td>89</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>9.5</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>135</td>\n",
|
||
" <td>81</td>\n",
|
||
" <td>35</td>\n",
|
||
" <td>38</td>\n",
|
||
" <td>10</td>\n",
|
||
" <td>20</td>\n",
|
||
" <td>21</td>\n",
|
||
" <td>25</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>7.5</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>135</td>\n",
|
||
" <td>81</td>\n",
|
||
" <td>35</td>\n",
|
||
" <td>38</td>\n",
|
||
" <td>10</td>\n",
|
||
" <td>20</td>\n",
|
||
" <td>21</td>\n",
|
||
" <td>25</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>7.5</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>135</td>\n",
|
||
" <td>81</td>\n",
|
||
" <td>35</td>\n",
|
||
" <td>38</td>\n",
|
||
" <td>10</td>\n",
|
||
" <td>20</td>\n",
|
||
" <td>21</td>\n",
|
||
" <td>25</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>10 rows × 48 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" tong_gio_lam so_ca_cua_toa num_tasks num_cleaning_tasks \\\n",
|
||
"0 4.0 1 7 7 \n",
|
||
"1 7.5 6 441 258 \n",
|
||
"2 8.0 6 441 258 \n",
|
||
"3 8.0 6 441 258 \n",
|
||
"4 8.0 6 441 258 \n",
|
||
"5 7.5 6 441 258 \n",
|
||
"6 9.5 6 441 258 \n",
|
||
"7 9.5 3 135 81 \n",
|
||
"8 7.5 3 135 81 \n",
|
||
"9 7.5 3 135 81 \n",
|
||
"\n",
|
||
" num_trash_collection_tasks num_monitoring_tasks num_deep_cleaning_tasks \\\n",
|
||
"0 1 2 1 \n",
|
||
"1 145 134 75 \n",
|
||
"2 145 134 75 \n",
|
||
"3 145 134 75 \n",
|
||
"4 145 134 75 \n",
|
||
"5 145 134 75 \n",
|
||
"6 145 134 75 \n",
|
||
"7 35 38 10 \n",
|
||
"8 35 38 10 \n",
|
||
"9 35 38 10 \n",
|
||
"\n",
|
||
" num_support_tasks num_other_tasks num_wc_tasks ... is_night_shift \\\n",
|
||
"0 0 0 4 ... 0 \n",
|
||
"1 57 45 89 ... 0 \n",
|
||
"2 57 45 89 ... 0 \n",
|
||
"3 57 45 89 ... 0 \n",
|
||
"4 57 45 89 ... 1 \n",
|
||
"5 57 45 89 ... 0 \n",
|
||
"6 57 45 89 ... 0 \n",
|
||
"7 20 21 25 ... 0 \n",
|
||
"8 20 21 25 ... 0 \n",
|
||
"9 20 21 25 ... 1 \n",
|
||
"\n",
|
||
" is_morning_shift is_afternoon_shift is_evening_shift loai_ca_Ca chiều \\\n",
|
||
"0 1 0 0 False \n",
|
||
"1 1 0 0 False \n",
|
||
"2 1 0 0 False \n",
|
||
"3 0 1 0 True \n",
|
||
"4 0 0 0 False \n",
|
||
"5 0 1 0 False \n",
|
||
"6 1 0 0 False \n",
|
||
"7 1 0 0 False \n",
|
||
"8 1 0 0 False \n",
|
||
"9 0 0 0 False \n",
|
||
"\n",
|
||
" loai_ca_Ca gãy loai_ca_Ca sáng loai_ca_Ca đêm loai_ca_Hành chính \\\n",
|
||
"0 False False False False \n",
|
||
"1 False False False True \n",
|
||
"2 False True False False \n",
|
||
"3 False False False False \n",
|
||
"4 False False True False \n",
|
||
"5 True False False False \n",
|
||
"6 False False False True \n",
|
||
"7 False False False True \n",
|
||
"8 True False False False \n",
|
||
"9 False False True False \n",
|
||
"\n",
|
||
" loai_ca_Part time \n",
|
||
"0 True \n",
|
||
"1 False \n",
|
||
"2 False \n",
|
||
"3 False \n",
|
||
"4 False \n",
|
||
"5 False \n",
|
||
"6 False \n",
|
||
"7 False \n",
|
||
"8 False \n",
|
||
"9 False \n",
|
||
"\n",
|
||
"[10 rows x 48 columns]"
|
||
]
|
||
},
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import numpy as np\n",
|
||
"\n",
|
||
"TARGET = \"so_luong\"\n",
|
||
"GROUP_COL = \"ma_dia_diem\"\n",
|
||
"\n",
|
||
"# chỉ bỏ 4 cột bạn yêu cầu\n",
|
||
"DROP_COLS = [\"ma_dia_diem\", \"all_task_normal\", \"all_task_dinhky\", \"is_tasks_text_missing\"]\n",
|
||
"\n",
|
||
"# ---------- helpers ----------\n",
|
||
"def parse_hour(t):\n",
|
||
" \"\"\"Convert 'HH:MM:SS' (or datetime-like) -> float hour in [0,24).\"\"\"\n",
|
||
" if pd.isna(t):\n",
|
||
" return np.nan\n",
|
||
" # pandas Timestamp / datetime\n",
|
||
" if hasattr(t, \"hour\"):\n",
|
||
" return float(t.hour) + float(getattr(t, \"minute\", 0))/60.0\n",
|
||
" s = str(t).strip()\n",
|
||
" # handle '06:30:00'\n",
|
||
" if \":\" in s:\n",
|
||
" parts = s.split(\":\")\n",
|
||
" try:\n",
|
||
" hh = int(float(parts[0]))\n",
|
||
" mm = int(float(parts[1])) if len(parts) > 1 else 0\n",
|
||
" return hh + mm/60.0\n",
|
||
" except:\n",
|
||
" return np.nan\n",
|
||
" # handle '6.5' etc\n",
|
||
" try:\n",
|
||
" return float(s)\n",
|
||
" except:\n",
|
||
" return np.nan\n",
|
||
"\n",
|
||
"# ---------- 1) y + groups ----------\n",
|
||
"y = df[TARGET].astype(float).copy()\n",
|
||
"groups = df[GROUP_COL].astype(str).copy() # để split theo tòa ở cell sau\n",
|
||
"\n",
|
||
"# ---------- 2) time features (bat_dau/ket_thuc) ----------\n",
|
||
"hour_start = df[\"bat_dau\"].apply(parse_hour)\n",
|
||
"hour_end = df[\"ket_thuc\"].apply(parse_hour)\n",
|
||
"\n",
|
||
"# cross-day (vd 22 -> 6)\n",
|
||
"is_cross_day = ((hour_end < hour_start) & hour_start.notna() & hour_end.notna()).astype(int)\n",
|
||
"\n",
|
||
"# shift length in hours, safe modulo 24\n",
|
||
"shift_length = ((hour_end - hour_start) % 24).fillna(0)\n",
|
||
"\n",
|
||
"# flags\n",
|
||
"is_night_shift = ((hour_start >= 22) | (hour_start < 6)).fillna(False).astype(int)\n",
|
||
"is_morning_shift = ((hour_start >= 6) & (hour_start < 12)).fillna(False).astype(int)\n",
|
||
"is_afternoon_shift = ((hour_start >= 12) & (hour_start < 18)).fillna(False).astype(int)\n",
|
||
"is_evening_shift = ((hour_start >= 18) & (hour_start < 22)).fillna(False).astype(int)\n",
|
||
"\n",
|
||
"# ---------- 3) Build X from df: drop forbidden + drop raw time cols + add time features ----------\n",
|
||
"X = df.drop(columns=[c for c in DROP_COLS if c in df.columns] + [TARGET], errors=\"ignore\").copy()\n",
|
||
"\n",
|
||
"# Drop raw time columns (không one-hot time)\n",
|
||
"for c in [\"bat_dau\", \"ket_thuc\"]:\n",
|
||
" if c in X.columns:\n",
|
||
" X = X.drop(columns=[c])\n",
|
||
"\n",
|
||
"# Add engineered time features\n",
|
||
"X[\"hour_start\"] = hour_start.fillna(0)\n",
|
||
"X[\"hour_end\"] = hour_end.fillna(0)\n",
|
||
"X[\"shift_length\"] = shift_length\n",
|
||
"X[\"is_cross_day\"] = is_cross_day\n",
|
||
"X[\"is_night_shift\"] = is_night_shift\n",
|
||
"X[\"is_morning_shift\"] = is_morning_shift\n",
|
||
"X[\"is_afternoon_shift\"] = is_afternoon_shift\n",
|
||
"X[\"is_evening_shift\"] = is_evening_shift\n",
|
||
"\n",
|
||
"# ---------- 4) Fill NA numeric + one-hot ONLY loai_ca ----------\n",
|
||
"# numeric fill\n",
|
||
"num_cols = X.select_dtypes(include=[np.number]).columns\n",
|
||
"X[num_cols] = X[num_cols].fillna(0)\n",
|
||
"\n",
|
||
"# One-hot loai_ca (nếu có)\n",
|
||
"if \"loai_ca\" in X.columns:\n",
|
||
" X[\"loai_ca\"] = X[\"loai_ca\"].fillna(\"UNKNOWN\").astype(str)\n",
|
||
" X = pd.get_dummies(X, columns=[\"loai_ca\"], drop_first=True)\n",
|
||
"\n",
|
||
"print(\"✅ X shape:\", X.shape, \"| y shape:\", y.shape, \"| #buildings:\", groups.nunique())\n",
|
||
"print(\"Columns sample:\", list(X.columns[:15]))\n",
|
||
"print(\"Any NaN left?:\", X.isna().any().any())\n",
|
||
"\n",
|
||
"X.head(10)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"id": "0ad8de9d",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"✅ Final X shape: (394, 45) | y shape: (394,) | #buildings: 192\n",
|
||
"Any NaN left in X?: False\n",
|
||
"Time one-hot columns (should be 0): 0\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>tong_gio_lam</th>\n",
|
||
" <th>so_ca_cua_toa</th>\n",
|
||
" <th>num_tasks</th>\n",
|
||
" <th>num_cleaning_tasks</th>\n",
|
||
" <th>num_trash_collection_tasks</th>\n",
|
||
" <th>num_monitoring_tasks</th>\n",
|
||
" <th>num_deep_cleaning_tasks</th>\n",
|
||
" <th>num_support_tasks</th>\n",
|
||
" <th>num_other_tasks</th>\n",
|
||
" <th>num_wc_tasks</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>hour_end</th>\n",
|
||
" <th>shift_length</th>\n",
|
||
" <th>is_cross_day</th>\n",
|
||
" <th>is_night_shift</th>\n",
|
||
" <th>loai_ca_Ca chiều</th>\n",
|
||
" <th>loai_ca_Ca gãy</th>\n",
|
||
" <th>loai_ca_Ca sáng</th>\n",
|
||
" <th>loai_ca_Ca đêm</th>\n",
|
||
" <th>loai_ca_Hành chính</th>\n",
|
||
" <th>loai_ca_Part time</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>10.5</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>7.5</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>441</td>\n",
|
||
" <td>258</td>\n",
|
||
" <td>145</td>\n",
|
||
" <td>134</td>\n",
|
||
" <td>75</td>\n",
|
||
" <td>57</td>\n",
|
||
" <td>45</td>\n",
|
||
" <td>89</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>16.0</td>\n",
|
||
" <td>9.5</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>8.0</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>441</td>\n",
|
||
" <td>258</td>\n",
|
||
" <td>145</td>\n",
|
||
" <td>134</td>\n",
|
||
" <td>75</td>\n",
|
||
" <td>57</td>\n",
|
||
" <td>45</td>\n",
|
||
" <td>89</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>8.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>3 rows × 45 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" tong_gio_lam so_ca_cua_toa num_tasks num_cleaning_tasks \\\n",
|
||
"0 4.0 1 7 7 \n",
|
||
"1 7.5 6 441 258 \n",
|
||
"2 8.0 6 441 258 \n",
|
||
"\n",
|
||
" num_trash_collection_tasks num_monitoring_tasks num_deep_cleaning_tasks \\\n",
|
||
"0 1 2 1 \n",
|
||
"1 145 134 75 \n",
|
||
"2 145 134 75 \n",
|
||
"\n",
|
||
" num_support_tasks num_other_tasks num_wc_tasks ... hour_end \\\n",
|
||
"0 0 0 4 ... 10.5 \n",
|
||
"1 57 45 89 ... 16.0 \n",
|
||
"2 57 45 89 ... 14.0 \n",
|
||
"\n",
|
||
" shift_length is_cross_day is_night_shift loai_ca_Ca chiều \\\n",
|
||
"0 4.0 0 0 False \n",
|
||
"1 9.5 0 0 False \n",
|
||
"2 8.0 0 0 False \n",
|
||
"\n",
|
||
" loai_ca_Ca gãy loai_ca_Ca sáng loai_ca_Ca đêm loai_ca_Hành chính \\\n",
|
||
"0 False False False False \n",
|
||
"1 False False False True \n",
|
||
"2 False True False False \n",
|
||
"\n",
|
||
" loai_ca_Part time \n",
|
||
"0 True \n",
|
||
"1 False \n",
|
||
"2 False \n",
|
||
"\n",
|
||
"[3 rows x 45 columns]"
|
||
]
|
||
},
|
||
"execution_count": 16,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"import numpy as np\n",
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"TARGET = \"so_luong\"\n",
|
||
"GROUP_COL = \"ma_dia_diem\"\n",
|
||
"DROP_COLS = [\"ma_dia_diem\", \"all_task_normal\", \"all_task_dinhky\", \"is_tasks_text_missing\"]\n",
|
||
"\n",
|
||
"def parse_hour(t):\n",
|
||
" \"\"\"Convert 'HH:MM:SS' or datetime-like -> float hour.\"\"\"\n",
|
||
" if pd.isna(t):\n",
|
||
" return np.nan\n",
|
||
" if hasattr(t, \"hour\"):\n",
|
||
" return float(t.hour) + float(getattr(t, \"minute\", 0))/60.0\n",
|
||
" s = str(t).strip()\n",
|
||
" if \":\" in s:\n",
|
||
" parts = s.split(\":\")\n",
|
||
" try:\n",
|
||
" hh = int(float(parts[0]))\n",
|
||
" mm = int(float(parts[1])) if len(parts) > 1 else 0\n",
|
||
" return hh + mm/60.0\n",
|
||
" except:\n",
|
||
" return np.nan\n",
|
||
" try:\n",
|
||
" return float(s)\n",
|
||
" except:\n",
|
||
" return np.nan\n",
|
||
"\n",
|
||
"# 1) y + groups (groups dùng ở cell 3 để split theo tòa)\n",
|
||
"y = df[TARGET].astype(float).copy()\n",
|
||
"groups = df[GROUP_COL].astype(str).copy()\n",
|
||
"\n",
|
||
"# 2) Time features từ bat_dau/ket_thuc (KHÔNG one-hot)\n",
|
||
"hour_start = df[\"bat_dau\"].apply(parse_hour)\n",
|
||
"hour_end = df[\"ket_thuc\"].apply(parse_hour)\n",
|
||
"\n",
|
||
"is_cross_day = ((hour_end < hour_start) & hour_start.notna() & hour_end.notna()).astype(int)\n",
|
||
"shift_length = ((hour_end - hour_start) % 24).fillna(0)\n",
|
||
"is_night_shift = ((hour_start >= 22) | (hour_start < 6)).fillna(False).astype(int)\n",
|
||
"\n",
|
||
"# 3) Build X: drop 4 cols + drop target + drop raw time cols\n",
|
||
"X = df.drop(columns=[c for c in DROP_COLS if c in df.columns] + [TARGET], errors=\"ignore\").copy()\n",
|
||
"X = X.drop(columns=[c for c in [\"bat_dau\", \"ket_thuc\"] if c in X.columns], errors=\"ignore\")\n",
|
||
"\n",
|
||
"# Add engineered time cols\n",
|
||
"X[\"hour_start\"] = hour_start.fillna(0)\n",
|
||
"X[\"hour_end\"] = hour_end.fillna(0)\n",
|
||
"X[\"shift_length\"] = shift_length\n",
|
||
"X[\"is_cross_day\"] = is_cross_day\n",
|
||
"X[\"is_night_shift\"] = is_night_shift\n",
|
||
"\n",
|
||
"# 4) Fill NA numeric\n",
|
||
"num_cols = X.select_dtypes(include=[np.number]).columns\n",
|
||
"X[num_cols] = X[num_cols].fillna(0)\n",
|
||
"\n",
|
||
"# 5) One-hot ONLY loai_ca (nếu có)\n",
|
||
"if \"loai_ca\" in X.columns:\n",
|
||
" X[\"loai_ca\"] = X[\"loai_ca\"].fillna(\"UNKNOWN\").astype(str)\n",
|
||
" X = pd.get_dummies(X, columns=[\"loai_ca\"], drop_first=True)\n",
|
||
"\n",
|
||
"print(\"✅ Final X shape:\", X.shape, \"| y shape:\", y.shape, \"| #buildings:\", groups.nunique())\n",
|
||
"print(\"Any NaN left in X?:\", X.isna().any().any())\n",
|
||
"\n",
|
||
"# sanity: confirm no ket_thuc_* or bat_dau_* one-hot columns\n",
|
||
"bad_cols = [c for c in X.columns if str(c).startswith(\"bat_dau_\") or str(c).startswith(\"ket_thuc_\")]\n",
|
||
"print(\"Time one-hot columns (should be 0):\", len(bad_cols))\n",
|
||
"\n",
|
||
"X.head(3)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"id": "2df3b609",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Shapes:\n",
|
||
"Train: (283, 45) (283,) | buildings: 134\n",
|
||
"Val: (58, 45) (58,) | buildings: 29\n",
|
||
"Test: (53, 45) (53,) | buildings: 29\n",
|
||
"\n",
|
||
"Leakage check (should all be 0):\n",
|
||
"Train ∩ Val : 0\n",
|
||
"Train ∩ Test: 0\n",
|
||
"Val ∩ Test: 0\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from sklearn.model_selection import GroupShuffleSplit\n",
|
||
"\n",
|
||
"# ----- 1) Split: (train+val) vs test = 85% / 15% -----\n",
|
||
"gss1 = GroupShuffleSplit(n_splits=1, test_size=0.15, random_state=42)\n",
|
||
"trainval_idx, test_idx = next(gss1.split(X, y, groups=groups))\n",
|
||
"\n",
|
||
"X_trainval, X_test = X.iloc[trainval_idx].reset_index(drop=True), X.iloc[test_idx].reset_index(drop=True)\n",
|
||
"y_trainval, y_test = y.iloc[trainval_idx].reset_index(drop=True), y.iloc[test_idx].reset_index(drop=True)\n",
|
||
"groups_trainval = groups.iloc[trainval_idx].reset_index(drop=True)\n",
|
||
"groups_test = groups.iloc[test_idx].reset_index(drop=True)\n",
|
||
"\n",
|
||
"# ----- 2) Split: train vs val inside trainval = 82.35% / 17.65% -> overall 70% / 15% -----\n",
|
||
"gss2 = GroupShuffleSplit(n_splits=1, test_size=0.1765, random_state=42)\n",
|
||
"train_idx, val_idx = next(gss2.split(X_trainval, y_trainval, groups=groups_trainval))\n",
|
||
"\n",
|
||
"X_train, X_val = X_trainval.iloc[train_idx].reset_index(drop=True), X_trainval.iloc[val_idx].reset_index(drop=True)\n",
|
||
"y_train, y_val = y_trainval.iloc[train_idx].reset_index(drop=True), y_trainval.iloc[val_idx].reset_index(drop=True)\n",
|
||
"groups_train = groups_trainval.iloc[train_idx].reset_index(drop=True)\n",
|
||
"groups_val = groups_trainval.iloc[val_idx].reset_index(drop=True)\n",
|
||
"\n",
|
||
"# ----- 3) Report -----\n",
|
||
"print(\"Shapes:\")\n",
|
||
"print(\"Train:\", X_train.shape, y_train.shape, \"| buildings:\", groups_train.nunique())\n",
|
||
"print(\"Val: \", X_val.shape, y_val.shape, \"| buildings:\", groups_val.nunique())\n",
|
||
"print(\"Test: \", X_test.shape, y_test.shape, \"| buildings:\", groups_test.nunique())\n",
|
||
"\n",
|
||
"# Leakage check: ensure no building appears in multiple splits\n",
|
||
"train_b = set(groups_train.unique())\n",
|
||
"val_b = set(groups_val.unique())\n",
|
||
"test_b = set(groups_test.unique())\n",
|
||
"print(\"\\nLeakage check (should all be 0):\")\n",
|
||
"print(\"Train ∩ Val :\", len(train_b & val_b))\n",
|
||
"print(\"Train ∩ Test:\", len(train_b & test_b))\n",
|
||
"print(\"Val ∩ Test:\", len(val_b & test_b))\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"id": "8cc64019",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"\n",
|
||
"Model: Ridge_log\n",
|
||
"Train | MAE=2.995 | RMSE=5.484 | R2=0.484\n",
|
||
"Val | MAE=1.398 | RMSE=2.015 | R2=0.037\n",
|
||
"Test | MAE=2.744 | RMSE=4.416 | R2=0.155\n",
|
||
"\n",
|
||
"Model: GBR_log\n",
|
||
"Train | MAE=1.201 | RMSE=2.466 | R2=0.896\n",
|
||
"Val | MAE=1.213 | RMSE=1.832 | R2=0.203\n",
|
||
"Test | MAE=2.979 | RMSE=4.810 | R2=-0.002\n",
|
||
"\n",
|
||
"✅ Saved: test_predictions_gbr.xlsx\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>y_true</th>\n",
|
||
" <th>y_pred</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>29.0</td>\n",
|
||
" <td>17.014321</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>5.206821</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>2.930329</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>1.853829</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>1.561030</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.867534</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>1.601529</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>2.436945</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>13.0</td>\n",
|
||
" <td>3.495240</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>1.596085</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" y_true y_pred\n",
|
||
"0 29.0 17.014321\n",
|
||
"1 4.0 5.206821\n",
|
||
"2 4.0 2.930329\n",
|
||
"3 3.0 1.853829\n",
|
||
"4 2.0 1.561030\n",
|
||
"5 1.0 0.867534\n",
|
||
"6 3.0 1.601529\n",
|
||
"7 3.0 2.436945\n",
|
||
"8 13.0 3.495240\n",
|
||
"9 2.0 1.596085"
|
||
]
|
||
},
|
||
"execution_count": 19,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"import numpy as np\n",
|
||
"import pandas as pd\n",
|
||
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
|
||
"from sklearn.linear_model import Ridge\n",
|
||
"from sklearn.ensemble import GradientBoostingRegressor\n",
|
||
"\n",
|
||
"def rmse(y, yhat):\n",
|
||
" return float(np.sqrt(mean_squared_error(y, yhat)))\n",
|
||
"\n",
|
||
"def eval_model(name, model, X_tr, y_tr, X_va, y_va, X_te, y_te, log_target=False):\n",
|
||
" # fit\n",
|
||
" model.fit(X_tr, np.log1p(y_tr) if log_target else y_tr)\n",
|
||
"\n",
|
||
" # predict helper\n",
|
||
" def pred(m, X):\n",
|
||
" p = m.predict(X)\n",
|
||
" return np.expm1(p) if log_target else p\n",
|
||
"\n",
|
||
" yhat_tr = pred(model, X_tr)\n",
|
||
" yhat_va = pred(model, X_va)\n",
|
||
" yhat_te = pred(model, X_te)\n",
|
||
"\n",
|
||
" def metrics(y, yhat):\n",
|
||
" return {\n",
|
||
" \"MAE\": float(mean_absolute_error(y, yhat)),\n",
|
||
" \"RMSE\": rmse(y, yhat),\n",
|
||
" \"R2\": float(r2_score(y, yhat)),\n",
|
||
" }\n",
|
||
"\n",
|
||
" res = {\n",
|
||
" \"model\": name,\n",
|
||
" \"Train\": metrics(y_tr, yhat_tr),\n",
|
||
" \"Val\": metrics(y_va, yhat_va),\n",
|
||
" \"Test\": metrics(y_te, yhat_te),\n",
|
||
" }\n",
|
||
" return res, yhat_te\n",
|
||
"\n",
|
||
"\n",
|
||
"results = []\n",
|
||
"\n",
|
||
"# 1) Ridge (log target)\n",
|
||
"ridge = Ridge(alpha=1.0, random_state=42)\n",
|
||
"res_ridge, ridge_test_pred = eval_model(\n",
|
||
" \"Ridge_log\", ridge,\n",
|
||
" X_train, y_train,\n",
|
||
" X_val, y_val,\n",
|
||
" X_test, y_test,\n",
|
||
" log_target=True\n",
|
||
")\n",
|
||
"results.append(res_ridge)\n",
|
||
"\n",
|
||
"# 2) Gradient Boosting (log target)\n",
|
||
"gbr = GradientBoostingRegressor(\n",
|
||
" n_estimators=300,\n",
|
||
" learning_rate=0.05,\n",
|
||
" max_depth=3,\n",
|
||
" random_state=42\n",
|
||
")\n",
|
||
"res_gbr, gbr_test_pred = eval_model(\n",
|
||
" \"GBR_log\", gbr,\n",
|
||
" X_train, y_train,\n",
|
||
" X_val, y_val,\n",
|
||
" X_test, y_test,\n",
|
||
" log_target=True\n",
|
||
")\n",
|
||
"results.append(res_gbr)\n",
|
||
"\n",
|
||
"# Print results\n",
|
||
"for r in results:\n",
|
||
" print(\"\\nModel:\", r[\"model\"])\n",
|
||
" for split in [\"Train\", \"Val\", \"Test\"]:\n",
|
||
" m = r[split]\n",
|
||
" print(f\"{split:5s} | MAE={m['MAE']:.3f} | RMSE={m['RMSE']:.3f} | R2={m['R2']:.3f}\")\n",
|
||
"\n",
|
||
"# Save TEST predictions (GBR)\n",
|
||
"out = pd.DataFrame({\n",
|
||
" \"y_true\": y_test.values,\n",
|
||
" \"y_pred\": np.maximum(0, gbr_test_pred) # clamp negative\n",
|
||
"})\n",
|
||
"out.to_excel(\"test_predictions_gbr.xlsx\", index=False)\n",
|
||
"print(\"\\n✅ Saved: test_predictions_gbr.xlsx\")\n",
|
||
"out.head(10)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 20,
|
||
"id": "e238b641",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"=== ML only (GBR_reg, log target) ===\n",
|
||
"Train | MAE=2.145 | RMSE=4.539 | R2=0.647\n",
|
||
"Val | MAE=1.074 | RMSE=1.722 | R2=0.296\n",
|
||
"Test | MAE=2.669 | RMSE=4.565 | R2=0.097\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import numpy as np\n",
|
||
"import pandas as pd\n",
|
||
"from sklearn.ensemble import GradientBoostingRegressor\n",
|
||
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
|
||
"\n",
|
||
"def rmse(y, yhat):\n",
|
||
" return float(np.sqrt(mean_squared_error(y, yhat)))\n",
|
||
"\n",
|
||
"def report(name, y_true, y_pred):\n",
|
||
" print(f\"{name:5s} | MAE={mean_absolute_error(y_true, y_pred):.3f} \"\n",
|
||
" f\"| RMSE={rmse(y_true, y_pred):.3f} | R2={r2_score(y_true, y_pred):.3f}\")\n",
|
||
"\n",
|
||
"# GBR \"hiền\" hơn để giảm overfit\n",
|
||
"gbr_reg = GradientBoostingRegressor(\n",
|
||
" n_estimators=600,\n",
|
||
" learning_rate=0.03,\n",
|
||
" max_depth=2,\n",
|
||
" min_samples_leaf=10,\n",
|
||
" min_samples_split=20,\n",
|
||
" random_state=42\n",
|
||
")\n",
|
||
"\n",
|
||
"# Train với log1p(target)\n",
|
||
"gbr_reg.fit(X_train, np.log1p(y_train))\n",
|
||
"\n",
|
||
"def predict_original_scale(model, X):\n",
|
||
" return np.maximum(0, np.expm1(model.predict(X)))\n",
|
||
"\n",
|
||
"pred_train = predict_original_scale(gbr_reg, X_train)\n",
|
||
"pred_val = predict_original_scale(gbr_reg, X_val)\n",
|
||
"pred_test = predict_original_scale(gbr_reg, X_test)\n",
|
||
"\n",
|
||
"print(\"=== ML only (GBR_reg, log target) ===\")\n",
|
||
"report(\"Train\", y_train, pred_train)\n",
|
||
"report(\"Val\", y_val, pred_val)\n",
|
||
"report(\"Test\", y_test, pred_test)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 21,
|
||
"id": "635bf672",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Best K on VAL: {'K': 150, 'mae_val': 1.0616528195190562}\n",
|
||
"\n",
|
||
"=== After Rule: y_pred_final = max(ML_pred, ceil(num_tasks/K)) ===\n",
|
||
"Rule used: min_staff = ceil(num_tasks / 150)\n",
|
||
"Train | MAE=2.259 | RMSE=4.601 | R2=0.637\n",
|
||
"Val | MAE=1.062 | RMSE=1.721 | R2=0.297\n",
|
||
"Test | MAE=2.602 | RMSE=4.527 | R2=0.112\n",
|
||
"\n",
|
||
"TEST big cases (y_true >= 10): 6\n",
|
||
"ML | MAE=8.512 | RMSE=9.050 | R2=-0.959\n",
|
||
"Rule | MAE=8.320 | RMSE=8.909 | R2=-0.899\n",
|
||
"\n",
|
||
"✅ Saved: test_predictions_ml_plus_rule.xlsx\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>y_true</th>\n",
|
||
" <th>y_pred_ml</th>\n",
|
||
" <th>y_pred_final</th>\n",
|
||
" <th>num_tasks</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>29.0</td>\n",
|
||
" <td>15.005548</td>\n",
|
||
" <td>15.005548</td>\n",
|
||
" <td>593</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>4.369563</td>\n",
|
||
" <td>4.369563</td>\n",
|
||
" <td>593</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>2.902404</td>\n",
|
||
" <td>4.000000</td>\n",
|
||
" <td>593</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>2.591762</td>\n",
|
||
" <td>4.000000</td>\n",
|
||
" <td>593</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>2.089570</td>\n",
|
||
" <td>2.089570</td>\n",
|
||
" <td>33</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.946909</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>33</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>1.167053</td>\n",
|
||
" <td>2.000000</td>\n",
|
||
" <td>182</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>2.154357</td>\n",
|
||
" <td>2.154357</td>\n",
|
||
" <td>182</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>13.0</td>\n",
|
||
" <td>3.034284</td>\n",
|
||
" <td>3.034284</td>\n",
|
||
" <td>182</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>1.674387</td>\n",
|
||
" <td>1.674387</td>\n",
|
||
" <td>124</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" y_true y_pred_ml y_pred_final num_tasks\n",
|
||
"0 29.0 15.005548 15.005548 593\n",
|
||
"1 4.0 4.369563 4.369563 593\n",
|
||
"2 4.0 2.902404 4.000000 593\n",
|
||
"3 3.0 2.591762 4.000000 593\n",
|
||
"4 2.0 2.089570 2.089570 33\n",
|
||
"5 1.0 0.946909 1.000000 33\n",
|
||
"6 3.0 1.167053 2.000000 182\n",
|
||
"7 3.0 2.154357 2.154357 182\n",
|
||
"8 13.0 3.034284 3.034284 182\n",
|
||
"9 2.0 1.674387 1.674387 124"
|
||
]
|
||
},
|
||
"execution_count": 21,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"import numpy as np\n",
|
||
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
|
||
"\n",
|
||
"def rmse(y, yhat):\n",
|
||
" return float(np.sqrt(mean_squared_error(y, yhat)))\n",
|
||
"\n",
|
||
"def report(name, y_true, y_pred):\n",
|
||
" print(f\"{name:5s} | MAE={mean_absolute_error(y_true, y_pred):.3f} \"\n",
|
||
" f\"| RMSE={rmse(y_true, y_pred):.3f} | R2={r2_score(y_true, y_pred):.3f}\")\n",
|
||
"\n",
|
||
"# num_tasks phải tồn tại trong X_train/X_val/X_test\n",
|
||
"assert \"num_tasks\" in X_train.columns, \"❌ Missing num_tasks in X_train\"\n",
|
||
"\n",
|
||
"nt_train = X_train[\"num_tasks\"].values\n",
|
||
"nt_val = X_val[\"num_tasks\"].values\n",
|
||
"nt_test = X_test[\"num_tasks\"].values\n",
|
||
"\n",
|
||
"# ---- tune K on VAL (KHÔNG đụng test khi tune) ----\n",
|
||
"Ks = [30, 40, 50, 60, 70, 80, 100, 120, 150]\n",
|
||
"best = None\n",
|
||
"\n",
|
||
"for K in Ks:\n",
|
||
" min_val = np.ceil(nt_val / K)\n",
|
||
" pred_val_rule = np.maximum(pred_val, min_val)\n",
|
||
" mae = mean_absolute_error(y_val, pred_val_rule)\n",
|
||
" if (best is None) or (mae < best[\"mae_val\"]):\n",
|
||
" best = {\"K\": K, \"mae_val\": mae}\n",
|
||
"\n",
|
||
"print(\"Best K on VAL:\", best)\n",
|
||
"\n",
|
||
"K_best = best[\"K\"]\n",
|
||
"\n",
|
||
"def apply_rule(pred, num_tasks, K):\n",
|
||
" min_staff = np.ceil(num_tasks / K)\n",
|
||
" return np.maximum(pred, min_staff)\n",
|
||
"\n",
|
||
"pred_train_rule = apply_rule(pred_train, nt_train, K_best)\n",
|
||
"pred_val_rule = apply_rule(pred_val, nt_val, K_best)\n",
|
||
"pred_test_rule = apply_rule(pred_test, nt_test, K_best)\n",
|
||
"\n",
|
||
"print(\"\\n=== After Rule: y_pred_final = max(ML_pred, ceil(num_tasks/K)) ===\")\n",
|
||
"print(\"Rule used: min_staff = ceil(num_tasks / %d)\" % K_best)\n",
|
||
"\n",
|
||
"report(\"Train\", y_train, pred_train_rule)\n",
|
||
"report(\"Val\", y_val, pred_val_rule)\n",
|
||
"report(\"Test\", y_test, pred_test_rule)\n",
|
||
"\n",
|
||
"# ---- big cases analysis on TEST ----\n",
|
||
"mask_big = (y_test.values >= 10)\n",
|
||
"print(\"\\nTEST big cases (y_true >= 10):\", int(mask_big.sum()))\n",
|
||
"if mask_big.sum() > 0:\n",
|
||
" report(\"ML\", y_test.values[mask_big], pred_test[mask_big])\n",
|
||
" report(\"Rule\", y_test.values[mask_big], pred_test_rule[mask_big])\n",
|
||
"\n",
|
||
"# save test predictions (after rule)\n",
|
||
"out_rule = pd.DataFrame({\n",
|
||
" \"y_true\": y_test.values,\n",
|
||
" \"y_pred_ml\": pred_test,\n",
|
||
" \"y_pred_final\": pred_test_rule,\n",
|
||
" \"num_tasks\": nt_test\n",
|
||
"})\n",
|
||
"out_rule.to_excel(\"test_predictions_ml_plus_rule.xlsx\", index=False)\n",
|
||
"print(\"\\n✅ Saved: test_predictions_ml_plus_rule.xlsx\")\n",
|
||
"out_rule.head(10)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 22,
|
||
"id": "70493591",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Best (a,b) on VAL: {'a': 0, 'b': 0.1, 'mae_val': 1.0737151241156944}\n",
|
||
"\n",
|
||
"=== After Smooth Rule: y_pred_final = max(ML_pred, ceil(a + b*sqrt(num_tasks))) ===\n",
|
||
"Rule used: ceil(0 + 0.1*sqrt(num_tasks))\n",
|
||
"Train | MAE=2.185 | RMSE=4.551 | R2=0.645\n",
|
||
"Val | MAE=1.074 | RMSE=1.701 | R2=0.314\n",
|
||
"Test | MAE=2.599 | RMSE=4.556 | R2=0.101\n",
|
||
"\n",
|
||
"TEST big cases (y_true >= 10): 6\n",
|
||
"ML | MAE=8.512 | RMSE=9.050 | R2=-0.959\n",
|
||
"Smooth | MAE=8.512 | RMSE=9.050 | R2=-0.959\n",
|
||
"\n",
|
||
"✅ Saved: test_predictions_ml_plus_smooth_rule.xlsx\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>y_true</th>\n",
|
||
" <th>y_pred_ml</th>\n",
|
||
" <th>y_pred_final</th>\n",
|
||
" <th>num_tasks</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>29.0</td>\n",
|
||
" <td>15.005548</td>\n",
|
||
" <td>15.005548</td>\n",
|
||
" <td>593</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>4.369563</td>\n",
|
||
" <td>4.369563</td>\n",
|
||
" <td>593</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>2.902404</td>\n",
|
||
" <td>3.000000</td>\n",
|
||
" <td>593</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>2.591762</td>\n",
|
||
" <td>3.000000</td>\n",
|
||
" <td>593</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>2.089570</td>\n",
|
||
" <td>2.089570</td>\n",
|
||
" <td>33</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.946909</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>33</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>1.167053</td>\n",
|
||
" <td>2.000000</td>\n",
|
||
" <td>182</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>2.154357</td>\n",
|
||
" <td>2.154357</td>\n",
|
||
" <td>182</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>13.0</td>\n",
|
||
" <td>3.034284</td>\n",
|
||
" <td>3.034284</td>\n",
|
||
" <td>182</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>1.674387</td>\n",
|
||
" <td>2.000000</td>\n",
|
||
" <td>124</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" y_true y_pred_ml y_pred_final num_tasks\n",
|
||
"0 29.0 15.005548 15.005548 593\n",
|
||
"1 4.0 4.369563 4.369563 593\n",
|
||
"2 4.0 2.902404 3.000000 593\n",
|
||
"3 3.0 2.591762 3.000000 593\n",
|
||
"4 2.0 2.089570 2.089570 33\n",
|
||
"5 1.0 0.946909 1.000000 33\n",
|
||
"6 3.0 1.167053 2.000000 182\n",
|
||
"7 3.0 2.154357 2.154357 182\n",
|
||
"8 13.0 3.034284 3.034284 182\n",
|
||
"9 2.0 1.674387 2.000000 124"
|
||
]
|
||
},
|
||
"execution_count": 22,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"import numpy as np\n",
|
||
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
|
||
"\n",
|
||
"def rmse(y, yhat):\n",
|
||
" return float(np.sqrt(mean_squared_error(y, yhat)))\n",
|
||
"\n",
|
||
"def report(name, y_true, y_pred):\n",
|
||
" print(f\"{name:5s} | MAE={mean_absolute_error(y_true, y_pred):.3f} \"\n",
|
||
" f\"| RMSE={rmse(y_true, y_pred):.3f} | R2={r2_score(y_true, y_pred):.3f}\")\n",
|
||
"\n",
|
||
"nt_train = X_train[\"num_tasks\"].values\n",
|
||
"nt_val = X_val[\"num_tasks\"].values\n",
|
||
"nt_test = X_test[\"num_tasks\"].values\n",
|
||
"\n",
|
||
"# search grid (nhỏ thôi để bạn dễ đọc)\n",
|
||
"a_list = [0, 1, 2]\n",
|
||
"b_list = [0.10, 0.15, 0.20, 0.25, 0.30]\n",
|
||
"\n",
|
||
"best = None\n",
|
||
"for a in a_list:\n",
|
||
" for b in b_list:\n",
|
||
" min_val = np.ceil(a + b * np.sqrt(nt_val))\n",
|
||
" pred_val_rule = np.maximum(pred_val, min_val)\n",
|
||
" mae = mean_absolute_error(y_val, pred_val_rule)\n",
|
||
" if (best is None) or (mae < best[\"mae_val\"]):\n",
|
||
" best = {\"a\": a, \"b\": b, \"mae_val\": mae}\n",
|
||
"\n",
|
||
"print(\"Best (a,b) on VAL:\", best)\n",
|
||
"\n",
|
||
"a_best, b_best = best[\"a\"], best[\"b\"]\n",
|
||
"\n",
|
||
"def apply_smooth_rule(pred, num_tasks, a, b):\n",
|
||
" min_staff = np.ceil(a + b * np.sqrt(num_tasks))\n",
|
||
" return np.maximum(pred, min_staff)\n",
|
||
"\n",
|
||
"pred_train_rule2 = apply_smooth_rule(pred_train, nt_train, a_best, b_best)\n",
|
||
"pred_val_rule2 = apply_smooth_rule(pred_val, nt_val, a_best, b_best)\n",
|
||
"pred_test_rule2 = apply_smooth_rule(pred_test, nt_test, a_best, b_best)\n",
|
||
"\n",
|
||
"print(\"\\n=== After Smooth Rule: y_pred_final = max(ML_pred, ceil(a + b*sqrt(num_tasks))) ===\")\n",
|
||
"print(f\"Rule used: ceil({a_best} + {b_best}*sqrt(num_tasks))\")\n",
|
||
"\n",
|
||
"report(\"Train\", y_train, pred_train_rule2)\n",
|
||
"report(\"Val\", y_val, pred_val_rule2)\n",
|
||
"report(\"Test\", y_test, pred_test_rule2)\n",
|
||
"\n",
|
||
"mask_big = (y_test.values >= 10)\n",
|
||
"print(\"\\nTEST big cases (y_true >= 10):\", int(mask_big.sum()))\n",
|
||
"if mask_big.sum() > 0:\n",
|
||
" report(\"ML\", y_test.values[mask_big], pred_test[mask_big])\n",
|
||
" report(\"Smooth\",y_test.values[mask_big], pred_test_rule2[mask_big])\n",
|
||
"\n",
|
||
"# save\n",
|
||
"out_rule2 = pd.DataFrame({\n",
|
||
" \"y_true\": y_test.values,\n",
|
||
" \"y_pred_ml\": pred_test,\n",
|
||
" \"y_pred_final\": pred_test_rule2,\n",
|
||
" \"num_tasks\": nt_test\n",
|
||
"})\n",
|
||
"out_rule2.to_excel(\"test_predictions_ml_plus_smooth_rule.xlsx\", index=False)\n",
|
||
"print(\"\\n✅ Saved: test_predictions_ml_plus_smooth_rule.xlsx\")\n",
|
||
"out_rule2.head(10)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "e71605c4",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Big-rate (positive class=1):\n",
|
||
"Train: 0.14840989399293286 | count: 42 / 283\n",
|
||
"Val: 0.017241379310344827 | count: 1 / 58\n",
|
||
"Test: 0.11320754716981132 | count: 6 / 53\n",
|
||
"\n",
|
||
"================================================================================\n",
|
||
"MODEL: LogReg_balanced\n",
|
||
"\n",
|
||
"[VAL] classification_report:\n",
|
||
" precision recall f1-score support\n",
|
||
"\n",
|
||
" 0 0.981 0.930 0.955 57\n",
|
||
" 1 0.000 0.000 0.000 1\n",
|
||
"\n",
|
||
" accuracy 0.914 58\n",
|
||
" macro avg 0.491 0.465 0.477 58\n",
|
||
"weighted avg 0.965 0.914 0.938 58\n",
|
||
"\n",
|
||
"VAL confusion_matrix:\n",
|
||
" [[53 4]\n",
|
||
" [ 1 0]]\n",
|
||
"\n",
|
||
"[TEST] classification_report:\n",
|
||
" precision recall f1-score support\n",
|
||
"\n",
|
||
" 0 0.881 0.787 0.831 47\n",
|
||
" 1 0.091 0.167 0.118 6\n",
|
||
"\n",
|
||
" accuracy 0.717 53\n",
|
||
" macro avg 0.486 0.477 0.475 53\n",
|
||
"weighted avg 0.792 0.717 0.751 53\n",
|
||
"\n",
|
||
"TEST confusion_matrix:\n",
|
||
" [[37 10]\n",
|
||
" [ 5 1]]\n",
|
||
"\n",
|
||
"================================================================================\n",
|
||
"MODEL: GBC_depth2\n",
|
||
"\n",
|
||
"[VAL] classification_report:\n",
|
||
" precision recall f1-score support\n",
|
||
"\n",
|
||
" 0 0.983 1.000 0.991 57\n",
|
||
" 1 0.000 0.000 0.000 1\n",
|
||
"\n",
|
||
" accuracy 0.983 58\n",
|
||
" macro avg 0.491 0.500 0.496 58\n",
|
||
"weighted avg 0.966 0.983 0.974 58\n",
|
||
"\n",
|
||
"VAL confusion_matrix:\n",
|
||
" [[57 0]\n",
|
||
" [ 1 0]]\n",
|
||
"\n",
|
||
"[TEST] classification_report:\n",
|
||
" precision recall f1-score support\n",
|
||
"\n",
|
||
" 0 0.907 0.830 0.867 47\n",
|
||
" 1 0.200 0.333 0.250 6\n",
|
||
"\n",
|
||
" accuracy 0.774 53\n",
|
||
" macro avg 0.553 0.582 0.558 53\n",
|
||
"weighted avg 0.827 0.774 0.797 53\n",
|
||
"\n",
|
||
"TEST confusion_matrix:\n",
|
||
" [[39 8]\n",
|
||
" [ 4 2]]\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
|
||
"c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
|
||
"c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Liệt kê các biến model đang tồn tại\n",
|
||
"[name for name in globals().keys() if \"gbr\" in name.lower()]\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 25,
|
||
"id": "ce971deb",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"['gbr', 'res_gbr', 'gbr_test_pred', 'gbr_reg']"
|
||
]
|
||
},
|
||
"execution_count": 25,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Liệt kê các biến model đang tồn tại\n",
|
||
"[name for name in globals().keys() if \"gbr\" in name.lower()]\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 26,
|
||
"id": "9ad22a15",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"\n",
|
||
"=== TEST RESULTS ===\n",
|
||
"ML only | MAE=2.669 | RMSE=4.565 | R2=0.097\n",
|
||
"ML + Business Rules | MAE=2.663 | RMSE=4.510 | R2=0.119\n",
|
||
"✅ Saved: test_predictions_ml_plus_business_rules.xlsx\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>y_true</th>\n",
|
||
" <th>y_pred_ml</th>\n",
|
||
" <th>min_staff_rule</th>\n",
|
||
" <th>y_pred_final</th>\n",
|
||
" <th>num_tasks</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>29.0</td>\n",
|
||
" <td>15.006</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>15.006</td>\n",
|
||
" <td>593</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>4.370</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>4.370</td>\n",
|
||
" <td>593</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>2.902</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>4.000</td>\n",
|
||
" <td>593</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>2.592</td>\n",
|
||
" <td>6.0</td>\n",
|
||
" <td>6.000</td>\n",
|
||
" <td>593</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>2.090</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>2.090</td>\n",
|
||
" <td>33</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.947</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.000</td>\n",
|
||
" <td>33</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>1.167</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>3.000</td>\n",
|
||
" <td>182</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>2.154</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>2.154</td>\n",
|
||
" <td>182</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>13.0</td>\n",
|
||
" <td>3.034</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>3.034</td>\n",
|
||
" <td>182</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>1.674</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>3.000</td>\n",
|
||
" <td>124</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" y_true y_pred_ml min_staff_rule y_pred_final num_tasks\n",
|
||
"0 29.0 15.006 4.0 15.006 593\n",
|
||
"1 4.0 4.370 4.0 4.370 593\n",
|
||
"2 4.0 2.902 4.0 4.000 593\n",
|
||
"3 3.0 2.592 6.0 6.000 593\n",
|
||
"4 2.0 2.090 1.0 2.090 33\n",
|
||
"5 1.0 0.947 1.0 1.000 33\n",
|
||
"6 3.0 1.167 3.0 3.000 182\n",
|
||
"7 3.0 2.154 2.0 2.154 182\n",
|
||
"8 13.0 3.034 2.0 3.034 182\n",
|
||
"9 2.0 1.674 3.0 3.000 124"
|
||
]
|
||
},
|
||
"execution_count": 26,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"import numpy as np\n",
|
||
"import pandas as pd\n",
|
||
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
|
||
"\n",
|
||
"def eval_reg(y_true, y_pred, name):\n",
|
||
" rmse = float(np.sqrt(mean_squared_error(y_true, y_pred)))\n",
|
||
" print(f\"{name:20s} | MAE={mean_absolute_error(y_true, y_pred):.3f} | RMSE={rmse:.3f} | R2={r2_score(y_true, y_pred):.3f}\")\n",
|
||
"\n",
|
||
"# -----------------------\n",
|
||
"# 1) ML prediction (log target -> original)\n",
|
||
"# -----------------------\n",
|
||
"y_pred_ml = np.maximum(0, np.expm1(gbr_reg.predict(X_test)))\n",
|
||
"\n",
|
||
"# -----------------------\n",
|
||
"# 2) Business rules: minimum staffing\n",
|
||
"# (các cột có thể có/không có nên check)\n",
|
||
"# -----------------------\n",
|
||
"min_staff = np.ceil(X_test[\"num_tasks\"] / 150)\n",
|
||
"\n",
|
||
"if \"num_wc_tasks\" in X_test.columns:\n",
|
||
" min_staff = np.maximum(min_staff, np.ceil(X_test[\"num_wc_tasks\"] / 40))\n",
|
||
"\n",
|
||
"if \"num_outdoor_tasks\" in X_test.columns:\n",
|
||
" min_staff = np.maximum(min_staff, np.ceil(X_test[\"num_outdoor_tasks\"] / 60))\n",
|
||
"\n",
|
||
"# Nếu bạn có cột shift (đêm / qua ngày) thì cộng thêm\n",
|
||
"for col in [\"is_night_shift\", \"is_cross_day\"]:\n",
|
||
" if col in X_test.columns:\n",
|
||
" min_staff = min_staff + X_test[col].astype(int)\n",
|
||
"\n",
|
||
"# -----------------------\n",
|
||
"# 3) Final prediction\n",
|
||
"# -----------------------\n",
|
||
"y_pred_final = np.maximum(y_pred_ml, min_staff)\n",
|
||
"\n",
|
||
"# -----------------------\n",
|
||
"# 4) Evaluate\n",
|
||
"# -----------------------\n",
|
||
"print(\"\\n=== TEST RESULTS ===\")\n",
|
||
"eval_reg(y_test, y_pred_ml, \"ML only\")\n",
|
||
"eval_reg(y_test, y_pred_final,\"ML + Business Rules\")\n",
|
||
"\n",
|
||
"# -----------------------\n",
|
||
"# 5) Save file\n",
|
||
"# -----------------------\n",
|
||
"out = pd.DataFrame({\n",
|
||
" \"y_true\": y_test.values,\n",
|
||
" \"y_pred_ml\": np.round(y_pred_ml, 3),\n",
|
||
" \"min_staff_rule\": min_staff.astype(float).values,\n",
|
||
" \"y_pred_final\": np.round(y_pred_final, 3),\n",
|
||
" \"num_tasks\": X_test[\"num_tasks\"].values\n",
|
||
"})\n",
|
||
"\n",
|
||
"out.to_excel(\"test_predictions_ml_plus_business_rules.xlsx\", index=False)\n",
|
||
"print(\"✅ Saved: test_predictions_ml_plus_business_rules.xlsx\")\n",
|
||
"\n",
|
||
"out.head(10)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 27,
|
||
"id": "79387bd4",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"✅ Best rule on VAL: {'k_tasks': 150, 'k_wc': 30, 'k_out': 40, 'mae_val': 1.0984917361586377}\n",
|
||
"\n",
|
||
"=== TEST EVAL ===\n",
|
||
"ML only | MAE=2.669 | RMSE=4.565 | R2=0.097\n",
|
||
"ML + tuned business rules | MAE=2.600 | RMSE=4.440 | R2=0.146\n",
|
||
"✅ Saved: test_predictions_ml_plus_tuned_rules.xlsx\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>y_true</th>\n",
|
||
" <th>y_pred_ml</th>\n",
|
||
" <th>min_staff_rule</th>\n",
|
||
" <th>y_pred_final</th>\n",
|
||
" <th>num_tasks</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>29.0</td>\n",
|
||
" <td>15.006</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>15.006</td>\n",
|
||
" <td>593</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>4.370</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>4.370</td>\n",
|
||
" <td>593</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>2.902</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>4.000</td>\n",
|
||
" <td>593</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>2.592</td>\n",
|
||
" <td>6.0</td>\n",
|
||
" <td>6.000</td>\n",
|
||
" <td>593</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>2.090</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>2.090</td>\n",
|
||
" <td>33</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.947</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.000</td>\n",
|
||
" <td>33</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>1.167</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>3.000</td>\n",
|
||
" <td>182</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>2.154</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>2.154</td>\n",
|
||
" <td>182</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>13.0</td>\n",
|
||
" <td>3.034</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>3.034</td>\n",
|
||
" <td>182</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>1.674</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>3.000</td>\n",
|
||
" <td>124</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" y_true y_pred_ml min_staff_rule y_pred_final num_tasks\n",
|
||
"0 29.0 15.006 4.0 15.006 593\n",
|
||
"1 4.0 4.370 4.0 4.370 593\n",
|
||
"2 4.0 2.902 4.0 4.000 593\n",
|
||
"3 3.0 2.592 6.0 6.000 593\n",
|
||
"4 2.0 2.090 1.0 2.090 33\n",
|
||
"5 1.0 0.947 1.0 1.000 33\n",
|
||
"6 3.0 1.167 3.0 3.000 182\n",
|
||
"7 3.0 2.154 2.0 2.154 182\n",
|
||
"8 13.0 3.034 2.0 3.034 182\n",
|
||
"9 2.0 1.674 3.0 3.000 124"
|
||
]
|
||
},
|
||
"execution_count": 27,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"import numpy as np\n",
|
||
"import pandas as pd\n",
|
||
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
|
||
"\n",
|
||
"def rmse(y, yhat):\n",
|
||
" return float(np.sqrt(mean_squared_error(y, yhat)))\n",
|
||
"\n",
|
||
"def eval_reg(y_true, y_pred, name):\n",
|
||
" print(f\"{name:25s} | MAE={mean_absolute_error(y_true, y_pred):.3f} | RMSE={rmse(y_true, y_pred):.3f} | R2={r2_score(y_true, y_pred):.3f}\")\n",
|
||
"\n",
|
||
"# ML preds (đã train gbr_reg)\n",
|
||
"pred_train = np.maximum(0, np.expm1(gbr_reg.predict(X_train)))\n",
|
||
"pred_val = np.maximum(0, np.expm1(gbr_reg.predict(X_val)))\n",
|
||
"pred_test = np.maximum(0, np.expm1(gbr_reg.predict(X_test)))\n",
|
||
"\n",
|
||
"def compute_min_staff(X, k_tasks, k_wc=None, k_out=None):\n",
|
||
" ms = np.ceil(X[\"num_tasks\"] / k_tasks)\n",
|
||
"\n",
|
||
" if (k_wc is not None) and (\"num_wc_tasks\" in X.columns):\n",
|
||
" ms = np.maximum(ms, np.ceil(X[\"num_wc_tasks\"] / k_wc))\n",
|
||
"\n",
|
||
" if (k_out is not None) and (\"num_outdoor_tasks\" in X.columns):\n",
|
||
" ms = np.maximum(ms, np.ceil(X[\"num_outdoor_tasks\"] / k_out))\n",
|
||
"\n",
|
||
" for col in [\"is_night_shift\", \"is_cross_day\"]:\n",
|
||
" if col in X.columns:\n",
|
||
" ms = ms + X[col].astype(int)\n",
|
||
" return ms\n",
|
||
"\n",
|
||
"# Grid search trên VAL\n",
|
||
"k_tasks_list = [100, 120, 150, 180, 200]\n",
|
||
"k_wc_list = [25, 30, 40, 50, None] # None = bỏ rule wc\n",
|
||
"k_out_list = [40, 60, 80, None] # None = bỏ rule outdoor\n",
|
||
"\n",
|
||
"best = None\n",
|
||
"\n",
|
||
"for kt in k_tasks_list:\n",
|
||
" for kw in k_wc_list:\n",
|
||
" for ko in k_out_list:\n",
|
||
" ms_val = compute_min_staff(X_val, kt, kw, ko)\n",
|
||
" pred_val_final = np.maximum(pred_val, ms_val)\n",
|
||
" mae_val = mean_absolute_error(y_val, pred_val_final)\n",
|
||
"\n",
|
||
" if (best is None) or (mae_val < best[\"mae_val\"]):\n",
|
||
" best = {\"k_tasks\": kt, \"k_wc\": kw, \"k_out\": ko, \"mae_val\": mae_val}\n",
|
||
"\n",
|
||
"print(\"✅ Best rule on VAL:\", best)\n",
|
||
"\n",
|
||
"# Apply best rule to TEST\n",
|
||
"ms_test = compute_min_staff(X_test, best[\"k_tasks\"], best[\"k_wc\"], best[\"k_out\"])\n",
|
||
"pred_test_final = np.maximum(pred_test, ms_test)\n",
|
||
"\n",
|
||
"print(\"\\n=== TEST EVAL ===\")\n",
|
||
"eval_reg(y_test, pred_test, \"ML only\")\n",
|
||
"eval_reg(y_test, pred_test_final, \"ML + tuned business rules\")\n",
|
||
"\n",
|
||
"# Save predictions\n",
|
||
"out = pd.DataFrame({\n",
|
||
" \"y_true\": y_test.values,\n",
|
||
" \"y_pred_ml\": np.round(pred_test, 3),\n",
|
||
" \"min_staff_rule\": ms_test.astype(float).values,\n",
|
||
" \"y_pred_final\": np.round(pred_test_final, 3),\n",
|
||
" \"num_tasks\": X_test[\"num_tasks\"].values\n",
|
||
"})\n",
|
||
"out.to_excel(\"test_predictions_ml_plus_tuned_rules.xlsx\", index=False)\n",
|
||
"print(\"✅ Saved: test_predictions_ml_plus_tuned_rules.xlsx\")\n",
|
||
"out.head(10)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 28,
|
||
"id": "6e9841d0",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Class distribution:\n",
|
||
"Train:\n",
|
||
"so_luong\n",
|
||
"0 207\n",
|
||
"1 39\n",
|
||
"2 37\n",
|
||
"Name: count, dtype: int64\n",
|
||
"Val:\n",
|
||
"so_luong\n",
|
||
"0 53\n",
|
||
"1 4\n",
|
||
"2 1\n",
|
||
"Name: count, dtype: int64\n",
|
||
"Test:\n",
|
||
"so_luong\n",
|
||
"0 43\n",
|
||
"1 5\n",
|
||
"2 5\n",
|
||
"Name: count, dtype: int64\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# ===== CELL 9: Create staff size class =====\n",
|
||
"\n",
|
||
"def staff_class(y):\n",
|
||
" if y <= 5:\n",
|
||
" return 0\n",
|
||
" elif y <= 10:\n",
|
||
" return 1\n",
|
||
" else:\n",
|
||
" return 2\n",
|
||
"\n",
|
||
"y_train_cls = y_train.apply(staff_class)\n",
|
||
"y_val_cls = y_val.apply(staff_class)\n",
|
||
"y_test_cls = y_test.apply(staff_class)\n",
|
||
"\n",
|
||
"print(\"Class distribution:\")\n",
|
||
"for name, y in [(\"Train\", y_train_cls), (\"Val\", y_val_cls), (\"Test\", y_test_cls)]:\n",
|
||
" print(f\"{name}:\")\n",
|
||
" print(y.value_counts().sort_index())\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 29,
|
||
"id": "daf4acc7",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.\n",
|
||
" warnings.warn(\n",
|
||
"c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:465: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
|
||
"STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.\n",
|
||
"\n",
|
||
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
|
||
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
|
||
"Please also refer to the documentation for alternative solver options:\n",
|
||
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
|
||
" n_iter_i = _check_optimize_result(\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"\n",
|
||
"=== VAL ===\n",
|
||
"\n",
|
||
"================================================================================\n",
|
||
"MODEL: LR_balanced (VAL)\n",
|
||
" precision recall f1-score support\n",
|
||
"\n",
|
||
" 0 0.956 0.811 0.878 53\n",
|
||
" 1 0.111 0.250 0.154 4\n",
|
||
" 2 0.000 0.000 0.000 1\n",
|
||
"\n",
|
||
" accuracy 0.759 58\n",
|
||
" macro avg 0.356 0.354 0.344 58\n",
|
||
"weighted avg 0.881 0.759 0.813 58\n",
|
||
"\n",
|
||
"Confusion matrix:\n",
|
||
" [[43 7 3]\n",
|
||
" [ 2 1 1]\n",
|
||
" [ 0 1 0]]\n",
|
||
"\n",
|
||
"================================================================================\n",
|
||
"MODEL: HGB (VAL)\n",
|
||
" precision recall f1-score support\n",
|
||
"\n",
|
||
" 0 0.946 1.000 0.972 53\n",
|
||
" 1 0.500 0.250 0.333 4\n",
|
||
" 2 0.000 0.000 0.000 1\n",
|
||
"\n",
|
||
" accuracy 0.931 58\n",
|
||
" macro avg 0.482 0.417 0.435 58\n",
|
||
"weighted avg 0.899 0.931 0.912 58\n",
|
||
"\n",
|
||
"Confusion matrix:\n",
|
||
" [[53 0 0]\n",
|
||
" [ 3 1 0]\n",
|
||
" [ 0 1 0]]\n",
|
||
"\n",
|
||
"=== TEST ===\n",
|
||
"\n",
|
||
"================================================================================\n",
|
||
"MODEL: LR_balanced (TEST)\n",
|
||
" precision recall f1-score support\n",
|
||
"\n",
|
||
" 0 0.914 0.744 0.821 43\n",
|
||
" 1 0.231 0.600 0.333 5\n",
|
||
" 2 0.000 0.000 0.000 5\n",
|
||
"\n",
|
||
" accuracy 0.660 53\n",
|
||
" macro avg 0.382 0.448 0.385 53\n",
|
||
"weighted avg 0.764 0.660 0.697 53\n",
|
||
"\n",
|
||
"Confusion matrix:\n",
|
||
" [[32 7 4]\n",
|
||
" [ 1 3 1]\n",
|
||
" [ 2 3 0]]\n",
|
||
"\n",
|
||
"================================================================================\n",
|
||
"MODEL: HGB (TEST)\n",
|
||
" precision recall f1-score support\n",
|
||
"\n",
|
||
" 0 0.925 0.860 0.892 43\n",
|
||
" 1 0.200 0.200 0.200 5\n",
|
||
" 2 0.125 0.200 0.154 5\n",
|
||
"\n",
|
||
" accuracy 0.736 53\n",
|
||
" macro avg 0.417 0.420 0.415 53\n",
|
||
"weighted avg 0.781 0.736 0.757 53\n",
|
||
"\n",
|
||
"Confusion matrix:\n",
|
||
" [[37 2 4]\n",
|
||
" [ 1 1 3]\n",
|
||
" [ 2 2 1]]\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
|
||
"c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
|
||
"c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import numpy as np\n",
|
||
"from sklearn.linear_model import LogisticRegression\n",
|
||
"from sklearn.ensemble import HistGradientBoostingClassifier\n",
|
||
"from sklearn.metrics import classification_report, confusion_matrix\n",
|
||
"\n",
|
||
"# ---- 1) Models ----\n",
|
||
"clf_lr = LogisticRegression(\n",
|
||
" max_iter=5000,\n",
|
||
" class_weight=\"balanced\",\n",
|
||
" solver=\"lbfgs\",\n",
|
||
" multi_class=\"auto\",\n",
|
||
" random_state=42\n",
|
||
")\n",
|
||
"\n",
|
||
"clf_hgb = HistGradientBoostingClassifier(\n",
|
||
" max_depth=3,\n",
|
||
" learning_rate=0.05,\n",
|
||
" max_iter=500,\n",
|
||
" random_state=42\n",
|
||
")\n",
|
||
"\n",
|
||
"# ---- 2) Fit ----\n",
|
||
"clf_lr.fit(X_train, y_train_cls)\n",
|
||
"clf_hgb.fit(X_train, y_train_cls)\n",
|
||
"\n",
|
||
"# ---- 3) Predict (proba) ----\n",
|
||
"proba_val_lr = clf_lr.predict_proba(X_val)\n",
|
||
"proba_test_lr = clf_lr.predict_proba(X_test)\n",
|
||
"\n",
|
||
"proba_val_hgb = clf_hgb.predict_proba(X_val)\n",
|
||
"proba_test_hgb = clf_hgb.predict_proba(X_test)\n",
|
||
"\n",
|
||
"pred_val_lr = np.argmax(proba_val_lr, axis=1)\n",
|
||
"pred_test_lr = np.argmax(proba_test_lr, axis=1)\n",
|
||
"\n",
|
||
"pred_val_hgb = np.argmax(proba_val_hgb, axis=1)\n",
|
||
"pred_test_hgb = np.argmax(proba_test_hgb, axis=1)\n",
|
||
"\n",
|
||
"def show_clf(name, y_true, y_pred):\n",
|
||
" print(\"\\n\" + \"=\"*80)\n",
|
||
" print(\"MODEL:\", name)\n",
|
||
" print(classification_report(y_true, y_pred, digits=3))\n",
|
||
" print(\"Confusion matrix:\\n\", confusion_matrix(y_true, y_pred))\n",
|
||
"\n",
|
||
"print(\"\\n=== VAL ===\")\n",
|
||
"show_clf(\"LR_balanced (VAL)\", y_val_cls, pred_val_lr)\n",
|
||
"show_clf(\"HGB (VAL)\", y_val_cls, pred_val_hgb)\n",
|
||
"\n",
|
||
"print(\"\\n=== TEST ===\")\n",
|
||
"show_clf(\"LR_balanced (TEST)\", y_test_cls, pred_test_lr)\n",
|
||
"show_clf(\"HGB (TEST)\", y_test_cls, pred_test_hgb)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 30,
|
||
"id": "38c8d00e",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"✅ Best (min_hc, min_2424) on VAL: {'min_hc': 0, 'min_2424': 0, 'mae_val': 1.0984917361586377}\n",
|
||
"\n",
|
||
"=== TEST EVAL ===\n",
|
||
"ML only (GBR log) | MAE=2.669 | RMSE=4.565 | R2=0.097\n",
|
||
"ML + tuned base rules | MAE=2.600 | RMSE=4.440 | R2=0.146\n",
|
||
"ML + base + loai_ca rules | MAE=2.600 | RMSE=4.440 | R2=0.146\n",
|
||
"✅ Saved: test_predictions_ml_plus_rules_plus_ca.xlsx\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>y_true</th>\n",
|
||
" <th>y_pred_ml</th>\n",
|
||
" <th>min_staff_base</th>\n",
|
||
" <th>min_staff_ca</th>\n",
|
||
" <th>min_staff_final</th>\n",
|
||
" <th>y_pred_final</th>\n",
|
||
" <th>num_tasks</th>\n",
|
||
" <th>loai_ca</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>29.0</td>\n",
|
||
" <td>15.006</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>15.006</td>\n",
|
||
" <td>593</td>\n",
|
||
" <td>Hành chính</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>4.370</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>4.370</td>\n",
|
||
" <td>593</td>\n",
|
||
" <td>Ca sáng</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>2.902</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>4.000</td>\n",
|
||
" <td>593</td>\n",
|
||
" <td>Ca chiều</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>2.592</td>\n",
|
||
" <td>6.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>6.0</td>\n",
|
||
" <td>6.000</td>\n",
|
||
" <td>593</td>\n",
|
||
" <td>Ca đêm</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>2.090</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>2.090</td>\n",
|
||
" <td>33</td>\n",
|
||
" <td>Hành chính</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.947</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.000</td>\n",
|
||
" <td>33</td>\n",
|
||
" <td>Part time</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>1.167</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>3.000</td>\n",
|
||
" <td>182</td>\n",
|
||
" <td>Ca sáng</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>2.154</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>2.154</td>\n",
|
||
" <td>182</td>\n",
|
||
" <td>Ca chiều</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>13.0</td>\n",
|
||
" <td>3.034</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>3.034</td>\n",
|
||
" <td>182</td>\n",
|
||
" <td>Hành chính</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>1.674</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>3.000</td>\n",
|
||
" <td>124</td>\n",
|
||
" <td>Ca sáng</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" y_true y_pred_ml min_staff_base min_staff_ca min_staff_final \\\n",
|
||
"0 29.0 15.006 4.0 0.0 4.0 \n",
|
||
"1 4.0 4.370 4.0 0.0 4.0 \n",
|
||
"2 4.0 2.902 4.0 0.0 4.0 \n",
|
||
"3 3.0 2.592 6.0 0.0 6.0 \n",
|
||
"4 2.0 2.090 1.0 0.0 1.0 \n",
|
||
"5 1.0 0.947 1.0 0.0 1.0 \n",
|
||
"6 3.0 1.167 3.0 0.0 3.0 \n",
|
||
"7 3.0 2.154 2.0 0.0 2.0 \n",
|
||
"8 13.0 3.034 2.0 0.0 2.0 \n",
|
||
"9 2.0 1.674 3.0 0.0 3.0 \n",
|
||
"\n",
|
||
" y_pred_final num_tasks loai_ca \n",
|
||
"0 15.006 593 Hành chính \n",
|
||
"1 4.370 593 Ca sáng \n",
|
||
"2 4.000 593 Ca chiều \n",
|
||
"3 6.000 593 Ca đêm \n",
|
||
"4 2.090 33 Hành chính \n",
|
||
"5 1.000 33 Part time \n",
|
||
"6 3.000 182 Ca sáng \n",
|
||
"7 2.154 182 Ca chiều \n",
|
||
"8 3.034 182 Hành chính \n",
|
||
"9 3.000 124 Ca sáng "
|
||
]
|
||
},
|
||
"execution_count": 30,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"import numpy as np\n",
|
||
"import pandas as pd\n",
|
||
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
|
||
"\n",
|
||
"def rmse(y, yhat):\n",
|
||
" return float(np.sqrt(mean_squared_error(y, yhat)))\n",
|
||
"\n",
|
||
"def eval_reg(y_true, y_pred, name):\n",
|
||
" print(f\"{name:28s} | MAE={mean_absolute_error(y_true, y_pred):.3f} | RMSE={rmse(y_true, y_pred):.3f} | R2={r2_score(y_true, y_pred):.3f}\")\n",
|
||
"\n",
|
||
"# ========= 1) ML predictions (GBR log-target) =========\n",
|
||
"pred_train = np.maximum(0, np.expm1(gbr_reg.predict(X_train)))\n",
|
||
"pred_val = np.maximum(0, np.expm1(gbr_reg.predict(X_val)))\n",
|
||
"pred_test = np.maximum(0, np.expm1(gbr_reg.predict(X_test)))\n",
|
||
"\n",
|
||
"# ========= 2) Base min-staff rule (tuned earlier) =========\n",
|
||
"# dùng lại best bạn đã tìm: k_tasks=150, k_wc=30, k_out=40\n",
|
||
"def compute_min_staff_base(X, k_tasks=150, k_wc=30, k_out=40):\n",
|
||
" ms = np.ceil(X[\"num_tasks\"] / k_tasks)\n",
|
||
"\n",
|
||
" if \"num_wc_tasks\" in X.columns:\n",
|
||
" ms = np.maximum(ms, np.ceil(X[\"num_wc_tasks\"] / k_wc))\n",
|
||
"\n",
|
||
" if \"num_outdoor_tasks\" in X.columns:\n",
|
||
" ms = np.maximum(ms, np.ceil(X[\"num_outdoor_tasks\"] / k_out))\n",
|
||
"\n",
|
||
" # bonus theo ca đêm / cross-day nếu có\n",
|
||
" for col in [\"is_night_shift\", \"is_cross_day\"]:\n",
|
||
" if col in X.columns:\n",
|
||
" ms = ms + X[col].astype(int)\n",
|
||
"\n",
|
||
" return ms\n",
|
||
"\n",
|
||
"# ========= 3) Tune MIN staff theo loai_ca trên VAL =========\n",
|
||
"# Ta tune 2 tham số: min_HC (Hành chính), min_2424 (24/24)\n",
|
||
"# Nếu loai_ca đang one-hot, ta sẽ suy ra label từ cột gốc nếu có\n",
|
||
"def get_loai_ca_series(X):\n",
|
||
" # ưu tiên nếu còn cột gốc 'loai_ca' (string)\n",
|
||
" if \"loai_ca\" in X.columns and X[\"loai_ca\"].dtype == \"object\":\n",
|
||
" return X[\"loai_ca\"].astype(str)\n",
|
||
" # nếu đã one-hot: tìm các cột bắt đầu bằng \"loai_ca_\"\n",
|
||
" onehot_cols = [c for c in X.columns if c.startswith(\"loai_ca_\")]\n",
|
||
" if onehot_cols:\n",
|
||
" # lấy tên category có value True/1\n",
|
||
" def decode_row(row):\n",
|
||
" for c in onehot_cols:\n",
|
||
" if row[c] == 1 or row[c] is True:\n",
|
||
" return c.replace(\"loai_ca_\", \"\")\n",
|
||
" return \"UNKNOWN\"\n",
|
||
" return X[onehot_cols].apply(decode_row, axis=1)\n",
|
||
" return pd.Series([\"UNKNOWN\"] * len(X), index=X.index)\n",
|
||
"\n",
|
||
"loai_ca_val = get_loai_ca_series(X_val)\n",
|
||
"loai_ca_test = get_loai_ca_series(X_test)\n",
|
||
"loai_ca_train = get_loai_ca_series(X_train)\n",
|
||
"\n",
|
||
"def apply_ca_rule(loai_ca_series, min_hc, min_2424):\n",
|
||
" # map tên ca -> min staff\n",
|
||
" # bạn có thể thêm biến thể viết khác nếu dữ liệu có\n",
|
||
" lc = loai_ca_series.str.lower()\n",
|
||
" min_by_ca = np.zeros(len(lc), dtype=float)\n",
|
||
"\n",
|
||
" # Hành chính\n",
|
||
" mask_hc = lc.str.contains(\"hành chính\") | lc.str.contains(\"hanh chinh\")\n",
|
||
" min_by_ca[mask_hc.values] = min_hc\n",
|
||
"\n",
|
||
" # 24/24 hoặc 24-24\n",
|
||
" mask_2424 = lc.str.contains(\"24/24\") | lc.str.contains(\"24-24\") | lc.str.contains(\"24 24\")\n",
|
||
" min_by_ca[mask_2424.values] = np.maximum(min_by_ca[mask_2424.values], min_2424)\n",
|
||
"\n",
|
||
" return min_by_ca\n",
|
||
"\n",
|
||
"best = None\n",
|
||
"min_hc_list = [0, 4, 6, 8, 10]\n",
|
||
"min_2424_list = [0, 6, 8, 10, 12]\n",
|
||
"\n",
|
||
"base_val = compute_min_staff_base(X_val, 150, 30, 40)\n",
|
||
"\n",
|
||
"for mhc in min_hc_list:\n",
|
||
" for m24 in min_2424_list:\n",
|
||
" min_ca = apply_ca_rule(loai_ca_val, mhc, m24)\n",
|
||
" min_staff = np.maximum(base_val, min_ca)\n",
|
||
" pred_final = np.maximum(pred_val, min_staff)\n",
|
||
"\n",
|
||
" mae_val = mean_absolute_error(y_val, pred_final)\n",
|
||
" if (best is None) or (mae_val < best[\"mae_val\"]):\n",
|
||
" best = {\"min_hc\": mhc, \"min_2424\": m24, \"mae_val\": mae_val}\n",
|
||
"\n",
|
||
"print(\"✅ Best (min_hc, min_2424) on VAL:\", best)\n",
|
||
"\n",
|
||
"# ========= 4) Apply best rule to TEST =========\n",
|
||
"base_test = compute_min_staff_base(X_test, 150, 30, 40)\n",
|
||
"min_ca_test = apply_ca_rule(loai_ca_test, best[\"min_hc\"], best[\"min_2424\"])\n",
|
||
"min_staff_test = np.maximum(base_test, min_ca_test)\n",
|
||
"\n",
|
||
"pred_test_final = np.maximum(pred_test, min_staff_test)\n",
|
||
"\n",
|
||
"print(\"\\n=== TEST EVAL ===\")\n",
|
||
"eval_reg(y_test, pred_test, \"ML only (GBR log)\")\n",
|
||
"eval_reg(y_test, np.maximum(pred_test, base_test), \"ML + tuned base rules\")\n",
|
||
"eval_reg(y_test, pred_test_final, \"ML + base + loai_ca rules\")\n",
|
||
"\n",
|
||
"# ========= 5) Save excel =========\n",
|
||
"out = pd.DataFrame({\n",
|
||
" \"y_true\": y_test.values,\n",
|
||
" \"y_pred_ml\": np.round(pred_test, 3),\n",
|
||
" \"min_staff_base\": base_test.astype(float),\n",
|
||
" \"min_staff_ca\": min_ca_test.astype(float),\n",
|
||
" \"min_staff_final\": min_staff_test.astype(float),\n",
|
||
" \"y_pred_final\": np.round(pred_test_final, 3),\n",
|
||
" \"num_tasks\": X_test[\"num_tasks\"].values if \"num_tasks\" in X_test.columns else np.nan,\n",
|
||
" \"loai_ca\": loai_ca_test.values\n",
|
||
"})\n",
|
||
"out.to_excel(\"test_predictions_ml_plus_rules_plus_ca.xlsx\", index=False)\n",
|
||
"print(\"✅ Saved: test_predictions_ml_plus_rules_plus_ca.xlsx\")\n",
|
||
"\n",
|
||
"out.head(10)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 32,
|
||
"id": "6dc15922",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Loaded: final_2.xlsx | sheet: final\n",
|
||
"Shape (raw): (401, 42)\n",
|
||
"Shape (dedup): (394, 42)\n",
|
||
"\n",
|
||
"=== Target summary (so_luong) ===\n",
|
||
"count 394.000000\n",
|
||
"mean 4.710660\n",
|
||
"std 6.848602\n",
|
||
"min 0.000000\n",
|
||
"25% 1.000000\n",
|
||
"50% 2.000000\n",
|
||
"75% 5.000000\n",
|
||
"max 64.000000\n",
|
||
"Name: so_luong, dtype: float64\n",
|
||
"\n",
|
||
"=== staff_band distribution ===\n",
|
||
"staff_band\n",
|
||
"0 216\n",
|
||
"1 87\n",
|
||
"2 48\n",
|
||
"3 27\n",
|
||
"4 16\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n",
|
||
"Sample rows:\n",
|
||
" ma_dia_diem loai_ca tong_gio_lam num_tasks so_luong staff_band\n",
|
||
"0 115-2 Part time 4.0 7 1 0\n",
|
||
"1 101-1 Hành chính 7.5 441 24 4\n",
|
||
"2 101-1 Ca sáng 8.0 441 3 1\n",
|
||
"3 101-1 Ca chiều 8.0 441 5 1\n",
|
||
"4 101-1 Ca đêm 8.0 441 1 0\n",
|
||
"5 101-1 Ca gãy 7.5 441 1 0\n",
|
||
"6 101-1 Hành chính 9.5 441 22 4\n",
|
||
"7 101-2 Hành chính 9.5 135 8 2\n",
|
||
"8 101-2 Ca gãy 7.5 135 1 0\n",
|
||
"9 101-2 Ca đêm 7.5 135 1 0\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import numpy as np\n",
|
||
"\n",
|
||
"DATA_PATH = \"final_2.xlsx\"\n",
|
||
"SHEET_NAME = \"final\"\n",
|
||
"\n",
|
||
"df = pd.read_excel(DATA_PATH, sheet_name=SHEET_NAME)\n",
|
||
"print(\"Loaded:\", DATA_PATH, \"| sheet:\", SHEET_NAME)\n",
|
||
"print(\"Shape (raw):\", df.shape)\n",
|
||
"\n",
|
||
"# drop duplicates\n",
|
||
"df = df.drop_duplicates()\n",
|
||
"print(\"Shape (dedup):\", df.shape)\n",
|
||
"\n",
|
||
"# target\n",
|
||
"assert \"so_luong\" in df.columns, \"Missing target so_luong\"\n",
|
||
"\n",
|
||
"# ---- Define ordinal bins (bậc nhân sự) ----\n",
|
||
"# 0: 0-2 (rất nhỏ)\n",
|
||
"# 1: 3-5 (nhỏ)\n",
|
||
"# 2: 6-10 (trung bình)\n",
|
||
"# 3: 11-20 (lớn)\n",
|
||
"# 4: >20 (rất lớn)\n",
|
||
"bins = [-0.1, 2, 5, 10, 20, 10**9]\n",
|
||
"labels = [0, 1, 2, 3, 4]\n",
|
||
"\n",
|
||
"df[\"staff_band\"] = pd.cut(df[\"so_luong\"], bins=bins, labels=labels).astype(int)\n",
|
||
"\n",
|
||
"print(\"\\n=== Target summary (so_luong) ===\")\n",
|
||
"print(df[\"so_luong\"].describe())\n",
|
||
"\n",
|
||
"print(\"\\n=== staff_band distribution ===\")\n",
|
||
"print(df[\"staff_band\"].value_counts().sort_index())\n",
|
||
"\n",
|
||
"print(\"\\nSample rows:\")\n",
|
||
"print(df[[\"ma_dia_diem\",\"loai_ca\",\"tong_gio_lam\",\"num_tasks\",\"so_luong\",\"staff_band\"]].head(10))\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 33,
|
||
"id": "666be810",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"=== SPLIT SUMMARY (by ma_dia_diem) ===\n",
|
||
"Buildings: 192\n",
|
||
"Train buildings: 138 | rows: 282\n",
|
||
"Val buildings: 29 | rows: 56\n",
|
||
"Test buildings: 25 | rows: 56\n",
|
||
"\n",
|
||
"Leakage check (should be 0):\n",
|
||
"Train ∩ Val : 0\n",
|
||
"Train ∩ Test: 0\n",
|
||
"Val ∩ Test: 0\n",
|
||
"\n",
|
||
"=== staff_band distribution ===\n",
|
||
"\n",
|
||
"Train:\n",
|
||
"staff_band\n",
|
||
"0 155\n",
|
||
"1 55\n",
|
||
"2 38\n",
|
||
"3 22\n",
|
||
"4 12\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n",
|
||
"Val:\n",
|
||
"staff_band\n",
|
||
"0 28\n",
|
||
"1 20\n",
|
||
"2 5\n",
|
||
"3 3\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n",
|
||
"Test:\n",
|
||
"staff_band\n",
|
||
"0 33\n",
|
||
"1 12\n",
|
||
"2 5\n",
|
||
"3 2\n",
|
||
"4 4\n",
|
||
"Name: count, dtype: int64\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import numpy as np\n",
|
||
"\n",
|
||
"SEED = 42\n",
|
||
"TRAIN_RATIO = 0.72\n",
|
||
"VAL_RATIO = 0.15\n",
|
||
"TEST_RATIO = 0.13\n",
|
||
"\n",
|
||
"assert abs(TRAIN_RATIO + VAL_RATIO + TEST_RATIO - 1.0) < 1e-9\n",
|
||
"\n",
|
||
"# 1) lấy danh sách tòa nhà (group)\n",
|
||
"buildings = df[\"ma_dia_diem\"].astype(str).unique()\n",
|
||
"rng = np.random.RandomState(SEED)\n",
|
||
"rng.shuffle(buildings)\n",
|
||
"\n",
|
||
"n = len(buildings)\n",
|
||
"n_train = int(round(n * TRAIN_RATIO))\n",
|
||
"n_val = int(round(n * VAL_RATIO))\n",
|
||
"# phần còn lại là test\n",
|
||
"train_b = set(buildings[:n_train])\n",
|
||
"val_b = set(buildings[n_train:n_train+n_val])\n",
|
||
"test_b = set(buildings[n_train+n_val:])\n",
|
||
"\n",
|
||
"# 2) tạo mask theo group\n",
|
||
"train_mask = df[\"ma_dia_diem\"].astype(str).isin(train_b)\n",
|
||
"val_mask = df[\"ma_dia_diem\"].astype(str).isin(val_b)\n",
|
||
"test_mask = df[\"ma_dia_diem\"].astype(str).isin(test_b)\n",
|
||
"\n",
|
||
"df_train = df[train_mask].copy()\n",
|
||
"df_val = df[val_mask].copy()\n",
|
||
"df_test = df[test_mask].copy()\n",
|
||
"\n",
|
||
"print(\"=== SPLIT SUMMARY (by ma_dia_diem) ===\")\n",
|
||
"print(\"Buildings:\", n)\n",
|
||
"print(\"Train buildings:\", len(train_b), \"| rows:\", df_train.shape[0])\n",
|
||
"print(\"Val buildings:\", len(val_b), \"| rows:\", df_val.shape[0])\n",
|
||
"print(\"Test buildings:\", len(test_b), \"| rows:\", df_test.shape[0])\n",
|
||
"\n",
|
||
"# 3) leakage check (phải = 0)\n",
|
||
"train_set = set(df_train[\"ma_dia_diem\"].astype(str).unique())\n",
|
||
"val_set = set(df_val[\"ma_dia_diem\"].astype(str).unique())\n",
|
||
"test_set = set(df_test[\"ma_dia_diem\"].astype(str).unique())\n",
|
||
"\n",
|
||
"print(\"\\nLeakage check (should be 0):\")\n",
|
||
"print(\"Train ∩ Val :\", len(train_set & val_set))\n",
|
||
"print(\"Train ∩ Test:\", len(train_set & test_set))\n",
|
||
"print(\"Val ∩ Test:\", len(val_set & test_set))\n",
|
||
"\n",
|
||
"# 4) distribution staff_band theo từng tập\n",
|
||
"print(\"\\n=== staff_band distribution ===\")\n",
|
||
"for name, d in [(\"Train\", df_train), (\"Val\", df_val), (\"Test\", df_test)]:\n",
|
||
" vc = d[\"staff_band\"].value_counts().sort_index()\n",
|
||
" print(f\"\\n{name}:\")\n",
|
||
" print(vc)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 34,
|
||
"id": "d898cfe6",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Categorical cols: ['loai_ca', 'bat_dau', 'ket_thuc']\n",
|
||
"\n",
|
||
"Shapes:\n",
|
||
"Train: (282, 115) (282,)\n",
|
||
"Val : (56, 115) (56,)\n",
|
||
"Test : (56, 115) (56,)\n",
|
||
"\n",
|
||
"Sample feature columns (first 25):\n",
|
||
"['tong_gio_lam', 'so_ca_cua_toa', 'num_tasks', 'num_cleaning_tasks', 'num_trash_collection_tasks', 'num_monitoring_tasks', 'num_deep_cleaning_tasks', 'num_support_tasks', 'num_other_tasks', 'num_wc_tasks', 'num_hallway_tasks', 'num_lobby_tasks', 'num_outdoor_tasks', 'num_elevator_tasks', 'cleaning_ratio', 'trash_collection_ratio', 'monitoring_ratio', 'area_diversity', 'so_tang', 'so_cua_thang_may', 'dien_tich_ngoai_canh', 'dien_tich_sanh', 'dien_tich_hanh_lang', 'dien_tich_wc', 'dien_tich_phong']\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import numpy as np\n",
|
||
"\n",
|
||
"# ========= 1) Helper: parse time to hour float =========\n",
|
||
"def time_to_hour(x):\n",
|
||
" if pd.isna(x):\n",
|
||
" return np.nan\n",
|
||
" # nếu là kiểu datetime/time của pandas\n",
|
||
" if hasattr(x, \"hour\"):\n",
|
||
" return float(x.hour) + float(getattr(x, \"minute\", 0))/60.0\n",
|
||
" s = str(x).strip()\n",
|
||
" # format \"HH:MM:SS\" hoặc \"HH:MM\"\n",
|
||
" if \":\" in s:\n",
|
||
" parts = s.split(\":\")\n",
|
||
" try:\n",
|
||
" h = float(parts[0])\n",
|
||
" m = float(parts[1]) if len(parts) > 1 else 0.0\n",
|
||
" return h + m/60.0\n",
|
||
" except:\n",
|
||
" return np.nan\n",
|
||
" # fallback\n",
|
||
" try:\n",
|
||
" return float(s)\n",
|
||
" except:\n",
|
||
" return np.nan\n",
|
||
"\n",
|
||
"def add_time_features(d):\n",
|
||
" d = d.copy()\n",
|
||
" d[\"hour_start\"] = d[\"bat_dau\"].apply(time_to_hour)\n",
|
||
" d[\"hour_end\"] = d[\"ket_thuc\"].apply(time_to_hour)\n",
|
||
"\n",
|
||
" # shift length (handle cross-day)\n",
|
||
" # nếu end < start -> qua ngày: +24\n",
|
||
" end_adj = d[\"hour_end\"].copy()\n",
|
||
" mask_cross = (d[\"hour_end\"].notna()) & (d[\"hour_start\"].notna()) & (d[\"hour_end\"] < d[\"hour_start\"])\n",
|
||
" end_adj[mask_cross] = end_adj[mask_cross] + 24.0\n",
|
||
"\n",
|
||
" d[\"shift_length\"] = (end_adj - d[\"hour_start\"]).clip(lower=0)\n",
|
||
" d[\"is_cross_day\"] = mask_cross.astype(int)\n",
|
||
"\n",
|
||
" # buckets theo giờ bắt đầu\n",
|
||
" hs = d[\"hour_start\"].fillna(-1)\n",
|
||
" d[\"is_morning_shift\"] = ((hs >= 6) & (hs < 12)).astype(int)\n",
|
||
" d[\"is_afternoon_shift\"] = ((hs >= 12) & (hs < 18)).astype(int)\n",
|
||
" d[\"is_evening_shift\"] = ((hs >= 18) & (hs < 24)).astype(int)\n",
|
||
" d[\"is_night_shift\"] = ((hs >= 0) & (hs < 6)).astype(int)\n",
|
||
"\n",
|
||
" return d\n",
|
||
"\n",
|
||
"# ========= 2) Apply time features =========\n",
|
||
"df_train_fe = add_time_features(df_train)\n",
|
||
"df_val_fe = add_time_features(df_val)\n",
|
||
"df_test_fe = add_time_features(df_test)\n",
|
||
"\n",
|
||
"# ========= 3) Drop columns (the ones you requested) + leakage columns =========\n",
|
||
"DROP_COLS = [\"ma_dia_diem\", \"all_task_normal\", \"all_task_dinhky\", \"is_tasks_text_missing\"]\n",
|
||
"LEAK_COLS = [\"so_luong\"] # label thật -> tuyệt đối không dùng feature\n",
|
||
"\n",
|
||
"# giữ lại staff_band làm y\n",
|
||
"y_train = df_train_fe[\"staff_band\"].astype(int)\n",
|
||
"y_val = df_val_fe[\"staff_band\"].astype(int)\n",
|
||
"y_test = df_test_fe[\"staff_band\"].astype(int)\n",
|
||
"\n",
|
||
"X_train = df_train_fe.drop(columns=[c for c in (DROP_COLS + LEAK_COLS + [\"staff_band\"]) if c in df_train_fe.columns])\n",
|
||
"X_val = df_val_fe.drop(columns=[c for c in (DROP_COLS + LEAK_COLS + [\"staff_band\"]) if c in df_val_fe.columns])\n",
|
||
"X_test = df_test_fe.drop(columns=[c for c in (DROP_COLS + LEAK_COLS + [\"staff_band\"]) if c in df_test_fe.columns])\n",
|
||
"\n",
|
||
"# ========= 4) One-hot only categorical columns =========\n",
|
||
"cat_cols = [c for c in X_train.columns if X_train[c].dtype == \"object\"]\n",
|
||
"print(\"Categorical cols:\", cat_cols)\n",
|
||
"\n",
|
||
"X_train = pd.get_dummies(X_train, columns=cat_cols, dummy_na=True)\n",
|
||
"X_val = pd.get_dummies(X_val, columns=cat_cols, dummy_na=True)\n",
|
||
"X_test = pd.get_dummies(X_test, columns=cat_cols, dummy_na=True)\n",
|
||
"\n",
|
||
"# align columns across splits\n",
|
||
"X_train, X_val = X_train.align(X_val, join=\"left\", axis=1, fill_value=0)\n",
|
||
"X_train, X_test = X_train.align(X_test, join=\"left\", axis=1, fill_value=0)\n",
|
||
"\n",
|
||
"# fill NaN numeric\n",
|
||
"X_train = X_train.fillna(0)\n",
|
||
"X_val = X_val.fillna(0)\n",
|
||
"X_test = X_test.fillna(0)\n",
|
||
"\n",
|
||
"print(\"\\nShapes:\")\n",
|
||
"print(\"Train:\", X_train.shape, y_train.shape)\n",
|
||
"print(\"Val :\", X_val.shape, y_val.shape)\n",
|
||
"print(\"Test :\", X_test.shape, y_test.shape)\n",
|
||
"\n",
|
||
"print(\"\\nSample feature columns (first 25):\")\n",
|
||
"print(list(X_train.columns[:25]))\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 35,
|
||
"id": "253b34f1",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:465: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
|
||
"STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.\n",
|
||
"\n",
|
||
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
|
||
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
|
||
"Please also refer to the documentation for alternative solver options:\n",
|
||
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
|
||
" n_iter_i = _check_optimize_result(\n",
|
||
"c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
|
||
"c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
|
||
"c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"\n",
|
||
"======================================================================\n",
|
||
"MODEL: LR_balanced\n",
|
||
"Train | Acc=0.624 | MacroF1=0.552\n",
|
||
"Val | Acc=0.429 | MacroF1=0.277\n",
|
||
"Test | Acc=0.518 | MacroF1=0.402\n",
|
||
"\n",
|
||
"[VAL] Confusion matrix:\n",
|
||
"[[18 5 0 5]\n",
|
||
" [ 6 4 0 10]\n",
|
||
" [ 2 2 0 1]\n",
|
||
" [ 0 1 0 2]]\n",
|
||
"\n",
|
||
"[VAL] Classification report:\n",
|
||
" precision recall f1-score support\n",
|
||
"\n",
|
||
" 0 0.692 0.643 0.667 28\n",
|
||
" 1 0.333 0.200 0.250 20\n",
|
||
" 2 0.000 0.000 0.000 5\n",
|
||
" 3 0.111 0.667 0.190 3\n",
|
||
"\n",
|
||
" accuracy 0.429 56\n",
|
||
" macro avg 0.284 0.377 0.277 56\n",
|
||
"weighted avg 0.471 0.429 0.433 56\n",
|
||
"\n",
|
||
"\n",
|
||
"[TEST] Confusion matrix:\n",
|
||
"[[20 8 1 2 2]\n",
|
||
" [ 3 4 2 1 2]\n",
|
||
" [ 0 1 1 3 0]\n",
|
||
" [ 0 0 0 2 0]\n",
|
||
" [ 0 1 0 1 2]]\n",
|
||
"\n",
|
||
"[TEST] Classification report:\n",
|
||
" precision recall f1-score support\n",
|
||
"\n",
|
||
" 0 0.870 0.606 0.714 33\n",
|
||
" 1 0.286 0.333 0.308 12\n",
|
||
" 2 0.250 0.200 0.222 5\n",
|
||
" 3 0.222 1.000 0.364 2\n",
|
||
" 4 0.333 0.500 0.400 4\n",
|
||
"\n",
|
||
" accuracy 0.518 56\n",
|
||
" macro avg 0.392 0.528 0.402 56\n",
|
||
"weighted avg 0.628 0.518 0.548 56\n",
|
||
"\n",
|
||
"\n",
|
||
"======================================================================\n",
|
||
"MODEL: RF_balanced\n",
|
||
"Train | Acc=0.894 | MacroF1=0.875\n",
|
||
"Val | Acc=0.607 | MacroF1=0.424\n",
|
||
"Test | Acc=0.625 | MacroF1=0.453\n",
|
||
"\n",
|
||
"[VAL] Confusion matrix:\n",
|
||
"[[19 8 1 0]\n",
|
||
" [ 6 13 1 0]\n",
|
||
" [ 0 2 2 1]\n",
|
||
" [ 0 2 1 0]]\n",
|
||
"\n",
|
||
"[VAL] Classification report:\n",
|
||
" precision recall f1-score support\n",
|
||
"\n",
|
||
" 0 0.760 0.679 0.717 28\n",
|
||
" 1 0.520 0.650 0.578 20\n",
|
||
" 2 0.400 0.400 0.400 5\n",
|
||
" 3 0.000 0.000 0.000 3\n",
|
||
"\n",
|
||
" accuracy 0.607 56\n",
|
||
" macro avg 0.420 0.432 0.424 56\n",
|
||
"weighted avg 0.601 0.607 0.601 56\n",
|
||
"\n",
|
||
"\n",
|
||
"[TEST] Confusion matrix:\n",
|
||
"[[23 2 8 0 0]\n",
|
||
" [ 3 7 2 0 0]\n",
|
||
" [ 0 0 4 1 0]\n",
|
||
" [ 0 1 0 1 0]\n",
|
||
" [ 0 0 4 0 0]]\n",
|
||
"\n",
|
||
"[TEST] Classification report:\n",
|
||
" precision recall f1-score support\n",
|
||
"\n",
|
||
" 0 0.885 0.697 0.780 33\n",
|
||
" 1 0.700 0.583 0.636 12\n",
|
||
" 2 0.222 0.800 0.348 5\n",
|
||
" 3 0.500 0.500 0.500 2\n",
|
||
" 4 0.000 0.000 0.000 4\n",
|
||
"\n",
|
||
" accuracy 0.625 56\n",
|
||
" macro avg 0.461 0.516 0.453 56\n",
|
||
"weighted avg 0.709 0.625 0.645 56\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
|
||
"c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
|
||
"c:\\Users\\SLG PC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"\n",
|
||
"======================================================================\n",
|
||
"MODEL: HGB\n",
|
||
"Train | Acc=0.989 | MacroF1=0.989\n",
|
||
"Val | Acc=0.589 | MacroF1=0.405\n",
|
||
"Test | Acc=0.518 | MacroF1=0.326\n",
|
||
"\n",
|
||
"[VAL] Confusion matrix:\n",
|
||
"[[22 5 1 0]\n",
|
||
" [ 8 9 2 1]\n",
|
||
" [ 1 1 2 1]\n",
|
||
" [ 1 2 0 0]]\n",
|
||
"\n",
|
||
"[VAL] Classification report:\n",
|
||
" precision recall f1-score support\n",
|
||
"\n",
|
||
" 0 0.688 0.786 0.733 28\n",
|
||
" 1 0.529 0.450 0.486 20\n",
|
||
" 2 0.400 0.400 0.400 5\n",
|
||
" 3 0.000 0.000 0.000 3\n",
|
||
"\n",
|
||
" accuracy 0.589 56\n",
|
||
" macro avg 0.404 0.409 0.405 56\n",
|
||
"weighted avg 0.569 0.589 0.576 56\n",
|
||
"\n",
|
||
"\n",
|
||
"[TEST] Confusion matrix:\n",
|
||
"[[22 6 4 1 0]\n",
|
||
" [ 4 5 3 0 0]\n",
|
||
" [ 1 2 1 1 0]\n",
|
||
" [ 0 1 1 0 0]\n",
|
||
" [ 0 1 2 0 1]]\n",
|
||
"\n",
|
||
"[TEST] Classification report:\n",
|
||
" precision recall f1-score support\n",
|
||
"\n",
|
||
" 0 0.815 0.667 0.733 33\n",
|
||
" 1 0.333 0.417 0.370 12\n",
|
||
" 2 0.091 0.200 0.125 5\n",
|
||
" 3 0.000 0.000 0.000 2\n",
|
||
" 4 1.000 0.250 0.400 4\n",
|
||
"\n",
|
||
" accuracy 0.518 56\n",
|
||
" macro avg 0.448 0.307 0.326 56\n",
|
||
"weighted avg 0.631 0.518 0.551 56\n",
|
||
"\n",
|
||
"\n",
|
||
"======================================================================\n",
|
||
"SUMMARY (sorted by Val MacroF1):\n",
|
||
"RF_balanced | Val MacroF1=0.424 | Test MacroF1=0.453\n",
|
||
"HGB | Val MacroF1=0.405 | Test MacroF1=0.326\n",
|
||
"LR_balanced | Val MacroF1=0.277 | Test MacroF1=0.402\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import numpy as np\n",
|
||
"from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score\n",
|
||
"from sklearn.linear_model import LogisticRegression\n",
|
||
"from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier\n",
|
||
"\n",
|
||
"def eval_cls(name, model, Xtr, ytr, Xva, yva, Xte, yte):\n",
|
||
" model.fit(Xtr, ytr)\n",
|
||
"\n",
|
||
" def _metrics(split_name, X, y):\n",
|
||
" pred = model.predict(X)\n",
|
||
" acc = accuracy_score(y, pred)\n",
|
||
" f1m = f1_score(y, pred, average=\"macro\")\n",
|
||
" return pred, acc, f1m\n",
|
||
"\n",
|
||
" pred_tr, acc_tr, f1_tr = _metrics(\"Train\", Xtr, ytr)\n",
|
||
" pred_va, acc_va, f1_va = _metrics(\"Val\", Xva, yva)\n",
|
||
" pred_te, acc_te, f1_te = _metrics(\"Test\", Xte, yte)\n",
|
||
"\n",
|
||
" print(\"\\n\" + \"=\"*70)\n",
|
||
" print(f\"MODEL: {name}\")\n",
|
||
" print(f\"Train | Acc={acc_tr:.3f} | MacroF1={f1_tr:.3f}\")\n",
|
||
" print(f\"Val | Acc={acc_va:.3f} | MacroF1={f1_va:.3f}\")\n",
|
||
" print(f\"Test | Acc={acc_te:.3f} | MacroF1={f1_te:.3f}\")\n",
|
||
"\n",
|
||
" print(\"\\n[VAL] Confusion matrix:\")\n",
|
||
" print(confusion_matrix(yva, pred_va))\n",
|
||
" print(\"\\n[VAL] Classification report:\")\n",
|
||
" print(classification_report(yva, pred_va, digits=3))\n",
|
||
"\n",
|
||
" print(\"\\n[TEST] Confusion matrix:\")\n",
|
||
" print(confusion_matrix(yte, pred_te))\n",
|
||
" print(\"\\n[TEST] Classification report:\")\n",
|
||
" print(classification_report(yte, pred_te, digits=3))\n",
|
||
"\n",
|
||
" return {\n",
|
||
" \"name\": name,\n",
|
||
" \"model\": model,\n",
|
||
" \"val_macro_f1\": f1_va,\n",
|
||
" \"test_macro_f1\": f1_te\n",
|
||
" }\n",
|
||
"\n",
|
||
"# 1) Logistic Regression (balanced) - baseline mạnh cho tabular\n",
|
||
"lr = LogisticRegression(\n",
|
||
" max_iter=5000,\n",
|
||
" class_weight=\"balanced\",\n",
|
||
" n_jobs=None\n",
|
||
")\n",
|
||
"\n",
|
||
"# 2) RandomForest (balanced_subsample)\n",
|
||
"rf = RandomForestClassifier(\n",
|
||
" n_estimators=600,\n",
|
||
" max_depth=None,\n",
|
||
" min_samples_leaf=2,\n",
|
||
" random_state=42,\n",
|
||
" class_weight=\"balanced_subsample\",\n",
|
||
" n_jobs=-1\n",
|
||
")\n",
|
||
"\n",
|
||
"# 3) HistGradientBoosting (mạnh cho tabular, chạy nhanh)\n",
|
||
"hgb = HistGradientBoostingClassifier(\n",
|
||
" learning_rate=0.06,\n",
|
||
" max_depth=6,\n",
|
||
" max_iter=600,\n",
|
||
" random_state=42\n",
|
||
")\n",
|
||
"\n",
|
||
"results = []\n",
|
||
"results.append(eval_cls(\"LR_balanced\", lr, X_train, y_train, X_val, y_val, X_test, y_test))\n",
|
||
"results.append(eval_cls(\"RF_balanced\", rf, X_train, y_train, X_val, y_val, X_test, y_test))\n",
|
||
"results.append(eval_cls(\"HGB\", hgb, X_train, y_train, X_val, y_val, X_test, y_test))\n",
|
||
"\n",
|
||
"# Summary\n",
|
||
"print(\"\\n\" + \"=\"*70)\n",
|
||
"print(\"SUMMARY (sorted by Val MacroF1):\")\n",
|
||
"for r in sorted(results, key=lambda x: x[\"val_macro_f1\"], reverse=True):\n",
|
||
" print(f\"{r['name']:12s} | Val MacroF1={r['val_macro_f1']:.3f} | Test MacroF1={r['test_macro_f1']:.3f}\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 37,
|
||
"id": "e1851e78",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Loaded: final_2.xlsx | sheet: final\n",
|
||
"Shape (raw): (401, 42)\n",
|
||
"Shape (after dedup): (394, 42)\n",
|
||
"\n",
|
||
"=== TARGET SUMMARY (so_luong) ===\n",
|
||
"count 394.000000\n",
|
||
"mean 4.710660\n",
|
||
"std 6.848602\n",
|
||
"min 0.000000\n",
|
||
"25% 1.000000\n",
|
||
"50% 2.000000\n",
|
||
"75% 5.000000\n",
|
||
"max 64.000000\n",
|
||
"Name: so_luong, dtype: float64\n",
|
||
"Missing target: 0\n",
|
||
"Negative target: 0\n",
|
||
"Zero target: 3\n",
|
||
"\n",
|
||
"Sample rows:\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>ma_dia_diem</th>\n",
|
||
" <th>all_task_normal</th>\n",
|
||
" <th>all_task_dinhky</th>\n",
|
||
" <th>loai_ca</th>\n",
|
||
" <th>bat_dau</th>\n",
|
||
" <th>ket_thuc</th>\n",
|
||
" <th>tong_gio_lam</th>\n",
|
||
" <th>so_ca_cua_toa</th>\n",
|
||
" <th>so_luong</th>\n",
|
||
" <th>num_tasks</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>dien_tich_tham</th>\n",
|
||
" <th>doc_ham</th>\n",
|
||
" <th>vien_phan_quang</th>\n",
|
||
" <th>op_tuong</th>\n",
|
||
" <th>op_chan_tuong</th>\n",
|
||
" <th>ranh_thoat_nuoc</th>\n",
|
||
" <th>dien_tich_kinh</th>\n",
|
||
" <th>num_medical_tasks_total</th>\n",
|
||
" <th>num_indoor_room_tasks</th>\n",
|
||
" <th>is_tasks_text_missing</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>115-2</td>\n",
|
||
" <td>Làm sạch toàn bộ phòng giao dịch tầng 1 (kể cả...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>Part time</td>\n",
|
||
" <td>06:30:00</td>\n",
|
||
" <td>10:30:00</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>20.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>101-1</td>\n",
|
||
" <td>Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,...</td>\n",
|
||
" <td>Lau bảng biển, bình cứu hỏa , cây nước hành la...</td>\n",
|
||
" <td>Hành chính</td>\n",
|
||
" <td>06:30:00</td>\n",
|
||
" <td>16:00:00</td>\n",
|
||
" <td>7.5</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>24</td>\n",
|
||
" <td>441</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>70</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>9176.0</td>\n",
|
||
" <td>89.0</td>\n",
|
||
" <td>25</td>\n",
|
||
" <td>894.0</td>\n",
|
||
" <td>112</td>\n",
|
||
" <td>39</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>101-1</td>\n",
|
||
" <td>Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,...</td>\n",
|
||
" <td>Lau bảng biển, bình cứu hỏa , cây nước hành la...</td>\n",
|
||
" <td>Ca sáng</td>\n",
|
||
" <td>06:00:00</td>\n",
|
||
" <td>14:00:00</td>\n",
|
||
" <td>8.0</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>441</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>70</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>9176.0</td>\n",
|
||
" <td>89.0</td>\n",
|
||
" <td>25</td>\n",
|
||
" <td>894.0</td>\n",
|
||
" <td>112</td>\n",
|
||
" <td>39</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>3 rows × 42 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" ma_dia_diem all_task_normal \\\n",
|
||
"0 115-2 Làm sạch toàn bộ phòng giao dịch tầng 1 (kể cả... \n",
|
||
"1 101-1 Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,... \n",
|
||
"2 101-1 Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,... \n",
|
||
"\n",
|
||
" all_task_dinhky loai_ca bat_dau \\\n",
|
||
"0 NaN Part time 06:30:00 \n",
|
||
"1 Lau bảng biển, bình cứu hỏa , cây nước hành la... Hành chính 06:30:00 \n",
|
||
"2 Lau bảng biển, bình cứu hỏa , cây nước hành la... Ca sáng 06:00:00 \n",
|
||
"\n",
|
||
" ket_thuc tong_gio_lam so_ca_cua_toa so_luong num_tasks ... \\\n",
|
||
"0 10:30:00 4.0 1 1 7 ... \n",
|
||
"1 16:00:00 7.5 6 24 441 ... \n",
|
||
"2 14:00:00 8.0 6 3 441 ... \n",
|
||
"\n",
|
||
" dien_tich_tham doc_ham vien_phan_quang op_tuong op_chan_tuong \\\n",
|
||
"0 0.0 0 0 0.0 0.0 \n",
|
||
"1 0.0 70 0 9176.0 89.0 \n",
|
||
"2 0.0 70 0 9176.0 89.0 \n",
|
||
"\n",
|
||
" ranh_thoat_nuoc dien_tich_kinh num_medical_tasks_total \\\n",
|
||
"0 0 20.0 0 \n",
|
||
"1 25 894.0 112 \n",
|
||
"2 25 894.0 112 \n",
|
||
"\n",
|
||
" num_indoor_room_tasks is_tasks_text_missing \n",
|
||
"0 1 0 \n",
|
||
"1 39 0 \n",
|
||
"2 39 0 \n",
|
||
"\n",
|
||
"[3 rows x 42 columns]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"# CELL 1 — LOAD DATA & BASIC CLEAN\n",
|
||
"\n",
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"DATA_PATH = \"final_2.xlsx\"\n",
|
||
"SHEET_NAME = \"final\"\n",
|
||
"\n",
|
||
"# 1. Load\n",
|
||
"df = pd.read_excel(DATA_PATH, sheet_name=SHEET_NAME)\n",
|
||
"print(f\"Loaded: {DATA_PATH} | sheet: {SHEET_NAME}\")\n",
|
||
"print(\"Shape (raw):\", df.shape)\n",
|
||
"\n",
|
||
"# 2. Drop duplicate full rows\n",
|
||
"df = df.drop_duplicates().reset_index(drop=True)\n",
|
||
"print(\"Shape (after dedup):\", df.shape)\n",
|
||
"\n",
|
||
"# 3. Check target\n",
|
||
"assert \"so_luong\" in df.columns, \"❌ Missing target so_luong\"\n",
|
||
"\n",
|
||
"print(\"\\n=== TARGET SUMMARY (so_luong) ===\")\n",
|
||
"print(df[\"so_luong\"].describe())\n",
|
||
"print(\"Missing target:\", df[\"so_luong\"].isna().sum())\n",
|
||
"print(\"Negative target:\", (df[\"so_luong\"] < 0).sum())\n",
|
||
"print(\"Zero target:\", (df[\"so_luong\"] == 0).sum())\n",
|
||
"\n",
|
||
"# 4. Peek data\n",
|
||
"print(\"\\nSample rows:\")\n",
|
||
"display(df.head(3))\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "8c9268c5",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# CELL 2 — FEATURE SELECTION (STRICT)\n",
|
||
"\n",
|
||
"# 1. Xem toàn bộ cột\n",
|
||
"print(\"All columns:\")\n",
|
||
"for i, c in enumerate(df.columns):\n",
|
||
" print(f\"{i:2d}: {c}\")\n",
|
||
"\n",
|
||
"# 2. Xác định cột cần loại bỏ (THEO THỎA THUẬN)\n",
|
||
"DROP_COLS = [\n",
|
||
" df.columns[0], # ma_dia_diem\n",
|
||
" df.columns[1], # all_task_normal\n",
|
||
" df.columns[2], # all_task_dinhky\n",
|
||
" df.columns[-1], # is_tasks_text_missing\n",
|
||
"]\n",
|
||
"\n",
|
||
"print(\"\\nDropped columns:\")\n",
|
||
"for c in DROP_COLS:\n",
|
||
" print(\" -\", c)\n",
|
||
"\n",
|
||
"# 3. Tạo X, y\n",
|
||
"X = df.drop(columns=DROP_COLS + [\"so_luong\"])\n",
|
||
"y = df[\"so_luong\"].astype(float)\n",
|
||
"\n",
|
||
"print(\"\\nShapes:\")\n",
|
||
"print(\"X:\", X.shape)\n",
|
||
"print(\"y:\", y.shape)\n",
|
||
"\n",
|
||
"# 4. Kiểm tra kiểu dữ liệu\n",
|
||
"print(\"\\nFeature dtypes:\")\n",
|
||
"display(X.dtypes.value_counts())\n",
|
||
"\n",
|
||
"# 5. Kiểm tra missing\n",
|
||
"print(\"\\nMissing values in X:\")\n",
|
||
"display(X.isna().sum().sort_values(ascending=False).head(10))\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "b975f6cf",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "1a595fe8",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "b2fb9c84",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "base",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.13.5"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|