predict_caLamviec_nhansu/train.ipynb

2158 lines
70 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 24,
"id": "e1667110",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loaded: final_2.xlsx | sheet: final\n",
"Shape (raw): (401, 42)\n",
"Shape (after dedup): (394, 42)\n",
"\n",
"=== TARGET SUMMARY (so_luong) ===\n",
"count 394.000000\n",
"mean 4.710660\n",
"std 6.848602\n",
"min 0.000000\n",
"25% 1.000000\n",
"50% 2.000000\n",
"75% 5.000000\n",
"max 64.000000\n",
"Name: so_luong, dtype: float64\n",
"Missing target: 0\n",
"Negative target: 0\n",
"Zero target: 3\n",
"\n",
"Sample rows:\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ma_dia_diem</th>\n",
" <th>all_task_normal</th>\n",
" <th>all_task_dinhky</th>\n",
" <th>loai_ca</th>\n",
" <th>bat_dau</th>\n",
" <th>ket_thuc</th>\n",
" <th>tong_gio_lam</th>\n",
" <th>so_ca_cua_toa</th>\n",
" <th>so_luong</th>\n",
" <th>num_tasks</th>\n",
" <th>...</th>\n",
" <th>dien_tich_tham</th>\n",
" <th>doc_ham</th>\n",
" <th>vien_phan_quang</th>\n",
" <th>op_tuong</th>\n",
" <th>op_chan_tuong</th>\n",
" <th>ranh_thoat_nuoc</th>\n",
" <th>dien_tich_kinh</th>\n",
" <th>num_medical_tasks_total</th>\n",
" <th>num_indoor_room_tasks</th>\n",
" <th>is_tasks_text_missing</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>115-2</td>\n",
" <td>Làm sạch toàn bộ phòng giao dịch tầng 1 (kể cả...</td>\n",
" <td>NaN</td>\n",
" <td>Part time</td>\n",
" <td>06:30:00</td>\n",
" <td>10:30:00</td>\n",
" <td>4.0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>7</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>20.0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>101-1</td>\n",
" <td>Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,...</td>\n",
" <td>Lau bảng biển, bình cứu hỏa , cây nước hành la...</td>\n",
" <td>Hành chính</td>\n",
" <td>06:30:00</td>\n",
" <td>16:00:00</td>\n",
" <td>7.5</td>\n",
" <td>6</td>\n",
" <td>24</td>\n",
" <td>441</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>70</td>\n",
" <td>0</td>\n",
" <td>9176.0</td>\n",
" <td>89.0</td>\n",
" <td>25</td>\n",
" <td>894.0</td>\n",
" <td>112</td>\n",
" <td>39</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>101-1</td>\n",
" <td>Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,...</td>\n",
" <td>Lau bảng biển, bình cứu hỏa , cây nước hành la...</td>\n",
" <td>Ca sáng</td>\n",
" <td>06:00:00</td>\n",
" <td>14:00:00</td>\n",
" <td>8.0</td>\n",
" <td>6</td>\n",
" <td>3</td>\n",
" <td>441</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>70</td>\n",
" <td>0</td>\n",
" <td>9176.0</td>\n",
" <td>89.0</td>\n",
" <td>25</td>\n",
" <td>894.0</td>\n",
" <td>112</td>\n",
" <td>39</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3 rows × 42 columns</p>\n",
"</div>"
],
"text/plain": [
" ma_dia_diem all_task_normal \\\n",
"0 115-2 Làm sạch toàn bộ phòng giao dịch tầng 1 (kể cả... \n",
"1 101-1 Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,... \n",
"2 101-1 Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,... \n",
"\n",
" all_task_dinhky loai_ca bat_dau \\\n",
"0 NaN Part time 06:30:00 \n",
"1 Lau bảng biển, bình cứu hỏa , cây nước hành la... Hành chính 06:30:00 \n",
"2 Lau bảng biển, bình cứu hỏa , cây nước hành la... Ca sáng 06:00:00 \n",
"\n",
" ket_thuc tong_gio_lam so_ca_cua_toa so_luong num_tasks ... \\\n",
"0 10:30:00 4.0 1 1 7 ... \n",
"1 16:00:00 7.5 6 24 441 ... \n",
"2 14:00:00 8.0 6 3 441 ... \n",
"\n",
" dien_tich_tham doc_ham vien_phan_quang op_tuong op_chan_tuong \\\n",
"0 0.0 0 0 0.0 0.0 \n",
"1 0.0 70 0 9176.0 89.0 \n",
"2 0.0 70 0 9176.0 89.0 \n",
"\n",
" ranh_thoat_nuoc dien_tich_kinh num_medical_tasks_total \\\n",
"0 0 20.0 0 \n",
"1 25 894.0 112 \n",
"2 25 894.0 112 \n",
"\n",
" num_indoor_room_tasks is_tasks_text_missing \n",
"0 1 0 \n",
"1 39 0 \n",
"2 39 0 \n",
"\n",
"[3 rows x 42 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# CELL 1 — LOAD DATA & BASIC CLEAN\n",
"\n",
"import pandas as pd\n",
"\n",
"DATA_PATH = \"final_2.xlsx\"\n",
"SHEET_NAME = \"final\"\n",
"\n",
"# 1. Load\n",
"df = pd.read_excel(DATA_PATH, sheet_name=SHEET_NAME)\n",
"print(f\"Loaded: {DATA_PATH} | sheet: {SHEET_NAME}\")\n",
"print(\"Shape (raw):\", df.shape)\n",
"\n",
"# 2. Drop duplicate full rows\n",
"df = df.drop_duplicates().reset_index(drop=True)\n",
"print(\"Shape (after dedup):\", df.shape)\n",
"\n",
"# 3. Check target\n",
"assert \"so_luong\" in df.columns, \"❌ Missing target so_luong\"\n",
"\n",
"print(\"\\n=== TARGET SUMMARY (so_luong) ===\")\n",
"print(df[\"so_luong\"].describe())\n",
"print(\"Missing target:\", df[\"so_luong\"].isna().sum())\n",
"print(\"Negative target:\", (df[\"so_luong\"] < 0).sum())\n",
"print(\"Zero target:\", (df[\"so_luong\"] == 0).sum())\n",
"\n",
"# 4. Peek data\n",
"print(\"\\nSample rows:\")\n",
"display(df.head(3))\n"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "5601efad",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"All columns:\n",
" 0: ma_dia_diem\n",
" 1: all_task_normal\n",
" 2: all_task_dinhky\n",
" 3: loai_ca\n",
" 4: bat_dau\n",
" 5: ket_thuc\n",
" 6: tong_gio_lam\n",
" 7: so_ca_cua_toa\n",
" 8: so_luong\n",
" 9: num_tasks\n",
"10: num_cleaning_tasks\n",
"11: num_trash_collection_tasks\n",
"12: num_monitoring_tasks\n",
"13: num_deep_cleaning_tasks\n",
"14: num_support_tasks\n",
"15: num_other_tasks\n",
"16: num_wc_tasks\n",
"17: num_hallway_tasks\n",
"18: num_lobby_tasks\n",
"19: num_outdoor_tasks\n",
"20: num_elevator_tasks\n",
"21: cleaning_ratio\n",
"22: trash_collection_ratio\n",
"23: monitoring_ratio\n",
"24: area_diversity\n",
"25: so_tang\n",
"26: so_cua_thang_may\n",
"27: dien_tich_ngoai_canh\n",
"28: dien_tich_sanh\n",
"29: dien_tich_hanh_lang\n",
"30: dien_tich_wc\n",
"31: dien_tich_phong\n",
"32: dien_tich_tham\n",
"33: doc_ham\n",
"34: vien_phan_quang\n",
"35: op_tuong\n",
"36: op_chan_tuong\n",
"37: ranh_thoat_nuoc\n",
"38: dien_tich_kinh\n",
"39: num_medical_tasks_total\n",
"40: num_indoor_room_tasks\n",
"41: is_tasks_text_missing\n",
"\n",
"Dropped columns:\n",
" - ma_dia_diem\n",
" - all_task_normal\n",
" - all_task_dinhky\n",
" - is_tasks_text_missing\n",
"\n",
"Shapes:\n",
"X: (394, 37)\n",
"y: (394,)\n",
"\n",
"Feature dtypes:\n"
]
},
{
"data": {
"text/plain": [
"int64 21\n",
"float64 13\n",
"object 3\n",
"Name: count, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Missing values in X:\n"
]
},
{
"data": {
"text/plain": [
"loai_ca 0\n",
"bat_dau 0\n",
"ket_thuc 0\n",
"tong_gio_lam 0\n",
"so_ca_cua_toa 0\n",
"num_tasks 0\n",
"num_cleaning_tasks 0\n",
"num_trash_collection_tasks 0\n",
"num_monitoring_tasks 0\n",
"num_deep_cleaning_tasks 0\n",
"dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# CELL 2 — FEATURE SELECTION (STRICT)\n",
"\n",
"# 1. Xem toàn bộ cột\n",
"print(\"All columns:\")\n",
"for i, c in enumerate(df.columns):\n",
" print(f\"{i:2d}: {c}\")\n",
"\n",
"# 2. Xác định cột cần loại bỏ (THEO THỎA THUẬN)\n",
"DROP_COLS = [\n",
" df.columns[0], # ma_dia_diem\n",
" df.columns[1], # all_task_normal\n",
" df.columns[2], # all_task_dinhky\n",
" df.columns[-1], # is_tasks_text_missing\n",
"]\n",
"\n",
"print(\"\\nDropped columns:\")\n",
"for c in DROP_COLS:\n",
" print(\" -\", c)\n",
"\n",
"# 3. Tạo X, y\n",
"X = df.drop(columns=DROP_COLS + [\"so_luong\"])\n",
"y = df[\"so_luong\"].astype(float)\n",
"\n",
"print(\"\\nShapes:\")\n",
"print(\"X:\", X.shape)\n",
"print(\"y:\", y.shape)\n",
"\n",
"# 4. Kiểm tra kiểu dữ liệu\n",
"print(\"\\nFeature dtypes:\")\n",
"display(X.dtypes.value_counts())\n",
"\n",
"# 5. Kiểm tra missing\n",
"print(\"\\nMissing values in X:\")\n",
"display(X.isna().sum().sort_values(ascending=False).head(10))\n"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "bb467e4c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Categorical columns: ['loai_ca']\n",
"\n",
"After preprocess:\n",
"X_proc shape: (394, 46)\n",
"Any non-numeric dtypes? False\n",
"\n",
"Sample columns (first 30):\n",
"['tong_gio_lam', 'so_ca_cua_toa', 'num_tasks', 'num_cleaning_tasks', 'num_trash_collection_tasks', 'num_monitoring_tasks', 'num_deep_cleaning_tasks', 'num_support_tasks', 'num_other_tasks', 'num_wc_tasks', 'num_hallway_tasks', 'num_lobby_tasks', 'num_outdoor_tasks', 'num_elevator_tasks', 'cleaning_ratio', 'trash_collection_ratio', 'monitoring_ratio', 'area_diversity', 'so_tang', 'so_cua_thang_may', 'dien_tich_ngoai_canh', 'dien_tich_sanh', 'dien_tich_hanh_lang', 'dien_tich_wc', 'dien_tich_phong', 'dien_tich_tham', 'doc_ham', 'vien_phan_quang', 'op_tuong', 'op_chan_tuong']\n"
]
}
],
"source": [
"# CELL 3 — PREPROCESS (TIME + CATEGORICAL) WITHOUT JUNK\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"# ---------- 1) Time parsing ----------\n",
"def time_to_hour(x):\n",
" if pd.isna(x):\n",
" return np.nan\n",
"\n",
" # datetime/time object\n",
" if hasattr(x, \"hour\"):\n",
" return float(x.hour) + float(getattr(x, \"minute\", 0))/60.0\n",
"\n",
" s = str(x).strip()\n",
" # \"YYYY-MM-DD HH:MM:SS\"\n",
" if \" \" in s and \":\" in s:\n",
" s = s.split(\" \", 1)[1].strip()\n",
"\n",
" # \"HH:MM\" or \"HH:MM:SS\"\n",
" if \":\" in s:\n",
" parts = s.split(\":\")\n",
" try:\n",
" h = float(parts[0])\n",
" m = float(parts[1]) if len(parts) > 1 else 0.0\n",
" return h + m/60.0\n",
" except:\n",
" return np.nan\n",
"\n",
" # numeric fallback\n",
" try:\n",
" return float(s)\n",
" except:\n",
" return np.nan\n",
"\n",
"# Create new numeric time features (do NOT one-hot time)\n",
"X_proc = X.copy()\n",
"\n",
"if \"bat_dau\" in X_proc.columns:\n",
" X_proc[\"hour_start\"] = X_proc[\"bat_dau\"].apply(time_to_hour)\n",
"if \"ket_thuc\" in X_proc.columns:\n",
" X_proc[\"hour_end\"] = X_proc[\"ket_thuc\"].apply(time_to_hour)\n",
"\n",
"# shift_length + cross day\n",
"if (\"hour_start\" in X_proc.columns) and (\"hour_end\" in X_proc.columns):\n",
" end_adj = X_proc[\"hour_end\"].copy()\n",
" cross = (X_proc[\"hour_start\"].notna()) & (X_proc[\"hour_end\"].notna()) & (X_proc[\"hour_end\"] < X_proc[\"hour_start\"])\n",
" end_adj[cross] = end_adj[cross] + 24.0\n",
"\n",
" X_proc[\"shift_length\"] = (end_adj - X_proc[\"hour_start\"]).clip(lower=0)\n",
" X_proc[\"is_cross_day\"] = cross.astype(int)\n",
"\n",
"# Drop raw time cols to avoid junk\n",
"for c in [\"bat_dau\", \"ket_thuc\"]:\n",
" if c in X_proc.columns:\n",
" X_proc = X_proc.drop(columns=[c])\n",
"\n",
"# ---------- 2) One-hot categorical ----------\n",
"cat_cols = [c for c in X_proc.columns if X_proc[c].dtype == \"object\"]\n",
"print(\"Categorical columns:\", cat_cols)\n",
"\n",
"X_proc = pd.get_dummies(X_proc, columns=cat_cols, dummy_na=True)\n",
"\n",
"# ---------- 3) Fill missing ----------\n",
"X_proc = X_proc.replace([np.inf, -np.inf], np.nan).fillna(0)\n",
"\n",
"print(\"\\nAfter preprocess:\")\n",
"print(\"X_proc shape:\", X_proc.shape)\n",
"print(\"Any non-numeric dtypes?\", any(dt == \"object\" for dt in X_proc.dtypes))\n",
"\n",
"print(\"\\nSample columns (first 30):\")\n",
"print(list(X_proc.columns[:30]))\n"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "3cd119b7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Shapes:\n",
"Train: (326, 46) (326,)\n",
"Val : (68, 46) (68,)\n",
"\n",
"ElasticNet(alpha=0.01, l1_ratio=0.5)\n",
" Train | MAE=2.754 | RMSE=5.200 | R2=0.458\n",
" Val | MAE=2.420 | RMSE=4.792 | R2=0.259\n",
"\n",
"DecisionTree(max_depth=8, min_samples_leaf=5)\n",
" Train | MAE=2.187 | RMSE=5.313 | R2=0.434\n",
" Val | MAE=2.312 | RMSE=4.149 | R2=0.445\n",
"\n",
"RandomForest(n_estimators=600, min_samples_leaf=3)\n",
" Train | MAE=1.894 | RMSE=4.820 | R2=0.535\n",
" Val | MAE=2.402 | RMSE=4.467 | R2=0.356\n",
"\n",
"ExtraTrees(n_estimators=800, min_samples_leaf=2)\n",
" Train | MAE=1.095 | RMSE=3.320 | R2=0.779\n",
" Val | MAE=1.968 | RMSE=3.461 | R2=0.614\n",
"\n",
"HistGradientBoosting(learning_rate=0.05, min_samples_leaf=20)\n",
" Train | MAE=1.990 | RMSE=4.871 | R2=0.525\n",
" Val | MAE=2.406 | RMSE=4.599 | R2=0.318\n",
"\n",
"=== HOLDOUT VAL MAE summary (lower is better) ===\n",
"ExtraTrees : Val MAE = 1.968\n",
"DecisionTree: Val MAE = 2.312\n",
"RandomForest: Val MAE = 2.402\n",
"HGBR : Val MAE = 2.406\n",
"ElasticNet : Val MAE = 2.420\n"
]
}
],
"source": [
"# CELL 4 (EXTENDED) — ADD TREE/ENSEMBLE MODELS TO HOLDOUT VAL\n",
"\n",
"import numpy as np\n",
"from sklearn.model_selection import GroupShuffleSplit\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet\n",
"from sklearn.tree import DecisionTreeRegressor\n",
"from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor\n",
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
"\n",
"# ---------- 1) Group split (80% train, 20% val) ----------\n",
"groups = df[\"ma_dia_diem\"].astype(str)\n",
"\n",
"gss = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=42)\n",
"train_idx, val_idx = next(gss.split(X_proc, y, groups))\n",
"\n",
"X_train = X_proc.iloc[train_idx]\n",
"y_train = y.iloc[train_idx]\n",
"\n",
"X_val = X_proc.iloc[val_idx]\n",
"y_val = y.iloc[val_idx]\n",
"\n",
"print(\"Shapes:\")\n",
"print(\"Train:\", X_train.shape, y_train.shape)\n",
"print(\"Val :\", X_val.shape, y_val.shape)\n",
"\n",
"# ---------- 2) Scale features (for linear models) ----------\n",
"scaler = StandardScaler()\n",
"X_train_s = scaler.fit_transform(X_train)\n",
"X_val_s = scaler.transform(X_val)\n",
"\n",
"# ---------- 2b) No-scale matrices (for tree models) ----------\n",
"X_train_ns = X_train.values\n",
"X_val_ns = X_val.values\n",
"\n",
"# ---------- 3) Log-transform target ----------\n",
"y_train_log = np.log1p(y_train)\n",
"y_val_log = np.log1p(y_val)\n",
"\n",
"# ---------- 4) Evaluation helper (support scale/no-scale) ----------\n",
"def eval_reg_any(name, model, use_scaled_X=True):\n",
" Xtr = X_train_s if use_scaled_X else X_train_ns\n",
" Xva = X_val_s if use_scaled_X else X_val_ns\n",
"\n",
" model.fit(Xtr, y_train_log)\n",
"\n",
" pred_train = np.maximum(0, np.expm1(model.predict(Xtr)))\n",
" pred_val = np.maximum(0, np.expm1(model.predict(Xva)))\n",
"\n",
" def _m(y_true, y_pred):\n",
" return (\n",
" mean_absolute_error(y_true, y_pred),\n",
" mean_squared_error(y_true, y_pred) ** 0.5,\n",
" r2_score(y_true, y_pred),\n",
" )\n",
"\n",
" tr = _m(y_train, pred_train)\n",
" va = _m(y_val, pred_val)\n",
"\n",
" print(f\"\\n{name}\")\n",
" print(f\" Train | MAE={tr[0]:.3f} | RMSE={tr[1]:.3f} | R2={tr[2]:.3f}\")\n",
" print(f\" Val | MAE={va[0]:.3f} | RMSE={va[1]:.3f} | R2={va[2]:.3f}\")\n",
"\n",
" return va[0]\n",
"\n",
"# ---------- 5) Train baselines + tree/ensemble ----------\n",
"results = {}\n",
"\n",
"# Linear (scaled)\n",
"# results[\"Linear\"] = eval_reg_any(\"LinearRegression\", LinearRegression(), use_scaled_X=True)\n",
"# results[\"Ridge\"] = eval_reg_any(\"Ridge(alpha=1.0)\", Ridge(alpha=1.0), use_scaled_X=True)\n",
"# results[\"Lasso\"] = eval_reg_any(\"Lasso(alpha=0.01)\", Lasso(alpha=0.01, max_iter=5000), use_scaled_X=True)\n",
"results[\"ElasticNet\"] = eval_reg_any(\n",
" \"ElasticNet(alpha=0.01, l1_ratio=0.5)\",\n",
" ElasticNet(alpha=0.01, l1_ratio=0.5, max_iter=5000),\n",
" use_scaled_X=True\n",
")\n",
"\n",
"# Tree / Ensemble (no scale)\n",
"results[\"DecisionTree\"] = eval_reg_any(\n",
" \"DecisionTree(max_depth=8, min_samples_leaf=5)\",\n",
" DecisionTreeRegressor(max_depth=8, min_samples_leaf=5, random_state=42),\n",
" use_scaled_X=False\n",
")\n",
"\n",
"results[\"RandomForest\"] = eval_reg_any(\n",
" \"RandomForest(n_estimators=600, min_samples_leaf=3)\",\n",
" RandomForestRegressor(\n",
" n_estimators=600, min_samples_leaf=3, random_state=42, n_jobs=-1\n",
" ),\n",
" use_scaled_X=False\n",
")\n",
"\n",
"results[\"ExtraTrees\"] = eval_reg_any(\n",
" \"ExtraTrees(n_estimators=800, min_samples_leaf=2)\",\n",
" ExtraTreesRegressor(\n",
" n_estimators=800, min_samples_leaf=2, random_state=42, n_jobs=-1\n",
" ),\n",
" use_scaled_X=False\n",
")\n",
"\n",
"results[\"HGBR\"] = eval_reg_any(\n",
" \"HistGradientBoosting(learning_rate=0.05, min_samples_leaf=20)\",\n",
" HistGradientBoostingRegressor(\n",
" learning_rate=0.05, min_samples_leaf=20, max_leaf_nodes=31, random_state=42\n",
" ),\n",
" use_scaled_X=False\n",
")\n",
"\n",
"print(\"\\n=== HOLDOUT VAL MAE summary (lower is better) ===\")\n",
"for k, v in sorted(results.items(), key=lambda x: x[1]):\n",
" print(f\"{k:12s}: Val MAE = {v:.3f}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "106e557f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train for CV: (326, 46) (326,) | unique groups: 153\n",
"\n",
"=== ExtraTrees GroupKFold (TRAIN ONLY) ===\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>MAE</th>\n",
" <th>RMSE</th>\n",
" <th>R2</th>\n",
" <th>fold</th>\n",
" <th>n_val</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3.677199</td>\n",
" <td>7.263821</td>\n",
" <td>0.048081</td>\n",
" <td>1</td>\n",
" <td>66</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3.673333</td>\n",
" <td>6.600086</td>\n",
" <td>0.104487</td>\n",
" <td>2</td>\n",
" <td>65</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3.092924</td>\n",
" <td>8.986669</td>\n",
" <td>0.087366</td>\n",
" <td>3</td>\n",
" <td>65</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2.100641</td>\n",
" <td>3.499245</td>\n",
" <td>0.291277</td>\n",
" <td>4</td>\n",
" <td>65</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2.550497</td>\n",
" <td>4.739900</td>\n",
" <td>0.407946</td>\n",
" <td>5</td>\n",
" <td>65</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" MAE RMSE R2 fold n_val\n",
"0 3.677199 7.263821 0.048081 1 66\n",
"1 3.673333 6.600086 0.104487 2 65\n",
"2 3.092924 8.986669 0.087366 3 65\n",
"3 2.100641 3.499245 0.291277 4 65\n",
"4 2.550497 4.739900 0.407946 5 65"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== CV SUMMARY ===\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>MAE</th>\n",
" <th>RMSE</th>\n",
" <th>R2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>3.018919</td>\n",
" <td>6.217944</td>\n",
" <td>0.187832</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>0.694572</td>\n",
" <td>2.149515</td>\n",
" <td>0.154694</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" MAE RMSE R2\n",
"mean 3.018919 6.217944 0.187832\n",
"std 0.694572 2.149515 0.154694"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Worst fold (highest MAE):\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>MAE</th>\n",
" <th>RMSE</th>\n",
" <th>R2</th>\n",
" <th>fold</th>\n",
" <th>n_val</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3.677199</td>\n",
" <td>7.263821</td>\n",
" <td>0.048081</td>\n",
" <td>1</td>\n",
" <td>66</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" MAE RMSE R2 fold n_val\n",
"0 3.677199 7.263821 0.048081 1 66"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# CELL 5 — GROUP K-FOLD CV (TRAIN ONLY) FOR EXTRA TREES (NO LEAKAGE by ma_dia_diem)\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"from sklearn.model_selection import GroupKFold\n",
"from sklearn.ensemble import ExtraTreesRegressor\n",
"from sklearn.base import clone\n",
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
"\n",
"# =========================\n",
"# 1) Prepare TRAIN data + groups\n",
"# =========================\n",
"X_tr = X_train.values # no scaling for tree\n",
"y_tr = y_train.values.astype(float)\n",
"g_tr = df.loc[train_idx, \"ma_dia_diem\"].astype(str).values\n",
"\n",
"print(\"Train for CV:\", X_tr.shape, y_tr.shape, \"| unique groups:\", len(np.unique(g_tr)))\n",
"\n",
"# =========================\n",
"# 2) Metric helper (evaluate on original scale)\n",
"# =========================\n",
"def metrics(y_true, y_pred):\n",
" y_pred = np.maximum(0, y_pred)\n",
" return {\n",
" \"MAE\": mean_absolute_error(y_true, y_pred),\n",
" \"RMSE\": mean_squared_error(y_true, y_pred) ** 0.5,\n",
" \"R2\": r2_score(y_true, y_pred),\n",
" }\n",
"\n",
"# =========================\n",
"# 3) ExtraTrees config (same as your holdout baseline)\n",
"# =========================\n",
"base_model = ExtraTreesRegressor(\n",
" n_estimators=800,\n",
" min_samples_leaf=2,\n",
" random_state=42,\n",
" n_jobs=-1\n",
")\n",
"\n",
"# =========================\n",
"# 4) GroupKFold CV (fit log1p(y) -> expm1(pred))\n",
"# =========================\n",
"gkf = GroupKFold(n_splits=5)\n",
"\n",
"rows = []\n",
"for fold, (tr_idx, va_idx) in enumerate(gkf.split(X_tr, y_tr, groups=g_tr), start=1):\n",
" model = clone(base_model)\n",
"\n",
" Xtr_f, Xva_f = X_tr[tr_idx], X_tr[va_idx]\n",
" ytr_f, yva_f = y_tr[tr_idx], y_tr[va_idx]\n",
"\n",
" model.fit(Xtr_f, np.log1p(ytr_f))\n",
" pred_va = np.expm1(model.predict(Xva_f))\n",
" pred_va = np.maximum(0, pred_va)\n",
"\n",
" m = metrics(yva_f, pred_va)\n",
" m[\"fold\"] = fold\n",
" m[\"n_val\"] = len(va_idx)\n",
" rows.append(m)\n",
"\n",
"cv_df = pd.DataFrame(rows)\n",
"\n",
"print(\"\\n=== ExtraTrees GroupKFold (TRAIN ONLY) ===\")\n",
"display(cv_df)\n",
"\n",
"summary = cv_df[[\"MAE\", \"RMSE\", \"R2\"]].agg([\"mean\", \"std\"])\n",
"print(\"\\n=== CV SUMMARY ===\")\n",
"display(summary)\n",
"\n",
"best_worst = cv_df.sort_values(\"MAE\", ascending=False)\n",
"print(\"\\nWorst fold (highest MAE):\")\n",
"display(best_worst.head(1))\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "ee66e389",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Linear | MAE = 4.971 ± 2.813\n",
"Lasso_0.01 | MAE = 7.110 ± 7.917\n",
"ElasticNet_0.01 | MAE = 6.386 ± 6.240\n",
"\n",
"=== CV SUMMARY ===\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>mean</th>\n",
" <th>std</th>\n",
" <th>min</th>\n",
" <th>max</th>\n",
" </tr>\n",
" <tr>\n",
" <th>model</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Linear</th>\n",
" <td>4.970868</td>\n",
" <td>3.145106</td>\n",
" <td>2.279954</td>\n",
" <td>10.321992</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ElasticNet_0.01</th>\n",
" <td>6.386372</td>\n",
" <td>6.976535</td>\n",
" <td>2.104239</td>\n",
" <td>18.765868</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Lasso_0.01</th>\n",
" <td>7.110267</td>\n",
" <td>8.851455</td>\n",
" <td>2.113510</td>\n",
" <td>22.876806</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" mean std min max\n",
"model \n",
"Linear 4.970868 3.145106 2.279954 10.321992\n",
"ElasticNet_0.01 6.386372 6.976535 2.104239 18.765868\n",
"Lasso_0.01 7.110267 8.851455 2.113510 22.876806"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# CELL 5 — GROUP K-FOLD CONFIRMATION (TOP LINEAR MODELS)\n",
"\n",
"from sklearn.model_selection import GroupKFold\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.linear_model import LinearRegression, Lasso, ElasticNet\n",
"from sklearn.metrics import mean_absolute_error\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"models = {\n",
" \"Linear\": LinearRegression(),\n",
" \"Lasso_0.01\": Lasso(alpha=0.01, max_iter=5000),\n",
" \"ElasticNet_0.01\": ElasticNet(alpha=0.01, l1_ratio=0.5, max_iter=5000),\n",
"}\n",
"\n",
"groups = df[\"ma_dia_diem\"].astype(str)\n",
"gkf = GroupKFold(n_splits=5)\n",
"\n",
"rows = []\n",
"\n",
"for name, model in models.items():\n",
" maes = []\n",
" for fold, (tr_idx, va_idx) in enumerate(gkf.split(X_proc, y, groups)):\n",
" X_tr, X_va = X_proc.iloc[tr_idx], X_proc.iloc[va_idx]\n",
" y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]\n",
"\n",
" scaler = StandardScaler()\n",
" X_tr_s = scaler.fit_transform(X_tr)\n",
" X_va_s = scaler.transform(X_va)\n",
"\n",
" model.fit(X_tr_s, np.log1p(y_tr))\n",
" pred_va = np.maximum(0, np.expm1(model.predict(X_va_s)))\n",
"\n",
" mae = mean_absolute_error(y_va, pred_va)\n",
" maes.append(mae)\n",
"\n",
" rows.append({\n",
" \"model\": name,\n",
" \"fold\": fold,\n",
" \"MAE\": mae,\n",
" \"n_val\": len(va_idx),\n",
" })\n",
"\n",
" print(f\"{name:12s} | MAE = {np.mean(maes):.3f} ± {np.std(maes):.3f}\")\n",
"\n",
"cv_df = pd.DataFrame(rows)\n",
"\n",
"print(\"\\n=== CV SUMMARY ===\")\n",
"display(\n",
" cv_df.groupby(\"model\")[\"MAE\"]\n",
" .agg([\"mean\", \"std\", \"min\", \"max\"])\n",
" .sort_values(\"mean\")\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "73a31e6e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"=== GROUP K-FOLD CV LEADERBOARD (TRAIN ONLY) ===\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>model</th>\n",
" <th>MAE_mean</th>\n",
" <th>MAE_std</th>\n",
" <th>RMSE_mean</th>\n",
" <th>R2_mean</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>ExtraTrees</td>\n",
" <td>3.018919</td>\n",
" <td>0.694572</td>\n",
" <td>6.217944</td>\n",
" <td>0.187832</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>RandomForest</td>\n",
" <td>3.052084</td>\n",
" <td>0.715409</td>\n",
" <td>6.243409</td>\n",
" <td>0.187735</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>HGBR</td>\n",
" <td>3.105344</td>\n",
" <td>0.673739</td>\n",
" <td>6.344961</td>\n",
" <td>0.156316</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>DecisionTree</td>\n",
" <td>3.681985</td>\n",
" <td>0.668149</td>\n",
" <td>6.861043</td>\n",
" <td>-0.017019</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Lasso</td>\n",
" <td>4.366601</td>\n",
" <td>1.709182</td>\n",
" <td>10.584715</td>\n",
" <td>-2.703761</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>ElasticNet</td>\n",
" <td>4.918275</td>\n",
" <td>2.414753</td>\n",
" <td>13.602627</td>\n",
" <td>-15.883303</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" model MAE_mean MAE_std RMSE_mean R2_mean\n",
"4 ExtraTrees 3.018919 0.694572 6.217944 0.187832\n",
"3 RandomForest 3.052084 0.715409 6.243409 0.187735\n",
"5 HGBR 3.105344 0.673739 6.344961 0.156316\n",
"2 DecisionTree 3.681985 0.668149 6.861043 -0.017019\n",
"0 Lasso 4.366601 1.709182 10.584715 -2.703761\n",
"1 ElasticNet 4.918275 2.414753 13.602627 -15.883303"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Per-fold details (worst MAE first) ===\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>MAE</th>\n",
" <th>RMSE</th>\n",
" <th>R2</th>\n",
" <th>model</th>\n",
" <th>fold</th>\n",
" <th>n_val</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>8.972213</td>\n",
" <td>37.201319</td>\n",
" <td>-79.102177</td>\n",
" <td>ElasticNet</td>\n",
" <td>4</td>\n",
" <td>65</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>7.342893</td>\n",
" <td>19.028144</td>\n",
" <td>-5.532246</td>\n",
" <td>Lasso</td>\n",
" <td>1</td>\n",
" <td>66</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>5.347237</td>\n",
" <td>9.838825</td>\n",
" <td>-0.746450</td>\n",
" <td>ElasticNet</td>\n",
" <td>1</td>\n",
" <td>66</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>4.347409</td>\n",
" <td>7.407078</td>\n",
" <td>0.010163</td>\n",
" <td>DecisionTree</td>\n",
" <td>1</td>\n",
" <td>66</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4.288895</td>\n",
" <td>12.729106</td>\n",
" <td>-8.378285</td>\n",
" <td>Lasso</td>\n",
" <td>4</td>\n",
" <td>65</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>4.144338</td>\n",
" <td>9.678611</td>\n",
" <td>-0.058584</td>\n",
" <td>DecisionTree</td>\n",
" <td>3</td>\n",
" <td>65</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>3.772958</td>\n",
" <td>7.109340</td>\n",
" <td>-0.039038</td>\n",
" <td>DecisionTree</td>\n",
" <td>2</td>\n",
" <td>65</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>3.742208</td>\n",
" <td>6.724452</td>\n",
" <td>0.070421</td>\n",
" <td>RandomForest</td>\n",
" <td>2</td>\n",
" <td>65</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>3.721214</td>\n",
" <td>7.376454</td>\n",
" <td>0.018331</td>\n",
" <td>HGBR</td>\n",
" <td>1</td>\n",
" <td>66</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>3.684392</td>\n",
" <td>6.736473</td>\n",
" <td>0.067094</td>\n",
" <td>HGBR</td>\n",
" <td>2</td>\n",
" <td>65</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>3.677199</td>\n",
" <td>7.263821</td>\n",
" <td>0.048081</td>\n",
" <td>ExtraTrees</td>\n",
" <td>1</td>\n",
" <td>66</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>3.673333</td>\n",
" <td>6.600086</td>\n",
" <td>0.104487</td>\n",
" <td>ExtraTrees</td>\n",
" <td>2</td>\n",
" <td>65</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>3.557926</td>\n",
" <td>7.090878</td>\n",
" <td>0.092870</td>\n",
" <td>RandomForest</td>\n",
" <td>1</td>\n",
" <td>66</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3.511097</td>\n",
" <td>6.344130</td>\n",
" <td>0.172598</td>\n",
" <td>Lasso</td>\n",
" <td>2</td>\n",
" <td>65</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>3.507451</td>\n",
" <td>5.627415</td>\n",
" <td>0.165473</td>\n",
" <td>DecisionTree</td>\n",
" <td>5</td>\n",
" <td>65</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" MAE RMSE R2 model fold n_val\n",
"8 8.972213 37.201319 -79.102177 ElasticNet 4 65\n",
"0 7.342893 19.028144 -5.532246 Lasso 1 66\n",
"5 5.347237 9.838825 -0.746450 ElasticNet 1 66\n",
"10 4.347409 7.407078 0.010163 DecisionTree 1 66\n",
"3 4.288895 12.729106 -8.378285 Lasso 4 65\n",
"12 4.144338 9.678611 -0.058584 DecisionTree 3 65\n",
"11 3.772958 7.109340 -0.039038 DecisionTree 2 65\n",
"16 3.742208 6.724452 0.070421 RandomForest 2 65\n",
"25 3.721214 7.376454 0.018331 HGBR 1 66\n",
"26 3.684392 6.736473 0.067094 HGBR 2 65\n",
"20 3.677199 7.263821 0.048081 ExtraTrees 1 66\n",
"21 3.673333 6.600086 0.104487 ExtraTrees 2 65\n",
"15 3.557926 7.090878 0.092870 RandomForest 1 66\n",
"1 3.511097 6.344130 0.172598 Lasso 2 65\n",
"14 3.507451 5.627415 0.165473 DecisionTree 5 65"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# CELL 6 — GROUP K-FOLD CV: TRY MULTIPLE MODELS (FAIR COMPARISON)\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"from sklearn.model_selection import GroupKFold\n",
"from sklearn.base import clone\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
"\n",
"from sklearn.linear_model import Lasso, ElasticNet\n",
"from sklearn.tree import DecisionTreeRegressor\n",
"from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor\n",
"\n",
"# =========================\n",
"# 1) Prepare TRAIN data + groups\n",
"# =========================\n",
"X_tr_df = X_train.copy() # keep dataframe\n",
"y_tr = y_train.astype(float).values\n",
"g_tr = df.loc[train_idx, \"ma_dia_diem\"].astype(str).values\n",
"\n",
"# =========================\n",
"# 2) Metric helper\n",
"# =========================\n",
"def metrics(y_true, y_pred):\n",
" y_pred = np.maximum(0, y_pred)\n",
" return {\n",
" \"MAE\": mean_absolute_error(y_true, y_pred),\n",
" \"RMSE\": mean_squared_error(y_true, y_pred) ** 0.5,\n",
" \"R2\": r2_score(y_true, y_pred),\n",
" }\n",
"\n",
"def cv_eval_model(name, model, X_df, y, groups, n_splits=5, use_log=True):\n",
" gkf = GroupKFold(n_splits=n_splits)\n",
" rows = []\n",
"\n",
" for fold, (tr_i, va_i) in enumerate(gkf.split(X_df, y, groups=groups), start=1):\n",
" m = clone(model)\n",
"\n",
" Xtr = X_df.iloc[tr_i]\n",
" Xva = X_df.iloc[va_i]\n",
" ytr = y[tr_i]\n",
" yva = y[va_i]\n",
"\n",
" if use_log:\n",
" m.fit(Xtr, np.log1p(ytr))\n",
" pred = np.expm1(m.predict(Xva))\n",
" else:\n",
" m.fit(Xtr, ytr)\n",
" pred = m.predict(Xva)\n",
"\n",
" mm = metrics(yva, pred)\n",
" mm.update({\"model\": name, \"fold\": fold, \"n_val\": len(va_i)})\n",
" rows.append(mm)\n",
"\n",
" out = pd.DataFrame(rows)\n",
" summ = out[[\"MAE\", \"RMSE\", \"R2\"]].agg([\"mean\", \"std\"])\n",
" return out, summ\n",
"\n",
"# =========================\n",
"# 3) Define candidates\n",
"# =========================\n",
"candidates = []\n",
"\n",
"# Linear (need scale)\n",
"candidates.append((\"Lasso\", Pipeline([\n",
" (\"scaler\", StandardScaler()),\n",
" (\"model\", Lasso(alpha=0.01, max_iter=5000, random_state=42))\n",
"]), True))\n",
"\n",
"candidates.append((\"ElasticNet\", Pipeline([\n",
" (\"scaler\", StandardScaler()),\n",
" (\"model\", ElasticNet(alpha=0.01, l1_ratio=0.5, max_iter=8000, random_state=42))\n",
"]), True))\n",
"\n",
"# Tree/Ensemble (no scale needed)\n",
"candidates.append((\"DecisionTree\", DecisionTreeRegressor(\n",
" max_depth=8, min_samples_leaf=5, random_state=42\n",
"), True))\n",
"\n",
"candidates.append((\"RandomForest\", RandomForestRegressor(\n",
" n_estimators=600, min_samples_leaf=3, random_state=42, n_jobs=-1\n",
"), True))\n",
"\n",
"candidates.append((\"ExtraTrees\", ExtraTreesRegressor(\n",
" n_estimators=800, min_samples_leaf=2, random_state=42, n_jobs=-1\n",
"), True))\n",
"\n",
"candidates.append((\"HGBR\", HistGradientBoostingRegressor(\n",
" learning_rate=0.05, min_samples_leaf=20, max_leaf_nodes=31, random_state=42\n",
"), True))\n",
"\n",
"# =========================\n",
"# 4) Run CV for all models\n",
"# =========================\n",
"all_fold = []\n",
"rows_lb = []\n",
"\n",
"for name, model, use_log in candidates:\n",
" fold_df, summ = cv_eval_model(name, model, X_tr_df, y_tr, g_tr, n_splits=5, use_log=use_log)\n",
" all_fold.append(fold_df)\n",
"\n",
" rows_lb.append({\n",
" \"model\": name,\n",
" \"MAE_mean\": summ.loc[\"mean\", \"MAE\"],\n",
" \"MAE_std\": summ.loc[\"std\", \"MAE\"],\n",
" \"RMSE_mean\": summ.loc[\"mean\", \"RMSE\"],\n",
" \"R2_mean\": summ.loc[\"mean\", \"R2\"],\n",
" })\n",
"\n",
"cv_all = pd.concat(all_fold, ignore_index=True)\n",
"leaderboard = pd.DataFrame(rows_lb).sort_values([\"MAE_mean\", \"RMSE_mean\"], ascending=True)\n",
"\n",
"print(\"=== GROUP K-FOLD CV LEADERBOARD (TRAIN ONLY) ===\")\n",
"display(leaderboard)\n",
"\n",
"print(\"\\n=== Per-fold details (worst MAE first) ===\")\n",
"display(cv_all.sort_values(\"MAE\", ascending=False).head(15))\n"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "1d408c02",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"=== FINAL MODEL: ExtraTrees (fit on TRAIN, eval on HOLDOUT VAL) ===\n",
"\n",
"[VAL (raw)]\n",
"MAE : 1.968\n",
"RMSE: 3.461\n",
"R2 : 0.614\n",
"\n",
"[VAL (rounded)]\n",
"MAE : 1.971\n",
"RMSE: 3.502\n",
"R2 : 0.604\n",
"\n",
"Worst 15 samples (by abs error):\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>y_true</th>\n",
" <th>y_pred</th>\n",
" <th>y_pred_round</th>\n",
" <th>abs_err</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>34</th>\n",
" <td>32.0</td>\n",
" <td>16.876830</td>\n",
" <td>17</td>\n",
" <td>15.123170</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>29.0</td>\n",
" <td>16.242619</td>\n",
" <td>16</td>\n",
" <td>12.757381</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>13.0</td>\n",
" <td>5.239388</td>\n",
" <td>5</td>\n",
" <td>7.760612</td>\n",
" </tr>\n",
" <tr>\n",
" <th>65</th>\n",
" <td>1.0</td>\n",
" <td>7.994058</td>\n",
" <td>8</td>\n",
" <td>6.994058</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>14.0</td>\n",
" <td>7.210027</td>\n",
" <td>7</td>\n",
" <td>6.789973</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>12.0</td>\n",
" <td>5.706020</td>\n",
" <td>6</td>\n",
" <td>6.293980</td>\n",
" </tr>\n",
" <tr>\n",
" <th>60</th>\n",
" <td>1.0</td>\n",
" <td>6.108304</td>\n",
" <td>6</td>\n",
" <td>5.108304</td>\n",
" </tr>\n",
" <tr>\n",
" <th>61</th>\n",
" <td>1.0</td>\n",
" <td>5.948221</td>\n",
" <td>6</td>\n",
" <td>4.948221</td>\n",
" </tr>\n",
" <tr>\n",
" <th>56</th>\n",
" <td>2.0</td>\n",
" <td>6.426550</td>\n",
" <td>6</td>\n",
" <td>4.426550</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>11.0</td>\n",
" <td>6.758044</td>\n",
" <td>7</td>\n",
" <td>4.241956</td>\n",
" </tr>\n",
" <tr>\n",
" <th>58</th>\n",
" <td>10.0</td>\n",
" <td>6.354210</td>\n",
" <td>6</td>\n",
" <td>3.645790</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>6.0</td>\n",
" <td>2.356056</td>\n",
" <td>2</td>\n",
" <td>3.643944</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>7.0</td>\n",
" <td>3.442200</td>\n",
" <td>3</td>\n",
" <td>3.557800</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>2.0</td>\n",
" <td>5.504394</td>\n",
" <td>6</td>\n",
" <td>3.504394</td>\n",
" </tr>\n",
" <tr>\n",
" <th>62</th>\n",
" <td>3.0</td>\n",
" <td>6.087934</td>\n",
" <td>6</td>\n",
" <td>3.087934</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" y_true y_pred y_pred_round abs_err\n",
"34 32.0 16.876830 17 15.123170\n",
"0 29.0 16.242619 16 12.757381\n",
"14 13.0 5.239388 5 7.760612\n",
"65 1.0 7.994058 8 6.994058\n",
"28 14.0 7.210027 7 6.789973\n",
"29 12.0 5.706020 6 6.293980\n",
"60 1.0 6.108304 6 5.108304\n",
"61 1.0 5.948221 6 4.948221\n",
"56 2.0 6.426550 6 4.426550\n",
"17 11.0 6.758044 7 4.241956\n",
"58 10.0 6.354210 6 3.645790\n",
"10 6.0 2.356056 2 3.643944\n",
"23 7.0 3.442200 3 3.557800\n",
"20 2.0 5.504394 6 3.504394\n",
"62 3.0 6.087934 6 3.087934"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Saved:\n",
" - Model : ./artifacts\\extratrees_log1p.joblib\n",
" - Columns: ./artifacts\\X_proc_columns.joblib\n"
]
}
],
"source": [
"# CELL 7 — FINALIZE BEST MODEL (ExtraTrees) + EVAL ON HOLDOUT VAL + SAVE ARTIFACTS\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"from sklearn.ensemble import ExtraTreesRegressor\n",
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
"import joblib\n",
"\n",
"# =========================\n",
"# 1) Train final ExtraTrees on FULL TRAIN\n",
"# =========================\n",
"final_model = ExtraTreesRegressor(\n",
" n_estimators=800,\n",
" min_samples_leaf=2,\n",
" random_state=42,\n",
" n_jobs=-1\n",
")\n",
"\n",
"final_model.fit(X_train.values, np.log1p(y_train.values))\n",
"\n",
"# =========================\n",
"# 2) Predict on HOLDOUT VAL\n",
"# =========================\n",
"pred_val = np.expm1(final_model.predict(X_val.values))\n",
"pred_val = np.maximum(0, pred_val)\n",
"\n",
"# optional: round to headcount integer\n",
"pred_val_round = np.rint(pred_val).astype(int)\n",
"\n",
"# =========================\n",
"# 3) Metrics (raw vs rounded)\n",
"# =========================\n",
"def print_metrics(tag, y_true, y_pred):\n",
" mae = mean_absolute_error(y_true, y_pred)\n",
" rmse = mean_squared_error(y_true, y_pred) ** 0.5\n",
" r2 = r2_score(y_true, y_pred)\n",
" print(f\"\\n[{tag}]\")\n",
" print(f\"MAE : {mae:.3f}\")\n",
" print(f\"RMSE: {rmse:.3f}\")\n",
" print(f\"R2 : {r2:.3f}\")\n",
" return {\"MAE\": mae, \"RMSE\": rmse, \"R2\": r2}\n",
"\n",
"print(\"=== FINAL MODEL: ExtraTrees (fit on TRAIN, eval on HOLDOUT VAL) ===\")\n",
"m_raw = print_metrics(\"VAL (raw)\", y_val.values, pred_val)\n",
"m_int = print_metrics(\"VAL (rounded)\", y_val.values, pred_val_round)\n",
"\n",
"# =========================\n",
"# 4) Quick error analysis\n",
"# =========================\n",
"err_df = pd.DataFrame({\n",
" \"y_true\": y_val.values,\n",
" \"y_pred\": pred_val,\n",
" \"y_pred_round\": pred_val_round,\n",
" \"abs_err\": np.abs(y_val.values - pred_val),\n",
"})\n",
"print(\"\\nWorst 15 samples (by abs error):\")\n",
"display(err_df.sort_values(\"abs_err\", ascending=False).head(15))\n",
"\n",
"# =========================\n",
"# 5) Save model + schema (columns)\n",
"# =========================\n",
"ARTIFACT_DIR = \"./artifacts\"\n",
"import os\n",
"os.makedirs(ARTIFACT_DIR, exist_ok=True)\n",
"\n",
"model_path = os.path.join(ARTIFACT_DIR, \"extratrees_log1p.joblib\")\n",
"cols_path = os.path.join(ARTIFACT_DIR, \"X_proc_columns.joblib\")\n",
"\n",
"joblib.dump(final_model, model_path)\n",
"joblib.dump(list(X_train.columns), cols_path)\n",
"\n",
"print(\"\\nSaved:\")\n",
"print(\" - Model :\", model_path)\n",
"print(\" - Columns:\", cols_path)\n"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "9cff151d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Preview:\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ma_dia_diem</th>\n",
" <th>so_luong_thuc_te</th>\n",
" <th>so_luong_du_doan_raw</th>\n",
" <th>so_luong_du_doan_round</th>\n",
" <th>abs_error</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>34</th>\n",
" <td>579-1</td>\n",
" <td>32.0</td>\n",
" <td>16.876830</td>\n",
" <td>17</td>\n",
" <td>15.123170</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>114-1</td>\n",
" <td>29.0</td>\n",
" <td>16.242619</td>\n",
" <td>16</td>\n",
" <td>12.757381</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>121-3</td>\n",
" <td>13.0</td>\n",
" <td>5.239388</td>\n",
" <td>5</td>\n",
" <td>7.760612</td>\n",
" </tr>\n",
" <tr>\n",
" <th>65</th>\n",
" <td>227-1</td>\n",
" <td>1.0</td>\n",
" <td>7.994058</td>\n",
" <td>8</td>\n",
" <td>6.994058</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>55-1</td>\n",
" <td>14.0</td>\n",
" <td>7.210027</td>\n",
" <td>7</td>\n",
" <td>6.789973</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>55-1</td>\n",
" <td>12.0</td>\n",
" <td>5.706020</td>\n",
" <td>6</td>\n",
" <td>6.293980</td>\n",
" </tr>\n",
" <tr>\n",
" <th>60</th>\n",
" <td>236-1</td>\n",
" <td>1.0</td>\n",
" <td>6.108304</td>\n",
" <td>6</td>\n",
" <td>5.108304</td>\n",
" </tr>\n",
" <tr>\n",
" <th>61</th>\n",
" <td>236-1</td>\n",
" <td>1.0</td>\n",
" <td>5.948221</td>\n",
" <td>6</td>\n",
" <td>4.948221</td>\n",
" </tr>\n",
" <tr>\n",
" <th>56</th>\n",
" <td>236-1</td>\n",
" <td>2.0</td>\n",
" <td>6.426550</td>\n",
" <td>6</td>\n",
" <td>4.426550</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>121-4</td>\n",
" <td>11.0</td>\n",
" <td>6.758044</td>\n",
" <td>7</td>\n",
" <td>4.241956</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ma_dia_diem so_luong_thuc_te so_luong_du_doan_raw \\\n",
"34 579-1 32.0 16.876830 \n",
"0 114-1 29.0 16.242619 \n",
"14 121-3 13.0 5.239388 \n",
"65 227-1 1.0 7.994058 \n",
"28 55-1 14.0 7.210027 \n",
"29 55-1 12.0 5.706020 \n",
"60 236-1 1.0 6.108304 \n",
"61 236-1 1.0 5.948221 \n",
"56 236-1 2.0 6.426550 \n",
"17 121-4 11.0 6.758044 \n",
"\n",
" so_luong_du_doan_round abs_error \n",
"34 17 15.123170 \n",
"0 16 12.757381 \n",
"14 5 7.760612 \n",
"65 8 6.994058 \n",
"28 7 6.789973 \n",
"29 6 6.293980 \n",
"60 6 5.108304 \n",
"61 6 4.948221 \n",
"56 6 4.426550 \n",
"17 7 4.241956 "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Saved CSV to: ./outputs\\val_predictions_extratrees.csv\n",
"Rows: 68\n"
]
}
],
"source": [
"# CELL 8 — EXPORT VAL PREDICTIONS TO CSV (FOR ANALYSIS / BUSINESS REVIEW)\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"import os\n",
"\n",
"# =========================\n",
"# 1) Recompute predictions (safety, explicit)\n",
"# =========================\n",
"pred_val_raw = np.expm1(final_model.predict(X_val.values))\n",
"pred_val_raw = np.maximum(0, pred_val_raw)\n",
"\n",
"pred_val_round = np.rint(pred_val_raw).astype(int)\n",
"\n",
"# =========================\n",
"# 2) Build result DataFrame\n",
"# =========================\n",
"val_result = pd.DataFrame({\n",
" \"ma_dia_diem\": df.loc[val_idx, \"ma_dia_diem\"].values,\n",
" \"so_luong_thuc_te\": y_val.values,\n",
" \"so_luong_du_doan_raw\": pred_val_raw,\n",
" \"so_luong_du_doan_round\": pred_val_round,\n",
" \"abs_error\": np.abs(y_val.values - pred_val_raw),\n",
"})\n",
"\n",
"# (optional) sort by error to review bad cases first\n",
"val_result = val_result.sort_values(\"abs_error\", ascending=False)\n",
"\n",
"print(\"Preview:\")\n",
"display(val_result.head(10))\n",
"\n",
"# =========================\n",
"# 3) Save to CSV\n",
"# =========================\n",
"OUTPUT_DIR = \"./outputs\"\n",
"os.makedirs(OUTPUT_DIR, exist_ok=True)\n",
"\n",
"csv_path = os.path.join(OUTPUT_DIR, \"val_predictions_extratrees.csv\")\n",
"val_result.to_csv(csv_path, index=False, encoding=\"utf-8-sig\")\n",
"\n",
"print(f\"\\nSaved CSV to: {csv_path}\")\n",
"print(f\"Rows: {len(val_result)}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "8cb3cde1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"=== MODEL SAVED SUCCESSFULLY ===\n",
"Model file : ./artifacts\\extratrees_staff_model.joblib\n",
"Columns file : ./artifacts\\X_proc_columns.joblib\n",
"Meta file : ./artifacts\\model_meta.joblib\n"
]
}
],
"source": [
"# CELL 8 — SAVE TRAINED MODEL (NO PREDICT YET)\n",
"\n",
"import os\n",
"import joblib\n",
"\n",
"# =========================\n",
"# 1) Create artifact directory\n",
"# =========================\n",
"ARTIFACT_DIR = \"./artifacts\"\n",
"os.makedirs(ARTIFACT_DIR, exist_ok=True)\n",
"\n",
"# =========================\n",
"# 2) Define paths\n",
"# =========================\n",
"MODEL_PATH = os.path.join(ARTIFACT_DIR, \"extratrees_staff_model.joblib\")\n",
"COLUMNS_PATH = os.path.join(ARTIFACT_DIR, \"X_proc_columns.joblib\")\n",
"META_PATH = os.path.join(ARTIFACT_DIR, \"model_meta.joblib\")\n",
"\n",
"# =========================\n",
"# 3) Save model\n",
"# =========================\n",
"joblib.dump(final_model, MODEL_PATH)\n",
"\n",
"# =========================\n",
"# 4) Save feature schema (VERY IMPORTANT)\n",
"# =========================\n",
"joblib.dump(list(X_train.columns), COLUMNS_PATH)\n",
"\n",
"# =========================\n",
"# 5) Save metadata (optional but professional)\n",
"# =========================\n",
"meta = {\n",
" \"model_type\": \"ExtraTreesRegressor\",\n",
" \"target\": \"so_luong\",\n",
" \"target_transform\": \"log1p -> expm1\",\n",
" \"train_size\": len(X_train),\n",
" \"val_size\": len(X_val),\n",
" \"features\": X_train.shape[1],\n",
" \"note\": \"Predict staff headcount per shift\",\n",
"}\n",
"\n",
"joblib.dump(meta, META_PATH)\n",
"\n",
"print(\"=== MODEL SAVED SUCCESSFULLY ===\")\n",
"print(\"Model file :\", MODEL_PATH)\n",
"print(\"Columns file :\", COLUMNS_PATH)\n",
"print(\"Meta file :\", META_PATH)\n"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "0eab135b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['tong_gio_lam', 'so_ca_cua_toa', 'num_tasks', 'num_cleaning_tasks',\n",
" 'num_trash_collection_tasks', 'num_monitoring_tasks',\n",
" 'num_deep_cleaning_tasks', 'num_support_tasks', 'num_other_tasks',\n",
" 'num_wc_tasks', 'num_hallway_tasks', 'num_lobby_tasks',\n",
" 'num_outdoor_tasks', 'num_elevator_tasks', 'cleaning_ratio',\n",
" 'trash_collection_ratio', 'monitoring_ratio', 'area_diversity',\n",
" 'so_tang', 'so_cua_thang_may', 'dien_tich_ngoai_canh', 'dien_tich_sanh',\n",
" 'dien_tich_hanh_lang', 'dien_tich_wc', 'dien_tich_phong',\n",
" 'dien_tich_tham', 'doc_ham', 'vien_phan_quang', 'op_tuong',\n",
" 'op_chan_tuong', 'ranh_thoat_nuoc', 'dien_tich_kinh',\n",
" 'num_medical_tasks_total', 'num_indoor_room_tasks', 'hour_start',\n",
" 'hour_end', 'shift_length', 'is_cross_day', 'loai_ca_24/24',\n",
" 'loai_ca_Ca chiều', 'loai_ca_Ca gãy', 'loai_ca_Ca sáng',\n",
" 'loai_ca_Ca đêm', 'loai_ca_Hành chính', 'loai_ca_Part time',\n",
" 'loai_ca_nan'],\n",
" dtype='object')"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_val.columns"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "1dd44caa",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['ma_dia_diem', 'all_task_normal', 'all_task_dinhky', 'loai_ca',\n",
" 'bat_dau', 'ket_thuc', 'tong_gio_lam', 'so_ca_cua_toa', 'so_luong',\n",
" 'num_tasks', 'num_cleaning_tasks', 'num_trash_collection_tasks',\n",
" 'num_monitoring_tasks', 'num_deep_cleaning_tasks', 'num_support_tasks',\n",
" 'num_other_tasks', 'num_wc_tasks', 'num_hallway_tasks',\n",
" 'num_lobby_tasks', 'num_outdoor_tasks', 'num_elevator_tasks',\n",
" 'cleaning_ratio', 'trash_collection_ratio', 'monitoring_ratio',\n",
" 'area_diversity', 'so_tang', 'so_cua_thang_may', 'dien_tich_ngoai_canh',\n",
" 'dien_tich_sanh', 'dien_tich_hanh_lang', 'dien_tich_wc',\n",
" 'dien_tich_phong', 'dien_tich_tham', 'doc_ham', 'vien_phan_quang',\n",
" 'op_tuong', 'op_chan_tuong', 'ranh_thoat_nuoc', 'dien_tich_kinh',\n",
" 'num_medical_tasks_total', 'num_indoor_room_tasks',\n",
" 'is_tasks_text_missing'],\n",
" dtype='object')"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns"
]
},
{
"cell_type": "markdown",
"id": "a7036167",
"metadata": {},
"source": []
},
{
"cell_type": "markdown",
"id": "d8dbd670",
"metadata": {},
"source": []
},
{
"cell_type": "markdown",
"id": "3ebac85b",
"metadata": {},
"source": []
},
{
"cell_type": "markdown",
"id": "22866fc4",
"metadata": {},
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}