{
"cells": [
{
"cell_type": "code",
"execution_count": 24,
"id": "e1667110",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loaded: final_2.xlsx | sheet: final\n",
"Shape (raw): (401, 42)\n",
"Shape (after dedup): (394, 42)\n",
"\n",
"=== TARGET SUMMARY (so_luong) ===\n",
"count 394.000000\n",
"mean 4.710660\n",
"std 6.848602\n",
"min 0.000000\n",
"25% 1.000000\n",
"50% 2.000000\n",
"75% 5.000000\n",
"max 64.000000\n",
"Name: so_luong, dtype: float64\n",
"Missing target: 0\n",
"Negative target: 0\n",
"Zero target: 3\n",
"\n",
"Sample rows:\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ma_dia_diem | \n",
" all_task_normal | \n",
" all_task_dinhky | \n",
" loai_ca | \n",
" bat_dau | \n",
" ket_thuc | \n",
" tong_gio_lam | \n",
" so_ca_cua_toa | \n",
" so_luong | \n",
" num_tasks | \n",
" ... | \n",
" dien_tich_tham | \n",
" doc_ham | \n",
" vien_phan_quang | \n",
" op_tuong | \n",
" op_chan_tuong | \n",
" ranh_thoat_nuoc | \n",
" dien_tich_kinh | \n",
" num_medical_tasks_total | \n",
" num_indoor_room_tasks | \n",
" is_tasks_text_missing | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 115-2 | \n",
" Làm sạch toàn bộ phòng giao dịch tầng 1 (kể cả... | \n",
" NaN | \n",
" Part time | \n",
" 06:30:00 | \n",
" 10:30:00 | \n",
" 4.0 | \n",
" 1 | \n",
" 1 | \n",
" 7 | \n",
" ... | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0 | \n",
" 20.0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 101-1 | \n",
" Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,... | \n",
" Lau bảng biển, bình cứu hỏa , cây nước hành la... | \n",
" Hành chính | \n",
" 06:30:00 | \n",
" 16:00:00 | \n",
" 7.5 | \n",
" 6 | \n",
" 24 | \n",
" 441 | \n",
" ... | \n",
" 0.0 | \n",
" 70 | \n",
" 0 | \n",
" 9176.0 | \n",
" 89.0 | \n",
" 25 | \n",
" 894.0 | \n",
" 112 | \n",
" 39 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 101-1 | \n",
" Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,... | \n",
" Lau bảng biển, bình cứu hỏa , cây nước hành la... | \n",
" Ca sáng | \n",
" 06:00:00 | \n",
" 14:00:00 | \n",
" 8.0 | \n",
" 6 | \n",
" 3 | \n",
" 441 | \n",
" ... | \n",
" 0.0 | \n",
" 70 | \n",
" 0 | \n",
" 9176.0 | \n",
" 89.0 | \n",
" 25 | \n",
" 894.0 | \n",
" 112 | \n",
" 39 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
3 rows × 42 columns
\n",
"
"
],
"text/plain": [
" ma_dia_diem all_task_normal \\\n",
"0 115-2 Làm sạch toàn bộ phòng giao dịch tầng 1 (kể cả... \n",
"1 101-1 Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,... \n",
"2 101-1 Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,... \n",
"\n",
" all_task_dinhky loai_ca bat_dau \\\n",
"0 NaN Part time 06:30:00 \n",
"1 Lau bảng biển, bình cứu hỏa , cây nước hành la... Hành chính 06:30:00 \n",
"2 Lau bảng biển, bình cứu hỏa , cây nước hành la... Ca sáng 06:00:00 \n",
"\n",
" ket_thuc tong_gio_lam so_ca_cua_toa so_luong num_tasks ... \\\n",
"0 10:30:00 4.0 1 1 7 ... \n",
"1 16:00:00 7.5 6 24 441 ... \n",
"2 14:00:00 8.0 6 3 441 ... \n",
"\n",
" dien_tich_tham doc_ham vien_phan_quang op_tuong op_chan_tuong \\\n",
"0 0.0 0 0 0.0 0.0 \n",
"1 0.0 70 0 9176.0 89.0 \n",
"2 0.0 70 0 9176.0 89.0 \n",
"\n",
" ranh_thoat_nuoc dien_tich_kinh num_medical_tasks_total \\\n",
"0 0 20.0 0 \n",
"1 25 894.0 112 \n",
"2 25 894.0 112 \n",
"\n",
" num_indoor_room_tasks is_tasks_text_missing \n",
"0 1 0 \n",
"1 39 0 \n",
"2 39 0 \n",
"\n",
"[3 rows x 42 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# CELL 1 — LOAD DATA & BASIC CLEAN\n",
"\n",
"import pandas as pd\n",
"\n",
"DATA_PATH = \"final_2.xlsx\"\n",
"SHEET_NAME = \"final\"\n",
"\n",
"# 1. Load\n",
"df = pd.read_excel(DATA_PATH, sheet_name=SHEET_NAME)\n",
"print(f\"Loaded: {DATA_PATH} | sheet: {SHEET_NAME}\")\n",
"print(\"Shape (raw):\", df.shape)\n",
"\n",
"# 2. Drop duplicate full rows\n",
"df = df.drop_duplicates().reset_index(drop=True)\n",
"print(\"Shape (after dedup):\", df.shape)\n",
"\n",
"# 3. Check target\n",
"assert \"so_luong\" in df.columns, \"❌ Missing target so_luong\"\n",
"\n",
"print(\"\\n=== TARGET SUMMARY (so_luong) ===\")\n",
"print(df[\"so_luong\"].describe())\n",
"print(\"Missing target:\", df[\"so_luong\"].isna().sum())\n",
"print(\"Negative target:\", (df[\"so_luong\"] < 0).sum())\n",
"print(\"Zero target:\", (df[\"so_luong\"] == 0).sum())\n",
"\n",
"# 4. Peek data\n",
"print(\"\\nSample rows:\")\n",
"display(df.head(3))\n"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "5601efad",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"All columns:\n",
" 0: ma_dia_diem\n",
" 1: all_task_normal\n",
" 2: all_task_dinhky\n",
" 3: loai_ca\n",
" 4: bat_dau\n",
" 5: ket_thuc\n",
" 6: tong_gio_lam\n",
" 7: so_ca_cua_toa\n",
" 8: so_luong\n",
" 9: num_tasks\n",
"10: num_cleaning_tasks\n",
"11: num_trash_collection_tasks\n",
"12: num_monitoring_tasks\n",
"13: num_deep_cleaning_tasks\n",
"14: num_support_tasks\n",
"15: num_other_tasks\n",
"16: num_wc_tasks\n",
"17: num_hallway_tasks\n",
"18: num_lobby_tasks\n",
"19: num_outdoor_tasks\n",
"20: num_elevator_tasks\n",
"21: cleaning_ratio\n",
"22: trash_collection_ratio\n",
"23: monitoring_ratio\n",
"24: area_diversity\n",
"25: so_tang\n",
"26: so_cua_thang_may\n",
"27: dien_tich_ngoai_canh\n",
"28: dien_tich_sanh\n",
"29: dien_tich_hanh_lang\n",
"30: dien_tich_wc\n",
"31: dien_tich_phong\n",
"32: dien_tich_tham\n",
"33: doc_ham\n",
"34: vien_phan_quang\n",
"35: op_tuong\n",
"36: op_chan_tuong\n",
"37: ranh_thoat_nuoc\n",
"38: dien_tich_kinh\n",
"39: num_medical_tasks_total\n",
"40: num_indoor_room_tasks\n",
"41: is_tasks_text_missing\n",
"\n",
"Dropped columns:\n",
" - ma_dia_diem\n",
" - all_task_normal\n",
" - all_task_dinhky\n",
" - is_tasks_text_missing\n",
"\n",
"Shapes:\n",
"X: (394, 37)\n",
"y: (394,)\n",
"\n",
"Feature dtypes:\n"
]
},
{
"data": {
"text/plain": [
"int64 21\n",
"float64 13\n",
"object 3\n",
"Name: count, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Missing values in X:\n"
]
},
{
"data": {
"text/plain": [
"loai_ca 0\n",
"bat_dau 0\n",
"ket_thuc 0\n",
"tong_gio_lam 0\n",
"so_ca_cua_toa 0\n",
"num_tasks 0\n",
"num_cleaning_tasks 0\n",
"num_trash_collection_tasks 0\n",
"num_monitoring_tasks 0\n",
"num_deep_cleaning_tasks 0\n",
"dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# CELL 2 — FEATURE SELECTION (STRICT)\n",
"\n",
"# 1. Xem toàn bộ cột\n",
"print(\"All columns:\")\n",
"for i, c in enumerate(df.columns):\n",
" print(f\"{i:2d}: {c}\")\n",
"\n",
"# 2. Xác định cột cần loại bỏ (THEO THỎA THUẬN)\n",
"DROP_COLS = [\n",
" df.columns[0], # ma_dia_diem\n",
" df.columns[1], # all_task_normal\n",
" df.columns[2], # all_task_dinhky\n",
" df.columns[-1], # is_tasks_text_missing\n",
"]\n",
"\n",
"print(\"\\nDropped columns:\")\n",
"for c in DROP_COLS:\n",
" print(\" -\", c)\n",
"\n",
"# 3. Tạo X, y\n",
"X = df.drop(columns=DROP_COLS + [\"so_luong\"])\n",
"y = df[\"so_luong\"].astype(float)\n",
"\n",
"print(\"\\nShapes:\")\n",
"print(\"X:\", X.shape)\n",
"print(\"y:\", y.shape)\n",
"\n",
"# 4. Kiểm tra kiểu dữ liệu\n",
"print(\"\\nFeature dtypes:\")\n",
"display(X.dtypes.value_counts())\n",
"\n",
"# 5. Kiểm tra missing\n",
"print(\"\\nMissing values in X:\")\n",
"display(X.isna().sum().sort_values(ascending=False).head(10))\n"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "bb467e4c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Categorical columns: ['loai_ca']\n",
"\n",
"After preprocess:\n",
"X_proc shape: (394, 46)\n",
"Any non-numeric dtypes? False\n",
"\n",
"Sample columns (first 30):\n",
"['tong_gio_lam', 'so_ca_cua_toa', 'num_tasks', 'num_cleaning_tasks', 'num_trash_collection_tasks', 'num_monitoring_tasks', 'num_deep_cleaning_tasks', 'num_support_tasks', 'num_other_tasks', 'num_wc_tasks', 'num_hallway_tasks', 'num_lobby_tasks', 'num_outdoor_tasks', 'num_elevator_tasks', 'cleaning_ratio', 'trash_collection_ratio', 'monitoring_ratio', 'area_diversity', 'so_tang', 'so_cua_thang_may', 'dien_tich_ngoai_canh', 'dien_tich_sanh', 'dien_tich_hanh_lang', 'dien_tich_wc', 'dien_tich_phong', 'dien_tich_tham', 'doc_ham', 'vien_phan_quang', 'op_tuong', 'op_chan_tuong']\n"
]
}
],
"source": [
"# CELL 3 — PREPROCESS (TIME + CATEGORICAL) WITHOUT JUNK\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"# ---------- 1) Time parsing ----------\n",
"def time_to_hour(x):\n",
" if pd.isna(x):\n",
" return np.nan\n",
"\n",
" # datetime/time object\n",
" if hasattr(x, \"hour\"):\n",
" return float(x.hour) + float(getattr(x, \"minute\", 0))/60.0\n",
"\n",
" s = str(x).strip()\n",
" # \"YYYY-MM-DD HH:MM:SS\"\n",
" if \" \" in s and \":\" in s:\n",
" s = s.split(\" \", 1)[1].strip()\n",
"\n",
" # \"HH:MM\" or \"HH:MM:SS\"\n",
" if \":\" in s:\n",
" parts = s.split(\":\")\n",
" try:\n",
" h = float(parts[0])\n",
" m = float(parts[1]) if len(parts) > 1 else 0.0\n",
" return h + m/60.0\n",
" except:\n",
" return np.nan\n",
"\n",
" # numeric fallback\n",
" try:\n",
" return float(s)\n",
" except:\n",
" return np.nan\n",
"\n",
"# Create new numeric time features (do NOT one-hot time)\n",
"X_proc = X.copy()\n",
"\n",
"if \"bat_dau\" in X_proc.columns:\n",
" X_proc[\"hour_start\"] = X_proc[\"bat_dau\"].apply(time_to_hour)\n",
"if \"ket_thuc\" in X_proc.columns:\n",
" X_proc[\"hour_end\"] = X_proc[\"ket_thuc\"].apply(time_to_hour)\n",
"\n",
"# shift_length + cross day\n",
"if (\"hour_start\" in X_proc.columns) and (\"hour_end\" in X_proc.columns):\n",
" end_adj = X_proc[\"hour_end\"].copy()\n",
" cross = (X_proc[\"hour_start\"].notna()) & (X_proc[\"hour_end\"].notna()) & (X_proc[\"hour_end\"] < X_proc[\"hour_start\"])\n",
" end_adj[cross] = end_adj[cross] + 24.0\n",
"\n",
" X_proc[\"shift_length\"] = (end_adj - X_proc[\"hour_start\"]).clip(lower=0)\n",
" X_proc[\"is_cross_day\"] = cross.astype(int)\n",
"\n",
"# Drop raw time cols to avoid junk\n",
"for c in [\"bat_dau\", \"ket_thuc\"]:\n",
" if c in X_proc.columns:\n",
" X_proc = X_proc.drop(columns=[c])\n",
"\n",
"# ---------- 2) One-hot categorical ----------\n",
"cat_cols = [c for c in X_proc.columns if X_proc[c].dtype == \"object\"]\n",
"print(\"Categorical columns:\", cat_cols)\n",
"\n",
"X_proc = pd.get_dummies(X_proc, columns=cat_cols, dummy_na=True)\n",
"\n",
"# ---------- 3) Fill missing ----------\n",
"X_proc = X_proc.replace([np.inf, -np.inf], np.nan).fillna(0)\n",
"\n",
"print(\"\\nAfter preprocess:\")\n",
"print(\"X_proc shape:\", X_proc.shape)\n",
"print(\"Any non-numeric dtypes?\", any(dt == \"object\" for dt in X_proc.dtypes))\n",
"\n",
"print(\"\\nSample columns (first 30):\")\n",
"print(list(X_proc.columns[:30]))\n"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "3cd119b7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Shapes:\n",
"Train: (326, 46) (326,)\n",
"Val : (68, 46) (68,)\n",
"\n",
"ElasticNet(alpha=0.01, l1_ratio=0.5)\n",
" Train | MAE=2.754 | RMSE=5.200 | R2=0.458\n",
" Val | MAE=2.420 | RMSE=4.792 | R2=0.259\n",
"\n",
"DecisionTree(max_depth=8, min_samples_leaf=5)\n",
" Train | MAE=2.187 | RMSE=5.313 | R2=0.434\n",
" Val | MAE=2.312 | RMSE=4.149 | R2=0.445\n",
"\n",
"RandomForest(n_estimators=600, min_samples_leaf=3)\n",
" Train | MAE=1.894 | RMSE=4.820 | R2=0.535\n",
" Val | MAE=2.402 | RMSE=4.467 | R2=0.356\n",
"\n",
"ExtraTrees(n_estimators=800, min_samples_leaf=2)\n",
" Train | MAE=1.095 | RMSE=3.320 | R2=0.779\n",
" Val | MAE=1.968 | RMSE=3.461 | R2=0.614\n",
"\n",
"HistGradientBoosting(learning_rate=0.05, min_samples_leaf=20)\n",
" Train | MAE=1.990 | RMSE=4.871 | R2=0.525\n",
" Val | MAE=2.406 | RMSE=4.599 | R2=0.318\n",
"\n",
"=== HOLDOUT VAL MAE summary (lower is better) ===\n",
"ExtraTrees : Val MAE = 1.968\n",
"DecisionTree: Val MAE = 2.312\n",
"RandomForest: Val MAE = 2.402\n",
"HGBR : Val MAE = 2.406\n",
"ElasticNet : Val MAE = 2.420\n"
]
}
],
"source": [
"# CELL 4 (EXTENDED) — ADD TREE/ENSEMBLE MODELS TO HOLDOUT VAL\n",
"\n",
"import numpy as np\n",
"from sklearn.model_selection import GroupShuffleSplit\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet\n",
"from sklearn.tree import DecisionTreeRegressor\n",
"from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor\n",
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
"\n",
"# ---------- 1) Group split (80% train, 20% val) ----------\n",
"groups = df[\"ma_dia_diem\"].astype(str)\n",
"\n",
"gss = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=42)\n",
"train_idx, val_idx = next(gss.split(X_proc, y, groups))\n",
"\n",
"X_train = X_proc.iloc[train_idx]\n",
"y_train = y.iloc[train_idx]\n",
"\n",
"X_val = X_proc.iloc[val_idx]\n",
"y_val = y.iloc[val_idx]\n",
"\n",
"print(\"Shapes:\")\n",
"print(\"Train:\", X_train.shape, y_train.shape)\n",
"print(\"Val :\", X_val.shape, y_val.shape)\n",
"\n",
"# ---------- 2) Scale features (for linear models) ----------\n",
"scaler = StandardScaler()\n",
"X_train_s = scaler.fit_transform(X_train)\n",
"X_val_s = scaler.transform(X_val)\n",
"\n",
"# ---------- 2b) No-scale matrices (for tree models) ----------\n",
"X_train_ns = X_train.values\n",
"X_val_ns = X_val.values\n",
"\n",
"# ---------- 3) Log-transform target ----------\n",
"y_train_log = np.log1p(y_train)\n",
"y_val_log = np.log1p(y_val)\n",
"\n",
"# ---------- 4) Evaluation helper (support scale/no-scale) ----------\n",
"def eval_reg_any(name, model, use_scaled_X=True):\n",
" Xtr = X_train_s if use_scaled_X else X_train_ns\n",
" Xva = X_val_s if use_scaled_X else X_val_ns\n",
"\n",
" model.fit(Xtr, y_train_log)\n",
"\n",
" pred_train = np.maximum(0, np.expm1(model.predict(Xtr)))\n",
" pred_val = np.maximum(0, np.expm1(model.predict(Xva)))\n",
"\n",
" def _m(y_true, y_pred):\n",
" return (\n",
" mean_absolute_error(y_true, y_pred),\n",
" mean_squared_error(y_true, y_pred) ** 0.5,\n",
" r2_score(y_true, y_pred),\n",
" )\n",
"\n",
" tr = _m(y_train, pred_train)\n",
" va = _m(y_val, pred_val)\n",
"\n",
" print(f\"\\n{name}\")\n",
" print(f\" Train | MAE={tr[0]:.3f} | RMSE={tr[1]:.3f} | R2={tr[2]:.3f}\")\n",
" print(f\" Val | MAE={va[0]:.3f} | RMSE={va[1]:.3f} | R2={va[2]:.3f}\")\n",
"\n",
" return va[0]\n",
"\n",
"# ---------- 5) Train baselines + tree/ensemble ----------\n",
"results = {}\n",
"\n",
"# Linear (scaled)\n",
"# results[\"Linear\"] = eval_reg_any(\"LinearRegression\", LinearRegression(), use_scaled_X=True)\n",
"# results[\"Ridge\"] = eval_reg_any(\"Ridge(alpha=1.0)\", Ridge(alpha=1.0), use_scaled_X=True)\n",
"# results[\"Lasso\"] = eval_reg_any(\"Lasso(alpha=0.01)\", Lasso(alpha=0.01, max_iter=5000), use_scaled_X=True)\n",
"results[\"ElasticNet\"] = eval_reg_any(\n",
" \"ElasticNet(alpha=0.01, l1_ratio=0.5)\",\n",
" ElasticNet(alpha=0.01, l1_ratio=0.5, max_iter=5000),\n",
" use_scaled_X=True\n",
")\n",
"\n",
"# Tree / Ensemble (no scale)\n",
"results[\"DecisionTree\"] = eval_reg_any(\n",
" \"DecisionTree(max_depth=8, min_samples_leaf=5)\",\n",
" DecisionTreeRegressor(max_depth=8, min_samples_leaf=5, random_state=42),\n",
" use_scaled_X=False\n",
")\n",
"\n",
"results[\"RandomForest\"] = eval_reg_any(\n",
" \"RandomForest(n_estimators=600, min_samples_leaf=3)\",\n",
" RandomForestRegressor(\n",
" n_estimators=600, min_samples_leaf=3, random_state=42, n_jobs=-1\n",
" ),\n",
" use_scaled_X=False\n",
")\n",
"\n",
"results[\"ExtraTrees\"] = eval_reg_any(\n",
" \"ExtraTrees(n_estimators=800, min_samples_leaf=2)\",\n",
" ExtraTreesRegressor(\n",
" n_estimators=800, min_samples_leaf=2, random_state=42, n_jobs=-1\n",
" ),\n",
" use_scaled_X=False\n",
")\n",
"\n",
"results[\"HGBR\"] = eval_reg_any(\n",
" \"HistGradientBoosting(learning_rate=0.05, min_samples_leaf=20)\",\n",
" HistGradientBoostingRegressor(\n",
" learning_rate=0.05, min_samples_leaf=20, max_leaf_nodes=31, random_state=42\n",
" ),\n",
" use_scaled_X=False\n",
")\n",
"\n",
"print(\"\\n=== HOLDOUT VAL MAE summary (lower is better) ===\")\n",
"for k, v in sorted(results.items(), key=lambda x: x[1]):\n",
" print(f\"{k:12s}: Val MAE = {v:.3f}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "106e557f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train for CV: (326, 46) (326,) | unique groups: 153\n",
"\n",
"=== ExtraTrees GroupKFold (TRAIN ONLY) ===\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" MAE | \n",
" RMSE | \n",
" R2 | \n",
" fold | \n",
" n_val | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 3.677199 | \n",
" 7.263821 | \n",
" 0.048081 | \n",
" 1 | \n",
" 66 | \n",
"
\n",
" \n",
" | 1 | \n",
" 3.673333 | \n",
" 6.600086 | \n",
" 0.104487 | \n",
" 2 | \n",
" 65 | \n",
"
\n",
" \n",
" | 2 | \n",
" 3.092924 | \n",
" 8.986669 | \n",
" 0.087366 | \n",
" 3 | \n",
" 65 | \n",
"
\n",
" \n",
" | 3 | \n",
" 2.100641 | \n",
" 3.499245 | \n",
" 0.291277 | \n",
" 4 | \n",
" 65 | \n",
"
\n",
" \n",
" | 4 | \n",
" 2.550497 | \n",
" 4.739900 | \n",
" 0.407946 | \n",
" 5 | \n",
" 65 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" MAE RMSE R2 fold n_val\n",
"0 3.677199 7.263821 0.048081 1 66\n",
"1 3.673333 6.600086 0.104487 2 65\n",
"2 3.092924 8.986669 0.087366 3 65\n",
"3 2.100641 3.499245 0.291277 4 65\n",
"4 2.550497 4.739900 0.407946 5 65"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== CV SUMMARY ===\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" MAE | \n",
" RMSE | \n",
" R2 | \n",
"
\n",
" \n",
" \n",
" \n",
" | mean | \n",
" 3.018919 | \n",
" 6.217944 | \n",
" 0.187832 | \n",
"
\n",
" \n",
" | std | \n",
" 0.694572 | \n",
" 2.149515 | \n",
" 0.154694 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" MAE RMSE R2\n",
"mean 3.018919 6.217944 0.187832\n",
"std 0.694572 2.149515 0.154694"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Worst fold (highest MAE):\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" MAE | \n",
" RMSE | \n",
" R2 | \n",
" fold | \n",
" n_val | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 3.677199 | \n",
" 7.263821 | \n",
" 0.048081 | \n",
" 1 | \n",
" 66 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" MAE RMSE R2 fold n_val\n",
"0 3.677199 7.263821 0.048081 1 66"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# CELL 5 — GROUP K-FOLD CV (TRAIN ONLY) FOR EXTRA TREES (NO LEAKAGE by ma_dia_diem)\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"from sklearn.model_selection import GroupKFold\n",
"from sklearn.ensemble import ExtraTreesRegressor\n",
"from sklearn.base import clone\n",
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
"\n",
"# =========================\n",
"# 1) Prepare TRAIN data + groups\n",
"# =========================\n",
"X_tr = X_train.values # no scaling for tree\n",
"y_tr = y_train.values.astype(float)\n",
"g_tr = df.loc[train_idx, \"ma_dia_diem\"].astype(str).values\n",
"\n",
"print(\"Train for CV:\", X_tr.shape, y_tr.shape, \"| unique groups:\", len(np.unique(g_tr)))\n",
"\n",
"# =========================\n",
"# 2) Metric helper (evaluate on original scale)\n",
"# =========================\n",
"def metrics(y_true, y_pred):\n",
" y_pred = np.maximum(0, y_pred)\n",
" return {\n",
" \"MAE\": mean_absolute_error(y_true, y_pred),\n",
" \"RMSE\": mean_squared_error(y_true, y_pred) ** 0.5,\n",
" \"R2\": r2_score(y_true, y_pred),\n",
" }\n",
"\n",
"# =========================\n",
"# 3) ExtraTrees config (same as your holdout baseline)\n",
"# =========================\n",
"base_model = ExtraTreesRegressor(\n",
" n_estimators=800,\n",
" min_samples_leaf=2,\n",
" random_state=42,\n",
" n_jobs=-1\n",
")\n",
"\n",
"# =========================\n",
"# 4) GroupKFold CV (fit log1p(y) -> expm1(pred))\n",
"# =========================\n",
"gkf = GroupKFold(n_splits=5)\n",
"\n",
"rows = []\n",
"for fold, (tr_idx, va_idx) in enumerate(gkf.split(X_tr, y_tr, groups=g_tr), start=1):\n",
" model = clone(base_model)\n",
"\n",
" Xtr_f, Xva_f = X_tr[tr_idx], X_tr[va_idx]\n",
" ytr_f, yva_f = y_tr[tr_idx], y_tr[va_idx]\n",
"\n",
" model.fit(Xtr_f, np.log1p(ytr_f))\n",
" pred_va = np.expm1(model.predict(Xva_f))\n",
" pred_va = np.maximum(0, pred_va)\n",
"\n",
" m = metrics(yva_f, pred_va)\n",
" m[\"fold\"] = fold\n",
" m[\"n_val\"] = len(va_idx)\n",
" rows.append(m)\n",
"\n",
"cv_df = pd.DataFrame(rows)\n",
"\n",
"print(\"\\n=== ExtraTrees GroupKFold (TRAIN ONLY) ===\")\n",
"display(cv_df)\n",
"\n",
"summary = cv_df[[\"MAE\", \"RMSE\", \"R2\"]].agg([\"mean\", \"std\"])\n",
"print(\"\\n=== CV SUMMARY ===\")\n",
"display(summary)\n",
"\n",
"best_worst = cv_df.sort_values(\"MAE\", ascending=False)\n",
"print(\"\\nWorst fold (highest MAE):\")\n",
"display(best_worst.head(1))\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "ee66e389",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Linear | MAE = 4.971 ± 2.813\n",
"Lasso_0.01 | MAE = 7.110 ± 7.917\n",
"ElasticNet_0.01 | MAE = 6.386 ± 6.240\n",
"\n",
"=== CV SUMMARY ===\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" mean | \n",
" std | \n",
" min | \n",
" max | \n",
"
\n",
" \n",
" | model | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | Linear | \n",
" 4.970868 | \n",
" 3.145106 | \n",
" 2.279954 | \n",
" 10.321992 | \n",
"
\n",
" \n",
" | ElasticNet_0.01 | \n",
" 6.386372 | \n",
" 6.976535 | \n",
" 2.104239 | \n",
" 18.765868 | \n",
"
\n",
" \n",
" | Lasso_0.01 | \n",
" 7.110267 | \n",
" 8.851455 | \n",
" 2.113510 | \n",
" 22.876806 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" mean std min max\n",
"model \n",
"Linear 4.970868 3.145106 2.279954 10.321992\n",
"ElasticNet_0.01 6.386372 6.976535 2.104239 18.765868\n",
"Lasso_0.01 7.110267 8.851455 2.113510 22.876806"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# CELL 5 — GROUP K-FOLD CONFIRMATION (TOP LINEAR MODELS)\n",
"\n",
"from sklearn.model_selection import GroupKFold\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.linear_model import LinearRegression, Lasso, ElasticNet\n",
"from sklearn.metrics import mean_absolute_error\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"models = {\n",
" \"Linear\": LinearRegression(),\n",
" \"Lasso_0.01\": Lasso(alpha=0.01, max_iter=5000),\n",
" \"ElasticNet_0.01\": ElasticNet(alpha=0.01, l1_ratio=0.5, max_iter=5000),\n",
"}\n",
"\n",
"groups = df[\"ma_dia_diem\"].astype(str)\n",
"gkf = GroupKFold(n_splits=5)\n",
"\n",
"rows = []\n",
"\n",
"for name, model in models.items():\n",
" maes = []\n",
" for fold, (tr_idx, va_idx) in enumerate(gkf.split(X_proc, y, groups)):\n",
" X_tr, X_va = X_proc.iloc[tr_idx], X_proc.iloc[va_idx]\n",
" y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]\n",
"\n",
" scaler = StandardScaler()\n",
" X_tr_s = scaler.fit_transform(X_tr)\n",
" X_va_s = scaler.transform(X_va)\n",
"\n",
" model.fit(X_tr_s, np.log1p(y_tr))\n",
" pred_va = np.maximum(0, np.expm1(model.predict(X_va_s)))\n",
"\n",
" mae = mean_absolute_error(y_va, pred_va)\n",
" maes.append(mae)\n",
"\n",
" rows.append({\n",
" \"model\": name,\n",
" \"fold\": fold,\n",
" \"MAE\": mae,\n",
" \"n_val\": len(va_idx),\n",
" })\n",
"\n",
" print(f\"{name:12s} | MAE = {np.mean(maes):.3f} ± {np.std(maes):.3f}\")\n",
"\n",
"cv_df = pd.DataFrame(rows)\n",
"\n",
"print(\"\\n=== CV SUMMARY ===\")\n",
"display(\n",
" cv_df.groupby(\"model\")[\"MAE\"]\n",
" .agg([\"mean\", \"std\", \"min\", \"max\"])\n",
" .sort_values(\"mean\")\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "73a31e6e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"=== GROUP K-FOLD CV LEADERBOARD (TRAIN ONLY) ===\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" model | \n",
" MAE_mean | \n",
" MAE_std | \n",
" RMSE_mean | \n",
" R2_mean | \n",
"
\n",
" \n",
" \n",
" \n",
" | 4 | \n",
" ExtraTrees | \n",
" 3.018919 | \n",
" 0.694572 | \n",
" 6.217944 | \n",
" 0.187832 | \n",
"
\n",
" \n",
" | 3 | \n",
" RandomForest | \n",
" 3.052084 | \n",
" 0.715409 | \n",
" 6.243409 | \n",
" 0.187735 | \n",
"
\n",
" \n",
" | 5 | \n",
" HGBR | \n",
" 3.105344 | \n",
" 0.673739 | \n",
" 6.344961 | \n",
" 0.156316 | \n",
"
\n",
" \n",
" | 2 | \n",
" DecisionTree | \n",
" 3.681985 | \n",
" 0.668149 | \n",
" 6.861043 | \n",
" -0.017019 | \n",
"
\n",
" \n",
" | 0 | \n",
" Lasso | \n",
" 4.366601 | \n",
" 1.709182 | \n",
" 10.584715 | \n",
" -2.703761 | \n",
"
\n",
" \n",
" | 1 | \n",
" ElasticNet | \n",
" 4.918275 | \n",
" 2.414753 | \n",
" 13.602627 | \n",
" -15.883303 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" model MAE_mean MAE_std RMSE_mean R2_mean\n",
"4 ExtraTrees 3.018919 0.694572 6.217944 0.187832\n",
"3 RandomForest 3.052084 0.715409 6.243409 0.187735\n",
"5 HGBR 3.105344 0.673739 6.344961 0.156316\n",
"2 DecisionTree 3.681985 0.668149 6.861043 -0.017019\n",
"0 Lasso 4.366601 1.709182 10.584715 -2.703761\n",
"1 ElasticNet 4.918275 2.414753 13.602627 -15.883303"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Per-fold details (worst MAE first) ===\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" MAE | \n",
" RMSE | \n",
" R2 | \n",
" model | \n",
" fold | \n",
" n_val | \n",
"
\n",
" \n",
" \n",
" \n",
" | 8 | \n",
" 8.972213 | \n",
" 37.201319 | \n",
" -79.102177 | \n",
" ElasticNet | \n",
" 4 | \n",
" 65 | \n",
"
\n",
" \n",
" | 0 | \n",
" 7.342893 | \n",
" 19.028144 | \n",
" -5.532246 | \n",
" Lasso | \n",
" 1 | \n",
" 66 | \n",
"
\n",
" \n",
" | 5 | \n",
" 5.347237 | \n",
" 9.838825 | \n",
" -0.746450 | \n",
" ElasticNet | \n",
" 1 | \n",
" 66 | \n",
"
\n",
" \n",
" | 10 | \n",
" 4.347409 | \n",
" 7.407078 | \n",
" 0.010163 | \n",
" DecisionTree | \n",
" 1 | \n",
" 66 | \n",
"
\n",
" \n",
" | 3 | \n",
" 4.288895 | \n",
" 12.729106 | \n",
" -8.378285 | \n",
" Lasso | \n",
" 4 | \n",
" 65 | \n",
"
\n",
" \n",
" | 12 | \n",
" 4.144338 | \n",
" 9.678611 | \n",
" -0.058584 | \n",
" DecisionTree | \n",
" 3 | \n",
" 65 | \n",
"
\n",
" \n",
" | 11 | \n",
" 3.772958 | \n",
" 7.109340 | \n",
" -0.039038 | \n",
" DecisionTree | \n",
" 2 | \n",
" 65 | \n",
"
\n",
" \n",
" | 16 | \n",
" 3.742208 | \n",
" 6.724452 | \n",
" 0.070421 | \n",
" RandomForest | \n",
" 2 | \n",
" 65 | \n",
"
\n",
" \n",
" | 25 | \n",
" 3.721214 | \n",
" 7.376454 | \n",
" 0.018331 | \n",
" HGBR | \n",
" 1 | \n",
" 66 | \n",
"
\n",
" \n",
" | 26 | \n",
" 3.684392 | \n",
" 6.736473 | \n",
" 0.067094 | \n",
" HGBR | \n",
" 2 | \n",
" 65 | \n",
"
\n",
" \n",
" | 20 | \n",
" 3.677199 | \n",
" 7.263821 | \n",
" 0.048081 | \n",
" ExtraTrees | \n",
" 1 | \n",
" 66 | \n",
"
\n",
" \n",
" | 21 | \n",
" 3.673333 | \n",
" 6.600086 | \n",
" 0.104487 | \n",
" ExtraTrees | \n",
" 2 | \n",
" 65 | \n",
"
\n",
" \n",
" | 15 | \n",
" 3.557926 | \n",
" 7.090878 | \n",
" 0.092870 | \n",
" RandomForest | \n",
" 1 | \n",
" 66 | \n",
"
\n",
" \n",
" | 1 | \n",
" 3.511097 | \n",
" 6.344130 | \n",
" 0.172598 | \n",
" Lasso | \n",
" 2 | \n",
" 65 | \n",
"
\n",
" \n",
" | 14 | \n",
" 3.507451 | \n",
" 5.627415 | \n",
" 0.165473 | \n",
" DecisionTree | \n",
" 5 | \n",
" 65 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" MAE RMSE R2 model fold n_val\n",
"8 8.972213 37.201319 -79.102177 ElasticNet 4 65\n",
"0 7.342893 19.028144 -5.532246 Lasso 1 66\n",
"5 5.347237 9.838825 -0.746450 ElasticNet 1 66\n",
"10 4.347409 7.407078 0.010163 DecisionTree 1 66\n",
"3 4.288895 12.729106 -8.378285 Lasso 4 65\n",
"12 4.144338 9.678611 -0.058584 DecisionTree 3 65\n",
"11 3.772958 7.109340 -0.039038 DecisionTree 2 65\n",
"16 3.742208 6.724452 0.070421 RandomForest 2 65\n",
"25 3.721214 7.376454 0.018331 HGBR 1 66\n",
"26 3.684392 6.736473 0.067094 HGBR 2 65\n",
"20 3.677199 7.263821 0.048081 ExtraTrees 1 66\n",
"21 3.673333 6.600086 0.104487 ExtraTrees 2 65\n",
"15 3.557926 7.090878 0.092870 RandomForest 1 66\n",
"1 3.511097 6.344130 0.172598 Lasso 2 65\n",
"14 3.507451 5.627415 0.165473 DecisionTree 5 65"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# CELL 6 — GROUP K-FOLD CV: TRY MULTIPLE MODELS (FAIR COMPARISON)\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"from sklearn.model_selection import GroupKFold\n",
"from sklearn.base import clone\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
"\n",
"from sklearn.linear_model import Lasso, ElasticNet\n",
"from sklearn.tree import DecisionTreeRegressor\n",
"from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor\n",
"\n",
"# =========================\n",
"# 1) Prepare TRAIN data + groups\n",
"# =========================\n",
"X_tr_df = X_train.copy() # keep dataframe\n",
"y_tr = y_train.astype(float).values\n",
"g_tr = df.loc[train_idx, \"ma_dia_diem\"].astype(str).values\n",
"\n",
"# =========================\n",
"# 2) Metric helper\n",
"# =========================\n",
"def metrics(y_true, y_pred):\n",
" y_pred = np.maximum(0, y_pred)\n",
" return {\n",
" \"MAE\": mean_absolute_error(y_true, y_pred),\n",
" \"RMSE\": mean_squared_error(y_true, y_pred) ** 0.5,\n",
" \"R2\": r2_score(y_true, y_pred),\n",
" }\n",
"\n",
"def cv_eval_model(name, model, X_df, y, groups, n_splits=5, use_log=True):\n",
" gkf = GroupKFold(n_splits=n_splits)\n",
" rows = []\n",
"\n",
" for fold, (tr_i, va_i) in enumerate(gkf.split(X_df, y, groups=groups), start=1):\n",
" m = clone(model)\n",
"\n",
" Xtr = X_df.iloc[tr_i]\n",
" Xva = X_df.iloc[va_i]\n",
" ytr = y[tr_i]\n",
" yva = y[va_i]\n",
"\n",
" if use_log:\n",
" m.fit(Xtr, np.log1p(ytr))\n",
" pred = np.expm1(m.predict(Xva))\n",
" else:\n",
" m.fit(Xtr, ytr)\n",
" pred = m.predict(Xva)\n",
"\n",
" mm = metrics(yva, pred)\n",
" mm.update({\"model\": name, \"fold\": fold, \"n_val\": len(va_i)})\n",
" rows.append(mm)\n",
"\n",
" out = pd.DataFrame(rows)\n",
" summ = out[[\"MAE\", \"RMSE\", \"R2\"]].agg([\"mean\", \"std\"])\n",
" return out, summ\n",
"\n",
"# =========================\n",
"# 3) Define candidates\n",
"# =========================\n",
"candidates = []\n",
"\n",
"# Linear (need scale)\n",
"candidates.append((\"Lasso\", Pipeline([\n",
" (\"scaler\", StandardScaler()),\n",
" (\"model\", Lasso(alpha=0.01, max_iter=5000, random_state=42))\n",
"]), True))\n",
"\n",
"candidates.append((\"ElasticNet\", Pipeline([\n",
" (\"scaler\", StandardScaler()),\n",
" (\"model\", ElasticNet(alpha=0.01, l1_ratio=0.5, max_iter=8000, random_state=42))\n",
"]), True))\n",
"\n",
"# Tree/Ensemble (no scale needed)\n",
"candidates.append((\"DecisionTree\", DecisionTreeRegressor(\n",
" max_depth=8, min_samples_leaf=5, random_state=42\n",
"), True))\n",
"\n",
"candidates.append((\"RandomForest\", RandomForestRegressor(\n",
" n_estimators=600, min_samples_leaf=3, random_state=42, n_jobs=-1\n",
"), True))\n",
"\n",
"candidates.append((\"ExtraTrees\", ExtraTreesRegressor(\n",
" n_estimators=800, min_samples_leaf=2, random_state=42, n_jobs=-1\n",
"), True))\n",
"\n",
"candidates.append((\"HGBR\", HistGradientBoostingRegressor(\n",
" learning_rate=0.05, min_samples_leaf=20, max_leaf_nodes=31, random_state=42\n",
"), True))\n",
"\n",
"# =========================\n",
"# 4) Run CV for all models\n",
"# =========================\n",
"all_fold = []\n",
"rows_lb = []\n",
"\n",
"for name, model, use_log in candidates:\n",
" fold_df, summ = cv_eval_model(name, model, X_tr_df, y_tr, g_tr, n_splits=5, use_log=use_log)\n",
" all_fold.append(fold_df)\n",
"\n",
" rows_lb.append({\n",
" \"model\": name,\n",
" \"MAE_mean\": summ.loc[\"mean\", \"MAE\"],\n",
" \"MAE_std\": summ.loc[\"std\", \"MAE\"],\n",
" \"RMSE_mean\": summ.loc[\"mean\", \"RMSE\"],\n",
" \"R2_mean\": summ.loc[\"mean\", \"R2\"],\n",
" })\n",
"\n",
"cv_all = pd.concat(all_fold, ignore_index=True)\n",
"leaderboard = pd.DataFrame(rows_lb).sort_values([\"MAE_mean\", \"RMSE_mean\"], ascending=True)\n",
"\n",
"print(\"=== GROUP K-FOLD CV LEADERBOARD (TRAIN ONLY) ===\")\n",
"display(leaderboard)\n",
"\n",
"print(\"\\n=== Per-fold details (worst MAE first) ===\")\n",
"display(cv_all.sort_values(\"MAE\", ascending=False).head(15))\n"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "1d408c02",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"=== FINAL MODEL: ExtraTrees (fit on TRAIN, eval on HOLDOUT VAL) ===\n",
"\n",
"[VAL (raw)]\n",
"MAE : 1.968\n",
"RMSE: 3.461\n",
"R2 : 0.614\n",
"\n",
"[VAL (rounded)]\n",
"MAE : 1.971\n",
"RMSE: 3.502\n",
"R2 : 0.604\n",
"\n",
"Worst 15 samples (by abs error):\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" y_true | \n",
" y_pred | \n",
" y_pred_round | \n",
" abs_err | \n",
"
\n",
" \n",
" \n",
" \n",
" | 34 | \n",
" 32.0 | \n",
" 16.876830 | \n",
" 17 | \n",
" 15.123170 | \n",
"
\n",
" \n",
" | 0 | \n",
" 29.0 | \n",
" 16.242619 | \n",
" 16 | \n",
" 12.757381 | \n",
"
\n",
" \n",
" | 14 | \n",
" 13.0 | \n",
" 5.239388 | \n",
" 5 | \n",
" 7.760612 | \n",
"
\n",
" \n",
" | 65 | \n",
" 1.0 | \n",
" 7.994058 | \n",
" 8 | \n",
" 6.994058 | \n",
"
\n",
" \n",
" | 28 | \n",
" 14.0 | \n",
" 7.210027 | \n",
" 7 | \n",
" 6.789973 | \n",
"
\n",
" \n",
" | 29 | \n",
" 12.0 | \n",
" 5.706020 | \n",
" 6 | \n",
" 6.293980 | \n",
"
\n",
" \n",
" | 60 | \n",
" 1.0 | \n",
" 6.108304 | \n",
" 6 | \n",
" 5.108304 | \n",
"
\n",
" \n",
" | 61 | \n",
" 1.0 | \n",
" 5.948221 | \n",
" 6 | \n",
" 4.948221 | \n",
"
\n",
" \n",
" | 56 | \n",
" 2.0 | \n",
" 6.426550 | \n",
" 6 | \n",
" 4.426550 | \n",
"
\n",
" \n",
" | 17 | \n",
" 11.0 | \n",
" 6.758044 | \n",
" 7 | \n",
" 4.241956 | \n",
"
\n",
" \n",
" | 58 | \n",
" 10.0 | \n",
" 6.354210 | \n",
" 6 | \n",
" 3.645790 | \n",
"
\n",
" \n",
" | 10 | \n",
" 6.0 | \n",
" 2.356056 | \n",
" 2 | \n",
" 3.643944 | \n",
"
\n",
" \n",
" | 23 | \n",
" 7.0 | \n",
" 3.442200 | \n",
" 3 | \n",
" 3.557800 | \n",
"
\n",
" \n",
" | 20 | \n",
" 2.0 | \n",
" 5.504394 | \n",
" 6 | \n",
" 3.504394 | \n",
"
\n",
" \n",
" | 62 | \n",
" 3.0 | \n",
" 6.087934 | \n",
" 6 | \n",
" 3.087934 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" y_true y_pred y_pred_round abs_err\n",
"34 32.0 16.876830 17 15.123170\n",
"0 29.0 16.242619 16 12.757381\n",
"14 13.0 5.239388 5 7.760612\n",
"65 1.0 7.994058 8 6.994058\n",
"28 14.0 7.210027 7 6.789973\n",
"29 12.0 5.706020 6 6.293980\n",
"60 1.0 6.108304 6 5.108304\n",
"61 1.0 5.948221 6 4.948221\n",
"56 2.0 6.426550 6 4.426550\n",
"17 11.0 6.758044 7 4.241956\n",
"58 10.0 6.354210 6 3.645790\n",
"10 6.0 2.356056 2 3.643944\n",
"23 7.0 3.442200 3 3.557800\n",
"20 2.0 5.504394 6 3.504394\n",
"62 3.0 6.087934 6 3.087934"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Saved:\n",
" - Model : ./artifacts\\extratrees_log1p.joblib\n",
" - Columns: ./artifacts\\X_proc_columns.joblib\n"
]
}
],
"source": [
"# CELL 7 — FINALIZE BEST MODEL (ExtraTrees) + EVAL ON HOLDOUT VAL + SAVE ARTIFACTS\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"from sklearn.ensemble import ExtraTreesRegressor\n",
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
"import joblib\n",
"\n",
"# =========================\n",
"# 1) Train final ExtraTrees on FULL TRAIN\n",
"# =========================\n",
"final_model = ExtraTreesRegressor(\n",
" n_estimators=800,\n",
" min_samples_leaf=2,\n",
" random_state=42,\n",
" n_jobs=-1\n",
")\n",
"\n",
"final_model.fit(X_train.values, np.log1p(y_train.values))\n",
"\n",
"# =========================\n",
"# 2) Predict on HOLDOUT VAL\n",
"# =========================\n",
"pred_val = np.expm1(final_model.predict(X_val.values))\n",
"pred_val = np.maximum(0, pred_val)\n",
"\n",
"# optional: round to headcount integer\n",
"pred_val_round = np.rint(pred_val).astype(int)\n",
"\n",
"# =========================\n",
"# 3) Metrics (raw vs rounded)\n",
"# =========================\n",
"def print_metrics(tag, y_true, y_pred):\n",
" mae = mean_absolute_error(y_true, y_pred)\n",
" rmse = mean_squared_error(y_true, y_pred) ** 0.5\n",
" r2 = r2_score(y_true, y_pred)\n",
" print(f\"\\n[{tag}]\")\n",
" print(f\"MAE : {mae:.3f}\")\n",
" print(f\"RMSE: {rmse:.3f}\")\n",
" print(f\"R2 : {r2:.3f}\")\n",
" return {\"MAE\": mae, \"RMSE\": rmse, \"R2\": r2}\n",
"\n",
"print(\"=== FINAL MODEL: ExtraTrees (fit on TRAIN, eval on HOLDOUT VAL) ===\")\n",
"m_raw = print_metrics(\"VAL (raw)\", y_val.values, pred_val)\n",
"m_int = print_metrics(\"VAL (rounded)\", y_val.values, pred_val_round)\n",
"\n",
"# =========================\n",
"# 4) Quick error analysis\n",
"# =========================\n",
"err_df = pd.DataFrame({\n",
" \"y_true\": y_val.values,\n",
" \"y_pred\": pred_val,\n",
" \"y_pred_round\": pred_val_round,\n",
" \"abs_err\": np.abs(y_val.values - pred_val),\n",
"})\n",
"print(\"\\nWorst 15 samples (by abs error):\")\n",
"display(err_df.sort_values(\"abs_err\", ascending=False).head(15))\n",
"\n",
"# =========================\n",
"# 5) Save model + schema (columns)\n",
"# =========================\n",
"ARTIFACT_DIR = \"./artifacts\"\n",
"import os\n",
"os.makedirs(ARTIFACT_DIR, exist_ok=True)\n",
"\n",
"model_path = os.path.join(ARTIFACT_DIR, \"extratrees_log1p.joblib\")\n",
"cols_path = os.path.join(ARTIFACT_DIR, \"X_proc_columns.joblib\")\n",
"\n",
"joblib.dump(final_model, model_path)\n",
"joblib.dump(list(X_train.columns), cols_path)\n",
"\n",
"print(\"\\nSaved:\")\n",
"print(\" - Model :\", model_path)\n",
"print(\" - Columns:\", cols_path)\n"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "9cff151d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Preview:\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ma_dia_diem | \n",
" so_luong_thuc_te | \n",
" so_luong_du_doan_raw | \n",
" so_luong_du_doan_round | \n",
" abs_error | \n",
"
\n",
" \n",
" \n",
" \n",
" | 34 | \n",
" 579-1 | \n",
" 32.0 | \n",
" 16.876830 | \n",
" 17 | \n",
" 15.123170 | \n",
"
\n",
" \n",
" | 0 | \n",
" 114-1 | \n",
" 29.0 | \n",
" 16.242619 | \n",
" 16 | \n",
" 12.757381 | \n",
"
\n",
" \n",
" | 14 | \n",
" 121-3 | \n",
" 13.0 | \n",
" 5.239388 | \n",
" 5 | \n",
" 7.760612 | \n",
"
\n",
" \n",
" | 65 | \n",
" 227-1 | \n",
" 1.0 | \n",
" 7.994058 | \n",
" 8 | \n",
" 6.994058 | \n",
"
\n",
" \n",
" | 28 | \n",
" 55-1 | \n",
" 14.0 | \n",
" 7.210027 | \n",
" 7 | \n",
" 6.789973 | \n",
"
\n",
" \n",
" | 29 | \n",
" 55-1 | \n",
" 12.0 | \n",
" 5.706020 | \n",
" 6 | \n",
" 6.293980 | \n",
"
\n",
" \n",
" | 60 | \n",
" 236-1 | \n",
" 1.0 | \n",
" 6.108304 | \n",
" 6 | \n",
" 5.108304 | \n",
"
\n",
" \n",
" | 61 | \n",
" 236-1 | \n",
" 1.0 | \n",
" 5.948221 | \n",
" 6 | \n",
" 4.948221 | \n",
"
\n",
" \n",
" | 56 | \n",
" 236-1 | \n",
" 2.0 | \n",
" 6.426550 | \n",
" 6 | \n",
" 4.426550 | \n",
"
\n",
" \n",
" | 17 | \n",
" 121-4 | \n",
" 11.0 | \n",
" 6.758044 | \n",
" 7 | \n",
" 4.241956 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ma_dia_diem so_luong_thuc_te so_luong_du_doan_raw \\\n",
"34 579-1 32.0 16.876830 \n",
"0 114-1 29.0 16.242619 \n",
"14 121-3 13.0 5.239388 \n",
"65 227-1 1.0 7.994058 \n",
"28 55-1 14.0 7.210027 \n",
"29 55-1 12.0 5.706020 \n",
"60 236-1 1.0 6.108304 \n",
"61 236-1 1.0 5.948221 \n",
"56 236-1 2.0 6.426550 \n",
"17 121-4 11.0 6.758044 \n",
"\n",
" so_luong_du_doan_round abs_error \n",
"34 17 15.123170 \n",
"0 16 12.757381 \n",
"14 5 7.760612 \n",
"65 8 6.994058 \n",
"28 7 6.789973 \n",
"29 6 6.293980 \n",
"60 6 5.108304 \n",
"61 6 4.948221 \n",
"56 6 4.426550 \n",
"17 7 4.241956 "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Saved CSV to: ./outputs\\val_predictions_extratrees.csv\n",
"Rows: 68\n"
]
}
],
"source": [
"# CELL 8 — EXPORT VAL PREDICTIONS TO CSV (FOR ANALYSIS / BUSINESS REVIEW)\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"import os\n",
"\n",
"# =========================\n",
"# 1) Recompute predictions (safety, explicit)\n",
"# =========================\n",
"pred_val_raw = np.expm1(final_model.predict(X_val.values))\n",
"pred_val_raw = np.maximum(0, pred_val_raw)\n",
"\n",
"pred_val_round = np.rint(pred_val_raw).astype(int)\n",
"\n",
"# =========================\n",
"# 2) Build result DataFrame\n",
"# =========================\n",
"val_result = pd.DataFrame({\n",
" \"ma_dia_diem\": df.loc[val_idx, \"ma_dia_diem\"].values,\n",
" \"so_luong_thuc_te\": y_val.values,\n",
" \"so_luong_du_doan_raw\": pred_val_raw,\n",
" \"so_luong_du_doan_round\": pred_val_round,\n",
" \"abs_error\": np.abs(y_val.values - pred_val_raw),\n",
"})\n",
"\n",
"# (optional) sort by error to review bad cases first\n",
"val_result = val_result.sort_values(\"abs_error\", ascending=False)\n",
"\n",
"print(\"Preview:\")\n",
"display(val_result.head(10))\n",
"\n",
"# =========================\n",
"# 3) Save to CSV\n",
"# =========================\n",
"OUTPUT_DIR = \"./outputs\"\n",
"os.makedirs(OUTPUT_DIR, exist_ok=True)\n",
"\n",
"csv_path = os.path.join(OUTPUT_DIR, \"val_predictions_extratrees.csv\")\n",
"val_result.to_csv(csv_path, index=False, encoding=\"utf-8-sig\")\n",
"\n",
"print(f\"\\nSaved CSV to: {csv_path}\")\n",
"print(f\"Rows: {len(val_result)}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "8cb3cde1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"=== MODEL SAVED SUCCESSFULLY ===\n",
"Model file : ./artifacts\\extratrees_staff_model.joblib\n",
"Columns file : ./artifacts\\X_proc_columns.joblib\n",
"Meta file : ./artifacts\\model_meta.joblib\n"
]
}
],
"source": [
"# CELL 8 — SAVE TRAINED MODEL (NO PREDICT YET)\n",
"\n",
"import os\n",
"import joblib\n",
"\n",
"# =========================\n",
"# 1) Create artifact directory\n",
"# =========================\n",
"ARTIFACT_DIR = \"./artifacts\"\n",
"os.makedirs(ARTIFACT_DIR, exist_ok=True)\n",
"\n",
"# =========================\n",
"# 2) Define paths\n",
"# =========================\n",
"MODEL_PATH = os.path.join(ARTIFACT_DIR, \"extratrees_staff_model.joblib\")\n",
"COLUMNS_PATH = os.path.join(ARTIFACT_DIR, \"X_proc_columns.joblib\")\n",
"META_PATH = os.path.join(ARTIFACT_DIR, \"model_meta.joblib\")\n",
"\n",
"# =========================\n",
"# 3) Save model\n",
"# =========================\n",
"joblib.dump(final_model, MODEL_PATH)\n",
"\n",
"# =========================\n",
"# 4) Save feature schema (VERY IMPORTANT)\n",
"# =========================\n",
"joblib.dump(list(X_train.columns), COLUMNS_PATH)\n",
"\n",
"# =========================\n",
"# 5) Save metadata (optional but professional)\n",
"# =========================\n",
"meta = {\n",
" \"model_type\": \"ExtraTreesRegressor\",\n",
" \"target\": \"so_luong\",\n",
" \"target_transform\": \"log1p -> expm1\",\n",
" \"train_size\": len(X_train),\n",
" \"val_size\": len(X_val),\n",
" \"features\": X_train.shape[1],\n",
" \"note\": \"Predict staff headcount per shift\",\n",
"}\n",
"\n",
"joblib.dump(meta, META_PATH)\n",
"\n",
"print(\"=== MODEL SAVED SUCCESSFULLY ===\")\n",
"print(\"Model file :\", MODEL_PATH)\n",
"print(\"Columns file :\", COLUMNS_PATH)\n",
"print(\"Meta file :\", META_PATH)\n"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "0eab135b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['tong_gio_lam', 'so_ca_cua_toa', 'num_tasks', 'num_cleaning_tasks',\n",
" 'num_trash_collection_tasks', 'num_monitoring_tasks',\n",
" 'num_deep_cleaning_tasks', 'num_support_tasks', 'num_other_tasks',\n",
" 'num_wc_tasks', 'num_hallway_tasks', 'num_lobby_tasks',\n",
" 'num_outdoor_tasks', 'num_elevator_tasks', 'cleaning_ratio',\n",
" 'trash_collection_ratio', 'monitoring_ratio', 'area_diversity',\n",
" 'so_tang', 'so_cua_thang_may', 'dien_tich_ngoai_canh', 'dien_tich_sanh',\n",
" 'dien_tich_hanh_lang', 'dien_tich_wc', 'dien_tich_phong',\n",
" 'dien_tich_tham', 'doc_ham', 'vien_phan_quang', 'op_tuong',\n",
" 'op_chan_tuong', 'ranh_thoat_nuoc', 'dien_tich_kinh',\n",
" 'num_medical_tasks_total', 'num_indoor_room_tasks', 'hour_start',\n",
" 'hour_end', 'shift_length', 'is_cross_day', 'loai_ca_24/24',\n",
" 'loai_ca_Ca chiều', 'loai_ca_Ca gãy', 'loai_ca_Ca sáng',\n",
" 'loai_ca_Ca đêm', 'loai_ca_Hành chính', 'loai_ca_Part time',\n",
" 'loai_ca_nan'],\n",
" dtype='object')"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_val.columns"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "1dd44caa",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['ma_dia_diem', 'all_task_normal', 'all_task_dinhky', 'loai_ca',\n",
" 'bat_dau', 'ket_thuc', 'tong_gio_lam', 'so_ca_cua_toa', 'so_luong',\n",
" 'num_tasks', 'num_cleaning_tasks', 'num_trash_collection_tasks',\n",
" 'num_monitoring_tasks', 'num_deep_cleaning_tasks', 'num_support_tasks',\n",
" 'num_other_tasks', 'num_wc_tasks', 'num_hallway_tasks',\n",
" 'num_lobby_tasks', 'num_outdoor_tasks', 'num_elevator_tasks',\n",
" 'cleaning_ratio', 'trash_collection_ratio', 'monitoring_ratio',\n",
" 'area_diversity', 'so_tang', 'so_cua_thang_may', 'dien_tich_ngoai_canh',\n",
" 'dien_tich_sanh', 'dien_tich_hanh_lang', 'dien_tich_wc',\n",
" 'dien_tich_phong', 'dien_tich_tham', 'doc_ham', 'vien_phan_quang',\n",
" 'op_tuong', 'op_chan_tuong', 'ranh_thoat_nuoc', 'dien_tich_kinh',\n",
" 'num_medical_tasks_total', 'num_indoor_room_tasks',\n",
" 'is_tasks_text_missing'],\n",
" dtype='object')"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns"
]
},
{
"cell_type": "markdown",
"id": "a7036167",
"metadata": {},
"source": []
},
{
"cell_type": "markdown",
"id": "d8dbd670",
"metadata": {},
"source": []
},
{
"cell_type": "markdown",
"id": "3ebac85b",
"metadata": {},
"source": []
},
{
"cell_type": "markdown",
"id": "22866fc4",
"metadata": {},
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}