{ "cells": [ { "cell_type": "code", "execution_count": 24, "id": "e1667110", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loaded: final_2.xlsx | sheet: final\n", "Shape (raw): (401, 42)\n", "Shape (after dedup): (394, 42)\n", "\n", "=== TARGET SUMMARY (so_luong) ===\n", "count 394.000000\n", "mean 4.710660\n", "std 6.848602\n", "min 0.000000\n", "25% 1.000000\n", "50% 2.000000\n", "75% 5.000000\n", "max 64.000000\n", "Name: so_luong, dtype: float64\n", "Missing target: 0\n", "Negative target: 0\n", "Zero target: 3\n", "\n", "Sample rows:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ma_dia_diemall_task_normalall_task_dinhkyloai_cabat_dauket_thuctong_gio_lamso_ca_cua_toaso_luongnum_tasks...dien_tich_thamdoc_hamvien_phan_quangop_tuongop_chan_tuongranh_thoat_nuocdien_tich_kinhnum_medical_tasks_totalnum_indoor_room_tasksis_tasks_text_missing
0115-2Làm sạch toàn bộ phòng giao dịch tầng 1 (kể cả...NaNPart time06:30:0010:30:004.0117...0.0000.00.0020.0010
1101-1Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,...Lau bảng biển, bình cứu hỏa , cây nước hành la...Hành chính06:30:0016:00:007.5624441...0.07009176.089.025894.0112390
2101-1Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,...Lau bảng biển, bình cứu hỏa , cây nước hành la...Ca sáng06:00:0014:00:008.063441...0.07009176.089.025894.0112390
\n", "

3 rows × 42 columns

\n", "
" ], "text/plain": [ " ma_dia_diem all_task_normal \\\n", "0 115-2 Làm sạch toàn bộ phòng giao dịch tầng 1 (kể cả... \n", "1 101-1 Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,... \n", "2 101-1 Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,... \n", "\n", " all_task_dinhky loai_ca bat_dau \\\n", "0 NaN Part time 06:30:00 \n", "1 Lau bảng biển, bình cứu hỏa , cây nước hành la... Hành chính 06:30:00 \n", "2 Lau bảng biển, bình cứu hỏa , cây nước hành la... Ca sáng 06:00:00 \n", "\n", " ket_thuc tong_gio_lam so_ca_cua_toa so_luong num_tasks ... \\\n", "0 10:30:00 4.0 1 1 7 ... \n", "1 16:00:00 7.5 6 24 441 ... \n", "2 14:00:00 8.0 6 3 441 ... \n", "\n", " dien_tich_tham doc_ham vien_phan_quang op_tuong op_chan_tuong \\\n", "0 0.0 0 0 0.0 0.0 \n", "1 0.0 70 0 9176.0 89.0 \n", "2 0.0 70 0 9176.0 89.0 \n", "\n", " ranh_thoat_nuoc dien_tich_kinh num_medical_tasks_total \\\n", "0 0 20.0 0 \n", "1 25 894.0 112 \n", "2 25 894.0 112 \n", "\n", " num_indoor_room_tasks is_tasks_text_missing \n", "0 1 0 \n", "1 39 0 \n", "2 39 0 \n", "\n", "[3 rows x 42 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# CELL 1 — LOAD DATA & BASIC CLEAN\n", "\n", "import pandas as pd\n", "\n", "DATA_PATH = \"final_2.xlsx\"\n", "SHEET_NAME = \"final\"\n", "\n", "# 1. Load\n", "df = pd.read_excel(DATA_PATH, sheet_name=SHEET_NAME)\n", "print(f\"Loaded: {DATA_PATH} | sheet: {SHEET_NAME}\")\n", "print(\"Shape (raw):\", df.shape)\n", "\n", "# 2. Drop duplicate full rows\n", "df = df.drop_duplicates().reset_index(drop=True)\n", "print(\"Shape (after dedup):\", df.shape)\n", "\n", "# 3. Check target\n", "assert \"so_luong\" in df.columns, \"❌ Missing target so_luong\"\n", "\n", "print(\"\\n=== TARGET SUMMARY (so_luong) ===\")\n", "print(df[\"so_luong\"].describe())\n", "print(\"Missing target:\", df[\"so_luong\"].isna().sum())\n", "print(\"Negative target:\", (df[\"so_luong\"] < 0).sum())\n", "print(\"Zero target:\", (df[\"so_luong\"] == 0).sum())\n", "\n", "# 4. Peek data\n", "print(\"\\nSample rows:\")\n", "display(df.head(3))\n" ] }, { "cell_type": "code", "execution_count": 25, "id": "5601efad", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "All columns:\n", " 0: ma_dia_diem\n", " 1: all_task_normal\n", " 2: all_task_dinhky\n", " 3: loai_ca\n", " 4: bat_dau\n", " 5: ket_thuc\n", " 6: tong_gio_lam\n", " 7: so_ca_cua_toa\n", " 8: so_luong\n", " 9: num_tasks\n", "10: num_cleaning_tasks\n", "11: num_trash_collection_tasks\n", "12: num_monitoring_tasks\n", "13: num_deep_cleaning_tasks\n", "14: num_support_tasks\n", "15: num_other_tasks\n", "16: num_wc_tasks\n", "17: num_hallway_tasks\n", "18: num_lobby_tasks\n", "19: num_outdoor_tasks\n", "20: num_elevator_tasks\n", "21: cleaning_ratio\n", "22: trash_collection_ratio\n", "23: monitoring_ratio\n", "24: area_diversity\n", "25: so_tang\n", "26: so_cua_thang_may\n", "27: dien_tich_ngoai_canh\n", "28: dien_tich_sanh\n", "29: dien_tich_hanh_lang\n", "30: dien_tich_wc\n", "31: dien_tich_phong\n", "32: dien_tich_tham\n", "33: doc_ham\n", "34: vien_phan_quang\n", "35: op_tuong\n", "36: op_chan_tuong\n", "37: ranh_thoat_nuoc\n", "38: dien_tich_kinh\n", "39: num_medical_tasks_total\n", "40: num_indoor_room_tasks\n", "41: is_tasks_text_missing\n", "\n", "Dropped columns:\n", " - ma_dia_diem\n", " - all_task_normal\n", " - all_task_dinhky\n", " - is_tasks_text_missing\n", "\n", "Shapes:\n", "X: (394, 37)\n", "y: (394,)\n", "\n", "Feature dtypes:\n" ] }, { "data": { "text/plain": [ "int64 21\n", "float64 13\n", "object 3\n", "Name: count, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Missing values in X:\n" ] }, { "data": { "text/plain": [ "loai_ca 0\n", "bat_dau 0\n", "ket_thuc 0\n", "tong_gio_lam 0\n", "so_ca_cua_toa 0\n", "num_tasks 0\n", "num_cleaning_tasks 0\n", "num_trash_collection_tasks 0\n", "num_monitoring_tasks 0\n", "num_deep_cleaning_tasks 0\n", "dtype: int64" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# CELL 2 — FEATURE SELECTION (STRICT)\n", "\n", "# 1. Xem toàn bộ cột\n", "print(\"All columns:\")\n", "for i, c in enumerate(df.columns):\n", " print(f\"{i:2d}: {c}\")\n", "\n", "# 2. Xác định cột cần loại bỏ (THEO THỎA THUẬN)\n", "DROP_COLS = [\n", " df.columns[0], # ma_dia_diem\n", " df.columns[1], # all_task_normal\n", " df.columns[2], # all_task_dinhky\n", " df.columns[-1], # is_tasks_text_missing\n", "]\n", "\n", "print(\"\\nDropped columns:\")\n", "for c in DROP_COLS:\n", " print(\" -\", c)\n", "\n", "# 3. Tạo X, y\n", "X = df.drop(columns=DROP_COLS + [\"so_luong\"])\n", "y = df[\"so_luong\"].astype(float)\n", "\n", "print(\"\\nShapes:\")\n", "print(\"X:\", X.shape)\n", "print(\"y:\", y.shape)\n", "\n", "# 4. Kiểm tra kiểu dữ liệu\n", "print(\"\\nFeature dtypes:\")\n", "display(X.dtypes.value_counts())\n", "\n", "# 5. Kiểm tra missing\n", "print(\"\\nMissing values in X:\")\n", "display(X.isna().sum().sort_values(ascending=False).head(10))\n" ] }, { "cell_type": "code", "execution_count": 26, "id": "bb467e4c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Categorical columns: ['loai_ca']\n", "\n", "After preprocess:\n", "X_proc shape: (394, 46)\n", "Any non-numeric dtypes? False\n", "\n", "Sample columns (first 30):\n", "['tong_gio_lam', 'so_ca_cua_toa', 'num_tasks', 'num_cleaning_tasks', 'num_trash_collection_tasks', 'num_monitoring_tasks', 'num_deep_cleaning_tasks', 'num_support_tasks', 'num_other_tasks', 'num_wc_tasks', 'num_hallway_tasks', 'num_lobby_tasks', 'num_outdoor_tasks', 'num_elevator_tasks', 'cleaning_ratio', 'trash_collection_ratio', 'monitoring_ratio', 'area_diversity', 'so_tang', 'so_cua_thang_may', 'dien_tich_ngoai_canh', 'dien_tich_sanh', 'dien_tich_hanh_lang', 'dien_tich_wc', 'dien_tich_phong', 'dien_tich_tham', 'doc_ham', 'vien_phan_quang', 'op_tuong', 'op_chan_tuong']\n" ] } ], "source": [ "# CELL 3 — PREPROCESS (TIME + CATEGORICAL) WITHOUT JUNK\n", "\n", "import numpy as np\n", "import pandas as pd\n", "\n", "# ---------- 1) Time parsing ----------\n", "def time_to_hour(x):\n", " if pd.isna(x):\n", " return np.nan\n", "\n", " # datetime/time object\n", " if hasattr(x, \"hour\"):\n", " return float(x.hour) + float(getattr(x, \"minute\", 0))/60.0\n", "\n", " s = str(x).strip()\n", " # \"YYYY-MM-DD HH:MM:SS\"\n", " if \" \" in s and \":\" in s:\n", " s = s.split(\" \", 1)[1].strip()\n", "\n", " # \"HH:MM\" or \"HH:MM:SS\"\n", " if \":\" in s:\n", " parts = s.split(\":\")\n", " try:\n", " h = float(parts[0])\n", " m = float(parts[1]) if len(parts) > 1 else 0.0\n", " return h + m/60.0\n", " except:\n", " return np.nan\n", "\n", " # numeric fallback\n", " try:\n", " return float(s)\n", " except:\n", " return np.nan\n", "\n", "# Create new numeric time features (do NOT one-hot time)\n", "X_proc = X.copy()\n", "\n", "if \"bat_dau\" in X_proc.columns:\n", " X_proc[\"hour_start\"] = X_proc[\"bat_dau\"].apply(time_to_hour)\n", "if \"ket_thuc\" in X_proc.columns:\n", " X_proc[\"hour_end\"] = X_proc[\"ket_thuc\"].apply(time_to_hour)\n", "\n", "# shift_length + cross day\n", "if (\"hour_start\" in X_proc.columns) and (\"hour_end\" in X_proc.columns):\n", " end_adj = X_proc[\"hour_end\"].copy()\n", " cross = (X_proc[\"hour_start\"].notna()) & (X_proc[\"hour_end\"].notna()) & (X_proc[\"hour_end\"] < X_proc[\"hour_start\"])\n", " end_adj[cross] = end_adj[cross] + 24.0\n", "\n", " X_proc[\"shift_length\"] = (end_adj - X_proc[\"hour_start\"]).clip(lower=0)\n", " X_proc[\"is_cross_day\"] = cross.astype(int)\n", "\n", "# Drop raw time cols to avoid junk\n", "for c in [\"bat_dau\", \"ket_thuc\"]:\n", " if c in X_proc.columns:\n", " X_proc = X_proc.drop(columns=[c])\n", "\n", "# ---------- 2) One-hot categorical ----------\n", "cat_cols = [c for c in X_proc.columns if X_proc[c].dtype == \"object\"]\n", "print(\"Categorical columns:\", cat_cols)\n", "\n", "X_proc = pd.get_dummies(X_proc, columns=cat_cols, dummy_na=True)\n", "\n", "# ---------- 3) Fill missing ----------\n", "X_proc = X_proc.replace([np.inf, -np.inf], np.nan).fillna(0)\n", "\n", "print(\"\\nAfter preprocess:\")\n", "print(\"X_proc shape:\", X_proc.shape)\n", "print(\"Any non-numeric dtypes?\", any(dt == \"object\" for dt in X_proc.dtypes))\n", "\n", "print(\"\\nSample columns (first 30):\")\n", "print(list(X_proc.columns[:30]))\n" ] }, { "cell_type": "code", "execution_count": 27, "id": "3cd119b7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Shapes:\n", "Train: (326, 46) (326,)\n", "Val : (68, 46) (68,)\n", "\n", "ElasticNet(alpha=0.01, l1_ratio=0.5)\n", " Train | MAE=2.754 | RMSE=5.200 | R2=0.458\n", " Val | MAE=2.420 | RMSE=4.792 | R2=0.259\n", "\n", "DecisionTree(max_depth=8, min_samples_leaf=5)\n", " Train | MAE=2.187 | RMSE=5.313 | R2=0.434\n", " Val | MAE=2.312 | RMSE=4.149 | R2=0.445\n", "\n", "RandomForest(n_estimators=600, min_samples_leaf=3)\n", " Train | MAE=1.894 | RMSE=4.820 | R2=0.535\n", " Val | MAE=2.402 | RMSE=4.467 | R2=0.356\n", "\n", "ExtraTrees(n_estimators=800, min_samples_leaf=2)\n", " Train | MAE=1.095 | RMSE=3.320 | R2=0.779\n", " Val | MAE=1.968 | RMSE=3.461 | R2=0.614\n", "\n", "HistGradientBoosting(learning_rate=0.05, min_samples_leaf=20)\n", " Train | MAE=1.990 | RMSE=4.871 | R2=0.525\n", " Val | MAE=2.406 | RMSE=4.599 | R2=0.318\n", "\n", "=== HOLDOUT VAL MAE summary (lower is better) ===\n", "ExtraTrees : Val MAE = 1.968\n", "DecisionTree: Val MAE = 2.312\n", "RandomForest: Val MAE = 2.402\n", "HGBR : Val MAE = 2.406\n", "ElasticNet : Val MAE = 2.420\n" ] } ], "source": [ "# CELL 4 (EXTENDED) — ADD TREE/ENSEMBLE MODELS TO HOLDOUT VAL\n", "\n", "import numpy as np\n", "from sklearn.model_selection import GroupShuffleSplit\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet\n", "from sklearn.tree import DecisionTreeRegressor\n", "from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor\n", "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n", "\n", "# ---------- 1) Group split (80% train, 20% val) ----------\n", "groups = df[\"ma_dia_diem\"].astype(str)\n", "\n", "gss = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=42)\n", "train_idx, val_idx = next(gss.split(X_proc, y, groups))\n", "\n", "X_train = X_proc.iloc[train_idx]\n", "y_train = y.iloc[train_idx]\n", "\n", "X_val = X_proc.iloc[val_idx]\n", "y_val = y.iloc[val_idx]\n", "\n", "print(\"Shapes:\")\n", "print(\"Train:\", X_train.shape, y_train.shape)\n", "print(\"Val :\", X_val.shape, y_val.shape)\n", "\n", "# ---------- 2) Scale features (for linear models) ----------\n", "scaler = StandardScaler()\n", "X_train_s = scaler.fit_transform(X_train)\n", "X_val_s = scaler.transform(X_val)\n", "\n", "# ---------- 2b) No-scale matrices (for tree models) ----------\n", "X_train_ns = X_train.values\n", "X_val_ns = X_val.values\n", "\n", "# ---------- 3) Log-transform target ----------\n", "y_train_log = np.log1p(y_train)\n", "y_val_log = np.log1p(y_val)\n", "\n", "# ---------- 4) Evaluation helper (support scale/no-scale) ----------\n", "def eval_reg_any(name, model, use_scaled_X=True):\n", " Xtr = X_train_s if use_scaled_X else X_train_ns\n", " Xva = X_val_s if use_scaled_X else X_val_ns\n", "\n", " model.fit(Xtr, y_train_log)\n", "\n", " pred_train = np.maximum(0, np.expm1(model.predict(Xtr)))\n", " pred_val = np.maximum(0, np.expm1(model.predict(Xva)))\n", "\n", " def _m(y_true, y_pred):\n", " return (\n", " mean_absolute_error(y_true, y_pred),\n", " mean_squared_error(y_true, y_pred) ** 0.5,\n", " r2_score(y_true, y_pred),\n", " )\n", "\n", " tr = _m(y_train, pred_train)\n", " va = _m(y_val, pred_val)\n", "\n", " print(f\"\\n{name}\")\n", " print(f\" Train | MAE={tr[0]:.3f} | RMSE={tr[1]:.3f} | R2={tr[2]:.3f}\")\n", " print(f\" Val | MAE={va[0]:.3f} | RMSE={va[1]:.3f} | R2={va[2]:.3f}\")\n", "\n", " return va[0]\n", "\n", "# ---------- 5) Train baselines + tree/ensemble ----------\n", "results = {}\n", "\n", "# Linear (scaled)\n", "# results[\"Linear\"] = eval_reg_any(\"LinearRegression\", LinearRegression(), use_scaled_X=True)\n", "# results[\"Ridge\"] = eval_reg_any(\"Ridge(alpha=1.0)\", Ridge(alpha=1.0), use_scaled_X=True)\n", "# results[\"Lasso\"] = eval_reg_any(\"Lasso(alpha=0.01)\", Lasso(alpha=0.01, max_iter=5000), use_scaled_X=True)\n", "results[\"ElasticNet\"] = eval_reg_any(\n", " \"ElasticNet(alpha=0.01, l1_ratio=0.5)\",\n", " ElasticNet(alpha=0.01, l1_ratio=0.5, max_iter=5000),\n", " use_scaled_X=True\n", ")\n", "\n", "# Tree / Ensemble (no scale)\n", "results[\"DecisionTree\"] = eval_reg_any(\n", " \"DecisionTree(max_depth=8, min_samples_leaf=5)\",\n", " DecisionTreeRegressor(max_depth=8, min_samples_leaf=5, random_state=42),\n", " use_scaled_X=False\n", ")\n", "\n", "results[\"RandomForest\"] = eval_reg_any(\n", " \"RandomForest(n_estimators=600, min_samples_leaf=3)\",\n", " RandomForestRegressor(\n", " n_estimators=600, min_samples_leaf=3, random_state=42, n_jobs=-1\n", " ),\n", " use_scaled_X=False\n", ")\n", "\n", "results[\"ExtraTrees\"] = eval_reg_any(\n", " \"ExtraTrees(n_estimators=800, min_samples_leaf=2)\",\n", " ExtraTreesRegressor(\n", " n_estimators=800, min_samples_leaf=2, random_state=42, n_jobs=-1\n", " ),\n", " use_scaled_X=False\n", ")\n", "\n", "results[\"HGBR\"] = eval_reg_any(\n", " \"HistGradientBoosting(learning_rate=0.05, min_samples_leaf=20)\",\n", " HistGradientBoostingRegressor(\n", " learning_rate=0.05, min_samples_leaf=20, max_leaf_nodes=31, random_state=42\n", " ),\n", " use_scaled_X=False\n", ")\n", "\n", "print(\"\\n=== HOLDOUT VAL MAE summary (lower is better) ===\")\n", "for k, v in sorted(results.items(), key=lambda x: x[1]):\n", " print(f\"{k:12s}: Val MAE = {v:.3f}\")\n" ] }, { "cell_type": "code", "execution_count": 28, "id": "106e557f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train for CV: (326, 46) (326,) | unique groups: 153\n", "\n", "=== ExtraTrees GroupKFold (TRAIN ONLY) ===\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MAERMSER2foldn_val
03.6771997.2638210.048081166
13.6733336.6000860.104487265
23.0929248.9866690.087366365
32.1006413.4992450.291277465
42.5504974.7399000.407946565
\n", "
" ], "text/plain": [ " MAE RMSE R2 fold n_val\n", "0 3.677199 7.263821 0.048081 1 66\n", "1 3.673333 6.600086 0.104487 2 65\n", "2 3.092924 8.986669 0.087366 3 65\n", "3 2.100641 3.499245 0.291277 4 65\n", "4 2.550497 4.739900 0.407946 5 65" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "=== CV SUMMARY ===\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MAERMSER2
mean3.0189196.2179440.187832
std0.6945722.1495150.154694
\n", "
" ], "text/plain": [ " MAE RMSE R2\n", "mean 3.018919 6.217944 0.187832\n", "std 0.694572 2.149515 0.154694" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Worst fold (highest MAE):\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MAERMSER2foldn_val
03.6771997.2638210.048081166
\n", "
" ], "text/plain": [ " MAE RMSE R2 fold n_val\n", "0 3.677199 7.263821 0.048081 1 66" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# CELL 5 — GROUP K-FOLD CV (TRAIN ONLY) FOR EXTRA TREES (NO LEAKAGE by ma_dia_diem)\n", "\n", "import numpy as np\n", "import pandas as pd\n", "\n", "from sklearn.model_selection import GroupKFold\n", "from sklearn.ensemble import ExtraTreesRegressor\n", "from sklearn.base import clone\n", "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n", "\n", "# =========================\n", "# 1) Prepare TRAIN data + groups\n", "# =========================\n", "X_tr = X_train.values # no scaling for tree\n", "y_tr = y_train.values.astype(float)\n", "g_tr = df.loc[train_idx, \"ma_dia_diem\"].astype(str).values\n", "\n", "print(\"Train for CV:\", X_tr.shape, y_tr.shape, \"| unique groups:\", len(np.unique(g_tr)))\n", "\n", "# =========================\n", "# 2) Metric helper (evaluate on original scale)\n", "# =========================\n", "def metrics(y_true, y_pred):\n", " y_pred = np.maximum(0, y_pred)\n", " return {\n", " \"MAE\": mean_absolute_error(y_true, y_pred),\n", " \"RMSE\": mean_squared_error(y_true, y_pred) ** 0.5,\n", " \"R2\": r2_score(y_true, y_pred),\n", " }\n", "\n", "# =========================\n", "# 3) ExtraTrees config (same as your holdout baseline)\n", "# =========================\n", "base_model = ExtraTreesRegressor(\n", " n_estimators=800,\n", " min_samples_leaf=2,\n", " random_state=42,\n", " n_jobs=-1\n", ")\n", "\n", "# =========================\n", "# 4) GroupKFold CV (fit log1p(y) -> expm1(pred))\n", "# =========================\n", "gkf = GroupKFold(n_splits=5)\n", "\n", "rows = []\n", "for fold, (tr_idx, va_idx) in enumerate(gkf.split(X_tr, y_tr, groups=g_tr), start=1):\n", " model = clone(base_model)\n", "\n", " Xtr_f, Xva_f = X_tr[tr_idx], X_tr[va_idx]\n", " ytr_f, yva_f = y_tr[tr_idx], y_tr[va_idx]\n", "\n", " model.fit(Xtr_f, np.log1p(ytr_f))\n", " pred_va = np.expm1(model.predict(Xva_f))\n", " pred_va = np.maximum(0, pred_va)\n", "\n", " m = metrics(yva_f, pred_va)\n", " m[\"fold\"] = fold\n", " m[\"n_val\"] = len(va_idx)\n", " rows.append(m)\n", "\n", "cv_df = pd.DataFrame(rows)\n", "\n", "print(\"\\n=== ExtraTrees GroupKFold (TRAIN ONLY) ===\")\n", "display(cv_df)\n", "\n", "summary = cv_df[[\"MAE\", \"RMSE\", \"R2\"]].agg([\"mean\", \"std\"])\n", "print(\"\\n=== CV SUMMARY ===\")\n", "display(summary)\n", "\n", "best_worst = cv_df.sort_values(\"MAE\", ascending=False)\n", "print(\"\\nWorst fold (highest MAE):\")\n", "display(best_worst.head(1))\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "ee66e389", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Linear | MAE = 4.971 ± 2.813\n", "Lasso_0.01 | MAE = 7.110 ± 7.917\n", "ElasticNet_0.01 | MAE = 6.386 ± 6.240\n", "\n", "=== CV SUMMARY ===\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
meanstdminmax
model
Linear4.9708683.1451062.27995410.321992
ElasticNet_0.016.3863726.9765352.10423918.765868
Lasso_0.017.1102678.8514552.11351022.876806
\n", "
" ], "text/plain": [ " mean std min max\n", "model \n", "Linear 4.970868 3.145106 2.279954 10.321992\n", "ElasticNet_0.01 6.386372 6.976535 2.104239 18.765868\n", "Lasso_0.01 7.110267 8.851455 2.113510 22.876806" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# CELL 5 — GROUP K-FOLD CONFIRMATION (TOP LINEAR MODELS)\n", "\n", "from sklearn.model_selection import GroupKFold\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.linear_model import LinearRegression, Lasso, ElasticNet\n", "from sklearn.metrics import mean_absolute_error\n", "import numpy as np\n", "import pandas as pd\n", "\n", "models = {\n", " \"Linear\": LinearRegression(),\n", " \"Lasso_0.01\": Lasso(alpha=0.01, max_iter=5000),\n", " \"ElasticNet_0.01\": ElasticNet(alpha=0.01, l1_ratio=0.5, max_iter=5000),\n", "}\n", "\n", "groups = df[\"ma_dia_diem\"].astype(str)\n", "gkf = GroupKFold(n_splits=5)\n", "\n", "rows = []\n", "\n", "for name, model in models.items():\n", " maes = []\n", " for fold, (tr_idx, va_idx) in enumerate(gkf.split(X_proc, y, groups)):\n", " X_tr, X_va = X_proc.iloc[tr_idx], X_proc.iloc[va_idx]\n", " y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]\n", "\n", " scaler = StandardScaler()\n", " X_tr_s = scaler.fit_transform(X_tr)\n", " X_va_s = scaler.transform(X_va)\n", "\n", " model.fit(X_tr_s, np.log1p(y_tr))\n", " pred_va = np.maximum(0, np.expm1(model.predict(X_va_s)))\n", "\n", " mae = mean_absolute_error(y_va, pred_va)\n", " maes.append(mae)\n", "\n", " rows.append({\n", " \"model\": name,\n", " \"fold\": fold,\n", " \"MAE\": mae,\n", " \"n_val\": len(va_idx),\n", " })\n", "\n", " print(f\"{name:12s} | MAE = {np.mean(maes):.3f} ± {np.std(maes):.3f}\")\n", "\n", "cv_df = pd.DataFrame(rows)\n", "\n", "print(\"\\n=== CV SUMMARY ===\")\n", "display(\n", " cv_df.groupby(\"model\")[\"MAE\"]\n", " .agg([\"mean\", \"std\", \"min\", \"max\"])\n", " .sort_values(\"mean\")\n", ")\n" ] }, { "cell_type": "code", "execution_count": 29, "id": "73a31e6e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "=== GROUP K-FOLD CV LEADERBOARD (TRAIN ONLY) ===\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
modelMAE_meanMAE_stdRMSE_meanR2_mean
4ExtraTrees3.0189190.6945726.2179440.187832
3RandomForest3.0520840.7154096.2434090.187735
5HGBR3.1053440.6737396.3449610.156316
2DecisionTree3.6819850.6681496.861043-0.017019
0Lasso4.3666011.70918210.584715-2.703761
1ElasticNet4.9182752.41475313.602627-15.883303
\n", "
" ], "text/plain": [ " model MAE_mean MAE_std RMSE_mean R2_mean\n", "4 ExtraTrees 3.018919 0.694572 6.217944 0.187832\n", "3 RandomForest 3.052084 0.715409 6.243409 0.187735\n", "5 HGBR 3.105344 0.673739 6.344961 0.156316\n", "2 DecisionTree 3.681985 0.668149 6.861043 -0.017019\n", "0 Lasso 4.366601 1.709182 10.584715 -2.703761\n", "1 ElasticNet 4.918275 2.414753 13.602627 -15.883303" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "=== Per-fold details (worst MAE first) ===\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MAERMSER2modelfoldn_val
88.97221337.201319-79.102177ElasticNet465
07.34289319.028144-5.532246Lasso166
55.3472379.838825-0.746450ElasticNet166
104.3474097.4070780.010163DecisionTree166
34.28889512.729106-8.378285Lasso465
124.1443389.678611-0.058584DecisionTree365
113.7729587.109340-0.039038DecisionTree265
163.7422086.7244520.070421RandomForest265
253.7212147.3764540.018331HGBR166
263.6843926.7364730.067094HGBR265
203.6771997.2638210.048081ExtraTrees166
213.6733336.6000860.104487ExtraTrees265
153.5579267.0908780.092870RandomForest166
13.5110976.3441300.172598Lasso265
143.5074515.6274150.165473DecisionTree565
\n", "
" ], "text/plain": [ " MAE RMSE R2 model fold n_val\n", "8 8.972213 37.201319 -79.102177 ElasticNet 4 65\n", "0 7.342893 19.028144 -5.532246 Lasso 1 66\n", "5 5.347237 9.838825 -0.746450 ElasticNet 1 66\n", "10 4.347409 7.407078 0.010163 DecisionTree 1 66\n", "3 4.288895 12.729106 -8.378285 Lasso 4 65\n", "12 4.144338 9.678611 -0.058584 DecisionTree 3 65\n", "11 3.772958 7.109340 -0.039038 DecisionTree 2 65\n", "16 3.742208 6.724452 0.070421 RandomForest 2 65\n", "25 3.721214 7.376454 0.018331 HGBR 1 66\n", "26 3.684392 6.736473 0.067094 HGBR 2 65\n", "20 3.677199 7.263821 0.048081 ExtraTrees 1 66\n", "21 3.673333 6.600086 0.104487 ExtraTrees 2 65\n", "15 3.557926 7.090878 0.092870 RandomForest 1 66\n", "1 3.511097 6.344130 0.172598 Lasso 2 65\n", "14 3.507451 5.627415 0.165473 DecisionTree 5 65" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# CELL 6 — GROUP K-FOLD CV: TRY MULTIPLE MODELS (FAIR COMPARISON)\n", "\n", "import numpy as np\n", "import pandas as pd\n", "\n", "from sklearn.model_selection import GroupKFold\n", "from sklearn.base import clone\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n", "\n", "from sklearn.linear_model import Lasso, ElasticNet\n", "from sklearn.tree import DecisionTreeRegressor\n", "from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor\n", "\n", "# =========================\n", "# 1) Prepare TRAIN data + groups\n", "# =========================\n", "X_tr_df = X_train.copy() # keep dataframe\n", "y_tr = y_train.astype(float).values\n", "g_tr = df.loc[train_idx, \"ma_dia_diem\"].astype(str).values\n", "\n", "# =========================\n", "# 2) Metric helper\n", "# =========================\n", "def metrics(y_true, y_pred):\n", " y_pred = np.maximum(0, y_pred)\n", " return {\n", " \"MAE\": mean_absolute_error(y_true, y_pred),\n", " \"RMSE\": mean_squared_error(y_true, y_pred) ** 0.5,\n", " \"R2\": r2_score(y_true, y_pred),\n", " }\n", "\n", "def cv_eval_model(name, model, X_df, y, groups, n_splits=5, use_log=True):\n", " gkf = GroupKFold(n_splits=n_splits)\n", " rows = []\n", "\n", " for fold, (tr_i, va_i) in enumerate(gkf.split(X_df, y, groups=groups), start=1):\n", " m = clone(model)\n", "\n", " Xtr = X_df.iloc[tr_i]\n", " Xva = X_df.iloc[va_i]\n", " ytr = y[tr_i]\n", " yva = y[va_i]\n", "\n", " if use_log:\n", " m.fit(Xtr, np.log1p(ytr))\n", " pred = np.expm1(m.predict(Xva))\n", " else:\n", " m.fit(Xtr, ytr)\n", " pred = m.predict(Xva)\n", "\n", " mm = metrics(yva, pred)\n", " mm.update({\"model\": name, \"fold\": fold, \"n_val\": len(va_i)})\n", " rows.append(mm)\n", "\n", " out = pd.DataFrame(rows)\n", " summ = out[[\"MAE\", \"RMSE\", \"R2\"]].agg([\"mean\", \"std\"])\n", " return out, summ\n", "\n", "# =========================\n", "# 3) Define candidates\n", "# =========================\n", "candidates = []\n", "\n", "# Linear (need scale)\n", "candidates.append((\"Lasso\", Pipeline([\n", " (\"scaler\", StandardScaler()),\n", " (\"model\", Lasso(alpha=0.01, max_iter=5000, random_state=42))\n", "]), True))\n", "\n", "candidates.append((\"ElasticNet\", Pipeline([\n", " (\"scaler\", StandardScaler()),\n", " (\"model\", ElasticNet(alpha=0.01, l1_ratio=0.5, max_iter=8000, random_state=42))\n", "]), True))\n", "\n", "# Tree/Ensemble (no scale needed)\n", "candidates.append((\"DecisionTree\", DecisionTreeRegressor(\n", " max_depth=8, min_samples_leaf=5, random_state=42\n", "), True))\n", "\n", "candidates.append((\"RandomForest\", RandomForestRegressor(\n", " n_estimators=600, min_samples_leaf=3, random_state=42, n_jobs=-1\n", "), True))\n", "\n", "candidates.append((\"ExtraTrees\", ExtraTreesRegressor(\n", " n_estimators=800, min_samples_leaf=2, random_state=42, n_jobs=-1\n", "), True))\n", "\n", "candidates.append((\"HGBR\", HistGradientBoostingRegressor(\n", " learning_rate=0.05, min_samples_leaf=20, max_leaf_nodes=31, random_state=42\n", "), True))\n", "\n", "# =========================\n", "# 4) Run CV for all models\n", "# =========================\n", "all_fold = []\n", "rows_lb = []\n", "\n", "for name, model, use_log in candidates:\n", " fold_df, summ = cv_eval_model(name, model, X_tr_df, y_tr, g_tr, n_splits=5, use_log=use_log)\n", " all_fold.append(fold_df)\n", "\n", " rows_lb.append({\n", " \"model\": name,\n", " \"MAE_mean\": summ.loc[\"mean\", \"MAE\"],\n", " \"MAE_std\": summ.loc[\"std\", \"MAE\"],\n", " \"RMSE_mean\": summ.loc[\"mean\", \"RMSE\"],\n", " \"R2_mean\": summ.loc[\"mean\", \"R2\"],\n", " })\n", "\n", "cv_all = pd.concat(all_fold, ignore_index=True)\n", "leaderboard = pd.DataFrame(rows_lb).sort_values([\"MAE_mean\", \"RMSE_mean\"], ascending=True)\n", "\n", "print(\"=== GROUP K-FOLD CV LEADERBOARD (TRAIN ONLY) ===\")\n", "display(leaderboard)\n", "\n", "print(\"\\n=== Per-fold details (worst MAE first) ===\")\n", "display(cv_all.sort_values(\"MAE\", ascending=False).head(15))\n" ] }, { "cell_type": "code", "execution_count": 30, "id": "1d408c02", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "=== FINAL MODEL: ExtraTrees (fit on TRAIN, eval on HOLDOUT VAL) ===\n", "\n", "[VAL (raw)]\n", "MAE : 1.968\n", "RMSE: 3.461\n", "R2 : 0.614\n", "\n", "[VAL (rounded)]\n", "MAE : 1.971\n", "RMSE: 3.502\n", "R2 : 0.604\n", "\n", "Worst 15 samples (by abs error):\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
y_truey_predy_pred_roundabs_err
3432.016.8768301715.123170
029.016.2426191612.757381
1413.05.23938857.760612
651.07.99405886.994058
2814.07.21002776.789973
2912.05.70602066.293980
601.06.10830465.108304
611.05.94822164.948221
562.06.42655064.426550
1711.06.75804474.241956
5810.06.35421063.645790
106.02.35605623.643944
237.03.44220033.557800
202.05.50439463.504394
623.06.08793463.087934
\n", "
" ], "text/plain": [ " y_true y_pred y_pred_round abs_err\n", "34 32.0 16.876830 17 15.123170\n", "0 29.0 16.242619 16 12.757381\n", "14 13.0 5.239388 5 7.760612\n", "65 1.0 7.994058 8 6.994058\n", "28 14.0 7.210027 7 6.789973\n", "29 12.0 5.706020 6 6.293980\n", "60 1.0 6.108304 6 5.108304\n", "61 1.0 5.948221 6 4.948221\n", "56 2.0 6.426550 6 4.426550\n", "17 11.0 6.758044 7 4.241956\n", "58 10.0 6.354210 6 3.645790\n", "10 6.0 2.356056 2 3.643944\n", "23 7.0 3.442200 3 3.557800\n", "20 2.0 5.504394 6 3.504394\n", "62 3.0 6.087934 6 3.087934" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Saved:\n", " - Model : ./artifacts\\extratrees_log1p.joblib\n", " - Columns: ./artifacts\\X_proc_columns.joblib\n" ] } ], "source": [ "# CELL 7 — FINALIZE BEST MODEL (ExtraTrees) + EVAL ON HOLDOUT VAL + SAVE ARTIFACTS\n", "\n", "import numpy as np\n", "import pandas as pd\n", "\n", "from sklearn.ensemble import ExtraTreesRegressor\n", "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n", "import joblib\n", "\n", "# =========================\n", "# 1) Train final ExtraTrees on FULL TRAIN\n", "# =========================\n", "final_model = ExtraTreesRegressor(\n", " n_estimators=800,\n", " min_samples_leaf=2,\n", " random_state=42,\n", " n_jobs=-1\n", ")\n", "\n", "final_model.fit(X_train.values, np.log1p(y_train.values))\n", "\n", "# =========================\n", "# 2) Predict on HOLDOUT VAL\n", "# =========================\n", "pred_val = np.expm1(final_model.predict(X_val.values))\n", "pred_val = np.maximum(0, pred_val)\n", "\n", "# optional: round to headcount integer\n", "pred_val_round = np.rint(pred_val).astype(int)\n", "\n", "# =========================\n", "# 3) Metrics (raw vs rounded)\n", "# =========================\n", "def print_metrics(tag, y_true, y_pred):\n", " mae = mean_absolute_error(y_true, y_pred)\n", " rmse = mean_squared_error(y_true, y_pred) ** 0.5\n", " r2 = r2_score(y_true, y_pred)\n", " print(f\"\\n[{tag}]\")\n", " print(f\"MAE : {mae:.3f}\")\n", " print(f\"RMSE: {rmse:.3f}\")\n", " print(f\"R2 : {r2:.3f}\")\n", " return {\"MAE\": mae, \"RMSE\": rmse, \"R2\": r2}\n", "\n", "print(\"=== FINAL MODEL: ExtraTrees (fit on TRAIN, eval on HOLDOUT VAL) ===\")\n", "m_raw = print_metrics(\"VAL (raw)\", y_val.values, pred_val)\n", "m_int = print_metrics(\"VAL (rounded)\", y_val.values, pred_val_round)\n", "\n", "# =========================\n", "# 4) Quick error analysis\n", "# =========================\n", "err_df = pd.DataFrame({\n", " \"y_true\": y_val.values,\n", " \"y_pred\": pred_val,\n", " \"y_pred_round\": pred_val_round,\n", " \"abs_err\": np.abs(y_val.values - pred_val),\n", "})\n", "print(\"\\nWorst 15 samples (by abs error):\")\n", "display(err_df.sort_values(\"abs_err\", ascending=False).head(15))\n", "\n", "# =========================\n", "# 5) Save model + schema (columns)\n", "# =========================\n", "ARTIFACT_DIR = \"./artifacts\"\n", "import os\n", "os.makedirs(ARTIFACT_DIR, exist_ok=True)\n", "\n", "model_path = os.path.join(ARTIFACT_DIR, \"extratrees_log1p.joblib\")\n", "cols_path = os.path.join(ARTIFACT_DIR, \"X_proc_columns.joblib\")\n", "\n", "joblib.dump(final_model, model_path)\n", "joblib.dump(list(X_train.columns), cols_path)\n", "\n", "print(\"\\nSaved:\")\n", "print(\" - Model :\", model_path)\n", "print(\" - Columns:\", cols_path)\n" ] }, { "cell_type": "code", "execution_count": 31, "id": "9cff151d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Preview:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ma_dia_diemso_luong_thuc_teso_luong_du_doan_rawso_luong_du_doan_roundabs_error
34579-132.016.8768301715.123170
0114-129.016.2426191612.757381
14121-313.05.23938857.760612
65227-11.07.99405886.994058
2855-114.07.21002776.789973
2955-112.05.70602066.293980
60236-11.06.10830465.108304
61236-11.05.94822164.948221
56236-12.06.42655064.426550
17121-411.06.75804474.241956
\n", "
" ], "text/plain": [ " ma_dia_diem so_luong_thuc_te so_luong_du_doan_raw \\\n", "34 579-1 32.0 16.876830 \n", "0 114-1 29.0 16.242619 \n", "14 121-3 13.0 5.239388 \n", "65 227-1 1.0 7.994058 \n", "28 55-1 14.0 7.210027 \n", "29 55-1 12.0 5.706020 \n", "60 236-1 1.0 6.108304 \n", "61 236-1 1.0 5.948221 \n", "56 236-1 2.0 6.426550 \n", "17 121-4 11.0 6.758044 \n", "\n", " so_luong_du_doan_round abs_error \n", "34 17 15.123170 \n", "0 16 12.757381 \n", "14 5 7.760612 \n", "65 8 6.994058 \n", "28 7 6.789973 \n", "29 6 6.293980 \n", "60 6 5.108304 \n", "61 6 4.948221 \n", "56 6 4.426550 \n", "17 7 4.241956 " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Saved CSV to: ./outputs\\val_predictions_extratrees.csv\n", "Rows: 68\n" ] } ], "source": [ "# CELL 8 — EXPORT VAL PREDICTIONS TO CSV (FOR ANALYSIS / BUSINESS REVIEW)\n", "\n", "import pandas as pd\n", "import numpy as np\n", "import os\n", "\n", "# =========================\n", "# 1) Recompute predictions (safety, explicit)\n", "# =========================\n", "pred_val_raw = np.expm1(final_model.predict(X_val.values))\n", "pred_val_raw = np.maximum(0, pred_val_raw)\n", "\n", "pred_val_round = np.rint(pred_val_raw).astype(int)\n", "\n", "# =========================\n", "# 2) Build result DataFrame\n", "# =========================\n", "val_result = pd.DataFrame({\n", " \"ma_dia_diem\": df.loc[val_idx, \"ma_dia_diem\"].values,\n", " \"so_luong_thuc_te\": y_val.values,\n", " \"so_luong_du_doan_raw\": pred_val_raw,\n", " \"so_luong_du_doan_round\": pred_val_round,\n", " \"abs_error\": np.abs(y_val.values - pred_val_raw),\n", "})\n", "\n", "# (optional) sort by error to review bad cases first\n", "val_result = val_result.sort_values(\"abs_error\", ascending=False)\n", "\n", "print(\"Preview:\")\n", "display(val_result.head(10))\n", "\n", "# =========================\n", "# 3) Save to CSV\n", "# =========================\n", "OUTPUT_DIR = \"./outputs\"\n", "os.makedirs(OUTPUT_DIR, exist_ok=True)\n", "\n", "csv_path = os.path.join(OUTPUT_DIR, \"val_predictions_extratrees.csv\")\n", "val_result.to_csv(csv_path, index=False, encoding=\"utf-8-sig\")\n", "\n", "print(f\"\\nSaved CSV to: {csv_path}\")\n", "print(f\"Rows: {len(val_result)}\")\n" ] }, { "cell_type": "code", "execution_count": 32, "id": "8cb3cde1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "=== MODEL SAVED SUCCESSFULLY ===\n", "Model file : ./artifacts\\extratrees_staff_model.joblib\n", "Columns file : ./artifacts\\X_proc_columns.joblib\n", "Meta file : ./artifacts\\model_meta.joblib\n" ] } ], "source": [ "# CELL 8 — SAVE TRAINED MODEL (NO PREDICT YET)\n", "\n", "import os\n", "import joblib\n", "\n", "# =========================\n", "# 1) Create artifact directory\n", "# =========================\n", "ARTIFACT_DIR = \"./artifacts\"\n", "os.makedirs(ARTIFACT_DIR, exist_ok=True)\n", "\n", "# =========================\n", "# 2) Define paths\n", "# =========================\n", "MODEL_PATH = os.path.join(ARTIFACT_DIR, \"extratrees_staff_model.joblib\")\n", "COLUMNS_PATH = os.path.join(ARTIFACT_DIR, \"X_proc_columns.joblib\")\n", "META_PATH = os.path.join(ARTIFACT_DIR, \"model_meta.joblib\")\n", "\n", "# =========================\n", "# 3) Save model\n", "# =========================\n", "joblib.dump(final_model, MODEL_PATH)\n", "\n", "# =========================\n", "# 4) Save feature schema (VERY IMPORTANT)\n", "# =========================\n", "joblib.dump(list(X_train.columns), COLUMNS_PATH)\n", "\n", "# =========================\n", "# 5) Save metadata (optional but professional)\n", "# =========================\n", "meta = {\n", " \"model_type\": \"ExtraTreesRegressor\",\n", " \"target\": \"so_luong\",\n", " \"target_transform\": \"log1p -> expm1\",\n", " \"train_size\": len(X_train),\n", " \"val_size\": len(X_val),\n", " \"features\": X_train.shape[1],\n", " \"note\": \"Predict staff headcount per shift\",\n", "}\n", "\n", "joblib.dump(meta, META_PATH)\n", "\n", "print(\"=== MODEL SAVED SUCCESSFULLY ===\")\n", "print(\"Model file :\", MODEL_PATH)\n", "print(\"Columns file :\", COLUMNS_PATH)\n", "print(\"Meta file :\", META_PATH)\n" ] }, { "cell_type": "code", "execution_count": 33, "id": "0eab135b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['tong_gio_lam', 'so_ca_cua_toa', 'num_tasks', 'num_cleaning_tasks',\n", " 'num_trash_collection_tasks', 'num_monitoring_tasks',\n", " 'num_deep_cleaning_tasks', 'num_support_tasks', 'num_other_tasks',\n", " 'num_wc_tasks', 'num_hallway_tasks', 'num_lobby_tasks',\n", " 'num_outdoor_tasks', 'num_elevator_tasks', 'cleaning_ratio',\n", " 'trash_collection_ratio', 'monitoring_ratio', 'area_diversity',\n", " 'so_tang', 'so_cua_thang_may', 'dien_tich_ngoai_canh', 'dien_tich_sanh',\n", " 'dien_tich_hanh_lang', 'dien_tich_wc', 'dien_tich_phong',\n", " 'dien_tich_tham', 'doc_ham', 'vien_phan_quang', 'op_tuong',\n", " 'op_chan_tuong', 'ranh_thoat_nuoc', 'dien_tich_kinh',\n", " 'num_medical_tasks_total', 'num_indoor_room_tasks', 'hour_start',\n", " 'hour_end', 'shift_length', 'is_cross_day', 'loai_ca_24/24',\n", " 'loai_ca_Ca chiều', 'loai_ca_Ca gãy', 'loai_ca_Ca sáng',\n", " 'loai_ca_Ca đêm', 'loai_ca_Hành chính', 'loai_ca_Part time',\n", " 'loai_ca_nan'],\n", " dtype='object')" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_val.columns" ] }, { "cell_type": "code", "execution_count": 34, "id": "1dd44caa", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['ma_dia_diem', 'all_task_normal', 'all_task_dinhky', 'loai_ca',\n", " 'bat_dau', 'ket_thuc', 'tong_gio_lam', 'so_ca_cua_toa', 'so_luong',\n", " 'num_tasks', 'num_cleaning_tasks', 'num_trash_collection_tasks',\n", " 'num_monitoring_tasks', 'num_deep_cleaning_tasks', 'num_support_tasks',\n", " 'num_other_tasks', 'num_wc_tasks', 'num_hallway_tasks',\n", " 'num_lobby_tasks', 'num_outdoor_tasks', 'num_elevator_tasks',\n", " 'cleaning_ratio', 'trash_collection_ratio', 'monitoring_ratio',\n", " 'area_diversity', 'so_tang', 'so_cua_thang_may', 'dien_tich_ngoai_canh',\n", " 'dien_tich_sanh', 'dien_tich_hanh_lang', 'dien_tich_wc',\n", " 'dien_tich_phong', 'dien_tich_tham', 'doc_ham', 'vien_phan_quang',\n", " 'op_tuong', 'op_chan_tuong', 'ranh_thoat_nuoc', 'dien_tich_kinh',\n", " 'num_medical_tasks_total', 'num_indoor_room_tasks',\n", " 'is_tasks_text_missing'],\n", " dtype='object')" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns" ] }, { "cell_type": "markdown", "id": "a7036167", "metadata": {}, "source": [] }, { "cell_type": "markdown", "id": "d8dbd670", "metadata": {}, "source": [] }, { "cell_type": "markdown", "id": "3ebac85b", "metadata": {}, "source": [] }, { "cell_type": "markdown", "id": "22866fc4", "metadata": {}, "source": [] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.5" } }, "nbformat": 4, "nbformat_minor": 5 }