predict_caLamviec_nhansu/train.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "e1667110",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded: final_2.xlsx | sheet: final\n",
      "Shape (raw): (401, 42)\n",
      "Shape (after dedup): (394, 42)\n",
      "\n",
      "=== TARGET SUMMARY (so_luong) ===\n",
      "count    394.000000\n",
      "mean       4.710660\n",
      "std        6.848602\n",
      "min        0.000000\n",
      "25%        1.000000\n",
      "50%        2.000000\n",
      "75%        5.000000\n",
      "max       64.000000\n",
      "Name: so_luong, dtype: float64\n",
      "Missing target: 0\n",
      "Negative target: 0\n",
      "Zero target: 3\n",
      "\n",
      "Sample rows:\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ma_dia_diem</th>\n",
       "      <th>all_task_normal</th>\n",
       "      <th>all_task_dinhky</th>\n",
       "      <th>loai_ca</th>\n",
       "      <th>bat_dau</th>\n",
       "      <th>ket_thuc</th>\n",
       "      <th>tong_gio_lam</th>\n",
       "      <th>so_ca_cua_toa</th>\n",
       "      <th>so_luong</th>\n",
       "      <th>num_tasks</th>\n",
       "      <th>...</th>\n",
       "      <th>dien_tich_tham</th>\n",
       "      <th>doc_ham</th>\n",
       "      <th>vien_phan_quang</th>\n",
       "      <th>op_tuong</th>\n",
       "      <th>op_chan_tuong</th>\n",
       "      <th>ranh_thoat_nuoc</th>\n",
       "      <th>dien_tich_kinh</th>\n",
       "      <th>num_medical_tasks_total</th>\n",
       "      <th>num_indoor_room_tasks</th>\n",
       "      <th>is_tasks_text_missing</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>115-2</td>\n",
       "      <td>Làm sạch toàn bộ phòng giao dịch tầng 1 (kể cả...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Part time</td>\n",
       "      <td>06:30:00</td>\n",
       "      <td>10:30:00</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>20.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>101-1</td>\n",
       "      <td>Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,...</td>\n",
       "      <td>Lau bảng biển, bình cứu hỏa , cây nước hành la...</td>\n",
       "      <td>Hành chính</td>\n",
       "      <td>06:30:00</td>\n",
       "      <td>16:00:00</td>\n",
       "      <td>7.5</td>\n",
       "      <td>6</td>\n",
       "      <td>24</td>\n",
       "      <td>441</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>70</td>\n",
       "      <td>0</td>\n",
       "      <td>9176.0</td>\n",
       "      <td>89.0</td>\n",
       "      <td>25</td>\n",
       "      <td>894.0</td>\n",
       "      <td>112</td>\n",
       "      <td>39</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>101-1</td>\n",
       "      <td>Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,...</td>\n",
       "      <td>Lau bảng biển, bình cứu hỏa , cây nước hành la...</td>\n",
       "      <td>Ca sáng</td>\n",
       "      <td>06:00:00</td>\n",
       "      <td>14:00:00</td>\n",
       "      <td>8.0</td>\n",
       "      <td>6</td>\n",
       "      <td>3</td>\n",
       "      <td>441</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>70</td>\n",
       "      <td>0</td>\n",
       "      <td>9176.0</td>\n",
       "      <td>89.0</td>\n",
       "      <td>25</td>\n",
       "      <td>894.0</td>\n",
       "      <td>112</td>\n",
       "      <td>39</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3 rows × 42 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "  ma_dia_diem                                    all_task_normal  \\\n",
       "0       115-2  Làm sạch toàn bộ phòng giao dịch tầng 1 (kể cả...   \n",
       "1       101-1  Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,...   \n",
       "2       101-1  Kiểm tra nhân sự các vị trí Thay rác, đẩy khô,...   \n",
       "\n",
       "                                     all_task_dinhky     loai_ca   bat_dau  \\\n",
       "0                                                NaN   Part time  06:30:00   \n",
       "1  Lau bảng biển, bình cứu hỏa , cây nước hành la...  Hành chính  06:30:00   \n",
       "2  Lau bảng biển, bình cứu hỏa , cây nước hành la...     Ca sáng  06:00:00   \n",
       "\n",
       "   ket_thuc  tong_gio_lam  so_ca_cua_toa  so_luong  num_tasks  ...  \\\n",
       "0  10:30:00           4.0              1         1          7  ...   \n",
       "1  16:00:00           7.5              6        24        441  ...   \n",
       "2  14:00:00           8.0              6         3        441  ...   \n",
       "\n",
       "   dien_tich_tham  doc_ham  vien_phan_quang  op_tuong  op_chan_tuong  \\\n",
       "0             0.0        0                0       0.0            0.0   \n",
       "1             0.0       70                0    9176.0           89.0   \n",
       "2             0.0       70                0    9176.0           89.0   \n",
       "\n",
       "   ranh_thoat_nuoc  dien_tich_kinh  num_medical_tasks_total  \\\n",
       "0                0            20.0                        0   \n",
       "1               25           894.0                      112   \n",
       "2               25           894.0                      112   \n",
       "\n",
       "   num_indoor_room_tasks  is_tasks_text_missing  \n",
       "0                      1                      0  \n",
       "1                     39                      0  \n",
       "2                     39                      0  \n",
       "\n",
       "[3 rows x 42 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# CELL 1 — LOAD DATA & BASIC CLEAN\n",
    "\n",
    "import pandas as pd\n",
    "\n",
    "DATA_PATH = \"final_2.xlsx\"\n",
    "SHEET_NAME = \"final\"\n",
    "\n",
    "# 1. Load\n",
    "df = pd.read_excel(DATA_PATH, sheet_name=SHEET_NAME)\n",
    "print(f\"Loaded: {DATA_PATH} | sheet: {SHEET_NAME}\")\n",
    "print(\"Shape (raw):\", df.shape)\n",
    "\n",
    "# 2. Drop duplicate full rows\n",
    "df = df.drop_duplicates().reset_index(drop=True)\n",
    "print(\"Shape (after dedup):\", df.shape)\n",
    "\n",
    "# 3. Check target\n",
    "assert \"so_luong\" in df.columns, \"❌ Missing target so_luong\"\n",
    "\n",
    "print(\"\\n=== TARGET SUMMARY (so_luong) ===\")\n",
    "print(df[\"so_luong\"].describe())\n",
    "print(\"Missing target:\", df[\"so_luong\"].isna().sum())\n",
    "print(\"Negative target:\", (df[\"so_luong\"] < 0).sum())\n",
    "print(\"Zero target:\", (df[\"so_luong\"] == 0).sum())\n",
    "\n",
    "# 4. Peek data\n",
    "print(\"\\nSample rows:\")\n",
    "display(df.head(3))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "5601efad",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "All columns:\n",
      " 0: ma_dia_diem\n",
      " 1: all_task_normal\n",
      " 2: all_task_dinhky\n",
      " 3: loai_ca\n",
      " 4: bat_dau\n",
      " 5: ket_thuc\n",
      " 6: tong_gio_lam\n",
      " 7: so_ca_cua_toa\n",
      " 8: so_luong\n",
      " 9: num_tasks\n",
      "10: num_cleaning_tasks\n",
      "11: num_trash_collection_tasks\n",
      "12: num_monitoring_tasks\n",
      "13: num_deep_cleaning_tasks\n",
      "14: num_support_tasks\n",
      "15: num_other_tasks\n",
      "16: num_wc_tasks\n",
      "17: num_hallway_tasks\n",
      "18: num_lobby_tasks\n",
      "19: num_outdoor_tasks\n",
      "20: num_elevator_tasks\n",
      "21: cleaning_ratio\n",
      "22: trash_collection_ratio\n",
      "23: monitoring_ratio\n",
      "24: area_diversity\n",
      "25: so_tang\n",
      "26: so_cua_thang_may\n",
      "27: dien_tich_ngoai_canh\n",
      "28: dien_tich_sanh\n",
      "29: dien_tich_hanh_lang\n",
      "30: dien_tich_wc\n",
      "31: dien_tich_phong\n",
      "32: dien_tich_tham\n",
      "33: doc_ham\n",
      "34: vien_phan_quang\n",
      "35: op_tuong\n",
      "36: op_chan_tuong\n",
      "37: ranh_thoat_nuoc\n",
      "38: dien_tich_kinh\n",
      "39: num_medical_tasks_total\n",
      "40: num_indoor_room_tasks\n",
      "41: is_tasks_text_missing\n",
      "\n",
      "Dropped columns:\n",
      " - ma_dia_diem\n",
      " - all_task_normal\n",
      " - all_task_dinhky\n",
      " - is_tasks_text_missing\n",
      "\n",
      "Shapes:\n",
      "X: (394, 37)\n",
      "y: (394,)\n",
      "\n",
      "Feature dtypes:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "int64      21\n",
       "float64    13\n",
       "object      3\n",
       "Name: count, dtype: int64"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Missing values in X:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "loai_ca                       0\n",
       "bat_dau                       0\n",
       "ket_thuc                      0\n",
       "tong_gio_lam                  0\n",
       "so_ca_cua_toa                 0\n",
       "num_tasks                     0\n",
       "num_cleaning_tasks            0\n",
       "num_trash_collection_tasks    0\n",
       "num_monitoring_tasks          0\n",
       "num_deep_cleaning_tasks       0\n",
       "dtype: int64"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# CELL 2 — FEATURE SELECTION (STRICT)\n",
    "\n",
    "# 1. Xem toàn bộ cột\n",
    "print(\"All columns:\")\n",
    "for i, c in enumerate(df.columns):\n",
    "    print(f\"{i:2d}: {c}\")\n",
    "\n",
    "# 2. Xác định cột cần loại bỏ (THEO THỎA THUẬN)\n",
    "DROP_COLS = [\n",
    "    df.columns[0],   # ma_dia_diem\n",
    "    df.columns[1],   # all_task_normal\n",
    "    df.columns[2],   # all_task_dinhky\n",
    "    df.columns[-1],  # is_tasks_text_missing\n",
    "]\n",
    "\n",
    "print(\"\\nDropped columns:\")\n",
    "for c in DROP_COLS:\n",
    "    print(\" -\", c)\n",
    "\n",
    "# 3. Tạo X, y\n",
    "X = df.drop(columns=DROP_COLS + [\"so_luong\"])\n",
    "y = df[\"so_luong\"].astype(float)\n",
    "\n",
    "print(\"\\nShapes:\")\n",
    "print(\"X:\", X.shape)\n",
    "print(\"y:\", y.shape)\n",
    "\n",
    "# 4. Kiểm tra kiểu dữ liệu\n",
    "print(\"\\nFeature dtypes:\")\n",
    "display(X.dtypes.value_counts())\n",
    "\n",
    "# 5. Kiểm tra missing\n",
    "print(\"\\nMissing values in X:\")\n",
    "display(X.isna().sum().sort_values(ascending=False).head(10))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "bb467e4c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Categorical columns: ['loai_ca']\n",
      "\n",
      "After preprocess:\n",
      "X_proc shape: (394, 46)\n",
      "Any non-numeric dtypes? False\n",
      "\n",
      "Sample columns (first 30):\n",
      "['tong_gio_lam', 'so_ca_cua_toa', 'num_tasks', 'num_cleaning_tasks', 'num_trash_collection_tasks', 'num_monitoring_tasks', 'num_deep_cleaning_tasks', 'num_support_tasks', 'num_other_tasks', 'num_wc_tasks', 'num_hallway_tasks', 'num_lobby_tasks', 'num_outdoor_tasks', 'num_elevator_tasks', 'cleaning_ratio', 'trash_collection_ratio', 'monitoring_ratio', 'area_diversity', 'so_tang', 'so_cua_thang_may', 'dien_tich_ngoai_canh', 'dien_tich_sanh', 'dien_tich_hanh_lang', 'dien_tich_wc', 'dien_tich_phong', 'dien_tich_tham', 'doc_ham', 'vien_phan_quang', 'op_tuong', 'op_chan_tuong']\n"
     ]
    }
   ],
   "source": [
    "# CELL 3 — PREPROCESS (TIME + CATEGORICAL) WITHOUT JUNK\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "# ---------- 1) Time parsing ----------\n",
    "def time_to_hour(x):\n",
    "    if pd.isna(x):\n",
    "        return np.nan\n",
    "\n",
    "    # datetime/time object\n",
    "    if hasattr(x, \"hour\"):\n",
    "        return float(x.hour) + float(getattr(x, \"minute\", 0))/60.0\n",
    "\n",
    "    s = str(x).strip()\n",
    "    # \"YYYY-MM-DD HH:MM:SS\"\n",
    "    if \" \" in s and \":\" in s:\n",
    "        s = s.split(\" \", 1)[1].strip()\n",
    "\n",
    "    # \"HH:MM\" or \"HH:MM:SS\"\n",
    "    if \":\" in s:\n",
    "        parts = s.split(\":\")\n",
    "        try:\n",
    "            h = float(parts[0])\n",
    "            m = float(parts[1]) if len(parts) > 1 else 0.0\n",
    "            return h + m/60.0\n",
    "        except:\n",
    "            return np.nan\n",
    "\n",
    "    # numeric fallback\n",
    "    try:\n",
    "        return float(s)\n",
    "    except:\n",
    "        return np.nan\n",
    "\n",
    "# Create new numeric time features (do NOT one-hot time)\n",
    "X_proc = X.copy()\n",
    "\n",
    "if \"bat_dau\" in X_proc.columns:\n",
    "    X_proc[\"hour_start\"] = X_proc[\"bat_dau\"].apply(time_to_hour)\n",
    "if \"ket_thuc\" in X_proc.columns:\n",
    "    X_proc[\"hour_end\"] = X_proc[\"ket_thuc\"].apply(time_to_hour)\n",
    "\n",
    "# shift_length + cross day\n",
    "if (\"hour_start\" in X_proc.columns) and (\"hour_end\" in X_proc.columns):\n",
    "    end_adj = X_proc[\"hour_end\"].copy()\n",
    "    cross = (X_proc[\"hour_start\"].notna()) & (X_proc[\"hour_end\"].notna()) & (X_proc[\"hour_end\"] < X_proc[\"hour_start\"])\n",
    "    end_adj[cross] = end_adj[cross] + 24.0\n",
    "\n",
    "    X_proc[\"shift_length\"] = (end_adj - X_proc[\"hour_start\"]).clip(lower=0)\n",
    "    X_proc[\"is_cross_day\"] = cross.astype(int)\n",
    "\n",
    "# Drop raw time cols to avoid junk\n",
    "for c in [\"bat_dau\", \"ket_thuc\"]:\n",
    "    if c in X_proc.columns:\n",
    "        X_proc = X_proc.drop(columns=[c])\n",
    "\n",
    "# ---------- 2) One-hot categorical ----------\n",
    "cat_cols = [c for c in X_proc.columns if X_proc[c].dtype == \"object\"]\n",
    "print(\"Categorical columns:\", cat_cols)\n",
    "\n",
    "X_proc = pd.get_dummies(X_proc, columns=cat_cols, dummy_na=True)\n",
    "\n",
    "# ---------- 3) Fill missing ----------\n",
    "X_proc = X_proc.replace([np.inf, -np.inf], np.nan).fillna(0)\n",
    "\n",
    "print(\"\\nAfter preprocess:\")\n",
    "print(\"X_proc shape:\", X_proc.shape)\n",
    "print(\"Any non-numeric dtypes?\", any(dt == \"object\" for dt in X_proc.dtypes))\n",
    "\n",
    "print(\"\\nSample columns (first 30):\")\n",
    "print(list(X_proc.columns[:30]))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "3cd119b7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Shapes:\n",
      "Train: (326, 46) (326,)\n",
      "Val  : (68, 46) (68,)\n",
      "\n",
      "ElasticNet(alpha=0.01, l1_ratio=0.5)\n",
      " Train | MAE=2.754 | RMSE=5.200 | R2=0.458\n",
      " Val   | MAE=2.420 | RMSE=4.792 | R2=0.259\n",
      "\n",
      "DecisionTree(max_depth=8, min_samples_leaf=5)\n",
      " Train | MAE=2.187 | RMSE=5.313 | R2=0.434\n",
      " Val   | MAE=2.312 | RMSE=4.149 | R2=0.445\n",
      "\n",
      "RandomForest(n_estimators=600, min_samples_leaf=3)\n",
      " Train | MAE=1.894 | RMSE=4.820 | R2=0.535\n",
      " Val   | MAE=2.402 | RMSE=4.467 | R2=0.356\n",
      "\n",
      "ExtraTrees(n_estimators=800, min_samples_leaf=2)\n",
      " Train | MAE=1.095 | RMSE=3.320 | R2=0.779\n",
      " Val   | MAE=1.968 | RMSE=3.461 | R2=0.614\n",
      "\n",
      "HistGradientBoosting(learning_rate=0.05, min_samples_leaf=20)\n",
      " Train | MAE=1.990 | RMSE=4.871 | R2=0.525\n",
      " Val   | MAE=2.406 | RMSE=4.599 | R2=0.318\n",
      "\n",
      "=== HOLDOUT VAL MAE summary (lower is better) ===\n",
      "ExtraTrees  : Val MAE = 1.968\n",
      "DecisionTree: Val MAE = 2.312\n",
      "RandomForest: Val MAE = 2.402\n",
      "HGBR        : Val MAE = 2.406\n",
      "ElasticNet  : Val MAE = 2.420\n"
     ]
    }
   ],
   "source": [
    "# CELL 4 (EXTENDED) — ADD TREE/ENSEMBLE MODELS TO HOLDOUT VAL\n",
    "\n",
    "import numpy as np\n",
    "from sklearn.model_selection import GroupShuffleSplit\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet\n",
    "from sklearn.tree import DecisionTreeRegressor\n",
    "from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor\n",
    "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
    "\n",
    "# ---------- 1) Group split (80% train, 20% val) ----------\n",
    "groups = df[\"ma_dia_diem\"].astype(str)\n",
    "\n",
    "gss = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=42)\n",
    "train_idx, val_idx = next(gss.split(X_proc, y, groups))\n",
    "\n",
    "X_train = X_proc.iloc[train_idx]\n",
    "y_train = y.iloc[train_idx]\n",
    "\n",
    "X_val = X_proc.iloc[val_idx]\n",
    "y_val = y.iloc[val_idx]\n",
    "\n",
    "print(\"Shapes:\")\n",
    "print(\"Train:\", X_train.shape, y_train.shape)\n",
    "print(\"Val  :\", X_val.shape, y_val.shape)\n",
    "\n",
    "# ---------- 2) Scale features (for linear models) ----------\n",
    "scaler = StandardScaler()\n",
    "X_train_s = scaler.fit_transform(X_train)\n",
    "X_val_s   = scaler.transform(X_val)\n",
    "\n",
    "# ---------- 2b) No-scale matrices (for tree models) ----------\n",
    "X_train_ns = X_train.values\n",
    "X_val_ns   = X_val.values\n",
    "\n",
    "# ---------- 3) Log-transform target ----------\n",
    "y_train_log = np.log1p(y_train)\n",
    "y_val_log   = np.log1p(y_val)\n",
    "\n",
    "# ---------- 4) Evaluation helper (support scale/no-scale) ----------\n",
    "def eval_reg_any(name, model, use_scaled_X=True):\n",
    "    Xtr = X_train_s if use_scaled_X else X_train_ns\n",
    "    Xva = X_val_s   if use_scaled_X else X_val_ns\n",
    "\n",
    "    model.fit(Xtr, y_train_log)\n",
    "\n",
    "    pred_train = np.maximum(0, np.expm1(model.predict(Xtr)))\n",
    "    pred_val   = np.maximum(0, np.expm1(model.predict(Xva)))\n",
    "\n",
    "    def _m(y_true, y_pred):\n",
    "        return (\n",
    "            mean_absolute_error(y_true, y_pred),\n",
    "            mean_squared_error(y_true, y_pred) ** 0.5,\n",
    "            r2_score(y_true, y_pred),\n",
    "        )\n",
    "\n",
    "    tr = _m(y_train, pred_train)\n",
    "    va = _m(y_val, pred_val)\n",
    "\n",
    "    print(f\"\\n{name}\")\n",
    "    print(f\" Train | MAE={tr[0]:.3f} | RMSE={tr[1]:.3f} | R2={tr[2]:.3f}\")\n",
    "    print(f\" Val   | MAE={va[0]:.3f} | RMSE={va[1]:.3f} | R2={va[2]:.3f}\")\n",
    "\n",
    "    return va[0]\n",
    "\n",
    "# ---------- 5) Train baselines + tree/ensemble ----------\n",
    "results = {}\n",
    "\n",
    "# Linear (scaled)\n",
    "# results[\"Linear\"] = eval_reg_any(\"LinearRegression\", LinearRegression(), use_scaled_X=True)\n",
    "# results[\"Ridge\"]  = eval_reg_any(\"Ridge(alpha=1.0)\", Ridge(alpha=1.0), use_scaled_X=True)\n",
    "# results[\"Lasso\"]  = eval_reg_any(\"Lasso(alpha=0.01)\", Lasso(alpha=0.01, max_iter=5000), use_scaled_X=True)\n",
    "results[\"ElasticNet\"] = eval_reg_any(\n",
    "    \"ElasticNet(alpha=0.01, l1_ratio=0.5)\",\n",
    "    ElasticNet(alpha=0.01, l1_ratio=0.5, max_iter=5000),\n",
    "    use_scaled_X=True\n",
    ")\n",
    "\n",
    "# Tree / Ensemble (no scale)\n",
    "results[\"DecisionTree\"] = eval_reg_any(\n",
    "    \"DecisionTree(max_depth=8, min_samples_leaf=5)\",\n",
    "    DecisionTreeRegressor(max_depth=8, min_samples_leaf=5, random_state=42),\n",
    "    use_scaled_X=False\n",
    ")\n",
    "\n",
    "results[\"RandomForest\"] = eval_reg_any(\n",
    "    \"RandomForest(n_estimators=600, min_samples_leaf=3)\",\n",
    "    RandomForestRegressor(\n",
    "        n_estimators=600, min_samples_leaf=3, random_state=42, n_jobs=-1\n",
    "    ),\n",
    "    use_scaled_X=False\n",
    ")\n",
    "\n",
    "results[\"ExtraTrees\"] = eval_reg_any(\n",
    "    \"ExtraTrees(n_estimators=800, min_samples_leaf=2)\",\n",
    "    ExtraTreesRegressor(\n",
    "        n_estimators=800, min_samples_leaf=2, random_state=42, n_jobs=-1\n",
    "    ),\n",
    "    use_scaled_X=False\n",
    ")\n",
    "\n",
    "results[\"HGBR\"] = eval_reg_any(\n",
    "    \"HistGradientBoosting(learning_rate=0.05, min_samples_leaf=20)\",\n",
    "    HistGradientBoostingRegressor(\n",
    "        learning_rate=0.05, min_samples_leaf=20, max_leaf_nodes=31, random_state=42\n",
    "    ),\n",
    "    use_scaled_X=False\n",
    ")\n",
    "\n",
    "print(\"\\n=== HOLDOUT VAL MAE summary (lower is better) ===\")\n",
    "for k, v in sorted(results.items(), key=lambda x: x[1]):\n",
    "    print(f\"{k:12s}: Val MAE = {v:.3f}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "106e557f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train for CV: (326, 46) (326,) | unique groups: 153\n",
      "\n",
      "=== ExtraTrees GroupKFold (TRAIN ONLY) ===\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>MAE</th>\n",
       "      <th>RMSE</th>\n",
       "      <th>R2</th>\n",
       "      <th>fold</th>\n",
       "      <th>n_val</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3.677199</td>\n",
       "      <td>7.263821</td>\n",
       "      <td>0.048081</td>\n",
       "      <td>1</td>\n",
       "      <td>66</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3.673333</td>\n",
       "      <td>6.600086</td>\n",
       "      <td>0.104487</td>\n",
       "      <td>2</td>\n",
       "      <td>65</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3.092924</td>\n",
       "      <td>8.986669</td>\n",
       "      <td>0.087366</td>\n",
       "      <td>3</td>\n",
       "      <td>65</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2.100641</td>\n",
       "      <td>3.499245</td>\n",
       "      <td>0.291277</td>\n",
       "      <td>4</td>\n",
       "      <td>65</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2.550497</td>\n",
       "      <td>4.739900</td>\n",
       "      <td>0.407946</td>\n",
       "      <td>5</td>\n",
       "      <td>65</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        MAE      RMSE        R2  fold  n_val\n",
       "0  3.677199  7.263821  0.048081     1     66\n",
       "1  3.673333  6.600086  0.104487     2     65\n",
       "2  3.092924  8.986669  0.087366     3     65\n",
       "3  2.100641  3.499245  0.291277     4     65\n",
       "4  2.550497  4.739900  0.407946     5     65"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "=== CV SUMMARY ===\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>MAE</th>\n",
       "      <th>RMSE</th>\n",
       "      <th>R2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>3.018919</td>\n",
       "      <td>6.217944</td>\n",
       "      <td>0.187832</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>0.694572</td>\n",
       "      <td>2.149515</td>\n",
       "      <td>0.154694</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           MAE      RMSE        R2\n",
       "mean  3.018919  6.217944  0.187832\n",
       "std   0.694572  2.149515  0.154694"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Worst fold (highest MAE):\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>MAE</th>\n",
       "      <th>RMSE</th>\n",
       "      <th>R2</th>\n",
       "      <th>fold</th>\n",
       "      <th>n_val</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3.677199</td>\n",
       "      <td>7.263821</td>\n",
       "      <td>0.048081</td>\n",
       "      <td>1</td>\n",
       "      <td>66</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        MAE      RMSE        R2  fold  n_val\n",
       "0  3.677199  7.263821  0.048081     1     66"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# CELL 5 — GROUP K-FOLD CV (TRAIN ONLY) FOR EXTRA TREES (NO LEAKAGE by ma_dia_diem)\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from sklearn.model_selection import GroupKFold\n",
    "from sklearn.ensemble import ExtraTreesRegressor\n",
    "from sklearn.base import clone\n",
    "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
    "\n",
    "# =========================\n",
    "# 1) Prepare TRAIN data + groups\n",
    "# =========================\n",
    "X_tr = X_train.values          # no scaling for tree\n",
    "y_tr = y_train.values.astype(float)\n",
    "g_tr = df.loc[train_idx, \"ma_dia_diem\"].astype(str).values\n",
    "\n",
    "print(\"Train for CV:\", X_tr.shape, y_tr.shape, \"| unique groups:\", len(np.unique(g_tr)))\n",
    "\n",
    "# =========================\n",
    "# 2) Metric helper (evaluate on original scale)\n",
    "# =========================\n",
    "def metrics(y_true, y_pred):\n",
    "    y_pred = np.maximum(0, y_pred)\n",
    "    return {\n",
    "        \"MAE\":  mean_absolute_error(y_true, y_pred),\n",
    "        \"RMSE\": mean_squared_error(y_true, y_pred) ** 0.5,\n",
    "        \"R2\":   r2_score(y_true, y_pred),\n",
    "    }\n",
    "\n",
    "# =========================\n",
    "# 3) ExtraTrees config (same as your holdout baseline)\n",
    "# =========================\n",
    "base_model = ExtraTreesRegressor(\n",
    "    n_estimators=800,\n",
    "    min_samples_leaf=2,\n",
    "    random_state=42,\n",
    "    n_jobs=-1\n",
    ")\n",
    "\n",
    "# =========================\n",
    "# 4) GroupKFold CV (fit log1p(y) -> expm1(pred))\n",
    "# =========================\n",
    "gkf = GroupKFold(n_splits=5)\n",
    "\n",
    "rows = []\n",
    "for fold, (tr_idx, va_idx) in enumerate(gkf.split(X_tr, y_tr, groups=g_tr), start=1):\n",
    "    model = clone(base_model)\n",
    "\n",
    "    Xtr_f, Xva_f = X_tr[tr_idx], X_tr[va_idx]\n",
    "    ytr_f, yva_f = y_tr[tr_idx], y_tr[va_idx]\n",
    "\n",
    "    model.fit(Xtr_f, np.log1p(ytr_f))\n",
    "    pred_va = np.expm1(model.predict(Xva_f))\n",
    "    pred_va = np.maximum(0, pred_va)\n",
    "\n",
    "    m = metrics(yva_f, pred_va)\n",
    "    m[\"fold\"] = fold\n",
    "    m[\"n_val\"] = len(va_idx)\n",
    "    rows.append(m)\n",
    "\n",
    "cv_df = pd.DataFrame(rows)\n",
    "\n",
    "print(\"\\n=== ExtraTrees GroupKFold (TRAIN ONLY) ===\")\n",
    "display(cv_df)\n",
    "\n",
    "summary = cv_df[[\"MAE\", \"RMSE\", \"R2\"]].agg([\"mean\", \"std\"])\n",
    "print(\"\\n=== CV SUMMARY ===\")\n",
    "display(summary)\n",
    "\n",
    "best_worst = cv_df.sort_values(\"MAE\", ascending=False)\n",
    "print(\"\\nWorst fold (highest MAE):\")\n",
    "display(best_worst.head(1))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "ee66e389",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Linear       | MAE = 4.971 ± 2.813\n",
      "Lasso_0.01   | MAE = 7.110 ± 7.917\n",
      "ElasticNet_0.01 | MAE = 6.386 ± 6.240\n",
      "\n",
      "=== CV SUMMARY ===\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>mean</th>\n",
       "      <th>std</th>\n",
       "      <th>min</th>\n",
       "      <th>max</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>model</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Linear</th>\n",
       "      <td>4.970868</td>\n",
       "      <td>3.145106</td>\n",
       "      <td>2.279954</td>\n",
       "      <td>10.321992</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ElasticNet_0.01</th>\n",
       "      <td>6.386372</td>\n",
       "      <td>6.976535</td>\n",
       "      <td>2.104239</td>\n",
       "      <td>18.765868</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Lasso_0.01</th>\n",
       "      <td>7.110267</td>\n",
       "      <td>8.851455</td>\n",
       "      <td>2.113510</td>\n",
       "      <td>22.876806</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                     mean       std       min        max\n",
       "model                                                   \n",
       "Linear           4.970868  3.145106  2.279954  10.321992\n",
       "ElasticNet_0.01  6.386372  6.976535  2.104239  18.765868\n",
       "Lasso_0.01       7.110267  8.851455  2.113510  22.876806"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# CELL 5 — GROUP K-FOLD CONFIRMATION (TOP LINEAR MODELS)\n",
    "\n",
    "from sklearn.model_selection import GroupKFold\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.linear_model import LinearRegression, Lasso, ElasticNet\n",
    "from sklearn.metrics import mean_absolute_error\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "models = {\n",
    "    \"Linear\": LinearRegression(),\n",
    "    \"Lasso_0.01\": Lasso(alpha=0.01, max_iter=5000),\n",
    "    \"ElasticNet_0.01\": ElasticNet(alpha=0.01, l1_ratio=0.5, max_iter=5000),\n",
    "}\n",
    "\n",
    "groups = df[\"ma_dia_diem\"].astype(str)\n",
    "gkf = GroupKFold(n_splits=5)\n",
    "\n",
    "rows = []\n",
    "\n",
    "for name, model in models.items():\n",
    "    maes = []\n",
    "    for fold, (tr_idx, va_idx) in enumerate(gkf.split(X_proc, y, groups)):\n",
    "        X_tr, X_va = X_proc.iloc[tr_idx], X_proc.iloc[va_idx]\n",
    "        y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]\n",
    "\n",
    "        scaler = StandardScaler()\n",
    "        X_tr_s = scaler.fit_transform(X_tr)\n",
    "        X_va_s = scaler.transform(X_va)\n",
    "\n",
    "        model.fit(X_tr_s, np.log1p(y_tr))\n",
    "        pred_va = np.maximum(0, np.expm1(model.predict(X_va_s)))\n",
    "\n",
    "        mae = mean_absolute_error(y_va, pred_va)\n",
    "        maes.append(mae)\n",
    "\n",
    "        rows.append({\n",
    "            \"model\": name,\n",
    "            \"fold\": fold,\n",
    "            \"MAE\": mae,\n",
    "            \"n_val\": len(va_idx),\n",
    "        })\n",
    "\n",
    "    print(f\"{name:12s} | MAE = {np.mean(maes):.3f} ± {np.std(maes):.3f}\")\n",
    "\n",
    "cv_df = pd.DataFrame(rows)\n",
    "\n",
    "print(\"\\n=== CV SUMMARY ===\")\n",
    "display(\n",
    "    cv_df.groupby(\"model\")[\"MAE\"]\n",
    "    .agg([\"mean\", \"std\", \"min\", \"max\"])\n",
    "    .sort_values(\"mean\")\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "73a31e6e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=== GROUP K-FOLD CV LEADERBOARD (TRAIN ONLY) ===\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>model</th>\n",
       "      <th>MAE_mean</th>\n",
       "      <th>MAE_std</th>\n",
       "      <th>RMSE_mean</th>\n",
       "      <th>R2_mean</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>ExtraTrees</td>\n",
       "      <td>3.018919</td>\n",
       "      <td>0.694572</td>\n",
       "      <td>6.217944</td>\n",
       "      <td>0.187832</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>RandomForest</td>\n",
       "      <td>3.052084</td>\n",
       "      <td>0.715409</td>\n",
       "      <td>6.243409</td>\n",
       "      <td>0.187735</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>HGBR</td>\n",
       "      <td>3.105344</td>\n",
       "      <td>0.673739</td>\n",
       "      <td>6.344961</td>\n",
       "      <td>0.156316</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>DecisionTree</td>\n",
       "      <td>3.681985</td>\n",
       "      <td>0.668149</td>\n",
       "      <td>6.861043</td>\n",
       "      <td>-0.017019</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Lasso</td>\n",
       "      <td>4.366601</td>\n",
       "      <td>1.709182</td>\n",
       "      <td>10.584715</td>\n",
       "      <td>-2.703761</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>ElasticNet</td>\n",
       "      <td>4.918275</td>\n",
       "      <td>2.414753</td>\n",
       "      <td>13.602627</td>\n",
       "      <td>-15.883303</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          model  MAE_mean   MAE_std  RMSE_mean    R2_mean\n",
       "4    ExtraTrees  3.018919  0.694572   6.217944   0.187832\n",
       "3  RandomForest  3.052084  0.715409   6.243409   0.187735\n",
       "5          HGBR  3.105344  0.673739   6.344961   0.156316\n",
       "2  DecisionTree  3.681985  0.668149   6.861043  -0.017019\n",
       "0         Lasso  4.366601  1.709182  10.584715  -2.703761\n",
       "1    ElasticNet  4.918275  2.414753  13.602627 -15.883303"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "=== Per-fold details (worst MAE first) ===\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>MAE</th>\n",
       "      <th>RMSE</th>\n",
       "      <th>R2</th>\n",
       "      <th>model</th>\n",
       "      <th>fold</th>\n",
       "      <th>n_val</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>8.972213</td>\n",
       "      <td>37.201319</td>\n",
       "      <td>-79.102177</td>\n",
       "      <td>ElasticNet</td>\n",
       "      <td>4</td>\n",
       "      <td>65</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7.342893</td>\n",
       "      <td>19.028144</td>\n",
       "      <td>-5.532246</td>\n",
       "      <td>Lasso</td>\n",
       "      <td>1</td>\n",
       "      <td>66</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>5.347237</td>\n",
       "      <td>9.838825</td>\n",
       "      <td>-0.746450</td>\n",
       "      <td>ElasticNet</td>\n",
       "      <td>1</td>\n",
       "      <td>66</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>4.347409</td>\n",
       "      <td>7.407078</td>\n",
       "      <td>0.010163</td>\n",
       "      <td>DecisionTree</td>\n",
       "      <td>1</td>\n",
       "      <td>66</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4.288895</td>\n",
       "      <td>12.729106</td>\n",
       "      <td>-8.378285</td>\n",
       "      <td>Lasso</td>\n",
       "      <td>4</td>\n",
       "      <td>65</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>4.144338</td>\n",
       "      <td>9.678611</td>\n",
       "      <td>-0.058584</td>\n",
       "      <td>DecisionTree</td>\n",
       "      <td>3</td>\n",
       "      <td>65</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>3.772958</td>\n",
       "      <td>7.109340</td>\n",
       "      <td>-0.039038</td>\n",
       "      <td>DecisionTree</td>\n",
       "      <td>2</td>\n",
       "      <td>65</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>3.742208</td>\n",
       "      <td>6.724452</td>\n",
       "      <td>0.070421</td>\n",
       "      <td>RandomForest</td>\n",
       "      <td>2</td>\n",
       "      <td>65</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>3.721214</td>\n",
       "      <td>7.376454</td>\n",
       "      <td>0.018331</td>\n",
       "      <td>HGBR</td>\n",
       "      <td>1</td>\n",
       "      <td>66</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>3.684392</td>\n",
       "      <td>6.736473</td>\n",
       "      <td>0.067094</td>\n",
       "      <td>HGBR</td>\n",
       "      <td>2</td>\n",
       "      <td>65</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>3.677199</td>\n",
       "      <td>7.263821</td>\n",
       "      <td>0.048081</td>\n",
       "      <td>ExtraTrees</td>\n",
       "      <td>1</td>\n",
       "      <td>66</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>3.673333</td>\n",
       "      <td>6.600086</td>\n",
       "      <td>0.104487</td>\n",
       "      <td>ExtraTrees</td>\n",
       "      <td>2</td>\n",
       "      <td>65</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>3.557926</td>\n",
       "      <td>7.090878</td>\n",
       "      <td>0.092870</td>\n",
       "      <td>RandomForest</td>\n",
       "      <td>1</td>\n",
       "      <td>66</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3.511097</td>\n",
       "      <td>6.344130</td>\n",
       "      <td>0.172598</td>\n",
       "      <td>Lasso</td>\n",
       "      <td>2</td>\n",
       "      <td>65</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>3.507451</td>\n",
       "      <td>5.627415</td>\n",
       "      <td>0.165473</td>\n",
       "      <td>DecisionTree</td>\n",
       "      <td>5</td>\n",
       "      <td>65</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         MAE       RMSE         R2         model  fold  n_val\n",
       "8   8.972213  37.201319 -79.102177    ElasticNet     4     65\n",
       "0   7.342893  19.028144  -5.532246         Lasso     1     66\n",
       "5   5.347237   9.838825  -0.746450    ElasticNet     1     66\n",
       "10  4.347409   7.407078   0.010163  DecisionTree     1     66\n",
       "3   4.288895  12.729106  -8.378285         Lasso     4     65\n",
       "12  4.144338   9.678611  -0.058584  DecisionTree     3     65\n",
       "11  3.772958   7.109340  -0.039038  DecisionTree     2     65\n",
       "16  3.742208   6.724452   0.070421  RandomForest     2     65\n",
       "25  3.721214   7.376454   0.018331          HGBR     1     66\n",
       "26  3.684392   6.736473   0.067094          HGBR     2     65\n",
       "20  3.677199   7.263821   0.048081    ExtraTrees     1     66\n",
       "21  3.673333   6.600086   0.104487    ExtraTrees     2     65\n",
       "15  3.557926   7.090878   0.092870  RandomForest     1     66\n",
       "1   3.511097   6.344130   0.172598         Lasso     2     65\n",
       "14  3.507451   5.627415   0.165473  DecisionTree     5     65"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# CELL 6 — GROUP K-FOLD CV: TRY MULTIPLE MODELS (FAIR COMPARISON)\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from sklearn.model_selection import GroupKFold\n",
    "from sklearn.base import clone\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
    "\n",
    "from sklearn.linear_model import Lasso, ElasticNet\n",
    "from sklearn.tree import DecisionTreeRegressor\n",
    "from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor\n",
    "\n",
    "# =========================\n",
    "# 1) Prepare TRAIN data + groups\n",
    "# =========================\n",
    "X_tr_df = X_train.copy()  # keep dataframe\n",
    "y_tr = y_train.astype(float).values\n",
    "g_tr = df.loc[train_idx, \"ma_dia_diem\"].astype(str).values\n",
    "\n",
    "# =========================\n",
    "# 2) Metric helper\n",
    "# =========================\n",
    "def metrics(y_true, y_pred):\n",
    "    y_pred = np.maximum(0, y_pred)\n",
    "    return {\n",
    "        \"MAE\":  mean_absolute_error(y_true, y_pred),\n",
    "        \"RMSE\": mean_squared_error(y_true, y_pred) ** 0.5,\n",
    "        \"R2\":   r2_score(y_true, y_pred),\n",
    "    }\n",
    "\n",
    "def cv_eval_model(name, model, X_df, y, groups, n_splits=5, use_log=True):\n",
    "    gkf = GroupKFold(n_splits=n_splits)\n",
    "    rows = []\n",
    "\n",
    "    for fold, (tr_i, va_i) in enumerate(gkf.split(X_df, y, groups=groups), start=1):\n",
    "        m = clone(model)\n",
    "\n",
    "        Xtr = X_df.iloc[tr_i]\n",
    "        Xva = X_df.iloc[va_i]\n",
    "        ytr = y[tr_i]\n",
    "        yva = y[va_i]\n",
    "\n",
    "        if use_log:\n",
    "            m.fit(Xtr, np.log1p(ytr))\n",
    "            pred = np.expm1(m.predict(Xva))\n",
    "        else:\n",
    "            m.fit(Xtr, ytr)\n",
    "            pred = m.predict(Xva)\n",
    "\n",
    "        mm = metrics(yva, pred)\n",
    "        mm.update({\"model\": name, \"fold\": fold, \"n_val\": len(va_i)})\n",
    "        rows.append(mm)\n",
    "\n",
    "    out = pd.DataFrame(rows)\n",
    "    summ = out[[\"MAE\", \"RMSE\", \"R2\"]].agg([\"mean\", \"std\"])\n",
    "    return out, summ\n",
    "\n",
    "# =========================\n",
    "# 3) Define candidates\n",
    "# =========================\n",
    "candidates = []\n",
    "\n",
    "# Linear (need scale)\n",
    "candidates.append((\"Lasso\", Pipeline([\n",
    "    (\"scaler\", StandardScaler()),\n",
    "    (\"model\", Lasso(alpha=0.01, max_iter=5000, random_state=42))\n",
    "]), True))\n",
    "\n",
    "candidates.append((\"ElasticNet\", Pipeline([\n",
    "    (\"scaler\", StandardScaler()),\n",
    "    (\"model\", ElasticNet(alpha=0.01, l1_ratio=0.5, max_iter=8000, random_state=42))\n",
    "]), True))\n",
    "\n",
    "# Tree/Ensemble (no scale needed)\n",
    "candidates.append((\"DecisionTree\", DecisionTreeRegressor(\n",
    "    max_depth=8, min_samples_leaf=5, random_state=42\n",
    "), True))\n",
    "\n",
    "candidates.append((\"RandomForest\", RandomForestRegressor(\n",
    "    n_estimators=600, min_samples_leaf=3, random_state=42, n_jobs=-1\n",
    "), True))\n",
    "\n",
    "candidates.append((\"ExtraTrees\", ExtraTreesRegressor(\n",
    "    n_estimators=800, min_samples_leaf=2, random_state=42, n_jobs=-1\n",
    "), True))\n",
    "\n",
    "candidates.append((\"HGBR\", HistGradientBoostingRegressor(\n",
    "    learning_rate=0.05, min_samples_leaf=20, max_leaf_nodes=31, random_state=42\n",
    "), True))\n",
    "\n",
    "# =========================\n",
    "# 4) Run CV for all models\n",
    "# =========================\n",
    "all_fold = []\n",
    "rows_lb = []\n",
    "\n",
    "for name, model, use_log in candidates:\n",
    "    fold_df, summ = cv_eval_model(name, model, X_tr_df, y_tr, g_tr, n_splits=5, use_log=use_log)\n",
    "    all_fold.append(fold_df)\n",
    "\n",
    "    rows_lb.append({\n",
    "        \"model\": name,\n",
    "        \"MAE_mean\":  summ.loc[\"mean\", \"MAE\"],\n",
    "        \"MAE_std\":   summ.loc[\"std\",  \"MAE\"],\n",
    "        \"RMSE_mean\": summ.loc[\"mean\", \"RMSE\"],\n",
    "        \"R2_mean\":   summ.loc[\"mean\", \"R2\"],\n",
    "    })\n",
    "\n",
    "cv_all = pd.concat(all_fold, ignore_index=True)\n",
    "leaderboard = pd.DataFrame(rows_lb).sort_values([\"MAE_mean\", \"RMSE_mean\"], ascending=True)\n",
    "\n",
    "print(\"=== GROUP K-FOLD CV LEADERBOARD (TRAIN ONLY) ===\")\n",
    "display(leaderboard)\n",
    "\n",
    "print(\"\\n=== Per-fold details (worst MAE first) ===\")\n",
    "display(cv_all.sort_values(\"MAE\", ascending=False).head(15))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "1d408c02",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=== FINAL MODEL: ExtraTrees (fit on TRAIN, eval on HOLDOUT VAL) ===\n",
      "\n",
      "[VAL (raw)]\n",
      "MAE : 1.968\n",
      "RMSE: 3.461\n",
      "R2  : 0.614\n",
      "\n",
      "[VAL (rounded)]\n",
      "MAE : 1.971\n",
      "RMSE: 3.502\n",
      "R2  : 0.604\n",
      "\n",
      "Worst 15 samples (by abs error):\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>y_true</th>\n",
       "      <th>y_pred</th>\n",
       "      <th>y_pred_round</th>\n",
       "      <th>abs_err</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>32.0</td>\n",
       "      <td>16.876830</td>\n",
       "      <td>17</td>\n",
       "      <td>15.123170</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>29.0</td>\n",
       "      <td>16.242619</td>\n",
       "      <td>16</td>\n",
       "      <td>12.757381</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>13.0</td>\n",
       "      <td>5.239388</td>\n",
       "      <td>5</td>\n",
       "      <td>7.760612</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>65</th>\n",
       "      <td>1.0</td>\n",
       "      <td>7.994058</td>\n",
       "      <td>8</td>\n",
       "      <td>6.994058</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>14.0</td>\n",
       "      <td>7.210027</td>\n",
       "      <td>7</td>\n",
       "      <td>6.789973</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>12.0</td>\n",
       "      <td>5.706020</td>\n",
       "      <td>6</td>\n",
       "      <td>6.293980</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>60</th>\n",
       "      <td>1.0</td>\n",
       "      <td>6.108304</td>\n",
       "      <td>6</td>\n",
       "      <td>5.108304</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>61</th>\n",
       "      <td>1.0</td>\n",
       "      <td>5.948221</td>\n",
       "      <td>6</td>\n",
       "      <td>4.948221</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>56</th>\n",
       "      <td>2.0</td>\n",
       "      <td>6.426550</td>\n",
       "      <td>6</td>\n",
       "      <td>4.426550</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>11.0</td>\n",
       "      <td>6.758044</td>\n",
       "      <td>7</td>\n",
       "      <td>4.241956</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>58</th>\n",
       "      <td>10.0</td>\n",
       "      <td>6.354210</td>\n",
       "      <td>6</td>\n",
       "      <td>3.645790</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>6.0</td>\n",
       "      <td>2.356056</td>\n",
       "      <td>2</td>\n",
       "      <td>3.643944</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>7.0</td>\n",
       "      <td>3.442200</td>\n",
       "      <td>3</td>\n",
       "      <td>3.557800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>2.0</td>\n",
       "      <td>5.504394</td>\n",
       "      <td>6</td>\n",
       "      <td>3.504394</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>62</th>\n",
       "      <td>3.0</td>\n",
       "      <td>6.087934</td>\n",
       "      <td>6</td>\n",
       "      <td>3.087934</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    y_true     y_pred  y_pred_round    abs_err\n",
       "34    32.0  16.876830            17  15.123170\n",
       "0     29.0  16.242619            16  12.757381\n",
       "14    13.0   5.239388             5   7.760612\n",
       "65     1.0   7.994058             8   6.994058\n",
       "28    14.0   7.210027             7   6.789973\n",
       "29    12.0   5.706020             6   6.293980\n",
       "60     1.0   6.108304             6   5.108304\n",
       "61     1.0   5.948221             6   4.948221\n",
       "56     2.0   6.426550             6   4.426550\n",
       "17    11.0   6.758044             7   4.241956\n",
       "58    10.0   6.354210             6   3.645790\n",
       "10     6.0   2.356056             2   3.643944\n",
       "23     7.0   3.442200             3   3.557800\n",
       "20     2.0   5.504394             6   3.504394\n",
       "62     3.0   6.087934             6   3.087934"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved:\n",
      " - Model  : ./artifacts\\extratrees_log1p.joblib\n",
      " - Columns: ./artifacts\\X_proc_columns.joblib\n"
     ]
    }
   ],
   "source": [
    "# CELL 7 — FINALIZE BEST MODEL (ExtraTrees) + EVAL ON HOLDOUT VAL + SAVE ARTIFACTS\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from sklearn.ensemble import ExtraTreesRegressor\n",
    "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
    "import joblib\n",
    "\n",
    "# =========================\n",
    "# 1) Train final ExtraTrees on FULL TRAIN\n",
    "# =========================\n",
    "final_model = ExtraTreesRegressor(\n",
    "    n_estimators=800,\n",
    "    min_samples_leaf=2,\n",
    "    random_state=42,\n",
    "    n_jobs=-1\n",
    ")\n",
    "\n",
    "final_model.fit(X_train.values, np.log1p(y_train.values))\n",
    "\n",
    "# =========================\n",
    "# 2) Predict on HOLDOUT VAL\n",
    "# =========================\n",
    "pred_val = np.expm1(final_model.predict(X_val.values))\n",
    "pred_val = np.maximum(0, pred_val)\n",
    "\n",
    "# optional: round to headcount integer\n",
    "pred_val_round = np.rint(pred_val).astype(int)\n",
    "\n",
    "# =========================\n",
    "# 3) Metrics (raw vs rounded)\n",
    "# =========================\n",
    "def print_metrics(tag, y_true, y_pred):\n",
    "    mae  = mean_absolute_error(y_true, y_pred)\n",
    "    rmse = mean_squared_error(y_true, y_pred) ** 0.5\n",
    "    r2   = r2_score(y_true, y_pred)\n",
    "    print(f\"\\n[{tag}]\")\n",
    "    print(f\"MAE : {mae:.3f}\")\n",
    "    print(f\"RMSE: {rmse:.3f}\")\n",
    "    print(f\"R2  : {r2:.3f}\")\n",
    "    return {\"MAE\": mae, \"RMSE\": rmse, \"R2\": r2}\n",
    "\n",
    "print(\"=== FINAL MODEL: ExtraTrees (fit on TRAIN, eval on HOLDOUT VAL) ===\")\n",
    "m_raw = print_metrics(\"VAL (raw)\", y_val.values, pred_val)\n",
    "m_int = print_metrics(\"VAL (rounded)\", y_val.values, pred_val_round)\n",
    "\n",
    "# =========================\n",
    "# 4) Quick error analysis\n",
    "# =========================\n",
    "err_df = pd.DataFrame({\n",
    "    \"y_true\": y_val.values,\n",
    "    \"y_pred\": pred_val,\n",
    "    \"y_pred_round\": pred_val_round,\n",
    "    \"abs_err\": np.abs(y_val.values - pred_val),\n",
    "})\n",
    "print(\"\\nWorst 15 samples (by abs error):\")\n",
    "display(err_df.sort_values(\"abs_err\", ascending=False).head(15))\n",
    "\n",
    "# =========================\n",
    "# 5) Save model + schema (columns)\n",
    "# =========================\n",
    "ARTIFACT_DIR = \"./artifacts\"\n",
    "import os\n",
    "os.makedirs(ARTIFACT_DIR, exist_ok=True)\n",
    "\n",
    "model_path = os.path.join(ARTIFACT_DIR, \"extratrees_log1p.joblib\")\n",
    "cols_path  = os.path.join(ARTIFACT_DIR, \"X_proc_columns.joblib\")\n",
    "\n",
    "joblib.dump(final_model, model_path)\n",
    "joblib.dump(list(X_train.columns), cols_path)\n",
    "\n",
    "print(\"\\nSaved:\")\n",
    "print(\" - Model  :\", model_path)\n",
    "print(\" - Columns:\", cols_path)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "9cff151d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Preview:\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ma_dia_diem</th>\n",
       "      <th>so_luong_thuc_te</th>\n",
       "      <th>so_luong_du_doan_raw</th>\n",
       "      <th>so_luong_du_doan_round</th>\n",
       "      <th>abs_error</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>579-1</td>\n",
       "      <td>32.0</td>\n",
       "      <td>16.876830</td>\n",
       "      <td>17</td>\n",
       "      <td>15.123170</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>114-1</td>\n",
       "      <td>29.0</td>\n",
       "      <td>16.242619</td>\n",
       "      <td>16</td>\n",
       "      <td>12.757381</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>121-3</td>\n",
       "      <td>13.0</td>\n",
       "      <td>5.239388</td>\n",
       "      <td>5</td>\n",
       "      <td>7.760612</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>65</th>\n",
       "      <td>227-1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>7.994058</td>\n",
       "      <td>8</td>\n",
       "      <td>6.994058</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>55-1</td>\n",
       "      <td>14.0</td>\n",
       "      <td>7.210027</td>\n",
       "      <td>7</td>\n",
       "      <td>6.789973</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>55-1</td>\n",
       "      <td>12.0</td>\n",
       "      <td>5.706020</td>\n",
       "      <td>6</td>\n",
       "      <td>6.293980</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>60</th>\n",
       "      <td>236-1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>6.108304</td>\n",
       "      <td>6</td>\n",
       "      <td>5.108304</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>61</th>\n",
       "      <td>236-1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>5.948221</td>\n",
       "      <td>6</td>\n",
       "      <td>4.948221</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>56</th>\n",
       "      <td>236-1</td>\n",
       "      <td>2.0</td>\n",
       "      <td>6.426550</td>\n",
       "      <td>6</td>\n",
       "      <td>4.426550</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>121-4</td>\n",
       "      <td>11.0</td>\n",
       "      <td>6.758044</td>\n",
       "      <td>7</td>\n",
       "      <td>4.241956</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ma_dia_diem  so_luong_thuc_te  so_luong_du_doan_raw  \\\n",
       "34       579-1              32.0             16.876830   \n",
       "0        114-1              29.0             16.242619   \n",
       "14       121-3              13.0              5.239388   \n",
       "65       227-1               1.0              7.994058   \n",
       "28        55-1              14.0              7.210027   \n",
       "29        55-1              12.0              5.706020   \n",
       "60       236-1               1.0              6.108304   \n",
       "61       236-1               1.0              5.948221   \n",
       "56       236-1               2.0              6.426550   \n",
       "17       121-4              11.0              6.758044   \n",
       "\n",
       "    so_luong_du_doan_round  abs_error  \n",
       "34                      17  15.123170  \n",
       "0                       16  12.757381  \n",
       "14                       5   7.760612  \n",
       "65                       8   6.994058  \n",
       "28                       7   6.789973  \n",
       "29                       6   6.293980  \n",
       "60                       6   5.108304  \n",
       "61                       6   4.948221  \n",
       "56                       6   4.426550  \n",
       "17                       7   4.241956  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved CSV to: ./outputs\\val_predictions_extratrees.csv\n",
      "Rows: 68\n"
     ]
    }
   ],
   "source": [
    "# CELL 8 — EXPORT VAL PREDICTIONS TO CSV (FOR ANALYSIS / BUSINESS REVIEW)\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "\n",
    "# =========================\n",
    "# 1) Recompute predictions (safety, explicit)\n",
    "# =========================\n",
    "pred_val_raw = np.expm1(final_model.predict(X_val.values))\n",
    "pred_val_raw = np.maximum(0, pred_val_raw)\n",
    "\n",
    "pred_val_round = np.rint(pred_val_raw).astype(int)\n",
    "\n",
    "# =========================\n",
    "# 2) Build result DataFrame\n",
    "# =========================\n",
    "val_result = pd.DataFrame({\n",
    "    \"ma_dia_diem\": df.loc[val_idx, \"ma_dia_diem\"].values,\n",
    "    \"so_luong_thuc_te\": y_val.values,\n",
    "    \"so_luong_du_doan_raw\": pred_val_raw,\n",
    "    \"so_luong_du_doan_round\": pred_val_round,\n",
    "    \"abs_error\": np.abs(y_val.values - pred_val_raw),\n",
    "})\n",
    "\n",
    "# (optional) sort by error to review bad cases first\n",
    "val_result = val_result.sort_values(\"abs_error\", ascending=False)\n",
    "\n",
    "print(\"Preview:\")\n",
    "display(val_result.head(10))\n",
    "\n",
    "# =========================\n",
    "# 3) Save to CSV\n",
    "# =========================\n",
    "OUTPUT_DIR = \"./outputs\"\n",
    "os.makedirs(OUTPUT_DIR, exist_ok=True)\n",
    "\n",
    "csv_path = os.path.join(OUTPUT_DIR, \"val_predictions_extratrees.csv\")\n",
    "val_result.to_csv(csv_path, index=False, encoding=\"utf-8-sig\")\n",
    "\n",
    "print(f\"\\nSaved CSV to: {csv_path}\")\n",
    "print(f\"Rows: {len(val_result)}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "8cb3cde1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=== MODEL SAVED SUCCESSFULLY ===\n",
      "Model file   : ./artifacts\\extratrees_staff_model.joblib\n",
      "Columns file : ./artifacts\\X_proc_columns.joblib\n",
      "Meta file    : ./artifacts\\model_meta.joblib\n"
     ]
    }
   ],
   "source": [
    "# CELL 8 — SAVE TRAINED MODEL (NO PREDICT YET)\n",
    "\n",
    "import os\n",
    "import joblib\n",
    "\n",
    "# =========================\n",
    "# 1) Create artifact directory\n",
    "# =========================\n",
    "ARTIFACT_DIR = \"./artifacts\"\n",
    "os.makedirs(ARTIFACT_DIR, exist_ok=True)\n",
    "\n",
    "# =========================\n",
    "# 2) Define paths\n",
    "# =========================\n",
    "MODEL_PATH = os.path.join(ARTIFACT_DIR, \"extratrees_staff_model.joblib\")\n",
    "COLUMNS_PATH = os.path.join(ARTIFACT_DIR, \"X_proc_columns.joblib\")\n",
    "META_PATH = os.path.join(ARTIFACT_DIR, \"model_meta.joblib\")\n",
    "\n",
    "# =========================\n",
    "# 3) Save model\n",
    "# =========================\n",
    "joblib.dump(final_model, MODEL_PATH)\n",
    "\n",
    "# =========================\n",
    "# 4) Save feature schema (VERY IMPORTANT)\n",
    "# =========================\n",
    "joblib.dump(list(X_train.columns), COLUMNS_PATH)\n",
    "\n",
    "# =========================\n",
    "# 5) Save metadata (optional but professional)\n",
    "# =========================\n",
    "meta = {\n",
    "    \"model_type\": \"ExtraTreesRegressor\",\n",
    "    \"target\": \"so_luong\",\n",
    "    \"target_transform\": \"log1p -> expm1\",\n",
    "    \"train_size\": len(X_train),\n",
    "    \"val_size\": len(X_val),\n",
    "    \"features\": X_train.shape[1],\n",
    "    \"note\": \"Predict staff headcount per shift\",\n",
    "}\n",
    "\n",
    "joblib.dump(meta, META_PATH)\n",
    "\n",
    "print(\"=== MODEL SAVED SUCCESSFULLY ===\")\n",
    "print(\"Model file   :\", MODEL_PATH)\n",
    "print(\"Columns file :\", COLUMNS_PATH)\n",
    "print(\"Meta file    :\", META_PATH)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "0eab135b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['tong_gio_lam', 'so_ca_cua_toa', 'num_tasks', 'num_cleaning_tasks',\n",
       "       'num_trash_collection_tasks', 'num_monitoring_tasks',\n",
       "       'num_deep_cleaning_tasks', 'num_support_tasks', 'num_other_tasks',\n",
       "       'num_wc_tasks', 'num_hallway_tasks', 'num_lobby_tasks',\n",
       "       'num_outdoor_tasks', 'num_elevator_tasks', 'cleaning_ratio',\n",
       "       'trash_collection_ratio', 'monitoring_ratio', 'area_diversity',\n",
       "       'so_tang', 'so_cua_thang_may', 'dien_tich_ngoai_canh', 'dien_tich_sanh',\n",
       "       'dien_tich_hanh_lang', 'dien_tich_wc', 'dien_tich_phong',\n",
       "       'dien_tich_tham', 'doc_ham', 'vien_phan_quang', 'op_tuong',\n",
       "       'op_chan_tuong', 'ranh_thoat_nuoc', 'dien_tich_kinh',\n",
       "       'num_medical_tasks_total', 'num_indoor_room_tasks', 'hour_start',\n",
       "       'hour_end', 'shift_length', 'is_cross_day', 'loai_ca_24/24',\n",
       "       'loai_ca_Ca chiều', 'loai_ca_Ca gãy', 'loai_ca_Ca sáng',\n",
       "       'loai_ca_Ca đêm', 'loai_ca_Hành chính', 'loai_ca_Part time',\n",
       "       'loai_ca_nan'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_val.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "1dd44caa",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['ma_dia_diem', 'all_task_normal', 'all_task_dinhky', 'loai_ca',\n",
       "       'bat_dau', 'ket_thuc', 'tong_gio_lam', 'so_ca_cua_toa', 'so_luong',\n",
       "       'num_tasks', 'num_cleaning_tasks', 'num_trash_collection_tasks',\n",
       "       'num_monitoring_tasks', 'num_deep_cleaning_tasks', 'num_support_tasks',\n",
       "       'num_other_tasks', 'num_wc_tasks', 'num_hallway_tasks',\n",
       "       'num_lobby_tasks', 'num_outdoor_tasks', 'num_elevator_tasks',\n",
       "       'cleaning_ratio', 'trash_collection_ratio', 'monitoring_ratio',\n",
       "       'area_diversity', 'so_tang', 'so_cua_thang_may', 'dien_tich_ngoai_canh',\n",
       "       'dien_tich_sanh', 'dien_tich_hanh_lang', 'dien_tich_wc',\n",
       "       'dien_tich_phong', 'dien_tich_tham', 'doc_ham', 'vien_phan_quang',\n",
       "       'op_tuong', 'op_chan_tuong', 'ranh_thoat_nuoc', 'dien_tich_kinh',\n",
       "       'num_medical_tasks_total', 'num_indoor_room_tasks',\n",
       "       'is_tasks_text_missing'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a7036167",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "d8dbd670",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "3ebac85b",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "22866fc4",
   "metadata": {},
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}