""" MERGE 3 FILES: SHIFT + TASK + BUILDING FEATURES Gộp 3 file Excel thành 1 file tổng để predict số nhân sự Created: January 5, 2026 """ import pandas as pd import numpy as np def merge_three_datasets( shift_file: str, task_file: str, building_file: str, output_excel: str, output_csv: str ): """ Merge 3 datasets: 1. shift_features_for_prediction.xlsx (942 rows × 7 cols) 2. ket_qua_cong_viec_full_WITH_FEATURES.xlsx (302 rows × 28 cols) 3. Du_Lieu_Toa_Nha_Aggregate.xlsx (233 rows × 18 cols) Join key: ma_dia_diem (Mã địa điểm) Join type: LEFT JOIN (keep all shifts) Expected output: 942 rows × (7 + 25 + 17) = 49 cols """ print("=" * 80) print("🚀 MERGE 3 DATASETS: SHIFT + TASK + BUILDING FEATURES") print("=" * 80) # ===================================================================== # 1. ĐỌC FILE 1: SHIFT FEATURES (base dataset) # ===================================================================== print(f"\n📂 [1/3] Đọc file SHIFT features: {shift_file}") df_shift = pd.read_excel(shift_file) print(f" ✅ Shape: {df_shift.shape}") print(f" ✅ Columns: {list(df_shift.columns)}") print(f" ✅ Unique buildings: {df_shift['ma_dia_diem'].nunique()}") # ===================================================================== # 2. ĐỌC FILE 2: TASK FEATURES # ===================================================================== print(f"\n📂 [2/3] Đọc file TASK features: {task_file}") df_task = pd.read_excel(task_file) print(f" ✅ Shape: {df_task.shape}") print(f" ✅ Unique buildings: {df_task['ma_dia_diem'].nunique()}") # Loại bỏ cột text gốc (không cần cho modeling) cols_to_drop = ['all_task_normal', 'all_task_dinhky'] df_task = df_task.drop(columns=[c for c in cols_to_drop if c in df_task.columns]) print(f" ✅ Đã loại bỏ cột text, còn lại: {df_task.shape[1]} cột") print(f" ✅ Task feature columns: {list(df_task.columns)}") # ===================================================================== # 3. ĐỌC FILE 3: BUILDING FEATURES # ===================================================================== print(f"\n📂 [3/3] Đọc file BUILDING features: {building_file}") df_building = pd.read_excel(building_file) print(f" ✅ Shape: {df_building.shape}") print(f" ✅ Unique buildings: {df_building['Mã địa điểm'].nunique()}") # Rename column để match với các file khác df_building = df_building.rename(columns={'Mã địa điểm': 'ma_dia_diem'}) # Rename các cột để dễ dùng (bỏ dấu, khoảng trắng) column_mapping = { 'Loại hình': 'loai_hinh', 'Tên Tòa Tháp': 'ten_toa_thap', 'Mức độ Lưu lượng KH': 'muc_do_luu_luong', 'Số tầng': 'so_tang', 'Tổng số cửa thang máy': 'so_cua_thang_may', 'Diện tích ngoại cảnh Tòa tháp (m2)': 'dien_tich_ngoai_canh', 'Sàn Sảnh (m2)': 'dien_tich_sanh', 'Sàn Hành lang (m2)': 'dien_tich_hanh_lang', 'Sàn WC (m2)': 'dien_tich_wc', 'Sàn Phòng (m2)': 'dien_tich_phong', 'Thảm (m2)': 'dien_tich_tham', 'Dốc hầm (m)': 'doc_ham', 'Viền phản quang (m)': 'vien_phan_quang', 'Ốp tường (m2)': 'op_tuong', 'Ốp chân tường (m2)': 'op_chan_tuong', 'Rãnh thoát nước (m)': 'ranh_thoat_nuoc', 'Kính (m2)': 'dien_tich_kinh' } df_building = df_building.rename(columns=column_mapping) print(f" ✅ Đã rename columns: {list(df_building.columns)}") # ===================================================================== # 4. MERGE DATASET 1 (SHIFT) + DATASET 2 (TASK) # ===================================================================== print(f"\n🔗 [MERGE 1/2] Merge SHIFT + TASK features...") print(f" Join key: ma_dia_diem") print(f" Join type: LEFT (keep all shifts)") df_merged = df_shift.merge( df_task, on='ma_dia_diem', how='left', suffixes=('', '_task') ) print(f" ✅ Result shape: {df_merged.shape}") print(f" ✅ Số shift giữ nguyên: {len(df_merged)} (expected: {len(df_shift)})") # Check missing values sau merge task_features = [col for col in df_task.columns if col != 'ma_dia_diem'] missing_task_features = df_merged[task_features].isna().sum().sum() print(f" ⚠️ Missing values trong task features: {missing_task_features}") # ===================================================================== # 5. MERGE RESULT + DATASET 3 (BUILDING) # ===================================================================== print(f"\n🔗 [MERGE 2/2] Merge (SHIFT+TASK) + BUILDING features...") print(f" Join key: ma_dia_diem") print(f" Join type: LEFT (keep all shifts)") df_final = df_merged.merge( df_building, on='ma_dia_diem', how='left', suffixes=('', '_building') ) print(f" ✅ Result shape: {df_final.shape}") print(f" ✅ Số shift giữ nguyên: {len(df_final)} (expected: {len(df_shift)})") # Check missing values sau merge building_features = [col for col in df_building.columns if col != 'ma_dia_diem'] missing_building_features = df_final[building_features].isna().sum().sum() print(f" ⚠️ Missing values trong building features: {missing_building_features}") # ===================================================================== # 6. FINAL STATISTICS # ===================================================================== print(f"\n📊 FINAL DATASET STATISTICS:") print(f" 📐 Shape: {df_final.shape}") print(f" 🏢 Unique buildings: {df_final['ma_dia_diem'].nunique()}") print(f" 📋 Total columns: {len(df_final.columns)}") print(f"\n 📋 COLUMN BREAKDOWN:") print(f" - Shift features: 7 cols") print(f" - Task features: {len(task_features)} cols") print(f" - Building features: {len(building_features)} cols") print(f" - Total: {7 + len(task_features) + len(building_features)} cols") # Check missing values tổng thể print(f"\n ⚠️ MISSING VALUES BY COLUMN:") missing_summary = df_final.isna().sum() missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False) if len(missing_summary) > 0: print(f" Found {len(missing_summary)} columns with missing values:") for col, count in missing_summary.head(20).items(): pct = count / len(df_final) * 100 print(f" - {col}: {count} ({pct:.1f}%)") else: print(f" ✅ No missing values!") # ===================================================================== # 7. DATA VALIDATION # ===================================================================== print(f"\n✅ DATA VALIDATION:") # Check target variable print(f" 🎯 Target variable (so_luong):") print(f" - Count: {df_final['so_luong'].notna().sum()}") print(f" - Missing: {df_final['so_luong'].isna().sum()}") print(f" - Min: {df_final['so_luong'].min()}") print(f" - Mean: {df_final['so_luong'].mean():.2f}") print(f" - Median: {df_final['so_luong'].median():.0f}") print(f" - Max: {df_final['so_luong'].max()}") # Check feature coverage print(f"\n 📊 Feature coverage:") shift_buildings = set(df_shift['ma_dia_diem'].unique()) task_buildings = set(df_task['ma_dia_diem'].unique()) building_buildings = set(df_building['ma_dia_diem'].unique()) print(f" - Shifts: {len(shift_buildings)} buildings") print(f" - Tasks: {len(task_buildings)} buildings") print(f" - Building info: {len(building_buildings)} buildings") # Overlap analysis shift_with_task = shift_buildings.intersection(task_buildings) shift_with_building = shift_buildings.intersection(building_buildings) all_three = shift_buildings.intersection(task_buildings).intersection(building_buildings) print(f"\n 🔗 Overlap analysis:") print(f" - Shifts ∩ Tasks: {len(shift_with_task)} buildings") print(f" - Shifts ∩ Building: {len(shift_with_building)} buildings") print(f" - All three: {len(all_three)} buildings") shift_only = shift_buildings - task_buildings - building_buildings if len(shift_only) > 0: print(f"\n ⚠️ Buildings with shift only (no task/building data): {len(shift_only)}") print(f" Examples: {list(shift_only)[:10]}") # ===================================================================== # 8. EXPORT FILES # ===================================================================== print(f"\n💾 EXPORTING FILES...") # Excel print(f" [1/2] Exporting Excel: {output_excel}") df_final.to_excel(output_excel, index=False, engine='openpyxl') print(f" ✅ Done!") # CSV print(f" [2/2] Exporting CSV: {output_csv}") df_final.to_csv(output_csv, index=False, encoding='utf-8-sig') print(f" ✅ Done!") # ===================================================================== # 9. SUMMARY # ===================================================================== print(f"\n" + "=" * 80) print("✅ MERGE COMPLETED!") print("=" * 80) print(f"\n📁 FILES CREATED:") print(f" 1. {output_excel} ({df_final.shape[0]} rows × {df_final.shape[1]} columns)") print(f" 2. {output_csv} (CSV backup)") print(f"\n📋 COLUMN STRUCTURE:") print(f" - ma_dia_diem (identifier)") print(f" - Shift features (6): loai_ca, bat_dau, ket_thuc, tong_gio_lam, so_ca_cua_toa") print(f" - Task features ({len(task_features)}): num_tasks, cleaning_ratio, ...") print(f" - Building features ({len(building_features)}): so_tang, dien_tich_*, ...") print(f" - so_luong (TARGET)") print(f"\n🎯 READY FOR MACHINE LEARNING!") print(f" - Total samples: {len(df_final)}") print(f" - Total features: {df_final.shape[1] - 2} (excluding ma_dia_diem & target)") print(f" - Target variable: so_luong") return df_final if __name__ == "__main__": # File paths shift_file = "shift_features_for_prediction.xlsx" task_file = "ket_qua_cong_viec_full_WITH_FEATURES.xlsx" building_file = "Du_Lieu_Toa_Nha_Aggregate.xlsx" output_excel = "COMPLETE_DATASET_FOR_PREDICTION.xlsx" output_csv = "COMPLETE_DATASET_FOR_PREDICTION.csv" # Run merge df_final = merge_three_datasets( shift_file, task_file, building_file, output_excel, output_csv ) # Display sample print(f"\n📊 SAMPLE DATA (first 5 rows, first 15 columns):") print(df_final.iloc[:5, :15].to_string()) print(f"\n📊 COLUMN LIST:") for i, col in enumerate(df_final.columns, 1): print(f" {i:2d}. {col}")