271 lines
11 KiB
Python
271 lines
11 KiB
Python
"""
|
||
MERGE 3 FILES: SHIFT + TASK + BUILDING FEATURES
|
||
Gộp 3 file Excel thành 1 file tổng để predict số nhân sự
|
||
Created: January 5, 2026
|
||
"""
|
||
|
||
import pandas as pd
|
||
import numpy as np
|
||
|
||
def merge_three_datasets(
|
||
shift_file: str,
|
||
task_file: str,
|
||
building_file: str,
|
||
output_excel: str,
|
||
output_csv: str
|
||
):
|
||
"""
|
||
Merge 3 datasets:
|
||
1. shift_features_for_prediction.xlsx (942 rows × 7 cols)
|
||
2. ket_qua_cong_viec_full_WITH_FEATURES.xlsx (302 rows × 28 cols)
|
||
3. Du_Lieu_Toa_Nha_Aggregate.xlsx (233 rows × 18 cols)
|
||
|
||
Join key: ma_dia_diem (Mã địa điểm)
|
||
Join type: LEFT JOIN (keep all shifts)
|
||
|
||
Expected output: 942 rows × (7 + 25 + 17) = 49 cols
|
||
"""
|
||
|
||
print("=" * 80)
|
||
print("🚀 MERGE 3 DATASETS: SHIFT + TASK + BUILDING FEATURES")
|
||
print("=" * 80)
|
||
|
||
# =====================================================================
|
||
# 1. ĐỌC FILE 1: SHIFT FEATURES (base dataset)
|
||
# =====================================================================
|
||
print(f"\n📂 [1/3] Đọc file SHIFT features: {shift_file}")
|
||
df_shift = pd.read_excel(shift_file)
|
||
|
||
print(f" ✅ Shape: {df_shift.shape}")
|
||
print(f" ✅ Columns: {list(df_shift.columns)}")
|
||
print(f" ✅ Unique buildings: {df_shift['ma_dia_diem'].nunique()}")
|
||
|
||
# =====================================================================
|
||
# 2. ĐỌC FILE 2: TASK FEATURES
|
||
# =====================================================================
|
||
print(f"\n📂 [2/3] Đọc file TASK features: {task_file}")
|
||
df_task = pd.read_excel(task_file)
|
||
|
||
print(f" ✅ Shape: {df_task.shape}")
|
||
print(f" ✅ Unique buildings: {df_task['ma_dia_diem'].nunique()}")
|
||
|
||
# Loại bỏ cột text gốc (không cần cho modeling)
|
||
cols_to_drop = ['all_task_normal', 'all_task_dinhky']
|
||
df_task = df_task.drop(columns=[c for c in cols_to_drop if c in df_task.columns])
|
||
|
||
print(f" ✅ Đã loại bỏ cột text, còn lại: {df_task.shape[1]} cột")
|
||
print(f" ✅ Task feature columns: {list(df_task.columns)}")
|
||
|
||
# =====================================================================
|
||
# 3. ĐỌC FILE 3: BUILDING FEATURES
|
||
# =====================================================================
|
||
print(f"\n📂 [3/3] Đọc file BUILDING features: {building_file}")
|
||
df_building = pd.read_excel(building_file)
|
||
|
||
print(f" ✅ Shape: {df_building.shape}")
|
||
print(f" ✅ Unique buildings: {df_building['Mã địa điểm'].nunique()}")
|
||
|
||
# Rename column để match với các file khác
|
||
df_building = df_building.rename(columns={'Mã địa điểm': 'ma_dia_diem'})
|
||
|
||
# Rename các cột để dễ dùng (bỏ dấu, khoảng trắng)
|
||
column_mapping = {
|
||
'Loại hình': 'loai_hinh',
|
||
'Tên Tòa Tháp': 'ten_toa_thap',
|
||
'Mức độ Lưu lượng KH': 'muc_do_luu_luong',
|
||
'Số tầng': 'so_tang',
|
||
'Tổng số cửa thang máy': 'so_cua_thang_may',
|
||
'Diện tích ngoại cảnh Tòa tháp (m2)': 'dien_tich_ngoai_canh',
|
||
'Sàn Sảnh (m2)': 'dien_tich_sanh',
|
||
'Sàn Hành lang (m2)': 'dien_tich_hanh_lang',
|
||
'Sàn WC (m2)': 'dien_tich_wc',
|
||
'Sàn Phòng (m2)': 'dien_tich_phong',
|
||
'Thảm (m2)': 'dien_tich_tham',
|
||
'Dốc hầm (m)': 'doc_ham',
|
||
'Viền phản quang (m)': 'vien_phan_quang',
|
||
'Ốp tường (m2)': 'op_tuong',
|
||
'Ốp chân tường (m2)': 'op_chan_tuong',
|
||
'Rãnh thoát nước (m)': 'ranh_thoat_nuoc',
|
||
'Kính (m2)': 'dien_tich_kinh'
|
||
}
|
||
df_building = df_building.rename(columns=column_mapping)
|
||
|
||
print(f" ✅ Đã rename columns: {list(df_building.columns)}")
|
||
|
||
# =====================================================================
|
||
# 4. MERGE DATASET 1 (SHIFT) + DATASET 2 (TASK)
|
||
# =====================================================================
|
||
print(f"\n🔗 [MERGE 1/2] Merge SHIFT + TASK features...")
|
||
print(f" Join key: ma_dia_diem")
|
||
print(f" Join type: LEFT (keep all shifts)")
|
||
|
||
df_merged = df_shift.merge(
|
||
df_task,
|
||
on='ma_dia_diem',
|
||
how='left',
|
||
suffixes=('', '_task')
|
||
)
|
||
|
||
print(f" ✅ Result shape: {df_merged.shape}")
|
||
print(f" ✅ Số shift giữ nguyên: {len(df_merged)} (expected: {len(df_shift)})")
|
||
|
||
# Check missing values sau merge
|
||
task_features = [col for col in df_task.columns if col != 'ma_dia_diem']
|
||
missing_task_features = df_merged[task_features].isna().sum().sum()
|
||
print(f" ⚠️ Missing values trong task features: {missing_task_features}")
|
||
|
||
# =====================================================================
|
||
# 5. MERGE RESULT + DATASET 3 (BUILDING)
|
||
# =====================================================================
|
||
print(f"\n🔗 [MERGE 2/2] Merge (SHIFT+TASK) + BUILDING features...")
|
||
print(f" Join key: ma_dia_diem")
|
||
print(f" Join type: LEFT (keep all shifts)")
|
||
|
||
df_final = df_merged.merge(
|
||
df_building,
|
||
on='ma_dia_diem',
|
||
how='left',
|
||
suffixes=('', '_building')
|
||
)
|
||
|
||
print(f" ✅ Result shape: {df_final.shape}")
|
||
print(f" ✅ Số shift giữ nguyên: {len(df_final)} (expected: {len(df_shift)})")
|
||
|
||
# Check missing values sau merge
|
||
building_features = [col for col in df_building.columns if col != 'ma_dia_diem']
|
||
missing_building_features = df_final[building_features].isna().sum().sum()
|
||
print(f" ⚠️ Missing values trong building features: {missing_building_features}")
|
||
|
||
# =====================================================================
|
||
# 6. FINAL STATISTICS
|
||
# =====================================================================
|
||
print(f"\n📊 FINAL DATASET STATISTICS:")
|
||
print(f" 📐 Shape: {df_final.shape}")
|
||
print(f" 🏢 Unique buildings: {df_final['ma_dia_diem'].nunique()}")
|
||
print(f" 📋 Total columns: {len(df_final.columns)}")
|
||
|
||
print(f"\n 📋 COLUMN BREAKDOWN:")
|
||
print(f" - Shift features: 7 cols")
|
||
print(f" - Task features: {len(task_features)} cols")
|
||
print(f" - Building features: {len(building_features)} cols")
|
||
print(f" - Total: {7 + len(task_features) + len(building_features)} cols")
|
||
|
||
# Check missing values tổng thể
|
||
print(f"\n ⚠️ MISSING VALUES BY COLUMN:")
|
||
missing_summary = df_final.isna().sum()
|
||
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)
|
||
|
||
if len(missing_summary) > 0:
|
||
print(f" Found {len(missing_summary)} columns with missing values:")
|
||
for col, count in missing_summary.head(20).items():
|
||
pct = count / len(df_final) * 100
|
||
print(f" - {col}: {count} ({pct:.1f}%)")
|
||
else:
|
||
print(f" ✅ No missing values!")
|
||
|
||
# =====================================================================
|
||
# 7. DATA VALIDATION
|
||
# =====================================================================
|
||
print(f"\n✅ DATA VALIDATION:")
|
||
|
||
# Check target variable
|
||
print(f" 🎯 Target variable (so_luong):")
|
||
print(f" - Count: {df_final['so_luong'].notna().sum()}")
|
||
print(f" - Missing: {df_final['so_luong'].isna().sum()}")
|
||
print(f" - Min: {df_final['so_luong'].min()}")
|
||
print(f" - Mean: {df_final['so_luong'].mean():.2f}")
|
||
print(f" - Median: {df_final['so_luong'].median():.0f}")
|
||
print(f" - Max: {df_final['so_luong'].max()}")
|
||
|
||
# Check feature coverage
|
||
print(f"\n 📊 Feature coverage:")
|
||
shift_buildings = set(df_shift['ma_dia_diem'].unique())
|
||
task_buildings = set(df_task['ma_dia_diem'].unique())
|
||
building_buildings = set(df_building['ma_dia_diem'].unique())
|
||
|
||
print(f" - Shifts: {len(shift_buildings)} buildings")
|
||
print(f" - Tasks: {len(task_buildings)} buildings")
|
||
print(f" - Building info: {len(building_buildings)} buildings")
|
||
|
||
# Overlap analysis
|
||
shift_with_task = shift_buildings.intersection(task_buildings)
|
||
shift_with_building = shift_buildings.intersection(building_buildings)
|
||
all_three = shift_buildings.intersection(task_buildings).intersection(building_buildings)
|
||
|
||
print(f"\n 🔗 Overlap analysis:")
|
||
print(f" - Shifts ∩ Tasks: {len(shift_with_task)} buildings")
|
||
print(f" - Shifts ∩ Building: {len(shift_with_building)} buildings")
|
||
print(f" - All three: {len(all_three)} buildings")
|
||
|
||
shift_only = shift_buildings - task_buildings - building_buildings
|
||
if len(shift_only) > 0:
|
||
print(f"\n ⚠️ Buildings with shift only (no task/building data): {len(shift_only)}")
|
||
print(f" Examples: {list(shift_only)[:10]}")
|
||
|
||
# =====================================================================
|
||
# 8. EXPORT FILES
|
||
# =====================================================================
|
||
print(f"\n💾 EXPORTING FILES...")
|
||
|
||
# Excel
|
||
print(f" [1/2] Exporting Excel: {output_excel}")
|
||
df_final.to_excel(output_excel, index=False, engine='openpyxl')
|
||
print(f" ✅ Done!")
|
||
|
||
# CSV
|
||
print(f" [2/2] Exporting CSV: {output_csv}")
|
||
df_final.to_csv(output_csv, index=False, encoding='utf-8-sig')
|
||
print(f" ✅ Done!")
|
||
|
||
# =====================================================================
|
||
# 9. SUMMARY
|
||
# =====================================================================
|
||
print(f"\n" + "=" * 80)
|
||
print("✅ MERGE COMPLETED!")
|
||
print("=" * 80)
|
||
|
||
print(f"\n📁 FILES CREATED:")
|
||
print(f" 1. {output_excel} ({df_final.shape[0]} rows × {df_final.shape[1]} columns)")
|
||
print(f" 2. {output_csv} (CSV backup)")
|
||
|
||
print(f"\n📋 COLUMN STRUCTURE:")
|
||
print(f" - ma_dia_diem (identifier)")
|
||
print(f" - Shift features (6): loai_ca, bat_dau, ket_thuc, tong_gio_lam, so_ca_cua_toa")
|
||
print(f" - Task features ({len(task_features)}): num_tasks, cleaning_ratio, ...")
|
||
print(f" - Building features ({len(building_features)}): so_tang, dien_tich_*, ...")
|
||
print(f" - so_luong (TARGET)")
|
||
|
||
print(f"\n🎯 READY FOR MACHINE LEARNING!")
|
||
print(f" - Total samples: {len(df_final)}")
|
||
print(f" - Total features: {df_final.shape[1] - 2} (excluding ma_dia_diem & target)")
|
||
print(f" - Target variable: so_luong")
|
||
|
||
return df_final
|
||
|
||
|
||
if __name__ == "__main__":
|
||
# File paths
|
||
shift_file = "shift_features_for_prediction.xlsx"
|
||
task_file = "ket_qua_cong_viec_full_WITH_FEATURES.xlsx"
|
||
building_file = "Du_Lieu_Toa_Nha_Aggregate.xlsx"
|
||
|
||
output_excel = "COMPLETE_DATASET_FOR_PREDICTION.xlsx"
|
||
output_csv = "COMPLETE_DATASET_FOR_PREDICTION.csv"
|
||
|
||
# Run merge
|
||
df_final = merge_three_datasets(
|
||
shift_file,
|
||
task_file,
|
||
building_file,
|
||
output_excel,
|
||
output_csv
|
||
)
|
||
|
||
# Display sample
|
||
print(f"\n📊 SAMPLE DATA (first 5 rows, first 15 columns):")
|
||
print(df_final.iloc[:5, :15].to_string())
|
||
|
||
print(f"\n📊 COLUMN LIST:")
|
||
for i, col in enumerate(df_final.columns, 1):
|
||
print(f" {i:2d}. {col}")
|