predict_caLamviec_nhansu/merge_all_features.py

271 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
MERGE 3 FILES: SHIFT + TASK + BUILDING FEATURES
Gộp 3 file Excel thành 1 file tổng để predict số nhân sự
Created: January 5, 2026
"""
import pandas as pd
import numpy as np
def merge_three_datasets(
shift_file: str,
task_file: str,
building_file: str,
output_excel: str,
output_csv: str
):
"""
Merge 3 datasets:
1. shift_features_for_prediction.xlsx (942 rows × 7 cols)
2. ket_qua_cong_viec_full_WITH_FEATURES.xlsx (302 rows × 28 cols)
3. Du_Lieu_Toa_Nha_Aggregate.xlsx (233 rows × 18 cols)
Join key: ma_dia_diem (Mã địa điểm)
Join type: LEFT JOIN (keep all shifts)
Expected output: 942 rows × (7 + 25 + 17) = 49 cols
"""
print("=" * 80)
print("🚀 MERGE 3 DATASETS: SHIFT + TASK + BUILDING FEATURES")
print("=" * 80)
# =====================================================================
# 1. ĐỌC FILE 1: SHIFT FEATURES (base dataset)
# =====================================================================
print(f"\n📂 [1/3] Đọc file SHIFT features: {shift_file}")
df_shift = pd.read_excel(shift_file)
print(f" ✅ Shape: {df_shift.shape}")
print(f" ✅ Columns: {list(df_shift.columns)}")
print(f" ✅ Unique buildings: {df_shift['ma_dia_diem'].nunique()}")
# =====================================================================
# 2. ĐỌC FILE 2: TASK FEATURES
# =====================================================================
print(f"\n📂 [2/3] Đọc file TASK features: {task_file}")
df_task = pd.read_excel(task_file)
print(f" ✅ Shape: {df_task.shape}")
print(f" ✅ Unique buildings: {df_task['ma_dia_diem'].nunique()}")
# Loại bỏ cột text gốc (không cần cho modeling)
cols_to_drop = ['all_task_normal', 'all_task_dinhky']
df_task = df_task.drop(columns=[c for c in cols_to_drop if c in df_task.columns])
print(f" ✅ Đã loại bỏ cột text, còn lại: {df_task.shape[1]} cột")
print(f" ✅ Task feature columns: {list(df_task.columns)}")
# =====================================================================
# 3. ĐỌC FILE 3: BUILDING FEATURES
# =====================================================================
print(f"\n📂 [3/3] Đọc file BUILDING features: {building_file}")
df_building = pd.read_excel(building_file)
print(f" ✅ Shape: {df_building.shape}")
print(f" ✅ Unique buildings: {df_building['Mã địa điểm'].nunique()}")
# Rename column để match với các file khác
df_building = df_building.rename(columns={'Mã địa điểm': 'ma_dia_diem'})
# Rename các cột để dễ dùng (bỏ dấu, khoảng trắng)
column_mapping = {
'Loại hình': 'loai_hinh',
'Tên Tòa Tháp': 'ten_toa_thap',
'Mức độ Lưu lượng KH': 'muc_do_luu_luong',
'Số tầng': 'so_tang',
'Tổng số cửa thang máy': 'so_cua_thang_may',
'Diện tích ngoại cảnh Tòa tháp (m2)': 'dien_tich_ngoai_canh',
'Sàn Sảnh (m2)': 'dien_tich_sanh',
'Sàn Hành lang (m2)': 'dien_tich_hanh_lang',
'Sàn WC (m2)': 'dien_tich_wc',
'Sàn Phòng (m2)': 'dien_tich_phong',
'Thảm (m2)': 'dien_tich_tham',
'Dốc hầm (m)': 'doc_ham',
'Viền phản quang (m)': 'vien_phan_quang',
'Ốp tường (m2)': 'op_tuong',
'Ốp chân tường (m2)': 'op_chan_tuong',
'Rãnh thoát nước (m)': 'ranh_thoat_nuoc',
'Kính (m2)': 'dien_tich_kinh'
}
df_building = df_building.rename(columns=column_mapping)
print(f" ✅ Đã rename columns: {list(df_building.columns)}")
# =====================================================================
# 4. MERGE DATASET 1 (SHIFT) + DATASET 2 (TASK)
# =====================================================================
print(f"\n🔗 [MERGE 1/2] Merge SHIFT + TASK features...")
print(f" Join key: ma_dia_diem")
print(f" Join type: LEFT (keep all shifts)")
df_merged = df_shift.merge(
df_task,
on='ma_dia_diem',
how='left',
suffixes=('', '_task')
)
print(f" ✅ Result shape: {df_merged.shape}")
print(f" ✅ Số shift giữ nguyên: {len(df_merged)} (expected: {len(df_shift)})")
# Check missing values sau merge
task_features = [col for col in df_task.columns if col != 'ma_dia_diem']
missing_task_features = df_merged[task_features].isna().sum().sum()
print(f" ⚠️ Missing values trong task features: {missing_task_features}")
# =====================================================================
# 5. MERGE RESULT + DATASET 3 (BUILDING)
# =====================================================================
print(f"\n🔗 [MERGE 2/2] Merge (SHIFT+TASK) + BUILDING features...")
print(f" Join key: ma_dia_diem")
print(f" Join type: LEFT (keep all shifts)")
df_final = df_merged.merge(
df_building,
on='ma_dia_diem',
how='left',
suffixes=('', '_building')
)
print(f" ✅ Result shape: {df_final.shape}")
print(f" ✅ Số shift giữ nguyên: {len(df_final)} (expected: {len(df_shift)})")
# Check missing values sau merge
building_features = [col for col in df_building.columns if col != 'ma_dia_diem']
missing_building_features = df_final[building_features].isna().sum().sum()
print(f" ⚠️ Missing values trong building features: {missing_building_features}")
# =====================================================================
# 6. FINAL STATISTICS
# =====================================================================
print(f"\n📊 FINAL DATASET STATISTICS:")
print(f" 📐 Shape: {df_final.shape}")
print(f" 🏢 Unique buildings: {df_final['ma_dia_diem'].nunique()}")
print(f" 📋 Total columns: {len(df_final.columns)}")
print(f"\n 📋 COLUMN BREAKDOWN:")
print(f" - Shift features: 7 cols")
print(f" - Task features: {len(task_features)} cols")
print(f" - Building features: {len(building_features)} cols")
print(f" - Total: {7 + len(task_features) + len(building_features)} cols")
# Check missing values tổng thể
print(f"\n ⚠️ MISSING VALUES BY COLUMN:")
missing_summary = df_final.isna().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)
if len(missing_summary) > 0:
print(f" Found {len(missing_summary)} columns with missing values:")
for col, count in missing_summary.head(20).items():
pct = count / len(df_final) * 100
print(f" - {col}: {count} ({pct:.1f}%)")
else:
print(f" ✅ No missing values!")
# =====================================================================
# 7. DATA VALIDATION
# =====================================================================
print(f"\n✅ DATA VALIDATION:")
# Check target variable
print(f" 🎯 Target variable (so_luong):")
print(f" - Count: {df_final['so_luong'].notna().sum()}")
print(f" - Missing: {df_final['so_luong'].isna().sum()}")
print(f" - Min: {df_final['so_luong'].min()}")
print(f" - Mean: {df_final['so_luong'].mean():.2f}")
print(f" - Median: {df_final['so_luong'].median():.0f}")
print(f" - Max: {df_final['so_luong'].max()}")
# Check feature coverage
print(f"\n 📊 Feature coverage:")
shift_buildings = set(df_shift['ma_dia_diem'].unique())
task_buildings = set(df_task['ma_dia_diem'].unique())
building_buildings = set(df_building['ma_dia_diem'].unique())
print(f" - Shifts: {len(shift_buildings)} buildings")
print(f" - Tasks: {len(task_buildings)} buildings")
print(f" - Building info: {len(building_buildings)} buildings")
# Overlap analysis
shift_with_task = shift_buildings.intersection(task_buildings)
shift_with_building = shift_buildings.intersection(building_buildings)
all_three = shift_buildings.intersection(task_buildings).intersection(building_buildings)
print(f"\n 🔗 Overlap analysis:")
print(f" - Shifts ∩ Tasks: {len(shift_with_task)} buildings")
print(f" - Shifts ∩ Building: {len(shift_with_building)} buildings")
print(f" - All three: {len(all_three)} buildings")
shift_only = shift_buildings - task_buildings - building_buildings
if len(shift_only) > 0:
print(f"\n ⚠️ Buildings with shift only (no task/building data): {len(shift_only)}")
print(f" Examples: {list(shift_only)[:10]}")
# =====================================================================
# 8. EXPORT FILES
# =====================================================================
print(f"\n💾 EXPORTING FILES...")
# Excel
print(f" [1/2] Exporting Excel: {output_excel}")
df_final.to_excel(output_excel, index=False, engine='openpyxl')
print(f" ✅ Done!")
# CSV
print(f" [2/2] Exporting CSV: {output_csv}")
df_final.to_csv(output_csv, index=False, encoding='utf-8-sig')
print(f" ✅ Done!")
# =====================================================================
# 9. SUMMARY
# =====================================================================
print(f"\n" + "=" * 80)
print("✅ MERGE COMPLETED!")
print("=" * 80)
print(f"\n📁 FILES CREATED:")
print(f" 1. {output_excel} ({df_final.shape[0]} rows × {df_final.shape[1]} columns)")
print(f" 2. {output_csv} (CSV backup)")
print(f"\n📋 COLUMN STRUCTURE:")
print(f" - ma_dia_diem (identifier)")
print(f" - Shift features (6): loai_ca, bat_dau, ket_thuc, tong_gio_lam, so_ca_cua_toa")
print(f" - Task features ({len(task_features)}): num_tasks, cleaning_ratio, ...")
print(f" - Building features ({len(building_features)}): so_tang, dien_tich_*, ...")
print(f" - so_luong (TARGET)")
print(f"\n🎯 READY FOR MACHINE LEARNING!")
print(f" - Total samples: {len(df_final)}")
print(f" - Total features: {df_final.shape[1] - 2} (excluding ma_dia_diem & target)")
print(f" - Target variable: so_luong")
return df_final
if __name__ == "__main__":
# File paths
shift_file = "shift_features_for_prediction.xlsx"
task_file = "ket_qua_cong_viec_full_WITH_FEATURES.xlsx"
building_file = "Du_Lieu_Toa_Nha_Aggregate.xlsx"
output_excel = "COMPLETE_DATASET_FOR_PREDICTION.xlsx"
output_csv = "COMPLETE_DATASET_FOR_PREDICTION.csv"
# Run merge
df_final = merge_three_datasets(
shift_file,
task_file,
building_file,
output_excel,
output_csv
)
# Display sample
print(f"\n📊 SAMPLE DATA (first 5 rows, first 15 columns):")
print(df_final.iloc[:5, :15].to_string())
print(f"\n📊 COLUMN LIST:")
for i, col in enumerate(df_final.columns, 1):
print(f" {i:2d}. {col}")