predict_caLamviec_nhansu/extract_shift_features.py

"""
EXTRACTION: SHIFT FEATURES FROM JSON FOR STAFF PREDICTION
Trích xuất features ca làm việc để predict số nhân sự
Created: January 5, 2026
"""

import pandas as pd
import json
from datetime import datetime
from typing import Dict, List

def parse_time_string(time_str):
    """Parse time string to extract hours"""
    if pd.isna(time_str) or time_str == 0:
        return 0.0

    time_str = str(time_str)

    # Handle datetime format: "2025-01-01 22:00:00"
    if '2025' in time_str or '2024' in time_str:
        try:
            dt = pd.to_datetime(time_str)
            return dt.hour + dt.minute/60.0
        except:
            pass

    # Handle time format: "HH:MM:SS"
    try:
        parts = time_str.split(':')
        if len(parts) >= 2:
            hours = float(parts[0])
            minutes = float(parts[1])
            return hours + minutes/60.0
    except:
        pass

    return 0.0

def extract_shift_features(json_file_path: str, output_excel_path: str):
    """
    Trích xuất shift features từ JSON và tạo Excel file

    Features:
    - ma_dia_diem: Mã tòa nhà
    - loai_ca: Loại ca làm việc
    - bat_dau: Giờ bắt đầu ca
    - ket_thuc: Giờ kết thúc ca
    - tong_gio_lam: Tổng số giờ làm việc
    - so_ca_cua_toa: Số lượng ca của tòa nhà xuất hiện trong file
    - so_luong: Target variable - Số nhân sự trong ca (để predict)

    Args:
        json_file_path: Path đến file JSON
        output_excel_path: Path output file Excel
    """

    print("=" * 80)
    print("🚀 TRÍCH XUẤT SHIFT FEATURES TỪ JSON")
    print("=" * 80)

    # 1. Đọc JSON file
    print(f"\n📂 Đọc file: {json_file_path}")
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    print(f"✅ Đọc thành công: {len(data)} records")

    # 2. Chuyển sang DataFrame
    df = pd.DataFrame(data)

    print(f"\n📊 Cấu trúc dữ liệu gốc:")
    print(f"   - Số dòng: {len(df)}")
    print(f"   - Số cột: {len(df.columns)}")
    print(f"   - Các cột: {list(df.columns)}")

    # 3. Đếm số ca của mỗi tòa nhà
    print(f"\n🔢 Đếm số ca của mỗi tòa nhà...")
    shift_counts = df['Mã địa điểm'].value_counts().to_dict()

    # Thống kê số ca
    unique_buildings = len(shift_counts)
    total_shifts = sum(shift_counts.values())
    avg_shifts = total_shifts / unique_buildings if unique_buildings > 0 else 0

    print(f"   - Số tòa nhà unique: {unique_buildings}")
    print(f"   - Tổng số ca: {total_shifts}")
    print(f"   - Trung bình ca/tòa: {avg_shifts:.2f}")

    # 4. Tạo features DataFrame
    print(f"\n🔧 Trích xuất features...")

    features_data = []

    for idx, row in df.iterrows():
        ma_dia_diem = row['Mã địa điểm']

        # Feature: Số ca của tòa nhà
        so_ca_cua_toa = shift_counts.get(ma_dia_diem, 0)

        # Parse time strings
        bat_dau_str = str(row['Bắt đầu'])
        ket_thuc_str = str(row['Kết thúc'])
        tong_gio_lam_str = str(row['Tổng giờ làm'])

        # Parse tổng giờ làm
        try:
            if 'day' in tong_gio_lam_str:
                # Format: "7 days, 12:00:00"
                parts = tong_gio_lam_str.split(',')
                days = int(parts[0].split()[0])
                time_parts = parts[1].strip().split(':')
                hours = days * 24 + int(time_parts[0])
                minutes = int(time_parts[1])
                tong_gio_lam = hours + minutes/60.0
            else:
                # Format: "8:00:00"
                time_parts = tong_gio_lam_str.split(':')
                tong_gio_lam = float(time_parts[0]) + float(time_parts[1])/60.0
        except:
            tong_gio_lam = 0.0

        # Target variable
        so_luong = row['Số lượng']

        # Thêm vào list
        features_data.append({
            'ma_dia_diem': ma_dia_diem,
            'loai_ca': row['Loại ca'],
            'bat_dau': bat_dau_str,
            'ket_thuc': ket_thuc_str,
            'tong_gio_lam': round(tong_gio_lam, 2),
            'so_ca_cua_toa': so_ca_cua_toa,
            'so_luong': so_luong
        })

    # 5. Tạo DataFrame
    features_df = pd.DataFrame(features_data)

    print(f"✅ Trích xuất thành công {len(features_df)} shifts")

    # 6. Thống kê
    print(f"\n📈 THỐNG KÊ FEATURES:")
    print(f"\n   🏢 Tòa nhà:")
    print(f"      - Số tòa unique: {features_df['ma_dia_diem'].nunique()}")

    print(f"\n   🕐 Loại ca:")
    for loai_ca, count in features_df['loai_ca'].value_counts().items():
        print(f"      - {loai_ca}: {count} ca ({count/len(features_df)*100:.1f}%)")

    print(f"\n   ⏱️ Tổng giờ làm:")
    print(f"      - Min: {features_df['tong_gio_lam'].min():.2f} giờ")
    print(f"      - Mean: {features_df['tong_gio_lam'].mean():.2f} giờ")
    print(f"      - Max: {features_df['tong_gio_lam'].max():.2f} giờ")

    print(f"\n   📊 Số ca của tòa:")
    print(f"      - Min: {features_df['so_ca_cua_toa'].min()} ca")
    print(f"      - Mean: {features_df['so_ca_cua_toa'].mean():.2f} ca")
    print(f"      - Max: {features_df['so_ca_cua_toa'].max()} ca")

    print(f"\n   👥 Số lượng nhân sự (TARGET):")
    print(f"      - Min: {features_df['so_luong'].min()} người")
    print(f"      - Mean: {features_df['so_luong'].mean():.2f} người")
    print(f"      - Median: {features_df['so_luong'].median():.0f} người")
    print(f"      - Max: {features_df['so_luong'].max()} người")

    # Top 5 tòa có nhiều nhân sự nhất
    print(f"\n   🏆 TOP 5 CA CÓ NHIỀU NHÂN SỰ NHẤT:")
    top5 = features_df.nlargest(5, 'so_luong')[['ma_dia_diem', 'loai_ca', 'tong_gio_lam', 'so_luong']]
    for idx, row in top5.iterrows():
        print(f"      - {row['ma_dia_diem']}: {row['loai_ca']} ({row['tong_gio_lam']:.1f}h) → {row['so_luong']} người")

    # 7. Export sang Excel
    print(f"\n💾 Xuất file Excel...")
    features_df.to_excel(output_excel_path, index=False, engine='openpyxl')

    print(f"✅ Đã tạo file: {output_excel_path}")
    print(f"   - Số dòng: {len(features_df)}")
    print(f"   - Số cột: {len(features_df.columns)}")

    # 8. Tạo thêm CSV backup
    csv_path = output_excel_path.replace('.xlsx', '.csv')
    features_df.to_csv(csv_path, index=False, encoding='utf-8-sig')
    print(f"✅ Đã tạo CSV backup: {csv_path}")

    # 9. Summary statistics
    print(f"\n📊 SUMMARY STATISTICS:")
    print(features_df.describe())

    print(f"\n" + "=" * 80)
    print("✅ HOÀN THÀNH!")
    print("=" * 80)

    return features_df


if __name__ == "__main__":
    # Paths
    json_file = "Link LLV 2025.json"
    output_excel = "shift_features_for_prediction.xlsx"

    # Run extraction
    df = extract_shift_features(json_file, output_excel)

    print(f"\n📋 SAMPLE DATA (10 dòng đầu):")
    print(df.head(10).to_string())

    print(f"\n🎯 CÁC FILE ĐÃ TẠO:")
    print(f"   1. {output_excel} - File Excel chính (để mở và xem)")
    print(f"   2. shift_features_for_prediction.csv - File CSV backup")

    print(f"\n🎯 CẤU TRÚC FILE:")
    print(f"   - ma_dia_diem: Mã tòa nhà")
    print(f"   - loai_ca: Loại ca (Hành chính, Ca sáng, Ca chiều, ...)")
    print(f"   - bat_dau: Giờ bắt đầu ca")
    print(f"   - ket_thuc: Giờ kết thúc ca")
    print(f"   - tong_gio_lam: Tổng số giờ làm việc")
    print(f"   - so_ca_cua_toa: Số ca của tòa nhà (feature)")
    print(f"   - so_luong: Số nhân sự (TARGET VARIABLE)")

    print(f"\n✨ READY FOR MODELING!")