preditc_nhansu_phase2/create_final_dataset.py

"""
TẠO DATASET CUỐI CÙNG VỚI FEATURES TỪ all_task
- Sửa delimiter từ ';' sang '/'
- Loại bỏ features có tỷ lệ non-zero < 40%
- Tạo dataset cuối từ buoc2_with_area_v3.csv
"""

import pandas as pd
import re
from typing import Dict, List


# =========================================================
# HELPERS
# =========================================================
def split_tasks(tasks_text: str) -> List[str]:
    """Tách tasks bằng delimiter: / (FIXED)"""
    tasks_text = str(tasks_text).lower()
    tasks = re.split(r"/", tasks_text)
    return [t.strip() for t in tasks if t.strip()]


def count_tasks_with_keywords(tasks: List[str], keywords: List[str]) -> int:
    """Đếm số tasks chứa ít nhất 1 keyword"""
    count = 0
    for task in tasks:
        if any(k in task for k in keywords):
            count += 1
    return count


def count_tasks_without_keywords(tasks: List[str], all_keywords: List[str]) -> int:
    """Đếm số tasks KHÔNG chứa bất kỳ keyword nào"""
    count = 0
    for task in tasks:
        if not any(k in task for k in all_keywords):
            count += 1
    return count


# =========================================================
# FEATURE EXTRACTION
# =========================================================
def extract_task_features(tasks_text: str) -> Dict[str, float]:
    """
    Trích xuất features từ all_task column
    CHỈ giữ features có tỷ lệ non-zero >= 40%
    """

    if pd.isna(tasks_text) or str(tasks_text).strip() == "":
        return {
            "num_tasks": 0,
            "num_cleaning_tasks": 0,
            "num_trash_collection_tasks": 0,
            "num_monitoring_tasks": 0,
            "num_deep_cleaning_tasks": 0,
            "num_support_tasks": 0,
            "num_wc_tasks": 0,
            "num_hallway_tasks": 0,
            "num_elevator_tasks": 0,
            "cleaning_ratio": 0.0,
            "trash_collection_ratio": 0.0,
            "monitoring_ratio": 0.0,
            "area_diversity": 0,
        }

    tasks = split_tasks(tasks_text)
    num_tasks = len(tasks)

    # -----------------------------
    # TASK TYPE COUNTS
    # -----------------------------
    cleaning_keywords = [
        "vệ sinh", "lau", "chùi", "quét", "hút", "ve sinh",
        "đẩy khô", "lau ẩm", "làm sạch", "lau bụi", "lau kính", "lau sàn", "hút bụi",
        "day kho", "lam sach"
    ]
    trash_keywords = [
        "thu gom rác", "thay rác", "vận chuyển rác", "tua rác", "đổ rác",
        "thu rác", "gom rác", "quét rác nổi", "trực rác", "rác nổi", "rac",
        "van chuyen rac", "thu gom rac", "thay rac"
    ]
    monitoring_keywords = [
        "trực", "trực phát sinh", "trực ps", "kiểm tra", "check",
        "giám sát", "theo dõi", "tuần tra", "tuan tra", "truc", "phat sinh"
    ]
    deep_cleaning_keywords = [
        "cọ rửa", "cọ bồn cầu", "cọ", "gạt kính", "đánh sàn",
        "đánh chân tường", "chà tường", "cọ gương", "cọ lavabo", "rửa",
        "co rua", "co bon cau", "gat kinh", "danh san"
    ]
    support_keywords = [
        "giao ca", "bàn giao", "bàn giao ca", "chụp ảnh", "nhận ca",
        "vsdc", "vệ sinh dụng cụ", "chuẩn bị dụng cụ", "chuẩn bị nước", "chuẩn bị", "giao ban",
        "ve sinh dung cu", "chuan bi", "nhan ca"
    ]

    num_cleaning_tasks = count_tasks_with_keywords(tasks, cleaning_keywords)
    num_trash_collection_tasks = count_tasks_with_keywords(tasks, trash_keywords)
    num_monitoring_tasks = count_tasks_with_keywords(tasks, monitoring_keywords)
    num_deep_cleaning_tasks = count_tasks_with_keywords(tasks, deep_cleaning_keywords)
    num_support_tasks = count_tasks_with_keywords(tasks, support_keywords)

    # -----------------------------
    # AREA COVERAGE (CHỈ GIỮ >= 40%)
    # -----------------------------
    wc_keywords = [
        "wc", "toilet", "nhà vệ sinh", "restroom", "phòng vệ sinh",
        "bồn cầu", "lavabo", "tiểu nam", "bồn tiểu", "ve sinh",
        "nha ve sinh", "bon cau", "tieu nam"
    ]
    hallway_keywords = [
        "hành lang", "corridor", "lối đi", "hall", "hl", "hanh lang",
        "hang lang", "loi di"
    ]
    elevator_keywords = [
        "thang máy", "elevator", "lift", "cầu thang", "thang bộ",
        "tay vịn", "tam cấp", "cau thang", "thang may", "thang bo", "tam cap"
    ]

    # Patient room - KEPT (likely > 40% for medical facilities)
    patient_room_keywords = [
        "phòng bệnh", "giường bệnh", "phòng vip", "phòng bệnh nhân",
        "pb", "phòng bv", "phong benh", "giuong benh", "benh nhan"
    ]

    # Office - KEPT
    office_keywords = [
        "phòng nhân viên", "phòng giám đốc", "phòng họp", "phòng hành chính",
        "văn phòng", "phòng ban", "phòng giao ban", "hội trường",
        "phòng kế toán", "van phong", "phong hop", "phong nhan vien"
    ]

    num_wc_tasks = count_tasks_with_keywords(tasks, wc_keywords)
    num_hallway_tasks = count_tasks_with_keywords(tasks, hallway_keywords)
    num_elevator_tasks = count_tasks_with_keywords(tasks, elevator_keywords)
    num_patient_room_tasks = count_tasks_with_keywords(tasks, patient_room_keywords)
    num_office_tasks = count_tasks_with_keywords(tasks, office_keywords)

    # -----------------------------
    # RATIOS & DIVERSITY
    # -----------------------------
    cleaning_ratio = num_cleaning_tasks / num_tasks if num_tasks > 0 else 0.0
    trash_collection_ratio = num_trash_collection_tasks / num_tasks if num_tasks > 0 else 0.0
    monitoring_ratio = num_monitoring_tasks / num_tasks if num_tasks > 0 else 0.0

    area_counts = [
        num_wc_tasks, num_hallway_tasks, num_elevator_tasks,
        num_patient_room_tasks, num_office_tasks
    ]
    area_diversity = sum(1 for c in area_counts if c > 0)

    return {
        # TASK COUNTS (6 - removed num_other_tasks as it's only 1%)
        "num_tasks": num_tasks,
        "num_cleaning_tasks": num_cleaning_tasks,
        "num_trash_collection_tasks": num_trash_collection_tasks,
        "num_monitoring_tasks": num_monitoring_tasks,
        "num_deep_cleaning_tasks": num_deep_cleaning_tasks,
        "num_support_tasks": num_support_tasks,

        # AREA (5 - kept only >= 40%)
        "num_wc_tasks": num_wc_tasks,
        "num_hallway_tasks": num_hallway_tasks,
        "num_elevator_tasks": num_elevator_tasks,
        "num_patient_room_tasks": num_patient_room_tasks,
        "num_office_tasks": num_office_tasks,

        # RATIOS & DIVERSITY (4)
        "cleaning_ratio": round(cleaning_ratio, 4),
        "trash_collection_ratio": round(trash_collection_ratio, 4),
        "monitoring_ratio": round(monitoring_ratio, 4),
        "area_diversity": area_diversity,
    }


# =========================================================
# MAIN SCRIPT
# =========================================================
if __name__ == "__main__":
    input_csv = r"c:\Users\tainl\Documents\testHM\data\data-hoanmy\buoc2_with_area_v3.csv"
    output_csv = r"c:\Users\tainl\Documents\testHM\data\data-hoanmy\final_dataset.csv"

    print("=" * 100)
    print("TẠO DATASET CUỐI CÙNG VỚI TASK FEATURES")
    print("=" * 100)

    print(f"\n📂 Đọc file: buoc2_with_area_v3.csv")
    df = pd.read_csv(input_csv, encoding='utf-8-sig')
    print(f"✅ Đọc thành công {len(df)} records")

    print(f"\n⚙️  Đang trích xuất features (delimiter: /)...")

    features_list = []
    for idx, row in df.iterrows():
        if (idx + 1) % 100 == 0:
            print(f"   Đã xử lý {idx + 1}/{len(df)} records...")

        feats = extract_task_features(row['all_task'])

        # Kết hợp các cột gốc + features
        result = {
            'ma_toa_nha': row['ma_toa_nha'],
            'Khu_vuc_lam_sach': row['Khu_vuc_lam_sach'],
            'gio': row['gio'],
            'So_nhan_su': row['So_nhan_su'],
            'Ten_ca': row['Ten_ca'],
            'dien_tich_m2': row['dien_tich_m2'],
            **feats
        }
        features_list.append(result)

    df_final = pd.DataFrame(features_list)

    print(f"\n✅ HOÀN THÀNH! Shape: {df_final.shape}")
    print(f"   - Records: {len(df_final)}")
    print(f"   - Columns: {len(df_final.columns)} (6 gốc + 15 features)")

    # Save
    print(f"\n💾 Lưu file: final_dataset.csv")
    df_final.to_csv(output_csv, index=False, encoding='utf-8-sig')

    # Statistics
    print(f"\n{'=' * 100}")
    print("📊 THỐNG KÊ FEATURES (CHỈ GIỮ FEATURES CÓ NON-ZERO >= 40%):")
    print(f"{'=' * 100}\n")

    feature_cols = [
        'num_tasks', 'num_cleaning_tasks', 'num_trash_collection_tasks',
        'num_monitoring_tasks', 'num_deep_cleaning_tasks', 'num_support_tasks',
        'num_wc_tasks', 'num_hallway_tasks', 'num_elevator_tasks',
        'num_patient_room_tasks', 'num_office_tasks',
        'cleaning_ratio', 'trash_collection_ratio', 'monitoring_ratio', 'area_diversity'
    ]

    print(f"{'Feature':<35} {'Mean':<10} {'Min':<6} {'Max':<6} {'Non-Zero':<15}")
    print("-" * 100)

    for col in feature_cols:
        mean_val = df_final[col].mean()
        min_val = df_final[col].min()
        max_val = df_final[col].max()
        non_zero = (df_final[col] > 0).sum()
        non_zero_pct = non_zero * 100 / len(df_final)

        print(f"{col:<35} {mean_val:<10.2f} {min_val:<6.0f} {max_val:<6.0f} {non_zero:<6} ({non_zero_pct:>5.1f}%)")

    # Sample
    print(f"\n{'=' * 100}")
    print("📝 MẪU DỮ LIỆU (5 records đầu):")
    print(f"{'=' * 100}\n")

    display_cols = ['ma_toa_nha', 'dien_tich_m2', 'num_tasks', 'num_cleaning_tasks',
                   'num_wc_tasks', 'cleaning_ratio', 'area_diversity']
    print(df_final[display_cols].head(5).to_string(index=False))

    # Correlation với diện tích
    print(f"\n{'=' * 100}")
    print("🔗 CORRELATION VỚI DIỆN TÍCH (Top 10):")
    print(f"{'=' * 100}\n")

    correlations = {}
    for col in feature_cols:
        corr = df_final[col].corr(df_final['dien_tich_m2'])
        correlations[col] = corr

    sorted_corr = sorted(correlations.items(), key=lambda x: abs(x[1]), reverse=True)
    for feat, corr in sorted_corr[:10]:
        print(f"   {feat:<35} {corr:>7.4f}")

    print(f"\n{'=' * 100}")
    print("✅ DONE! Dataset cuối đã sẵn sàng cho modeling")
    print(f"{'=' * 100}")