predict_caLamviec_nhansu/extract_25_features.py

"""
EXTRACTION FUNCTION: 25 KEYWORD FEATURES TỪ TASKS TEXT
Dựa trên phân tích 30,917 công việc từ 302 tòa nhà
Updated: January 5, 2026
"""

import pandas as pd
import re
from typing import Dict, List

def extract_25_keyword_features(tasks_text: str) -> Dict[str, float]:
    """
    Trích xuất 25 keyword-based features từ tasks text

    Args:
        tasks_text: Chuỗi text chứa tất cả công việc (all_task_normal + all_task_dinhky)

    Returns:
        Dict với 25 features
    """

    # Xử lý missing values
    if pd.isna(tasks_text) or str(tasks_text).strip() == '':
        return _get_empty_features()

    # Chuyển về lowercase và tách tasks
    tasks_text = str(tasks_text).lower()

    # Tách tasks bằng các delimiter: ; | hoặc xuống dòng
    tasks = re.split(r'[;|\n]+', tasks_text)
    tasks = [t.strip() for t in tasks if t.strip()]

    # =================================================================
    # NHÓM 1: TASK COUNTS THEO LOẠI CÔNG VIỆC (9 features)
    # =================================================================

    # 1. Tổng số công việc
    num_tasks = len(tasks)

    # 2. Vệ sinh thường ngày (55.9% dữ liệu)
    cleaning_keywords = [
        'vệ sinh', 'tvs', 'tổng vệ sinh', 'lau', 'chùi', 'quét', 'hút',
        'đẩy khô', 'lau ẩm', 'đẩy ẩm', 'làm sạch', 'lau bụi', 'quét bụi',
        'lau kính', 'lau sàn', 'quét sàn', 'hút bụi', 'lau nền'
    ]
    num_cleaning_tasks = _count_tasks_with_keywords(tasks, cleaning_keywords)

    # 3. Thu gom/thay rác (7.9% dữ liệu) - MỚI ⭐
    trash_keywords = [
        'thu gom rác', 'thay rác', 'vận chuyển rác', 'tua rác', 'đổ rác',
        'thu rác', 'gom rác', 'chuyển rác', 'bỏ rác', 'đẩy rác',
        'quét rác nổi', 'trực rác', 'rác nổi'
    ]
    num_trash_collection_tasks = _count_tasks_with_keywords(tasks, trash_keywords)

    # 4. Trực/kiểm tra phát sinh (16.1% dữ liệu) - MỚI ⭐
    monitoring_keywords = [
        'trực', 'trực phát sinh', 'trực lại', 'trực ps', 'trực tua',
        'kiểm tra', 'check', 'giám sát', 'theo dõi', 'tuần tra'
    ]
    num_monitoring_tasks = _count_tasks_with_keywords(tasks, monitoring_keywords)

    # 5. Dọn phòng Y TẾ (0.4% nhưng đặc trưng) - MỚI ⭐
    room_cleaning_keywords = [
        'dọn mổ', 'dọn đẻ', 'dọn can thiệp', 'ra viện', 'dọn phòng',
        'bệnh nhân ra viện', 'dọn khi bệnh nhân', 'dọn phòng bệnh'
    ]
    num_room_cleaning_tasks = _count_tasks_with_keywords(tasks, room_cleaning_keywords)

    # 6. Vệ sinh chuyên sâu (4.5% dữ liệu) - MỚI ⭐
    deep_cleaning_keywords = [
        'cọ rửa', 'cọ bồn cầu', 'cọ', 'gạt kính', 'gạt', 'đánh sàn',
        'đánh chân tường', 'đánh cọ', 'đánh vết bẩn', 'chà tường',
        'đánh dép', 'cọ gương', 'cọ lavabo', 'cọ thùng rác'
    ]
    num_deep_cleaning_tasks = _count_tasks_with_keywords(tasks, deep_cleaning_keywords)

    # 7. Bảo trì/sửa chữa (0.6% dữ liệu)
    maintenance_keywords = [
        'bảo dưỡng', 'sửa chữa', 'bảo trì', 'thay thế', 'sửa',
        'thay', 'kiểm định', 'bảo dưỡng máy'
    ]
    num_maintenance_tasks = _count_tasks_with_keywords(tasks, maintenance_keywords)

    # 8. Hỗ trợ (5.8% dữ liệu) - MỚI ⭐
    support_keywords = [
        'giao ca', 'bàn giao', 'bàn giao ca', 'chụp ảnh', 'nhận ca',
        'vsdc', 'vệ sinh dụng cụ', 'chuẩn bị dụng cụ', 'vệ sinh xe đồ',
        'chuẩn bị nước', 'chuẩn bị', 'giao ban'
    ]
    num_support_tasks = _count_tasks_with_keywords(tasks, support_keywords)

    # 9. Công việc khác (không thuộc các loại trên)
    # Tạm tính = tổng - (các loại đã đếm, nhưng lưu ý có overlap)
    # Để tính chính xác, cần đếm tasks không match bất kỳ keyword nào
    all_keywords = (cleaning_keywords + trash_keywords + monitoring_keywords +
                   room_cleaning_keywords + deep_cleaning_keywords +
                   maintenance_keywords + support_keywords)
    num_other_tasks = _count_tasks_without_keywords(tasks, all_keywords)

    # =================================================================
    # NHÓM 2: AREA COVERAGE - KHU VỰC (10 features)
    # =================================================================

    # 10. WC/Nhà vệ sinh (20.4% dữ liệu)
    wc_keywords = [
        'wc', 'toilet', 'nhà vệ sinh', 'restroom', 'phòng vệ sinh',
        'bồn cầu', 'lavabo', 'tiểu nam', 'bồn tiểu', 'wc công cộng',
        'wc nhân viên', 'nhà wc'
    ]
    num_wc_tasks = _count_tasks_with_keywords(tasks, wc_keywords)

    # 11. Hành lang (13.7% dữ liệu)
    hallway_keywords = [
        'hành lang', 'corridor', 'lối đi', 'hall', 'hành lang tầng',
        'hl', 'hanh lang'
    ]
    num_hallway_tasks = _count_tasks_with_keywords(tasks, hallway_keywords)

    # 12. Sảnh (7.6% dữ liệu)
    lobby_keywords = [
        'sảnh', 'lobby', 'tiền sảnh', 'sảnh đỏ', 'sảnh chính',
        'tiền sảnh', 'sảnh tầng', 'sanh'
    ]
    num_lobby_tasks = _count_tasks_with_keywords(tasks, lobby_keywords)

    # 13. Phòng bệnh Y TẾ (1.5% dữ liệu) - MỚI ⭐
    patient_room_keywords = [
        'phòng bệnh', 'giường bệnh', 'phòng víp', 'phòng vip',
        'phòng bệnh nhân', 'pb', 'phòng bv'
    ]
    num_patient_room_tasks = _count_tasks_with_keywords(tasks, patient_room_keywords)

    # 14. Phòng khám Y TẾ (0.3% dữ liệu) - MỚI ⭐
    clinic_room_keywords = [
        'phòng khám', 'khoa khám', 'phòng nội', 'phòng sản',
        'phòng khám bệnh', 'khu khám', 'pk'
    ]
    num_clinic_room_tasks = _count_tasks_with_keywords(tasks, clinic_room_keywords)

    # 15. Phòng mổ Y TẾ (0.4% dữ liệu) - MỚI ⭐
    surgery_room_keywords = [
        'phòng mổ', 'hậu phẫu', 'phòng phẫu thuật', 'khu mổ',
        'phòng pt', 'ngoài phòng mổ', 'trong phòng mổ'
    ]
    num_surgery_room_tasks = _count_tasks_with_keywords(tasks, surgery_room_keywords)

    # 16. Ngoại cảnh (4.3% dữ liệu)
    outdoor_keywords = [
        'ngoại cảnh', 'sân', 'vỉa hè', 'khuôn viên', 'cổng',
        'outdoor', 'bãi xe', 'tầng hầm', 'sân sau', 'sân trước'
    ]
    num_outdoor_tasks = _count_tasks_with_keywords(tasks, outdoor_keywords)

    # 17. Thang máy/Cầu thang (10.6% dữ liệu)
    elevator_keywords = [
        'thang máy', 'elevator', 'lift', 'cầu thang', 'bậc tam cấp',
        'thang bộ', 'cầu thang bộ', 'tay vịn', 'tam cấp'
    ]
    num_elevator_tasks = _count_tasks_with_keywords(tasks, elevator_keywords)

    # 18. Phòng nhân viên/hành chính (4.4% dữ liệu) - MỚI ⭐
    office_keywords = [
        'phòng nhân viên', 'phòng giám đốc', 'phòng họp', 'phòng hành chính',
        'văn phòng', 'phòng gd', 'phòng pgd', 'phòng ban', 'phòng giao ban',
        'phòng bác sĩ', 'phòng trưởng khoa', 'hội trường', 'phòng kế toán'
    ]
    num_office_tasks = _count_tasks_with_keywords(tasks, office_keywords)

    # 19. Phòng kỹ thuật Y TẾ (0.2% dữ liệu) - MỚI ⭐
    technical_room_keywords = [
        'phòng xét nghiệm', 'phòng chụp', 'xq', 'siêu âm', 'kho dược',
        'phòng xn', 'labo', 'phòng thí nghiệm', 'phòng kỹ thuật',
        'phòng điện tim', 'phòng nội soi', 'phòng cấp cứu', 'phòng hồi sức'
    ]
    num_technical_room_tasks = _count_tasks_with_keywords(tasks, technical_room_keywords)

    # =================================================================
    # NHÓM 3: RATIOS & COMPLEXITY (6 features)
    # =================================================================

    # 20. Tỷ lệ vệ sinh thường ngày
    cleaning_ratio = num_cleaning_tasks / num_tasks if num_tasks > 0 else 0.0

    # 21. Tỷ lệ thu gom rác - MỚI ⭐
    trash_collection_ratio = num_trash_collection_tasks / num_tasks if num_tasks > 0 else 0.0

    # 22. Tỷ lệ trực/kiểm tra - MỚI ⭐
    monitoring_ratio = num_monitoring_tasks / num_tasks if num_tasks > 0 else 0.0

    # 23. Tỷ lệ dọn phòng Y TẾ - MỚI ⭐
    room_cleaning_ratio = num_room_cleaning_tasks / num_tasks if num_tasks > 0 else 0.0

    # 24. Độ đa dạng khu vực (0-10)
    area_counts = [
        num_wc_tasks, num_hallway_tasks, num_lobby_tasks,
        num_patient_room_tasks, num_clinic_room_tasks, num_surgery_room_tasks,
        num_outdoor_tasks, num_elevator_tasks, num_office_tasks, num_technical_room_tasks
    ]
    area_diversity = sum(1 for count in area_counts if count > 0)

    # 25. Điểm phức tạp (0.0 - 10.0) - MỚI ⭐
    # Dựa vào:
    # - Độ dài text (càng dài càng phức tạp)
    # - Số lượng công việc (càng nhiều càng phức tạp)
    # - Từ khóa kỹ thuật (Y TẾ, máy móc...)

    technical_keywords = [
        'bms', 'hvac', 'camera', 'access control', 'máy phát', 'máy móc',
        'hệ thống', 'thiết bị', 'bảo dưỡng máy', 'sửa máy', 'kiểm tra máy',
        'xét nghiệm', 'chụp chiếu', 'điện tim', 'nội soi', 'phẫu thuật'
    ]
    num_technical_keywords = _count_tasks_with_keywords(tasks, technical_keywords)

    # Công thức tính task_complexity_score:
    # - Text length: 0-3 điểm (0-1000 chars = 0, 1000-5000 = 1-2, 5000+ = 3)
    # - Num tasks: 0-3 điểm (0-10 = 0-1, 10-50 = 1-2, 50+ = 2-3)
    # - Technical keywords: 0-2 điểm (0 = 0, 1-3 = 1, 4+ = 2)
    # - Area diversity: 0-2 điểm (0-3 = 0-1, 4-7 = 1-1.5, 8-10 = 1.5-2)

    text_length = len(tasks_text)
    length_score = min(3.0, text_length / 2000)  # Max 3.0

    tasks_score = min(3.0, num_tasks / 20)  # Max 3.0

    technical_score = min(2.0, num_technical_keywords / 2)  # Max 2.0

    diversity_score = min(2.0, area_diversity / 5)  # Max 2.0

    task_complexity_score = round(length_score + tasks_score + technical_score + diversity_score, 2)

    # =================================================================
    # TRẢ VỀ DICT VỚI 25 FEATURES
    # =================================================================

    return {
        # NHÓM 1: Task Counts (9 features)
        'num_tasks': num_tasks,
        'num_cleaning_tasks': num_cleaning_tasks,
        'num_trash_collection_tasks': num_trash_collection_tasks,
        'num_monitoring_tasks': num_monitoring_tasks,
        'num_room_cleaning_tasks': num_room_cleaning_tasks,
        'num_deep_cleaning_tasks': num_deep_cleaning_tasks,
        'num_maintenance_tasks': num_maintenance_tasks,
        'num_support_tasks': num_support_tasks,
        'num_other_tasks': num_other_tasks,

        # NHÓM 2: Area Coverage (10 features)
        'num_wc_tasks': num_wc_tasks,
        'num_hallway_tasks': num_hallway_tasks,
        'num_lobby_tasks': num_lobby_tasks,
        'num_patient_room_tasks': num_patient_room_tasks,
        'num_clinic_room_tasks': num_clinic_room_tasks,
        'num_surgery_room_tasks': num_surgery_room_tasks,
        'num_outdoor_tasks': num_outdoor_tasks,
        'num_elevator_tasks': num_elevator_tasks,
        'num_office_tasks': num_office_tasks,
        'num_technical_room_tasks': num_technical_room_tasks,

        # NHÓM 3: Ratios & Complexity (6 features)
        'cleaning_ratio': round(cleaning_ratio, 4),
        'trash_collection_ratio': round(trash_collection_ratio, 4),
        'monitoring_ratio': round(monitoring_ratio, 4),
        'room_cleaning_ratio': round(room_cleaning_ratio, 4),
        'area_diversity': area_diversity,
        'task_complexity_score': task_complexity_score
    }


def _count_tasks_with_keywords(tasks: List[str], keywords: List[str]) -> int:
    """Đếm số tasks chứa ít nhất 1 keyword"""
    count = 0
    for task in tasks:
        task_lower = task.lower()
        if any(keyword in task_lower for keyword in keywords):
            count += 1
    return count


def _count_tasks_without_keywords(tasks: List[str], all_keywords: List[str]) -> int:
    """Đếm số tasks KHÔNG chứa bất kỳ keyword nào"""
    count = 0
    for task in tasks:
        task_lower = task.lower()
        if not any(keyword in task_lower for keyword in all_keywords):
            count += 1
    return count


def _get_empty_features() -> Dict[str, float]:
    """Trả về dict với tất cả features = 0 (cho missing data)"""
    return {
        # NHÓM 1
        'num_tasks': 0,
        'num_cleaning_tasks': 0,
        'num_trash_collection_tasks': 0,
        'num_monitoring_tasks': 0,
        'num_room_cleaning_tasks': 0,
        'num_deep_cleaning_tasks': 0,
        'num_maintenance_tasks': 0,
        'num_support_tasks': 0,
        'num_other_tasks': 0,

        # NHÓM 2
        'num_wc_tasks': 0,
        'num_hallway_tasks': 0,
        'num_lobby_tasks': 0,
        'num_patient_room_tasks': 0,
        'num_clinic_room_tasks': 0,
        'num_surgery_room_tasks': 0,
        'num_outdoor_tasks': 0,
        'num_elevator_tasks': 0,
        'num_office_tasks': 0,
        'num_technical_room_tasks': 0,

        # NHÓM 3
        'cleaning_ratio': 0.0,
        'trash_collection_ratio': 0.0,
        'monitoring_ratio': 0.0,
        'room_cleaning_ratio': 0.0,
        'area_diversity': 0,
        'task_complexity_score': 0.0
    }


# =================================================================
# MAIN: ÁP DỤNG CHO TẤT CẢ TÒA NHÀ
# =================================================================

if __name__ == '__main__':
    print("=" * 100)
    print("TRÍCH XUẤT 25 KEYWORD FEATURES TỪ TASKS TEXT")
    print("=" * 100)

    # Đọc file
    print("\n📂 Đọc file ket_qua_cong_viec_full.xlsx...")
    df = pd.read_excel('ket_qua_cong_viec_full.xlsx')
    print(f"   ✅ Đọc thành công {len(df)} tòa nhà")

    # Gộp all_task_normal và all_task_dinhky
    print("\n🔗 Gộp all_task_normal + all_task_dinhky...")
    df['all_tasks_combined'] = (
        df['all_task_normal'].fillna('') + ' ; ' + df['all_task_dinhky'].fillna('')
    )

    # Áp dụng extraction function cho tất cả tòa
    print("\n⚙️  Trích xuất 25 features cho từng tòa...")
    features_list = []

    for idx, row in df.iterrows():
        if (idx + 1) % 50 == 0:
            print(f"   Đang xử lý... {idx + 1}/{len(df)} tòa")

        features = extract_25_keyword_features(row['all_tasks_combined'])
        features['ma_dia_diem'] = row['ma_dia_diem']
        features_list.append(features)

    print(f"   ✅ Hoàn thành {len(df)} tòa")

    # Tạo DataFrame
    print("\n📊 Tạo DataFrame với 25 features...")
    df_features = pd.DataFrame(features_list)

    # Sắp xếp lại cột: ma_dia_diem đầu tiên
    cols = ['ma_dia_diem'] + [col for col in df_features.columns if col != 'ma_dia_diem']
    df_features = df_features[cols]

    # Hiển thị thống kê
    print("\n" + "=" * 100)
    print("📈 THỐNG KÊ 25 FEATURES")
    print("=" * 100)

    print("\n🔹 NHÓM 1: TASK COUNTS (9 features)")
    group1_cols = [
        'num_tasks', 'num_cleaning_tasks', 'num_trash_collection_tasks',
        'num_monitoring_tasks', 'num_room_cleaning_tasks', 'num_deep_cleaning_tasks',
        'num_maintenance_tasks', 'num_support_tasks', 'num_other_tasks'
    ]
    print(df_features[group1_cols].describe().round(2))

    print("\n🔹 NHÓM 2: AREA COVERAGE (10 features)")
    group2_cols = [
        'num_wc_tasks', 'num_hallway_tasks', 'num_lobby_tasks',
        'num_patient_room_tasks', 'num_clinic_room_tasks', 'num_surgery_room_tasks',
        'num_outdoor_tasks', 'num_elevator_tasks', 'num_office_tasks', 'num_technical_room_tasks'
    ]
    print(df_features[group2_cols].describe().round(2))

    print("\n🔹 NHÓM 3: RATIOS & COMPLEXITY (6 features)")
    group3_cols = [
        'cleaning_ratio', 'trash_collection_ratio', 'monitoring_ratio',
        'room_cleaning_ratio', 'area_diversity', 'task_complexity_score'
    ]
    print(df_features[group3_cols].describe().round(4))

    # Top 10 tòa có nhiều công việc nhất
    print("\n" + "=" * 100)
    print("🏆 TOP 10 TÒA CÓ NHIỀU CÔNG VIỆC NHẤT")
    print("=" * 100)
    top10 = df_features.nlargest(10, 'num_tasks')[
        ['ma_dia_diem', 'num_tasks', 'cleaning_ratio', 'area_diversity', 'task_complexity_score']
    ]
    print(top10.to_string(index=False))

    # Top 10 tòa phức tạp nhất
    print("\n" + "=" * 100)
    print("🎯 TOP 10 TÒA PHỨC TẠP NHẤT (task_complexity_score)")
    print("=" * 100)
    top10_complex = df_features.nlargest(10, 'task_complexity_score')[
        ['ma_dia_diem', 'num_tasks', 'task_complexity_score', 'area_diversity']
    ]
    print(top10_complex.to_string(index=False))

    # Lưu file
    output_file = 'features_25_keywords.csv'
    print(f"\n💾 Lưu features vào file: {output_file}")
    df_features.to_csv(output_file, index=False, encoding='utf-8-sig')
    print(f"   ✅ Đã lưu {len(df_features)} tòa với 25 features")

    # Hiển thị 5 tòa đầu tiên
    print("\n" + "=" * 100)
    print("📋 MẪU DỮ LIỆU (5 tòa đầu tiên)")
    print("=" * 100)
    print(df_features.head().to_string(index=False))

    print("\n" + "=" * 100)
    print("✅ HOÀN THÀNH!")
    print("=" * 100)
    print(f"\n📊 Tổng kết:")
    print(f"   - Số tòa nhà: {len(df_features)}")
    print(f"   - Số features: {len(df_features.columns) - 1}")  # Trừ cột ma_dia_diem
    print(f"   - File output: {output_file}")
    print(f"\n🎯 Bước tiếp theo:")
    print(f"   1. Kiểm tra file {output_file}")
    print(f"   2. Phân tích correlation giữa các features")
    print(f"   3. Visualize distribution")
    print(f"   4. Tiếp tục với TF-IDF features (10 features)")
    print(f"   5. Join với building features (18 features)")
    print(f"   → TỔNG: 25 + 10 + 18 = 53 features")