predict_caLamviec_nhansu/predict.py

import re
from typing import Dict, List, Optional

import pandas as pd


# =========================================================
# 1) HELPERS
# =========================================================
_TASK_SPLIT_RE = re.compile(r"[;|\n]+")

def _split_tasks(tasks_text: str) -> List[str]:
    """Tách tasks bằng delimiter: ; | hoặc xuống dòng"""
    tasks_text = str(tasks_text).lower()
    tasks = _TASK_SPLIT_RE.split(tasks_text)
    return [t.strip() for t in tasks if t.strip()]

def _count_tasks_with_keywords(tasks: List[str], keywords: List[str]) -> int:
    """Đếm số tasks chứa ít nhất 1 keyword"""
    count = 0
    for task in tasks:
        if any(k in task for k in keywords):
            count += 1
    return count

def _count_tasks_without_keywords(tasks: List[str], all_keywords: List[str]) -> int:
    """Đếm số tasks KHÔNG chứa bất kỳ keyword nào"""
    count = 0
    for task in tasks:
        if not any(k in task for k in all_keywords):
            count += 1
    return count

def _get_empty_features() -> Dict[str, float]:
    """Trả về dict với tất cả features = 0 (cho missing data)"""
    return {
        # TASK COUNTS (7)
        "num_tasks": 0,
        "num_cleaning_tasks": 0,
        "num_trash_collection_tasks": 0,
        "num_monitoring_tasks": 0,
        "num_deep_cleaning_tasks": 0,
        "num_support_tasks": 0,
        "num_other_tasks": 0,

        # AREA (reduced + aggregated) (7)
        "num_wc_tasks": 0,
        "num_hallway_tasks": 0,
        "num_lobby_tasks": 0,
        "num_outdoor_tasks": 0,
        "num_elevator_tasks": 0,
        "num_medical_tasks_total": 0,
        "num_indoor_room_tasks": 0,

        # RATIOS & DIVERSITY (4)
        "cleaning_ratio": 0.0,
        "trash_collection_ratio": 0.0,
        "monitoring_ratio": 0.0,
        "area_diversity": 0,
    }


# =========================================================
# 2) MAIN: 2 TEXT INPUTS -> FEATURES
# =========================================================
def extract_keyword_features_reduced_from_two_texts(
    task_normal: Optional[str],
    task_dinhky: Optional[str],
) -> Dict[str, float]:
    """
    Input:
        task_normal: text công việc thường
        task_dinhky: text công việc định kỳ
    Output:
        Dict keyword-features reduced (schema y như bạn định nghĩa)

    Logic gộp giống bản gốc:
        combined = task_normal + " ; " + task_dinhky
    """

    tn = "" if task_normal is None or (isinstance(task_normal, float) and pd.isna(task_normal)) else str(task_normal)
    td = "" if task_dinhky is None or (isinstance(task_dinhky, float) and pd.isna(task_dinhky)) else str(task_dinhky)

    combined = (tn.strip() + " ; " + td.strip()).strip()
    if combined == "":
        return _get_empty_features()

    tasks = _split_tasks(combined)
    num_tasks = len(tasks)
    if num_tasks == 0:
        return _get_empty_features()

    # -----------------------------
    # GROUP 1: TASK COUNTS (reduced)
    # -----------------------------
    cleaning_keywords = [
        "vệ sinh", "lau", "chùi", "quét", "hút",
        "đẩy khô", "lau ẩm", "làm sạch", "lau bụi", "lau kính", "lau sàn", "hút bụi"
    ]
    trash_keywords = [
        "thu gom rác", "thay rác", "vận chuyển rác", "tua rác", "đổ rác",
        "thu rác", "gom rác", "quét rác nổi", "trực rác", "rác nổi"
    ]
    monitoring_keywords = [
        "trực", "trực phát sinh", "trực ps", "kiểm tra", "check",
        "giám sát", "theo dõi", "tuần tra"
    ]
    deep_cleaning_keywords = [
        "cọ rửa", "cọ bồn cầu", "cọ", "gạt kính", "đánh sàn",
        "đánh chân tường", "chà tường", "cọ gương", "cọ lavabo"
    ]
    support_keywords = [
        "giao ca", "bàn giao", "bàn giao ca", "chụp ảnh", "nhận ca",
        "vsdc", "vệ sinh dụng cụ", "chuẩn bị dụng cụ", "chuẩn bị nước", "chuẩn bị", "giao ban"
    ]

    num_cleaning_tasks = _count_tasks_with_keywords(tasks, cleaning_keywords)
    num_trash_collection_tasks = _count_tasks_with_keywords(tasks, trash_keywords)
    num_monitoring_tasks = _count_tasks_with_keywords(tasks, monitoring_keywords)
    num_deep_cleaning_tasks = _count_tasks_with_keywords(tasks, deep_cleaning_keywords)
    num_support_tasks = _count_tasks_with_keywords(tasks, support_keywords)

    all_keywords_for_other = (
        cleaning_keywords + trash_keywords + monitoring_keywords + deep_cleaning_keywords + support_keywords
    )
    num_other_tasks = _count_tasks_without_keywords(tasks, all_keywords_for_other)

    # -----------------------------
    # GROUP 2: AREA COVERAGE (reduced + aggregated)
    # -----------------------------
    wc_keywords = [
        "wc", "toilet", "nhà vệ sinh", "restroom", "phòng vệ sinh",
        "bồn cầu", "lavabo", "tiểu nam", "bồn tiểu"
    ]
    hallway_keywords = ["hành lang", "corridor", "lối đi", "hall", "hl", "hanh lang"]
    lobby_keywords = ["sảnh", "lobby", "tiền sảnh", "sảnh chính", "sanh"]
    outdoor_keywords = ["ngoại cảnh", "sân", "vỉa hè", "khuôn viên", "cổng", "bãi xe", "tầng hầm"]
    elevator_keywords = ["thang máy", "elevator", "lift", "cầu thang", "thang bộ", "tay vịn", "tam cấp"]

    patient_room_keywords = ["phòng bệnh", "giường bệnh", "phòng vip", "phòng bệnh nhân", "pb", "phòng bv"]
    clinic_room_keywords = ["phòng khám", "khoa khám", "phòng khám bệnh", "khu khám", "pk"]
    surgery_room_keywords = ["phòng mổ", "hậu phẫu", "phòng phẫu thuật", "khu mổ", "phòng pt"]
    technical_room_keywords = [
        "phòng xét nghiệm", "phòng chụp", "xq", "siêu âm", "kho dược",
        "phòng xn", "labo", "phòng thí nghiệm", "nội soi", "cấp cứu", "hồi sức"
    ]

    office_keywords = [
        "phòng nhân viên", "phòng giám đốc", "phòng họp", "phòng hành chính",
        "văn phòng", "phòng ban", "phòng giao ban", "hội trường", "phòng kế toán"
    ]

    num_wc_tasks = _count_tasks_with_keywords(tasks, wc_keywords)
    num_hallway_tasks = _count_tasks_with_keywords(tasks, hallway_keywords)
    num_lobby_tasks = _count_tasks_with_keywords(tasks, lobby_keywords)
    num_outdoor_tasks = _count_tasks_with_keywords(tasks, outdoor_keywords)
    num_elevator_tasks = _count_tasks_with_keywords(tasks, elevator_keywords)

    num_patient_room_tasks = _count_tasks_with_keywords(tasks, patient_room_keywords)
    num_clinic_room_tasks = _count_tasks_with_keywords(tasks, clinic_room_keywords)
    num_surgery_room_tasks = _count_tasks_with_keywords(tasks, surgery_room_keywords)
    num_technical_room_tasks = _count_tasks_with_keywords(tasks, technical_room_keywords)

    num_medical_tasks_total = (
        num_patient_room_tasks + num_clinic_room_tasks + num_surgery_room_tasks + num_technical_room_tasks
    )

    num_indoor_room_tasks = _count_tasks_with_keywords(tasks, office_keywords)

    # -----------------------------
    # GROUP 3: RATIOS & DIVERSITY (reduced)
    # -----------------------------
    cleaning_ratio = num_cleaning_tasks / num_tasks if num_tasks > 0 else 0.0
    trash_collection_ratio = num_trash_collection_tasks / num_tasks if num_tasks > 0 else 0.0
    monitoring_ratio = num_monitoring_tasks / num_tasks if num_tasks > 0 else 0.0

    area_counts = [
        num_wc_tasks, num_hallway_tasks, num_lobby_tasks, num_outdoor_tasks, num_elevator_tasks,
        num_medical_tasks_total, num_indoor_room_tasks
    ]
    area_diversity = sum(1 for c in area_counts if c > 0)

    return {
        # TASK COUNTS (7)
        "num_tasks": num_tasks,
        "num_cleaning_tasks": num_cleaning_tasks,
        "num_trash_collection_tasks": num_trash_collection_tasks,
        "num_monitoring_tasks": num_monitoring_tasks,
        "num_deep_cleaning_tasks": num_deep_cleaning_tasks,
        "num_support_tasks": num_support_tasks,
        "num_other_tasks": num_other_tasks,

        # AREA (reduced + aggregated) (7)
        "num_wc_tasks": num_wc_tasks,
        "num_hallway_tasks": num_hallway_tasks,
        "num_lobby_tasks": num_lobby_tasks,
        "num_outdoor_tasks": num_outdoor_tasks,
        "num_elevator_tasks": num_elevator_tasks,
        "num_medical_tasks_total": num_medical_tasks_total,
        "num_indoor_room_tasks": num_indoor_room_tasks,

        # RATIOS & DIVERSITY (4)
        "cleaning_ratio": round(cleaning_ratio, 4),
        "trash_collection_ratio": round(trash_collection_ratio, 4),
        "monitoring_ratio": round(monitoring_ratio, 4),
        "area_diversity": area_diversity,
    }