251 lines
9.9 KiB
Python
251 lines
9.9 KiB
Python
"""
|
|
EXTRACTION FUNCTION: REDUCED KEYWORD FEATURES TỪ TASKS TEXT
|
|
Dựa trên phân tích 30,917 công việc từ 302 tòa nhà
|
|
Updated: January 2026 (Reduced version for small dataset)
|
|
"""
|
|
|
|
import pandas as pd
|
|
import re
|
|
from typing import Dict, List
|
|
|
|
|
|
# =========================================================
|
|
# 1) HELPERS
|
|
# =========================================================
|
|
def _split_tasks(tasks_text: str) -> List[str]:
|
|
"""Tách tasks bằng delimiter: ; | hoặc xuống dòng"""
|
|
tasks_text = str(tasks_text).lower()
|
|
tasks = re.split(r"[;|\n]+", tasks_text)
|
|
return [t.strip() for t in tasks if t.strip()]
|
|
|
|
|
|
def _count_tasks_with_keywords(tasks: List[str], keywords: List[str]) -> int:
|
|
"""Đếm số tasks chứa ít nhất 1 keyword"""
|
|
count = 0
|
|
for task in tasks:
|
|
if any(k in task for k in keywords):
|
|
count += 1
|
|
return count
|
|
|
|
|
|
def _count_tasks_without_keywords(tasks: List[str], all_keywords: List[str]) -> int:
|
|
"""Đếm số tasks KHÔNG chứa bất kỳ keyword nào"""
|
|
count = 0
|
|
for task in tasks:
|
|
if not any(k in task for k in all_keywords):
|
|
count += 1
|
|
return count
|
|
|
|
|
|
def _get_empty_features() -> Dict[str, float]:
|
|
"""Trả về dict với tất cả features = 0 (cho missing data)"""
|
|
return {
|
|
# TASK COUNTS (7)
|
|
"num_tasks": 0,
|
|
"num_cleaning_tasks": 0,
|
|
"num_trash_collection_tasks": 0,
|
|
"num_monitoring_tasks": 0,
|
|
"num_deep_cleaning_tasks": 0,
|
|
"num_support_tasks": 0,
|
|
"num_other_tasks": 0,
|
|
|
|
# AREA (reduced + aggregated) (7)
|
|
"num_wc_tasks": 0,
|
|
"num_hallway_tasks": 0,
|
|
"num_lobby_tasks": 0,
|
|
"num_outdoor_tasks": 0,
|
|
"num_elevator_tasks": 0,
|
|
"num_medical_tasks_total": 0,
|
|
"num_indoor_room_tasks": 0,
|
|
|
|
# RATIOS & DIVERSITY (4)
|
|
"cleaning_ratio": 0.0,
|
|
"trash_collection_ratio": 0.0,
|
|
"monitoring_ratio": 0.0,
|
|
"area_diversity": 0,
|
|
}
|
|
|
|
|
|
# =========================================================
|
|
# 2) MAIN EXTRACTION (NEW FEATURES)
|
|
# =========================================================
|
|
def extract_keyword_features_reduced(tasks_text: str) -> Dict[str, float]:
|
|
"""
|
|
Trích xuất bộ keyword-based features (Reduced + Aggregated)
|
|
|
|
Args:
|
|
tasks_text: Chuỗi text chứa tất cả công việc (all_task_normal + all_task_dinhky)
|
|
|
|
Returns:
|
|
Dict với features mới (reduced)
|
|
"""
|
|
|
|
if pd.isna(tasks_text) or str(tasks_text).strip() == "":
|
|
return _get_empty_features()
|
|
|
|
tasks = _split_tasks(tasks_text)
|
|
num_tasks = len(tasks)
|
|
|
|
# -----------------------------
|
|
# GROUP 1: TASK COUNTS (reduced)
|
|
# -----------------------------
|
|
cleaning_keywords = [
|
|
"vệ sinh", "lau", "chùi", "quét", "hút",
|
|
"đẩy khô", "lau ẩm", "làm sạch", "lau bụi", "lau kính", "lau sàn", "hút bụi"
|
|
]
|
|
trash_keywords = [
|
|
"thu gom rác", "thay rác", "vận chuyển rác", "tua rác", "đổ rác",
|
|
"thu rác", "gom rác", "quét rác nổi", "trực rác", "rác nổi"
|
|
]
|
|
monitoring_keywords = [
|
|
"trực", "trực phát sinh", "trực ps", "kiểm tra", "check",
|
|
"giám sát", "theo dõi", "tuần tra"
|
|
]
|
|
deep_cleaning_keywords = [
|
|
"cọ rửa", "cọ bồn cầu", "cọ", "gạt kính", "đánh sàn",
|
|
"đánh chân tường", "chà tường", "cọ gương", "cọ lavabo"
|
|
]
|
|
support_keywords = [
|
|
"giao ca", "bàn giao", "bàn giao ca", "chụp ảnh", "nhận ca",
|
|
"vsdc", "vệ sinh dụng cụ", "chuẩn bị dụng cụ", "chuẩn bị nước", "chuẩn bị", "giao ban"
|
|
]
|
|
|
|
num_cleaning_tasks = _count_tasks_with_keywords(tasks, cleaning_keywords)
|
|
num_trash_collection_tasks = _count_tasks_with_keywords(tasks, trash_keywords)
|
|
num_monitoring_tasks = _count_tasks_with_keywords(tasks, monitoring_keywords)
|
|
num_deep_cleaning_tasks = _count_tasks_with_keywords(tasks, deep_cleaning_keywords)
|
|
num_support_tasks = _count_tasks_with_keywords(tasks, support_keywords)
|
|
|
|
all_keywords_for_other = (
|
|
cleaning_keywords + trash_keywords + monitoring_keywords + deep_cleaning_keywords + support_keywords
|
|
)
|
|
num_other_tasks = _count_tasks_without_keywords(tasks, all_keywords_for_other)
|
|
|
|
# -----------------------------
|
|
# GROUP 2: AREA COVERAGE (reduced + aggregated)
|
|
# -----------------------------
|
|
wc_keywords = [
|
|
"wc", "toilet", "nhà vệ sinh", "restroom", "phòng vệ sinh",
|
|
"bồn cầu", "lavabo", "tiểu nam", "bồn tiểu"
|
|
]
|
|
hallway_keywords = ["hành lang", "corridor", "lối đi", "hall", "hl", "hanh lang"]
|
|
lobby_keywords = ["sảnh", "lobby", "tiền sảnh", "sảnh chính", "sanh"]
|
|
outdoor_keywords = ["ngoại cảnh", "sân", "vỉa hè", "khuôn viên", "cổng", "bãi xe", "tầng hầm"]
|
|
elevator_keywords = ["thang máy", "elevator", "lift", "cầu thang", "thang bộ", "tay vịn", "tam cấp"]
|
|
|
|
# medical detail keywords (we count but only output total)
|
|
patient_room_keywords = ["phòng bệnh", "giường bệnh", "phòng vip", "phòng bệnh nhân", "pb", "phòng bv"]
|
|
clinic_room_keywords = ["phòng khám", "khoa khám", "phòng khám bệnh", "khu khám", "pk"]
|
|
surgery_room_keywords = ["phòng mổ", "hậu phẫu", "phòng phẫu thuật", "khu mổ", "phòng pt"]
|
|
technical_room_keywords = [
|
|
"phòng xét nghiệm", "phòng chụp", "xq", "siêu âm", "kho dược",
|
|
"phòng xn", "labo", "phòng thí nghiệm", "nội soi", "cấp cứu", "hồi sức"
|
|
]
|
|
|
|
office_keywords = [
|
|
"phòng nhân viên", "phòng giám đốc", "phòng họp", "phòng hành chính",
|
|
"văn phòng", "phòng ban", "phòng giao ban", "hội trường", "phòng kế toán"
|
|
]
|
|
|
|
num_wc_tasks = _count_tasks_with_keywords(tasks, wc_keywords)
|
|
num_hallway_tasks = _count_tasks_with_keywords(tasks, hallway_keywords)
|
|
num_lobby_tasks = _count_tasks_with_keywords(tasks, lobby_keywords)
|
|
num_outdoor_tasks = _count_tasks_with_keywords(tasks, outdoor_keywords)
|
|
num_elevator_tasks = _count_tasks_with_keywords(tasks, elevator_keywords)
|
|
|
|
num_patient_room_tasks = _count_tasks_with_keywords(tasks, patient_room_keywords)
|
|
num_clinic_room_tasks = _count_tasks_with_keywords(tasks, clinic_room_keywords)
|
|
num_surgery_room_tasks = _count_tasks_with_keywords(tasks, surgery_room_keywords)
|
|
num_technical_room_tasks = _count_tasks_with_keywords(tasks, technical_room_keywords)
|
|
|
|
num_medical_tasks_total = (
|
|
num_patient_room_tasks + num_clinic_room_tasks + num_surgery_room_tasks + num_technical_room_tasks
|
|
)
|
|
|
|
num_indoor_room_tasks = _count_tasks_with_keywords(tasks, office_keywords)
|
|
|
|
# -----------------------------
|
|
# GROUP 3: RATIOS & DIVERSITY (reduced)
|
|
# -----------------------------
|
|
cleaning_ratio = num_cleaning_tasks / num_tasks if num_tasks > 0 else 0.0
|
|
trash_collection_ratio = num_trash_collection_tasks / num_tasks if num_tasks > 0 else 0.0
|
|
monitoring_ratio = num_monitoring_tasks / num_tasks if num_tasks > 0 else 0.0
|
|
|
|
area_counts = [
|
|
num_wc_tasks, num_hallway_tasks, num_lobby_tasks, num_outdoor_tasks, num_elevator_tasks,
|
|
num_medical_tasks_total, num_indoor_room_tasks
|
|
]
|
|
area_diversity = sum(1 for c in area_counts if c > 0)
|
|
|
|
return {
|
|
# TASK COUNTS (7)
|
|
"num_tasks": num_tasks,
|
|
"num_cleaning_tasks": num_cleaning_tasks,
|
|
"num_trash_collection_tasks": num_trash_collection_tasks,
|
|
"num_monitoring_tasks": num_monitoring_tasks,
|
|
"num_deep_cleaning_tasks": num_deep_cleaning_tasks,
|
|
"num_support_tasks": num_support_tasks,
|
|
"num_other_tasks": num_other_tasks,
|
|
|
|
# AREA (reduced + aggregated) (7)
|
|
"num_wc_tasks": num_wc_tasks,
|
|
"num_hallway_tasks": num_hallway_tasks,
|
|
"num_lobby_tasks": num_lobby_tasks,
|
|
"num_outdoor_tasks": num_outdoor_tasks,
|
|
"num_elevator_tasks": num_elevator_tasks,
|
|
"num_medical_tasks_total": num_medical_tasks_total,
|
|
"num_indoor_room_tasks": num_indoor_room_tasks,
|
|
|
|
# RATIOS & DIVERSITY (4)
|
|
"cleaning_ratio": round(cleaning_ratio, 4),
|
|
"trash_collection_ratio": round(trash_collection_ratio, 4),
|
|
"monitoring_ratio": round(monitoring_ratio, 4),
|
|
"area_diversity": area_diversity,
|
|
}
|
|
|
|
|
|
# =========================================================
|
|
# 3) MAIN (APPLY TO EXCEL)
|
|
# =========================================================
|
|
if __name__ == "__main__":
|
|
print("=" * 100)
|
|
print("TRÍCH XUẤT REDUCED KEYWORD FEATURES TỪ TASKS TEXT")
|
|
print("=" * 100)
|
|
|
|
input_file = "ket_qua_cong_viec_full.xlsx"
|
|
output_csv = "features_keywords_reduced.csv"
|
|
output_xlsx = "features_keywords_reduced.xlsx"
|
|
|
|
print(f"\n📂 Đọc file {input_file} ...")
|
|
df = pd.read_excel(input_file)
|
|
print(f"✅ Đọc thành công {len(df)} rows")
|
|
|
|
print("\n🔗 Gộp all_task_normal + all_task_dinhky ...")
|
|
df["all_tasks_combined"] = df["all_task_normal"].fillna("") + " ; " + df["all_task_dinhky"].fillna("")
|
|
|
|
print("\n⚙️ Trích xuất features ...")
|
|
features_list = []
|
|
for idx, row in df.iterrows():
|
|
if (idx + 1) % 50 == 0:
|
|
print(f" Đang xử lý... {idx + 1}/{len(df)}")
|
|
feats = extract_keyword_features_reduced(row["all_tasks_combined"])
|
|
feats["ma_dia_diem"] = row.get("ma_dia_diem", None)
|
|
features_list.append(feats)
|
|
|
|
df_features = pd.DataFrame(features_list)
|
|
|
|
# ma_dia_diem lên đầu
|
|
cols = ["ma_dia_diem"] + [c for c in df_features.columns if c != "ma_dia_diem"]
|
|
df_features = df_features[cols]
|
|
|
|
print("\n✅ DONE. Shape:", df_features.shape)
|
|
|
|
print(f"\n💾 Save CSV: {output_csv}")
|
|
df_features.to_csv(output_csv, index=False, encoding="utf-8-sig")
|
|
|
|
print(f"💾 Save XLSX: {output_xlsx}")
|
|
df_features.to_excel(output_xlsx, index=False, engine="openpyxl")
|
|
|
|
print("\n📋 Sample:")
|
|
print(df_features.head(5).to_string(index=False))
|