search_suggest/extract_work_content.py

import pandas as pd
import json
import os
import re

# Bước 1: Đọc file Excel và lấy list mã địa điểm
excel_file = "Link LLV 2025.xlsx"
sheet_name = "Bản sao của LLV Form mới"

df = pd.read_excel(excel_file, sheet_name=sheet_name)
filtered_df = df[df["Bên AI review File"] == "Hoàn thành"]

# Lấy list mã địa điểm
ma_dia_diem_list = filtered_df["Mã địa điểm"].tolist()

print(f"Tìm thấy {len(ma_dia_diem_list)} mã địa điểm có trạng thái 'Hoàn thành'")
print(f"Danh sách mã địa điểm: {ma_dia_diem_list[:10]}...")  # In 10 mã đầu tiên

# Bước 2: Đọc các file JSON và trích xuất "Nội dung công việc"
success_folder = "success"
all_work_contents = set()  # Dùng set để tự động loại bỏ trùng lặp

def clean_work_content(text):
    """
    Xử lý chuỗi loại bỏ:
    - Cụm "tầng", "tâng"
    - Cụm "T" + số (T1, T2, T3...)
    - Cụm "tầng" + số
    - Các dấu đặc biệt và số không cần thiết
    """
    if not text:
        return text

    # Loại bỏ các cụm "T" + số (T1, T2, T3, T1-T3, T1->T3, v.v.)
    text = re.sub(r'\bT\d+', '', text, flags=re.IGNORECASE)
    text = re.sub(r'T\d+\s*[-–>]+\s*T?\d+', '', text, flags=re.IGNORECASE)

    # Loại bỏ "tầng" hoặc "tâng" + số
    text = re.sub(r't[ầâ]ng\s*\d+', '', text, flags=re.IGNORECASE)
    text = re.sub(r't[ầâ]ng\s*\d+\s*[-–>]+\s*\d+', '', text, flags=re.IGNORECASE)

    # Loại bỏ từ "tầng", "tâng" đơn lẻ
    text = re.sub(r'\bt[ầâ]ng\b', '', text, flags=re.IGNORECASE)

    # Loại bỏ các số đứng riêng lẻ (1, 2, 3...)
    text = re.sub(r'\b\d+\b', '', text)

    # Loại bỏ các ký tự đặc biệt thừa: ->, --, ( trống ), v.v.
    text = re.sub(r'[-–>]+\s*[-–>]*', ' ', text)
    text = re.sub(r'\(\s*\)', '', text)

    # Loại bỏ khoảng trắng thừa
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()

    # Loại bỏ dấu phẩy, dấu chấm ở cuối nếu có
    text = text.strip(',. ')

    return text

# Đọc từng file JSON
processed_files = 0
not_found_files = []

for ma_dia_diem in ma_dia_diem_list:
    # Tạo tên file JSON (có thể có nhiều file với cùng mã)
    json_pattern = f"{ma_dia_diem}*.json"

    # Tìm tất cả file khớp với pattern
    matching_files = []
    for filename in os.listdir(success_folder):
        if filename.startswith(str(ma_dia_diem)) and filename.endswith('.json'):
            matching_files.append(filename)

    if not matching_files:
        not_found_files.append(ma_dia_diem)
        continue

    # Xử lý từng file khớp
    for filename in matching_files:
        json_path = os.path.join(success_folder, filename)

        try:
            with open(json_path, 'r', encoding='utf-8') as f:
                data = json.load(f)

            # Trích xuất "Nội dung công việc" từ Chi_tiet_vi_tri
            if 'Chi_tiet_vi_tri' in data:
                chi_tiet = data['Chi_tiet_vi_tri']

                # Duyệt qua từng vị trí (VT1, VT2, ...)
                for vi_tri_key, vi_tri_data in chi_tiet.items():
                    # Xử lý "Công việc thường"
                    if 'Công việc thường' in vi_tri_data:
                        for cong_viec in vi_tri_data['Công việc thường']:
                            if 'Nội dung công việc' in cong_viec:
                                content = cong_viec['Nội dung công việc']
                                cleaned_content = clean_work_content(content)
                                if cleaned_content:  # Chỉ thêm nếu còn nội dung sau khi xử lý
                                    all_work_contents.add(cleaned_content)

                    # Xử lý "Công việc định kỳ"
                    if 'Công việc định kỳ' in vi_tri_data:
                        for cong_viec in vi_tri_data['Công việc định kỳ']:
                            if 'Nội dung công việc' in cong_viec:
                                content = cong_viec['Nội dung công việc']
                                cleaned_content = clean_work_content(content)
                                if cleaned_content:
                                    all_work_contents.add(cleaned_content)

            processed_files += 1

        except Exception as e:
            print(f"Lỗi khi đọc file {filename}: {e}")

print(f"\nĐã xử lý {processed_files} file JSON")
print(f"Không tìm thấy file cho {len(not_found_files)} mã địa điểm")

# Bước 3: Lưu kết quả
# Sắp xếp và lưu vào file text
sorted_contents = sorted(all_work_contents)

output_file = "unique_work_contents.txt"
with open(output_file, 'w', encoding='utf-8') as f:
    for content in sorted_contents:
        f.write(content + '\n')

print(f"\nĐã tìm thấy {len(all_work_contents)} nội dung công việc duy nhất")
print(f"Kết quả đã được lưu vào file: {output_file}")

# In một số ví dụ
print("\n=== Một số ví dụ nội dung công việc đã xử lý ===")
for i, content in enumerate(sorted_contents[:20], 1):
    print(f"{i}. {content}")

# Lưu thêm vào file JSON để dễ xử lý sau này
output_json = "unique_work_contents.json"
with open(output_json, 'w', encoding='utf-8') as f:
    json.dump(sorted_contents, f, ensure_ascii=False, indent=2)

print(f"\nKết quả cũng đã được lưu vào file JSON: {output_json}")