preditc_nhansu_phase2/workload_converter/workload_converter_script.py

96 lines
2.6 KiB
Python

import os
import re
import pandas as pd
pd.set_option('future.no_silent_downcasting', True)
INPUT_DIR = "input_files"
OUTPUT_DIR = "output_files"
def clean_and_convert(value):
if isinstance(value, str):
value = re.sub(r'[\n\r\\/]+', ' ', value)
value = re.sub(r'\s+', ' ', value).strip()
try:
num = float(value)
if num.is_integer():
return int(num)
return num
except ValueError:
return value
return value
def deduplicate_columns(df):
cols = df.columns.tolist()
counts = {}
new_cols = []
for col in cols:
if col in counts:
counts[col] += 1
new_cols.append(f"{col}.{counts[col]}")
else:
counts[col] = 0
new_cols.append(col)
df.columns = new_cols
return df
def process_batch():
print(f"BATCH CONVERT: {INPUT_DIR} -> {OUTPUT_DIR}")
if not os.path.exists(INPUT_DIR):
os.makedirs(INPUT_DIR)
print(f"Da tao thu muc '{INPUT_DIR}'. Hay copy file Excel vao do va chay lai!")
return
os.makedirs(OUTPUT_DIR, exist_ok=True)
files = [f for f in os.listdir(INPUT_DIR) if f.endswith('.xlsx') and not f.startswith('~$')]
if not files:
print(f"Thu muc '{INPUT_DIR}' dang trong.")
return
print(f"Tim thay {len(files)} file. Bat dau xu ly...")
for filename in files:
input_path = os.path.join(INPUT_DIR, filename)
base_name = os.path.splitext(filename)[0]
output_path = os.path.join(OUTPUT_DIR, f"{base_name}.json")
try:
print(f"Dang xu ly: {filename} ...", end=" ")
df = pd.read_excel(
input_path,
sheet_name="Khối lượng công việc chi tiết",
header=2,
dtype=object,
engine='openpyxl'
)
df = df.iloc[2:].reset_index(drop=True)
df.columns = [str(clean_and_convert(col)) for col in df.columns]
df = deduplicate_columns(df)
df = df.map(clean_and_convert)
df = df.fillna(0)
df.to_json(output_path, orient='records', force_ascii=False, indent=4)
print("Xong")
except ValueError as ve:
if "Worksheet" in str(ve):
print("Loi: Khong co sheet 'Khối lượng công việc chi tiết'")
else:
print(f"Loi: {ve}")
except Exception as e:
print(f"Loi: {e}")
print("-" * 50)
print(f"Hoan tat! Kiem tra ket qua tai: {os.path.abspath(OUTPUT_DIR)}")
if __name__ == "__main__":
process_batch()