import os import re import pandas as pd pd.set_option('future.no_silent_downcasting', True) INPUT_DIR = "input_files" OUTPUT_DIR = "output_files" def clean_and_convert(value): if isinstance(value, str): value = re.sub(r'[\n\r\\/]+', ' ', value) value = re.sub(r'\s+', ' ', value).strip() try: num = float(value) if num.is_integer(): return int(num) return num except ValueError: return value return value def deduplicate_columns(df): cols = df.columns.tolist() counts = {} new_cols = [] for col in cols: if col in counts: counts[col] += 1 new_cols.append(f"{col}.{counts[col]}") else: counts[col] = 0 new_cols.append(col) df.columns = new_cols return df def process_batch(): print(f"BATCH CONVERT: {INPUT_DIR} -> {OUTPUT_DIR}") if not os.path.exists(INPUT_DIR): os.makedirs(INPUT_DIR) print(f"Da tao thu muc '{INPUT_DIR}'. Hay copy file Excel vao do va chay lai!") return os.makedirs(OUTPUT_DIR, exist_ok=True) files = [f for f in os.listdir(INPUT_DIR) if f.endswith('.xlsx') and not f.startswith('~$')] if not files: print(f"Thu muc '{INPUT_DIR}' dang trong.") return print(f"Tim thay {len(files)} file. Bat dau xu ly...") for filename in files: input_path = os.path.join(INPUT_DIR, filename) base_name = os.path.splitext(filename)[0] output_path = os.path.join(OUTPUT_DIR, f"{base_name}.json") try: print(f"Dang xu ly: {filename} ...", end=" ") df = pd.read_excel( input_path, sheet_name="Khối lượng công việc chi tiết", header=2, dtype=object, engine='openpyxl' ) df = df.iloc[2:].reset_index(drop=True) df.columns = [str(clean_and_convert(col)) for col in df.columns] df = deduplicate_columns(df) df = df.map(clean_and_convert) df = df.fillna(0) df.to_json(output_path, orient='records', force_ascii=False, indent=4) print("Xong") except ValueError as ve: if "Worksheet" in str(ve): print("Loi: Khong co sheet 'Khối lượng công việc chi tiết'") else: print(f"Loi: {ve}") except Exception as e: print(f"Loi: {e}") print("-" * 50) print(f"Hoan tat! Kiem tra ket qua tai: {os.path.abspath(OUTPUT_DIR)}") if __name__ == "__main__": process_batch()