f

2025-12-29 09:11:39 +07:00 · 2025-12-29 09:11:39 +07:00 · 2d896bb6af
commit 2d896bb6af
8 changed files with 2134 additions and 0 deletions
--- a/pycache/predict_staff.cpython-312.pyc
+++ b/pycache/predict_staff.cpython-312.pyc
--- a/pycache/predict_staff.cpython-313.pyc
+++ b/pycache/predict_staff.cpython-313.pyc
--- a/calamviec.xlsx
+++ b/calamviec.xlsx
--- a/demo_predict_staff.ipynb
+++ b/demo_predict_staff.ipynb
--- a/namoi.csv
+++ b/namoi.csv
@ -0,0 +1,9 @@
+Mã địa điểm,Loại hình,Tổng Giờ hoạt động của khách hàng mỗi tuần,Lưu lượng KH hoạt động ngày tại Tòa tháp,Diện tích ngoại cảnh Tòa tháp (m2),Số tòa tháp,Số tầng nổi,Số tầng hầm,Tầng hầm (m2),Sàn Sảnh (m2),Sàn Hành lang (m2),Sàn WC (m2),Sàn Phòng (m2) ,Thảm (m2),Dốc hầm (m) ,Viền phản quang (m),Ốp tường (m2),Ốp chân tường (m2),Rãnh thoát nước (m),Kính (m2),Thang máy,Thang bộ
+559-1,3,168,3,400,2,59,2,2480,492,5637,214,15,0,5,350,822,0,3,56,268,128
+618-1,3,168,1,2750,1,22,4,6800,9223,2384,194,177,6,6,850,684,1151,11,701,113,44
+283-1,3,168,2,4610,1,15,1,2200,1890,1490,382,62,59,400,0,0,100,275,770,61,30
+337-1,3,54,2,3000,3,60,6,4000,900,7080,2585,40,4,4,0,3520,1060,200,1170,20,12
+55-1,3,54,3,6800,1,24,2,4800,1217,541,671,30,2226,5,450,0,2240,650,25,148,56
+33-1,3,54,1,900,1,19,3,2000,2000,1235,330,0,154,60,0,940,0,0,2190,105,38
+213-1,3,168,2,2000,2,50,3,2650,8350,7890,150,700,8,6,0,0,0,0,900,248,100
+286-1,3,54,3,100,1,18,2,1800,150,480,540,15,690,0,0,900,450,0,720,90,44
--- a/predict_staff.py
+++ b/predict_staff.py
@ -0,0 +1,346 @@
+"""
+Hệ thống dự đoán số lượng nhân sự theo ca làm việc và đặc điểm tòa nhà
+Author: ML Expert
+Date: December 29, 2025
+"""
+
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split, cross_val_score
+from sklearn.preprocessing import StandardScaler, LabelEncoder
+from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
+from sklearn.linear_model import Ridge
+from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
+import re
+import warnings
+warnings.filterwarnings('ignore')
+
+class StaffPredictor:
+    """Class dự đoán số lượng nhân sự"""
+    
+    def __init__(self):
+        self.model = None
+        self.scaler = StandardScaler()
+        self.building_data = None
+        self.shift_data = None
+        self.feature_columns = None
+        
+    def load_data(self, building_file='namoi.csv', shift_file='calamviec.xlsx'):
+        """Đọc dữ liệu từ file"""
+        print("📂 Đang đọc dữ liệu...")
+        
+        # Đọc dữ liệu tòa nhà
+        self.building_data = pd.read_csv(building_file, encoding='utf-8')
+        print(f"✅ Đã đọc {len(self.building_data)} tòa nhà")
+        
+        # Đọc dữ liệu ca làm việc
+        self.shift_data = pd.read_excel(shift_file)
+        print(f"✅ Đã đọc {len(self.shift_data)} records ca làm việc")
+        
+        return self.building_data, self.shift_data
+    
+    def extract_shift_features(self, shift_str):
+        """Trích xuất thông tin từ ca làm việc"""
+        # Chuẩn hóa format giờ
+        shift_str = str(shift_str).strip()
+        
+        # Các pattern có thể có
+        patterns = [
+            r'(\d+)[h:](\d+)[-:](\d+)[h:](\d+)',  # 14h00:22h00 hoặc 14:00-22:00
+            r'(\d+)[h:](\d+)\s*-\s*(\d+)[h:](\d+)',  # 6h30-14h30
+            r'(\d+):(\d+)\s*-\s*(\d+):(\d+)',  # 6:30-14:30
+        ]
+        
+        for pattern in patterns:
+            match = re.search(pattern, shift_str)
+            if match:
+                start_hour = int(match.group(1))
+                start_min = int(match.group(2))
+                end_hour = int(match.group(3))
+                end_min = int(match.group(4))
+                
+                # Tính độ dài ca (giờ)
+                start_time = start_hour + start_min/60
+                end_time = end_hour + end_min/60
+                
+                # Xử lý trường hợp qua đêm
+                if end_time < start_time:
+                    end_time += 24
+                
+                duration = end_time - start_time
+                
+                return {
+                    'start_hour': start_hour,
+                    'start_minute': start_min,
+                    'end_hour': end_hour,
+                    'end_minute': end_min,
+                    'shift_duration': duration,
+                    'is_morning': 1 if 6 <= start_hour < 12 else 0,
+                    'is_afternoon': 1 if 12 <= start_hour < 18 else 0,
+                    'is_evening': 1 if 18 <= start_hour < 24 else 0,
+                }
+        
+        # Nếu không match được, return giá trị mặc định
+        return {
+            'start_hour': 8,
+            'start_minute': 0,
+            'end_hour': 17,
+            'end_minute': 0,
+            'shift_duration': 8,
+            'is_morning': 1,
+            'is_afternoon': 0,
+            'is_evening': 0,
+        }
+    
+    def prepare_features(self):
+        """Chuẩn bị features để train model"""
+        print("\n🔧 Chuẩn bị features...")
+        
+        # Merge dữ liệu
+        df = pd.merge(self.shift_data, self.building_data, 
+                      left_on='Mã địa điểm', right_on='Mã địa điểm', how='left')
+        
+        # Trích xuất features từ ca làm việc
+        shift_features = df['Ca'].apply(self.extract_shift_features)
+        shift_features_df = pd.DataFrame(shift_features.tolist())
+        
+        # Tính tổng số ca của mỗi tòa nhà
+        shifts_per_building = self.shift_data.groupby('Mã địa điểm')['Ca'].nunique().to_dict()
+        df['total_shifts_per_building'] = df['Mã địa điểm'].map(shifts_per_building)
+        
+        # Kết hợp tất cả features
+        df = pd.concat([df, shift_features_df], axis=1)
+        
+        # Chọn features quan trọng
+        feature_cols = [
+            # Features từ tòa nhà
+            'Tổng Giờ hoạt động của khách hàng mỗi tuần',
+            'Lưu lượng KH hoạt động ngày tại Tòa tháp',
+            'Diện tích ngoại cảnh Tòa tháp (m2)',
+            'Số tòa tháp',
+            'Số tầng nổi',
+            'Số tầng hầm',
+            'Tầng hầm (m2)',
+            'Sàn Sảnh (m2)',
+            'Sàn Hành lang (m2)',
+            'Sàn WC (m2)',
+            'Sàn Phòng (m2) ',
+            'Thảm (m2)',
+            'Kính (m2)',
+            'Thang máy',
+            'Thang bộ',
+            'total_shifts_per_building',
+            # Features từ ca làm việc
+            'start_hour',
+            'shift_duration',
+            'is_morning',
+            'is_afternoon',
+            'is_evening',
+        ]
+        
+        X = df[feature_cols].fillna(0)
+        y = df['Number']
+        
+        self.feature_columns = feature_cols
+        
+        print(f"✅ Đã chuẩn bị {len(feature_cols)} features")
+        print(f"📊 Dataset shape: {X.shape}")
+        
+        return X, y, df
+    
+    def train_model(self, X, y, model_type='random_forest'):
+        """Train model"""
+        print(f"\n🤖 Training model: {model_type}...")
+        
+        # Split data
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=0.2, random_state=42
+        )
+        
+        # Standardize features
+        X_train_scaled = self.scaler.fit_transform(X_train)
+        X_test_scaled = self.scaler.transform(X_test)
+        
+        # Chọn model
+        if model_type == 'random_forest':
+            self.model = RandomForestRegressor(
+                n_estimators=100,
+                max_depth=10,
+                min_samples_split=2,
+                min_samples_leaf=1,
+                random_state=42
+            )
+        elif model_type == 'gradient_boosting':
+            self.model = GradientBoostingRegressor(
+                n_estimators=100,
+                learning_rate=0.1,
+                max_depth=5,
+                random_state=42
+            )
+        else:  # ridge
+            self.model = Ridge(alpha=1.0)
+        
+        # Train
+        self.model.fit(X_train_scaled, y_train)
+        
+        # Evaluate
+        y_pred_train = self.model.predict(X_train_scaled)
+        y_pred_test = self.model.predict(X_test_scaled)
+        
+        train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
+        test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
+        train_mae = mean_absolute_error(y_train, y_pred_train)
+        test_mae = mean_absolute_error(y_test, y_pred_test)
+        train_r2 = r2_score(y_train, y_pred_train)
+        test_r2 = r2_score(y_test, y_pred_test)
+        
+        print(f"\n📈 Kết quả đánh giá:")
+        print(f"Train RMSE: {train_rmse:.2f}, MAE: {train_mae:.2f}, R²: {train_r2:.3f}")
+        print(f"Test RMSE:  {test_rmse:.2f}, MAE: {test_mae:.2f}, R²: {test_r2:.3f}")
+        
+        # Cross-validation (với dataset nhỏ, dùng 3-fold)
+        cv_scores = cross_val_score(self.model, X_train_scaled, y_train, 
+                                     cv=min(3, len(X_train)), 
+                                     scoring='neg_mean_squared_error')
+        cv_rmse = np.sqrt(-cv_scores.mean())
+        print(f"Cross-validation RMSE: {cv_rmse:.2f} (+/- {np.sqrt(-cv_scores).std():.2f})")
+        
+        return {
+            'train_rmse': train_rmse,
+            'test_rmse': test_rmse,
+            'train_mae': train_mae,
+            'test_mae': test_mae,
+            'train_r2': train_r2,
+            'test_r2': test_r2,
+            'cv_rmse': cv_rmse
+        }
+    
+    def predict_staff(self, building_code, shift_time):
+        """
+        Dự đoán số lượng nhân sự cho một tòa nhà và ca làm việc cụ thể
+        
+        Parameters:
+        -----------
+        building_code : str
+            Mã địa điểm (vd: '559-1')
+        shift_time : str
+            Ca làm việc (vd: '6h00:14h00')
+        
+        Returns:
+        --------
+        int : Số lượng nhân sự dự đoán
+        """
+        # Lấy thông tin tòa nhà
+        building_info = self.building_data[
+            self.building_data['Mã địa điểm'] == building_code
+        ]
+        
+        if len(building_info) == 0:
+            raise ValueError(f"Không tìm thấy tòa nhà với mã: {building_code}")
+        
+        # Trích xuất features từ ca làm việc
+        shift_features = self.extract_shift_features(shift_time)
+        
+        # Tính tổng số ca của tòa nhà này
+        total_shifts = self.shift_data[self.shift_data['Mã địa điểm'] == building_code]['Ca'].nunique()
+        
+        # Tạo input features
+        input_data = {}
+        
+        # Features từ tòa nhà
+        for col in self.feature_columns:
+            if col == 'total_shifts_per_building':
+                input_data[col] = total_shifts
+            elif col in building_info.columns:
+                input_data[col] = building_info[col].values[0]
+            elif col in shift_features:
+                input_data[col] = shift_features[col]
+            else:
+                input_data[col] = 0
+        
+        # Convert to DataFrame
+        X_input = pd.DataFrame([input_data])
+        
+        # Standardize
+        X_input_scaled = self.scaler.transform(X_input)
+        
+        # Predict
+        prediction = self.model.predict(X_input_scaled)[0]
+        
+        # Làm tròn và đảm bảo >= 1
+        prediction = max(1, round(prediction))
+        
+        return prediction
+    
+    def show_feature_importance(self):
+        """Hiển thị độ quan trọng của features"""
+        if hasattr(self.model, 'feature_importances_'):
+            print("\n📊 Feature Importance (Top 10):")
+            importances = pd.DataFrame({
+                'feature': self.feature_columns,
+                'importance': self.model.feature_importances_
+            }).sort_values('importance', ascending=False)
+            
+            for idx, row in importances.head(10).iterrows():
+                print(f"  {row['feature']}: {row['importance']:.4f}")
+        else:
+            print("\n⚠️  Model không hỗ trợ feature importance")
+
+
+def main():
+    """Hàm chính để chạy chương trình"""
+    
+    print("="*60)
+    print("🏢 HỆ THỐNG DỰ ĐOÁN SỐ LƯỢNG NHÂN SỰ THEO CA LÀM VIỆC")
+    print("="*60)
+    
+    # Khởi tạo predictor
+    predictor = StaffPredictor()
+    
+    # Load dữ liệu
+    building_df, shift_df = predictor.load_data()
+    
+    print("\n📋 Thông tin dữ liệu:")
+    print(f"  - Số tòa nhà: {len(building_df)}")
+    print(f"  - Số records ca làm việc: {len(shift_df)}")
+    print(f"  - Các tòa nhà: {', '.join(building_df['Mã địa điểm'].unique())}")
+    
+    # Chuẩn bị features
+    X, y, df_full = predictor.prepare_features()
+    
+    # Train model
+    results = predictor.train_model(X, y, model_type='random_forest')
+    
+    # Hiển thị feature importance
+    predictor.show_feature_importance()
+    
+    # Demo dự đoán
+    print("\n" + "="*60)
+    print("🎯 DEMO DỰ ĐOÁN")
+    print("="*60)
+    
+    # Dự đoán cho một số trường hợp
+    test_cases = [
+        ('559-1', '6h00:14h00'),
+        ('618-1', '14h00:22h00'),
+        ('283-1', '6h00:14h00'),
+    ]
+    
+    for building_code, shift_time in test_cases:
+        predicted_staff = predictor.predict_staff(building_code, shift_time)
+        print(f"\n🏢 Tòa nhà: {building_code}")
+        print(f"⏰ Ca làm việc: {shift_time}")
+        print(f"👥 Số nhân sự dự đoán: {predicted_staff} người")
+    
+    print("\n" + "="*60)
+    print("✅ Hoàn thành!")
+    print("="*60)
+    
+    return predictor
+
+
+if __name__ == "__main__":
+    predictor = main()
+    
+    # Bạn có thể dùng predictor để dự đoán:
+    # predictor.predict_staff('559-1', '14h00:22h00')
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,7 @@
+pandas>=1.3.0
+numpy>=1.21.0
+scikit-learn>=1.0.0
+openpyxl>=3.0.0
+matplotlib>=3.4.0
+seaborn>=0.11.0
+jupyter>=1.0.0
--- a/test_full_system.py
+++ b/test_full_system.py
@ -0,0 +1,150 @@
+"""
+Script demo đầy đủ - Test các chức năng của hệ thống
+"""
+
+from predict_staff import StaffPredictor
+import pandas as pd
+
+def test_all_features():
+    """Test tất cả các chức năng"""
+    
+    print("="*70)
+    print("🧪 KIỂM TRA TOÀN BỘ HỆ THỐNG")
+    print("="*70)
+    
+    # 1. Khởi tạo và load data
+    print("\n1️⃣ KHỞI TẠO VÀ LOAD DỮ LIỆU")
+    print("-"*70)
+    predictor = StaffPredictor()
+    building_df, shift_df = predictor.load_data()
+    
+    # 2. Chuẩn bị features
+    print("\n2️⃣ CHUẨN BỊ FEATURES")
+    print("-"*70)
+    X, y, df_full = predictor.prepare_features()
+    
+    # 3. Train model
+    print("\n3️⃣ TRAIN MODEL")
+    print("-"*70)
+    results = predictor.train_model(X, y, model_type='random_forest')
+    
+    # 4. Feature importance
+    print("\n4️⃣ FEATURE IMPORTANCE")
+    print("-"*70)
+    predictor.show_feature_importance()
+    
+    # 5. Test dự đoán cho TẤT CẢ các tòa nhà
+    print("\n5️⃣ DỰ ĐOÁN CHO TẤT CẢ TÒA NHÀ")
+    print("-"*70)
+    
+    available_buildings = building_df['Mã địa điểm'].tolist()
+    common_shifts = [
+        '6h00:14h00',
+        '14h00:22h00', 
+        '6h30-14h30',
+        '13h00-21h00',
+        '07:00-17:00',
+        '8:00-17:00'
+    ]
+    
+    all_predictions = []
+    
+    for building in available_buildings:
+        print(f"\n🏢 Tòa nhà: {building}")
+        for shift in common_shifts:
+            try:
+                pred = predictor.predict_staff(building, shift)
+                print(f"  ⏰ {shift:20s} → 👥 {pred:2d} người")
+                all_predictions.append({
+                    'Mã địa điểm': building,
+                    'Ca làm việc': shift,
+                    'Số nhân sự dự đoán': pred
+                })
+            except Exception as e:
+                print(f"  ⏰ {shift:20s} → ❌ Lỗi: {e}")
+    
+    # 6. Tạo báo cáo tổng hợp
+    print("\n6️⃣ BÁO CÁO TỔNG HỢP")
+    print("-"*70)
+    
+    pred_df = pd.DataFrame(all_predictions)
+    
+    # Thống kê theo tòa nhà
+    print("\n📊 Tổng nhân sự cần thiết cho mỗi tòa (tất cả các ca):")
+    building_stats = pred_df.groupby('Mã địa điểm')['Số nhân sự dự đoán'].agg([
+        ('Tổng', 'sum'),
+        ('Trung bình', 'mean'),
+        ('Min', 'min'),
+        ('Max', 'max')
+    ]).round(1)
+    print(building_stats)
+    
+    # Thống kê theo ca
+    print("\n📊 Tổng nhân sự cần thiết cho mỗi ca (tất cả các tòa):")
+    shift_stats = pred_df.groupby('Ca làm việc')['Số nhân sự dự đoán'].agg([
+        ('Tổng', 'sum'),
+        ('Trung bình', 'mean'),
+        ('Min', 'min'),
+        ('Max', 'max')
+    ]).round(1)
+    print(shift_stats)
+    
+    # 7. So sánh với dữ liệu thực tế
+    print("\n7️⃣ SO SÁNH VỚI DỮ LIỆU THỰC TẾ")
+    print("-"*70)
+    
+    # Lấy một số mẫu từ dữ liệu thực
+    sample_real = shift_df.head(10)
+    
+    print(f"\n{'Mã địa điểm':15s} {'Ca làm việc':25s} {'Thực tế':10s} {'Dự đoán':10s} {'Chênh lệch':12s}")
+    print("-"*75)
+    
+    for idx, row in sample_real.iterrows():
+        building = row['Mã địa điểm']
+        shift = row['Ca']
+        real_staff = row['Number']
+        
+        try:
+            pred_staff = predictor.predict_staff(building, shift)
+            diff = pred_staff - real_staff
+            diff_pct = (diff / real_staff * 100) if real_staff > 0 else 0
+            
+            print(f"{building:15s} {shift:25s} {real_staff:10d} {pred_staff:10d} "
+                  f"{diff:+5d} ({diff_pct:+.1f}%)")
+        except:
+            print(f"{building:15s} {shift:25s} {real_staff:10d} {'ERROR':10s}")
+    
+    # 8. Xuất kết quả
+    print("\n8️⃣ XUẤT KẾT QUẢ")
+    print("-"*70)
+    
+    output_file = 'ket_qua_du_doan_day_du.csv'
+    pred_df.to_csv(output_file, index=False, encoding='utf-8-sig')
+    print(f"✅ Đã xuất {len(pred_df)} dự đoán ra file: {output_file}")
+    
+    # Tạo pivot table
+    pivot_df = pred_df.pivot_table(
+        index='Mã địa điểm',
+        columns='Ca làm việc',
+        values='Số nhân sự dự đoán',
+        aggfunc='mean'
+    ).round(0)
+    
+    pivot_file = 'bang_nhan_su_theo_toa_ca.csv'
+    pivot_df.to_csv(pivot_file, encoding='utf-8-sig')
+    print(f"✅ Đã xuất ma trận pivot ra file: {pivot_file}")
+    
+    print("\n" + "="*70)
+    print("✅ HOÀN THÀNH KIỂM TRA TOÀN BỘ HỆ THỐNG!")
+    print("="*70)
+    
+    return predictor, pred_df
+
+
+if __name__ == "__main__":
+    predictor, predictions = test_all_features()
+    
+    print("\n💡 Bạn có thể sử dụng predictor để dự đoán:")
+    print("   predictor.predict_staff('559-1', '14h00:22h00')")
+    print("\n💡 Xem kết quả dự đoán:")
+    print("   predictions.head()")