""" Example: Using TextFeatureExtractor for Staff Prediction ========================================================= This script demonstrates how to use extract_text_features.py to extract TF-IDF+SVD features and train a prediction model. Run this script to see a complete example workflow. """ import pandas as pd import numpy as np from extract_text_features import TextFeatureExtractor, extract_features_from_dataframe from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.tree import DecisionTreeRegressor from sklearn.metrics import r2_score, mean_absolute_error import pickle import os def example_1_basic_usage(): """Example 1: Basic text feature extraction""" print("=" * 80) print("EXAMPLE 1: BASIC USAGE") print("=" * 80) # Sample Vietnamese task texts texts = [ "Kiểm tra hệ thống điện tòa nhà A định kỳ", "Bảo trì thang máy tầng 5 và kiểm tra an toàn", "Sửa chữa điều hòa phòng họp B tầng 3", "Vệ sinh kính và kiểm tra hệ thống chiếu sáng", "Bảo trì máy phát điện dự phòng" ] print(f"\nInput: {len(texts)} task descriptions") print(f"Sample: '{texts[0]}'") # Initialize extractor extractor = TextFeatureExtractor( max_features=20, # Small for demo n_components=5 # Small for demo ) # Fit and transform features = extractor.fit_transform(texts) print(f"\nOutput shape: {features.shape}") print(f"Feature names: {extractor.get_feature_names()}") print(f"\nFirst sample features:") print(features[0]) # Show summary print(f"\nExtractor summary:") for key, value in extractor.get_summary().items(): print(f" {key}: {value}") print("\n✅ Example 1 complete!\n") def example_2_dataframe_extraction(): """Example 2: Extract features from DataFrame""" print("=" * 80) print("EXAMPLE 2: DATAFRAME EXTRACTION") print("=" * 80) # Create sample DataFrame df = pd.DataFrame({ 'ma_dia_diem': ['A01', 'A02', 'A03', 'A04', 'A05'], 'all_task_normal': [ 'Kiểm tra điện', 'Bảo trì thang máy', 'Sửa điều hòa', 'Vệ sinh kính', 'Bảo trì máy phát' ], 'all_task_dinhky': [ 'Định kỳ hàng tháng', 'Định kỳ hàng tuần', 'Khẩn cấp', 'Hàng ngày', 'Định kỳ quý' ], 'so_luong': [5, 3, 2, 8, 4] }) print(f"\nInput DataFrame shape: {df.shape}") print(df) # Extract features text_features_df, extractor = extract_features_from_dataframe( df, text_columns=['all_task_normal', 'all_task_dinhky'], fit=True ) print(f"\nExtracted features shape: {text_features_df.shape}") print(f"\nSample features:") print(text_features_df.head()) print("\n✅ Example 2 complete!\n") def example_3_save_and_load(): """Example 3: Save and load extractor""" print("=" * 80) print("EXAMPLE 3: SAVE AND LOAD") print("=" * 80) # Training data train_texts = [ "Kiểm tra hệ thống điện", "Bảo trì thang máy", "Sửa chữa điều hòa" ] # Fit extractor print("\n1. Training extractor...") extractor = TextFeatureExtractor(max_features=10, n_components=3) train_features = extractor.fit_transform(train_texts) print(f" Train features shape: {train_features.shape}") # Save save_path = 'example_extractor.pkl' extractor.save(save_path) print(f" Saved to: {save_path}") # Load print("\n2. Loading extractor...") loaded_extractor = TextFeatureExtractor.load(save_path) # Use loaded extractor on new data print("\n3. Using loaded extractor on new data...") new_texts = ["Vệ sinh kính tầng 5", "Kiểm tra máy phát điện"] new_features = loaded_extractor.transform(new_texts) print(f" New features shape: {new_features.shape}") print(f" Features:\n{new_features}") # Cleanup if os.path.exists(save_path): os.remove(save_path) print(f"\n Cleaned up: {save_path}") print("\n✅ Example 3 complete!\n") def example_4_full_pipeline(): """Example 4: Complete ML pipeline with text features""" print("=" * 80) print("EXAMPLE 4: FULL ML PIPELINE") print("=" * 80) # Create sample dataset np.random.seed(42) n_samples = 100 tasks_pool = [ "Kiểm tra hệ thống điện", "Bảo trì thang máy", "Sửa chữa điều hòa", "Vệ sinh kính", "Bảo trì máy phát điện", "Kiểm tra an toàn", "Sửa chữa ống nước", "Bảo trì hệ thống PCCC" ] df = pd.DataFrame({ 'all_task_normal': [np.random.choice(tasks_pool) for _ in range(n_samples)], 'all_task_dinhky': [np.random.choice(['Hàng ngày', 'Hàng tuần', 'Hàng tháng', 'Quý']) for _ in range(n_samples)], 'dien_tich': np.random.uniform(100, 500, n_samples), 'so_tang': np.random.randint(5, 30, n_samples), 'so_luong': np.random.randint(1, 10, n_samples) }) print(f"\n📊 Dataset: {df.shape}") print(f" Target (so_luong): mean={df['so_luong'].mean():.2f}, std={df['so_luong'].std():.2f}") # === TRAINING PHASE === print("\n1️⃣ TRAINING PHASE") print("-" * 80) # Split data train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) print(f"\n Train: {len(train_df)}, Test: {len(test_df)}") # Extract text features print("\n Extracting text features...") text_features_train, extractor = extract_features_from_dataframe( train_df, text_columns=['all_task_normal', 'all_task_dinhky'], fit=True ) # Prepare numeric features numeric_cols = ['dien_tich', 'so_tang'] X_numeric_train = train_df[numeric_cols].reset_index(drop=True) # Combine features X_train = pd.concat([X_numeric_train, text_features_train], axis=1) y_train = train_df['so_luong'].values print(f"\n Combined features: {X_train.shape}") print(f" - Numeric: {len(numeric_cols)}") print(f" - Text SVD: {text_features_train.shape[1]}") # Scale features scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) # Train model print("\n Training model...") model = DecisionTreeRegressor(max_depth=5, random_state=42) model.fit(X_train_scaled, y_train) # Evaluate on training set y_train_pred = model.predict(X_train_scaled) train_r2 = r2_score(y_train, y_train_pred) train_mae = mean_absolute_error(y_train, y_train_pred) print(f"\n Training metrics:") print(f" R²: {train_r2:.4f}") print(f" MAE: {train_mae:.4f}") # === INFERENCE PHASE === print("\n2️⃣ INFERENCE PHASE") print("-" * 80) # Extract text features (transform only, no fitting!) print("\n Extracting text features (transform only)...") text_features_test, _ = extract_features_from_dataframe( test_df, text_columns=['all_task_normal', 'all_task_dinhky'], extractor=extractor, fit=False # Important! ) # Prepare numeric features X_numeric_test = test_df[numeric_cols].reset_index(drop=True) # Combine features X_test = pd.concat([X_numeric_test, text_features_test], axis=1) y_test = test_df['so_luong'].values # Scale features X_test_scaled = scaler.transform(X_test) # Predict print("\n Making predictions...") y_test_pred = model.predict(X_test_scaled) # Evaluate test_r2 = r2_score(y_test, y_test_pred) test_mae = mean_absolute_error(y_test, y_test_pred) print(f"\n Test metrics:") print(f" R²: {test_r2:.4f}") print(f" MAE: {test_mae:.4f}") # Show sample predictions print(f"\n Sample predictions:") results_df = pd.DataFrame({ 'actual': y_test[:5], 'predicted': y_test_pred[:5], 'error': y_test[:5] - y_test_pred[:5] }) print(results_df.to_string(index=False)) # === FEATURE IMPORTANCE === print("\n3️⃣ FEATURE IMPORTANCE") print("-" * 80) importances = model.feature_importances_ feature_names = X_train.columns.tolist() importance_df = pd.DataFrame({ 'feature': feature_names, 'importance': importances }).sort_values('importance', ascending=False) print("\n Top 10 important features:") print(importance_df.head(10).to_string(index=False)) # Aggregate by feature type n_numeric = len(numeric_cols) text_importance = importances[n_numeric:].sum() numeric_importance = importances[:n_numeric].sum() print(f"\n Feature type contribution:") print(f" Numeric features: {numeric_importance:.4f} ({numeric_importance/(text_importance+numeric_importance)*100:.1f}%)") print(f" Text features: {text_importance:.4f} ({text_importance/(text_importance+numeric_importance)*100:.1f}%)") print("\n✅ Example 4 complete!\n") def example_5_top_tfidf_terms(): """Example 5: Analyze top TF-IDF terms""" print("=" * 80) print("EXAMPLE 5: TOP TF-IDF TERMS ANALYSIS") print("=" * 80) # Sample task texts texts = [ "Kiểm tra hệ thống điện tòa nhà", "Bảo trì thang máy và kiểm tra an toàn", "Sửa chữa hệ thống điều hòa không khí", "Kiểm tra và vệ sinh kính tòa nhà", "Bảo trì máy phát điện dự phòng", "Kiểm tra hệ thống PCCC định kỳ", "Sửa chữa ống nước và hệ thống cấp thoát", "Bảo trì hệ thống thang máy tòa nhà" ] print(f"\nInput: {len(texts)} task descriptions") # Fit extractor extractor = TextFeatureExtractor(max_features=50, n_components=10) extractor.fit(texts) # Get top TF-IDF features print("\n📋 Top 20 TF-IDF terms (by document frequency):") top_features = extractor.get_top_tfidf_features(top_n=20) print(top_features.to_string(index=False)) # Get summary summary = extractor.get_summary() print(f"\n📊 Summary:") print(f" Actual TF-IDF features: {summary['actual_tfidf_features']}") print(f" SVD components: {summary['n_components']}") print(f" Explained variance: {summary['explained_variance']*100:.2f}%") print("\n✅ Example 5 complete!\n") def main(): """Run all examples""" print("\n" + "=" * 80) print("TEXT FEATURE EXTRACTION - EXAMPLES") print("=" * 80 + "\n") try: example_1_basic_usage() example_2_dataframe_extraction() example_3_save_and_load() example_4_full_pipeline() example_5_top_tfidf_terms() print("\n" + "=" * 80) print("✅ ALL EXAMPLES COMPLETED SUCCESSFULLY!") print("=" * 80 + "\n") print("Next steps:") print(" 1. Try with your own dataset: FINAL_DATASET_WITH_TEXT_BACKUP_20260105_213507.xlsx") print(" 2. Adjust hyperparameters: max_features, n_components") print(" 3. Integrate into your ML pipeline") print(" 4. Save extractor for production use") except Exception as e: print(f"\n❌ Error: {e}") import traceback traceback.print_exc() if __name__ == '__main__': main()