368 lines
12 KiB
Python
368 lines
12 KiB
Python
"""
|
||
Example: Using TextFeatureExtractor for Staff Prediction
|
||
=========================================================
|
||
|
||
This script demonstrates how to use extract_text_features.py
|
||
to extract TF-IDF+SVD features and train a prediction model.
|
||
|
||
Run this script to see a complete example workflow.
|
||
"""
|
||
|
||
import pandas as pd
|
||
import numpy as np
|
||
from extract_text_features import TextFeatureExtractor, extract_features_from_dataframe
|
||
from sklearn.model_selection import train_test_split
|
||
from sklearn.preprocessing import StandardScaler
|
||
from sklearn.tree import DecisionTreeRegressor
|
||
from sklearn.metrics import r2_score, mean_absolute_error
|
||
import pickle
|
||
import os
|
||
|
||
|
||
def example_1_basic_usage():
|
||
"""Example 1: Basic text feature extraction"""
|
||
print("=" * 80)
|
||
print("EXAMPLE 1: BASIC USAGE")
|
||
print("=" * 80)
|
||
|
||
# Sample Vietnamese task texts
|
||
texts = [
|
||
"Kiểm tra hệ thống điện tòa nhà A định kỳ",
|
||
"Bảo trì thang máy tầng 5 và kiểm tra an toàn",
|
||
"Sửa chữa điều hòa phòng họp B tầng 3",
|
||
"Vệ sinh kính và kiểm tra hệ thống chiếu sáng",
|
||
"Bảo trì máy phát điện dự phòng"
|
||
]
|
||
|
||
print(f"\nInput: {len(texts)} task descriptions")
|
||
print(f"Sample: '{texts[0]}'")
|
||
|
||
# Initialize extractor
|
||
extractor = TextFeatureExtractor(
|
||
max_features=20, # Small for demo
|
||
n_components=5 # Small for demo
|
||
)
|
||
|
||
# Fit and transform
|
||
features = extractor.fit_transform(texts)
|
||
|
||
print(f"\nOutput shape: {features.shape}")
|
||
print(f"Feature names: {extractor.get_feature_names()}")
|
||
print(f"\nFirst sample features:")
|
||
print(features[0])
|
||
|
||
# Show summary
|
||
print(f"\nExtractor summary:")
|
||
for key, value in extractor.get_summary().items():
|
||
print(f" {key}: {value}")
|
||
|
||
print("\n✅ Example 1 complete!\n")
|
||
|
||
|
||
def example_2_dataframe_extraction():
|
||
"""Example 2: Extract features from DataFrame"""
|
||
print("=" * 80)
|
||
print("EXAMPLE 2: DATAFRAME EXTRACTION")
|
||
print("=" * 80)
|
||
|
||
# Create sample DataFrame
|
||
df = pd.DataFrame({
|
||
'ma_dia_diem': ['A01', 'A02', 'A03', 'A04', 'A05'],
|
||
'all_task_normal': [
|
||
'Kiểm tra điện',
|
||
'Bảo trì thang máy',
|
||
'Sửa điều hòa',
|
||
'Vệ sinh kính',
|
||
'Bảo trì máy phát'
|
||
],
|
||
'all_task_dinhky': [
|
||
'Định kỳ hàng tháng',
|
||
'Định kỳ hàng tuần',
|
||
'Khẩn cấp',
|
||
'Hàng ngày',
|
||
'Định kỳ quý'
|
||
],
|
||
'so_luong': [5, 3, 2, 8, 4]
|
||
})
|
||
|
||
print(f"\nInput DataFrame shape: {df.shape}")
|
||
print(df)
|
||
|
||
# Extract features
|
||
text_features_df, extractor = extract_features_from_dataframe(
|
||
df,
|
||
text_columns=['all_task_normal', 'all_task_dinhky'],
|
||
fit=True
|
||
)
|
||
|
||
print(f"\nExtracted features shape: {text_features_df.shape}")
|
||
print(f"\nSample features:")
|
||
print(text_features_df.head())
|
||
|
||
print("\n✅ Example 2 complete!\n")
|
||
|
||
|
||
def example_3_save_and_load():
|
||
"""Example 3: Save and load extractor"""
|
||
print("=" * 80)
|
||
print("EXAMPLE 3: SAVE AND LOAD")
|
||
print("=" * 80)
|
||
|
||
# Training data
|
||
train_texts = [
|
||
"Kiểm tra hệ thống điện",
|
||
"Bảo trì thang máy",
|
||
"Sửa chữa điều hòa"
|
||
]
|
||
|
||
# Fit extractor
|
||
print("\n1. Training extractor...")
|
||
extractor = TextFeatureExtractor(max_features=10, n_components=3)
|
||
train_features = extractor.fit_transform(train_texts)
|
||
print(f" Train features shape: {train_features.shape}")
|
||
|
||
# Save
|
||
save_path = 'example_extractor.pkl'
|
||
extractor.save(save_path)
|
||
print(f" Saved to: {save_path}")
|
||
|
||
# Load
|
||
print("\n2. Loading extractor...")
|
||
loaded_extractor = TextFeatureExtractor.load(save_path)
|
||
|
||
# Use loaded extractor on new data
|
||
print("\n3. Using loaded extractor on new data...")
|
||
new_texts = ["Vệ sinh kính tầng 5", "Kiểm tra máy phát điện"]
|
||
new_features = loaded_extractor.transform(new_texts)
|
||
print(f" New features shape: {new_features.shape}")
|
||
print(f" Features:\n{new_features}")
|
||
|
||
# Cleanup
|
||
if os.path.exists(save_path):
|
||
os.remove(save_path)
|
||
print(f"\n Cleaned up: {save_path}")
|
||
|
||
print("\n✅ Example 3 complete!\n")
|
||
|
||
|
||
def example_4_full_pipeline():
|
||
"""Example 4: Complete ML pipeline with text features"""
|
||
print("=" * 80)
|
||
print("EXAMPLE 4: FULL ML PIPELINE")
|
||
print("=" * 80)
|
||
|
||
# Create sample dataset
|
||
np.random.seed(42)
|
||
n_samples = 100
|
||
|
||
tasks_pool = [
|
||
"Kiểm tra hệ thống điện",
|
||
"Bảo trì thang máy",
|
||
"Sửa chữa điều hòa",
|
||
"Vệ sinh kính",
|
||
"Bảo trì máy phát điện",
|
||
"Kiểm tra an toàn",
|
||
"Sửa chữa ống nước",
|
||
"Bảo trì hệ thống PCCC"
|
||
]
|
||
|
||
df = pd.DataFrame({
|
||
'all_task_normal': [np.random.choice(tasks_pool) for _ in range(n_samples)],
|
||
'all_task_dinhky': [np.random.choice(['Hàng ngày', 'Hàng tuần', 'Hàng tháng', 'Quý'])
|
||
for _ in range(n_samples)],
|
||
'dien_tich': np.random.uniform(100, 500, n_samples),
|
||
'so_tang': np.random.randint(5, 30, n_samples),
|
||
'so_luong': np.random.randint(1, 10, n_samples)
|
||
})
|
||
|
||
print(f"\n📊 Dataset: {df.shape}")
|
||
print(f" Target (so_luong): mean={df['so_luong'].mean():.2f}, std={df['so_luong'].std():.2f}")
|
||
|
||
# === TRAINING PHASE ===
|
||
print("\n1️⃣ TRAINING PHASE")
|
||
print("-" * 80)
|
||
|
||
# Split data
|
||
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
|
||
print(f"\n Train: {len(train_df)}, Test: {len(test_df)}")
|
||
|
||
# Extract text features
|
||
print("\n Extracting text features...")
|
||
text_features_train, extractor = extract_features_from_dataframe(
|
||
train_df,
|
||
text_columns=['all_task_normal', 'all_task_dinhky'],
|
||
fit=True
|
||
)
|
||
|
||
# Prepare numeric features
|
||
numeric_cols = ['dien_tich', 'so_tang']
|
||
X_numeric_train = train_df[numeric_cols].reset_index(drop=True)
|
||
|
||
# Combine features
|
||
X_train = pd.concat([X_numeric_train, text_features_train], axis=1)
|
||
y_train = train_df['so_luong'].values
|
||
|
||
print(f"\n Combined features: {X_train.shape}")
|
||
print(f" - Numeric: {len(numeric_cols)}")
|
||
print(f" - Text SVD: {text_features_train.shape[1]}")
|
||
|
||
# Scale features
|
||
scaler = StandardScaler()
|
||
X_train_scaled = scaler.fit_transform(X_train)
|
||
|
||
# Train model
|
||
print("\n Training model...")
|
||
model = DecisionTreeRegressor(max_depth=5, random_state=42)
|
||
model.fit(X_train_scaled, y_train)
|
||
|
||
# Evaluate on training set
|
||
y_train_pred = model.predict(X_train_scaled)
|
||
train_r2 = r2_score(y_train, y_train_pred)
|
||
train_mae = mean_absolute_error(y_train, y_train_pred)
|
||
|
||
print(f"\n Training metrics:")
|
||
print(f" R²: {train_r2:.4f}")
|
||
print(f" MAE: {train_mae:.4f}")
|
||
|
||
# === INFERENCE PHASE ===
|
||
print("\n2️⃣ INFERENCE PHASE")
|
||
print("-" * 80)
|
||
|
||
# Extract text features (transform only, no fitting!)
|
||
print("\n Extracting text features (transform only)...")
|
||
text_features_test, _ = extract_features_from_dataframe(
|
||
test_df,
|
||
text_columns=['all_task_normal', 'all_task_dinhky'],
|
||
extractor=extractor,
|
||
fit=False # Important!
|
||
)
|
||
|
||
# Prepare numeric features
|
||
X_numeric_test = test_df[numeric_cols].reset_index(drop=True)
|
||
|
||
# Combine features
|
||
X_test = pd.concat([X_numeric_test, text_features_test], axis=1)
|
||
y_test = test_df['so_luong'].values
|
||
|
||
# Scale features
|
||
X_test_scaled = scaler.transform(X_test)
|
||
|
||
# Predict
|
||
print("\n Making predictions...")
|
||
y_test_pred = model.predict(X_test_scaled)
|
||
|
||
# Evaluate
|
||
test_r2 = r2_score(y_test, y_test_pred)
|
||
test_mae = mean_absolute_error(y_test, y_test_pred)
|
||
|
||
print(f"\n Test metrics:")
|
||
print(f" R²: {test_r2:.4f}")
|
||
print(f" MAE: {test_mae:.4f}")
|
||
|
||
# Show sample predictions
|
||
print(f"\n Sample predictions:")
|
||
results_df = pd.DataFrame({
|
||
'actual': y_test[:5],
|
||
'predicted': y_test_pred[:5],
|
||
'error': y_test[:5] - y_test_pred[:5]
|
||
})
|
||
print(results_df.to_string(index=False))
|
||
|
||
# === FEATURE IMPORTANCE ===
|
||
print("\n3️⃣ FEATURE IMPORTANCE")
|
||
print("-" * 80)
|
||
|
||
importances = model.feature_importances_
|
||
feature_names = X_train.columns.tolist()
|
||
|
||
importance_df = pd.DataFrame({
|
||
'feature': feature_names,
|
||
'importance': importances
|
||
}).sort_values('importance', ascending=False)
|
||
|
||
print("\n Top 10 important features:")
|
||
print(importance_df.head(10).to_string(index=False))
|
||
|
||
# Aggregate by feature type
|
||
n_numeric = len(numeric_cols)
|
||
text_importance = importances[n_numeric:].sum()
|
||
numeric_importance = importances[:n_numeric].sum()
|
||
|
||
print(f"\n Feature type contribution:")
|
||
print(f" Numeric features: {numeric_importance:.4f} ({numeric_importance/(text_importance+numeric_importance)*100:.1f}%)")
|
||
print(f" Text features: {text_importance:.4f} ({text_importance/(text_importance+numeric_importance)*100:.1f}%)")
|
||
|
||
print("\n✅ Example 4 complete!\n")
|
||
|
||
|
||
def example_5_top_tfidf_terms():
|
||
"""Example 5: Analyze top TF-IDF terms"""
|
||
print("=" * 80)
|
||
print("EXAMPLE 5: TOP TF-IDF TERMS ANALYSIS")
|
||
print("=" * 80)
|
||
|
||
# Sample task texts
|
||
texts = [
|
||
"Kiểm tra hệ thống điện tòa nhà",
|
||
"Bảo trì thang máy và kiểm tra an toàn",
|
||
"Sửa chữa hệ thống điều hòa không khí",
|
||
"Kiểm tra và vệ sinh kính tòa nhà",
|
||
"Bảo trì máy phát điện dự phòng",
|
||
"Kiểm tra hệ thống PCCC định kỳ",
|
||
"Sửa chữa ống nước và hệ thống cấp thoát",
|
||
"Bảo trì hệ thống thang máy tòa nhà"
|
||
]
|
||
|
||
print(f"\nInput: {len(texts)} task descriptions")
|
||
|
||
# Fit extractor
|
||
extractor = TextFeatureExtractor(max_features=50, n_components=10)
|
||
extractor.fit(texts)
|
||
|
||
# Get top TF-IDF features
|
||
print("\n📋 Top 20 TF-IDF terms (by document frequency):")
|
||
top_features = extractor.get_top_tfidf_features(top_n=20)
|
||
print(top_features.to_string(index=False))
|
||
|
||
# Get summary
|
||
summary = extractor.get_summary()
|
||
print(f"\n📊 Summary:")
|
||
print(f" Actual TF-IDF features: {summary['actual_tfidf_features']}")
|
||
print(f" SVD components: {summary['n_components']}")
|
||
print(f" Explained variance: {summary['explained_variance']*100:.2f}%")
|
||
|
||
print("\n✅ Example 5 complete!\n")
|
||
|
||
|
||
def main():
|
||
"""Run all examples"""
|
||
print("\n" + "=" * 80)
|
||
print("TEXT FEATURE EXTRACTION - EXAMPLES")
|
||
print("=" * 80 + "\n")
|
||
|
||
try:
|
||
example_1_basic_usage()
|
||
example_2_dataframe_extraction()
|
||
example_3_save_and_load()
|
||
example_4_full_pipeline()
|
||
example_5_top_tfidf_terms()
|
||
|
||
print("\n" + "=" * 80)
|
||
print("✅ ALL EXAMPLES COMPLETED SUCCESSFULLY!")
|
||
print("=" * 80 + "\n")
|
||
|
||
print("Next steps:")
|
||
print(" 1. Try with your own dataset: FINAL_DATASET_WITH_TEXT_BACKUP_20260105_213507.xlsx")
|
||
print(" 2. Adjust hyperparameters: max_features, n_components")
|
||
print(" 3. Integrate into your ML pipeline")
|
||
print(" 4. Save extractor for production use")
|
||
|
||
except Exception as e:
|
||
print(f"\n❌ Error: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|