predict_caLamviec_nhansu/example_text_features.py

368 lines
12 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Example: Using TextFeatureExtractor for Staff Prediction
=========================================================
This script demonstrates how to use extract_text_features.py
to extract TF-IDF+SVD features and train a prediction model.
Run this script to see a complete example workflow.
"""
import pandas as pd
import numpy as np
from extract_text_features import TextFeatureExtractor, extract_features_from_dataframe
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error
import pickle
import os
def example_1_basic_usage():
"""Example 1: Basic text feature extraction"""
print("=" * 80)
print("EXAMPLE 1: BASIC USAGE")
print("=" * 80)
# Sample Vietnamese task texts
texts = [
"Kiểm tra hệ thống điện tòa nhà A định kỳ",
"Bảo trì thang máy tầng 5 và kiểm tra an toàn",
"Sửa chữa điều hòa phòng họp B tầng 3",
"Vệ sinh kính và kiểm tra hệ thống chiếu sáng",
"Bảo trì máy phát điện dự phòng"
]
print(f"\nInput: {len(texts)} task descriptions")
print(f"Sample: '{texts[0]}'")
# Initialize extractor
extractor = TextFeatureExtractor(
max_features=20, # Small for demo
n_components=5 # Small for demo
)
# Fit and transform
features = extractor.fit_transform(texts)
print(f"\nOutput shape: {features.shape}")
print(f"Feature names: {extractor.get_feature_names()}")
print(f"\nFirst sample features:")
print(features[0])
# Show summary
print(f"\nExtractor summary:")
for key, value in extractor.get_summary().items():
print(f" {key}: {value}")
print("\n✅ Example 1 complete!\n")
def example_2_dataframe_extraction():
"""Example 2: Extract features from DataFrame"""
print("=" * 80)
print("EXAMPLE 2: DATAFRAME EXTRACTION")
print("=" * 80)
# Create sample DataFrame
df = pd.DataFrame({
'ma_dia_diem': ['A01', 'A02', 'A03', 'A04', 'A05'],
'all_task_normal': [
'Kiểm tra điện',
'Bảo trì thang máy',
'Sửa điều hòa',
'Vệ sinh kính',
'Bảo trì máy phát'
],
'all_task_dinhky': [
'Định kỳ hàng tháng',
'Định kỳ hàng tuần',
'Khẩn cấp',
'Hàng ngày',
'Định kỳ quý'
],
'so_luong': [5, 3, 2, 8, 4]
})
print(f"\nInput DataFrame shape: {df.shape}")
print(df)
# Extract features
text_features_df, extractor = extract_features_from_dataframe(
df,
text_columns=['all_task_normal', 'all_task_dinhky'],
fit=True
)
print(f"\nExtracted features shape: {text_features_df.shape}")
print(f"\nSample features:")
print(text_features_df.head())
print("\n✅ Example 2 complete!\n")
def example_3_save_and_load():
"""Example 3: Save and load extractor"""
print("=" * 80)
print("EXAMPLE 3: SAVE AND LOAD")
print("=" * 80)
# Training data
train_texts = [
"Kiểm tra hệ thống điện",
"Bảo trì thang máy",
"Sửa chữa điều hòa"
]
# Fit extractor
print("\n1. Training extractor...")
extractor = TextFeatureExtractor(max_features=10, n_components=3)
train_features = extractor.fit_transform(train_texts)
print(f" Train features shape: {train_features.shape}")
# Save
save_path = 'example_extractor.pkl'
extractor.save(save_path)
print(f" Saved to: {save_path}")
# Load
print("\n2. Loading extractor...")
loaded_extractor = TextFeatureExtractor.load(save_path)
# Use loaded extractor on new data
print("\n3. Using loaded extractor on new data...")
new_texts = ["Vệ sinh kính tầng 5", "Kiểm tra máy phát điện"]
new_features = loaded_extractor.transform(new_texts)
print(f" New features shape: {new_features.shape}")
print(f" Features:\n{new_features}")
# Cleanup
if os.path.exists(save_path):
os.remove(save_path)
print(f"\n Cleaned up: {save_path}")
print("\n✅ Example 3 complete!\n")
def example_4_full_pipeline():
"""Example 4: Complete ML pipeline with text features"""
print("=" * 80)
print("EXAMPLE 4: FULL ML PIPELINE")
print("=" * 80)
# Create sample dataset
np.random.seed(42)
n_samples = 100
tasks_pool = [
"Kiểm tra hệ thống điện",
"Bảo trì thang máy",
"Sửa chữa điều hòa",
"Vệ sinh kính",
"Bảo trì máy phát điện",
"Kiểm tra an toàn",
"Sửa chữa ống nước",
"Bảo trì hệ thống PCCC"
]
df = pd.DataFrame({
'all_task_normal': [np.random.choice(tasks_pool) for _ in range(n_samples)],
'all_task_dinhky': [np.random.choice(['Hàng ngày', 'Hàng tuần', 'Hàng tháng', 'Quý'])
for _ in range(n_samples)],
'dien_tich': np.random.uniform(100, 500, n_samples),
'so_tang': np.random.randint(5, 30, n_samples),
'so_luong': np.random.randint(1, 10, n_samples)
})
print(f"\n📊 Dataset: {df.shape}")
print(f" Target (so_luong): mean={df['so_luong'].mean():.2f}, std={df['so_luong'].std():.2f}")
# === TRAINING PHASE ===
print("\n1⃣ TRAINING PHASE")
print("-" * 80)
# Split data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
print(f"\n Train: {len(train_df)}, Test: {len(test_df)}")
# Extract text features
print("\n Extracting text features...")
text_features_train, extractor = extract_features_from_dataframe(
train_df,
text_columns=['all_task_normal', 'all_task_dinhky'],
fit=True
)
# Prepare numeric features
numeric_cols = ['dien_tich', 'so_tang']
X_numeric_train = train_df[numeric_cols].reset_index(drop=True)
# Combine features
X_train = pd.concat([X_numeric_train, text_features_train], axis=1)
y_train = train_df['so_luong'].values
print(f"\n Combined features: {X_train.shape}")
print(f" - Numeric: {len(numeric_cols)}")
print(f" - Text SVD: {text_features_train.shape[1]}")
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
# Train model
print("\n Training model...")
model = DecisionTreeRegressor(max_depth=5, random_state=42)
model.fit(X_train_scaled, y_train)
# Evaluate on training set
y_train_pred = model.predict(X_train_scaled)
train_r2 = r2_score(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
print(f"\n Training metrics:")
print(f" R²: {train_r2:.4f}")
print(f" MAE: {train_mae:.4f}")
# === INFERENCE PHASE ===
print("\n2⃣ INFERENCE PHASE")
print("-" * 80)
# Extract text features (transform only, no fitting!)
print("\n Extracting text features (transform only)...")
text_features_test, _ = extract_features_from_dataframe(
test_df,
text_columns=['all_task_normal', 'all_task_dinhky'],
extractor=extractor,
fit=False # Important!
)
# Prepare numeric features
X_numeric_test = test_df[numeric_cols].reset_index(drop=True)
# Combine features
X_test = pd.concat([X_numeric_test, text_features_test], axis=1)
y_test = test_df['so_luong'].values
# Scale features
X_test_scaled = scaler.transform(X_test)
# Predict
print("\n Making predictions...")
y_test_pred = model.predict(X_test_scaled)
# Evaluate
test_r2 = r2_score(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
print(f"\n Test metrics:")
print(f" R²: {test_r2:.4f}")
print(f" MAE: {test_mae:.4f}")
# Show sample predictions
print(f"\n Sample predictions:")
results_df = pd.DataFrame({
'actual': y_test[:5],
'predicted': y_test_pred[:5],
'error': y_test[:5] - y_test_pred[:5]
})
print(results_df.to_string(index=False))
# === FEATURE IMPORTANCE ===
print("\n3⃣ FEATURE IMPORTANCE")
print("-" * 80)
importances = model.feature_importances_
feature_names = X_train.columns.tolist()
importance_df = pd.DataFrame({
'feature': feature_names,
'importance': importances
}).sort_values('importance', ascending=False)
print("\n Top 10 important features:")
print(importance_df.head(10).to_string(index=False))
# Aggregate by feature type
n_numeric = len(numeric_cols)
text_importance = importances[n_numeric:].sum()
numeric_importance = importances[:n_numeric].sum()
print(f"\n Feature type contribution:")
print(f" Numeric features: {numeric_importance:.4f} ({numeric_importance/(text_importance+numeric_importance)*100:.1f}%)")
print(f" Text features: {text_importance:.4f} ({text_importance/(text_importance+numeric_importance)*100:.1f}%)")
print("\n✅ Example 4 complete!\n")
def example_5_top_tfidf_terms():
"""Example 5: Analyze top TF-IDF terms"""
print("=" * 80)
print("EXAMPLE 5: TOP TF-IDF TERMS ANALYSIS")
print("=" * 80)
# Sample task texts
texts = [
"Kiểm tra hệ thống điện tòa nhà",
"Bảo trì thang máy và kiểm tra an toàn",
"Sửa chữa hệ thống điều hòa không khí",
"Kiểm tra và vệ sinh kính tòa nhà",
"Bảo trì máy phát điện dự phòng",
"Kiểm tra hệ thống PCCC định kỳ",
"Sửa chữa ống nước và hệ thống cấp thoát",
"Bảo trì hệ thống thang máy tòa nhà"
]
print(f"\nInput: {len(texts)} task descriptions")
# Fit extractor
extractor = TextFeatureExtractor(max_features=50, n_components=10)
extractor.fit(texts)
# Get top TF-IDF features
print("\n📋 Top 20 TF-IDF terms (by document frequency):")
top_features = extractor.get_top_tfidf_features(top_n=20)
print(top_features.to_string(index=False))
# Get summary
summary = extractor.get_summary()
print(f"\n📊 Summary:")
print(f" Actual TF-IDF features: {summary['actual_tfidf_features']}")
print(f" SVD components: {summary['n_components']}")
print(f" Explained variance: {summary['explained_variance']*100:.2f}%")
print("\n✅ Example 5 complete!\n")
def main():
"""Run all examples"""
print("\n" + "=" * 80)
print("TEXT FEATURE EXTRACTION - EXAMPLES")
print("=" * 80 + "\n")
try:
example_1_basic_usage()
example_2_dataframe_extraction()
example_3_save_and_load()
example_4_full_pipeline()
example_5_top_tfidf_terms()
print("\n" + "=" * 80)
print("✅ ALL EXAMPLES COMPLETED SUCCESSFULLY!")
print("=" * 80 + "\n")
print("Next steps:")
print(" 1. Try with your own dataset: FINAL_DATASET_WITH_TEXT_BACKUP_20260105_213507.xlsx")
print(" 2. Adjust hyperparameters: max_features, n_components")
print(" 3. Integrate into your ML pipeline")
print(" 4. Save extractor for production use")
except Exception as e:
print(f"\n❌ Error: {e}")
import traceback
traceback.print_exc()
if __name__ == '__main__':
main()