predict_caLamviec_nhansu/example_text_features.py

"""
Example: Using TextFeatureExtractor for Staff Prediction
=========================================================

This script demonstrates how to use extract_text_features.py
to extract TF-IDF+SVD features and train a prediction model.

Run this script to see a complete example workflow.
"""

import pandas as pd
import numpy as np
from extract_text_features import TextFeatureExtractor, extract_features_from_dataframe
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error
import pickle
import os


def example_1_basic_usage():
    """Example 1: Basic text feature extraction"""
    print("=" * 80)
    print("EXAMPLE 1: BASIC USAGE")
    print("=" * 80)

    # Sample Vietnamese task texts
    texts = [
        "Kiểm tra hệ thống điện tòa nhà A định kỳ",
        "Bảo trì thang máy tầng 5 và kiểm tra an toàn",
        "Sửa chữa điều hòa phòng họp B tầng 3",
        "Vệ sinh kính và kiểm tra hệ thống chiếu sáng",
        "Bảo trì máy phát điện dự phòng"
    ]

    print(f"\nInput: {len(texts)} task descriptions")
    print(f"Sample: '{texts[0]}'")

    # Initialize extractor
    extractor = TextFeatureExtractor(
        max_features=20,  # Small for demo
        n_components=5    # Small for demo
    )

    # Fit and transform
    features = extractor.fit_transform(texts)

    print(f"\nOutput shape: {features.shape}")
    print(f"Feature names: {extractor.get_feature_names()}")
    print(f"\nFirst sample features:")
    print(features[0])

    # Show summary
    print(f"\nExtractor summary:")
    for key, value in extractor.get_summary().items():
        print(f"  {key}: {value}")

    print("\n✅ Example 1 complete!\n")


def example_2_dataframe_extraction():
    """Example 2: Extract features from DataFrame"""
    print("=" * 80)
    print("EXAMPLE 2: DATAFRAME EXTRACTION")
    print("=" * 80)

    # Create sample DataFrame
    df = pd.DataFrame({
        'ma_dia_diem': ['A01', 'A02', 'A03', 'A04', 'A05'],
        'all_task_normal': [
            'Kiểm tra điện',
            'Bảo trì thang máy',
            'Sửa điều hòa',
            'Vệ sinh kính',
            'Bảo trì máy phát'
        ],
        'all_task_dinhky': [
            'Định kỳ hàng tháng',
            'Định kỳ hàng tuần',
            'Khẩn cấp',
            'Hàng ngày',
            'Định kỳ quý'
        ],
        'so_luong': [5, 3, 2, 8, 4]
    })

    print(f"\nInput DataFrame shape: {df.shape}")
    print(df)

    # Extract features
    text_features_df, extractor = extract_features_from_dataframe(
        df,
        text_columns=['all_task_normal', 'all_task_dinhky'],
        fit=True
    )

    print(f"\nExtracted features shape: {text_features_df.shape}")
    print(f"\nSample features:")
    print(text_features_df.head())

    print("\n✅ Example 2 complete!\n")


def example_3_save_and_load():
    """Example 3: Save and load extractor"""
    print("=" * 80)
    print("EXAMPLE 3: SAVE AND LOAD")
    print("=" * 80)

    # Training data
    train_texts = [
        "Kiểm tra hệ thống điện",
        "Bảo trì thang máy",
        "Sửa chữa điều hòa"
    ]

    # Fit extractor
    print("\n1. Training extractor...")
    extractor = TextFeatureExtractor(max_features=10, n_components=3)
    train_features = extractor.fit_transform(train_texts)
    print(f"   Train features shape: {train_features.shape}")

    # Save
    save_path = 'example_extractor.pkl'
    extractor.save(save_path)
    print(f"   Saved to: {save_path}")

    # Load
    print("\n2. Loading extractor...")
    loaded_extractor = TextFeatureExtractor.load(save_path)

    # Use loaded extractor on new data
    print("\n3. Using loaded extractor on new data...")
    new_texts = ["Vệ sinh kính tầng 5", "Kiểm tra máy phát điện"]
    new_features = loaded_extractor.transform(new_texts)
    print(f"   New features shape: {new_features.shape}")
    print(f"   Features:\n{new_features}")

    # Cleanup
    if os.path.exists(save_path):
        os.remove(save_path)
        print(f"\n   Cleaned up: {save_path}")

    print("\n✅ Example 3 complete!\n")


def example_4_full_pipeline():
    """Example 4: Complete ML pipeline with text features"""
    print("=" * 80)
    print("EXAMPLE 4: FULL ML PIPELINE")
    print("=" * 80)

    # Create sample dataset
    np.random.seed(42)
    n_samples = 100

    tasks_pool = [
        "Kiểm tra hệ thống điện",
        "Bảo trì thang máy",
        "Sửa chữa điều hòa",
        "Vệ sinh kính",
        "Bảo trì máy phát điện",
        "Kiểm tra an toàn",
        "Sửa chữa ống nước",
        "Bảo trì hệ thống PCCC"
    ]

    df = pd.DataFrame({
        'all_task_normal': [np.random.choice(tasks_pool) for _ in range(n_samples)],
        'all_task_dinhky': [np.random.choice(['Hàng ngày', 'Hàng tuần', 'Hàng tháng', 'Quý'])
                            for _ in range(n_samples)],
        'dien_tich': np.random.uniform(100, 500, n_samples),
        'so_tang': np.random.randint(5, 30, n_samples),
        'so_luong': np.random.randint(1, 10, n_samples)
    })

    print(f"\n📊 Dataset: {df.shape}")
    print(f"   Target (so_luong): mean={df['so_luong'].mean():.2f}, std={df['so_luong'].std():.2f}")

    # === TRAINING PHASE ===
    print("\n1️⃣ TRAINING PHASE")
    print("-" * 80)

    # Split data
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    print(f"\n   Train: {len(train_df)}, Test: {len(test_df)}")

    # Extract text features
    print("\n   Extracting text features...")
    text_features_train, extractor = extract_features_from_dataframe(
        train_df,
        text_columns=['all_task_normal', 'all_task_dinhky'],
        fit=True
    )

    # Prepare numeric features
    numeric_cols = ['dien_tich', 'so_tang']
    X_numeric_train = train_df[numeric_cols].reset_index(drop=True)

    # Combine features
    X_train = pd.concat([X_numeric_train, text_features_train], axis=1)
    y_train = train_df['so_luong'].values

    print(f"\n   Combined features: {X_train.shape}")
    print(f"     - Numeric: {len(numeric_cols)}")
    print(f"     - Text SVD: {text_features_train.shape[1]}")

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)

    # Train model
    print("\n   Training model...")
    model = DecisionTreeRegressor(max_depth=5, random_state=42)
    model.fit(X_train_scaled, y_train)

    # Evaluate on training set
    y_train_pred = model.predict(X_train_scaled)
    train_r2 = r2_score(y_train, y_train_pred)
    train_mae = mean_absolute_error(y_train, y_train_pred)

    print(f"\n   Training metrics:")
    print(f"     R²:  {train_r2:.4f}")
    print(f"     MAE: {train_mae:.4f}")

    # === INFERENCE PHASE ===
    print("\n2️⃣ INFERENCE PHASE")
    print("-" * 80)

    # Extract text features (transform only, no fitting!)
    print("\n   Extracting text features (transform only)...")
    text_features_test, _ = extract_features_from_dataframe(
        test_df,
        text_columns=['all_task_normal', 'all_task_dinhky'],
        extractor=extractor,
        fit=False  # Important!
    )

    # Prepare numeric features
    X_numeric_test = test_df[numeric_cols].reset_index(drop=True)

    # Combine features
    X_test = pd.concat([X_numeric_test, text_features_test], axis=1)
    y_test = test_df['so_luong'].values

    # Scale features
    X_test_scaled = scaler.transform(X_test)

    # Predict
    print("\n   Making predictions...")
    y_test_pred = model.predict(X_test_scaled)

    # Evaluate
    test_r2 = r2_score(y_test, y_test_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)

    print(f"\n   Test metrics:")
    print(f"     R²:  {test_r2:.4f}")
    print(f"     MAE: {test_mae:.4f}")

    # Show sample predictions
    print(f"\n   Sample predictions:")
    results_df = pd.DataFrame({
        'actual': y_test[:5],
        'predicted': y_test_pred[:5],
        'error': y_test[:5] - y_test_pred[:5]
    })
    print(results_df.to_string(index=False))

    # === FEATURE IMPORTANCE ===
    print("\n3️⃣ FEATURE IMPORTANCE")
    print("-" * 80)

    importances = model.feature_importances_
    feature_names = X_train.columns.tolist()

    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    }).sort_values('importance', ascending=False)

    print("\n   Top 10 important features:")
    print(importance_df.head(10).to_string(index=False))

    # Aggregate by feature type
    n_numeric = len(numeric_cols)
    text_importance = importances[n_numeric:].sum()
    numeric_importance = importances[:n_numeric].sum()

    print(f"\n   Feature type contribution:")
    print(f"     Numeric features: {numeric_importance:.4f} ({numeric_importance/(text_importance+numeric_importance)*100:.1f}%)")
    print(f"     Text features:    {text_importance:.4f} ({text_importance/(text_importance+numeric_importance)*100:.1f}%)")

    print("\n✅ Example 4 complete!\n")


def example_5_top_tfidf_terms():
    """Example 5: Analyze top TF-IDF terms"""
    print("=" * 80)
    print("EXAMPLE 5: TOP TF-IDF TERMS ANALYSIS")
    print("=" * 80)

    # Sample task texts
    texts = [
        "Kiểm tra hệ thống điện tòa nhà",
        "Bảo trì thang máy và kiểm tra an toàn",
        "Sửa chữa hệ thống điều hòa không khí",
        "Kiểm tra và vệ sinh kính tòa nhà",
        "Bảo trì máy phát điện dự phòng",
        "Kiểm tra hệ thống PCCC định kỳ",
        "Sửa chữa ống nước và hệ thống cấp thoát",
        "Bảo trì hệ thống thang máy tòa nhà"
    ]

    print(f"\nInput: {len(texts)} task descriptions")

    # Fit extractor
    extractor = TextFeatureExtractor(max_features=50, n_components=10)
    extractor.fit(texts)

    # Get top TF-IDF features
    print("\n📋 Top 20 TF-IDF terms (by document frequency):")
    top_features = extractor.get_top_tfidf_features(top_n=20)
    print(top_features.to_string(index=False))

    # Get summary
    summary = extractor.get_summary()
    print(f"\n📊 Summary:")
    print(f"   Actual TF-IDF features: {summary['actual_tfidf_features']}")
    print(f"   SVD components: {summary['n_components']}")
    print(f"   Explained variance: {summary['explained_variance']*100:.2f}%")

    print("\n✅ Example 5 complete!\n")


def main():
    """Run all examples"""
    print("\n" + "=" * 80)
    print("TEXT FEATURE EXTRACTION - EXAMPLES")
    print("=" * 80 + "\n")

    try:
        example_1_basic_usage()
        example_2_dataframe_extraction()
        example_3_save_and_load()
        example_4_full_pipeline()
        example_5_top_tfidf_terms()

        print("\n" + "=" * 80)
        print("✅ ALL EXAMPLES COMPLETED SUCCESSFULLY!")
        print("=" * 80 + "\n")

        print("Next steps:")
        print("  1. Try with your own dataset: FINAL_DATASET_WITH_TEXT_BACKUP_20260105_213507.xlsx")
        print("  2. Adjust hyperparameters: max_features, n_components")
        print("  3. Integrate into your ML pipeline")
        print("  4. Save extractor for production use")

    except Exception as e:
        print(f"\n❌ Error: {e}")
        import traceback
        traceback.print_exc()


if __name__ == '__main__':
    main()