""" Text Feature Extraction Pipeline for Staff Prediction Model ============================================================= This script extracts TF-IDF and SVD features from Vietnamese task descriptions. Can be used for both training (fit_transform) and inference (transform). Usage: ------ Training mode: python extract_text_features.py --mode train --input data.xlsx --output features.csv Inference mode: python extract_text_features.py --mode predict --input new_data.xlsx --output predictions.csv As a module: from extract_text_features import TextFeatureExtractor extractor = TextFeatureExtractor() features = extractor.fit_transform(texts) Author: ML Team Date: 2026-01-06 """ import pandas as pd import numpy as np import re import pickle import argparse import os from typing import List, Tuple, Optional, Union from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import TruncatedSVD class TextFeatureExtractor: """ Extract TF-IDF and SVD features from Vietnamese text. Attributes: max_features (int): Maximum number of TF-IDF features n_components (int): Number of SVD components ngram_range (tuple): N-gram range for TF-IDF min_df (int): Minimum document frequency max_df (float): Maximum document frequency tfidf (TfidfVectorizer): Fitted TF-IDF vectorizer svd (TruncatedSVD): Fitted SVD model is_fitted (bool): Whether the extractor has been fitted """ def __init__( self, max_features: int = 200, n_components: int = 50, ngram_range: Tuple[int, int] = (1, 2), min_df: int = 2, max_df: float = 0.95, random_state: int = 42 ): """ Initialize the TextFeatureExtractor. Args: max_features: Maximum number of TF-IDF features (default: 200) n_components: Number of SVD components (default: 50) ngram_range: N-gram range for TF-IDF (default: (1, 2)) min_df: Minimum document frequency (default: 2) max_df: Maximum document frequency (default: 0.95) random_state: Random seed for reproducibility (default: 42) """ self.max_features = max_features self.n_components = n_components self.ngram_range = ngram_range self.min_df = min_df self.max_df = max_df self.random_state = random_state # Initialize models self.tfidf = TfidfVectorizer( max_features=max_features, ngram_range=ngram_range, min_df=min_df, max_df=max_df, sublinear_tf=True, strip_accents=None ) self.svd = TruncatedSVD( n_components=n_components, random_state=random_state ) self.is_fitted = False @staticmethod def preprocess_text(text: Union[str, float, None]) -> str: """ Preprocess Vietnamese text. Args: text: Input text (can be str, float, or None) Returns: Cleaned text string """ if pd.isna(text) or text is None or str(text).strip() == '': return '' text = str(text).lower() # Keep Vietnamese characters, numbers, spaces text = re.sub( r'[^a-zàáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ0-9\s]', ' ', text ) # Remove multiple spaces text = re.sub(r'\s+', ' ', text).strip() return text def preprocess_texts(self, texts: List[str]) -> List[str]: """ Preprocess a list of texts. Args: texts: List of text strings Returns: List of cleaned text strings """ return [self.preprocess_text(text) for text in texts] def combine_task_columns( self, task_normal: pd.Series, task_dinhky: pd.Series ) -> List[str]: """ Combine two task columns into one. Args: task_normal: Series of normal task descriptions task_dinhky: Series of scheduled task descriptions Returns: List of combined and cleaned texts """ # Preprocess both columns normal_clean = task_normal.apply(self.preprocess_text) dinhky_clean = task_dinhky.apply(self.preprocess_text) # Combine combined = (normal_clean + ' ' + dinhky_clean).str.strip() return combined.tolist() def fit(self, texts: List[str]) -> 'TextFeatureExtractor': """ Fit the TF-IDF and SVD models on training texts. Args: texts: List of text strings Returns: self """ print(f"Fitting TF-IDF on {len(texts)} documents...") # Preprocess texts_clean = self.preprocess_texts(texts) # Fit TF-IDF tfidf_matrix = self.tfidf.fit_transform(texts_clean) print(f" TF-IDF shape: {tfidf_matrix.shape}") print(f" Sparsity: {(1.0 - tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1])) * 100:.2f}%") # Fit SVD print(f"\nFitting SVD ({self.n_components} components)...") self.svd.fit(tfidf_matrix) print(f" Explained variance: {self.svd.explained_variance_ratio_.sum()*100:.2f}%") self.is_fitted = True print("\n✅ Fitting complete!") return self def transform(self, texts: List[str]) -> np.ndarray: """ Transform texts to SVD features. Args: texts: List of text strings Returns: Array of SVD features (n_samples, n_components) """ if not self.is_fitted: raise ValueError("Extractor must be fitted before transform. Call fit() first.") # Preprocess texts_clean = self.preprocess_texts(texts) # TF-IDF transform tfidf_matrix = self.tfidf.transform(texts_clean) # SVD transform svd_features = self.svd.transform(tfidf_matrix) return svd_features def fit_transform(self, texts: List[str]) -> np.ndarray: """ Fit and transform in one step. Args: texts: List of text strings Returns: Array of SVD features (n_samples, n_components) """ self.fit(texts) return self.transform(texts) def get_feature_names(self) -> List[str]: """ Get feature names for the SVD components. Returns: List of feature names """ return [f'text_svd_{i+1}' for i in range(self.n_components)] def get_top_tfidf_features(self, top_n: int = 20) -> pd.DataFrame: """ Get top TF-IDF features by document frequency. Args: top_n: Number of top features to return Returns: DataFrame with feature names and document frequencies """ if not self.is_fitted: raise ValueError("Extractor must be fitted first.") feature_names = self.tfidf.get_feature_names_out() doc_freq = np.asarray(self.tfidf.transform(self.tfidf.get_feature_names_out()).sum(axis=0)).ravel() top_features = pd.DataFrame({ 'feature': feature_names, 'doc_frequency': doc_freq }).sort_values('doc_frequency', ascending=False).head(top_n) return top_features def save(self, filepath: str): """ Save the fitted extractor to disk. Args: filepath: Path to save the extractor (should end with .pkl) """ if not self.is_fitted: raise ValueError("Extractor must be fitted before saving.") with open(filepath, 'wb') as f: pickle.dump(self, f) print(f"✅ Saved extractor to: {filepath}") @staticmethod def load(filepath: str) -> 'TextFeatureExtractor': """ Load a fitted extractor from disk. Args: filepath: Path to the saved extractor Returns: Loaded TextFeatureExtractor """ with open(filepath, 'rb') as f: extractor = pickle.load(f) print(f"✅ Loaded extractor from: {filepath}") return extractor def get_summary(self) -> dict: """ Get summary statistics of the extractor. Returns: Dictionary with summary information """ if not self.is_fitted: return {'status': 'not_fitted'} return { 'status': 'fitted', 'max_features': self.max_features, 'n_components': self.n_components, 'ngram_range': self.ngram_range, 'min_df': self.min_df, 'max_df': self.max_df, 'actual_tfidf_features': len(self.tfidf.get_feature_names_out()), 'explained_variance': float(self.svd.explained_variance_ratio_.sum()), 'random_state': self.random_state } def extract_features_from_dataframe( df: pd.DataFrame, text_columns: List[str] = ['all_task_normal', 'all_task_dinhky'], extractor: Optional[TextFeatureExtractor] = None, fit: bool = True ) -> Tuple[pd.DataFrame, TextFeatureExtractor]: """ Extract text features from a DataFrame. Args: df: Input DataFrame text_columns: List of text column names to combine extractor: Pre-fitted extractor (optional, for inference) fit: Whether to fit the extractor (True for training, False for inference) Returns: Tuple of (features_df, extractor) """ print("=" * 80) print("TEXT FEATURE EXTRACTION") print("=" * 80) # Combine text columns if len(text_columns) == 1: texts = df[text_columns[0]].tolist() else: print(f"\nCombining {len(text_columns)} text columns...") texts = [] for _, row in df.iterrows(): combined = ' '.join([str(row[col]) if pd.notna(row[col]) else '' for col in text_columns]) texts.append(combined) print(f"Total documents: {len(texts)}") # Initialize or use existing extractor if extractor is None: print("\nInitializing new TextFeatureExtractor...") extractor = TextFeatureExtractor() # Extract features if fit: print("\nMode: TRAINING (fit_transform)") features = extractor.fit_transform(texts) else: print("\nMode: INFERENCE (transform)") features = extractor.transform(texts) # Create DataFrame feature_names = extractor.get_feature_names() features_df = pd.DataFrame(features, columns=feature_names) print(f"\n✅ Extraction complete!") print(f" Output shape: {features_df.shape}") print(f" Feature names: {feature_names[:5]}... (showing first 5)") # Summary summary = extractor.get_summary() print(f"\n📊 Extractor Summary:") for key, value in summary.items(): print(f" {key}: {value}") return features_df, extractor def main(): """Command-line interface for text feature extraction.""" parser = argparse.ArgumentParser( description='Extract TF-IDF and SVD features from Vietnamese task descriptions' ) parser.add_argument( '--mode', type=str, choices=['train', 'predict'], required=True, help='Mode: train (fit and save) or predict (load and transform)' ) parser.add_argument( '--input', type=str, required=True, help='Input file path (Excel or CSV)' ) parser.add_argument( '--output', type=str, required=True, help='Output file path for features (CSV)' ) parser.add_argument( '--text-columns', type=str, nargs='+', default=['all_task_normal', 'all_task_dinhky'], help='Text column names to combine (default: all_task_normal all_task_dinhky)' ) parser.add_argument( '--extractor-path', type=str, default='text_feature_extractor.pkl', help='Path to save/load the extractor (default: text_feature_extractor.pkl)' ) parser.add_argument( '--max-features', type=int, default=200, help='Maximum TF-IDF features (default: 200)' ) parser.add_argument( '--n-components', type=int, default=50, help='Number of SVD components (default: 50)' ) args = parser.parse_args() # Load data print(f"\n📂 Loading data from: {args.input}") if args.input.endswith('.xlsx'): df = pd.read_excel(args.input) elif args.input.endswith('.csv'): df = pd.read_csv(args.input) else: raise ValueError("Input file must be .xlsx or .csv") print(f" Shape: {df.shape}") # Check columns missing_cols = [col for col in args.text_columns if col not in df.columns] if missing_cols: raise ValueError(f"Missing columns in input data: {missing_cols}") # Extract features if args.mode == 'train': # Training mode: fit and save features_df, extractor = extract_features_from_dataframe( df, text_columns=args.text_columns, extractor=TextFeatureExtractor( max_features=args.max_features, n_components=args.n_components ), fit=True ) # Save extractor extractor.save(args.extractor_path) else: # predict mode # Inference mode: load and transform if not os.path.exists(args.extractor_path): raise FileNotFoundError(f"Extractor not found: {args.extractor_path}") extractor = TextFeatureExtractor.load(args.extractor_path) features_df, _ = extract_features_from_dataframe( df, text_columns=args.text_columns, extractor=extractor, fit=False ) # Save features features_df.to_csv(args.output, index=False) print(f"\n✅ Saved features to: {args.output}") # Show top TF-IDF features (training mode only) if args.mode == 'train': print("\n📋 Top 20 TF-IDF features:") top_features = extractor.get_top_tfidf_features(top_n=20) print(top_features.to_string(index=False)) print("\n" + "=" * 80) print("✅ COMPLETE!") print("=" * 80) if __name__ == '__main__': main()