Source code for autox.autox_competition.feature_engineer.fe_cross

import itertools
import warnings
import lightgbm as lgb
import pandas as pd
import numpy as np
warnings.filterwarnings('ignore')


[docs]class FeatureCross: """**synthetic feature formed by multiplying (crossing) two features.** """ def __init__(self, importance_type='split'): self.importance_type = importance_type self.shapely_flag = importance_type == 'shapley_value'
[docs] def fit(self, X, y, objective, category_cols, top_k=10, used_cols=[]): ''' :param X: {array-like, sparse matrix} of shape (n_samples, n_features). Training vector, where n_samples is the number of samples and n_features is the number of features. :param y: array-like of shape (n_samples,). Target vector relative to X. :param objective: str, objective equal to 'binary' or 'regression'. :param category_cols: list, column names of categorical features. :param top_k: int, keep the top_k importance cross features, default top_k = 10. :param used_cols: list, columns will be used for training model, default top_k = 10. ''' self.category_cols = category_cols if len(used_cols) > 0: self.used_cols = used_cols else: self.used_cols = list(X.columns) self.used_cols = [x for x in list(X.describe().columns) if x in self.used_cols] self.top_k = top_k assert (objective in ['binary', 'regression']) params = {'objective': objective, 'boosting': 'gbdt', 'learning_rate': 0.01, 'num_leaves': 2 ** 3, 'bagging_fraction': 0.95, 'bagging_freq': 1, 'bagging_seed': 66, 'feature_fraction': 0.7, 'feature_fraction_seed': 66, 'max_depth': -1 } N_round = 100 trn_data = lgb.Dataset(X[self.used_cols], label=y, categorical_feature=category_cols) self.clf = lgb.train(params, trn_data, num_boost_round=N_round, valid_sets=[trn_data], verbose_eval=False) self.feature_importances = pd.DataFrame() self.feature_importances['feature'] = X[self.used_cols].columns if self.shapely_flag: self.feature_importances['imp'] = np.abs( self.clf.predict(X[self.used_cols], pred_contrib=True) ).sum(axis=0)[:len(self.used_cols)] else: self.feature_importances['imp'] = self.clf.feature_importance(importance_type=self.importance_type) self.feature_importances = self.feature_importances.sort_values(by="imp", ascending=False) self.feature_importances.index = range(len(self.feature_importances)) self.top_k_features = [x for x in self.feature_importances['feature'] if x in category_cols][:top_k] self.cross_features = [] for item in list(itertools.permutations(self.top_k_features, 2)): f1 = item[0] f2 = item[1] if f1 in category_cols and f2 in category_cols: self.cross_features.append([f1, f2])
[docs] def transform(self, X): ''' :param X: {array-like, sparse matrix} of shape (n_samples, n_features). Training vector, where n_samples is the number of samples and n_features is the number of features. :return: dataframe, cross features. ''' result = pd.DataFrame() for [f1, f2] in self.cross_features: result[f'{f1}_cross_{f2}'] = X[f1].astype(str) + '__' + X[f2].astype(str) return result
def fit_transform(self, X, y, objective, category_cols, top_k=10, used_cols=[]): self.fit(X, y, objective, category_cols=category_cols, used_cols=used_cols, top_k=top_k) return self.transform(X)