Source code for autox.autox_competition.feature_engineer.fe_count

from tqdm import tqdm
from autox.autox_competition.process_data import Feature_type_recognition
from autox.autox_competition.CONST import FEATURE_TYPE

[docs]class FeatureCount: """**Convert categorical features into the number of occurrences.** """ def __init__(self): self.target = None self.df_feature_type = None self.silence_cols = [] self.select_all = None self.max_num = None self.ops = []
[docs] def fit(self, df, degree=1, target = None, df_feature_type = None, silence_cols = [], select_all = True, max_num = None): """ :param df: dataframe, train_test. :param degree: int, degree equal to 1 or 2. :param target: str, target column. :param df_feature_type: dict, {col: type of col}. :param silence_cols: :param select_all: :param max_num: """ assert(degree == 1 or degree == 2) self.target = target self.df_feature_type = df_feature_type self.silence_cols = silence_cols self.select_all = select_all self.max_num = max_num if self.df_feature_type is None: feature_type_recognition = Feature_type_recognition() feature_type = feature_type_recognition.fit(df) self.df_feature_type = feature_type for feature in self.df_feature_type.keys(): if self.df_feature_type[feature] == FEATURE_TYPE['cat'] and feature not in self.silence_cols: self.ops.append([feature]) if not self.select_all: if self.target is not None: # 训练模型,对group_col进行筛选 pass else: # 通过统计信息进行筛选 del_count_cols = [] for count_col in self.ops: if df.drop_duplicates(count_col).shape[0] > df.shape[0] * 0.2: del_count_cols.append(count_col) for count_col in del_count_cols: self.ops.remove(count_col) if degree == 2: ops_degree_1 = self.ops ops = [] for col_1 in ops_degree_1: for col_2 in ops_degree_1: if col_1 == col_2: continue else: ops.append(col_1 + col_2) self.ops = ops + ops_degree_1
def get_ops(self): return self.ops def set_keys(self, ops): self.ops = ops
[docs] def transform(self, df): """ :param df: dataframe, train_test. :return: dataframe, count features. """ name_list = [] for op in tqdm(self.ops): if len(op) == 1: name = f'COUNT_{"__".join(op)}' name_list.append(name) df[name] = df.groupby(op)[op].transform('count') else: col_1, col_2 = op name = f'COUNT_{col_1}__{col_2}' name_list.append(name) df_map = df.groupby([col_1, col_2]).size().to_frame() df_map.columns = [name] df = df.merge(df_map, on=[col_1, col_2], how='left') result = df[name_list] df.drop(name_list, axis=1, inplace=True) return result
def fit_transform(self, df, target = None, df_feature_type = None, silence_cols = [], select_all = True, max_num = None): self.fit(df, target=target, df_feature_type=df_feature_type, silence_cols=silence_cols, select_all=select_all, max_num=max_num) return self.transform(df)