Source code for autox.autox_competition.feature_engineer.fe_cumsum

import pandas as pd
from autox.autox_competition.CONST import FEATURE_TYPE
from autox.autox_competition.process_data import Feature_type_recognition
from tqdm import tqdm
import numpy as np
from autox.autox_competition.util import log

[docs]class FeatureCumsum: """cumsum特征描述""" def __init__(self): self.target = None self.df_feature_type = None self.silence_group_cols = [] self.silence_agg_cols = [] self.select_all = None self.max_num = None self.ops = {} def fit(self, df, target=None, df_feature_type=None, silence_group_cols=[], silence_agg_cols=[], select_all=True, max_num=None): self.target = target self.df_feature_type = df_feature_type self.silence_group_cols = silence_group_cols self.silence_agg_cols = silence_agg_cols self.select_all = select_all self.max_num = max_num if self.df_feature_type is None: feature_type_recognition = Feature_type_recognition() feature_type = feature_type_recognition.fit(df) self.df_feature_type = feature_type for group_col in self.df_feature_type.keys(): if self.df_feature_type[group_col] == FEATURE_TYPE['cat'] and group_col not in self.silence_group_cols: if df[group_col].nunique() == df.shape[0]: continue self.ops[(group_col)] = [] for agg_col in self.df_feature_type.keys(): if group_col == agg_col: continue if agg_col not in self.silence_agg_cols: if self.df_feature_type[agg_col] == FEATURE_TYPE['num']: self.ops[(group_col)].append(agg_col) if not self.select_all: if self.target is not None: # 训练模型,对group_col进行筛选 pass else: # 通过统计信息进行筛选 del_group_cols = [] for group_col in self.ops.keys(): if df[group_col].nunique() > df.shape[0] * 0.2 or df[group_col].nunique() < 5: del_group_cols.append(group_col) for group_col in del_group_cols: del self.ops[group_col] def get_ops(self): return self.ops def set_ops(self, ops): self.ops = ops def transform(self, df): result = pd.DataFrame() for group_col in tqdm(self.ops.keys()): agg_cols = self.ops[group_col] for agg_col in agg_cols: cumsum_value = df.groupby(group_col)[agg_col].cumsum().values if type(group_col) == tuple: name = f'{"__".join(group_col)}__{agg_col}__cumsum' else: name = f'{group_col}__{agg_col}__cumsum' result[name] = cumsum_value del_cols = list((result == np.inf).sum().index) result.drop(del_cols, axis=1, inplace=True) log(f"this cols with inf data, del them: {del_cols}") return result def fit_transform(self, df, target=None, df_feature_type=None, silence_group_cols=[], silence_agg_cols=None, select_all=True, max_num=None): self.fit(df, target=target, df_feature_type=df_feature_type, silence_group_cols=silence_group_cols, silence_agg_cols=silence_agg_cols, select_all=select_all, max_num=max_num) return self.transform(df)