Source code for autox.autox

from autox.autox_competition.feature_engineer import FeatureCount
from autox.autox_competition.feature_engineer.fe_stat import FeatureStat
from autox.autox_competition.feature_engineer import FeatureRank
from autox.autox_competition.feature_engineer import FeatureNlp
from autox.autox_competition.feature_engineer.fe_time import FeatureTime
from autox.autox_competition.feature_engineer import FeatureCumsum
from autox.autox_competition.feature_engineer import FeatureShift
from autox.autox_competition.feature_engineer import FeatureDiff
from autox.autox_competition.feature_engineer.fe_one2M import FeatureOne2M
from autox.autox_competition.feature_engineer import fe_ima2vec
from autox.autox_competition.file_io import read_data_from_path
from autox.autox_competition.models import CrossLgbRegression, CrossXgbRegression
from autox.autox_competition.models.classifier import CrossLgbBiClassifier, CrossXgbBiClassifier
from autox.autox_competition.process_data import feature_combination, train_test_divide, clip_label
from autox.autox_competition.process_data import feature_filter, auto_encoder
from autox.autox_competition.process_data.feature_type_recognition import Feature_type_recognition
from autox.autox_competition.util import log
from autox.autox_competition.feature_engineer import FeatureShiftTS, FeatureRollingStatTS, FeatureExpWeightedMean
from autox.autox_competition.models.regressor_ts import LgbRegressionTs, XgbRegressionTs

[docs]class AutoX(): """AutoX主函数描述""" def __init__(self, target, train_name, test_name, path, time_series=False, ts_unit=None, time_col=None, metric='rmse', feature_type = {}, relations = [], id = [], task_type = 'regression', Debug = False, image_info={}, target_map={}): self.Debug = Debug self.info_ = {} self.info_['id'] = id self.info_['task_type'] = task_type self.info_['target'] = target self.info_['feature_type'] = feature_type self.info_['relations'] = relations self.info_['train_name'] = train_name self.info_['test_name'] = test_name self.info_['metric'] = metric self.info_['time_series'] = time_series self.info_['ts_unit'] = ts_unit self.info_['time_col'] = time_col self.info_['image_info'] = image_info self.info_['target_map'] = target_map self.dfs_ = read_data_from_path(path, train_name=train_name, target=target, target_map=target_map) if image_info: assert('image_path' in image_info.keys()) assert('image_col' in image_info.keys()) assert('filename_extension' in image_info.keys()) if time_series: assert(ts_unit is not None) assert(time_col is not None) if Debug: log("Debug mode, sample data") self.dfs_[train_name] = self.dfs_[train_name].sample(5000) self.info_['max_target'] = self.dfs_[train_name][target].max() self.info_['min_target'] = self.dfs_[train_name][target].min() if feature_type == {}: for table_name in self.dfs_.keys(): df = self.dfs_[table_name] feature_type_recognition = Feature_type_recognition() feature_type = feature_type_recognition.fit(df) self.info_['feature_type'][table_name] = feature_type self.join_simple_tables() self.concat_train_test() self.dfs_['FE_all'] = None self.sub = None # 识别任务类型 if self.dfs_[self.info_['train_name']][self.info_['target']].nunique() == 2: self.info_['task_type'] = 'binary' else: self.info_['task_type'] = 'regression' def join_simple_tables(self): simple_relations = [x for x in self.info_['relations'] if x['type'] == '1-1' and x['related_to_main_table'] == 'true'] for relation in simple_relations: left_table_name = relation['left_entity'] right_table_name = relation['right_entity'] left_on = relation['left_on'] right_on = relation['right_on'] if right_table_name in [self.info_['train_name'], self.info_['test_name']]: left_table_name, right_table_name = right_table_name, left_table_name left_on, right_on = right_on, left_on skip_name = right_on merge_table_name = right_table_name merge_table = self.dfs_[merge_table_name].copy() # rename merge_table.columns = [x if x in skip_name else merge_table_name + '__' + x for x in merge_table.columns] self.dfs_[left_table_name] = self.dfs_[left_table_name].merge(merge_table, left_on=left_on, right_on=right_on, how='left') if left_on != right_on: self.dfs_[left_table_name].drop(right_on, axis=1, inplace=True) del merge_table for key_ in self.info_['feature_type'][merge_table_name]: if key_ not in skip_name: self.info_['feature_type'][left_table_name][merge_table_name + '__' + key_] = self.info_['feature_type'][merge_table_name][key_] def concat_train_test(self): self.info_['shape_of_train'] = len(self.dfs_[self.info_['train_name']]) self.info_['shape_of_test'] = len(self.dfs_[self.info_['test_name']]) self.dfs_['train_test'] = self.dfs_[self.info_['train_name']].append(self.dfs_[self.info_['test_name']]) self.dfs_['train_test'].index = range(len(self.dfs_['train_test'])) feature_type_train_test = {} for col in self.dfs_['train_test'].columns: if col in self.info_['feature_type'][self.info_['train_name']]: feature_type_train_test[col] = self.info_['feature_type'][self.info_['train_name']][col] else: feature_type_train_test[col] = self.info_['feature_type'][self.info_['test_name']][col] self.info_['feature_type']['train_test'] = feature_type_train_test def split_train_test(self): self.dfs_['FE_train'] = self.dfs_['FE_all'][:self.info_['shape_of_train']] self.dfs_['FE_test'] = self.dfs_['FE_all'][self.info_['shape_of_train']:] def get_submit(self): self.topk_feas = self.get_top_features(return_df = False) # 模型训练 log("start training xgboost model") if self.info_['task_type'] == 'regression': self.model_xgb = CrossXgbRegression(metric=self.info_['metric']) self.model_xgb.fit(self.train[self.used_features], self.train[self.info_['target']], tuning=False, Debug=self.Debug) elif self.info_['task_type'] == 'binary': self.model_xgb = CrossXgbBiClassifier() self.model_xgb.fit(self.train[self.used_features], self.train[self.info_['target']], tuning=False, Debug=self.Debug) # 模型预测 predict_lgb = self.model_lgb.predict(self.test[self.used_features]) predict_xgb = self.model_xgb.predict(self.test[self.used_features]) # predict_tabnet = model_tabnet.predict(test[used_features]) predict = (predict_xgb + predict_lgb) / 2 # 预测结果后处理 min_ = self.info_['min_target'] max_ = self.info_['max_target'] predict = clip_label(predict, min_, max_) # 获得结果 sub = self.test[self.info_['id']] sub[self.info_['target']] = predict sub.index = range(len(sub)) return sub def get_top_features(self, topk = 50, return_df = True): id_ = self.info_['id'] target = self.info_['target'] # 特征工程 log("start feature engineer") df = self.dfs_['train_test'] feature_type = self.info_['feature_type']['train_test'] # 1-M拼表特征 # one2M拼表特征 log("feature engineer: one2M") featureOne2M = FeatureOne2M() featureOne2M.fit(self.info_['relations'], self.info_['train_name'], self.info_['feature_type']) log(f"featureOne2M ops: {featureOne2M.get_ops()}") if len(featureOne2M.get_ops()) != 0: self.dfs_['FE_One2M'] = featureOne2M.transform(df, self.dfs_) else: self.dfs_['FE_One2M'] = None log("ignore featureOne2M") # 时间特征 log("feature engineer: time") featureTime = FeatureTime() featureTime.fit(df, df_feature_type=feature_type, silence_cols=id_ + [target]) log(f"featureTime ops: {featureTime.get_ops()}") self.dfs_['FE_time'] = featureTime.transform(df) # cumsum特征 log("feature engineer: Cumsum") featureCumsum = FeatureCumsum() featureCumsum.fit(df, df_feature_type=feature_type, silence_group_cols=id_ + [target], silence_agg_cols=id_ + [target], select_all=False) fe_cumsum_cnt = 0 for key_ in featureCumsum.get_ops().keys(): fe_cumsum_cnt += len(featureCumsum.get_ops()[key_]) if fe_cumsum_cnt < 30: self.dfs_['FE_cumsum'] = featureCumsum.transform(df) log(f"featureCumsum ops: {featureCumsum.get_ops()}") else: self.dfs_['FE_cumsum'] = None log("ignore featureCumsum") # shift特征 log("feature engineer: Shift") featureShift = FeatureShift() featureShift.fit(df, df_feature_type=feature_type, silence_group_cols=id_ + [target], silence_agg_cols=id_ + [target], select_all=False) fe_shift_cnt = 0 for key_ in featureShift.get_ops().keys(): fe_shift_cnt += len(featureShift.get_ops()[key_]) if fe_shift_cnt < 30: self.dfs_['FE_shift'] = featureShift.transform(df) log(f"featureShift ops: {featureShift.get_ops()}") else: self.dfs_['FE_shift'] = None log("ignore featureShift") # diff特征 log("feature engineer: Diff") featureDiff = FeatureDiff() featureDiff.fit(df, df_feature_type=feature_type, silence_group_cols=id_ + [target], silence_agg_cols=id_ + [target], select_all=False) fe_diff_cnt = 0 for key_ in featureDiff.get_ops().keys(): fe_diff_cnt += len(featureDiff.get_ops()[key_]) if fe_diff_cnt < 30: self.dfs_['FE_diff'] = featureDiff.transform(df) log(f"featureDiff ops: {featureDiff.get_ops()}") else: self.dfs_['FE_diff'] = None log("ignore featureDiff") # 统计特征 log("feature engineer: Stat") featureStat = FeatureStat() featureStat.fit(df, df_feature_type=feature_type, silence_group_cols= id_ + [target], silence_agg_cols= id_ + [target], select_all=False) fe_stat_cnt = 0 for key_ in featureStat.get_ops().keys(): fe_stat_cnt += len(featureStat.get_ops()[key_]) if fe_stat_cnt < 1500: self.dfs_['FE_stat'] = featureStat.transform(df) log(f"featureStat ops: {featureStat.get_ops()}") else: self.dfs_['FE_stat'] = None log("ignore featureStat") # nlp特征 log("feature engineer: NLP") featureNlp = FeatureNlp() featureNlp.fit(df, target, df_feature_type=feature_type, silence_cols=id_, select_all=False) self.dfs_['FE_nlp'] = featureNlp.transform(df) log(f"featureNlp ops: {featureNlp.get_ops()}") # count特征 log("feature engineer: Count") # degree自动调整 featureCount = FeatureCount() featureCount.fit(df, degree=2, df_feature_type=feature_type, silence_cols= id_ + [target], select_all=False) if len(featureCount.get_ops()) > 500: featureCount = FeatureCount() featureCount.fit(df, degree=1, df_feature_type=feature_type, silence_cols=id_ + [target], select_all=False) self.dfs_['FE_count'] = featureCount.transform(df) log(f"featureCount ops: {featureCount.get_ops()}") # rank特征 log("feature engineer: Rank") featureRank = FeatureRank() featureRank.fit(df, df_feature_type=feature_type, select_all=False) fe_rank_cnt = 0 for key_ in featureRank.get_ops().keys(): fe_rank_cnt += len(featureRank.get_ops()[key_]) if fe_rank_cnt < 500: self.dfs_['FE_rank'] = featureRank.transform(df) log(f"featureRank ops: {featureRank.get_ops()}") else: self.dfs_['FE_rank'] = None log("ignore featureRank") # image特征 if self.info_['image_info']: self.dfs_['FE_image'] = fe_ima2vec(df, self.info_['image_info']['image_path'], self.info_['image_info']['image_col'], self.info_['image_info']['filename_extension']) else: self.dfs_['FE_image'] = None log("ignore image feature") # auto_encoder df = auto_encoder(df, feature_type, id_) # 特征合并 log("feature combination") df_list = [df, self.dfs_['FE_nlp'], self.dfs_['FE_count'], self.dfs_['FE_stat'], self.dfs_['FE_rank'], self.dfs_['FE_shift'], self.dfs_['FE_diff'], self.dfs_['FE_cumsum'], self.dfs_['FE_One2M'], self.dfs_['FE_image']] self.dfs_['FE_all'] = feature_combination(df_list) # # 内存优化 # self.dfs_['FE_all'] = reduce_mem_usage(self.dfs_['FE_all']) # train和test数据切分 train_length = self.info_['shape_of_train'] self.train, self.test = train_test_divide(self.dfs_['FE_all'], train_length) log(f"shape of FE_all: {self.dfs_['FE_all'].shape}, shape of train: {self.train.shape}, shape of test: {self.test.shape}") # 特征过滤 log("feature filter") self.used_features = feature_filter(self.train, self.test, id_, target) log(f"used_features: {self.used_features}") # 模型训练 log("start training lightgbm model") if self.info_['task_type'] == 'regression': self.model_lgb = CrossLgbRegression(metric=self.info_['metric']) self.model_lgb.fit(self.train[self.used_features], self.train[target], tuning=False, Debug=self.Debug) elif self.info_['task_type'] == 'binary': self.model_lgb = CrossLgbBiClassifier() self.model_lgb.fit(self.train[self.used_features], self.train[target], tuning=False, Debug=self.Debug) # 特征重要性 fimp = self.model_lgb.feature_importances_ log("feature importance") log(fimp) topk_feas = [x for x in list(fimp['feature']) if x not in df.columns][:topk] if return_df: return topk_feas, self.train[id_ + topk_feas], self.test[id_ + topk_feas] else: return topk_feas def get_submit_ts(self): self.topk_feas = self.get_top_features_ts(return_df=False) # 模型训练 log("start training xgboost model") if self.info_['task_type'] == 'regression': self.model_xgb = XgbRegressionTs() self.model_xgb.fit(self.train, self.test, self.used_features, self.info_['target'], self.info_['time_col'], self.info_['ts_unit']) # 模型预测 predict_lgb = self.model_lgb.predict(self.test, self.used_features) predict_xgb = self.model_xgb.predict(self.test, self.used_features) # predict_tabnet = model_tabnet.predict(test[used_features]) predict = (predict_xgb + predict_lgb) / 2 # 预测结果后处理 min_ = self.info_['min_target'] max_ = self.info_['max_target'] predict = clip_label(predict, min_, max_) # 获得结果 sub = self.test[self.info_['id'] + [self.info_['time_col']]] sub[self.info_['target']] = predict sub.index = range(len(sub)) return sub def get_top_features_ts(self, topk = 50, return_df = True): id_ = self.info_['id'] target = self.info_['target'] # 特征工程 log("start feature engineer") df = self.dfs_['train_test'] feature_type = self.info_['feature_type']['train_test'] # 1-M拼表特征 # one2M拼表特征 log("feature engineer: one2M") featureOne2M = FeatureOne2M() featureOne2M.fit(self.info_['relations'], self.info_['train_name'], self.info_['feature_type']) log(f"featureOne2M ops: {featureOne2M.get_ops()}") if len(featureOne2M.get_ops()) != 0: self.dfs_['FE_One2M'] = featureOne2M.transform(df, self.dfs_) else: self.dfs_['FE_One2M'] = None log("ignore featureOne2M") # 时间特征 log("feature engineer: time") featureTime = FeatureTime() featureTime.fit(df, df_feature_type=feature_type, silence_cols=id_ + [target]) log(f"featureTime ops: {featureTime.get_ops()}") self.dfs_['FE_time'] = featureTime.transform(df) # lag_ts特征 log("feature engineer: ShiftTS") featureShiftTS = FeatureShiftTS() featureShiftTS.fit(df, id_, target, feature_type, self.info_['time_col'], self.info_['ts_unit']) log(f"featureShiftTS ops: {featureShiftTS.get_ops()}") log(f"featureShiftTS lags: {featureShiftTS.get_lags()}") self.dfs_['FE_shift_ts'] = featureShiftTS.transform(df) # rolling_stat_ts特征 log("feature engineer: RollingStatTS") featureRollingStatTS = FeatureRollingStatTS() featureRollingStatTS.fit(df, id_, target, feature_type, self.info_['time_col'], self.info_['ts_unit']) log(f"featureRollingStatTS ops: {featureRollingStatTS.get_ops()}") log(f"featureRollingStatTS windows: {featureRollingStatTS.get_windows()}") self.dfs_['FE_rollingStat_ts'] = featureRollingStatTS.transform(df) # exp_weighted_mean_ts特征 log("feature engineer: ExpWeightedMean") featureExpWeightedMean = FeatureExpWeightedMean() featureExpWeightedMean.fit(df, id_, target, feature_type, self.info_['time_col'], self.info_['ts_unit']) log(f"featureExpWeightedMean ops: {featureExpWeightedMean.get_ops()}") log(f"featureExpWeightedMean lags: {featureExpWeightedMean.get_lags()}") self.dfs_['FE_ewm'] = featureExpWeightedMean.transform(df) # label_encoder df = auto_encoder(df, feature_type, id_) # 特征合并 log("feature combination") df_list = [df, self.dfs_['FE_One2M'], self.dfs_['FE_time'], self.dfs_['FE_shift_ts'], self.dfs_['FE_rollingStat_ts'], self.dfs_['FE_ewm']] self.dfs_['FE_all'] = feature_combination(df_list) # # 内存优化 # self.dfs_['FE_all'] = reduce_mem_usage(self.dfs_['FE_all']) # train和test数据切分 train_length = self.info_['shape_of_train'] self.train, self.test = train_test_divide(self.dfs_['FE_all'], train_length) log(f"shape of FE_all: {self.dfs_['FE_all'].shape}, shape of train: {self.train.shape}, shape of test: {self.test.shape}") # 特征过滤 log("feature filter") self.used_features = feature_filter(self.train, self.test, id_, target, time_series=True) log(f"used_features: {self.used_features}") # 模型训练 log("start training lightgbm model") if self.info_['task_type'] == 'regression': self.model_lgb = LgbRegressionTs() self.model_lgb.fit(self.train, self.test, self.used_features, target, self.info_['time_col'], self.info_['ts_unit']) # 特征重要性 fimp = self.model_lgb.feature_importances_ log("feature importance") log(fimp) topk_feas = [x for x in list(fimp['feature']) if x not in df.columns][:topk] if return_df: return topk_feas, self.train[id_ + topk_feas], self.test[id_ + topk_feas] else: return topk_feas