Source code for miceforest.ImputationKernel

from .ImputedData import ImputedData
from .MeanMatchScheme import MeanMatchScheme
from .utils import (
    _t_dat,
    _t_var_list,
    _t_var_dict,
    _t_var_sub,
    _t_random_state,
    _assert_dataset_equivalent,
    _draw_random_int32,
    _interpret_ds,
    _subset_data,
    ensure_rng,
    hash_int32,
    stratified_categorical_folds,
    stratified_continuous_folds,
    stratified_subset,
)
from .compat import pd_DataFrame, pd_Series
from .default_lightgbm_parameters import default_parameters, make_default_tuning_space
from .logger import Logger
import numpy as np
from numpy.random import RandomState
from warnings import warn
from lightgbm import train, Dataset, cv, log_evaluation, early_stopping, Booster
from lightgbm.basic import _ConfigAliases
from io import BytesIO
import blosc
import dill
from copy import copy
from typing import Union, List, Dict, Any, Optional

_DEFAULT_DATA_SUBSET = 1.0


[docs]class ImputationKernel(ImputedData): """ Creates a kernel dataset. This dataset can perform MICE on itself, and impute new data from models obtained during MICE. Parameters ---------- data : np.ndarray or pandas DataFrame. .. code-block:: text The data to be imputed. variable_schema : None or list or dict, default=None .. code-block:: text Specifies the feature - target relationships used to train models. This parameter also controls which models are built. Models can be built even if a variable contains no missing values, or is not being imputed (train_nonmissing must be set to True). - If None, all columns will be used as features in the training of each model. - If list, all columns in data are used to impute the variables in the list - If dict the values will be used to impute the keys. Can be either column indices or names (if data is a pd.DataFrame). No models will be trained for variables not specified by variable_schema (either by None, a list, or in dict keys). imputation_order: str, list[str], list[int], default="ascending" .. code-block:: text The order the imputations should occur in. If a string from the items below, all variables specified by variable_schema with missing data are imputed: ascending: variables are imputed from least to most missing descending: most to least missing roman: from left to right in the dataset arabic: from right to left in the dataset. If a list is provided: - the variables will be imputed in that order. - only variables with missing values should be included in the list. - must be a subset of variables specified by variable_schema. If a variable with missing values is in variable_schema, but not in imputation_order, then models to impute that variable will be trained, but the actual values will not be imputed. See examples for details. train_nonmissing: boolean .. code-block:: text Should models be trained for variables with no missing values? Useful if you expect you will need to impute new data which will have missing values, but the training data is fully recognized. If True, parameters are interpreted like so: - models are run for all variables specified by variable_schema - if variable_schema is None, models are run for all variables - each iteration, models build for fully recognized variables are always trained after the models trained during mice. - imputation_order does not have any affect on fully recognized variable model training. WARNING: Setting this to True without specifying a variable schema will build models for all variables in the dataset, whether they have missing values or not. This may or may not be what you want. data_subset: None or int or float or dict. .. code-block:: text Subsets the data used in each iteration, which can save a significant amount of time. This can also help with memory consumption, as the candidate data must be copied to make a feature dataset for lightgbm. The number of rows used for each variable is (# rows in raw data) - (# missing variable values) for each variable. data_subset takes a random sample of this. If float, must be 0.0 < data_subset <= 1.0. Interpreted as a percentage of available candidates If int must be data_subset >= 0. Interpreted as the number of candidates. If 0, no subsetting is done. If dict, keys must be variable names, and values must follow two above rules. It is recommended to carefully select this value for each variable if dealing with very large data that barely fits into memory. mean_match_scheme: Dict, default = None .. code-block:: text An instance of the miceforest.MeanMatchScheme class. If None is passed, a sensible default scheme is used. There are multiple helpful schemes that can be accessed from miceforest.builtin_mean_match_schemes, or you can build your own. A description of the defaults: - mean_match_default (default, if mean_match_scheme is None)) This scheme is has medium speed and accuracy for most data. Categorical: If mmc = 0, the class with the highest probability is chosen. If mmc > 0, get N nearest neighbors from class probabilities. Select 1 at random. Numeric: If mmc = 0, the predicted value is used If mmc > 0, obtain the mmc closest candidate predictions and collect the associated real candidate values. Choose 1 randomly. - mean_match_shap This scheme is the most accurate, but takes the longest. It works the same as mean_match_default, except all nearest neighbor searches are performed on the shap values of the predictions, instead of the predictions themselves. - mean_match_scheme_fast_cat: This scheme is faster for categorical variables, but may be less accurate as well.. Categorical: If mmc = 0, the class with the highest probability is chosen. If mmc > 0, return class based on random draw weighted by class probability for each sample. Numeric or binary: If mmc = 0, the predicted value is used If mmc > 0, obtain the mmc closest candidate predictions and collect the associated real candidate values. Choose 1 randomly. categorical_feature: str or list, default="auto" .. code-block:: text The categorical features in the dataset. This handling depends on class of impute_data: pandas DataFrame: - "auto": categorical information is inferred from any columns with datatype category or object. - list of column names (or indices): Useful if all categorical columns have already been cast to numeric encodings of some type, otherwise you should just use "auto". Will throw an error if a list is provided AND categorical dtypes exist in data. If a list is provided, values in the columns must be consecutive integers starting at 0, as required by lightgbm. numpy ndarray: - "auto": no categorical information is stored. - list of column indices: Specified columns are treated as categorical. Column values must be consecutive integers starting at 0, as required by lightgbm. initialization: str .. code-block:: text "random" - missing values will be filled in randomly from existing values. "empty" - lightgbm will start MICE without initial imputation save_all_iterations: boolean, optional(default=True) .. code-block:: text Save all the imputation values from all iterations, or just the latest. Saving all iterations allows for additional plotting, but may take more memory save_models: int .. code-block:: text Which models should be saved: = 0: no models are saved. Cannot get feature importance or impute new data. = 1: only the last model iteration is saved. Can only get feature importance of last iteration. New data is imputed using the last model for all specified iterations. This is only an issue if data is heavily Missing At Random. = 2: all model iterations are saved. Can get feature importance for any iteration. When imputing new data, each iteration is imputed using the model obtained at that iteration in mice. This allows for imputations that most closely resemble those that would have been obtained in mice. copy_data: boolean (default = False) .. code-block:: text Should the dataset be referenced directly? If False, this will cause the dataset to be altered in place. If a copy is created, it is saved in self.working_data. There are different ways in which the dataset can be altered: 1) complete_data() will fill in missing values 2) To save space, mice() references and manipulates self.working_data directly. If self.working_data is a reference to the original dataset, the original dataset will undergo these manipulations during the mice process. At the end of the mice process, missing values will be set back to np.NaN where they were originally missing. save_loggers: boolean (default = False) .. code-block:: text A logger is created each time mice() or impute_new_data() is called. If True, the loggers are stored in a list ImputationKernel.loggers. If you wish to start saving logs, call ImputationKernel.start_logging(). If you wish to stop saving logs, call ImputationKernel.stop_logging(). random_state: None,int, or numpy.random.RandomState .. code-block:: text The random_state ensures script reproducibility. It only ensures reproducible results if the same script is called multiple times. It does not guarantee reproducible results at the record level, if a record is imputed multiple different times. If reproducible record-results are desired, a seed must be passed for each record in the random_seed_array parameter. """
[docs] def __init__( self, data: _t_dat, datasets: int = 1, variable_schema: Union[_t_var_list, _t_var_dict, None] = None, imputation_order: Union[str, _t_var_list] = "ascending", train_nonmissing: bool = False, mean_match_scheme: Optional[MeanMatchScheme] = None, data_subset: Union[int, float, _t_var_sub, None] = None, categorical_feature: Union[str, _t_var_list] = "auto", initialization: str = "random", save_all_iterations: bool = True, save_models: int = 1, copy_data: bool = True, save_loggers: bool = False, random_state: _t_random_state = None, ): super().__init__( impute_data=data, datasets=datasets, variable_schema=variable_schema, imputation_order=imputation_order, train_nonmissing=train_nonmissing, categorical_feature=categorical_feature, save_all_iterations=save_all_iterations, copy_data=copy_data, ) self.initialization = initialization self.train_nonmissing = train_nonmissing self.save_models = save_models self.save_loggers = save_loggers self.loggers: List[Logger] = [] self.models: Dict[Any, Booster] = {} self.candidate_preds: Dict[Any, np.ndarray] = {} self.optimal_parameters: Dict[Any, Any] = { ds: {var: {} for var in self.variable_training_order} for ds in range(datasets) } self.optimal_parameter_losses: Dict[Any, Any] = { ds: {var: np.Inf for var in self.variable_training_order} for ds in range(datasets) } # Format data_subset and available_candidates available_candidates = { v: (self.data_shape[0] - self.na_counts[v]) for v in self.variable_training_order } data_subset = _DEFAULT_DATA_SUBSET if data_subset is None else data_subset if not isinstance(data_subset, dict): data_subset = {v: data_subset for v in self.variable_training_order} if set(data_subset) != set(self.variable_training_order): # Change variable names to indices for v in list(data_subset): data_subset[self._get_var_ind_from_scalar(v)] = data_subset.pop(v) ds_supplement = { v: _DEFAULT_DATA_SUBSET for v in self.variable_training_order if v not in data_subset.keys() } data_subset.update(ds_supplement) for v, ds in data_subset.items(): assert v in self.variable_training_order, ( f"Variable {self._get_var_name_from_scalar(v)} will not have a model trained " + "but it is in data_subset." ) data_subset[v] = _interpret_ds(data_subset[v], available_candidates[v]) self.available_candidates = available_candidates self.data_subset = data_subset # Get mean matching function if mean_match_scheme is None: from .builtin_mean_match_schemes import mean_match_default self.mean_match_scheme = mean_match_default.copy() else: assert isinstance(mean_match_scheme, MeanMatchScheme) self.mean_match_scheme = mean_match_scheme.copy() # Format and run through mean match candidate checks. self.mean_match_scheme._format_mean_match_candidates(data, available_candidates) # Ensure mmc and mms make sense: # mmc <= mms <= available candidates for each var for v in self.imputation_order: mmc = self.mean_match_scheme.mean_match_candidates[v] assert mmc <= data_subset[v], f"{v} mean_match_candidates > data_subset" assert ( data_subset[v] <= available_candidates[v] ), f"{v} data_subset > available candidates" # Make sure all pandas categorical levels are used. rare_levels = [] for cat in self.categorical_variables: cat_name = self._get_var_name_from_scalar(cat) cat_dat = self._get_nonmissing_values(cat) cat_levels, cat_count = np.unique(cat_dat, return_counts=True) cat_dtype = cat_dat.dtype if cat_dtype.name == "category": levels_in_data = set(cat_levels) levels_in_catdt = set(cat_dtype.categories) levels_not_in_data = levels_in_catdt - levels_in_data assert ( len(levels_not_in_data) == 0 ), f"{cat_name} has unused categories: {','.join(levels_not_in_data)}" if any(cat_count / cat_count.sum() < 0.002): rare_levels.append(cat_name) if len(rare_levels) > 0: warn( f"[{','.join(rare_levels)}] have very rare categories, it is a good " "idea to group these, or set the min_data_in_leaf parameter to prevent " "lightgbm from outputting 0.0 probabilities." ) # Manage randomness self._completely_random_kernel = random_state is None self._random_state = ensure_rng(random_state) # Set initial imputations (iteration 0). self._initialize_dataset( self, random_state=self._random_state, random_seed_array=None )
def __repr__(self): summary_string = f'\n{" " * 14}Class: ImputationKernel\n{self._ids_info()}' return summary_string # def _mm_type_handling(self, mm, available_candidates) -> int: # if isinstance(mm, float): # # assert (mm > 0.0) and ( # mm <= 1.0 # ), "mean_matching must be < 0.0 and >= 1.0 if a float" # # ret = int(mm * available_candidates) # # elif isinstance(mm, int): # # assert mm >= 0, "mean_matching must be above 0 if an int is passed." # ret = mm # # else: # # raise ValueError( # "mean_match_candidates type not recognized. " # + "Any supplied values must be a 0.0 < float <= 1.0 or int >= 1" # ) # # return ret def _initialize_random_seed_array(self, random_seed_array, expected_shape): """ Formats and takes the first hash of the random_seed_array. """ # Format random_seed_array if it was passed. if random_seed_array is not None: if self._completely_random_kernel: warn( """ This kernel is completely random (no random_state was provided on initialization). Values imputed using ThisKernel.impute_new_data() will be deterministic, however the kernel itself is non-reproducible. """ ) assert isinstance(random_seed_array, np.ndarray) assert ( random_seed_array.dtype == "int32" ), "random_seed_array must be a np.ndarray of type int32" assert ( random_seed_array.shape[0] == expected_shape ), "random_seed_array must be the same length as data." random_seed_array = hash_int32(random_seed_array) else: random_seed_array = None return random_seed_array def _iter_pairs(self, new_iterations): """ Returns the absolute and relative iterations that are going to be run for a given function call. """ current_iters = self.iteration_count() iter_pairs = [(current_iters + i + 1, i + 1) for i in range(new_iterations)] return iter_pairs def _initialize_dataset(self, imputed_data, random_state, random_seed_array): """ Sets initial imputation values for iteration 0. If "random", draw values from the kernel at random. If "empty", keep the values missing, since missing values can be handled natively by lightgbm. """ assert not imputed_data.initialized, "dataset has already been initialized" if self.initialization == "random": for var in imputed_data.imputation_order: kernel_nonmissing_ind = self._get_nonmissing_indx(var) candidate_values = _subset_data( self.working_data, kernel_nonmissing_ind, var, return_1d=True ) n_candidates = kernel_nonmissing_ind.shape[0] missing_ind = imputed_data.na_where[var] for ds in range(imputed_data.dataset_count()): # Initialize using the random_state if no record seeds were passed. if random_seed_array is None: imputed_data[ds, var, 0] = random_state.choice( candidate_values, size=imputed_data.na_counts[var], replace=True, ) else: assert ( len(random_seed_array) == imputed_data.data_shape[0] ), "The random_seed_array did not match the number of rows being imputed." selection_ind = random_seed_array[missing_ind] % n_candidates init_imps = candidate_values[selection_ind] imputed_data[ds, var, 0] = np.array(init_imps) random_seed_array[missing_ind] = hash_int32( random_seed_array[missing_ind] ) elif self.initialization == "empty": for var in imputed_data.imputation_order: for ds in range(imputed_data.dataset_count()): # Saves space, since np.nan will be broadcast. imputed_data[ds, var, 0] = np.array([np.nan]) else: raise ValueError("initialization parameter not recognized.") imputed_data.initialized = True def _reconcile_parameters(self, defaults, user_supplied): """ Checks in user_supplied for aliases of each parameter in defaults. Combines the dicts once the aliases have been reconciled. """ params = defaults.copy() for par, val in defaults.items(): alias_names = _ConfigAliases.get(par) user_supplied_aliases = [ i for i in alias_names if i in list(user_supplied) and i != par ] if len(user_supplied_aliases) == 0: continue elif len(user_supplied_aliases) == 1: params[par] = user_supplied.pop(user_supplied_aliases[0]) else: raise ValueError( f"Supplied 2 aliases for the same parameter: {user_supplied_aliases}" ) params.update(user_supplied) return params def _format_variable_parameters( self, variable_parameters: Optional[Dict] ) -> Dict[int, Any]: """ Unpacking will expect an empty dict at a minimum. This function collects parameters if they were provided, and returns empty dicts if they weren't. """ if variable_parameters is None: vsp: Dict[int, Any] = {var: {} for var in self.variable_training_order} else: for variable in list(variable_parameters): variable_parameters[ self._get_var_ind_from_scalar(variable) ] = variable_parameters.pop(variable) vsp_vars = set(variable_parameters) assert vsp_vars.issubset( self.variable_training_order ), "Some variable_parameters are not associated with models being trained." vsp = { var: variable_parameters[var] if var in vsp_vars else {} for var in self.variable_training_order } return vsp def _get_lgb_params(self, var, vsp, random_state, **kwlgb): """ Builds the parameters for a lightgbm model. Infers objective based on datatype of the response variable, assigns a random seed, finds aliases in the user supplied parameters, and returns a final dict. Parameters ---------- var: int The variable to be modeled vsp: dict Variable specific parameters. These are supplied by the user. random_state: np.random.RandomState The random state to use (used to set the seed). kwlgb: dict Any additional parameters that should take presidence over the defaults or user supplied. """ seed = _draw_random_int32(random_state, size=1)[0] if var in self.categorical_variables: n_c = self.category_counts[var] if n_c > 2: obj = {"objective": "multiclass", "num_class": n_c} else: obj = {"objective": "binary"} else: obj = {"objective": "regression"} default_lgb_params = {**default_parameters, **obj, "seed": seed} # Priority is [variable specific] > [global in kwargs] > [defaults] params = self._reconcile_parameters(default_lgb_params, kwlgb) params = self._reconcile_parameters(params, vsp) return params def _get_random_sample(self, parameters, random_state): """ Searches through a parameter set and selects a random number between the values in any provided tuple of length 2. """ parameters = parameters.copy() for p, v in parameters.items(): if hasattr(v, "__iter__"): if isinstance(v, list): parameters[p] = random_state.choice(v) elif isinstance(v, tuple): parameters[p] = random_state.uniform(v[0], v[1], size=1)[0] else: pass parameters = self._make_params_digestible(parameters) return parameters def _make_params_digestible(self, params): """ Cursory checks to force parameters to be digestible """ int_params = [ "num_leaves", "min_data_in_leaf", "num_threads", "max_depth", "num_iterations", "bagging_freq", "max_drop", "min_data_per_group", "max_cat_to_onehot", ] params = { key: int(val) if key in int_params else val for key, val in params.items() } return params def _get_oof_performance( self, parameters, folds, train_pointer, categorical_feature ): """ Performance is gathered from built-in lightgbm.cv out of fold metric. Optimal number of iterations is also obtained. """ num_iterations = parameters.pop("num_iterations") lgbcv = cv( params=parameters, train_set=train_pointer, folds=folds, num_boost_round=num_iterations, categorical_feature=categorical_feature, return_cvbooster=True, callbacks=[ early_stopping(stopping_rounds=10, verbose=False), log_evaluation(period=0), ], ) best_iteration = lgbcv["cvbooster"].best_iteration loss_metric_key = list(lgbcv)[0] loss = np.min(lgbcv[loss_metric_key]) return loss, best_iteration def _get_nonmissing_values(self, variable): """ Returns the non-missing values of a column. """ var_indx = self._get_var_ind_from_scalar(variable) nonmissing_index = self._get_nonmissing_indx(variable) candidate_values = _subset_data( self.working_data, row_ind=nonmissing_index, col_ind=var_indx ) return candidate_values def _get_candidate_subset(self, variable, subset_count, random_seed): """ Returns a reproducible subset index of the non-missing values for a given variable. """ var_indx = self._get_var_ind_from_scalar(variable) nonmissing_index = self._get_nonmissing_indx(var_indx) # Get the subset indices if subset_count < len(nonmissing_index): candidate_values = _subset_data( self.working_data, row_ind=nonmissing_index, col_ind=var_indx ) candidates = candidate_values.shape[0] groups = max(10, int(candidates / 1000)) ss = stratified_subset( y=candidate_values, size=subset_count, groups=groups, cat=var_indx in self.categorical_variables, seed=random_seed, ) candidate_subset = nonmissing_index[ss] else: candidate_subset = nonmissing_index return candidate_subset def _make_label(self, variable, subset_count, random_seed): """ Returns a reproducible subset of the non-missing values of a variable. """ var_indx = self._get_var_ind_from_scalar(variable) candidate_subset = self._get_candidate_subset( var_indx, subset_count, random_seed ) label = _subset_data( self.working_data, row_ind=candidate_subset, col_ind=var_indx ) return label def _make_features_label(self, variable, subset_count, random_seed): """ Makes a reproducible set of features and target needed to train a lightgbm model. """ var_indx = self._get_var_ind_from_scalar(variable) candidate_subset = self._get_candidate_subset( var_indx, subset_count, random_seed ) xvars = self.variable_schema[var_indx] ret_cols = sorted(xvars + [var_indx]) features = _subset_data( self.working_data, row_ind=candidate_subset, col_ind=ret_cols ) if self.original_data_class == "pd_DataFrame": y_name = self._get_var_name_from_scalar(var_indx) label = features.pop(y_name) elif self.original_data_class == "np_ndarray": y_col = ret_cols.index(var_indx) label = features[:, y_col].copy() features = np.delete(features, y_col, axis=1) x_cat = [xvars.index(var) for var in self.categorical_variables if var in xvars] return features, label, x_cat
[docs] def append(self, imputation_kernel): """ Combine two imputation kernels together. For compatibility, the following attributes of each must be equal: - working_data - iteration_count - categorical_feature - mean_match_scheme - variable_schema - imputation_order - save_models - save_all_iterations Only cursory checks are done to ensure working_data is equal. Appending a kernel with different working_data could ruin this kernel. Parameters ---------- imputation_kernel: ImputationKernel The kernel to merge. """ _assert_dataset_equivalent(self.working_data, imputation_kernel.working_data) assert self.iteration_count() == imputation_kernel.iteration_count() assert self.variable_schema == imputation_kernel.variable_schema assert self.imputation_order == imputation_kernel.imputation_order assert self.variable_training_order == imputation_kernel.variable_training_order assert self.categorical_feature == imputation_kernel.categorical_feature assert self.save_models == imputation_kernel.save_models assert self.save_all_iterations == imputation_kernel.save_all_iterations assert ( self.mean_match_scheme.objective_pred_dtypes == imputation_kernel.mean_match_scheme.objective_pred_dtypes ) assert ( self.mean_match_scheme.objective_pred_funcs == imputation_kernel.mean_match_scheme.objective_pred_funcs ) assert ( self.mean_match_scheme.objective_args == imputation_kernel.mean_match_scheme.objective_args ) assert ( self.mean_match_scheme.mean_match_candidates == imputation_kernel.mean_match_scheme.mean_match_candidates ) current_datasets = self.dataset_count() new_datasets = imputation_kernel.dataset_count() for key, model in imputation_kernel.models.items(): new_ds_indx = key[0] + current_datasets insert_key = new_ds_indx, key[1], key[2] self.models[insert_key] = model for key, cp in imputation_kernel.candidate_preds.items(): new_ds_indx = key[0] + current_datasets insert_key = new_ds_indx, key[1], key[2] self.candidate_preds[insert_key] = cp for key, iv in imputation_kernel.imputation_values.items(): new_ds_indx = key[0] + current_datasets self[new_ds_indx, key[1], key[2]] = iv # Combine dicts for ds in range(new_datasets): insert_index = current_datasets + ds self.optimal_parameters[ insert_index ] = imputation_kernel.optimal_parameters[ds] self.optimal_parameter_losses[ insert_index ] = imputation_kernel.optimal_parameter_losses[ds] # Append iterations self.iterations = np.append( self.iterations, imputation_kernel.iterations, axis=0 )
[docs] def compile_candidate_preds(self): """ Candidate predictions can be pre-generated before imputing new data. This can save a substantial amount of time, especially if save_models == 1. """ compile_objectives = ( self.mean_match_scheme.get_objectives_requiring_candidate_preds() ) for key, model in self.models.items(): already_compiled = key in self.candidate_preds.keys() objective = model.params["objective"] if objective in compile_objectives and not already_compiled: var = key[1] candidate_features, _, _ = self._make_features_label( variable=var, subset_count=self.data_subset[var], random_seed=model.params["seed"], ) self.candidate_preds[key] = self.mean_match_scheme.model_predict( model, candidate_features ) else: continue
[docs] def delete_candidate_preds(self): """ Deletes the pre-computed candidate predictions. """ self.candidate_preds = {}
[docs] def fit(self, X, y, **fit_params): """ Method for fitting a kernel when used in a sklearn pipeline. Should not be called by the user directly. """ assert self.dataset_count() == 1, ( "miceforest kernel should be initialized with datasets=1 if " + "being used in a sklearn pipeline." ) _assert_dataset_equivalent(self.working_data, X), ( "The dataset passed to fit() was not the same as the " "dataset passed to ImputationKernel()" ) self.mice(**fit_params) return self
[docs] def get_model( self, dataset: int, variable: Union[str, int], iteration: Optional[int] = None ): """ Return the model for a specific dataset, variable, iteration. Parameters ---------- dataset: int The dataset to return the model for. var: str The variable that was imputed iteration: int The model iteration to return. Keep in mind if save_models ==1, the model was not saved. If none is provided, the latest model is returned. Returns: lightgbm.Booster The model used to impute this specific variable, iteration. """ var_indx = self._get_var_ind_from_scalar(variable) itrn = ( self.iteration_count(datasets=dataset, variables=var_indx) if iteration is None else iteration ) try: return self.models[dataset, var_indx, itrn] except Exception: raise ValueError("Could not find model.")
[docs] def get_raw_prediction( self, variable: Union[int, str], imp_dataset: int = 0, imp_iteration: Optional[int] = None, model_dataset: Optional[int] = None, model_iteration: Optional[int] = None, dtype: Union[str, np.dtype, None] = None, ): """ Get the raw model output for a specific variable. The data is pulled from the imp_dataset dataset, at the imp_iteration iteration. The model is pulled from model_dataset dataset, at the model_iteration iteration. So, for example, it is possible to get predictions using the imputed values for dataset 3, at iteration 2, using the model obtained from dataset 10, at iteration 6. This is assuming desired iterations and models have been saved. Parameters ---------- variable: int or str The variable to get the raw predictions for. Can be an index or variable name. imp_dataset: int The imputation dataset to use when creating the feature dataset. imp_iteration: int The iteration from which to draw the imputation values when creating the feature dataset. If None, the latest iteration is used. model_dataset: int The dataset from which to pull the trained model for this variable. If None, it is selected to be the same as imp_dataset. model_iteration: int The iteration from which to pull the trained model for this variable If None, it is selected to be the same as imp_iteration. dtype: str, np.dtype The datatype to cast the raw prediction as. Passed to MeanMatchScheme.model_predict(). Returns ------- np.ndarray of raw predictions. """ var_indx = self._get_var_ind_from_scalar(variable) predictor_variables = self.variable_schema[var_indx] # Get the latest imputation iteration if imp_iteration was not specified if imp_iteration is None: imp_iteration = self.iteration_count( datasets=imp_dataset, variables=var_indx ) # If model dataset / iteration wasn't specified, assume it is from the same # dataset / iteration we are pulling the imputation values from model_iteration = imp_iteration if model_iteration is None else model_iteration model_dataset = imp_dataset if model_dataset is None else model_dataset # Get our internal dataset ready self.complete_data(dataset=imp_dataset, iteration=imp_iteration, inplace=True) features = _subset_data(self.working_data, col_ind=predictor_variables) model = self.get_model(model_dataset, var_indx, iteration=model_iteration) preds = self.mean_match_scheme.model_predict(model, features, dtype=dtype) return preds
[docs] def mice( self, iterations=2, verbose=False, variable_parameters=None, compile_candidates=False, **kwlgb, ): """ Perform mice on a given dataset. Multiple Imputation by Chained Equations (MICE) is an iterative method which fills in (imputes) missing data points in a dataset by modeling each column using the other columns, and then inferring the missing data. For more information on MICE, and missing data in general, see Stef van Buuren's excellent online book: https://stefvanbuuren.name/fimd/ch-introduction.html For detailed usage information, see this project's README on the github repository: https://github.com/AnotherSamWilson/miceforest Parameters ---------- iterations: int The number of iterations to run. verbose: bool Should information about the process be printed? variable_parameters: None or dict Model parameters can be specified by variable here. Keys should be variable names or indices, and values should be a dict of parameter which should apply to that variable only. compile_candidates: bool Candidate predictions can be stored as they are created while performing mice. This prevents kernel.compile_candidate_preds() from having to be called separately, and can save a significant amount of time if compiled candidate predictions are desired. kwlgb: Additional arguments to pass to lightgbm. Applied to all models. """ __MICE_TIMED_EVENTS = ["prepare_xy", "training", "predict", "mean_matching"] iter_pairs = self._iter_pairs(iterations) # Delete models and candidate_preds if we shouldn't be saving every iteration if self.save_models < 2: self.models = {} self.candidate_preds = {} logger = Logger( name=f"mice {str(iter_pairs[0][0])}-{str(iter_pairs[-1][0])}", verbose=verbose, ) vsp = self._format_variable_parameters(variable_parameters) for ds in range(self.dataset_count()): logger.log("Dataset " + str(ds)) # set self.working_data to the most current iteration. self.complete_data(dataset=ds, inplace=True) last_iteration = False for iter_abs, iter_rel in iter_pairs: logger.log(str(iter_abs) + " ", end="") if iter_rel == iterations: last_iteration = True save_model = self.save_models == 2 or ( last_iteration and self.save_models == 1 ) for variable in self.variable_training_order: var_name = self._get_var_name_from_scalar(variable) logger.log(" | " + var_name, end="") predictor_variables = self.variable_schema[variable] data_subset = self.data_subset[variable] nawhere = self.na_where[variable] log_context = { "dataset": ds, "variable_name": var_name, "iteration": iter_abs, } # Define the lightgbm parameters lgbpars = self._get_lgb_params( variable, vsp[variable], self._random_state, **kwlgb ) objective = lgbpars["objective"] # These are necessary for building model in mice. logger.set_start_time() ( candidate_features, candidate_values, feature_cat_index, ) = self._make_features_label( variable=variable, subset_count=data_subset, random_seed=lgbpars["seed"], ) if ( self.original_data_class == "pd_DataFrame" or len(feature_cat_index) == 0 ): feature_cat_index = "auto" # lightgbm requires integers for label. Categories won't work. if candidate_values.dtype.name == "category": candidate_values = candidate_values.cat.codes num_iterations = lgbpars.pop("num_iterations") train_pointer = Dataset( data=candidate_features, label=candidate_values, categorical_feature=feature_cat_index, ) logger.record_time(timed_event="prepare_xy", **log_context) logger.set_start_time() current_model = train( params=lgbpars, train_set=train_pointer, num_boost_round=num_iterations, categorical_feature=feature_cat_index, ) logger.record_time(timed_event="training", **log_context) if save_model: self.models[ds, variable, iter_abs] = current_model # Only perform mean matching and insertion # if variable is being imputed. if variable in self.imputation_order: mean_match_args = self.mean_match_scheme.get_mean_match_args( objective ) # Start creating kwargs for mean matching function mm_kwargs = {} if "lgb_booster" in mean_match_args: mm_kwargs["lgb_booster"] = current_model if {"bachelor_preds", "bachelor_features"}.intersection( mean_match_args ): logger.set_start_time() bachelor_features = _subset_data( self.working_data, row_ind=nawhere, col_ind=predictor_variables, ) logger.record_time(timed_event="prepare_xy", **log_context) if "bachelor_features" in mean_match_args: mm_kwargs["bachelor_features"] = bachelor_features if "bachelor_preds" in mean_match_args: logger.set_start_time() bachelor_preds = self.mean_match_scheme.model_predict( current_model, bachelor_features ) logger.record_time(timed_event="predict", **log_context) mm_kwargs["bachelor_preds"] = bachelor_preds if "candidate_values" in mean_match_args: mm_kwargs["candidate_values"] = candidate_values if "candidate_features" in mean_match_args: mm_kwargs["candidate_features"] = candidate_features # Calculate the candidate predictions if # the mean matching function calls for it if "candidate_preds" in mean_match_args: logger.set_start_time() candidate_preds = self.mean_match_scheme.model_predict( current_model, candidate_features ) logger.record_time(timed_event="predict", **log_context) mm_kwargs["candidate_preds"] = candidate_preds if compile_candidates and save_model: self.candidate_preds[ ds, variable, iter_abs ] = candidate_preds if "random_state" in mean_match_args: mm_kwargs["random_state"] = self._random_state # Hashed seeds are only to ensure record reproducibility # for impute_new_data(). if "hashed_seeds" in mean_match_args: mm_kwargs["hashed_seeds"] = None logger.set_start_time() imp_values = self.mean_match_scheme._mean_match( variable, objective, **mm_kwargs ) logger.record_time(timed_event="mean_matching", **log_context) assert imp_values.shape == ( self.na_counts[variable], ), f"{variable} mean matching returned malformed array" # Updates our working data and saves the imputations. self._insert_new_data( dataset=ds, variable_index=variable, new_data=imp_values ) self.iterations[ ds, self.variable_training_order.index(variable) ] += 1 logger.log("\n", end="") self._ampute_original_data() if self.save_loggers: self.loggers.append(logger)
[docs] def transform(self, X, y=None): """ Method for calling a kernel when used in a sklearn pipeline. Should not be called by the user directly. """ new_dat = self.impute_new_data(X, datasets=[0]) return new_dat.complete_data(dataset=0, inplace=False)
[docs] def tune_parameters( self, dataset: int, variables: Union[List[int], List[str], None] = None, variable_parameters: Optional[Dict[Any, Any]] = None, parameter_sampling_method: str = "random", nfold: int = 10, optimization_steps: int = 5, random_state: _t_random_state = None, verbose: bool = False, **kwbounds, ): """ Perform hyperparameter tuning on models at the current iteration. .. code-block:: text A few notes: - Underlying models will now be gradient boosted trees by default (or any other boosting type compatible with lightgbm.cv). - The parameters are tuned on the data that would currently be returned by complete_data(dataset). It is usually a good idea to run at least 1 iteration of mice with the default parameters to get a more accurate idea of the real optimal parameters, since Missing At Random (MAR) data imputations tend to converge over time. - num_iterations is treated as the maximum number of boosting rounds to run in lightgbm.cv. It is NEVER optimized. The num_iterations that is returned is the best_iteration returned by lightgbm.cv. num_iterations can be passed to limit the boosting rounds, but the returned value will always be obtained from best_iteration. - lightgbm parameters are chosen in the following order of priority: 1) Anything specified in variable_parameters 2) Parameters specified globally in **kwbounds 3) Default tuning space (miceforest.default_lightgbm_parameters.make_default_tuning_space) 4) Default parameters (miceforest.default_lightgbm_parameters.default_parameters) - See examples for a detailed run-through. See https://github.com/AnotherSamWilson/miceforest#Tuning-Parameters for even more detailed examples. Parameters ---------- dataset: int (required) .. code-block:: text The dataset to run parameter tuning on. Tuning parameters on 1 dataset usually results in acceptable parameters for all datasets. However, tuning results are still stored seperately for each dataset. variables: None or list .. code-block:: text - If None, default hyper-parameter spaces are selected based on kernel data, and all variables with missing values are tuned. - If list, must either be indexes or variable names corresponding to the variables that are to be tuned. variable_parameters: None or dict .. code-block:: text Defines the tuning space. Dict keys must be variable names or indices, and a subset of the variables parameter. Values must be a dict with lightgbm parameter names as keys, and values that abide by the following rules: scalar: If a single value is passed, that parameter will be used to build the model, and will not be tuned. tuple: If a tuple is passed, it must have length = 2 and will be interpreted as the bounds to search within for that parameter. list: If a list is passed, values will be randomly selected from the list. NOTE: This is only possible with method = 'random'. example: If you wish to tune the imputation model for the 4th variable with specific bounds and parameters, you could pass: variable_parameters = { 4: { 'learning_rate: 0.01', 'min_sum_hessian_in_leaf: (0.1, 10), 'extra_trees': [True, False] } } All models for variable 4 will have a learning_rate = 0.01. The process will randomly search within the bounds (0.1, 10) for min_sum_hessian_in_leaf, and extra_trees will be randomly selected from the list. Also note, the variable name for the 4th column could also be passed instead of the integer 4. All other variables will be tuned with the default search space, unless **kwbounds are passed. parameter_sampling_method: str .. code-block:: text If 'random', parameters are randomly selected. Other methods will be added in future releases. nfold: int .. code-block:: text The number of folds to perform cross validation with. More folds takes longer, but Gives a more accurate distribution of the error metric. optimization_steps: .. code-block:: text How many steps to run the process for. random_state: int or np.random.RandomState or None (default=None) .. code-block:: text The random state of the process. Ensures reproduceability. If None, the random state of the kernel is used. Beware, this permanently alters the random state of the kernel and ensures non-reproduceable results, unless the entire process up to this point is re-run. kwbounds: .. code-block:: text Any additional arguments that you want to apply globally to every variable. For example, if you want to limit the number of iterations, you could pass num_iterations = x to this functions, and it would apply globally. Custom bounds can also be passed. Returns ------- 2 dicts: optimal_parameters, optimal_parameter_losses - optimal_parameters: dict A dict of the optimal parameters found for each variable. This can be passed directly to the variable_parameters parameter in mice() .. code-block:: text {variable: {parameter_name: parameter_value}} - optimal_parameter_losses: dict The average out of fold cv loss obtained directly from lightgbm.cv() associated with the optimal parameter set. .. code-block:: text {variable: loss} """ if random_state is None: random_state = self._random_state else: random_state = ensure_rng(random_state) if variables is None: variables = self.imputation_order else: variables = self._get_var_ind_from_list(variables) self.complete_data(dataset, inplace=True) logger = Logger( name=f"tune: {optimization_steps}", verbose=verbose, ) vsp = self._format_variable_parameters(variable_parameters) variable_parameter_space = {} for var in variables: default_tuning_space = make_default_tuning_space( self.category_counts[var] if var in self.categorical_variables else 1, int((self.data_shape[0] - len(self.na_where[var])) / 10), ) variable_parameter_space[var] = self._get_lgb_params( var=var, vsp={**kwbounds, **vsp[var]}, random_state=random_state, **default_tuning_space, ) if parameter_sampling_method == "random": for var, parameter_space in variable_parameter_space.items(): logger.log(self._get_var_name_from_scalar(var) + " | ", end="") ( candidate_features, candidate_values, feature_cat_index, ) = self._make_features_label( variable=var, subset_count=self.data_subset[var], random_seed=_draw_random_int32( random_state=self._random_state, size=1 )[0], ) # lightgbm requires integers for label. Categories won't work. if candidate_values.dtype.name == "category": candidate_values = candidate_values.cat.codes is_categorical = var in self.categorical_variables for step in range(optimization_steps): logger.log(str(step), end="") # Make multiple attempts to learn something. non_learners = 0 learning_attempts = 10 while non_learners < learning_attempts: # Pointer and folds need to be re-initialized after every run. train_pointer = Dataset( data=candidate_features, label=candidate_values, categorical_feature=feature_cat_index, free_raw_data=False, ) if is_categorical: folds = stratified_categorical_folds( candidate_values, nfold ) else: folds = stratified_continuous_folds(candidate_values, nfold) sampling_point = self._get_random_sample( parameters=parameter_space, random_state=random_state ) try: loss, best_iteration = self._get_oof_performance( parameters=sampling_point.copy(), folds=folds, train_pointer=train_pointer, categorical_feature=feature_cat_index, ) except: loss, best_iteration = np.Inf, 0 if best_iteration > 1: break else: non_learners += 1 if loss < self.optimal_parameter_losses[dataset][var]: del sampling_point["seed"] sampling_point["num_iterations"] = best_iteration self.optimal_parameters[dataset][var] = sampling_point self.optimal_parameter_losses[dataset][var] = loss logger.log(" - ", end="") logger.log("\n", end="") self._ampute_original_data() return ( self.optimal_parameters[dataset], self.optimal_parameter_losses[dataset], )
[docs] def impute_new_data( self, new_data: _t_dat, datasets: Optional[List[int]] = None, iterations: Optional[int] = None, save_all_iterations: bool = True, copy_data: bool = True, random_state: _t_random_state = None, random_seed_array: Optional[np.ndarray] = None, verbose: bool = False, ) -> ImputedData: """ Impute a new dataset Uses the models obtained while running MICE to impute new data, without fitting new models. Pulls mean matching candidates from the original data. save_models must be > 0. If save_models == 1, the last model obtained in mice is used for every iteration. If save_models > 1, the model obtained at each iteration is used to impute the new data for that iteration. If specified iterations is greater than the number of iterations run so far using mice, the last model is used for each additional iteration. Type checking is not done. It is up to the user to ensure that the kernel data matches the new data being imputed. Parameters ---------- new_data: pandas DataFrame or numpy ndarray The new data to impute datasets: int or List[int] (default = None) The datasets from the kernel to use to impute the new data. If None, all datasets from the kernel are used. iterations: int The number of iterations to run. If None, the same number of iterations run so far in mice is used. save_all_iterations: bool Should the imputation values of all iterations be archived? If False, only the latest imputation values are saved. copy_data: boolean Should the dataset be referenced directly? This will cause the dataset to be altered in place. If a copy is created, it is saved in self.working_data. There are different ways in which the dataset can be altered: 1) complete_data() will fill in missing values 2) mice() references and manipulates self.working_data directly. random_state: int or np.random.RandomState or None (default=None) The random state of the process. Ensures reproducibility. If None, the random state of the kernel is used. Beware, this permanently alters the random state of the kernel and ensures non-reproduceable results, unless the entire process up to this point is re-run. random_seed_array: None or np.ndarray (int32) .. code-block:: text Record-level seeds. Ensures deterministic imputations at the record level. random_seed_array causes deterministic imputations for each record no matter what dataset each record is imputed with, assuming the same number of iterations and datasets are used. If random_seed_array os passed, random_state must also be passed. Record-level imputations are deterministic if the following conditions are met: 1) The associated seed is the same. 2) The same kernel is used. 3) The same number of iterations are run. 4) The same number of datasets are run. Notes: a) This will slightly slow down the imputation process, because random number generation in numpy can no longer be vectorized. If you don't have a specific need for deterministic imputations at the record level, it is better to keep this parameter as None. b) Using this parameter may change the global numpy seed by calling np.random.seed(). c) Internally, these seeds are hashed each time they are used, in order to obtain different results for each dataset / iteration. verbose: boolean Should information about the process be printed? Returns ------- miceforest.ImputedData """ datasets = list(range(self.dataset_count())) if datasets is None else datasets kernel_iterations = self.iteration_count() iterations = kernel_iterations if iterations is None else iterations iter_pairs = self._iter_pairs(iterations) __IND_TIMED_EVENTS = ["prepare_xy", "predict", "mean_matching"] logger = Logger( name=f"ind {str(iter_pairs[0][1])}-{str(iter_pairs[-1][1])}", verbose=verbose, ) if isinstance(self.working_data, pd_DataFrame): assert isinstance(new_data, pd_DataFrame) assert set(self.working_data.columns) == set( new_data.columns ), "Different columns from original dataset." assert all( [ self.working_data[col].dtype == new_data[col].dtype for col in self.working_data.columns ] ), "Column types are not the same as the original data. Check categorical columns." if self.save_models < 1: raise ValueError("No models were saved.") imputed_data = ImputedData( impute_data=new_data, datasets=len(datasets), variable_schema=self.variable_schema.copy(), imputation_order=self.variable_training_order.copy(), train_nonmissing=False, categorical_feature=self.categorical_feature, save_all_iterations=save_all_iterations, copy_data=copy_data, ) ### Manage Randomness. if random_state is None: assert ( random_seed_array is None ), "random_state is also required when using random_seed_array" random_state = self._random_state else: random_state = ensure_rng(random_state) # use_seed_array = random_seed_array is not None random_seed_array = self._initialize_random_seed_array( random_seed_array=random_seed_array, expected_shape=imputed_data.data_shape[0], ) self._initialize_dataset( imputed_data, random_state=random_state, random_seed_array=random_seed_array ) for ds_kern in datasets: logger.log("Dataset " + str(ds_kern)) self.complete_data(dataset=ds_kern, inplace=True) ds_new = datasets.index(ds_kern) imputed_data.complete_data(dataset=ds_new, inplace=True) for iter_abs, iter_rel in iter_pairs: logger.log(str(iter_rel) + " ", end="") # Determine which model iteration to grab if self.save_models == 1 or iter_abs > kernel_iterations: iter_model = kernel_iterations else: iter_model = iter_abs for var in imputed_data.imputation_order: var_name = self._get_var_name_from_scalar(var) logger.log(" | " + var_name, end="") log_context = { "dataset": ds_kern, "variable_name": var_name, "iteration": iter_rel, } nawhere = imputed_data.na_where[var] predictor_variables = self.variable_schema[var] # Select our model. current_model = self.get_model( variable=var, dataset=ds_kern, iteration=iter_model ) objective = current_model.params["objective"] model_seed = current_model.params["seed"] # Start building mean matching kwargs mean_match_args = self.mean_match_scheme.get_mean_match_args( objective ) mm_kwargs = {} if "lgb_booster" in mean_match_args: mm_kwargs["lgb_booster"] = current_model # Procure bachelor information if {"bachelor_preds", "bachelor_features"}.intersection( mean_match_args ): logger.set_start_time() bachelor_features = _subset_data( imputed_data.working_data, row_ind=imputed_data.na_where[var], col_ind=predictor_variables, ) logger.record_time(timed_event="prepare_xy", **log_context) if "bachelor_features" in mean_match_args: mm_kwargs["bachelor_features"] = bachelor_features if "bachelor_preds" in mean_match_args: logger.set_start_time() bachelor_preds = self.mean_match_scheme.model_predict( current_model, bachelor_features ) logger.record_time(timed_event="predict", **log_context) mm_kwargs["bachelor_preds"] = bachelor_preds # Procure candidate information if { "candidate_values", "candidate_features", "candidate_preds", }.intersection(mean_match_args): # Need to return candidate features if we need to calculate # candidate_preds or candidate_features is needed by mean matching function calculate_candidate_preds = ( ds_kern, var, iter_model, ) not in self.candidate_preds.keys() and "candidate_preds" in mean_match_args return_features = ( "candidate_features" in mean_match_args ) or calculate_candidate_preds # Set up like this so we only have to subset once logger.set_start_time() if return_features: ( candidate_features, candidate_values, _, ) = self._make_features_label( variable=var, subset_count=self.data_subset[var], random_seed=model_seed, ) else: candidate_values = self._make_label( variable=var, subset_count=self.data_subset[var], random_seed=model_seed, ) logger.record_time(timed_event="prepare_xy", **log_context) if "candidate_values" in mean_match_args: # lightgbm requires integers for label. Categories won't work. if candidate_values.dtype.name == "category": candidate_values = candidate_values.cat.codes mm_kwargs["candidate_values"] = candidate_values if "candidate_features" in mean_match_args: mm_kwargs["candidate_features"] = candidate_features # Calculate the candidate predictions if # the mean matching function calls for it if "candidate_preds" in mean_match_args: if calculate_candidate_preds: logger.set_start_time() candidate_preds = self.mean_match_scheme.model_predict( current_model, candidate_features ) logger.record_time(timed_event="predict", **log_context) else: candidate_preds = self.candidate_preds[ ds_kern, var, iter_model ] mm_kwargs["candidate_preds"] = candidate_preds if "random_state" in mean_match_args: mm_kwargs["random_state"] = random_state if "hashed_seeds" in mean_match_args: if isinstance(random_seed_array, np.ndarray): seeds = random_seed_array[nawhere] rehash_seeds = True else: seeds = None rehash_seeds = False mm_kwargs["hashed_seeds"] = seeds else: rehash_seeds = False logger.set_start_time() imp_values = self.mean_match_scheme._mean_match( var, objective, **mm_kwargs ) logger.record_time(timed_event="mean_matching", **log_context) imputed_data._insert_new_data( dataset=ds_new, variable_index=var, new_data=imp_values ) # Refresh our seeds. if rehash_seeds: assert isinstance(random_seed_array, np.ndarray) random_seed_array[nawhere] = hash_int32(seeds) imputed_data.iterations[ ds_new, imputed_data.imputation_order.index(var) ] += 1 logger.log("\n", end="") imputed_data._ampute_original_data() if self.save_loggers: self.loggers.append(logger) return imputed_data
[docs] def start_logging(self): """ Start saving loggers to self.loggers """ self.save_loggers = True
[docs] def stop_logging(self): """ Stop saving loggers to self.loggers """ self.save_loggers = False
[docs] def save_kernel( self, filepath, clevel=None, cname=None, n_threads=None, copy_while_saving=True ): """ Compresses and saves the kernel to a file. Parameters ---------- filepath: str The file to save to. clevel: int The compression level, sent to clevel argument in blosc.compress() cname: str The compression algorithm used. Sent to cname argument in blosc.compress. If None is specified, the default is lz4hc. n_threads: int The number of threads to use for compression. By default, all threads are used. copy_while_saving: boolean Should the kernel be copied while saving? Copying is safer, but may take more memory. """ clevel = 9 if clevel is None else clevel cname = "lz4hc" if cname is None else cname n_threads = blosc.detect_number_of_cores() if n_threads is None else n_threads if copy_while_saving: kernel = copy(self) else: kernel = self # convert working data to parquet bytes object if kernel.original_data_class == "pd_DataFrame": working_data_bytes = BytesIO() kernel.working_data.to_parquet(working_data_bytes) kernel.working_data = working_data_bytes blosc.set_nthreads(n_threads) with open(filepath, "wb") as f: dill.dump( blosc.compress( dill.dumps(kernel), clevel=clevel, typesize=8, shuffle=blosc.NOSHUFFLE, cname=cname, ), f, )
[docs] def get_feature_importance(self, dataset, iteration=None) -> np.ndarray: """ Return a matrix of feature importance. The cells represent the normalized feature importance of the columns to impute the rows. This is calculated internally by lightgbm.Booster.feature_importance(). Parameters ---------- dataset: int The dataset to get the feature importance for. iteration: int The iteration to return the feature importance for. Right now, the model must be saved to return importance Returns ------- np.ndarray of importance values. Rows are imputed variables, and columns are predictor variables. """ if iteration is None: iteration = self.iteration_count(datasets=dataset) importance_matrix = np.full( shape=(len(self.imputation_order), len(self.predictor_vars)), fill_value=np.NaN, ) for ivar in self.imputation_order: importance_dict = dict( zip( self.variable_schema[ivar], self.get_model(dataset, ivar, iteration).feature_importance(), ) ) for pvar in importance_dict: importance_matrix[ np.sort(self.imputation_order).tolist().index(ivar), np.sort(self.predictor_vars).tolist().index(pvar), ] = importance_dict[pvar] return importance_matrix
[docs] def plot_feature_importance( self, dataset, normalize: bool = True, iteration: Optional[int] = None, **kw_plot ): """ Plot the feature importance. See get_feature_importance() for more details. Parameters ---------- dataset: int The dataset to plot the feature importance for. iteration: int The iteration to plot the feature importance of. normalize: book Should the values be normalize from 0-1? If False, values are raw from Booster.feature_importance() kw_plot Additional arguments sent to sns.heatmap() """ # Move this to .compat at some point. try: from seaborn import heatmap except ImportError: raise ImportError("seaborn must be installed to plot importance") importance_matrix = self.get_feature_importance( dataset=dataset, iteration=iteration ) if normalize: importance_matrix = ( importance_matrix / np.nansum(importance_matrix, 1).reshape(-1, 1) ).round(2) imputed_var_names = [ self._get_var_name_from_scalar(int(i)) for i in np.sort(self.imputation_order) ] predictor_var_names = [ self._get_var_name_from_scalar(int(i)) for i in np.sort(self.predictor_vars) ] params = { **{ "cmap": "coolwarm", "annot": True, "fmt": ".2f", "xticklabels": predictor_var_names, "yticklabels": imputed_var_names, "annot_kws": {"size": 16}, }, **kw_plot, } print(heatmap(importance_matrix, **params))