Source code for posydon.active_learning.psy_cris.regress

"""The PSY-CRIS regression module."""


__authors__ = [
    "Kyle Akira Rocha <kylerocha2024@u.northwestern.edu>",
    "Scott Coughlin <scottcoughlin2014@u.northwestern.edu>",
]


import collections
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
from collections import OrderedDict

# -------- regressors --------
from scipy.interpolate import LinearNDInterpolator
from scipy.spatial.qhull import QhullError
from scipy.interpolate import Rbf

import sklearn.gaussian_process as gp
# from sklearn.gaussian_process import GaussianProcessRegressor
# from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C

# -----------------------------


LinearNDInterpolator_names = [
    "linear",
    "lin",
    "linearndinterpolator",
    "linear nd interpolator",
]
RBF_names = ["rbf", "radialbasisfunction", "radial basis function"]
GaussianProcessRegressor_names = ["gp", "gpr", "gaussianprocessregressor"]


[docs] def makehash(): """Manage nested dictionaries.""" return collections.defaultdict(makehash)
[docs] class Regressor: """Perform regression/interpolation with different regression algorithms. Regression algorithms are trained by class and by output column in the data set and stored as instance variables in nested dictionaries. This class inlcudes a 'cross validation' method that trains with the holdout method but calculates differences instead of a single accuracy. """ def __init__(self, TableData_object): """Initialize the Regressor instance. Parameters ---------- TableData_object : instance of <class, TableData> An instance of the TableData class. """ self._TableData_ = TableData_object holder = self._TableData_.get_regr_data(what_data="full") self.input_dict = holder[0] # _regr_inputs_ self.output_dict = holder[1] # _regr_outputs_ self.regr_dfs_per_class = holder[2] # _regr_dfs_per_class_ max_apc_vals = [] apc_dfs_per_class = self.regr_dfs_per_class.copy() for key, val in apc_dfs_per_class.items(): non_APC_cols = [i for i in val.columns if "APC" not in i] if len(non_APC_cols) == len(val.columns): continue # no APC cols to work on apc_dfs_per_class[key] = val.drop(columns=non_APC_cols) abs_max_val = np.nanmax(abs(apc_dfs_per_class[key].to_numpy())) max_apc_vals.append(abs_max_val) if len(max_apc_vals) != 0: self.abs_max_APC = np.max(max_apc_vals) else: self.abs_max_APC = None self._undefined_p_change_val_ = self._TableData_._return_data_( "undefined_p_change_val" ) self._regressors_ = makehash() self._cv_regressors_ = makehash() self._log_history_ = makehash() self._cv_log_history = makehash() self.__train_cross_val = False
[docs] def train_everything(self, regressor_names, verbose=False): """Train all classes and columns with the specified list of regressors. Parameters ---------- regressor_names : list List of strings specifying all the regressors to train. verbose : optional, bool Print useful information. Returns ------- None """ for regr_name in regressor_names: if verbose: print("Regressor: {0}".format(regr_name)) class_keys = list(self.regr_dfs_per_class.keys()) for class_name in class_keys: self.train(regr_name, [class_name], None, verbose=verbose) if verbose: print("\nDone Regressor train_everything.") return None
[docs] def train(self, regressor_name, class_keys, col_keys, di=None, verbose=False): """Train a regression algorithm. Implemented regressors: LinearNDInterpolator ('linear', ...) Radial Basis Function ('rbf', ...) GaussianProcessRegressor ('gp', ...) >>> rg = Regressor( TableData_object ) >>> rg.train('linear', di = np.arange(0, Ndatapoints, 5), verbose=True) Trained regressor objects are uniquely defined by the algorithm used to train, the data set used to train (grouped by class), and finally the output column (there could be more than one). This motivates the data structure for storing the regressor objects as follows: Algorithm -> Class -> Output Column -> Object Here is more realistic example of what it could look like: {RBF: {"class_1": {"output_1": {instance of scipy.interpolate.rbf}}}} Parameters ---------- regressor_name : string Name of regressor to train. class_keys : list List of class(es) to train on. col_keys : list or None For a given class, what columns to train on. If None, it trains on all columns in one class. di : optional, array Array indicies of data used to train (training on a subset). If None (default) - train on whole data set verbose : optional, bool Print statements with more information while training. Returns ------- None Note: You can train mutliple classes at once as long as they have the same columns specified in col_keys. """ regressor_key = self.get_regressor_name_to_key(regressor_name) if col_keys is None: first_class_data = self.regr_dfs_per_class[class_keys[0]] if isinstance(first_class_data, pd.DataFrame): col_keys = np.array(first_class_data.keys()) if verbose: print( "\t Training on all {0} columns in '{1}'...".format( len(col_keys), class_keys[0] ) ) else: if verbose: print("No regression data for {0}.".format(class_keys[0])) return if regressor_key == "LinearNDInterpolator": regr_holder = self.fit_linear_ND_interpolator( class_keys, col_keys, data_interval=di, verbose=verbose ) elif regressor_key == "RBF": regr_holder = self.fit_rbf_interpolator( class_keys, col_keys, data_interval=di, verbose=verbose ) elif regressor_key == "GaussianProcessRegressor": regr_holder = self.fit_gaussian_process_regressor( class_keys, col_keys, data_interval=di, verbose=verbose ) else: print("No trainers with name {0}".format(regressor_name)) return for class_key, class_dict in regr_holder.items(): for col_key, interpolated_obj in class_dict.items(): if verbose: print( "\tdict loc: {0}, {1}, {2},".format( regressor_key, class_key, col_key ) ) if self.__train_cross_val: self._cv_regressors_[regressor_key][class_key][ col_key ] = interpolated_obj else: self._regressors_[regressor_key][class_key][ col_key ] = interpolated_obj if verbose: print("\tEXIT TRAIN\n") return None
def _get_cleaned_regression_data_(self, training_x, training_y, class_key, col_key): """Check for NaNs and user-specified `undefined_p_change_val`. Given a set of training data, the output is checked for nans and user-specified undefined_p_change_val. All instances are removed before training. Returns the new training input and output data: training_x, training_y. Parameters ---------- training_x : ndarray Input data to clean. training_y : array Ouptut data to clean. class_key : str Which class is being cleaned. col_key : str Which column is being cleaned. Returns ------- training_x : ndarray Cleaned input data free of undefined values. training_y : array Cleaned output data free of undefined values. """ if np.sum(np.isnan(training_y)) > 0: where_undef = np.where(np.isnan(training_y))[0] where_def = np.where(~np.isnan(training_y))[0] need_to_clean = True elif self._undefined_p_change_val_ in training_y: where_undef = np.where(self._undefined_p_change_val_ == training_y)[0] where_def = np.where(self._undefined_p_change_val_ != training_y)[0] need_to_clean = True else: need_to_clean = False if need_to_clean: print("Not training on {0} value(s) in {1}, {2}.". format(len(where_undef), class_key, col_key)) training_x = training_x[where_def] training_y = training_y[where_def] return training_x, training_y
[docs] def fit_linear_ND_interpolator(self, class_keys, col_keys, data_interval=None, verbose=False): """Fit linear ND interpolator. Implementation from: scipy.interpolate.LinearNDInterpolator (https://docs.scipy.org/doc/scipy/reference/interpolate.html) Parameters ---------- class_keys : list List of classes to train on. col_keys : list List of columns in the class to train on. If multiple classes are given, it is assumed they all contain the supplied columns. data_interval : array, optional Array indicies of data used to train (training on a subset). If None (default) train on whole data set verbose : bool, optional Print statements with more information while training. Returns ------- regressor_holder : dict Ordered by class specific data and then by column. Nested dictionary maps to a trained linearNDinterpolator object. """ if verbose: print("--- Fit LinearNDInterpolator ---") start_time = time.time() regressor_holder = OrderedDict() for class_key in class_keys: this_class_dict = OrderedDict() # will hold columns # extract the output data associated with class_key which_class_data = self.regr_dfs_per_class[class_key] for col_key in col_keys: if data_interval is None: training_x = self.input_dict[class_key].to_numpy(float) training_y = which_class_data[col_key].to_numpy(float) else: di = np.array(data_interval) training_x = self.input_dict[class_key].to_numpy(float)[di] training_y = which_class_data[col_key].to_numpy(float)[di] # if any undefined_p_change_val in regression data, remove it training_x, training_y = self._get_cleaned_regression_data_( training_x, training_y, class_key, col_key) if verbose: print( "%s: %s - %.0f training points" % (class_key, col_key, len(training_x)) ) try: line = LinearNDInterpolator(training_x, training_y) except QhullError as err: if verbose: print("Error: {}".format(err)) print("Skipping linearNDinterpolator training") line = None this_class_dict[col_key] = line regressor_holder[class_key] = this_class_dict if verbose: print("--- Done in {0:.2f} seconds. ---". format(time.time() - start_time)) return regressor_holder
[docs] def fit_rbf_interpolator(self, class_keys, col_keys, data_interval=None, verbose=False): """Fit RBF interpolator - binary classification (one against all). Implementation from: scipy.interpolate.Rbf (https://docs.scipy.org/doc/scipy/reference/interpolate.html) Parameters ---------- class_keys : List of classes to train on. col_keys : If multiple classes are given, it is assumed they all contain the supplied columns. data_interval : array, optional Array indicies of data used to train (training on a subset). if None (default) train on whole data set verbose : bool, optional Print statements with more information while training. Returns ------- regressor_holder : dict Ordered by class specific data and then by column. Nested dictionary maps to a trained RBF object. """ if verbose: print("--- Fit RBF ---") start_time = time.time() regressor_holder = OrderedDict() for class_key in class_keys: this_class_dict = OrderedDict() # will hold columns # extract the output data associated with class_key which_class_data = self.regr_dfs_per_class[class_key] for col_key in col_keys: if data_interval is None: training_x = self.input_dict[class_key].to_numpy(float) training_y = which_class_data[col_key].to_numpy(float) else: di = np.array(data_interval) training_x = self.input_dict[class_key].to_numpy(float)[di] training_y = which_class_data[col_key].to_numpy(float)[di] # if any undefined_p_change_val in regression data, remove it training_x, training_y = self._get_cleaned_regression_data_( training_x, training_y, class_key, col_key) argList = [] for col in range(len(training_x[0])): argList.append(training_x.T[col]) argList.append(training_y) if verbose: print( "%s: %s - %.0f training points" % (class_key, col_key, len(training_x)) ) if len(training_x) <= 1: print("Skipping training... not enough points for Rbf") # Rbf will fail for training with one point. # So we put None here. line = None else: line = Rbf(*argList) this_class_dict[col_key] = line regressor_holder[class_key] = this_class_dict if verbose: print("--- Done in {0:.2f} seconds. ---". format(time.time() - start_time)) return regressor_holder
[docs] def fit_gaussian_process_regressor(self, class_keys, col_keys, data_interval=None, verbose=False): """Fit a Gaussian Process regressor. Implementation from: sklearn.gaussian_process (https://scikit-learn.org/stable/modules/gaussian_process.html) Parameters ---------- class_keys : List of classes to train on. col_keys : If multiple classes are given, it is assumed they all contain the supplied columns. data_interval : array, optional Array indicies of data used to train (training on a subset). if None (default) train on whole data set verbose : bool, optional Print statements with more information while training. Returns ------- regressor_holder : dict Ordered by class specific data and then by column. Nested dictionary maps to a trained GaussianProcessRegressor object. """ if verbose: print("--- Fit GaussianProcessRegressor ---") start_time = time.time() n_restarts = 3 regressor_holder = OrderedDict() for class_key in class_keys: this_class_dict = OrderedDict() # will hold columns # extract the output data associated with class_key which_class_data = self.regr_dfs_per_class[class_key] for col_key in col_keys: if data_interval is None: training_x = self.input_dict[class_key].to_numpy(float) training_y = which_class_data[col_key].to_numpy(float) else: di = np.array(data_interval) training_x = self.input_dict[class_key].to_numpy(float)[di] training_y = which_class_data[col_key].to_numpy(float)[di] # if any undefined_p_change_val in regression data, remove it training_x, training_y = self._get_cleaned_regression_data_( training_x, training_y, class_key, col_key) if verbose: print( "%s: %s - %.0f training points" % (class_key, col_key, len(training_x)) ) num_dim = len(training_x[0]) starting_loc = [1 for i in range(num_dim)] axis_ranges = [(1e-3, 1e3) for i in range(num_dim)] # kernel = C( 1e3, (1e2, 5e4) ) * RBF( # [10, 500, 300.], [(1e0, 1e3), (1e0, 1e3), (1e-1, 5e3)]) kernel = gp.kernels.RBF(starting_loc, axis_ranges) gpr = gp.GaussianProcessRegressor( kernel=kernel, n_restarts_optimizer=n_restarts ) if verbose: print( " PRE-fit params:\n{0}".format(gpr.kernel.get_params()) ) # helpful for kernel things gpr.fit(training_x, training_y) if verbose: print("POST-fit params:\n{0}". format(gpr.kernel_.get_params())) this_class_dict[col_key] = gpr regressor_holder[class_key] = this_class_dict if verbose: print("--- Done in {0:.2f} seconds. ---". format(time.time() - start_time)) return regressor_holder
[docs] def get_predictions(self, regressor_names, class_keys, col_keys, test_input, return_std=False): """Get predictions from trained regressors for a set of inputs. Parameters ---------- regressor_names : list List of regressor algorithm names to use to predict. class_keys : list List of classes to get predictions for. col_keys : list List of columns to get predictions for. test_input : ndarray Array of input points for which predictions will be found. return_std : optional, bool Return the STD is when using GaussianProcessRegressor. Returns ------- predictions : dict Dictionary ordered by algorithm, class, and output column mapping to an array of predictions for the test input points. """ predictions = OrderedDict() for regr_name in regressor_names: regr_key = self.get_regressor_name_to_key(regr_name) this_class_dict = OrderedDict() for class_key in class_keys: these_cols_dict = OrderedDict() for col_key in col_keys: # will return None for failed Rbf, otherwise ndarray pred_vals = self._predict(regr_key, class_key, col_key, test_input, return_std=return_std) these_cols_dict[col_key] = pred_vals this_class_dict[class_key] = these_cols_dict predictions[regr_key] = this_class_dict return predictions
def _predict(self, regressor_name, class_key, col_key, test_input, return_std=False): """Evaluate the trained regressor at test_input and return predictions. If using GaussianProcessRegressor, the std is optionally returned. """ if isinstance(test_input, list): test_input = np.array(test_input) if test_input.ndim == 1: test_input = np.array([test_input]) if len(test_input) == 0: # given bad data return None sigma = None # default # if empty if not bool(self._regressors_) and not bool(self._cv_regressors_): raise Exception("\n\nNo trained interpolators exist.") regressor_key = self.get_regressor_name_to_key(regressor_name) if self.__train_cross_val: interpolators = self._cv_regressors_[regressor_key] else: interpolators = self._regressors_[regressor_key] interp = interpolators[class_key][col_key] if regressor_key == "RBF": # When Rbf training fails for small classes, interpolator is None if interp is None: return None argList = [] for col in range(len(test_input[0])): argList.append(test_input.T[col]) pred = interp(*argList) elif regressor_key == "GaussianProcessRegressor": if return_std: pred, sigma = interp.predict(test_input, return_std=True) else: pred = interp.predict(test_input) elif regressor_key == "LinearNDInterpolator": pred = interp(test_input) else: print("Name not recognized: {0}".format(regressor_name)) if return_std: return np.array(pred), np.array(sigma) else: return np.array(pred)
[docs] def get_regressor_name_to_key(self, name): """Return the standard key (str) of a classifier.""" if name.lower() in LinearNDInterpolator_names: key = "LinearNDInterpolator" elif name.lower() in RBF_names: key = "RBF" elif name.lower() in GaussianProcessRegressor_names: key = "GaussianProcessRegressor" else: print("No regressor with name '%s'." % name) return None return key
[docs] def show_structure(self): """Show (print) the structure of the regression data.""" for outer_key, outer_val in self.regr_dfs_per_class.items(): print("CLASS: {0}".format(outer_key)) if isinstance(outer_val, pd.DataFrame): print("\tCOLS:") for mid_key, mid_val in outer_val.items(): print("\t" + mid_key) print("") return None
[docs] def get_cross_val_data(self, class_key, col_key, alpha): """Randomly sample the data set and seperate training and test data. Parameters ---------- class_key : str, class_dtype(int or other) Class key specifying the class to get data from. col_key : str Column key specifying the output column to get data. alpha : float Fraction of data set to use for training. (0.05 = 5% of data set) Returns ------- cross_val_test_input_data : ndarray Input data used to test after training on a subset. cross_val_test_output_data : ndarray Output data used to test after training on a subset. sorted_rnd_int_vals : array Indicies of original data that were used as training points. """ num_points = int(len(self.input_dict[class_key]) * alpha) rnd_input_train = [] rnd_outout_train = [] rnd_int_vals = [] rnd_int_set = set() # print("Num points", num_points) if alpha > 1 or alpha <= 0: raise ValueError("Alpha must be in the range (0,1].") ct = 0 while len(rnd_int_vals) < num_points and ct < 1e7: rnd_int = int(np.random.random() * len(self.input_dict[class_key])) if rnd_int not in rnd_int_set: rnd_int_vals.append(rnd_int) rnd_int_set.add(rnd_int) ct += 1 train_rnd_int_vals = np.array(sorted(rnd_int_vals)) # Random training data # cross_val_train_input_data = (self.input_dict[class_key]. # to_numpy(float))[train_rnd_int_vals, :] # cross_val_train_class_data = ( # self.regr_dfs_per_class[class_key][col_key].to_numpy(float) # )[train_rnd_int_vals] test_int_vals = [] for i in range(len(self.input_dict[class_key])): if i in train_rnd_int_vals: pass else: test_int_vals.append(i) # The remainder which will be used to test fits cross_val_test_input_data = (self.input_dict[class_key]. to_numpy(float))[test_int_vals, :] cross_val_test_output_data = ( self.regr_dfs_per_class[class_key][col_key].to_numpy(float))[ test_int_vals] return (cross_val_test_input_data, cross_val_test_output_data, train_rnd_int_vals)
[docs] def cross_validate(self, regressor_name, class_key, col_key, alpha, verbose=False): """Our method of cross validation for regression. Train on a subset of the data and predict values for the rest. Then calculate the difference between the true and predicted value. Parameters ---------- regressor_name : Regressor name to use for analysis. class_key : Class key to take differences. col_key : Column key to take differences. alpha : float Fraction of data set used to find differences. verbose : bool, optional Print useful information. Returns ------- percent_diffs : array Percent difference. diffs : array Absolute difference. """ ( cross_val_test_input, cross_val_test_output, train_data_indicies, ) = self.get_cross_val_data(class_key, col_key, alpha) if verbose: print( "alpha: %f, num_training_points %.0f" % (alpha, len(train_data_indicies)) ) regressor_key = self.get_regressor_name_to_key(regressor_name) # Train classifier start_time = time.time() try: self.__train_cross_val = True if regressor_key == "LinearNDInterpolator": # if linear - train rbf to use if linear predicts nan self.train( regressor_key, [class_key], [col_key], di=train_data_indicies, verbose=verbose, ) self.train( "RBF", [class_key], [col_key], di=train_data_indicies, verbose=verbose, ) else: self.train( regressor_key, [class_key], [col_key], di=train_data_indicies, verbose=verbose, ) time_to_train = time.time() - start_time # Make Predictions if regressor_key == "LinearNDInterpolator": predicted_values_linear = self._predict( regressor_key, class_key, col_key, cross_val_test_input ) predicted_values_rbf = self._predict( "RBF", class_key, col_key, cross_val_test_input ) where_nan = np.where(np.isnan(predicted_values_linear))[0] if len(where_nan) > 0: print("{0}: {1} nan points out of {2}. Used rbf instead.". format(regressor_key, len(where_nan), len(predicted_values_linear))) predicted_values_linear[where_nan] = predicted_values_rbf[ where_nan] predicted_values = predicted_values_linear else: predicted_values = self._predict(regressor_key, class_key, col_key, cross_val_test_input) except Exception: self.__train_cross_val = False print("FAILED DURING CROSS VAL PREDICT") raise self.__train_cross_val = False # Calculate the difference diffs = predicted_values - cross_val_test_output where_zero = np.where(cross_val_test_output == 0)[0] # 1d array where_not_zero = np.where(cross_val_test_output != 0)[0] # 1d array if len(where_zero) > 0: percent_diffs = ( diffs[where_not_zero] / cross_val_test_output[where_not_zero] ) * 100 print("{0} output(s) with value zero. Omitting for percent change " "calculation.".format(len(where_zero))) else: percent_diffs = (diffs / cross_val_test_output) * 100 return percent_diffs, diffs
[docs] def get_max_APC_val(self, regressor_name, class_key, args): """Return the maximum interpolated average percent change for a class. For a given class, and regression method. Return the maximum interpolated average percent change value across all APC columns in the class sorted data set. Helper method for constructing target distributions for the Sampler. Parameters ---------- regressor_name : str Name of regression algorithm to use. class_key : str Class key to use for data. args : array Locations for the APC value to be predicted. Returns ------- max_APC : float Maximum average percent change (APC) value. which_col_max : int Index of which column had the maximum APC. """ regr_column_names = self.regr_dfs_per_class[class_key].keys() good_col_keys = [ i for i in regr_column_names if "APC" in i ] # columns with average percent change data # No APC for this class if not good_col_keys: return 0, None regr_key = self.get_regressor_name_to_key(regressor_name) predictions = self.get_predictions([regr_key], [class_key], good_col_keys, args) dict_with_APC_data = predictions[regr_key][class_key] max_APC_vals = [i[0] for i in dict_with_APC_data.values()] max_APC = np.max(max_APC_vals) which_col_max = list( dict_with_APC_data.keys())[np.argmax(max_APC_vals)] return max_APC, which_col_max
[docs] def mult_diffs(self, regressor_name, class_key, col_keys, alpha, cutoff, verbose=False): """For multiple calls to cross_validate. Parameters ---------- regressor_name : str Name of regression algorithm to use. class_key : str, class_dtype(int or other) Name of class data to use. col_keys : str Column keys to cross validate on. alpha : float Fraction of data set to cross validate on. cutoff : float Sets the cutoff percentage at which to calculate the fraction of the data set above or below. vebose : bool, optional Print useful diagnostic information. Returns ------- p_diffs_holder : ndarray Percent differencs per column. attr_holder : ndarray Contains the number of points outside the cutoff, mean, and standard deviation of the percent difference calculations. """ # col_keys = self.regr_dfs_per_class[class_key].keys() if verbose: print("MULT DIFFS:", regressor_name, col_keys) p_diffs_holder = [] for col_key in col_keys: p_diffs, diffs = self.cross_validate( regressor_name, class_key, col_key, alpha, verbose=verbose ) where_not_nan = np.where(np.invert(np.isnan(p_diffs)))[0] p_diffs_holder.append(p_diffs[where_not_nan]) attr_holder = [] for p_diff in p_diffs_holder: holder = [] outside_cutoff = abs(p_diff) >= cutoff * 100 num_outside = np.sum(outside_cutoff) holder.append(num_outside / len(p_diff) * 100) # percent outside holder.append(np.mean(p_diff)) # mean holder.append(np.std(p_diff)) # standard deviation attr_holder.append(holder) return np.array(p_diffs_holder), np.array(attr_holder)
[docs] def plot_regr_data(self, class_name): """Plot all regression data from the chosen class. Parameters ---------- class_name : str Specify what class data will plotted. Returns ------- matplotlib figure Plots with all regression data for a given class. """ data_out = self.regr_dfs_per_class[class_name] data_in = self.input_dict[class_name] if isinstance(data_out, pd.DataFrame): pass else: print( "Output for class '{0}': {1} \nNo valid data to plot.".format( class_name, str(data_out) ) ) return None key_in = np.array(data_in.columns) key_out = np.array(data_out.columns) # note they are still data frames until this point num_x_axis = len(data_in.keys()) num_y_axis = len(data_out.keys()) # inches per subplot - these ratios can be changed fig_x_ratio = 4 + 1 / 3 fig_y_ratio = 3 + 1 / 3 fig, subs = plt.subplots( nrows=num_y_axis, ncols=num_x_axis, dpi=100, figsize=(fig_x_ratio * num_x_axis, fig_y_ratio * num_y_axis), ) # so that the indexing below works if num_y_axis == 1: subs = np.array([subs]) print("Plotting all regression data from class '{0}'. " "This could take some time...".format(class_name)) for i in range(num_x_axis): for k in range(num_y_axis): data_x = np.array(data_in[key_in[i]]).astype(float) data_y = np.array(data_out[key_out[k]]).astype(float) subs[k, i].plot(data_x, data_y, ".") subs[k, i].set_xlabel(key_in[i]) subs[k, i].set_ylabel(key_out[k]) fig.tight_layout() return fig
[docs] def get_rnd_test_inputs(self, class_name, N, other_rng={}, verbose=False): """Produce randomly sampled 'test' inputs inside domain of input_data. Input data is seperated by class. Parameters ---------- class_name : str Class name to specify which input data you want to look at. N : int Number of test inputs to return. other_rng: dict, optional Change the range of random sampling in desired axis. By default, the sampling is done in the range of the training data. The axis is specified with an integer key and the value is a list specifying the range. {1:[min, max]} verbose: bool, optional Print diagnostic information. (default False) Returns ------- rnd_test_points : ndarray Test points randomly sampled in the range of the training data in each axis unless otherwise specified in 'other_rng'. Has the same shape as input data from TableData. """ num_axis = len(self.input_dict[class_name].values[0]) # find max and min in each axis a_max = [] a_min = [] for i in range(num_axis): a_max.append(max(self.input_dict[class_name].values.T[i])) a_min.append(min(self.input_dict[class_name].values.T[i])) # sample N points between max & min in each axis axis_rnd_points = [] for i in range(num_axis): if i in other_rng: b_min, b_max = other_rng[i] else: b_min, b_max = a_min[i], a_max[i] if verbose: print("{0} - min: {1}, max: {2}".format(i, b_min, b_max)) r = np.random.uniform(low=b_min, high=b_max, size=N) # this reshape is necessary to concatenate axis_rnd_points.append(r[:, np.newaxis]) # now put the random points back together with same shape as input_data rnd_test_points = np.concatenate(axis_rnd_points, axis=1) return rnd_test_points