Source code for src.analysis.theory_simulation.calc_normal_splits

"""

A module to calculate the results for the general stump predictor in Subsection
4.2 (Theorem 4.1) of the paper for the dynamic environment of x.

Replacing the bootstrap procedure by a subsampling scheme, we can here calculate
upper bounds for the Variance and the Bias of stump predictors seen in
subsection 4.2 and following the framework developed by :cite:`Buhlmann2002`.

"""
import json
import pickle
import numpy as np
from scipy.stats import norm

from bld.project_paths import project_paths_join as ppj


[docs]def bias_normal_splits(c_value, a_value, gamma): """ Calculates the squared bias for stump predictors as defined in the paper in Theorem 4.1. Parameters ---------- c_value: int, float The gridpoint to be considered. a_value: float The subsampling fraction. gamma: float The rate of convergence of the estimator. Returns the squared bias. """ bias = ( (norm.cdf(c_value * a_value ** gamma) - norm.cdf(c_value)) ** 2 ) return bias
[docs]def variance_normal_splits(c_value, a_value, gamma): """ Calculates the variance for stump predictors as defined in the paper in Theorem 4.1. Parameters ---------- c_value: int, float The gridpoint to be considered. a_value: float The subsampling fraction. gamma: float The rate of convergence of the estimator. Returns the variance. """ variance = ( a_value * norm.cdf(c_value * a_value ** gamma) * (1 - norm.cdf(c_value * a_value ** gamma)) ) return variance
[docs]def calculate_normal_splits(settings): """ Calculate the Bias and the Variance for the case of subagging based on the calculation settings defined in *settings*. settings: Dictionary as described in :ref:`model_specs` The dictionary defines the calculation set-up that is specific to the stump predictor simulation. Returns the calculated values as a dictionary. """ output = {} # Create a range of c_value values that we will iterate over for each subsampling # fraction *a_value* and save it to output dictionary for plotting. c_range = ( np.linspace( settings['c_min'], settings['c_max'], num=settings['c_gridpoints'] ) ) output['c_range'] = c_range # Create the array with the a_value_range. The formation is chosen this way, # for plotting reasons. # Note that the first a is always one as a reference (unbagged). Hence it is # not a fraction.This way we can easily adjust the *normal_splits_settings.json* # to plot other fractions. a_range = ( np.array([ settings['a_array']['first_a'], settings['a_array']['second_a'][0] / settings['a_array']['second_a'][1], settings['a_array']['third_a'][0] / settings['a_array']['third_a'][1], settings['a_array']['fourth_a'][0] / settings['a_array']['fourth_a'][1] ]) ) # Loop over the range of c_value values. for i_a, a_value in enumerate(a_range): # For the list of a values (subsampling fraction). # Create an array that save the results for Bias and Variance bias_array = np.ones(settings['c_gridpoints']) * np.nan var_array = np.ones(settings['c_gridpoints']) * np.nan for i_c, c_value in enumerate(c_range): # The calculation are done straight forward following the # derivations in the paper. bias_array[i_c] = bias_normal_splits(c_value, a_value, settings['gamma']) var_array[i_c] = variance_normal_splits(c_value, a_value, settings['gamma']) mse_array = np.add(bias_array, var_array) # Save the results to the dictionary. Note that we use the iteration number # as the key, since we follow a similar logic in the plotting part. output[i_a] = {} output[i_a]['bias'] = bias_array output[i_a]['variance'] = var_array output[i_a]['mse'] = mse_array return output
if __name__ == '__main__': with open(ppj("IN_MODEL_SPECS", "normal_splits_settings.json")) as f: NORMAL_SPLITS_SETTINGS_IMPORTED = json.load(f) CALCULATE_NORMAL_SPLITS = calculate_normal_splits( NORMAL_SPLITS_SETTINGS_IMPORTED) with open(ppj("OUT_ANALYSIS_THEORY", "output_normal_splits.pickle"), "wb") as out_file: pickle.dump(CALCULATE_NORMAL_SPLITS, out_file)