Source code for src.analysis.theory_simulation.calc_finite_sample

"""

A module to calculate the results for the introductory example in subsection 3.2
of the paper without the dynamic environment of x.

Without choosing a dynamic environment for x, the estimator developed by
:cite:`Buhlmann2002` and illustrated in our paper stabilizes by the (weak)
Law of Large Numbers. We simulate this here for a range of sample sizes for a
given mean and variance, assuming that Y follows a Gaussian distribution.

"""
import json
import pickle
import numpy as np

from bld.project_paths import project_paths_join as ppj


[docs]def indicator(x_value, y_bar): """ A indicator function that returns 1 if the threshold *y_bar* is smaller or equal the x value *x_value*. Parameters ---------- x_value: int, float The value of x to be considered. y_bar: int, float The value of y_bar to be considered, i.e. the threshold. """ if y_bar <= x_value: out = 1 else: out = 0 return out
[docs]def bagged_indicator(x_value, sample, b_iterations=50): """ The bagged indicator function as described in subsection 3.2. Parameters ---------- x_value: int, float The value of x to be considered. sample: numpy array of shape = [sample_size] The sample on which we bootstrap the mean. b_iterations: int, optional (Default=50) The number of bootstrap iterations to construct the predictor. Returns the value of the bagged predictor. """ predictions = np.ones(b_iterations) * np.nan for i in range(b_iterations): # Draw a new bootstrap sample. bootstrap_sample = ( np.random.choice(sample, size=(sample.size,), replace=True) ) # Calculate the bootstrap prediction. y_bootstrap = bootstrap_sample.mean() predictions[i] = indicator(x_value, y_bootstrap) return predictions.mean()
[docs]def simulate_finite_sample(settings): """ Performs the simulation of the MSE for the bagged and unbagged predictor for a range of sample sizes, which are specified by the *settings* dictionary. The procedure is described in greater detail in the Appendix Part B.2 of the paper. Parameters ---------- settings: Dictionary as described in :ref:`model_specs` The dictionary that defines the simulation set-up for the finite sample case. """ # Create a dictionary to save the finale results. output = {} # Create array with x values we want to consider. x_range = np.linspace( settings['x_min'], settings['x_max'], settings['x_gridpoints'] ) # Save x_range to dictionary as we want to plot the results later. output['x_range'] = x_range # Iterate over the list of sample sizes. for sample_size in settings['n_list']: # Create Arrays to save the results for the given sample size. mse_array_bagging = np.ones(settings['x_gridpoints']) * np.nan mse_array_unbagged = np.ones(settings['x_gridpoints']) * np.nan # Iterate over the range of x values. for i_x, x_value in enumerate(x_range): # Create Arrays to save the simulated results. y_se_bagged = np.ones(settings['n_repeat']) * np.nan y_se_unbagged = np.ones(settings['n_repeat']) * np.nan # Set random state s.t. for each grid point we draw the same # sequence. A larger explanation why we define RandomStates # can be found in the documentation. random_state = np.random.RandomState(settings['random_seed']) # Calculate the true prediction for the given x. true_prediction = indicator(x_value, settings['mu']) # Simulate the Expected MSPE for given x. for i_repeat in range(settings['n_repeat']): # Draw a new sample and make a prediction for bagging and # without bagging. y_sample = ( random_state.normal( settings['mu'], settings['sigma'], size=sample_size ) ) # Make a prediction with the unbagged predictor. prediction_unbagged = indicator(x_value, y_sample.mean()) # Make a prediction with the bagged predictor. prediction_bagged = ( bagged_indicator( x_value, y_sample, b_iterations=settings['b_iterations'] ) ) # Calculate the Squared Error for the given repetition. y_se_bagged[i_repeat] = ( (true_prediction - prediction_bagged) ** 2 ) y_se_unbagged[i_repeat] = ( (true_prediction - prediction_unbagged) ** 2 ) # Calculate the MSPE for bagging and the normal predictor. mse_array_bagging[i_x] = ( y_se_bagged.sum(axis=0) / settings['n_repeat'] ) mse_array_unbagged[i_x] = ( y_se_unbagged.sum(axis=0) / settings['n_repeat'] ) # Save the results of the given sample size. output[sample_size] = {} output[sample_size]['mse_bagging'] = mse_array_bagging output[sample_size]['mse_unbagged'] = mse_array_unbagged return output
if __name__ == '__main__': with open(ppj("IN_MODEL_SPECS", "finite_sample_settings.json")) as f: FINITE_SAMPLE_SETTINGS_IMPORTED = json.load(f) SIMULATE_FINITE_SAMPLE = ( simulate_finite_sample(FINITE_SAMPLE_SETTINGS_IMPORTED) ) with open(ppj("OUT_ANALYSIS_THEORY", "output_finite_sample.pickle"), "wb") as out_file: pickle.dump(SIMULATE_FINITE_SAMPLE, out_file)