Source code for src.analysis.theory_simulation.calc_finite_sample
"""
A module to calculate the results for the introductory example in subsection 3.2
of the paper without the dynamic environment of x.
Without choosing a dynamic environment for x, the estimator developed by
:cite:`Buhlmann2002` and illustrated in our paper stabilizes by the (weak)
Law of Large Numbers. We simulate this here for a range of sample sizes for a
given mean and variance, assuming that Y follows a Gaussian distribution.
"""
import json
import pickle
import numpy as np
from bld.project_paths import project_paths_join as ppj
[docs]def indicator(x_value, y_bar):
"""
A indicator function that returns 1 if the threshold *y_bar* is smaller
or equal the x value *x_value*.
Parameters
----------
x_value: int, float
The value of x to be considered.
y_bar: int, float
The value of y_bar to be considered, i.e. the threshold.
"""
if y_bar <= x_value:
out = 1
else:
out = 0
return out
[docs]def bagged_indicator(x_value, sample, b_iterations=50):
"""
The bagged indicator function as described in subsection 3.2.
Parameters
----------
x_value: int, float
The value of x to be considered.
sample: numpy array of shape = [sample_size]
The sample on which we bootstrap the mean.
b_iterations: int, optional (Default=50)
The number of bootstrap iterations to construct the predictor.
Returns the value of the bagged predictor.
"""
predictions = np.ones(b_iterations) * np.nan
for i in range(b_iterations):
# Draw a new bootstrap sample.
bootstrap_sample = (
np.random.choice(sample, size=(sample.size,), replace=True)
)
# Calculate the bootstrap prediction.
y_bootstrap = bootstrap_sample.mean()
predictions[i] = indicator(x_value, y_bootstrap)
return predictions.mean()
[docs]def simulate_finite_sample(settings):
"""
Performs the simulation of the MSE for the bagged and unbagged predictor
for a range of sample sizes, which are specified by the *settings*
dictionary. The procedure is described in greater detail in the Appendix
Part B.2 of the paper.
Parameters
----------
settings: Dictionary as described in :ref:`model_specs`
The dictionary that defines the simulation set-up for the finite sample
case.
"""
# Create a dictionary to save the finale results.
output = {}
# Create array with x values we want to consider.
x_range = np.linspace(
settings['x_min'],
settings['x_max'],
settings['x_gridpoints']
)
# Save x_range to dictionary as we want to plot the results later.
output['x_range'] = x_range
# Iterate over the list of sample sizes.
for sample_size in settings['n_list']:
# Create Arrays to save the results for the given sample size.
mse_array_bagging = np.ones(settings['x_gridpoints']) * np.nan
mse_array_unbagged = np.ones(settings['x_gridpoints']) * np.nan
# Iterate over the range of x values.
for i_x, x_value in enumerate(x_range):
# Create Arrays to save the simulated results.
y_se_bagged = np.ones(settings['n_repeat']) * np.nan
y_se_unbagged = np.ones(settings['n_repeat']) * np.nan
# Set random state s.t. for each grid point we draw the same
# sequence. A larger explanation why we define RandomStates
# can be found in the documentation.
random_state = np.random.RandomState(settings['random_seed'])
# Calculate the true prediction for the given x.
true_prediction = indicator(x_value, settings['mu'])
# Simulate the Expected MSPE for given x.
for i_repeat in range(settings['n_repeat']):
# Draw a new sample and make a prediction for bagging and
# without bagging.
y_sample = (
random_state.normal(
settings['mu'],
settings['sigma'],
size=sample_size
)
)
# Make a prediction with the unbagged predictor.
prediction_unbagged = indicator(x_value, y_sample.mean())
# Make a prediction with the bagged predictor.
prediction_bagged = (
bagged_indicator(
x_value,
y_sample,
b_iterations=settings['b_iterations']
)
)
# Calculate the Squared Error for the given repetition.
y_se_bagged[i_repeat] = (
(true_prediction - prediction_bagged) ** 2
)
y_se_unbagged[i_repeat] = (
(true_prediction - prediction_unbagged) ** 2
)
# Calculate the MSPE for bagging and the normal predictor.
mse_array_bagging[i_x] = (
y_se_bagged.sum(axis=0) / settings['n_repeat']
)
mse_array_unbagged[i_x] = (
y_se_unbagged.sum(axis=0) / settings['n_repeat']
)
# Save the results of the given sample size.
output[sample_size] = {}
output[sample_size]['mse_bagging'] = mse_array_bagging
output[sample_size]['mse_unbagged'] = mse_array_unbagged
return output
if __name__ == '__main__':
with open(ppj("IN_MODEL_SPECS", "finite_sample_settings.json")) as f:
FINITE_SAMPLE_SETTINGS_IMPORTED = json.load(f)
SIMULATE_FINITE_SAMPLE = (
simulate_finite_sample(FINITE_SAMPLE_SETTINGS_IMPORTED)
)
with open(ppj("OUT_ANALYSIS_THEORY", "output_finite_sample.pickle"), "wb") as out_file:
pickle.dump(SIMULATE_FINITE_SAMPLE, out_file)