Source code for src.analysis.main_simulation.calc_simulation_tree_depth
"""
This module simulates the variations in the model complexity governed by the
Tree depth for the Bagging Algorithm.
For this we use the ``MonteCarloSimulation`` Class described in :ref:`model_code`
in the *simulate_tree_depth()* function and return the results as a dictionary.
Also the intuition of the simulation setup from :ref:`model_code` and
:ref:`design_choice` carries over to this module.
"""
import sys
import json
import pickle
import numpy as np
from src.model_code.montecarlosimulation import MonteCarloSimulation
from bld.project_paths import project_paths_join as ppj
[docs]def simulate_tree_depth(general_settings, tree_depth_settings, model):
"""
A function that simulates the variations in tree depth an its effect on the
MSPE decomposition for the Bagging Algorithm.
Parameters
----------
general_settings: Dictionary as described in :ref:`model_specs`
The dictionary is shared across various simulations and defines the overall
simulation set-up.
tree_depth_settings: Dictionary as described in :ref:`model_specs`
The dictionary defines the simulation set-up that is specific to the tree
depth simulation.
model: String that defines the data generating process to be considered.
The option are 'friedman', 'linear' and 'indicator' which is usually
passed as the first system argument.
Returns a tuple of the simulation results:
- tuple[0]: numpy array of shape = [min_split_array.size, 4], where
*min_split_array* is the array of minimal split values we want
to consider. This is defined by keys in *tree_depth_settings*.
The array consists of the MSPE decompositions for each of those
minimal split values for the Bagging Algorithm.
- tuple[0]: numpy array of shape = [min_split_array.size, 4], where
*min_split_array* is the array of minimal split values we want
to consider. This is defined by keys in *tree_depth_settings*.
The array consists of the MSPE decompositions for each of those
minimal split values for the **unbagged** Tree.
"""
# MSE + Variance + Bias + Error = 4
size_mse_decomp = 4
# Create an array that describes minimal leaf sizes.
# As we want to start from high to low, we turn the array around with
# [::-1].
# We add the step size again to the maximal value as we want it to be
# included.
min_split_array = (
np.arange(
tree_depth_settings['min_split'],
tree_depth_settings['max_split'] +
tree_depth_settings["steps_split"],
tree_depth_settings["steps_split"]
)[::-1]
)
# Create arrays to save the MSE, Bias, Variance + Noise for each split
# specification.
output_array_bagging = (
np.ones((min_split_array.size, size_mse_decomp)) * np.nan
)
output_array_tree = (
np.ones((min_split_array.size, size_mse_decomp)) * np.nan
)
# Create a MonteCarloSimulation instance that defines the attributes For
# the data generating process and will be constant for the tree and
# bagging simulation.
simulation_basis = (
MonteCarloSimulation(
n_repeat=general_settings['n_repeat'],
noise=general_settings['noise'],
data_process=model,
n_test_train=general_settings['n_test_train'],
random_seeds=general_settings['random_seeds']
)
)
# We simulate the MSE for Bagging and Trees for the different splits, while
# keeping the data generating process constant.
for index, split in enumerate(min_split_array):
output_bagging = (
simulation_basis.calc_mse(
ratio=general_settings['bagging_ratio'],
bootstrap=True,
min_split_tree=split,
b_iterations=general_settings["b_iterations"]
)
)
# Note: Subagging(bootstrap=False) with ratio = 1 -> Tree
output_tree = (
simulation_basis.calc_mse(
ratio=general_settings['bagging_ratio'],
bootstrap=False,
min_split_tree=split,
b_iterations=general_settings["b_iterations"]
)
)
output_array_bagging[index, :] = output_bagging
output_array_tree[index, :] = output_tree
return output_array_bagging, output_array_tree
if __name__ == '__main__':
DGP_MODEL = sys.argv[1]
with open(ppj("IN_MODEL_SPECS", "general_settings.json")) as f:
GENERAL_SETTINGS_IMPORTED = json.load(f)
with open(ppj("IN_MODEL_SPECS", "tree_depth_settings.json")) as f:
TREE_DEPTH_SETTINGS_IMPORTED = json.load(f)
OUTPUT_SIMULATION = simulate_tree_depth(
GENERAL_SETTINGS_IMPORTED,
TREE_DEPTH_SETTINGS_IMPORTED,
DGP_MODEL)
SIMULATION_TREE_DEPTH = {}
SIMULATION_TREE_DEPTH['bagging'] = OUTPUT_SIMULATION[0]
SIMULATION_TREE_DEPTH['trees'] = OUTPUT_SIMULATION[1]
with open(ppj("OUT_ANALYSIS_MAIN", "output_tree_depth_{}.pickle"
.format(DGP_MODEL)), "wb") as out_file:
pickle.dump(SIMULATION_TREE_DEPTH, out_file)
print('Done with the {} model for the tree depth simulation'.format(DGP_MODEL))