Source code for src.analysis.theory_simulation.calc_normal_splits

"""

A module to calculate the results for the general stump predictor in Subsection
4.2 (Theorem 4.1) of the paper for the dynamic environment of x.

Replacing the bootstrap procedure by a subsampling scheme, we can here calculate
upper bounds for the Variance and the Bias of stump predictors seen in
subsection 4.2 and following the framework developed by :cite:`Buhlmann2002`.

"""
import json
import pickle
import numpy as np
from scipy.stats import norm

from bld.project_paths import project_paths_join as ppj


[docs]def bias_normal_splits(c_value, a_value, gamma):
    """
    Calculates the squared bias for stump predictors as defined in the paper in
    Theorem 4.1.

    Parameters
    ----------

    c_value: int, float
        The gridpoint to be considered.

    a_value: float
        The subsampling fraction.

    gamma: float
        The rate of convergence of the estimator.

    Returns the squared bias.
    """
    bias = (
        (norm.cdf(c_value * a_value ** gamma) -
         norm.cdf(c_value)) ** 2
    )
    return bias


[docs]def variance_normal_splits(c_value, a_value, gamma):
    """
    Calculates the variance for stump predictors as defined in the paper in
    Theorem 4.1.

    Parameters
    ----------

    c_value: int, float
        The gridpoint to be considered.

    a_value: float
        The subsampling fraction.

    gamma: float
        The rate of convergence of the estimator.

    Returns the variance.
    """
    variance = (
        a_value * norm.cdf(c_value * a_value ** gamma) *
        (1 - norm.cdf(c_value * a_value ** gamma))
    )
    return variance


[docs]def calculate_normal_splits(settings):
    """
    Calculate the Bias and the Variance for the case of subagging based on the
    calculation settings defined in *settings*.

    settings: Dictionary as described in :ref:`model_specs`
        The dictionary defines the calculation set-up that is specific to the
        stump predictor simulation.

    Returns the calculated values as a dictionary.

    """

    output = {}

    # Create a range of c_value values that we will iterate over for each subsampling
    # fraction *a_value* and save it to output dictionary for plotting.
    c_range = (
        np.linspace(
            settings['c_min'],
            settings['c_max'],
            num=settings['c_gridpoints']
        )
    )
    output['c_range'] = c_range

    # Create the array with the a_value_range. The formation is chosen this way,
    # for plotting reasons.
    # Note that the first a is always one as a reference (unbagged). Hence it is
    # not a fraction.This way we can easily adjust the *normal_splits_settings.json*
    # to plot other fractions.
    a_range = (
        np.array([
            settings['a_array']['first_a'],
            settings['a_array']['second_a'][0] /
            settings['a_array']['second_a'][1],
            settings['a_array']['third_a'][0] /
            settings['a_array']['third_a'][1],
            settings['a_array']['fourth_a'][0] /
            settings['a_array']['fourth_a'][1]
        ])
    )

    # Loop over the range of c_value values.
    for i_a, a_value in enumerate(a_range):
        # For the list of a values (subsampling fraction).

        # Create an array that save the results for Bias and Variance
        bias_array = np.ones(settings['c_gridpoints']) * np.nan
        var_array = np.ones(settings['c_gridpoints']) * np.nan

        for i_c, c_value in enumerate(c_range):
            # The calculation are done straight forward following the
            # derivations in the paper.
            bias_array[i_c] = bias_normal_splits(c_value, a_value, settings['gamma'])
            var_array[i_c] = variance_normal_splits(c_value, a_value, settings['gamma'])

        mse_array = np.add(bias_array, var_array)

        # Save the results to the dictionary. Note that we use the iteration number
        # as the key, since we follow a similar logic in the plotting part.
        output[i_a] = {}
        output[i_a]['bias'] = bias_array
        output[i_a]['variance'] = var_array
        output[i_a]['mse'] = mse_array
    return output


if __name__ == '__main__':
    with open(ppj("IN_MODEL_SPECS", "normal_splits_settings.json")) as f:
        NORMAL_SPLITS_SETTINGS_IMPORTED = json.load(f)

    CALCULATE_NORMAL_SPLITS = calculate_normal_splits(
        NORMAL_SPLITS_SETTINGS_IMPORTED)

    with open(ppj("OUT_ANALYSIS_THEORY", "output_normal_splits.pickle"), "wb") as out_file:
        pickle.dump(CALCULATE_NORMAL_SPLITS, out_file)