Source code for src.analysis.theory_simulation.calc_toy_example

"""

A module to calculate the results for the introductory example in Subsection
3.2 of the paper for the dynamic environment of x.

Given the choice of the appropriate environment of x, the estimator does not
stabilizes even asymptotically and we can illustrate the effects of bagging on
it.

"""
import pickle
import json
import numpy as np
from scipy.stats import norm
import scipy.integrate as integrate

from bld.project_paths import project_paths_join as ppj


[docs]def convolution_cdf_df(c_value):
    """
    Calculate the convolution as defined by :cite:`Buhlmann2002` and as
    used in the introductory example of our paper for the the c.d.f of the
    standard normal distribution and the standard normal density for the
    gridpoint *c_value* for the real number line.

    Parameters
    ----------
    c_value: float, int
        The gridpoint to be considered.

    """
    # We use the lambda operator here, as its a very simple function, we want
    # to integrate over and its only used once.
    convolution = integrate.quad(
        lambda y: norm.cdf(c_value - y) * norm.pdf(y), -np.inf, np.inf
    )[0]
    return convolution


[docs]def convolution_cdf_squared_df(c_value):
    """
    Calculate the convolution as defined by :cite:`Buhlmann2002` and as
    used in the introductory example of our paper for the the squared c.d.f of
    the standard normal distribution and the standard normal density for the
    gridpoint *c_value* for the real number line.

    Parameters
    ----------
    c_value: float, int
        The gridpoint to be considered.

    """
    # We use the lambda operator here, as its a very simple function, we want
    # to integrate over and its only used once.
    convolution = integrate.quad(
        lambda y: norm.cdf(c_value - y) ** 2 * norm.pdf(y), -np.inf, np.inf
    )[0]
    return convolution


[docs]def calculate_bias_bagged(c_value):
    """ Calculate the squared bias for the bagged predictor given the grid point
    *c_value*.

    Parameters
    ----------
    c_value: float, int
        The gridpoint to be considered.

    """
    bias_bagged = (convolution_cdf_df(c_value) - norm.cdf(c_value)) ** 2
    return bias_bagged


[docs]def calculate_var_bagged(c_value):
    """ Calculate the variance for the bagged predictor given the grid point
    *c_value*.

    Parameters
    ----------
    c_value: float, int
        The gridpoint to be considered.

    """
    var_bagged = (
        convolution_cdf_squared_df(c_value) - convolution_cdf_df(c_value) ** 2
    )

    return var_bagged


[docs]def calculate_var_unbagged(c_value):
    """ Calculate the variance for the bagged predictor given the grid point
    *c_value*.

    Parameters
    ----------
    c_value: float, int
        The gridpoint to be considered.


    """
    var_unbagged = norm.cdf(c_value) * (1 - norm.cdf(c_value))
    return var_unbagged


[docs]def calculate_toy_example(settings):
    """
    Calculate the Bias and the Variance for the case of bagged and unbagged
    predictor based on the calulation settings defined in *settings*.

    settings: Dictionary as described in :ref:`model_specs`
        The dictionary defines the calculation set-up that is specific to the
        introductory simulation.

    Returns the calculated values as a dictionary.

    """
    # Create grid with *c_value* values that we want to consider.
    c_range = (
        np.linspace(
            settings['c_min'],
            settings['c_max'],
            num=settings['c_gridpoints']
        )
    )

    # Create the arrays that will be used to save the results.
    bagged_var = np.ones(settings['c_gridpoints']) * np.nan
    unbagged_var = np.ones(settings['c_gridpoints']) * np.nan
    bagged_bias = np.ones(settings['c_gridpoints']) * np.nan

    # We save all results to the dictionary *output*.
    output = {}
    output['c_range'] = c_range

    # Loop over *c_value* values that we want to consider and save the results.
    # Note that the unbagged predictor is unbiased.
    for i_c, c_value in enumerate(c_range):
        bagged_var[i_c] = calculate_var_bagged(c_value)
        unbagged_var[i_c] = calculate_var_unbagged(c_value)
        bagged_bias[i_c] = calculate_bias_bagged(c_value)

    output['bagged'] = {}
    output['bagged']['variance'] = bagged_var
    output['bagged']['bias'] = bagged_bias

    output['unbagged'] = {}
    output['unbagged']['variance'] = unbagged_var
    # For plotting reasons we also save squared bias of the unbagged predictor
    # which is zero as it is unbiased as shown in the paper.
    output['unbagged']['bias'] = np.zeros(settings['c_gridpoints'])

    return output


if __name__ == '__main__':
    with open(ppj("IN_MODEL_SPECS", "toy_example_settings.json")) as f:
        TOY_EXAMPLE_SETTINGS_IMPORTED = json.load(f)

    CALCULATE_TOY_EXAMPLE = calculate_toy_example(
        TOY_EXAMPLE_SETTINGS_IMPORTED)

    with open(ppj("OUT_ANALYSIS_THEORY", "output_toy_example.pickle"), "wb") as out_file:
        pickle.dump(CALCULATE_TOY_EXAMPLE, out_file)