Source code for src.analysis.real_data_simulation.calc_boston

"""
This module simulates the MSPE for bagging and subagging for the Boston Housing
data set. The simulation set-up is the following:

i. For each simulation iteration follow this procedure
    (a) Randomly divide the data set into a training and a test set
    (b) Fit the predictor (Tree, Bagging, Subagging) to the training set
    (c) Using this new predictor make a prediction into the current test set
        and save the predicted values
    (d) Compute the average prediction error of the current test set and save
        the value
ii. Compute the MSPE as the mean of average prediction errors of each iteration

For this we use the ``BaggingTree`` Class described in :ref:`model_code` in the
*simulate_bagging()* and *simulate_subagging()* functions and write the
results as a dictionary.

"""

import pickle
import json
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from src.model_code.baggingtree import BaggingTree
from bld.project_paths import project_paths_join as ppj


[docs]def split_fit_predict_bagging(
        x_matrix,
        y_vector,
        ratio_test,
        random_state,
        bagging_object):
    """
    A  function that splits the data consisting of *x_matrix* and *y_vector*
    into a new test and training sample, fits the Bagging Algorithm on the
    training sample and makes a prediction on the test sample.
    Eventually the MSPE is computed.

    Parameters
    ----------
    x_matrix: numpy-array with shape = [n_size, n_features] (Default: None)
        The covariant matrix *x_matrix* with the sample size n_size and
        n_features of covariants.

    y_vector: numpy-array with shape = [n_size] (Default: None)
        The vector of the dependent variable *y_vector* with the sample size
        n_size.

    ratio_test: float
        The ratio of the data used for the test sample.

    random_state: Numpy RandomState container
        The RandomState instance to perform the train/test split.

    bagging_object: Instance of the class *BaggingTrees*
        The bagging instance used to fit the algorithm to the newly splitted
        data.

    Returns the MSPE for one iteration.
    """
    # Split the sample into train and test using the RandomState instance.
    x_matrix_train, x_matrix_test, y_vector_train, y_vector_test = (
        train_test_split(
            x_matrix,
            y_vector,
            test_size=ratio_test,
            random_state=random_state
        )
    )

    # Fit the Model and make a prediction on the test sample.
    fitted_model = bagging_object.fit(x_matrix_train, y_vector_train)
    predictions = fitted_model.predict(x_matrix_test)

    # Calculate the MSPE.
    y_mse = np.mean((y_vector_test - predictions) ** 2)
    return y_mse


[docs]def simulate_bagging(x_matrix, y_vector, general_settings, boston_settings):
    """
    A  function that simulates the MSPE for the Bagging Algorithm using the
    Boston Housing data.

    Parameters
    ----------
    x_matrix: numpy-array with shape = [n_size, n_features] (Default: None)
        The covariant matrix *x_matrix* with the sample size n_size and
        n_features of covariants.

    y_vector: numpy-array with shape = [n_size] (Default: None)
        The vector of the dependent variable *y_vector* with the sample size
        n_size

    general_settings: Dictionary as described in :ref:`model_specs`
        The dictionary is shared across various simulations and defines the
        overall simulation set-up.

    boston_settings: Dictionary as described in :ref:`model_specs`
        The dictionary defines the simulation set-up that is specific to the
        boston simulation.

    Returns the simulated MSPE.

    """
    # Define a RandomState for the train_test_split.
    random_state_split = (
        np.random.RandomState(boston_settings['random_seed_split'])
    )

    # Create an array that will save the simulation results.
    mse_not_expected = np.ones(general_settings['n_repeat']) * np.nan

    # Train the bagged Regression Tree.
    # The random seed can be fixed here.
    bagging_instance = (
        BaggingTree(
            random_seed=boston_settings['random_seed_fit'],
            ratio=general_settings['bagging_ratio'],
            bootstrap=True,
            b_iterations=general_settings['b_iterations'],
            min_split_tree=general_settings['min_split_tree']
        )
    )

    for i_n_repeat in range(general_settings['n_repeat']):
        mse_not_expected[i_n_repeat] = (
            split_fit_predict_bagging(
                x_matrix,
                y_vector,
                ratio_test=boston_settings['ratio_test'],
                random_state=random_state_split,
                bagging_object=bagging_instance
            )
        )

    # Average over all simulation results to get the MSPE.
    mse_sim = np.mean(mse_not_expected)
    return mse_sim


[docs]def simulate_subagging(
        x_matrix,
        y_vector,
        general_settings,
        subagging_settings,
        boston_settings):
    """
    A  function that simulates the MSPE for the Subagging Algorithm over a
    range of subsampling fractions.

    Parameters
    ----------
    x_matrix: numpy-array with shape = [n_size, n_features] (Default: None)
        The covariant matrix *x_matrix* with the sample size n_size and
        n_features of covariants.

    y_vector: numpy-array with shape = [n_size] (Default: None)
        The vector of the dependent variable *y_vector* with the sample size
        n_size

    general_settings: Dictionary as described in :ref:`model_specs`
        The dictionary is shared across various simulations and defines the
        overall simulation set-up.

    subagging_settings: Dictionary as described in :ref:`model_specs`
        The dictionary defines the simulation set-up that is specific to
        subagging simulations.

    boston_settings: Dictionary as described in :ref:`model_specs`
        The dictionary defines the simulation set-up that is specific to the
        boston simulation.

    Returns a numpy array with the simulated MSPE for each subsampling fraction.

    """
    # Create an array that will save the MSPE for all ratios.
    mse_subagging = np.ones(subagging_settings['n_ratios']) * np.nan

    # Create the range of ratios we want to consider.
    ratio_range = (
        np.linspace(
            subagging_settings['min_ratio'],
            subagging_settings['max_ratio'],
            subagging_settings['n_ratios']
        )
    )

    # Loop over the different ratios we consider and simulate the MSPE for each ratio.
    for i_a, a_value in enumerate(ratio_range):
        # Define a RandomState for the train_test_split
        # Note:Inside the loop as we want to have a smooth plot -> same
        # samples.
        random_state_split = (
            np.random.RandomState(
                boston_settings['random_seed_split']
            )
        )

        # Train the bagged Regression Tree.
        # The random seed can be fixed here.
        bagging_instance = (
            BaggingTree(
                random_seed=boston_settings['random_seed_fit'],
                ratio=a_value,
                bootstrap=False,
                b_iterations=general_settings['b_iterations'],
                min_split_tree=general_settings['min_split_tree']
            )
        )
        # Create an array that will save the simulation results for one ratio.
        mse_not_expected = np.ones(general_settings['n_repeat']) * np.nan

        for i_n_repeat in range(general_settings['n_repeat']):
            mse_not_expected[i_n_repeat] = (
                split_fit_predict_bagging(
                    x_matrix,
                    y_vector,
                    ratio_test=boston_settings['ratio_test'],
                    random_state=random_state_split,
                    bagging_object=bagging_instance
                )
            )
        # Average over all simulation results for a specific ratio to get the MSPE.
        mse_subagging[i_a] = np.mean(mse_not_expected)
    return mse_subagging


if __name__ == '__main__':
    with open(ppj("IN_MODEL_SPECS", "general_settings.json")) as f:
        GENERAL_SETTINGS_IMPORTED = json.load(f)

    with open(ppj("IN_MODEL_SPECS", "boston_settings.json")) as f:
        BOSTON_SETTINGS_IMPORTED = json.load(f)

    with open(ppj("IN_MODEL_SPECS", "subagging_settings.json")) as f:
        SUBAGGING_SETTINGS_IMPORTED = json.load(f)

    BOSTON_FULL = load_boston()
    BOSTON_X_MATRIX = BOSTON_FULL['data']
    BOSTON_Y_VECTOR = BOSTON_FULL['target']

    MSE_BAGGING = (
        simulate_bagging(
            BOSTON_X_MATRIX,
            BOSTON_Y_VECTOR,
            GENERAL_SETTINGS_IMPORTED,
            BOSTON_SETTINGS_IMPORTED
        )
    )
    MSE_SUBAGGING = (
        simulate_subagging(
            BOSTON_X_MATRIX,
            BOSTON_Y_VECTOR,
            GENERAL_SETTINGS_IMPORTED,
            SUBAGGING_SETTINGS_IMPORTED,
            BOSTON_SETTINGS_IMPORTED
        )
    )

    SIMULATION_BOSTON = {}
    SIMULATION_BOSTON['mse_bagging'] = MSE_BAGGING
    SIMULATION_BOSTON['mse_subagging'] = MSE_SUBAGGING

    with open(ppj("OUT_ANALYSIS_REAL_DATA", "output_boston.pickle"), "wb") as out_file:
        pickle.dump(SIMULATION_BOSTON, out_file)
    print('Done with the real data simulation')