Source code for preprocessing

'''This module provides data processing and preparation tools.

    - Scaling/centering data
    - Generating Test/Train splits
    - Generating Cross validation Folds (In Progress)
    - Generating Bootstrap resamples (In Progress)

'''

import numpy as np
from collections import namedtuple


[docs]def scale_and_center(features):
    '''
    Center and scale each individual column of the feature matrix 
    to have 0 mean and unit variance.

    Parameters
    ----------
    features : numpy.ndarray
        Design matrix of explanatory variables.

    Returns
    -------
    features : numpy.ndarray
        Design matrix of scaled and centered explanatory variables.
    '''
    if features.shape[0] == 0:
        return
        
    n_rows, n_cols = features.shape
    for column in range(n_cols):
        column_mean = np.mean(features[:, column])*np.ones(n_rows)
        column_std_dev = np.std(features[:, column])
        centered_column = features[:, column] - column_mean
        if np.std(features[:, column]) == 0:
            features[:, column] = centered_column
        else:
            centered_and_scaled_column = centered_column / column_std_dev
            features[:, column] = centered_and_scaled_column
    
    return features


[docs]def train_test_split(features, output, split_proportion):
    '''Split the data into training and testing sets.

    Parameters
    ----------
    featurs : numpy.ndarray
        Design matrix of explanatory variables
    output : numpy.ndarray
        The given response variables
    split_proportion : float
        The proportion of data used for training. Default is 25%.
        Must lie between 0 and 1.

    Returns
    -------
    split_values : namedTuple
        Stores the following values: ['sample_size', 'train_size', 
                         'test_size','train_rows', 'test_rows', 
                         'train_features', 'test_features', 
                         'train_output', 'test_output']

    Notes
    ------
    I used [2]_ to figure out how to randomly choose rows of an array.

    References
    ----------
    .. [2] https://stackoverflow.com/a/14262743
    '''
    sample_size = features.shape[0]
    train_size = np.ceil(sample_size * split_proportion).astype(int)
    train_rows = np.random.choice(sample_size, train_size, 
                                  replace=False)
    train_features = features[train_rows]
    train_output = output[train_rows]
    
    test_size = sample_size - train_size
    test_rows = np.setdiff1d(np.arange(sample_size), train_rows)
    test_features = features[test_rows]
    test_output = output[test_rows]

    split_information = ['sample_size', 'train_size', 'test_size', 
                         'train_rows', 'test_rows', 'train_features',
                         'test_features', 'train_output', 'test_output']

    TrainTestSplit = namedtuple('TrainTestSplit', split_information)

    split_values = TrainTestSplit(sample_size, train_size, test_size, 
                                  train_rows, test_rows, 
                                  train_features, test_features, 
                                  train_output, test_output)

    return split_values


[docs]def cross_validation_folds_idx(row_count, fold_count):
    '''Partition the (training) dataset into folds.

    Parameters
    -----------
    row_count : int
        Sample size of training data we form folds from.
    fold_count : int    
        The number of folds to produce; cannot exceed row_count.

    Returns
    --------
    folds : numpy.ndarray  
        Each row stores the indices for a fold, 
        with column size equal to fold size

    Raises
    -------
    AssertionError
        If more folds are requested than there are observations
    '''

    assert fold_count <= row_count, "There cannot be more folds than the sample size."

    rows_per_fold = row_count // fold_count 

    # Indices that have not been assigned to a fold yet; will be updated
    row_indices = np.arange(row_count) 

    # Array to store our folds: each row stores indices in that folds
    folds = np.zeros((fold_count, rows_per_fold), dtype=np.int8)

    for fold in range(fold_count):
        fold_rows = np.random.choice(row_indices, 
                                     size=rows_per_fold, 
                                     replace=False)
        row_indices = np.setdiff1d(row_indices, fold_rows)
        folds[fold, :] = fold_rows
    
    return folds