Source code for preprocessing

'''This module provides data processing and preparation tools.

    - Scaling/centering data
    - Generating Test/Train splits
    - Generating Cross validation Folds (In Progress)
    - Generating Bootstrap resamples (In Progress)

'''

import numpy as np
from collections import namedtuple


[docs]def scale_and_center(features): ''' Center and scale each individual column of the feature matrix to have 0 mean and unit variance. Parameters ---------- features : numpy.ndarray Design matrix of explanatory variables. Returns ------- features : numpy.ndarray Design matrix of scaled and centered explanatory variables. ''' if features.shape[0] == 0: return n_rows, n_cols = features.shape for column in range(n_cols): column_mean = np.mean(features[:, column])*np.ones(n_rows) column_std_dev = np.std(features[:, column]) centered_column = features[:, column] - column_mean if np.std(features[:, column]) == 0: features[:, column] = centered_column else: centered_and_scaled_column = centered_column / column_std_dev features[:, column] = centered_and_scaled_column return features
[docs]def train_test_split(features, output, split_proportion): '''Split the data into training and testing sets. Parameters ---------- featurs : numpy.ndarray Design matrix of explanatory variables output : numpy.ndarray The given response variables split_proportion : float The proportion of data used for training. Default is 25%. Must lie between 0 and 1. Returns ------- split_values : namedTuple Stores the following values: ['sample_size', 'train_size', 'test_size','train_rows', 'test_rows', 'train_features', 'test_features', 'train_output', 'test_output'] Notes ------ I used [2]_ to figure out how to randomly choose rows of an array. References ---------- .. [2] https://stackoverflow.com/a/14262743 ''' sample_size = features.shape[0] train_size = np.ceil(sample_size * split_proportion).astype(int) train_rows = np.random.choice(sample_size, train_size, replace=False) train_features = features[train_rows] train_output = output[train_rows] test_size = sample_size - train_size test_rows = np.setdiff1d(np.arange(sample_size), train_rows) test_features = features[test_rows] test_output = output[test_rows] split_information = ['sample_size', 'train_size', 'test_size', 'train_rows', 'test_rows', 'train_features', 'test_features', 'train_output', 'test_output'] TrainTestSplit = namedtuple('TrainTestSplit', split_information) split_values = TrainTestSplit(sample_size, train_size, test_size, train_rows, test_rows, train_features, test_features, train_output, test_output) return split_values
[docs]def cross_validation_folds_idx(row_count, fold_count): '''Partition the (training) dataset into folds. Parameters ----------- row_count : int Sample size of training data we form folds from. fold_count : int The number of folds to produce; cannot exceed row_count. Returns -------- folds : numpy.ndarray Each row stores the indices for a fold, with column size equal to fold size Raises ------- AssertionError If more folds are requested than there are observations ''' assert fold_count <= row_count, "There cannot be more folds than the sample size." rows_per_fold = row_count // fold_count # Indices that have not been assigned to a fold yet; will be updated row_indices = np.arange(row_count) # Array to store our folds: each row stores indices in that folds folds = np.zeros((fold_count, rows_per_fold), dtype=np.int8) for fold in range(fold_count): fold_rows = np.random.choice(row_indices, size=rows_per_fold, replace=False) row_indices = np.setdiff1d(row_indices, fold_rows) folds[fold, :] = fold_rows return folds