Source code for regression

'''
This module builds a a base class for regression problems, such as least squares
or k-nearest neighbors regressions. The preprocessing (if applicable)
is done at this class level.

'''

import numpy as np
from src.helperfunctions.preprocessing import train_test_split
from src.helperfunctions.preprocessing import scale_and_center


[docs]class Regression: """ A class used to represent a regression algorithm. Parameters ----------- features : numpy.ndarray Design matrix of explanatory variables. output : numpy.ndarray Labels of data corresponding to feature matrix. split_proportion : float Proportion of data to use for training; between 0 and 1. standardized : bool Whether to center/scale the data (train/test done separately). True by default. Attributes ----------- sample_size : int The sample size of all given data (train and test). train_size : int The sample size of the training data. test_size : int The sample size of the test data. train_rows : numpy.ndarray The list of indices for the train split. test_rows : numpy.ndarray The list of indices for the test split. train_features : numpy.ndarray The train design matrix. test_features : numpy.ndarray The test design matrix. train_output : numpy.ndarray The train output data. test_output : numpy.ndarray The test output data. dimension : int The number of dimensions of the data, or columns of design matrix. Does not include output. """ def __init__(self, features, output, split_proportion=0.75, standardized=True): train_test_split_data = train_test_split(features, output, split_proportion) self.sample_size = train_test_split_data.sample_size self.train_size = train_test_split_data.train_size self.test_size = train_test_split_data.test_size self.train_rows = train_test_split_data.train_rows self.test_rows = train_test_split_data.test_rows self.train_features = train_test_split_data.train_features self.test_features = train_test_split_data.test_features self.train_output = train_test_split_data.train_output self.test_output = train_test_split_data.test_output self.dimension = self.train_features.shape[1] if standardized: self.standardize() self.standardized = True else: self.standardized = False
[docs] def standardize(self): ''' Separately scale/center the train and test data so each feature (column of observations) has 0 mean and unit variance. ''' self.train_features = scale_and_center(self.train_features) self.test_features = scale_and_center(self.test_features)