Source code for classification

'''This module builds a base class for classification problems, such as logistic
regression or k-nearest neighbors classification. 
The preprocessing (if applicable) is done at this class level.

'''

import numpy as np
from numpy.lib.shape_base import split
from src.helperfunctions.preprocessing import train_test_split, scale_and_center
from src.helperfunctions.evaluation_metrics import evaluate_accuracy
from src.helperfunctions.exceptions import ClassInstantiationChecks

[docs]class Classification: """ A class used to represent a classification algorithm. Parameters ----------- features : numpy.ndarray Design matrix of explanatory variables. output : numpy.ndarray Labels of data corresponding to feature matrix. split_proportion : float Proportion of data to use for training; between 0 and 1. number_labels : int The number of labels present in the data. standardized : bool Whether to center/scale the data (train/test done separately). True by default. Attributes ----------- number_labels : int The number of labels present in existing and future data. sample_size : int The sample size of all given data (train and test). train_size : int The sample size of the training data. test_size : int The sample size of the test data. train_rows : numpy.ndarray The list of indices for the train split. test_rows : numpy.ndarray The list of indices for the test split. train_features : numpy.ndarray The train design matrix. test_features : numpy.ndarray The test design matrix. train_output : numpy.ndarray The train output data. test_output : numpy.ndarray The test output data. dimension : int The number of dimensions of the data, or columns of design matrix. Does not include output. """ def __init__(self, features, output, split_proportion=0.75, number_labels=None, standardized=True): # Default procedure is to assume all labels appear in output # If labels are missing in data, specify number_labels manually if number_labels is None: self.number_labels = len(np.unique(output)) else: self.number_labels = number_labels ClassInstantiationChecks(features, output, split_proportion, number_labels, standardized) train_test_split_data = train_test_split(features, output, split_proportion) self.sample_size = train_test_split_data.sample_size self.train_size = train_test_split_data.train_size self.test_size = train_test_split_data.test_size self.train_rows = train_test_split_data.train_rows self.test_rows = train_test_split_data.test_rows self.train_features = train_test_split_data.train_features self.test_features = train_test_split_data.test_features self.train_output = train_test_split_data.train_output self.test_output = train_test_split_data.test_output self.dimension = self.train_features.shape[1] if standardized: self.standardize()
[docs] def standardize(self): ''' Separately scale/center the train and test data so each feature (column of observations) has 0 mean and unit variance. ''' self.train_features = scale_and_center(self.train_features) self.test_features = scale_and_center(self.test_features)