Source code for knn_classify

'''This module builds a class for k-nearest neighbor classification.
'''

import numpy as np
from scipy.stats import mode
from src.classification.classification import Classification
from src.helperfunctions.evaluation_metrics import evaluate_accuracy, confusion_matrix, evaluate_regression_error

[docs]class KNNClassify(Classification): ''' A class used to represent a k-nearest neighbor classifier. We only list non-inherited attributes. We include regression functionality as well. Parameters ----------- features : numpy.ndarray Design matrix of explanatory variables. output : numpy.ndarray Labels of data corresponding to feature matrix. split_proportion : float Proportion of data to use for training; between 0 and 1. number_labels : int The number of labels present in the data. standardized : bool Whether to center/scale the data (train/test done separately). True by default. k : int The number of neighbors to use in the algorithm. classify : bool Whether we are using this class for classification or regression. True by default. We will use instants with classify == False for a KNNRegression class. Attributes ---------- k : int The number of neighbors to use in the algorithm. test_predictions : numpy.ndarray The labels predicted for the given test data (for classification). test_accuracy : float The accuracy of the classifier evaluated on test data (for classification). test_confusion : numpy.ndarray A confusion matrix of the classifier evaluated on test data (for classification). test_predictions_reg : numpy.ndarray The predicted output on test data (for regression). test_error : float The test MSE of model fit using training data (for regression). See Also --------- knnregression.KNNRegression : Class for a k-nearest neighbor regression model. ''' def __init__(self, features, output, split_proportion=0.75, number_labels=None, standardized=True, k=3, classify=True): super().__init__(features, output, split_proportion, number_labels, standardized) self.k = k if classify: self.test_predictions = self.predict_class() self.test_accuracy = evaluate_accuracy(self.test_predictions, self.test_output) self.test_confusion = confusion_matrix(self.number_labels, self.test_predictions, self.test_output) else: self.test_predictions_reg = self.predict_value() self.test_error = evaluate_regression_error(self.test_predictions_reg, self.test_output)
[docs] def k_neighbors_idx(self, current_location): '''Find row indices (in given data) of the k closest neighbors to a given data point. Parameters ----------- current_location : numpy.ndarray Point we would like to classify, using its neighbors. Returns -------- k_nearest_idx : numpy.ndarray The k indices of the features observations closest to the current point. Notes ------ An efficient numpy procedure (using its broadcasting functionality) to compute all pairwise differences between two collections of data points is given in [1]_. We use this, as an alternative to using a manual nested 'for-loop' procedure. References ------------ .. [1] https://sparrow.dev/pairwise-distance-in-numpy/ ''' current_location = np.reshape(current_location, (1, current_location.shape[0])) pairwise_differences = self.train_features[:, None, :] - current_location[None, :, :] distance_matrix = np.linalg.norm(pairwise_differences, axis = -1).ravel() k_nearest_idx = np.argsort(distance_matrix, axis = 0)[:self.k] return k_nearest_idx
[docs] def classify_point(self, current_location): '''Classify a new datapoint based on its k neighbors. Parameters ----------- current_location : numpy.ndarray Point we would like to classify, using its neighbors. Returns -------- label_mode : int The predicted label (mode of labels of the k-nearest neighbors). Notes ------ We choose the smallest label by default. See Also --------- estimate_point : Find average output value among neighbors instead of most common label (for regression). ''' k_nearest_idx = self.k_neighbors_idx(current_location) nearest_k_labels = self.train_output[k_nearest_idx] label_mode = mode(nearest_k_labels)[0] return label_mode
[docs] def estimate_point(self, current_location): '''Estimate (for a regression context) a new datapoint based on its k neighbors. Parameters ----------- current_location : numpy.ndarray Point we would like to classify, using its neighbors. Returns -------- output_estimate : int The predicted output value of the current location. See Also --------- classify_point : Find most common label among neighbors instead of average output value (for classification). ''' k_nearest_idx = self.k_neighbors_idx(current_location) output_estimate = np.mean(self.train_output[k_nearest_idx]) return output_estimate
[docs] def predict_class(self): '''Classify many new datapoints based on their k neighbors. Parameters ----------- test_features : numpy.ndarray Points we would like to classify, using their neighbors. Returns -------- test_labels : numpy.ndarray The predicted labels for each test datapoint. See Also --------- predict_value : Predict output value instead of label (for regression). ''' test_labels = np.zeros(self.test_size, dtype = np.int8) for row in range(self.test_size): test_labels[row] = self.classify_point(self.test_features[row, :]) return test_labels
[docs] def predict_value(self, test_features, k): '''Classify many new datapoints based on their k neighbors. Parameters ----------- test_features : numpy.ndarray Points we would like to classify, using their neighbors. Returns -------- test_estimates : numpy.ndarray The predicted output for each test datapoint. See Also --------- predict_class : Predict label instead of output value (for classification). ''' test_sample_size = test_features.shape[0] test_estimates = np.zeros(test_sample_size) for row in range(test_sample_size): test_estimates[row] = self.estimate_point(test_features[row, :]) return test_estimates