Source code for logisticreg

'''This module builds a class for logistic regression problems.
We compute the solution by directly maximizing the log-likelihood.
To do, we implement the Newton-Raphson method, following the treatment
in the [1]_.

References
-----------
Hastie, T., Tibshirani, R., & Friedman, J. (2001). The Elements of Statistical Learning. Springer New York Inc..
'''

import numpy as np
from numpy.linalg import inv
from scipy.optimize import minimize
from scipy.optimize.optimize import _minimize_bfgs
from scipy.special import expit
from src.classification.classification import Classification
from src.helperfunctions.evaluation_metrics import evaluate_accuracy, confusion_matrix
from src.helperfunctions.preprocessing import scale_and_center


[docs]class Logistic(Classification): ''' A class used to represent a logistic regression classifier. We only list non-inherited attributes. We use an optimizer from scipy to manually compute the maximum likelihood estimator. Parameters ----------- features : numpy.ndarray Design matrix of explanatory variables. output : numpy.ndarray Labels of data corresponding to feature matrix. split_proportion : float Proportion of data to use for training; between 0 and 1. threshold : float The minimum probability needed to classify a datapoint as a 1. number_labels : int The number of labels present in the data. standardized : bool Whether to center/scale the data (train/test done separately). True by default. This isn't strictly necessary, but it may help the Newton-Raphson algorithm converge. [2]_ Attributes ---------- tolerance : float One of two possible stopping criterion for the Newton-Raphson algorithm. It sets the required change in L_2 norm between successive coefficients. The algorithm will terminate no matter if the max_steps is reached. max_steps : int The maximum number of steps permitted for the Newton-Raphson algorithm. The algorithm terminates earlier if the tolerance is achieved, however, coefficients : numpy.ndarray The coefficients in the logistic regression model. threshold : float The minimum probability needed to classify a datapoint as a 1. train_probs : numpy.ndarray The predicted probabilities each training observation has label 1. train_predictions : numpy.ndarray The classified labels for the training data. test_probs : numpy.ndarray The predicted probabilities each test observation has label 1. test_predictions : numpy.ndarray The labels predicted for the given test data. train_accuracy : float The accuracy of the classifier evaluated on training data. test_accuracy : float The accuracy of the classifier evaluated on test data. train_confusion : float The accuracy of the classifier evaluated on training data. test_confusion : numpy.ndarray A confusion matrix of the classifier evaluated on test data. References ------------ .. [2] https://stats.stackexchange.com/a/113027s ''' def __init__(self, features, output, split_proportion=0.75, threshold=0.5, number_labels=None, standardized=True, tolerance = 0.001, max_steps = 100): if standardized: self.features = scale_and_center(features) # Add column for intercept self.features = np.append(np.ones((features.shape[0], 1)), features, axis=1) super().__init__(self.features, output, split_proportion, standardized=False) self.tolerance = tolerance self.max_steps = max_steps self.steps = None self.coefficients = self.fit() self.threshold = threshold self.train_probs = Logistic.probability_estimate(self.train_features, self.coefficients) self.train_predictions = np.array([1 if self.train_probs[i] > self.threshold else 0 for \ i in range(self.train_size)]) self.test_probs = Logistic.probability_estimate(self.test_features, self.coefficients) self.test_predictions = np.array([1 if self.test_probs[i] > self.threshold else 0 for \ i in range(self.test_size)]) self.train_accuracy = evaluate_accuracy(self.train_predictions, self.train_output) self.test_accuracy = evaluate_accuracy(self.test_predictions, self.test_output) self.train_confusion = confusion_matrix(self.number_labels, self.train_predictions, self.train_output) self.test_confusion = confusion_matrix(self.number_labels, self.test_predictions, self.test_output)
[docs] @staticmethod def probability_estimate(features, coefficients): '''Compute P(y = 1|x, beta) = 1/(1+exp(-beta^Tx)). It can be used for both training and for prediction. Parameters ------------- features : numpy.ndarray A design matrix of observations to condition on. coefficients : numpy.ndarray A vector of possible coefficients Returns ------- estimate : float The estimated probability of an label of 1 for each observation, conditional on the data and coefficient. ''' if features.ndim > 1 and (features.shape[1] != coefficients.shape[0]): raise TypeError("Must pass in coefficients and features of the same dimension") if coefficients.ndim > 1: raise TypeError("Coefficient argument must be a 1D Numpy array.") if features.ndim == 1: n = 1 d = features.shape[0] else: n = features.shape[0] dot_prods = np.broadcast_to(features @ coefficients, n) estimates = np.zeros(n) for i in range(n): if dot_prods[i] > 20: estimates[i] = 1 elif dot_prods[i] < -20: estimates[i] = 0 else: estimates[i] = 1 / (1 + np.exp(-dot_prods[i])) return estimates
[docs] @staticmethod def weighted_matrix(probabilities): '''Compute weight matrix for the weighted least squares problem used in the Newton-Raphson algorithm of solving logistic regression. Parameters ------------- probabilities : numpy.ndarray An array of probabilities from which we compute the weight matrix. Returns ------- wt_matrix : numpy.ndarray Diagonal matrix with ith entries p(1-p), where p = P(y = 1|X = x_i, beta). See Also --------- Logistic.newton_raphson_update : Implement a single step of the Newton-Raphson algorithm. ''' wt_matrix_one = np.diag(probabilities) wt_matrix_two = np.diag(1 - probabilities) # Since diagonal, matrix multiplication is same as entry-wise multiplication wt_matrix = wt_matrix_one @ wt_matrix_two return wt_matrix
[docs] @staticmethod def newton_raphson_update(features, beta_old, output): '''Compute a single step of the Newton-Raphson algorithm to compute the coefficient maximizing the likelihood for the logistic model. This algorithm is a root finder, i.e., finds a solution to f(x)=0 for some function f. In this case, we set f to be the derivative of the log likelihood (i.e., the score function), to find the MLE. Parameters ----------- beta_old : numpy.ndarray The coefficients we would like to update features : numpy.ndarray A design matrix of explanatory variables output : numpy.ndarray The labels corresponding to our observed features Returns -------- beta_new : numpy.ndarray The updated coefficients. ''' X = features probabilities = Logistic.probability_estimate(X, beta_old) W = Logistic.weighted_matrix(probabilities) # Weight matrix y = output probabilities = Logistic.probability_estimate(features, beta_old) beta_new = beta_old + np.linalg.inv(X.T @ W @ X) @ X.T @ (y - probabilities) return beta_new
[docs] def fit(self): ''' Calculate the coefficient estimate on the training data. Returns -------- beta_new : numpy.ndarray The estimated coefficients for logistic regression. ''' beta_init = np.zeros(self.dimension) # Proportion of 1s in the data ratio = np.sum(self.train_output)/self.train_size # A recommended initial starting point for the intercept beta_init[0] = np.log(ratio/(1-ratio)) # Guarantee we always make the first step coeff_change = np.Inf # Keep track of the steps taken steps = 1 while coeff_change > self.tolerance and steps < self.max_steps: beta_new = Logistic.newton_raphson_update(self.train_features, beta_init, self.train_output) coeff_change = np.linalg.norm(beta_new - beta_init) beta_init = beta_new steps += 1 self.steps = steps self.final_coeff_change = coeff_change return beta_new