Source code for logisticreg

'''This module builds a class for logistic regression problems.
We compute the solution by directly maximizing the log-likelihood.
To do, we implement the Newton-Raphson method, following the treatment
in the [1]_.

References
-----------
Hastie, T., Tibshirani, R., & Friedman, J. (2001). The Elements of Statistical Learning. Springer New York Inc..
'''

import numpy as np
from numpy.linalg import inv
from scipy.optimize import minimize
from scipy.optimize.optimize import _minimize_bfgs
from scipy.special import expit
from src.classification.classification import Classification
from src.helperfunctions.evaluation_metrics import evaluate_accuracy, confusion_matrix
from src.helperfunctions.preprocessing import scale_and_center


[docs]class Logistic(Classification):
    '''
    A class used to represent a logistic regression classifier.
    We only list non-inherited attributes. We use an optimizer
    from scipy to manually compute the maximum likelihood estimator.

    Parameters
    -----------
    features : numpy.ndarray
        Design matrix of explanatory variables.
    output : numpy.ndarray
        Labels of data corresponding to feature matrix.
    split_proportion : float
        Proportion of data to use for training; between 0 and 1.
    threshold : float
        The minimum probability needed to classify a datapoint as a 1.
    number_labels : int
        The number of labels present in the data.
    standardized : bool
        Whether to center/scale the data (train/test done separately).
        True by default. This isn't strictly necessary, but it may help
        the Newton-Raphson algorithm converge. [2]_

    Attributes
    ----------
    tolerance : float
        One of two possible stopping criterion for the Newton-Raphson algorithm.
        It sets the required change in L_2 norm between successive coefficients.
        The algorithm will terminate no matter if the max_steps is reached.
    max_steps : int
        The maximum number of steps permitted for the Newton-Raphson algorithm.
        The algorithm terminates earlier if the tolerance is achieved, however,
    coefficients : numpy.ndarray
        The coefficients in the logistic regression model.
    threshold : float
        The minimum probability needed to classify a datapoint as a 1.
    train_probs : numpy.ndarray
        The predicted probabilities each training observation has label 1.
    train_predictions : numpy.ndarray
        The classified labels for the training data.
    test_probs : numpy.ndarray
        The predicted probabilities each test observation has label 1.
    test_predictions : numpy.ndarray
        The labels predicted for the given test data.
    train_accuracy : float
        The accuracy of the classifier evaluated on training data.
    test_accuracy : float
        The accuracy of the classifier evaluated on test data.
    train_confusion : float
        The accuracy of the classifier evaluated on training data.
    test_confusion : numpy.ndarray
        A confusion matrix of the classifier evaluated on test data.

    References
    ------------
    .. [2] https://stats.stackexchange.com/a/113027s
    '''

    def __init__(self, features, output, split_proportion=0.75, threshold=0.5, 
                 number_labels=None, standardized=True, tolerance = 0.001,
                 max_steps = 100):
        if standardized:
            self.features = scale_and_center(features)

        # Add column for intercept
        self.features = np.append(np.ones((features.shape[0], 1)),
                                  features,
                                  axis=1)

        super().__init__(self.features, output, split_proportion, 
                         standardized=False)
        self.tolerance = tolerance
        self.max_steps = max_steps
        self.steps = None
        self.coefficients = self.fit()
        self.threshold = threshold
        self.train_probs = Logistic.probability_estimate(self.train_features, 
                                            self.coefficients)
        self.train_predictions = np.array([1 if self.train_probs[i] > self.threshold else 0 for \
            i in range(self.train_size)])

        self.test_probs = Logistic.probability_estimate(self.test_features, 
                                           self.coefficients)
        self.test_predictions = np.array([1 if self.test_probs[i] > self.threshold else 0 for \
            i in range(self.test_size)])
        self.train_accuracy = evaluate_accuracy(self.train_predictions, 
                                                self.train_output)
        self.test_accuracy = evaluate_accuracy(self.test_predictions, 
                                               self.test_output)
        self.train_confusion = confusion_matrix(self.number_labels, 
                                                self.train_predictions,
                                                self.train_output)
        self.test_confusion = confusion_matrix(self.number_labels, 
                                               self.test_predictions, 
                                               self.test_output)
[docs]    @staticmethod
    def probability_estimate(features, coefficients):
        '''Compute P(y = 1|x, beta) = 1/(1+exp(-beta^Tx)).
        It can be used for both training and for prediction.
        
        Parameters
        -------------
        features : numpy.ndarray
            A design matrix of observations to condition on.
        coefficients : numpy.ndarray
            A vector of possible coefficients

        Returns
        -------
        estimate : float
            The estimated probability of an label of 1 for each observation,
            conditional on the data and coefficient.
        '''
        if features.ndim > 1 and (features.shape[1] != coefficients.shape[0]):
            raise TypeError("Must pass in coefficients and features of the same dimension")
            
        if coefficients.ndim > 1:
            raise TypeError("Coefficient argument must be a 1D Numpy array.")

        if features.ndim == 1:
            n = 1
            d = features.shape[0]
        else:
            n = features.shape[0]

        dot_prods = np.broadcast_to(features @ coefficients, n)
        estimates = np.zeros(n)

        for i in range(n):
            if dot_prods[i] > 20:
                estimates[i] = 1
            elif dot_prods[i] < -20:
                estimates[i] = 0
            else:
                estimates[i] = 1 / (1 + np.exp(-dot_prods[i]))
        return estimates

[docs]    @staticmethod
    def weighted_matrix(probabilities):
        '''Compute weight matrix for the weighted least squares problem
        used in the Newton-Raphson algorithm of solving logistic regression.
        
        Parameters
        -------------
        probabilities : numpy.ndarray
            An array of probabilities from which we compute
            the weight matrix.

        Returns
        -------
        wt_matrix : numpy.ndarray
            Diagonal matrix with ith entries p(1-p), where
            p = P(y = 1|X = x_i, beta).

        See Also
        ---------
        Logistic.newton_raphson_update : Implement a single step of the Newton-Raphson algorithm.
        '''

        wt_matrix_one = np.diag(probabilities)
       
        wt_matrix_two = np.diag(1 - probabilities)

        # Since diagonal, matrix multiplication is same as entry-wise multiplication 
        wt_matrix = wt_matrix_one @ wt_matrix_two  
       
        return wt_matrix
    
[docs]    @staticmethod
    def newton_raphson_update(features, beta_old, output):
        '''Compute a single step of the Newton-Raphson algorithm to compute
        the coefficient maximizing the likelihood for the logistic model.

        This algorithm is a root finder, i.e., finds a solution to f(x)=0
        for some function f. In this case, we set f to be the derivative of the
        log likelihood (i.e., the score function), to find the MLE. 

        Parameters
        -----------
        beta_old : numpy.ndarray
            The coefficients we would like to update
        features : numpy.ndarray
            A design matrix of explanatory variables
        output : numpy.ndarray
            The labels corresponding to our observed features

        Returns
        --------
        beta_new : numpy.ndarray
            The updated coefficients.
        '''
        X = features
        probabilities = Logistic.probability_estimate(X, beta_old)
        W = Logistic.weighted_matrix(probabilities)  # Weight matrix
        y = output
        probabilities = Logistic.probability_estimate(features, beta_old)
        beta_new = beta_old + np.linalg.inv(X.T @ W @ X) @ X.T @ (y - probabilities)

        return beta_new

[docs]    def fit(self):
        '''
        Calculate the coefficient estimate on the training data.

        Returns
        --------
        beta_new : numpy.ndarray
            The estimated coefficients for logistic regression.
        '''

        beta_init = np.zeros(self.dimension)

        # Proportion of 1s in the data
        ratio = np.sum(self.train_output)/self.train_size

        # A recommended initial starting point for the intercept
        beta_init[0] = np.log(ratio/(1-ratio))

        # Guarantee we always make the first step
        coeff_change = np.Inf 

        # Keep track of the steps taken
        steps = 1

        while coeff_change > self.tolerance and steps < self.max_steps:
            beta_new = Logistic.newton_raphson_update(self.train_features, 
                                                      beta_init,
                                                      self.train_output)
            coeff_change = np.linalg.norm(beta_new - beta_init)
            beta_init = beta_new
            steps += 1
        self.steps = steps
        self.final_coeff_change = coeff_change
        return beta_new