Source code for poissonreg

'''This module builds a class for Poisson regression problems.
We compute the solution by directly maximizing the log-likelihood.
We use an existing software implementation to globally maximize the 
likelihood function: BFGS, available in 
scipy.optimize.minimize(method = 'BFGS)

'''

import numpy as np
from src.regression.regression import Regression
from src.helperfunctions.evaluation_metrics import evaluate_regression_error
from scipy.optimize import minimize
from scipy.optimize.optimize import _minimize_bfgs
from src.helperfunctions.preprocessing import scale_and_center

[docs]class Poisson(Regression):
    '''
    A class used to represent a Poisson regression model.

    Parameters
    -----------
    features : numpy.ndarray
        Design matrix of explanatory variables.
    output : numpy.ndarray
        Count data output corresponding to feature matrix.
    split_proportion : float
        Proportion of data to use for training; between 0 and 1.
    standardized : bool
        Whether to center/scale the data (train/test done separately).
        True by default.

    Attributes
    ----------
    coefficients : numpy.ndarray
        The coefficients in the Poisson regression model.
    train_predictions : numpy.ndarray
        The predicted output values for the training data.
    test_predictions : numpy.ndarray
        The predicted output values for the test data.
    train_error : float
        The error of model on training data (default is MSE).
    train_error : float
        The error of model on test data (default is MSE).

    '''
    def __init__(self, features, output, split_proportion=0.75,
                 standardized=True):
        if standardized:
            self.features = scale_and_center(features)

        # Add column for intercept
        self.features = np.append(np.ones((features.shape[0], 1)),
                                  features,
                                  axis=1)

        super().__init__(self.features, output, split_proportion, 
                         standardized=False)
        self.coefficients = self.fit()
        self.train_predictions = Poisson.predict(self.train_features, 
                                                self.coefficients)
        self.test_predictions = Poisson.predict(self.test_features, 
                                               self.coefficients)
        self.train_error = evaluate_regression_error(self.train_predictions, 
                                                     self.train_output)
        self.test_error = evaluate_regression_error(self.test_predictions, 
                                                    self.test_output)
    
[docs]    @staticmethod
    def loglikelihood(features, counts, coefficient):
        '''Compute empirical log likelihood for each coefficient.

        Parameters
        ----------
        features : numpy.ndarray
            Design matrix of explanatory variables
        counts : numpy.ndarray
            The given response values (integers).
        coefficients : numpy.ndarray
            Vector of coefficients for Poisson regression.
        
        Returns
        -------
        loglikelihood : float
            The value of the log likelihood with given data and coefficient.
        '''

        coeff_matrix = np.tile(coefficient, (features.shape[0], 1))
        dot_prods = np.sum(features * coeff_matrix, axis=1)
        exp_dot_prods = np.exp(dot_prods)
        summands = np.multiply(dot_prods, counts) - exp_dot_prods
        loglikelihood = np.sum(summands)

        return loglikelihood

[docs]    @staticmethod
    def mle_finder(features, counts):
        '''
        Find the MLE for a Poisson regression model; return the coefficient.

        Parameters
        ----------
        features : numpy.ndarray
            Design matrix of explanatory variables
        counts : numpy.ndarray
            The given output counts

        Returns
        -------
        mle : float
            The coefficient that solves the Poisson regression problem for 
            the given data.

        Notes
        -----
        We use the BFGS algorithm as implemented in scipy to maximize
        the log-likelihood.
        This requires negating the log likelihood as we use minimization.

        See Also
        --------
        scipy.optimize.minimize
        '''

        def negative_log_likelihood(coefficient):
            ''' The negative log likelihood function that we minimize.
            '''
            value = -Poisson.loglikelihood(features, counts, 
                                            coefficient)
            return value
        dimension = np.ones(features.shape[1], dtype = np.int8)
        optimum = _minimize_bfgs(negative_log_likelihood, dimension)
        mle = optimum.x  # Obtain argmin
        return mle

[docs]    def fit(self):
        '''
        Calculate the coefficient estimate on the training data.
        '''

        coefficients = Poisson.mle_finder(self.train_features, 
                                           self.train_output)

        return coefficients
    
[docs]    @staticmethod
    def predict(features, coefficients):
        '''Compute estimated means of Poisson regression.

        Parameters
        ----------
        features : numpy.ndarray
            Design matrix of explanatory variables.
        coefficients : numpy.ndarray
            Vector of coefficients for Poisson regression solution.

        Returns
        -------
        exp_dot_prods : numpy.ndarray
            Predicted mean of Poisson distribution

        Notes
        ------
        In a Poisson model, we assume Y is Poisson, and that
        log E[Y|x] = beta^T x. 

        Here we return our estimate of E[Y|x] for a test data point x. 
        This quantity reflects the mean (and variance of the) count we 
        might expect of the response, conditional on the observed features.

        '''

        coeff_matrix = np.tile(coefficients, (features.shape[0], 1))
        dot_prods = np.sum(features * coeff_matrix, axis=1)
        exp_dot_prods = np.exp(dot_prods)
        
        return exp_dot_prods