Source code for poissonreg

'''This module builds a class for Poisson regression problems.
We compute the solution by directly maximizing the log-likelihood.
We use an existing software implementation to globally maximize the 
likelihood function: BFGS, available in 
scipy.optimize.minimize(method = 'BFGS)

'''

import numpy as np
from src.regression.regression import Regression
from src.helperfunctions.evaluation_metrics import evaluate_regression_error
from scipy.optimize import minimize
from scipy.optimize.optimize import _minimize_bfgs
from src.helperfunctions.preprocessing import scale_and_center

[docs]class Poisson(Regression): ''' A class used to represent a Poisson regression model. Parameters ----------- features : numpy.ndarray Design matrix of explanatory variables. output : numpy.ndarray Count data output corresponding to feature matrix. split_proportion : float Proportion of data to use for training; between 0 and 1. standardized : bool Whether to center/scale the data (train/test done separately). True by default. Attributes ---------- coefficients : numpy.ndarray The coefficients in the Poisson regression model. train_predictions : numpy.ndarray The predicted output values for the training data. test_predictions : numpy.ndarray The predicted output values for the test data. train_error : float The error of model on training data (default is MSE). train_error : float The error of model on test data (default is MSE). ''' def __init__(self, features, output, split_proportion=0.75, standardized=True): if standardized: self.features = scale_and_center(features) # Add column for intercept self.features = np.append(np.ones((features.shape[0], 1)), features, axis=1) super().__init__(self.features, output, split_proportion, standardized=False) self.coefficients = self.fit() self.train_predictions = Poisson.predict(self.train_features, self.coefficients) self.test_predictions = Poisson.predict(self.test_features, self.coefficients) self.train_error = evaluate_regression_error(self.train_predictions, self.train_output) self.test_error = evaluate_regression_error(self.test_predictions, self.test_output)
[docs] @staticmethod def loglikelihood(features, counts, coefficient): '''Compute empirical log likelihood for each coefficient. Parameters ---------- features : numpy.ndarray Design matrix of explanatory variables counts : numpy.ndarray The given response values (integers). coefficients : numpy.ndarray Vector of coefficients for Poisson regression. Returns ------- loglikelihood : float The value of the log likelihood with given data and coefficient. ''' coeff_matrix = np.tile(coefficient, (features.shape[0], 1)) dot_prods = np.sum(features * coeff_matrix, axis=1) exp_dot_prods = np.exp(dot_prods) summands = np.multiply(dot_prods, counts) - exp_dot_prods loglikelihood = np.sum(summands) return loglikelihood
[docs] @staticmethod def mle_finder(features, counts): ''' Find the MLE for a Poisson regression model; return the coefficient. Parameters ---------- features : numpy.ndarray Design matrix of explanatory variables counts : numpy.ndarray The given output counts Returns ------- mle : float The coefficient that solves the Poisson regression problem for the given data. Notes ----- We use the BFGS algorithm as implemented in scipy to maximize the log-likelihood. This requires negating the log likelihood as we use minimization. See Also -------- scipy.optimize.minimize ''' def negative_log_likelihood(coefficient): ''' The negative log likelihood function that we minimize. ''' value = -Poisson.loglikelihood(features, counts, coefficient) return value dimension = np.ones(features.shape[1], dtype = np.int8) optimum = _minimize_bfgs(negative_log_likelihood, dimension) mle = optimum.x # Obtain argmin return mle
[docs] def fit(self): ''' Calculate the coefficient estimate on the training data. ''' coefficients = Poisson.mle_finder(self.train_features, self.train_output) return coefficients
[docs] @staticmethod def predict(features, coefficients): '''Compute estimated means of Poisson regression. Parameters ---------- features : numpy.ndarray Design matrix of explanatory variables. coefficients : numpy.ndarray Vector of coefficients for Poisson regression solution. Returns ------- exp_dot_prods : numpy.ndarray Predicted mean of Poisson distribution Notes ------ In a Poisson model, we assume Y is Poisson, and that log E[Y|x] = beta^T x. Here we return our estimate of E[Y|x] for a test data point x. This quantity reflects the mean (and variance of the) count we might expect of the response, conditional on the observed features. ''' coeff_matrix = np.tile(coefficients, (features.shape[0], 1)) dot_prods = np.sum(features * coeff_matrix, axis=1) exp_dot_prods = np.exp(dot_prods) return exp_dot_prods