Source code for evaluation_metrics

'''This module provides functionality to evaluate the 
performance of a machine learning model.
Each method compares the predicted data to true data.

'''

import numpy as np
from src.helperfunctions import norms


[docs]def evaluate_accuracy(predicted_output, true_output):
    """Calculate the proportion of labels a classifier correctly predicts.

    Parameters
    ----------
    predicted_output : numpy.ndarray
        The predictions made by the classifier.
    true_output : int
        The true labels.

    Returns
    -------
    accuracy : float
        Proportion correctly predicted, between 0 and 1 (inclusive).
    """
    if predicted_output is None:
        return
        
    if predicted_output.ndim >= 2 or true_output.ndim >= 2:
        raise TypeError("Must pass in 1D Numpy array")
    
    if predicted_output.shape[0] != true_output.shape[0]:
        raise TypeError("Predicted and true output must have same size", \
            'Dimensions of predictions: ', predicted_output.shape,\
            'Dimensions of true output: ', true_output.shape)
    
    number_predictions = predicted_output.shape[0]

    if number_predictions == 0:
        return "None made"
        
    correct_predictions = np.count_nonzero(predicted_output == true_output)
    accuracy = correct_predictions / number_predictions
    return accuracy


[docs]def confusion_matrix(number_labels, predicted_output, true_output):
    """Calculate contingency table of predicted labels and true labels.

    If there are L labels, then for 0 <= i, j <= L - 1, the (i, j) entry 
    contains the number of times we predicted j when the true class is i.

    Parameters
    ----------
    number_labels : int
        The number of possible labels in the data.
    predicted_output : numpy.ndarray
        The predictions made by the classifier.
    true_output : int
        The true labels .

    Returns
    -------
    confusion_matrix : numpy.ndarray
        Square [confusion] matrix of size number_labels by number_labels.
    
    Notes
    ------
    The true output and predicted output row vectors are stacked over 
    each other. This 2 by sample_size numpy array is stored as output_combined.
    To identify how often we predict j when the truth is i, we count 
    the number of times the column vector [i, j] appears in output_combined.
    I consulted [1]_ to figure out the lattermost step.

    References
    -----------
    .. [1] https://stackoverflow.com/a/40382459
    """
    if predicted_output is None:
        return
        
    confusion_matrix = np.zeros(shape=(number_labels, number_labels), 
                                dtype=np.int8)
    output_combined = np.stack((true_output, predicted_output), axis=1)  
    for row_index in range(number_labels):  #
        for col_index in range(number_labels):  # j
            count = (output_combined == (row_index, col_index)).\
                all(axis=1).sum()
            confusion_matrix[row_index, col_index] = count

    return confusion_matrix


[docs]def evaluate_regression_error(predicted_output, true_output, 
                              norm=norms.euclidean_2):
    """Calculate the error with respect to a norm of regression output.

    Parameters
    ----------
    predicted_output : numpy.ndarray
        The predictions made by the classifier.
    true_output : int
        The true response values.
    norm : func
        The choice of norm to use to measure error. 
        Default is the Euclidean L_2 norm.

    Returns
    -------
    error : float
        Measurement of error of regression model. (squared norm)
    """

    error = norm(predicted_output - true_output)**2
    return error