Source code for clustering

'''
This module builds a a base class for clustering problems, such as k-means. The preprocessing (if applicable)
is done at this class level.

'''

import numpy as np
from src.helperfunctions.preprocessing import train_test_split, scale_and_center


[docs]class Clustering:
    """
    A class used to represent a clustering algorithm.

    Parameters
    -----------
    features : numpy.ndarray
        Design matrix of explanatory variables.
    standardized : bool
        Whether to center/scale the data. True by default.

    Attributes
    -----------
    sample_size : int
        The sample size of all given data (train and test).
    dimension : int
        The number of dimensions of the data, or columns of design matrix.
        Does not include output.
    """

    def __init__(self, features, standardized=True):
        self.features = features
        self.sample_size = features.shape[0]
        self.dimension = self.features.shape[1]

        if standardized:
            self.standardize()

[docs]    def standardize(self):
        '''
        Separately scale/center the train and test data so each feature
        (column of observations) has 0 mean and unit variance.
        '''
        self.features= scale_and_center(self.train_features)
        self.test_features = scale_and_center(self.test_features)