Source code for clustering

'''
This module builds a a base class for clustering problems, such as k-means. The preprocessing (if applicable)
is done at this class level.

'''

import numpy as np
from src.helperfunctions.preprocessing import train_test_split, scale_and_center


[docs]class Clustering: """ A class used to represent a clustering algorithm. Parameters ----------- features : numpy.ndarray Design matrix of explanatory variables. standardized : bool Whether to center/scale the data. True by default. Attributes ----------- sample_size : int The sample size of all given data (train and test). dimension : int The number of dimensions of the data, or columns of design matrix. Does not include output. """ def __init__(self, features, standardized=True): self.features = features self.sample_size = features.shape[0] self.dimension = self.features.shape[1] if standardized: self.standardize()
[docs] def standardize(self): ''' Separately scale/center the train and test data so each feature (column of observations) has 0 mean and unit variance. ''' self.features= scale_and_center(self.train_features) self.test_features = scale_and_center(self.test_features)