Source code for proglearn.voters

"""
Main Author: Will LeVine 
Corresponding Email: levinewill@icloud.com
"""
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils.validation import (
    check_X_y,
    check_array,
    check_is_fitted,
)
from sklearn.utils.multiclass import check_classification_targets
from .base import BaseClassificationVoter


[docs]class TreeClassificationVoter(BaseClassificationVoter): """ A class used to vote on data transformed under a tree, which inherits from the BaseClassificationVoter class in base.py. Parameters ---------- kappa : float coefficient for finite sample correction If set to default, no finite sample correction is performed. classes : list, default=[] list of all possible output label values Attributes ---------- missing_label_indices_ : list a (potentially empty) list of label values that exist in the ``classes`` parameter but are missing in the latest ``fit`` function call uniform_posterior_ : ndarray of shape (n_classes,) the uniform posterior associated with the """ def __init__(self, kappa=np.inf, classes=[]): self.kappa = kappa self.classes = np.asarray(classes)
[docs] def fit(self, X, y): """ Fits transformed data X given corresponding class labels y. Parameters ---------- X : array of shape [n_samples, n_features] the transformed input data y : array of shape [n_samples] the class labels Returns ------- self : TreeClassificationVoter The object itself. """ check_classification_targets(y) num_fit_classes = len(np.unique(y)) self.missing_label_indices_ = [] if self.classes.size != 0 and num_fit_classes < len(self.classes): for idx, label in enumerate(self.classes): if label not in np.unique(y): self.missing_label_indices_.append(idx) self.uniform_posterior_ = np.ones(num_fit_classes) / num_fit_classes self.leaf_to_posterior_ = {} for leaf_id in np.unique(X): idxs_in_leaf = np.where(X == leaf_id)[0] class_counts = [ len(np.where(y[idxs_in_leaf] == y_val)[0]) for y_val in np.unique(y) ] posteriors = np.nan_to_num(np.array(class_counts) / np.sum(class_counts)) posteriors = self._finite_sample_correction( posteriors, len(idxs_in_leaf), self.kappa ) self.leaf_to_posterior_[leaf_id] = posteriors return self
[docs] def predict_proba(self, X): """ Returns the posterior probabilities of each class for data X. Parameters ---------- X : array of shape [n_samples, n_features] the transformed input data Returns ------- y_proba_hat : ndarray of shape [n_samples, n_classes] posteriors per example Raises ------ NotFittedError When the model is not fitted. """ check_is_fitted(self) votes_per_example = [] for x in X: if x in list(self.leaf_to_posterior_.keys()): votes_per_example.append(self.leaf_to_posterior_[x]) else: votes_per_example.append(self.uniform_posterior_) votes_per_example = np.array(votes_per_example) if len(self.missing_label_indices_) > 0: for i in self.missing_label_indices_: new_col = np.zeros(votes_per_example.shape[0]) votes_per_example = np.insert(votes_per_example, i, new_col, axis=1) return votes_per_example
[docs] def predict(self, X): """ Returns the predicted class labels for data X. Parameters ---------- X : array of shape [n_samples, n_features] the transformed input data Returns ------- y_hat : ndarray of shape [n_samples] predicted class label per example Raises ------ NotFittedError When the model is not fitted. """ return self.classes[np.argmax(self.predict_proba(X), axis=1)]
def _finite_sample_correction(self, posteriors, num_points_in_partition, kappa): """ Encourage posteriors to approach uniform when there is low data through a finite sample correction. Parameters ---------- posteriors : array of shape[n_samples, n_classes] posterior of each class for each sample num_points_in_partition : int number of samples in this particular transformation kappa : float coefficient for finite sample correction Returns ------- y_proba_hat : ndarray of shape [n_samples, n_classes] posteriors per example """ correction_constant = 1 / (kappa * num_points_in_partition) zero_posterior_idxs = np.where(posteriors == 0)[0] posteriors[zero_posterior_idxs] = correction_constant posteriors /= sum(posteriors) return posteriors
[docs]class KNNClassificationVoter(BaseClassificationVoter): """ A class used to vote on data under any transformer outputting data in continuous Euclidean space, which inherits from the BaseClassificationVoter class in base.py. Parameters ---------- k : int integer indicating number of neighbors to use for each prediction during fitting and voting kwargs : dictionary, default={} contains all keyword arguments for the underlying KNN classes : list, default=[] list of all possible output label values Attributes ---------- missing_label_indices_ : list a (potentially empty) list of label values that exist in the ``classes`` parameter but are missing in the latest ``fit`` function call knn_ : sklearn.neighbors.KNeighborsClassifier the internal sklearn instance of KNN classifier """ def __init__(self, k=None, kwargs={}, classes=[]): self.k = k self.kwargs = kwargs self.classes = np.asarray(classes)
[docs] def fit(self, X, y): """ Fits data X given class labels y. Parameters ---------- X : array of shape [n_samples, n_features] the transformed data that will be trained on y : array of shape [n_samples] the label for class membership of the given data Returns ------- self : KNNClassificationVoter The object itself. """ X, y = check_X_y(X, y) k = int(np.log2(len(X))) if self.k == None else self.k self.knn_ = KNeighborsClassifier(k, **self.kwargs) self.knn_.fit(X, y) num_classes = len(np.unique(y)) self.missing_label_indices_ = [] if self.classes.size != 0 and num_classes < len(self.classes): for idx, label in enumerate(self.classes): if label not in np.unique(y): self.missing_label_indices_.append(idx) return self
[docs] def predict_proba(self, X): """ Returns the posterior probabilities of each class for data X. Parameters ---------- X : array of shape [n_samples, n_features] the transformed input data Returns ------- y_proba_hat : ndarray of shape [n_samples, n_classes] posteriors per example Raises ------ NotFittedError When the model is not fitted. """ check_is_fitted(self) X = check_array(X) votes_per_example = self.knn_.predict_proba(X) if len(self.missing_label_indices_) > 0: for i in self.missing_label_indices_: new_col = np.zeros(votes_per_example.shape[0]) votes_per_example = np.insert(votes_per_example, i, new_col, axis=1) return votes_per_example
[docs] def predict(self, X): """ Returns the predicted class labels for data X. Parameters ---------- X : array of shape [n_samples, n_features] the transformed input data Returns ------- y_hat : ndarray of shape [n_samples] predicted class label per example Raises ------ NotFittedError When the model is not fitted. """ return self.classes[np.argmax(self.predict_proba(X), axis=1)]