Source code for hfs.hill_climbing

"""
Hill Climbing Feature Selectors.
"""
import math

import numpy as np
from scipy import sparse
from sklearn.utils.validation import check_X_y

from hfs.eagerHierarchicalFeatureSelector import EagerHierarchicalFeatureSelector
from hfs.helpers import compute_aggregated_values, get_leaves, normalize_score
from hfs.metrics import cosine_similarity


[docs]class HillClimbingSelector(EagerHierarchicalFeatureSelector): """Base class for hill climbing feature selection methods proposed by Wang. The feature selection methods are intended for hierarchical data. Therefore, this class inherits from the EagerHierarchicalFeatureSelector. """
[docs] def __init__( self, hierarchy: np.ndarray = None, alpha: float = 0.99, dataset_type: str = "binary", ): """Initializes a HillClimbingSelector. Parameters ---------- hierarchy : np.ndarray The hierarchy graph as an adjacency matrix. alpha: float A hyperparameter needed for the hill climbing methods. The default value is 0.99. dataset_type: string, either "binary" or "numerical" A value indicating if the input dataset contains binary or numerical data. Default is "binary". """ super().__init__(hierarchy) self.alpha = alpha self.dataset_type = dataset_type
[docs] def fit(self, X, y, columns=None): """Fitting function that sets self.representatives\_. Calls the function performing feature selection algorithm. The number of columns in X and the number of nodes in the hierarchy are expected to be the same and each column should be mapped to exactly one node in the hierarchy with the columns parameter. After fitting self.representatives\_ includes the names of all nodes from the hierarchy that are left after feature selection. The features are selected comparing different sets of features with a fitness function. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The training input samples. y : array-like, shape (n_samples,) The target values. An array of int. columns: list or None, length n_features The mapping from the hierarchy graph's nodes to the columns in X. A list of ints. If this parameter is None the columns in X and the corresponding nodes in the hierarchy are expected to be in the same order. Returns ------- self : object Returns self. """ # Input validation X, y = check_X_y(X, y, accept_sparse=True) super().fit(X, y, columns) if sparse.issparse(X): X = X.tocsr() # Feature Selection Algorithm self.y_ = y self._num_rows = X.shape[0] self.representatives_ = self._hill_climb(X) return self
def _hill_climb(self, X): """Performs the feature selection. This methods needs to be implemented by the subclasses. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The training input samples. """ raise NotImplementedError def _calculate_scores(self, X): """Calculate scores for each value in X. To calculate the scores the values X are summed up with the feature's children's values. If the dataset is of the type "numerical" the sums are normalized. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The training input samples. Returns ------- score_matrix : numpy.ndarray, shape (n_samples, n_features) The scores calculated for each value in X. """ score_matrix = compute_aggregated_values( X.copy(), self._hierarchy, self._columns ) if self.dataset_type == "numerical": normalized_matrix = np.zeros_like(score_matrix, dtype=float) for row_index in range(self._num_rows): for column_index in range(self.n_features_): score = score_matrix[row_index, column_index] normalized_matrix[row_index, column_index] = normalize_score( score, max(score_matrix[row_index, :]) ) score_matrix = normalized_matrix return score_matrix def _compare( self, sample_i: int, sample_j: int, feature_set: list[int], ): """Compare to samples from the dataset. This method needs to be implemented by the subclasses. It should return a score that can be used for comparison. Parameters ---------- sample_i : int A row index for a sample from the dataset. sample_j : int A row index for another sample from the dataset. feature_set: list A list of nodes that are in the feature set that is currently being evaluated. """ raise NotImplementedError def _comparison_matrix(self, feature_set: list[int]): """Creates matrix to compare the individual samples from the dataset. Parameters ---------- feature_set : list A list of nodes that are in the feature set that is currently being evaluated. Returns ------- distances : numpy.ndarray, shape (num_rows, num_rows) An matrix with distances or similarities to compare all rows from the dataset with eachother. """ distances = np.zeros((self._num_rows, self._num_rows), dtype=float) for row_i in range(self._num_rows): for row_j in range(self._num_rows): if distances[row_i, row_j] == 0: distance = self._compare(row_i, row_j, feature_set) distances[row_i, row_j] = distance distances[row_j, row_i] = distance return distances def _fitness_function(self, comparison_matrix: np.ndarray) -> float: raise NotImplementedError
[docs]class TopDownSelector(HillClimbingSelector): """Hill climbing top down feature selection method. This feature selection method was proposed by Wang et al. in 2002. The features are selected by going through the feature graph from top to bottom, replacing parent nodes with their children and evaluating the resulting feature set with a fitness function. The method is intended for hierarchical data. Therefore, it inherits from the EagerHierarchicalFeatureSelector. """
[docs] def __init__( self, hierarchy: np.ndarray = None, alpha: float = 0.99, dataset_type: str = "binary", ): """Initializes a TopDownSelector. Parameters ---------- hierarchy : np.ndarray The hierarchy graph as an adjacency matrix. alpha: float A hyperparameter needed for the hill climbing methods. The default value is 0.99. dataset_type: string, either "binary" or "numerical" A value indicating if the input dataset contains binary or numerical data. Default is "binary". """ super().__init__(hierarchy, alpha=alpha, dataset_type=dataset_type)
[docs] def fit(self, X, y, columns=None): """Fitting function that sets self.representatives\_. Calls the function performing feature selection algorithm. The number of columns in X and the number of nodes in the hierarchy are expected to be the same and each column should be mapped to exactly one node in the hierarchy with the columns parameter. After fitting self.representatives\_ includes the names of all nodes from the hierarchy that are left after feature selection. The features are selected by going through the feature graph from top to bottom, replacing parent nodes with their children and evaluating the resulting feature set with a fitness function. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The training input samples. y : array-like, shape (n_samples,) The target values. An array of int. columns: list or None, length n_features The mapping from the hierarchy graph's nodes to the columns in X. A list of ints. If this parameter is None the columns in X and the corresponding nodes in the hierarchy are expected to be in the same order. Returns ------- self : object Returns self. """ return super().fit(X, y, columns)
def _hill_climb(self, X) -> list[int]: """Performs the feature selection. The features are selected by going through the feature graph from top to bottom, replacing parent nodes with their children and evaluating the resulting feature set with a fitness function. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The training input samples. Returns ------- optimal_feature_set : list A list of nodes (int or str) that represent the optimal features set. """ self._score_matrix = self._calculate_scores(X) # Start with nodes on first level after virtual root node optimal_feature_set = set(self._hierarchy.successors("ROOT")) fitness = 0 best_fitness = 0 best_feature_set = None while True: for node in optimal_feature_set: children = list(self._hierarchy.successors(node)) if children: # Replace the current node with its children and # evaluate the resulting feature set using the # fitness function. temporary_feature_set = optimal_feature_set.copy() temporary_feature_set.remove(node) temporary_feature_set.update(children) distances = self._comparison_matrix(temporary_feature_set) temporary_fitness = self._fitness_function(distances) if (temporary_fitness) > best_fitness: best_fitness = temporary_fitness best_feature_set = temporary_feature_set if best_fitness > fitness: optimal_feature_set = best_feature_set fitness = best_fitness else: break return list(optimal_feature_set) def _compare( self, sample_i: int, sample_j: int, feature_set: list[int], ): return self._calculate_distance(sample_i, sample_j, feature_set) def _calculate_distance( self, sample_i: int, sample_j: int, feature_set: list[int], ): distance = 0 for column in feature_set: column_index = self._column_index(column) difference = ( self._score_matrix[sample_i, column_index] - self._score_matrix[sample_j, column_index] ) distance += math.pow(difference, 2) return math.sqrt(distance) def _fitness_function(self, comparison_matrix: np.ndarray) -> float: result = 0 row_indices = range(self._num_rows) for row_index in row_indices: same_class = [ sample for sample in row_indices if self.y_[sample] == self.y_[row_index] ] other_class = [x for x in row_indices if x not in same_class] nominator = sum([comparison_matrix[x, row_index] for x in other_class]) denominator = 1 + self.alpha * sum( [comparison_matrix[sample, row_index] for sample in same_class] ) result += nominator / denominator return result
[docs]class BottomUpSelector(HillClimbingSelector): """Hill climbing bottom up feature selection method. This feature selection method was proposed by Wang et al. in 2003. The features are selected by going through the feature graph from bottom to top, replacing child nodes with their parent node and evaluating the resulting feature set with a fitness function. The method is intended for hierarchical data. Therefore, it inherits from the EagerHierarchicalFeatureSelector. """
[docs] def __init__( self, hierarchy: np.ndarray = None, alpha: float = 0.01, k: int = 5, dataset_type: str = "binary", ): """Initializes a BottomUpSelector. Parameters ---------- hierarchy : np.ndarray The hierarchy graph as an adjacency matrix. For this feature selection method to work as intended the graph needs to be a tree. alpha: float A hyperparameter needed for the feature selection. In the paper by Wang et al. this parameter is called beta. The default value is 0.01. k : int A hyperparameter needed to determine the k nearest neighbors during the feature selection algorithm. The default value is 5. dataset_type: string, either "binary" or "numerical" A value indicating if the input dataset contains binary or numerical data. Default is "binary". """ super().__init__(hierarchy, alpha=alpha, dataset_type=dataset_type) self.k = k
[docs] def fit(self, X, y, columns=None): """Fitting function that sets self.representatives\_. Calls the function performing feature selection algorithm. The number of columns in X and the number of nodes in the hierarchy are expected to be the same and each column should be mapped to exactly one node in the hierarchy with the columns parameter. After fitting self.representatives\_ includes the names of all nodes from the hierarchy that are left after feature selection. The features are selected by going through the feature graph from bottom to top, replacing child nodes with their parent node and evaluating the resulting feature set with a fitness function. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The training input samples. y : array-like, shape (n_samples,) The target values. An array of int. columns: list or None, length n_features The mapping from the hierarchy graph's nodes to the columns in X. A list of ints. If this parameter is None the columns in X and the corresponding nodes in the hierarchy are expected to be in the same order. Returns ------- self : object Returns self. """ return super().fit(X, y, columns)
def _hill_climb(self, X) -> list[int]: """Performs the feature selection. The features are selected by going through the feature graph from bottom to top, replacing child nodes with their parent node and evaluating the resulting feature set with a fitness function. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The training input samples. Returns ------- optimal_feature_set : list A list of nodes (int or str) that represent the optimal features set. """ self._score_matrix = self._calculate_scores(X) # Start with the leaves. current_feature_set = get_leaves(self._hierarchy) if current_feature_set == ["ROOT"] or current_feature_set == []: return [] current_fitness = self._fitness_function( self._comparison_matrix(current_feature_set) ) unvisited = set(current_feature_set) while unvisited: temporary_feature_set = current_feature_set.copy() node = unvisited.pop() parent = list(self._hierarchy.predecessors(node))[ 0 ] # This does not work with a DAG. if parent != "ROOT": # Replace the current node and its siblings with their # parent node. temporary_feature_set.append(parent) children = list(self._hierarchy.successors(parent)) updated_feature_set = [ node for node in temporary_feature_set if node not in children ] temporary_fitness = self._fitness_function( self._comparison_matrix(updated_feature_set) ) if temporary_fitness < current_fitness: current_feature_set = temporary_feature_set current_fitness = temporary_fitness unvisited = set(current_feature_set) return current_feature_set def _compare( self, sample_i: int, sample_j: int, feature_set: list[int], ): return self._calculate_similarity(sample_i, sample_j, feature_set) def _calculate_similarity( self, sample_i: int, sample_j: int, feature_set: list[int] ): if "ROOT" in feature_set: feature_set = feature_set.remove("ROOT") row_i = self._score_matrix[sample_i, feature_set] row_j = self._score_matrix[sample_j, feature_set] return cosine_similarity(row_i.flatten(), row_j.flatten()) def _fitness_function(self, comparison_matrix: np.ndarray) -> float: # number_of_leaf_nodes is the alpha value from paper. number_of_leaf_nodes = len(get_leaves(self._hierarchy)) if number_of_leaf_nodes == 0: number_of_leaf_nodes = 1 threshold_index = self._num_rows - self.k - 1 k_nearest_neigbors = [ list( np.argpartition(comparison_matrix[row, :], threshold_index)[ threshold_index: ] ) for row in range(self._num_rows) ] count = 0 for row in range(self._num_rows): for neighbor in k_nearest_neigbors[row]: if self.y_[neighbor] == self.y_[row] and neighbor != row: count += 1 result = count * ( 1 + self.alpha * (number_of_leaf_nodes - self.n_features_) / number_of_leaf_nodes ) return result