Source code for hfs.hill_climbing

"""
Hill Climbing Feature Selectors.
"""
import math

import numpy as np
from scipy import sparse
from sklearn.utils.validation import check_X_y

from hfs.eagerHierarchicalFeatureSelector import EagerHierarchicalFeatureSelector
from hfs.helpers import compute_aggregated_values, get_leaves, normalize_score
from hfs.metrics import cosine_similarity


[docs]class HillClimbingSelector(EagerHierarchicalFeatureSelector):
    """Base class for hill climbing feature selection methods proposed by Wang.

    The feature selection methods are intended for hierarchical data.
    Therefore, this class inherits from the EagerHierarchicalFeatureSelector.
    """

[docs]    def __init__(
        self,
        hierarchy: np.ndarray = None,
        alpha: float = 0.99,
        dataset_type: str = "binary",
    ):
        """Initializes a HillClimbingSelector.

        Parameters
        ----------
        hierarchy : np.ndarray
                    The hierarchy graph as an adjacency matrix.
        alpha: float
                A hyperparameter needed for the hill climbing methods.
                The default value is 0.99.
        dataset_type: string, either "binary" or "numerical"
                A value indicating if the input dataset contains binary or
                numerical data. Default is "binary".
        """
        super().__init__(hierarchy)
        self.alpha = alpha
        self.dataset_type = dataset_type

[docs]    def fit(self, X, y, columns=None):
        """Fitting function that sets self.representatives\_.

        Calls the function performing feature selection algorithm.
        The number of columns in X and the number of nodes in the hierarchy
        are expected to be the same and each column should be mapped to
        exactly one node in the hierarchy with the columns parameter.
        After fitting self.representatives\_ includes the names of all
        nodes from the hierarchy that are left after feature selection.
        The features are selected comparing different sets of features
        with a fitness function.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
        y : array-like, shape (n_samples,)
            The target values. An array of int.
        columns: list or None, length n_features
            The mapping from the hierarchy graph's nodes to the columns in X.
            A list of ints. If this parameter is None the columns in X and
            the corresponding nodes in the hierarchy are expected to be in the
            same order.

        Returns
        -------
        self : object
            Returns self.
        """
        # Input validation
        X, y = check_X_y(X, y, accept_sparse=True)

        super().fit(X, y, columns)
        if sparse.issparse(X):
            X = X.tocsr()

        # Feature Selection Algorithm
        self.y_ = y
        self._num_rows = X.shape[0]
        self.representatives_ = self._hill_climb(X)

        return self

    def _hill_climb(self, X):
        """Performs the feature selection.

        This methods needs to be implemented by the subclasses.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
        """
        raise NotImplementedError

    def _calculate_scores(self, X):
        """Calculate scores for each value in X.

        To calculate the scores the values X are summed up with
        the feature's children's values. If the dataset is of the
        type "numerical" the sums are normalized.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.

        Returns
        -------
        score_matrix : numpy.ndarray, shape (n_samples, n_features)
                    The scores calculated for each value in X.
        """
        score_matrix = compute_aggregated_values(
            X.copy(), self._hierarchy, self._columns
        )

        if self.dataset_type == "numerical":
            normalized_matrix = np.zeros_like(score_matrix, dtype=float)
            for row_index in range(self._num_rows):
                for column_index in range(self.n_features_):
                    score = score_matrix[row_index, column_index]
                    normalized_matrix[row_index, column_index] = normalize_score(
                        score, max(score_matrix[row_index, :])
                    )
            score_matrix = normalized_matrix
        return score_matrix

    def _compare(
        self,
        sample_i: int,
        sample_j: int,
        feature_set: list[int],
    ):
        """Compare to samples from the dataset.

        This method needs to be implemented by the subclasses.
        It should return a score that can be used for comparison.

        Parameters
        ----------
        sample_i : int
            A row index for a sample from the dataset.
        sample_j : int
            A row index for another sample from the dataset.
        feature_set: list
                    A list of nodes that are in the feature set that is
                    currently being evaluated.
        """
        raise NotImplementedError

    def _comparison_matrix(self, feature_set: list[int]):
        """Creates matrix to compare the individual samples from the dataset.

        Parameters
        ----------
        feature_set : list
                A list of nodes that are in the feature set that is
                currently being evaluated.

        Returns
        -------
        distances : numpy.ndarray, shape (num_rows, num_rows)
                    An matrix with distances or similarities to compare all
                    rows from the dataset with eachother.
        """
        distances = np.zeros((self._num_rows, self._num_rows), dtype=float)
        for row_i in range(self._num_rows):
            for row_j in range(self._num_rows):
                if distances[row_i, row_j] == 0:
                    distance = self._compare(row_i, row_j, feature_set)
                    distances[row_i, row_j] = distance
                    distances[row_j, row_i] = distance
        return distances

    def _fitness_function(self, comparison_matrix: np.ndarray) -> float:
        raise NotImplementedError


[docs]class TopDownSelector(HillClimbingSelector):
    """Hill climbing top down feature selection method.

    This feature selection method was proposed by Wang et al. in 2002.
    The features are selected by going through the feature graph from
    top to bottom, replacing parent nodes with their children and
    evaluating the resulting feature set with a fitness function.
    The method is intended for hierarchical data. Therefore, it inherits
    from the EagerHierarchicalFeatureSelector.
    """

[docs]    def __init__(
        self,
        hierarchy: np.ndarray = None,
        alpha: float = 0.99,
        dataset_type: str = "binary",
    ):
        """Initializes a TopDownSelector.

        Parameters
        ----------
        hierarchy : np.ndarray
                    The hierarchy graph as an adjacency matrix.
        alpha: float
                A hyperparameter needed for the hill climbing methods.
                The default value is 0.99.
        dataset_type: string, either "binary" or "numerical"
                A value indicating if the input dataset contains binary or
                numerical data. Default is "binary".
        """
        super().__init__(hierarchy, alpha=alpha, dataset_type=dataset_type)

[docs]    def fit(self, X, y, columns=None):
        """Fitting function that sets self.representatives\_.

        Calls the function performing feature selection algorithm.
        The number of columns in X and the number of nodes in the hierarchy
        are expected to be the same and each column should be mapped to
        exactly one node in the hierarchy with the columns parameter.
        After fitting self.representatives\_ includes the names of all
        nodes from the hierarchy that are left after feature selection.
        The features are selected by going through the feature graph from
        top to bottom, replacing parent nodes with their children and
        evaluating the resulting feature set with a fitness function.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
        y : array-like, shape (n_samples,)
            The target values. An array of int.
        columns: list or None, length n_features
            The mapping from the hierarchy graph's nodes to the columns in X.
            A list of ints. If this parameter is None the columns in X and
            the corresponding nodes in the hierarchy are expected to be in the
            same order.

        Returns
        -------
        self : object
            Returns self.
        """
        return super().fit(X, y, columns)

    def _hill_climb(self, X) -> list[int]:
        """Performs the feature selection.

        The features are selected by going through the feature graph from
        top to bottom, replacing parent nodes with their children and
        evaluating the resulting feature set with a fitness function.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.

        Returns
        -------
        optimal_feature_set : list
                    A list of nodes (int or str) that represent the
                    optimal features set.
        """
        self._score_matrix = self._calculate_scores(X)

        # Start with nodes on first level after virtual root node
        optimal_feature_set = set(self._hierarchy.successors("ROOT"))
        fitness = 0
        best_fitness = 0
        best_feature_set = None

        while True:
            for node in optimal_feature_set:
                children = list(self._hierarchy.successors(node))
                if children:
                    # Replace the current node with its children and
                    # evaluate the resulting feature set using the
                    # fitness function.
                    temporary_feature_set = optimal_feature_set.copy()
                    temporary_feature_set.remove(node)
                    temporary_feature_set.update(children)
                    distances = self._comparison_matrix(temporary_feature_set)
                    temporary_fitness = self._fitness_function(distances)
                    if (temporary_fitness) > best_fitness:
                        best_fitness = temporary_fitness
                        best_feature_set = temporary_feature_set

            if best_fitness > fitness:
                optimal_feature_set = best_feature_set
                fitness = best_fitness
            else:
                break
        return list(optimal_feature_set)

    def _compare(
        self,
        sample_i: int,
        sample_j: int,
        feature_set: list[int],
    ):
        return self._calculate_distance(sample_i, sample_j, feature_set)

    def _calculate_distance(
        self,
        sample_i: int,
        sample_j: int,
        feature_set: list[int],
    ):
        distance = 0
        for column in feature_set:
            column_index = self._column_index(column)
            difference = (
                self._score_matrix[sample_i, column_index]
                - self._score_matrix[sample_j, column_index]
            )
            distance += math.pow(difference, 2)
        return math.sqrt(distance)

    def _fitness_function(self, comparison_matrix: np.ndarray) -> float:
        result = 0
        row_indices = range(self._num_rows)
        for row_index in row_indices:
            same_class = [
                sample
                for sample in row_indices
                if self.y_[sample] == self.y_[row_index]
            ]
            other_class = [x for x in row_indices if x not in same_class]

            nominator = sum([comparison_matrix[x, row_index] for x in other_class])
            denominator = 1 + self.alpha * sum(
                [comparison_matrix[sample, row_index] for sample in same_class]
            )
            result += nominator / denominator

        return result


[docs]class BottomUpSelector(HillClimbingSelector):
    """Hill climbing bottom up feature selection method.

    This feature selection method was proposed by Wang et al. in 2003.
    The features are selected by going through the feature graph from
    bottom to top, replacing child nodes with their parent node and
    evaluating the resulting feature set with a fitness function.
    The method is intended for hierarchical data. Therefore, it inherits
    from the EagerHierarchicalFeatureSelector.
    """

[docs]    def __init__(
        self,
        hierarchy: np.ndarray = None,
        alpha: float = 0.01,
        k: int = 5,
        dataset_type: str = "binary",
    ):
        """Initializes a BottomUpSelector.

        Parameters
        ----------
        hierarchy : np.ndarray
                    The hierarchy graph as an adjacency matrix. For this
                    feature selection method to work as intended the graph
                    needs to be a tree.
        alpha: float
                A hyperparameter needed for the feature selection. In
                the paper by Wang et al. this parameter is called beta.
                The default value is 0.01.
        k : int
                A hyperparameter needed to determine the k nearest
                neighbors during the feature selection algorithm.
                The default value is 5.
        dataset_type: string, either "binary" or "numerical"
                A value indicating if the input dataset contains binary or
                numerical data. Default is "binary".
        """
        super().__init__(hierarchy, alpha=alpha, dataset_type=dataset_type)
        self.k = k

[docs]    def fit(self, X, y, columns=None):
        """Fitting function that sets self.representatives\_.

        Calls the function performing feature selection algorithm.
        The number of columns in X and the number of nodes in the hierarchy
        are expected to be the same and each column should be mapped to
        exactly one node in the hierarchy with the columns parameter.
        After fitting self.representatives\_ includes the names of all
        nodes from the hierarchy that are left after feature selection.
        The features are selected by going through the feature graph from
        bottom to top, replacing child nodes with their parent node and
        evaluating the resulting feature set with a fitness function.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
        y : array-like, shape (n_samples,)
            The target values. An array of int.
        columns: list or None, length n_features
            The mapping from the hierarchy graph's nodes to the columns in X.
            A list of ints. If this parameter is None the columns in X and
            the corresponding nodes in the hierarchy are expected to be in the
            same order.

        Returns
        -------
        self : object
            Returns self.
        """
        return super().fit(X, y, columns)

    def _hill_climb(self, X) -> list[int]:
        """Performs the feature selection.

        The features are selected by going through the feature graph from
        bottom to top, replacing child nodes with their parent node and
        evaluating the resulting feature set with a fitness function.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.

        Returns
        -------
        optimal_feature_set : list
                    A list of nodes (int or str) that represent the
                    optimal features set.
        """
        self._score_matrix = self._calculate_scores(X)

        # Start with the leaves.
        current_feature_set = get_leaves(self._hierarchy)
        if current_feature_set == ["ROOT"] or current_feature_set == []:
            return []
        current_fitness = self._fitness_function(
            self._comparison_matrix(current_feature_set)
        )

        unvisited = set(current_feature_set)

        while unvisited:
            temporary_feature_set = current_feature_set.copy()
            node = unvisited.pop()
            parent = list(self._hierarchy.predecessors(node))[
                0
            ]  # This does not work with a DAG.
            if parent != "ROOT":
                # Replace the current node and its siblings with their
                # parent node.
                temporary_feature_set.append(parent)
                children = list(self._hierarchy.successors(parent))
                updated_feature_set = [
                    node for node in temporary_feature_set if node not in children
                ]
                temporary_fitness = self._fitness_function(
                    self._comparison_matrix(updated_feature_set)
                )
                if temporary_fitness < current_fitness:
                    current_feature_set = temporary_feature_set
                    current_fitness = temporary_fitness
                    unvisited = set(current_feature_set)

        return current_feature_set

    def _compare(
        self,
        sample_i: int,
        sample_j: int,
        feature_set: list[int],
    ):
        return self._calculate_similarity(sample_i, sample_j, feature_set)

    def _calculate_similarity(
        self, sample_i: int, sample_j: int, feature_set: list[int]
    ):
        if "ROOT" in feature_set:
            feature_set = feature_set.remove("ROOT")
        row_i = self._score_matrix[sample_i, feature_set]
        row_j = self._score_matrix[sample_j, feature_set]
        return cosine_similarity(row_i.flatten(), row_j.flatten())

    def _fitness_function(self, comparison_matrix: np.ndarray) -> float:
        # number_of_leaf_nodes is the alpha value from paper.
        number_of_leaf_nodes = len(get_leaves(self._hierarchy))
        if number_of_leaf_nodes == 0:
            number_of_leaf_nodes = 1

        threshold_index = self._num_rows - self.k - 1
        k_nearest_neigbors = [
            list(
                np.argpartition(comparison_matrix[row, :], threshold_index)[
                    threshold_index:
                ]
            )
            for row in range(self._num_rows)
        ]

        count = 0
        for row in range(self._num_rows):
            for neighbor in k_nearest_neigbors[row]:
                if self.y_[neighbor] == self.y_[row] and neighbor != row:
                    count += 1

        result = count * (
            1
            + self.alpha
            * (number_of_leaf_nodes - self.n_features_)
            / number_of_leaf_nodes
        )
        return result