Source code for hfs.shsel

"""
SHSEL Feature Selector.
"""
import statistics

import numpy as np
from scipy import sparse
from sklearn.utils.validation import check_X_y

from .eagerHierarchicalFeatureSelector import EagerHierarchicalFeatureSelector
from .helpers import compute_aggregated_values, get_leaves, get_paths
from .metrics import information_gain, pearson_correlation


[docs]class SHSELSelector(EagerHierarchicalFeatureSelector):
    """SHSEL feature selection method for hierarchical features.

    This feature selection method was proposed by Ristoski and Paulheim
    in 2014. The features are selected by removing features with
    parents that have a similar relevance and removing features with
    lower than average information gain for each path from leaf to
    root.
    This Selector also implements the hierarchical feature
    engineering (HFE) extension proposed by Oudah and Henschel in
    2018.
    """

[docs]    def __init__(
        self,
        hierarchy: np.ndarray = None,
        relevance_metric: str = "IG",
        similarity_threshold=0.99,
        use_hfe_extension=False,
        preprocess_numerical_data=False,
    ):
        """Initializes a SHSELSelector.

        Parameters
        ----------
        hierarchy : np.ndarray
                    The hierarchy graph as an adjacency matrix.
        relevance_metric : str
                    The relevance metric to use in the initial selection
                    stage of the algorithm. The options ore "IG" for
                    information gain and "Correlation". Default is IG.
        similarity_threshold : float
                    The similarity threshold to use in the initial selection
                    stage of the algorithm. This can be a number between
                    0 an 1. Default is 0.99.
        use_hfe_extension : bool
                    If True the HFE algorithm proposed by Oudah and Henschel is
                    used. Set relevance_metric to "Correlation" when using this
                    extension. Default is False.
        preprocess_numerical_data : False
                    If True the data is preprocessed by adding up the child values.
                    This method is used in the HFE extension algorithm which
                    expects numerical data. If binary data is used it is
                    recommended to set this parameter to False. Default is False.

        """
        super().__init__(hierarchy)
        self.relevance_metric = relevance_metric
        self.similarity_threshold = similarity_threshold
        self.use_hfe_extension = use_hfe_extension
        self.preprocess_numerical_data = preprocess_numerical_data

[docs]    def fit(self, X, y, columns=None):
        """Fitting function that sets self.representatives\_.

        The number of columns in X and the number of nodes in the hierarchy
        are expected to be the same and each column should be mapped to
        exactly one node in the hierarchy with the columns parameter.
        After fitting self.representatives\_ includes the names of all
        nodes from the hierarchy that are left after feature selection.
        The features are selected by removing features with
        parents that have a similar relevance and removing features with
        lower than average information gain for each path from leaf to
        root.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
        y : array-like, shape (n_samples,)
            The target values. An array of int.
        columns: list or None, length n_features
            The mapping from the hierarchy graph's nodes to the columns in X.
            A list of ints. If this parameter is None the columns in X and
            the corresponding nodes in the hierarchy are expected to be in the
            same order.

        Returns
        -------
        self : object
            Returns self.
        """
        # Input validation
        X, y = check_X_y(X, y, accept_sparse=True)
        if sparse.issparse(X):
            X = X.tocsr()
        super().fit(X, y, columns)

        # Feature Selection Algorithm
        self._calculate_relevance(X, y)
        self._fit(X)

        self.is_fitted_ = True
        return self

    def _fit(self, X):
        """The feature selection algorithm."""
        if self.preprocess_numerical_data:
            X = self._preprocess(X)
        paths = get_paths(self._hierarchy, reverse=True)
        self._inital_selection(paths, X)
        self._pruning(paths)
        if self.use_hfe_extension:
            self._leaf_filtering()

    def _inital_selection(self, paths, X):
        """First part of the feature selection algorithm."""
        remove_nodes = set()

        for path in paths:
            # If the relevance is to similar to the parents relevance
            # the child is removed
            for index, node in enumerate(path):
                parent_node = path[index + 1]
                if parent_node == "ROOT":
                    break
                if self.relevance_metric == "IG":
                    similarity = 1 - abs(
                        self._relevance_values[parent_node]
                        - self._relevance_values[node]
                    )
                else:
                    similarity = pearson_correlation(
                        X[:, self._columns.index(parent_node)],
                        X[:, self._columns.index(node)],
                    )
                if similarity >= self.similarity_threshold:
                    remove_nodes.add(node)

        self.representatives_ = [
            feature for feature in self._columns if feature not in remove_nodes
        ]

    def _select_leaves(self):
        """Select leaves of incomplete paths (part of HFE extension)"""
        leaves = [
            leaf
            for leaf in get_leaves(self._hierarchy)
            if leaf in self.representatives_
        ]

        paths = get_paths(self._hierarchy)
        max_path_len = max([len(path) for path in paths])
        selected_leaves = []
        for leaf in leaves:
            for path in paths:
                if leaf in path and len(path) != max_path_len:
                    selected_leaves.append(leaf)
        return selected_leaves

    def _pruning(self, paths):
        """Second part of the feature selection algorithm"""
        paths = get_paths(self._hierarchy, reverse=True)
        updated_representatives = []

        for path in paths:
            path.remove("ROOT")
            average_relevance = statistics.mean(
                [
                    self._relevance_values[node]
                    for node in path
                    if node in self.representatives_
                ]
            )
            for node in path:
                if node in self.representatives_ and round(
                    self._relevance_values[node], 6
                ) >= round(average_relevance, 6):
                    updated_representatives.append(node)

        self.representatives_ = updated_representatives

    def _calculate_relevance(self, X, y):
        values = information_gain(X, y)
        self._relevance_values = dict(zip(self._columns, values))

    def _preprocess(self, X):
        """Preprocess numerical data by summing up child values.

        This is part of the HFE extension and only makes sense for
        numerial data and not for binary data.
        """
        return compute_aggregated_values("ROOT", X, self._hierarchy, self._columns)

    def _leaf_filtering(self):
        """Filtering representatives by removing leaves with low relevance.

        This is part of the HFE extension proposed by Oudah and Henschel.
        """
        average_ig = statistics.mean(
            [self._relevance_values[node] for node in self.representatives_]
        )

        leaves = self._select_leaves()

        remove_nodes = [
            leaf
            for leaf in leaves
            if self._relevance_values[leaf] < average_ig
            or self._relevance_values[leaf] == 0
        ]
        updated_representatives = [
            node for node in self.representatives_ if node not in remove_nodes
        ]
        self.representatives_ = updated_representatives