"""
SHSEL Feature Selector.
"""
import statistics
import numpy as np
from scipy import sparse
from sklearn.utils.validation import check_X_y
from .eagerHierarchicalFeatureSelector import EagerHierarchicalFeatureSelector
from .helpers import compute_aggregated_values, get_leaves, get_paths
from .metrics import information_gain, pearson_correlation
[docs]class SHSELSelector(EagerHierarchicalFeatureSelector):
"""SHSEL feature selection method for hierarchical features.
This feature selection method was proposed by Ristoski and Paulheim
in 2014. The features are selected by removing features with
parents that have a similar relevance and removing features with
lower than average information gain for each path from leaf to
root.
This Selector also implements the hierarchical feature
engineering (HFE) extension proposed by Oudah and Henschel in
2018.
"""
[docs] def __init__(
self,
hierarchy: np.ndarray = None,
relevance_metric: str = "IG",
similarity_threshold=0.99,
use_hfe_extension=False,
preprocess_numerical_data=False,
):
"""Initializes a SHSELSelector.
Parameters
----------
hierarchy : np.ndarray
The hierarchy graph as an adjacency matrix.
relevance_metric : str
The relevance metric to use in the initial selection
stage of the algorithm. The options ore "IG" for
information gain and "Correlation". Default is IG.
similarity_threshold : float
The similarity threshold to use in the initial selection
stage of the algorithm. This can be a number between
0 an 1. Default is 0.99.
use_hfe_extension : bool
If True the HFE algorithm proposed by Oudah and Henschel is
used. Set relevance_metric to "Correlation" when using this
extension. Default is False.
preprocess_numerical_data : False
If True the data is preprocessed by adding up the child values.
This method is used in the HFE extension algorithm which
expects numerical data. If binary data is used it is
recommended to set this parameter to False. Default is False.
"""
super().__init__(hierarchy)
self.relevance_metric = relevance_metric
self.similarity_threshold = similarity_threshold
self.use_hfe_extension = use_hfe_extension
self.preprocess_numerical_data = preprocess_numerical_data
[docs] def fit(self, X, y, columns=None):
"""Fitting function that sets self.representatives\_.
The number of columns in X and the number of nodes in the hierarchy
are expected to be the same and each column should be mapped to
exactly one node in the hierarchy with the columns parameter.
After fitting self.representatives\_ includes the names of all
nodes from the hierarchy that are left after feature selection.
The features are selected by removing features with
parents that have a similar relevance and removing features with
lower than average information gain for each path from leaf to
root.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
The training input samples.
y : array-like, shape (n_samples,)
The target values. An array of int.
columns: list or None, length n_features
The mapping from the hierarchy graph's nodes to the columns in X.
A list of ints. If this parameter is None the columns in X and
the corresponding nodes in the hierarchy are expected to be in the
same order.
Returns
-------
self : object
Returns self.
"""
# Input validation
X, y = check_X_y(X, y, accept_sparse=True)
if sparse.issparse(X):
X = X.tocsr()
super().fit(X, y, columns)
# Feature Selection Algorithm
self._calculate_relevance(X, y)
self._fit(X)
self.is_fitted_ = True
return self
def _fit(self, X):
"""The feature selection algorithm."""
if self.preprocess_numerical_data:
X = self._preprocess(X)
paths = get_paths(self._hierarchy, reverse=True)
self._inital_selection(paths, X)
self._pruning(paths)
if self.use_hfe_extension:
self._leaf_filtering()
def _inital_selection(self, paths, X):
"""First part of the feature selection algorithm."""
remove_nodes = set()
for path in paths:
# If the relevance is to similar to the parents relevance
# the child is removed
for index, node in enumerate(path):
parent_node = path[index + 1]
if parent_node == "ROOT":
break
if self.relevance_metric == "IG":
similarity = 1 - abs(
self._relevance_values[parent_node]
- self._relevance_values[node]
)
else:
similarity = pearson_correlation(
X[:, self._columns.index(parent_node)],
X[:, self._columns.index(node)],
)
if similarity >= self.similarity_threshold:
remove_nodes.add(node)
self.representatives_ = [
feature for feature in self._columns if feature not in remove_nodes
]
def _select_leaves(self):
"""Select leaves of incomplete paths (part of HFE extension)"""
leaves = [
leaf
for leaf in get_leaves(self._hierarchy)
if leaf in self.representatives_
]
paths = get_paths(self._hierarchy)
max_path_len = max([len(path) for path in paths])
selected_leaves = []
for leaf in leaves:
for path in paths:
if leaf in path and len(path) != max_path_len:
selected_leaves.append(leaf)
return selected_leaves
def _pruning(self, paths):
"""Second part of the feature selection algorithm"""
paths = get_paths(self._hierarchy, reverse=True)
updated_representatives = []
for path in paths:
path.remove("ROOT")
average_relevance = statistics.mean(
[
self._relevance_values[node]
for node in path
if node in self.representatives_
]
)
for node in path:
if node in self.representatives_ and round(
self._relevance_values[node], 6
) >= round(average_relevance, 6):
updated_representatives.append(node)
self.representatives_ = updated_representatives
def _calculate_relevance(self, X, y):
values = information_gain(X, y)
self._relevance_values = dict(zip(self._columns, values))
def _preprocess(self, X):
"""Preprocess numerical data by summing up child values.
This is part of the HFE extension and only makes sense for
numerial data and not for binary data.
"""
return compute_aggregated_values("ROOT", X, self._hierarchy, self._columns)
def _leaf_filtering(self):
"""Filtering representatives by removing leaves with low relevance.
This is part of the HFE extension proposed by Oudah and Henschel.
"""
average_ig = statistics.mean(
[self._relevance_values[node] for node in self.representatives_]
)
leaves = self._select_leaves()
remove_nodes = [
leaf
for leaf in leaves
if self._relevance_values[leaf] < average_ig
or self._relevance_values[leaf] == 0
]
updated_representatives = [
node for node in self.representatives_ if node not in remove_nodes
]
self.representatives_ = updated_representatives