"""
TSEL Feature Selector.
"""
import numpy as np
from networkx.algorithms.dag import descendants
from sklearn.utils.validation import check_X_y
from hfs.eagerHierarchicalFeatureSelector import EagerHierarchicalFeatureSelector
from hfs.helpers import get_paths
from hfs.metrics import lift
[docs]class TSELSelector(EagerHierarchicalFeatureSelector):
"""A tree-based feature selection method for hierarchical features.
This hierarchical feature selection methods was proposed by Jeong and
Myaeng in 2013. The features are selected by choosing the most
representative nodes from each path and filtering these nodes further
by removing parents with children that were also selected.
"""
[docs] def __init__(
self, hierarchy: np.ndarray = None, use_original_implementation: bool = True
):
"""Initializes a TSELSelector.
Parameters
----------
hierarchy : np.ndarray
The hierarchy graph as an adjacency matrix. The
feature selection method is intended for a hierarchy
graph that has a tree structure.
use_original_implementation: bool
Should the original implementation from the
paper be used. If False, a slightly different
interpretation of the algorithm is used. Default
is True.
"""
super().__init__(hierarchy)
self.use_original_implementation = use_original_implementation
[docs] def fit(self, X, y, columns=None):
"""Fitting function that sets self.representatives\_.
The number of columns in X and the number of nodes in the hierarchy
are expected to be the same and each column should be mapped to
exactly one node in the hierarchy with the columns parameter.
After fitting self.representatives\_ includes the names of all
nodes from the hierarchy that are left after feature selection.
The features are selected by choosing the most
representative nodes from each path and filtering these nodes further
by removing parents with children that were also selected.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
The training input samples.
y : array-like, shape (n_samples,)
The target values. An array of int.
columns: list or None, length n_features
The mapping from the hierarchy graph's nodes to the columns in X.
A list of ints. If this parameter is None the columns in X and
the corresponding nodes in the hierarchy are expected to be in the
same order.
Returns
-------
self : object
Returns self.
"""
X, y = check_X_y(X, y, accept_sparse=True)
super().fit(X, y, columns)
# Feature Selection Algorithm
paths = get_paths(self._hierarchy)
lift_values = lift(X, y)
self._node_to_lift = {
column_name: lift_values[index]
for index, column_name in enumerate(self._columns)
}
self.representatives_ = self._find_representatives(paths)
self.is_fitted_ = True
return self
def _find_representatives(self, paths):
""" "Finds a representative node for each path.
This is the first stage of the feature selection algorithm.
In this stage two different implementation can be used.
This is determined by the self.use_original_implementation
parameter.
Parameters
----------
paths : list
The paths for which the representative nodes should
be found. This is a list of lists of node names.
Returns
-------
list : A list of node names. This are the features chosen
by the feature selection algorithm.
"""
representatives = set()
for path in paths:
path.remove("ROOT")
max_node = (
self._select_from_path1(path)
if self.use_original_implementation
else self._select_from_path2(path)
)
representatives.add(max_node)
return self._filter_representatives(representatives)
def _select_from_path1(self, path: list[str]):
"""Finds the prepresentative node for a path.
This is the implementation used in paper by Jeong and Myaeng.
Parameters
----------
paths : list
The paths for which the representative nodes should
be found. This is a list of lists of node names.
Returns
-------
node : int
The node selected as the representative for the given
path.
"""
for index, node in enumerate(path):
if index == len(path) - 1:
return node
elif self._node_to_lift[node] >= self._node_to_lift[path[index + 1]]:
return node
def _select_from_path2(self, path: list[str]):
"""Finds the prepresentative node for a path.
This is a different interpretation of the algorithm form the
paper by Jeong and Myaeng. If multiple nodes are the maximum
the node closest to the root is returned
Parameters
----------
paths : list
The paths for which the representative nodes should
be found. This is a list of lists of node names.
Returns
-------
node : int
The node selected as the representative for the given
path.
"""
max_node = max(path, key=lambda x: self._node_to_lift[x])
return max_node
def _filter_representatives(self, representatives: list[str]):
"""Filters the representative nodes selected in the previous stage.
Parameters
----------
representatives : list
The list of previously selected nodes.
Returns
-------
representatives : list
The list of filtered representatives.
"""
updated_representatives = []
for node in representatives:
selected_decendents = [
descendent
for descendent in descendants(self._hierarchy, node)
if descendent in representatives
]
if not selected_decendents:
updated_representatives.append(node)
return updated_representatives