Source code for frlearn.trees.data_descriptors

"""Decision tree data descriptors"""
from __future__ import annotations

from typing import Callable

from sklearn.ensemble import IsolationForest

from frlearn.base import DataDescriptor


[docs]class EIF(DataDescriptor): """ Wrapper for the Extended Isolation Forest (IF) data descriptor [1]_. Requires the eif library, which is not automatically installed. Expresses the effort required to isolate a query instance from the target data by separating instances with random hyperplanes. Parameters ---------- psi : int or (int -> int) = 256 Sub-sampling size. Number of training instances to use for each random tree. Should be either a positive integer, or a function that takes the size of the target class and returns such an integer. If the size of the target class is a smaller number, that will be used instead. t : int = 100 Number of random trees. random_state : int = 0 Random state to use. eif_params additional keyword parameters will be passed on as-is to eif's iForest constructor. preprocessors : iterable = () Preprocessors to apply. Notes ----- Scores are the complement of the anomaly scores in [1]_. `psi` and `t` are two hyperparameters that can potentially be tuned, but the default values should be good enough [2]_. References ---------- .. [1] `Hariri S, Carrasco Kind M, Brunner RJ (2021). Extended Isolation Forest. IEEE Transactions on Knowledge and Data Engineering, vol 33, no 4, pp 1479–1489. doi: 10.1109/TKDE.2019.2947676 <https://ieeexplore.ieee.org/document/8888179>`_ .. [2] `Liu FT, Ting KM, Zhou Z-H (2008). Isolation Forest. ICDM 2008: Proceedings of the Eighth IEEE International Conference on Data Mining, pp 413–422. IEEE. doi: 10.1109/ICDM.2008.17 <https://ieeexplore.ieee.org/document/4781136>`_ """ def __init__( self, psi: int | Callable[[int], int] = 256, t: int = 100, random_state: int = 0, preprocessors=(), **eif_params ): super().__init__(preprocessors=preprocessors) try: import eif except ImportError: raise ImportError('EIF data descriptor requires the eif library.') from None self.psi = psi self.t = t self.random_state = random_state self.eif_params = eif_params def _construct(self, X): import eif model = super()._construct(X) model.psi = min(self.psi, X.shape[0]) model.t = self.t model.random_state = self.random_state model.forest = eif.iForest( X, ntrees=model.t, sample_size=model.psi, seed=model.random_state, ExtensionLevel=X.shape[1] - 1, **self.eif_params ) return model
[docs] class Model(DataDescriptor.Model): psi: int t: int random_state: int forest: ... def _query(self, X): # convert anomaly scores to normality scores return 1 - self.forest.compute_paths(X_in=X)
[docs]class IF(DataDescriptor): """ Wrapper for the Isolation Forest (IF) data descriptor [1]_ implemented in scikit-learn. Expresses the effort required to isolate a query instance from the target data by random splits on attribute values. Parameters ---------- psi : int or (int -> int) = 256 Sub-sampling size. Number of training instances to use for each random tree. Should be either a positive integer, or a function that takes the size of the target class and returns such an integer. If the size of the target class is a smaller number, that will be used instead. t : int = 100 Number of random trees. random_state : int = 0 Random state to use. preprocessors : iterable = () Preprocessors to apply. sklearn_params Additional keyword parameters will be passed on as-is to scikit-learn's IsolationForest constructor. Notes ----- Scores are the complement of the anomaly scores in [1]_. `psi` and `t` are two hyperparameters that can potentially be tuned, but the default values should be good enough [1]_. References ---------- .. [1] `Liu FT, Ting KM, Zhou Z-H (2008). Isolation Forest. ICDM 2008: Proceedings of the Eighth IEEE International Conference on Data Mining, pp 413–422. IEEE. doi: 10.1109/ICDM.2008.17 <https://ieeexplore.ieee.org/document/4781136>`_ """ def __init__( self, psi: int | Callable[[int], int] = 256, t: int = 100, random_state: int = 0, preprocessors=(), **sklearn_params, ): super().__init__(preprocessors=preprocessors) self.psi = psi self.t = t self.random_state = random_state self.sklearn_params = sklearn_params def _construct(self, X): model = super()._construct(X) model.psi = min(self.psi, X.shape[0]) model.t = self.t model.random_state = self.random_state model.forest = IsolationForest( max_samples=model.psi, n_estimators=model.t, random_state=self.random_state, **self.sklearn_params ).fit(X) return model
[docs] class Model(DataDescriptor.Model): psi: int t: int random_state: int forest: IsolationForest def _query(self, X): # map from [-1, 0] to [0, 1] return 1 + self.forest.score_samples(X)