Benchmarking different CF explanation methods
In this notebook, we show runtimes of different model-agnostic explanation methods. Currently, we support three model-agnostic explanation methods: 1. Random-Sampling 2. Genetic Algorithm 3. Querying a KD tree
[1]:
import numpy as np
import timeit
import random
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
import dice_ml
from dice_ml.utils import helpers # helper functions
from dice_ml import Dice
[2]:
%load_ext autoreload
%autoreload 2
Loading dataset
We use the “adult” income dataset from UCI Machine Learning Repository (https://archive.ics.uci.edu/ml/datasets/adult). For demonstration purposes, we transform the data as described in dice_ml.utils.helpers module.
[3]:
dataset = helpers.load_adult_income_dataset()
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
Input In [3], in <cell line: 1>()
----> 1 dataset = helpers.load_adult_income_dataset()
File /mnt/c/Users/amshar/code/dice/dice_ml/utils/helpers.py:25, in load_adult_income_dataset(only_train)
19 def load_adult_income_dataset(only_train=True):
20 """Loads adult income dataset from https://archive.ics.uci.edu/ml/datasets/Adult and prepares
21 the data for data analysis based on https://rpubs.com/H_Zhu/235617
22
23 :return adult_data: returns preprocessed adult income dataset.
24 """
---> 25 raw_data = np.genfromtxt('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',
26 delimiter=', ', dtype=str, invalid_raise=False)
28 # column names from "https://archive.ics.uci.edu/ml/datasets/Adult"
29 column_names = ['age', 'workclass', 'fnlwgt', 'education', 'educational-num', 'marital-status', 'occupation',
30 'relationship', 'race', 'gender', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
31 'income']
File ~/python-envs/v3.8dowhy/lib/python3.8/site-packages/numpy/lib/npyio.py:1934, in genfromtxt(fname, dtype, comments, delimiter, skip_header, skip_footer, converters, missing_values, filling_values, usecols, names, excludelist, deletechars, replace_space, autostrip, case_sensitive, defaultfmt, unpack, usemask, loose, invalid_raise, max_rows, encoding, ndmin, like)
1932 fname = os_fspath(fname)
1933 if isinstance(fname, str):
-> 1934 fid = np.lib._datasource.open(fname, 'rt', encoding=encoding)
1935 fid_ctx = contextlib.closing(fid)
1936 else:
File ~/python-envs/v3.8dowhy/lib/python3.8/site-packages/numpy/lib/_datasource.py:193, in open(path, mode, destpath, encoding, newline)
156 """
157 Open `path` with `mode` and return the file object.
158
(...)
189
190 """
192 ds = DataSource(destpath)
--> 193 return ds.open(path, mode, encoding=encoding, newline=newline)
File ~/python-envs/v3.8dowhy/lib/python3.8/site-packages/numpy/lib/_datasource.py:533, in DataSource.open(self, path, mode, encoding, newline)
530 return _file_openers[ext](found, mode=mode,
531 encoding=encoding, newline=newline)
532 else:
--> 533 raise FileNotFoundError(f"{path} not found.")
FileNotFoundError: https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data not found.
[4]:
dataset.head()
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Input In [4], in <cell line: 1>()
----> 1 dataset.head()
NameError: name 'dataset' is not defined
[5]:
d = dice_ml.Data(dataframe=dataset,
continuous_features=['age', 'hours_per_week'], outcome_name='income')
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Input In [5], in <cell line: 1>()
----> 1 d = dice_ml.Data(dataframe=dataset,
2 continuous_features=['age', 'hours_per_week'], outcome_name='income')
NameError: name 'dataset' is not defined
Training the ML model
Currently, the genetic algorithm & KD tree methods work with scikit-learn models. Support for Tensorflow 1&2 and Pytorch will be implemented soon.
[6]:
target = dataset["income"]
# Split data into train and test
datasetX = dataset.drop("income", axis=1)
x_train, x_test, y_train, y_test = train_test_split(datasetX,
target,
test_size=0.2,
random_state=0,
stratify=target)
numerical = ["age", "hours_per_week"]
categorical = x_train.columns.difference(numerical)
# We create the preprocessing pipelines for both numeric and categorical data.
numeric_transformer = Pipeline(
steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(
steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
transformations = ColumnTransformer(
transformers=[
('num', numeric_transformer, numerical),
('cat', categorical_transformer, categorical)])
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', transformations),
('classifier', RandomForestClassifier())])
model = clf.fit(x_train, y_train)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Input In [6], in <cell line: 1>()
----> 1 target = dataset["income"]
3 # Split data into train and test
4 datasetX = dataset.drop("income", axis=1)
NameError: name 'dataset' is not defined
[7]:
m = dice_ml.Model(model=model, backend="sklearn")
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Input In [7], in <cell line: 1>()
----> 1 m = dice_ml.Model(model=model, backend="sklearn")
NameError: name 'model' is not defined
Initialize counterfactual generation methods
We now initialize all three counterfactuals generation methods
[8]:
exp_random = Dice(d, m, method="random")
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Input In [8], in <cell line: 1>()
----> 1 exp_random = Dice(d, m, method="random")
NameError: name 'd' is not defined
[9]:
exp_genetic = Dice(d, m, method="genetic")
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Input In [9], in <cell line: 1>()
----> 1 exp_genetic = Dice(d, m, method="genetic")
NameError: name 'd' is not defined
[10]:
exp_KD = Dice(d, m, method="kdtree")
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Input In [10], in <cell line: 1>()
----> 1 exp_KD = Dice(d, m, method="kdtree")
NameError: name 'd' is not defined
[11]:
query_instances = x_train[4:7]
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Input In [11], in <cell line: 1>()
----> 1 query_instances = x_train[4:7]
NameError: name 'x_train' is not defined
[12]:
query_instances
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Input In [12], in <cell line: 1>()
----> 1 query_instances
NameError: name 'query_instances' is not defined
Generate Counterfactuals
We now generate counterfactuals of desired_class=0 using all three different methods and check the runtime. You can modify the number of loops (num_loops
), and the number of diverse counterfactuals to generate (k
).
[13]:
num_loops = 2
k = 2
[14]:
elapsed_random = 0
elapsed_kd = 0
elapsed_genetic = 0
for _ in range(num_loops):
for q in query_instances:
if q in d.categorical_feature_names:
query_instances.loc[:, q] = \
[np.unique(random.choice(dataset[q].values)) for _ in query_instances.index]
else:
query_instances.loc[:, q] = \
[np.random.uniform(dataset[q].min(), dataset[q].max()) for _ in query_instances.index]
start_time = timeit.default_timer()
dice_exp_random = exp_random.generate_counterfactuals(query_instances, total_CFs=k,
desired_class=0, verbose=False)
elapsed_random += timeit.default_timer() - start_time
start_time = timeit.default_timer()
dice_exp = exp_genetic.generate_counterfactuals(query_instances, total_CFs=k, desired_class=0,
yloss_type="hinge_loss", verbose=False)
elapsed_genetic += timeit.default_timer() - start_time
start_time = timeit.default_timer()
dice_kd = exp_KD.generate_counterfactuals(query_instances, total_CFs=k, desired_class=0,
verbose=False)
elapsed_kd += timeit.default_timer() - start_time
m_random, s_random = divmod(elapsed_random, 60)
print('For Independent random sampling of features: Total time taken to generate %d' % num_loops,
'sets of %d' % k, 'counterfactuals each: %02d' % m_random, 'min %02d' % s_random, 'sec')
m_kd, s_kd = divmod(elapsed_kd, 60)
print('For querying from a KD tree: Total time taken to generate %d' % num_loops,
'sets of %d' % k, 'counterfactuals each: %02d' % m_kd, 'min %02d' % s_kd, 'sec')
m_genetic, s_genetic = divmod(elapsed_genetic, 60)
print('For genetic algorithm: Total time taken to generate %d' % num_loops,
'sets of %d' % k, 'counterfactuals each: %02d' % m_genetic, 'min %02d' % s_genetic, 'sec')
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Input In [14], in <cell line: 5>()
3 elapsed_genetic = 0
5 for _ in range(num_loops):
----> 6 for q in query_instances:
7 if q in d.categorical_feature_names:
8 query_instances.loc[:, q] = \
9 [np.unique(random.choice(dataset[q].values)) for _ in query_instances.index]
NameError: name 'query_instances' is not defined