Benchmarking different CF explanation methods

In this notebook, we show runtimes of different model-agnostic explanation methods. Currently, we support three model-agnostic explanation methods: 1. Random-Sampling 2. Genetic Algorithm 3. Querying a KD tree

[1]:
import numpy as np
import timeit
import random

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

import dice_ml
from dice_ml.utils import helpers  # helper functions
from dice_ml import Dice
[2]:
%load_ext autoreload
%autoreload 2

Loading dataset

We use the “adult” income dataset from UCI Machine Learning Repository (https://archive.ics.uci.edu/ml/datasets/adult). For demonstration purposes, we transform the data as described in dice_ml.utils.helpers module.

[3]:
dataset = helpers.load_adult_income_dataset()
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Input In [3], in <cell line: 1>()
----> 1 dataset = helpers.load_adult_income_dataset()

File /mnt/c/Users/amshar/code/dice/dice_ml/utils/helpers.py:25, in load_adult_income_dataset(only_train)
     19 def load_adult_income_dataset(only_train=True):
     20     """Loads adult income dataset from https://archive.ics.uci.edu/ml/datasets/Adult and prepares
     21        the data for data analysis based on https://rpubs.com/H_Zhu/235617
     22
     23     :return adult_data: returns preprocessed adult income dataset.
     24     """
---> 25     raw_data = np.genfromtxt('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',
     26                              delimiter=', ', dtype=str, invalid_raise=False)
     28     #  column names from "https://archive.ics.uci.edu/ml/datasets/Adult"
     29     column_names = ['age', 'workclass', 'fnlwgt', 'education', 'educational-num', 'marital-status', 'occupation',
     30                     'relationship', 'race', 'gender', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
     31                     'income']

File ~/python-envs/v3.8dowhy/lib/python3.8/site-packages/numpy/lib/npyio.py:1934, in genfromtxt(fname, dtype, comments, delimiter, skip_header, skip_footer, converters, missing_values, filling_values, usecols, names, excludelist, deletechars, replace_space, autostrip, case_sensitive, defaultfmt, unpack, usemask, loose, invalid_raise, max_rows, encoding, ndmin, like)
   1932     fname = os_fspath(fname)
   1933 if isinstance(fname, str):
-> 1934     fid = np.lib._datasource.open(fname, 'rt', encoding=encoding)
   1935     fid_ctx = contextlib.closing(fid)
   1936 else:

File ~/python-envs/v3.8dowhy/lib/python3.8/site-packages/numpy/lib/_datasource.py:193, in open(path, mode, destpath, encoding, newline)
    156 """
    157 Open `path` with `mode` and return the file object.
    158
   (...)
    189
    190 """
    192 ds = DataSource(destpath)
--> 193 return ds.open(path, mode, encoding=encoding, newline=newline)

File ~/python-envs/v3.8dowhy/lib/python3.8/site-packages/numpy/lib/_datasource.py:533, in DataSource.open(self, path, mode, encoding, newline)
    530     return _file_openers[ext](found, mode=mode,
    531                               encoding=encoding, newline=newline)
    532 else:
--> 533     raise FileNotFoundError(f"{path} not found.")

FileNotFoundError: https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data not found.
[4]:
dataset.head()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Input In [4], in <cell line: 1>()
----> 1 dataset.head()

NameError: name 'dataset' is not defined
[5]:
d = dice_ml.Data(dataframe=dataset,
                 continuous_features=['age', 'hours_per_week'], outcome_name='income')
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Input In [5], in <cell line: 1>()
----> 1 d = dice_ml.Data(dataframe=dataset,
      2                  continuous_features=['age', 'hours_per_week'], outcome_name='income')

NameError: name 'dataset' is not defined

Training the ML model

Currently, the genetic algorithm & KD tree methods work with scikit-learn models. Support for Tensorflow 1&2 and Pytorch will be implemented soon.

[6]:
target = dataset["income"]

# Split data into train and test
datasetX = dataset.drop("income", axis=1)
x_train, x_test, y_train, y_test = train_test_split(datasetX,
                                                    target,
                                                    test_size=0.2,
                                                    random_state=0,
                                                    stratify=target)

numerical = ["age", "hours_per_week"]
categorical = x_train.columns.difference(numerical)

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_transformer = Pipeline(
    steps=[('scaler', StandardScaler())])

categorical_transformer = Pipeline(
    steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical),
        ('cat', categorical_transformer, categorical)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', transformations),
                      ('classifier', RandomForestClassifier())])
model = clf.fit(x_train, y_train)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Input In [6], in <cell line: 1>()
----> 1 target = dataset["income"]
      3 # Split data into train and test
      4 datasetX = dataset.drop("income", axis=1)

NameError: name 'dataset' is not defined
[7]:
m = dice_ml.Model(model=model, backend="sklearn")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Input In [7], in <cell line: 1>()
----> 1 m = dice_ml.Model(model=model, backend="sklearn")

NameError: name 'model' is not defined

Initialize counterfactual generation methods

We now initialize all three counterfactuals generation methods

[8]:
exp_random = Dice(d, m, method="random")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Input In [8], in <cell line: 1>()
----> 1 exp_random = Dice(d, m, method="random")

NameError: name 'd' is not defined
[9]:
exp_genetic = Dice(d, m, method="genetic")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Input In [9], in <cell line: 1>()
----> 1 exp_genetic = Dice(d, m, method="genetic")

NameError: name 'd' is not defined
[10]:
exp_KD = Dice(d, m, method="kdtree")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Input In [10], in <cell line: 1>()
----> 1 exp_KD = Dice(d, m, method="kdtree")

NameError: name 'd' is not defined
[11]:
query_instances = x_train[4:7]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Input In [11], in <cell line: 1>()
----> 1 query_instances = x_train[4:7]

NameError: name 'x_train' is not defined
[12]:
query_instances
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Input In [12], in <cell line: 1>()
----> 1 query_instances

NameError: name 'query_instances' is not defined

Generate Counterfactuals

We now generate counterfactuals of desired_class=0 using all three different methods and check the runtime. You can modify the number of loops (num_loops), and the number of diverse counterfactuals to generate (k).

[13]:
num_loops = 2
k = 2
[14]:
elapsed_random = 0
elapsed_kd = 0
elapsed_genetic = 0

for _ in range(num_loops):
    for q in query_instances:
        if q in d.categorical_feature_names:
            query_instances.loc[:, q] = \
                [np.unique(random.choice(dataset[q].values)) for _ in query_instances.index]
        else:
            query_instances.loc[:, q] = \
                [np.random.uniform(dataset[q].min(), dataset[q].max()) for _ in query_instances.index]

    start_time = timeit.default_timer()
    dice_exp_random = exp_random.generate_counterfactuals(query_instances, total_CFs=k,
                                                          desired_class=0, verbose=False)
    elapsed_random += timeit.default_timer() - start_time

    start_time = timeit.default_timer()
    dice_exp = exp_genetic.generate_counterfactuals(query_instances, total_CFs=k, desired_class=0,
                                                    yloss_type="hinge_loss", verbose=False)
    elapsed_genetic += timeit.default_timer() - start_time

    start_time = timeit.default_timer()
    dice_kd = exp_KD.generate_counterfactuals(query_instances, total_CFs=k, desired_class=0,
                                              verbose=False)
    elapsed_kd += timeit.default_timer() - start_time

m_random, s_random = divmod(elapsed_random, 60)
print('For Independent random sampling of features: Total time taken to generate %d' % num_loops,
      'sets of %d' % k, 'counterfactuals each: %02d' % m_random, 'min %02d' % s_random, 'sec')

m_kd, s_kd = divmod(elapsed_kd, 60)
print('For querying from a KD tree: Total time taken to generate %d' % num_loops,
      'sets of %d' % k, 'counterfactuals each: %02d' % m_kd, 'min %02d' % s_kd, 'sec')

m_genetic, s_genetic = divmod(elapsed_genetic, 60)
print('For genetic algorithm: Total time taken to generate %d' % num_loops,
      'sets of %d' % k, 'counterfactuals each: %02d' % m_genetic, 'min %02d' % s_genetic, 'sec')
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Input In [14], in <cell line: 5>()
      3 elapsed_genetic = 0
      5 for _ in range(num_loops):
----> 6     for q in query_instances:
      7         if q in d.categorical_feature_names:
      8             query_instances.loc[:, q] = \
      9                 [np.unique(random.choice(dataset[q].values)) for _ in query_instances.index]

NameError: name 'query_instances' is not defined