In [4]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from vflow import Vset, init_args, dict_to_df
from vflow.pipeline import build_graph
from functools import partial
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
import sklearn.utils
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.inspection import permutation_importance
In [5]:
# load data
data_dir = "./data/enhancer/"
X_train = np.asarray(pd.read_csv(data_dir + "01_X_train.csv", error_bad_lines=False).iloc[:, 1:])
X_test = np.asarray(pd.read_csv(data_dir + "02_X_test.csv", error_bad_lines=False).iloc[:, 1:])
y_train = np.asarray(pd.read_csv(data_dir + "03_y_train.csv", error_bad_lines=False).iloc[:, 1])
y_test = np.asarray(pd.read_csv(data_dir + "04_y_test.csv", error_bad_lines=False).iloc[:, 1])

# initialize data
np.random.seed(14)
X_train, X_test, y_train, y_test = init_args((X_train, X_test, y_train, y_test),
                                             names=['X_train', 'X_test', 'y_train', 'y_test'])

# subsample
subsampling_fns = [partial(sklearn.utils.resample, n_samples=1000, random_state=i) for i in range(3)]

subsampling_set = Vset(name='subsampling', modules=subsampling_fns)
X_trains, y_trains = subsampling_set(X_train, y_train)

modeling_set = Vset(name='modeling',
                         modules=[RandomForestClassifier(n_estimators=50, max_depth=5), MLPClassifier()],
                         module_keys=["RF", "MLP"])

# model
modeling_set.fit(X_trains, y_trains)
preds = modeling_set.predict(X_test)

# hard metrics
hard_metrics_set = Vset(name='hard_metrics', modules=[accuracy_score, balanced_accuracy_score],
                             module_keys=["Acc", "Bal_Acc"])
hard_metrics = hard_metrics_set.evaluate(preds, y_test)

# permutation importance
feature_importance_set = Vset(name='feature_importance', modules=[permutation_importance])
importances = feature_importance_set.evaluate(modeling_set.out, X_test, y_test)

G = build_graph(importances, draw=True)
plt.show()
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-5-747273350103> in <module>
     32 # permutation importance
     33 feature_importance_set = Vset(name='feature_importance', modules=[permutation_importance])
---> 34 importances = feature_importance_set.evaluate(modeling_set.out, X_test, y_test)
     35 
     36 G = build_graph(importances, draw=True)

~/Desktop/binyugroup/veridical-flow/vflow/vset.py in evaluate(self, *args, **kwargs)
    180         '''Combines dicts before calling _apply_func
    181         '''
--> 182         return self._apply_func(None, *args)
    183 
    184     def __call__(self, *args, n_out: int = None, keys: list = [], **kwargs):

~/Desktop/binyugroup/veridical-flow/vflow/vset.py in _apply_func(self, out_dict, *args)
     92         apply_func_cached = self._memory.cache(_apply_func_cached)
     93         data_dict, out_dict = apply_func_cached(
---> 94             out_dict, self._async, self._lazy, *args
     95         )
     96         if PREV_KEY in data_dict:

~/anaconda3/envs/vflow/lib/python3.6/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
    350 
    351     def __call__(self, *args, **kwargs):
--> 352         return self.func(*args, **kwargs)
    353 
    354     def call_and_shelve(self, *args, **kwargs):

~/Desktop/binyugroup/veridical-flow/vflow/vset.py in _apply_func_cached(out_dict, is_async, lazy, *args)
    250         args = async_args
    251     data_dict = combine_dicts(*args)
--> 252     out_dict = apply_modules(out_dict, data_dict, lazy)
    253 
    254     if is_async:

~/Desktop/binyugroup/veridical-flow/vflow/convert.py in apply_modules(modules, data_dict, lazy)
    270                         if isinstance(data, VfuncPromise):
    271                             data_list[i] = data()
--> 272                     out_dict[combined_key] = func(*data_list)
    273 
    274     return out_dict

~/Desktop/binyugroup/veridical-flow/vflow/vfunc.py in __call__(self, *args, **kwargs)
     35         '''This should decide what to call
     36         '''
---> 37         return self.fit(*args, **kwargs)
     38 
     39 @ray.remote

~/Desktop/binyugroup/veridical-flow/vflow/vfunc.py in fit(self, *args, **kwargs)
     24             return self.module.fit(*args, **kwargs)
     25         else:
---> 26             return self.module(*args, **kwargs)
     27 
     28     @abstractmethod

~/anaconda3/envs/vflow/lib/python3.6/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
     61             extra_args = len(args) - len(all_args)
     62             if extra_args <= 0:
---> 63                 return f(*args, **kwargs)
     64 
     65             # extra_args > 0

~/anaconda3/envs/vflow/lib/python3.6/site-packages/sklearn/inspection/_permutation_importance.py in permutation_importance(estimator, X, y, scoring, n_repeats, n_jobs, random_state, sample_weight)
    149     scores = Parallel(n_jobs=n_jobs)(delayed(_calculate_permutation_scores)(
    150         estimator, X, y, sample_weight, col_idx, random_seed, n_repeats, scorer
--> 151     ) for col_idx in range(X.shape[1]))
    152 
    153     importances = baseline_score - np.array(scores)

~/anaconda3/envs/vflow/lib/python3.6/site-packages/joblib/parallel.py in __call__(self, iterable)
   1042                 self._iterating = self._original_iterator is not None
   1043 
-> 1044             while self.dispatch_one_batch(iterator):
   1045                 pass
   1046 

~/anaconda3/envs/vflow/lib/python3.6/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
    857                 return False
    858             else:
--> 859                 self._dispatch(tasks)
    860                 return True
    861 

~/anaconda3/envs/vflow/lib/python3.6/site-packages/joblib/parallel.py in _dispatch(self, batch)
    775         with self._lock:
    776             job_idx = len(self._jobs)
--> 777             job = self._backend.apply_async(batch, callback=cb)
    778             # A job can complete so quickly than its callback is
    779             # called before we get here, causing self._jobs to

~/anaconda3/envs/vflow/lib/python3.6/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
    206     def apply_async(self, func, callback=None):
    207         """Schedule a func to be run"""
--> 208         result = ImmediateResult(func)
    209         if callback:
    210             callback(result)

~/anaconda3/envs/vflow/lib/python3.6/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
    570         # Don't delay the application, to avoid keeping the input
    571         # arguments in memory
--> 572         self.results = batch()
    573 
    574     def get(self):

~/anaconda3/envs/vflow/lib/python3.6/site-packages/joblib/parallel.py in __call__(self)
    261         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    262             return [func(*args, **kwargs)
--> 263                     for func, args, kwargs in self.items]
    264 
    265     def __reduce__(self):

~/anaconda3/envs/vflow/lib/python3.6/site-packages/joblib/parallel.py in <listcomp>(.0)
    261         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    262             return [func(*args, **kwargs)
--> 263                     for func, args, kwargs in self.items]
    264 
    265     def __reduce__(self):

~/anaconda3/envs/vflow/lib/python3.6/site-packages/sklearn/utils/fixes.py in __call__(self, *args, **kwargs)
    220     def __call__(self, *args, **kwargs):
    221         with config_context(**self.config):
--> 222             return self.function(*args, **kwargs)

~/anaconda3/envs/vflow/lib/python3.6/site-packages/sklearn/inspection/_permutation_importance.py in _calculate_permutation_scores(estimator, X, y, sample_weight, col_idx, random_state, n_repeats, scorer)
     40             X_permuted[:, col_idx] = X_permuted[shuffling_idx, col_idx]
     41         feature_score = _weights_scorer(
---> 42             scorer, estimator, X_permuted, y, sample_weight
     43         )
     44         scores[n_round] = feature_score

~/anaconda3/envs/vflow/lib/python3.6/site-packages/sklearn/inspection/_permutation_importance.py in _weights_scorer(scorer, estimator, X, y, sample_weight)
     14     if sample_weight is not None:
     15         return scorer(estimator, X, y, sample_weight)
---> 16     return scorer(estimator, X, y)
     17 
     18 

~/anaconda3/envs/vflow/lib/python3.6/site-packages/sklearn/metrics/_scorer.py in _passthrough_scorer(estimator, *args, **kwargs)
    395 def _passthrough_scorer(estimator, *args, **kwargs):
    396     """Function that wraps estimator.score"""
--> 397     return estimator.score(*args, **kwargs)
    398 
    399 

~/anaconda3/envs/vflow/lib/python3.6/site-packages/sklearn/base.py in score(self, X, y, sample_weight)
    498         """
    499         from .metrics import accuracy_score
--> 500         return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
    501 
    502     def _more_tags(self):

~/anaconda3/envs/vflow/lib/python3.6/site-packages/sklearn/ensemble/_forest.py in predict(self, X)
    628             The predicted classes.
    629         """
--> 630         proba = self.predict_proba(X)
    631 
    632         if self.n_outputs_ == 1:

~/anaconda3/envs/vflow/lib/python3.6/site-packages/sklearn/ensemble/_forest.py in predict_proba(self, X)
    685             delayed(_accumulate_prediction)(e.predict_proba, X, all_proba,
    686                                             lock)
--> 687             for e in self.estimators_)
    688 
    689         for proba in all_proba:

~/anaconda3/envs/vflow/lib/python3.6/site-packages/joblib/parallel.py in __call__(self, iterable)
   1042                 self._iterating = self._original_iterator is not None
   1043 
-> 1044             while self.dispatch_one_batch(iterator):
   1045                 pass
   1046 

~/anaconda3/envs/vflow/lib/python3.6/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
    857                 return False
    858             else:
--> 859                 self._dispatch(tasks)
    860                 return True
    861 

~/anaconda3/envs/vflow/lib/python3.6/site-packages/joblib/parallel.py in _dispatch(self, batch)
    775         with self._lock:
    776             job_idx = len(self._jobs)
--> 777             job = self._backend.apply_async(batch, callback=cb)
    778             # A job can complete so quickly than its callback is
    779             # called before we get here, causing self._jobs to

~/anaconda3/envs/vflow/lib/python3.6/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
    206     def apply_async(self, func, callback=None):
    207         """Schedule a func to be run"""
--> 208         result = ImmediateResult(func)
    209         if callback:
    210             callback(result)

~/anaconda3/envs/vflow/lib/python3.6/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
    570         # Don't delay the application, to avoid keeping the input
    571         # arguments in memory
--> 572         self.results = batch()
    573 
    574     def get(self):

~/anaconda3/envs/vflow/lib/python3.6/site-packages/joblib/parallel.py in __call__(self)
    261         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    262             return [func(*args, **kwargs)
--> 263                     for func, args, kwargs in self.items]
    264 
    265     def __reduce__(self):

~/anaconda3/envs/vflow/lib/python3.6/site-packages/joblib/parallel.py in <listcomp>(.0)
    261         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    262             return [func(*args, **kwargs)
--> 263                     for func, args, kwargs in self.items]
    264 
    265     def __reduce__(self):

~/anaconda3/envs/vflow/lib/python3.6/site-packages/sklearn/utils/fixes.py in __call__(self, *args, **kwargs)
    220     def __call__(self, *args, **kwargs):
    221         with config_context(**self.config):
--> 222             return self.function(*args, **kwargs)

~/anaconda3/envs/vflow/lib/python3.6/site-packages/sklearn/ensemble/_forest.py in _accumulate_prediction(predict, X, out, lock)
    465     complains that it cannot pickle it when placed there.
    466     """
--> 467     prediction = predict(X, check_input=False)
    468     with lock:
    469         if len(out) == 1:

~/anaconda3/envs/vflow/lib/python3.6/site-packages/sklearn/tree/_classes.py in predict_proba(self, X, check_input)
    934         check_is_fitted(self)
    935         X = self._validate_X_predict(X, check_input)
--> 936         proba = self.tree_.predict(X)
    937 
    938         if self.n_outputs_ == 1:

KeyboardInterrupt: 
In [11]:
df = dict_to_df(hard_metrics)
df
Out[11]:
init-subsampling init-subsampling subsampling init-modeling modeling init-hard_metrics hard_metrics out
0 X_test X_train subsampling_0 y_train RF y_test Acc 0.926097
1 X_test X_train subsampling_1 y_train RF y_test Acc 0.924557
2 X_test X_train subsampling_2 y_train RF y_test Acc 0.924044
3 X_test X_train subsampling_0 y_train MLP y_test Acc 0.907365
4 X_test X_train subsampling_1 y_train MLP y_test Acc 0.907878
5 X_test X_train subsampling_2 y_train MLP y_test Acc 0.901463
6 X_test X_train subsampling_0 y_train RF y_test Bal_Acc 0.826515
7 X_test X_train subsampling_1 y_train RF y_test Bal_Acc 0.794172
8 X_test X_train subsampling_2 y_train RF y_test Bal_Acc 0.807060
9 X_test X_train subsampling_0 y_train MLP y_test Bal_Acc 0.700783
10 X_test X_train subsampling_1 y_train MLP y_test Bal_Acc 0.705016
11 X_test X_train subsampling_2 y_train MLP y_test Bal_Acc 0.683108