Module vflow.vset
Set of modules to be parallelized over in a pipeline. Function arguments are each a list
Expand source code
"""Set of modules to be parallelized over in a pipeline.
Function arguments are each a list
"""
PREV_KEY = '__prev__'
FILTER_PREV_KEY = '__filter_prev__'
import joblib
import numpy as np
import ray
from mlflow.tracking import MlflowClient
from vflow.convert import *
from vflow.subkey import Subkey
from vflow.vfunc import Vfunc, AsyncModule, VfuncPromise, _remote_fun
class Vset:
def __init__(self, name: str, modules, module_keys: list = None,
is_async: bool = False, output_matching: bool = False,
lazy: bool = False, cache_dir: str = None,
tracking_dir: str = None):
"""
Parameters
----------
name: str
Name of this Vset.
modules: list or dict
Dictionary of functions that we want to associate with
module_keys: list (optional)
List of names corresponding to each module
is_async: bool (optional)
If True, `modules` are computed asynchronously
output_matching: bool (optional)
If True, then output keys from this Vset will be matched when used
in other Vsets
lazy: bool (optional)
If True, then modules are evaluated lazily, i.e. outputs are `vset.vfunc.VfuncPromise`
cache_dir: str (optional)
If provided, do caching and use `cache_dir` as the data store for
`joblib.Memory`.
tracking_dir: str (optional)
If provided, use the `mlflow.tracking` api to log outputs as metrics
with params determined by input keys.
.. todo:: include prev and next and change functions to include that.
"""
self.name = name
self._fitted = False
self.out = None # outputs
self._async = is_async
self._output_matching = output_matching
self._lazy = lazy
self._cache_dir = cache_dir
self._tracking_dir = tracking_dir
self._memory = joblib.Memory(self._cache_dir)
if self._tracking_dir is not None:
self._mlflow = MlflowClient(tracking_uri=self._tracking_dir)
experiment = self._mlflow.get_experiment_by_name(name=self.name)
if experiment is None:
self._exp_id = self._mlflow.create_experiment(name=self.name)
else:
self._exp_id = experiment.experiment_id
else:
self._mlflow = None
# check if any of the modules are AsyncModules
# if so, we'll make then all AsyncModules later on
if not self._async and np.any([isinstance(mod, AsyncModule) for mod in modules]):
self._async = True
if type(modules) is dict:
self.modules = modules
elif type(modules) is list:
if module_keys is not None:
assert type(module_keys) is list, 'modules passed as list but module_names is not a list'
assert len(modules) == len(
module_keys), 'modules list and module_names list do not have the same length'
# TODO: how best to handle tuple subkeys?
module_keys = [(self.__create_subkey(k),) for k in module_keys]
else:
module_keys = [(self.__create_subkey(f'{name}_{i}'),) for i in range(len(modules))]
# convert module keys to singleton tuples
self.modules = dict(zip(module_keys, modules))
# if needed, wrap the modules in the Vfunc or AsyncModule class
for k, v in self.modules.items():
if self._async:
if not isinstance(v, AsyncModule):
self.modules[k] = AsyncModule(k[0], v)
elif not isinstance(v, Vfunc):
self.modules[k] = Vfunc(k[0], v)
def _apply_func(self, out_dict: dict = None, *args):
"""Apply functions in out_dict to combined args dict
Optionally logs output Subkeys and values as params and metrics using
`mlflow.tracking` if this Vset has a `_tracking_dir`.
Parameters
----------
*args: dict
Takes multiple dicts and combines them into one.
Then runs modules on each item in combined dict.
out_dict: dict (optional), default None
The dictionary to pass to the matching function. If None, defaults to self.modules.
Returns
-------
out_dict: dict
Dictionary with items being determined by functions in module set.
Functions and input dictionaries are currently matched using a cartesian matching format.
Examples
--------
>>> modules, data = {LR : logistic}, {train_1 : [X1,y1], train2 : [X2,y2]}
{(train_1, LR) : fitted logistic, (train_2, LR) : fitted logistic}
"""
if out_dict is None:
out_dict = deepcopy(self.modules)
apply_func_cached = self._memory.cache(_apply_func_cached)
out_dict = apply_func_cached(out_dict, self._async, self._lazy, *args)
prev = tuple()
for arg in args:
if PREV_KEY in arg:
prev += (arg[PREV_KEY],)
out_dict[PREV_KEY] = (self,) + prev
if self._mlflow is not None:
run_dict = {}
# log subkeys as params and value as metric
for k, v in out_dict.items():
if not k == PREV_KEY:
origins = np.array([subk.origin for subk in k])
# ignore init origins and the last origin (this Vset)
param_idx = [
i for i in range(len(k[:-1])) if origins[i] != 'init'
]
# get or create mlflow run
run_dict_key = tuple([subk.value for subk in k[:-1]])
if run_dict_key in run_dict:
run_id = run_dict[run_dict_key]
else:
run = self._mlflow.create_run(self._exp_id)
run_id = run.info.run_id
run_dict[run_dict_key] = run_id
# log params
for idx in param_idx:
subkey = k[idx]
param_name = subkey.origin
# check if the origin occurs multiple times
if np.sum(origins == param_name) > 1:
occurence = np.sum(origins[:idx] == param_name)
param_name = param_name + str(occurence)
self._mlflow.log_param(
run_id, param_name, subkey.value
)
self._mlflow.log_metric(run_id, k[-1].value, v)
return out_dict
def fit(self, *args):
"""Fits to args using `_apply_func`
"""
out_dict = {}
for k, v in self.modules.items():
out_dict[k] = v.fit
self.out = self._apply_func(out_dict, *args)
prev = self.out[PREV_KEY][1:]
if hasattr(self, FILTER_PREV_KEY):
prev = getattr(self, FILTER_PREV_KEY) + prev
setattr(self, PREV_KEY, prev)
self._fitted = True
return self
def fit_transform(self, *args):
"""Fits to args and transforms only the first arg.
"""
return self.fit(*args).transform(args[0])
def transform(self, *args):
"""Transforms args using `_apply_func`
"""
if not self._fitted:
raise AttributeError('Please fit the Vset object before calling the transform method.')
out_dict = {}
for k, v in self.out.items():
if hasattr(v, 'transform'):
out_dict[k] = v.transform
return self._apply_func(out_dict, *args)
def predict(self, *args):
"""Predicts args using `_apply_func`
"""
if not self._fitted:
raise AttributeError('Please fit the Vset object before calling the predict method.')
pred_dict = {}
for k, v in self.out.items():
if hasattr(v, 'predict'):
pred_dict[k] = v.predict
return self._apply_func(pred_dict, *args)
def predict_proba(self, *args):
"""Calls predict_proba on args using `_apply_func`
"""
if not self._fitted:
raise AttributeError('Please fit the Vset object before calling the predict_proba method.')
pred_dict = {}
for k, v in self.out.items():
if hasattr(v, 'predict_proba'):
pred_dict[k] = v.predict_proba
return self._apply_func(pred_dict, *args)
def evaluate(self, *args):
"""Combines dicts before calling `_apply_func`
"""
return self._apply_func(None, *args)
def __call__(self, *args, n_out: int = None, keys=None, **kwargs):
"""Call args using `_apply_func`, optionally seperating
output dictionary into `n_out` dictionaries with `keys`
"""
if keys is None:
keys = []
if n_out is None:
n_out = len(args)
out_dict = self._apply_func(None, *args)
if n_out == 1:
return out_dict
out_dicts = sep_dicts(out_dict, n_out=n_out, keys=keys)
# add back prev
prev = out_dict[PREV_KEY]
for i in range(n_out):
if n_out == len(args):
out_dicts[i][PREV_KEY] = (prev[0],) + (prev[i + 1],)
else:
out_dicts[i][PREV_KEY] = prev
return out_dicts
def __getitem__(self, i):
"""Accesses ith item in the module set
"""
return self.modules[i]
def __contains__(self, key):
"""Returns true if modules is a dict and key is one of its keys
"""
if isinstance(self.modules, dict):
return key in self.modules.keys()
return False
def keys(self):
"""Returns Vset module keys
"""
if isinstance(self.modules, dict):
return self.modules.keys()
return {}.keys()
def __len__(self):
return len(self.modules)
def __str__(self):
return 'Vset(' + self.name + ')'
def __create_subkey(self, value):
"""Helper function to construct `Subkey` with
this Vset determining origin and output_matching
"""
return Subkey(value, self.name, self._output_matching)
def _apply_func_cached(out_dict: dict, is_async: bool, lazy: bool, *args):
"""
Params
------
*args: dict
Takes multiple dicts and combines them into one.
Then runs modules on each item in combined dict.
out_dict: dict
The dictionary to pass to the matching function.
is_async: bool
If True, outputs are computed asynchronously.
lazy: bool
If True, outputs are evaluated lazily, i.e. outputs are `VfuncPromise`.
Returns
-------
out_dict: dict
Dictionary with items being determined by functions in module set.
Functions and input dictionaries are currently matched using cartesian matching format.
"""
for in_dict in args:
if not isinstance(in_dict, dict):
raise Exception('Need to run init_args before calling module_set!')
data_dict = combine_dicts(*args)
out_dict = apply_modules(out_dict, data_dict, lazy)
if is_async and not lazy:
out_keys = list(out_dict.keys())
out_vals = ray.get(list(out_dict.values()))
out_dict = dict(zip(out_keys, out_vals))
return out_dict
Classes
class Vset (name: str, modules, module_keys: list = None, is_async: bool = False, output_matching: bool = False, lazy: bool = False, cache_dir: str = None, tracking_dir: str = None)
-
Parameters
name
:str
- Name of this Vset.
modules
:list
ordict
- Dictionary of functions that we want to associate with
module_keys
:list (optional)
- List of names corresponding to each module
is_async
:bool (optional)
- If True,
modules
are computed asynchronously output_matching
:bool (optional)
- If True, then output keys from this Vset will be matched when used in other Vsets
lazy
:bool (optional)
- If True, then modules are evaluated lazily, i.e. outputs are
vset.vfunc.VfuncPromise
cache_dir
:str (optional)
- If provided, do caching and use
cache_dir
as the data store forjoblib.Memory
. tracking_dir
:str (optional)
- If provided, use the
mlflow.tracking
api to log outputs as metrics with params determined by input keys.
TODO
include prev and next and change functions to include that.
Expand source code
class Vset: def __init__(self, name: str, modules, module_keys: list = None, is_async: bool = False, output_matching: bool = False, lazy: bool = False, cache_dir: str = None, tracking_dir: str = None): """ Parameters ---------- name: str Name of this Vset. modules: list or dict Dictionary of functions that we want to associate with module_keys: list (optional) List of names corresponding to each module is_async: bool (optional) If True, `modules` are computed asynchronously output_matching: bool (optional) If True, then output keys from this Vset will be matched when used in other Vsets lazy: bool (optional) If True, then modules are evaluated lazily, i.e. outputs are `vset.vfunc.VfuncPromise` cache_dir: str (optional) If provided, do caching and use `cache_dir` as the data store for `joblib.Memory`. tracking_dir: str (optional) If provided, use the `mlflow.tracking` api to log outputs as metrics with params determined by input keys. .. todo:: include prev and next and change functions to include that. """ self.name = name self._fitted = False self.out = None # outputs self._async = is_async self._output_matching = output_matching self._lazy = lazy self._cache_dir = cache_dir self._tracking_dir = tracking_dir self._memory = joblib.Memory(self._cache_dir) if self._tracking_dir is not None: self._mlflow = MlflowClient(tracking_uri=self._tracking_dir) experiment = self._mlflow.get_experiment_by_name(name=self.name) if experiment is None: self._exp_id = self._mlflow.create_experiment(name=self.name) else: self._exp_id = experiment.experiment_id else: self._mlflow = None # check if any of the modules are AsyncModules # if so, we'll make then all AsyncModules later on if not self._async and np.any([isinstance(mod, AsyncModule) for mod in modules]): self._async = True if type(modules) is dict: self.modules = modules elif type(modules) is list: if module_keys is not None: assert type(module_keys) is list, 'modules passed as list but module_names is not a list' assert len(modules) == len( module_keys), 'modules list and module_names list do not have the same length' # TODO: how best to handle tuple subkeys? module_keys = [(self.__create_subkey(k),) for k in module_keys] else: module_keys = [(self.__create_subkey(f'{name}_{i}'),) for i in range(len(modules))] # convert module keys to singleton tuples self.modules = dict(zip(module_keys, modules)) # if needed, wrap the modules in the Vfunc or AsyncModule class for k, v in self.modules.items(): if self._async: if not isinstance(v, AsyncModule): self.modules[k] = AsyncModule(k[0], v) elif not isinstance(v, Vfunc): self.modules[k] = Vfunc(k[0], v) def _apply_func(self, out_dict: dict = None, *args): """Apply functions in out_dict to combined args dict Optionally logs output Subkeys and values as params and metrics using `mlflow.tracking` if this Vset has a `_tracking_dir`. Parameters ---------- *args: dict Takes multiple dicts and combines them into one. Then runs modules on each item in combined dict. out_dict: dict (optional), default None The dictionary to pass to the matching function. If None, defaults to self.modules. Returns ------- out_dict: dict Dictionary with items being determined by functions in module set. Functions and input dictionaries are currently matched using a cartesian matching format. Examples -------- >>> modules, data = {LR : logistic}, {train_1 : [X1,y1], train2 : [X2,y2]} {(train_1, LR) : fitted logistic, (train_2, LR) : fitted logistic} """ if out_dict is None: out_dict = deepcopy(self.modules) apply_func_cached = self._memory.cache(_apply_func_cached) out_dict = apply_func_cached(out_dict, self._async, self._lazy, *args) prev = tuple() for arg in args: if PREV_KEY in arg: prev += (arg[PREV_KEY],) out_dict[PREV_KEY] = (self,) + prev if self._mlflow is not None: run_dict = {} # log subkeys as params and value as metric for k, v in out_dict.items(): if not k == PREV_KEY: origins = np.array([subk.origin for subk in k]) # ignore init origins and the last origin (this Vset) param_idx = [ i for i in range(len(k[:-1])) if origins[i] != 'init' ] # get or create mlflow run run_dict_key = tuple([subk.value for subk in k[:-1]]) if run_dict_key in run_dict: run_id = run_dict[run_dict_key] else: run = self._mlflow.create_run(self._exp_id) run_id = run.info.run_id run_dict[run_dict_key] = run_id # log params for idx in param_idx: subkey = k[idx] param_name = subkey.origin # check if the origin occurs multiple times if np.sum(origins == param_name) > 1: occurence = np.sum(origins[:idx] == param_name) param_name = param_name + str(occurence) self._mlflow.log_param( run_id, param_name, subkey.value ) self._mlflow.log_metric(run_id, k[-1].value, v) return out_dict def fit(self, *args): """Fits to args using `_apply_func` """ out_dict = {} for k, v in self.modules.items(): out_dict[k] = v.fit self.out = self._apply_func(out_dict, *args) prev = self.out[PREV_KEY][1:] if hasattr(self, FILTER_PREV_KEY): prev = getattr(self, FILTER_PREV_KEY) + prev setattr(self, PREV_KEY, prev) self._fitted = True return self def fit_transform(self, *args): """Fits to args and transforms only the first arg. """ return self.fit(*args).transform(args[0]) def transform(self, *args): """Transforms args using `_apply_func` """ if not self._fitted: raise AttributeError('Please fit the Vset object before calling the transform method.') out_dict = {} for k, v in self.out.items(): if hasattr(v, 'transform'): out_dict[k] = v.transform return self._apply_func(out_dict, *args) def predict(self, *args): """Predicts args using `_apply_func` """ if not self._fitted: raise AttributeError('Please fit the Vset object before calling the predict method.') pred_dict = {} for k, v in self.out.items(): if hasattr(v, 'predict'): pred_dict[k] = v.predict return self._apply_func(pred_dict, *args) def predict_proba(self, *args): """Calls predict_proba on args using `_apply_func` """ if not self._fitted: raise AttributeError('Please fit the Vset object before calling the predict_proba method.') pred_dict = {} for k, v in self.out.items(): if hasattr(v, 'predict_proba'): pred_dict[k] = v.predict_proba return self._apply_func(pred_dict, *args) def evaluate(self, *args): """Combines dicts before calling `_apply_func` """ return self._apply_func(None, *args) def __call__(self, *args, n_out: int = None, keys=None, **kwargs): """Call args using `_apply_func`, optionally seperating output dictionary into `n_out` dictionaries with `keys` """ if keys is None: keys = [] if n_out is None: n_out = len(args) out_dict = self._apply_func(None, *args) if n_out == 1: return out_dict out_dicts = sep_dicts(out_dict, n_out=n_out, keys=keys) # add back prev prev = out_dict[PREV_KEY] for i in range(n_out): if n_out == len(args): out_dicts[i][PREV_KEY] = (prev[0],) + (prev[i + 1],) else: out_dicts[i][PREV_KEY] = prev return out_dicts def __getitem__(self, i): """Accesses ith item in the module set """ return self.modules[i] def __contains__(self, key): """Returns true if modules is a dict and key is one of its keys """ if isinstance(self.modules, dict): return key in self.modules.keys() return False def keys(self): """Returns Vset module keys """ if isinstance(self.modules, dict): return self.modules.keys() return {}.keys() def __len__(self): return len(self.modules) def __str__(self): return 'Vset(' + self.name + ')' def __create_subkey(self, value): """Helper function to construct `Subkey` with this Vset determining origin and output_matching """ return Subkey(value, self.name, self._output_matching)
Methods
def evaluate(self, *args)
-
Combines dicts before calling
_apply_func
Expand source code
def evaluate(self, *args): """Combines dicts before calling `_apply_func` """ return self._apply_func(None, *args)
def fit(self, *args)
-
Fits to args using
_apply_func
Expand source code
def fit(self, *args): """Fits to args using `_apply_func` """ out_dict = {} for k, v in self.modules.items(): out_dict[k] = v.fit self.out = self._apply_func(out_dict, *args) prev = self.out[PREV_KEY][1:] if hasattr(self, FILTER_PREV_KEY): prev = getattr(self, FILTER_PREV_KEY) + prev setattr(self, PREV_KEY, prev) self._fitted = True return self
def fit_transform(self, *args)
-
Fits to args and transforms only the first arg.
Expand source code
def fit_transform(self, *args): """Fits to args and transforms only the first arg. """ return self.fit(*args).transform(args[0])
def keys(self)
-
Returns Vset module keys
Expand source code
def keys(self): """Returns Vset module keys """ if isinstance(self.modules, dict): return self.modules.keys() return {}.keys()
def predict(self, *args)
-
Predicts args using
_apply_func
Expand source code
def predict(self, *args): """Predicts args using `_apply_func` """ if not self._fitted: raise AttributeError('Please fit the Vset object before calling the predict method.') pred_dict = {} for k, v in self.out.items(): if hasattr(v, 'predict'): pred_dict[k] = v.predict return self._apply_func(pred_dict, *args)
def predict_proba(self, *args)
-
Calls predict_proba on args using
_apply_func
Expand source code
def predict_proba(self, *args): """Calls predict_proba on args using `_apply_func` """ if not self._fitted: raise AttributeError('Please fit the Vset object before calling the predict_proba method.') pred_dict = {} for k, v in self.out.items(): if hasattr(v, 'predict_proba'): pred_dict[k] = v.predict_proba return self._apply_func(pred_dict, *args)
def transform(self, *args)
-
Transforms args using
_apply_func
Expand source code
def transform(self, *args): """Transforms args using `_apply_func` """ if not self._fitted: raise AttributeError('Please fit the Vset object before calling the transform method.') out_dict = {} for k, v in self.out.items(): if hasattr(v, 'transform'): out_dict[k] = v.transform return self._apply_func(out_dict, *args)