Module vflow.helpers

User-facing helper functions included at import vflow

Expand source code
"""User-facing helper functions included at import vflow
"""
from functools import partial
from itertools import product
from typing import Union

from vflow.convert import dict_to_df
from vflow.vfunc import Vfunc
from vflow.vset import Vset, PREV_KEY, FILTER_PREV_KEY


def build_vset(name: str, obj, param_dict=None, *args, reps: int = 1,
               is_async: bool = False, output_matching: bool = False,
               lazy: bool = False, cache_dir: str = None, verbose: bool = True,
               tracking_dir: str = None, **kwargs) -> Vset:
    """Builds a Vset by currying callable obj with all combinations of parameters in param_dict.

    Parameters
    ----------
    name: str
        a name for the output Vset
    obj: callable
        a callable to use as the base for Vfuncs in the output Vset
    param_dict: dict[str, list]
        keys are obj kwarg names and values in the dict are lists of params to try
    *args
        additional fixed arguments to pass to obj
    reps: int (optional)
        the number of times to repeat the obj in the output Vset's modules for
        each combination of params in param_dict
    is_async: bool (optional)
        if True, modules are computed asynchronously
    output_matching: bool (optional)
        if True, then output keys from Vset will be matched when used
        in other Vsets
    cache_dir: str (optional)
        if provided, do caching and use cache_dir as the data store for
        joblib.Memory
    verbose : bool (optional)
        if True, modules are named with param_dict items as tuples of str("param_name=param_val")
    tracking_dir: str (optional)
        if provided, use the mlflow.tracking api to log outputs as metrics
        with params determined by input keys
    **kwargs
        additional fixed keyword arguments to pass to obj

    Returns
    -------
    new_vset : Vset
    """
    if param_dict is None:
        param_dict = {}
    assert callable(obj), 'obj must be callable'

    vfuncs = []
    vkeys = []

    # TODO: better way to check this?
    # check if obj is a class
    instantiate = isinstance(obj, type)

    param_names = list(param_dict.keys())
    param_lists = list(param_dict.values())
    kwargs_tuples = product(*param_lists)
    for tup in kwargs_tuples:
        kwargs_dict = {}
        vkey_tup = ()
        for param_name, param_val in zip(param_names, tup):
            kwargs_dict[param_name] = param_val
            vkey_tup += (f'{param_name}={param_val}', )
        # add additional fixed kwargs to kwargs_dict
        for k, v in kwargs.items():
            kwargs_dict[k] = v
        for i in range(reps):
            # add module key to vkeys
            if reps > 1:
                vkeys.append((f'rep={i}', ) + vkey_tup)
            else:
                vkeys.append(vkey_tup)
            if instantiate:
                # instantiate obj
                vfuncs.append(Vfunc(module=obj(*args, **kwargs_dict), name=str(vkey_tup)))
            else:
                # use partial to wrap obj
                vfuncs.append(Vfunc(module=partial(obj, *args, **kwargs_dict), name=str(vkey_tup)))
    if not verbose or (len(param_dict) == 0 and reps == 1):
        vkeys = None
    return Vset(name, vfuncs, is_async=is_async, module_keys=vkeys,
                output_matching=output_matching, lazy=lazy,
                cache_dir=cache_dir, tracking_dir=tracking_dir)


def filter_vset_by_metric(metric_dict: dict, vset: Vset, *vsets: Vset, n_keep: int = 1,
                          bigger_is_better: bool = True, filter_on=None,
                          group: bool = False) -> Union[Vset, list]:
    """Returns a new Vset by filtering `vset.modules` based on values in filter_dict.

    Parameters
    ----------
    metric_dict: dict
        output from a Vset, typically with metrics or other numeric values to use when
        filtering `vset.modules`
    vset: Vset
        a Vsets
    *vsets: Vset
        zero or more additional Vsets
    n_keep: int (optional)
        number of entries to keep from `vset.modules`
    bigger_is_better: bool (optional)
        if True, then the top `n_keep` largest values are retained
    filter_on: list[str] (optional)
        if there are multiple metrics in `metric_dict`, you can specify a subset
        to consider
    group: bool (optional)
        if True, average metrics after grouping values in `metric_dict` by the
        input Vset names

    Returns
    -------
    *new_vset : Vset
        Copies of the input Vsets but with Vfuncs filtered based on metrics
    """
    if filter_on is None:
        filter_on = []
    df = dict_to_df(metric_dict)
    vsets = [vset, *vsets]
    vset_names = []
    for vset in vsets:
        if vset.name not in df.columns:
            raise ValueError(f'{vset.name} should be one of the columns of dict_to_df(metric_dict)')
        vset_names.append(vset.name)
    if len(filter_on) > 0:
        filter_col = list(metric_dict.keys())[0][-1].origin
        df = df[df[filter_col].isin(filter_on)]
    if group:
        df = df.groupby(by=vset_names, as_index=False).mean()
    if bigger_is_better:
        df = df.sort_values(by='out', ascending=False)
    else:
        df = df.sort_values(by='out')
    df = df.iloc[0:n_keep]
    for i, vset in enumerate(vsets):
        vfuncs = vset.modules
        vfunc_filter = [str(name) for name in df[vset.name].to_numpy()]
        new_vfuncs = {k: v for k, v in vfuncs.items() if str(v.name) in vfunc_filter}
        new_vset = Vset('filtered_' + vset.name, new_vfuncs, is_async=vset._async,
                        output_matching=vset._output_matching, lazy=vset._lazy,
                        cache_dir=vset._cache_dir, tracking_dir=vset._tracking_dir)
        setattr(new_vset, FILTER_PREV_KEY, (metric_dict[PREV_KEY], vset,))
        setattr(new_vset, PREV_KEY, getattr(new_vset, FILTER_PREV_KEY))
        vsets[i] = new_vset
    if len(vsets) == 1:
        return vsets[0]
    else:
        return vsets

Functions

def build_vset(name: str, obj, param_dict=None, *args, reps: int = 1, is_async: bool = False, output_matching: bool = False, lazy: bool = False, cache_dir: str = None, verbose: bool = True, tracking_dir: str = None, **kwargs) ‑> Vset

Builds a Vset by currying callable obj with all combinations of parameters in param_dict.

Parameters

name : str
a name for the output Vset
obj : callable
a callable to use as the base for Vfuncs in the output Vset
param_dict : dict[str, list]
keys are obj kwarg names and values in the dict are lists of params to try
*args
additional fixed arguments to pass to obj
reps : int (optional)
the number of times to repeat the obj in the output Vset's modules for each combination of params in param_dict
is_async : bool (optional)
if True, modules are computed asynchronously
output_matching : bool (optional)
if True, then output keys from Vset will be matched when used in other Vsets
cache_dir : str (optional)
if provided, do caching and use cache_dir as the data store for joblib.Memory
verbose : bool (optional)
if True, modules are named with param_dict items as tuples of str("param_name=param_val")
tracking_dir : str (optional)
if provided, use the mlflow.tracking api to log outputs as metrics with params determined by input keys
**kwargs
additional fixed keyword arguments to pass to obj

Returns

new_vset : Vset
 
Expand source code
def build_vset(name: str, obj, param_dict=None, *args, reps: int = 1,
               is_async: bool = False, output_matching: bool = False,
               lazy: bool = False, cache_dir: str = None, verbose: bool = True,
               tracking_dir: str = None, **kwargs) -> Vset:
    """Builds a Vset by currying callable obj with all combinations of parameters in param_dict.

    Parameters
    ----------
    name: str
        a name for the output Vset
    obj: callable
        a callable to use as the base for Vfuncs in the output Vset
    param_dict: dict[str, list]
        keys are obj kwarg names and values in the dict are lists of params to try
    *args
        additional fixed arguments to pass to obj
    reps: int (optional)
        the number of times to repeat the obj in the output Vset's modules for
        each combination of params in param_dict
    is_async: bool (optional)
        if True, modules are computed asynchronously
    output_matching: bool (optional)
        if True, then output keys from Vset will be matched when used
        in other Vsets
    cache_dir: str (optional)
        if provided, do caching and use cache_dir as the data store for
        joblib.Memory
    verbose : bool (optional)
        if True, modules are named with param_dict items as tuples of str("param_name=param_val")
    tracking_dir: str (optional)
        if provided, use the mlflow.tracking api to log outputs as metrics
        with params determined by input keys
    **kwargs
        additional fixed keyword arguments to pass to obj

    Returns
    -------
    new_vset : Vset
    """
    if param_dict is None:
        param_dict = {}
    assert callable(obj), 'obj must be callable'

    vfuncs = []
    vkeys = []

    # TODO: better way to check this?
    # check if obj is a class
    instantiate = isinstance(obj, type)

    param_names = list(param_dict.keys())
    param_lists = list(param_dict.values())
    kwargs_tuples = product(*param_lists)
    for tup in kwargs_tuples:
        kwargs_dict = {}
        vkey_tup = ()
        for param_name, param_val in zip(param_names, tup):
            kwargs_dict[param_name] = param_val
            vkey_tup += (f'{param_name}={param_val}', )
        # add additional fixed kwargs to kwargs_dict
        for k, v in kwargs.items():
            kwargs_dict[k] = v
        for i in range(reps):
            # add module key to vkeys
            if reps > 1:
                vkeys.append((f'rep={i}', ) + vkey_tup)
            else:
                vkeys.append(vkey_tup)
            if instantiate:
                # instantiate obj
                vfuncs.append(Vfunc(module=obj(*args, **kwargs_dict), name=str(vkey_tup)))
            else:
                # use partial to wrap obj
                vfuncs.append(Vfunc(module=partial(obj, *args, **kwargs_dict), name=str(vkey_tup)))
    if not verbose or (len(param_dict) == 0 and reps == 1):
        vkeys = None
    return Vset(name, vfuncs, is_async=is_async, module_keys=vkeys,
                output_matching=output_matching, lazy=lazy,
                cache_dir=cache_dir, tracking_dir=tracking_dir)
def filter_vset_by_metric(metric_dict: dict, vset: Vset, *vsets: Vset, n_keep: int = 1, bigger_is_better: bool = True, filter_on=None, group: bool = False) ‑> Union[Vset, list]

Returns a new Vset by filtering vset.modules based on values in filter_dict.

Parameters

metric_dict : dict
output from a Vset, typically with metrics or other numeric values to use when filtering vset.modules
vset : Vset
a Vsets
*vsets : Vset
zero or more additional Vsets
n_keep : int (optional)
number of entries to keep from vset.modules
bigger_is_better : bool (optional)
if True, then the top n_keep largest values are retained
filter_on : list[str] (optional)
if there are multiple metrics in metric_dict, you can specify a subset to consider
group : bool (optional)
if True, average metrics after grouping values in metric_dict by the input Vset names

Returns

*new_vset : Vset
Copies of the input Vsets but with Vfuncs filtered based on metrics
Expand source code
def filter_vset_by_metric(metric_dict: dict, vset: Vset, *vsets: Vset, n_keep: int = 1,
                          bigger_is_better: bool = True, filter_on=None,
                          group: bool = False) -> Union[Vset, list]:
    """Returns a new Vset by filtering `vset.modules` based on values in filter_dict.

    Parameters
    ----------
    metric_dict: dict
        output from a Vset, typically with metrics or other numeric values to use when
        filtering `vset.modules`
    vset: Vset
        a Vsets
    *vsets: Vset
        zero or more additional Vsets
    n_keep: int (optional)
        number of entries to keep from `vset.modules`
    bigger_is_better: bool (optional)
        if True, then the top `n_keep` largest values are retained
    filter_on: list[str] (optional)
        if there are multiple metrics in `metric_dict`, you can specify a subset
        to consider
    group: bool (optional)
        if True, average metrics after grouping values in `metric_dict` by the
        input Vset names

    Returns
    -------
    *new_vset : Vset
        Copies of the input Vsets but with Vfuncs filtered based on metrics
    """
    if filter_on is None:
        filter_on = []
    df = dict_to_df(metric_dict)
    vsets = [vset, *vsets]
    vset_names = []
    for vset in vsets:
        if vset.name not in df.columns:
            raise ValueError(f'{vset.name} should be one of the columns of dict_to_df(metric_dict)')
        vset_names.append(vset.name)
    if len(filter_on) > 0:
        filter_col = list(metric_dict.keys())[0][-1].origin
        df = df[df[filter_col].isin(filter_on)]
    if group:
        df = df.groupby(by=vset_names, as_index=False).mean()
    if bigger_is_better:
        df = df.sort_values(by='out', ascending=False)
    else:
        df = df.sort_values(by='out')
    df = df.iloc[0:n_keep]
    for i, vset in enumerate(vsets):
        vfuncs = vset.modules
        vfunc_filter = [str(name) for name in df[vset.name].to_numpy()]
        new_vfuncs = {k: v for k, v in vfuncs.items() if str(v.name) in vfunc_filter}
        new_vset = Vset('filtered_' + vset.name, new_vfuncs, is_async=vset._async,
                        output_matching=vset._output_matching, lazy=vset._lazy,
                        cache_dir=vset._cache_dir, tracking_dir=vset._tracking_dir)
        setattr(new_vset, FILTER_PREV_KEY, (metric_dict[PREV_KEY], vset,))
        setattr(new_vset, PREV_KEY, getattr(new_vset, FILTER_PREV_KEY))
        vsets[i] = new_vset
    if len(vsets) == 1:
        return vsets[0]
    else:
        return vsets