Source code for mlshell.producers.metric

"""
The :mod:`mlshell.producers.metric` contains examples of `Metric` class to make
empty metric object and `MetricProducer` class to fill it.

:class:`mlshell.Metric` proposes unified interface to work with underlying
scorer. Intended to be used in :class:`mlshell.Workflow`. For new metric
formats no need to edit `Workflow` class, just adapt `Metric` interface logic.

:class:`mlshell.MetricProducer` specifies methods to make metric from custom
function. Current implementation inherits :func:`sklearn.metrics.make_scorer`
logic.

"""


import pycnfg
import numpy as np
import pandas as pd
import sklearn
import tabulate

__all__ = ['Metric', 'MetricProducer']


[docs]class Metric(object):
    """Unified pipeline interface.

    Implements interface to access arbitrary scorer.
    Interface: pprint and all underlying scorer methods.

    Attributes
    ----------
    scorer: callable, optional (default=None)
        Underlying scorer.
    oid : str, optional (default=None)
        Instance identifier.
    score_func: callable, optional (default=None)
        Scorer score function, return scalar value.
    score_func_vector: callable, optional (default=None)
        Scorer vectorized score function, return vector of values for all
        samples.
    greater_is_better : bool, optional (default=True)
        Whether `score_func` is a score function (default), meaning high
        is good, or a loss function, meaning low is good. In the latter
        case, the scorer object should sign-flip the outcome of the
        `score_func`.
    needs_proba : bool, optional (default=False)
        Whether `score_func` requires predict_proba to get probability
        estimates out of a classifier.
    needs_threshold : bool, optional (default=False)
        Whether `score_func` takes a continuous decision certainty.
        This only works for classification using estimators that
        have either a decision_function or predict_proba method.
    needs_custom_kw_args : bool, optional (default=False)
        If True, before score evaluation extract scorer kwargs from pipeline
        'pass_custom' step (if existed).

    Notes
    -----
    Extended :term:`sklearn:scorer` object:

    * Additional ``needs_custom_kw_args`` kwarg.
     Allows to optimize custom scorer kwargs as hyper-parameters.
    * Additional ``score_func_vector`` kwarg.
     Allows to evaluate vectorized score for more detailed analyze.

    """
[docs]    def __init__(self, scorer=None, oid=None, score_func=None,
                 score_func_vector=None, greater_is_better=True,
                 needs_proba=False, needs_threshold=False,
                 needs_custom_kw_args=False):
        self.scorer = scorer
        self.score_func = score_func
        self.score_func_vector = score_func_vector
        self.oid = oid
        # Flags.
        self.greater_is_better = greater_is_better
        self.needs_proba = needs_proba
        self.needs_threshold = needs_threshold
        self.needs_custom_kw_args = needs_custom_kw_args

    def __call__(self, estimator, *args, **kwargs):
        """Redirect call to scorer object."""
        if self.needs_custom_kw_args:
            self._set_custom_kwargs(estimator)
        return self.scorer(estimator, *args, **kwargs)

    def __getattr__(self, name):
        """Redirect unknown methods to scorer object."""
        def wrapper(*args, **kwargs):
            # if name == '__getstate__' or name == '__setstate__':
            #     # Otherwise error on pickle/unpickle.
            #     return False
            return getattr(self.scorer, name)(*args, **kwargs)
        return wrapper

    def __getstate__(self):
        # Allow pickle.
        return self.__dict__

    def __setstate__(self, d):
        # Allow unpickle.
        self.__dict__ = d

    @property
    def kw_args(self):
        """dict: Additional kwargs passed to `score_func`."""
        # Unchanged if no `pass_custom` step in pipeline.
        return self.scorer._kwargs

[docs]    def pprint(self, score):
        """Pretty print metric result.

        Parameters
        ----------
        score : any object
            `score_func` output.

        Returns
        -------
        score : str
            Input converted to string.

        """
        if self.score_func.__name__ == 'confusion_matrix':
            labels = self.scorer._kwargs.get('labels', None)  # classes
            score = tabulate.tabulate(
                pd.DataFrame(data=score, columns=labels, index=labels),
                headers='keys', tablefmt='psql'
            ).replace('\n', '\n    ')
        elif self.score_func.__name__ == 'classification_report':
            if isinstance(score, dict):
                score = tabulate.tabulate(
                    pd.DataFrame(score), headers='keys', tablefmt='psql'
                ).replace('\n', '\n    ')
            else:
                score = score.replace('\n', '\n    ')
        elif isinstance(score, np.ndarray):
            score = np.array2string(score, prefix='    ')
        return str(score)

    def _set_custom_kwargs(self, estimator):
        # Allow to get custom kwargs
        if hasattr(estimator, 'steps'):
            for step in estimator.steps:
                if step[0] == 'pass_custom':
                    temp = step[1].kw_args.get(self.oid, {})
                    # self.kw_args = self.kw_args
                    self.kw_args.update(temp)


[docs]class MetricProducer(pycnfg.Producer):
    """Factory to produce metric.

    Interface: make.

    Parameters
    ----------
    objects : dict
        Dictionary with resulted objects from previous executed producers:
        {'section_id__config__id', object,}.
    oid : str
        Unique identifier of produced object.
    path_id : str, optional (default='default')
        Project path identifier in `objects`.
    logger_id : str, optional (default='default')
        Logger identifier in `objects`.

    Attributes
    ----------
    objects : dict
        Dictionary with resulted objects from previous executed producers:
        {'section_id__config__id', object,}.
    oid : str
        Unique identifier of produced object.
    logger : :class:`logging.Logger`
        Logger.
    project_path: str
        Absolute path to project dir.

    """
    _required_parameters = ['objects', 'oid', 'path_id', 'logger_id']

[docs]    def __init__(self, objects, oid, path_id='path__default',
                 logger_id='logger__default'):
        pycnfg.Producer.__init__(self, objects, oid, path_id=path_id,
                                 logger_id=logger_id)

[docs]    @classmethod
    def make(cls, scorer, score_func, score_func_vector=None,
             needs_custom_kw_args=False, **kwargs):
        """Make scorer from metric callable.

        Parameters
        ----------
        scorer : :class:`mlshell.Metric`
            Scorer object, will be updated.
        score_func : callback or str
            Custom function or key from :data:`sklearn.metrics.SCORERS` .
        score_func_vector: callback, optional (default=None)
            Vectorized `score_func` returning vector of values for all samples.
            Mainly for result visualization purpose.
        needs_custom_kw_args : bool, optional (default=False)
            If True, before score evaluation extract scorer kwargs from
            pipeline 'pass_custom' step (if existed).
        **kwargs : dict
            Additional kwargs to pass in :func:`sklearn.metrics.make_scorer`
            (if ``score_func`` is not str).

        Notes
        -----
        Extended :func:`sklearn.metrics.make_scorer` in compliance with
        :class:`mlshell.Metric` .

        """
        # Convert to scorer.
        if isinstance(score_func, str):
            # built_in = sklearn.metrics.SCORERS.keys().
            # Ignore kwargs, built-in `str` metrics has hard-coded kwargs.
            scorer.scorer = sklearn.metrics.get_scorer(score_func)
        else:
            # Non-scalar output metric also possible.
            scorer.scorer = sklearn.metrics.make_scorer(score_func, **kwargs)
        scorer.score_func = score_func
        scorer.score_func_vector = score_func_vector
        scorer.needs_custom_kw_args = needs_custom_kw_args
        scorer.greater_is_better = scorer.scorer._sign > 0
        scorer.needs_proba =\
            isinstance(scorer.scorer, sklearn.metrics._scorer._ProbaScorer)
        scorer.needs_threshold =\
            isinstance(scorer.scorer, sklearn.metrics._scorer._ThresholdScorer)
        scorer.needs_custom_kw_args = needs_custom_kw_args
        return scorer


if __name__ == '__main__':
    pass