Source code for greykite.framework.utils.result_summary

# BSD 2-CLAUSE LICENSE

# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:

# Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
# Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# original author: Albert Chen
"""Functions to summarize the output of
`~greykite.framework.pipeline.pipeline.forecast_pipeline`.
"""

import re
import warnings

import numpy as np
import pandas as pd
from scipy.stats import rankdata

from greykite.common.constants import FRACTION_OUTSIDE_TOLERANCE
from greykite.common.evaluation import EvaluationMetricEnum
from greykite.common.python_utils import assert_equal
from greykite.framework.constants import CV_REPORT_METRICS_ALL
from greykite.framework.pipeline.utils import get_score_func_with_aggregation


[docs]def get_ranks_and_splits(
        grid_search,
        score_func=EvaluationMetricEnum.MeanAbsolutePercentError.name,
        greater_is_better=False,
        combine_splits=True,
        decimals=None,
        warn_metric=True):
    """Extracts CV results from ``grid_search`` for the specified score function.
    Returns the correct ranks on the test set and a tuple of the scores across splits,
    for both test set and train set (if available).

    Notes
    -----
    While ``cv_results`` contains keys with the ranks, these ranks are inverted
    if lower values are better and the ``scoring`` function was initialized
    with ``greater_is_better=True`` to report metrics with their original sign.

    This function always returns the correct ranks, accounting for metric direction.

    Parameters
    ----------
    grid_search : `~sklearn.model_selection.RandomizedSearchCV`
        Grid search output (fitted RandomizedSearchCV object).
    score_func : `str` or callable, default ``EvaluationMetricEnum.MeanAbsolutePercentError.name``
        Score function to get the ranks for.
        If a callable, takes arrays ``y_true``, ``y_pred`` and returns a float.
        If a string, must be either a
        `~greykite.common.evaluation.EvaluationMetricEnum` member name
        or `~greykite.common.constants.FRACTION_OUTSIDE_TOLERANCE`.

        Should be the same as what was passed to
        :py:meth:`~greykite.framework.templates.forecaster.Forecaster.run_forecast_config`,
        or `~greykite.framework.pipeline.pipeline.forecast_pipeline`,
        or `~greykite.framework.pipeline.utils.get_hyperparameter_searcher`.
    greater_is_better : `bool` or None, default False
        True if ``score_func`` is a score function, meaning higher is better,
        and False if it is a loss function, meaning lower is better.
        Must be provided if ``score_func`` is a callable (custom function).
        Ignored if ``score_func`` is a string, because the direction is known.

        Used in this function to rank values in the proper direction.

        Should be the same as what was passed to
        :py:meth:`~greykite.framework.templates.forecaster.Forecaster.run_forecast_config`,
        or `~greykite.framework.pipeline.pipeline.forecast_pipeline`,
        or `~greykite.framework.pipeline.utils.get_hyperparameter_searcher`.
    combine_splits : `bool`, default True
        Whether to report split scores as a tuple in a single column.
        If True, a single column is returned for all the splits
        of a given metric and train/test set.
        For example, "split_train_score" would contain the values
        (split1_train_score, split2_train_score, split3_train_score)
        as as tuple.
        If False, they are reported in their original columns.
    decimals : `int` or None, default None
        Number of decimal places to round to.
        If decimals is negative, it specifies the number of
        positions to the left of the decimal point.
        If None, does not round.
    warn_metric : `bool`, default True
        Whether to issue a warning if the requested metric is
        not found in the CV results.

    Returns
    -------
    ranks_and_splits : `dict`
        Ranks and split scores.
        Dictionary with the following keys:

            ``"short_name"`` : `int`
                Canonical short name for the ``score_func``.
            ``"ranks"`` : `numpy.array`
                Ranks of the test scores for the ``score_func``,
                where 1 is the best.
            ``"split_train"`` : `list` [`list` [`float`]]
                Train split scores. Outer list corresponds to the
                parameter setting; inner list contains the
                scores for that parameter setting across all splits.
            ``"split_test"`` : `list` [`list` [`float`]]
                Test split scores. Outer list corresponds to the
                parameter setting; inner list contains the
                scores for that parameter setting across all splits.
    """
    cv_results = grid_search.cv_results_
    _, greater_is_better, short_name = get_score_func_with_aggregation(
        score_func=score_func,  # string or callable
        greater_is_better=greater_is_better,
        # Dummy value, doesn't matter because we ignore the returned `score_func`
        relative_error_tolerance=0.01)

    # Warns if the metric is not available
    if f"mean_test_{short_name}" not in cv_results:
        if warn_metric:
            warnings.warn(f"Metric '{short_name}' is not available in the CV results.")
        return {
            "short_name": short_name,
            "ranks": None,
            "split_train": None,
            "split_test": None}

    # Computes the ranks, using the same tiebreaking method as in sklearn.
    scores = cv_results[f"mean_test_{short_name}"].copy()
    if greater_is_better:
        scores *= -1  # `rankdata` function ranks lowest values first
    ranks = np.asarray(rankdata(scores, method='min'), dtype=np.int32)

    # Computes split score columns.
    train_scores = None
    test_scores = None

    def round_as_list(split_scores, decimals=None):
        """Rounds `split_scores` to the specified
        `decimals` and returns the result as a list.

        Parameters
        ----------
        split_scores : `numpy.array`
             Split scores.
        decimals : `int` or None, default None
            Number of decimal places to round to.
            If decimals is negative, it specifies the number of
            positions to the left of the decimal point.
            If None, does not round.
        Returns
        -------
        split_scores_list : `list` [`float`]
            ``split_scores``, rounded according
            to ``decimals`` and returned as a list.
        """
        if decimals is not None:
            split_scores = split_scores.round(decimals)
        return split_scores.tolist()

    if combine_splits:
        # Each sublist contains the scores for split i
        # across all parameter settings.
        test_scores = [
            round_as_list(
                cv_results[f"split{i}_test_{short_name}"],
                decimals=decimals)
            for i in range(grid_search.n_splits_)]
        # Makes each sublist contain the scores for a particular
        # parameter setting across all splits.
        test_scores = list(zip(*test_scores))

        # Train scores
        if grid_search.return_train_score:
            train_scores = [
                round_as_list(
                    cv_results[f"split{i}_train_{short_name}"],
                    decimals=decimals)
                for i in range(grid_search.n_splits_)]
            train_scores = list(zip(*train_scores))

    ranks_and_splits = {
        "short_name": short_name,
        "ranks": ranks,
        "split_train": train_scores,
        "split_test": test_scores}
    return ranks_and_splits


[docs]def summarize_grid_search_results(
        grid_search,
        only_changing_params=True,
        combine_splits=True,
        decimals=None,
        score_func=EvaluationMetricEnum.MeanAbsolutePercentError.name,
        score_func_greater_is_better=False,
        cv_report_metrics=CV_REPORT_METRICS_ALL,
        column_order=None):
    """Summarizes CV results for each grid search parameter combination.

    While ``grid_search.cv_results_`` could be imported into
    a `pandas.DataFrame` without this function, the following conveniences
    are provided:

        - returns the correct ranks based on each metric's greater_is_better direction.
        - summarizes the hyperparameter space, only showing the parameters that change
        - combines split scores into a tuple to save table width
        - rounds the values to specified decimals
        - orders columns by type (test score, train score, metric, etc.)

    Parameters
    ----------
    grid_search : `~sklearn.model_selection.RandomizedSearchCV`
        Grid search output (fitted RandomizedSearchCV object).
    only_changing_params : `bool`, default True
        If True, only show parameters with multiple values in
        the hyperparameter_grid.
    combine_splits : `bool`, default True
        Whether to report split scores as a tuple in a single column.

            - If True, adds a column for the test splits scores for each
              requested metric. Adds a column with train split scores if those
              are available.

              For example, "split_train_score" would contain the values
              (split1_train_score, split2_train_score, split3_train_score)
              as as tuple.
            - If False, this summary column is not added.

        The original split columns are available either way.
    decimals : `int` or None, default None
        Number of decimal places to round to.
        If decimals is negative, it specifies the number of
        positions to the left of the decimal point.
        If None, does not round.
    score_func : `str` or callable, default ``EvaluationMetricEnum.MeanAbsolutePercentError.name``
        Score function used to select optimal model in CV.
        If a callable, takes arrays ``y_true``, ``y_pred`` and returns a float.
        If a string, must be either a
        `~greykite.common.evaluation.EvaluationMetricEnum` member name
        or `~greykite.common.constants.FRACTION_OUTSIDE_TOLERANCE`.

        Used in this function to fix the ``"rank_test_score"`` column if
        ``score_func_greater_is_better=False``.

        Should be the same as what was passed to
        :py:meth:`~greykite.framework.templates.forecaster.Forecaster.run_forecast_config`,
        or `~greykite.framework.pipeline.pipeline.forecast_pipeline`,
        or `~greykite.framework.pipeline.utils.get_hyperparameter_searcher`.
    score_func_greater_is_better : `bool`, default False
        True if ``score_func`` is a score function, meaning higher is better,
        and False if it is a loss function, meaning lower is better.
        Must be provided if ``score_func`` is a callable (custom function).
        Ignored if ``score_func`` is a string, because the direction is known.

        Used in this function to fix the ``"rank_test_score"`` column if
        ``score_func_greater_is_better=False``.

        Should be the same as what was passed to
        :py:meth:`~greykite.framework.templates.forecaster.Forecaster.run_forecast_config`,
        or `~greykite.framework.pipeline.pipeline.forecast_pipeline`,
        or `~greykite.framework.pipeline.utils.get_hyperparameter_searcher`.
    cv_report_metrics : `~greykite.framework.constants.CV_REPORT_METRICS_ALL`, or `list` [`str`], or None, default `~greykite.common.constants.CV_REPORT_METRICS_ALL`  # noqa: E501
        Additional metrics to show in the summary, besides the one specified by ``score_func``.

        If a metric is specified but not available, a warning will be given.

        Should be the same as what was passed to
        :py:meth:`~greykite.framework.templates.forecaster.Forecaster.run_forecast_config`,
        or `~greykite.framework.pipeline.pipeline.forecast_pipeline`,
        or `~greykite.framework.pipeline.utils.get_hyperparameter_searcher`,
        or a subset of computed metric to show.

        If a list of strings, valid strings are
        `greykite.common.evaluation.EvaluationMetricEnum` member names
        and `~greykite.common.constants.FRACTION_OUTSIDE_TOLERANCE`.
    column_order : `list` [`str`] or None, default None
        How to order the columns.
        A list of regex to order column names, in greedy fashion. Column names matching
        the first item are placed first. Among remaining items, those matching the second
        items are placed next, etc.
        Use "*" as the last element to select all available columns, if desired.
        If None, uses default ordering::

            column_order = ["rank_test", "mean_test", "split_test", "mean_train",
                            "params", "param", "split_train", "time", ".*"]

    Notes
    -----
    Metrics are named in ``grid_search.cv_results_`` according to the ``scoring``
    parameter passed to `~sklearn.model_selection.RandomizedSearchCV`.

    ``"score"`` is the default used by sklearn for single metric
    evaluation.

    If a dictionary is provided to ``scoring``, as is the case through
    templates, then the metrics are named by its keys, and the
    metric used for selection is defined by ``refit``. The keys
    are derived from ``score_func`` and ``cv_report_metrics``
    in `~greykite.framework.pipeline.utils.get_scoring_and_refit`.

        - The key for ``score_func`` if it is a callable is
          `~greykite.common.constants.CUSTOM_SCORE_FUNC_NAME`.
        - The key for ``EvaluationMetricEnum`` member name is the short name
          from ``.get_metric_name()``.
        - The key for `~greykite.common.constants.FRACTION_OUTSIDE_TOLERANCE`
          is `~greykite.common.constants.FRACTION_OUTSIDE_TOLERANCE_NAME`.

    Returns
    -------
    cv_results : `pandas.DataFrame`
        A summary of cross-validation results in tabular format.
        Each row corresponds to a set of parameters used in the grid search.

        The columns have the following format, where name is the canonical short
        name for the metric.

            ``"rank_test_{name}"`` : `int`
                The params ranked by mean_test_score (1 is best).
            ``"mean_test_{name}"`` : `float`
                Average test score.
            ``"split_test_{name}"`` : `list` [`float`]
                Test score on each split. [split 0, split 1, ...]
            ``"std_test_{name}"`` : `float`
                Standard deviation of test scores.
            ``"mean_train_{name}"`` : `float`
                Average train score.
            ``"split_train_{name}"`` : `list` [`float`]
                Train score on each split. [split 0, split 1, ...]
            ``"std_train_{name}"`` : `float`
                Standard deviation of train scores.
            ``"mean_fit_time"`` : `float`
                Average time to fit each CV split (in seconds)
            ``"std_fit_time"`` : `float`
                Std of time to fit each CV split (in seconds)
            ``"mean_score_time"`` : `float`
                Average time to score each CV split (in seconds)
            ``"std_score_time"`` : `float`
                Std of time to score each CV split (in seconds)
            ``"params"`` : `dict`
                The parameters used. If ``only_changing==True``,
                only shows the parameters which are not identical
                across all CV splits.
            ``"param_{pipeline__param__name}"`` : Any
                The value of pipeline parameter `pipeline__param__name`
                for each row.

    """
    if column_order is None:
        column_order = ["rank_test", "mean_test", "split_test", "mean_train", "params", "param", "split_train", "time", ".*"]

    cv_results = grid_search.cv_results_.copy()

    # Overwrites the params
    selected_params = []
    if only_changing_params:
        # Removes keys that don't vary
        keep_params = set()
        seen_params = {}
        for params in cv_results['params']:
            for k, v in params.items():
                if k in seen_params:
                    try:
                        assert_equal(v, seen_params[k])
                    except AssertionError:
                        # the values are different
                        keep_params.add(k)
                else:
                    seen_params[k] = v

        for params in cv_results['params']:
            explore_params = [(k, v) for k, v in params.items() if k in keep_params]
            selected_params.append(explore_params)
        cv_results['params'] = selected_params

    # Overwrites the ranks and computes combined split score columns
    # for the requested metrics.
    metric_list = [(score_func, score_func_greater_is_better, True)]
    if cv_report_metrics == CV_REPORT_METRICS_ALL:
        cv_report_metrics = EvaluationMetricEnum.__dict__["_member_names_"].copy()
        # Computes `FRACTION_OUTSIDE_TOLERANCE` if `relative_error_tolerance` is specified
        cv_report_metrics.append(FRACTION_OUTSIDE_TOLERANCE)
        metric_list += [(metric, None, False) for metric in cv_report_metrics]
    elif cv_report_metrics is not None:
        # greater_is_better is derived from the metric name
        metric_list += [(metric, None, True) for metric in cv_report_metrics]

    keep_metrics = set()
    for metric, greater_is_better, warn_metric in metric_list:
        ranks_and_splits = get_ranks_and_splits(
            grid_search=grid_search,
            score_func=metric,
            greater_is_better=greater_is_better,
            combine_splits=combine_splits,
            decimals=decimals,
            warn_metric=warn_metric)
        short_name = ranks_and_splits["short_name"]
        if ranks_and_splits["ranks"] is not None:
            cv_results[f"rank_test_{short_name}"] = ranks_and_splits["ranks"]
        if ranks_and_splits["split_train"] is not None:
            cv_results[f"split_train_{short_name}"] = ranks_and_splits["split_train"]
        if ranks_and_splits["split_test"] is not None:
            cv_results[f"split_test_{short_name}"] = ranks_and_splits["split_test"]
        keep_metrics.add(short_name)

    # Creates DataFrame and orders the columns.
    # Dictionary keys are unordered, but appears to follow insertion order.
    cv_results_df = pd.DataFrame(cv_results)
    available_cols = list(cv_results_df.columns)
    # Removes metrics not selected
    all_metrics = set(col.replace("mean_test_", "") for col in cv_results.keys()
                      if re.search("mean_test_", col))
    remove_metrics = all_metrics - keep_metrics
    remove_regex = "|".join(remove_metrics)
    if remove_regex:
        available_cols = [col for col in available_cols
                          if not re.search(remove_regex, col)]
    # Orders the columns
    ordered_cols = []
    for regex in column_order:
        selected_cols = [col for col in available_cols
                         if col not in ordered_cols and re.search(regex, col)]
        ordered_cols += selected_cols
    cv_results_df = cv_results_df[ordered_cols]

    if decimals is not None:
        cv_results_df = cv_results_df.round(decimals)

    return cv_results_df