Source code for greykite.framework.benchmark.benchmark_class

# BSD 2-CLAUSE LICENSE

# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:

# Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
# Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# original author: Sayan Patra
"""Class for benchmarking model templates."""

from typing import Dict
from typing import List

import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.colors import DEFAULT_PLOTLY_COLORS
from tqdm.autonotebook import tqdm

from greykite.common.constants import ACTUAL_COL
from greykite.common.constants import PREDICTED_COL
from greykite.common.constants import PREDICTED_LOWER_COL
from greykite.common.constants import PREDICTED_UPPER_COL
from greykite.common.constants import TIME_COL
from greykite.common.constants import VALUE_COL
from greykite.common.evaluation import EvaluationMetricEnum
from greykite.common.evaluation import add_finite_filter_to_scorer
from greykite.common.logging import LoggingLevelEnum
from greykite.common.logging import log_message
from greykite.common.python_utils import get_pattern_cols
from greykite.common.viz.timeseries_plotting import plot_multivariate
from greykite.common.viz.timeseries_plotting import plot_multivariate_grouped
from greykite.framework.benchmark.benchmark_class_helper import forecast_pipeline_rolling_evaluation
from greykite.framework.constants import FORECAST_STEP_COL
from greykite.framework.templates.autogen.forecast_config import ForecastConfig
from greykite.framework.templates.forecaster import Forecaster
from greykite.sklearn.cross_validation import RollingTimeSeriesSplit


[docs]class BenchmarkForecastConfig: """Class for benchmarking multiple ForecastConfig on a rolling window basis. Attributes ---------- df : `pandas.DataFrame` Timeseries data to forecast. Contains columns [`time_col`, `value_col`], and optional regressor columns. Regressor columns should include future values for prediction. configs : `Dict` [`str`, `ForecastConfig`] Dictionary of model configurations. A model configuration is a ``ForecastConfig``. See :class:`~greykite.framework.templates.autogen.forecast_config.ForecastConfig` for details on valid ``ForecastConfig``. Validity of the ``configs`` for benchmarking is checked via the ``validate`` method. tscv : `~greykite.sklearn.cross_validation.RollingTimeSeriesSplit` Cross-validation object that determines the rolling window evaluation. See :class:`~greykite.sklearn.cross_validation.RollingTimeSeriesSplit` for details. The ``forecast_horizon`` and ``periods_between_train_test`` parameters of ``configs`` are matched against that of ``tscv``. A ValueError is raised if there is a mismatch. forecaster : `~greykite.framework.templates.forecaster.Forecaster` Forecaster used to create the forecasts. is_run : bool, default False Indicator of whether the `run` method is executed. After executing `run`, this indicator is set to True. Some class methods like ``get_forecast`` requires ``is_run`` to be True to be executed. result : `dict` Stores the benchmarking results. Has the same keys as ``configs``. forecasts : `pandas.DataFrame`, default None Merged DataFrame of forecasts, upper and lower confidence interval for all input ``configs``. Also stores train end date and forecast step for each prediction. """ def __init__( self, df: pd.DataFrame, configs: Dict[str, ForecastConfig], tscv: RollingTimeSeriesSplit, forecaster: Forecaster = Forecaster()): self.df = df self.configs = configs self.tscv = tscv self.forecaster = forecaster self.is_run = False # output self.result = dict.fromkeys(configs.keys()) self.forecasts = None
[docs] def validate(self): """Validates the inputs to the class for the method ``run``. Raises a ValueError if there is a mismatch between the following parameters of ``configs`` and ``tscv``: - ``forecast_horizon`` - ``periods_between_train_test`` Raises ValueError if all the ``configs`` do not have the same ``coverage`` parameter. """ coverage_list = [] for config_name, config in self.configs.items(): # Checks forecast_horizon if config.forecast_horizon != self.tscv.forecast_horizon: raise ValueError(f"{config_name}'s 'forecast_horizon' ({config.forecast_horizon}) does " f"not match that of 'tscv' ({self.tscv.forecast_horizon}).") # Checks periods_between_train_test if config.evaluation_period_param.periods_between_train_test != self.tscv.periods_between_train_test: raise ValueError(f"{config_name}'s 'periods_between_train_test' ({config.evaluation_period_param.periods_between_train_test}) " f"does not match that of 'tscv' ({self.tscv.periods_between_train_test}).") coverage_list.append(config.coverage) # Computes pipeline parameters pipeline_params = self.forecaster.apply_forecast_config( df=self.df, config=config) self.result[config_name] = dict(pipeline_params=pipeline_params) # Checks all coverages are same if coverage_list[1:] != coverage_list[:-1]: raise ValueError("All forecast configs must have same coverage.")
[docs] def run(self): """Runs every config and stores the output of the :func:`~greykite.framework.pipeline.pipeline.forecast_pipeline`. This function runs only if the ``configs`` and ``tscv`` are jointly valid. Returns ------- self : Returns self. Stores pipeline output of every config in ``self.result``. """ self.validate() with tqdm(self.result.items(), ncols=800, leave=True) as progress_bar: for (config_name, config) in progress_bar: # Description will be displayed on the left of progress bar progress_bar.set_description(f"Benchmarking '{config_name}' ") rolling_evaluation = forecast_pipeline_rolling_evaluation( pipeline_params=config["pipeline_params"], tscv=self.tscv) config["rolling_evaluation"] = rolling_evaluation self.is_run = True
[docs] def extract_forecasts(self): """Extracts forecasts, upper and lower confidence interval for each individual config. This is saved as a ``pandas.DataFrame`` with the name ``rolling_forecast_df`` within the corresponding config of ``self.result``. e.g. if config key is "silverkite", then the forecasts are stored in ``self.result["silverkite"]["rolling_forecast_df"]``. This method also constructs a merged DataFrame of forecasts, upper and lower confidence interval for all input ``configs``. """ if not self.is_run: raise ValueError("Please execute 'run' method to create forecasts.") merged_df = pd.DataFrame() for config_name, config in self.result.items(): rolling_evaluation = config["rolling_evaluation"] rolling_forecast_df = pd.DataFrame() for num, (split_key, split_value) in enumerate(rolling_evaluation.items()): forecast = split_value["pipeline_result"].forecast # Subsets forecast_horizon rows from the end of forecast dataframe forecast_df = forecast.df.iloc[-forecast.forecast_horizon:] forecast_df.insert(0, "train_end_date", forecast.train_end_date) forecast_df.insert(1, FORECAST_STEP_COL, np.arange(forecast.forecast_horizon) + 1) forecast_df.insert(2, "split_num", num) rolling_forecast_df = pd.concat([rolling_forecast_df, forecast_df], axis=0) rolling_forecast_df = rolling_forecast_df.reset_index(drop=True) self.result[config_name]["rolling_forecast_df"] = rolling_forecast_df # Merges the forecasts of individual config # Augments prediction columns with config name pred_cols = [PREDICTED_COL] if PREDICTED_LOWER_COL in rolling_forecast_df.columns: pred_cols.append(PREDICTED_LOWER_COL) if PREDICTED_UPPER_COL in rolling_forecast_df.columns: pred_cols.append(PREDICTED_UPPER_COL) mapper = { col: f"{config_name}_{col}" for col in pred_cols } if merged_df.empty: temp_df = rolling_forecast_df.rename(columns=mapper) else: temp_df = rolling_forecast_df[pred_cols].rename(columns=mapper) merged_df = pd.concat([merged_df, temp_df], axis=1) self.forecasts = merged_df.reset_index(drop=True)
[docs] def plot_forecasts_by_step( self, forecast_step: int, config_names: List = None, xlabel: str = TIME_COL, ylabel: str = VALUE_COL, title: str = None, showlegend: bool = True): """Returns a ``forecast_step`` ahead rolling forecast plot. The plot consists one line for each valid. ``config_names``. If available, the corresponding actual values are also plotted. For a more customizable plot, see :func:`~greykite.common.viz.timeseries_plotting.plot_multivariate` Parameters ---------- forecast_step : `int` Which forecast step to plot. A forecast step is an integer between 1 and the forecast horizon, inclusive, indicating the number of periods from train end date to the prediction date (# steps ahead). config_names : `list` [`str`], default None Which config results to plot. A list of config names. If None, uses all the available config keys. xlabel : `str` or None, default TIME_COL x-axis label. ylabel : `str` or None, default VALUE_COL y-axis label. title : `str` or None, default None Plot title. If None, default is based on ``forecast_step``. showlegend : `bool`, default True Whether to show the legend. Returns ------- fig : `plotly.graph_objects.Figure` Interactive plotly graph. Plots multiple column(s) in ``self.forecasts`` against ``TIME_COL``. See `~greykite.common.viz.timeseries_plotting.plot_forecast_vs_actual` return value for how to plot the figure and add customization. """ if self.forecasts is None: self.extract_forecasts() if forecast_step > self.tscv.forecast_horizon: raise ValueError(f"`forecast_step` ({forecast_step}) must be less than or equal to " f"forecast horizon ({self.tscv.forecast_horizon}).") config_names = self.get_valid_config_names(config_names) y_cols = [TIME_COL, ACTUAL_COL] + \ [f"{config_name}_{PREDICTED_COL}" for config_name in config_names] df = self.forecasts[self.forecasts[FORECAST_STEP_COL] == forecast_step] df = df[y_cols] if title is None: title = f"{forecast_step}-step ahead rolling forecasts" fig = plot_multivariate( df=df, x_col=TIME_COL, y_col_style_dict="plotly", xlabel=xlabel, ylabel=ylabel, title=title, showlegend=showlegend) return fig
[docs] def plot_forecasts_by_config( self, config_name: str, colors: List = DEFAULT_PLOTLY_COLORS, xlabel: str = TIME_COL, ylabel: str = VALUE_COL, title: str = None, showlegend: bool = True): """Returns a rolling plot of the forecasts by ``config_name`` against ``TIME_COL``. The plot consists of one line for each available split. Some lines may overlap if test period in corresponding splits intersect. Hence every line is given a different color. If available, the corresponding actual values are also plotted. For a more customizable plot, see :func:`~greykite.common.viz.timeseries_plotting.plot_multivariate_grouped` Parameters ---------- config_name : `str` Which config result to plot. The name must match the name of one of the input ``configs``. colors : [`str`, `List` [`str`]], default ``DEFAULT_PLOTLY_COLORS`` Which colors to use to build the color palette. This can be a list of RGB colors or a `str` from ``PLOTLY_SCALES``. To use a single color for all lines, pass a `List` with a single color. xlabel : `str` or None, default TIME_COL x-axis label. ylabel : `str` or None, default VALUE_COL y-axis label. title : `str` or None, default None Plot title. If None, default is based on ``config_name``. showlegend : `bool`, default True Whether to show the legend. Returns ------- fig : `plotly.graph_objects.Figure` Interactive plotly graph. Plots multiple column(s) in ``self.forecasts`` against ``TIME_COL``. """ if self.forecasts is None: self.extract_forecasts() config_name = self.get_valid_config_names([config_name])[0] if title is None: title = f"Rolling forecast for {config_name}" fig = plot_multivariate_grouped( df=self.forecasts, x_col=TIME_COL, y_col_style_dict={ ACTUAL_COL: { "line": { "width": 1, "dash": "solid" } } }, grouping_x_col="split_num", grouping_x_col_values=None, grouping_y_col_style_dict={ f"{config_name}_{PREDICTED_COL}": { "name": "split", "line": { "width": 1, "dash": "solid" } } }, colors=colors, xlabel=xlabel, ylabel=ylabel, title=title, showlegend=showlegend) return fig
[docs] def get_evaluation_metrics( self, metric_dict: Dict, config_names: List = None): """Returns rolling train and test evaluation metric values. Parameters ---------- metric_dict : `dict` [`str`, `callable`] Evaluation metrics to compute. - key: evaluation metric name, used to create column name in output. - value: metric function to apply to forecast df in each split to generate the column value. Signature (y_true: `str`, y_pred: `str`) -> transformed value: `float`. For example:: metric_dict = { "median_residual": lambda y_true, y_pred: np.median(y_true - y_pred), "mean_squared_error": lambda y_true, y_pred: np.mean((y_true - y_pred)**2) } Some predefined functions are available in `~greykite.common.evaluation`. For example:: metric_dict = { "correlation": lambda y_true, y_pred: correlation(y_true, y_pred), "RMSE": lambda y_true, y_pred: root_mean_squared_error(y_true, y_pred), "Q_95": lambda y_true, y_pred: partial(quantile_loss(y_true, y_pred, q=0.95)) } As shorthand, it is sufficient to provide the corresponding ``EvaluationMetricEnum`` member. These are auto-expanded into the appropriate function. So the following is equivalent:: metric_dict = { "correlation": EvaluationMetricEnum.Correlation, "RMSE": EvaluationMetricEnum.RootMeanSquaredError, "Q_95": EvaluationMetricEnum.Quantile95 } config_names : `list` [`str`], default None Which config results to plot. A list of config names. If None, uses all the available config keys. Returns ------- evaluation_metrics_df : pd.DataFrame A DataFrame containing splitwise train and test evaluation metrics for ``metric_dict`` and ``config_names``. For example. Let's assume:: metric_dict = { "RMSE": EvaluationMetricEnum.RootMeanSquaredError, "Q_95": EvaluationMetricEnum.Quantile95 } config_names = ["default_prophet", "custom_silverkite"] These are valid ``config_names`` and there are 2 splits for each. Then evaluation_metrics_df = config_name split_num train_RMSE test_RMSE train_Q_95 test_Q_95 default_prophet 0 * * * * default_prophet 1 * * * * custom_silverkite 0 * * * * custom_silverkite 1 * * * * where * represents computed values. """ if not self.is_run: raise ValueError("Please execute the 'run' method before computing evaluation metrics.") metric_dict = self.autocomplete_metric_dict( metric_dict=metric_dict, enum_class=EvaluationMetricEnum) config_names = self.get_valid_config_names(config_names=config_names) evaluation_metrics_df = pd.DataFrame() for config_name in config_names: rolling_evaluation = self.result[config_name]["rolling_evaluation"] for num, (split_key, split_value) in enumerate(rolling_evaluation.items()): forecast = split_value["pipeline_result"].forecast split_metrics = { "config_name": config_name, "split_num": num} # Updates train metrics df_train = forecast.df_train split_metrics.update({ f"train_{metric_name}": metric_func( df_train[forecast.actual_col], df_train[forecast.predicted_col] ) for metric_name, metric_func in metric_dict.items() }) # Updates test metrics df_test = forecast.df_test if df_test.shape[0] > 0 and forecast.test_na_count < df_test.shape[0]: split_metrics.update({ f"test_{metric_name}": metric_func( df_test[forecast.actual_col], df_test[forecast.predicted_col] ) for metric_name, metric_func in metric_dict.items() }) else: split_metrics.update({ f"test_{metric_name}": np.nan for metric_name, metric_func in metric_dict.items() }) split_metrics_df = pd.DataFrame(split_metrics, index=[num]) evaluation_metrics_df = pd.concat([evaluation_metrics_df, split_metrics_df]) # Resets index and fills missing values (e.g. when correlation is not defined) with np.nan evaluation_metrics_df = evaluation_metrics_df.reset_index(drop=True).fillna(value=np.nan) temp_df = evaluation_metrics_df.copy() # Rearranges columns so that train and test error of a config are side by side evaluation_metrics_df = pd.DataFrame() evaluation_metrics_df["config_name"] = temp_df["config_name"] evaluation_metrics_df["split_num"] = temp_df["split_num"] for metric_name in metric_dict.keys(): evaluation_metrics_df[f"train_{metric_name}"] = temp_df[f"train_{metric_name}"] evaluation_metrics_df[f"test_{metric_name}"] = temp_df[f"test_{metric_name}"] return evaluation_metrics_df
[docs] def plot_evaluation_metrics( self, metric_dict: Dict, config_names: List = None, xlabel: str = None, ylabel: str = "Metric value", title: str = None, showlegend: bool = True): """Returns a barplot of the train and test values of ``metric_dict`` of ``config_names``. Value of a metric for all ``config_names`` are plotted as a grouped bar. Train and test values of a metric are plot side-by-side for easy comparison. Parameters ---------- metric_dict : `dict` [`str`, `callable`] Evaluation metrics to compute. Same as `~greykite.framework.framework.benchmark.benchmark_class.BenchmarkForecastConfig.get_evaluation_metrics`. To get the best visualization, keep number of metrics <= 2. config_names : `list` [`str`], default None Which config results to plot. A list of config names. If None, uses all the available config keys. xlabel : `str` or None, default None x-axis label. ylabel : `str` or None, default "Metric value" y-axis label. title : `str` or None, default None Plot title. showlegend : `bool`, default True Whether to show the legend. Returns ------- fig : `plotly.graph_objects.Figure` Interactive plotly bar plot. """ evaluation_metrics_df = self.get_evaluation_metrics( metric_dict=metric_dict, config_names=config_names) # This function groups by config name evaluation_metrics_df = (evaluation_metrics_df .drop(columns=["split_num"]) .groupby("config_name") .mean() .dropna(how="all")) # Rearranges columns so that train and test error of a config are side by side plot_df = pd.DataFrame() for metric_name in metric_dict.keys(): plot_df[f"train_{metric_name}"] = evaluation_metrics_df[f"train_{metric_name}"] plot_df[f"test_{metric_name}"] = evaluation_metrics_df[f"test_{metric_name}"] if title is None: title = "Average evaluation metric across rolling windows" data = [] # Each row (index) is a config. Adds each row to the bar plot. for index in plot_df.index: data.append( go.Bar( name=index, x=plot_df.columns, y=plot_df.loc[index].values ) ) layout = go.Layout( xaxis=dict(title=xlabel), yaxis=dict(title=ylabel), title=title, title_x=0.5, showlegend=showlegend, barmode="group", ) fig = go.Figure(data=data, layout=layout) return fig
[docs] def get_grouping_evaluation_metrics( self, metric_dict: Dict, config_names: List = None, which: str = "train", groupby_time_feature: str = None, groupby_sliding_window_size: int = None, groupby_custom_column: pd.Series = None): """Returns splitwise rolling evaluation metric values. These values are grouped by the grouping method chosen by ``groupby_time_feature``, ``groupby_sliding_window_size`` and ``groupby_custom_column``. See `~greykite.framework.output.univariate_forecast.UnivariateForecast.get_grouping_evaluation` for details on grouping method. Parameters ---------- metric_dict : `dict` [`str`, `callable`] Evaluation metrics to compute. Same as `~greykite.framework.framework.benchmark.benchmark_class.BenchmarkForecastConfig.get_evaluation_metrics`. config_names : `list` [`str`], default None Which config results to plot. A list of config names. If None, uses all the available config keys. which: `str` "train" or "test". Which dataset to evaluate. groupby_time_feature : `str` or None, default None If provided, groups by a column generated by `~greykite.common.features.timeseries_features.build_time_features_df`. See that function for valid values. groupby_sliding_window_size : `int` or None, default None If provided, sequentially partitions data into groups of size ``groupby_sliding_window_size``. groupby_custom_column : `pandas.Series` or None, default None If provided, groups by this column value. Should be same length as the DataFrame. Returns ------- grouped_evaluation_df : `pandas.DataFrame` A DataFrame containing splitwise train and test evaluation metrics for ``metric_dict`` and ``config_names``. The evaluation metrics are grouped by the grouping method. """ if not self.is_run: raise ValueError("Please execute the 'run' method before computing " "grouped evaluation metrics.") metric_dict = self.autocomplete_metric_dict( metric_dict=metric_dict, enum_class=EvaluationMetricEnum) config_names = self.get_valid_config_names(config_names=config_names) grouped_evaluation_df = pd.DataFrame() for config_name in config_names: rolling_evaluation = self.result[config_name]["rolling_evaluation"] for num, (split_key, split_value) in enumerate(rolling_evaluation.items()): forecast = split_value["pipeline_result"].forecast split_evaluation_df = pd.DataFrame() for metric_name, metric_func in metric_dict.items(): grouped_df = forecast.get_grouping_evaluation( score_func=metric_func, score_func_name=metric_name, which=which, groupby_time_feature=groupby_time_feature, groupby_sliding_window_size=groupby_sliding_window_size, groupby_custom_column=groupby_custom_column) # Adds grouped_df to split_evaluation_df, handling the case if split_evaluation_df is empty # If the actual values are missing, grouped_df.shape[0] might be 0 if grouped_df.shape[0] > 0: if split_evaluation_df.empty: split_evaluation_df = grouped_df else: groupby_col = split_evaluation_df.columns[0] split_evaluation_df = pd.merge(split_evaluation_df, grouped_df, on=groupby_col) else: # This column name is the same as that obtained from # `~greykite.framework.output.univariate_forecast.UnivariateForecast.get_grouping_evaluation` split_evaluation_df[f"{which} {metric_name}"] = np.nan split_evaluation_df.insert(0, "config_name", config_name) split_evaluation_df.insert(1, "split_num", num) grouped_evaluation_df = pd.concat([grouped_evaluation_df, split_evaluation_df]) grouped_evaluation_df = grouped_evaluation_df.reset_index(drop=True) return grouped_evaluation_df
[docs] def plot_grouping_evaluation_metrics( self, metric_dict: Dict, config_names: List = None, which: str = "train", groupby_time_feature: str = None, groupby_sliding_window_size: int = None, groupby_custom_column: pd.Series = None, xlabel=None, ylabel="Metric value", title=None, showlegend=True): """Returns a line plot of the grouped evaluation values of ``metric_dict`` of ``config_names``. These values are grouped by the grouping method chosen by ``groupby_time_feature``, ``groupby_sliding_window_size`` and ``groupby_custom_column``. See `~greykite.framework.output.univariate_forecast.UnivariateForecast.get_grouping_evaluation` for details on grouping method. Parameters ---------- metric_dict : `dict` [`str`, `callable`] Evaluation metrics to compute. Same as `~greykite.framework.framework.benchmark.benchmark_class.BenchmarkForecastConfig.get_evaluation_metrics`. To get the best visualization, keep number of metrics <= 2. config_names : `list` [`str`], default None Which config results to plot. A list of config names. If None, uses all the available config keys. which: `str` "train" or "test". Which dataset to evaluate. groupby_time_feature : `str` or None, optional If provided, groups by a column generated by `~greykite.common.features.timeseries_features.build_time_features_df`. See that function for valid values. groupby_sliding_window_size : `int` or None, optional If provided, sequentially partitions data into groups of size ``groupby_sliding_window_size``. groupby_custom_column : `pandas.Series` or None, optional If provided, groups by this column value. Should be same length as the DataFrame. xlabel : `str` or None, default None x-axis label. If None, label is determined by the groupby column name. ylabel : `str` or None, default "Metric value" y-axis label. title : `str` or None, default None Plot title. If None, default is based on ``config_name``. showlegend : `bool`, default True Whether to show the legend. Returns ------- fig : `plotly.graph_objects.Figure` Interactive plotly graph. """ grouped_evaluation_df = self.get_grouping_evaluation_metrics( metric_dict=metric_dict, config_names=config_names, which=which, groupby_time_feature=groupby_time_feature, groupby_sliding_window_size=groupby_sliding_window_size, groupby_custom_column=groupby_custom_column) # Figures out groupby_col name by process of elimination cols = [col for col in grouped_evaluation_df.columns if col not in ["config_name", "split_num"]] groupby_col = get_pattern_cols(cols, pos_pattern=".*", neg_pattern=which)[0] plot_df = (grouped_evaluation_df .drop(columns=["split_num"]) # Drops redundant column .groupby(["config_name", groupby_col]) # Averages values across splits .mean() .dropna(how="all") # Drops rows with all NA values .unstack(level=0) # Moves config_name from multiindex rows to multiindex columns .sort_index(axis=1) # Sorts on groupby_col to plot groups in logical order ) # Flattens and renames multiindex columns cols = [groupby_col] + ["_".join(v) for v in plot_df.columns] plot_df = pd.DataFrame(plot_df.to_records()) plot_df.columns = cols if xlabel is None: xlabel = groupby_col if title is None: title = f"{which} performance by {xlabel} across rolling windows" fig = plot_multivariate( df=plot_df, x_col=groupby_col, y_col_style_dict="plotly", xlabel=xlabel, ylabel=ylabel, title=title, showlegend=showlegend) return fig
[docs] def get_runtimes(self, config_names: List = None): """Returns rolling average runtime in seconds for ``config_names``. Parameters ---------- config_names : `list` [`str`], default None Which config results to plot. A list of config names. If None, uses all the available config keys. Returns ------- runtimes_df : pd.DataFrame A DataFrame containing splitwise runtime in seconds for ``config_names``. For example. Let's assume:: config_names = ["default_prophet", "custom_silverkite"] These are valid ``config_names`` and there are 2 splits for each. Then runtimes_df = config_name split_num runtime_sec default_prophet 0 * default_prophet 1 * custom_silverkite 0 * custom_silverkite 1 * where * represents computed values. """ if not self.is_run: raise ValueError("Please execute the 'run' method to obtain runtimes.") config_names = self.get_valid_config_names(config_names=config_names) runtimes_df = pd.DataFrame() for config_name in config_names: rolling_evaluation = self.result[config_name]["rolling_evaluation"] for num, (split_key, split_value) in enumerate(rolling_evaluation.items()): split_runtime_df = pd.DataFrame({ "config_name": config_name, "split_num": num, "runtime_sec": split_value["runtime_sec"] }, index=[num]) runtimes_df = pd.concat([runtimes_df, split_runtime_df]) return runtimes_df.reset_index(drop=True)
[docs] def plot_runtimes( self, config_names: List = None, xlabel: str = None, ylabel: str = "Mean runtime in seconds", title: str = "Average runtime across rolling windows", showlegend: bool = True): """Returns a barplot of the runtimes of ``config_names``. Parameters ---------- config_names : `list` [`str`], default None Which config results to plot. A list of config names. If None, uses all the available config keys. xlabel : `str` or None, default None x-axis label. ylabel : `str` or None, default "Mean runtime in seconds" y-axis label. title : `str` or None, default "Average runtime across rolling windows" Plot title. showlegend : `bool`, default True Whether to show the legend. Returns ------- fig : `plotly.graph_objects.Figure` Interactive plotly bar plot. """ runtimes_df = self.get_runtimes(config_names=config_names) plot_df = runtimes_df.drop(columns=["split_num"]).groupby("config_name").mean() data = [go.Bar(x=plot_df.index, y=plot_df["runtime_sec"], name="Runtime")] layout = go.Layout( xaxis=dict(title=xlabel), yaxis=dict(title=ylabel), title=title, title_x=0.5, showlegend=showlegend, ) fig = go.Figure(data=data, layout=layout) return fig
[docs] def get_valid_config_names(self, config_names: List = None): """Validate ``config_names`` against keys of ``configs``. Raises a ValueError in case of a mismatch. Parameters ---------- config_names : `list` [`str`], default None Which config results to plot. A list of config names. If None, uses all the available config keys. Returns ------- config_names : `list` List of valid config names. """ available_config_names = list(self.configs.keys()) if config_names is None: config_names = available_config_names else: missing_config_names = set(config_names) - set(available_config_names) if len(missing_config_names) > 0: raise ValueError(f"The following config keys are missing: {missing_config_names}.") return config_names
[docs] @staticmethod def autocomplete_metric_dict(metric_dict, enum_class): """Sweeps through ``metric_dict``, converting members of ``enum_class`` to their corresponding evaluation function. For example:: metric_dict = { "correlation": EvaluationMetricEnum.Correlation, "RMSE": EvaluationMetricEnum.RootMeanSquaredError, "Q_95": EvaluationMetricEnum.Quantile95 "custom_metric": custom_function } is converted to metric_dict = { "correlation": correlation(y_true, y_pred), "RMSE": root_mean_squared_error(y_true, y_pred), "Q_95": quantile_loss_q(y_true, y_pred, q=0.95), "custom_function": custom_function } Parameters ---------- metric_dict : `dict` [`str`, `callable`] Evaluation metrics to compute. Same as `~greykite.framework.framework.benchmark.benchmark_class.BenchmarkForecastConfig.get_evaluation_metrics`. enum_class : Enum The enum class ``metric_dict`` elements might be member of. It must have a method ``get_metric_func``. Returns ------- updated_metric_dict : `dict` Autocompleted metric dict. """ updated_metric_dict = {} for metric_name, metric_value in metric_dict.items(): if isinstance(metric_value, enum_class): updated_metric_dict[metric_name] = metric_value.get_metric_func() else: updated_metric_dict[metric_name] = add_finite_filter_to_scorer(metric_value) if not callable(metric_value): raise ValueError(f"Value of '{metric_name}' should be a callable or a member of {enum_class}.") return updated_metric_dict
def save(self): log_message("Benchmark save is not implemented yet.", LoggingLevelEnum.WARNING) def summary(self): log_message("Benchmark summary is not implemented yet.", LoggingLevelEnum.WARNING)