# BSD 2-CLAUSE LICENSE
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
# Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
# Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# original author: Sayan Patra
"""Class for benchmarking model templates."""
from typing import Dict
from typing import List
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.colors import DEFAULT_PLOTLY_COLORS
from tqdm.autonotebook import tqdm
from greykite.common.constants import ACTUAL_COL
from greykite.common.constants import PREDICTED_COL
from greykite.common.constants import PREDICTED_LOWER_COL
from greykite.common.constants import PREDICTED_UPPER_COL
from greykite.common.constants import TIME_COL
from greykite.common.constants import VALUE_COL
from greykite.common.evaluation import EvaluationMetricEnum
from greykite.common.evaluation import add_finite_filter_to_scorer
from greykite.common.logging import LoggingLevelEnum
from greykite.common.logging import log_message
from greykite.common.python_utils import get_pattern_cols
from greykite.common.viz.timeseries_plotting import plot_multivariate
from greykite.common.viz.timeseries_plotting import plot_multivariate_grouped
from greykite.framework.benchmark.benchmark_class_helper import forecast_pipeline_rolling_evaluation
from greykite.framework.constants import FORECAST_STEP_COL
from greykite.framework.templates.autogen.forecast_config import ForecastConfig
from greykite.framework.templates.forecaster import Forecaster
from greykite.sklearn.cross_validation import RollingTimeSeriesSplit
[docs]class BenchmarkForecastConfig:
"""Class for benchmarking multiple ForecastConfig on a rolling window basis.
Attributes
----------
df : `pandas.DataFrame`
Timeseries data to forecast.
Contains columns [`time_col`, `value_col`], and optional regressor columns.
Regressor columns should include future values for prediction.
configs : `Dict` [`str`, `ForecastConfig`]
Dictionary of model configurations.
A model configuration is a ``ForecastConfig``.
See :class:`~greykite.framework.templates.autogen.forecast_config.ForecastConfig` for details on
valid ``ForecastConfig``.
Validity of the ``configs`` for benchmarking is checked via the ``validate`` method.
tscv : `~greykite.sklearn.cross_validation.RollingTimeSeriesSplit`
Cross-validation object that determines the rolling window evaluation.
See :class:`~greykite.sklearn.cross_validation.RollingTimeSeriesSplit` for details.
The ``forecast_horizon`` and ``periods_between_train_test`` parameters of ``configs`` are
matched against that of ``tscv``. A ValueError is raised if there is a mismatch.
forecaster : `~greykite.framework.templates.forecaster.Forecaster`
Forecaster used to create the forecasts.
is_run : bool, default False
Indicator of whether the `run` method is executed.
After executing `run`, this indicator is set to True.
Some class methods like ``get_forecast`` requires ``is_run`` to be True
to be executed.
result : `dict`
Stores the benchmarking results. Has the same keys as ``configs``.
forecasts : `pandas.DataFrame`, default None
Merged DataFrame of forecasts, upper and lower confidence interval for all
input ``configs``. Also stores train end date and forecast step for each prediction.
"""
def __init__(
self,
df: pd.DataFrame,
configs: Dict[str, ForecastConfig],
tscv: RollingTimeSeriesSplit,
forecaster: Forecaster = Forecaster()):
self.df = df
self.configs = configs
self.tscv = tscv
self.forecaster = forecaster
self.is_run = False
# output
self.result = dict.fromkeys(configs.keys())
self.forecasts = None
[docs] def validate(self):
"""Validates the inputs to the class for the method ``run``.
Raises a ValueError if there is a mismatch between the following parameters
of ``configs`` and ``tscv``:
- ``forecast_horizon``
- ``periods_between_train_test``
Raises ValueError if all the ``configs`` do not have the same ``coverage`` parameter.
"""
coverage_list = []
for config_name, config in self.configs.items():
# Checks forecast_horizon
if config.forecast_horizon != self.tscv.forecast_horizon:
raise ValueError(f"{config_name}'s 'forecast_horizon' ({config.forecast_horizon}) does "
f"not match that of 'tscv' ({self.tscv.forecast_horizon}).")
# Checks periods_between_train_test
if config.evaluation_period_param.periods_between_train_test != self.tscv.periods_between_train_test:
raise ValueError(f"{config_name}'s 'periods_between_train_test' ({config.evaluation_period_param.periods_between_train_test}) "
f"does not match that of 'tscv' ({self.tscv.periods_between_train_test}).")
coverage_list.append(config.coverage)
# Computes pipeline parameters
pipeline_params = self.forecaster.apply_forecast_config(
df=self.df,
config=config)
self.result[config_name] = dict(pipeline_params=pipeline_params)
# Checks all coverages are same
if coverage_list[1:] != coverage_list[:-1]:
raise ValueError("All forecast configs must have same coverage.")
[docs] def run(self):
"""Runs every config and stores the output of the
:func:`~greykite.framework.pipeline.pipeline.forecast_pipeline`.
This function runs only if the ``configs`` and ``tscv`` are jointly valid.
Returns
-------
self : Returns self. Stores pipeline output of every config in ``self.result``.
"""
self.validate()
with tqdm(self.result.items(), ncols=800, leave=True) as progress_bar:
for (config_name, config) in progress_bar:
# Description will be displayed on the left of progress bar
progress_bar.set_description(f"Benchmarking '{config_name}' ")
rolling_evaluation = forecast_pipeline_rolling_evaluation(
pipeline_params=config["pipeline_params"],
tscv=self.tscv)
config["rolling_evaluation"] = rolling_evaluation
self.is_run = True
[docs] def plot_forecasts_by_step(
self,
forecast_step: int,
config_names: List = None,
xlabel: str = TIME_COL,
ylabel: str = VALUE_COL,
title: str = None,
showlegend: bool = True):
"""Returns a ``forecast_step`` ahead rolling forecast plot.
The plot consists one line for each valid. ``config_names``.
If available, the corresponding actual values are also plotted.
For a more customizable plot, see
:func:`~greykite.common.viz.timeseries_plotting.plot_multivariate`
Parameters
----------
forecast_step : `int`
Which forecast step to plot. A forecast step is an integer between 1 and the
forecast horizon, inclusive, indicating the number of periods from train end date
to the prediction date (# steps ahead).
config_names : `list` [`str`], default None
Which config results to plot. A list of config names.
If None, uses all the available config keys.
xlabel : `str` or None, default TIME_COL
x-axis label.
ylabel : `str` or None, default VALUE_COL
y-axis label.
title : `str` or None, default None
Plot title. If None, default is based on ``forecast_step``.
showlegend : `bool`, default True
Whether to show the legend.
Returns
-------
fig : `plotly.graph_objects.Figure`
Interactive plotly graph.
Plots multiple column(s) in ``self.forecasts`` against ``TIME_COL``.
See `~greykite.common.viz.timeseries_plotting.plot_forecast_vs_actual`
return value for how to plot the figure and add customization.
"""
if self.forecasts is None:
self.extract_forecasts()
if forecast_step > self.tscv.forecast_horizon:
raise ValueError(f"`forecast_step` ({forecast_step}) must be less than or equal to "
f"forecast horizon ({self.tscv.forecast_horizon}).")
config_names = self.get_valid_config_names(config_names)
y_cols = [TIME_COL, ACTUAL_COL] + \
[f"{config_name}_{PREDICTED_COL}" for config_name in config_names]
df = self.forecasts[self.forecasts[FORECAST_STEP_COL] == forecast_step]
df = df[y_cols]
if title is None:
title = f"{forecast_step}-step ahead rolling forecasts"
fig = plot_multivariate(
df=df,
x_col=TIME_COL,
y_col_style_dict="plotly",
xlabel=xlabel,
ylabel=ylabel,
title=title,
showlegend=showlegend)
return fig
[docs] def plot_forecasts_by_config(
self,
config_name: str,
colors: List = DEFAULT_PLOTLY_COLORS,
xlabel: str = TIME_COL,
ylabel: str = VALUE_COL,
title: str = None,
showlegend: bool = True):
"""Returns a rolling plot of the forecasts by ``config_name`` against ``TIME_COL``.
The plot consists of one line for each available split. Some lines may overlap if test
period in corresponding splits intersect. Hence every line is given a different color.
If available, the corresponding actual values are also plotted.
For a more customizable plot, see
:func:`~greykite.common.viz.timeseries_plotting.plot_multivariate_grouped`
Parameters
----------
config_name : `str`
Which config result to plot.
The name must match the name of one of the input ``configs``.
colors : [`str`, `List` [`str`]], default ``DEFAULT_PLOTLY_COLORS``
Which colors to use to build the color palette.
This can be a list of RGB colors or a `str` from ``PLOTLY_SCALES``.
To use a single color for all lines, pass a `List` with a single color.
xlabel : `str` or None, default TIME_COL
x-axis label.
ylabel : `str` or None, default VALUE_COL
y-axis label.
title : `str` or None, default None
Plot title. If None, default is based on ``config_name``.
showlegend : `bool`, default True
Whether to show the legend.
Returns
-------
fig : `plotly.graph_objects.Figure`
Interactive plotly graph.
Plots multiple column(s) in ``self.forecasts`` against ``TIME_COL``.
"""
if self.forecasts is None:
self.extract_forecasts()
config_name = self.get_valid_config_names([config_name])[0]
if title is None:
title = f"Rolling forecast for {config_name}"
fig = plot_multivariate_grouped(
df=self.forecasts,
x_col=TIME_COL,
y_col_style_dict={
ACTUAL_COL: {
"line": {
"width": 1,
"dash": "solid"
}
}
},
grouping_x_col="split_num",
grouping_x_col_values=None,
grouping_y_col_style_dict={
f"{config_name}_{PREDICTED_COL}": {
"name": "split",
"line": {
"width": 1,
"dash": "solid"
}
}
},
colors=colors,
xlabel=xlabel,
ylabel=ylabel,
title=title,
showlegend=showlegend)
return fig
[docs] def get_evaluation_metrics(
self,
metric_dict: Dict,
config_names: List = None):
"""Returns rolling train and test evaluation metric values.
Parameters
----------
metric_dict : `dict` [`str`, `callable`]
Evaluation metrics to compute.
- key: evaluation metric name, used to create column name in output.
- value: metric function to apply to forecast df in each split to generate the column value.
Signature (y_true: `str`, y_pred: `str`) -> transformed value: `float`.
For example::
metric_dict = {
"median_residual": lambda y_true, y_pred: np.median(y_true - y_pred),
"mean_squared_error": lambda y_true, y_pred: np.mean((y_true - y_pred)**2)
}
Some predefined functions are available in
`~greykite.common.evaluation`. For example::
metric_dict = {
"correlation": lambda y_true, y_pred: correlation(y_true, y_pred),
"RMSE": lambda y_true, y_pred: root_mean_squared_error(y_true, y_pred),
"Q_95": lambda y_true, y_pred: partial(quantile_loss(y_true, y_pred, q=0.95))
}
As shorthand, it is sufficient to provide the corresponding ``EvaluationMetricEnum``
member. These are auto-expanded into the appropriate function.
So the following is equivalent::
metric_dict = {
"correlation": EvaluationMetricEnum.Correlation,
"RMSE": EvaluationMetricEnum.RootMeanSquaredError,
"Q_95": EvaluationMetricEnum.Quantile95
}
config_names : `list` [`str`], default None
Which config results to plot. A list of config names.
If None, uses all the available config keys.
Returns
-------
evaluation_metrics_df : pd.DataFrame
A DataFrame containing splitwise train and test evaluation metrics for ``metric_dict``
and ``config_names``.
For example. Let's assume::
metric_dict = {
"RMSE": EvaluationMetricEnum.RootMeanSquaredError,
"Q_95": EvaluationMetricEnum.Quantile95
}
config_names = ["default_prophet", "custom_silverkite"]
These are valid ``config_names`` and there are 2 splits for each.
Then evaluation_metrics_df =
config_name split_num train_RMSE test_RMSE train_Q_95 test_Q_95
default_prophet 0 * * * *
default_prophet 1 * * * *
custom_silverkite 0 * * * *
custom_silverkite 1 * * * *
where * represents computed values.
"""
if not self.is_run:
raise ValueError("Please execute the 'run' method before computing evaluation metrics.")
metric_dict = self.autocomplete_metric_dict(
metric_dict=metric_dict,
enum_class=EvaluationMetricEnum)
config_names = self.get_valid_config_names(config_names=config_names)
evaluation_metrics_df = pd.DataFrame()
for config_name in config_names:
rolling_evaluation = self.result[config_name]["rolling_evaluation"]
for num, (split_key, split_value) in enumerate(rolling_evaluation.items()):
forecast = split_value["pipeline_result"].forecast
split_metrics = {
"config_name": config_name,
"split_num": num}
# Updates train metrics
df_train = forecast.df_train
split_metrics.update({
f"train_{metric_name}": metric_func(
df_train[forecast.actual_col],
df_train[forecast.predicted_col]
) for metric_name, metric_func in metric_dict.items()
})
# Updates test metrics
df_test = forecast.df_test
if df_test.shape[0] > 0 and forecast.test_na_count < df_test.shape[0]:
split_metrics.update({
f"test_{metric_name}": metric_func(
df_test[forecast.actual_col],
df_test[forecast.predicted_col]
) for metric_name, metric_func in metric_dict.items()
})
else:
split_metrics.update({
f"test_{metric_name}": np.nan
for metric_name, metric_func in metric_dict.items()
})
split_metrics_df = pd.DataFrame(split_metrics, index=[num])
evaluation_metrics_df = pd.concat([evaluation_metrics_df, split_metrics_df])
# Resets index and fills missing values (e.g. when correlation is not defined) with np.nan
evaluation_metrics_df = evaluation_metrics_df.reset_index(drop=True).fillna(value=np.nan)
temp_df = evaluation_metrics_df.copy()
# Rearranges columns so that train and test error of a config are side by side
evaluation_metrics_df = pd.DataFrame()
evaluation_metrics_df["config_name"] = temp_df["config_name"]
evaluation_metrics_df["split_num"] = temp_df["split_num"]
for metric_name in metric_dict.keys():
evaluation_metrics_df[f"train_{metric_name}"] = temp_df[f"train_{metric_name}"]
evaluation_metrics_df[f"test_{metric_name}"] = temp_df[f"test_{metric_name}"]
return evaluation_metrics_df
[docs] def plot_evaluation_metrics(
self,
metric_dict: Dict,
config_names: List = None,
xlabel: str = None,
ylabel: str = "Metric value",
title: str = None,
showlegend: bool = True):
"""Returns a barplot of the train and test values of ``metric_dict`` of ``config_names``.
Value of a metric for all ``config_names`` are plotted as a grouped bar.
Train and test values of a metric are plot side-by-side for easy comparison.
Parameters
----------
metric_dict : `dict` [`str`, `callable`]
Evaluation metrics to compute. Same as
`~greykite.framework.framework.benchmark.benchmark_class.BenchmarkForecastConfig.get_evaluation_metrics`.
To get the best visualization, keep number of metrics <= 2.
config_names : `list` [`str`], default None
Which config results to plot. A list of config names.
If None, uses all the available config keys.
xlabel : `str` or None, default None
x-axis label.
ylabel : `str` or None, default "Metric value"
y-axis label.
title : `str` or None, default None
Plot title.
showlegend : `bool`, default True
Whether to show the legend.
Returns
-------
fig : `plotly.graph_objects.Figure`
Interactive plotly bar plot.
"""
evaluation_metrics_df = self.get_evaluation_metrics(
metric_dict=metric_dict,
config_names=config_names)
# This function groups by config name
evaluation_metrics_df = (evaluation_metrics_df
.drop(columns=["split_num"])
.groupby("config_name")
.mean()
.dropna(how="all"))
# Rearranges columns so that train and test error of a config are side by side
plot_df = pd.DataFrame()
for metric_name in metric_dict.keys():
plot_df[f"train_{metric_name}"] = evaluation_metrics_df[f"train_{metric_name}"]
plot_df[f"test_{metric_name}"] = evaluation_metrics_df[f"test_{metric_name}"]
if title is None:
title = "Average evaluation metric across rolling windows"
data = []
# Each row (index) is a config. Adds each row to the bar plot.
for index in plot_df.index:
data.append(
go.Bar(
name=index,
x=plot_df.columns,
y=plot_df.loc[index].values
)
)
layout = go.Layout(
xaxis=dict(title=xlabel),
yaxis=dict(title=ylabel),
title=title,
title_x=0.5,
showlegend=showlegend,
barmode="group",
)
fig = go.Figure(data=data, layout=layout)
return fig
[docs] def get_grouping_evaluation_metrics(
self,
metric_dict: Dict,
config_names: List = None,
which: str = "train",
groupby_time_feature: str = None,
groupby_sliding_window_size: int = None,
groupby_custom_column: pd.Series = None):
"""Returns splitwise rolling evaluation metric values.
These values are grouped by the grouping method chosen by ``groupby_time_feature``,
``groupby_sliding_window_size`` and ``groupby_custom_column``.
See `~greykite.framework.output.univariate_forecast.UnivariateForecast.get_grouping_evaluation`
for details on grouping method.
Parameters
----------
metric_dict : `dict` [`str`, `callable`]
Evaluation metrics to compute. Same as
`~greykite.framework.framework.benchmark.benchmark_class.BenchmarkForecastConfig.get_evaluation_metrics`.
config_names : `list` [`str`], default None
Which config results to plot. A list of config names.
If None, uses all the available config keys.
which: `str`
"train" or "test". Which dataset to evaluate.
groupby_time_feature : `str` or None, default None
If provided, groups by a column generated by
`~greykite.common.features.timeseries_features.build_time_features_df`.
See that function for valid values.
groupby_sliding_window_size : `int` or None, default None
If provided, sequentially partitions data into groups of size
``groupby_sliding_window_size``.
groupby_custom_column : `pandas.Series` or None, default None
If provided, groups by this column value. Should be same length as the DataFrame.
Returns
-------
grouped_evaluation_df : `pandas.DataFrame`
A DataFrame containing splitwise train and test evaluation metrics for ``metric_dict``
and ``config_names``. The evaluation metrics are grouped by the grouping method.
"""
if not self.is_run:
raise ValueError("Please execute the 'run' method before computing "
"grouped evaluation metrics.")
metric_dict = self.autocomplete_metric_dict(
metric_dict=metric_dict,
enum_class=EvaluationMetricEnum)
config_names = self.get_valid_config_names(config_names=config_names)
grouped_evaluation_df = pd.DataFrame()
for config_name in config_names:
rolling_evaluation = self.result[config_name]["rolling_evaluation"]
for num, (split_key, split_value) in enumerate(rolling_evaluation.items()):
forecast = split_value["pipeline_result"].forecast
split_evaluation_df = pd.DataFrame()
for metric_name, metric_func in metric_dict.items():
grouped_df = forecast.get_grouping_evaluation(
score_func=metric_func,
score_func_name=metric_name,
which=which,
groupby_time_feature=groupby_time_feature,
groupby_sliding_window_size=groupby_sliding_window_size,
groupby_custom_column=groupby_custom_column)
# Adds grouped_df to split_evaluation_df, handling the case if split_evaluation_df is empty
# If the actual values are missing, grouped_df.shape[0] might be 0
if grouped_df.shape[0] > 0:
if split_evaluation_df.empty:
split_evaluation_df = grouped_df
else:
groupby_col = split_evaluation_df.columns[0]
split_evaluation_df = pd.merge(split_evaluation_df, grouped_df, on=groupby_col)
else:
# This column name is the same as that obtained from
# `~greykite.framework.output.univariate_forecast.UnivariateForecast.get_grouping_evaluation`
split_evaluation_df[f"{which} {metric_name}"] = np.nan
split_evaluation_df.insert(0, "config_name", config_name)
split_evaluation_df.insert(1, "split_num", num)
grouped_evaluation_df = pd.concat([grouped_evaluation_df, split_evaluation_df])
grouped_evaluation_df = grouped_evaluation_df.reset_index(drop=True)
return grouped_evaluation_df
[docs] def plot_grouping_evaluation_metrics(
self,
metric_dict: Dict,
config_names: List = None,
which: str = "train",
groupby_time_feature: str = None,
groupby_sliding_window_size: int = None,
groupby_custom_column: pd.Series = None,
xlabel=None,
ylabel="Metric value",
title=None,
showlegend=True):
"""Returns a line plot of the grouped evaluation values of ``metric_dict`` of ``config_names``.
These values are grouped by the grouping method chosen by ``groupby_time_feature``,
``groupby_sliding_window_size`` and ``groupby_custom_column``.
See `~greykite.framework.output.univariate_forecast.UnivariateForecast.get_grouping_evaluation`
for details on grouping method.
Parameters
----------
metric_dict : `dict` [`str`, `callable`]
Evaluation metrics to compute. Same as
`~greykite.framework.framework.benchmark.benchmark_class.BenchmarkForecastConfig.get_evaluation_metrics`.
To get the best visualization, keep number of metrics <= 2.
config_names : `list` [`str`], default None
Which config results to plot. A list of config names.
If None, uses all the available config keys.
which: `str`
"train" or "test". Which dataset to evaluate.
groupby_time_feature : `str` or None, optional
If provided, groups by a column generated by
`~greykite.common.features.timeseries_features.build_time_features_df`.
See that function for valid values.
groupby_sliding_window_size : `int` or None, optional
If provided, sequentially partitions data into groups of size
``groupby_sliding_window_size``.
groupby_custom_column : `pandas.Series` or None, optional
If provided, groups by this column value. Should be same length as the DataFrame.
xlabel : `str` or None, default None
x-axis label. If None, label is determined by the groupby column name.
ylabel : `str` or None, default "Metric value"
y-axis label.
title : `str` or None, default None
Plot title. If None, default is based on ``config_name``.
showlegend : `bool`, default True
Whether to show the legend.
Returns
-------
fig : `plotly.graph_objects.Figure`
Interactive plotly graph.
"""
grouped_evaluation_df = self.get_grouping_evaluation_metrics(
metric_dict=metric_dict,
config_names=config_names,
which=which,
groupby_time_feature=groupby_time_feature,
groupby_sliding_window_size=groupby_sliding_window_size,
groupby_custom_column=groupby_custom_column)
# Figures out groupby_col name by process of elimination
cols = [col for col in grouped_evaluation_df.columns if col not in ["config_name", "split_num"]]
groupby_col = get_pattern_cols(cols, pos_pattern=".*", neg_pattern=which)[0]
plot_df = (grouped_evaluation_df
.drop(columns=["split_num"]) # Drops redundant column
.groupby(["config_name", groupby_col]) # Averages values across splits
.mean()
.dropna(how="all") # Drops rows with all NA values
.unstack(level=0) # Moves config_name from multiindex rows to multiindex columns
.sort_index(axis=1) # Sorts on groupby_col to plot groups in logical order
)
# Flattens and renames multiindex columns
cols = [groupby_col] + ["_".join(v) for v in plot_df.columns]
plot_df = pd.DataFrame(plot_df.to_records())
plot_df.columns = cols
if xlabel is None:
xlabel = groupby_col
if title is None:
title = f"{which} performance by {xlabel} across rolling windows"
fig = plot_multivariate(
df=plot_df,
x_col=groupby_col,
y_col_style_dict="plotly",
xlabel=xlabel,
ylabel=ylabel,
title=title,
showlegend=showlegend)
return fig
[docs] def get_runtimes(self, config_names: List = None):
"""Returns rolling average runtime in seconds for ``config_names``.
Parameters
----------
config_names : `list` [`str`], default None
Which config results to plot. A list of config names.
If None, uses all the available config keys.
Returns
-------
runtimes_df : pd.DataFrame
A DataFrame containing splitwise runtime in seconds for ``config_names``.
For example. Let's assume::
config_names = ["default_prophet", "custom_silverkite"]
These are valid ``config_names`` and there are 2 splits for each.
Then runtimes_df =
config_name split_num runtime_sec
default_prophet 0 *
default_prophet 1 *
custom_silverkite 0 *
custom_silverkite 1 *
where * represents computed values.
"""
if not self.is_run:
raise ValueError("Please execute the 'run' method to obtain runtimes.")
config_names = self.get_valid_config_names(config_names=config_names)
runtimes_df = pd.DataFrame()
for config_name in config_names:
rolling_evaluation = self.result[config_name]["rolling_evaluation"]
for num, (split_key, split_value) in enumerate(rolling_evaluation.items()):
split_runtime_df = pd.DataFrame({
"config_name": config_name,
"split_num": num,
"runtime_sec": split_value["runtime_sec"]
}, index=[num])
runtimes_df = pd.concat([runtimes_df, split_runtime_df])
return runtimes_df.reset_index(drop=True)
[docs] def plot_runtimes(
self,
config_names: List = None,
xlabel: str = None,
ylabel: str = "Mean runtime in seconds",
title: str = "Average runtime across rolling windows",
showlegend: bool = True):
"""Returns a barplot of the runtimes of ``config_names``.
Parameters
----------
config_names : `list` [`str`], default None
Which config results to plot. A list of config names.
If None, uses all the available config keys.
xlabel : `str` or None, default None
x-axis label.
ylabel : `str` or None, default "Mean runtime in seconds"
y-axis label.
title : `str` or None, default "Average runtime across rolling windows"
Plot title.
showlegend : `bool`, default True
Whether to show the legend.
Returns
-------
fig : `plotly.graph_objects.Figure`
Interactive plotly bar plot.
"""
runtimes_df = self.get_runtimes(config_names=config_names)
plot_df = runtimes_df.drop(columns=["split_num"]).groupby("config_name").mean()
data = [go.Bar(x=plot_df.index, y=plot_df["runtime_sec"], name="Runtime")]
layout = go.Layout(
xaxis=dict(title=xlabel),
yaxis=dict(title=ylabel),
title=title,
title_x=0.5,
showlegend=showlegend,
)
fig = go.Figure(data=data, layout=layout)
return fig
[docs] def get_valid_config_names(self, config_names: List = None):
"""Validate ``config_names`` against keys of ``configs``.
Raises a ValueError in case of a mismatch.
Parameters
----------
config_names : `list` [`str`], default None
Which config results to plot. A list of config names.
If None, uses all the available config keys.
Returns
-------
config_names : `list`
List of valid config names.
"""
available_config_names = list(self.configs.keys())
if config_names is None:
config_names = available_config_names
else:
missing_config_names = set(config_names) - set(available_config_names)
if len(missing_config_names) > 0:
raise ValueError(f"The following config keys are missing: {missing_config_names}.")
return config_names
[docs] @staticmethod
def autocomplete_metric_dict(metric_dict, enum_class):
"""Sweeps through ``metric_dict``, converting members of ``enum_class`` to
their corresponding evaluation function.
For example::
metric_dict = {
"correlation": EvaluationMetricEnum.Correlation,
"RMSE": EvaluationMetricEnum.RootMeanSquaredError,
"Q_95": EvaluationMetricEnum.Quantile95
"custom_metric": custom_function
}
is converted to
metric_dict = {
"correlation": correlation(y_true, y_pred),
"RMSE": root_mean_squared_error(y_true, y_pred),
"Q_95": quantile_loss_q(y_true, y_pred, q=0.95),
"custom_function": custom_function
}
Parameters
----------
metric_dict : `dict` [`str`, `callable`]
Evaluation metrics to compute. Same as
`~greykite.framework.framework.benchmark.benchmark_class.BenchmarkForecastConfig.get_evaluation_metrics`.
enum_class : Enum
The enum class ``metric_dict`` elements might be member of.
It must have a method ``get_metric_func``.
Returns
-------
updated_metric_dict : `dict`
Autocompleted metric dict.
"""
updated_metric_dict = {}
for metric_name, metric_value in metric_dict.items():
if isinstance(metric_value, enum_class):
updated_metric_dict[metric_name] = metric_value.get_metric_func()
else:
updated_metric_dict[metric_name] = add_finite_filter_to_scorer(metric_value)
if not callable(metric_value):
raise ValueError(f"Value of '{metric_name}' should be a callable or a member of {enum_class}.")
return updated_metric_dict
def save(self):
log_message("Benchmark save is not implemented yet.", LoggingLevelEnum.WARNING)
def summary(self):
log_message("Benchmark summary is not implemented yet.", LoggingLevelEnum.WARNING)