Interpretability

Silverkite generates easily interpretable forecasting models when using its default ML algorithms (e.g. Ridge). This is because after transforming the raw features to basis functions (transformed features), the model uses an additive structure. Silverkite can break down each forecast into various summable components e.g. long-term growth, seasonality, holidays, events, short-term growth (auto-regression), regressors impact etc.

The approach to generate these breakdowns consists of two steps:

  1. Group the transformed variables into various meaningful groups.

  2. Calculate the sum of the features multiplied by their regression coefficients within each group.

These breakdowns then can be used to answer questions such as:

  • Question 1: How is the forecast value is generated?

  • Question 2: What is driving the change of the forecast as new data comes in?

Forecast components can also help us analyze model behavior and sensitivity. This is because while it is not feasible to compare a large set of features across two model settings, it can be quite practical and informative to compare a few well-defined components.

26
27
28
29
30
31
32
33
34
35
36
37
38
39
 # required imports
 import plotly
 import warnings
 import pandas as pd
 from greykite.framework.benchmark.data_loader_ts import DataLoaderTS
 from greykite.framework.templates.autogen.forecast_config import EvaluationPeriodParam
 from greykite.framework.templates.autogen.forecast_config import ForecastConfig
 from greykite.framework.templates.autogen.forecast_config import MetadataParam
 from greykite.framework.templates.autogen.forecast_config import ModelComponentsParam
 from greykite.framework.templates.forecaster import Forecaster
 from greykite.framework.templates.model_templates import ModelTemplateEnum
 from greykite.framework.utils.result_summary import summarize_grid_search_results
 from greykite.common.viz.timeseries_plotting import plot_multivariate
 warnings.filterwarnings("ignore")

Function to load and prepare data

This is the code to upload and prepare the daily bike-sharing data in Washington DC.

46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
 def prepare_bikesharing_data():
     """Loads bike-sharing data and adds proper regressors."""
     dl = DataLoaderTS()
     agg_func = {"count": "sum", "tmin": "mean", "tmax": "mean", "pn": "mean"}
     df = dl.load_bikesharing(agg_freq="daily", agg_func=agg_func)

     # There are some zero values which cause issue for MAPE
     # This adds a small number to all data to avoid that issue
     value_col = "count"
     df[value_col] += 10
     # We drop last value as data might be incorrect as original data is hourly
     df.drop(df.tail(1).index, inplace=True)
     # We only use data from 2018 for demonstration purposes (run time is shorter)
     df = df.loc[df["ts"] > "2018-01-01"]
     df.reset_index(drop=True, inplace=True)

     print(f"\n df.tail(): \n {df.tail()}")

     # Creates useful regressors from existing raw regressors
     df["bin_pn"] = (df["pn"] > 5).map(float)
     df["bin_heavy_pn"] = (df["pn"] > 20).map(float)
     df.columns = [
         "ts",
         value_col,
         "regressor_tmin",
         "regressor_tmax",
         "regressor_pn",
         "regressor_bin_pn",
         "regressor_bin_heavy_pn"]

     forecast_horizon = 7
     train_df = df.copy()
     test_df = df.tail(forecast_horizon).reset_index(drop=True)
     # When using the pipeline (as done in the ``fit_forecast`` below),
     # fitting and prediction are done in one step
     # Therefore for demonstration purpose we remove the response values of last 7 days.
     # This is needed because we are using regressors,
     # and future regressor data must be augmented to ``df``.
     # We mimic that by removal of the values of the response.
     train_df.at[(len(train_df) - forecast_horizon):len(train_df), value_col] = None

     print(f"train_df shape: \n {train_df.shape}")
     print(f"test_df shape: \n {test_df.shape}")
     print(f"train_df.tail(14): \n {train_df.tail(14)}")
     print(f"test_df: \n {test_df}")

     return {
         "train_df": train_df,
         "test_df": test_df}

Function to fit silverkite

This is the code for fitting a silverkite model to the data.

101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
 def fit_forecast(
         df,
         time_col,
         value_col):
     """Fits a daily model for this use case.
     The daily model is a generic silverkite model with regressors."""

     meta_data_params = MetadataParam(
         time_col=time_col,
         value_col=value_col,
         freq="D",
     )

     # Autoregression to be used in the function
     autoregression = {
         "autoreg_dict": {
             "lag_dict": {"orders": [1, 2, 3]},
             "agg_lag_dict": {
                 "orders_list": [[7, 7*2, 7*3]],
                 "interval_list": [(1, 7), (8, 7*2)]},
             "series_na_fill_func": lambda s: s.bfill().ffill()},
             "fast_simulation": True
     }

     # Changepoints configuration
     # The config includes changepoints both in trend and seasonality
     changepoints = {
         "changepoints_dict": {
             "method": "auto",
             "yearly_seasonality_order": 15,
             "resample_freq": "2D",
             "actual_changepoint_min_distance": "100D",
             "potential_changepoint_distance": "50D",
             "no_changepoint_distance_from_end": "50D"},
         "seasonality_changepoints_dict": {
             "method": "auto",
             "yearly_seasonality_order": 15,
             "resample_freq": "2D",
             "actual_changepoint_min_distance": "100D",
             "potential_changepoint_distance": "50D",
             "no_changepoint_distance_from_end": "50D"}
         }

     regressor_cols = [
         "regressor_tmin",
         "regressor_bin_pn",
         "regressor_bin_heavy_pn",
     ]

     # Model parameters
     model_components = ModelComponentsParam(
         growth=dict(growth_term="linear"),
         seasonality=dict(
             yearly_seasonality=[15],
             quarterly_seasonality=[False],
             monthly_seasonality=[False],
             weekly_seasonality=[7],
             daily_seasonality=[False]
         ),
         custom=dict(
             fit_algorithm_dict=dict(fit_algorithm="ridge"),
             extra_pred_cols=None,
             normalize_method="statistical"
         ),
         regressors=dict(regressor_cols=regressor_cols),
         autoregression=autoregression,
         uncertainty=dict(uncertainty_dict=None),
         events=dict(holiday_lookup_countries=["US"]),
         changepoints=changepoints
      )

     # Evaluation is done on same ``forecast_horizon`` as desired for output
     evaluation_period_param = EvaluationPeriodParam(
         test_horizon=None,
         cv_horizon=forecast_horizon,
         cv_min_train_periods=365*2,
         cv_expanding_window=True,
         cv_use_most_recent_splits=False,
         cv_periods_between_splits=None,
         cv_periods_between_train_test=0,
         cv_max_splits=5,
     )

     # Runs the forecast model using "SILVERKITE" template
     forecaster = Forecaster()
     result = forecaster.run_forecast_config(
         df=df,
         config=ForecastConfig(
             model_template=ModelTemplateEnum.SILVERKITE.name,
             coverage=0.95,
             forecast_horizon=forecast_horizon,
             metadata_param=meta_data_params,
             evaluation_period_param=evaluation_period_param,
             model_components_param=model_components
         )
     )

     # Gets cross-validation results
     grid_search = result.grid_search
     cv_results = summarize_grid_search_results(
         grid_search=grid_search,
         decimals=2,
         cv_report_metrics=None)
     cv_results = cv_results.transpose()
     cv_results = pd.DataFrame(cv_results)
     cv_results.columns = ["err_value"]
     cv_results["err_name"] = cv_results.index
     cv_results = cv_results.reset_index(drop=True)
     cv_results = cv_results[["err_name", "err_value"]]

     print(f"\n cv_results: \n {cv_results}")

     return result

Loads and prepares data

The data is loaded and some information about the input data is printed. We use the number of daily rented bikes in Washington DC over time. The data is augmented with weather data (precipitation, min/max daily temperature).

221
 data = prepare_bikesharing_data()

Out:

 df.tail():
             ts  count  tmin  tmax   pn
602 2019-08-27  12216  17.2  26.7  0.0
603 2019-08-28  11401  18.3  27.8  0.0
604 2019-08-29  12685  16.7  28.9  0.0
605 2019-08-30  12097  14.4  32.8  0.0
606 2019-08-31  11281  17.8  31.1  0.0
train_df shape:
 (607, 7)
test_df shape:
 (7, 7)
train_df.tail(14):
             ts    count  regressor_tmin  regressor_tmax  regressor_pn  regressor_bin_pn  regressor_bin_heavy_pn
593 2019-08-18   9655.0            22.2            35.6           0.3               0.0                     0.0
594 2019-08-19  10579.0            21.1            37.2           0.0               0.0                     0.0
595 2019-08-20   8898.0            22.2            36.1           0.0               0.0                     0.0
596 2019-08-21  11648.0            21.7            35.0           1.8               0.0                     0.0
597 2019-08-22  11724.0            21.7            35.0          30.7               1.0                     1.0
598 2019-08-23   8158.0            17.8            23.3           1.8               0.0                     0.0
599 2019-08-24  12475.0            16.7            26.1           0.0               0.0                     0.0
600 2019-08-25      NaN            15.6            26.7           0.0               0.0                     0.0
601 2019-08-26      NaN            17.2            25.0           0.0               0.0                     0.0
602 2019-08-27      NaN            17.2            26.7           0.0               0.0                     0.0
603 2019-08-28      NaN            18.3            27.8           0.0               0.0                     0.0
604 2019-08-29      NaN            16.7            28.9           0.0               0.0                     0.0
605 2019-08-30      NaN            14.4            32.8           0.0               0.0                     0.0
606 2019-08-31      NaN            17.8            31.1           0.0               0.0                     0.0
test_df:
           ts  count  regressor_tmin  regressor_tmax  regressor_pn  regressor_bin_pn  regressor_bin_heavy_pn
0 2019-08-25  11634            15.6            26.7           0.0               0.0                     0.0
1 2019-08-26  11747            17.2            25.0           0.0               0.0                     0.0
2 2019-08-27  12216            17.2            26.7           0.0               0.0                     0.0
3 2019-08-28  11401            18.3            27.8           0.0               0.0                     0.0
4 2019-08-29  12685            16.7            28.9           0.0               0.0                     0.0
5 2019-08-30  12097            14.4            32.8           0.0               0.0                     0.0
6 2019-08-31  11281            17.8            31.1           0.0               0.0                     0.0

Fits model to daily data

In this step we fit a silverkite model to the data which uses weather regressors, holidays, auto-regression etc.

228
229
230
231
232
233
234
235
236
237
238
239
240
 df = data["train_df"]
 time_col = "ts"
 value_col = "count"
 forecast_horizon = 7

 result = fit_forecast(
     df=df,
     time_col=time_col,
     value_col=value_col)
 trained_estimator = result.model[-1]
 # Checks model coefficients and p-values
 print("\n Model Summary:")
 print(trained_estimator.summary())

Out:

Fitting 1 folds for each of 1 candidates, totalling 1 fits

 cv_results:
                                              err_name                                          err_value
0                                      rank_test_MAPE                                                  1
1                                      mean_test_MAPE                                              10.28
2                                     split_test_MAPE                                           (10.28,)
3                                     mean_train_MAPE                                              21.66
4                                              params                                                 []
5                 param_estimator__yearly_seasonality                                                 15
6                 param_estimator__weekly_seasonality                                                  7
7                   param_estimator__uncertainty_dict                                               None
8                  param_estimator__training_fraction                                               None
9                  param_estimator__train_test_thresh                                               None
10                   param_estimator__time_properties  {'period': 86400, 'simple_freq': SimpleTimeFre...
11                    param_estimator__simulation_num                                                 10
12     param_estimator__seasonality_changepoints_dict  {'method': 'auto', 'yearly_seasonality_order':...
13                    param_estimator__regressor_cols  [regressor_tmin, regressor_bin_pn, regressor_b...
14             param_estimator__regression_weight_col                                               None
15             param_estimator__quarterly_seasonality                                              False
16              param_estimator__origin_for_time_vars                                               None
17                  param_estimator__normalize_method                                        statistical
18               param_estimator__monthly_seasonality                                              False
19              param_estimator__min_admissible_value                                               None
20  param_estimator__max_weekly_seas_interaction_o...                                                  2
21  param_estimator__max_daily_seas_interaction_order                                                  5
22              param_estimator__max_admissible_value                                               None
23             param_estimator__lagged_regressor_dict                                               None
24      param_estimator__holidays_to_model_separately                                               auto
25         param_estimator__holiday_pre_post_num_dict                                               None
26              param_estimator__holiday_pre_num_days                                                  2
27             param_estimator__holiday_post_num_days                                                  2
28          param_estimator__holiday_lookup_countries                                               [US]
29                       param_estimator__growth_term                                             linear
30                param_estimator__fit_algorithm_dict                         {'fit_algorithm': 'ridge'}
31              param_estimator__feature_sets_enabled                                               auto
32                   param_estimator__fast_simulation                                               True
33                   param_estimator__extra_pred_cols                                               None
34                param_estimator__explicit_pred_cols                                               None
35                    param_estimator__drop_pred_cols                                               None
36                 param_estimator__daily_seasonality                                              False
37               param_estimator__daily_event_df_dict                                               None
38                 param_estimator__changepoints_dict  {'method': 'auto', 'yearly_seasonality_order':...
39                      param_estimator__autoreg_dict  {'lag_dict': {'orders': [1, 2, 3]}, 'agg_lag_d...
40                  param_estimator__auto_seasonality                                              False
41                      param_estimator__auto_holiday                                              False
42                       param_estimator__auto_growth                                              False
43                                   split_train_MAPE                                           (21.66,)
44                                      mean_fit_time                                               4.79
45                                       std_fit_time                                                  0
46                                    mean_score_time                                               10.1
47                                     std_score_time                                                  0
48                                   split0_test_MAPE                                              10.28
49                                      std_test_MAPE                                                  0
50                                  split0_train_MAPE                                              21.66
51                                     std_train_MAPE                                                  0

 Model Summary:
================================ Model Summary =================================

Number of observations: 600,   Number of features: 134
Method: Ridge regression
Number of nonzero features: 133
Regularization parameter: 174.3

Residuals:
         Min           1Q       Median           3Q          Max
     -7534.0       -895.1        89.89        997.0       7621.0

             Pred_col Estimate Std. Err Pr(>)_boot sig. code                95%CI
            Intercept   9633.0    73.94     <2e-16       ***     (9504.0, 9787.0)
 events_Christmas Day   -145.0    77.24     <2e-16       ***  (-185.9, 7.917e-28)
  events_C...as Day-1   -135.8     71.9     <2e-16       ***  (-174.6, 8.152e-28)
  events_C...as Day-2   -51.78    29.98      0.004        **  (-81.94, 8.064e-28)
  events_C...as Day+1    -72.6     40.1     <2e-16       ***  (-104.4, 8.861e-28)
  events_C...as Day+2   -23.42    17.43      0.120             (-51.7, 8.766e-28)
  events_I...ence Day    45.72    22.53      0.018         *  (-9.712e-28, 79.14)
  events_I...ce Day-1   -27.68    20.22      0.142                (-64.06, 9.238)
  events_I...ce Day-2   -14.51    28.19      0.572                (-64.88, 38.62)
  events_I...ce Day+1   -15.82    15.06      0.236                (-45.41, 13.12)
  events_I...ce Day+2   -65.05    47.52      0.132                 (-139.9, 12.1)
     events_Labor Day   -61.41    32.19      0.006        **  (-88.66, 6.223e-28)
   events_Labor Day-1    92.37    47.65     <2e-16       ***  (-6.695e-28, 122.2)
   events_Labor Day-2    -59.2    32.58      0.010         *  (-92.38, 5.980e-28)
   events_Labor Day+1   -51.36    29.33      0.024         *  (-83.99, 5.986e-28)
   events_Labor Day+2   -3.448    11.31      0.486                (-30.33, 19.85)
  events_Memorial Day   -42.14    21.76      0.026         *  (-74.79, 1.215e-27)
  events_M...al Day-1    125.1    72.99      0.024         *  (-1.820e-27, 223.7)
  events_M...al Day-2   -29.22    21.61      0.144                 (-66.9, 13.46)
  events_M...al Day+1   -57.53    51.18      0.302                (-130.4, 33.14)
  events_M...al Day+2    -35.5    19.77      0.044         *  (-65.59, 1.754e-27)
 events_New Years Day   -46.89     26.3      0.012         *  (-78.04, 8.683e-28)
  events_N...rs Day-1    -42.7    25.41      0.026         *  (-73.79, 8.004e-28)
  events_N...rs Day-2     7.73    11.58      0.358                (-17.03, 30.31)
  events_N...rs Day+1   -23.69    32.25      0.476                (-82.48, 32.99)
  events_N...rs Day+2    33.59     33.2      0.324                (-29.95, 91.51)
         events_Other   -129.2    49.37      0.008        **     (-210.3, -16.32)
       events_Other-1    29.81    51.27      0.574                (-75.87, 118.6)
       events_Other-2   -107.1    44.54      0.022         *     (-180.5, -8.931)
       events_Other+1     32.0    46.15      0.482                (-51.08, 116.1)
       events_Other+2   -45.72    67.19      0.494                (-162.4, 88.56)
  events_Thanksgiving   -184.4    95.57     <2e-16       ***  (-225.9, 6.168e-28)
  events_T...giving-1   -46.46    26.53      0.016         *  (-73.42, 6.497e-28)
  events_T...giving-2    1.876    8.613      0.506                (-20.18, 17.97)
  events_T...giving+1   -128.7    65.43     <2e-16       ***  (-172.3, 6.463e-28)
  events_T...giving+2   -53.12    31.73      0.036         *  (-90.99, 1.163e-27)
  events_Veterans Day   -28.32    19.92      0.102                (-61.6, 0.9676)
  events_V...ns Day-1   -36.49     24.0      0.072         .  (-74.64, 6.280e-28)
  events_V...ns Day-2   -77.61     41.2     <2e-16       ***  (-104.4, 6.548e-28)
  events_V...ns Day+1    32.43    19.94      0.080         .      (-4.655, 58.78)
  events_V...ns Day+2    4.144    15.34      0.542                (-32.25, 34.99)
        str_dow_2-Tue    20.87    29.08      0.470                (-36.43, 79.57)
        str_dow_3-Wed    19.99    23.17      0.406                (-25.45, 61.91)
        str_dow_4-Thu    28.17    26.45      0.300                (-26.37, 79.17)
        str_dow_5-Fri    40.95    32.01      0.210                (-20.11, 102.0)
        str_dow_6-Sat   -8.635    38.43      0.840                (-77.99, 67.03)
        str_dow_7-Sun   -105.5    28.85      0.002        **     (-163.1, -54.52)
       regressor_tmin    598.5    62.21     <2e-16       ***       (456.7, 701.9)
     regressor_bin_pn   -835.9     61.1     <2e-16       ***     (-941.6, -699.5)
  regresso...heavy_pn   -364.5    80.62     <2e-16       ***     (-530.8, -217.2)
                  ct1   -8.196     30.9      0.810                (-67.44, 49.18)
       is_weekend:ct1   -13.61    24.44      0.586                (-60.22, 32.66)
    str_dow_2-Tue:ct1     21.7    23.97      0.368                (-27.01, 67.37)
    str_dow_3-Wed:ct1    14.95     20.4      0.474                (-26.38, 51.67)
    str_dow_4-Thu:ct1    4.774     22.7      0.830                (-36.84, 53.22)
    str_dow_5-Fri:ct1    8.282     26.9      0.738                (-47.55, 57.74)
    str_dow_6-Sat:ct1    17.64    32.06      0.614                (-42.43, 78.95)
    str_dow_7-Sun:ct1   -36.09    28.37      0.200                 (-89.29, 22.5)
    cp0_2018_07_21_00   -155.5    25.68     <2e-16       ***     (-194.9, -97.24)
  is_weeke...07_21_00   -26.61    28.84      0.338                (-76.35, 32.09)
  str_dow_...07_21_00   -38.97    35.66      0.270                (-110.4, 31.26)
  str_dow_...07_21_00   -33.33    22.34      0.148                 (-72.5, 10.25)
  str_dow_...07_21_00   -14.47    30.92      0.658                (-77.68, 45.39)
  str_dow_...07_21_00   -88.07    42.61      0.042         *     (-170.8, -7.553)
  str_dow_...07_21_00    15.77    44.21      0.724                (-73.23, 102.0)
  str_dow_...07_21_00   -52.68    43.61      0.238                (-129.4, 41.76)
  ct1:sin1_tow_weekly    20.52    22.18      0.364                (-21.43, 62.33)
  ct1:cos1_tow_weekly   -35.92    23.38      0.134                (-82.59, 6.589)
  ct1:sin2_tow_weekly    30.35    21.04      0.128                (-11.45, 70.39)
  ct1:cos2_tow_weekly   -30.92    24.69      0.222                 (-77.93, 18.5)
  cp0_2018...w_weekly   -2.809    26.49      0.878                (-55.41, 45.86)
  cp0_2018...w_weekly   -25.97    32.89      0.432                (-92.45, 40.25)
  cp0_2018...w_weekly   -12.22    33.21      0.690                (-82.95, 46.34)
  cp0_2018...w_weekly   -61.22    31.85      0.042         *     (-121.2, -4.486)
      sin1_tow_weekly    59.81    27.95      0.038         *       (1.393, 115.4)
      cos1_tow_weekly   -56.32     30.1      0.050         .     (-113.0, -4.038)
      sin2_tow_weekly    59.61    30.24      0.046         *       (3.018, 120.4)
      cos2_tow_weekly    27.39    32.29      0.378                (-31.46, 90.84)
      sin3_tow_weekly    9.787    30.41      0.746                (-46.12, 71.02)
      cos3_tow_weekly    35.32    28.01      0.226                (-14.59, 93.59)
      sin4_tow_weekly   -9.787    30.41      0.746                (-71.02, 46.12)
      cos4_tow_weekly    35.32    28.01      0.226                (-14.59, 93.59)
      sin5_tow_weekly   -59.61    30.24      0.046         *     (-120.4, -3.018)
      cos5_tow_weekly    27.39    32.29      0.378                (-31.46, 90.84)
      sin6_tow_weekly   -59.81    27.95      0.038         *     (-115.4, -1.393)
      cos6_tow_weekly   -56.32     30.1      0.050         .     (-113.0, -4.038)
      sin7_tow_weekly    63.92    26.24      0.012         *       (10.52, 112.1)
      cos7_tow_weekly       0.       0.      1.000                       (0., 0.)
      sin1_ct1_yearly    14.61    46.95      0.748                (-84.11, 94.62)
      cos1_ct1_yearly   -523.4    38.97     <2e-16       ***     (-586.6, -439.0)
      sin2_ct1_yearly   -203.5    45.51     <2e-16       ***     (-294.1, -108.1)
      cos2_ct1_yearly   -83.95    56.28      0.162                (-180.6, 34.07)
      sin3_ct1_yearly   -69.93    54.06      0.182                (-170.1, 41.85)
      cos3_ct1_yearly   -40.88    49.84      0.418                (-133.4, 63.62)
      sin4_ct1_yearly    34.08    50.49      0.492                (-72.56, 129.7)
      cos4_ct1_yearly    36.63    55.76      0.550                (-52.64, 152.3)
      sin5_ct1_yearly   -57.09    53.62      0.292                (-155.4, 50.06)
      cos5_ct1_yearly   -59.11    57.11      0.316                (-171.0, 54.01)
      sin6_ct1_yearly   -17.35    52.76      0.744                (-118.0, 77.28)
      cos6_ct1_yearly   -198.7    58.58     <2e-16       ***     (-295.9, -66.54)
      sin7_ct1_yearly   -23.14    54.46      0.672                (-132.0, 80.12)
      cos7_ct1_yearly    60.45    61.08      0.326                 (-51.5, 183.9)
      sin8_ct1_yearly    16.92    63.72      0.806                (-101.6, 130.8)
      cos8_ct1_yearly    29.37    57.44      0.620                (-77.54, 143.5)
      sin9_ct1_yearly   -21.39    60.04      0.718                (-141.6, 88.29)
      cos9_ct1_yearly   -32.55    54.76      0.544                (-129.3, 83.25)
     sin10_ct1_yearly    95.53    59.54      0.098         .      (-24.03, 209.3)
     cos10_ct1_yearly   -24.85    55.15      0.662                (-126.2, 79.35)
     sin11_ct1_yearly   -14.08    56.79      0.844                (-129.9, 92.91)
     cos11_ct1_yearly   -9.692    60.93      0.904                (-129.0, 101.1)
     sin12_ct1_yearly   -42.91    54.95      0.420                (-158.5, 56.08)
     cos12_ct1_yearly    119.3    59.78      0.038         *       (7.019, 231.5)
     sin13_ct1_yearly   -78.89    56.16      0.150                 (-188.1, 32.5)
     cos13_ct1_yearly   -50.95    59.49      0.386                (-167.6, 62.37)
     sin14_ct1_yearly   -54.58    54.88      0.302                (-171.6, 50.61)
     cos14_ct1_yearly   -22.74    59.79      0.700                (-143.8, 90.64)
     sin15_ct1_yearly   -165.8    57.89     <2e-16       ***     (-276.7, -48.78)
     cos15_ct1_yearly   -36.63    60.13      0.568                (-150.3, 82.42)
  sin1_con...07_21_00    38.06    46.58      0.378                (-64.74, 122.8)
  cos1_con...07_21_00   -136.9    53.88      0.010         *     (-252.7, -35.97)
  sin2_con...07_21_00    32.56    45.06      0.472                (-59.63, 113.8)
  cos2_con...07_21_00   -172.8    61.73      0.004        **     (-288.6, -43.99)
  sin3_con...07_21_00   -27.26    56.41      0.628                (-143.6, 76.76)
  cos3_con...07_21_00   -3.896    48.86      0.936                 (-91.08, 97.2)
  sin4_con...07_21_00   -18.09    43.46      0.702                (-108.8, 62.85)
  cos4_con...07_21_00    43.25    53.98      0.422                 (-67.3, 151.9)
  sin5_con...07_21_00    45.09    59.12      0.434                (-66.86, 166.5)
  cos5_con...07_21_00    67.98    48.38      0.156                (-24.62, 156.7)
               y_lag1    606.3    81.57     <2e-16       ***       (419.2, 733.3)
               y_lag2    89.37    66.61      0.146                (-32.68, 229.8)
               y_lag3    158.1    70.68      0.030         *       (26.69, 305.2)
     y_avglag_7_14_21    331.3    57.69     <2e-16       ***       (209.5, 424.2)
      y_avglag_1_to_7    235.6    43.37     <2e-16       ***       (173.6, 331.3)
     y_avglag_8_to_14    334.5    57.35     <2e-16       ***       (218.0, 445.0)
Signif. Code: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Multiple R-squared: 0.7796,   Adjusted R-squared: 0.7502
F-statistic: 22.675 on 70 and 528 DF,   p-value: 1.110e-16
Model AIC: 12944.0,   model BIC: 13259.0

WARNING: the F-ratio and its p-value on regularized methods might be misleading, they are provided only for reference purposes.
WARNING: the following columns have estimated coefficients equal to zero, while ridge is not supposed to have zero estimates. This is probably because these columns are degenerate in the design matrix. Make sure these columns do not have constant values.
['cos7_tow_weekly']
WARNING: the following columns are degenerate, do you really want to include them in your model? This may cause some of them to show unrealistic significance. Consider using the `drop_degenerate` transformer.
['Intercept', 'cos7_tow_weekly']

Grouping of variables

Regex expressions are used to group variables in the breakdown plot. Each group is given in one key of this dictionary. The grouping is done using variable names and for each group multiple regex are given. For each group, variables that satisfy EITHER regex are chosen. Note that this grouping assumes that regressor variables start with “regressor_”. Also note that the order of this grouping matters (Python treats the dictionary as ordered in 3.6+). That means the variables chosen using regex in top groups will not be picked up again. If some variables do not satisfy any of the groupings, they will be grouped into “OTHER”. The following breakdown dictionary should work for many use cases. However, the users can customize it as needed.

257
258
259
260
261
262
263
 grouping_regex_patterns_dict = {
     "regressors": "regressor_.*",  # regressor effects
     "AR": ".*lag",  # autoregression component
     "events": ".*events_.*",  # events and holidays
     "seasonality": ".*quarter.*|.*month.*|.*C\(dow.*|.*C\(dow_hr.*|sin.*|cos.*|.*doq.*|.*dom.*|.*str_dow.*|.*is_weekend.*|.*tow_weekly.*",  # seasonality
     "trend": "ct1|ct2|ct_sqrt|ct3|ct_root3|.*changepoint.*",  # long term trend (includes changepoints)
 }

Creates forecast breakdown

This is generated for observed data plus the prediction data (available in df). Each component is centered around zero and the sum of all components is equal to forecast.

271
272
273
274
275
276
277
 breakdown_result = trained_estimator.forecast_breakdown(
     grouping_regex_patterns_dict=grouping_regex_patterns_dict,
     center_components=True,
     plt_title="forecast breakdowns")
 forecast_breakdown_df = breakdown_result["breakdown_df_with_index_col"]
 forecast_components_fig = breakdown_result["breakdown_fig"]
 plotly.io.show(forecast_components_fig)

Standardization of the components

Next we provide a more “standardized” view of the breakdown. This is achieved by dividing all components by observed absolute value of the metric. By doing so, intercept should be mapped to 1 and the y-axis changes can be viewed relative to the average magnitude of the series. The sum of all components at each time point will be equal to “forecast / obs_abs_mean”.

288
289
290
291
292
293
294
295
296
297
298
299
300
301
 column_grouping_result = breakdown_result["column_grouping_result"]
 component_cols = list(grouping_regex_patterns_dict.keys())
 forecast_breakdown_stdzd_df = forecast_breakdown_df.copy()
 obs_abs_mean = abs(df[value_col]).mean()
 for col in component_cols + ["Intercept", "OTHER"]:
     if col in forecast_breakdown_stdzd_df.columns:
         forecast_breakdown_stdzd_df[col] /= obs_abs_mean
 forecast_breakdown_stdzd_fig = plot_multivariate(
     df=forecast_breakdown_stdzd_df,
     x_col=time_col,
     title="forecast breakdowns divided by mean of abs value of response",
     ylabel="component")
 forecast_breakdown_stdzd_fig.update_layout(yaxis_range=[-1.1, 1.1])
 plotly.io.show(forecast_breakdown_stdzd_fig)

Breaking down the predictions

Next we perform a prediction and generate a breakdown plot for that prediction.

307
308
309
310
311
312
313
314
315
316
317
318
319
 test_df = data["test_df"].reset_index()
 test_df[value_col] = None
 print(f"\n test_df: \n {test_df}")
 pred_df = trained_estimator.predict(test_df)
 forecast_x_mat = trained_estimator.forecast_x_mat
 # Generate the breakdown plot
 breakdown_result = trained_estimator.forecast_breakdown(
     grouping_regex_patterns_dict=grouping_regex_patterns_dict,
     forecast_x_mat=forecast_x_mat,
     time_values=pred_df[time_col])

 breakdown_fig = breakdown_result["breakdown_fig"]
 plotly.io.show(breakdown_fig)

Out:

 test_df:
    index         ts count  regressor_tmin  regressor_tmax  regressor_pn  regressor_bin_pn  regressor_bin_heavy_pn
0      0 2019-08-25  None            15.6            26.7           0.0               0.0                     0.0
1      1 2019-08-26  None            17.2            25.0           0.0               0.0                     0.0
2      2 2019-08-27  None            17.2            26.7           0.0               0.0                     0.0
3      3 2019-08-28  None            18.3            27.8           0.0               0.0                     0.0
4      4 2019-08-29  None            16.7            28.9           0.0               0.0                     0.0
5      5 2019-08-30  None            14.4            32.8           0.0               0.0                     0.0
6      6 2019-08-31  None            17.8            31.1           0.0               0.0                     0.0

Demonstrating a scenario-based breakdown

We artificially inject a “bad weather” day into test data on the second day of prediction. This is done to observe if the breakdown plot captures a decrease in the collective regressors’ effect. The impact of the change in the regressor values can be clearly seen in the updated breakdown.

329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
 # Altering the test data.
 # We alter the normal weather conditions on the second day to heavy precipitation and low temperature.
 test_df["regressor_bin_pn"] = [0, 1, 0, 0, 0, 0, 0]
 test_df["regressor_bin_heavy_pn"] = [0, 1, 0, 0, 0, 0, 0]
 test_df["regressor_tmin"] = [15, 0, 15, 15,  15, 15, 15]
 print(f"altered test_df: \n {test_df}")

 # Gets predictions and the design matrix used during predictions.
 pred_df = trained_estimator.predict(test_df.reset_index())
 forecast_x_mat = trained_estimator.forecast_x_mat

 # Generates the breakdown plot.
 breakdown_result = trained_estimator.forecast_breakdown(
     grouping_regex_patterns_dict=grouping_regex_patterns_dict,
     forecast_x_mat=forecast_x_mat,
     time_values=pred_df[time_col])
 breakdown_fig = breakdown_result["breakdown_fig"]
 plotly.io.show(breakdown_fig)

Out:

altered test_df:
    index         ts count  regressor_tmin  regressor_tmax  regressor_pn  regressor_bin_pn  regressor_bin_heavy_pn
0      0 2019-08-25  None              15            26.7           0.0                 0                       0
1      1 2019-08-26  None               0            25.0           0.0                 1                       1
2      2 2019-08-27  None              15            26.7           0.0                 0                       0
3      3 2019-08-28  None              15            27.8           0.0                 0                       0
4      4 2019-08-29  None              15            28.9           0.0                 0                       0
5      5 2019-08-30  None              15            32.8           0.0                 0                       0
6      6 2019-08-31  None              15            31.1           0.0                 0                       0

Total running time of the script: ( 0 minutes 33.933 seconds)

Gallery generated by Sphinx-Gallery