Source code for greykite.algo.forecast.silverkite.forecast_simple_silverkite

# BSD 2-CLAUSE LICENSE

# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:

# Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
# Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# original author: Albert Chen


from datetime import datetime
from typing import Dict
from typing import List
from typing import Optional
from typing import Type
from typing import Union

import pandas as pd

from greykite.algo.changepoint.adalasso.changepoint_detector import get_changepoints_dict
from greykite.algo.forecast.silverkite.constants.silverkite_column import SilverkiteColumn
from greykite.algo.forecast.silverkite.constants.silverkite_constant import SilverkiteConstant
from greykite.algo.forecast.silverkite.constants.silverkite_constant import default_silverkite_constant
from greykite.algo.forecast.silverkite.constants.silverkite_holiday import SilverkiteHoliday
from greykite.algo.forecast.silverkite.constants.silverkite_time_frequency import SilverkiteTimeFrequencyEnum
from greykite.algo.forecast.silverkite.forecast_silverkite import SilverkiteForecast
from greykite.algo.forecast.silverkite.forecast_simple_silverkite_helper import cols_interact
from greykite.algo.forecast.silverkite.forecast_simple_silverkite_helper import generate_holiday_events
from greykite.algo.forecast.silverkite.forecast_simple_silverkite_helper import get_event_pred_cols
from greykite.algo.forecast.silverkite.forecast_simple_silverkite_helper import patsy_categorical_term
from greykite.common import constants as cst
from greykite.common.constants import GROWTH_COL_ALIAS
from greykite.common.enums import SimpleTimeFrequencyEnum
from greykite.common.enums import TimeEnum
from greykite.common.features.timeseries_features import get_available_holidays_across_countries
from greykite.common.features.timeseries_features import get_changepoint_features_and_values_from_config
from greykite.common.logging import LoggingLevelEnum
from greykite.common.logging import log_message
from greykite.common.python_utils import unique_elements_in_list
from greykite.common.python_utils import update_dictionary
from greykite.common.time_properties_forecast import get_forecast_time_properties


[docs]class SimpleSilverkiteForecast(SilverkiteForecast): """A derived class of `~greykite.algo.forecast.silverkite.SilverkiteForecast`. Provides an alternative interface with simplified configuration parameters. Produces the same trained model output and uses the same predict functions. """ def __init__( self, constants: SilverkiteConstant = default_silverkite_constant): super().__init__(constants=constants) self._silverkite_time_frequency_enum: Type[SilverkiteTimeFrequencyEnum] = constants.get_silverkite_time_frequency_enum() self._silverkite_holiday: Type[SilverkiteHoliday] = constants.get_silverkite_holiday() self._silverkite_column: Type[SilverkiteColumn] = constants.get_silverkite_column()
[docs] def convert_params( self, df: pd.DataFrame, time_col: str, value_col: str, time_properties: Optional[Dict] = None, freq: Optional[str] = None, forecast_horizon: Optional[int] = None, origin_for_time_vars: Optional[float] = None, train_test_thresh: Optional[datetime] = None, training_fraction: Optional[float] = 0.9, fit_algorithm: str = "ridge", fit_algorithm_params: Optional[Dict] = None, holidays_to_model_separately: Optional[Union[str, List[str]]] = "auto", holiday_lookup_countries: Optional[Union[str, List[str]]] = "auto", holiday_pre_num_days: int = 2, holiday_post_num_days: int = 2, holiday_pre_post_num_dict: Optional[Dict] = None, daily_event_df_dict: Optional[Dict] = None, changepoints_dict: Optional[Dict] = None, yearly_seasonality: Union[bool, str, int] = "auto", quarterly_seasonality: Union[bool, str, int] = "auto", monthly_seasonality: Union[bool, str, int] = "auto", weekly_seasonality: Union[bool, str, int] = "auto", daily_seasonality: Union[bool, str, int] = "auto", max_daily_seas_interaction_order: Optional[int] = None, max_weekly_seas_interaction_order: Optional[int] = None, autoreg_dict: Optional[Dict] = None, past_df: Optional[pd.DataFrame] = None, lagged_regressor_dict: Optional[Dict] = None, seasonality_changepoints_dict: Optional[Dict] = None, min_admissible_value: Optional[float] = None, max_admissible_value: Optional[float] = None, uncertainty_dict: Optional[Dict] = None, normalize_method: Optional[str] = None, growth_term: Optional[str] = "linear", regressor_cols: Optional[List[str]] = None, feature_sets_enabled: Optional[Union[bool, str, Dict[str, Optional[Union[bool, str]]]]] = "auto", extra_pred_cols: Optional[List[str]] = None, drop_pred_cols: Optional[List[str]] = None, explicit_pred_cols: Optional[List[str]] = None, regression_weight_col: Optional[str] = None, simulation_based: Optional[bool] = False, simulation_num: int = 10): """Converts parameters of :func:`~greykite.algo.forecast.silverkite.forecast_simple_silverkite` into those of :func:`~greykite.algo.forecast.forecast_silverkite.SilverkiteForecast::forecast`. Makes it easier to set parameters to ``SilverkiteForecast::forecast`` suitable for most forecasting problems. Provides data-aware defaults for seasonality and interaction terms. Provides a simple configuration of holidays from an internal holiday database, and user-friendly configuration for growth and regressors. These parameters can be set from a plain-text config (e.g. no pandas dataframes). The parameter list is intentionally flat to facilitate hyperparameter grid search. Every parameter is either a parameter of ``SilverkiteForecast::forecast`` or a tuning parameter. Notes ----- The basic parameters are identical to ``SilverkiteForecast::forecast``. The more complex parameters are specified via config parameters: * ``daily_event_df_dict`` (via ``holiday*``) * ``fs_components_df`` (via `*_seasonality``) * ``extra_pred_cols`` (via ``holiday*``, ``*seas*``, ``growth_term``, ``regressor_cols``, ``feature_sets_enabled``, ``extra_pred_cols``) Parameters ---------- df : `pandas.DataFrame` A data frame which includes the timestamp column as well as the value column. This is the ``df`` for training the model, not for future prediction. time_col : `str` The column name in `df` representing time for the time series data The time column can be anything that can be parsed by pandas DatetimeIndex value_col: `str` The column name which has the value of interest to be forecasted time_properties : `dict` [`str`, `any`] or None, optional Time properties dictionary (likely produced by `~greykite.common.time_properties_forecast.get_forecast_time_properties`) with keys: ``"ts"`` : `UnivariateTimeSeries` or None ``df`` converted to a ``UnivariateTimeSeries``. ``"period"`` : `int` Period of each observation (i.e. minimum time between observations, in seconds). ``"simple_freq"`` : `SimpleTimeFrequencyEnum` ``SimpleTimeFrequencyEnum`` member corresponding to data frequency. ``"num_training_points"`` : `int` Number of observations for training. ``"num_training_days"`` : `int` Number of days for training. ``"start_year"`` : `int` Start year of the training period. ``"end_year"`` : `int` End year of the forecast period. ``"origin_for_time_vars"`` : `float` Continuous time representation of the first date in ``df``. In this function, - ``start_year`` and ``end_year`` are used to define ``daily_event_df_dict``. - ``simple_freq`` and ``num_training_days`` are used to define ``fs_components_df``. - ``simple_freq`` and ``num_training_days`` are used to set default ``feature_sets_enabled``. - ``origin_for_time_vars`` is used to set default ``origin_for_time_vars``. - the other parameters are ignored It is okay if ``num_training_points``, ``num_training_days``, ``start_year``, ``end_year`` are computed for a superset of ``df``. This allows CV splits and backtest, which train on partial data, to use the same data-aware model parameters as the forecast on all training data. If None, the values are computed for ``df``. This corresponds to using the same modeling *approach* on the CV splits and backtest from `forecast_pipeline`, without requiring the same parameters. In this case, make sure ``forecast_horizon`` is at least as large as the test period for the split, to ensure all holidays are captured. freq : `str` or None, optional, default `None` Frequency of input data. Used to compute ``time_properties`` only if ``time_properties is None``. Frequency strings can have multiples, e.g. '5H'. See https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases for a list of frequency aliases. If None, inferred by `pandas.infer_freq`. Provide this parameter if ``df`` has missing timepoints. forecast_horizon : `int` or None, optional, default `None` Number of periods to forecast into the future. Must be > 0. Used to compute ``time_properties`` only if ``time_properties is None``. If None, default is determined by input data frequency. Used to determine forecast end date, to pull the appropriate holiday data. Should be at least as large as the prediction period (if this function is called from ``forecast_pipeline``, the prediction period for different splits is set via ``cv_horizon``, ``test_horizon``, ``forecast_horizon``). origin_for_time_vars : `float` or None, optional, default `None` The time origin used to create continuous variables for time. If None, uses the value from ``time_properties``. train_test_thresh : `datetime.datetime` or None, optional, default `None` e.g. datetime.datetime(2019, 6, 30) The threshold for training and testing split. Note that the final returned model is trained using all data. If None, training split is based on ``training_fraction``. training_fraction : `float` or None, optional, default 0.9 The fraction of data used for training (0.0 to 1.0) Used only if ``train_test_thresh is None``. If this is also None or 1.0, then we skip testing and train on the entire dataset. fit_algorithm : `str`, optional, default "linear" The type of predictive model used in fitting. See `~greykite.algo.common.ml_models.fit_model_via_design_matrix` for available options and their parameters. fit_algorithm_params : `dict` or None, optional, default None Parameters passed to the requested fit_algorithm. If None, uses the defaults in `~greykite.algo.common.ml_models.fit_model_via_design_matrix`. holiday_lookup_countries : `list` [`str`] or "auto" or None, optional, default "auto" The countries that contain the holidays you intend to model (``holidays_to_model_separately``). * If "auto", uses a default list of countries that contain the default ``holidays_to_model_separately``. See `~greykite.algo.forecast.silverkite.constants.silverkite_holiday.SilverkiteHoliday.HOLIDAY_LOOKUP_COUNTRIES_AUTO`. * If a list, must be a list of country names. * If None or an empty list, no holidays are modeled. holidays_to_model_separately : `list` [`str`] or "auto" or `~greykite.algo.forecast.silverkite.constants.silverkite_holiday.SilverkiteHoliday.ALL_HOLIDAYS_IN_COUNTRIES` or None, optional, default "auto" # noqa: E501 Which holidays to include in the model. The model creates a separate key, value for each item in ``holidays_to_model_separately``. The other holidays in the countries are grouped together as a single effect. * If "auto", uses a default list of important holidays. See `~greykite.algo.forecast.silverkite.constants.silverkite_holiday.SilverkiteHoliday.HOLIDAYS_TO_MODEL_SEPARATELY_AUTO`. * If `~greykite.algo.forecast.silverkite.constants.silverkite_holiday.SilverkiteHoliday.ALL_HOLIDAYS_IN_COUNTRIES`, uses all available holidays in ``holiday_lookup_countries``. This can often create a model that has too many parameters, and should typically be avoided. * If a list, must be a list of holiday names. * If None or an empty list, all holidays in ``holiday_lookup_countries`` are grouped together as a single effect. Use ``holiday_lookup_countries`` to provide a list of countries where these holiday occur. holiday_pre_num_days : `int`, default 2 Model holiday effects for ``holiday_pre_num_days`` days before the holiday. holiday_post_num_days : `int`, default 2 Model holiday effects for ``holiday_post_num_days`` days after the holiday. holiday_pre_post_num_dict : `dict` [`str`, (`int`, `int`)] or None, default None Overrides ``pre_num`` and ``post_num`` for each holiday in ``holidays_to_model_separately``. For example, if ``holidays_to_model_separately`` contains "Thanksgiving" and "Labor Day", this parameter can be set to ``{"Thanksgiving": [1, 3], "Labor Day": [1, 2]}``, denoting that the "Thanksgiving" ``pre_num`` is 1 and ``post_num`` is 3, and "Labor Day" ``pre_num`` is 1 and ``post_num`` is 2. Holidays not specified use the default given by ``pre_num`` and ``post_num``. daily_event_df_dict : `dict` [`str`, `pandas.DataFrame`] or None, default None A dictionary of data frames, each representing events data for the corresponding key. Specifies additional events to include besides the holidays specified above. The format is the same as in `~greykite.algo.forecast.silverkite.SilverkiteForecast.forecast`. The DataFrame has two columns: - The first column contains event dates. Must be in a format recognized by `pandas.to_datetime`. Must be at daily frequency for proper join. It is joined against the time in ``df``, converted to a day: ``pd.to_datetime(pd.DatetimeIndex(df[time_col]).date)``. - the second column contains the event label for each date The column order is important; column names are ignored. The event dates must span their occurrences in both the training and future prediction period. During modeling, each key in the dictionary is mapped to a categorical variable named ``f"{EVENT_PREFIX}_{key}"``, whose value at each timestamp is specified by the corresponding DataFrame. For example, to manually specify a yearly event on September 1 during a training/forecast period that spans 2020-2022:: daily_event_df_dict = { "custom_event": pd.DataFrame({ "date": ["2020-09-01", "2021-09-01", "2022-09-01"], "label": ["is_event", "is_event", "is_event"] }) } It's possible to specify multiple events in the same df. Two events, ``"sep"`` and ``"oct"`` are specified below for 2020-2021:: daily_event_df_dict = { "custom_event": pd.DataFrame({ "date": ["2020-09-01", "2020-10-01", "2021-09-01", "2021-10-01"], "event_name": ["sep", "oct", "sep", "oct"] }) } Use multiple keys if two events may fall on the same date. These events must be in separate DataFrames:: daily_event_df_dict = { "fixed_event": pd.DataFrame({ "date": ["2020-09-01", "2021-09-01", "2022-09-01"], "event_name": "fixed_event" }), "moving_event": pd.DataFrame({ "date": ["2020-09-01", "2021-08-28", "2022-09-03"], "event_name": "moving_event" }), } The multiple event specification can be used even if events never overlap. An equivalent specification to the second example:: daily_event_df_dict = { "sep": pd.DataFrame({ "date": ["2020-09-01", "2021-09-01"], "event_name": "is_event" }), "oct": pd.DataFrame({ "date": ["2020-10-01", "2021-10-01"], "event_name": "is_event" }), } Note: All these events are automatically added to the model. There is no need to specify them in ``extra_pred_cols`` as you would for `~greykite.algo.forecast.silverkite.SilverkiteForecast.forecast`. Note: Do not use `~greykite.common.constants.EVENT_DEFAULT` in the second column. This is reserved to indicate dates that do not correspond to an event. changepoints_dict : `dict` or None, optional, default None Specifies the changepoint configuration. ``"method"``: `str` The method to locate changepoints. Valid options: - "uniform". Places n_changepoints evenly spaced changepoints to allow growth to change. - "custom". Places changepoints at the specified dates. - "auto". Automatically detects change points. For configuration, see `~greykite.algo.changepoint.adalasso.changepoint_detector.ChangepointDetector.find_trend_changepoints` Additional keys to provide parameters for each particular method are described below. ``"continuous_time_col"``: `str`, optional Column to apply ``growth_func`` to, to generate changepoint features Typically, this should match the growth term in the model ``"growth_func"``: callable or None, optional Growth function (scalar -> scalar). Changepoint features are created by applying ``growth_func`` to ``continuous_time_col`` with offsets. If None, uses identity function to use ``continuous_time_col`` directly as growth term If changepoints_dict["method"] == "uniform", this other key is required: ``"n_changepoints"``: int number of changepoints to evenly space across training period If changepoints_dict["method"] == "custom", this other key is required: ``"dates"``: Iterable[Union[int, float, str, datetime]] Changepoint dates. Must be parsable by pd.to_datetime. Changepoints are set at the closest time on or after these dates in the dataset. If changepoints_dict["method"] == "auto", the keys that matches the parameters in `~greykite.algo.changepoint.adalasso.changepoint_detector.ChangepointDetector.find_trend_changepoints`, except ``df``, ``time_col`` and ``value_col``, are optional. Extra keys also include "dates", "combine_changepoint_min_distance" and "keep_detected" to specify additional custom trend changepoints. These three parameters correspond to the three parameters "custom_changepoint_dates", "min_distance" and "keep_detected" in `~greykite.algo.changepoint.adalasso.changepoints_utils.combine_detected_and_custom_trend_changepoints`. yearly_seasonality : `str` or `bool` or `int` Determines the yearly seasonality. 'auto', True, False, or a number for the Fourier order quarterly_seasonality : `str` or `bool` or `int` Determines the quarterly seasonality. 'auto', True, False, or a number for the Fourier order monthly_seasonality : `str` or `bool` or `int` Determines the monthly seasonality. 'auto', True, False, or a number for the Fourier order weekly_seasonality : `str` or `bool` or `int` Determines the weekly seasonality. 'auto', True, False, or a number for the Fourier order daily_seasonality : `str` or `bool` or `int` Determines the daily seasonality. 'auto', True, False, or a number for the Fourier order max_daily_seas_interaction_order : `int` or None, optional, default `None` Max fourier order for interaction terms with daily seasonality. If None, uses all available terms. max_weekly_seas_interaction_order : `int` or None, optional, default `None` Max fourier order for interaction terms with weekly seasonality. If None, uses all available terms. autoreg_dict : `dict` or `str` or None, optional, default `None` If a `dict`: A dictionary with arguments for `~greykite.common.features.timeseries_lags.build_autoreg_df`. That function's parameter ``value_col`` is inferred from the input of current function ``self.forecast``. Other keys are: ``"lag_dict"`` : `dict` or None ``"agg_lag_dict"`` : `dict` or None ``"series_na_fill_func"`` : callable If a `str`: The string will represent a method and a dictionary will be constructed using that `str`. Currently only implemented method is "auto" which uses `~greykite.algo.forecast.silverkite.SilverkiteForecast.__get_default_autoreg_dict` to create a dictionary. See more details for above parameters in `~greykite.common.features.timeseries_lags.build_autoreg_df`. past_df : `pandas.DataFrame` or None, default None The past df used for building autoregression features. This is not necessarily needed since imputation is available, however, if such data is available but not used in training for speed purposes, they can be passed here to build more accurate autoregression features. lagged_regressor_dict : `dict` or None, default None A dictionary with arguments for `greykite.common.features.timeseries_lags.build_autoreg_df_multi`. The keys of the dictionary are the target lagged regressor column names. It can leverage the regressors included in ``df``. The value of each key is either a `dict` or `str`. If `dict`, it has the following keys: ``"lag_dict"`` : `dict` or None ``"agg_lag_dict"`` : `dict` or None ``"series_na_fill_func"`` : callable If `str`, it represents a method and a dictionary will be constructed using that `str`. Currently the only implemented method is "auto" which uses `~greykite.algo.forecast.silverkite.SilverkiteForecast.__get_default_lagged_regressor_dict` to create a dictionary for each lagged regressor. An example:: lagged_regressor_dict = { "regressor1": { "lag_dict": {"orders": [1, 2, 3]}, "agg_lag_dict": { "orders_list": [[7, 7 * 2, 7 * 3]], "interval_list": [(8, 7 * 2)]}, "series_na_fill_func": lambda s: s.bfill().ffill()}, "regressor2": "auto"} seasonality_changepoints_dict : `dict` or None, optional, default `None` The parameter dictionary for seasonality change point detection. Parameters are in `~greykite.algo.changepoint.adalasso.changepoint_detector.ChangepointDetector.find_seasonality_changepoints`. Note ``df``, ``time_col``, ``value_col`` and ``trend_changepoints`` are auto populated, and do not need to be provided. min_admissible_value : `float` or None, optional, default `None` The minimum admissible value to return during prediction. If None, no limit is applied. max_admissible_value : `float` or None, optional, default `None` The maximum admissible value to return during prediction. If None, no limit is applied. uncertainty_dict : `dict` or None, optional, default `None` How to fit the uncertainty model. A dictionary with keys: ``"uncertainty_method"`` : `str` The title of the method. Only "simple_conditional_residuals" is implemented in ``fit_prediction_model`` which calculates CIs using residuals ``"params"``: `dict` A dictionary of parameters needed for the requested ``uncertainty_method``. For example, for ``uncertainty_method="simple_conditional_residuals"``, see parameters of `~greykite.algo.uncertainty.conditional.conf_interval.conf_interval`, listed briefly here: ``"conditional_cols"`` ``"quantiles"`` ``"quantile_estimation_method"`` ``"sample_size_thresh"`` ``"small_sample_size_method"`` ``"small_sample_size_quantile"`` If None, no uncertainty intervals are calculated. normalize_method : `str` or None, default None If a string is provided, it will be used as the normalization method in `~greykite.common.features.normalize.normalize_df`, passed via the argument ``method``. Available options are: "min_max", "statistical". If None, no normalization will be performed. See that function for more details. growth_term : `str` or None, optional, default "ct1" How to model the growth. Valid options are {"linear", "quadratic", "sqrt", "cuberoot"}. regressor_cols : `list` [`str`] or None, optional, default None The columns in ``df`` to use as regressors. These must be provided during prediction as well. feature_sets_enabled: `dict` [`str`, `bool` or "auto" or None] or `bool` or "auto" or None, default "auto" Whether to include interaction terms and categorical variables to increase model flexibility. If a `dict`, boolean values indicate whether include various sets of features in the model. The following keys are recognized (from `~greykite.algo.forecast.silverkite.constants.silverkite_column.SilverkiteColumn`): ``"COLS_HOUR_OF_WEEK"`` : `str` Constant hour of week effect ``"COLS_WEEKEND_SEAS"`` : `str` Daily seasonality interaction with is_weekend ``"COLS_DAY_OF_WEEK_SEAS"`` : `str` Daily seasonality interaction with day of week ``"COLS_TREND_DAILY_SEAS"`` : `str` Allow daily seasonality to change over time by is_weekend ``"COLS_EVENT_SEAS"`` : `str` Allow sub-daily event effects ``"COLS_EVENT_WEEKEND_SEAS"`` : `str` Allow sub-daily event effect to interact with is_weekend ``"COLS_DAY_OF_WEEK"`` : `str` Constant day of week effect ``"COLS_TREND_WEEKEND"`` : `str` Allow trend (growth, changepoints) to interact with is_weekend ``"COLS_TREND_DAY_OF_WEEK"`` : `str` Allow trend to interact with day of week ``"COLS_TREND_WEEKLY_SEAS"`` : `str` Allow weekly seasonality to change over time The following dictionary values are recognized: - True: include the feature set in the model - False: do not include the feature set in the model - None: do not include the feature set in the model - "auto" or not provided: use the default setting based on data frequency and size If not a `dict`: - if a boolean, equivalent to a dictionary with all values set to the boolean. - if None, equivalent to a dictionary with all values set to False. - if "auto", equivalent to a dictionary with all values set to "auto". extra_pred_cols : `list` [`str`] or None, optional, default `None` Columns to include in ``extra_pred_cols`` for ``SilverkiteForecast::forecast``. Other columns are added to ``extra_pred_cols`` by the other parameters of this function (i.e. ``holidays_*``, ``growth_term``, ``regressors``, ``feature_sets_enabled``). If `None`, treated is the same as []. drop_pred_cols : `list` [`str`] or None, default None Names of predictor columns to be dropped from the final model. Ignored if None. explicit_pred_cols : `list` [`str`] or None, default None Names of the explicit predictor columns which will be the only variables in the final model. Note that this overwrites the generated predictors in the model and may include new terms not appearing in the predictors (e.g. interaction terms). Ignored if None. regression_weight_col : `str` or None, default None The column name for the weights to be used in weighted regression version of applicable machine-learning models. simulation_based : `bool`, default False Boolean to specify if the future predictions are to be using simulations or not. Note that this is only used in deciding what parameters should be used for certain components e.g. autoregression, if automatic methods are requested. However, the auto-settings and the prediction settings regarding using simulations should match. simulation_num : `int`, default 10 The number of simulations for when simulations are used for generating forecasts and prediction intervals. Returns ------- parameters : `dict` Parameters to call :func:`~greykite.algo.forecast.silverkite.SilverkiteForecast.forecast`. """ if extra_pred_cols is None: extra_pred_cols = [] else: # Does not modify the input list extra_pred_cols = extra_pred_cols.copy() # Specifies regressors (via ``extra_pred_cols``) if regressor_cols is None: regressor_cols = [] extra_pred_cols += regressor_cols if time_properties is None: # ``df`` only contains the dates for training, # so we can use ``use_univariate_ts=False``. # ``forecast_horizon`` must be at least as large as # the actual size of the test set / forecast set # in order to pull all holidays time_properties = get_forecast_time_properties( df=df, time_col=time_col, value_col=value_col, freq=freq, regressor_cols=regressor_cols, forecast_horizon=forecast_horizon) if time_properties is not None: forecast_horizon = forecast_horizon or time_properties.get("forecast_horizon") if origin_for_time_vars is None: origin_for_time_vars = time_properties["origin_for_time_vars"] # Specifies seasonality (added to ``extra_pred_cols`` by `SilverkiteForecast::forecast`) seasonality_dict = { "yearly_seasonality": yearly_seasonality, "quarterly_seasonality": quarterly_seasonality, "monthly_seasonality": monthly_seasonality, "weekly_seasonality": weekly_seasonality, "daily_seasonality": daily_seasonality, } fs_components_df = self.__get_silverkite_seasonality( simple_freq=time_properties["simple_freq"].name, num_days=time_properties["num_training_days"], seasonality=seasonality_dict) # Specifies growth (via ``extra_pred_cols``) growth_term_formula = None if growth_term is not None: growth_term_formula = GROWTH_COL_ALIAS[growth_term] extra_pred_cols += [growth_term_formula] # Specifies events (via ``daily_event_df_dict``, ``extra_pred_cols``). # Constant daily effect. holiday_df_dict = self.__get_silverkite_holidays( holiday_lookup_countries=holiday_lookup_countries, holidays_to_model_separately=holidays_to_model_separately, start_year=time_properties["start_year"], end_year=time_properties["end_year"], pre_num=holiday_pre_num_days, post_num=holiday_post_num_days, pre_post_num_dict=holiday_pre_post_num_dict) if holiday_df_dict is not None: # Adds holidays to the user-specified events, # giving preference to user events # if there are conflicts daily_event_df_dict = update_dictionary( holiday_df_dict, overwrite_dict=daily_event_df_dict) if not daily_event_df_dict: # Sets empty dictionary to None daily_event_df_dict = None extra_pred_cols += get_event_pred_cols(daily_event_df_dict) # Specifies ``extra_pred_cols`` (interactions and additional model terms). # Seasonality interaction order is limited by the available order and max requested. daily_seas_interaction_order = self.__get_seasonality_order_from_dataframe( seasonality=self._silverkite_seasonality_enum.DAILY_SEASONALITY.value, fs=fs_components_df, max_order=max_daily_seas_interaction_order ) weekly_seas_interaction_order = self.__get_seasonality_order_from_dataframe( seasonality=self._silverkite_seasonality_enum.WEEKLY_SEASONALITY.value, fs=fs_components_df, max_order=max_weekly_seas_interaction_order ) # updates `changepoints_dict`, unchanged if not "method" == "auto" changepoints_dict, changepoint_detector = get_changepoints_dict( df=df, time_col=time_col, value_col=value_col, changepoints_dict=changepoints_dict) # determines changepoint column names if changepoints_dict is not None: changepoints = get_changepoint_features_and_values_from_config( df=df, # the training dataset time_col=time_col, changepoints_dict=changepoints_dict, origin_for_time_vars=origin_for_time_vars) changepoint_cols = changepoints["changepoint_cols"] else: changepoint_cols = [] feature_sets_enabled = self.__get_feature_sets_enabled( simple_freq=time_properties["simple_freq"].name, num_days=time_properties["num_training_days"], feature_sets_enabled=feature_sets_enabled) model_feature_terms = self.__get_feature_sets_terms( daily_event_df_dict=daily_event_df_dict, daily_seas_interaction_order=daily_seas_interaction_order, weekly_seas_interaction_order=weekly_seas_interaction_order, growth_term=growth_term_formula, changepoint_cols=changepoint_cols) # extends ``extra_pred_cols`` by the requested feature sets from ``feature_sets_enabled`` for feature_set_name, feature_set_terms in model_feature_terms.items(): if feature_sets_enabled[feature_set_name]: extra_pred_cols += feature_set_terms extra_pred_cols = unique_elements_in_list(extra_pred_cols) # the parameters to call ``SilverkiteForecast::forecast`` # parameters that are directly passed through are noted below parameters = dict( df=df, # pass-through time_col=time_col, # pass-through value_col=value_col, # pass-through origin_for_time_vars=origin_for_time_vars, extra_pred_cols=extra_pred_cols, drop_pred_cols=drop_pred_cols, explicit_pred_cols=explicit_pred_cols, train_test_thresh=train_test_thresh, # pass-through training_fraction=training_fraction, # pass-through fit_algorithm=fit_algorithm, # pass-through fit_algorithm_params=fit_algorithm_params, # pass-through daily_event_df_dict=daily_event_df_dict, fs_components_df=fs_components_df, autoreg_dict=autoreg_dict, # pass-through past_df=past_df, # pass-through lagged_regressor_dict=lagged_regressor_dict, # pass-through changepoints_dict=changepoints_dict, # pass-through seasonality_changepoints_dict=seasonality_changepoints_dict, # pass-through changepoint_detector=changepoint_detector, min_admissible_value=min_admissible_value, # pass-through max_admissible_value=max_admissible_value, # pass-through uncertainty_dict=uncertainty_dict, normalize_method=normalize_method, # pass-through regression_weight_col=regression_weight_col, # pass-through forecast_horizon=forecast_horizon, # pass-through simulation_based=simulation_based, # pass-through simulation_num=simulation_num # pass-through ) return parameters
[docs] def forecast_simple( self, *args, **kwargs): """A wrapper around ``SilverkiteForecast::forecast`` that simplifies some of the input parameters. Parameters ---------- args : positional args Positional args to pass to :func:`~greykite.algo.forecast.silverkite.forecast_simple_silverkite.convert_simple_silverkite_params`. See that function for details. kwargs : keyword args Keyword args to pass to :func:`~greykite.algo.forecast.silverkite.forecast_simple_silverkite.convert_simple_silverkite_params`. See that function for details. Returns ------- trained_model : `dict` The return value of :func:`~greykite.algo.forecast.silverkite.SilverkiteForecast.forecast` A dictionary that includes the fitted model from the function :func:`~greykite.algo.common.ml_models.fit_ml_model_with_evaluation`. """ parameters = self.convert_params(*args, **kwargs) trained_model = super().forecast(**parameters) return trained_model
def __get_requested_seasonality_order( self, requested_seasonality="auto", default_order=5, is_enabled_auto=True): """Returns requested seasonality fourier series order. Parameters ---------- requested_seasonality : `str` or `bool` or `int`, default = 'auto' The requested seasonality. 'auto', True, False, or a number for the Fourier order. default_order : `int` The default order to use for 'auto' and True. is_enabled_auto : `bool` Whether the seasonality should be modeled for 'auto' seasonality. Returns ------- order : `int` Seasonality fourier series order. """ if requested_seasonality is True or (requested_seasonality == 'auto' and is_enabled_auto): order = default_order elif requested_seasonality is False or (requested_seasonality == 'auto' and not is_enabled_auto): order = 0 else: try: order = int(requested_seasonality) except ValueError as e: log_message(f"Requested seasonality order '{requested_seasonality}' must be one of:" f" 'auto', True, False, integer", LoggingLevelEnum.ERROR) raise e return order def __get_silverkite_seasonality( self, simple_freq=SimpleTimeFrequencyEnum.DAY.name, num_days=1000, seasonality=None): """Generates `fs_components_df` parameter for `forecast_silverkite` for modeling seasonality. Parameters ---------- simple_freq : `str` SimpleTimeFrequencyEnum member that best matches the input data frequency according to `get_simple_time_frequency_from_period` num_days : `int` Number of days of observations in the input data seasonality : `dict` or None Seasonality configuration dictionary, with the following optional keys. (keys are SilverkiteSeasonalityEnum members in lower case): - ``"yearly_seasonality"`` : `str` or `bool` or `int` or None, default = 'auto' Determines the yearly seasonality 'auto', True, False, or a number for the Fourier order ``"quarterly_seasonality"`` : `str` or `bool` or `int` or None, default = 'auto' Determines the quarterly seasonality 'auto', True, False, or a number for the Fourier order ``"monthly_seasonality"`` : `str` or `bool` or `int` or None, default = 'auto' Determines the monthly seasonality 'auto', True, False, or a number for the Fourier order ``"weekly_seasonality"`` : `str` or `bool` or `int` or None, default = 'auto' Determines the weekly seasonality 'auto', True, False, or a number for the Fourier order ``"daily_seasonality"`` : `str` or `bool` or `int` or None, default = 'auto' Determines the daily seasonality 'auto', True, False, or a number for the Fourier order None is equivalent to 'auto'. If 'auto', seasonality components are based on input data (``num_days``, ``simple_freq``), according to `~greykite.algo.forecast.silverkite.constants.silverkite_seasonality.SilverkiteSeasonalityEnum`. and `~greykite.algo.forecast.silverkite.constants.silverkite_time_frequency.SilverkiteTimeFrequencyEnum`. Returns ------- fs_components_df : `pandas.DataFrame` Contains fourier series specification. Columns: - "name" - "period" - "order" - "seas_names" """ if seasonality is None: seasonality = {} # recognized seasonalities for silverkite silverkite_seasonalities = self._silverkite_seasonality_enum.__members__.copy() silverkite_seasonalities = {k.lower(): v for k, v in silverkite_seasonalities.items()} # valid seasonalities based on input data frequency freq_valid_seas_names = SimpleTimeFrequencyEnum[simple_freq].value.valid_seas freq_auto_seas_names = self._silverkite_time_frequency_enum[simple_freq].value.auto_fourier_seas for key in seasonality.keys(): if key not in silverkite_seasonalities.keys(): raise ValueError(f"{key} must be one of {silverkite_seasonalities.keys()}") seasonalities = [] # seasonalities to add to the model for seas in silverkite_seasonalities.values(): # keys are SilverkiteSeasonalityEnum members in lower case seas_input = seasonality.get(seas.name.lower(), "auto") # under auto configuration, seasonality is added if it's recommended for both # the input frequency and data size is_enabled_auto = ( num_days >= seas.value.default_min_days and seas.name in freq_auto_seas_names) order = self.__get_requested_seasonality_order( requested_seasonality=seas_input, default_order=seas.value.order, is_enabled_auto=is_enabled_auto) if order > 0: if seas.name not in freq_valid_seas_names: log_message(f"'{seas.name.lower()}' is typically not valid for " f"data with '{simple_freq}' frequency. Each seasonality period " f"should cover multiple observations in the data. To remove " f"these seasonality terms from the model, remove {seas.name.lower()}={seas_input} " f"or set it to 'auto' or 0.", LoggingLevelEnum.WARNING) seasonalities.append({ "name": seas.value.name, "period": seas.value.period, "order": order, # user is allowed to override default order "seas_names": seas.value.seas_names }) # constructs dataframe where each seasonality is a row if len(seasonalities) > 0: fs = pd.DataFrame( seasonalities, columns=["name", "period", "order", "seas_names"]) else: fs = None return fs def __get_seasonality_order_from_dataframe( self, seasonality, fs=None, max_order=None): """Returns fourier series order from a `pandas.DataFrame` fourier series specification. Return value is capped by ``max_order``. Parameters ---------- seasonality : `SilverkiteSeasonalityEnum.Seasonality` namedtuple Which seasonality to extract from ``fs``. Has attributes ``name``, ``period``, ``order``, ``seas_names`` Can be a `SilverkiteSeasonalityEnum` member value. fs : `pandas.DataFrame` or None, optional, default `None` Columns: "name", "period", "order", "seas_names" Suitable for ``fs_components_df`` parameter for ``forecast_silverkite`` for modeling seasonality. Could be returned by ``get_silverkite_seasonality``. Assumes that ``name`` and ``seas_names`` uniquely identify a row. max_order: `int` or None, optional, default `None` Upper limit on seasonality_order. Returns ------- fs_order : `int` The Fourier series order of the row with the given `name` and `seas_names` """ order = 0 if fs is not None: name_match = (fs["name"] == seasonality.name) seas_match = ((fs["seas_names"] == seasonality.seas_names) if seasonality.seas_names is not None else pd.isna(fs["seas_names"])) if any(name_match & seas_match): order = fs.loc[(name_match & seas_match), "order"].values[0] if max_order is not None: order = min(order, max_order) return order def __get_feature_sets_enabled( self, simple_freq=SimpleTimeFrequencyEnum.DAY.name, num_days=1000, feature_sets_enabled="auto"): """Returns default feature sets based on training data frequency and size. Parameters ---------- simple_freq: `str`, default SimpleTimeFrequencyEnum.DAY.name SimpleTimeFrequencyEnum member that best matches the input data frequency according to `get_simple_time_frequency_from_period` num_days: `int`, default 1000 Number of days of observations in the input data feature_sets_enabled: `dict` [`str`, `bool` or "auto" or None] or `bool` or "auto" or None, default "auto" Whether to include interaction terms and categorical variables to increase model flexibility. If a `dict`, boolean values indicate whether include various sets of features in the model. The following keys are recognized (from `~greykite.algo.forecast.silverkite.constants.silverkite_column.SilverkiteColumn`): ``"COLS_HOUR_OF_WEEK"`` : `str` Constant hour of week effect ``"COLS_WEEKEND_SEAS"`` : `str` Daily seasonality interaction with is_weekend ``"COLS_DAY_OF_WEEK_SEAS"`` : `str` Daily seasonality interaction with day of week ``"COLS_TREND_DAILY_SEAS"`` : `str` Allow daily seasonality to change over time by is_weekend ``"COLS_EVENT_SEAS"`` : `str` Allow sub-daily event effects ``"COLS_EVENT_WEEKEND_SEAS"`` : `str` Allow sub-daily event effect to interact with is_weekend ``"COLS_DAY_OF_WEEK"`` : `str` Constant day of week effect ``"COLS_TREND_WEEKEND"`` : `str` Allow trend (growth, changepoints) to interact with is_weekend ``"COLS_TREND_DAY_OF_WEEK"`` : `str` Allow trend to interact with day of week ``"COLS_TREND_WEEKLY_SEAS"`` : `str` Allow weekly seasonality to change over time The following dictionary values are recognized: - True: include the feature set in the model - False: do not include the feature set in the model - None: do not include the feature set in the model - "auto" or not provided: use the default setting based on data frequency and size If not a `dict`: - if a boolean, equivalent to a dictionary with all values set to the boolean. - if None, equivalent to a dictionary with all values set to False. - if "auto", equivalent to a dictionary with all values set to "auto". Returns ------- feature_sets_enabled : `dict` [`str`, `bool`] Indicates which feature sets will be added to the model. Feature sets are determined by `get_model_feature_terms` and may be empty (e.g. if there are no events, there is no event:seasonality interaction) Same valid options as `feature_sets_enabled` parameter. """ feature_sets_enabled_default = { self._silverkite_column.COLS_HOUR_OF_WEEK: False, self._silverkite_column.COLS_WEEKEND_SEAS: False, self._silverkite_column.COLS_DAY_OF_WEEK_SEAS: False, self._silverkite_column.COLS_TREND_DAILY_SEAS: False, self._silverkite_column.COLS_EVENT_SEAS: False, self._silverkite_column.COLS_EVENT_WEEKEND_SEAS: False, self._silverkite_column.COLS_DAY_OF_WEEK: False, self._silverkite_column.COLS_TREND_WEEKEND: False, self._silverkite_column.COLS_TREND_DAY_OF_WEEK: False, self._silverkite_column.COLS_TREND_WEEKLY_SEAS: False, } frequency = SimpleTimeFrequencyEnum[simple_freq].value # for sub-daily data if ( frequency.seconds_per_observation <= SimpleTimeFrequencyEnum.HOUR.value.seconds_per_observation): if num_days >= TimeEnum.ONE_MONTH_IN_DAYS.value: # hour of week offset, helps the fourier terms feature_sets_enabled_default[self._silverkite_column.COLS_HOUR_OF_WEEK] = True # daily seasonality on weekday vs weekend feature_sets_enabled_default[self._silverkite_column.COLS_WEEKEND_SEAS] = True # daily seasonality by day of week feature_sets_enabled_default[self._silverkite_column.COLS_DAY_OF_WEEK_SEAS] = True # daily seasonality trend on weekday, weekend feature_sets_enabled_default[self._silverkite_column.COLS_TREND_DAILY_SEAS] = True if num_days < 3 * TimeEnum.ONE_YEAR_IN_DAYS.value: # holiday daily seasonality feature_sets_enabled_default[self._silverkite_column.COLS_EVENT_SEAS] = True else: # holiday daily seasonality that depends on weekend/weekday # By pigeonhole principle, with reasonable assumption that a holiday must fall on a different # day of the week for any three consecutive years (or else always be on the same day of week), # it takes at most 3 years of training data to observe all weekend/weekday possibilities. feature_sets_enabled_default[self._silverkite_column.COLS_EVENT_WEEKEND_SEAS] = True # for sub-weekly data if ( frequency.seconds_per_observation <= SimpleTimeFrequencyEnum.DAY.value.seconds_per_observation): # day of week offset, helps the fourier terms feature_sets_enabled_default[self._silverkite_column.COLS_DAY_OF_WEEK] = True # allows different trend on weekday vs weekend if num_days >= TimeEnum.ONE_MONTH_IN_DAYS.value: feature_sets_enabled_default[self._silverkite_column.COLS_TREND_WEEKEND] = True # allows trend interaction with day of week if num_days >= TimeEnum.ONE_QUARTER_IN_DAYS.value: feature_sets_enabled_default[self._silverkite_column.COLS_TREND_DAY_OF_WEEK] = True if num_days >= TimeEnum.ONE_YEAR_IN_DAYS.value: # weekly seasonality trend over time feature_sets_enabled_default[self._silverkite_column.COLS_TREND_WEEKLY_SEAS] = True # None is treated the same as False. # Intuitively, feature_sets_enabled=None should # mean no feature sets are enabled. if feature_sets_enabled is None: feature_sets_enabled = False # Overrides defaults with user provided dictionary if feature_sets_enabled == "auto": pass # uses the automatic defaults directly elif isinstance(feature_sets_enabled, bool): # All values are set to the provided boolean value for k in feature_sets_enabled_default.keys(): feature_sets_enabled_default[k] = feature_sets_enabled elif isinstance(feature_sets_enabled, dict): # Uses the boolean values in `feature_sets_enabled` to override `feature_sets_enabled_default` for setting, is_enabled in feature_sets_enabled.items(): if setting not in feature_sets_enabled_default: raise ValueError(f"Unrecognized feature set: '{setting}'. Value feature sets are " f"{list(feature_sets_enabled_default.keys())}") if is_enabled == "auto": # "auto" values are considered not set by the user and fall back to the default continue if is_enabled is True: # User explicitly turned on this feature set. feature_sets_enabled_default[setting] = True elif is_enabled is False or is_enabled is None: # User explicitly turned off this feature set. # None values are treated the same as False. feature_sets_enabled_default[setting] = False else: raise ValueError( f"Unrecognized `feature_sets_enabled` dictionary value for key {setting}: " f"expected bool or 'auto' or None. Found: {is_enabled}") else: raise ValueError( f"Unrecognized type for `feature_sets_enabled`: expected bool, dict, 'auto', or None. Found: {feature_sets_enabled}") return feature_sets_enabled_default def __get_feature_sets_terms( self, daily_event_df_dict=None, daily_seas_interaction_order=0, weekly_seas_interaction_order=0, growth_term=None, changepoint_cols=None): """Defines features sets for use in the `extra_pred_cols` parameter to `forecast_silverkite`. Derived from events, seasonality, and trend (growth + changepoints). :param daily_event_df_dict: Optional[Dict[str, pd.DataFrame("date", "event")]] suitable for use as `daily_event_df_dict` parameter in `forecast_silverkite` Each event is modeled as its own effect :param daily_seas_interaction_order: int Order on interaction terms with daily seasonality :param weekly_seas_interaction_order: int Order on interaction terms with weekly seasonality :param growth_term: Optional[str] How to model the growth. Valid options are "linear", "quadratic", "sqrt", "cubic", "cuberoot" :param changepoint_cols: Optional[List[str]] Names of the changepoint feature columns to be generated by `build_silverkite_features` :return: Dict[str, List[str]] The patsy model terms for each feature set key: feature set name value: list of patsy model terms If there are no valid patsy model terms according to the input configuration, the list is empty. For example, if there are no events, the event related effects will be empty """ # enumerates all possible keys extra_pred_cols_grouped = { self._silverkite_column.COLS_HOUR_OF_WEEK: [], self._silverkite_column.COLS_WEEKEND_SEAS: [], self._silverkite_column.COLS_DAY_OF_WEEK_SEAS: [], self._silverkite_column.COLS_TREND_DAILY_SEAS: [], self._silverkite_column.COLS_EVENT_SEAS: [], self._silverkite_column.COLS_EVENT_WEEKEND_SEAS: [], self._silverkite_column.COLS_DAY_OF_WEEK: [], self._silverkite_column.COLS_TREND_WEEKEND: [], self._silverkite_column.COLS_TREND_DAY_OF_WEEK: [], self._silverkite_column.COLS_TREND_WEEKLY_SEAS: [], } # the columns which constitute the trend if changepoint_cols is None: changepoint_cols = [] growth_col = [growth_term] if growth_term is not None else [] trend_cols = growth_col + changepoint_cols # all possible values of `dow` and `dow_hr` from `build_time_features_df` dow_levels = ["1-Mon", "2-Tue", "3-Wed", "4-Thu", "5-Fri", "6-Sat", "7-Sun"] dow_hr_levels = [f"{day + 1}_{str(hour).zfill(2)}" for day in range(7) for hour in range(24)] day_of_week = patsy_categorical_term(term="str_dow", levels=dow_levels) hour_of_week = patsy_categorical_term(term="dow_hr", levels=dow_hr_levels) extra_pred_cols_grouped[self._silverkite_column.COLS_DAY_OF_WEEK] = [day_of_week] extra_pred_cols_grouped[self._silverkite_column.COLS_HOUR_OF_WEEK] = [hour_of_week] extra_pred_cols_grouped[self._silverkite_column.COLS_TREND_WEEKEND] = [f"is_weekend:{col}" for col in trend_cols] extra_pred_cols_grouped[self._silverkite_column.COLS_TREND_DAY_OF_WEEK] = [f"{day_of_week}:{col}" for col in trend_cols] # allows major holidays to have different daily seasonality # interact with fourier series terms up to fs_daily_interaction_order daily_seasonality = self._silverkite_seasonality_enum.DAILY_SEASONALITY.value weekly_seasonality = self._silverkite_seasonality_enum.WEEKLY_SEASONALITY.value if daily_seas_interaction_order > 0: for holiday in self._silverkite_holiday.HOLIDAYS_TO_INTERACT: if daily_event_df_dict is not None and holiday in daily_event_df_dict.keys(): event_levels = [cst.EVENT_DEFAULT] # reference level for non-event days, added by `add_daily_events` # This event's levels event_levels += list(daily_event_df_dict[holiday][cst.EVENT_DF_LABEL_COL].unique()) # `term` matches new_col in `add_daily_events` term = f"{cst.EVENT_PREFIX}_{holiday}" extra_pred_cols_grouped[self._silverkite_column.COLS_EVENT_SEAS] += cols_interact( static_col=f"{patsy_categorical_term(term=term, levels=event_levels)}", fs_name=daily_seasonality.name, fs_order=daily_seas_interaction_order, fs_seas_name=daily_seasonality.seas_names) extra_pred_cols_grouped[self._silverkite_column.COLS_EVENT_WEEKEND_SEAS] += cols_interact( static_col=f"is_weekend:{patsy_categorical_term(term=term, levels=event_levels)}", fs_name=daily_seasonality.name, fs_order=daily_seas_interaction_order, fs_seas_name=daily_seasonality.seas_names) extra_pred_cols_grouped[self._silverkite_column.COLS_WEEKEND_SEAS] = cols_interact( static_col="is_weekend", fs_name=daily_seasonality.name, fs_order=daily_seas_interaction_order, fs_seas_name=daily_seasonality.seas_names) extra_pred_cols_grouped[self._silverkite_column.COLS_DAY_OF_WEEK_SEAS] = cols_interact( static_col=day_of_week, fs_name=daily_seasonality.name, fs_order=daily_seas_interaction_order, fs_seas_name=daily_seasonality.seas_names) for col in trend_cols: extra_pred_cols_grouped[self._silverkite_column.COLS_TREND_DAILY_SEAS] += cols_interact( static_col=f"is_weekend:{col}", fs_name=daily_seasonality.name, fs_order=daily_seas_interaction_order, fs_seas_name=daily_seasonality.seas_names) if weekly_seas_interaction_order > 0: for col in trend_cols: extra_pred_cols_grouped[self._silverkite_column.COLS_TREND_WEEKLY_SEAS] += cols_interact( static_col=col, fs_name=weekly_seasonality.name, fs_order=weekly_seas_interaction_order, fs_seas_name=weekly_seasonality.seas_names) return extra_pred_cols_grouped def __get_silverkite_holidays( self, holiday_lookup_countries="auto", holidays_to_model_separately="auto", start_year=2015, end_year=2030, pre_num=2, post_num=2, pre_post_num_dict=None): """Generates holidays dictionary for input to daily_event_df_dict parameter of silverkite model. The main purpose is to provide reasonable defaults for the holiday names and countries Parameters ---------- holiday_lookup_countries : `list` [`str`] or "auto" or None, optional, default "auto" The countries that contain the holidays you intend to model (``holidays_to_model_separately``). * If "auto", uses a default list of countries that contain the default ``holidays_to_model_separately``. See `~greykite.algo.forecast.silverkite.constants.silverkite_holiday.SilverkiteHoliday.HOLIDAY_LOOKUP_COUNTRIES_AUTO`. * If a list, must be a list of country names. * If None or an empty list, no holidays are modeled. holidays_to_model_separately : `list` [`str`] or "auto" or `~greykite.algo.forecast.silverkite.constants.silverkite_holiday.SilverkiteHoliday.ALL_HOLIDAYS_IN_COUNTRIES` or None, optional, default "auto" # noqa: E501 Which holidays to include in the model. The model creates a separate key, value for each item in ``holidays_to_model_separately``. The other holidays in the countries are grouped together as a single effect. * If "auto", uses a default list of important holidays. See `~greykite.algo.forecast.silverkite.constants.silverkite_holiday.SilverkiteHoliday.HOLIDAYS_TO_MODEL_SEPARATELY_AUTO`. * If `~greykite.algo.forecast.silverkite.constants.silverkite_holiday.SilverkiteHoliday.ALL_HOLIDAYS_IN_COUNTRIES`, uses all available holidays in ``holiday_lookup_countries``. This can often create a model that has too many parameters, and should typically be avoided. * If a list, must be a list of holiday names. * If None or an empty list, all holidays in ``holiday_lookup_countries`` are grouped together as a single effect. Use ``holiday_lookup_countries`` to provide a list of countries where these holiday occur. start_year : `int` Year of first training data point, used to generate holiday events. end_year : `int` Year of last forecast data point, used to generate holiday events. pre_num : `int` Model holiday effects for ``pre_num`` days before the holiday. post_num : `int` Model holiday effects for ``post_num`` days after the holiday. pre_post_num_dict : `dict` [`str`, (`int`, `int`)] or None, default None Overrides ``pre_num`` and ``post_num`` for each holiday in ``holidays_to_model_separately``. For example, if ``holidays_to_model_separately`` contains "Thanksgiving" and "Labor Day", this parameter can be set to ``{"Thanksgiving": [1, 3], "Labor Day": [1, 2]}``, denoting that the "Thanksgiving" ``pre_num`` is 1 and ``post_num`` is 3, and "Labor Day" ``pre_num`` is 1 and ``post_num`` is 2. Holidays not specified use the default given by ``pre_num`` and ``post_num``. Returns ------- daily_event_df_dict : `dict` [`str`, `pandas.DataFrame` [EVENT_DF_DATE_COL, EVENT_DF_LABEL_COL]] Suitable for use as `daily_event_df_dict` parameter in `forecast_silverkite`. Each holiday is modeled as its own effect (not specific to each country). See Also -------- `~greykite.common.features.timeseries_features.get_available_holiday_lookup_countries` to list available countries for modeling. `~greykite.common.features.timeseries_features.get_available_holidays_across_countries` to see available holidays in those countries. """ if holiday_lookup_countries is None: # `None` will not model any holidays holiday_lookup_countries = [] elif holiday_lookup_countries == "auto": # countries that contain the default `holidays_to_model_separately` holiday_lookup_countries = self._silverkite_holiday.HOLIDAY_LOOKUP_COUNTRIES_AUTO elif not isinstance(holiday_lookup_countries, (list, tuple)): raise ValueError( f"`holiday_lookup_countries` should be a list, found {holiday_lookup_countries}") if holidays_to_model_separately is None: holidays_to_model_separately = [] elif holidays_to_model_separately == "auto": # important holidays holidays_to_model_separately = self._silverkite_holiday.HOLIDAYS_TO_MODEL_SEPARATELY_AUTO elif holidays_to_model_separately == self._silverkite_holiday.ALL_HOLIDAYS_IN_COUNTRIES: holidays_to_model_separately = get_available_holidays_across_countries( countries=holiday_lookup_countries, year_start=start_year - 1, year_end=end_year + 1) elif not isinstance(holidays_to_model_separately, (list, tuple)): raise ValueError( f"`holidays_to_model_separately` should be a list, found {holidays_to_model_separately}") return generate_holiday_events( countries=holiday_lookup_countries, holidays_to_model_separately=holidays_to_model_separately, year_start=start_year - 1, # subtract 1 just in case, to ensure coverage of all holidays year_end=end_year + 1, # add 1 just in case, to ensure coverage of all holidays pre_num=pre_num, post_num=post_num, pre_post_num_dict=pre_post_num_dict)