Source code for greykite.algo.forecast.silverkite.forecast_silverkite_helper

# BSD 2-CLAUSE LICENSE

# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:

# Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
# Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# original author: Reza Hosseini
"""Helper functions for
`~greykite.algo.forecast.silverkite.forecast_silverkite.py.`
"""

import math
import warnings

from greykite.common.enums import SimpleTimeFrequencyEnum


def get_similar_lag(freq_in_days):
    """For a given frequency, it returns a lag which is likely to be most correlated
    to the observation at current time.

    For daily data, this will return 7 and for hourly data it will return 24*7.
    In general for sub-weekly frequencies, it returns the lag which corresponds to
    the same time in the last week.
    For data which is weekly or with frequencies larger than a week, it returns None.

    Parameters
    ----------
    freq_in_days : `float`
        The time frequency of the timeseries given in day units.

    Returns
    -------
    similar_lag : `int` or None
        The returned lag or None.
    """
    similar_lag = None
    # Get the number of observations per week
    obs_num_per_week = 7 / freq_in_days

    if obs_num_per_week > 1:
        similar_lag = math.ceil(obs_num_per_week)

    return similar_lag


def get_default_changepoints_dict(
        changepoints_method,
        num_days,
        forecast_horizon_in_days):
    """Get a changepoint dictionary based on the number of days in the observed
    timeseries and forecast horizon length in days to be provided as input to
    `~greykite.algo.forecast.silverkite.forecast_silverkite.SilverkiteForecast.forecast`.
    For the "uniform" method, we place the change points at a distance of
    ``max(28, forecast_horizon)``.
    For the "auto" method, we have used some defaults which seem to work for general
    applications::

        changepoints_dict = {
            "method": "auto",
            "yearly_seasonality_order": 10,
            "resample_freq": "7D",
            "regularization_strength": 0.8,
            "actual_changepoint_min_distance": "14D",
            "potential_changepoint_distance": "7D",
            "no_changepoint_distance_from_end": "14D"}

    If the length of data is smaller than ``2*max(28, forecast_horizon)``,
    the function will return None for all methods.

    Parameters
    ----------
    changepoints_method : `str`
        The method to locate changepoints.
        Valid options:

            - "uniform". Places changepoints evenly spaced changepoints to allow
            growth to change. The distance between the uniform change points is
            set to be ``max(28, forecast_horizon)``
            - "auto". Automatically detects change points.
            For configuration, see
            `~greykite.algo.changepoint.adalasso.changepoint_detector.ChangepointDetector.find_trend_changepoints`

        For more details for both methods, also check the documentation for
        `~greykite.algo.forecast.silverkite.forecast_silverkite.SilverkiteForecast.forecast`.
    num_days : `int`
        Number of days appearing in the observed timeseries.
    forecast_horizon_in_days : `float`
        The length of the forecast horizon in days.

    Returns
    -------
    changepoints_dict : `dict` or None
        A dictionary with change points information to be used as input to
        `~greykite.algo.forecast.silverkite.forecast_silverkite.SilverkiteForecast.forecast`.
        See that function's documentation for more details.
    """

    changepoints_dict = None
    # A reasonable distance defined based on ``forecast_horizon``
    # Here the minimum is set at 28 days
    uniform_distance = max(28, forecast_horizon_in_days)
    # Number of change points for "uniform"
    # Also if this number is zero both methods will return `None`
    changepoint_num = num_days // uniform_distance - 1

    if changepoint_num > 0:
        if changepoints_method == "uniform":
            changepoints_dict = {
                "method": "uniform",
                "n_changepoints": changepoint_num,
                "continuous_time_col": "ct1",
                "growth_func": lambda x: x}

        elif changepoints_method == "auto":
            changepoints_dict = {
                "method": "auto",
                "yearly_seasonality_order": 10,
                "resample_freq": "7D",
                "regularization_strength": 0.8,
                "actual_changepoint_min_distance": "14D",
                "potential_changepoint_distance": "7D",
                "no_changepoint_distance_from_end": "14D"}

    return changepoints_dict


[docs]def get_silverkite_uncertainty_dict( uncertainty, simple_freq=SimpleTimeFrequencyEnum.DAY.name, coverage=None): """Returns an uncertainty_dict for `~greykite.algo.forecast.silverkite.forecast_silverkite.SilverkiteForecast.forecast` input parameter: uncertainty_dict. The logic is as follows: - If ``uncertainty`` is passed as dict: - If ``quantiles`` are not passed through ``uncertainty`` we fill them using `coverage`. - If ``coverage`` also missing or quantiles calculated in two ways (via ``uncertainty["params"]["quantiles"]`` and ``coverage``) do not match, we throw Exceptions - If ``uncertainty=="auto"``: - We provide defaults based on time frequency of data. - Specify ``uncertainty["params"]["quantiles"]`` based on ``coverage`` if provided, otherwise the default coverage is 0.95. Parameters ---------- uncertainty : `str` or `dict` or None It specifies what method should be used for uncertainty. If a dict is passed then it is directly returned to be passed to `~greykite.algo.forecast.silverkite.forecast_silverkite.SilverkiteForecast.forecast` as `uncertainty_dict`. If "auto", it builds a generic dict depending on frequency. - For frequencies less than or equal to one day it sets `conditional_cols` to be ["dow_hr"]. - Otherwise it sets the conditional_cols to be `None` If None and `coverage` is None, the upper/lower predictions are not returned simple_freq : `str`, optional SimpleTimeFrequencyEnum member that best matches the input data frequency according to `get_simple_time_frequency_from_period` coverage : `float` or None, optional Intended coverage of the prediction bands (0.0 to 1.0) If None and `uncertainty` is None, the upper/lower predictions are not returned Returns ------- uncertainty : `dict` or None An uncertainty dict to be used as input to `~greykite.algo.forecast.silverkite.forecast_silverkite.SilverkiteForecast.forecast`. See that function's docstring for more details. """ frequency = SimpleTimeFrequencyEnum[simple_freq].value # boolean to determine if freq is longer than one day freq_is_longer_than_day = ( frequency.seconds_per_observation > SimpleTimeFrequencyEnum.DAY.value.seconds_per_observation) uncertainty_dict = None # if both `uncertainty` and `coverage` are None, we return None if uncertainty is None and coverage is None: return None # checking if coverage input is sensible if coverage is not None and (coverage < 0 or coverage > 1): raise ValueError("coverage must be between 0 and 1") # if only coverage is provided, consider uncertainty to be "auto" if coverage is not None and uncertainty is None: uncertainty = "auto" # The case where `uncertainty` is input as a dict # We check if quantiles are passed through `uncertainty` # If not, we use `coverage` to fill them in # If quantiles are passed in `uncertainty` and inferrable from `coverage`: # and they are inconsistent, we throw an Exception if isinstance(uncertainty, dict): uncertainty_dict = uncertainty # boolean to check if quantiles are passed through uncertainty try: quantiles_specified = (uncertainty["params"]["quantiles"] is not None) except KeyError: quantiles_specified = False if "params" not in uncertainty_dict: uncertainty_dict["params"] = {} if quantiles_specified: quantiles = uncertainty["params"]["quantiles"] # If quantiles are specified, we do some sanity checks on their values: # We give warnings if more than two quantiles were passed # or if they are not symmetric i.e. first quantiles distance to zero # is not the same as last quantile distance to 1 # We throw exceptions if quantiles are not increasing # or if `coverage` is also passed and inconsistent with `quantiles` if len(quantiles) > 2: warnings.warn( "More than two quantiles are passed in `uncertainty`." " Confidence intervals will be based on" " the first (lower limit) and last (upper limit) quantile", Warning) coverage_via_uncertainty = quantiles[-1] - quantiles[0] if coverage_via_uncertainty <= 0: raise ValueError( "`quantiles` is expected to be an increasing sequence" " of at least two elements." f"These quantiles were passed: quantiles = {quantiles}") if round(quantiles[-1], 3) != round(1 - quantiles[0], 3): warnings.warn( "1 - (quantiles upper limit) is not equal to (quantiles lower limit)" " (lack of symmetry)." f" Asymmetric quantiles: {quantiles} were used.", Warning) if coverage is not None: # The case where quantiles are both provided through `uncertainty` # and inferrable using `coverage` # We check for conflict in coverage specification if round(coverage_via_uncertainty, 3) != round(coverage, 3): raise ValueError( "Coverage is specified/inferred both via `coverage` and via `uncertainty` input" " and values do not match." f" Coverage specified via `coverage`: {round(coverage, 3)}." f" Coverage inferred via `uncertainty`: {round(coverage_via_uncertainty, 2)}.") if not quantiles_specified: if coverage is None: raise ValueError( "`quantiles` are not specified in `uncertainty`" " and `coverage` is not provided to infer them") else: # The case where quantiles is not provided through `uncertainty` # but coverage is passed q1 = (1 - coverage)/2 q2 = 1 - q1 uncertainty_dict["params"]["quantiles"] = [q1, q2] # The case where `uncertainty` is passed as "auto" # The auto case conditions data on `dow_hr` which represents day of week and hour # for data with frequency less than or equal to a day (e.g. hourly, daily) # note that for daily case this works too as dow_hr will only depend on dow if uncertainty == "auto": if not freq_is_longer_than_day: uncertainty_dict = { "uncertainty_method": "simple_conditional_residuals", "params": { "conditional_cols": ["dow_hr"], "quantiles": [0.025, 0.975], "quantile_estimation_method": "normal_fit", "sample_size_thresh": 5, "small_sample_size_method": "std_quantiles", "small_sample_size_quantile": 0.98}} else: uncertainty_dict = { "uncertainty_method": "simple_conditional_residuals", "params": { "conditional_cols": None, "quantiles": [0.025, 0.975], "quantile_estimation_method": "normal_fit", "sample_size_thresh": 5, "small_sample_size_method": "std_quantiles", "small_sample_size_quantile": 0.98}} # if coverage is provided the quantiles are overridden in auto # we do not give warnings as it is the auto case and # user expects using the coverage provided if coverage is not None: q1 = (1 - coverage)/2 q2 = 1 - q1 uncertainty_dict["params"]["quantiles"] = [q1, q2] return uncertainty_dict