Source code for greykite.algo.forecast.silverkite.forecast_silverkite_helper

# BSD 2-CLAUSE LICENSE

# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:

# Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
# Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# original author: Reza Hosseini
"""Helper functions for
`~greykite.algo.forecast.silverkite.forecast_silverkite.py.`
"""

import math
import warnings

from greykite.common.enums import SimpleTimeFrequencyEnum


def get_similar_lag(freq_in_days):
    """For a given frequency, it returns a lag which is likely to be most correlated
    to the observation at current time.

    For daily data, this will return 7 and for hourly data it will return 24*7.
    In general for sub-weekly frequencies, it returns the lag which corresponds to
    the same time in the last week.
    For data which is weekly or with frequencies larger than a week, it returns None.

    Parameters
    ----------
    freq_in_days : `float`
        The time frequency of the timeseries given in day units.

    Returns
    -------
    similar_lag : `int` or None
        The returned lag or None.
    """
    similar_lag = None
    # Get the number of observations per week
    obs_num_per_week = 7 / freq_in_days

    if obs_num_per_week > 1:
        similar_lag = math.ceil(obs_num_per_week)

    return similar_lag


def get_default_changepoints_dict(
        changepoints_method,
        num_days,
        forecast_horizon_in_days):
    """Get a changepoint dictionary based on the number of days in the observed
    timeseries and forecast horizon length in days to be provided as input to
    `~greykite.algo.forecast.silverkite.forecast_silverkite.SilverkiteForecast.forecast`.
    For the "uniform" method, we place the change points at a distance of
    ``max(28, forecast_horizon)``.
    For the "auto" method, we have used some defaults which seem to work for general
    applications::

        changepoints_dict = {
            "method": "auto",
            "yearly_seasonality_order": 10,
            "resample_freq": "7D",
            "regularization_strength": 0.8,
            "actual_changepoint_min_distance": "14D",
            "potential_changepoint_distance": "7D",
            "no_changepoint_distance_from_end": "14D"}

    If the length of data is smaller than ``2*max(28, forecast_horizon)``,
    the function will return None for all methods.

    Parameters
    ----------
    changepoints_method : `str`
        The method to locate changepoints.
        Valid options:

            - "uniform". Places changepoints evenly spaced changepoints to allow
            growth to change. The distance between the uniform change points is
            set to be ``max(28, forecast_horizon)``
            - "auto". Automatically detects change points.
            For configuration, see
            `~greykite.algo.changepoint.adalasso.changepoint_detector.ChangepointDetector.find_trend_changepoints`

        For more details for both methods, also check the documentation for
        `~greykite.algo.forecast.silverkite.forecast_silverkite.SilverkiteForecast.forecast`.
    num_days : `int`
        Number of days appearing in the observed timeseries.
    forecast_horizon_in_days : `float`
        The length of the forecast horizon in days.

    Returns
    -------
    changepoints_dict : `dict` or None
        A dictionary with change points information to be used as input to
        `~greykite.algo.forecast.silverkite.forecast_silverkite.SilverkiteForecast.forecast`.
        See that function's documentation for more details.
    """

    changepoints_dict = None
    # A reasonable distance defined based on ``forecast_horizon``
    # Here the minimum is set at 28 days
    uniform_distance = max(28, forecast_horizon_in_days)
    # Number of change points for "uniform"
    # Also if this number is zero both methods will return `None`
    changepoint_num = num_days // uniform_distance - 1

    if changepoint_num > 0:
        if changepoints_method == "uniform":
            changepoints_dict = {
                "method": "uniform",
                "n_changepoints": changepoint_num,
                "continuous_time_col": "ct1",
                "growth_func": lambda x: x}

        elif changepoints_method == "auto":
            changepoints_dict = {
                "method": "auto",
                "yearly_seasonality_order": 10,
                "resample_freq": "7D",
                "regularization_strength": 0.8,
                "actual_changepoint_min_distance": "14D",
                "potential_changepoint_distance": "7D",
                "no_changepoint_distance_from_end": "14D"}

    return changepoints_dict


[docs]def get_silverkite_uncertainty_dict(
        uncertainty,
        simple_freq=SimpleTimeFrequencyEnum.DAY.name,
        coverage=None):
    """Returns an uncertainty_dict for
    `~greykite.algo.forecast.silverkite.forecast_silverkite.SilverkiteForecast.forecast`
    input parameter: uncertainty_dict.

    The logic is as follows:

        - If ``uncertainty`` is passed as dict:
            - If ``quantiles`` are not passed through ``uncertainty`` we fill them
              using `coverage`.
            - If ``coverage`` also missing or quantiles calculated
              in two ways (via ``uncertainty["params"]["quantiles"]`` and ``coverage``)
              do not match, we throw Exceptions

        - If ``uncertainty=="auto"``:
            - We provide defaults based on time frequency of data.
            - Specify ``uncertainty["params"]["quantiles"]`` based on
              ``coverage`` if provided, otherwise the default coverage is 0.95.

    Parameters
    ----------
    uncertainty : `str` or `dict` or None
        It specifies what method should be used for uncertainty.
        If a dict is passed then it is directly returned to be passed to
        `~greykite.algo.forecast.silverkite.forecast_silverkite.SilverkiteForecast.forecast` as `uncertainty_dict`.

        If "auto", it builds a generic dict depending on frequency.
            - For frequencies less than or equal to one day it sets
              `conditional_cols` to be ["dow_hr"].
            - Otherwise it sets the conditional_cols to be `None`

        If None and `coverage` is None, the upper/lower predictions are not returned
    simple_freq : `str`, optional
        SimpleTimeFrequencyEnum member that best matches the input data frequency
        according to `get_simple_time_frequency_from_period`
    coverage : `float` or None, optional
        Intended coverage of the prediction bands (0.0 to 1.0)
        If None and `uncertainty` is None, the upper/lower predictions are not returned

    Returns
    -------
    uncertainty : `dict` or None
        An uncertainty dict to be used as input to
        `~greykite.algo.forecast.silverkite.forecast_silverkite.SilverkiteForecast.forecast`.
        See that function's docstring for more details.
    """
    frequency = SimpleTimeFrequencyEnum[simple_freq].value

    # boolean to determine if freq is longer than one day
    freq_is_longer_than_day = (
            frequency.seconds_per_observation
            > SimpleTimeFrequencyEnum.DAY.value.seconds_per_observation)

    uncertainty_dict = None

    # if both `uncertainty` and `coverage` are None, we return None
    if uncertainty is None and coverage is None:
        return None

    # checking if coverage input is sensible
    if coverage is not None and (coverage < 0 or coverage > 1):
        raise ValueError("coverage must be between 0 and 1")

    # if only coverage is provided, consider uncertainty to be "auto"
    if coverage is not None and uncertainty is None:
        uncertainty = "auto"

    # The case where `uncertainty` is input as a dict
    # We check if quantiles are passed through `uncertainty`
    # If not, we use `coverage` to fill them in
    # If quantiles are passed in `uncertainty` and inferrable from `coverage`:
    # and they are inconsistent, we throw an Exception
    if isinstance(uncertainty, dict):
        uncertainty_dict = uncertainty
        # boolean to check if quantiles are passed through uncertainty
        try:
            quantiles_specified = (uncertainty["params"]["quantiles"] is not None)
        except KeyError:
            quantiles_specified = False
        if "params" not in uncertainty_dict:
            uncertainty_dict["params"] = {}

        if quantiles_specified:
            quantiles = uncertainty["params"]["quantiles"]
            # If quantiles are specified, we do some sanity checks on their values:
            # We give warnings if more than two quantiles were passed
            # or if they are not symmetric i.e. first quantiles distance to zero
            # is not the same as last quantile distance to 1
            # We throw exceptions if quantiles are not increasing
            # or if `coverage` is also passed and inconsistent with `quantiles`
            if len(quantiles) > 2:
                warnings.warn(
                    "More than two quantiles are passed in `uncertainty`."
                    " Confidence intervals will be based on"
                    " the first (lower limit) and last (upper limit) quantile",
                    Warning)
            coverage_via_uncertainty = quantiles[-1] - quantiles[0]
            if coverage_via_uncertainty <= 0:
                raise ValueError(
                    "`quantiles` is expected to be an increasing sequence"
                    " of at least two elements."
                    f"These quantiles were passed: quantiles = {quantiles}")
            if round(quantiles[-1], 3) != round(1 - quantiles[0], 3):
                warnings.warn(
                    "1 - (quantiles upper limit) is not equal to (quantiles lower limit)"
                    " (lack of symmetry)."
                    f" Asymmetric quantiles: {quantiles} were used.",
                    Warning)
            if coverage is not None:
                # The case where quantiles are both provided through `uncertainty`
                # and inferrable using `coverage`
                # We check for conflict in coverage specification
                if round(coverage_via_uncertainty, 3) != round(coverage, 3):
                    raise ValueError(
                        "Coverage is specified/inferred both via `coverage` and via `uncertainty` input"
                        " and values do not match."
                        f" Coverage specified via `coverage`: {round(coverage, 3)}."
                        f" Coverage inferred via `uncertainty`: {round(coverage_via_uncertainty, 2)}.")
        if not quantiles_specified:
            if coverage is None:
                raise ValueError(
                    "`quantiles` are not specified in `uncertainty`"
                    " and `coverage` is not provided to infer them")
            else:
                # The case where quantiles is not provided through `uncertainty`
                # but coverage is passed
                q1 = (1 - coverage)/2
                q2 = 1 - q1
                uncertainty_dict["params"]["quantiles"] = [q1, q2]

    # The case where `uncertainty` is passed as "auto"
    # The auto case conditions data on `dow_hr` which represents day of week and hour
    # for data with frequency less than or equal to a day (e.g. hourly, daily)
    # note that for daily case this works too as dow_hr will only depend on dow
    if uncertainty == "auto":
        if not freq_is_longer_than_day:
            uncertainty_dict = {
                "uncertainty_method": "simple_conditional_residuals",
                "params": {
                    "conditional_cols": ["dow_hr"],
                    "quantiles": [0.025, 0.975],
                    "quantile_estimation_method": "normal_fit",
                    "sample_size_thresh": 5,
                    "small_sample_size_method": "std_quantiles",
                    "small_sample_size_quantile": 0.98}}
        else:
            uncertainty_dict = {
                "uncertainty_method": "simple_conditional_residuals",
                "params": {
                    "conditional_cols": None,
                    "quantiles": [0.025, 0.975],
                    "quantile_estimation_method": "normal_fit",
                    "sample_size_thresh": 5,
                    "small_sample_size_method": "std_quantiles",
                    "small_sample_size_quantile": 0.98}}
        # if coverage is provided the quantiles are overridden in auto
        # we do not give warnings as it is the auto case and
        # user expects using the coverage provided
        if coverage is not None:
            q1 = (1 - coverage)/2
            q2 = 1 - q1
            uncertainty_dict["params"]["quantiles"] = [q1, q2]

    return uncertainty_dict