Source code for greykite.algo.uncertainty.conditional.conf_interval

# BSD 2-CLAUSE LICENSE

# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:

# Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
# Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# original author: Reza Hosseini
"""Calculates uncertainty intervals from the conditional
empirical distribution of the residual.
"""

import warnings

import numpy as np
import pandas as pd

from greykite.algo.uncertainty.conditional.dataframe_utils import limit_tuple_col
from greykite.algo.uncertainty.conditional.dataframe_utils import offset_tuple_col
from greykite.algo.uncertainty.conditional.estimate_distribution import estimate_empirical_distribution
from greykite.algo.uncertainty.conditional.normal_quantiles import normal_quantiles_df
from greykite.common.constants import ERR_STD_COL


[docs]def conf_interval(
        df,
        value_col,
        residual_col=None,
        conditional_cols=None,
        quantiles=[0.005, 0.025, 0.975, 0.995],
        quantile_estimation_method="normal_fit",
        sample_size_thresh=5,
        small_sample_size_method="std_quantiles",
        small_sample_size_quantile=0.95,
        min_admissible_value=None,
        max_admissible_value=None):
    """A function to calculate confidence intervals (ci) for values given
    in ``value_col``
    for each slice of data (given in ``conditional_cols``)
    using approximate distributions estimated via
    ``estimate_empirical_distribution``.
    The variability of the CI's either come from "value_col" itself or
    alternatively come from the ``residual_col`` if provided.

    We allow for calculating as many quantiles as needed (specified by `quantiles`)
    as opposed to only two quantiles representing a typical ci interval.

    Two options are available for method of calculation of quantiles for
    each slice,

        - for each slice a confidence interval is calculated
            simply using empirical quantiles
        - using a normal distribution fit.

    There are two main possibilities:

        - ``residual_col`` is not provided and values in value_col are used
            directly to calculate quantiles by using the distribution of the
            values in each slices
        - ``residual_col`` is provided, we calculate quantiles
            for the residuals distribution for each slice and then
            offset the quantiles by the value given in value_col.
            In that case we use a fixed_mean of zero when constructing quantiles
            for the residuals. This is done so that our predicted values given
            in value_col are not perturbed as they might be coming from a much
            more complex fitting model taking into account many more variables
            as compared with conditional_cols

    Parameters
    ----------
    df : `pandas.Dataframe`
        The dataframe with the needed columns:

            - value_col,
            - conditional_cols,
            - residual_col (optional column)

    value_col : str
        The column containing the values for the variable for which confidence
        interval is needed
    residual_col : str
        If a residual column is given, quantiles will be built for the residual
        values and the interval is then offset using the value given in
        value_col itself
    conditional_cols : `list` [`str`]
        These columns are used to slice the data first then calculate quantiles
        for each slice
    quantiles : `list` [`float`]
        The quantiles calculated for each slice.
        These quantiles can be then used to construct the desired CIs.
        The default values [0.005, 0.025, 0.0975, 0.995] can be used to construct
        99 and 95 percent CIs.
    quantile_estimation_method : `str`
        There are two options implemented for the quantile estimation method
        (conditional on slice):

            - "normal_fit": it uses the standard deviation of the values in each
              slice to compute normal distribution quantiles
            - "ecdf": it uses the values directly to calculate sample quantiles

    sample_size_thresh : int
        The minimum sample size for each slice where we allow for using the conditional
        distribution (conditioned on the "conditional_cols" argument).
        If sample size for that slice is smaller than this,
        we fall back to a fallback method
    small_sample_size_method : str
        The method to use for slices with small sample size

            - "std_quantile" method is implemented and it looks at the response
              std for each slice with
              sample size >= "sample_size_thresh"
              and takes the row which has  its std being closest
              to "small_sample_size_quantile" quantile.
              It assigns that row to act as fall-back for calculating conf
              intervals.

    min_admissible_value : Union[float, double, int]
        This is the lowest admissible value for the obtained ci limits
        and any value below this will be mapped back to this value.
    max_admissible_value : Union[float, double, int]
        This is the highest admissible value for the obtained ci limits
        and any higher value will be mapped back to this value.

    Returns
    -------
    uncertainty_model : `dict`
        Dict with following items (main component is the ``predict`` function).

            - "ecdf_df": pandas.DataFrame
                ecdf_df generated by "estimate_empirical_distribution"
            - "ecdf_df_overall": pandas.DataFrame
                ecdf_df_overall generated by "estimate_empirical_distribution"
            - "ecdf_df_fallback": pandas.DataFrame
                ecdf_df_fallback, a fall back data to get the CI quantiles
                when the sample size for that slice is small or that slice
                is unobserved in that case.

                    - if small_sample_size_method = "std_quantiles",
                      we use std quantiles to pick a slice which has a std close
                      to that quantile and fall-back to that slice.
                    - otherwise we fallback to "ecdf_overall"

            - "predict": callable
                it can apply to new_df and add quantiles
                the new column added has the extra column with name
                "{value_col}_quantile_summary"
                as well as the input slices given in "conditional_cols"

    """
    is_residual_based = (residual_col is not None)
    distribution_col = residual_col if is_residual_based else value_col
    std_col = f"{distribution_col}_std"
    sample_size_col = f"{distribution_col}_count"

    model_dict = estimate_empirical_distribution(
        df=df,
        value_col=distribution_col,
        quantile_grid_size=None,
        quantiles=quantiles,
        conditional_cols=conditional_cols)

    ecdf_df = model_dict["ecdf_df"]
    ecdf_df_overall = model_dict["ecdf_df_overall"]
    ecdf_df_fallback = ecdf_df_overall.copy()

    # two methods are implemented: ecdf; normal_fit.
    # we re-assign the quantile_summary_col if
    # the quantile_estimation_method is normal_fit
    if quantile_estimation_method == "ecdf":
        quantile_summary_col = f"{distribution_col}_quantile_summary"
    elif quantile_estimation_method == "normal_fit":
        quantile_summary_col = "normal_quantiles"
        if is_residual_based:
            mean_col = None
            fixed_mean = 0.0
        else:
            mean_col = value_col
            fixed_mean = None

        ecdf_df = normal_quantiles_df(
            df=ecdf_df,
            std_col=std_col,
            mean_col=mean_col,
            fixed_mean=fixed_mean,
            quantiles=quantiles)

        ecdf_df_fallback = normal_quantiles_df(
            df=ecdf_df_fallback,
            std_col=std_col,
            mean_col=mean_col,
            fixed_mean=fixed_mean,
            quantiles=quantiles)
    else:
        raise NotImplementedError(
                f"CI calculation method {quantile_estimation_method} is not either of: normal_fit; ecdf")

    # handling slices with small sample size
    # if a method is provided via the argument "small_sample_size_method" then it is used here
    # the idea is to take a relatively high volatility
    # when the new point does not have enough (as specified by "sample_size_thresh")
    # similar points in the past
    fall_back_for_all = False
    if small_sample_size_method == "std_quantiles":
        ecdf_df_large_ss = ecdf_df.loc[ecdf_df[sample_size_col] >= sample_size_thresh].reset_index(drop=True)
        assert set(ecdf_df_large_ss.columns).intersection(["std_quantile", "std_quantile_diff"]) == set(), (
            "column names: std_quantile, std_quantile_diff should not appear in ecdf_df")
        if len(ecdf_df_large_ss) == 0:
            warnings.warn("No slice had sufficient sample size. We fall back to the overall distribution.")
            # If ``ecdf_df_large_ss`` is empty it means we do not have any sufficient
            # samples for any slices.
            # Therefore we have to fall back in all cases and we set ``ecdf_df``
            # to ``ecdf_df_fall_back``
            ecdf_df = ecdf_df_fallback
            fall_back_for_all = True
        else:
            ecdf_df_large_ss["std_quantile"] = np.argsort(ecdf_df_large_ss[std_col]) / ecdf_df_large_ss.shape[0]
            # Calculates the distance between "std_quantile" column values and ``small_sample_size_quantile``
            ecdf_df_large_ss["std_quantile_diff"] = abs(ecdf_df_large_ss["std_quantile"] - small_sample_size_quantile)
            # Chooses the row with closes value in "std_quantile" column to ``small_sample_size_quantile``
            # Note the resulting dataframe below ``ecdf_df_fallback`` will have one row
            ecdf_df_fallback = ecdf_df_large_ss.loc[[ecdf_df_large_ss["std_quantile_diff"].idxmin()]]
            del ecdf_df_fallback["std_quantile"]
            del ecdf_df_fallback["std_quantile_diff"]
            del ecdf_df_large_ss["std_quantile"]
            del ecdf_df_large_ss["std_quantile_diff"]
            # we re-assign ecdf_df by removing the combinations with small sample size
            # this is done so that in predict phase those values are not populated from
            # small sample sizes and use ``ecdf_fallback``
            ecdf_df = ecdf_df_large_ss
    elif small_sample_size_method is not None:
        raise NotImplementedError(
            f"small_sample_size_method {small_sample_size_method} is not implemented.")

    return {
        "ecdf_df": ecdf_df,
        "ecdf_df_overall": ecdf_df_overall,
        "ecdf_df_fallback": ecdf_df_fallback,
        "value_col": value_col,
        "min_admissible_value": min_admissible_value,
        "max_admissible_value": max_admissible_value,
        "conditional_cols": conditional_cols,
        "std_col": std_col,
        "quantile_summary_col": quantile_summary_col,
        "fall_back_for_all": fall_back_for_all,
        "is_residual_based": is_residual_based}


def predict_ci(
        new_df,
        ci_model):
    """It applies on a dataframe (``new_df``) and attaches the quantiles needed.

    :param new_df: pd.Dataframe
        A dataframe with ``value_col`` column as mandatory and
        ``conditional_cols`` as optional depending on
        how the function ``conf_interval`` is called.
    :parame ci_model: dict
        Returned CI model from ``conf_interval``.
    :return: pd.Dataframe
        A dataframe which includes ``new_df`` and new columns containing
        the quantiles.
    """

    ecdf_df = ci_model["ecdf_df"]
    ecdf_df_fallback = ci_model["ecdf_df_fallback"]
    value_col = ci_model["value_col"]
    min_admissible_value = ci_model["min_admissible_value"]
    max_admissible_value = ci_model["max_admissible_value"]
    conditional_cols = ci_model["conditional_cols"]
    std_col = ci_model["std_col"]
    quantile_summary_col = ci_model["quantile_summary_col"]
    fall_back_for_all = ci_model["fall_back_for_all"]
    is_residual_based = ci_model["is_residual_based"]

    # copy ``new_df`` so that input df to predict is not altered
    new_df = new_df.copy().reset_index(drop=True)
    new_df["temporary_overall_dummy"] = 0
    ecdf_df_fallback_dummy = ecdf_df_fallback.copy()
    ecdf_df_fallback_dummy["temporary_overall_dummy"] = 0
    new_df_fallback = pd.merge(
        new_df,
        ecdf_df_fallback_dummy,
        on=["temporary_overall_dummy"],
        how="left")

    if conditional_cols is None or fall_back_for_all:
        new_df_conditional = new_df_fallback.copy()
    else:
        new_df_conditional = pd.merge(
            new_df,
            ecdf_df,
            on=conditional_cols,
            how="left")
    del new_df_conditional["temporary_overall_dummy"]

    # when we have missing in the grouped case (which can happen if a level
    # in ``match_cols`` didn't appear in train dataset)
    # we fall back to the overall case
    for col in [quantile_summary_col, std_col]:
        na_index = new_df_conditional[col].isnull()
        new_df_conditional.loc[na_index, col] = (
            new_df_fallback.loc[na_index, col])

    # offsetting the values in ``value_col`` by residual quantiles
    # if ``is_residual_based``
    value_quantile_summary_col = f"{value_col}_quantile_summary"
    if is_residual_based:
        new_df_conditional[value_quantile_summary_col] = offset_tuple_col(
            df=new_df_conditional,
            offset_col=value_col,
            tuple_col=quantile_summary_col)

    new_df_conditional = limit_tuple_col(
        df=new_df_conditional,
        tuple_col=value_quantile_summary_col,
        lower=min_admissible_value,
        upper=max_admissible_value)

    # only returning needed cols
    returned_cols = [value_quantile_summary_col, std_col]
    if conditional_cols is not None:
        returned_cols = conditional_cols + returned_cols

    pred_df = new_df_conditional[returned_cols]
    pred_df.rename(columns={std_col: ERR_STD_COL}, inplace=True)

    return pred_df