Source code for greykite.algo.uncertainty.conditional.conf_interval

# BSD 2-CLAUSE LICENSE

# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:

# Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
# Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# original author: Reza Hosseini, Sayan Patra
"""Calculates uncertainty intervals from the conditional
empirical distribution of the residual.
"""

import warnings

import numpy as np
import pandas as pd

from greykite.algo.uncertainty.conditional.dataframe_utils import limit_tuple_col
from greykite.algo.uncertainty.conditional.dataframe_utils import offset_tuple_col
from greykite.algo.uncertainty.conditional.estimate_distribution import estimate_empirical_distribution
from greykite.algo.uncertainty.conditional.normal_quantiles import normal_quantiles_df
from greykite.common.constants import ERR_STD_COL
from greykite.common.constants import QUANTILE_SUMMARY_COL


[docs]def conf_interval( df, distribution_col, offset_col=None, conditional_cols=None, quantiles=(0.005, 0.025, 0.975, 0.995), quantile_estimation_method="normal_fit", sample_size_thresh=5, small_sample_size_method="std_quantiles", small_sample_size_quantile=0.95, min_admissible_value=None, max_admissible_value=None): """A function to calculate confidence intervals (CI) for values given in ``distribution_col``. We allow for calculating as many quantiles as needed (specified by ``quantiles``) as opposed to only two quantiles representing a typical CI. Two methods are available for quantiles calculation for each slice of data (given in ``conditional_cols``). - "normal_fit" : CI is calculated using quantiles of a normal distribution fit. - "ecdf" : CI is calculated using quantiles of empirical cumulative distribution function. ``offset_col`` is used in the prediction phase to shift the calculated quantiles appropriately. Parameters ---------- df : `pandas.Dataframe` The dataframe with the following columns: - distribution_col, - conditional_cols (optional), - offset_col (optional column) distribution_col : `str` The column containing the values for the variable for which confidence interval is needed. offset_col : `str` or None, default None The column containing the values by which the computed quantiles for ``distribution_col`` are shifted. Only used during prediction phase. If None, quantiles are not shifted. conditional_cols : `list` [`str`] or None, default None These columns are used to slice the data first then calculate quantiles for each slice. quantiles : `list` [`float`], default (0.005, 0.025, 0.975, 0.995) The quantiles calculated for each slice. These quantiles can be then used to construct the desired CIs. The default values [0.005, 0.025, 0.0975, 0.995] can be used to construct 99 and 95 percent CIs. quantile_estimation_method : `str`, default "normal_fit" There are two options implemented for the quantile estimation method (conditional on slice): - "normal_fit": Uses the standard deviation of the values in each slice to compute normal distribution quantiles. - "ecdf": Uses the empirical cumulative distribution function to calculate sample quantiles. sample_size_thresh : `int`, default 5 The minimum sample size for each slice where we allow for using the conditional distribution (conditioned on the "conditional_cols" argument). If sample size for that slice is smaller than this, we use the fallback method. small_sample_size_method : `str`, default "std_quantiles" The method to use for slices with small sample size - "std_quantile" method is implemented and it looks at the response std for each slice with sample size >= "sample_size_thresh" and takes the row which has its std being closest to "small_sample_size_quantile" quantile. It assigns that row to act as fall-back for calculating conf intervals. small_sample_size_quantile : `float`, default 0.95 Quantile to calculate for small sample size. min_admissible_value : Union[float, double, int], default None This is the lowest admissible value for the obtained ci limits and any value below this will be mapped back to this value. max_admissible_value : Union[float, double, int], default None This is the highest admissible value for the obtained ci limits and any higher value will be mapped back to this value. Returns ------- uncertainty_model : `dict` Dictionary with following items (main component is the ``predict`` function). - "ecdf_df" : `pandas.DataFrame` ecdf_df generated by "estimate_empirical_distribution" - "ecdf_df_overall" : `pandas.DataFrame` ecdf_df_overall generated by "estimate_empirical_distribution" - "ecdf_df_fallback" : `pandas.DataFrame` ecdf_df_fallback, a fall back data to get the CI quantiles when the sample size for that slice is small or that slice is unobserved in that case. - if small_sample_size_method = "std_quantiles", we use std quantiles to pick a slice which has a std close to that quantile and fall-back to that slice. - otherwise we fallback to "ecdf_overall" - "distribution_col" : `str` Input ``distribution_col`` - "offset_col": `str` Input ``offset_col`` - "quantiles" : `list` [`float`] Input ``quantiles`` - "min_admissible_value": `float` Input ``min_admissible_value`` - "max_admissible_value": `float` Input ``max_admissible_value`` - "conditional_cols": `list` [`str`] Input ``conditional_cols`` - "std_col": `str` The column name with standard deviations. - "quantile_summary_col": `str` The column name with computed quantiles. - "fall_back_for_all": `bool` Indicates if fallback method should be used for the whole dataset. """ std_col = f"{distribution_col}_std" sample_size_col = f"{distribution_col}_count" model_dict = estimate_empirical_distribution( df=df, distribution_col=distribution_col, quantile_grid_size=None, quantiles=quantiles, conditional_cols=conditional_cols, remove_conditional_mean=True ) ecdf_df = model_dict["ecdf_df"] ecdf_df_overall = model_dict["ecdf_df_overall"] # Two methods are implemented: ecdf; normal_fit. if quantile_estimation_method == "ecdf": quantile_summary_col = f"{distribution_col}_ecdf_quantile_summary" elif quantile_estimation_method == "normal_fit": quantile_summary_col = f"{distribution_col}_normal_quantile_summary" ecdf_df = normal_quantiles_df( df=ecdf_df, std_col=std_col, mean_col=None, fixed_mean=0.0, quantiles=quantiles, quantile_summary_col=quantile_summary_col ) ecdf_df_fallback = normal_quantiles_df( df=ecdf_df_overall, std_col=std_col, mean_col=None, fixed_mean=0.0, quantiles=quantiles, quantile_summary_col=quantile_summary_col ) else: raise NotImplementedError( f"CI calculation method {quantile_estimation_method} is not either of: normal_fit; ecdf") # handling slices with small sample size # if a method is provided via the argument "small_sample_size_method" then it is used here # the idea is to take a relatively high volatility # when the new point does not have enough (as specified by "sample_size_thresh") # similar points in the past fall_back_for_all = False if small_sample_size_method == "std_quantiles": ecdf_df_large_ss = ecdf_df.loc[ecdf_df[sample_size_col] >= sample_size_thresh].reset_index(drop=True) assert set(ecdf_df_large_ss.columns).intersection(["std_quantile", "std_quantile_diff"]) == set(), ( "column names: std_quantile, std_quantile_diff should not appear in ecdf_df") if len(ecdf_df_large_ss) == 0: warnings.warn("No slice had sufficient sample size. We fall back to the overall distribution.") # If ``ecdf_df_large_ss`` is empty it means we do not have any sufficient # samples for any slices. # Therefore we have to fall back in all cases and we set ``ecdf_df`` # to ``ecdf_df_fall_back`` ecdf_df = ecdf_df_fallback fall_back_for_all = True else: ecdf_df_large_ss["std_quantile"] = np.argsort(ecdf_df_large_ss[std_col]) / ecdf_df_large_ss.shape[0] # Calculates the distance between "std_quantile" column values and ``small_sample_size_quantile`` ecdf_df_large_ss["std_quantile_diff"] = abs(ecdf_df_large_ss["std_quantile"] - small_sample_size_quantile) # Chooses the row with closes value in "std_quantile" column to ``small_sample_size_quantile`` # Note the resulting dataframe below ``ecdf_df_fallback`` will have one row ecdf_df_fallback = ecdf_df_large_ss.loc[[ecdf_df_large_ss["std_quantile_diff"].idxmin()]] del ecdf_df_fallback["std_quantile"] del ecdf_df_fallback["std_quantile_diff"] del ecdf_df_large_ss["std_quantile"] del ecdf_df_large_ss["std_quantile_diff"] # we re-assign ecdf_df by removing the combinations with small sample size # this is done so that in predict phase those values are not populated from # small sample sizes and use ``ecdf_fallback`` ecdf_df = ecdf_df_large_ss elif small_sample_size_method is not None: raise NotImplementedError( f"small_sample_size_method {small_sample_size_method} is not implemented.") return { "ecdf_df": ecdf_df, "ecdf_df_overall": ecdf_df_overall, "ecdf_df_fallback": ecdf_df_fallback, "distribution_col": distribution_col, "offset_col": offset_col, "quantiles": quantiles, "min_admissible_value": min_admissible_value, "max_admissible_value": max_admissible_value, "conditional_cols": conditional_cols, "std_col": std_col, "quantile_summary_col": quantile_summary_col, "fall_back_for_all": fall_back_for_all}
def predict_ci( new_df, ci_model): """Predicts the quantiles of the ``offset_col`` (defined in ``ci_model``) in ``new_df``. Parameters ---------- new_df : `pd.Dataframe` Prediction dataframe which minimally includes ``offset_col`` and ``conditional_cols`` defined in the ``ci_model``. ci_model : `dict` Returned CI model from ``conf_interval``. Returns ------- pred_df : `pd.Dataframe` A dataframe which includes ``new_df`` and new columns containing the quantiles. """ ecdf_df = ci_model["ecdf_df"] ecdf_df_fallback = ci_model["ecdf_df_fallback"] offset_col = ci_model["offset_col"] min_admissible_value = ci_model["min_admissible_value"] max_admissible_value = ci_model["max_admissible_value"] conditional_cols = ci_model["conditional_cols"] std_col = ci_model["std_col"] quantile_summary_col = ci_model["quantile_summary_col"] fall_back_for_all = ci_model["fall_back_for_all"] # Copies ``pred_df`` so that input df to predict is not altered pred_df = new_df.reset_index(drop=True) pred_df["temporary_overall_dummy"] = 0 ecdf_df_fallback_dummy = ecdf_df_fallback.copy() ecdf_df_fallback_dummy["temporary_overall_dummy"] = 0 pred_df_fallback = pd.merge( pred_df, ecdf_df_fallback_dummy, on=["temporary_overall_dummy"], how="left") del pred_df["temporary_overall_dummy"] del pred_df_fallback["temporary_overall_dummy"] if (conditional_cols is None) or (conditional_cols == []) or fall_back_for_all: pred_df_conditional = pred_df_fallback.copy() else: pred_df_conditional = pd.merge( pred_df, ecdf_df, on=conditional_cols, how="left") # When we have missing in the grouped case (which can happen if a level # in ``match_cols`` didn't appear in train dataset) # we fall back to the overall case for col in [quantile_summary_col, std_col]: na_index = pred_df_conditional[col].isnull() pred_df_conditional.loc[na_index, col] = ( pred_df_fallback.loc[na_index, col]) # offsetting the values in ``distribution_col`` by ``offset_col`` if offset_col is None: pred_df_conditional[QUANTILE_SUMMARY_COL] = pred_df_conditional[quantile_summary_col] else: pred_df_conditional[QUANTILE_SUMMARY_COL] = offset_tuple_col( df=pred_df_conditional, offset_col=offset_col, tuple_col=quantile_summary_col) pred_df_conditional = limit_tuple_col( df=pred_df_conditional, tuple_col=QUANTILE_SUMMARY_COL, lower=min_admissible_value, upper=max_admissible_value) # Only returning needed cols returned_cols = [QUANTILE_SUMMARY_COL, std_col] if conditional_cols is not None: returned_cols = conditional_cols + returned_cols pred_df[returned_cols] = pred_df_conditional[returned_cols] pred_df.rename(columns={ std_col: ERR_STD_COL }, inplace=True) return pred_df