Source code for greykite.framework.input.univariate_time_series

# BSD 2-CLAUSE LICENSE

# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:

# Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
# Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# original author: Albert Chen
"""Input timeseries."""

import warnings
from datetime import datetime
from functools import partial
from typing import Dict
from typing import List
from typing import Optional
from typing import Union

import numpy as np
import pandas as pd

from greykite.common.constants import TIME_COL
from greykite.common.constants import VALUE_COL
from greykite.common.logging import LoggingLevelEnum
from greykite.common.logging import log_message
from greykite.common.time_properties import describe_timeseries
from greykite.common.time_properties import get_canonical_data
from greykite.common.viz.timeseries_plotting import add_groupby_column
from greykite.common.viz.timeseries_plotting import flexible_grouping_evaluation
from greykite.common.viz.timeseries_plotting import grouping_evaluation
from greykite.common.viz.timeseries_plotting import plot_multivariate
from greykite.common.viz.timeseries_plotting import plot_univariate
from greykite.framework.constants import MEAN_COL_GROUP
from greykite.framework.constants import OVERLAY_COL_GROUP
from greykite.framework.constants import QUANTILE_COL_GROUP


[docs]class UnivariateTimeSeries: """Defines univariate time series input. The dataset can include regressors, but only one metric is designated as the target metric to forecast. Loads time series into a standard format. Provides statistics, plotting functions, and ability to generate future dataframe for prediction. Attributes ---------- df: `pandas.DataFrame` Data frame containing timestamp and value, with standardized column names for internal use (TIME_COL, VALUE_COL). Rows are sorted by time index, and missing gaps between dates are filled in so that dates are spaced at regular intervals. Values are adjusted for anomalies according to ``anomaly_info``. The index can be timezone aware (but TIME_COL is not). y: `pandas.Series`, dtype float64 Value of time series to forecast. time_stats: `dict` Summary statistics about the timestamp column. value_stats: `dict` Summary statistics about the value column. original_time_col: `str` Name of time column in original input data. original_value_col: `str` Name of value column in original input data. regressor_cols: `list` [`str`] A list of regressor columns in the training and prediction DataFrames. lagged_regressor_cols: `list` [`str`] A list of additional columns needed for lagged regressors in the training and prediction DataFrames. last_date_for_val: `datetime.datetime` or None, default None Date or timestamp corresponding to last non-null value in ``df[original_value_col]``. last_date_for_reg: `datetime.datetime` or None, default None Date or timestamp corresponding to last non-null value in ``df[regressor_cols]``. If ``regressor_cols`` is None, ``last_date_for_reg`` is None. last_date_for_lag_reg: `datetime.datetime` or None, default None Date or timestamp corresponding to last non-null value in ``df[lagged_regressor_cols]``. If ``lagged_regressor_cols`` is None, ``last_date_for_lag_reg`` is None. train_end_date: `datetime.datetime` Last date or timestamp in ``fit_df``. It is always less than or equal to minimum non-null values of ``last_date_for_val`` and ``last_date_for_reg``. fit_cols: `list` [`str`] A list of columns used in the training and prediction DataFrames. fit_df: `pandas.DataFrame` Data frame containing timestamp and value, with standardized column names for internal use. Will be used for fitting (train, cv, backtest). fit_y: `pandas.Series`, dtype float64 Value of time series for fit_df. freq: `str` timeseries frequency, DateOffset alias, e.g. {'T' (minute), 'H', D', 'W', 'M' (month end), 'MS' (month start), 'Y' (year end), 'Y' (year start)} See https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases anomaly_info : `dict` or `list` [`dict`] or None, default None Anomaly adjustment info. Anomalies in ``df`` are corrected before any forecasting is done. See ``self.load_data()`` df_before_adjustment : `pandas.DataFrame` or None, default None ``self.df`` before adjustment by ``anomaly_info``. Used by ``self.plot()`` to show the adjustment. """ def __init__(self) -> None: self.df: Optional[pd.DataFrame] = None self.y: Optional[pd.Series] = None self.time_stats: Optional[Dict] = None self.value_stats: Optional[Dict] = None self.original_time_col: Optional[str] = None self.original_value_col: Optional[str] = None self.regressor_cols: List[str] = [] self.lagged_regressor_cols: List[str] = [] self.last_date_for_val: Optional[datetime] = None self.last_date_for_reg: Optional[datetime] = None self.last_date_for_lag_reg: Optional[datetime] = None self.train_end_date: Optional[str] = None self.fit_cols: List[str] = [] self.fit_df: Optional[pd.DataFrame] = None self.fit_y: Optional[pd.DataFrame] = None self.freq: Optional[str] = None self.anomaly_info: Optional[Union[Dict, List[Dict]]] = None self.df_before_adjustment: Optional[pd.DataFrame] = None
[docs] def load_data( self, df: pd.DataFrame, time_col: str = TIME_COL, value_col: str = VALUE_COL, freq: str = None, date_format: str = None, tz: str = None, train_end_date: datetime = None, regressor_cols: List[str] = None, lagged_regressor_cols: List[str] = None, anomaly_info: Optional[Union[Dict, List[Dict]]] = None): """Loads data to internal representation. Parses date column, sets timezone aware index. Checks for irregularities and raises an error if input is invalid. Adjusts for anomalies according to ``anomaly_info``. Parameters ---------- df : `pandas.DataFrame` Input timeseries. A data frame which includes the timestamp column as well as the value column. time_col : `str` The column name in ``df`` representing time for the time series data. The time column can be anything that can be parsed by pandas DatetimeIndex. value_col: `str` The column name which has the value of interest to be forecasted. freq : `str` or None, default None Timeseries frequency, DateOffset alias, If None automatically inferred. date_format : `str` or None, default None strftime format to parse time column, eg ``%m/%d/%Y``. Note that ``%f`` will parse all the way up to nanoseconds. If None (recommended), inferred by `pandas.to_datetime`. tz : `str` or pytz.timezone object or None, default None Passed to `pandas.tz_localize` to localize the timestamp. train_end_date : `datetime.datetime` or None, default None Last date to use for fitting the model. Forecasts are generated after this date. If None, it is set to the minimum of ``self.last_date_for_val`` and ``self.last_date_for_reg``. regressor_cols: `list` [`str`] or None, default None A list of regressor columns used in the training and prediction DataFrames. If None, no regressor columns are used. Regressor columns that are unavailable in ``df`` are dropped. lagged_regressor_cols: `list` [`str`] or None, default None A list of additional columns needed for lagged regressors in the training and prediction DataFrames. This list can have overlap with ``regressor_cols``. If None, no additional columns are added to the DataFrame. Lagged regressor columns that are unavailable in ``df`` are dropped. anomaly_info : `dict` or `list` [`dict`] or None, default None Anomaly adjustment info. Anomalies in ``df`` are corrected before any forecasting is done. If None, no adjustments are made. A dictionary containing the parameters to `~greykite.common.features.adjust_anomalous_data.adjust_anomalous_data`. See that function for details. The possible keys are: ``"value_col"`` : `str` The name of the column in ``df`` to adjust. You may adjust the value to forecast as well as any numeric regressors. ``"anomaly_df"`` : `pandas.DataFrame` Adjustments to correct the anomalies. ``"start_date_col"``: `str`, default START_DATE_COL Start date column in ``anomaly_df``. ``"end_date_col"``: `str`, default END_DATE_COL End date column in ``anomaly_df``. ``"adjustment_delta_col"``: `str` or None, default None Impact column in ``anomaly_df``. ``"filter_by_dict"``: `dict` or None, default None Used to filter ``anomaly_df`` to the relevant anomalies for the ``value_col`` in this dictionary. Key specifies the column name, value specifies the filter value. ``"filter_by_value_col""``: `str` or None, default None Adds ``{filter_by_value_col: value_col}`` to ``filter_by_dict`` if not None, for the ``value_col`` in this dictionary. ``"adjustment_method"`` : `str` ("add" or "subtract"), default "add" How to make the adjustment, if ``adjustment_delta_col`` is provided. Accepts a list of such dictionaries to adjust multiple columns in ``df``. Returns ------- self : Returns self. Sets ``self.df`` with standard column names, value adjusted for anomalies, and time gaps filled in, sorted by time index. """ self.original_time_col = time_col self.original_value_col = value_col self.anomaly_info = anomaly_info canonical_data_dict = get_canonical_data( df=df, time_col=time_col, value_col=value_col, freq=freq, date_format=date_format, tz=tz, train_end_date=train_end_date, regressor_cols=regressor_cols, lagged_regressor_cols=lagged_regressor_cols, anomaly_info=anomaly_info) self.df = canonical_data_dict["df"] self.df_before_adjustment = canonical_data_dict["df_before_adjustment"] self.fit_df = canonical_data_dict["fit_df"] self.freq = canonical_data_dict["freq"] self.time_stats = canonical_data_dict["time_stats"] self.regressor_cols = canonical_data_dict["regressor_cols"] self.lagged_regressor_cols = canonical_data_dict["lagged_regressor_cols"] self.fit_cols = canonical_data_dict["fit_cols"] self.train_end_date = canonical_data_dict["train_end_date"] self.last_date_for_val = canonical_data_dict["last_date_for_val"] self.last_date_for_reg = canonical_data_dict["last_date_for_reg"] self.last_date_for_lag_reg = canonical_data_dict["last_date_for_lag_reg"] # y (possibly with null values) after gaps have been filled in and anomalies corrected self.y = self.df[VALUE_COL] self.fit_y = self.fit_df[VALUE_COL] # computes statistics of processed dataset self.describe_time_col() self.describe_value_col() # compute value statistics log_message(f"last date for fit: {self.train_end_date}", LoggingLevelEnum.INFO) log_message(f"last date for {self.original_value_col}: {self.last_date_for_val}", LoggingLevelEnum.INFO) log_message(f"last date with any regressor: {self.last_date_for_reg}", LoggingLevelEnum.INFO) log_message(f"columns available to use as regressors: {', '.join(self.regressor_cols)}", LoggingLevelEnum.INFO) log_message(f"columns available to use as lagged regressors: {', '.join(self.lagged_regressor_cols)}", LoggingLevelEnum.INFO) return self
[docs] def describe_time_col(self): """Basic descriptive stats on the timeseries time column. Returns ------- time_stats: `dict` Dictionary with descriptive stats on the timeseries time column. * data_points: int number of time points * mean_increment_secs: float mean frequency * min_timestamp: datetime64 start date * max_timestamp: datetime64 end date """ if self.df is None: raise RuntimeError("Must load data before describing dataset") timeseries_info = describe_timeseries(df=self.df, time_col=TIME_COL) data_points = self.df.shape[0] mean_increment_secs = timeseries_info["mean_increment_secs"] min_timestamp = timeseries_info["min_timestamp"] max_timestamp = timeseries_info["max_timestamp"] log_message("Input time stats:", LoggingLevelEnum.INFO) log_message(f" data points: {data_points}", LoggingLevelEnum.INFO) log_message(f" avg increment (sec): {mean_increment_secs:.2f}", LoggingLevelEnum.INFO) log_message(f" start date: {min_timestamp}", LoggingLevelEnum.INFO) log_message(f" end date: {max_timestamp}", LoggingLevelEnum.INFO) time_stats = { "data_points": data_points, # total number of time points, including missing ones "mean_increment_secs": mean_increment_secs, # after filling in gaps "min_timestamp": min_timestamp, "max_timestamp": max_timestamp, } self.time_stats.update(time_stats) # compute time statistics return time_stats
[docs] def describe_value_col(self): """Basic descriptive stats on the timeseries value column. Returns ------- value_stats : `dict` [`str`, `float`] Dict with keys: count, mean, std, min, 25%, 50%, 75%, max """ if self.df is None: raise RuntimeError("Must load data before describing values") self.value_stats = self.df[VALUE_COL].describe() # count is the total number of provided timepoints log_message("Input value stats:", LoggingLevelEnum.INFO) log_message(repr(self.value_stats), LoggingLevelEnum.INFO) return self.value_stats
[docs] def make_future_dataframe(self, periods: int = None, include_history=True): """Extends the input data for prediction into the future. Includes the historical values (VALUE_COL) so this can be fed into a Pipeline that transforms input data for fitting, and for use in evaluation. Parameters ---------- periods : int or None Number of periods to forecast. If there are no regressors, default is 30. If there are regressors, default is to predict all available dates. include_history : bool Whether to return historical dates and values with future dates. Returns ------- future_df : `pandas.DataFrame` Dataframe with future timestamps for prediction. Contains columns for: * prediction dates (``TIME_COL``), * values (``VALUE_COL``), * optional regressors """ if self.df is None: raise RuntimeError("Must load data before generating future dates.") # determines the number of future periods to predict if self.regressor_cols: max_regressor_periods = len(self.df[ (self.df[TIME_COL] > self.train_end_date) & (self.df[TIME_COL] <= self.last_date_for_reg) ]) if periods is None: periods = max_regressor_periods elif periods > max_regressor_periods: warnings.warn( f"Provided periods '{periods}' is more than allowed ('{max_regressor_periods}') due to " f"the length of regressor columns. Using '{max_regressor_periods}'.", UserWarning) periods = max_regressor_periods elif periods is None: periods = 30 # the future dates for prediction dates = pd.date_range( start=self.train_end_date, periods=periods + 1, # an extra in case we include start freq=self.freq) dates = dates[dates > self.train_end_date] # drops values up to train_end_date dates = dates[:periods] # returns the correct number of periods if self.regressor_cols: # return TIME_COL, VALUE_COL, and regressors last_date_for_predict = dates.max() if include_history: valid_indices = (self.df[TIME_COL] <= last_date_for_predict) else: valid_indices = ((self.df[TIME_COL] > self.train_end_date) & (self.df[TIME_COL] <= last_date_for_predict)) future_df = self.df[valid_indices] else: # return TIME_COL, VALUE_COL future_df = self.df.reindex(index=dates) future_df[TIME_COL] = future_df.index if include_history: future_df = pd.concat([self.fit_df, future_df], axis=0, sort=False) return future_df[self.fit_cols]
[docs] def plot( self, color="rgb(32, 149, 212)", show_anomaly_adjustment=False, **kwargs): """Returns interactive plotly graph of the value against time. If anomaly info is provided, there is an option to show the anomaly adjustment. Parameters ---------- color : `str`, default "rgb(32, 149, 212)" (light blue) Color of the value line (after adjustment, if applicable). show_anomaly_adjustment : `bool`, default False Whether to show the anomaly adjustment. kwargs : additional parameters Additional parameters to pass to `~greykite.common.viz.timeseries_plotting.plot_univariate` such as title and color. Returns ------- fig : `plotly.graph_objects.Figure` Interactive plotly graph of the value against time. See `~greykite.common.viz.timeseries_plotting.plot_forecast_vs_actual` return value for how to plot the figure and add customization. """ df = self.df.copy() # Plots value after anomaly adjustment y_col_style_dict = { VALUE_COL: dict( name=self.original_value_col, mode="lines", line=dict( color=color, ), opacity=0.8 ) } if show_anomaly_adjustment: if self.anomaly_info is not None: # Adds value before adjustment to ``df`` postfix = "_unadjusted" df[f"{VALUE_COL}{postfix}"] = self.df_before_adjustment[VALUE_COL] y_col_style_dict[f"{VALUE_COL}{postfix}"] = dict( name=f"{self.original_value_col}{postfix}", mode="lines", line=dict( color="#B3B3B3", # light gray ), opacity=0.8 ) else: raise ValueError("There is no `anomaly_info` to show. `show_anomaly_adjustment` must be False.") return plot_multivariate( df, TIME_COL, y_col_style_dict, xlabel=self.original_time_col, ylabel=self.original_value_col, **kwargs)
[docs] def get_grouping_evaluation( self, aggregation_func=np.nanmean, aggregation_func_name="mean", groupby_time_feature=None, groupby_sliding_window_size=None, groupby_custom_column=None): """Group-wise computation of aggregated timeSeries value. Can be used to evaluate error/ aggregated value by a time feature, over time, or by a user-provided column. Exactly one of: ``groupby_time_feature``, ``groupby_sliding_window_size``, ``groupby_custom_column`` must be provided. Parameters ---------- aggregation_func : callable, optional, default ``numpy.nanmean`` Function that aggregates an array to a number. Signature (y: array) -> aggregated value: float. aggregation_func_name : `str` or None, optional, default "mean" Name of grouping function, used to report results. If None, defaults to "aggregation". groupby_time_feature : `str` or None, optional If provided, groups by a column generated by `~greykite.common.features.timeseries_features.build_time_features_df`. See that function for valid values. groupby_sliding_window_size : `int` or None, optional If provided, sequentially partitions data into groups of size ``groupby_sliding_window_size``. groupby_custom_column : `pandas.Series` or None, optional If provided, groups by this column value. Should be same length as the DataFrame. Returns ------- grouped_df : `pandas.DataFrame` with two columns: (1) grouping_func_name: evaluation metric for aggregation of timeseries. (2) group name: group name depends on the grouping method: ``groupby_time_feature`` for ``groupby_time_feature`` ``cst.TIME_COL`` for ``groupby_sliding_window_size`` ``groupby_custom_column.name`` for ``groupby_custom_column``. """ df = self.df.copy() if aggregation_func_name: grouping_func_name = f"{aggregation_func_name} of {VALUE_COL}" else: grouping_func_name = f"aggregation of {VALUE_COL}" def grouping_func(grp): return aggregation_func(grp[VALUE_COL]) result = add_groupby_column( df=df, time_col=TIME_COL, groupby_time_feature=groupby_time_feature, groupby_sliding_window_size=groupby_sliding_window_size, groupby_custom_column=groupby_custom_column) grouped_df = grouping_evaluation( df=result["df"], groupby_col=result["groupby_col"], grouping_func=grouping_func, grouping_func_name=grouping_func_name) return grouped_df
[docs] def plot_grouping_evaluation( self, aggregation_func=np.nanmean, aggregation_func_name="mean", groupby_time_feature=None, groupby_sliding_window_size=None, groupby_custom_column=None, xlabel=None, ylabel=None, title=None): """Computes aggregated timeseries by group and plots the result. Can be used to plot aggregated timeseries by a time feature, over time, or by a user-provided column. Exactly one of: ``groupby_time_feature``, ``groupby_sliding_window_size``, ``groupby_custom_column`` must be provided. Parameters ---------- aggregation_func : callable, optional, default ``numpy.nanmean`` Function that aggregates an array to a number. Signature (y: array) -> aggregated value: float. aggregation_func_name : `str` or None, optional, default "mean" Name of grouping function, used to report results. If None, defaults to "aggregation". groupby_time_feature : `str` or None, optional If provided, groups by a column generated by `~greykite.common.features.timeseries_features.build_time_features_df`. See that function for valid values. groupby_sliding_window_size : `int` or None, optional If provided, sequentially partitions data into groups of size ``groupby_sliding_window_size``. groupby_custom_column : `pandas.Series` or None, optional If provided, groups by this column value. Should be same length as the DataFrame. xlabel : `str`, optional, default None X-axis label of the plot. ylabel : `str`, optional, default None Y-axis label of the plot. title : `str` or None, optional Plot title. If None, default is based on axis labels. Returns ------- fig : `plotly.graph_objects.Figure` plotly graph object showing aggregated timeseries by group. x-axis label depends on the grouping method: ``groupby_time_feature`` for ``groupby_time_feature`` ``TIME_COL`` for ``groupby_sliding_window_size`` ``groupby_custom_column.name`` for ``groupby_custom_column``. """ grouped_df = self.get_grouping_evaluation( aggregation_func=aggregation_func, aggregation_func_name=aggregation_func_name, groupby_time_feature=groupby_time_feature, groupby_sliding_window_size=groupby_sliding_window_size, groupby_custom_column=groupby_custom_column) xcol, ycol = grouped_df.columns fig = plot_univariate( df=grouped_df, x_col=xcol, y_col=ycol, xlabel=xlabel, ylabel=ylabel, title=title) return fig
[docs] def get_quantiles_and_overlays( self, groupby_time_feature=None, groupby_sliding_window_size=None, groupby_custom_column=None, show_mean=False, show_quantiles=False, show_overlays=False, overlay_label_time_feature=None, overlay_label_sliding_window_size=None, overlay_label_custom_column=None, center_values=False, value_col=VALUE_COL, mean_col_name="mean", quantile_col_prefix="Q", **overlay_pivot_table_kwargs): """Computes mean, quantiles, and overlays by the requested grouping dimension. Overlays are best explained in the plotting context. The grouping dimension goes on the x-axis, and one line is shown for each level of the overlay dimension. This function returns a column for each line to plot (e.g. mean, each quantile, each overlay value). Exactly one of: ``groupby_time_feature``, ``groupby_sliding_window_size``, ``groupby_custom_column`` must be provided as the grouping dimension. If ``show_overlays`` is True, exactly one of: ``overlay_label_time_feature``, ``overlay_label_sliding_window_size``, ``overlay_label_custom_column`` can be provided to specify the ``label_col`` (overlay dimension). Internally, the function calls `pandas.DataFrame.pivot_table` with ``index=groupby_col``, ``columns=label_col``, ``values=value_col`` to get the overlay values for plotting. You can pass additional parameters to `pandas.DataFrame.pivot_table` via ``overlay_pivot_table_kwargs``, e.g. to change the aggregation method. If an explicit label is not provided, the records are labeled by their position within the group. For example, to show yearly seasonality mean, quantiles, and overlay plots for each individual year, use:: self.get_quantiles_and_overlays( groupby_time_feature="doy", # Rows: a row for each day of year (1, 2, ..., 366) show_mean=True, # mean value on that day show_quantiles=[0.1, 0.9], # quantiles of the observed distribution on that day show_overlays=True, # Include overlays defined by ``overlay_label_time_feature`` overlay_label_time_feature="year") # One column for each observed "year" (2016, 2017, 2018, ...) To show weekly seasonality over time, use:: self.get_quantiles_and_overlays( groupby_time_feature="dow", # Rows: a row for each day of week (1, 2, ..., 7) show_mean=True, # mean value on that day show_quantiles=[0.1, 0.5, 0.9], # quantiles of the observed distribution on that day show_overlays=True, # Include overlays defined by ``overlay_label_time_feature`` overlay_label_sliding_window_size=90, # One column for each 90 period sliding window in the dataset, aggfunc="median") # overlay value is the median value for the dow over the period (default="mean"). It may be difficult to assess the weekly seasonality from the previous result, because overlays shift up/down over time due to trend/yearly seasonality. Use ``center_values=True`` to adjust each overlay so its average value is centered at 0. Mean and quantiles are shifted by a single constant to center the mean at 0, while preserving their relative values:: self.get_quantiles_and_overlays( groupby_time_feature="dow", show_mean=True, show_quantiles=[0.1, 0.5, 0.9], show_overlays=True, overlay_label_sliding_window_size=90, aggfunc="median", center_values=True) # Centers the output Centering reduces the variability in the overlays to make it easier to isolate the effect by the groupby column. As a result, centered overlays have smaller variability than that reported by the quantiles, which operate on the original, uncentered data points. Similarly, if overlays are aggregates of individual values (i.e. ``aggfunc`` is needed in the call to `pandas.DataFrame.pivot_table`), the quantiles of overlays will be less extreme than those of the original data. - To assess variability conditioned on the groupby value, check the quantiles. - To assess variability conditioned on both the groupby and overlay value, after any necessary aggregation, check the variability of the overlay values. Compute quantiles of overlays from the return value if desired. Parameters ---------- groupby_time_feature : `str` or None, default None If provided, groups by a column generated by `~greykite.common.features.timeseries_features.build_time_features_df`. See that function for valid values. groupby_sliding_window_size : `int` or None, default None If provided, sequentially partitions data into groups of size ``groupby_sliding_window_size``. groupby_custom_column : `pandas.Series` or None, default None If provided, groups by this column value. Should be same length as the DataFrame. show_mean : `bool`, default False Whether to return the mean value by the groupby column. show_quantiles : `bool` or `list` [`float`] or `numpy.array`, default False Whether to return the quantiles of the value by the groupby column. If False, does not return quantiles. If True, returns default quantiles (0.1 and 0.9). If array-like, a list of quantiles to compute (e.g. (0.1, 0.25, 0.75, 0.9)). show_overlays : `bool` or `int` or array-like [`int` or `str`] or None, default False Whether to return overlays of the value by the groupby column. If False, no overlays are shown. If True and ``label_col`` is defined, calls `pandas.DataFrame.pivot_table` with ``index=groupby_col``, ``columns=label_col``, ``values=value_col``. ``label_col`` is defined by one of ``overlay_label_time_feature``, ``overlay_label_sliding_window_size``, or ``overlay_label_custom_column``. Returns one column for each value of the ``label_col``. If True and the ``label_col`` is not defined, returns the raw values within each group. Values across groups are put into columns by their position in the group (1st element in group, 2nd, 3rd, etc.). Positional order in a group is not guaranteed to correspond to anything meaningful, so the items within a column may not have anything in common. It is better to specify one of ``overlay_*`` to explicitly define the overlay labels. If an integer, the number of overlays to randomly sample. The same as True, then randomly samples up to `int` columns. This is useful if there are too many values. If a list [int], a list of column indices (int type). The same as True, then selects the specified columns by index. If a list [str], a list of column names. Column names are matched by their string representation to the names in this list. The same as True, then selects the specified columns by name. overlay_label_time_feature : `str` or None, default None If ``show_overlays`` is True, can be used to define ``label_col``, i.e. which dimension to show separately as overlays. If provided, uses a column generated by `~greykite.common.features.timeseries_features.build_time_features_df`. See that function for valid values. overlay_label_sliding_window_size : `int` or None, default None If ``show_overlays`` is True, can be used to define ``label_col``, i.e. which dimension to show separately as overlays. If provided, uses a column that sequentially partitions data into groups of size ``groupby_sliding_window_size``. overlay_label_custom_column : `pandas.Series` or None, default None If ``show_overlays`` is True, can be used to define ``label_col``, i.e. which dimension to show separately as overlays. If provided, uses this column value. Should be same length as the DataFrame. value_col : `str`, default VALUE_COL The column name for the value column. By default, shows the univariate time series value, but it can be any other column in ``self.df``. mean_col_name : `str`, default "mean" The name to use for the mean column in the output. Applies if ``show_mean=True``. quantile_col_prefix : `str`, default "Q" The prefix to use for quantile column names in the output. Columns are named with this prefix followed by the quantile, rounded to 2 decimal places. center_values : `bool`, default False Whether to center the return values. If True, shifts each overlay so its average value is centered at 0. Shifts mean and quantiles by a constant to center the mean at 0, while preserving their relative values. If False, values are not centered. overlay_pivot_table_kwargs : additional parameters Additional keyword parameters to pass to `pandas.DataFrame.pivot_table`, used in generating the overlays. See above description for details. Returns ------- grouped_df : `pandas.DataFrame` Dataframe with mean, quantiles, and overlays by the grouping column. Overlays are defined by the grouping column and overlay dimension. ColumnIndex is a multiindex with first level as the "category", a subset of [MEAN_COL_GROUP, QUANTILE_COL_GROUP, OVERLAY_COL_GROUP] depending on what is requests. - grouped_df[MEAN_COL_GROUP] = df with single column, named ``mean_col_name``. - grouped_df[QUANTILE_COL_GROUP] = df with a column for each quantile, named f"{quantile_col_prefix}{round(str(q))}", where ``q`` is the quantile. - grouped_df[OVERLAY_COL_GROUP] = df with one column per overlay value, named by the overlay value. For example, it might look like:: category mean quantile overlay name mean Q0.1 Q0.9 2007 2008 2009 doy 1 8.42 7.72 9.08 8.29 7.75 8.33 2 8.82 8.20 9.56 8.43 8.80 8.53 3 8.95 8.25 9.88 8.26 9.12 8.70 4 9.07 8.60 9.49 8.10 9.99 8.73 5 8.73 8.29 9.24 7.95 9.26 8.37 ... ... ... ... ... ... ... """ # Default quantiles to show if `show_quantiles` is boolean if isinstance(show_quantiles, bool): if show_quantiles: show_quantiles = [0.1, 0.9] else: show_quantiles = None # Adds grouping dimension result = add_groupby_column( df=self.df, time_col=TIME_COL, # Already standardized groupby_time_feature=groupby_time_feature, groupby_sliding_window_size=groupby_sliding_window_size, groupby_custom_column=groupby_custom_column) df = result["df"] groupby_col = result["groupby_col"] grouped_df = None # Whether an overlay label is provided add_overlay_label = (overlay_label_time_feature is not None) or \ (overlay_label_sliding_window_size is not None) or \ (overlay_label_custom_column is not None) overlay_df = None # Defines an aggregation function to compute mean, quantiles, and overlays agg_kwargs = {} if show_mean: agg_kwargs.update({mean_col_name: pd.NamedAgg(column=value_col, aggfunc=np.nanmean)}) if show_quantiles is not None: # Returns the quantiles of the group's `value_col` as a list agg_kwargs.update({quantile_col_prefix: pd.NamedAgg( column=value_col, aggfunc=lambda grp_values: partial(np.nanquantile, q=show_quantiles)(grp_values).tolist())}) if show_overlays is not False: if add_overlay_label: # Uses DataFrame pivot_table to get overlay labels as columns, `groupby_col` as index label_result = add_groupby_column( df=df, time_col=TIME_COL, groupby_time_feature=overlay_label_time_feature, groupby_sliding_window_size=overlay_label_sliding_window_size, groupby_custom_column=overlay_label_custom_column) label_col = label_result["groupby_col"] overlay_df = label_result["df"].pivot_table( index=groupby_col, columns=label_col, values=value_col, **overlay_pivot_table_kwargs) else: # Uses aggregation to get overlays. # Takes original values within each group. # Values across groups are put into columns by their position # within the group (1st element in group, 2nd, 3rd, etc.) agg_kwargs.update({"overlay": pd.NamedAgg(column=value_col, aggfunc=tuple)}) # Names the quantile columns # Keeps to 2 decimal places to handle numerical imprecision. list_names_dict = {quantile_col_prefix: [ f"{quantile_col_prefix}{str(round(x, 2))}" for x in show_quantiles]}\ if show_quantiles is not None else {} if agg_kwargs: grouped_df = flexible_grouping_evaluation( result["df"], map_func_dict=None, groupby_col=result["groupby_col"], agg_kwargs=agg_kwargs, extend_col_names=False, unpack_list=True, list_names_dict=list_names_dict) # Adds overlays if requested and not already computed during aggregation if overlay_df is not None: overlay_df.columns = map(str, overlay_df.columns) # Either overlay_df or grouped_df is populated if grouped_df is None and overlay_df is None: raise ValueError("Must enable at least one of: show_mean, show_quantiles, show_overlays.") grouped_df = pd.concat([grouped_df, overlay_df], axis=1) # Creates MultiIndex for column names to categorize the column names by their type mean_cols = [mean_col_name] if show_mean else [] quantile_cols = list_names_dict.get(quantile_col_prefix, []) overlay_cols = [col for col in list(grouped_df.columns) if col not in mean_cols + quantile_cols] if isinstance(show_overlays, int) and not isinstance(show_overlays, bool): # Samples from `overlay_cols` which_overlays = sorted(np.random.choice( range(len(overlay_cols)), size=min(show_overlays, len(overlay_cols)), replace=False)) overlay_cols = list(np.array(overlay_cols)[which_overlays]) elif isinstance(show_overlays, (list, tuple, np.ndarray)): # Selects from `overlay_cols` all_integers = np.issubdtype(np.array(show_overlays).dtype, np.integer) if all_integers: overlay_cols = [col for i, col in enumerate(overlay_cols) if i in show_overlays] else: overlay_cols = [col for col in overlay_cols if str(col) in show_overlays] cols = mean_cols + quantile_cols + overlay_cols # Reorders columns by group grouped_df = grouped_df[cols] categories = list(np.repeat( [MEAN_COL_GROUP, QUANTILE_COL_GROUP, OVERLAY_COL_GROUP], # Labels columns by category [len(mean_cols), len(quantile_cols), len(overlay_cols)])) cateory_col_index = pd.MultiIndex.from_arrays([categories, cols], names=["category", "name"]) grouped_df.columns = cateory_col_index if center_values: # Each overlay is independently shifted to have mean 0. if OVERLAY_COL_GROUP in grouped_df: grouped_df[OVERLAY_COL_GROUP] -= grouped_df[OVERLAY_COL_GROUP].mean() # Mean and quantiles are shifted by the same constant, so the mean column is centered at 0. if MEAN_COL_GROUP in grouped_df: mean_shift = grouped_df[MEAN_COL_GROUP].mean()[0] grouped_df[MEAN_COL_GROUP] -= mean_shift else: mean_shift = self.df[value_col].mean() if QUANTILE_COL_GROUP in grouped_df: grouped_df[QUANTILE_COL_GROUP] -= mean_shift return grouped_df
[docs] def plot_quantiles_and_overlays( self, groupby_time_feature=None, groupby_sliding_window_size=None, groupby_custom_column=None, show_mean=False, show_quantiles=False, show_overlays=False, overlay_label_time_feature=None, overlay_label_sliding_window_size=None, overlay_label_custom_column=None, center_values=False, value_col=VALUE_COL, mean_col_name="mean", quantile_col_prefix="Q", mean_style=None, quantile_style=None, overlay_style=None, xlabel=None, ylabel=None, title=None, showlegend=True, **overlay_pivot_table_kwargs): """Plots mean, quantiles, and overlays by the requested grouping dimension. The grouping dimension goes on the x-axis, and one line is shown for the mean, each quantile, and each level of the overlay dimension, as requested. By default, shading is applied between the quantiles. Exactly one of: ``groupby_time_feature``, ``groupby_sliding_window_size``, ``groupby_custom_column`` must be provided as the grouping dimension. If ``show_overlays`` is True, exactly one of: ``overlay_label_time_feature``, ``overlay_label_sliding_window_size``, ``overlay_label_custom_column`` can be provided to specify the ``label_col`` (overlay dimension). Internally, the function calls `pandas.DataFrame.pivot_table` with ``index=groupby_col``, ``columns=label_col``, ``values=value_col`` to get the overlay values for plotting. You can pass additional parameters to `pandas.DataFrame.pivot_table` via ``overlay_pivot_table_kwargs``, e.g. to change the aggregation method. If an explicit label is not provided, the records are labeled by their position within the group. For example, to show yearly seasonality mean, quantiles, and overlay plots for each individual year, use:: self.plot_quantiles_and_overlays( groupby_time_feature="doy", # Rows: a row for each day of year (1, 2, ..., 366) show_mean=True, # mean value on that day show_quantiles=[0.1, 0.9], # quantiles of the observed distribution on that day show_overlays=True, # Include overlays defined by ``overlay_label_time_feature`` overlay_label_time_feature="year") # One column for each observed "year" (2016, 2017, 2018, ...) To show weekly seasonality over time, use:: self.plot_quantiles_and_overlays( groupby_time_feature="dow", # Rows: a row for each day of week (1, 2, ..., 7) show_mean=True, # mean value on that day show_quantiles=[0.1, 0.5, 0.9], # quantiles of the observed distribution on that day show_overlays=True, # Include overlays defined by ``overlay_label_time_feature`` overlay_label_sliding_window_size=90, # One column for each 90 period sliding window in the dataset, aggfunc="median") # overlay value is the median value for the dow over the period (default="mean"). It may be difficult to assess the weekly seasonality from the previous result, because overlays shift up/down over time due to trend/yearly seasonality. Use ``center_values=True`` to adjust each overlay so its average value is centered at 0. Mean and quantiles are shifted by a single constant to center the mean at 0, while preserving their relative values:: self.plot_quantiles_and_overlays( groupby_time_feature="dow", show_mean=True, show_quantiles=[0.1, 0.5, 0.9], show_overlays=True, overlay_label_sliding_window_size=90, aggfunc="median", center_values=True) # Centers the output Centering reduces the variability in the overlays to make it easier to isolate the effect by the groupby column. As a result, centered overlays have smaller variability than that reported by the quantiles, which operate on the original, uncentered data points. Similarly, if overlays are aggregates of individual values (i.e. ``aggfunc`` is needed in the call to `pandas.DataFrame.pivot_table`), the quantiles of overlays will be less extreme than those of the original data. - To assess variability conditioned on the groupby value, check the quantiles. - To assess variability conditioned on both the groupby and overlay value, after any necessary aggregation, check the variability of the overlay values. Compute quantiles of overlays from the return value if desired. Parameters ---------- groupby_time_feature : `str` or None, default None If provided, groups by a column generated by `~greykite.common.features.timeseries_features.build_time_features_df`. See that function for valid values. groupby_sliding_window_size : `int` or None, default None If provided, sequentially partitions data into groups of size ``groupby_sliding_window_size``. groupby_custom_column : `pandas.Series` or None, default None If provided, groups by this column value. Should be same length as the DataFrame. show_mean : `bool`, default False Whether to return the mean value by the groupby column. show_quantiles : `bool` or `list` [`float`] or `numpy.array`, default False Whether to return the quantiles of the value by the groupby column. If False, does not return quantiles. If True, returns default quantiles (0.1 and 0.9). If array-like, a list of quantiles to compute (e.g. (0.1, 0.25, 0.75, 0.9)). show_overlays : `bool` or `int` or array-like [`int` or `str`], default False Whether to return overlays of the value by the groupby column. If False, no overlays are shown. If True and ``label_col`` is defined, calls `pandas.DataFrame.pivot_table` with ``index=groupby_col``, ``columns=label_col``, ``values=value_col``. ``label_col`` is defined by one of ``overlay_label_time_feature``, ``overlay_label_sliding_window_size``, or ``overlay_label_custom_column``. Returns one column for each value of the ``label_col``. If True and the ``label_col`` is not defined, returns the raw values within each group. Values across groups are put into columns by their position in the group (1st element in group, 2nd, 3rd, etc.). Positional order in a group is not guaranteed to correspond to anything meaningful, so the items within a column may not have anything in common. It is better to specify one of ``overlay_*`` to explicitly define the overlay labels. If an integer, the number of overlays to randomly sample. The same as True, then randomly samples up to `int` columns. This is useful if there are too many values. If a list [int], a list of column indices (int type). The same as True, then selects the specified columns by index. If a list [str], a list of column names. Column names are matched by their string representation to the names in this list. The same as True, then selects the specified columns by name. overlay_label_time_feature : `str` or None, default None If ``show_overlays`` is True, can be used to define ``label_col``, i.e. which dimension to show separately as overlays. If provided, uses a column generated by `~greykite.common.features.timeseries_features.build_time_features_df`. See that function for valid values. overlay_label_sliding_window_size : `int` or None, default None If ``show_overlays`` is True, can be used to define ``label_col``, i.e. which dimension to show separately as overlays. If provided, uses a column that sequentially partitions data into groups of size ``groupby_sliding_window_size``. overlay_label_custom_column : `pandas.Series` or None, default None If ``show_overlays`` is True, can be used to define ``label_col``, i.e. which dimension to show separately as overlays. If provided, uses this column value. Should be same length as the DataFrame. value_col : `str`, default VALUE_COL The column name for the value column. By default, shows the univariate time series value, but it can be any other column in ``self.df``. mean_col_name : `str`, default "mean" The name to use for the mean column in the output. Applies if ``show_mean=True``. quantile_col_prefix : `str`, default "Q" The prefix to use for quantile column names in the output. Columns are named with this prefix followed by the quantile, rounded to 2 decimal places. center_values : `bool`, default False Whether to center the return values. If True, shifts each overlay so its average value is centered at 0. Shifts mean and quantiles by a constant to center the mean at 0, while preserving their relative values. If False, values are not centered. mean_style: `dict` or None, default None How to style the mean line, passed as keyword arguments to `plotly.graph_objects.Scatter`. If None, the default is:: mean_style = { "line": dict( width=2, color="#595959"), # gray "legendgroup": MEAN_COL_GROUP} quantile_style: `dict` or None, default None How to style the quantile lines, passed as keyword arguments to `plotly.graph_objects.Scatter`. If None, the default is:: quantile_style = { "line": dict( width=2, color="#1F9AFF", # blue dash="solid"), "legendgroup": QUANTILE_COL_GROUP, # show/hide them together "fill": "tonexty"} Note that fill style is removed from to the first quantile line, to fill only between items in the same category. overlay_style: `dict` or None, default None How to style the overlay lines, passed as keyword arguments to `plotly.graph_objects.Scatter`. If None, the default is:: overlay_style = { "opacity": 0.5, # makes it easier to see density "line": dict( width=1, color="#B3B3B3", # light gray dash="solid"), "legendgroup": OVERLAY_COL_GROUP} xlabel : `str`, optional, default None X-axis label of the plot. ylabel : `str`, optional, default None Y-axis label of the plot. If None, uses ``value_col``. title : `str` or None, default None Plot title. If None, default is based on axis labels. showlegend : `bool`, default True Whether to show the legend. overlay_pivot_table_kwargs : additional parameters Additional keyword parameters to pass to `pandas.DataFrame.pivot_table`, used in generating the overlays. See `~greykite.framework.input.univariate_time_series.UnivariateTimeSeries.get_quantiles_and_overlays` description for details. Returns ------- fig : `plotly.graph_objects.Figure` plotly graph object showing the mean, quantiles, and overlays. See Also -------- `~greykite.framework.input.univariate_time_series.UnivariateTimeSeries.get_quantiles_and_overlays` To get the mean, quantiles, and overlays as a `pandas.DataFrame` without plotting. """ if ylabel is None: ylabel = value_col grouped_df = self.get_quantiles_and_overlays( groupby_time_feature=groupby_time_feature, groupby_sliding_window_size=groupby_sliding_window_size, groupby_custom_column=groupby_custom_column, show_mean=show_mean, show_quantiles=show_quantiles, show_overlays=show_overlays, overlay_label_time_feature=overlay_label_time_feature, overlay_label_sliding_window_size=overlay_label_sliding_window_size, overlay_label_custom_column=overlay_label_custom_column, center_values=center_values, value_col=value_col, mean_col_name=mean_col_name, quantile_col_prefix=quantile_col_prefix, **overlay_pivot_table_kwargs) if mean_style is None: mean_style = { "line": dict( width=2, color="#595959"), # gray "legendgroup": MEAN_COL_GROUP} if quantile_style is None: quantile_style = { "line": dict( width=2, color="#1F9AFF", # blue dash="solid"), "legendgroup": QUANTILE_COL_GROUP, # show/hide them together "fill": "tonexty"} if overlay_style is None: overlay_style = { "opacity": 0.5, # makes it easier to see density "line": dict( width=1, color="#B3B3B3", # light gray dash="solid"), "legendgroup": OVERLAY_COL_GROUP} style_dict = { MEAN_COL_GROUP: mean_style, QUANTILE_COL_GROUP: quantile_style, OVERLAY_COL_GROUP: overlay_style} y_col_style_dict = {} # All categories in grouped_df. Reverses the order so the first category is plotted last (on top). categories = grouped_df.columns.get_level_values(0).unique()[::-1] for category in categories: style = style_dict.get(category, {}) if "fill" in style: # If fill is part of the style, plotly fills the area between this line and # the previous line added to the plot. # Since we only want to fill between lines in the same category (e.g. between quantiles), # we remove the "fill" from the first line within each category. Otherwise the first # line in this category would fill to the last line in the previous category. category_style_dict = {grouped_df[category].columns[0]: {k: v for k, v in style.items() if k != "fill"}} category_style_dict.update({col: style for col in grouped_df[category].columns[1:]}) else: category_style_dict = {col: style for col in grouped_df[category].columns} y_col_style_dict.update(category_style_dict) grouped_df.columns = list(grouped_df.columns.get_level_values(1)) # MultiIndex is not needed for plotting x_col = grouped_df.index.name grouped_df.reset_index(inplace=True) fig = plot_multivariate( grouped_df, x_col=x_col, y_col_style_dict=y_col_style_dict, xlabel=xlabel, ylabel=ylabel, title=title, showlegend=showlegend) return fig