# BSD 2-CLAUSE LICENSE
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
# Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
# Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# original author: Albert Chen
"""Input timeseries."""
import warnings
from datetime import datetime
from functools import partial
from typing import Dict
from typing import List
from typing import Optional
from typing import Union
import numpy as np
import pandas as pd
from greykite.common.constants import TIME_COL
from greykite.common.constants import VALUE_COL
from greykite.common.logging import LoggingLevelEnum
from greykite.common.logging import log_message
from greykite.common.time_properties import describe_timeseries
from greykite.common.time_properties import get_canonical_data
from greykite.common.viz.timeseries_plotting import add_groupby_column
from greykite.common.viz.timeseries_plotting import flexible_grouping_evaluation
from greykite.common.viz.timeseries_plotting import grouping_evaluation
from greykite.common.viz.timeseries_plotting import plot_multivariate
from greykite.common.viz.timeseries_plotting import plot_univariate
from greykite.framework.constants import MEAN_COL_GROUP
from greykite.framework.constants import OVERLAY_COL_GROUP
from greykite.framework.constants import QUANTILE_COL_GROUP
[docs]class UnivariateTimeSeries:
"""Defines univariate time series input. The dataset can include regressors,
but only one metric is designated as the target metric to forecast.
Loads time series into a standard format. Provides statistics, plotting
functions, and ability to generate future dataframe for prediction.
Attributes
----------
df: `pandas.DataFrame`
Data frame containing timestamp and value, with standardized column names for internal use
(TIME_COL, VALUE_COL). Rows are sorted by time index, and missing gaps between dates are filled
in so that dates are spaced at regular intervals. Values are adjusted for anomalies
according to ``anomaly_info``.
The index can be timezone aware (but TIME_COL is not).
y: `pandas.Series`, dtype float64
Value of time series to forecast.
time_stats: `dict`
Summary statistics about the timestamp column.
value_stats: `dict`
Summary statistics about the value column.
original_time_col: `str`
Name of time column in original input data.
original_value_col: `str`
Name of value column in original input data.
regressor_cols: `list` [`str`]
A list of regressor columns in the training and prediction DataFrames.
lagged_regressor_cols: `list` [`str`]
A list of additional columns needed for lagged regressors in the training and prediction DataFrames.
last_date_for_val: `datetime.datetime` or None, default None
Date or timestamp corresponding to last non-null value in ``df[original_value_col]``.
last_date_for_reg: `datetime.datetime` or None, default None
Date or timestamp corresponding to last non-null value in ``df[regressor_cols]``.
If ``regressor_cols`` is None, ``last_date_for_reg`` is None.
last_date_for_lag_reg: `datetime.datetime` or None, default None
Date or timestamp corresponding to last non-null value in ``df[lagged_regressor_cols]``.
If ``lagged_regressor_cols`` is None, ``last_date_for_lag_reg`` is None.
train_end_date: `datetime.datetime`
Last date or timestamp in ``fit_df``. It is always less than or equal to
minimum non-null values of ``last_date_for_val`` and ``last_date_for_reg``.
fit_cols: `list` [`str`]
A list of columns used in the training and prediction DataFrames.
fit_df: `pandas.DataFrame`
Data frame containing timestamp and value, with standardized column names for internal use.
Will be used for fitting (train, cv, backtest).
fit_y: `pandas.Series`, dtype float64
Value of time series for fit_df.
freq: `str`
timeseries frequency, DateOffset alias, e.g. {'T' (minute), 'H', D', 'W', 'M' (month end), 'MS' (month start),
'Y' (year end), 'Y' (year start)}
See https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
anomaly_info : `dict` or `list` [`dict`] or None, default None
Anomaly adjustment info. Anomalies in ``df``
are corrected before any forecasting is done.
See ``self.load_data()``
df_before_adjustment : `pandas.DataFrame` or None, default None
``self.df`` before adjustment by ``anomaly_info``.
Used by ``self.plot()`` to show the adjustment.
"""
def __init__(self) -> None:
self.df: Optional[pd.DataFrame] = None
self.y: Optional[pd.Series] = None
self.time_stats: Optional[Dict] = None
self.value_stats: Optional[Dict] = None
self.original_time_col: Optional[str] = None
self.original_value_col: Optional[str] = None
self.regressor_cols: List[str] = []
self.lagged_regressor_cols: List[str] = []
self.last_date_for_val: Optional[datetime] = None
self.last_date_for_reg: Optional[datetime] = None
self.last_date_for_lag_reg: Optional[datetime] = None
self.train_end_date: Optional[str] = None
self.fit_cols: List[str] = []
self.fit_df: Optional[pd.DataFrame] = None
self.fit_y: Optional[pd.DataFrame] = None
self.freq: Optional[str] = None
self.anomaly_info: Optional[Union[Dict, List[Dict]]] = None
self.df_before_adjustment: Optional[pd.DataFrame] = None
[docs] def load_data(
self,
df: pd.DataFrame,
time_col: str = TIME_COL,
value_col: str = VALUE_COL,
freq: str = None,
date_format: str = None,
tz: str = None,
train_end_date: datetime = None,
regressor_cols: List[str] = None,
lagged_regressor_cols: List[str] = None,
anomaly_info: Optional[Union[Dict, List[Dict]]] = None):
"""Loads data to internal representation. Parses date column,
sets timezone aware index.
Checks for irregularities and raises an error if input is invalid.
Adjusts for anomalies according to ``anomaly_info``.
Parameters
----------
df : `pandas.DataFrame`
Input timeseries. A data frame which includes the timestamp column
as well as the value column.
time_col : `str`
The column name in ``df`` representing time for the time series data.
The time column can be anything that can be parsed by pandas DatetimeIndex.
value_col: `str`
The column name which has the value of interest to be forecasted.
freq : `str` or None, default None
Timeseries frequency, DateOffset alias, If None automatically inferred.
date_format : `str` or None, default None
strftime format to parse time column, eg ``%m/%d/%Y``.
Note that ``%f`` will parse all the way up to nanoseconds.
If None (recommended), inferred by `pandas.to_datetime`.
tz : `str` or pytz.timezone object or None, default None
Passed to `pandas.tz_localize` to localize the timestamp.
train_end_date : `datetime.datetime` or None, default None
Last date to use for fitting the model. Forecasts are generated after this date.
If None, it is set to the minimum of ``self.last_date_for_val`` and
``self.last_date_for_reg``.
regressor_cols: `list` [`str`] or None, default None
A list of regressor columns used in the training and prediction DataFrames.
If None, no regressor columns are used.
Regressor columns that are unavailable in ``df`` are dropped.
lagged_regressor_cols: `list` [`str`] or None, default None
A list of additional columns needed for lagged regressors in the training and prediction DataFrames.
This list can have overlap with ``regressor_cols``.
If None, no additional columns are added to the DataFrame.
Lagged regressor columns that are unavailable in ``df`` are dropped.
anomaly_info : `dict` or `list` [`dict`] or None, default None
Anomaly adjustment info. Anomalies in ``df``
are corrected before any forecasting is done.
If None, no adjustments are made.
A dictionary containing the parameters to
`~greykite.common.features.adjust_anomalous_data.adjust_anomalous_data`.
See that function for details.
The possible keys are:
``"value_col"`` : `str`
The name of the column in ``df`` to adjust. You may adjust the value
to forecast as well as any numeric regressors.
``"anomaly_df"`` : `pandas.DataFrame`
Adjustments to correct the anomalies.
``"start_date_col"``: `str`, default START_DATE_COL
Start date column in ``anomaly_df``.
``"end_date_col"``: `str`, default END_DATE_COL
End date column in ``anomaly_df``.
``"adjustment_delta_col"``: `str` or None, default None
Impact column in ``anomaly_df``.
``"filter_by_dict"``: `dict` or None, default None
Used to filter ``anomaly_df`` to the relevant anomalies for
the ``value_col`` in this dictionary.
Key specifies the column name, value specifies the filter value.
``"filter_by_value_col""``: `str` or None, default None
Adds ``{filter_by_value_col: value_col}`` to ``filter_by_dict``
if not None, for the ``value_col`` in this dictionary.
``"adjustment_method"`` : `str` ("add" or "subtract"), default "add"
How to make the adjustment, if ``adjustment_delta_col`` is provided.
Accepts a list of such dictionaries to adjust multiple columns in ``df``.
Returns
-------
self : Returns self.
Sets ``self.df`` with standard column names,
value adjusted for anomalies, and time gaps filled in,
sorted by time index.
"""
self.original_time_col = time_col
self.original_value_col = value_col
self.anomaly_info = anomaly_info
canonical_data_dict = get_canonical_data(
df=df,
time_col=time_col,
value_col=value_col,
freq=freq,
date_format=date_format,
tz=tz,
train_end_date=train_end_date,
regressor_cols=regressor_cols,
lagged_regressor_cols=lagged_regressor_cols,
anomaly_info=anomaly_info)
self.df = canonical_data_dict["df"]
self.df_before_adjustment = canonical_data_dict["df_before_adjustment"]
self.fit_df = canonical_data_dict["fit_df"]
self.freq = canonical_data_dict["freq"]
self.time_stats = canonical_data_dict["time_stats"]
self.regressor_cols = canonical_data_dict["regressor_cols"]
self.lagged_regressor_cols = canonical_data_dict["lagged_regressor_cols"]
self.fit_cols = canonical_data_dict["fit_cols"]
self.train_end_date = canonical_data_dict["train_end_date"]
self.last_date_for_val = canonical_data_dict["last_date_for_val"]
self.last_date_for_reg = canonical_data_dict["last_date_for_reg"]
self.last_date_for_lag_reg = canonical_data_dict["last_date_for_lag_reg"]
# y (possibly with null values) after gaps have been filled in and anomalies corrected
self.y = self.df[VALUE_COL]
self.fit_y = self.fit_df[VALUE_COL]
# computes statistics of processed dataset
self.describe_time_col()
self.describe_value_col() # compute value statistics
log_message(f"last date for fit: {self.train_end_date}", LoggingLevelEnum.INFO)
log_message(f"last date for {self.original_value_col}: {self.last_date_for_val}", LoggingLevelEnum.INFO)
log_message(f"last date with any regressor: {self.last_date_for_reg}", LoggingLevelEnum.INFO)
log_message(f"columns available to use as regressors: {', '.join(self.regressor_cols)}", LoggingLevelEnum.INFO)
log_message(f"columns available to use as lagged regressors: {', '.join(self.lagged_regressor_cols)}", LoggingLevelEnum.INFO)
return self
[docs] def describe_time_col(self):
"""Basic descriptive stats on the timeseries time column.
Returns
-------
time_stats: `dict`
Dictionary with descriptive stats on the timeseries time column.
* data_points: int
number of time points
* mean_increment_secs: float
mean frequency
* min_timestamp: datetime64
start date
* max_timestamp: datetime64
end date
"""
if self.df is None:
raise RuntimeError("Must load data before describing dataset")
timeseries_info = describe_timeseries(df=self.df, time_col=TIME_COL)
data_points = self.df.shape[0]
mean_increment_secs = timeseries_info["mean_increment_secs"]
min_timestamp = timeseries_info["min_timestamp"]
max_timestamp = timeseries_info["max_timestamp"]
log_message("Input time stats:", LoggingLevelEnum.INFO)
log_message(f" data points: {data_points}", LoggingLevelEnum.INFO)
log_message(f" avg increment (sec): {mean_increment_secs:.2f}", LoggingLevelEnum.INFO)
log_message(f" start date: {min_timestamp}", LoggingLevelEnum.INFO)
log_message(f" end date: {max_timestamp}", LoggingLevelEnum.INFO)
time_stats = {
"data_points": data_points, # total number of time points, including missing ones
"mean_increment_secs": mean_increment_secs, # after filling in gaps
"min_timestamp": min_timestamp,
"max_timestamp": max_timestamp,
}
self.time_stats.update(time_stats) # compute time statistics
return time_stats
[docs] def describe_value_col(self):
"""Basic descriptive stats on the timeseries value column.
Returns
-------
value_stats : `dict` [`str`, `float`]
Dict with keys: count, mean, std, min, 25%, 50%, 75%, max
"""
if self.df is None:
raise RuntimeError("Must load data before describing values")
self.value_stats = self.df[VALUE_COL].describe() # count is the total number of provided timepoints
log_message("Input value stats:", LoggingLevelEnum.INFO)
log_message(repr(self.value_stats), LoggingLevelEnum.INFO)
return self.value_stats
[docs] def make_future_dataframe(self, periods: int = None, include_history=True):
"""Extends the input data for prediction into the future.
Includes the historical values (VALUE_COL) so this can be fed
into a Pipeline that transforms input data for fitting, and for
use in evaluation.
Parameters
----------
periods : int or None
Number of periods to forecast.
If there are no regressors, default is 30.
If there are regressors, default is to predict all available dates.
include_history : bool
Whether to return historical dates and values with future dates.
Returns
-------
future_df : `pandas.DataFrame`
Dataframe with future timestamps for prediction.
Contains columns for:
* prediction dates (``TIME_COL``),
* values (``VALUE_COL``),
* optional regressors
"""
if self.df is None:
raise RuntimeError("Must load data before generating future dates.")
# determines the number of future periods to predict
if self.regressor_cols:
max_regressor_periods = len(self.df[
(self.df[TIME_COL] > self.train_end_date)
& (self.df[TIME_COL] <= self.last_date_for_reg)
])
if periods is None:
periods = max_regressor_periods
elif periods > max_regressor_periods:
warnings.warn(
f"Provided periods '{periods}' is more than allowed ('{max_regressor_periods}') due to "
f"the length of regressor columns. Using '{max_regressor_periods}'.",
UserWarning)
periods = max_regressor_periods
elif periods is None:
periods = 30
# the future dates for prediction
dates = pd.date_range(
start=self.train_end_date,
periods=periods + 1, # an extra in case we include start
freq=self.freq)
dates = dates[dates > self.train_end_date] # drops values up to train_end_date
dates = dates[:periods] # returns the correct number of periods
if self.regressor_cols:
# return TIME_COL, VALUE_COL, and regressors
last_date_for_predict = dates.max()
if include_history:
valid_indices = (self.df[TIME_COL] <= last_date_for_predict)
else:
valid_indices = ((self.df[TIME_COL] > self.train_end_date)
& (self.df[TIME_COL] <= last_date_for_predict))
future_df = self.df[valid_indices]
else:
# return TIME_COL, VALUE_COL
future_df = self.df.reindex(index=dates)
future_df[TIME_COL] = future_df.index
if include_history:
future_df = pd.concat([self.fit_df, future_df], axis=0, sort=False)
return future_df[self.fit_cols]
[docs] def plot(
self,
color="rgb(32, 149, 212)",
show_anomaly_adjustment=False,
**kwargs):
"""Returns interactive plotly graph of the value against time.
If anomaly info is provided, there is an option to show the anomaly adjustment.
Parameters
----------
color : `str`, default "rgb(32, 149, 212)" (light blue)
Color of the value line (after adjustment, if applicable).
show_anomaly_adjustment : `bool`, default False
Whether to show the anomaly adjustment.
kwargs : additional parameters
Additional parameters to pass to
`~greykite.common.viz.timeseries_plotting.plot_univariate`
such as title and color.
Returns
-------
fig : `plotly.graph_objects.Figure`
Interactive plotly graph of the value against time.
See `~greykite.common.viz.timeseries_plotting.plot_forecast_vs_actual`
return value for how to plot the figure and add customization.
"""
df = self.df.copy()
# Plots value after anomaly adjustment
y_col_style_dict = {
VALUE_COL: dict(
name=self.original_value_col,
mode="lines",
line=dict(
color=color,
),
opacity=0.8
)
}
if show_anomaly_adjustment:
if self.anomaly_info is not None:
# Adds value before adjustment to ``df``
postfix = "_unadjusted"
df[f"{VALUE_COL}{postfix}"] = self.df_before_adjustment[VALUE_COL]
y_col_style_dict[f"{VALUE_COL}{postfix}"] = dict(
name=f"{self.original_value_col}{postfix}",
mode="lines",
line=dict(
color="#B3B3B3", # light gray
),
opacity=0.8
)
else:
raise ValueError("There is no `anomaly_info` to show. `show_anomaly_adjustment` must be False.")
return plot_multivariate(
df,
TIME_COL,
y_col_style_dict,
xlabel=self.original_time_col,
ylabel=self.original_value_col,
**kwargs)
[docs] def get_grouping_evaluation(
self,
aggregation_func=np.nanmean,
aggregation_func_name="mean",
groupby_time_feature=None,
groupby_sliding_window_size=None,
groupby_custom_column=None):
"""Group-wise computation of aggregated timeSeries value.
Can be used to evaluate error/ aggregated value by a time feature,
over time, or by a user-provided column.
Exactly one of: ``groupby_time_feature``, ``groupby_sliding_window_size``,
``groupby_custom_column`` must be provided.
Parameters
----------
aggregation_func : callable, optional, default ``numpy.nanmean``
Function that aggregates an array to a number.
Signature (y: array) -> aggregated value: float.
aggregation_func_name : `str` or None, optional, default "mean"
Name of grouping function, used to report results.
If None, defaults to "aggregation".
groupby_time_feature : `str` or None, optional
If provided, groups by a column generated by
`~greykite.common.features.timeseries_features.build_time_features_df`.
See that function for valid values.
groupby_sliding_window_size : `int` or None, optional
If provided, sequentially partitions data into groups of size
``groupby_sliding_window_size``.
groupby_custom_column : `pandas.Series` or None, optional
If provided, groups by this column value. Should be same length as the DataFrame.
Returns
-------
grouped_df : `pandas.DataFrame` with two columns:
(1) grouping_func_name:
evaluation metric for aggregation of timeseries.
(2) group name:
group name depends on the grouping method:
``groupby_time_feature`` for ``groupby_time_feature``
``cst.TIME_COL`` for ``groupby_sliding_window_size``
``groupby_custom_column.name`` for ``groupby_custom_column``.
"""
df = self.df.copy()
if aggregation_func_name:
grouping_func_name = f"{aggregation_func_name} of {VALUE_COL}"
else:
grouping_func_name = f"aggregation of {VALUE_COL}"
def grouping_func(grp):
return aggregation_func(grp[VALUE_COL])
result = add_groupby_column(
df=df,
time_col=TIME_COL,
groupby_time_feature=groupby_time_feature,
groupby_sliding_window_size=groupby_sliding_window_size,
groupby_custom_column=groupby_custom_column)
grouped_df = grouping_evaluation(
df=result["df"],
groupby_col=result["groupby_col"],
grouping_func=grouping_func,
grouping_func_name=grouping_func_name)
return grouped_df
[docs] def plot_grouping_evaluation(
self,
aggregation_func=np.nanmean,
aggregation_func_name="mean",
groupby_time_feature=None,
groupby_sliding_window_size=None,
groupby_custom_column=None,
xlabel=None,
ylabel=None,
title=None):
"""Computes aggregated timeseries by group and plots the result.
Can be used to plot aggregated timeseries by a time feature, over time,
or by a user-provided column.
Exactly one of: ``groupby_time_feature``, ``groupby_sliding_window_size``,
``groupby_custom_column`` must be provided.
Parameters
----------
aggregation_func : callable, optional, default ``numpy.nanmean``
Function that aggregates an array to a number.
Signature (y: array) -> aggregated value: float.
aggregation_func_name : `str` or None, optional, default "mean"
Name of grouping function, used to report results.
If None, defaults to "aggregation".
groupby_time_feature : `str` or None, optional
If provided, groups by a column generated by
`~greykite.common.features.timeseries_features.build_time_features_df`.
See that function for valid values.
groupby_sliding_window_size : `int` or None, optional
If provided, sequentially partitions data into groups of size
``groupby_sliding_window_size``.
groupby_custom_column : `pandas.Series` or None, optional
If provided, groups by this column value. Should be same length as the DataFrame.
xlabel : `str`, optional, default None
X-axis label of the plot.
ylabel : `str`, optional, default None
Y-axis label of the plot.
title : `str` or None, optional
Plot title. If None, default is based on axis labels.
Returns
-------
fig : `plotly.graph_objects.Figure`
plotly graph object showing aggregated timeseries by group.
x-axis label depends on the grouping method:
``groupby_time_feature`` for ``groupby_time_feature``
``TIME_COL`` for ``groupby_sliding_window_size``
``groupby_custom_column.name`` for ``groupby_custom_column``.
"""
grouped_df = self.get_grouping_evaluation(
aggregation_func=aggregation_func,
aggregation_func_name=aggregation_func_name,
groupby_time_feature=groupby_time_feature,
groupby_sliding_window_size=groupby_sliding_window_size,
groupby_custom_column=groupby_custom_column)
xcol, ycol = grouped_df.columns
fig = plot_univariate(
df=grouped_df,
x_col=xcol,
y_col=ycol,
xlabel=xlabel,
ylabel=ylabel,
title=title)
return fig
[docs] def get_quantiles_and_overlays(
self,
groupby_time_feature=None,
groupby_sliding_window_size=None,
groupby_custom_column=None,
show_mean=False,
show_quantiles=False,
show_overlays=False,
overlay_label_time_feature=None,
overlay_label_sliding_window_size=None,
overlay_label_custom_column=None,
center_values=False,
value_col=VALUE_COL,
mean_col_name="mean",
quantile_col_prefix="Q",
**overlay_pivot_table_kwargs):
"""Computes mean, quantiles, and overlays by the requested grouping dimension.
Overlays are best explained in the plotting context. The grouping dimension goes on
the x-axis, and one line is shown for each level of the overlay dimension. This
function returns a column for each line to plot (e.g. mean, each quantile,
each overlay value).
Exactly one of: ``groupby_time_feature``, ``groupby_sliding_window_size``,
``groupby_custom_column`` must be provided as the grouping dimension.
If ``show_overlays`` is True, exactly one of: ``overlay_label_time_feature``,
``overlay_label_sliding_window_size``, ``overlay_label_custom_column`` can be
provided to specify the ``label_col`` (overlay dimension). Internally, the
function calls `pandas.DataFrame.pivot_table` with ``index=groupby_col``,
``columns=label_col``, ``values=value_col`` to get the overlay values for plotting.
You can pass additional parameters to `pandas.DataFrame.pivot_table` via
``overlay_pivot_table_kwargs``, e.g. to change the aggregation method. If an explicit
label is not provided, the records are labeled by their position within the group.
For example, to show yearly seasonality mean, quantiles, and overlay plots for
each individual year, use::
self.get_quantiles_and_overlays(
groupby_time_feature="doy", # Rows: a row for each day of year (1, 2, ..., 366)
show_mean=True, # mean value on that day
show_quantiles=[0.1, 0.9], # quantiles of the observed distribution on that day
show_overlays=True, # Include overlays defined by ``overlay_label_time_feature``
overlay_label_time_feature="year") # One column for each observed "year" (2016, 2017, 2018, ...)
To show weekly seasonality over time, use::
self.get_quantiles_and_overlays(
groupby_time_feature="dow", # Rows: a row for each day of week (1, 2, ..., 7)
show_mean=True, # mean value on that day
show_quantiles=[0.1, 0.5, 0.9], # quantiles of the observed distribution on that day
show_overlays=True, # Include overlays defined by ``overlay_label_time_feature``
overlay_label_sliding_window_size=90, # One column for each 90 period sliding window in the dataset,
aggfunc="median") # overlay value is the median value for the dow over the period (default="mean").
It may be difficult to assess the weekly seasonality from the previous result,
because overlays shift up/down over time due to trend/yearly seasonality.
Use ``center_values=True`` to adjust each overlay so its average value is centered at 0.
Mean and quantiles are shifted by a single constant to center the mean at 0, while
preserving their relative values::
self.get_quantiles_and_overlays(
groupby_time_feature="dow",
show_mean=True,
show_quantiles=[0.1, 0.5, 0.9],
show_overlays=True,
overlay_label_sliding_window_size=90,
aggfunc="median",
center_values=True) # Centers the output
Centering reduces the variability in the overlays to make it easier to isolate
the effect by the groupby column. As a result, centered overlays have smaller
variability than that reported by the quantiles, which operate on the original,
uncentered data points. Similarly, if overlays are aggregates of individual values
(i.e. ``aggfunc`` is needed in the call to `pandas.DataFrame.pivot_table`),
the quantiles of overlays will be less extreme than those of the original data.
- To assess variability conditioned on the groupby value, check the quantiles.
- To assess variability conditioned on both the groupby and overlay value,
after any necessary aggregation, check the variability of the overlay values.
Compute quantiles of overlays from the return value if desired.
Parameters
----------
groupby_time_feature : `str` or None, default None
If provided, groups by a column generated by
`~greykite.common.features.timeseries_features.build_time_features_df`.
See that function for valid values.
groupby_sliding_window_size : `int` or None, default None
If provided, sequentially partitions data into groups of size
``groupby_sliding_window_size``.
groupby_custom_column : `pandas.Series` or None, default None
If provided, groups by this column value. Should be same length as the DataFrame.
show_mean : `bool`, default False
Whether to return the mean value by the groupby column.
show_quantiles : `bool` or `list` [`float`] or `numpy.array`, default False
Whether to return the quantiles of the value by the groupby column.
If False, does not return quantiles. If True, returns default
quantiles (0.1 and 0.9). If array-like, a list of quantiles
to compute (e.g. (0.1, 0.25, 0.75, 0.9)).
show_overlays : `bool` or `int` or array-like [`int` or `str`] or None, default False
Whether to return overlays of the value by the groupby column.
If False, no overlays are shown.
If True and ``label_col`` is defined, calls `pandas.DataFrame.pivot_table` with
``index=groupby_col``, ``columns=label_col``, ``values=value_col``.
``label_col`` is defined by one of ``overlay_label_time_feature``,
``overlay_label_sliding_window_size``, or ``overlay_label_custom_column``.
Returns one column for each value of the ``label_col``.
If True and the ``label_col`` is not defined, returns the raw values within
each group. Values across groups are put into columns by their position in
the group (1st element in group, 2nd, 3rd, etc.). Positional order in a group
is not guaranteed to correspond to anything meaningful, so the items within a
column may not have anything in common. It is better to specify one of ``overlay_*``
to explicitly define the overlay labels.
If an integer, the number of overlays to randomly sample. The same as True,
then randomly samples up to `int` columns. This is useful if there are too many values.
If a list [int], a list of column indices (int type). The same as True,
then selects the specified columns by index.
If a list [str], a list of column names. Column names are matched by their
string representation to the names in this list. The same as True,
then selects the specified columns by name.
overlay_label_time_feature : `str` or None, default None
If ``show_overlays`` is True, can be used to define ``label_col``,
i.e. which dimension to show separately as overlays.
If provided, uses a column generated by
`~greykite.common.features.timeseries_features.build_time_features_df`.
See that function for valid values.
overlay_label_sliding_window_size : `int` or None, default None
If ``show_overlays`` is True, can be used to define ``label_col``,
i.e. which dimension to show separately as overlays.
If provided, uses a column that sequentially partitions data into groups
of size ``groupby_sliding_window_size``.
overlay_label_custom_column : `pandas.Series` or None, default None
If ``show_overlays`` is True, can be used to define ``label_col``,
i.e. which dimension to show separately as overlays.
If provided, uses this column value. Should be same length as the DataFrame.
value_col : `str`, default VALUE_COL
The column name for the value column. By default,
shows the univariate time series value, but it can be any
other column in ``self.df``.
mean_col_name : `str`, default "mean"
The name to use for the mean column in the output.
Applies if ``show_mean=True``.
quantile_col_prefix : `str`, default "Q"
The prefix to use for quantile column names in the output.
Columns are named with this prefix followed by the quantile,
rounded to 2 decimal places.
center_values : `bool`, default False
Whether to center the return values.
If True, shifts each overlay so its average value is centered at 0.
Shifts mean and quantiles by a constant to center the mean at 0, while
preserving their relative values.
If False, values are not centered.
overlay_pivot_table_kwargs : additional parameters
Additional keyword parameters to pass to `pandas.DataFrame.pivot_table`,
used in generating the overlays. See above description for details.
Returns
-------
grouped_df : `pandas.DataFrame`
Dataframe with mean, quantiles, and overlays by the grouping column. Overlays
are defined by the grouping column and overlay dimension.
ColumnIndex is a multiindex with first level as the "category", a subset of
[MEAN_COL_GROUP, QUANTILE_COL_GROUP, OVERLAY_COL_GROUP] depending on what is requests.
- grouped_df[MEAN_COL_GROUP] = df with single column, named ``mean_col_name``.
- grouped_df[QUANTILE_COL_GROUP] = df with a column for each quantile, named
f"{quantile_col_prefix}{round(str(q))}", where ``q`` is the quantile.
- grouped_df[OVERLAY_COL_GROUP] = df with one column per overlay value, named
by the overlay value.
For example, it might look like::
category mean quantile overlay
name mean Q0.1 Q0.9 2007 2008 2009
doy
1 8.42 7.72 9.08 8.29 7.75 8.33
2 8.82 8.20 9.56 8.43 8.80 8.53
3 8.95 8.25 9.88 8.26 9.12 8.70
4 9.07 8.60 9.49 8.10 9.99 8.73
5 8.73 8.29 9.24 7.95 9.26 8.37
... ... ... ... ... ... ...
"""
# Default quantiles to show if `show_quantiles` is boolean
if isinstance(show_quantiles, bool):
if show_quantiles:
show_quantiles = [0.1, 0.9]
else:
show_quantiles = None
# Adds grouping dimension
result = add_groupby_column(
df=self.df,
time_col=TIME_COL, # Already standardized
groupby_time_feature=groupby_time_feature,
groupby_sliding_window_size=groupby_sliding_window_size,
groupby_custom_column=groupby_custom_column)
df = result["df"]
groupby_col = result["groupby_col"]
grouped_df = None
# Whether an overlay label is provided
add_overlay_label = (overlay_label_time_feature is not None) or \
(overlay_label_sliding_window_size is not None) or \
(overlay_label_custom_column is not None)
overlay_df = None
# Defines an aggregation function to compute mean, quantiles, and overlays
agg_kwargs = {}
if show_mean:
agg_kwargs.update({mean_col_name: pd.NamedAgg(column=value_col, aggfunc=np.nanmean)})
if show_quantiles is not None:
# Returns the quantiles of the group's `value_col` as a list
agg_kwargs.update({quantile_col_prefix: pd.NamedAgg(
column=value_col,
aggfunc=lambda grp_values: partial(np.nanquantile, q=show_quantiles)(grp_values).tolist())})
if show_overlays is not False:
if add_overlay_label:
# Uses DataFrame pivot_table to get overlay labels as columns, `groupby_col` as index
label_result = add_groupby_column(
df=df,
time_col=TIME_COL,
groupby_time_feature=overlay_label_time_feature,
groupby_sliding_window_size=overlay_label_sliding_window_size,
groupby_custom_column=overlay_label_custom_column)
label_col = label_result["groupby_col"]
overlay_df = label_result["df"].pivot_table(
index=groupby_col,
columns=label_col,
values=value_col,
**overlay_pivot_table_kwargs)
else:
# Uses aggregation to get overlays.
# Takes original values within each group.
# Values across groups are put into columns by their position
# within the group (1st element in group, 2nd, 3rd, etc.)
agg_kwargs.update({"overlay": pd.NamedAgg(column=value_col, aggfunc=tuple)})
# Names the quantile columns
# Keeps to 2 decimal places to handle numerical imprecision.
list_names_dict = {quantile_col_prefix: [
f"{quantile_col_prefix}{str(round(x, 2))}" for x in show_quantiles]}\
if show_quantiles is not None else {}
if agg_kwargs:
grouped_df = flexible_grouping_evaluation(
result["df"],
map_func_dict=None,
groupby_col=result["groupby_col"],
agg_kwargs=agg_kwargs,
extend_col_names=False,
unpack_list=True,
list_names_dict=list_names_dict)
# Adds overlays if requested and not already computed during aggregation
if overlay_df is not None:
overlay_df.columns = map(str, overlay_df.columns)
# Either overlay_df or grouped_df is populated
if grouped_df is None and overlay_df is None:
raise ValueError("Must enable at least one of: show_mean, show_quantiles, show_overlays.")
grouped_df = pd.concat([grouped_df, overlay_df], axis=1)
# Creates MultiIndex for column names to categorize the column names by their type
mean_cols = [mean_col_name] if show_mean else []
quantile_cols = list_names_dict.get(quantile_col_prefix, [])
overlay_cols = [col for col in list(grouped_df.columns) if col not in mean_cols + quantile_cols]
if isinstance(show_overlays, int) and not isinstance(show_overlays, bool):
# Samples from `overlay_cols`
which_overlays = sorted(np.random.choice(
range(len(overlay_cols)),
size=min(show_overlays, len(overlay_cols)),
replace=False))
overlay_cols = list(np.array(overlay_cols)[which_overlays])
elif isinstance(show_overlays, (list, tuple, np.ndarray)):
# Selects from `overlay_cols`
all_integers = np.issubdtype(np.array(show_overlays).dtype, np.integer)
if all_integers:
overlay_cols = [col for i, col in enumerate(overlay_cols) if i in show_overlays]
else:
overlay_cols = [col for col in overlay_cols if str(col) in show_overlays]
cols = mean_cols + quantile_cols + overlay_cols # Reorders columns by group
grouped_df = grouped_df[cols]
categories = list(np.repeat(
[MEAN_COL_GROUP, QUANTILE_COL_GROUP, OVERLAY_COL_GROUP], # Labels columns by category
[len(mean_cols), len(quantile_cols), len(overlay_cols)]))
cateory_col_index = pd.MultiIndex.from_arrays([categories, cols], names=["category", "name"])
grouped_df.columns = cateory_col_index
if center_values:
# Each overlay is independently shifted to have mean 0.
if OVERLAY_COL_GROUP in grouped_df:
grouped_df[OVERLAY_COL_GROUP] -= grouped_df[OVERLAY_COL_GROUP].mean()
# Mean and quantiles are shifted by the same constant, so the mean column is centered at 0.
if MEAN_COL_GROUP in grouped_df:
mean_shift = grouped_df[MEAN_COL_GROUP].mean()[0]
grouped_df[MEAN_COL_GROUP] -= mean_shift
else:
mean_shift = self.df[value_col].mean()
if QUANTILE_COL_GROUP in grouped_df:
grouped_df[QUANTILE_COL_GROUP] -= mean_shift
return grouped_df
[docs] def plot_quantiles_and_overlays(
self,
groupby_time_feature=None,
groupby_sliding_window_size=None,
groupby_custom_column=None,
show_mean=False,
show_quantiles=False,
show_overlays=False,
overlay_label_time_feature=None,
overlay_label_sliding_window_size=None,
overlay_label_custom_column=None,
center_values=False,
value_col=VALUE_COL,
mean_col_name="mean",
quantile_col_prefix="Q",
mean_style=None,
quantile_style=None,
overlay_style=None,
xlabel=None,
ylabel=None,
title=None,
showlegend=True,
**overlay_pivot_table_kwargs):
"""Plots mean, quantiles, and overlays by the requested grouping dimension.
The grouping dimension goes on the x-axis, and one line is shown for the mean,
each quantile, and each level of the overlay dimension, as requested. By default,
shading is applied between the quantiles.
Exactly one of: ``groupby_time_feature``, ``groupby_sliding_window_size``,
``groupby_custom_column`` must be provided as the grouping dimension.
If ``show_overlays`` is True, exactly one of: ``overlay_label_time_feature``,
``overlay_label_sliding_window_size``, ``overlay_label_custom_column`` can be
provided to specify the ``label_col`` (overlay dimension). Internally, the
function calls `pandas.DataFrame.pivot_table` with ``index=groupby_col``,
``columns=label_col``, ``values=value_col`` to get the overlay values for plotting.
You can pass additional parameters to `pandas.DataFrame.pivot_table` via
``overlay_pivot_table_kwargs``, e.g. to change the aggregation method. If an explicit
label is not provided, the records are labeled by their position within the group.
For example, to show yearly seasonality mean, quantiles, and overlay plots for
each individual year, use::
self.plot_quantiles_and_overlays(
groupby_time_feature="doy", # Rows: a row for each day of year (1, 2, ..., 366)
show_mean=True, # mean value on that day
show_quantiles=[0.1, 0.9], # quantiles of the observed distribution on that day
show_overlays=True, # Include overlays defined by ``overlay_label_time_feature``
overlay_label_time_feature="year") # One column for each observed "year" (2016, 2017, 2018, ...)
To show weekly seasonality over time, use::
self.plot_quantiles_and_overlays(
groupby_time_feature="dow", # Rows: a row for each day of week (1, 2, ..., 7)
show_mean=True, # mean value on that day
show_quantiles=[0.1, 0.5, 0.9], # quantiles of the observed distribution on that day
show_overlays=True, # Include overlays defined by ``overlay_label_time_feature``
overlay_label_sliding_window_size=90, # One column for each 90 period sliding window in the dataset,
aggfunc="median") # overlay value is the median value for the dow over the period (default="mean").
It may be difficult to assess the weekly seasonality from the previous result,
because overlays shift up/down over time due to trend/yearly seasonality.
Use ``center_values=True`` to adjust each overlay so its average value is centered at 0.
Mean and quantiles are shifted by a single constant to center the mean at 0, while
preserving their relative values::
self.plot_quantiles_and_overlays(
groupby_time_feature="dow",
show_mean=True,
show_quantiles=[0.1, 0.5, 0.9],
show_overlays=True,
overlay_label_sliding_window_size=90,
aggfunc="median",
center_values=True) # Centers the output
Centering reduces the variability in the overlays to make it easier to isolate
the effect by the groupby column. As a result, centered overlays have smaller
variability than that reported by the quantiles, which operate on the original,
uncentered data points. Similarly, if overlays are aggregates of individual values
(i.e. ``aggfunc`` is needed in the call to `pandas.DataFrame.pivot_table`),
the quantiles of overlays will be less extreme than those of the original data.
- To assess variability conditioned on the groupby value, check the quantiles.
- To assess variability conditioned on both the groupby and overlay value,
after any necessary aggregation, check the variability of the overlay values.
Compute quantiles of overlays from the return value if desired.
Parameters
----------
groupby_time_feature : `str` or None, default None
If provided, groups by a column generated by
`~greykite.common.features.timeseries_features.build_time_features_df`.
See that function for valid values.
groupby_sliding_window_size : `int` or None, default None
If provided, sequentially partitions data into groups of size
``groupby_sliding_window_size``.
groupby_custom_column : `pandas.Series` or None, default None
If provided, groups by this column value. Should be same length as the DataFrame.
show_mean : `bool`, default False
Whether to return the mean value by the groupby column.
show_quantiles : `bool` or `list` [`float`] or `numpy.array`, default False
Whether to return the quantiles of the value by the groupby column.
If False, does not return quantiles. If True, returns default
quantiles (0.1 and 0.9). If array-like, a list of quantiles
to compute (e.g. (0.1, 0.25, 0.75, 0.9)).
show_overlays : `bool` or `int` or array-like [`int` or `str`], default False
Whether to return overlays of the value by the groupby column.
If False, no overlays are shown.
If True and ``label_col`` is defined, calls `pandas.DataFrame.pivot_table` with
``index=groupby_col``, ``columns=label_col``, ``values=value_col``.
``label_col`` is defined by one of ``overlay_label_time_feature``,
``overlay_label_sliding_window_size``, or ``overlay_label_custom_column``.
Returns one column for each value of the ``label_col``.
If True and the ``label_col`` is not defined, returns the raw values within
each group. Values across groups are put into columns by their position in
the group (1st element in group, 2nd, 3rd, etc.). Positional order in a group
is not guaranteed to correspond to anything meaningful, so the items within a
column may not have anything in common. It is better to specify one of ``overlay_*``
to explicitly define the overlay labels.
If an integer, the number of overlays to randomly sample. The same as True,
then randomly samples up to `int` columns. This is useful if there are too many values.
If a list [int], a list of column indices (int type). The same as True,
then selects the specified columns by index.
If a list [str], a list of column names. Column names are matched by their
string representation to the names in this list. The same as True,
then selects the specified columns by name.
overlay_label_time_feature : `str` or None, default None
If ``show_overlays`` is True, can be used to define ``label_col``,
i.e. which dimension to show separately as overlays.
If provided, uses a column generated by
`~greykite.common.features.timeseries_features.build_time_features_df`.
See that function for valid values.
overlay_label_sliding_window_size : `int` or None, default None
If ``show_overlays`` is True, can be used to define ``label_col``,
i.e. which dimension to show separately as overlays.
If provided, uses a column that sequentially partitions data into groups
of size ``groupby_sliding_window_size``.
overlay_label_custom_column : `pandas.Series` or None, default None
If ``show_overlays`` is True, can be used to define ``label_col``,
i.e. which dimension to show separately as overlays.
If provided, uses this column value. Should be same length as the DataFrame.
value_col : `str`, default VALUE_COL
The column name for the value column. By default,
shows the univariate time series value, but it can be any
other column in ``self.df``.
mean_col_name : `str`, default "mean"
The name to use for the mean column in the output.
Applies if ``show_mean=True``.
quantile_col_prefix : `str`, default "Q"
The prefix to use for quantile column names in the output.
Columns are named with this prefix followed by the quantile,
rounded to 2 decimal places.
center_values : `bool`, default False
Whether to center the return values.
If True, shifts each overlay so its average value is centered at 0.
Shifts mean and quantiles by a constant to center the mean at 0, while
preserving their relative values.
If False, values are not centered.
mean_style: `dict` or None, default None
How to style the mean line, passed as keyword arguments to
`plotly.graph_objects.Scatter`. If None, the default is::
mean_style = {
"line": dict(
width=2,
color="#595959"), # gray
"legendgroup": MEAN_COL_GROUP}
quantile_style: `dict` or None, default None
How to style the quantile lines, passed as keyword arguments to
`plotly.graph_objects.Scatter`. If None, the default is::
quantile_style = {
"line": dict(
width=2,
color="#1F9AFF", # blue
dash="solid"),
"legendgroup": QUANTILE_COL_GROUP, # show/hide them together
"fill": "tonexty"}
Note that fill style is removed from to the first quantile line, to
fill only between items in the same category.
overlay_style: `dict` or None, default None
How to style the overlay lines, passed as keyword arguments to
`plotly.graph_objects.Scatter`. If None, the default is::
overlay_style = {
"opacity": 0.5, # makes it easier to see density
"line": dict(
width=1,
color="#B3B3B3", # light gray
dash="solid"),
"legendgroup": OVERLAY_COL_GROUP}
xlabel : `str`, optional, default None
X-axis label of the plot.
ylabel : `str`, optional, default None
Y-axis label of the plot. If None, uses ``value_col``.
title : `str` or None, default None
Plot title. If None, default is based on axis labels.
showlegend : `bool`, default True
Whether to show the legend.
overlay_pivot_table_kwargs : additional parameters
Additional keyword parameters to pass to `pandas.DataFrame.pivot_table`,
used in generating the overlays.
See `~greykite.framework.input.univariate_time_series.UnivariateTimeSeries.get_quantiles_and_overlays`
description for details.
Returns
-------
fig : `plotly.graph_objects.Figure`
plotly graph object showing the mean, quantiles, and overlays.
See Also
--------
`~greykite.framework.input.univariate_time_series.UnivariateTimeSeries.get_quantiles_and_overlays`
To get the mean, quantiles, and overlays as a `pandas.DataFrame` without plotting.
"""
if ylabel is None:
ylabel = value_col
grouped_df = self.get_quantiles_and_overlays(
groupby_time_feature=groupby_time_feature,
groupby_sliding_window_size=groupby_sliding_window_size,
groupby_custom_column=groupby_custom_column,
show_mean=show_mean,
show_quantiles=show_quantiles,
show_overlays=show_overlays,
overlay_label_time_feature=overlay_label_time_feature,
overlay_label_sliding_window_size=overlay_label_sliding_window_size,
overlay_label_custom_column=overlay_label_custom_column,
center_values=center_values,
value_col=value_col,
mean_col_name=mean_col_name,
quantile_col_prefix=quantile_col_prefix,
**overlay_pivot_table_kwargs)
if mean_style is None:
mean_style = {
"line": dict(
width=2,
color="#595959"), # gray
"legendgroup": MEAN_COL_GROUP}
if quantile_style is None:
quantile_style = {
"line": dict(
width=2,
color="#1F9AFF", # blue
dash="solid"),
"legendgroup": QUANTILE_COL_GROUP, # show/hide them together
"fill": "tonexty"}
if overlay_style is None:
overlay_style = {
"opacity": 0.5, # makes it easier to see density
"line": dict(
width=1,
color="#B3B3B3", # light gray
dash="solid"),
"legendgroup": OVERLAY_COL_GROUP}
style_dict = {
MEAN_COL_GROUP: mean_style,
QUANTILE_COL_GROUP: quantile_style,
OVERLAY_COL_GROUP: overlay_style}
y_col_style_dict = {}
# All categories in grouped_df. Reverses the order so the first category is plotted last (on top).
categories = grouped_df.columns.get_level_values(0).unique()[::-1]
for category in categories:
style = style_dict.get(category, {})
if "fill" in style:
# If fill is part of the style, plotly fills the area between this line and
# the previous line added to the plot.
# Since we only want to fill between lines in the same category (e.g. between quantiles),
# we remove the "fill" from the first line within each category. Otherwise the first
# line in this category would fill to the last line in the previous category.
category_style_dict = {grouped_df[category].columns[0]: {k: v for k, v in style.items() if k != "fill"}}
category_style_dict.update({col: style for col in grouped_df[category].columns[1:]})
else:
category_style_dict = {col: style for col in grouped_df[category].columns}
y_col_style_dict.update(category_style_dict)
grouped_df.columns = list(grouped_df.columns.get_level_values(1)) # MultiIndex is not needed for plotting
x_col = grouped_df.index.name
grouped_df.reset_index(inplace=True)
fig = plot_multivariate(
grouped_df,
x_col=x_col,
y_col_style_dict=y_col_style_dict,
xlabel=xlabel,
ylabel=ylabel,
title=title,
showlegend=showlegend)
return fig