Source code for greykite.sklearn.transform.null_transformer

# BSD 2-CLAUSE LICENSE

# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:

# Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
# Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# original author: Albert Chen

import warnings

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.exceptions import NotFittedError

from greykite.common.features.timeseries_impute import impute_with_lags_multi
from greykite.common.logging import LoggingLevelEnum
from greykite.common.logging import log_message
from greykite.common.python_utils import update_dictionary


DEFAULT_PARAMS = {
    "interpolate": dict(
        method="linear",
        limit_direction="both",
        axis=0),  # fills column-by-column
    "ts_interpolate": dict(
        orders=[7, 14, 21],
        agg_func=np.mean,
        iter_num=5)
}


[docs]class NullTransformer(BaseEstimator, TransformerMixin):
    """Imputes nulls in time series data.

    This transform is stateless in the sense that ``transform`` output
    does not depend on the data passed to ``fit``. The dataset passed to
    ``transform`` is used to impute itself.

    Parameters
    ----------
    max_frac : `float`, default 0.10
        issues warning if fraction of nulls is above this value
    impute_algorithm  : `str` or None, default "interpolate"
        Which imputation algorithm to use.
        Valid options are:

            - "interpolate" : `pandas.DataFrame.interpolate`
            - "ts_interpolate" : `~greykite.common.features.timeseries_impute.impute_with_lags_multi`.

        If None, this transformer is a no-op. No null imputation is done.

    impute_params : `dict` or None, default None
        Params to pass to the imputation algorithm.
        See `pandas.DataFrame.interpolate` and
        `~greykite.common.features.timeseries_impute.impute_with_lags_multi`
        for their respective options.

        For pandas "interpolate", the "ffill", "pad", "bfill", "backfill" methods
        are not allowed to avoid confusion with the fill axis parameter. Use "linear"
        with ``axis=0`` instead, with direction controlled by ``limit_direction``.

        If None, uses the defaults provided in this class.
    impute_all : `bool`, default True
        Whether to impute all values. If True, NaNs are not allowed in the
        transformed result. Ignored if ``impute_algorithm`` is None.

        The transform specified by ``impute_algorithm`` and
        ``impute_params`` may leave NaNs in the dataset. For example,
        if it fills in the forward direction but the first value in a
        column is NaN.

        A first pass is taken with the impute algorithm specified.
        A second pass is taken with the "interpolate" algorithm (method="linear",
        limit_direction="both") to fill in remaining NaNs.

    Attributes
    ----------
    null_frac : `int`
        The fraction data points that are null
    _is_fitted : `bool`
        Whether the transformer is fitted.
    missing_info : `dict`
        Information about the missing data.
        Set by ``transform`` if ``impute_algorithm = "ts_interpolate"``.
    """
    def __init__(
            self,
            max_frac=0.10,
            impute_algorithm=None,
            impute_params=None,
            impute_all=True):
        # sets params without modification to ensure get_params() works in grid search
        self.max_frac = max_frac
        self.impute_algorithm = impute_algorithm
        self.impute_params = impute_params
        self.impute_all = impute_all

        self.null_frac = None
        self._is_fitted = None
        self.missing_info = None
        if (self.impute_algorithm == "interpolate"
                and self.impute_params is not None
                and self.impute_params.get("method") in ["ffill", "pad", "bfill", "backfill"]):
            # These four methods treat "axis=0" as rows, "axis=1" as columns,
            # contrary to the pandas documentation. Avoid them to prevent misuse.
            raise ValueError(
                f"method '{self.impute_params['method']}' is not allowed. "
                f"Use method='linear' with `limit_direction` instead")

[docs]    def fit(self, X, y=None):
        """Updates `self.impute_params`.

        Parameters
        ----------
        X : `pandas.DataFrame`
            Training input data. e.g. each column is a timeseries.
            Columns are expected to be numeric.
        y : None
            There is no need of a target in a transformer, yet the pipeline API
            requires this parameter.

        Returns
        -------
        self : object
            Returns self.
        """
        assert isinstance(X, pd.DataFrame)
        self._is_fitted = True
        # sets default parameters
        if self.impute_algorithm is not None:
            default_params = DEFAULT_PARAMS.get(self.impute_algorithm, {})
            self.impute_params = update_dictionary(default_params, overwrite_dict=self.impute_params)
        return self

[docs]    def transform(self, X):
        """Imputes missing values in input time series.

        Checks the % of data points that are null, and provides warning if
        it exceeds ``self.max_frac``.

        Parameters
        ----------
        X : `pandas.DataFrame`
            Data to transform. e.g. each column is a timeseries.
            Columns are expected to be numeric.

        Returns
        -------
        X_imputed : `pandas.DataFrame`
            A copy of the data frame with original values and missing values imputed
        """
        if self._is_fitted is None:
            raise NotFittedError(
                "This instance is not fitted yet. Call 'fit' with appropriate arguments "
                "before calling 'transform'.")
        assert isinstance(X, pd.DataFrame)

        self.null_frac = X.isna().mean()  # fraction of NaNs in each column
        if np.any(self.null_frac > self.max_frac):
            warnings.warn(f"Input data has many null values. Missing {self.null_frac.max():.2%} of one input.",
                          RuntimeWarning)
        if any(self.null_frac > 0.0):
            log_message(f"Missing data detected: {self.null_frac.mean():.2%} of all input values "
                        f"are null. (If future external regressor(s) are used, some missing values in "
                        f"`value_col` are expected.)",
                        LoggingLevelEnum.INFO)

        if self.impute_algorithm is not None:
            if self.impute_algorithm == "interpolate":
                # Uses `pandas.DataFrame.interpolate`
                X_imputed = X.interpolate(**self.impute_params)
            elif self.impute_algorithm == "ts_interpolate":
                # Uses `impute_with_lags_multi`
                impute_info = impute_with_lags_multi(df=X, **self.impute_params)
                X_imputed = impute_info["df"]
                self.missing_info = impute_info["missing_info"]
            else:
                raise ValueError(f"`impute_algorithm` '{self.impute_algorithm}' is not recognized."
                                 f"Must be one of 'ts_interpolate', 'interpolate'")

            if self.impute_all:
                # A second pass is taken to make sure there are no NaNs.
                X_imputed = X_imputed.interpolate(**DEFAULT_PARAMS["interpolate"])
        else:
            # no-op
            X_imputed = X.copy()
        return X_imputed