Source code for greykite.sklearn.transform.null_transformer

# BSD 2-CLAUSE LICENSE

# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:

# Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
# Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# original author: Albert Chen

import warnings

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.exceptions import NotFittedError

from greykite.common.features.timeseries_impute import impute_with_lags_multi
from greykite.common.logging import LoggingLevelEnum
from greykite.common.logging import log_message
from greykite.common.python_utils import update_dictionary


DEFAULT_PARAMS = {
    "interpolate": dict(
        method="linear",
        limit_direction="both",
        axis=0),  # fills column-by-column
    "ts_interpolate": dict(
        orders=[7, 14, 21],
        agg_func=np.mean,
        iter_num=5)
}


[docs]class NullTransformer(BaseEstimator, TransformerMixin): """Imputes nulls in time series data. This transform is stateless in the sense that ``transform`` output does not depend on the data passed to ``fit``. The dataset passed to ``transform`` is used to impute itself. Parameters ---------- max_frac : `float`, default 0.10 issues warning if fraction of nulls is above this value impute_algorithm : `str` or None, default "interpolate" Which imputation algorithm to use. Valid options are: - "interpolate" : `pandas.DataFrame.interpolate` - "ts_interpolate" : `~greykite.common.features.timeseries_impute.impute_with_lags_multi`. If None, this transformer is a no-op. No null imputation is done. impute_params : `dict` or None, default None Params to pass to the imputation algorithm. See `pandas.DataFrame.interpolate` and `~greykite.common.features.timeseries_impute.impute_with_lags_multi` for their respective options. For pandas "interpolate", the "ffill", "pad", "bfill", "backfill" methods are not allowed to avoid confusion with the fill axis parameter. Use "linear" with ``axis=0`` instead, with direction controlled by ``limit_direction``. If None, uses the defaults provided in this class. impute_all : `bool`, default True Whether to impute all values. If True, NaNs are not allowed in the transformed result. Ignored if ``impute_algorithm`` is None. The transform specified by ``impute_algorithm`` and ``impute_params`` may leave NaNs in the dataset. For example, if it fills in the forward direction but the first value in a column is NaN. A first pass is taken with the impute algorithm specified. A second pass is taken with the "interpolate" algorithm (method="linear", limit_direction="both") to fill in remaining NaNs. Attributes ---------- null_frac : `int` The fraction data points that are null _is_fitted : `bool` Whether the transformer is fitted. missing_info : `dict` Information about the missing data. Set by ``transform`` if ``impute_algorithm = "ts_interpolate"``. """ def __init__( self, max_frac=0.10, impute_algorithm=None, impute_params=None, impute_all=True): # sets params without modification to ensure get_params() works in grid search self.max_frac = max_frac self.impute_algorithm = impute_algorithm self.impute_params = impute_params self.impute_all = impute_all self.null_frac = None self._is_fitted = None self.missing_info = None if (self.impute_algorithm == "interpolate" and self.impute_params is not None and self.impute_params.get("method") in ["ffill", "pad", "bfill", "backfill"]): # These four methods treat "axis=0" as rows, "axis=1" as columns, # contrary to the pandas documentation. Avoid them to prevent misuse. raise ValueError( f"method '{self.impute_params['method']}' is not allowed. " f"Use method='linear' with `limit_direction` instead")
[docs] def fit(self, X, y=None): """Updates `self.impute_params`. Parameters ---------- X : `pandas.DataFrame` Training input data. e.g. each column is a timeseries. Columns are expected to be numeric. y : None There is no need of a target in a transformer, yet the pipeline API requires this parameter. Returns ------- self : object Returns self. """ assert isinstance(X, pd.DataFrame) self._is_fitted = True # sets default parameters if self.impute_algorithm is not None: default_params = DEFAULT_PARAMS.get(self.impute_algorithm, {}) self.impute_params = update_dictionary(default_params, overwrite_dict=self.impute_params) return self
[docs] def transform(self, X): """Imputes missing values in input time series. Checks the % of data points that are null, and provides warning if it exceeds ``self.max_frac``. Parameters ---------- X : `pandas.DataFrame` Data to transform. e.g. each column is a timeseries. Columns are expected to be numeric. Returns ------- X_imputed : `pandas.DataFrame` A copy of the data frame with original values and missing values imputed """ if self._is_fitted is None: raise NotFittedError( "This instance is not fitted yet. Call 'fit' with appropriate arguments " "before calling 'transform'.") assert isinstance(X, pd.DataFrame) self.null_frac = X.isna().mean() # fraction of NaNs in each column if np.any(self.null_frac > self.max_frac): warnings.warn(f"Input data has many null values. Missing {self.null_frac.max():.2%} of one input.", RuntimeWarning) if any(self.null_frac > 0.0): log_message(f"Missing data detected: {self.null_frac.mean():.2%} of all input values " f"are null. (If future external regressor(s) are used, some missing values in " f"`value_col` are expected.)", LoggingLevelEnum.INFO) if self.impute_algorithm is not None: if self.impute_algorithm == "interpolate": # Uses `pandas.DataFrame.interpolate` X_imputed = X.interpolate(**self.impute_params) elif self.impute_algorithm == "ts_interpolate": # Uses `impute_with_lags_multi` impute_info = impute_with_lags_multi(df=X, **self.impute_params) X_imputed = impute_info["df"] self.missing_info = impute_info["missing_info"] else: raise ValueError(f"`impute_algorithm` '{self.impute_algorithm}' is not recognized." f"Must be one of 'ts_interpolate', 'interpolate'") if self.impute_all: # A second pass is taken to make sure there are no NaNs. X_imputed = X_imputed.interpolate(**DEFAULT_PARAMS["interpolate"]) else: # no-op X_imputed = X.copy() return X_imputed