Source code for greykite.sklearn.transform.zscore_outlier_transformer
# BSD 2-CLAUSE LICENSE
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
# Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
# Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# original author: Albert Chen
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.exceptions import NotFittedError
from greykite.common.logging import LoggingLevelEnum
from greykite.common.logging import log_message
[docs]class ZscoreOutlierTransformer(BaseEstimator, TransformerMixin):
"""Replaces outliers in data with NaN.
Outliers are determined by z-score cutoff. Columns are handled independently.
Parameters
----------
z_cutoff : `float` or None, default None
z-score cutoff to define outliers. If None, this transformer is a no-op.
use_fit_baseline : `bool`, default False
If True, the z-scores are calculated using the mean and standard
deviation of the dataset passed to ``fit``.
If False, the transformer is stateless. z-scores are calculated
for the dataset passed to ``transform``, regardless of ``fit``.
Attributes
----------
mean : `pandas.Series`
Mean of each column. NaNs are ignored.
std : `pandas.Series`
Standard deviation of each column. NaNs are ignored.
_is_fitted : `bool`
Whether the transformer is fitted.
"""
def __init__(self, z_cutoff=None, use_fit_baseline=False):
# sets params without modification to ensure get_params() works in grid search
self.z_cutoff = z_cutoff
self.use_fit_baseline = use_fit_baseline
self.mean = None
self.std = None
self._is_fitted = None
[docs] def fit(self, X, y=None):
"""Computes the column mean and standard deviation,
stored as ``mean`` and ``std`` attributes.
Parameters
----------
X : `pandas.DataFrame`
Training input data. e.g. each column is a timeseries.
Columns are expected to be numeric.
y : None
There is no need of a target in a transformer, yet the pipeline API
requires this parameter.
Returns
-------
self : object
Returns self.
"""
assert isinstance(X, pd.DataFrame)
self._is_fitted = True
if self.z_cutoff is not None and self.use_fit_baseline:
self.mean = X.mean()
self.std = X.std()
return self
[docs] def transform(self, X):
"""Replaces outliers with NaN.
Parameters
----------
X : `pandas.DataFrame`
Data to transform. e.g. each column is a timeseries.
Columns are expected to be numeric.
Returns
-------
X_outlier : `pandas.DataFrame`
A copy of the data frame with original values and outliers replaced with NaN.
"""
assert isinstance(X, pd.DataFrame)
result = X.copy()
if self.z_cutoff is not None:
if self.use_fit_baseline:
if self._is_fitted is None:
raise NotFittedError(
"This instance is not fitted yet. Call 'fit' with appropriate arguments "
"before calling 'transform'.")
mean = self.mean
std = self.std
else:
mean = X.mean()
std = X.std()
outlier_indices = np.abs(X - mean) > std * self.z_cutoff
if np.any(outlier_indices):
total_na = outlier_indices.sum().sum()
log_message(f"Detected {total_na} outlier(s).", LoggingLevelEnum.INFO)
result = result.mask(outlier_indices)
return result