# -*- coding: utf-8 -*-
# This file as well as the whole tsfresh package are licenced under the MIT licence (see the LICENCE.txt)
# Maximilian Christ (maximilianchrist.com), Blue Yonder Gmbh, 2016
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.exceptions import NotFittedError
from tsfresh.utilities.dataframe_functions import (
get_range_values_per_column,
impute_dataframe_range,
)
[docs]
class PerColumnImputer(BaseEstimator, TransformerMixin):
"""
Sklearn-compatible estimator, for column-wise imputing DataFrames by replacing all ``NaNs`` and ``infs``
with with average/extreme values from the same columns. It is basically a wrapper around
:func:`~tsfresh.utilities.dataframe_functions.impute`.
Each occurring ``inf`` or ``NaN`` in the DataFrame is replaced by
* ``-inf`` -> ``min``
* ``+inf`` -> ``max``
* ``NaN`` -> ``median``
This estimator - as most of the sklearn estimators - works in a two step procedure. First, the ``.fit``
function is called where for each column the min, max and median are computed.
Secondly, the ``.transform`` function is called which replaces the occurances of ``NaNs`` and ``infs`` using
the column-wise computed min, max and median values.
"""
def __init__(
self,
col_to_NINF_repl_preset=None,
col_to_PINF_repl_preset=None,
col_to_NAN_repl_preset=None,
):
"""
Create a new PerColumnImputer instance, optionally with dictionaries containing replacements for
``NaNs`` and ``infs``.
:param col_to_NINF_repl: Dictionary mapping column names to ``-inf`` replacement values
:type col_to_NINF_repl: dict
:param col_to_PINF_repl: Dictionary mapping column names to ``+inf`` replacement values
:type col_to_PINF_repl: dict
:param col_to_NAN_repl: Dictionary mapping column names to ``NaN`` replacement values
:type col_to_NAN_repl: dict
"""
self._col_to_NINF_repl = None
self._col_to_PINF_repl = None
self._col_to_NAN_repl = None
self.col_to_NINF_repl_preset = col_to_NINF_repl_preset
self.col_to_PINF_repl_preset = col_to_PINF_repl_preset
self.col_to_NAN_repl_preset = col_to_NAN_repl_preset
[docs]
def fit(self, X, y=None):
"""
Compute the min, max and median for all columns in the DataFrame. For more information,
please see the :func:`~tsfresh.utilities.dataframe_functions.get_range_values_per_column` function.
:param X: DataFrame to calculate min, max and median values on
:type X: pandas.DataFrame
:param y: Unneeded.
:type y: Any
:return: the estimator with the computed min, max and median values
:rtype: Imputer
"""
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
col_to_max, col_to_min, col_to_median = get_range_values_per_column(X)
if self.col_to_NINF_repl_preset is not None:
if not set(X.columns) >= set(self.col_to_NINF_repl_preset.keys()):
raise ValueError(
"Preset dictionary 'col_to_NINF_repl_preset' contain more keys "
"than the column names in X"
)
col_to_min.update(self.col_to_NINF_repl_preset)
self._col_to_NINF_repl = col_to_min
if self.col_to_PINF_repl_preset is not None:
if not set(X.columns) >= set(self.col_to_PINF_repl_preset.keys()):
raise ValueError(
"Preset dictionary 'col_to_PINF_repl_preset' contain more keys "
"than the column names in X"
)
col_to_max.update(self.col_to_PINF_repl_preset)
self._col_to_PINF_repl = col_to_max
if self.col_to_NAN_repl_preset is not None:
if not set(X.columns) >= set(self.col_to_NAN_repl_preset.keys()):
raise ValueError(
"Preset dictionary 'col_to_NAN_repl_preset' contain more keys "
"than the column names in X"
)
col_to_median.update(self.col_to_NAN_repl_preset)
self._col_to_NAN_repl = col_to_median
return self