# -*- coding: utf-8 -*-
# This file as well as the whole tsfresh package are licenced under the MIT licence (see the LICENCE.txt)
# Maximilian Christ (maximilianchrist.com), Blue Yonder Gmbh, 2016
"""
Utility functions for handling the DataFrame conversions to the internal normalized format
(see ``normalize_input_to_internal_representation``) or on how to handle ``NaN`` and ``inf`` in the DataFrames.
"""
import warnings
import numpy as np
import pandas as pd
import logging
_logger = logging.getLogger(__name__)
[docs]def check_for_nans_in_columns(df, columns=None):
"""
Helper function to check for ``NaN`` in the data frame and raise a ``ValueError`` if there is one.
:param df: the pandas DataFrame to test for NaNs
:type df: pandas.DataFrame
:param columns: a list of columns to test for NaNs. If left empty, all columns of the DataFrame will be tested.
:type columns: list
:return: None
:rtype: None
:raise: ``ValueError`` of ``NaNs`` are found in the DataFrame.
"""
if columns is None:
columns = df.columns
if pd.isnull(df.loc[:, columns]).any().any():
if not isinstance(columns, list):
columns = list(columns)
raise ValueError("Columns {} of DataFrame must not contain NaN values".format(
df.loc[:, columns].columns[pd.isnull(df.loc[:, columns]).sum() > 0].tolist()))
[docs]def impute(df_impute):
"""
Columnwise replaces all ``NaNs`` and ``infs`` from the DataFrame `df_impute` with average/extreme values from
the same columns. This is done as follows: Each occurring ``inf`` or ``NaN`` in `df_impute` is replaced by
* ``-inf`` -> ``min``
* ``+inf`` -> ``max``
* ``NaN`` -> ``median``
If the column does not contain finite values at all, it is filled with zeros.
This function modifies `df_impute` in place. After that, df_impute is guaranteed to not contain any non-finite
values. Also, all columns will be guaranteed to be of type ``np.float64``.
:param df_impute: DataFrame to impute
:type df_impute: pandas.DataFrame
:return df_impute: imputed DataFrame
:rtype df_impute: pandas.DataFrame
"""
col_to_max, col_to_min, col_to_median = get_range_values_per_column(df_impute)
df_impute = impute_dataframe_range(df_impute, col_to_max, col_to_min, col_to_median)
# Ensure a type of "np.float64"
df_impute.astype(np.float64, copy=False)
return df_impute
[docs]def impute_dataframe_zero(df_impute):
"""
Replaces all ``NaNs``, ``-infs`` and ``+infs`` from the DataFrame `df_impute` with 0s.
The `df_impute` will be modified in place. All its columns will be into converted into dtype ``np.float64``.
:param df_impute: DataFrame to impute
:type df_impute: pandas.DataFrame
:return df_impute: imputed DataFrame
:rtype df_impute: pandas.DataFrame
"""
df_impute.replace([np.PINF, np.NINF], 0, inplace=True)
df_impute.fillna(0, inplace=True)
# Ensure a type of "np.float64"
df_impute.astype(np.float64, copy=False)
return df_impute
[docs]def impute_dataframe_range(df_impute, col_to_max, col_to_min, col_to_median):
"""
Columnwise replaces all ``NaNs``, ``-inf`` and ``+inf`` from the DataFrame `df_impute` with average/extreme values
from the provided dictionaries.
This is done as follows: Each occurring ``inf`` or ``NaN`` in `df_impute` is replaced by
* ``-inf`` -> by value in col_to_min
* ``+inf`` -> by value in col_to_max
* ``NaN`` -> by value in col_to_median
If a column of df_impute is not found in the one of the dictionaries, this method will raise a ValueError.
Also, if one of the values to replace is not finite a ValueError is returned
This function modifies `df_impute` in place. Afterwards df_impute is
guaranteed to not contain any non-finite values.
Also, all columns will be guaranteed to be of type ``np.float64``.
:param df_impute: DataFrame to impute
:type df_impute: pandas.DataFrame
:param col_to_max: Dictionary mapping column names to max values
:type col_to_max: dict
:param col_to_min: Dictionary mapping column names to min values
:type col_to_max: dict
:param col_to_median: Dictionary mapping column names to median values
:type col_to_max: dict
:return df_impute: imputed DataFrame
:rtype df_impute: pandas.DataFrame
:raise ValueError: if a column of df_impute is missing in col_to_max, col_to_min or col_to_median or a value
to replace is non finite
"""
columns = df_impute.columns
# Making sure col_to_median, col_to_max and col_to_min have entries for every column
if not set(columns) <= set(col_to_median.keys()) or \
not set(columns) <= set(col_to_max.keys()) or \
not set(columns) <= set(col_to_min.keys()):
raise ValueError("Some of the dictionaries col_to_median, col_to_max, col_to_min contains more or less keys "
"than the column names in df")
# check if there are non finite values for the replacement
if np.any(~np.isfinite(list(col_to_median.values()))) or \
np.any(~np.isfinite(list(col_to_min.values()))) or \
np.any(~np.isfinite(list(col_to_max.values()))):
raise ValueError("Some of the dictionaries col_to_median, col_to_max, col_to_min contains non finite values "
"to replace")
# Replacing values
# +inf -> max
indices = np.nonzero(df_impute.values == np.PINF)
if len(indices[0]) > 0:
replacement = [col_to_max[columns[i]] for i in indices[1]]
df_impute.iloc[indices] = replacement
# -inf -> min
indices = np.nonzero(df_impute.values == np.NINF)
if len(indices[0]) > 0:
replacement = [col_to_min[columns[i]] for i in indices[1]]
df_impute.iloc[indices] = replacement
# NaN -> median
indices = np.nonzero(np.isnan(df_impute.values))
if len(indices[0]) > 0:
replacement = [col_to_median[columns[i]] for i in indices[1]]
df_impute.iloc[indices] = replacement
df_impute.astype(np.float64, copy=False)
return df_impute
[docs]def get_range_values_per_column(df):
"""
Retrieves the finite max, min and mean values per column in the DataFrame `df` and stores them in three
dictionaries. Those dictionaries `col_to_max`, `col_to_min`, `col_to_median` map the columnname to the maximal,
minimal or median value of that column.
If a column does not contain any finite values at all, a 0 is stored instead.
:param df: the Dataframe to get columnswise max, min and median from
:type df: pandas.DataFrame
:return: Dictionaries mapping column names to max, min, mean values
:rtype: (dict, dict, dict)
"""
data = df.get_values()
masked = np.ma.masked_invalid(data)
columns = df.columns
is_col_non_finite = masked.mask.sum(axis=0) == masked.data.shape[0]
if np.any(is_col_non_finite):
# We have columns that does not contain any finite value at all, so we will store 0 instead.
_logger.warning("The columns {} did not have any finite values. Filling with zeros.".format(
df.iloc[:, np.where(is_col_non_finite)[0]].columns.values))
masked.data[:, is_col_non_finite] = 0 # Set the values of the columns to 0
masked.mask[:, is_col_non_finite] = False # Remove the mask for this column
# fetch max, min and median for all columns
col_to_max = dict(zip(columns, np.max(masked, axis=0)))
col_to_min = dict(zip(columns, np.min(masked, axis=0)))
col_to_median = dict(zip(columns, np.ma.median(masked, axis=0)))
return col_to_max, col_to_min, col_to_median
# todo: add more testcases
# todo: rewrite in a more straightforward way
[docs]def roll_time_series(df_or_dict, column_id, column_sort, column_kind, rolling_direction,
maximum_number_of_timeshifts=None):
"""
Roll the (sorted) data frames for each kind and each id separately in the "time" domain
(which is represented by the sort order of the sort column given by `column_sort`).
For each rolling step, a new id is created by the scheme "id={id}, shift={shift}", here id is the former id of the
column and shift is the amount of "time" shifts.
A few remarks:
* This method will create new IDs!
* The sign of rolling defines the direction of time rolling, a positive value means we are going back in time
* It is possible to shift time series of different lenghts but
* We assume that the time series are uniformly sampled
* For more information, please see :ref:`rolling-label`.
:param df_or_dict: a pandas DataFrame or a dictionary. The required shape/form of the object depends on the rest of
the passed arguments.
:type df_or_dict: pandas.DataFrame or dict
:param column_id: it must be present in the pandas DataFrame or in all DataFrames in the dictionary.
It is not allowed to have NaN values in this column.
:type column_id: basestring or None
:param column_sort: if not None, sort the rows by this column. It is not allowed to
have NaN values in this column.
:type column_sort: basestring or None
:param column_kind: It can only be used when passing a pandas DataFrame (the dictionary is already assumed to be
grouped by the kind). Is must be present in the DataFrame and no NaN values are allowed.
If the kind column is not passed, it is assumed that each column in the pandas DataFrame (except the id or
sort column) is a possible kind.
:type column_kind: basestring or None
:param rolling_direction: The sign decides, if to roll backwards or forwards in "time"
:type rolling_direction: int
:param maximum_number_of_timeshifts: If not None, shift only up to maximum_number_of_timeshifts.
If None, shift as often as possible.
:type maximum_number_of_timeshifts: int
:return: The rolled data frame or dictionary of data frames
:rtype: the one from df_or_dict
"""
if rolling_direction == 0:
raise ValueError("Rolling direction of 0 is not possible")
if isinstance(df_or_dict, dict):
if column_kind is not None:
raise ValueError("You passed in a dictionary and gave a column name for the kind. Both are not possible.")
return {key: roll_time_series(df_or_dict=df_or_dict[key],
column_id=column_id,
column_sort=column_sort,
column_kind=column_kind,
rolling_direction=rolling_direction)
for key in df_or_dict}
# Now we know that this is a pandas data frame
df = df_or_dict
if column_id is not None:
if column_id not in df:
raise AttributeError("The given column for the id is not present in the data.")
else:
raise ValueError("You have to set the column_id which contains the ids of the different time series")
if column_kind is not None:
grouper = (column_kind, column_id)
else:
grouper = (column_id,)
if column_sort is not None:
# Require no Nans in column
if df[column_sort].isnull().any():
raise ValueError("You have NaN values in your sort column.")
df = df.sort_values(column_sort)
# if rolling is enabled, the data should be uniformly sampled in this column
# Build the differences between consecutive time sort values
differences = df.groupby(grouper)[column_sort].apply(
lambda x: x.values[:-1] - x.values[1:])
# Write all of them into one big list
differences = sum(map(list, differences), [])
# Test if all differences are the same
if differences and min(differences) != max(differences):
warnings.warn("Your time stamps are not uniformly sampled, which makes rolling "
"nonsensical in some domains.")
# Roll the data frames if requested
rolling_direction = np.sign(rolling_direction)
grouped_data = df.groupby(grouper)
maximum_number_of_timeshifts = maximum_number_of_timeshifts or grouped_data.count().max().max()
if np.isnan(maximum_number_of_timeshifts):
raise ValueError("Somehow the maximum length of your time series is NaN (Does your time series container have "
"only one row?). Can not perform rolling.")
if rolling_direction > 0:
range_of_shifts = range(maximum_number_of_timeshifts, -1, -1)
else:
range_of_shifts = range(-maximum_number_of_timeshifts, 1)
def roll_out_time_series(time_shift):
# Shift out only the first "time_shift" rows
df_temp = grouped_data.shift(time_shift)
df_temp[column_id] = "id=" + df[column_id].map(str) + ", shift={}".format(time_shift)
if column_kind:
df_temp[column_kind] = df[column_kind]
return df_temp.dropna()
return pd.concat([roll_out_time_series(time_shift) for time_shift in range_of_shifts],
ignore_index=True)