Source code for tsfresh.feature_selection.benjamini_hochberg_test

# -*- coding: utf-8 -*-
# This file as well as the whole tsfresh package are licenced under the MIT licence (see the LICENCE.txt)
# Maximilian Christ (maximilianchrist.com), Blue Yonder Gmbh, 2016

[docs]def benjamini_hochberg_test(df_pvalues, hypotheses_independent, fdr_level):
    """
    This is an implementation of the benjamini hochberg procedure that determines if the null hypothesis
    for a given feature can be rejected. For this the test regards the features' p-values
    and controls the global false discovery rate, which is the ratio of false rejections by all rejections:

    .. math::

        FDR = \\mathbb{E} \\left [ \\frac{ |\\text{false rejections}| }{ |\\text{all rejections}|} \\right]


    References
    ----------

    .. [1] Benjamini, Yoav and Yekutieli, Daniel (2001).
        The control of the false discovery rate in multiple testing under dependency.
        Annals of statistics, 1165--1188


    :param df_pvalues: This DataFrame should contain the p_values of the different hypotheses in a column named
                       "p_values".
    :type df_pvalues: pandas.DataFrame

    :param hypotheses_independent: Can the significance of the features be assumed to be independent?
                                   Normally, this should be set to False as the features are never
                                   independent (e.g. mean and median)
    :type hypotheses_independent: bool

    :param fdr_level: The FDR level that should be respected, this is the theoretical expected percentage of irrelevant
                      features among all created features.
    :type fdr_level: float

    :return: The same DataFrame as the input, but with an added boolean column "relevant"
             denoting if the null hypotheses has been rejected for a given feature.
    :rtype: pandas.DataFrame
    """

    # Get auxiliary variables and vectors
    df_pvalues = df_pvalues.sort_values(by="p_value")
    m = len(df_pvalues)
    K = list(range(1, m + 1))

    # Calculate the weight vector C
    if hypotheses_independent:
        # c(k) = 1
        C = [1] * m
    else:
        # c(k) = \sum_{i=1}^m 1/i
        C = [sum([1.0 / i for i in range(1, k + 1)]) for k in K]

    # Calculate the vector T to compare to the p_value
    T = [fdr_level * k / m * 1.0 / c for k, c in zip(K, C)]

    # Get the last p_value for which H0 has been rejected
    try:
        k_max = list(df_pvalues.p_value <= T).index(False)
    except ValueError:
        k_max = m

    # Add the column denoting if null hypothesis has been rejected
    df_pvalues["relevant"] = [True] * k_max + [False] * (m - k_max)

    return df_pvalues