Source code for graphpype.utils_stats

import scipy.stats as stat
import numpy as np
import itertools as it
"""
Functions for computing statistics over symetrical matrices (pairwise)

Input parameters:

All functions accept two (or one for test vs 0) stack of matrices (3D objects).

Old order (old_order = True) refers to a framework where the last dimension was 
referring to the stacking of matrices (10*10*20 for 20 samples of matrices with 
10 nodes) whereas the new order (i.e. old_order = False # noqa

keep_intracon=True means computing the tests of the diagonal as well. It is 
relevant for the test of number of inter-modular edges (in the case the 
diagonal will be the number of edges within a module). For fonctional 
connectivity the inclusion of diagonal is irrelevant. # noqa

Returns:
All the functions returns 3 objects as a tuple:
- a matrix of significance level, with the following code:
0 -> not significant
1 -> uncorrected significant (according to the parameter uncor_alpha, setting 
the uncorrected threshold) # noqa
2 -> FP significant (i.e., passing a threshold of 1/N_tests, given it is more 
stringent than the uncor_alpha) # noqa
    This threshold is mildly accepted in the scientific community but may be 
justified in case of graph connectivity, see Lynall and Bassett, 2013 # noqa
3 -> FDR significant (as set as the fdr_alpha, = cor_alpha in most cases)
4 -> Bonferroni significant (as set as the bon_alpha  = cor_alpha in most 
cases) # noqa
The sign of the code (+1 or -1 give the direction of the significant: +1: X > 
Y, -1: Y > X) # noqa

- a matrix of p-value associated with the statitical test

- the statistics value (T values for t-test, R^2 for correlation, etc)

"""


def _return_signif_code(p_values, uncor_alpha=0.001, fdr_alpha=0.05,
                        bon_alpha=0.05):
    """private function for sorting p-values and computing signif codes"""
    N = p_values.shape[0]

    # by default, code = 1 (cor at 0.05)
    signif_code = np.ones(shape=N)

    # uncor
    # code = 0 for all correlation below uncor_alpha
    signif_code[p_values >= uncor_alpha] = 0

    # FPcor
    if 1.0/N < uncor_alpha:
        signif_code[p_values < 1.0/N] = 2

    # fdr
    seq = np.arange(N, 0, -1)
    seq_fdr_p_values = fdr_alpha/seq
    order = p_values.argsort()
    signif_sorted = p_values[order] < seq_fdr_p_values
    signif_code[order[signif_sorted]] = 3

    # bonferroni
    signif_code[p_values < bon_alpha/N] = 4

    return signif_code


def _return_signif_code_Z(Z_values, uncor_alpha=0.001, fdr_alpha=0.05,
                          bon_alpha=0.05):
    """ private function for sorting Z-score and return signifcode"""
    N = Z_values.shape[0]

    # by default, code = 1 (cor at 0.05)
    signif_code = np.ones(shape=N)

    # uncor
    Z_uncor = stat.norm.ppf(1-uncor_alpha/2)
    signif_code[Z_values < Z_uncor] = 0

    # FPcor
    Z_FPcor = stat.norm.ppf(1-(1.0/(2*N)))
    signif_code[Z_values > Z_FPcor] = 2

    # fdr
    seq = np.arange(N, 0, -1)
    seq_fdr_p_values = fdr_alpha/seq
    seq_Z_val = stat.norm.ppf(1-seq_fdr_p_values/2)
    order = (-Z_values).argsort()  # sorted in reversed order
    sorted_Z_values = Z_values[order]  # sorted in reversed order
    signif_sorted = sorted_Z_values > seq_Z_val
    signif_code[order[signif_sorted]] = 3

    # bonferroni
    Z_bon = stat.norm.ppf(1-bon_alpha/(2*N))
    signif_code[Z_values > Z_bon] = 4

    return signif_code


[docs]def compute_pairwise_ttest_fdr(X, Y, cor_alpha, uncor_alpha, paired=True,
                               old_order=True, keep_intracon=False):
    """Two-way pairwise T-test stats"""
    # old order was n_nodes, n_nodes, sample_size
    # new order is sample_size, n_nodes, n_nodes,

    # number of nodes
    if old_order:
        X = np.moveaxis(X, 2, 0)
        Y = np.moveaxis(Y, 2, 0)

    # test squared matrices
    assert X.shape[1] == X.shape[2] and Y.shape[1] == Y.shape[2], ("Error, X \
        {} {} and/or Y {} {} are not squared".format(X.shape[1], X.shape[2],
                                                     Y.shape[1], Y.shape[2]))

    # test same number of nodes between X and Y
    assert X.shape[1] == Y.shape[1], ("Error, X {} and Y {}do not have the \
        same number of nodes".format(X.shape[1], Y.shape[1]))

    # test if same number of sample (paired t-test only)
    if paired:
        assert X.shape[0] == Y.shape[0], ("Error, X and Y are paired but do\
           not have the same number od samples{}{}".format(X.shape[0],
                                                           Y.shape[0]))
    # nb nodes
    N = X.shape[1]

    # tests are also done on the diagonal of the matrix
    if keep_intracon:
        iter_indexes = it.combinations_with_replacement(list(range(N)), 2)
    else:
        iter_indexes = it.combinations(list(range(N)), 2)

    # computing signif t-tests for each relevant pair
    list_diff = []

    for i, j in iter_indexes:

        # removing the nan
        X_nonan = X[np.logical_not(np.isnan(X[:, i, j])), i, j]
        Y_nonan = Y[np.logical_not(np.isnan(Y[:, i, j])), i, j]

        if len(X_nonan) < 2 or len(Y_nonan) < 2:
            print("Not enough values for sample {} {}, len = {} and {},\
                    skipping".format(i, j, len(X_nonan), len(Y_nonan)))
            continue

        if paired:
            t_stat, p_val = stat.ttest_rel(X_nonan, Y_nonan)
            if np.isnan(p_val):
                print("Warning, unable to compute T-test: ")

            # TODO pas encore present (version scipy 0.18)
            # t_stat,p_val = stat.ttest_rel(X[i,j,:],Y[i,j,:],
            # nan_policy = 'omit')

        else:
            t_stat, p_val = stat.ttest_ind(X_nonan, Y_nonan)

        list_diff.append([i, j, p_val, np.sign(
            np.mean(X_nonan)-np.mean(Y_nonan)), t_stat])

    assert len(list_diff) != 0, "Error, list_diff is empty"

    np_list_diff = np.array(list_diff)

    signif_code = _return_signif_code(np_list_diff[:, 2],
                                      uncor_alpha=uncor_alpha,
                                      fdr_alpha=cor_alpha,
                                      bon_alpha=cor_alpha)

    np_list_diff[:, 3] *= signif_code

    # formatting signif_mat, p_val_mat and T_stat_mat
    signif_mat = np.zeros((N, N), dtype='int')
    p_val_mat = np.zeros((N, N), dtype='float')
    T_stat_mat = np.zeros((N, N), dtype='float')

    s_i = np.array(np_list_diff[:, 0], dtype=int)
    s_j = np.array(np_list_diff[:, 1], dtype=int)

    signif_mat[s_i, s_j] = np_list_diff[:, 3].astype(int)
    signif_mat[s_j, s_i] = np_list_diff[:, 3].astype(int)

    p_val_mat[s_i, s_j] = p_val_mat[s_j, s_i] = np_list_diff[:, 2]
    T_stat_mat[s_i, s_j] = T_stat_mat[s_j, s_i] = np_list_diff[:, 4]

    return signif_mat, p_val_mat, T_stat_mat


[docs]def compute_pairwise_oneway_ttest_fdr(X, cor_alpha, uncor_alpha,
                                      old_order=True):
    """Oneway pairwise T-test stats"""
    if old_order:
        X = np.moveaxis(X, 2, 0)

    # number of nodes
    assert X.shape[1] == X.shape[2],  ("Error, X {}{} and/or Y {}{} are not \
        squared".format(X.shape[1], X.shape[2]))

    N = X.shape[1]

    list_diff = []

    for i, j in it.combinations(list(range(N)), 2):

        X_nonan = X[np.logical_not(np.isnan(X[:, i, j])), i, j]

        if len(X_nonan) < 2:
            print("Not enough values for sample {} {}, len = {}, \
                   skipping".format(i, j, len(X_nonan)))
            continue

        t_stat, p_val = stat.ttest_1samp(X_nonan, 0.0)  # 0.0 ?

        if np.isnan(p_val):

            print("Warning, unable to compute T-test: ")
            print(t_stat, p_val, X_nonan)

        list_diff.append([i, j, p_val, np.sign(np.mean(X_nonan)), t_stat])

    print(list_diff)

    assert len(list_diff) != 0, "Error, list_diff is empty"

    np_list_diff = np.array(list_diff)

    print(np_list_diff)

    signif_code = _return_signif_code(np_list_diff[:, 2], uncor_alpha,
                                      fdr_alpha=cor_alpha, bon_alpha=cor_alpha)

    np_list_diff[:, 3] *= signif_code

    signif_mat = np.zeros((N, N), dtype='int')
    p_val_mat = np.zeros((N, N), dtype='float')
    T_stat_mat = np.zeros((N, N), dtype='float')

    s_i = np.array(np_list_diff[:, 0], dtype=int)
    s_j = np.array(np_list_diff[:, 1], dtype=int)

    signif_mat[s_i, s_j] = np_list_diff[:, 3].astype(int)
    signif_mat[s_j, s_i] = np_list_diff[:, 3].astype(int)
    p_val_mat[s_i, s_j] = p_val_mat[s_j, s_i] = np_list_diff[:, 2]
    T_stat_mat[s_i, s_j] = T_stat_mat[s_j, s_i] = np_list_diff[:, 4]

    print(T_stat_mat)

    return signif_mat, p_val_mat, T_stat_mat


[docs]def compute_pairwise_mannwhitney_fdr(X, Y, cor_alpha, uncor_alpha=0.01,
                                     old_order=True):
    """compute pairwise Mann Whitney test"""
    """modified to be compatible with old_order = True
    (was only developed for old order) + assert"""
    # TODO : test if OK with moveaxis and 'new order'?
    # TODO : return parameter of mannwhitneyu (i.e. "U" values)?

    if old_order:
        X = np.moveaxis(X, 2, 0)
        Y = np.moveaxis(Y, 2, 0)

    # Assert test squared matrices
    assert X.shape[1] == X.shape[2] and Y.shape[1] == Y.shape[2], ("Error, X\
        {} {} and/or Y {} {} are not squared".format(X.shape[1], X.shape[2],
                                                     Y.shape[1], Y.shape[2]))

    # Assert test same number of nodes between X and Y
    assert X.shape[1] == Y.shape[1], ("Error, X {} and Y {}do not have the \
        same number of nodes".format(X.shape[1], Y.shape[1]))

    # number of nodes
    N = X.shape[1]

    # compute pairwise test
    list_diff = []

    for i, j in it.combinations(list(range(N)), 2):

        # TODO: handles nan correctly??
        X_val = X[:, i, j]
        Y_val = Y[:, i, j]
        u_stat, p_val = stat.mannwhitneyu(X_val, Y_val, use_continuity=False,
                                          alternative="two-sided")

        sign_diff = np.sign(np.mean(X_val)-np.mean(Y_val))

        list_diff.append([i, j, p_val, sign_diff])

    np_list_diff = np.array(list_diff)

    signif_code = _return_signif_code(np_list_diff[:, 2],
                                      uncor_alpha=uncor_alpha,
                                      fdr_alpha=cor_alpha, bon_alpha=cor_alpha)

    np_list_diff[:, 3] = np_list_diff[:, 3] * signif_code

    signif_mat = np.zeros((N, N), dtype='int')

    s_i = np.array(np_list_diff[:, 0], dtype=int)
    s_j = np.array(np_list_diff[:, 1], dtype=int)
    signif_sign = np.array(np_list_diff[:, 3], dtype=int)

    signif_mat[s_i, s_j] = signif_mat[s_j, s_i] = signif_sign

    return signif_mat


def _info_CI(X, Y):
    """ Compute binomial comparaison"""
    nX = len(X) * 1.
    nY = len(Y) * 1.

    pX = np.sum(X == 1)/nX
    pY = np.sum(Y == 1)/nY

    SE = np.sqrt(pX * (1-pX)/nX + pY * (1-pY)/nY)

    return np.absolute(pX-pY), SE, np.sign(pX-pY)


def compute_pairwise_binom_fdr(X, Y, uncor_alpha=0.001, cor_alpha=0.05,
                               old_order=True):
    """modified to be compatible with old_order = True
    (was only developed for old order) + assert"""
    # TODO : test if OK with moveaxis and 'new order'?

    if old_order:
        X = np.moveaxis(X, 2, 0)
        Y = np.moveaxis(Y, 2, 0)

    # Assert test squared matrices
    assert X.shape[1] == X.shape[2] and Y.shape[1] == Y.shape[2], ("Error, X\
        {} {} and/or Y {} {} are not squared".format(X.shape[1], X.shape[2],
                                                     Y.shape[1], Y.shape[2]))

    # assert test same number of nodes between X and Y
    assert X.shape[1] == Y.shape[1], ("Error, X {} and Y {}do not have the \
        same number of nodes".format(X.shape[1], Y.shape[1]))

    # number of nodes
    N = X.shape[1]

    # Perform binomial test at each edge
    list_diff = []

    for i, j in it.combinations(list(range(N)), 2):
        abs_diff, SE, sign_diff = _info_CI(X[:, i, j], Y[:, i, j])
        list_diff.append([i, j, abs_diff/SE, sign_diff])

    np_list_diff = np.array(list_diff)

    signif_code = _return_signif_code_Z(np_list_diff[:, 2],
                                        uncor_alpha=uncor_alpha,
                                        fdr_alpha=cor_alpha,
                                        bon_alpha=cor_alpha)

    np_list_diff[:, 3] = np_list_diff[:, 3] * signif_code

    signif_mat = np.zeros((N, N), dtype='int')

    s_i = np.array(np_list_diff[:, 0], dtype=int)
    s_j = np.array(np_list_diff[:, 1], dtype=int)

    signif_sign = np.array(np_list_diff[:, 3], dtype=int)
    signif_mat[s_i, s_j] = signif_mat[s_j, s_i] = signif_sign

    return signif_mat


def compute_oneway_anova_fwe(list_of_list_matrices, cor_alpha=0.05,
                             uncor_alpha=0.001, keep_intracon=False):
    """OneWay Anova (F-test)"""
    # TODO : warning, this is very different than previous functions,
    # needs tobe checked where it is called
    assert False, ("Warning, very old function, check your call and report it \
        to developer")

    for group_mat in list_of_list_matrices:
        assert group_mat.shape[1] == group_mat.shape[2], ("warning, matrices \
            are not squared {} {}".format(group_mat.shape[1],
                                          group_mat.shape[2]))

    N = group_mat.shape[2]

    list_diff = []

    if keep_intracon:
        iter_indexes = it.combinations_with_replacement(list(range(N)), 2)

    else:
        iter_indexes = it.combinations(list(range(N)), 2)

    for i, j in iter_indexes:

        list_val = [group_mat[:, i, j].tolist()
                    for group_mat in list_of_list_matrices]

        F_stat, p_val = stat.f_oneway(*list_val)

        list_diff.append([i, j, p_val, F_stat])

    # computing significance code
    np_list_diff = np.array(list_diff)

    signif_code = _return_signif_code(np_list_diff[:, 2],
                                      uncor_alpha=uncor_alpha,
                                      fdr_alpha=cor_alpha,
                                      bon_alpha=cor_alpha)

    signif_code[np.isnan(np_list_diff[:, 2])] = 0

    # converting to matrix
    signif_adj_mat = np.zeros((N, N), dtype='int')
    p_val_mat = np.zeros((N, N), dtype='float')
    F_stat_mat = np.zeros((N, N), dtype='float')

    s_i = np.array(np_list_diff[:, 0], dtype=int)
    s_j = np.array(np_list_diff[:, 1], dtype=int)

    signif_adj_mat[s_i, s_j] = signif_adj_mat[s_j, s_i] = signif_code
    p_val_mat[s_i, s_j] = p_val_mat[s_i, s_j] = np_list_diff[:, 2]
    F_stat_mat[s_i, s_j] = F_stat_mat[s_i, s_j] = np_list_diff[:, 3]

    return signif_adj_mat, p_val_mat, F_stat_mat


def compute_correl_behav(X, reg_interest, uncor_alpha=0.001, cor_alpha=0.05,
                         old_order=False, keep_intracon=False):
    """correlation with behaviour (1D vector)"""
    if old_order:
        X = X.moveaxis(X, 0, 2)

    N = X.shape[1]

    print(reg_interest)
    print(reg_interest.dtype)

    if keep_intracon:
        iter_indexes = it.combinations_with_replacement(list(range(N)), 2)
    else:
        iter_indexes = it.combinations(list(range(N)), 2)

    # number of nodes
    assert X.shape[1] == X.shape[2] and "Error, X {}{} is not squared".format(
        X.shape[1], X.shape[2])

    assert X.shape[0] == reg_interest.shape[0], ("Incompatible number of \
        fields in dataframe and nb matrices")

    list_diff = []

    for i, j in iter_indexes:

        keep_val = (~np.isnan(X[:, i, j])) & (~np.isnan(reg_interest))
        X_nonan = X[keep_val, i, j]
        reg_nonan = reg_interest[keep_val]
        r_stat, p_val = stat.pearsonr(X_nonan, reg_nonan)

        if np.isnan(p_val):

            print("Warning, unable to compute T-test: ")
            print(r_stat, p_val, X_nonan)

        list_diff.append([i, j, p_val, np.sign(r_stat), r_stat])

    assert len(list_diff) != 0, "Error, list_diff is empty"

    np_list_diff = np.array(list_diff)

    signif_code = _return_signif_code(np_list_diff[:, 2],
                                      uncor_alpha=uncor_alpha,
                                      fdr_alpha=cor_alpha,
                                      bon_alpha=cor_alpha)

    np_list_diff[:, 3] *= signif_code

    signif_mat = np.zeros((N, N), dtype='int')
    p_val_mat = np.zeros((N, N), dtype='float')
    r_stat_mat = np.zeros((N, N), dtype='float')

    s_i = np.array(np_list_diff[:, 0], dtype=int)
    s_j = np.array(np_list_diff[:, 1], dtype=int)

    signif_mat[s_i, s_j] = np_list_diff[:, 3].astype(int)
    signif_mat[s_j, s_i] = np_list_diff[:, 3].astype(int)

    p_val_mat[s_i, s_j] = p_val_mat[s_j, s_i] = np_list_diff[:, 2]
    r_stat_mat[s_i, s_j] = r_stat_mat[s_j, s_i] = np_list_diff[:, 4]

    print(r_stat_mat)

    return signif_mat, p_val_mat, r_stat_mat