Source code for graphpype.utils_stats

import scipy.stats as stat
import numpy as np
import itertools as it
"""
Functions for computing statistics over symetrical matrices (pairwise)

Input parameters:

All functions accept two (or one for test vs 0) stack of matrices (3D objects).

Old order (old_order = True) refers to a framework where the last dimension was 
referring to the stacking of matrices (10*10*20 for 20 samples of matrices with 
10 nodes) whereas the new order (i.e. old_order = False # noqa

keep_intracon=True means computing the tests of the diagonal as well. It is 
relevant for the test of number of inter-modular edges (in the case the 
diagonal will be the number of edges within a module). For fonctional 
connectivity the inclusion of diagonal is irrelevant. # noqa

Returns:
All the functions returns 3 objects as a tuple:
- a matrix of significance level, with the following code:
0 -> not significant
1 -> uncorrected significant (according to the parameter uncor_alpha, setting 
the uncorrected threshold) # noqa
2 -> FP significant (i.e., passing a threshold of 1/N_tests, given it is more 
stringent than the uncor_alpha) # noqa
    This threshold is mildly accepted in the scientific community but may be 
justified in case of graph connectivity, see Lynall and Bassett, 2013 # noqa
3 -> FDR significant (as set as the fdr_alpha, = cor_alpha in most cases)
4 -> Bonferroni significant (as set as the bon_alpha  = cor_alpha in most 
cases) # noqa
The sign of the code (+1 or -1 give the direction of the significant: +1: X > 
Y, -1: Y > X) # noqa

- a matrix of p-value associated with the statitical test

- the statistics value (T values for t-test, R^2 for correlation, etc)

"""


def _return_signif_code(p_values, uncor_alpha=0.001, fdr_alpha=0.05,
                        bon_alpha=0.05):
    """private function for sorting p-values and computing signif codes"""
    N = p_values.shape[0]

    # by default, code = 1 (cor at 0.05)
    signif_code = np.ones(shape=N)

    # uncor
    # code = 0 for all correlation below uncor_alpha
    signif_code[p_values >= uncor_alpha] = 0

    # FPcor
    if 1.0/N < uncor_alpha:
        signif_code[p_values < 1.0/N] = 2

    # fdr
    seq = np.arange(N, 0, -1)
    seq_fdr_p_values = fdr_alpha/seq
    order = p_values.argsort()
    signif_sorted = p_values[order] < seq_fdr_p_values
    signif_code[order[signif_sorted]] = 3

    # bonferroni
    signif_code[p_values < bon_alpha/N] = 4

    return signif_code


def _return_signif_code_Z(Z_values, uncor_alpha=0.001, fdr_alpha=0.05,
                          bon_alpha=0.05):
    """ private function for sorting Z-score and return signifcode"""
    N = Z_values.shape[0]

    # by default, code = 1 (cor at 0.05)
    signif_code = np.ones(shape=N)

    # uncor
    Z_uncor = stat.norm.ppf(1-uncor_alpha/2)
    signif_code[Z_values < Z_uncor] = 0

    # FPcor
    Z_FPcor = stat.norm.ppf(1-(1.0/(2*N)))
    signif_code[Z_values > Z_FPcor] = 2

    # fdr
    seq = np.arange(N, 0, -1)
    seq_fdr_p_values = fdr_alpha/seq
    seq_Z_val = stat.norm.ppf(1-seq_fdr_p_values/2)
    order = (-Z_values).argsort()  # sorted in reversed order
    sorted_Z_values = Z_values[order]  # sorted in reversed order
    signif_sorted = sorted_Z_values > seq_Z_val
    signif_code[order[signif_sorted]] = 3

    # bonferroni
    Z_bon = stat.norm.ppf(1-bon_alpha/(2*N))
    signif_code[Z_values > Z_bon] = 4

    return signif_code


[docs]def compute_pairwise_ttest_fdr(X, Y, cor_alpha, uncor_alpha, paired=True, old_order=True, keep_intracon=False): """Two-way pairwise T-test stats""" # old order was n_nodes, n_nodes, sample_size # new order is sample_size, n_nodes, n_nodes, # number of nodes if old_order: X = np.moveaxis(X, 2, 0) Y = np.moveaxis(Y, 2, 0) # test squared matrices assert X.shape[1] == X.shape[2] and Y.shape[1] == Y.shape[2], ("Error, X \ {} {} and/or Y {} {} are not squared".format(X.shape[1], X.shape[2], Y.shape[1], Y.shape[2])) # test same number of nodes between X and Y assert X.shape[1] == Y.shape[1], ("Error, X {} and Y {}do not have the \ same number of nodes".format(X.shape[1], Y.shape[1])) # test if same number of sample (paired t-test only) if paired: assert X.shape[0] == Y.shape[0], ("Error, X and Y are paired but do\ not have the same number od samples{}{}".format(X.shape[0], Y.shape[0])) # nb nodes N = X.shape[1] # tests are also done on the diagonal of the matrix if keep_intracon: iter_indexes = it.combinations_with_replacement(list(range(N)), 2) else: iter_indexes = it.combinations(list(range(N)), 2) # computing signif t-tests for each relevant pair list_diff = [] for i, j in iter_indexes: # removing the nan X_nonan = X[np.logical_not(np.isnan(X[:, i, j])), i, j] Y_nonan = Y[np.logical_not(np.isnan(Y[:, i, j])), i, j] if len(X_nonan) < 2 or len(Y_nonan) < 2: print("Not enough values for sample {} {}, len = {} and {},\ skipping".format(i, j, len(X_nonan), len(Y_nonan))) continue if paired: t_stat, p_val = stat.ttest_rel(X_nonan, Y_nonan) if np.isnan(p_val): print("Warning, unable to compute T-test: ") # TODO pas encore present (version scipy 0.18) # t_stat,p_val = stat.ttest_rel(X[i,j,:],Y[i,j,:], # nan_policy = 'omit') else: t_stat, p_val = stat.ttest_ind(X_nonan, Y_nonan) list_diff.append([i, j, p_val, np.sign( np.mean(X_nonan)-np.mean(Y_nonan)), t_stat]) assert len(list_diff) != 0, "Error, list_diff is empty" np_list_diff = np.array(list_diff) signif_code = _return_signif_code(np_list_diff[:, 2], uncor_alpha=uncor_alpha, fdr_alpha=cor_alpha, bon_alpha=cor_alpha) np_list_diff[:, 3] *= signif_code # formatting signif_mat, p_val_mat and T_stat_mat signif_mat = np.zeros((N, N), dtype='int') p_val_mat = np.zeros((N, N), dtype='float') T_stat_mat = np.zeros((N, N), dtype='float') s_i = np.array(np_list_diff[:, 0], dtype=int) s_j = np.array(np_list_diff[:, 1], dtype=int) signif_mat[s_i, s_j] = np_list_diff[:, 3].astype(int) signif_mat[s_j, s_i] = np_list_diff[:, 3].astype(int) p_val_mat[s_i, s_j] = p_val_mat[s_j, s_i] = np_list_diff[:, 2] T_stat_mat[s_i, s_j] = T_stat_mat[s_j, s_i] = np_list_diff[:, 4] return signif_mat, p_val_mat, T_stat_mat
[docs]def compute_pairwise_oneway_ttest_fdr(X, cor_alpha, uncor_alpha, old_order=True): """Oneway pairwise T-test stats""" if old_order: X = np.moveaxis(X, 2, 0) # number of nodes assert X.shape[1] == X.shape[2], ("Error, X {}{} and/or Y {}{} are not \ squared".format(X.shape[1], X.shape[2])) N = X.shape[1] list_diff = [] for i, j in it.combinations(list(range(N)), 2): X_nonan = X[np.logical_not(np.isnan(X[:, i, j])), i, j] if len(X_nonan) < 2: print("Not enough values for sample {} {}, len = {}, \ skipping".format(i, j, len(X_nonan))) continue t_stat, p_val = stat.ttest_1samp(X_nonan, 0.0) # 0.0 ? if np.isnan(p_val): print("Warning, unable to compute T-test: ") print(t_stat, p_val, X_nonan) list_diff.append([i, j, p_val, np.sign(np.mean(X_nonan)), t_stat]) print(list_diff) assert len(list_diff) != 0, "Error, list_diff is empty" np_list_diff = np.array(list_diff) print(np_list_diff) signif_code = _return_signif_code(np_list_diff[:, 2], uncor_alpha, fdr_alpha=cor_alpha, bon_alpha=cor_alpha) np_list_diff[:, 3] *= signif_code signif_mat = np.zeros((N, N), dtype='int') p_val_mat = np.zeros((N, N), dtype='float') T_stat_mat = np.zeros((N, N), dtype='float') s_i = np.array(np_list_diff[:, 0], dtype=int) s_j = np.array(np_list_diff[:, 1], dtype=int) signif_mat[s_i, s_j] = np_list_diff[:, 3].astype(int) signif_mat[s_j, s_i] = np_list_diff[:, 3].astype(int) p_val_mat[s_i, s_j] = p_val_mat[s_j, s_i] = np_list_diff[:, 2] T_stat_mat[s_i, s_j] = T_stat_mat[s_j, s_i] = np_list_diff[:, 4] print(T_stat_mat) return signif_mat, p_val_mat, T_stat_mat
[docs]def compute_pairwise_mannwhitney_fdr(X, Y, cor_alpha, uncor_alpha=0.01, old_order=True): """compute pairwise Mann Whitney test""" """modified to be compatible with old_order = True (was only developed for old order) + assert""" # TODO : test if OK with moveaxis and 'new order'? # TODO : return parameter of mannwhitneyu (i.e. "U" values)? if old_order: X = np.moveaxis(X, 2, 0) Y = np.moveaxis(Y, 2, 0) # Assert test squared matrices assert X.shape[1] == X.shape[2] and Y.shape[1] == Y.shape[2], ("Error, X\ {} {} and/or Y {} {} are not squared".format(X.shape[1], X.shape[2], Y.shape[1], Y.shape[2])) # Assert test same number of nodes between X and Y assert X.shape[1] == Y.shape[1], ("Error, X {} and Y {}do not have the \ same number of nodes".format(X.shape[1], Y.shape[1])) # number of nodes N = X.shape[1] # compute pairwise test list_diff = [] for i, j in it.combinations(list(range(N)), 2): # TODO: handles nan correctly?? X_val = X[:, i, j] Y_val = Y[:, i, j] u_stat, p_val = stat.mannwhitneyu(X_val, Y_val, use_continuity=False, alternative="two-sided") sign_diff = np.sign(np.mean(X_val)-np.mean(Y_val)) list_diff.append([i, j, p_val, sign_diff]) np_list_diff = np.array(list_diff) signif_code = _return_signif_code(np_list_diff[:, 2], uncor_alpha=uncor_alpha, fdr_alpha=cor_alpha, bon_alpha=cor_alpha) np_list_diff[:, 3] = np_list_diff[:, 3] * signif_code signif_mat = np.zeros((N, N), dtype='int') s_i = np.array(np_list_diff[:, 0], dtype=int) s_j = np.array(np_list_diff[:, 1], dtype=int) signif_sign = np.array(np_list_diff[:, 3], dtype=int) signif_mat[s_i, s_j] = signif_mat[s_j, s_i] = signif_sign return signif_mat
def _info_CI(X, Y): """ Compute binomial comparaison""" nX = len(X) * 1. nY = len(Y) * 1. pX = np.sum(X == 1)/nX pY = np.sum(Y == 1)/nY SE = np.sqrt(pX * (1-pX)/nX + pY * (1-pY)/nY) return np.absolute(pX-pY), SE, np.sign(pX-pY) def compute_pairwise_binom_fdr(X, Y, uncor_alpha=0.001, cor_alpha=0.05, old_order=True): """modified to be compatible with old_order = True (was only developed for old order) + assert""" # TODO : test if OK with moveaxis and 'new order'? if old_order: X = np.moveaxis(X, 2, 0) Y = np.moveaxis(Y, 2, 0) # Assert test squared matrices assert X.shape[1] == X.shape[2] and Y.shape[1] == Y.shape[2], ("Error, X\ {} {} and/or Y {} {} are not squared".format(X.shape[1], X.shape[2], Y.shape[1], Y.shape[2])) # assert test same number of nodes between X and Y assert X.shape[1] == Y.shape[1], ("Error, X {} and Y {}do not have the \ same number of nodes".format(X.shape[1], Y.shape[1])) # number of nodes N = X.shape[1] # Perform binomial test at each edge list_diff = [] for i, j in it.combinations(list(range(N)), 2): abs_diff, SE, sign_diff = _info_CI(X[:, i, j], Y[:, i, j]) list_diff.append([i, j, abs_diff/SE, sign_diff]) np_list_diff = np.array(list_diff) signif_code = _return_signif_code_Z(np_list_diff[:, 2], uncor_alpha=uncor_alpha, fdr_alpha=cor_alpha, bon_alpha=cor_alpha) np_list_diff[:, 3] = np_list_diff[:, 3] * signif_code signif_mat = np.zeros((N, N), dtype='int') s_i = np.array(np_list_diff[:, 0], dtype=int) s_j = np.array(np_list_diff[:, 1], dtype=int) signif_sign = np.array(np_list_diff[:, 3], dtype=int) signif_mat[s_i, s_j] = signif_mat[s_j, s_i] = signif_sign return signif_mat def compute_oneway_anova_fwe(list_of_list_matrices, cor_alpha=0.05, uncor_alpha=0.001, keep_intracon=False): """OneWay Anova (F-test)""" # TODO : warning, this is very different than previous functions, # needs tobe checked where it is called assert False, ("Warning, very old function, check your call and report it \ to developer") for group_mat in list_of_list_matrices: assert group_mat.shape[1] == group_mat.shape[2], ("warning, matrices \ are not squared {} {}".format(group_mat.shape[1], group_mat.shape[2])) N = group_mat.shape[2] list_diff = [] if keep_intracon: iter_indexes = it.combinations_with_replacement(list(range(N)), 2) else: iter_indexes = it.combinations(list(range(N)), 2) for i, j in iter_indexes: list_val = [group_mat[:, i, j].tolist() for group_mat in list_of_list_matrices] F_stat, p_val = stat.f_oneway(*list_val) list_diff.append([i, j, p_val, F_stat]) # computing significance code np_list_diff = np.array(list_diff) signif_code = _return_signif_code(np_list_diff[:, 2], uncor_alpha=uncor_alpha, fdr_alpha=cor_alpha, bon_alpha=cor_alpha) signif_code[np.isnan(np_list_diff[:, 2])] = 0 # converting to matrix signif_adj_mat = np.zeros((N, N), dtype='int') p_val_mat = np.zeros((N, N), dtype='float') F_stat_mat = np.zeros((N, N), dtype='float') s_i = np.array(np_list_diff[:, 0], dtype=int) s_j = np.array(np_list_diff[:, 1], dtype=int) signif_adj_mat[s_i, s_j] = signif_adj_mat[s_j, s_i] = signif_code p_val_mat[s_i, s_j] = p_val_mat[s_i, s_j] = np_list_diff[:, 2] F_stat_mat[s_i, s_j] = F_stat_mat[s_i, s_j] = np_list_diff[:, 3] return signif_adj_mat, p_val_mat, F_stat_mat def compute_correl_behav(X, reg_interest, uncor_alpha=0.001, cor_alpha=0.05, old_order=False, keep_intracon=False): """correlation with behaviour (1D vector)""" if old_order: X = X.moveaxis(X, 0, 2) N = X.shape[1] print(reg_interest) print(reg_interest.dtype) if keep_intracon: iter_indexes = it.combinations_with_replacement(list(range(N)), 2) else: iter_indexes = it.combinations(list(range(N)), 2) # number of nodes assert X.shape[1] == X.shape[2] and "Error, X {}{} is not squared".format( X.shape[1], X.shape[2]) assert X.shape[0] == reg_interest.shape[0], ("Incompatible number of \ fields in dataframe and nb matrices") list_diff = [] for i, j in iter_indexes: keep_val = (~np.isnan(X[:, i, j])) & (~np.isnan(reg_interest)) X_nonan = X[keep_val, i, j] reg_nonan = reg_interest[keep_val] r_stat, p_val = stat.pearsonr(X_nonan, reg_nonan) if np.isnan(p_val): print("Warning, unable to compute T-test: ") print(r_stat, p_val, X_nonan) list_diff.append([i, j, p_val, np.sign(r_stat), r_stat]) assert len(list_diff) != 0, "Error, list_diff is empty" np_list_diff = np.array(list_diff) signif_code = _return_signif_code(np_list_diff[:, 2], uncor_alpha=uncor_alpha, fdr_alpha=cor_alpha, bon_alpha=cor_alpha) np_list_diff[:, 3] *= signif_code signif_mat = np.zeros((N, N), dtype='int') p_val_mat = np.zeros((N, N), dtype='float') r_stat_mat = np.zeros((N, N), dtype='float') s_i = np.array(np_list_diff[:, 0], dtype=int) s_j = np.array(np_list_diff[:, 1], dtype=int) signif_mat[s_i, s_j] = np_list_diff[:, 3].astype(int) signif_mat[s_j, s_i] = np_list_diff[:, 3].astype(int) p_val_mat[s_i, s_j] = p_val_mat[s_j, s_i] = np_list_diff[:, 2] r_stat_mat[s_i, s_j] = r_stat_mat[s_j, s_i] = np_list_diff[:, 4] print(r_stat_mat) return signif_mat, p_val_mat, r_stat_mat