Source code for naplib.stats.ttest

from copy import deepcopy
import numpy as np
import pandas as pd
import statsmodels.api as sm
from patsy import dmatrix



[docs]
def ttest(*args, classes=None, cat_feats={}, con_feats={}, return_ols_result=False):
    '''
    Perform a t-test under the linear model framework, allowing for additional features
    to be controlled. For example, performing a paired t-test between samples
    while controlling for the group identity that each sample comes from.

    If no categorical or continuous features are given, then this function can reproduce
    scipy's `ttest_1samp <https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_1samp.html>`_, 
    `ttest_rel <https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_rel.html>`_, 
    or `ttest_ind <https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html>`_.
    Please see the Examples below for details on this equivalence.  However, if any
    additional features are provided, they are treated as extra predictors in a linear model
    framework, therefore controlling for their impact when estimating the test statistic.

    In the case of neural data, a common use-case for this is when testing a distribution of data points,
    each of which comes from a group, like a subject. For example, we may have 3 subjects, each of which
    has 10 electrodes, and we want to perform a relative t-test between each electrode's value
    during condition A and condition B. This function allows us to perform that test while controlling
    for the impact of subject identity by providing a subject identity categorical feature to the model
    which it uses to aid it its prediction.
 
    Parameters
    ----------
    x : array-like, shape (n_samples,)
        Data to test. All samples may come from a single class, in which case a one sample t-test
        is performed compared to a null hypothesis of mean=0, or from multiple classes if ``classes``
        is provided as well, in which case an independent two-sample t-test is performed.
    y : array-like, shape (n_samples,), optional
        If provided, must be the same shape as *x* and a paired t-test is performed between
        *x* and *y*.
    classes : array-like, shape (n_samples,), optional
        An array of 0's and 1's to separate the two classes to compare in an
        independent two-sample t-test if only *x* is provided without *y*.
    cat_feats : array-like, dict or DataFrame, optional
        Categorical features to control for, given as an array-like or dict-like object (including a
        dict, a pd.DataFrame, etc). If given as dict-like, each key specifies the name of the
        factor to control for, and the value is an array-like of shape (n_samples,) of that factor's values.
        If an array, then each column provides a categorical factor to control for.
        All features are automatically one-hot encoded. If performing an independent 2-sample
        ttest where `classes` were provided, then if this is dict-like, it cannot contain a key called "classes".
    con_feats : array-like, dict or DataFrame, optional
        Continuous features to control for, given as an array-like or dict-like object (including a
        dict, a pd.DataFrame, etc). If given as dict-like, each key specifies the name of the
        factor to control for, and the value is an array-like of shape (n_samples,) of that factor's values.
        If an array, then each column provides a continuous factor to control for. These features
        are converted to floats automatically.
        Key names must not overlap with cat_feats if given as dict-like.
    return_ols_result : bool, default=False
        Whether to return the ols result object as a third item in the tuple.
    
    Returns
    -------
    statistic : float
        T-value statistic.
    pvalue : float
        p-value.
    ols_result : statsmodels.regression.linear_model.RegressionResultsWrapper
        Only returned if ``return_ols_result=True``
    
    Examples
    --------
    >>> from naplib.stats import ttest
    >>> import numpy as np
    >>> rng = np.random.default_rng(10)
    >>> # Imagine we have data that comes from two underlying subjects pooled together
    >>> x1 = rng.normal(size=(10,1))    # subject 1 data
    >>> x2 = rng.normal(size=(10,1))+4  # subject 2 data
    >>> x = np.concatenate([x1,x2])     # pooled data
    >>> x1_group = np.zeros((10,))
    >>> x2_group = np.ones((10,))
    >>> groups = np.concatenate([x1_group,x2_group])    # pooled subject identity vector
    >>> # Also, each data point from each subject comes from one of three behavior types
    >>> behavior = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 0])

    We can perform a 1-sample t-test on all the x values assuming a null
    population mean of 0. This is equivalent to
    scipy.stats.ttest_1samp(x, popmean=0)

    >>> ttest(x)
    (3.9115219870652487, 0.0009378023836864556)
    
    Or we can do the same test but take into account the subject identity
    of the samples.
    
    >>> ttest(x, cat_feats=groups)
    (-0.33532229593097984, 0.7412584236348754)
    
    If x1 and x2 are paired data, we can do a paired relative t-test.
    This is equivalent to scipy.stats.ttest_rel(x2, x1)
    
    >>> ttest(x2, x1)
    (11.052116218441915, 1.5470048033033527e-06)

    Or the same test but take into account the behavior type of each sample.
    
    >>> ttest(x2, x1, cat_feats=behavior)
    (11.761977075053782, 7.272560323998328e-06)
    
    If x1 and x2 are not paired, we can do an independent 2 sample t-test.
    This is equivalent to scipy.stats.ttest_ind(x2, x1)
    
    >>> ttest(x, classes=groups)
    (11.335049583473713, 1.256342249131581e-09)

    Or the same test but take behavior into account. Since the cat_feats parameter
    must have the same number of samples as the input data, we need to concatenate
    the behavior so that we have a behavior indicator for each sample in x.

    >>> behavior_x1x2 = np.concatenate([behavior, behavior], axis=0)
    >>> ttest(x, classes=groups, cat_feats=behavior_x1x2)
    (11.682487719722063, 3.0312825227658624e-09)
    
    '''

    if not isinstance(cat_feats, np.ndarray) and not isinstance(cat_feats, dict) and not isinstance(cat_feats, pd.DataFrame):
        raise TypeError(f'cat_feats must be a np.ndarray, dict, or DataFrame, but got {type(cat_feats)}')
    if not isinstance(con_feats, np.ndarray) and not isinstance(con_feats, dict) and not isinstance(con_feats, pd.DataFrame):
        raise TypeError(f'con_feats must be a np.ndarray, dict, or DataFrame, but got {type(con_feats)}')

    cat_feats_ = deepcopy(cat_feats)
    con_feats_ = deepcopy(con_feats)

    if isinstance(cat_feats_, pd.DataFrame):
        len_cat = len(cat_feats_.columns)
    elif isinstance(cat_feats_, np.ndarray):
        if cat_feats_.ndim == 1:
            len_cat = 1
            cat_feats_ = cat_feats_[:,np.newaxis]
        cat_feats_ = dict(zip([f'cat_{i}' for i in range(len_cat)], cat_feats_.T))
    else: # already a dict
        len_cat = len(cat_feats_)
    
    
    if isinstance(con_feats_, pd.DataFrame):
        len_con = len(con_feats_.columns)
    elif isinstance(con_feats_, np.ndarray):
        if con_feats_.ndim == 1:
            con_feats_ = con_feats_[:,np.newaxis]
        len_con = con_feats_.shape[1]
        con_feats_ = dict(zip([f'con_{i}' for i in range(len_con)], con_feats_.T))
    else: # already a dict
        len_con = len(con_feats_)
        
    test_type = None
        
    if len(args) == 1:
        y = np.asarray(args[0])
        if classes is not None:
            # independent 2-sample ttest
            assert 'classes' not in cat_feats_, 'the key name "classes" cannot be in the control dict'
            if not np.array_equal(sorted(np.unique(np.asarray(classes))), np.array([0,1])):
                raise ValueError(f'classes must be an array of only zeros and ones')
            cat_feats_['classes'] = np.asarray(classes).astype('int')
            test_type = "ind"
        else:
            test_type = "1_samp"
    elif len(args) == 2:
        x, y = np.asarray(args[0]), np.asarray(args[1])
        assert len(x) == len(y), 'x and y must be the same length'
        y = x - y
        test_type = 'rel'
    else:
        raise ValueError(f'Must provide either 1 or 2 positional arguments (x and y), but got {len(args)}')
    
    new_data = {}
    for k in cat_feats_.keys():
        new_data[k] = cat_feats_[k].astype('str')
    for k in con_feats_.keys():
        new_data[k] = con_feats_[k].astype('float')
        
    if len(new_data) < len_con + len_cat:
        raise ValueError(f'cat_feats and con_feats have overlapping key names.')
        
    formula = ""
    for k in new_data.keys():
        formula += k + " + "
    formula = formula [:-2] # remove the last +

    
    if (test_type == 'rel' or test_type == '1_samp') and len(new_data) == 0:
        X = np.ones((len(y),1)) # need a dummy for the intercept
        names = ['Intercept']
    else:
        X = dmatrix(formula, new_data)
        names = X.design_info.column_names

    X_df = pd.DataFrame(dict(zip(names, X.astype('float').T)))
    
    ols = sm.OLS(y, X_df)
    ols_result = ols.fit()
    
    if test_type == 'rel' or test_type == '1_samp':
        tval = ols_result.tvalues['Intercept']
        pval = ols_result.pvalues['Intercept']
    else:
        tval = ols_result.tvalues['classes[T.1]']
        pval = ols_result.pvalues['classes[T.1]']
    
    if return_ols_result:
        return tval, pval, ols_result
    else:
        return tval, pval