Source code for naplib.stats.mixedeffectsmodel

import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning
import statsmodels.formula.api as smf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


[docs] class LinearMixedEffectsModel(): ''' A linear mixed effects model which can be used for main effect plots. A mixed effects model is useful when the data has some non-independence whose effect should be accounted for separately from the effects of the independent variables. For example, if half of the data comes from subject A and the other half from subject B, and you want to know the main effects of other predictor features. Parameters ---------- alpha : float, default=0.05 Alpha level for confidence interval. zscore_x : bool, default False Whether or not to zscore the columns of X when calling .fit(). Attributes ---------- varnames : list List of variable names mixedlm : statsmodels.regression.mixed_linear_model.MixedLM object Object which performs mixed linear model fitting mixedlm_results : statsmodels.regression.mixed_linear_model.MixedLMResults object Object which contains outputs from fitting. params : pd.DataFrame Coefficients for each variable. pvalues : pd.DataFrame Pvalues for each param. conf_int : pd.DataFrame Confidence interval around each param. rsquared : np.float R-squared value. ''' def __init__(self, alpha=0.05, zscore_x=False): self._alpha = alpha self._zscore_x = zscore_x self.varnames = [] self._isfit = False
[docs] def fit(self, X, y, random_effect=None, varnames=None): ''' Parameters ---------- X : np.array, shape (num_samples, num_features) y : np.array, shape (num_samples,) random_effect : np.array, shape (num_samples,), optional If given, used as a random effect in the model. For example, to use group identity as a random effect, this should be an array of integers specifying what group each sample belongs to. The values do not matter, only the categorical groups they form. varnames : list, optional, default None List of variable names, must be length (num_features+1) or (num_features + num_features_r + 1), giving the name for each feature in X, each feature in random_effect (if given) as well as a name for the predicted output in y. Returns ------- model : returns an instance of self ''' if random_effect is not None: if random_effect.ndim == 1: random_effect = random_effect[:, np.newaxis] elif random_effect.shape[1] > 1: raise ValueError(f'Only 1 random effect variable is currently supported, but got {random_effect.shape[1]}') if varnames is None: varnames = [f'feature-{i}' for i in range(X.shape[1])] varnames.append('y') if random_effect is None and len(varnames) != X.shape[1] + 1: raise Exception(f'Error: incorrect length of input "varnames". Must include name for each feature in X as well as a name for predicted y.') if random_effect is not None and len(varnames) != X.shape[1] + random_effect.shape[1] + 1: raise Exception(f'Error: incorrect length of input "varnames". Must include name for each feature in X, each feature in random_effect,' f' as well as a name for predicted y.') self.varnames = varnames[:X.shape[1]] + varnames[-1:] # remove spaces and dashes because they break the smf.mixedlm formula varnames_formula = varnames.copy() for bad_char in [' ', '-']: varnames_formula = [v.replace(bad_char, '') for v in varnames_formula] warnings.simplefilter('ignore', ConvergenceWarning) if self._zscore_x: X = (X - X.mean(0)) / X.std(0) self._y = y if random_effect is None: df_tmp = pd.DataFrame(np.concatenate((X, y.reshape(-1,1)), axis=1), columns=varnames_formula) formula = f'{varnames_formula[-1]} ~ {varnames_formula[0]}' for varname in varnames_formula[1:-1]: formula += f' + {varname}' mixedlm = smf.mixedlm(formula=formula, data=df_tmp, groups=[1 for _ in range(df_tmp.shape[0])]) else: df_tmp = pd.DataFrame(np.concatenate((X, random_effect, y.reshape(-1,1)), axis=1), columns=varnames_formula) formula = f'{varnames_formula[-1]} ~ {varnames_formula[0]}' for varname in varnames_formula[1:X.shape[1]]: formula += f' + {varname}' mixedlm = smf.mixedlm(formula=formula, data=df_tmp, groups=varnames[-2], re_formula='1') mixedlm_results = mixedlm.fit() self.mixedlm = mixedlm self.mixedlm_results = mixedlm_results self.params = mixedlm_results.params self.pvalues = mixedlm_results.pvalues self.conf_int = mixedlm_results.conf_int(alpha=self._alpha) self._isfit = True return self
[docs] def get_model_params(self): ''' Get model params after fitting. Returns ------- param_dict : dict dict of model params, pvalues, and confidence intervals for each variable. ''' if not self._isfit: raise Exception('Must call .fit() before trying to acces model params.') return {'params': np.array(self.params)[1:-1], 'pvalues': np.array(self.pvalues)[1:-1], 'conf_int': np.array(self.conf_int)[1:-1,:]}
[docs] def plot_effects(self, ax=None, plus_minus_colors=None, line_alpha=0.6, markersize=8, center_zero=True, print_ylabels=True): ''' Create main effects plot. Parameters ---------- ax : pyplot axes, optional, default=gca() plus_minus_colors : list or np.array, length=2, default red, blue Colors to use for the effect lines when they are significantly positive or negative, respectively. line_alpha : float, default=0.06 alpha value for lines markersize : float, default=8 Size of marker at each effect center_zero : bool, default=True Whether to put 0 in the center of x-axis. print_ylabels : bool, default=True Whether to print y labels ''' if ax is None: ax = plt.gca() if plus_minus_colors is None: plus_minus_colors = np.array([[255., 44, 0], [97., 164, 205]]) / 255 # blue and red model_params = self.get_model_params() for effect_idx in range(len(model_params['params'])): # check if this effect is significant if model_params['params'][effect_idx] > 0 and model_params['conf_int'][effect_idx].min() > 0: color_to_use = plus_minus_colors[0] elif model_params['params'][effect_idx] < 0 and model_params['conf_int'][effect_idx].max() < 0: color_to_use = plus_minus_colors[1] else: color_to_use = [0.5, 0.5, 0.5] ax.plot(model_params['conf_int'][effect_idx,:], effect_idx * np.ones_like(model_params['conf_int'][effect_idx-1,:]), color=color_to_use, alpha=line_alpha) ax.plot(model_params['params'][effect_idx], np.array([effect_idx]), marker="o", markersize=markersize, markeredgecolor=color_to_use, markerfacecolor=color_to_use) ax.set_ylim([-0.5, 1.5]) if center_zero: xabs_max = abs(max(ax.get_xlim(), key=abs)) * 1.1 ax.set_xlim(xmin=-xabs_max, xmax=xabs_max) ylims = ax.get_ylim() ax.set_ylim([-0.5, len(model_params['params'])-0.5]) ylims = ax.get_ylim() ax.vlines(0, ylims[0], ylims[1],'k',linestyles='dashed', linewidth=1, alpha=0.5) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.spines['left'].set_visible(False) if print_ylabels: ax.tick_params(top=False, bottom=True, left=False, right=False, labelleft=True, labelbottom=True) else: ax.tick_params(top=False, bottom=True, left=False, right=False, labelleft=False, labelbottom=True) ax.set_yticks([nn for nn in range(len(model_params['params']))]) ax.set_yticklabels(self.varnames[:-1]) return ax
[docs] def summary(self): ''' Print summary of model. ''' print(self.mixedlm_results.summary())
@property def rsquared(self): '''r-squared : float''' rss_tot = np.sum(np.square(self._y-self._y.mean())) rss_res = np.sum(np.square(self.mixedlm_results.resid)) return 1.0 - rss_res/rss_tot