diff --git a/docs/api.rst b/docs/api.rst index 5d12afa4c..ad9f0e267 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -181,8 +181,8 @@ API :toctree: generated/ :template: function.rst - tedana.stats.get_coeffs - tedana.stats.computefeats2 + tedana.stats.get_ls_coeffs + tedana.stats.get_ls_zvalues tedana.stats.getfbounds diff --git a/tedana/decomposition/pca.py b/tedana/decomposition/pca.py index 27ea23af3..aadc874d4 100644 --- a/tedana/decomposition/pca.py +++ b/tedana/decomposition/pca.py @@ -12,7 +12,7 @@ from tedana import metrics, utils, io from tedana.decomposition import ma_pca -from tedana.stats import computefeats2 +from tedana.stats import get_ls_zvalues from tedana.selection import kundu_tedpca LGR = logging.getLogger(__name__) @@ -243,8 +243,13 @@ def tedpca(data_cat, data_oc, combmode, mask, adaptive_mask, t2sG, comptable['normalized variance explained'] = varex_norm # write component maps to 4D image + # compute component spatial maps based on regression of the data on the + # component time series. Internally, regression (orthogonal least squares) + # is performed after z-normalization of data and component time series. + # Finally write component spatial maps in 4D files, where the spatial maps + # will divided by its standard deviation (option normalize=True) comp_ts_z = stats.zscore(comp_ts, axis=0) - comp_maps = utils.unmask(computefeats2(data_oc, comp_ts_z, mask), mask) + comp_maps = utils.unmask(get_ls_zvalues(data_oc, comp_ts_z, mask), mask) io.filewrite(comp_maps, op.join(out_dir, 'pca_components.nii.gz'), ref_img) # Select components using decision tree diff --git a/tedana/io.py b/tedana/io.py index 8e5e70c5b..1d587fe8f 100644 --- a/tedana/io.py +++ b/tedana/io.py @@ -13,7 +13,7 @@ from nilearn.image import new_img_like from tedana import utils -from tedana.stats import computefeats2, get_coeffs +from tedana.stats import get_ls_zvalues, get_ls_coeffs LGR = logging.getLogger(__name__) RepLGR = logging.getLogger('REPORT') @@ -47,8 +47,8 @@ def split_ts(data, mmix, mask, comptable): """ acc = comptable[comptable.classification == 'accepted'].index.values - cbetas = get_coeffs(data - data.mean(axis=-1, keepdims=True), - mmix, mask) + cbetas = get_ls_coeffs(data - data.mean(axis=-1, keepdims=True), + mmix, mask) betas = cbetas[mask] if len(acc) != 0: hikts = utils.unmask(betas[:, acc].dot(mmix.T[acc, :]), mask) @@ -106,7 +106,7 @@ def write_split_ts(data, mmix, mask, comptable, ref_img, out_dir='.', suffix='') dmdata = mdata.T - mdata.T.mean(axis=0) # get variance explained by retained components - betas = get_coeffs(dmdata.T, mmix, mask=None) + betas = get_ls_coeffs(dmdata.T, mmix, mask=None) varexpl = (1 - ((dmdata.T - betas.dot(mmix.T))**2.).sum() / (dmdata**2.).sum()) * 100 LGR.info('Variance explained by ICA decomposition: {:.02f}%'.format(varexpl)) @@ -169,7 +169,7 @@ def writefeats(data, mmix, mask, ref_img, out_dir='.', suffix=''): """ # write feature versions of components - feats = utils.unmask(computefeats2(data, mmix, mask), mask) + feats = utils.unmask(get_ls_zvalues(data, mmix, mask), mask) fname = filewrite(feats, op.join(out_dir, 'feats_{0}'.format(suffix)), ref_img) return fname @@ -227,7 +227,7 @@ def writeresults(ts, mask, comptable, mmix, n_vols, ref_img, out_dir='.'): write_split_ts(ts, mmix, mask, comptable, ref_img, out_dir=out_dir, suffix='OC') - ts_B = get_coeffs(ts, mmix, mask) + ts_B = get_ls_coeffs(ts, mmix, mask) fout = filewrite(ts_B, op.join(out_dir, 'betas_OC'), ref_img) LGR.info('Writing full ICA coefficient feature set: {}'.format(op.abspath(fout))) diff --git a/tedana/metrics/kundu_fit.py b/tedana/metrics/kundu_fit.py index 62813b655..03b4fa8f6 100644 --- a/tedana/metrics/kundu_fit.py +++ b/tedana/metrics/kundu_fit.py @@ -9,7 +9,7 @@ from scipy import stats from tedana import io, utils -from tedana.stats import getfbounds, computefeats2, get_coeffs +from tedana.stats import getfbounds, get_ls_zvalues, get_ls_coeffs LGR = logging.getLogger(__name__) @@ -109,10 +109,10 @@ def dependence_metrics(catd, tsoc, mmix, adaptive_mask, tes, ref_img, # compute un-normalized weight dataset (features) if mmixN is None: mmixN = mmix - WTS = computefeats2(tsoc, mmixN, mask=None, normalize=False) + WTS = get_ls_zvalues(tsoc, mmixN, mask=None) # compute PSC dataset - shouldn't have to refit data - tsoc_B = get_coeffs(tsoc_dm, mmix, mask=None, add_const=False) + tsoc_B = get_ls_coeffs(tsoc_dm, mmix, mask=None, add_const=False) del tsoc_dm tsoc_Babs = np.abs(tsoc_B) PSC = tsoc_B / tsoc.mean(axis=-1, keepdims=True) * 100 @@ -128,10 +128,10 @@ def dependence_metrics(catd, tsoc, mmix, adaptive_mask, tes, ref_img, totvar_norm = (WTS**2).sum() # compute Betas and means over TEs for TE-dependence analysis - betas = get_coeffs(utils.unmask(catd, mask), - mmix_corrected, - np.repeat(mask[:, np.newaxis], len(tes), axis=1), - add_const=True) + betas = get_ls_coeffs(utils.unmask(catd, mask), + mmix_corrected, + np.repeat(mask[:, np.newaxis], len(tes), axis=1), + add_const=True) betas = betas[mask, ...] n_voxels, n_echos, n_components = betas.shape mu = catd.mean(axis=-1, dtype=float) diff --git a/tedana/reporting/static_figures.py b/tedana/reporting/static_figures.py index fb6a0e44b..19649c2c8 100644 --- a/tedana/reporting/static_figures.py +++ b/tedana/reporting/static_figures.py @@ -71,7 +71,7 @@ def comp_figures(ts, mask, comptable, mmix, ref_img, out_dir, png_cmap): n_vols = len(mmix) # regenerate the beta images - ts_B = stats.get_coeffs(ts, mmix, mask) + ts_B = stats.get_ls_coeffs(ts, mmix, mask) ts_B = ts_B.reshape(ref_img.shape[:3] + ts_B.shape[1:]) # trim edges from ts_B array ts_B = _trim_edge_zeros(ts_B) diff --git a/tedana/stats.py b/tedana/stats.py index cea7846bd..ad1b46327 100644 --- a/tedana/stats.py +++ b/tedana/stats.py @@ -7,11 +7,70 @@ from scipy import stats from tedana import utils +from tedana.due import due, BibTeX, Doi LGR = logging.getLogger(__name__) RepLGR = logging.getLogger('REPORT') RefLGR = logging.getLogger('REFERENCES') +T2Z_TRANSFORM = BibTeX(""" + @article{hughett2007accurate, + title={Accurate Computation of the F-to-z and t-to-z Transforms + for Large Arguments}, + author={Hughett, Paul and others}, + journal={Journal of Statistical Software}, + volume={23}, + number={1}, + pages={1--5}, + year={2007}, + publisher={Foundation for Open Access Statistics} + } + """) +T2Z_IMPLEMENTATION = Doi('10.5281/zenodo.32508') + + +@due.dcite(T2Z_TRANSFORM, + description='Introduces T-to-Z transform.') +@due.dcite(T2Z_IMPLEMENTATION, + description='Python implementation of T-to-Z transform.') +def t_to_z(t_values, dof): + """ + From Vanessa Sochat's TtoZ package. + Copyright (c) 2015 Vanessa Sochat + MIT Licensed + """ + + # check if t_values is np.array, and convert if required + t_values = np.asanyarray(t_values) + + # Select just the nonzero voxels + nonzero = t_values[t_values != 0] + + # We will store our results here + z_values = np.zeros(len(nonzero)) + + # Select values less than or == 0, and greater than zero + c = np.zeros(len(nonzero)) + k1 = (nonzero <= c) + k2 = (nonzero > c) + # Subset the data into two sets + t1 = nonzero[k1] + t2 = nonzero[k2] + + # Calculate p values for <=0 + p_values_t1 = stats.t.cdf(t1, df=dof) + z_values_t1 = stats.norm.ppf(p_values_t1) + + # Calculate p values for > 0 + p_values_t2 = stats.t.cdf(-t2, df=dof) + z_values_t2 = -stats.norm.ppf(p_values_t2) + z_values[k1] = z_values_t1 + z_values[k2] = z_values_t2 + # Write new image to file + out = np.zeros(t_values.shape) + out[t_values != 0] = z_values + return out + def getfbounds(n_echos): """ @@ -34,7 +93,7 @@ def getfbounds(n_echos): return f05, f025, f01 -def computefeats2(data, mmix, mask=None, normalize=True): +def get_ls_zvalues(data, mmix, mask=None): """ Converts `data` to component space using `mmix` @@ -56,12 +115,14 @@ def computefeats2(data, mmix, mask=None, normalize=True): Data in component space """ if data.ndim != 2: - raise ValueError('Parameter data should be 2d, not {0}d'.format(data.ndim)) + raise ValueError('Parameter data should be 2d, not ' + '{0}d'.format(data.ndim)) elif mmix.ndim not in [2]: raise ValueError('Parameter mmix should be 2d, not ' '{0}d'.format(mmix.ndim)) elif (mask is not None) and (mask.ndim != 1): - raise ValueError('Parameter mask should be 1d, not {0}d'.format(mask.ndim)) + raise ValueError('Parameter mask should be 1d, not ' + '{0}d'.format(mask.ndim)) elif (mask is not None) and (data.shape[0] != mask.shape[0]): raise ValueError('First dimensions (number of samples) of data ({0}) ' 'and mask ({1}) do not match.'.format(data.shape[0], @@ -74,34 +135,20 @@ def computefeats2(data, mmix, mask=None, normalize=True): # demean masked data if mask is not None: data = data[mask, ...] - # normalize data (subtract mean and divide by standard deviation) in the last dimension - # so that least-squares estimates represent "approximate" correlation values (data_R) - # assuming mixing matrix (mmix) values are also normalized + # normalize data (minus mean and divide by std) data_vn = stats.zscore(data, axis=-1) - # get betas of `data`~`mmix` and limit to range [-0.999, 0.999] - data_R = get_coeffs(data_vn, mmix, mask=None) - # Avoid abs(data_R) => 1, otherwise Fisher's transform will return Inf or -Inf - data_R[data_R < -0.999] = -0.999 - data_R[data_R > 0.999] = 0.999 - - # R-to-Z transform - data_Z = np.arctanh(data_R) + # get betas and z-values of `data`~`mmix` + # mmix is normalized internally + _, data_Z = get_ls_coeffs(data_vn, mmix, mask=None, add_const=False, + compute_zvalues=True) if data_Z.ndim == 1: data_Z = np.atleast_2d(data_Z).T - # normalize data (only division by std) - if normalize: - # subtract mean and dividing by standard deviation - data_Zm = stats.zscore(data_Z, axis=0) - # adding back the mean - data_Z = data_Zm + (data_Z.mean(axis=0, keepdims=True) / - data_Z.std(axis=0, keepdims=True)) - return data_Z -def get_coeffs(data, X, mask=None, add_const=False): +def get_ls_coeffs(data, X, mask=None, add_const=False, compute_zvalues=False, min_df=1): """ Performs least-squares fit of `X` against `data` @@ -115,28 +162,37 @@ def get_coeffs(data, X, mask=None, add_const=False): Boolean mask array add_const : bool, optional Add intercept column to `X` before fitting. Default: False + compute_zvalues : bool, optional + Compute z-values of the betas (predictors) + min_df : integer, optional + Integer to give warning if # df <= min_df Returns ------- betas : (S [x E] x C) :obj:`numpy.ndarray` Array of `S` sample betas for `C` predictors + z_values : (S [x E] x C) :obj:`numpy.ndarray` + Array of `S` sample z-values for `C` predictors + """ if data.ndim not in [2, 3]: - raise ValueError('Parameter data should be 2d or 3d, not {0}d'.format(data.ndim)) + raise ValueError('Parameter data should be 2d or 3d, not ' + '{0}d'.format(data.ndim)) elif X.ndim not in [2]: raise ValueError('Parameter X should be 2d, not {0}d'.format(X.ndim)) elif data.shape[-1] != X.shape[0]: - raise ValueError('Last dimension (dimension {0}) of data ({1}) does not ' - 'match first dimension of ' - 'X ({2})'.format(data.ndim, data.shape[-1], X.shape[0])) + raise ValueError('Last dimension (dimension {0}) of data ({1}) does ' + 'not match first dimension of X ' + '({2})'.format(data.ndim, data.shape[-1], X.shape[0])) # mask data and flip (time x samples) if mask is not None: if mask.ndim not in [1, 2]: - raise ValueError('Parameter data should be 1d or 2d, not {0}d'.format(mask.ndim)) + raise ValueError('Parameter data should be 1d or 2d, not ' + '{0}d'.format(mask.ndim)) elif data.shape[0] != mask.shape[0]: - raise ValueError('First dimensions of data ({0}) and mask ({1}) do not ' - 'match'.format(data.shape[0], mask.shape[0])) + raise ValueError('First dimensions of data ({0}) and mask ({1}) do' + ' not match'.format(data.shape[0], mask.shape[0])) mdata = data[mask, :].T else: mdata = data.T @@ -150,11 +206,45 @@ def get_coeffs(data, X, mask=None, add_const=False): if add_const: # add intercept, if specified X = np.column_stack([X, np.ones((len(X), 1))]) + # least squares estimation: beta = (X^T * X)^(-1) * X^T * mdata + # betas is transposed due to backward compatibility with rest of code. betas = np.linalg.lstsq(X, mdata, rcond=None)[0].T + + if compute_zvalues: + # compute t-values of betas (estimates) and then convert to z-values + # first compute number of degrees of freedom + df = mdata.shape[0] - X.shape[1] + if df == 0: + LGR.error('ERROR: No degrees of freedom left in least squares ' + 'calculation. Stopping!!') + elif df <= min_df: + LGR.warning('Number of degrees of freedom in least-square ' + 'estimation is less than {}'.format(min_df + 1)) + # compute sigma: + # RSS = sum{[mdata - (X * betas)]^2} + # sigma = RSS / Degrees_of_Freedom + sigma = np.sum(np.power(mdata - np.dot(X, betas.T), 2), axis=0) / df + sigma = sigma[:, np.newaxis] + # Copmute std of betas: + # C = (X^T * X)_ii^(-1) + # std(betas) = sqrt(sigma * C) + C = np.diag(np.linalg.pinv(np.dot(X.T, X))) + C = C[:, np.newaxis] + std_betas = np.sqrt(np.dot(sigma, C.T)) + z_values = t_to_z(betas / std_betas, df) + z_values = np.clip(z_values, -40, 40) + if add_const: # drop beta for intercept, if specified betas = betas[:, :-1] + if compute_zvalues: + z_values = z_values[:, :-1] if mask is not None: betas = utils.unmask(betas, mask) + if compute_zvalues: + z_values = utils.unmask(z_values, mask) - return betas + if compute_zvalues: + return betas, z_values + else: + return betas diff --git a/tedana/tests/test_stats.py b/tedana/tests/test_stats.py index bbf28a95d..bec32d9b7 100644 --- a/tedana/tests/test_stats.py +++ b/tedana/tests/test_stats.py @@ -5,14 +5,14 @@ import pytest import random -from tedana.stats import computefeats2 -from tedana.stats import get_coeffs +from tedana.stats import get_ls_zvalues +from tedana.stats import get_ls_coeffs from tedana.stats import getfbounds -def test_break_computefeats2(): +def test_break_get_ls_zvalues(): """ - Ensure that computefeats2 fails when input data do not have the right + Ensure that get_ls_zvalues fails when input data do not have the right shapes. """ n_samples, n_vols, n_comps = 10000, 100, 50 @@ -22,43 +22,43 @@ def test_break_computefeats2(): data = np.empty((n_samples)) with pytest.raises(ValueError): - computefeats2(data, mmix, mask, normalize=True) + get_ls_zvalues(data, mmix, mask) data = np.empty((n_samples, n_vols)) mmix = np.empty((n_vols)) with pytest.raises(ValueError): - computefeats2(data, mmix, mask, normalize=True) + get_ls_zvalues(data, mmix, mask) mmix = np.empty((n_vols, n_comps)) mask = np.empty((n_samples, n_vols)) with pytest.raises(ValueError): - computefeats2(data, mmix, mask, normalize=True) + get_ls_zvalues(data, mmix, mask) mask = np.empty((n_samples + 1)) with pytest.raises(ValueError): - computefeats2(data, mmix, mask, normalize=True) + get_ls_zvalues(data, mmix, mask) data.shape[1] != mmix.shape[0] mask = np.empty((n_samples)) mmix = np.empty((n_vols + 1, n_comps)) with pytest.raises(ValueError): - computefeats2(data, mmix, mask, normalize=True) + get_ls_zvalues(data, mmix, mask) -def test_smoke_computefeats2(): +def test_smoke_get_ls_zvalues(): """ - Ensures that computefeats2 works with random inputs and different optional parameters + Ensures that get_ls_zvalues works with random inputs and different optional parameters """ n_samples, n_times, n_components = 100, 20, 6 data = np.random.random((n_samples, n_times)) mmix = np.random.random((n_times, n_components)) mask = np.random.randint(2, size=n_samples) - assert computefeats2(data, mmix) is not None - assert computefeats2(data, mmix, mask=mask) is not None - assert computefeats2(data, mmix, normalize=False) is not None + assert get_ls_zvalues(data, mmix) is not None + assert get_ls_zvalues(data, mmix, mask=mask) is not None + assert get_ls_zvalues(data, mmix) is not None -def test_get_coeffs(): +def test_get_ls_coeffs(): """ Check least squares coefficients. """ @@ -69,26 +69,26 @@ def test_get_coeffs(): X = np.arange(0, 40)[:, np.newaxis] mask = np.array([True, False]) - betas = get_coeffs(data, X, mask=None, add_const=False) + betas = get_ls_coeffs(data, X, mask=None, add_const=False) betas = np.squeeze(betas) assert np.allclose(betas, np.array([5., 5.])) - betas = get_coeffs(data, X, mask=None, add_const=True) + betas = get_ls_coeffs(data, X, mask=None, add_const=True) betas = np.squeeze(betas) assert np.allclose(betas, np.array([5., 5.])) - betas = get_coeffs(data, X, mask=mask, add_const=False) + betas = get_ls_coeffs(data, X, mask=mask, add_const=False) betas = np.squeeze(betas) assert np.allclose(betas, np.array([5, 0])) - betas = get_coeffs(data, X, mask=mask, add_const=True) + betas = get_ls_coeffs(data, X, mask=mask, add_const=True) betas = np.squeeze(betas) assert np.allclose(betas, np.array([5, 0])) -def test_break_get_coeffs(): +def test_break_get_ls_coeffs(): """ - Ensure that get_coeffs fails when input data do not have the right + Ensure that get_ls_coeffs fails when input data do not have the right shapes. """ n_samples, n_echos, n_vols, n_comps = 10000, 5, 100, 50 @@ -98,41 +98,41 @@ def test_break_get_coeffs(): data = np.empty((n_samples)) with pytest.raises(ValueError): - get_coeffs(data, X, mask, add_const=False) + get_ls_coeffs(data, X, mask, add_const=False) data = np.empty((n_samples, n_vols)) X = np.empty((n_vols)) with pytest.raises(ValueError): - get_coeffs(data, X, mask, add_const=False) + get_ls_coeffs(data, X, mask, add_const=False) data = np.empty((n_samples, n_echos, n_vols + 1)) X = np.empty((n_vols, n_comps)) with pytest.raises(ValueError): - get_coeffs(data, X, mask, add_const=False) + get_ls_coeffs(data, X, mask, add_const=False) data = np.empty((n_samples, n_echos, n_vols)) mask = np.empty((n_samples, n_echos, n_vols)) with pytest.raises(ValueError): - get_coeffs(data, X, mask, add_const=False) + get_ls_coeffs(data, X, mask, add_const=False) mask = np.empty((n_samples + 1, n_echos)) with pytest.raises(ValueError): - get_coeffs(data, X, mask, add_const=False) + get_ls_coeffs(data, X, mask, add_const=False) -def test_smoke_get_coeffs(): +def test_smoke_get_ls_coeffs(): """ - Ensure that get_coeffs returns outputs with different inputs and optional paramters + Ensure that get_ls_coeffs returns outputs with different inputs and optional paramters """ n_samples, _, n_times, n_components = 100, 5, 20, 6 data_2d = np.random.random((n_samples, n_times)) x = np.random.random((n_times, n_components)) mask = np.random.randint(2, size=n_samples) - assert get_coeffs(data_2d, x) is not None - # assert get_coeffs(data_3d, x) is not None TODO: submit an issue for the bug - assert get_coeffs(data_2d, x, mask=mask) is not None - assert get_coeffs(data_2d, x, add_const=True) is not None + assert get_ls_coeffs(data_2d, x) is not None + # assert get_ls_coeffs(data_3d, x) is not None TODO: submit an issue for the bug + assert get_ls_coeffs(data_2d, x, mask=mask) is not None + assert get_ls_coeffs(data_2d, x, add_const=True) is not None def test_getfbounds(): diff --git a/tedana/workflows/tedana.py b/tedana/workflows/tedana.py index 0b4d7c15f..61af4628f 100644 --- a/tedana/workflows/tedana.py +++ b/tedana/workflows/tedana.py @@ -19,7 +19,7 @@ from tedana import (decay, combine, decomposition, io, metrics, reporting, selection, utils) import tedana.gscontrol as gsc -from tedana.stats import computefeats2 +from tedana.stats import get_ls_zvalues from tedana.workflows.parser_utils import is_valid_file, check_tedpca_value, ContextFilter LGR = logging.getLogger(__name__) @@ -558,7 +558,7 @@ def tedana_workflow(data, tes, out_dir='.', mask=None, for comp in comptable.index.values] mixing_df = pd.DataFrame(data=mmix, columns=comp_names) mixing_df.to_csv(op.join(out_dir, 'ica_mixing.tsv'), sep='\t', index=False) - betas_oc = utils.unmask(computefeats2(data_oc, mmix, mask), mask) + betas_oc = utils.unmask(get_ls_zvalues(data_oc, mmix, mask), mask) io.filewrite(betas_oc, op.join(out_dir, 'ica_components.nii.gz'), ref_img) @@ -578,7 +578,7 @@ def tedana_workflow(data, tes, out_dir='.', mask=None, comptable = io.load_comptable(ctab) if manacc is not None: comptable = selection.manual_selection(comptable, acc=manacc) - betas_oc = utils.unmask(computefeats2(data_oc, mmix, mask), mask) + betas_oc = utils.unmask(get_ls_zvalues(data_oc, mmix, mask), mask) io.filewrite(betas_oc, op.join(out_dir, 'ica_components.nii.gz'), ref_img)