Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] Fix z statistic computation in computefeats2 #644

Draft
wants to merge 26 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
904bc01
change stats.py to compute OLS based on Pseudo-Inverse & Compute Z-st…
CesarCaballeroGaudes Nov 25, 2019
8587b79
updating pca.py to same as tedana in upstream
CesarCaballeroGaudes Nov 25, 2019
1617d2a
Changed default compute_zvalues of get_coeffs to F
Nov 25, 2019
2453c78
LIntered and changed name of function get_coeff into compute_least_sq…
Nov 27, 2019
94cc77e
Changed orientation of "beta" matrix in RSS
Nov 27, 2019
b3baa52
Renmaed compute_least_squares into get_ls_coeffs
Nov 27, 2019
487416d
Lintering and Removing part of code that is not needed anymore due to…
Nov 27, 2019
b96c3b2
Improved comments on what's going on in get_ls_coeffs
Nov 27, 2019
7061614
Corrected use of "due" in stats.py
Nov 27, 2019
4820c19
Corrected use of duecredit
Nov 27, 2019
1def734
Merge pull request #1 from smoia/patch-1
CesarCaballeroGaudes Nov 29, 2019
ba3aa26
Merge branch 'OLS_stats_modification' into OLS_stats_modification
CesarCaballeroGaudes Nov 29, 2019
2ec3b3c
Merge pull request #2 from smoia/OLS_stats_modification
CesarCaballeroGaudes Nov 29, 2019
25b5ea3
compute OLS with np.linalg.lstsq instead of np.linalg.pinv
CesarCaballeroGaudes Nov 29, 2019
8b03c2b
Removed unused parameter from computefeats2
Dec 2, 2019
88d049d
Removed unused parameter
Dec 2, 2019
adc2850
Removed unused parameter
Dec 8, 2019
f8dfc8b
Removed unused parameter
Dec 8, 2019
c679600
Added limits to betas and z_scores to avoid breaking code due to INFs.
Dec 11, 2019
2e2c63b
Merge branch 'OLS_stats_modification' of github.com:smoia/tedana into…
Dec 11, 2019
a4d2c35
Merge remote-tracking branch 'upstream/master' into OLS_stats_modific…
Dec 11, 2019
ae582b6
Modified nan_to_num to clip in order to limit arrays, removed limit o…
Dec 11, 2019
cd42798
Merge branch 'master' into OLS_stats_modification
Jan 15, 2021
d8bd355
Align indented line and keep the linter happy.
Jan 15, 2021
4c85723
Merge remote-tracking branch 'origin/main' into OLS_stats_modification
Feb 15, 2021
8cfa3ca
Rename computefeats2 as get_ls_zvalues
Feb 15, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ API
:toctree: generated/
:template: function.rst

tedana.stats.get_coeffs
tedana.stats.get_ls_coeffs
tedana.stats.computefeats2
tedana.stats.getfbounds

Expand Down
5 changes: 5 additions & 0 deletions tedana/decomposition/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,11 @@ def tedpca(data_cat, data_oc, combmode, mask, t2s, t2sG,
comptable['normalized variance explained'] = varex_norm

# write component maps to 4D image
# compute component spatial maps based on regression of the data on the
# component time series. Internally, regression (orthogonal least squares)
# is performed after z-normalization of data and component time series.
# Finally write component spatial maps in 4D files, where the spatial maps
# will divided by its standard deviation (option normalize=True)
comp_ts_z = stats.zscore(comp_ts, axis=0)
comp_maps = utils.unmask(computefeats2(data_oc, comp_ts_z, mask), mask)
io.filewrite(comp_maps, op.join(out_dir, 'pca_components.nii.gz'), ref_img)
Expand Down
10 changes: 5 additions & 5 deletions tedana/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from nilearn.image import new_img_like

from tedana import utils
from tedana.stats import computefeats2, get_coeffs
from tedana.stats import computefeats2, get_ls_coeffs

LGR = logging.getLogger(__name__)
RepLGR = logging.getLogger('REPORT')
Expand Down Expand Up @@ -47,8 +47,8 @@ def split_ts(data, mmix, mask, comptable):
"""
acc = comptable[comptable.classification == 'accepted'].index.values

cbetas = get_coeffs(data - data.mean(axis=-1, keepdims=True),
mmix, mask)
cbetas = get_ls_coeffs(data - data.mean(axis=-1, keepdims=True),
mmix, mask)
betas = cbetas[mask]
if len(acc) != 0:
hikts = utils.unmask(betas[:, acc].dot(mmix.T[acc, :]), mask)
Expand Down Expand Up @@ -104,7 +104,7 @@ def write_split_ts(data, mmix, mask, comptable, ref_img, suffix=''):
dmdata = mdata.T - mdata.T.mean(axis=0)

# get variance explained by retained components
betas = get_coeffs(dmdata.T, mmix, mask=None)
betas = get_ls_coeffs(dmdata.T, mmix, mask=None)
varexpl = (1 - ((dmdata.T - betas.dot(mmix.T))**2.).sum() /
(dmdata**2.).sum()) * 100
LGR.info('Variance explained by ICA decomposition: {:.02f}%'.format(varexpl))
Expand Down Expand Up @@ -223,7 +223,7 @@ def writeresults(ts, mask, comptable, mmix, n_vols, ref_img):

write_split_ts(ts, mmix, mask, comptable, ref_img, suffix='OC')

ts_B = get_coeffs(ts, mmix, mask)
ts_B = get_ls_coeffs(ts, mmix, mask)
fout = filewrite(ts_B, 'betas_OC', ref_img)
LGR.info('Writing full ICA coefficient feature set: {}'.format(op.abspath(fout)))

Expand Down
4 changes: 2 additions & 2 deletions tedana/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
# ex: set sts=4 ts=4 sw=4 et:

from .kundu_fit import (
dependence_metrics, kundu_metrics, get_coeffs, computefeats2
dependence_metrics, kundu_metrics, get_ls_coeffs, computefeats2
)

__all__ = [
'dependence_metrics', 'kundu_metrics', 'get_coeffs', 'computefeats2']
'dependence_metrics', 'kundu_metrics', 'get_ls_coeffs', 'computefeats2']
12 changes: 6 additions & 6 deletions tedana/metrics/kundu_fit.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from scipy import stats

from tedana import io, utils
from tedana.stats import getfbounds, computefeats2, get_coeffs
from tedana.stats import getfbounds, computefeats2, get_ls_coeffs


LGR = logging.getLogger(__name__)
Expand Down Expand Up @@ -104,10 +104,10 @@ def dependence_metrics(catd, tsoc, mmix, t2s, tes, ref_img,
# compute un-normalized weight dataset (features)
if mmixN is None:
mmixN = mmix
WTS = computefeats2(tsoc, mmixN, mask=None, normalize=False)
WTS = computefeats2(tsoc, mmixN, mask=None)

# compute PSC dataset - shouldn't have to refit data
tsoc_B = get_coeffs(tsoc_dm, mmix, mask=None)
tsoc_B = get_ls_coeffs(tsoc_dm, mmix, mask=None)
del tsoc_dm
tsoc_Babs = np.abs(tsoc_B)
PSC = tsoc_B / tsoc.mean(axis=-1, keepdims=True) * 100
Expand All @@ -124,9 +124,9 @@ def dependence_metrics(catd, tsoc, mmix, t2s, tes, ref_img,
totvar_norm = (WTS**2).sum()

# compute Betas and means over TEs for TE-dependence analysis
betas = get_coeffs(utils.unmask(catd, mask),
mmix,
np.repeat(mask[:, np.newaxis], len(tes), axis=1))
betas = get_ls_coeffs(utils.unmask(catd, mask),
mmix,
np.repeat(mask[:, np.newaxis], len(tes), axis=1))
betas = betas[mask, ...]
n_voxels, n_echos, n_components = betas.shape
mu = catd.mean(axis=-1, dtype=float)
Expand Down
146 changes: 121 additions & 25 deletions tedana/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,70 @@
from scipy import stats

from tedana import utils
from tedana.due import due, BibTeX, Doi

LGR = logging.getLogger(__name__)
RepLGR = logging.getLogger('REPORT')
RefLGR = logging.getLogger('REFERENCES')

T2Z_TRANSFORM = BibTeX("""
@article{hughett2007accurate,
title={Accurate Computation of the F-to-z and t-to-z Transforms
for Large Arguments},
author={Hughett, Paul and others},
journal={Journal of Statistical Software},
volume={23},
number={1},
pages={1--5},
year={2007},
publisher={Foundation for Open Access Statistics}
}
""")
T2Z_IMPLEMENTATION = Doi('10.5281/zenodo.32508')


@due.dcite(T2Z_TRANSFORM,
description='Introduces T-to-Z transform.')
@due.dcite(T2Z_IMPLEMENTATION,
description='Python implementation of T-to-Z transform.')
def t_to_z(t_values, dof):
"""
From Vanessa Sochat's TtoZ package.
Copyright (c) 2015 Vanessa Sochat
MIT Licensed
"""
Comment on lines +37 to +41
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just recommending the docstring I use in NiMARE.

Suggested change
"""
From Vanessa Sochat's TtoZ package.
Copyright (c) 2015 Vanessa Sochat
MIT Licensed
"""
"""Convert t-statistics to z-statistics.
An implementation of [1]_ from Vanessa Sochat's TtoZ package [2]_.
Parameters
----------
t_values : array_like
T-statistics
dof : int
Degrees of freedom
Returns
-------
z_values : array_like
Z-statistics
References
----------
.. [1] Hughett, P. (2007). Accurate Computation of the F-to-z and t-to-z
Transforms for Large Arguments. Journal of Statistical Software,
23(1), 1-5.
.. [2] Sochat, V. (2015, October 21). TtoZ Original Release. Zenodo.
http://doi.org/10.5281/zenodo.32508
"""

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The actual code from NiMARE might also be good to use since I improved the internal variable names.


# check if t_values is np.array, and convert if required
t_values = np.asanyarray(t_values)

# Select just the nonzero voxels
nonzero = t_values[t_values != 0]

# We will store our results here
z_values = np.zeros(len(nonzero))

# Select values less than or == 0, and greater than zero
c = np.zeros(len(nonzero))
k1 = (nonzero <= c)
k2 = (nonzero > c)
# Subset the data into two sets
t1 = nonzero[k1]
t2 = nonzero[k2]

# Calculate p values for <=0
p_values_t1 = stats.t.cdf(t1, df=dof)
z_values_t1 = stats.norm.ppf(p_values_t1)

# Calculate p values for > 0
p_values_t2 = stats.t.cdf(-t2, df=dof)
z_values_t2 = -stats.norm.ppf(p_values_t2)
z_values[k1] = z_values_t1
z_values[k2] = z_values_t2
# Write new image to file
out = np.zeros(t_values.shape)
out[t_values != 0] = z_values
return out


def getfbounds(n_echos):
"""
Expand All @@ -34,7 +93,7 @@ def getfbounds(n_echos):
return f05, f025, f01


def computefeats2(data, mmix, mask=None, normalize=True):
def computefeats2(data, mmix, mask=None):
smoia marked this conversation as resolved.
Show resolved Hide resolved
"""
Converts `data` to component space using `mmix`
Comment on lines 97 to 98
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"""
Converts `data` to component space using `mmix`
"""Fits mmix to data with OLS and calculates standardized beta values and z-statistics.


Expand All @@ -56,12 +115,14 @@ def computefeats2(data, mmix, mask=None, normalize=True):
Data in component space
"""
if data.ndim != 2:
raise ValueError('Parameter data should be 2d, not {0}d'.format(data.ndim))
raise ValueError('Parameter data should be 2d, not '
'{0}d'.format(data.ndim))
elif mmix.ndim not in [2]:
raise ValueError('Parameter mmix should be 2d, not '
'{0}d'.format(mmix.ndim))
elif (mask is not None) and (mask.ndim != 1):
raise ValueError('Parameter mask should be 1d, not {0}d'.format(mask.ndim))
raise ValueError('Parameter mask should be 1d, not '
'{0}d'.format(mask.ndim))
elif (mask is not None) and (data.shape[0] != mask.shape[0]):
raise ValueError('First dimensions (number of samples) of data ({0}) '
'and mask ({1}) do not match.'.format(data.shape[0],
Expand All @@ -74,28 +135,20 @@ def computefeats2(data, mmix, mask=None, normalize=True):
# demean masked data
if mask is not None:
data = data[mask, ...]
# normalize data (minus mean and divide by std)
data_vn = stats.zscore(data, axis=-1)

# get betas of `data`~`mmix` and limit to range [-0.999, 0.999]
data_R = get_coeffs(data_vn, mmix, mask=None)
data_R[data_R < -0.999] = -0.999
data_R[data_R > 0.999] = 0.999

# R-to-Z transform
data_Z = np.arctanh(data_R)
# get betas and z-values of `data`~`mmix`
# mmix is normalized internally
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should do it here anyway.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What if we use add_const=True while calling get_ls_coeffs instead, now that there's the option? I think it would make more sense given how the program runs.

In fact, I would set add_const default as True, given that if the data is not demeaned, you need the intercept, and if it's demeaned, the intercept doesn't bother you (it's 0).

@CesarCaballeroGaudes is there any other difference between normalising before computing LS and adding the intercept in the model?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like including the intercept by default, but isn't variance normalization required to get standardized parameter estimates?

_, data_Z = get_ls_coeffs(data_vn, mmix, mask=None, add_const=False,
compute_zvalues=True)
if data_Z.ndim == 1:
data_Z = np.atleast_2d(data_Z).T

# normalize data
if normalize:
data_Zm = stats.zscore(data_Z, axis=0)
data_Z = data_Zm + (data_Z.mean(axis=0, keepdims=True) /
data_Z.std(axis=0, keepdims=True))

return data_Z


def get_coeffs(data, X, mask=None, add_const=False):
def get_ls_coeffs(data, X, mask=None, add_const=False, compute_zvalues=False, min_df=1):
"""
Performs least-squares fit of `X` against `data`

Expand All @@ -109,28 +162,37 @@ def get_coeffs(data, X, mask=None, add_const=False):
Boolean mask array
add_const : bool, optional
Add intercept column to `X` before fitting. Default: False
compute_zvalues : bool, optional
Compute z-values of the betas (predictors)
min_df : integer, optional
Integer to give warning if # df <= min_df

Returns
-------
betas : (S [x E] x C) :obj:`numpy.ndarray`
Array of `S` sample betas for `C` predictors
z_values : (S [x E] x C) :obj:`numpy.ndarray`
Array of `S` sample z-values for `C` predictors

"""
if data.ndim not in [2, 3]:
raise ValueError('Parameter data should be 2d or 3d, not {0}d'.format(data.ndim))
raise ValueError('Parameter data should be 2d or 3d, not '
'{0}d'.format(data.ndim))
elif X.ndim not in [2]:
raise ValueError('Parameter X should be 2d, not {0}d'.format(X.ndim))
elif data.shape[-1] != X.shape[0]:
raise ValueError('Last dimension (dimension {0}) of data ({1}) does not '
'match first dimension of '
'X ({2})'.format(data.ndim, data.shape[-1], X.shape[0]))
raise ValueError('Last dimension (dimension {0}) of data ({1}) does '
'not match first dimension of X '
'({2})'.format(data.ndim, data.shape[-1], X.shape[0]))

# mask data and flip (time x samples)
if mask is not None:
if mask.ndim not in [1, 2]:
raise ValueError('Parameter data should be 1d or 2d, not {0}d'.format(mask.ndim))
raise ValueError('Parameter data should be 1d or 2d, not '
'{0}d'.format(mask.ndim))
elif data.shape[0] != mask.shape[0]:
raise ValueError('First dimensions of data ({0}) and mask ({1}) do not '
'match'.format(data.shape[0], mask.shape[0]))
raise ValueError('First dimensions of data ({0}) and mask ({1}) do'
' not match'.format(data.shape[0], mask.shape[0]))
mdata = data[mask, :].T
else:
mdata = data.T
Expand All @@ -144,11 +206,45 @@ def get_coeffs(data, X, mask=None, add_const=False):
if add_const: # add intercept, if specified
X = np.column_stack([X, np.ones((len(X), 1))])

# least squares estimation: beta = (X^T * X)^(-1) * X^T * mdata
# betas is transposed due to backward compatibility with rest of code.
betas = np.linalg.lstsq(X, mdata, rcond=None)[0].T

if compute_zvalues:
# compute t-values of betas (estimates) and then convert to z-values
# first compute number of degrees of freedom
df = mdata.shape[0] - X.shape[1]
if df == 0:
LGR.error('ERROR: No degrees of freedom left in least squares '
'calculation. Stopping!!')
elif df <= min_df:
LGR.warning('Number of degrees of freedom in least-square '
'estimation is less than {}'.format(min_df + 1))
# compute sigma:
# RSS = sum{[mdata - (X * betas)]^2}
# sigma = RSS / Degrees_of_Freedom
sigma = np.sum(np.power(mdata - np.dot(X, betas.T), 2), axis=0) / df
sigma = sigma[:, np.newaxis]
# Copmute std of betas:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# Copmute std of betas:
# compute std of betas:

# C = (X^T * X)_ii^(-1)
# std(betas) = sqrt(sigma * C)
C = np.diag(np.linalg.pinv(np.dot(X.T, X)))
C = C[:, np.newaxis]
std_betas = np.sqrt(np.dot(sigma, C.T))
z_values = t_to_z(betas / std_betas, df)
z_values = np.clip(z_values, -40, 40)

if add_const: # drop beta for intercept, if specified
betas = betas[:, :-1]
if compute_zvalues:
z_values = z_values[:, :-1]

if mask is not None:
betas = utils.unmask(betas, mask)
if compute_zvalues:
z_values = utils.unmask(z_values, mask)

return betas
if compute_zvalues:
return betas, z_values
else:
return betas
Loading