-
Notifications
You must be signed in to change notification settings - Fork 95
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[FIX] Fix z statistic computation in computefeats2 #644
base: main
Are you sure you want to change the base?
Changes from 22 commits
904bc01
8587b79
1617d2a
2453c78
94cc77e
b3baa52
487416d
b96c3b2
7061614
4820c19
1def734
ba3aa26
2ec3b3c
25b5ea3
8b03c2b
88d049d
adc2850
f8dfc8b
c679600
2e2c63b
a4d2c35
ae582b6
cd42798
d8bd355
4c85723
8cfa3ca
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
|
@@ -7,11 +7,70 @@ | |||||||
from scipy import stats | ||||||||
|
||||||||
from tedana import utils | ||||||||
from tedana.due import due, BibTeX, Doi | ||||||||
|
||||||||
LGR = logging.getLogger(__name__) | ||||||||
RepLGR = logging.getLogger('REPORT') | ||||||||
RefLGR = logging.getLogger('REFERENCES') | ||||||||
|
||||||||
T2Z_TRANSFORM = BibTeX(""" | ||||||||
@article{hughett2007accurate, | ||||||||
title={Accurate Computation of the F-to-z and t-to-z Transforms | ||||||||
for Large Arguments}, | ||||||||
author={Hughett, Paul and others}, | ||||||||
journal={Journal of Statistical Software}, | ||||||||
volume={23}, | ||||||||
number={1}, | ||||||||
pages={1--5}, | ||||||||
year={2007}, | ||||||||
publisher={Foundation for Open Access Statistics} | ||||||||
} | ||||||||
""") | ||||||||
T2Z_IMPLEMENTATION = Doi('10.5281/zenodo.32508') | ||||||||
|
||||||||
|
||||||||
@due.dcite(T2Z_TRANSFORM, | ||||||||
description='Introduces T-to-Z transform.') | ||||||||
@due.dcite(T2Z_IMPLEMENTATION, | ||||||||
description='Python implementation of T-to-Z transform.') | ||||||||
def t_to_z(t_values, dof): | ||||||||
""" | ||||||||
From Vanessa Sochat's TtoZ package. | ||||||||
Copyright (c) 2015 Vanessa Sochat | ||||||||
MIT Licensed | ||||||||
""" | ||||||||
|
||||||||
# check if t_values is np.array, and convert if required | ||||||||
t_values = np.asanyarray(t_values) | ||||||||
|
||||||||
# Select just the nonzero voxels | ||||||||
nonzero = t_values[t_values != 0] | ||||||||
|
||||||||
# We will store our results here | ||||||||
z_values = np.zeros(len(nonzero)) | ||||||||
|
||||||||
# Select values less than or == 0, and greater than zero | ||||||||
c = np.zeros(len(nonzero)) | ||||||||
k1 = (nonzero <= c) | ||||||||
k2 = (nonzero > c) | ||||||||
# Subset the data into two sets | ||||||||
t1 = nonzero[k1] | ||||||||
t2 = nonzero[k2] | ||||||||
|
||||||||
# Calculate p values for <=0 | ||||||||
p_values_t1 = stats.t.cdf(t1, df=dof) | ||||||||
z_values_t1 = stats.norm.ppf(p_values_t1) | ||||||||
|
||||||||
# Calculate p values for > 0 | ||||||||
p_values_t2 = stats.t.cdf(-t2, df=dof) | ||||||||
z_values_t2 = -stats.norm.ppf(p_values_t2) | ||||||||
z_values[k1] = z_values_t1 | ||||||||
z_values[k2] = z_values_t2 | ||||||||
# Write new image to file | ||||||||
out = np.zeros(t_values.shape) | ||||||||
out[t_values != 0] = z_values | ||||||||
return out | ||||||||
|
||||||||
|
||||||||
def getfbounds(n_echos): | ||||||||
""" | ||||||||
|
@@ -34,7 +93,7 @@ def getfbounds(n_echos): | |||||||
return f05, f025, f01 | ||||||||
|
||||||||
|
||||||||
def computefeats2(data, mmix, mask=None, normalize=True): | ||||||||
def computefeats2(data, mmix, mask=None): | ||||||||
smoia marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||
""" | ||||||||
Converts `data` to component space using `mmix` | ||||||||
Comment on lines
97
to
98
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
|
||||||||
|
@@ -56,12 +115,14 @@ def computefeats2(data, mmix, mask=None, normalize=True): | |||||||
Data in component space | ||||||||
""" | ||||||||
if data.ndim != 2: | ||||||||
raise ValueError('Parameter data should be 2d, not {0}d'.format(data.ndim)) | ||||||||
raise ValueError('Parameter data should be 2d, not ' | ||||||||
'{0}d'.format(data.ndim)) | ||||||||
elif mmix.ndim not in [2]: | ||||||||
raise ValueError('Parameter mmix should be 2d, not ' | ||||||||
'{0}d'.format(mmix.ndim)) | ||||||||
elif (mask is not None) and (mask.ndim != 1): | ||||||||
raise ValueError('Parameter mask should be 1d, not {0}d'.format(mask.ndim)) | ||||||||
raise ValueError('Parameter mask should be 1d, not ' | ||||||||
'{0}d'.format(mask.ndim)) | ||||||||
elif (mask is not None) and (data.shape[0] != mask.shape[0]): | ||||||||
raise ValueError('First dimensions (number of samples) of data ({0}) ' | ||||||||
'and mask ({1}) do not match.'.format(data.shape[0], | ||||||||
|
@@ -74,28 +135,20 @@ def computefeats2(data, mmix, mask=None, normalize=True): | |||||||
# demean masked data | ||||||||
if mask is not None: | ||||||||
data = data[mask, ...] | ||||||||
# normalize data (minus mean and divide by std) | ||||||||
data_vn = stats.zscore(data, axis=-1) | ||||||||
|
||||||||
# get betas of `data`~`mmix` and limit to range [-0.999, 0.999] | ||||||||
data_R = get_coeffs(data_vn, mmix, mask=None) | ||||||||
data_R[data_R < -0.999] = -0.999 | ||||||||
data_R[data_R > 0.999] = 0.999 | ||||||||
|
||||||||
# R-to-Z transform | ||||||||
data_Z = np.arctanh(data_R) | ||||||||
# get betas and z-values of `data`~`mmix` | ||||||||
# mmix is normalized internally | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should do it here anyway. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What if we use In fact, I would set add_const default as True, given that if the data is not demeaned, you need the intercept, and if it's demeaned, the intercept doesn't bother you (it's 0). @CesarCaballeroGaudes is there any other difference between normalising before computing LS and adding the intercept in the model? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like including the intercept by default, but isn't variance normalization required to get standardized parameter estimates? |
||||||||
_, data_Z = get_ls_coeffs(data_vn, mmix, mask=None, add_const=False, | ||||||||
compute_zvalues=True) | ||||||||
if data_Z.ndim == 1: | ||||||||
data_Z = np.atleast_2d(data_Z).T | ||||||||
|
||||||||
# normalize data | ||||||||
if normalize: | ||||||||
data_Zm = stats.zscore(data_Z, axis=0) | ||||||||
data_Z = data_Zm + (data_Z.mean(axis=0, keepdims=True) / | ||||||||
data_Z.std(axis=0, keepdims=True)) | ||||||||
|
||||||||
return data_Z | ||||||||
|
||||||||
|
||||||||
def get_coeffs(data, X, mask=None, add_const=False): | ||||||||
def get_ls_coeffs(data, X, mask=None, add_const=False, compute_zvalues=False, min_df=1): | ||||||||
""" | ||||||||
Performs least-squares fit of `X` against `data` | ||||||||
|
||||||||
|
@@ -109,28 +162,37 @@ def get_coeffs(data, X, mask=None, add_const=False): | |||||||
Boolean mask array | ||||||||
add_const : bool, optional | ||||||||
Add intercept column to `X` before fitting. Default: False | ||||||||
compute_zvalues : bool, optional | ||||||||
Compute z-values of the betas (predictors) | ||||||||
min_df : integer, optional | ||||||||
Integer to give warning if # df <= min_df | ||||||||
|
||||||||
Returns | ||||||||
------- | ||||||||
betas : (S [x E] x C) :obj:`numpy.ndarray` | ||||||||
Array of `S` sample betas for `C` predictors | ||||||||
z_values : (S [x E] x C) :obj:`numpy.ndarray` | ||||||||
Array of `S` sample z-values for `C` predictors | ||||||||
|
||||||||
""" | ||||||||
if data.ndim not in [2, 3]: | ||||||||
raise ValueError('Parameter data should be 2d or 3d, not {0}d'.format(data.ndim)) | ||||||||
raise ValueError('Parameter data should be 2d or 3d, not ' | ||||||||
'{0}d'.format(data.ndim)) | ||||||||
elif X.ndim not in [2]: | ||||||||
raise ValueError('Parameter X should be 2d, not {0}d'.format(X.ndim)) | ||||||||
elif data.shape[-1] != X.shape[0]: | ||||||||
raise ValueError('Last dimension (dimension {0}) of data ({1}) does not ' | ||||||||
'match first dimension of ' | ||||||||
'X ({2})'.format(data.ndim, data.shape[-1], X.shape[0])) | ||||||||
raise ValueError('Last dimension (dimension {0}) of data ({1}) does ' | ||||||||
'not match first dimension of X ' | ||||||||
'({2})'.format(data.ndim, data.shape[-1], X.shape[0])) | ||||||||
|
||||||||
# mask data and flip (time x samples) | ||||||||
if mask is not None: | ||||||||
if mask.ndim not in [1, 2]: | ||||||||
raise ValueError('Parameter data should be 1d or 2d, not {0}d'.format(mask.ndim)) | ||||||||
raise ValueError('Parameter data should be 1d or 2d, not ' | ||||||||
'{0}d'.format(mask.ndim)) | ||||||||
elif data.shape[0] != mask.shape[0]: | ||||||||
raise ValueError('First dimensions of data ({0}) and mask ({1}) do not ' | ||||||||
'match'.format(data.shape[0], mask.shape[0])) | ||||||||
raise ValueError('First dimensions of data ({0}) and mask ({1}) do' | ||||||||
' not match'.format(data.shape[0], mask.shape[0])) | ||||||||
mdata = data[mask, :].T | ||||||||
else: | ||||||||
mdata = data.T | ||||||||
|
@@ -144,11 +206,45 @@ def get_coeffs(data, X, mask=None, add_const=False): | |||||||
if add_const: # add intercept, if specified | ||||||||
X = np.column_stack([X, np.ones((len(X), 1))]) | ||||||||
|
||||||||
# least squares estimation: beta = (X^T * X)^(-1) * X^T * mdata | ||||||||
# betas is transposed due to backward compatibility with rest of code. | ||||||||
betas = np.linalg.lstsq(X, mdata, rcond=None)[0].T | ||||||||
|
||||||||
if compute_zvalues: | ||||||||
# compute t-values of betas (estimates) and then convert to z-values | ||||||||
# first compute number of degrees of freedom | ||||||||
df = mdata.shape[0] - X.shape[1] | ||||||||
if df == 0: | ||||||||
LGR.error('ERROR: No degrees of freedom left in least squares ' | ||||||||
'calculation. Stopping!!') | ||||||||
elif df <= min_df: | ||||||||
LGR.warning('Number of degrees of freedom in least-square ' | ||||||||
'estimation is less than {}'.format(min_df + 1)) | ||||||||
# compute sigma: | ||||||||
# RSS = sum{[mdata - (X * betas)]^2} | ||||||||
# sigma = RSS / Degrees_of_Freedom | ||||||||
sigma = np.sum(np.power(mdata - np.dot(X, betas.T), 2), axis=0) / df | ||||||||
sigma = sigma[:, np.newaxis] | ||||||||
# Copmute std of betas: | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
# C = (X^T * X)_ii^(-1) | ||||||||
# std(betas) = sqrt(sigma * C) | ||||||||
C = np.diag(np.linalg.pinv(np.dot(X.T, X))) | ||||||||
C = C[:, np.newaxis] | ||||||||
std_betas = np.sqrt(np.dot(sigma, C.T)) | ||||||||
z_values = t_to_z(betas / std_betas, df) | ||||||||
z_values = np.clip(z_values, -40, 40) | ||||||||
|
||||||||
if add_const: # drop beta for intercept, if specified | ||||||||
betas = betas[:, :-1] | ||||||||
if compute_zvalues: | ||||||||
z_values = z_values[:, :-1] | ||||||||
|
||||||||
if mask is not None: | ||||||||
betas = utils.unmask(betas, mask) | ||||||||
if compute_zvalues: | ||||||||
z_values = utils.unmask(z_values, mask) | ||||||||
|
||||||||
return betas | ||||||||
if compute_zvalues: | ||||||||
return betas, z_values | ||||||||
else: | ||||||||
return betas |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just recommending the docstring I use in
NiMARE
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The actual code from NiMARE might also be good to use since I improved the internal variable names.