Skip to content

Commit

Permalink
Remove the 'dhdl' method in decorrelating the u_nk (#250)
Browse files Browse the repository at this point in the history
* update

* update

* update

* Apply suggestions from code review

Co-authored-by: Zhiyi Wu <zwu@exscientia.co.uk>
Co-authored-by: Oliver Beckstein <orbeckst@gmail.com>
  • Loading branch information
3 people authored Oct 19, 2022
1 parent be3e73a commit 9da8da6
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 40 deletions.
3 changes: 3 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ Fixes
"dHdl" and "u_nk" data (or None). THe AMBER parser when using this function will read the
file just once, extracting all data at once. (issue #222, PR #240)
- Fix dhdl2series and u_nk2series would not reattach the unit. (PR #248)
- Removed the 'dhdl' keyword for uncorrelating the u_nk (see `u_nk2series()`). Use 'dE'
as an alternative or use 'all' (instead of the deprecated 'dhdl_all') (PR #250).


07/22/2022 xiki-tempula, IAlibay, dotsdl, orbeckst, ptmerz

Expand Down
2 changes: 1 addition & 1 deletion docs/preprocessing/alchemlyb.preprocessing.subsampling.rst
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ the autocorrection analysis. The series could be generated with
>>> dhdl2series, statistical_inefficiency, equilibrium_detection)
>>> bz = load_benzene().data
>>> u_nk = extract_u_nk(bz['Coulomb'], T=300)
>>> u_nk_series = u_nk2series(u_nk, method='dhdl')
>>> u_nk_series = u_nk2series(u_nk, method='dE')
>>> decorrelate_u_nk = statistical_inefficiency(u_nk, series=u_nk_series)
>>> decorrelate_u_nk = equilibrium_detection(u_nk, series=u_nk_series)
>>> dhdl = extract_dHdl(bz['Coulomb'], T=300)
Expand Down
42 changes: 20 additions & 22 deletions src/alchemlyb/preprocessing/subsampling.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
"""Functions for subsampling datasets.
"""
import warnings

import pandas as pd
from pymbar.timeseries import (statisticalInefficiency,
detectEquilibration,
subsampleCorrelatedData, )
from .. import pass_attrs

def decorrelate_u_nk(df, method='dhdl', drop_duplicates=True,
def decorrelate_u_nk(df, method='dE', drop_duplicates=True,
sort=True, remove_burnin=False, **kwargs):
"""Subsample a u_nk DataFrame based on the selected method.
The method can be either 'dhdl_all' (obtained as a sum over all energy
components) or 'dhdl' (obtained as the energy components that are
changing; default) or 'dE'. In the latter case the energy differences
"""Subsample an u_nk DataFrame based on the selected method.
The method can be either 'all' (obtained as a sum over all energy
components) or 'dE'. In the latter case the energy differences
dE_{i,i+1} (dE_{i,i-1} for the last lambda) are used.
This is a wrapper function around the function
:func:`~alchemlyb.preprocessing.subsampling.statistical_inefficiency` or
Expand All @@ -22,7 +23,7 @@ def decorrelate_u_nk(df, method='dhdl', drop_duplicates=True,
----------
df : DataFrame
DataFrame to be subsampled according to the selected method.
method : {'dhdl', 'dhdl_all', 'dE'}
method : {'all', 'dE'}
Method for decorrelating the data.
drop_duplicates : bool
Drop the duplicated lines based on time.
Expand All @@ -48,6 +49,7 @@ def decorrelate_u_nk(df, method='dhdl', drop_duplicates=True,
.. versionadded:: 0.6.0
.. versionchanged:: 1.0.0
Add the remove_burnin keyword to allow unequilibrated frames to be removed.
Rename value 'dhdl_all' to 'all' and deprecate the 'dhdl'.
"""
kwargs['drop_duplicates'] = drop_duplicates
kwargs['sort'] = sort
Expand Down Expand Up @@ -108,20 +110,19 @@ def decorrelate_dhdl(df, drop_duplicates=True, sort=True,
return statistical_inefficiency(df, series, **kwargs)

@pass_attrs
def u_nk2series(df, method='dhdl'):
def u_nk2series(df, method='dE'):
"""Convert an u_nk DataFrame into a series based on the selected method
for subsampling.
The method can be either 'dhdl_all' (obtained as a sum over all energy
components) or 'dhdl' (obtained as the energy components that are
changing; default) or 'dE'. In the latter case the energy differences
The method can be either 'all' (obtained as a sum over all energy
components) or 'dE'. In the latter case the energy differences
dE_{i,i+1} (dE_{i,i-1} for the last lambda) are used.
Parameters
----------
df : DataFrame
DataFrame to be converted according to the selected method.
method : {'dhdl', 'dhdl_all', 'dE'}
method : {'all', 'dE'}
Method for converting the data.
Returns
Expand All @@ -136,6 +137,13 @@ def u_nk2series(df, method='dhdl'):
.. versionadded:: 1.0.0
"""
# Check if the input is u_nk
if method == 'dhdl': # pragma: no cover
warnings.warn(DeprecationWarning("Method 'dhdl' has been deprecated, using 'dE' instead"))
method = 'dE'
elif method == 'dhdl_all': # pragma: no cover
warnings.warn(DeprecationWarning("Method 'dhdl_all' has been deprecated, using 'all' instead"))
method = 'all'

try:
key = df.index.values[0][1:]
if len(key) == 1:
Expand All @@ -144,17 +152,7 @@ def u_nk2series(df, method='dhdl'):
except KeyError:
raise ValueError('The input should be u_nk')

if method == 'dhdl':
# Find the current column index
# Select the first row and remove the first column (Time)
key = df.index.values[0][1:]
if len(key) > 1:
# Multiple keys
series = df[key]
else:
# Single key
series = df[key[0]]
elif method == 'dhdl_all':
if method == 'all':
series = df.sum(axis=1)
elif method == 'dE':
# Using the same logic as alchemical-analysis
Expand Down
10 changes: 4 additions & 6 deletions src/alchemlyb/tests/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,26 +420,24 @@ def test_equilibrium_detection(self, dhdl):
assert new_dhdl.attrs['temperature'] == 310
assert new_dhdl.attrs['energy_unit'] == 'kT'

@pytest.mark.parametrize(('method', 'size'), [('dhdl', 2001),
('dhdl_all', 2001),
@pytest.mark.parametrize(('method', 'size'), [('all', 2001),
('dE', 2001)])
def test_decorrelate_u_nk_single_l(gmx_benzene_u_nk_fixture, method, size):
assert len(decorrelate_u_nk(gmx_benzene_u_nk_fixture, method=method,
drop_duplicates=True,
sort=True)) == size

def test_decorrelate_u_nk_burnin(gmx_benzene_u_nk_fixture):
assert len(decorrelate_u_nk(gmx_benzene_u_nk_fixture, method='dhdl',
assert len(decorrelate_u_nk(gmx_benzene_u_nk_fixture, method='dE',
drop_duplicates=True,
sort=True, remove_burnin=True)) == 2743
sort=True, remove_burnin=True)) == 2849

def test_decorrelate_dhdl_burnin(gmx_benzene_dHdl_fixture):
assert len(decorrelate_dhdl(gmx_benzene_dHdl_fixture,
drop_duplicates=True,
sort=True, remove_burnin=True)) == 2848

@pytest.mark.parametrize(('method', 'size'), [('dhdl', 501),
('dhdl_all', 1001),
@pytest.mark.parametrize(('method', 'size'), [('all', 1001),
('dE', 334)])
def test_decorrelate_u_nk_multiple_l(gmx_ABFE_u_nk, method, size):
assert len(decorrelate_u_nk(gmx_ABFE_u_nk, method=method,)) == size
Expand Down
6 changes: 3 additions & 3 deletions src/alchemlyb/tests/test_workflow_ABFE.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def workflow(tmp_path_factory):
dir = os.path.dirname(load_ABFE()['data']['complex'][0])
workflow = ABFE(units='kcal/mol', software='GROMACS', dir=dir,
prefix='dhdl', suffix='xvg', T=310, outdirectory=str(outdir))
workflow.run(skiptime=10, uncorr='dhdl', threshold=50,
workflow.run(skiptime=10, uncorr='dE', threshold=50,
estimators=('MBAR', 'BAR', 'TI'), overlap='O_MBAR.pdf',
breakdown=True, forwrev=10)
return workflow
Expand Down Expand Up @@ -94,7 +94,7 @@ def workflow(tmp_path_factory):
suffix='xvg', T=310, outdirectory=str(outdir))
workflow.update_units('kcal/mol')
workflow.read()
workflow.preprocess(skiptime=10, uncorr='dhdl', threshold=50)
workflow.preprocess(skiptime=10, uncorr='dE', threshold=50)
workflow.estimate(estimators=('MBAR', 'BAR', 'TI'))
workflow.plot_overlap_matrix(overlap='O_MBAR.pdf')
workflow.plot_ti_dhdl(dhdl_TI='dhdl_TI.pdf')
Expand Down Expand Up @@ -161,7 +161,7 @@ def workflow(tmp_path_factory):
workflow = ABFE(units='kcal/mol', software='GROMACS', dir=dir,
prefix='dhdl', suffix='bz2', T=310,
outdirectory=outdir)
workflow.run(skiptime=0, uncorr='dhdl', threshold=50,
workflow.run(skiptime=0, uncorr='dE', threshold=50,
estimators=('MBAR', 'BAR', 'TI'), overlap='O_MBAR.pdf',
breakdown=True, forwrev=10)
return workflow
Expand Down
12 changes: 4 additions & 8 deletions src/alchemlyb/workflows/abfe.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def read(self, read_u_nk=True, read_dHdl=True):
self.dHdl_list = []


def run(self, skiptime=0, uncorr='dhdl', threshold=50,
def run(self, skiptime=0, uncorr='dE', threshold=50,
estimators=('MBAR', 'BAR', 'TI'), overlap='O_MBAR.pdf',
breakdown=True, forwrev=None, *args, **kwargs):
''' The method for running the automatic analysis.
Expand All @@ -178,9 +178,7 @@ def run(self, skiptime=0, uncorr='dhdl', threshold=50,
Discard data prior to this specified time as 'equilibration' data.
Units are specified by the corresponding MD Engine. Default: 0.
uncorr : str
The observable to be used for the autocorrelation analysis; 'dhdl'
(obtained as a sum over those energy components that are changing).
Specify as `None` will not uncorrelate the data. Default: 'dhdl'.
The observable to be used for the autocorrelation analysis; 'dE'.
threshold : int
Proceed with correlated samples if the number of uncorrelated samples is
found to be less than this number. If 0 is given, the time series
Expand Down Expand Up @@ -273,7 +271,7 @@ def update_units(self, units=None):
self.logger.info(f'Set unit to {units}.')
self.units = units or None

def preprocess(self, skiptime=0, uncorr='dhdl', threshold=50):
def preprocess(self, skiptime=0, uncorr='dE', threshold=50):
'''Preprocess the data by removing the equilibration time and
decorrelate the date.
Expand All @@ -283,9 +281,7 @@ def preprocess(self, skiptime=0, uncorr='dhdl', threshold=50):
Discard data prior to this specified time as 'equilibration' data.
Units are specified by the corresponding MD Engine. Default: 0.
uncorr : str
The observable to be used for the autocorrelation analysis; 'dhdl'
(obtained as a sum over those energy components that are changing).
Default: 'dhdl'
The observable to be used for the autocorrelation analysis; 'dE'.
threshold : int
Proceed with correlated samples if the number of uncorrelated
samples is found to be less than this number. If 0 is given, the
Expand Down

0 comments on commit 9da8da6

Please sign in to comment.