Remove the 'dhdl' method in decorrelating the u_nk (#250)

* update * update * update * Apply suggestions from code review Co-authored-by: Zhiyi Wu <zwu@exscientia.co.uk> Co-authored-by: Oliver Beckstein <orbeckst@gmail.com>
alchemistry · Oct 19, 2022 · 9da8da6 · 9da8da6
1 parent be3e73a
commit 9da8da6
Show file tree

Hide file tree

Showing 6 changed files with 35 additions and 40 deletions.
diff --git a/CHANGES b/CHANGES
@@ -49,6 +49,9 @@ Fixes
     "dHdl" and "u_nk" data (or None). THe AMBER parser when using this function will read the 
     file just once, extracting all data at once. (issue #222, PR #240)
   - Fix dhdl2series and u_nk2series would not reattach the unit. (PR #248)
+  - Removed the 'dhdl' keyword for uncorrelating the u_nk (see `u_nk2series()`). Use 'dE'
+    as an alternative or use 'all' (instead of the deprecated 'dhdl_all')  (PR #250).
+
 
 07/22/2022 xiki-tempula, IAlibay, dotsdl, orbeckst, ptmerz
 

diff --git a/docs/preprocessing/alchemlyb.preprocessing.subsampling.rst b/docs/preprocessing/alchemlyb.preprocessing.subsampling.rst
@@ -43,7 +43,7 @@ the autocorrection analysis. The series could be generated with
     >>>     dhdl2series, statistical_inefficiency, equilibrium_detection)
     >>> bz = load_benzene().data
     >>> u_nk = extract_u_nk(bz['Coulomb'], T=300)
-    >>> u_nk_series = u_nk2series(u_nk, method='dhdl')
+    >>> u_nk_series = u_nk2series(u_nk, method='dE')
     >>> decorrelate_u_nk = statistical_inefficiency(u_nk, series=u_nk_series)
     >>> decorrelate_u_nk = equilibrium_detection(u_nk, series=u_nk_series)
     >>> dhdl = extract_dHdl(bz['Coulomb'], T=300)

diff --git a/src/alchemlyb/preprocessing/subsampling.py b/src/alchemlyb/preprocessing/subsampling.py
@@ -1,18 +1,19 @@
 """Functions for subsampling datasets.
 
 """
+import warnings
+
 import pandas as pd
 from pymbar.timeseries import (statisticalInefficiency,
                                detectEquilibration,
                                subsampleCorrelatedData, )
 from .. import pass_attrs
 
-def decorrelate_u_nk(df, method='dhdl', drop_duplicates=True,
+def decorrelate_u_nk(df, method='dE', drop_duplicates=True,
                      sort=True, remove_burnin=False, **kwargs):
-    """Subsample a u_nk DataFrame based on the selected method.
-    The method can be either 'dhdl_all' (obtained as a sum over all energy
-    components) or 'dhdl' (obtained as the energy components that are
-    changing; default) or 'dE'. In the latter case the energy differences
+    """Subsample an u_nk DataFrame based on the selected method.
+    The method can be either 'all' (obtained as a sum over all energy
+    components) or 'dE'. In the latter case the energy differences
     dE_{i,i+1} (dE_{i,i-1} for the last lambda) are used.
     This is a wrapper function around the function
     :func:`~alchemlyb.preprocessing.subsampling.statistical_inefficiency` or
@@ -22,7 +23,7 @@ def decorrelate_u_nk(df, method='dhdl', drop_duplicates=True,
     ----------
     df : DataFrame
         DataFrame to be subsampled according to the selected method.
-    method : {'dhdl', 'dhdl_all', 'dE'}
+    method : {'all', 'dE'}
         Method for decorrelating the data.
     drop_duplicates : bool
         Drop the duplicated lines based on time.
@@ -48,6 +49,7 @@ def decorrelate_u_nk(df, method='dhdl', drop_duplicates=True,
     .. versionadded:: 0.6.0
     .. versionchanged:: 1.0.0
        Add the remove_burnin keyword to allow unequilibrated frames to be removed.
+       Rename value 'dhdl_all' to 'all' and deprecate the 'dhdl'.
     """
     kwargs['drop_duplicates'] = drop_duplicates
     kwargs['sort'] = sort
@@ -108,20 +110,19 @@ def decorrelate_dhdl(df, drop_duplicates=True, sort=True,
         return statistical_inefficiency(df, series, **kwargs)
 
 @pass_attrs
-def u_nk2series(df, method='dhdl'):
+def u_nk2series(df, method='dE'):
     """Convert an u_nk DataFrame into a series based on the selected method
     for subsampling.
 
-    The method can be either 'dhdl_all' (obtained as a sum over all energy
-    components) or 'dhdl' (obtained as the energy components that are
-    changing; default) or 'dE'. In the latter case the energy differences
+    The method can be either 'all' (obtained as a sum over all energy
+    components) or 'dE'. In the latter case the energy differences
     dE_{i,i+1} (dE_{i,i-1} for the last lambda) are used.
 
     Parameters
     ----------
     df : DataFrame
         DataFrame to be converted according to the selected method.
-    method : {'dhdl', 'dhdl_all', 'dE'}
+    method : {'all', 'dE'}
         Method for converting the data.
 
     Returns
@@ -136,6 +137,13 @@ def u_nk2series(df, method='dhdl'):
     .. versionadded:: 1.0.0
     """
     # Check if the input is u_nk
+    if method == 'dhdl': # pragma: no cover
+        warnings.warn(DeprecationWarning("Method 'dhdl' has been deprecated, using 'dE' instead"))
+        method = 'dE'
+    elif method == 'dhdl_all': # pragma: no cover
+        warnings.warn(DeprecationWarning("Method 'dhdl_all' has been deprecated, using 'all' instead"))
+        method = 'all'
+
     try:
         key = df.index.values[0][1:]
         if len(key) == 1:
@@ -144,17 +152,7 @@ def u_nk2series(df, method='dhdl'):
     except KeyError:
         raise ValueError('The input should be u_nk')
 
-    if method == 'dhdl':
-        # Find the current column index
-        # Select the first row and remove the first column (Time)
-        key = df.index.values[0][1:]
-        if len(key) > 1:
-            # Multiple keys
-            series = df[key]
-        else:
-            # Single key
-            series = df[key[0]]
-    elif method == 'dhdl_all':
+    if method == 'all':
         series = df.sum(axis=1)
     elif method == 'dE':
         # Using the same logic as alchemical-analysis

diff --git a/src/alchemlyb/tests/test_preprocessing.py b/src/alchemlyb/tests/test_preprocessing.py
@@ -420,26 +420,24 @@ def test_equilibrium_detection(self, dhdl):
         assert new_dhdl.attrs['temperature'] == 310
         assert new_dhdl.attrs['energy_unit'] == 'kT'
 
-@pytest.mark.parametrize(('method', 'size'), [('dhdl', 2001),
-                                              ('dhdl_all', 2001),
+@pytest.mark.parametrize(('method', 'size'), [('all', 2001),
                                               ('dE', 2001)])
 def test_decorrelate_u_nk_single_l(gmx_benzene_u_nk_fixture, method, size):
     assert len(decorrelate_u_nk(gmx_benzene_u_nk_fixture, method=method,
                                 drop_duplicates=True,
                                 sort=True)) == size
 
 def test_decorrelate_u_nk_burnin(gmx_benzene_u_nk_fixture):
-    assert len(decorrelate_u_nk(gmx_benzene_u_nk_fixture, method='dhdl',
+    assert len(decorrelate_u_nk(gmx_benzene_u_nk_fixture, method='dE',
                                 drop_duplicates=True,
-                                sort=True, remove_burnin=True)) == 2743
+                                sort=True, remove_burnin=True)) == 2849
 
 def test_decorrelate_dhdl_burnin(gmx_benzene_dHdl_fixture):
     assert len(decorrelate_dhdl(gmx_benzene_dHdl_fixture,
                                 drop_duplicates=True,
                                 sort=True, remove_burnin=True)) == 2848
 
-@pytest.mark.parametrize(('method', 'size'), [('dhdl', 501),
-                                              ('dhdl_all', 1001),
+@pytest.mark.parametrize(('method', 'size'), [('all', 1001),
                                               ('dE', 334)])
 def test_decorrelate_u_nk_multiple_l(gmx_ABFE_u_nk, method, size):
     assert len(decorrelate_u_nk(gmx_ABFE_u_nk, method=method,)) == size

diff --git a/src/alchemlyb/tests/test_workflow_ABFE.py b/src/alchemlyb/tests/test_workflow_ABFE.py
@@ -17,7 +17,7 @@ def workflow(tmp_path_factory):
         dir = os.path.dirname(load_ABFE()['data']['complex'][0])
         workflow = ABFE(units='kcal/mol', software='GROMACS', dir=dir,
                         prefix='dhdl', suffix='xvg', T=310, outdirectory=str(outdir))
-        workflow.run(skiptime=10, uncorr='dhdl', threshold=50,
+        workflow.run(skiptime=10, uncorr='dE', threshold=50,
                      estimators=('MBAR', 'BAR', 'TI'), overlap='O_MBAR.pdf',
                      breakdown=True, forwrev=10)
         return workflow
@@ -94,7 +94,7 @@ def workflow(tmp_path_factory):
                         suffix='xvg', T=310, outdirectory=str(outdir))
         workflow.update_units('kcal/mol')
         workflow.read()
-        workflow.preprocess(skiptime=10, uncorr='dhdl', threshold=50)
+        workflow.preprocess(skiptime=10, uncorr='dE', threshold=50)
         workflow.estimate(estimators=('MBAR', 'BAR', 'TI'))
         workflow.plot_overlap_matrix(overlap='O_MBAR.pdf')
         workflow.plot_ti_dhdl(dhdl_TI='dhdl_TI.pdf')
@@ -161,7 +161,7 @@ def workflow(tmp_path_factory):
         workflow = ABFE(units='kcal/mol', software='GROMACS', dir=dir,
                         prefix='dhdl', suffix='bz2', T=310,
                         outdirectory=outdir)
-        workflow.run(skiptime=0, uncorr='dhdl', threshold=50,
+        workflow.run(skiptime=0, uncorr='dE', threshold=50,
                         estimators=('MBAR', 'BAR', 'TI'), overlap='O_MBAR.pdf',
                         breakdown=True, forwrev=10)
         return workflow

diff --git a/src/alchemlyb/workflows/abfe.py b/src/alchemlyb/workflows/abfe.py
@@ -167,7 +167,7 @@ def read(self, read_u_nk=True, read_dHdl=True):
             self.dHdl_list = []
 
 
-    def run(self, skiptime=0, uncorr='dhdl', threshold=50,
+    def run(self, skiptime=0, uncorr='dE', threshold=50,
             estimators=('MBAR', 'BAR', 'TI'), overlap='O_MBAR.pdf',
             breakdown=True, forwrev=None, *args, **kwargs):
         ''' The method for running the automatic analysis.
@@ -178,9 +178,7 @@ def run(self, skiptime=0, uncorr='dhdl', threshold=50,
             Discard data prior to this specified time as 'equilibration' data.
             Units are specified by the corresponding MD Engine. Default: 0.
         uncorr : str
-            The observable to be used for the autocorrelation analysis; 'dhdl'
-            (obtained as a sum over those energy components that are changing).
-            Specify as `None` will not uncorrelate the data. Default: 'dhdl'.
+            The observable to be used for the autocorrelation analysis; 'dE'.
         threshold : int
             Proceed with correlated samples if the number of uncorrelated samples is
             found to be less than this number. If 0 is given, the time series
@@ -273,7 +271,7 @@ def update_units(self, units=None):
             self.logger.info(f'Set unit to {units}.')
             self.units = units or None
 
-    def preprocess(self, skiptime=0, uncorr='dhdl', threshold=50):
+    def preprocess(self, skiptime=0, uncorr='dE', threshold=50):
         '''Preprocess the data by removing the equilibration time and
         decorrelate the date.
 
@@ -283,9 +281,7 @@ def preprocess(self, skiptime=0, uncorr='dhdl', threshold=50):
             Discard data prior to this specified time as 'equilibration' data.
             Units are specified by the corresponding MD Engine. Default: 0.
         uncorr : str
-            The observable to be used for the autocorrelation analysis; 'dhdl'
-            (obtained as a sum over those energy components that are changing).
-            Default: 'dhdl'
+            The observable to be used for the autocorrelation analysis; 'dE'.
         threshold : int
             Proceed with correlated samples if the number of uncorrelated
             samples is found to be less than this number. If 0 is given, the