diff --git a/CHANGES b/CHANGES index a6fad7df..d235ba1b 100644 --- a/CHANGES +++ b/CHANGES @@ -41,6 +41,8 @@ Enhancements - Add remove_burnin keyword to decorrelate_u_nk and decorrelate_dhdl (PR #218). - Add a base class for workflows (PR #188). - Add the ABFE workflow (PR #114). + - Add R_c and A_c for "fractional equilibration time" convergence analysis + (issue #104, PR #239) - Add the keyword arg final_error to plot_convergence (#249) diff --git a/docs/convergence.rst b/docs/convergence.rst index 55a4e270..34dda71f 100644 --- a/docs/convergence.rst +++ b/docs/convergence.rst @@ -33,6 +33,52 @@ Will give a plot looks like this A convergence plot of showing that the forward and backward has converged fully. +Fractional equilibration time +----------------------------- + +Another way of assessing whether the simulation has converged is to check the +energy files. In [Fan2021]_, :math:`R_c` and +:math:`A_c` are two criteria of checking the +convergence. :func:`~alchemlyb.convergence.fwdrev_cumavg_Rc` takes a decorrelated +:class:`pandas.Series` as input and gives the metric +:math:`R_c`, which is 0 for fully-equilibrated +simulation and 1 for fully-unequilibrated simulation. :: + + >>> from alchemtest.gmx import load_ABFE + >>> from alchemlyb.parsing.gmx import extract_dHdl + >>> from alchemlyb.preprocessing import decorrelate_dhdl, dhdl2series + >>> from alchemlyb.convergence import fwdrev_cumavg_Rc + >>> from alchemlyb.visualisation import plot_convergence + + >>> file = load_ABFE().data['ligand'][0] + >>> dhdl = extract_dHdl(file, T=300) + >>> decorrelated = decorrelate_dhdl(dhdl, remove_burnin=True) + >>> R_c, running_average = fwdrev_cumavg_Rc(dhdl2series(decorrelated), tol=2) + >>> print(R_c) + 0.04 + >>> ax = plot_convergence(running_average, final_error=2) + >>> ax.set_ylabel("$\partial H/\partial\lambda$ (in kT)") + + +Will give a plot like this. + +.. figure:: images/R_c.png + + +The :func:`~alchemlyb.convergence.A_c` on the other hand, takes in a list of +decorrelated :class:`pandas.Series` and gives a metric of how converged the set +is, where 0 fully-unequilibrated and 1.0 is fully-equilibrated. :: + + >>> from alchemlyb.convergence import A_c + >>> dhdl_list = [] + >>> for file in load_ABFE().data['ligand']: + >>> dhdl = extract_dHdl(file, T=300) + >>> decorrelated = decorrelate_dhdl(dhdl, remove_burnin=True) + >>> decorrelated = dhdl2series(decorrelated) + >>> dhdl_list.append(decorrelated) + >>> value = A_c(dhdl_list) + 0.7085 + Convergence functions --------------------- @@ -44,5 +90,14 @@ The currently available connvergence functions: .. autosummary:: :toctree: convergence - convergence + forward_backward_convergence + fwdrev_cumavg_Rc + A_c + +References +---------- +.. [Fan2021] Fan, S., Nedev, H., Vijayan, R., Iorga, B.I., and Beckstein, O. + (2021). Precise force-field-based calculations of octanol-water partition + coefficients for the SAMPL7 molecules. Journal of Computer-Aided Molecular + Design 35, 853–87 diff --git a/docs/images/R_c.png b/docs/images/R_c.png new file mode 100644 index 00000000..8b249225 Binary files /dev/null and b/docs/images/R_c.png differ diff --git a/src/alchemlyb/convergence/__init__.py b/src/alchemlyb/convergence/__init__.py index e8dd32b3..d829581f 100644 --- a/src/alchemlyb/convergence/__init__.py +++ b/src/alchemlyb/convergence/__init__.py @@ -1 +1 @@ -from .convergence import forward_backward_convergence +from .convergence import forward_backward_convergence, fwdrev_cumavg_Rc, A_c diff --git a/src/alchemlyb/convergence/convergence.py b/src/alchemlyb/convergence/convergence.py index bc6bbdf3..bfe7402a 100644 --- a/src/alchemlyb/convergence/convergence.py +++ b/src/alchemlyb/convergence/convergence.py @@ -7,6 +7,7 @@ from ..estimators import BAR, TI, FEP_ESTIMATORS, TI_ESTIMATORS from ..estimators import AutoMBAR as MBAR from .. import concat +from ..postprocessors.units import to_kT def forward_backward_convergence(df_list, estimator='MBAR', num=10, **kwargs): @@ -63,7 +64,7 @@ def forward_backward_convergence(df_list, estimator='MBAR', num=10, **kwargs): .. versionadded:: 0.6.0 .. versionchanged:: 1.0.0 The ``estimator`` accepts uppercase input. - The default for using ``estimator='MBAR'`` was changed from + The default for using ``estimator='MBAR'`` was changed from :class:`~alchemlyb.estimators.MBAR` to :class:`~alchemlyb.estimators.AutoMBAR`. ''' @@ -137,3 +138,175 @@ def forward_backward_convergence(df_list, estimator='MBAR', num=10, **kwargs): 'data_fraction': [i / num for i in range(1, num + 1)]}) convergence.attrs = df_list[0].attrs return convergence + +def _cummean(vals, out_length): + '''The cumulative mean of an array. + + This function computes the cumulative mean and shapes the result to the + desired length. + + Parameters + ---------- + vals : numpy.array + The one-dimensional input array. + out_length : int + The length of the output array. + + Returns + ------- + numpy.array + The one-dimensional input array with length of total. + + Note + ---- + If the length of the input series is smaller than the ``out_length``, the + length of the output array is the same as the input series. + + + .. versionadded:: 1.0.0 + + ''' + in_length = len(vals) + if in_length < out_length: + out_length = in_length + block = in_length // out_length + reshape = vals[: block*out_length].reshape(block, out_length) + mean = np.mean(reshape, axis=0) + result = np.cumsum(mean) / np.arange(1, out_length+1) + return result + +def fwdrev_cumavg_Rc(series, precision=0.01, tol=2): + '''Generate the convergence criteria :math:`R_c` for a single simulation. + + The input will be pandas.Series generated by + :func:`~alchemlyb.preprocessing.subsampling.decorrelate_u_nk` or + :func:`~alchemlyb.preprocessing.subsampling.decorrelate_dhdl`. + + The output will the float :math:`R_c`. :math:`R_c = 0` indicates that the system is well + equilibrated right from the beginning while :math:`R_c = 1` signifies that the + whole trajectory is not equilibrated. + + Parameters + ---------- + series : pandas.Series + The input energy array. + precision : float + The precision of the output :math:`R_c`. To speed the calculation up, the data + has been block-averaged before doing the calculation, the size of the + block is controlled by the desired precision. + tol : float + Tolerance of the convergence check in :math:`kT`. + + Returns + ------- + float + Convergence time fraction. + DataFrame + The DataFrame with moving average. :: + + Forward Backward data_fraction + 0 3.016442 3.065176 0.1 + 1 3.078106 3.078567 0.2 + 2 3.072561 3.047357 0.3 + 3 3.048325 3.057527 0.4 + 4 3.049769 3.037454 0.5 + 5 3.034078 3.040484 0.6 + 6 3.043274 3.032495 0.7 + 7 3.035460 3.036670 0.8 + 8 3.042032 3.046597 0.9 + 9 3.044149 3.044385 1.0 + + Note + ---- + This function computes :math:`R_c` from equation 16 from + https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8397498/#FD16. The code is + modified based on Shujie Fan's (@VOD555) work. + + Please cite [Fan2021]_ when using this function. + + + .. versionadded:: 1.0.0 + ''' + series = to_kT(series) + array = series.to_numpy() + out_length = int(1 / precision) + g_forward = _cummean(array, out_length) + g_backward = _cummean(array[::-1], out_length) + length = len(g_forward) + + convergence = pd.DataFrame( + {'Forward': g_forward, + 'Backward': g_backward, + 'data_fraction': [i / length for i in range(1, length + 1)]}) + convergence.attrs = series.attrs + + # Final value + g = g_forward[-1] + conv = np.logical_and(np.abs(g_forward - g) < tol, np.abs(g_backward - g) < tol) + for i in range(out_length): + if all(conv[i:]): + return i / length, convergence + else: + # This branch exists as we are truncating the dataset to speed the + # calculation up. For example if the dataset has 21 points and the + # precision is 0.05. The last point of g_forward is computed using + # data[0:20] while the last point of g_backward is computed using + # data[1:21]. Thus, the last point of g_forward and g_backward are not + # the same as this branch will be triggered. + return 1.0, convergence + +def A_c(series_list, precision=0.01, diff=2): + '''Generate the ensemble convergence criteria :math:`A_c` for a set of simulations. + + The input is a list of pandas.Series generated by + :func:`~alchemlyb.preprocessing.subsampling.decorrelate_u_nk` or + :func:`~alchemlyb.preprocessing.subsampling.decorrelate_dhdl`. + + The output will the float :math:`A_c`. :math:`A_c` is a number between 0 and 1 that can be + interpreted as the ratio of the total equilibrated simulation time to the + whole simulation time for a full set of simulations. :math:`A_c = 1` means that all + simulation time frames in all windows can be considered equilibrated, while + :math:`A_c = 0` indicates that nothing is equilibrated. + + Parameters + ---------- + series_list : list + A list of pandas.Series energy array. + precision : float + The precision of the output :math:`A_c`. To speed the calculation up, the data + has been block-averaged before doing the calculation, the size of the + block is controlled by the desired precision. + diff : float + Tolerance of the convergence check in :math:`kT`. + + Returns + ------- + float + The area under curve for convergence time fraction. + + Note + ---- + This function computes :math:`A_c` from equation 18 from + https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8397498/#FD18. See also + :func:`~alchemlyb.convergence.fwdrev_cumavg_Rc`. + + Please cite [Fan2021]_ when using this function. + + + .. versionadded:: 1.0.0 + ''' + logger = logging.getLogger('alchemlyb.convergence.A_c') + n_R_c = len(series_list) + R_c_list = [fwdrev_cumavg_Rc(series, precision, diff)[0] for series in series_list] + logger.info(f'R_c list: {R_c_list}') + # Integrate the R_c_list <= R_c over the range of 0 to 1 + array_01 = np.hstack((R_c_list, [0, 1])) + sorted_array = np.sort(np.unique(array_01)) + result = 0 + for i, element in enumerate(sorted_array[::-1]): + if i == 0: + continue + else: + d_R_c = sorted_array[-i] - sorted_array[-i-1] + result += d_R_c * sum(R_c_list <= element) / n_R_c + return result diff --git a/src/alchemlyb/preprocessing/subsampling.py b/src/alchemlyb/preprocessing/subsampling.py index c9aef56d..a58fe01e 100644 --- a/src/alchemlyb/preprocessing/subsampling.py +++ b/src/alchemlyb/preprocessing/subsampling.py @@ -494,9 +494,9 @@ def equilibrium_detection(df, series=None, lower=None, upper=None, step=None, .. versionchanged:: 1.0.0 - Add the drop_duplicates and sort keyword to unify the behaviour between - :func:`~alchemlyb.preprocessing.subsampling.statistical_inefficiency` or - :func:`~alchemlyb.preprocessing.subsampling.equilibrium_detection`. + Add the drop_duplicates and sort keyword to unify the behaviour between + :func:`~alchemlyb.preprocessing.subsampling.statistical_inefficiency` or + :func:`~alchemlyb.preprocessing.subsampling.equilibrium_detection`. """ df, series = _prepare_input(df, series, drop_duplicates, sort) diff --git a/src/alchemlyb/tests/test_convergence.py b/src/alchemlyb/tests/test_convergence.py index 6520713c..91f70fb1 100644 --- a/src/alchemlyb/tests/test_convergence.py +++ b/src/alchemlyb/tests/test_convergence.py @@ -1,8 +1,12 @@ +import numpy as np +import pandas as pd import pytest from alchemtest.gmx import load_benzene from alchemlyb.parsing import gmx -from alchemlyb.convergence import forward_backward_convergence +from alchemlyb.convergence import forward_backward_convergence, fwdrev_cumavg_Rc, A_c +from alchemlyb.convergence.convergence import _cummean + @pytest.fixture() def gmx_benzene(): @@ -60,3 +64,46 @@ def test_convergence_method(gmx_benzene): dHdl, u_nk = gmx_benzene convergence = forward_backward_convergence(u_nk, 'MBAR', num=2, method='adaptive') assert len(convergence) == 2 + +def test_cummean_short(): + '''Test the case where the input is shorter than the expected output''' + value = _cummean(np.empty(10), 100) + assert len(value) == 10 + +def test_cummean_long(): + '''Test the case where the input is longer than the expected output''' + value = _cummean(np.empty(20), 10) + assert len(value) == 10 + +def test_cummean_long_none_integter(): + '''Test the case where the input is not a integer multiple of the expected output''' + value = _cummean(np.empty(25), 10) + assert len(value) == 10 + +def test_R_c_converged(): + data = pd.Series(data=[0,]*100) + data.attrs['temperature'] = 310 + data.attrs['energy_unit'] = 'kcal/mol' + value, running_average = fwdrev_cumavg_Rc(data) + np.testing.assert_allclose(value, 0.0) + +def test_R_c_notconverged(): + data = pd.Series(data=range(21)) + data.attrs['temperature'] = 310 + data.attrs['energy_unit'] = 'kcal/mol' + value, running_average = fwdrev_cumavg_Rc(data, tol=0.1, precision=0.05) + np.testing.assert_allclose(value, 1.0) + +def test_R_c_real(): + data = pd.Series(data=np.hstack((range(10), [4.5,]*10))) + data.attrs['temperature'] = 310 + data.attrs['energy_unit'] = 'kcal/mol' + value, running_average = fwdrev_cumavg_Rc(data) + np.testing.assert_allclose(value, 0.35) + +def test_A_c_real(): + data = pd.Series(data=np.hstack((range(10), [4.5,]*10))) + data.attrs['temperature'] = 310 + data.attrs['energy_unit'] = 'kcal/mol' + value = A_c([data, ] * 2) + np.testing.assert_allclose(value, 0.65)